diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 10:05:51 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 10:05:51 +0000 |
commit | 5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 (patch) | |
tree | a94efe259b9009378be6d90eb30d2b019d95c194 /fs/nfsd | |
parent | Initial commit. (diff) | |
download | linux-upstream.tar.xz linux-upstream.zip |
Adding upstream version 5.10.209.upstream/5.10.209upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/nfsd')
53 files changed, 41646 insertions, 0 deletions
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig new file mode 100644 index 000000000..248f1459c --- /dev/null +++ b/fs/nfsd/Kconfig @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: GPL-2.0-only +config NFSD + tristate "NFS server support" + depends on INET + depends on FILE_LOCKING + depends on FSNOTIFY + select LOCKD + select SUNRPC + select EXPORTFS + select NFS_ACL_SUPPORT if NFSD_V2_ACL + depends on MULTIUSER + help + Choose Y here if you want to allow other computers to access + files residing on this system using Sun's Network File System + protocol. To compile the NFS server support as a module, + choose M here: the module will be called nfsd. + + You may choose to use a user-space NFS server instead, in which + case you can choose N here. + + To export local file systems using NFS, you also need to install + user space programs which can be found in the Linux nfs-utils + package, available from http://linux-nfs.org/. More detail about + the Linux NFS server implementation is available via the + exports(5) man page. + + Below you can choose which versions of the NFS protocol are + available to clients mounting the NFS server on this system. + Support for NFS version 2 (RFC 1094) is always available when + CONFIG_NFSD is selected. + + If unsure, say N. + +config NFSD_V2_ACL + bool + depends on NFSD + +config NFSD_V3 + bool "NFS server support for NFS version 3" + depends on NFSD + help + This option enables support in your system's NFS server for + version 3 of the NFS protocol (RFC 1813). + + If unsure, say Y. + +config NFSD_V3_ACL + bool "NFS server support for the NFSv3 ACL protocol extension" + depends on NFSD_V3 + select NFSD_V2_ACL + help + Solaris NFS servers support an auxiliary NFSv3 ACL protocol that + never became an official part of the NFS version 3 protocol. + This protocol extension allows applications on NFS clients to + manipulate POSIX Access Control Lists on files residing on NFS + servers. NFS servers enforce POSIX ACLs on local files whether + this protocol is available or not. + + This option enables support in your system's NFS server for the + NFSv3 ACL protocol extension allowing NFS clients to manipulate + POSIX ACLs on files exported by your system's NFS server. NFS + clients which support the Solaris NFSv3 ACL protocol can then + access and modify ACLs on your NFS server. + + To store ACLs on your NFS server, you also need to enable ACL- + related CONFIG options for your local file systems of choice. + + If unsure, say N. + +config NFSD_V4 + bool "NFS server support for NFS version 4" + depends on NFSD && PROC_FS + select NFSD_V3 + select FS_POSIX_ACL + select SUNRPC_GSS + select CRYPTO + select CRYPTO_MD5 + select CRYPTO_SHA256 + select GRACE_PERIOD + help + This option enables support in your system's NFS server for + version 4 of the NFS protocol (RFC 3530). + + To export files using NFSv4, you need to install additional user + space programs which can be found in the Linux nfs-utils package, + available from http://linux-nfs.org/. + + If unsure, say N. + +config NFSD_PNFS + bool + +config NFSD_BLOCKLAYOUT + bool "NFSv4.1 server support for pNFS block layouts" + depends on NFSD_V4 && BLOCK + select NFSD_PNFS + select EXPORTFS_BLOCK_OPS + help + This option enables support for the exporting pNFS block layouts + in the kernel's NFS server. The pNFS block layout enables NFS + clients to directly perform I/O to block devices accesible to both + the server and the clients. See RFC 5663 for more details. + + If unsure, say N. + +config NFSD_SCSILAYOUT + bool "NFSv4.1 server support for pNFS SCSI layouts" + depends on NFSD_V4 && BLOCK + select NFSD_PNFS + select EXPORTFS_BLOCK_OPS + select BLK_SCSI_REQUEST + help + This option enables support for the exporting pNFS SCSI layouts + in the kernel's NFS server. The pNFS SCSI layout enables NFS + clients to directly perform I/O to SCSI devices accesible to both + the server and the clients. See draft-ietf-nfsv4-scsi-layout for + more details. + + If unsure, say N. + +config NFSD_FLEXFILELAYOUT + bool "NFSv4.1 server support for pNFS Flex File layouts" + depends on NFSD_V4 + select NFSD_PNFS + help + This option enables support for the exporting pNFS Flex File + layouts in the kernel's NFS server. The pNFS Flex File layout + enables NFS clients to directly perform I/O to NFSv3 devices + accesible to both the server and the clients. See + draft-ietf-nfsv4-flex-files for more details. + + Warning, this server implements the bare minimum functionality + to be a flex file server - it is for testing the client, + not for use in production. + + If unsure, say N. + +config NFSD_V4_2_INTER_SSC + bool "NFSv4.2 inter server to server COPY" + depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2 + help + This option enables support for NFSv4.2 inter server to + server copy where the destination server calls the NFSv4.2 + client to read the data to copy from the source server. + + If unsure, say N. + +config NFSD_V4_SECURITY_LABEL + bool "Provide Security Label support for NFSv4 server" + depends on NFSD_V4 && SECURITY + help + + Say Y here if you want enable fine-grained security label attribute + support for NFS version 4. Security labels allow security modules like + SELinux and Smack to label files to facilitate enforcement of their policies. + Without this an NFSv4 mount will have the same label on each file. + + If you do not wish to enable fine-grained security labels SELinux or + Smack policies on NFSv4 files, say N. diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile new file mode 100644 index 000000000..3f0983e93 --- /dev/null +++ b/fs/nfsd/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the Linux nfs server +# + +ccflags-y += -I$(src) # needed for trace events + +obj-$(CONFIG_NFSD) += nfsd.o + +# this one should be compiled first, as the tracing macros can easily blow up +nfsd-y += trace.o + +nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ + export.o auth.o lockd.o nfscache.o nfsxdr.o \ + stats.o filecache.o +nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o +nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o +nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o +nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ + nfs4acl.o nfs4callback.o nfs4recover.o +nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o +nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o +nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o +nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h new file mode 100644 index 000000000..ba14d2f4b --- /dev/null +++ b/fs/nfsd/acl.h @@ -0,0 +1,51 @@ +/* + * Common NFSv4 ACL handling definitions. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen <marius@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LINUX_NFS4_ACL_H +#define LINUX_NFS4_ACL_H + +struct nfs4_acl; +struct svc_fh; +struct svc_rqst; + +int nfs4_acl_bytes(int entries); +int nfs4_acl_get_whotype(char *, u32); +__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who); + +int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, + struct nfs4_acl **acl); +__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl); + +#endif /* LINUX_NFS4_ACL_H */ diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c new file mode 100644 index 000000000..fdf2aad73 --- /dev/null +++ b/fs/nfsd/auth.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ + +#include <linux/sched.h> +#include "nfsd.h" +#include "auth.h" + +int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) +{ + struct exp_flavor_info *f; + struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + + for (f = exp->ex_flavors; f < end; f++) { + if (f->pseudoflavor == rqstp->rq_cred.cr_flavor) + return f->flags; + } + return exp->ex_flags; + +} + +int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) +{ + struct group_info *rqgi; + struct group_info *gi; + struct cred *new; + int i; + int flags = nfsexp_flags(rqstp, exp); + + validate_process_creds(); + + /* discard any old override before preparing the new set */ + revert_creds(get_cred(current_real_cred())); + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->fsuid = rqstp->rq_cred.cr_uid; + new->fsgid = rqstp->rq_cred.cr_gid; + + rqgi = rqstp->rq_cred.cr_group_info; + + if (flags & NFSEXP_ALLSQUASH) { + new->fsuid = exp->ex_anon_uid; + new->fsgid = exp->ex_anon_gid; + gi = groups_alloc(0); + if (!gi) + goto oom; + } else if (flags & NFSEXP_ROOTSQUASH) { + if (uid_eq(new->fsuid, GLOBAL_ROOT_UID)) + new->fsuid = exp->ex_anon_uid; + if (gid_eq(new->fsgid, GLOBAL_ROOT_GID)) + new->fsgid = exp->ex_anon_gid; + + gi = groups_alloc(rqgi->ngroups); + if (!gi) + goto oom; + + for (i = 0; i < rqgi->ngroups; i++) { + if (gid_eq(GLOBAL_ROOT_GID, rqgi->gid[i])) + gi->gid[i] = exp->ex_anon_gid; + else + gi->gid[i] = rqgi->gid[i]; + } + + /* Each thread allocates its own gi, no race */ + groups_sort(gi); + } else { + gi = get_group_info(rqgi); + } + + if (uid_eq(new->fsuid, INVALID_UID)) + new->fsuid = exp->ex_anon_uid; + if (gid_eq(new->fsgid, INVALID_GID)) + new->fsgid = exp->ex_anon_gid; + + set_groups(new, gi); + put_group_info(gi); + + if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID)) + new->cap_effective = cap_drop_nfsd_set(new->cap_effective); + else + new->cap_effective = cap_raise_nfsd_set(new->cap_effective, + new->cap_permitted); + validate_process_creds(); + put_cred(override_creds(new)); + put_cred(new); + validate_process_creds(); + return 0; + +oom: + abort_creds(new); + return -ENOMEM; +} + diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h new file mode 100644 index 000000000..dbd66424f --- /dev/null +++ b/fs/nfsd/auth.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * nfsd-specific authentication stuff. + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ + +#ifndef LINUX_NFSD_AUTH_H +#define LINUX_NFSD_AUTH_H + +/* + * Set the current process's fsuid/fsgid etc to those of the NFS + * client user + */ +int nfsd_setuser(struct svc_rqst *, struct svc_export *); + +#endif /* LINUX_NFSD_AUTH_H */ diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c new file mode 100644 index 000000000..a07c39c94 --- /dev/null +++ b/fs/nfsd/blocklayout.c @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2014-2016 Christoph Hellwig. + */ +#include <linux/exportfs.h> +#include <linux/iomap.h> +#include <linux/genhd.h> +#include <linux/slab.h> +#include <linux/pr.h> + +#include <linux/nfsd/debug.h> +#include <scsi/scsi_proto.h> +#include <scsi/scsi_common.h> +#include <scsi/scsi_request.h> + +#include "blocklayoutxdr.h" +#include "pnfs.h" +#include "filecache.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + + +static __be32 +nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, + struct nfsd4_layoutget *args) +{ + struct nfsd4_layout_seg *seg = &args->lg_seg; + struct super_block *sb = inode->i_sb; + u32 block_size = i_blocksize(inode); + struct pnfs_block_extent *bex; + struct iomap iomap; + u32 device_generation = 0; + int error; + + if (seg->offset & (block_size - 1)) { + dprintk("pnfsd: I/O misaligned\n"); + goto out_layoutunavailable; + } + + /* + * Some clients barf on non-zero block numbers for NONE or INVALID + * layouts, so make sure to zero the whole structure. + */ + error = -ENOMEM; + bex = kzalloc(sizeof(*bex), GFP_KERNEL); + if (!bex) + goto out_error; + args->lg_content = bex; + + error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length, + &iomap, seg->iomode != IOMODE_READ, + &device_generation); + if (error) { + if (error == -ENXIO) + goto out_layoutunavailable; + goto out_error; + } + + if (iomap.length < args->lg_minlength) { + dprintk("pnfsd: extent smaller than minlength\n"); + goto out_layoutunavailable; + } + + switch (iomap.type) { + case IOMAP_MAPPED: + if (seg->iomode == IOMODE_READ) + bex->es = PNFS_BLOCK_READ_DATA; + else + bex->es = PNFS_BLOCK_READWRITE_DATA; + bex->soff = iomap.addr; + break; + case IOMAP_UNWRITTEN: + if (seg->iomode & IOMODE_RW) { + /* + * Crack monkey special case from section 2.3.1. + */ + if (args->lg_minlength == 0) { + dprintk("pnfsd: no soup for you!\n"); + goto out_layoutunavailable; + } + + bex->es = PNFS_BLOCK_INVALID_DATA; + bex->soff = iomap.addr; + break; + } + fallthrough; + case IOMAP_HOLE: + if (seg->iomode == IOMODE_READ) { + bex->es = PNFS_BLOCK_NONE_DATA; + break; + } + fallthrough; + case IOMAP_DELALLOC: + default: + WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type); + goto out_layoutunavailable; + } + + error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation); + if (error) + goto out_error; + bex->foff = iomap.offset; + bex->len = iomap.length; + + seg->offset = iomap.offset; + seg->length = iomap.length; + + dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es); + return 0; + +out_error: + seg->length = 0; + return nfserrno(error); +out_layoutunavailable: + seg->length = 0; + return nfserr_layoutunavailable; +} + +static __be32 +nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp, + struct iomap *iomaps, int nr_iomaps) +{ + loff_t new_size = lcp->lc_last_wr + 1; + struct iattr iattr = { .ia_valid = 0 }; + int error; + + if (lcp->lc_mtime.tv_nsec == UTIME_NOW || + timespec64_compare(&lcp->lc_mtime, &inode->i_mtime) < 0) + lcp->lc_mtime = current_time(inode); + iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME; + iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime; + + if (new_size > i_size_read(inode)) { + iattr.ia_valid |= ATTR_SIZE; + iattr.ia_size = new_size; + } + + error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps, + nr_iomaps, &iattr); + kfree(iomaps); + return nfserrno(error); +} + +#ifdef CONFIG_NFSD_BLOCKLAYOUT +static int +nfsd4_block_get_device_info_simple(struct super_block *sb, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_block_deviceaddr *dev; + struct pnfs_block_volume *b; + + dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + + sizeof(struct pnfs_block_volume), GFP_KERNEL); + if (!dev) + return -ENOMEM; + gdp->gd_device = dev; + + dev->nr_volumes = 1; + b = &dev->volumes[0]; + + b->type = PNFS_BLOCK_VOLUME_SIMPLE; + b->simple.sig_len = PNFS_BLOCK_UUID_LEN; + return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len, + &b->simple.offset); +} + +static __be32 +nfsd4_block_proc_getdeviceinfo(struct super_block *sb, + struct svc_rqst *rqstp, + struct nfs4_client *clp, + struct nfsd4_getdeviceinfo *gdp) +{ + if (bdev_is_partition(sb->s_bdev)) + return nfserr_inval; + return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp)); +} + +static __be32 +nfsd4_block_proc_layoutcommit(struct inode *inode, + struct nfsd4_layoutcommit *lcp) +{ + struct iomap *iomaps; + int nr_iomaps; + + nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, + lcp->lc_up_len, &iomaps, i_blocksize(inode)); + if (nr_iomaps < 0) + return nfserrno(nr_iomaps); + + return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); +} + +const struct nfsd4_layout_ops bl_layout_ops = { + /* + * Pretend that we send notification to the client. This is a blatant + * lie to force recent Linux clients to cache our device IDs. + * We rarely ever change the device ID, so the harm of leaking deviceids + * for a while isn't too bad. Unfortunately RFC5661 is a complete mess + * in this regard, but I filed errata 4119 for this a while ago, and + * hopefully the Linux client will eventually start caching deviceids + * without this again. + */ + .notify_types = + NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, + .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo, + .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo, + .proc_layoutget = nfsd4_block_proc_layoutget, + .encode_layoutget = nfsd4_block_encode_layoutget, + .proc_layoutcommit = nfsd4_block_proc_layoutcommit, +}; +#endif /* CONFIG_NFSD_BLOCKLAYOUT */ + +#ifdef CONFIG_NFSD_SCSILAYOUT +static int nfsd4_scsi_identify_device(struct block_device *bdev, + struct pnfs_block_volume *b) +{ + struct request_queue *q = bdev->bd_disk->queue; + struct request *rq; + struct scsi_request *req; + /* + * The allocation length (passed in bytes 3 and 4 of the INQUIRY + * command descriptor block) specifies the number of bytes that have + * been allocated for the data-in buffer. + * 252 is the highest one-byte value that is a multiple of 4. + * 65532 is the highest two-byte value that is a multiple of 4. + */ + size_t bufflen = 252, maxlen = 65532, len, id_len; + u8 *buf, *d, type, assoc; + int retries = 1, error; + + if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q))) + return -EINVAL; + +again: + buf = kzalloc(bufflen, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + rq = blk_get_request(q, REQ_OP_SCSI_IN, 0); + if (IS_ERR(rq)) { + error = -ENOMEM; + goto out_free_buf; + } + req = scsi_req(rq); + + error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL); + if (error) + goto out_put_request; + + req->cmd[0] = INQUIRY; + req->cmd[1] = 1; + req->cmd[2] = 0x83; + req->cmd[3] = bufflen >> 8; + req->cmd[4] = bufflen & 0xff; + req->cmd_len = COMMAND_SIZE(INQUIRY); + + blk_execute_rq(rq->q, NULL, rq, 1); + if (req->result) { + pr_err("pNFS: INQUIRY 0x83 failed with: %x\n", + req->result); + error = -EIO; + goto out_put_request; + } + + len = (buf[2] << 8) + buf[3] + 4; + if (len > bufflen) { + if (len <= maxlen && retries--) { + blk_put_request(rq); + kfree(buf); + bufflen = len; + goto again; + } + pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n", + len); + goto out_put_request; + } + + d = buf + 4; + for (d = buf + 4; d < buf + len; d += id_len + 4) { + id_len = d[3]; + type = d[1] & 0xf; + assoc = (d[1] >> 4) & 0x3; + + /* + * We only care about a EUI-64 and NAA designator types + * with LU association. + */ + if (assoc != 0x00) + continue; + if (type != 0x02 && type != 0x03) + continue; + if (id_len != 8 && id_len != 12 && id_len != 16) + continue; + + b->scsi.code_set = PS_CODE_SET_BINARY; + b->scsi.designator_type = type == 0x02 ? + PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA; + b->scsi.designator_len = id_len; + memcpy(b->scsi.designator, d + 4, id_len); + + /* + * If we found a 8 or 12 byte descriptor continue on to + * see if a 16 byte one is available. If we find a + * 16 byte descriptor we're done. + */ + if (id_len == 16) + break; + } + +out_put_request: + blk_put_request(rq); +out_free_buf: + kfree(buf); + return error; +} + +#define NFSD_MDS_PR_KEY 0x0100000000000000ULL + +/* + * We use the client ID as a unique key for the reservations. + * This allows us to easily fence a client when recalls fail. + */ +static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp) +{ + return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id; +} + +static int +nfsd4_block_get_device_info_scsi(struct super_block *sb, + struct nfs4_client *clp, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_block_deviceaddr *dev; + struct pnfs_block_volume *b; + const struct pr_ops *ops; + int error; + + dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + + sizeof(struct pnfs_block_volume), GFP_KERNEL); + if (!dev) + return -ENOMEM; + gdp->gd_device = dev; + + dev->nr_volumes = 1; + b = &dev->volumes[0]; + + b->type = PNFS_BLOCK_VOLUME_SCSI; + b->scsi.pr_key = nfsd4_scsi_pr_key(clp); + + error = nfsd4_scsi_identify_device(sb->s_bdev, b); + if (error) + return error; + + ops = sb->s_bdev->bd_disk->fops->pr_ops; + if (!ops) { + pr_err("pNFS: device %s does not support PRs.\n", + sb->s_id); + return -EINVAL; + } + + error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true); + if (error) { + pr_err("pNFS: failed to register key for device %s.\n", + sb->s_id); + return -EINVAL; + } + + error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY, + PR_EXCLUSIVE_ACCESS_REG_ONLY, 0); + if (error) { + pr_err("pNFS: failed to reserve device %s.\n", + sb->s_id); + return -EINVAL; + } + + return 0; +} + +static __be32 +nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb, + struct svc_rqst *rqstp, + struct nfs4_client *clp, + struct nfsd4_getdeviceinfo *gdp) +{ + if (bdev_is_partition(sb->s_bdev)) + return nfserr_inval; + return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp)); +} +static __be32 +nfsd4_scsi_proc_layoutcommit(struct inode *inode, + struct nfsd4_layoutcommit *lcp) +{ + struct iomap *iomaps; + int nr_iomaps; + + nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, + lcp->lc_up_len, &iomaps, i_blocksize(inode)); + if (nr_iomaps < 0) + return nfserrno(nr_iomaps); + + return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); +} + +static void +nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls) +{ + struct nfs4_client *clp = ls->ls_stid.sc_client; + struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev; + + bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, + nfsd4_scsi_pr_key(clp), 0, true); +} + +const struct nfsd4_layout_ops scsi_layout_ops = { + /* + * Pretend that we send notification to the client. This is a blatant + * lie to force recent Linux clients to cache our device IDs. + * We rarely ever change the device ID, so the harm of leaking deviceids + * for a while isn't too bad. Unfortunately RFC5661 is a complete mess + * in this regard, but I filed errata 4119 for this a while ago, and + * hopefully the Linux client will eventually start caching deviceids + * without this again. + */ + .notify_types = + NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, + .proc_getdeviceinfo = nfsd4_scsi_proc_getdeviceinfo, + .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo, + .proc_layoutget = nfsd4_block_proc_layoutget, + .encode_layoutget = nfsd4_block_encode_layoutget, + .proc_layoutcommit = nfsd4_scsi_proc_layoutcommit, + .fence_client = nfsd4_scsi_fence_client, +}; +#endif /* CONFIG_NFSD_SCSILAYOUT */ diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c new file mode 100644 index 000000000..2455dc8be --- /dev/null +++ b/fs/nfsd/blocklayoutxdr.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2014-2016 Christoph Hellwig. + */ +#include <linux/sunrpc/svc.h> +#include <linux/exportfs.h> +#include <linux/iomap.h> +#include <linux/nfs4.h> + +#include "nfsd.h" +#include "blocklayoutxdr.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + + +__be32 +nfsd4_block_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp) +{ + struct pnfs_block_extent *b = lgp->lg_content; + int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32); + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(__be32) + len); + if (!p) + return nfserr_toosmall; + + *p++ = cpu_to_be32(len); + *p++ = cpu_to_be32(1); /* we always return a single extent */ + + p = xdr_encode_opaque_fixed(p, &b->vol_id, + sizeof(struct nfsd4_deviceid)); + p = xdr_encode_hyper(p, b->foff); + p = xdr_encode_hyper(p, b->len); + p = xdr_encode_hyper(p, b->soff); + *p++ = cpu_to_be32(b->es); + return 0; +} + +static int +nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) +{ + __be32 *p; + int len; + + switch (b->type) { + case PNFS_BLOCK_VOLUME_SIMPLE: + len = 4 + 4 + 8 + 4 + (XDR_QUADLEN(b->simple.sig_len) << 2); + p = xdr_reserve_space(xdr, len); + if (!p) + return -ETOOSMALL; + + *p++ = cpu_to_be32(b->type); + *p++ = cpu_to_be32(1); /* single signature */ + p = xdr_encode_hyper(p, b->simple.offset); + p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len); + break; + case PNFS_BLOCK_VOLUME_SCSI: + len = 4 + 4 + 4 + 4 + (XDR_QUADLEN(b->scsi.designator_len) << 2) + 8; + p = xdr_reserve_space(xdr, len); + if (!p) + return -ETOOSMALL; + + *p++ = cpu_to_be32(b->type); + *p++ = cpu_to_be32(b->scsi.code_set); + *p++ = cpu_to_be32(b->scsi.designator_type); + p = xdr_encode_opaque(p, b->scsi.designator, b->scsi.designator_len); + p = xdr_encode_hyper(p, b->scsi.pr_key); + break; + default: + return -ENOTSUPP; + } + + return len; +} + +__be32 +nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_block_deviceaddr *dev = gdp->gd_device; + int len = sizeof(__be32), ret, i; + __be32 *p; + + /* + * See paragraph 5 of RFC 8881 S18.40.3. + */ + if (!gdp->gd_maxcount) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + + p = xdr_reserve_space(xdr, len + sizeof(__be32)); + if (!p) + return nfserr_resource; + + for (i = 0; i < dev->nr_volumes; i++) { + ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]); + if (ret < 0) + return nfserrno(ret); + len += ret; + } + + /* + * Fill in the overall length and number of volumes at the beginning + * of the layout. + */ + *p++ = cpu_to_be32(len); + *p++ = cpu_to_be32(dev->nr_volumes); + return 0; +} + +int +nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, + u32 block_size) +{ + struct iomap *iomaps; + u32 nr_iomaps, i; + + if (len < sizeof(u32)) { + dprintk("%s: extent array too small: %u\n", __func__, len); + return -EINVAL; + } + len -= sizeof(u32); + if (len % PNFS_BLOCK_EXTENT_SIZE) { + dprintk("%s: extent array invalid: %u\n", __func__, len); + return -EINVAL; + } + + nr_iomaps = be32_to_cpup(p++); + if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) { + dprintk("%s: extent array size mismatch: %u/%u\n", + __func__, len, nr_iomaps); + return -EINVAL; + } + + iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); + if (!iomaps) { + dprintk("%s: failed to allocate extent array\n", __func__); + return -ENOMEM; + } + + for (i = 0; i < nr_iomaps; i++) { + struct pnfs_block_extent bex; + + memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); + p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); + + p = xdr_decode_hyper(p, &bex.foff); + if (bex.foff & (block_size - 1)) { + dprintk("%s: unaligned offset 0x%llx\n", + __func__, bex.foff); + goto fail; + } + p = xdr_decode_hyper(p, &bex.len); + if (bex.len & (block_size - 1)) { + dprintk("%s: unaligned length 0x%llx\n", + __func__, bex.foff); + goto fail; + } + p = xdr_decode_hyper(p, &bex.soff); + if (bex.soff & (block_size - 1)) { + dprintk("%s: unaligned disk offset 0x%llx\n", + __func__, bex.soff); + goto fail; + } + bex.es = be32_to_cpup(p++); + if (bex.es != PNFS_BLOCK_READWRITE_DATA) { + dprintk("%s: incorrect extent state %d\n", + __func__, bex.es); + goto fail; + } + + iomaps[i].offset = bex.foff; + iomaps[i].length = bex.len; + } + + *iomapp = iomaps; + return nr_iomaps; +fail: + kfree(iomaps); + return -EINVAL; +} + +int +nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, + u32 block_size) +{ + struct iomap *iomaps; + u32 nr_iomaps, expected, i; + + if (len < sizeof(u32)) { + dprintk("%s: extent array too small: %u\n", __func__, len); + return -EINVAL; + } + + nr_iomaps = be32_to_cpup(p++); + expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE; + if (len != expected) { + dprintk("%s: extent array size mismatch: %u/%u\n", + __func__, len, expected); + return -EINVAL; + } + + iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); + if (!iomaps) { + dprintk("%s: failed to allocate extent array\n", __func__); + return -ENOMEM; + } + + for (i = 0; i < nr_iomaps; i++) { + u64 val; + + p = xdr_decode_hyper(p, &val); + if (val & (block_size - 1)) { + dprintk("%s: unaligned offset 0x%llx\n", __func__, val); + goto fail; + } + iomaps[i].offset = val; + + p = xdr_decode_hyper(p, &val); + if (val & (block_size - 1)) { + dprintk("%s: unaligned length 0x%llx\n", __func__, val); + goto fail; + } + iomaps[i].length = val; + } + + *iomapp = iomaps; + return nr_iomaps; +fail: + kfree(iomaps); + return -EINVAL; +} diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h new file mode 100644 index 000000000..bc5166bfe --- /dev/null +++ b/fs/nfsd/blocklayoutxdr.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NFSD_BLOCKLAYOUTXDR_H +#define _NFSD_BLOCKLAYOUTXDR_H 1 + +#include <linux/blkdev.h> +#include "xdr4.h" + +struct iomap; +struct xdr_stream; + +struct pnfs_block_extent { + struct nfsd4_deviceid vol_id; + u64 foff; + u64 len; + u64 soff; + enum pnfs_block_extent_state es; +}; + +struct pnfs_block_range { + u64 foff; + u64 len; +}; + +/* + * Random upper cap for the uuid length to avoid unbounded allocation. + * Not actually limited by the protocol. + */ +#define PNFS_BLOCK_UUID_LEN 128 + +struct pnfs_block_volume { + enum pnfs_block_volume_type type; + union { + struct { + u64 offset; + u32 sig_len; + u8 sig[PNFS_BLOCK_UUID_LEN]; + } simple; + struct { + enum scsi_code_set code_set; + enum scsi_designator_type designator_type; + int designator_len; + u8 designator[256]; + u64 pr_key; + } scsi; + }; +}; + +struct pnfs_block_deviceaddr { + u32 nr_volumes; + struct pnfs_block_volume volumes[]; +}; + +__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp); +__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp); +int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, + u32 block_size); +int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, + u32 block_size); + +#endif /* _NFSD_BLOCKLAYOUTXDR_H */ diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h new file mode 100644 index 000000000..65c331f75 --- /dev/null +++ b/fs/nfsd/cache.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Request reply cache. This was heavily inspired by the + * implementation in 4.3BSD/4.4BSD. + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ + +#ifndef NFSCACHE_H +#define NFSCACHE_H + +#include <linux/sunrpc/svc.h> +#include "netns.h" + +/* + * Representation of a reply cache entry. + * + * Note that we use a sockaddr_in6 to hold the address instead of the more + * typical sockaddr_storage. This is for space reasons, since sockaddr_storage + * is much larger than a sockaddr_in6. + */ +struct svc_cacherep { + struct { + /* Keep often-read xid, csum in the same cache line: */ + __be32 k_xid; + __wsum k_csum; + u32 k_proc; + u32 k_prot; + u32 k_vers; + unsigned int k_len; + struct sockaddr_in6 k_addr; + } c_key; + + struct rb_node c_node; + struct list_head c_lru; + unsigned char c_state, /* unused, inprog, done */ + c_type, /* status, buffer */ + c_secure : 1; /* req came from port < 1024 */ + unsigned long c_timestamp; + union { + struct kvec u_vec; + __be32 u_status; + } c_u; +}; + +#define c_replvec c_u.u_vec +#define c_replstat c_u.u_status + +/* cache entry states */ +enum { + RC_UNUSED, + RC_INPROG, + RC_DONE +}; + +/* return values */ +enum { + RC_DROPIT, + RC_REPLY, + RC_DOIT +}; + +/* + * Cache types. + * We may want to add more types one day, e.g. for diropres and + * attrstat replies. Using cache entries with fixed length instead + * of buffer pointers may be more efficient. + */ +enum { + RC_NOCACHE, + RC_REPLSTAT, + RC_REPLBUFF, +}; + +/* Cache entries expire after this time period */ +#define RC_EXPIRE (120 * HZ) + +/* Checksum this amount of the request */ +#define RC_CSUMLEN (256U) + +int nfsd_drc_slab_create(void); +void nfsd_drc_slab_free(void); +int nfsd_reply_cache_init(struct nfsd_net *); +void nfsd_reply_cache_shutdown(struct nfsd_net *); +int nfsd_cache_lookup(struct svc_rqst *); +void nfsd_cache_update(struct svc_rqst *, int, __be32 *); +int nfsd_reply_cache_stats_open(struct inode *, struct file *); + +#endif /* NFSCACHE_H */ diff --git a/fs/nfsd/current_stateid.h b/fs/nfsd/current_stateid.h new file mode 100644 index 000000000..c28540d86 --- /dev/null +++ b/fs/nfsd/current_stateid.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NFSD4_CURRENT_STATE_H +#define _NFSD4_CURRENT_STATE_H + +#include "state.h" +#include "xdr4.h" + +extern void clear_current_stateid(struct nfsd4_compound_state *cstate); +/* + * functions to set current state id + */ +extern void nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *, + union nfsd4_op_u *); +extern void nfsd4_set_openstateid(struct nfsd4_compound_state *, + union nfsd4_op_u *); +extern void nfsd4_set_lockstateid(struct nfsd4_compound_state *, + union nfsd4_op_u *); +extern void nfsd4_set_closestateid(struct nfsd4_compound_state *, + union nfsd4_op_u *); + +/* + * functions to consume current state id + */ +extern void nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *, + union nfsd4_op_u *); +extern void nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *, + union nfsd4_op_u *); +extern void nfsd4_get_freestateid(struct nfsd4_compound_state *, + union nfsd4_op_u *); +extern void nfsd4_get_setattrstateid(struct nfsd4_compound_state *, + union nfsd4_op_u *); +extern void nfsd4_get_closestateid(struct nfsd4_compound_state *, + union nfsd4_op_u *); +extern void nfsd4_get_lockustateid(struct nfsd4_compound_state *, + union nfsd4_op_u *); +extern void nfsd4_get_readstateid(struct nfsd4_compound_state *, + union nfsd4_op_u *); +extern void nfsd4_get_writestateid(struct nfsd4_compound_state *, + union nfsd4_op_u *); + +#endif /* _NFSD4_CURRENT_STATE_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c new file mode 100644 index 000000000..21e404e7c --- /dev/null +++ b/fs/nfsd/export.c @@ -0,0 +1,1329 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NFS exporting and validation. + * + * We maintain a list of clients, each of which has a list of + * exports. To export an fs to a given client, you first have + * to create the client entry with NFSCTL_ADDCLIENT, which + * creates a client control block and adds it to the hash + * table. Then, you call NFSCTL_EXPORT for each fs. + * + * + * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de> + */ + +#include <linux/slab.h> +#include <linux/namei.h> +#include <linux/module.h> +#include <linux/exportfs.h> +#include <linux/sunrpc/svc_xprt.h> + +#include "nfsd.h" +#include "nfsfh.h" +#include "netns.h" +#include "pnfs.h" +#include "filecache.h" +#include "trace.h" + +#define NFSDDBG_FACILITY NFSDDBG_EXPORT + +/* + * We have two caches. + * One maps client+vfsmnt+dentry to export options - the export map + * The other maps client+filehandle-fragment to export options. - the expkey map + * + * The export options are actually stored in the first map, and the + * second map contains a reference to the entry in the first map. + */ + +#define EXPKEY_HASHBITS 8 +#define EXPKEY_HASHMAX (1 << EXPKEY_HASHBITS) +#define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1) + +static void expkey_put(struct kref *ref) +{ + struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); + + if (test_bit(CACHE_VALID, &key->h.flags) && + !test_bit(CACHE_NEGATIVE, &key->h.flags)) + path_put(&key->ek_path); + auth_domain_put(key->ek_client); + kfree_rcu(key, ek_rcu); +} + +static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h); +} + +static void expkey_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + /* client fsidtype \xfsid */ + struct svc_expkey *ek = container_of(h, struct svc_expkey, h); + char type[5]; + + qword_add(bpp, blen, ek->ek_client->name); + snprintf(type, 5, "%d", ek->ek_fsidtype); + qword_add(bpp, blen, type); + qword_addhex(bpp, blen, (char*)ek->ek_fsid, key_len(ek->ek_fsidtype)); + (*bpp)[-1] = '\n'; +} + +static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, + struct svc_expkey *old); +static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *); + +static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) +{ + /* client fsidtype fsid expiry [path] */ + char *buf; + int len; + struct auth_domain *dom = NULL; + int err; + int fsidtype; + char *ep; + struct svc_expkey key; + struct svc_expkey *ek = NULL; + + if (mesg[mlen - 1] != '\n') + return -EINVAL; + mesg[mlen-1] = 0; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + err = -ENOMEM; + if (!buf) + goto out; + + err = -EINVAL; + if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + goto out; + + err = -ENOENT; + dom = auth_domain_find(buf); + if (!dom) + goto out; + dprintk("found domain %s\n", buf); + + err = -EINVAL; + if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + goto out; + fsidtype = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + dprintk("found fsidtype %d\n", fsidtype); + if (key_len(fsidtype)==0) /* invalid type */ + goto out; + if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + goto out; + dprintk("found fsid length %d\n", len); + if (len != key_len(fsidtype)) + goto out; + + /* OK, we seem to have a valid key */ + key.h.flags = 0; + key.h.expiry_time = get_expiry(&mesg); + if (key.h.expiry_time == 0) + goto out; + + key.ek_client = dom; + key.ek_fsidtype = fsidtype; + memcpy(key.ek_fsid, buf, len); + + ek = svc_expkey_lookup(cd, &key); + err = -ENOMEM; + if (!ek) + goto out; + + /* now we want a pathname, or empty meaning NEGATIVE */ + err = -EINVAL; + len = qword_get(&mesg, buf, PAGE_SIZE); + if (len < 0) + goto out; + dprintk("Path seems to be <%s>\n", buf); + err = 0; + if (len == 0) { + set_bit(CACHE_NEGATIVE, &key.h.flags); + ek = svc_expkey_update(cd, &key, ek); + if (ek) + trace_nfsd_expkey_update(ek, NULL); + else + err = -ENOMEM; + } else { + err = kern_path(buf, 0, &key.ek_path); + if (err) + goto out; + + dprintk("Found the path %s\n", buf); + + ek = svc_expkey_update(cd, &key, ek); + if (ek) + trace_nfsd_expkey_update(ek, buf); + else + err = -ENOMEM; + path_put(&key.ek_path); + } + cache_flush(); + out: + if (ek) + cache_put(&ek->h, cd); + if (dom) + auth_domain_put(dom); + kfree(buf); + return err; +} + +static int expkey_show(struct seq_file *m, + struct cache_detail *cd, + struct cache_head *h) +{ + struct svc_expkey *ek ; + int i; + + if (h ==NULL) { + seq_puts(m, "#domain fsidtype fsid [path]\n"); + return 0; + } + ek = container_of(h, struct svc_expkey, h); + seq_printf(m, "%s %d 0x", ek->ek_client->name, + ek->ek_fsidtype); + for (i=0; i < key_len(ek->ek_fsidtype)/4; i++) + seq_printf(m, "%08x", ek->ek_fsid[i]); + if (test_bit(CACHE_VALID, &h->flags) && + !test_bit(CACHE_NEGATIVE, &h->flags)) { + seq_printf(m, " "); + seq_path(m, &ek->ek_path, "\\ \t\n"); + } + seq_printf(m, "\n"); + return 0; +} + +static inline int expkey_match (struct cache_head *a, struct cache_head *b) +{ + struct svc_expkey *orig = container_of(a, struct svc_expkey, h); + struct svc_expkey *new = container_of(b, struct svc_expkey, h); + + if (orig->ek_fsidtype != new->ek_fsidtype || + orig->ek_client != new->ek_client || + memcmp(orig->ek_fsid, new->ek_fsid, key_len(orig->ek_fsidtype)) != 0) + return 0; + return 1; +} + +static inline void expkey_init(struct cache_head *cnew, + struct cache_head *citem) +{ + struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); + struct svc_expkey *item = container_of(citem, struct svc_expkey, h); + + kref_get(&item->ek_client->ref); + new->ek_client = item->ek_client; + new->ek_fsidtype = item->ek_fsidtype; + + memcpy(new->ek_fsid, item->ek_fsid, sizeof(new->ek_fsid)); +} + +static inline void expkey_update(struct cache_head *cnew, + struct cache_head *citem) +{ + struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); + struct svc_expkey *item = container_of(citem, struct svc_expkey, h); + + new->ek_path = item->ek_path; + path_get(&item->ek_path); +} + +static struct cache_head *expkey_alloc(void) +{ + struct svc_expkey *i = kmalloc(sizeof(*i), GFP_KERNEL); + if (i) + return &i->h; + else + return NULL; +} + +static void expkey_flush(void) +{ + /* + * Take the nfsd_mutex here to ensure that the file cache is not + * destroyed while we're in the middle of flushing. + */ + mutex_lock(&nfsd_mutex); + nfsd_file_cache_purge(current->nsproxy->net_ns); + mutex_unlock(&nfsd_mutex); +} + +static const struct cache_detail svc_expkey_cache_template = { + .owner = THIS_MODULE, + .hash_size = EXPKEY_HASHMAX, + .name = "nfsd.fh", + .cache_put = expkey_put, + .cache_upcall = expkey_upcall, + .cache_request = expkey_request, + .cache_parse = expkey_parse, + .cache_show = expkey_show, + .match = expkey_match, + .init = expkey_init, + .update = expkey_update, + .alloc = expkey_alloc, + .flush = expkey_flush, +}; + +static int +svc_expkey_hash(struct svc_expkey *item) +{ + int hash = item->ek_fsidtype; + char * cp = (char*)item->ek_fsid; + int len = key_len(item->ek_fsidtype); + + hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); + hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); + hash &= EXPKEY_HASHMASK; + return hash; +} + +static struct svc_expkey * +svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item) +{ + struct cache_head *ch; + int hash = svc_expkey_hash(item); + + ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); + if (ch) + return container_of(ch, struct svc_expkey, h); + else + return NULL; +} + +static struct svc_expkey * +svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, + struct svc_expkey *old) +{ + struct cache_head *ch; + int hash = svc_expkey_hash(new); + + ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); + if (ch) + return container_of(ch, struct svc_expkey, h); + else + return NULL; +} + + +#define EXPORT_HASHBITS 8 +#define EXPORT_HASHMAX (1<< EXPORT_HASHBITS) + +static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) +{ + struct nfsd4_fs_location *locations = fsloc->locations; + int i; + + if (!locations) + return; + + for (i = 0; i < fsloc->locations_count; i++) { + kfree(locations[i].path); + kfree(locations[i].hosts); + } + + kfree(locations); + fsloc->locations = NULL; +} + +static void svc_export_put(struct kref *ref) +{ + struct svc_export *exp = container_of(ref, struct svc_export, h.ref); + path_put(&exp->ex_path); + auth_domain_put(exp->ex_client); + nfsd4_fslocs_free(&exp->ex_fslocs); + kfree(exp->ex_uuid); + kfree_rcu(exp, ex_rcu); +} + +static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h); +} + +static void svc_export_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + /* client path */ + struct svc_export *exp = container_of(h, struct svc_export, h); + char *pth; + + qword_add(bpp, blen, exp->ex_client->name); + pth = d_path(&exp->ex_path, *bpp, *blen); + if (IS_ERR(pth)) { + /* is this correct? */ + (*bpp)[0] = '\n'; + return; + } + qword_add(bpp, blen, pth); + (*bpp)[-1] = '\n'; +} + +static struct svc_export *svc_export_update(struct svc_export *new, + struct svc_export *old); +static struct svc_export *svc_export_lookup(struct svc_export *); + +static int check_export(struct inode *inode, int *flags, unsigned char *uuid) +{ + + /* + * We currently export only dirs, regular files, and (for v4 + * pseudoroot) symlinks. + */ + if (!S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode) && + !S_ISREG(inode->i_mode)) + return -ENOTDIR; + + /* + * Mountd should never pass down a writeable V4ROOT export, but, + * just to make sure: + */ + if (*flags & NFSEXP_V4ROOT) + *flags |= NFSEXP_READONLY; + + /* There are two requirements on a filesystem to be exportable. + * 1: We must be able to identify the filesystem from a number. + * either a device number (so FS_REQUIRES_DEV needed) + * or an FSID number (so NFSEXP_FSID or ->uuid is needed). + * 2: We must be able to find an inode from a filehandle. + * This means that s_export_op must be set. + */ + if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && + !(*flags & NFSEXP_FSID) && + uuid == NULL) { + dprintk("exp_export: export of non-dev fs without fsid\n"); + return -EINVAL; + } + + if (!inode->i_sb->s_export_op || + !inode->i_sb->s_export_op->fh_to_dentry) { + dprintk("exp_export: export of invalid fs type.\n"); + return -EINVAL; + } + + return 0; + +} + +#ifdef CONFIG_NFSD_V4 + +static int +fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) +{ + int len; + int migrated, i, err; + + /* more than one fsloc */ + if (fsloc->locations) + return -EINVAL; + + /* listsize */ + err = get_uint(mesg, &fsloc->locations_count); + if (err) + return err; + if (fsloc->locations_count > MAX_FS_LOCATIONS) + return -EINVAL; + if (fsloc->locations_count == 0) + return 0; + + fsloc->locations = kcalloc(fsloc->locations_count, + sizeof(struct nfsd4_fs_location), + GFP_KERNEL); + if (!fsloc->locations) + return -ENOMEM; + for (i=0; i < fsloc->locations_count; i++) { + /* colon separated host list */ + err = -EINVAL; + len = qword_get(mesg, buf, PAGE_SIZE); + if (len <= 0) + goto out_free_all; + err = -ENOMEM; + fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL); + if (!fsloc->locations[i].hosts) + goto out_free_all; + err = -EINVAL; + /* slash separated path component list */ + len = qword_get(mesg, buf, PAGE_SIZE); + if (len <= 0) + goto out_free_all; + err = -ENOMEM; + fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL); + if (!fsloc->locations[i].path) + goto out_free_all; + } + /* migrated */ + err = get_int(mesg, &migrated); + if (err) + goto out_free_all; + err = -EINVAL; + if (migrated < 0 || migrated > 1) + goto out_free_all; + fsloc->migrated = migrated; + return 0; +out_free_all: + nfsd4_fslocs_free(fsloc); + return err; +} + +static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) +{ + struct exp_flavor_info *f; + u32 listsize; + int err; + + /* more than one secinfo */ + if (exp->ex_nflavors) + return -EINVAL; + + err = get_uint(mesg, &listsize); + if (err) + return err; + if (listsize > MAX_SECINFO_LIST) + return -EINVAL; + + for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { + err = get_uint(mesg, &f->pseudoflavor); + if (err) + return err; + /* + * XXX: It would be nice to also check whether this + * pseudoflavor is supported, so we can discover the + * problem at export time instead of when a client fails + * to authenticate. + */ + err = get_uint(mesg, &f->flags); + if (err) + return err; + /* Only some flags are allowed to differ between flavors: */ + if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags)) + return -EINVAL; + } + exp->ex_nflavors = listsize; + return 0; +} + +#else /* CONFIG_NFSD_V4 */ +static inline int +fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;} +static inline int +secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } +#endif + +static inline int +nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid) +{ + int len; + + /* more than one uuid */ + if (*puuid) + return -EINVAL; + + /* expect a 16 byte uuid encoded as \xXXXX... */ + len = qword_get(mesg, buf, PAGE_SIZE); + if (len != EX_UUID_LEN) + return -EINVAL; + + *puuid = kmemdup(buf, EX_UUID_LEN, GFP_KERNEL); + if (*puuid == NULL) + return -ENOMEM; + + return 0; +} + +static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) +{ + /* client path expiry [flags anonuid anongid fsid] */ + char *buf; + int len; + int err; + struct auth_domain *dom = NULL; + struct svc_export exp = {}, *expp; + int an_int; + + if (mesg[mlen-1] != '\n') + return -EINVAL; + mesg[mlen-1] = 0; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* client */ + err = -EINVAL; + len = qword_get(&mesg, buf, PAGE_SIZE); + if (len <= 0) + goto out; + + err = -ENOENT; + dom = auth_domain_find(buf); + if (!dom) + goto out; + + /* path */ + err = -EINVAL; + if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + goto out1; + + err = kern_path(buf, 0, &exp.ex_path); + if (err) + goto out1; + + exp.ex_client = dom; + exp.cd = cd; + exp.ex_devid_map = NULL; + + /* expiry */ + err = -EINVAL; + exp.h.expiry_time = get_expiry(&mesg); + if (exp.h.expiry_time == 0) + goto out3; + + /* flags */ + err = get_int(&mesg, &an_int); + if (err == -ENOENT) { + err = 0; + set_bit(CACHE_NEGATIVE, &exp.h.flags); + } else { + if (err || an_int < 0) + goto out3; + exp.ex_flags= an_int; + + /* anon uid */ + err = get_int(&mesg, &an_int); + if (err) + goto out3; + exp.ex_anon_uid= make_kuid(current_user_ns(), an_int); + + /* anon gid */ + err = get_int(&mesg, &an_int); + if (err) + goto out3; + exp.ex_anon_gid= make_kgid(current_user_ns(), an_int); + + /* fsid */ + err = get_int(&mesg, &an_int); + if (err) + goto out3; + exp.ex_fsid = an_int; + + while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) { + if (strcmp(buf, "fsloc") == 0) + err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); + else if (strcmp(buf, "uuid") == 0) + err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid); + else if (strcmp(buf, "secinfo") == 0) + err = secinfo_parse(&mesg, buf, &exp); + else + /* quietly ignore unknown words and anything + * following. Newer user-space can try to set + * new values, then see what the result was. + */ + break; + if (err) + goto out4; + } + + err = check_export(d_inode(exp.ex_path.dentry), &exp.ex_flags, + exp.ex_uuid); + if (err) + goto out4; + /* + * No point caching this if it would immediately expire. + * Also, this protects exportfs's dummy export from the + * anon_uid/anon_gid checks: + */ + if (exp.h.expiry_time < seconds_since_boot()) + goto out4; + /* + * For some reason exportfs has been passing down an + * invalid (-1) uid & gid on the "dummy" export which it + * uses to test export support. To make sure exportfs + * sees errors from check_export we therefore need to + * delay these checks till after check_export: + */ + err = -EINVAL; + if (!uid_valid(exp.ex_anon_uid)) + goto out4; + if (!gid_valid(exp.ex_anon_gid)) + goto out4; + err = 0; + + nfsd4_setup_layout_type(&exp); + } + + expp = svc_export_lookup(&exp); + if (!expp) { + err = -ENOMEM; + goto out4; + } + expp = svc_export_update(&exp, expp); + if (expp) { + trace_nfsd_export_update(expp); + cache_flush(); + exp_put(expp); + } else + err = -ENOMEM; +out4: + nfsd4_fslocs_free(&exp.ex_fslocs); + kfree(exp.ex_uuid); +out3: + path_put(&exp.ex_path); +out1: + auth_domain_put(dom); +out: + kfree(buf); + return err; +} + +static void exp_flags(struct seq_file *m, int flag, int fsid, + kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs); +static void show_secinfo(struct seq_file *m, struct svc_export *exp); + +static int svc_export_show(struct seq_file *m, + struct cache_detail *cd, + struct cache_head *h) +{ + struct svc_export *exp ; + + if (h ==NULL) { + seq_puts(m, "#path domain(flags)\n"); + return 0; + } + exp = container_of(h, struct svc_export, h); + seq_path(m, &exp->ex_path, " \t\n\\"); + seq_putc(m, '\t'); + seq_escape(m, exp->ex_client->name, " \t\n\\"); + seq_putc(m, '('); + if (test_bit(CACHE_VALID, &h->flags) && + !test_bit(CACHE_NEGATIVE, &h->flags)) { + exp_flags(m, exp->ex_flags, exp->ex_fsid, + exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs); + if (exp->ex_uuid) { + int i; + seq_puts(m, ",uuid="); + for (i = 0; i < EX_UUID_LEN; i++) { + if ((i&3) == 0 && i) + seq_putc(m, ':'); + seq_printf(m, "%02x", exp->ex_uuid[i]); + } + } + show_secinfo(m, exp); + } + seq_puts(m, ")\n"); + return 0; +} +static int svc_export_match(struct cache_head *a, struct cache_head *b) +{ + struct svc_export *orig = container_of(a, struct svc_export, h); + struct svc_export *new = container_of(b, struct svc_export, h); + return orig->ex_client == new->ex_client && + path_equal(&orig->ex_path, &new->ex_path); +} + +static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) +{ + struct svc_export *new = container_of(cnew, struct svc_export, h); + struct svc_export *item = container_of(citem, struct svc_export, h); + + kref_get(&item->ex_client->ref); + new->ex_client = item->ex_client; + new->ex_path = item->ex_path; + path_get(&item->ex_path); + new->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = 0; + new->ex_layout_types = 0; + new->ex_uuid = NULL; + new->cd = item->cd; +} + +static void export_update(struct cache_head *cnew, struct cache_head *citem) +{ + struct svc_export *new = container_of(cnew, struct svc_export, h); + struct svc_export *item = container_of(citem, struct svc_export, h); + int i; + + new->ex_flags = item->ex_flags; + new->ex_anon_uid = item->ex_anon_uid; + new->ex_anon_gid = item->ex_anon_gid; + new->ex_fsid = item->ex_fsid; + new->ex_devid_map = item->ex_devid_map; + item->ex_devid_map = NULL; + new->ex_uuid = item->ex_uuid; + item->ex_uuid = NULL; + new->ex_fslocs.locations = item->ex_fslocs.locations; + item->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; + item->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = item->ex_fslocs.migrated; + item->ex_fslocs.migrated = 0; + new->ex_layout_types = item->ex_layout_types; + new->ex_nflavors = item->ex_nflavors; + for (i = 0; i < MAX_SECINFO_LIST; i++) { + new->ex_flavors[i] = item->ex_flavors[i]; + } +} + +static struct cache_head *svc_export_alloc(void) +{ + struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL); + if (i) + return &i->h; + else + return NULL; +} + +static const struct cache_detail svc_export_cache_template = { + .owner = THIS_MODULE, + .hash_size = EXPORT_HASHMAX, + .name = "nfsd.export", + .cache_put = svc_export_put, + .cache_upcall = svc_export_upcall, + .cache_request = svc_export_request, + .cache_parse = svc_export_parse, + .cache_show = svc_export_show, + .match = svc_export_match, + .init = svc_export_init, + .update = export_update, + .alloc = svc_export_alloc, +}; + +static int +svc_export_hash(struct svc_export *exp) +{ + int hash; + + hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); + hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); + return hash; +} + +static struct svc_export * +svc_export_lookup(struct svc_export *exp) +{ + struct cache_head *ch; + int hash = svc_export_hash(exp); + + ch = sunrpc_cache_lookup_rcu(exp->cd, &exp->h, hash); + if (ch) + return container_of(ch, struct svc_export, h); + else + return NULL; +} + +static struct svc_export * +svc_export_update(struct svc_export *new, struct svc_export *old) +{ + struct cache_head *ch; + int hash = svc_export_hash(old); + + ch = sunrpc_cache_update(old->cd, &new->h, &old->h, hash); + if (ch) + return container_of(ch, struct svc_export, h); + else + return NULL; +} + + +static struct svc_expkey * +exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, + u32 *fsidv, struct cache_req *reqp) +{ + struct svc_expkey key, *ek; + int err; + + if (!clp) + return ERR_PTR(-ENOENT); + + key.ek_client = clp; + key.ek_fsidtype = fsid_type; + memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); + + ek = svc_expkey_lookup(cd, &key); + if (ek == NULL) + return ERR_PTR(-ENOMEM); + err = cache_check(cd, &ek->h, reqp); + if (err) { + trace_nfsd_exp_find_key(&key, err); + return ERR_PTR(err); + } + return ek; +} + +static struct svc_export * +exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp, + const struct path *path, struct cache_req *reqp) +{ + struct svc_export *exp, key; + int err; + + if (!clp) + return ERR_PTR(-ENOENT); + + key.ex_client = clp; + key.ex_path = *path; + key.cd = cd; + + exp = svc_export_lookup(&key); + if (exp == NULL) + return ERR_PTR(-ENOMEM); + err = cache_check(cd, &exp->h, reqp); + if (err) { + trace_nfsd_exp_get_by_name(&key, err); + return ERR_PTR(err); + } + return exp; +} + +/* + * Find the export entry for a given dentry. + */ +static struct svc_export * +exp_parent(struct cache_detail *cd, struct auth_domain *clp, struct path *path) +{ + struct dentry *saved = dget(path->dentry); + struct svc_export *exp = exp_get_by_name(cd, clp, path, NULL); + + while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { + struct dentry *parent = dget_parent(path->dentry); + dput(path->dentry); + path->dentry = parent; + exp = exp_get_by_name(cd, clp, path, NULL); + } + dput(path->dentry); + path->dentry = saved; + return exp; +} + + + +/* + * Obtain the root fh on behalf of a client. + * This could be done in user space, but I feel that it adds some safety + * since its harder to fool a kernel module than a user space program. + */ +int +exp_rootfh(struct net *net, struct auth_domain *clp, char *name, + struct knfsd_fh *f, int maxsize) +{ + struct svc_export *exp; + struct path path; + struct inode *inode; + struct svc_fh fh; + int err; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct cache_detail *cd = nn->svc_export_cache; + + err = -EPERM; + /* NB: we probably ought to check that it's NUL-terminated */ + if (kern_path(name, 0, &path)) { + printk("nfsd: exp_rootfh path not found %s", name); + return err; + } + inode = d_inode(path.dentry); + + dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", + name, path.dentry, clp->name, + inode->i_sb->s_id, inode->i_ino); + exp = exp_parent(cd, clp, &path); + if (IS_ERR(exp)) { + err = PTR_ERR(exp); + goto out; + } + + /* + * fh must be initialized before calling fh_compose + */ + fh_init(&fh, maxsize); + if (fh_compose(&fh, exp, path.dentry, NULL)) + err = -EINVAL; + else + err = 0; + memcpy(f, &fh.fh_handle, sizeof(struct knfsd_fh)); + fh_put(&fh); + exp_put(exp); +out: + path_put(&path); + return err; +} + +static struct svc_export *exp_find(struct cache_detail *cd, + struct auth_domain *clp, int fsid_type, + u32 *fsidv, struct cache_req *reqp) +{ + struct svc_export *exp; + struct nfsd_net *nn = net_generic(cd->net, nfsd_net_id); + struct svc_expkey *ek = exp_find_key(nn->svc_expkey_cache, clp, fsid_type, fsidv, reqp); + if (IS_ERR(ek)) + return ERR_CAST(ek); + + exp = exp_get_by_name(cd, clp, &ek->ek_path, reqp); + cache_put(&ek->h, nn->svc_expkey_cache); + + if (IS_ERR(exp)) + return ERR_CAST(exp); + return exp; +} + +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) +{ + struct exp_flavor_info *f; + struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + + /* legacy gss-only clients are always OK: */ + if (exp->ex_client == rqstp->rq_gssclient) + return 0; + /* ip-address based client; check sec= export option: */ + for (f = exp->ex_flavors; f < end; f++) { + if (f->pseudoflavor == rqstp->rq_cred.cr_flavor) + return 0; + } + /* defaults in absence of sec= options: */ + if (exp->ex_nflavors == 0) { + if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL || + rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX) + return 0; + } + + /* If the compound op contains a spo_must_allowed op, + * it will be sent with integrity/protection which + * will have to be expressly allowed on mounts that + * don't support it + */ + + if (nfsd4_spo_must_allow(rqstp)) + return 0; + + return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec; +} + +/* + * Uses rq_client and rq_gssclient to find an export; uses rq_client (an + * auth_unix client) if it's available and has secinfo information; + * otherwise, will try to use rq_gssclient. + * + * Called from functions that handle requests; functions that do work on + * behalf of mountd are passed a single client name to use, and should + * use exp_get_by_name() or exp_find(). + */ +struct svc_export * +rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) +{ + struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct cache_detail *cd = nn->svc_export_cache; + + if (rqstp->rq_client == NULL) + goto gss; + + /* First try the auth_unix client: */ + exp = exp_get_by_name(cd, rqstp->rq_client, path, &rqstp->rq_chandle); + if (PTR_ERR(exp) == -ENOENT) + goto gss; + if (IS_ERR(exp)) + return exp; + /* If it has secinfo, assume there are no gss/... clients */ + if (exp->ex_nflavors > 0) + return exp; +gss: + /* Otherwise, try falling back on gss client */ + if (rqstp->rq_gssclient == NULL) + return exp; + gssexp = exp_get_by_name(cd, rqstp->rq_gssclient, path, &rqstp->rq_chandle); + if (PTR_ERR(gssexp) == -ENOENT) + return exp; + if (!IS_ERR(exp)) + exp_put(exp); + return gssexp; +} + +struct svc_export * +rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv) +{ + struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct cache_detail *cd = nn->svc_export_cache; + + if (rqstp->rq_client == NULL) + goto gss; + + /* First try the auth_unix client: */ + exp = exp_find(cd, rqstp->rq_client, fsid_type, + fsidv, &rqstp->rq_chandle); + if (PTR_ERR(exp) == -ENOENT) + goto gss; + if (IS_ERR(exp)) + return exp; + /* If it has secinfo, assume there are no gss/... clients */ + if (exp->ex_nflavors > 0) + return exp; +gss: + /* Otherwise, try falling back on gss client */ + if (rqstp->rq_gssclient == NULL) + return exp; + gssexp = exp_find(cd, rqstp->rq_gssclient, fsid_type, fsidv, + &rqstp->rq_chandle); + if (PTR_ERR(gssexp) == -ENOENT) + return exp; + if (!IS_ERR(exp)) + exp_put(exp); + return gssexp; +} + +struct svc_export * +rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) +{ + struct dentry *saved = dget(path->dentry); + struct svc_export *exp = rqst_exp_get_by_name(rqstp, path); + + while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { + struct dentry *parent = dget_parent(path->dentry); + dput(path->dentry); + path->dentry = parent; + exp = rqst_exp_get_by_name(rqstp, path); + } + dput(path->dentry); + path->dentry = saved; + return exp; +} + +struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp) +{ + u32 fsidv[2]; + + mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); + + return rqst_exp_find(rqstp, FSID_NUM, fsidv); +} + +/* + * Called when we need the filehandle for the root of the pseudofs, + * for a given NFSv4 client. The root is defined to be the + * export point with fsid==0 + */ +__be32 +exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) +{ + struct svc_export *exp; + __be32 rv; + + exp = rqst_find_fsidzero_export(rqstp); + if (IS_ERR(exp)) + return nfserrno(PTR_ERR(exp)); + rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); + exp_put(exp); + return rv; +} + +static struct flags { + int flag; + char *name[2]; +} expflags[] = { + { NFSEXP_READONLY, {"ro", "rw"}}, + { NFSEXP_INSECURE_PORT, {"insecure", ""}}, + { NFSEXP_ROOTSQUASH, {"root_squash", "no_root_squash"}}, + { NFSEXP_ALLSQUASH, {"all_squash", ""}}, + { NFSEXP_ASYNC, {"async", "sync"}}, + { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, + { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}}, + { NFSEXP_NOHIDE, {"nohide", ""}}, + { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, + { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, + { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, + { NFSEXP_V4ROOT, {"v4root", ""}}, + { NFSEXP_PNFS, {"pnfs", ""}}, + { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, + { 0, {"", ""}} +}; + +static void show_expflags(struct seq_file *m, int flags, int mask) +{ + struct flags *flg; + int state, first = 0; + + for (flg = expflags; flg->flag; flg++) { + if (flg->flag & ~mask) + continue; + state = (flg->flag & flags) ? 0 : 1; + if (*flg->name[state]) + seq_printf(m, "%s%s", first++?",":"", flg->name[state]); + } +} + +static void show_secinfo_flags(struct seq_file *m, int flags) +{ + seq_printf(m, ","); + show_expflags(m, flags, NFSEXP_SECINFO_FLAGS); +} + +static bool secinfo_flags_equal(int f, int g) +{ + f &= NFSEXP_SECINFO_FLAGS; + g &= NFSEXP_SECINFO_FLAGS; + return f == g; +} + +static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end) +{ + int flags; + + flags = (*fp)->flags; + seq_printf(m, ",sec=%d", (*fp)->pseudoflavor); + (*fp)++; + while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) { + seq_printf(m, ":%d", (*fp)->pseudoflavor); + (*fp)++; + } + return flags; +} + +static void show_secinfo(struct seq_file *m, struct svc_export *exp) +{ + struct exp_flavor_info *f; + struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + int flags; + + if (exp->ex_nflavors == 0) + return; + f = exp->ex_flavors; + flags = show_secinfo_run(m, &f, end); + if (!secinfo_flags_equal(flags, exp->ex_flags)) + show_secinfo_flags(m, flags); + while (f != end) { + flags = show_secinfo_run(m, &f, end); + show_secinfo_flags(m, flags); + } +} + +static void exp_flags(struct seq_file *m, int flag, int fsid, + kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc) +{ + struct user_namespace *userns = m->file->f_cred->user_ns; + + show_expflags(m, flag, NFSEXP_ALLFLAGS); + if (flag & NFSEXP_FSID) + seq_printf(m, ",fsid=%d", fsid); + if (!uid_eq(anonu, make_kuid(userns, (uid_t)-2)) && + !uid_eq(anonu, make_kuid(userns, 0x10000-2))) + seq_printf(m, ",anonuid=%u", from_kuid_munged(userns, anonu)); + if (!gid_eq(anong, make_kgid(userns, (gid_t)-2)) && + !gid_eq(anong, make_kgid(userns, 0x10000-2))) + seq_printf(m, ",anongid=%u", from_kgid_munged(userns, anong)); + if (fsloc && fsloc->locations_count > 0) { + char *loctype = (fsloc->migrated) ? "refer" : "replicas"; + int i; + + seq_printf(m, ",%s=", loctype); + seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\"); + seq_putc(m, '@'); + seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\"); + for (i = 1; i < fsloc->locations_count; i++) { + seq_putc(m, ';'); + seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\"); + seq_putc(m, '@'); + seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\"); + } + } +} + +static int e_show(struct seq_file *m, void *p) +{ + struct cache_head *cp = p; + struct svc_export *exp = container_of(cp, struct svc_export, h); + struct cache_detail *cd = m->private; + + if (p == SEQ_START_TOKEN) { + seq_puts(m, "# Version 1.1\n"); + seq_puts(m, "# Path Client(Flags) # IPs\n"); + return 0; + } + + exp_get(exp); + if (cache_check(cd, &exp->h, NULL)) + return 0; + exp_put(exp); + return svc_export_show(m, cd, cp); +} + +const struct seq_operations nfs_exports_op = { + .start = cache_seq_start_rcu, + .next = cache_seq_next_rcu, + .stop = cache_seq_stop_rcu, + .show = e_show, +}; + +/* + * Initialize the exports module. + */ +int +nfsd_export_init(struct net *net) +{ + int rv; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + dprintk("nfsd: initializing export module (net: %x).\n", net->ns.inum); + + nn->svc_export_cache = cache_create_net(&svc_export_cache_template, net); + if (IS_ERR(nn->svc_export_cache)) + return PTR_ERR(nn->svc_export_cache); + rv = cache_register_net(nn->svc_export_cache, net); + if (rv) + goto destroy_export_cache; + + nn->svc_expkey_cache = cache_create_net(&svc_expkey_cache_template, net); + if (IS_ERR(nn->svc_expkey_cache)) { + rv = PTR_ERR(nn->svc_expkey_cache); + goto unregister_export_cache; + } + rv = cache_register_net(nn->svc_expkey_cache, net); + if (rv) + goto destroy_expkey_cache; + return 0; + +destroy_expkey_cache: + cache_destroy_net(nn->svc_expkey_cache, net); +unregister_export_cache: + cache_unregister_net(nn->svc_export_cache, net); +destroy_export_cache: + cache_destroy_net(nn->svc_export_cache, net); + return rv; +} + +/* + * Flush exports table - called when last nfsd thread is killed + */ +void +nfsd_export_flush(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + cache_purge(nn->svc_expkey_cache); + cache_purge(nn->svc_export_cache); +} + +/* + * Shutdown the exports module. + */ +void +nfsd_export_shutdown(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + dprintk("nfsd: shutting down export module (net: %x).\n", net->ns.inum); + + cache_unregister_net(nn->svc_expkey_cache, net); + cache_unregister_net(nn->svc_export_cache, net); + cache_destroy_net(nn->svc_expkey_cache, net); + cache_destroy_net(nn->svc_export_cache, net); + svcauth_unix_purge(net); + + dprintk("nfsd: export shutdown complete (net: %x).\n", net->ns.inum); +} diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h new file mode 100644 index 000000000..e7daa1f24 --- /dev/null +++ b/fs/nfsd/export.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> + */ +#ifndef NFSD_EXPORT_H +#define NFSD_EXPORT_H + +#include <linux/sunrpc/cache.h> +#include <uapi/linux/nfsd/export.h> +#include <linux/nfs4.h> + +struct knfsd_fh; +struct svc_fh; +struct svc_rqst; + +/* + * FS Locations + */ + +#define MAX_FS_LOCATIONS 128 + +struct nfsd4_fs_location { + char *hosts; /* colon separated list of hosts */ + char *path; /* slash separated list of path components */ +}; + +struct nfsd4_fs_locations { + uint32_t locations_count; + struct nfsd4_fs_location *locations; +/* If we're not actually serving this data ourselves (only providing a + * list of replicas that do serve it) then we set "migrated": */ + int migrated; +}; + +/* + * We keep an array of pseudoflavors with the export, in order from most + * to least preferred. For the foreseeable future, we don't expect more + * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3, + * spkm3i, and spkm3p (and using all 8 at once should be rare). + */ +#define MAX_SECINFO_LIST 8 +#define EX_UUID_LEN 16 + +struct exp_flavor_info { + u32 pseudoflavor; + u32 flags; +}; + +struct svc_export { + struct cache_head h; + struct auth_domain * ex_client; + int ex_flags; + struct path ex_path; + kuid_t ex_anon_uid; + kgid_t ex_anon_gid; + int ex_fsid; + unsigned char * ex_uuid; /* 16 byte fsid */ + struct nfsd4_fs_locations ex_fslocs; + uint32_t ex_nflavors; + struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; + u32 ex_layout_types; + struct nfsd4_deviceid_map *ex_devid_map; + struct cache_detail *cd; + struct rcu_head ex_rcu; +}; + +/* an "export key" (expkey) maps a filehandlefragement to an + * svc_export for a given client. There can be several per export, + * for the different fsid types. + */ +struct svc_expkey { + struct cache_head h; + + struct auth_domain * ek_client; + int ek_fsidtype; + u32 ek_fsid[6]; + + struct path ek_path; + struct rcu_head ek_rcu; +}; + +#define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC)) +#define EX_NOHIDE(exp) ((exp)->ex_flags & NFSEXP_NOHIDE) +#define EX_WGATHER(exp) ((exp)->ex_flags & NFSEXP_GATHERED_WRITES) + +int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp); +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp); + +/* + * Function declarations + */ +int nfsd_export_init(struct net *); +void nfsd_export_shutdown(struct net *); +void nfsd_export_flush(struct net *); +struct svc_export * rqst_exp_get_by_name(struct svc_rqst *, + struct path *); +struct svc_export * rqst_exp_parent(struct svc_rqst *, + struct path *); +struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *); +int exp_rootfh(struct net *, struct auth_domain *, + char *path, struct knfsd_fh *, int maxsize); +__be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *); +__be32 nfserrno(int errno); + +static inline void exp_put(struct svc_export *exp) +{ + cache_put(&exp->h, exp->cd); +} + +static inline struct svc_export *exp_get(struct svc_export *exp) +{ + cache_get(&exp->h); + return exp; +} +struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *); + +#endif /* NFSD_EXPORT_H */ diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c new file mode 100644 index 000000000..76bee0a0d --- /dev/null +++ b/fs/nfsd/fault_inject.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com> + * + * Uses debugfs to create fault injection points for client testing + */ + +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/nsproxy.h> +#include <linux/sunrpc/addr.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> + +#include "state.h" +#include "netns.h" + +struct nfsd_fault_inject_op { + char *file; + u64 (*get)(void); + u64 (*set_val)(u64); + u64 (*set_clnt)(struct sockaddr_storage *, size_t); +}; + +static struct dentry *debug_dir; + +static ssize_t fault_inject_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + static u64 val; + char read_buf[25]; + size_t size; + loff_t pos = *ppos; + struct nfsd_fault_inject_op *op = file_inode(file)->i_private; + + if (!pos) + val = op->get(); + size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); + + return simple_read_from_buffer(buf, len, ppos, read_buf, size); +} + +static ssize_t fault_inject_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + char write_buf[INET6_ADDRSTRLEN]; + size_t size = min(sizeof(write_buf) - 1, len); + struct net *net = current->nsproxy->net_ns; + struct sockaddr_storage sa; + struct nfsd_fault_inject_op *op = file_inode(file)->i_private; + u64 val; + char *nl; + + if (copy_from_user(write_buf, buf, size)) + return -EFAULT; + write_buf[size] = '\0'; + + /* Deal with any embedded newlines in the string */ + nl = strchr(write_buf, '\n'); + if (nl) { + size = nl - write_buf; + *nl = '\0'; + } + + size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa)); + if (size > 0) { + val = op->set_clnt(&sa, size); + if (val) + pr_info("NFSD [%s]: Client %s had %llu state object(s)\n", + op->file, write_buf, val); + } else { + val = simple_strtoll(write_buf, NULL, 0); + if (val == 0) + pr_info("NFSD Fault Injection: %s (all)", op->file); + else + pr_info("NFSD Fault Injection: %s (n = %llu)", + op->file, val); + val = op->set_val(val); + pr_info("NFSD: %s: found %llu", op->file, val); + } + return len; /* on success, claim we got the whole input */ +} + +static const struct file_operations fops_nfsd = { + .owner = THIS_MODULE, + .read = fault_inject_read, + .write = fault_inject_write, +}; + +void nfsd_fault_inject_cleanup(void) +{ + debugfs_remove_recursive(debug_dir); +} + +static struct nfsd_fault_inject_op inject_ops[] = { + { + .file = "forget_clients", + .get = nfsd_inject_print_clients, + .set_val = nfsd_inject_forget_clients, + .set_clnt = nfsd_inject_forget_client, + }, + { + .file = "forget_locks", + .get = nfsd_inject_print_locks, + .set_val = nfsd_inject_forget_locks, + .set_clnt = nfsd_inject_forget_client_locks, + }, + { + .file = "forget_openowners", + .get = nfsd_inject_print_openowners, + .set_val = nfsd_inject_forget_openowners, + .set_clnt = nfsd_inject_forget_client_openowners, + }, + { + .file = "forget_delegations", + .get = nfsd_inject_print_delegations, + .set_val = nfsd_inject_forget_delegations, + .set_clnt = nfsd_inject_forget_client_delegations, + }, + { + .file = "recall_delegations", + .get = nfsd_inject_print_delegations, + .set_val = nfsd_inject_recall_delegations, + .set_clnt = nfsd_inject_recall_client_delegations, + }, +}; + +void nfsd_fault_inject_init(void) +{ + unsigned int i; + struct nfsd_fault_inject_op *op; + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + + debug_dir = debugfs_create_dir("nfsd", NULL); + + for (i = 0; i < ARRAY_SIZE(inject_ops); i++) { + op = &inject_ops[i]; + debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd); + } +} diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c new file mode 100644 index 000000000..e30e1ddc1 --- /dev/null +++ b/fs/nfsd/filecache.c @@ -0,0 +1,1092 @@ +/* + * Open file cache. + * + * (c) 2015 - Jeff Layton <jeff.layton@primarydata.com> + */ + +#include <linux/hash.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/sched.h> +#include <linux/list_lru.h> +#include <linux/fsnotify_backend.h> +#include <linux/fsnotify.h> +#include <linux/seq_file.h> + +#include "vfs.h" +#include "nfsd.h" +#include "nfsfh.h" +#include "netns.h" +#include "filecache.h" +#include "trace.h" + +#define NFSDDBG_FACILITY NFSDDBG_FH + +/* FIXME: dynamically size this for the machine somehow? */ +#define NFSD_FILE_HASH_BITS 12 +#define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS) +#define NFSD_LAUNDRETTE_DELAY (2 * HZ) + +#define NFSD_FILE_SHUTDOWN (1) +#define NFSD_FILE_LRU_THRESHOLD (4096UL) +#define NFSD_FILE_LRU_LIMIT (NFSD_FILE_LRU_THRESHOLD << 2) + +/* We only care about NFSD_MAY_READ/WRITE for this cache */ +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) + +struct nfsd_fcache_bucket { + struct hlist_head nfb_head; + spinlock_t nfb_lock; + unsigned int nfb_count; + unsigned int nfb_maxcount; +}; + +static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); + +struct nfsd_fcache_disposal { + struct list_head list; + struct work_struct work; + struct net *net; + spinlock_t lock; + struct list_head freeme; + struct rcu_head rcu; +}; + +static struct workqueue_struct *nfsd_filecache_wq __read_mostly; + +static struct kmem_cache *nfsd_file_slab; +static struct kmem_cache *nfsd_file_mark_slab; +static struct nfsd_fcache_bucket *nfsd_file_hashtbl; +static struct list_lru nfsd_file_lru; +static long nfsd_file_lru_flags; +static struct fsnotify_group *nfsd_file_fsnotify_group; +static atomic_long_t nfsd_filecache_count; +static struct delayed_work nfsd_filecache_laundrette; +static DEFINE_SPINLOCK(laundrette_lock); +static LIST_HEAD(laundrettes); + +static void nfsd_file_gc(void); + +static void +nfsd_file_schedule_laundrette(void) +{ + long count = atomic_long_read(&nfsd_filecache_count); + + if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags)) + return; + + queue_delayed_work(system_wq, &nfsd_filecache_laundrette, + NFSD_LAUNDRETTE_DELAY); +} + +static void +nfsd_file_slab_free(struct rcu_head *rcu) +{ + struct nfsd_file *nf = container_of(rcu, struct nfsd_file, nf_rcu); + + put_cred(nf->nf_cred); + kmem_cache_free(nfsd_file_slab, nf); +} + +static void +nfsd_file_mark_free(struct fsnotify_mark *mark) +{ + struct nfsd_file_mark *nfm = container_of(mark, struct nfsd_file_mark, + nfm_mark); + + kmem_cache_free(nfsd_file_mark_slab, nfm); +} + +static struct nfsd_file_mark * +nfsd_file_mark_get(struct nfsd_file_mark *nfm) +{ + if (!refcount_inc_not_zero(&nfm->nfm_ref)) + return NULL; + return nfm; +} + +static void +nfsd_file_mark_put(struct nfsd_file_mark *nfm) +{ + if (refcount_dec_and_test(&nfm->nfm_ref)) { + fsnotify_destroy_mark(&nfm->nfm_mark, nfsd_file_fsnotify_group); + fsnotify_put_mark(&nfm->nfm_mark); + } +} + +static struct nfsd_file_mark * +nfsd_file_mark_find_or_create(struct nfsd_file *nf) +{ + int err; + struct fsnotify_mark *mark; + struct nfsd_file_mark *nfm = NULL, *new; + struct inode *inode = nf->nf_inode; + + do { + mutex_lock(&nfsd_file_fsnotify_group->mark_mutex); + mark = fsnotify_find_mark(&inode->i_fsnotify_marks, + nfsd_file_fsnotify_group); + if (mark) { + nfm = nfsd_file_mark_get(container_of(mark, + struct nfsd_file_mark, + nfm_mark)); + mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex); + if (nfm) { + fsnotify_put_mark(mark); + break; + } + /* Avoid soft lockup race with nfsd_file_mark_put() */ + fsnotify_destroy_mark(mark, nfsd_file_fsnotify_group); + fsnotify_put_mark(mark); + } else + mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex); + + /* allocate a new nfm */ + new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL); + if (!new) + return NULL; + fsnotify_init_mark(&new->nfm_mark, nfsd_file_fsnotify_group); + new->nfm_mark.mask = FS_ATTRIB|FS_DELETE_SELF; + refcount_set(&new->nfm_ref, 1); + + err = fsnotify_add_inode_mark(&new->nfm_mark, inode, 0); + + /* + * If the add was successful, then return the object. + * Otherwise, we need to put the reference we hold on the + * nfm_mark. The fsnotify code will take a reference and put + * it on failure, so we can't just free it directly. It's also + * not safe to call fsnotify_destroy_mark on it as the + * mark->group will be NULL. Thus, we can't let the nfm_ref + * counter drive the destruction at this point. + */ + if (likely(!err)) + nfm = new; + else + fsnotify_put_mark(&new->nfm_mark); + } while (unlikely(err == -EEXIST)); + + return nfm; +} + +static struct nfsd_file * +nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval, + struct net *net) +{ + struct nfsd_file *nf; + + nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL); + if (nf) { + INIT_HLIST_NODE(&nf->nf_node); + INIT_LIST_HEAD(&nf->nf_lru); + nf->nf_file = NULL; + nf->nf_cred = get_current_cred(); + nf->nf_net = net; + nf->nf_flags = 0; + nf->nf_inode = inode; + nf->nf_hashval = hashval; + refcount_set(&nf->nf_ref, 1); + nf->nf_may = may & NFSD_FILE_MAY_MASK; + if (may & NFSD_MAY_NOT_BREAK_LEASE) { + if (may & NFSD_MAY_WRITE) + __set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags); + if (may & NFSD_MAY_READ) + __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); + } + nf->nf_mark = NULL; + trace_nfsd_file_alloc(nf); + } + return nf; +} + +static bool +nfsd_file_free(struct nfsd_file *nf) +{ + bool flush = false; + + trace_nfsd_file_put_final(nf); + if (nf->nf_mark) + nfsd_file_mark_put(nf->nf_mark); + if (nf->nf_file) { + get_file(nf->nf_file); + filp_close(nf->nf_file, NULL); + fput(nf->nf_file); + flush = true; + } + call_rcu(&nf->nf_rcu, nfsd_file_slab_free); + return flush; +} + +static bool +nfsd_file_check_writeback(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + struct address_space *mapping; + + if (!file || !(file->f_mode & FMODE_WRITE)) + return false; + mapping = file->f_mapping; + return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); +} + +static int +nfsd_file_check_write_error(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + + if (!file || !(file->f_mode & FMODE_WRITE)) + return 0; + return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)); +} + +static void +nfsd_file_do_unhash(struct nfsd_file *nf) +{ + lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + + trace_nfsd_file_unhash(nf); + + if (nfsd_file_check_write_error(nf)) + nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id)); + --nfsd_file_hashtbl[nf->nf_hashval].nfb_count; + hlist_del_rcu(&nf->nf_node); + atomic_long_dec(&nfsd_filecache_count); +} + +static bool +nfsd_file_unhash(struct nfsd_file *nf) +{ + if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + nfsd_file_do_unhash(nf); + if (!list_empty(&nf->nf_lru)) + list_lru_del(&nfsd_file_lru, &nf->nf_lru); + return true; + } + return false; +} + +/* + * Return true if the file was unhashed. + */ +static bool +nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose) +{ + lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + + trace_nfsd_file_unhash_and_release_locked(nf); + if (!nfsd_file_unhash(nf)) + return false; + /* keep final reference for nfsd_file_lru_dispose */ + if (refcount_dec_not_one(&nf->nf_ref)) + return true; + + list_add(&nf->nf_lru, dispose); + return true; +} + +static void +nfsd_file_put_noref(struct nfsd_file *nf) +{ + trace_nfsd_file_put(nf); + + if (refcount_dec_and_test(&nf->nf_ref)) { + WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags)); + nfsd_file_free(nf); + } +} + +void +nfsd_file_put(struct nfsd_file *nf) +{ + bool is_hashed; + + set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); + if (refcount_read(&nf->nf_ref) > 2 || !nf->nf_file) { + nfsd_file_put_noref(nf); + return; + } + + filemap_flush(nf->nf_file->f_mapping); + is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0; + nfsd_file_put_noref(nf); + if (is_hashed) + nfsd_file_schedule_laundrette(); + if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT) + nfsd_file_gc(); +} + +struct nfsd_file * +nfsd_file_get(struct nfsd_file *nf) +{ + if (likely(refcount_inc_not_zero(&nf->nf_ref))) + return nf; + return NULL; +} + +static void +nfsd_file_dispose_list(struct list_head *dispose) +{ + struct nfsd_file *nf; + + while(!list_empty(dispose)) { + nf = list_first_entry(dispose, struct nfsd_file, nf_lru); + list_del(&nf->nf_lru); + nfsd_file_put_noref(nf); + } +} + +static void +nfsd_file_dispose_list_sync(struct list_head *dispose) +{ + bool flush = false; + struct nfsd_file *nf; + + while(!list_empty(dispose)) { + nf = list_first_entry(dispose, struct nfsd_file, nf_lru); + list_del(&nf->nf_lru); + if (!refcount_dec_and_test(&nf->nf_ref)) + continue; + if (nfsd_file_free(nf)) + flush = true; + } + if (flush) + flush_delayed_fput(); +} + +static void +nfsd_file_list_remove_disposal(struct list_head *dst, + struct nfsd_fcache_disposal *l) +{ + spin_lock(&l->lock); + list_splice_init(&l->freeme, dst); + spin_unlock(&l->lock); +} + +static void +nfsd_file_list_add_disposal(struct list_head *files, struct net *net) +{ + struct nfsd_fcache_disposal *l; + + rcu_read_lock(); + list_for_each_entry_rcu(l, &laundrettes, list) { + if (l->net == net) { + spin_lock(&l->lock); + list_splice_tail_init(files, &l->freeme); + spin_unlock(&l->lock); + queue_work(nfsd_filecache_wq, &l->work); + break; + } + } + rcu_read_unlock(); +} + +static void +nfsd_file_list_add_pernet(struct list_head *dst, struct list_head *src, + struct net *net) +{ + struct nfsd_file *nf, *tmp; + + list_for_each_entry_safe(nf, tmp, src, nf_lru) { + if (nf->nf_net == net) + list_move_tail(&nf->nf_lru, dst); + } +} + +static void +nfsd_file_dispose_list_delayed(struct list_head *dispose) +{ + LIST_HEAD(list); + struct nfsd_file *nf; + + while(!list_empty(dispose)) { + nf = list_first_entry(dispose, struct nfsd_file, nf_lru); + nfsd_file_list_add_pernet(&list, dispose, nf->nf_net); + nfsd_file_list_add_disposal(&list, nf->nf_net); + } +} + +/* + * Note this can deadlock with nfsd_file_cache_purge. + */ +static enum lru_status +nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, + spinlock_t *lock, void *arg) + __releases(lock) + __acquires(lock) +{ + struct list_head *head = arg; + struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru); + + /* + * Do a lockless refcount check. The hashtable holds one reference, so + * we look to see if anything else has a reference, or if any have + * been put since the shrinker last ran. Those don't get unhashed and + * released. + * + * Note that in the put path, we set the flag and then decrement the + * counter. Here we check the counter and then test and clear the flag. + * That order is deliberate to ensure that we can do this locklessly. + */ + if (refcount_read(&nf->nf_ref) > 1) + goto out_skip; + + /* + * Don't throw out files that are still undergoing I/O or + * that have uncleared errors pending. + */ + if (nfsd_file_check_writeback(nf)) + goto out_skip; + + if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) + goto out_skip; + + if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) + goto out_skip; + + list_lru_isolate_move(lru, &nf->nf_lru, head); + return LRU_REMOVED; +out_skip: + return LRU_SKIP; +} + +static unsigned long +nfsd_file_lru_walk_list(struct shrink_control *sc) +{ + LIST_HEAD(head); + struct nfsd_file *nf; + unsigned long ret; + + if (sc) + ret = list_lru_shrink_walk(&nfsd_file_lru, sc, + nfsd_file_lru_cb, &head); + else + ret = list_lru_walk(&nfsd_file_lru, + nfsd_file_lru_cb, + &head, LONG_MAX); + list_for_each_entry(nf, &head, nf_lru) { + spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + nfsd_file_do_unhash(nf); + spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + } + nfsd_file_dispose_list_delayed(&head); + return ret; +} + +static void +nfsd_file_gc(void) +{ + nfsd_file_lru_walk_list(NULL); +} + +static void +nfsd_file_gc_worker(struct work_struct *work) +{ + nfsd_file_gc(); + nfsd_file_schedule_laundrette(); +} + +static unsigned long +nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc) +{ + return list_lru_count(&nfsd_file_lru); +} + +static unsigned long +nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) +{ + return nfsd_file_lru_walk_list(sc); +} + +static struct shrinker nfsd_file_shrinker = { + .scan_objects = nfsd_file_lru_scan, + .count_objects = nfsd_file_lru_count, + .seeks = 1, +}; + +static void +__nfsd_file_close_inode(struct inode *inode, unsigned int hashval, + struct list_head *dispose) +{ + struct nfsd_file *nf; + struct hlist_node *tmp; + + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) { + if (inode == nf->nf_inode) + nfsd_file_unhash_and_release_locked(nf, dispose); + } + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); +} + +/** + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * @inode: inode of the file to attempt to remove + * + * Walk the whole hash bucket, looking for any files that correspond to "inode". + * If any do, then unhash them and put the hashtable reference to them and + * destroy any that had their last reference put. Also ensure that any of the + * fputs also have their final __fput done as well. + */ +void +nfsd_file_close_inode_sync(struct inode *inode) +{ + unsigned int hashval = (unsigned int)hash_long(inode->i_ino, + NFSD_FILE_HASH_BITS); + LIST_HEAD(dispose); + + __nfsd_file_close_inode(inode, hashval, &dispose); + trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose)); + nfsd_file_dispose_list_sync(&dispose); +} + +/** + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * @inode: inode of the file to attempt to remove + * + * Walk the whole hash bucket, looking for any files that correspond to "inode". + * If any do, then unhash them and put the hashtable reference to them and + * destroy any that had their last reference put. + */ +static void +nfsd_file_close_inode(struct inode *inode) +{ + unsigned int hashval = (unsigned int)hash_long(inode->i_ino, + NFSD_FILE_HASH_BITS); + LIST_HEAD(dispose); + + __nfsd_file_close_inode(inode, hashval, &dispose); + trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose)); + nfsd_file_dispose_list_delayed(&dispose); +} + +/** + * nfsd_file_delayed_close - close unused nfsd_files + * @work: dummy + * + * Walk the LRU list and close any entries that have not been used since + * the last scan. + * + * Note this can deadlock with nfsd_file_cache_purge. + */ +static void +nfsd_file_delayed_close(struct work_struct *work) +{ + LIST_HEAD(head); + struct nfsd_fcache_disposal *l = container_of(work, + struct nfsd_fcache_disposal, work); + + nfsd_file_list_remove_disposal(&head, l); + nfsd_file_dispose_list(&head); +} + +static int +nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg, + void *data) +{ + struct file_lock *fl = data; + + /* Only close files for F_SETLEASE leases */ + if (fl->fl_flags & FL_LEASE) + nfsd_file_close_inode_sync(file_inode(fl->fl_file)); + return 0; +} + +static struct notifier_block nfsd_file_lease_notifier = { + .notifier_call = nfsd_file_lease_notifier_call, +}; + +static int +nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *name, u32 cookie) +{ + trace_nfsd_file_fsnotify_handle_event(inode, mask); + + /* Should be no marks on non-regular files */ + if (!S_ISREG(inode->i_mode)) { + WARN_ON_ONCE(1); + return 0; + } + + /* don't close files if this was not the last link */ + if (mask & FS_ATTRIB) { + if (inode->i_nlink) + return 0; + } + + nfsd_file_close_inode(inode); + return 0; +} + + +static const struct fsnotify_ops nfsd_file_fsnotify_ops = { + .handle_inode_event = nfsd_file_fsnotify_handle_event, + .free_mark = nfsd_file_mark_free, +}; + +int +nfsd_file_cache_init(void) +{ + int ret = -ENOMEM; + unsigned int i; + + clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); + + if (nfsd_file_hashtbl) + return 0; + + nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0); + if (!nfsd_filecache_wq) + goto out; + + nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE, + sizeof(*nfsd_file_hashtbl), GFP_KERNEL); + if (!nfsd_file_hashtbl) { + pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n"); + goto out_err; + } + + nfsd_file_slab = kmem_cache_create("nfsd_file", + sizeof(struct nfsd_file), 0, 0, NULL); + if (!nfsd_file_slab) { + pr_err("nfsd: unable to create nfsd_file_slab\n"); + goto out_err; + } + + nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark", + sizeof(struct nfsd_file_mark), 0, 0, NULL); + if (!nfsd_file_mark_slab) { + pr_err("nfsd: unable to create nfsd_file_mark_slab\n"); + goto out_err; + } + + + ret = list_lru_init(&nfsd_file_lru); + if (ret) { + pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret); + goto out_err; + } + + ret = register_shrinker(&nfsd_file_shrinker); + if (ret) { + pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret); + goto out_lru; + } + + ret = lease_register_notifier(&nfsd_file_lease_notifier); + if (ret) { + pr_err("nfsd: unable to register lease notifier: %d\n", ret); + goto out_shrinker; + } + + nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops); + if (IS_ERR(nfsd_file_fsnotify_group)) { + pr_err("nfsd: unable to create fsnotify group: %ld\n", + PTR_ERR(nfsd_file_fsnotify_group)); + nfsd_file_fsnotify_group = NULL; + goto out_notifier; + } + + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head); + spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock); + } + + INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_gc_worker); +out: + return ret; +out_notifier: + lease_unregister_notifier(&nfsd_file_lease_notifier); +out_shrinker: + unregister_shrinker(&nfsd_file_shrinker); +out_lru: + list_lru_destroy(&nfsd_file_lru); +out_err: + kmem_cache_destroy(nfsd_file_slab); + nfsd_file_slab = NULL; + kmem_cache_destroy(nfsd_file_mark_slab); + nfsd_file_mark_slab = NULL; + kvfree(nfsd_file_hashtbl); + nfsd_file_hashtbl = NULL; + destroy_workqueue(nfsd_filecache_wq); + nfsd_filecache_wq = NULL; + goto out; +} + +/* + * Note this can deadlock with nfsd_file_lru_cb. + */ +void +nfsd_file_cache_purge(struct net *net) +{ + unsigned int i; + struct nfsd_file *nf; + struct hlist_node *next; + LIST_HEAD(dispose); + bool del; + + if (!nfsd_file_hashtbl) + return; + + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i]; + + spin_lock(&nfb->nfb_lock); + hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) { + if (net && nf->nf_net != net) + continue; + del = nfsd_file_unhash_and_release_locked(nf, &dispose); + + /* + * Deadlock detected! Something marked this entry as + * unhased, but hasn't removed it from the hash list. + */ + WARN_ON_ONCE(!del); + } + spin_unlock(&nfb->nfb_lock); + nfsd_file_dispose_list(&dispose); + } +} + +static struct nfsd_fcache_disposal * +nfsd_alloc_fcache_disposal(struct net *net) +{ + struct nfsd_fcache_disposal *l; + + l = kmalloc(sizeof(*l), GFP_KERNEL); + if (!l) + return NULL; + INIT_WORK(&l->work, nfsd_file_delayed_close); + l->net = net; + spin_lock_init(&l->lock); + INIT_LIST_HEAD(&l->freeme); + return l; +} + +static void +nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l) +{ + rcu_assign_pointer(l->net, NULL); + cancel_work_sync(&l->work); + nfsd_file_dispose_list(&l->freeme); + kfree_rcu(l, rcu); +} + +static void +nfsd_add_fcache_disposal(struct nfsd_fcache_disposal *l) +{ + spin_lock(&laundrette_lock); + list_add_tail_rcu(&l->list, &laundrettes); + spin_unlock(&laundrette_lock); +} + +static void +nfsd_del_fcache_disposal(struct nfsd_fcache_disposal *l) +{ + spin_lock(&laundrette_lock); + list_del_rcu(&l->list); + spin_unlock(&laundrette_lock); +} + +static int +nfsd_alloc_fcache_disposal_net(struct net *net) +{ + struct nfsd_fcache_disposal *l; + + l = nfsd_alloc_fcache_disposal(net); + if (!l) + return -ENOMEM; + nfsd_add_fcache_disposal(l); + return 0; +} + +static void +nfsd_free_fcache_disposal_net(struct net *net) +{ + struct nfsd_fcache_disposal *l; + + rcu_read_lock(); + list_for_each_entry_rcu(l, &laundrettes, list) { + if (l->net != net) + continue; + nfsd_del_fcache_disposal(l); + rcu_read_unlock(); + nfsd_free_fcache_disposal(l); + return; + } + rcu_read_unlock(); +} + +int +nfsd_file_cache_start_net(struct net *net) +{ + return nfsd_alloc_fcache_disposal_net(net); +} + +void +nfsd_file_cache_shutdown_net(struct net *net) +{ + nfsd_file_cache_purge(net); + nfsd_free_fcache_disposal_net(net); +} + +void +nfsd_file_cache_shutdown(void) +{ + set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); + + lease_unregister_notifier(&nfsd_file_lease_notifier); + unregister_shrinker(&nfsd_file_shrinker); + /* + * make sure all callers of nfsd_file_lru_cb are done before + * calling nfsd_file_cache_purge + */ + cancel_delayed_work_sync(&nfsd_filecache_laundrette); + nfsd_file_cache_purge(NULL); + list_lru_destroy(&nfsd_file_lru); + rcu_barrier(); + fsnotify_put_group(nfsd_file_fsnotify_group); + nfsd_file_fsnotify_group = NULL; + kmem_cache_destroy(nfsd_file_slab); + nfsd_file_slab = NULL; + fsnotify_wait_marks_destroyed(); + kmem_cache_destroy(nfsd_file_mark_slab); + nfsd_file_mark_slab = NULL; + kvfree(nfsd_file_hashtbl); + nfsd_file_hashtbl = NULL; + destroy_workqueue(nfsd_filecache_wq); + nfsd_filecache_wq = NULL; +} + +static bool +nfsd_match_cred(const struct cred *c1, const struct cred *c2) +{ + int i; + + if (!uid_eq(c1->fsuid, c2->fsuid)) + return false; + if (!gid_eq(c1->fsgid, c2->fsgid)) + return false; + if (c1->group_info == NULL || c2->group_info == NULL) + return c1->group_info == c2->group_info; + if (c1->group_info->ngroups != c2->group_info->ngroups) + return false; + for (i = 0; i < c1->group_info->ngroups; i++) { + if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i])) + return false; + } + return true; +} + +static struct nfsd_file * +nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, + unsigned int hashval, struct net *net) +{ + struct nfsd_file *nf; + unsigned char need = may_flags & NFSD_FILE_MAY_MASK; + + hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, + nf_node, lockdep_is_held(&nfsd_file_hashtbl[hashval].nfb_lock)) { + if (nf->nf_may != need) + continue; + if (nf->nf_inode != inode) + continue; + if (nf->nf_net != net) + continue; + if (!nfsd_match_cred(nf->nf_cred, current_cred())) + continue; + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) + continue; + if (nfsd_file_get(nf) != NULL) + return nf; + } + return NULL; +} + +/** + * nfsd_file_is_cached - are there any cached open files for this fh? + * @inode: inode of the file to check + * + * Scan the hashtable for open files that match this fh. Returns true if there + * are any, and false if not. + */ +bool +nfsd_file_is_cached(struct inode *inode) +{ + bool ret = false; + struct nfsd_file *nf; + unsigned int hashval; + + hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); + + rcu_read_lock(); + hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, + nf_node) { + if (inode == nf->nf_inode) { + ret = true; + break; + } + } + rcu_read_unlock(); + trace_nfsd_file_is_cached(inode, hashval, (int)ret); + return ret; +} + +__be32 +nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + __be32 status; + struct net *net = SVC_NET(rqstp); + struct nfsd_file *nf, *new; + struct inode *inode; + unsigned int hashval; + bool retry = true; + + /* FIXME: skip this if fh_dentry is already set? */ + status = fh_verify(rqstp, fhp, S_IFREG, + may_flags|NFSD_MAY_OWNER_OVERRIDE); + if (status != nfs_ok) + return status; + + inode = d_inode(fhp->fh_dentry); + hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); +retry: + rcu_read_lock(); + nf = nfsd_file_find_locked(inode, may_flags, hashval, net); + rcu_read_unlock(); + if (nf) + goto wait_for_construction; + + new = nfsd_file_alloc(inode, may_flags, hashval, net); + if (!new) { + trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, + NULL, nfserr_jukebox); + return nfserr_jukebox; + } + + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + nf = nfsd_file_find_locked(inode, may_flags, hashval, net); + if (nf == NULL) + goto open_file; + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + nfsd_file_slab_free(&new->nf_rcu); + +wait_for_construction: + wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE); + + /* Did construction of this file fail? */ + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + if (!retry) { + status = nfserr_jukebox; + goto out; + } + retry = false; + nfsd_file_put_noref(nf); + goto retry; + } + + this_cpu_inc(nfsd_file_cache_hits); + + if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) { + bool write = (may_flags & NFSD_MAY_WRITE); + + if (test_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags) || + (test_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags) && write)) { + status = nfserrno(nfsd_open_break_lease( + file_inode(nf->nf_file), may_flags)); + if (status == nfs_ok) { + clear_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); + if (write) + clear_bit(NFSD_FILE_BREAK_WRITE, + &nf->nf_flags); + } + } + } +out: + if (status == nfs_ok) { + *pnf = nf; + } else { + nfsd_file_put(nf); + nf = NULL; + } + + trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status); + return status; +open_file: + nf = new; + /* Take reference for the hashtable */ + refcount_inc(&nf->nf_ref); + __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); + __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); + list_lru_add(&nfsd_file_lru, &nf->nf_lru); + hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head); + ++nfsd_file_hashtbl[hashval].nfb_count; + nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount, + nfsd_file_hashtbl[hashval].nfb_count); + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + if (atomic_long_inc_return(&nfsd_filecache_count) >= NFSD_FILE_LRU_THRESHOLD) + nfsd_file_gc(); + + nf->nf_mark = nfsd_file_mark_find_or_create(nf); + if (nf->nf_mark) + status = nfsd_open_verified(rqstp, fhp, S_IFREG, + may_flags, &nf->nf_file); + else + status = nfserr_jukebox; + /* + * If construction failed, or we raced with a call to unlink() + * then unhash. + */ + if (status != nfs_ok || inode->i_nlink == 0) { + bool do_free; + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + do_free = nfsd_file_unhash(nf); + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + if (do_free) + nfsd_file_put_noref(nf); + } + clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags); + smp_mb__after_atomic(); + wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING); + goto out; +} + +/* + * Note that fields may be added, removed or reordered in the future. Programs + * scraping this file for info should test the labels to ensure they're + * getting the correct field. + */ +static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) +{ + unsigned int i, count = 0, longest = 0; + unsigned long hits = 0; + + /* + * No need for spinlocks here since we're not terribly interested in + * accuracy. We do take the nfsd_mutex simply to ensure that we + * don't end up racing with server shutdown + */ + mutex_lock(&nfsd_mutex); + if (nfsd_file_hashtbl) { + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + count += nfsd_file_hashtbl[i].nfb_count; + longest = max(longest, nfsd_file_hashtbl[i].nfb_count); + } + } + mutex_unlock(&nfsd_mutex); + + for_each_possible_cpu(i) + hits += per_cpu(nfsd_file_cache_hits, i); + + seq_printf(m, "total entries: %u\n", count); + seq_printf(m, "longest chain: %u\n", longest); + seq_printf(m, "cache hits: %lu\n", hits); + return 0; +} + +int nfsd_file_cache_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, nfsd_file_cache_stats_show, NULL); +} diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h new file mode 100644 index 000000000..435ceab27 --- /dev/null +++ b/fs/nfsd/filecache.h @@ -0,0 +1,63 @@ +#ifndef _FS_NFSD_FILECACHE_H +#define _FS_NFSD_FILECACHE_H + +#include <linux/fsnotify_backend.h> + +/* + * This is the fsnotify_mark container that nfsd attaches to the files that it + * is holding open. Note that we have a separate refcount here aside from the + * one in the fsnotify_mark. We only want a single fsnotify_mark attached to + * the inode, and for each nfsd_file to hold a reference to it. + * + * The fsnotify_mark is itself refcounted, but that's not sufficient to tell us + * how to put that reference. If there are still outstanding nfsd_files that + * reference the mark, then we would want to call fsnotify_put_mark on it. + * If there were not, then we'd need to call fsnotify_destroy_mark. Since we + * can't really tell the difference, we use the nfm_mark to keep track of how + * many nfsd_files hold references to the mark. When that counter goes to zero + * then we know to call fsnotify_destroy_mark on it. + */ +struct nfsd_file_mark { + struct fsnotify_mark nfm_mark; + refcount_t nfm_ref; +}; + +/* + * A representation of a file that has been opened by knfsd. These are hashed + * in the hashtable by inode pointer value. Note that this object doesn't + * hold a reference to the inode by itself, so the nf_inode pointer should + * never be dereferenced, only used for comparison. + */ +struct nfsd_file { + struct hlist_node nf_node; + struct list_head nf_lru; + struct rcu_head nf_rcu; + struct file *nf_file; + const struct cred *nf_cred; + struct net *nf_net; +#define NFSD_FILE_HASHED (0) +#define NFSD_FILE_PENDING (1) +#define NFSD_FILE_BREAK_READ (2) +#define NFSD_FILE_BREAK_WRITE (3) +#define NFSD_FILE_REFERENCED (4) + unsigned long nf_flags; + struct inode *nf_inode; + unsigned int nf_hashval; + refcount_t nf_ref; + unsigned char nf_may; + struct nfsd_file_mark *nf_mark; +}; + +int nfsd_file_cache_init(void); +void nfsd_file_cache_purge(struct net *); +void nfsd_file_cache_shutdown(void); +int nfsd_file_cache_start_net(struct net *net); +void nfsd_file_cache_shutdown_net(struct net *net); +void nfsd_file_put(struct nfsd_file *nf); +struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); +void nfsd_file_close_inode_sync(struct inode *inode); +bool nfsd_file_is_cached(struct inode *inode); +__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **nfp); +int nfsd_file_cache_stats_open(struct inode *, struct file *); +#endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c new file mode 100644 index 000000000..db7ef07ae --- /dev/null +++ b/fs/nfsd/flexfilelayout.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> + * + * The following implements a super-simple flex-file server + * where the NFSv4.1 mds is also the ds. And the storage is + * the same. I.e., writing to the mds via a NFSv4.1 WRITE + * goes to the same location as the NFSv3 WRITE. + */ +#include <linux/slab.h> + +#include <linux/nfsd/debug.h> + +#include <linux/sunrpc/addr.h> + +#include "flexfilelayoutxdr.h" +#include "pnfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +static __be32 +nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, + struct nfsd4_layoutget *args) +{ + struct nfsd4_layout_seg *seg = &args->lg_seg; + u32 device_generation = 0; + int error; + uid_t u; + + struct pnfs_ff_layout *fl; + + /* + * The super simple flex file server has 1 mirror, 1 data server, + * and 1 file handle. So instead of 4 allocs, do 1 for now. + * Zero it out for the stateid - don't want junk in there! + */ + error = -ENOMEM; + fl = kzalloc(sizeof(*fl), GFP_KERNEL); + if (!fl) + goto out_error; + args->lg_content = fl; + + /* + * Avoid layout commit, try to force the I/O to the DS, + * and for fun, cause all IOMODE_RW layout segments to + * effectively be WRITE only. + */ + fl->flags = FF_FLAGS_NO_LAYOUTCOMMIT | FF_FLAGS_NO_IO_THRU_MDS | + FF_FLAGS_NO_READ_IO; + + /* Do not allow a IOMODE_READ segment to have write pemissions */ + if (seg->iomode == IOMODE_READ) { + u = from_kuid(&init_user_ns, inode->i_uid) + 1; + fl->uid = make_kuid(&init_user_ns, u); + } else + fl->uid = inode->i_uid; + fl->gid = inode->i_gid; + + error = nfsd4_set_deviceid(&fl->deviceid, fhp, device_generation); + if (error) + goto out_error; + + fl->fh.size = fhp->fh_handle.fh_size; + memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size); + + /* Give whole file layout segments */ + seg->offset = 0; + seg->length = NFS4_MAX_UINT64; + + dprintk("GET: 0x%llx:0x%llx %d\n", seg->offset, seg->length, + seg->iomode); + return 0; + +out_error: + seg->length = 0; + return nfserrno(error); +} + +static __be32 +nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp, + struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_ff_device_addr *da; + + u16 port; + char addr[INET6_ADDRSTRLEN]; + + da = kzalloc(sizeof(struct pnfs_ff_device_addr), GFP_KERNEL); + if (!da) + return nfserrno(-ENOMEM); + + gdp->gd_device = da; + + da->version = 3; + da->minor_version = 0; + + da->rsize = svc_max_payload(rqstp); + da->wsize = da->rsize; + + rpc_ntop((struct sockaddr *)&rqstp->rq_daddr, + addr, INET6_ADDRSTRLEN); + if (rqstp->rq_daddr.ss_family == AF_INET) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&rqstp->rq_daddr; + port = ntohs(sin->sin_port); + snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp"); + da->netaddr.netid_len = 3; + } else { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&rqstp->rq_daddr; + port = ntohs(sin6->sin6_port); + snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp6"); + da->netaddr.netid_len = 4; + } + + da->netaddr.addr_len = + snprintf(da->netaddr.addr, FF_ADDR_LEN + 1, + "%s.%hhu.%hhu", addr, port >> 8, port & 0xff); + + da->tightly_coupled = false; + + return 0; +} + +const struct nfsd4_layout_ops ff_layout_ops = { + .notify_types = + NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, + .disable_recalls = true, + .proc_getdeviceinfo = nfsd4_ff_proc_getdeviceinfo, + .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo, + .proc_layoutget = nfsd4_ff_proc_layoutget, + .encode_layoutget = nfsd4_ff_encode_layoutget, +}; diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c new file mode 100644 index 000000000..bb205328e --- /dev/null +++ b/fs/nfsd/flexfilelayoutxdr.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> + */ +#include <linux/sunrpc/svc.h> +#include <linux/nfs4.h> + +#include "nfsd.h" +#include "flexfilelayoutxdr.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +struct ff_idmap { + char buf[11]; + int len; +}; + +__be32 +nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp) +{ + struct pnfs_ff_layout *fl = lgp->lg_content; + int len, mirror_len, ds_len, fh_len; + __be32 *p; + + /* + * Unlike nfsd4_encode_user, we know these will + * always be stringified. + */ + struct ff_idmap uid; + struct ff_idmap gid; + + fh_len = 4 + fl->fh.size; + + uid.len = sprintf(uid.buf, "%u", from_kuid(&init_user_ns, fl->uid)); + gid.len = sprintf(gid.buf, "%u", from_kgid(&init_user_ns, fl->gid)); + + /* 8 + len for recording the length, name, and padding */ + ds_len = 20 + sizeof(stateid_opaque_t) + 4 + fh_len + + 8 + uid.len + 8 + gid.len; + + mirror_len = 4 + ds_len; + + /* The layout segment */ + len = 20 + mirror_len; + + p = xdr_reserve_space(xdr, sizeof(__be32) + len); + if (!p) + return nfserr_toosmall; + + *p++ = cpu_to_be32(len); + p = xdr_encode_hyper(p, 0); /* stripe unit of 1 */ + + *p++ = cpu_to_be32(1); /* single mirror */ + *p++ = cpu_to_be32(1); /* single data server */ + + p = xdr_encode_opaque_fixed(p, &fl->deviceid, + sizeof(struct nfsd4_deviceid)); + + *p++ = cpu_to_be32(1); /* efficiency */ + + *p++ = cpu_to_be32(fl->stateid.si_generation); + p = xdr_encode_opaque_fixed(p, &fl->stateid.si_opaque, + sizeof(stateid_opaque_t)); + + *p++ = cpu_to_be32(1); /* single file handle */ + p = xdr_encode_opaque(p, fl->fh.data, fl->fh.size); + + p = xdr_encode_opaque(p, uid.buf, uid.len); + p = xdr_encode_opaque(p, gid.buf, gid.len); + + *p++ = cpu_to_be32(fl->flags); + *p++ = cpu_to_be32(0); /* No stats collect hint */ + + return 0; +} + +__be32 +nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_ff_device_addr *da = gdp->gd_device; + int len; + int ver_len; + int addr_len; + __be32 *p; + + /* + * See paragraph 5 of RFC 8881 S18.40.3. + */ + if (!gdp->gd_maxcount) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + + /* len + padding for two strings */ + addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len; + ver_len = 20; + + len = 4 + ver_len + 4 + addr_len; + + p = xdr_reserve_space(xdr, len + sizeof(__be32)); + if (!p) + return nfserr_resource; + + /* + * Fill in the overall length and number of volumes at the beginning + * of the layout. + */ + *p++ = cpu_to_be32(len); + *p++ = cpu_to_be32(1); /* 1 netaddr */ + p = xdr_encode_opaque(p, da->netaddr.netid, da->netaddr.netid_len); + p = xdr_encode_opaque(p, da->netaddr.addr, da->netaddr.addr_len); + + *p++ = cpu_to_be32(1); /* 1 versions */ + + *p++ = cpu_to_be32(da->version); + *p++ = cpu_to_be32(da->minor_version); + *p++ = cpu_to_be32(da->rsize); + *p++ = cpu_to_be32(da->wsize); + *p++ = cpu_to_be32(da->tightly_coupled); + + return 0; +} diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h new file mode 100644 index 000000000..8e195aeca --- /dev/null +++ b/fs/nfsd/flexfilelayoutxdr.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> + */ +#ifndef _NFSD_FLEXFILELAYOUTXDR_H +#define _NFSD_FLEXFILELAYOUTXDR_H 1 + +#include <linux/inet.h> +#include "xdr4.h" + +#define FF_FLAGS_NO_LAYOUTCOMMIT 1 +#define FF_FLAGS_NO_IO_THRU_MDS 2 +#define FF_FLAGS_NO_READ_IO 4 + +struct xdr_stream; + +#define FF_NETID_LEN (4) +#define FF_ADDR_LEN (INET6_ADDRSTRLEN + 8) +struct pnfs_ff_netaddr { + char netid[FF_NETID_LEN + 1]; + char addr[FF_ADDR_LEN + 1]; + u32 netid_len; + u32 addr_len; +}; + +struct pnfs_ff_device_addr { + struct pnfs_ff_netaddr netaddr; + u32 version; + u32 minor_version; + u32 rsize; + u32 wsize; + bool tightly_coupled; +}; + +struct pnfs_ff_layout { + u32 flags; + u32 stats_collect_hint; + kuid_t uid; + kgid_t gid; + struct nfsd4_deviceid deviceid; + stateid_t stateid; + struct nfs_fh fh; +}; + +__be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp); +__be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp); + +#endif /* _NFSD_FLEXFILELAYOUTXDR_H */ diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h new file mode 100644 index 000000000..23cc85d1e --- /dev/null +++ b/fs/nfsd/idmap.h @@ -0,0 +1,60 @@ +/* + * Mapping of UID to name and vice versa. + * + * Copyright (c) 2002, 2003 The Regents of the University of + * Michigan. All rights reserved. +> * + * Marius Aamodt Eriksen <marius@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LINUX_NFSD_IDMAP_H +#define LINUX_NFSD_IDMAP_H + +#include <linux/in.h> +#include <linux/sunrpc/svc.h> +#include <linux/nfs_idmap.h> + +#ifdef CONFIG_NFSD_V4 +int nfsd_idmap_init(struct net *); +void nfsd_idmap_shutdown(struct net *); +#else +static inline int nfsd_idmap_init(struct net *net) +{ + return 0; +} +static inline void nfsd_idmap_shutdown(struct net *net) +{ +} +#endif + +__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *); +__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *); +__be32 nfsd4_encode_user(struct xdr_stream *, struct svc_rqst *, kuid_t); +__be32 nfsd4_encode_group(struct xdr_stream *, struct svc_rqst *, kgid_t); + +#endif /* LINUX_NFSD_IDMAP_H */ diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c new file mode 100644 index 000000000..3f5b3d7b6 --- /dev/null +++ b/fs/nfsd/lockd.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains all the stubs needed when communicating with lockd. + * This level of indirection is necessary so we can run nfsd+lockd without + * requiring the nfs client to be compiled in/loaded, and vice versa. + * + * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/file.h> +#include <linux/lockd/bind.h> +#include "nfsd.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_LOCKD + +#ifdef CONFIG_LOCKD_V4 +#define nlm_stale_fh nlm4_stale_fh +#define nlm_failed nlm4_failed +#else +#define nlm_stale_fh nlm_lck_denied_nolocks +#define nlm_failed nlm_lck_denied_nolocks +#endif +/* + * Note: we hold the dentry use count while the file is open. + */ +static __be32 +nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) +{ + __be32 nfserr; + struct svc_fh fh; + + /* must initialize before using! but maxsize doesn't matter */ + fh_init(&fh,0); + fh.fh_handle.fh_size = f->size; + memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size); + fh.fh_export = NULL; + + nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); + fh_put(&fh); + /* We return nlm error codes as nlm doesn't know + * about nfsd, but nfsd does know about nlm.. + */ + switch (nfserr) { + case nfs_ok: + return 0; + case nfserr_dropit: + return nlm_drop_reply; + case nfserr_stale: + return nlm_stale_fh; + default: + return nlm_failed; + } +} + +static void +nlm_fclose(struct file *filp) +{ + fput(filp); +} + +static const struct nlmsvc_binding nfsd_nlm_ops = { + .fopen = nlm_fopen, /* open file for locking */ + .fclose = nlm_fclose, /* close file */ +}; + +void +nfsd_lockd_init(void) +{ + dprintk("nfsd: initializing lockd\n"); + nlmsvc_ops = &nfsd_nlm_ops; +} + +void +nfsd_lockd_shutdown(void) +{ + nlmsvc_ops = NULL; +} diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h new file mode 100644 index 000000000..02d3d2f0e --- /dev/null +++ b/fs/nfsd/netns.h @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * per net namespace data structures for nfsd + * + * Copyright (C) 2012, Jeff Layton <jlayton@redhat.com> + */ + +#ifndef __NFSD_NETNS_H__ +#define __NFSD_NETNS_H__ + +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +/* Hash tables for nfs4_clientid state */ +#define CLIENT_HASH_BITS 4 +#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) +#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) + +#define SESSION_HASH_SIZE 512 + +struct cld_net; +struct nfsd4_client_tracking_ops; + +/* + * Represents a nfsd "container". With respect to nfsv4 state tracking, the + * fields of interest are the *_id_hashtbls and the *_name_tree. These track + * the nfs4_client objects by either short or long form clientid. + * + * Each nfsd_net runs a nfs4_laundromat workqueue job when necessary to clean + * up expired clients and delegations within the container. + */ +struct nfsd_net { + struct cld_net *cld_net; + + struct cache_detail *svc_expkey_cache; + struct cache_detail *svc_export_cache; + + struct cache_detail *idtoname_cache; + struct cache_detail *nametoid_cache; + + struct lock_manager nfsd4_manager; + bool grace_ended; + time64_t boot_time; + + struct dentry *nfsd_client_dir; + + /* + * reclaim_str_hashtbl[] holds known client info from previous reset/reboot + * used in reboot/reset lease grace period processing + * + * conf_id_hashtbl[], and conf_name_tree hold confirmed + * setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_name_tree hold unconfirmed + * setclientid info. + */ + struct list_head *reclaim_str_hashtbl; + int reclaim_str_hashtbl_size; + struct list_head *conf_id_hashtbl; + struct rb_root conf_name_tree; + struct list_head *unconf_id_hashtbl; + struct rb_root unconf_name_tree; + struct list_head *sessionid_hashtbl; + /* + * client_lru holds client queue ordered by nfs4_client.cl_time + * for lease renewal. + * + * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time + * for last close replay. + * + * All of the above fields are protected by the client_mutex. + */ + struct list_head client_lru; + struct list_head close_lru; + struct list_head del_recall_lru; + + /* protected by blocked_locks_lock */ + struct list_head blocked_locks_lru; + + struct delayed_work laundromat_work; + + /* client_lock protects the client lru list and session hash table */ + spinlock_t client_lock; + + /* protects blocked_locks_lru */ + spinlock_t blocked_locks_lock; + + struct file *rec_file; + bool in_grace; + const struct nfsd4_client_tracking_ops *client_tracking_ops; + + time64_t nfsd4_lease; + time64_t nfsd4_grace; + bool somebody_reclaimed; + + bool track_reclaim_completes; + atomic_t nr_reclaim_complete; + + bool nfsd_net_up; + bool lockd_up; + + /* Time of server startup */ + struct timespec64 nfssvc_boot; + seqlock_t boot_lock; + + /* + * Max number of connections this nfsd container will allow. Defaults + * to '0' which is means that it bases this on the number of threads. + */ + unsigned int max_connections; + + u32 clientid_base; + u32 clientid_counter; + u32 clverifier_counter; + + struct svc_serv *nfsd_serv; + + wait_queue_head_t ntf_wq; + atomic_t ntf_refcnt; + + /* Allow umount to wait for nfsd state cleanup */ + struct completion nfsd_shutdown_complete; + + /* + * clientid and stateid data for construction of net unique COPY + * stateids. + */ + u32 s2s_cp_cl_id; + struct idr s2s_cp_stateids; + spinlock_t s2s_cp_lock; + + /* + * Version information + */ + bool *nfsd_versions; + bool *nfsd4_minorversions; + + /* + * Duplicate reply cache + */ + struct nfsd_drc_bucket *drc_hashtbl; + + /* max number of entries allowed in the cache */ + unsigned int max_drc_entries; + + /* number of significant bits in the hash value */ + unsigned int maskbits; + unsigned int drc_hashsize; + + /* + * Stats and other tracking of on the duplicate reply cache. + * These fields and the "rc" fields in nfsdstats are modified + * with only the per-bucket cache lock, which isn't really safe + * and should be fixed if we want the statistics to be + * completely accurate. + */ + + /* total number of entries */ + atomic_t num_drc_entries; + + /* cache misses due only to checksum comparison failures */ + unsigned int payload_misses; + + /* amount of memory (in bytes) currently consumed by the DRC */ + unsigned int drc_mem_usage; + + /* longest hash chain seen */ + unsigned int longest_chain; + + /* size of cache when we saw the longest hash chain */ + unsigned int longest_chain_cachesize; + + struct shrinker nfsd_reply_cache_shrinker; + /* utsname taken from the process that starts the server */ + char nfsd_name[UNX_MAXNODENAME+1]; +}; + +/* Simple check to find out if a given net was properly initialized */ +#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl) + +extern void nfsd_netns_free_versions(struct nfsd_net *nn); + +extern unsigned int nfsd_net_id; + +void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn); +void nfsd_reset_boot_verifier(struct nfsd_net *nn); +#endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c new file mode 100644 index 000000000..6a900f770 --- /dev/null +++ b/fs/nfsd/nfs2acl.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Process version 2 NFSACL requests. + * + * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de> + */ + +#include "nfsd.h" +/* FIXME: nfsacl.h is a broken header */ +#include <linux/nfsacl.h> +#include <linux/gfp.h> +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +/* + * NULL call. + */ +static __be32 +nfsacld_proc_null(struct svc_rqst *rqstp) +{ + return rpc_success; +} + +/* + * Get the Access and/or Default ACL of a file. + */ +static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) +{ + struct nfsd3_getaclargs *argp = rqstp->rq_argp; + struct nfsd3_getaclres *resp = rqstp->rq_resp; + struct posix_acl *acl; + struct inode *inode; + svc_fh *fh; + + dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); + + fh = fh_copy(&resp->fh, &argp->fh); + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; + + inode = d_inode(fh->fh_dentry); + + if (argp->mask & ~NFS_ACL_MASK) { + resp->status = nfserr_inval; + goto out; + } + resp->mask = argp->mask; + + resp->status = fh_getattr(fh, &resp->stat); + if (resp->status != nfs_ok) + goto out; + + if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (acl == NULL) { + /* Solaris returns the inode's minimum ACL. */ + acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + } + if (IS_ERR(acl)) { + resp->status = nfserrno(PTR_ERR(acl)); + goto fail; + } + resp->acl_access = acl; + } + if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { + /* Check how Solaris handles requests for the Default ACL + of a non-directory! */ + acl = get_acl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) { + resp->status = nfserrno(PTR_ERR(acl)); + goto fail; + } + resp->acl_default = acl; + } + + /* resp->acl_{access,default} are released in nfssvc_release_getacl. */ +out: + return rpc_success; + +fail: + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + goto out; +} + +/* + * Set the Access and/or Default ACL of a file. + */ +static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp) +{ + struct nfsd3_setaclargs *argp = rqstp->rq_argp; + struct nfsd_attrstat *resp = rqstp->rq_resp; + struct inode *inode; + svc_fh *fh; + int error; + + dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); + + fh = fh_copy(&resp->fh, &argp->fh); + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (resp->status != nfs_ok) + goto out; + + inode = d_inode(fh->fh_dentry); + + error = fh_want_write(fh); + if (error) + goto out_errno; + + fh_lock(fh); + + error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); + if (error) + goto out_drop_lock; + error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); + if (error) + goto out_drop_lock; + + fh_unlock(fh); + + fh_drop_write(fh); + + resp->status = fh_getattr(fh, &resp->stat); + +out: + /* argp->acl_{access,default} may have been allocated in + nfssvc_decode_setaclargs. */ + posix_acl_release(argp->acl_access); + posix_acl_release(argp->acl_default); + return rpc_success; + +out_drop_lock: + fh_unlock(fh); + fh_drop_write(fh); +out_errno: + resp->status = nfserrno(error); + goto out; +} + +/* + * Check file attributes + */ +static __be32 nfsacld_proc_getattr(struct svc_rqst *rqstp) +{ + struct nfsd_fhandle *argp = rqstp->rq_argp; + struct nfsd_attrstat *resp = rqstp->rq_resp; + + dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); + + fh_copy(&resp->fh, &argp->fh); + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; +} + +/* + * Check file access + */ +static __be32 nfsacld_proc_access(struct svc_rqst *rqstp) +{ + struct nfsd3_accessargs *argp = rqstp->rq_argp; + struct nfsd3_accessres *resp = rqstp->rq_resp; + + dprintk("nfsd: ACCESS(2acl) %s 0x%x\n", + SVCFH_fmt(&argp->fh), + argp->access); + + fh_copy(&resp->fh, &argp->fh); + resp->access = argp->access; + resp->status = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); + if (resp->status != nfs_ok) + goto out; + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; +} + +/* + * XDR decode functions + */ +static int nfsaclsvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) +{ + return 1; +} + +static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_getaclargs *argp = rqstp->rq_argp; + + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) + return 0; + argp->mask = ntohl(*p); p++; + + return xdr_argsize_check(rqstp, p); +} + + +static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_setaclargs *argp = rqstp->rq_argp; + struct kvec *head = rqstp->rq_arg.head; + unsigned int base; + int n; + + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) + return 0; + argp->mask = ntohl(*p++); + if (argp->mask & ~NFS_ACL_MASK || + !xdr_argsize_check(rqstp, p)) + return 0; + + base = (char *)p - (char *)head->iov_base; + n = nfsacl_decode(&rqstp->rq_arg, base, NULL, + (argp->mask & NFS_ACL) ? + &argp->acl_access : NULL); + if (n > 0) + n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, + (argp->mask & NFS_DFACL) ? + &argp->acl_default : NULL); + return (n > 0); +} + +static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_fhandle *argp = rqstp->rq_argp; + + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) + return 0; + return xdr_argsize_check(rqstp, p); +} + +static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_accessargs *argp = rqstp->rq_argp; + + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) + return 0; + argp->access = ntohl(*p++); + + return xdr_argsize_check(rqstp, p); +} + +/* + * XDR encode functions + */ + +/* + * There must be an encoding function for void results so svc_process + * will work properly. + */ +static int nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) +{ + return xdr_ressize_check(rqstp, p); +} + +/* GETACL */ +static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_getaclres *resp = rqstp->rq_resp; + struct dentry *dentry = resp->fh.fh_dentry; + struct inode *inode; + struct kvec *head = rqstp->rq_res.head; + unsigned int base; + int n; + int w; + + *p++ = resp->status; + if (resp->status != nfs_ok) + return xdr_ressize_check(rqstp, p); + + /* + * Since this is version 2, the check for nfserr in + * nfsd_dispatch actually ensures the following cannot happen. + * However, it seems fragile to depend on that. + */ + if (dentry == NULL || d_really_is_negative(dentry)) + return 0; + inode = d_inode(dentry); + + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); + *p++ = htonl(resp->mask); + if (!xdr_ressize_check(rqstp, p)) + return 0; + base = (char *)p - (char *)head->iov_base; + + rqstp->rq_res.page_len = w = nfsacl_size( + (resp->mask & NFS_ACL) ? resp->acl_access : NULL, + (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); + while (w > 0) { + if (!*(rqstp->rq_next_page++)) + return 0; + w -= PAGE_SIZE; + } + + n = nfsacl_encode(&rqstp->rq_res, base, inode, + resp->acl_access, + resp->mask & NFS_ACL, 0); + if (n > 0) + n = nfsacl_encode(&rqstp->rq_res, base + n, inode, + resp->acl_default, + resp->mask & NFS_DFACL, + NFS_ACL_DEFAULT); + return (n > 0); +} + +static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_attrstat *resp = rqstp->rq_resp; + + *p++ = resp->status; + if (resp->status != nfs_ok) + goto out; + + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); +out: + return xdr_ressize_check(rqstp, p); +} + +/* ACCESS */ +static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_accessres *resp = rqstp->rq_resp; + + *p++ = resp->status; + if (resp->status != nfs_ok) + goto out; + + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); + *p++ = htonl(resp->access); +out: + return xdr_ressize_check(rqstp, p); +} + +/* + * XDR release functions + */ +static void nfsaclsvc_release_getacl(struct svc_rqst *rqstp) +{ + struct nfsd3_getaclres *resp = rqstp->rq_resp; + + fh_put(&resp->fh); + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); +} + +static void nfsaclsvc_release_attrstat(struct svc_rqst *rqstp) +{ + struct nfsd_attrstat *resp = rqstp->rq_resp; + + fh_put(&resp->fh); +} + +static void nfsaclsvc_release_access(struct svc_rqst *rqstp) +{ + struct nfsd3_accessres *resp = rqstp->rq_resp; + + fh_put(&resp->fh); +} + +struct nfsd3_voidargs { int dummy; }; + +#define ST 1 /* status*/ +#define AT 21 /* attributes */ +#define pAT (1+AT) /* post attributes - conditional */ +#define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ + +static const struct svc_procedure nfsd_acl_procedures2[5] = { + [ACLPROC2_NULL] = { + .pc_func = nfsacld_proc_null, + .pc_decode = nfsaclsvc_decode_voidarg, + .pc_encode = nfsaclsvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd3_voidargs), + .pc_ressize = sizeof(struct nfsd3_voidargs), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [ACLPROC2_GETACL] = { + .pc_func = nfsacld_proc_getacl, + .pc_decode = nfsaclsvc_decode_getaclargs, + .pc_encode = nfsaclsvc_encode_getaclres, + .pc_release = nfsaclsvc_release_getacl, + .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_ressize = sizeof(struct nfsd3_getaclres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+1+2*(1+ACL), + }, + [ACLPROC2_SETACL] = { + .pc_func = nfsacld_proc_setacl, + .pc_decode = nfsaclsvc_decode_setaclargs, + .pc_encode = nfsaclsvc_encode_attrstatres, + .pc_release = nfsaclsvc_release_attrstat, + .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [ACLPROC2_GETATTR] = { + .pc_func = nfsacld_proc_getattr, + .pc_decode = nfsaclsvc_decode_fhandleargs, + .pc_encode = nfsaclsvc_encode_attrstatres, + .pc_release = nfsaclsvc_release_attrstat, + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [ACLPROC2_ACCESS] = { + .pc_func = nfsacld_proc_access, + .pc_decode = nfsaclsvc_decode_accessargs, + .pc_encode = nfsaclsvc_encode_accessres, + .pc_release = nfsaclsvc_release_access, + .pc_argsize = sizeof(struct nfsd3_accessargs), + .pc_ressize = sizeof(struct nfsd3_accessres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT+1, + }, +}; + +static unsigned int nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)]; +const struct svc_version nfsd_acl_version2 = { + .vs_vers = 2, + .vs_nproc = 5, + .vs_proc = nfsd_acl_procedures2, + .vs_count = nfsd_acl_count2, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS3_SVC_XDRSIZE, +}; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c new file mode 100644 index 000000000..34a394e50 --- /dev/null +++ b/fs/nfsd/nfs3acl.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Process version 3 NFSACL requests. + * + * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de> + */ + +#include "nfsd.h" +/* FIXME: nfsacl.h is a broken header */ +#include <linux/nfsacl.h> +#include <linux/gfp.h> +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" + +/* + * NULL call. + */ +static __be32 +nfsd3_proc_null(struct svc_rqst *rqstp) +{ + return rpc_success; +} + +/* + * Get the Access and/or Default ACL of a file. + */ +static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp) +{ + struct nfsd3_getaclargs *argp = rqstp->rq_argp; + struct nfsd3_getaclres *resp = rqstp->rq_resp; + struct posix_acl *acl; + struct inode *inode; + svc_fh *fh; + + fh = fh_copy(&resp->fh, &argp->fh); + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; + + inode = d_inode(fh->fh_dentry); + + if (argp->mask & ~NFS_ACL_MASK) { + resp->status = nfserr_inval; + goto out; + } + resp->mask = argp->mask; + + if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (acl == NULL) { + /* Solaris returns the inode's minimum ACL. */ + acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + } + if (IS_ERR(acl)) { + resp->status = nfserrno(PTR_ERR(acl)); + goto fail; + } + resp->acl_access = acl; + } + if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { + /* Check how Solaris handles requests for the Default ACL + of a non-directory! */ + acl = get_acl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) { + resp->status = nfserrno(PTR_ERR(acl)); + goto fail; + } + resp->acl_default = acl; + } + + /* resp->acl_{access,default} are released in nfs3svc_release_getacl. */ +out: + return rpc_success; + +fail: + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + goto out; +} + +/* + * Set the Access and/or Default ACL of a file. + */ +static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) +{ + struct nfsd3_setaclargs *argp = rqstp->rq_argp; + struct nfsd3_attrstat *resp = rqstp->rq_resp; + struct inode *inode; + svc_fh *fh; + int error; + + fh = fh_copy(&resp->fh, &argp->fh); + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (resp->status != nfs_ok) + goto out; + + inode = d_inode(fh->fh_dentry); + + error = fh_want_write(fh); + if (error) + goto out_errno; + + fh_lock(fh); + + error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); + if (error) + goto out_drop_lock; + error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); + +out_drop_lock: + fh_unlock(fh); + fh_drop_write(fh); +out_errno: + resp->status = nfserrno(error); +out: + /* argp->acl_{access,default} may have been allocated in + nfs3svc_decode_setaclargs. */ + posix_acl_release(argp->acl_access); + posix_acl_release(argp->acl_default); + return rpc_success; +} + +/* + * XDR decode functions + */ +static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_getaclargs *args = rqstp->rq_argp; + + p = nfs3svc_decode_fh(p, &args->fh); + if (!p) + return 0; + args->mask = ntohl(*p); p++; + + return xdr_argsize_check(rqstp, p); +} + + +static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_setaclargs *args = rqstp->rq_argp; + struct kvec *head = rqstp->rq_arg.head; + unsigned int base; + int n; + + p = nfs3svc_decode_fh(p, &args->fh); + if (!p) + return 0; + args->mask = ntohl(*p++); + if (args->mask & ~NFS_ACL_MASK || + !xdr_argsize_check(rqstp, p)) + return 0; + + base = (char *)p - (char *)head->iov_base; + n = nfsacl_decode(&rqstp->rq_arg, base, NULL, + (args->mask & NFS_ACL) ? + &args->acl_access : NULL); + if (n > 0) + n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, + (args->mask & NFS_DFACL) ? + &args->acl_default : NULL); + return (n > 0); +} + +/* + * XDR encode functions + */ + +/* GETACL */ +static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_getaclres *resp = rqstp->rq_resp; + struct dentry *dentry = resp->fh.fh_dentry; + + *p++ = resp->status; + p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); + if (resp->status == 0 && dentry && d_really_is_positive(dentry)) { + struct inode *inode = d_inode(dentry); + struct kvec *head = rqstp->rq_res.head; + unsigned int base; + int n; + int w; + + *p++ = htonl(resp->mask); + if (!xdr_ressize_check(rqstp, p)) + return 0; + base = (char *)p - (char *)head->iov_base; + + rqstp->rq_res.page_len = w = nfsacl_size( + (resp->mask & NFS_ACL) ? resp->acl_access : NULL, + (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); + while (w > 0) { + if (!*(rqstp->rq_next_page++)) + return 0; + w -= PAGE_SIZE; + } + + n = nfsacl_encode(&rqstp->rq_res, base, inode, + resp->acl_access, + resp->mask & NFS_ACL, 0); + if (n > 0) + n = nfsacl_encode(&rqstp->rq_res, base + n, inode, + resp->acl_default, + resp->mask & NFS_DFACL, + NFS_ACL_DEFAULT); + if (n <= 0) + return 0; + } else + if (!xdr_ressize_check(rqstp, p)) + return 0; + + return 1; +} + +/* SETACL */ +static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_attrstat *resp = rqstp->rq_resp; + + *p++ = resp->status; + p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); + return xdr_ressize_check(rqstp, p); +} + +/* + * XDR release functions + */ +static void nfs3svc_release_getacl(struct svc_rqst *rqstp) +{ + struct nfsd3_getaclres *resp = rqstp->rq_resp; + + fh_put(&resp->fh); + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); +} + +struct nfsd3_voidargs { int dummy; }; + +#define ST 1 /* status*/ +#define AT 21 /* attributes */ +#define pAT (1+AT) /* post attributes - conditional */ +#define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ + +static const struct svc_procedure nfsd_acl_procedures3[3] = { + [ACLPROC3_NULL] = { + .pc_func = nfsd3_proc_null, + .pc_decode = nfs3svc_decode_voidarg, + .pc_encode = nfs3svc_encode_voidres, + .pc_argsize = sizeof(struct nfsd3_voidargs), + .pc_ressize = sizeof(struct nfsd3_voidargs), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [ACLPROC3_GETACL] = { + .pc_func = nfsd3_proc_getacl, + .pc_decode = nfs3svc_decode_getaclargs, + .pc_encode = nfs3svc_encode_getaclres, + .pc_release = nfs3svc_release_getacl, + .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_ressize = sizeof(struct nfsd3_getaclres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+1+2*(1+ACL), + }, + [ACLPROC3_SETACL] = { + .pc_func = nfsd3_proc_setacl, + .pc_decode = nfs3svc_decode_setaclargs, + .pc_encode = nfs3svc_encode_setaclres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_ressize = sizeof(struct nfsd3_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT, + }, +}; + +static unsigned int nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)]; +const struct svc_version nfsd_acl_version3 = { + .vs_vers = 3, + .vs_nproc = 3, + .vs_proc = nfsd_acl_procedures3, + .vs_count = nfsd_acl_count3, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS3_SVC_XDRSIZE, +}; + diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c new file mode 100644 index 000000000..981a4e4c9 --- /dev/null +++ b/fs/nfsd/nfs3proc.c @@ -0,0 +1,935 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Process version 3 NFS requests. + * + * Copyright (C) 1996, 1997, 1998 Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/fs.h> +#include <linux/ext2_fs.h> +#include <linux/magic.h> + +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +static int nfs3_ftypes[] = { + 0, /* NF3NON */ + S_IFREG, /* NF3REG */ + S_IFDIR, /* NF3DIR */ + S_IFBLK, /* NF3BLK */ + S_IFCHR, /* NF3CHR */ + S_IFLNK, /* NF3LNK */ + S_IFSOCK, /* NF3SOCK */ + S_IFIFO, /* NF3FIFO */ +}; + +/* + * NULL call. + */ +static __be32 +nfsd3_proc_null(struct svc_rqst *rqstp) +{ + return rpc_success; +} + +/* + * Get a file's attributes + */ +static __be32 +nfsd3_proc_getattr(struct svc_rqst *rqstp) +{ + struct nfsd_fhandle *argp = rqstp->rq_argp; + struct nfsd3_attrstat *resp = rqstp->rq_resp; + + dprintk("nfsd: GETATTR(3) %s\n", + SVCFH_fmt(&argp->fh)); + + fh_copy(&resp->fh, &argp->fh); + resp->status = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + if (resp->status != nfs_ok) + goto out; + + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; +} + +/* + * Set a file's attributes + */ +static __be32 +nfsd3_proc_setattr(struct svc_rqst *rqstp) +{ + struct nfsd3_sattrargs *argp = rqstp->rq_argp; + struct nfsd3_attrstat *resp = rqstp->rq_resp; + + dprintk("nfsd: SETATTR(3) %s\n", + SVCFH_fmt(&argp->fh)); + + fh_copy(&resp->fh, &argp->fh); + resp->status = nfsd_setattr(rqstp, &resp->fh, &argp->attrs, + argp->check_guard, argp->guardtime); + return rpc_success; +} + +/* + * Look up a path name component + */ +static __be32 +nfsd3_proc_lookup(struct svc_rqst *rqstp) +{ + struct nfsd3_diropargs *argp = rqstp->rq_argp; + struct nfsd3_diropres *resp = rqstp->rq_resp; + + dprintk("nfsd: LOOKUP(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + fh_copy(&resp->dirfh, &argp->fh); + fh_init(&resp->fh, NFS3_FHSIZE); + + resp->status = nfsd_lookup(rqstp, &resp->dirfh, + argp->name, argp->len, + &resp->fh); + return rpc_success; +} + +/* + * Check file access + */ +static __be32 +nfsd3_proc_access(struct svc_rqst *rqstp) +{ + struct nfsd3_accessargs *argp = rqstp->rq_argp; + struct nfsd3_accessres *resp = rqstp->rq_resp; + + dprintk("nfsd: ACCESS(3) %s 0x%x\n", + SVCFH_fmt(&argp->fh), + argp->access); + + fh_copy(&resp->fh, &argp->fh); + resp->access = argp->access; + resp->status = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); + return rpc_success; +} + +/* + * Read a symlink. + */ +static __be32 +nfsd3_proc_readlink(struct svc_rqst *rqstp) +{ + struct nfsd3_readlinkargs *argp = rqstp->rq_argp; + struct nfsd3_readlinkres *resp = rqstp->rq_resp; + + dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh)); + + /* Read the symlink. */ + fh_copy(&resp->fh, &argp->fh); + resp->len = NFS3_MAXPATHLEN; + resp->status = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len); + return rpc_success; +} + +/* + * Read a portion of a file. + */ +static __be32 +nfsd3_proc_read(struct svc_rqst *rqstp) +{ + struct nfsd3_readargs *argp = rqstp->rq_argp; + struct nfsd3_readres *resp = rqstp->rq_resp; + u32 max_blocksize = svc_max_payload(rqstp); + unsigned long cnt = min(argp->count, max_blocksize); + + dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", + SVCFH_fmt(&argp->fh), + (unsigned long) argp->count, + (unsigned long long) argp->offset); + + /* Obtain buffer pointer for payload. + * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) + * + 1 (xdr opaque byte count) = 26 + */ + resp->count = cnt; + svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); + + fh_copy(&resp->fh, &argp->fh); + resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, + rqstp->rq_vec, argp->vlen, &resp->count, + &resp->eof); + return rpc_success; +} + +/* + * Write data to a file + */ +static __be32 +nfsd3_proc_write(struct svc_rqst *rqstp) +{ + struct nfsd3_writeargs *argp = rqstp->rq_argp; + struct nfsd3_writeres *resp = rqstp->rq_resp; + unsigned long cnt = argp->len; + unsigned int nvecs; + + dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n", + SVCFH_fmt(&argp->fh), + argp->len, + (unsigned long long) argp->offset, + argp->stable? " stable" : ""); + + resp->status = nfserr_fbig; + if (argp->offset > (u64)OFFSET_MAX || + argp->offset + argp->len > (u64)OFFSET_MAX) + return rpc_success; + + fh_copy(&resp->fh, &argp->fh); + resp->committed = argp->stable; + nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages, + &argp->first, cnt); + if (!nvecs) { + resp->status = nfserr_io; + goto out; + } + resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, + rqstp->rq_vec, nvecs, &cnt, + resp->committed, resp->verf); + resp->count = cnt; +out: + return rpc_success; +} + +/* + * With NFSv3, CREATE processing is a lot easier than with NFSv2. + * At least in theory; we'll see how it fares in practice when the + * first reports about SunOS compatibility problems start to pour in... + */ +static __be32 +nfsd3_proc_create(struct svc_rqst *rqstp) +{ + struct nfsd3_createargs *argp = rqstp->rq_argp; + struct nfsd3_diropres *resp = rqstp->rq_resp; + svc_fh *dirfhp, *newfhp = NULL; + struct iattr *attr; + + dprintk("nfsd: CREATE(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + dirfhp = fh_copy(&resp->dirfh, &argp->fh); + newfhp = fh_init(&resp->fh, NFS3_FHSIZE); + attr = &argp->attrs; + + /* Unfudge the mode bits */ + attr->ia_mode &= ~S_IFMT; + if (!(attr->ia_valid & ATTR_MODE)) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode = S_IFREG; + } else { + attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG; + } + + /* Now create the file and set attributes */ + resp->status = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len, + attr, newfhp, argp->createmode, + (u32 *)argp->verf, NULL, NULL); + return rpc_success; +} + +/* + * Make directory. This operation is not idempotent. + */ +static __be32 +nfsd3_proc_mkdir(struct svc_rqst *rqstp) +{ + struct nfsd3_createargs *argp = rqstp->rq_argp; + struct nfsd3_diropres *resp = rqstp->rq_resp; + + dprintk("nfsd: MKDIR(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + argp->attrs.ia_valid &= ~ATTR_SIZE; + fh_copy(&resp->dirfh, &argp->fh); + fh_init(&resp->fh, NFS3_FHSIZE); + resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, + &argp->attrs, S_IFDIR, 0, &resp->fh); + fh_unlock(&resp->dirfh); + return rpc_success; +} + +static __be32 +nfsd3_proc_symlink(struct svc_rqst *rqstp) +{ + struct nfsd3_symlinkargs *argp = rqstp->rq_argp; + struct nfsd3_diropres *resp = rqstp->rq_resp; + + if (argp->tlen == 0) { + resp->status = nfserr_inval; + goto out; + } + if (argp->tlen > NFS3_MAXPATHLEN) { + resp->status = nfserr_nametoolong; + goto out; + } + + argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, + page_address(rqstp->rq_arg.pages[0]), + argp->tlen); + if (IS_ERR(argp->tname)) { + resp->status = nfserrno(PTR_ERR(argp->tname)); + goto out; + } + + dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n", + SVCFH_fmt(&argp->ffh), + argp->flen, argp->fname, + argp->tlen, argp->tname); + + fh_copy(&resp->dirfh, &argp->ffh); + fh_init(&resp->fh, NFS3_FHSIZE); + resp->status = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, + argp->flen, argp->tname, &resp->fh); + kfree(argp->tname); +out: + return rpc_success; +} + +/* + * Make socket/fifo/device. + */ +static __be32 +nfsd3_proc_mknod(struct svc_rqst *rqstp) +{ + struct nfsd3_mknodargs *argp = rqstp->rq_argp; + struct nfsd3_diropres *resp = rqstp->rq_resp; + int type; + dev_t rdev = 0; + + dprintk("nfsd: MKNOD(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + fh_copy(&resp->dirfh, &argp->fh); + fh_init(&resp->fh, NFS3_FHSIZE); + + if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) { + rdev = MKDEV(argp->major, argp->minor); + if (MAJOR(rdev) != argp->major || + MINOR(rdev) != argp->minor) { + resp->status = nfserr_inval; + goto out; + } + } else if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) { + resp->status = nfserr_badtype; + goto out; + } + + type = nfs3_ftypes[argp->ftype]; + resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, + &argp->attrs, type, rdev, &resp->fh); + fh_unlock(&resp->dirfh); +out: + return rpc_success; +} + +/* + * Remove file/fifo/socket etc. + */ +static __be32 +nfsd3_proc_remove(struct svc_rqst *rqstp) +{ + struct nfsd3_diropargs *argp = rqstp->rq_argp; + struct nfsd3_attrstat *resp = rqstp->rq_resp; + + dprintk("nfsd: REMOVE(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + /* Unlink. -S_IFDIR means file must not be a directory */ + fh_copy(&resp->fh, &argp->fh); + resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, + argp->name, argp->len); + fh_unlock(&resp->fh); + return rpc_success; +} + +/* + * Remove a directory + */ +static __be32 +nfsd3_proc_rmdir(struct svc_rqst *rqstp) +{ + struct nfsd3_diropargs *argp = rqstp->rq_argp; + struct nfsd3_attrstat *resp = rqstp->rq_resp; + + dprintk("nfsd: RMDIR(3) %s %.*s\n", + SVCFH_fmt(&argp->fh), + argp->len, + argp->name); + + fh_copy(&resp->fh, &argp->fh); + resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, + argp->name, argp->len); + fh_unlock(&resp->fh); + return rpc_success; +} + +static __be32 +nfsd3_proc_rename(struct svc_rqst *rqstp) +{ + struct nfsd3_renameargs *argp = rqstp->rq_argp; + struct nfsd3_renameres *resp = rqstp->rq_resp; + + dprintk("nfsd: RENAME(3) %s %.*s ->\n", + SVCFH_fmt(&argp->ffh), + argp->flen, + argp->fname); + dprintk("nfsd: -> %s %.*s\n", + SVCFH_fmt(&argp->tfh), + argp->tlen, + argp->tname); + + fh_copy(&resp->ffh, &argp->ffh); + fh_copy(&resp->tfh, &argp->tfh); + resp->status = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen, + &resp->tfh, argp->tname, argp->tlen); + return rpc_success; +} + +static __be32 +nfsd3_proc_link(struct svc_rqst *rqstp) +{ + struct nfsd3_linkargs *argp = rqstp->rq_argp; + struct nfsd3_linkres *resp = rqstp->rq_resp; + + dprintk("nfsd: LINK(3) %s ->\n", + SVCFH_fmt(&argp->ffh)); + dprintk("nfsd: -> %s %.*s\n", + SVCFH_fmt(&argp->tfh), + argp->tlen, + argp->tname); + + fh_copy(&resp->fh, &argp->ffh); + fh_copy(&resp->tfh, &argp->tfh); + resp->status = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen, + &resp->fh); + return rpc_success; +} + +/* + * Read a portion of a directory. + */ +static __be32 +nfsd3_proc_readdir(struct svc_rqst *rqstp) +{ + struct nfsd3_readdirargs *argp = rqstp->rq_argp; + struct nfsd3_readdirres *resp = rqstp->rq_resp; + int count = 0; + struct page **p; + caddr_t page_addr = NULL; + + dprintk("nfsd: READDIR(3) %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->count, (u32) argp->cookie); + + /* Make sure we've room for the NULL ptr & eof flag, and shrink to + * client read size */ + count = (argp->count >> 2) - 2; + + /* Read directory and encode entries on the fly */ + fh_copy(&resp->fh, &argp->fh); + + resp->buflen = count; + resp->common.err = nfs_ok; + resp->buffer = argp->buffer; + resp->rqstp = rqstp; + resp->status = nfsd_readdir(rqstp, &resp->fh, (loff_t *)&argp->cookie, + &resp->common, nfs3svc_encode_entry); + memcpy(resp->verf, argp->verf, 8); + count = 0; + for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { + page_addr = page_address(*p); + + if (((caddr_t)resp->buffer >= page_addr) && + ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { + count += (caddr_t)resp->buffer - page_addr; + break; + } + count += PAGE_SIZE; + } + resp->count = count >> 2; + if (resp->offset) { + loff_t offset = argp->cookie; + + if (unlikely(resp->offset1)) { + /* we ended up with offset on a page boundary */ + *resp->offset = htonl(offset >> 32); + *resp->offset1 = htonl(offset & 0xffffffff); + resp->offset1 = NULL; + } else { + xdr_encode_hyper(resp->offset, offset); + } + resp->offset = NULL; + } + + return rpc_success; +} + +/* + * Read a portion of a directory, including file handles and attrs. + * For now, we choose to ignore the dircount parameter. + */ +static __be32 +nfsd3_proc_readdirplus(struct svc_rqst *rqstp) +{ + struct nfsd3_readdirargs *argp = rqstp->rq_argp; + struct nfsd3_readdirres *resp = rqstp->rq_resp; + int count = 0; + loff_t offset; + struct page **p; + caddr_t page_addr = NULL; + + dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->count, (u32) argp->cookie); + + /* Convert byte count to number of words (i.e. >> 2), + * and reserve room for the NULL ptr & eof flag (-2 words) */ + resp->count = (argp->count >> 2) - 2; + + /* Read directory and encode entries on the fly */ + fh_copy(&resp->fh, &argp->fh); + + resp->common.err = nfs_ok; + resp->buffer = argp->buffer; + resp->buflen = resp->count; + resp->rqstp = rqstp; + offset = argp->cookie; + + resp->status = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; + + if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS) { + resp->status = nfserr_notsupp; + goto out; + } + + resp->status = nfsd_readdir(rqstp, &resp->fh, &offset, + &resp->common, nfs3svc_encode_entry_plus); + memcpy(resp->verf, argp->verf, 8); + for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { + page_addr = page_address(*p); + + if (((caddr_t)resp->buffer >= page_addr) && + ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { + count += (caddr_t)resp->buffer - page_addr; + break; + } + count += PAGE_SIZE; + } + resp->count = count >> 2; + if (resp->offset) { + if (unlikely(resp->offset1)) { + /* we ended up with offset on a page boundary */ + *resp->offset = htonl(offset >> 32); + *resp->offset1 = htonl(offset & 0xffffffff); + resp->offset1 = NULL; + } else { + xdr_encode_hyper(resp->offset, offset); + } + resp->offset = NULL; + } + +out: + return rpc_success; +} + +/* + * Get file system stats + */ +static __be32 +nfsd3_proc_fsstat(struct svc_rqst *rqstp) +{ + struct nfsd_fhandle *argp = rqstp->rq_argp; + struct nfsd3_fsstatres *resp = rqstp->rq_resp; + + dprintk("nfsd: FSSTAT(3) %s\n", + SVCFH_fmt(&argp->fh)); + + resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); + fh_put(&argp->fh); + return rpc_success; +} + +/* + * Get file system info + */ +static __be32 +nfsd3_proc_fsinfo(struct svc_rqst *rqstp) +{ + struct nfsd_fhandle *argp = rqstp->rq_argp; + struct nfsd3_fsinfores *resp = rqstp->rq_resp; + u32 max_blocksize = svc_max_payload(rqstp); + + dprintk("nfsd: FSINFO(3) %s\n", + SVCFH_fmt(&argp->fh)); + + resp->f_rtmax = max_blocksize; + resp->f_rtpref = max_blocksize; + resp->f_rtmult = PAGE_SIZE; + resp->f_wtmax = max_blocksize; + resp->f_wtpref = max_blocksize; + resp->f_wtmult = PAGE_SIZE; + resp->f_dtpref = max_blocksize; + resp->f_maxfilesize = ~(u32) 0; + resp->f_properties = NFS3_FSF_DEFAULT; + + resp->status = fh_verify(rqstp, &argp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + + /* Check special features of the file system. May request + * different read/write sizes for file systems known to have + * problems with large blocks */ + if (resp->status == nfs_ok) { + struct super_block *sb = argp->fh.fh_dentry->d_sb; + + /* Note that we don't care for remote fs's here */ + if (sb->s_magic == MSDOS_SUPER_MAGIC) { + resp->f_properties = NFS3_FSF_BILLYBOY; + } + resp->f_maxfilesize = sb->s_maxbytes; + } + + fh_put(&argp->fh); + return rpc_success; +} + +/* + * Get pathconf info for the specified file + */ +static __be32 +nfsd3_proc_pathconf(struct svc_rqst *rqstp) +{ + struct nfsd_fhandle *argp = rqstp->rq_argp; + struct nfsd3_pathconfres *resp = rqstp->rq_resp; + + dprintk("nfsd: PATHCONF(3) %s\n", + SVCFH_fmt(&argp->fh)); + + /* Set default pathconf */ + resp->p_link_max = 255; /* at least */ + resp->p_name_max = 255; /* at least */ + resp->p_no_trunc = 0; + resp->p_chown_restricted = 1; + resp->p_case_insensitive = 0; + resp->p_case_preserving = 1; + + resp->status = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); + + if (resp->status == nfs_ok) { + struct super_block *sb = argp->fh.fh_dentry->d_sb; + + /* Note that we don't care for remote fs's here */ + switch (sb->s_magic) { + case EXT2_SUPER_MAGIC: + resp->p_link_max = EXT2_LINK_MAX; + resp->p_name_max = EXT2_NAME_LEN; + break; + case MSDOS_SUPER_MAGIC: + resp->p_case_insensitive = 1; + resp->p_case_preserving = 0; + break; + } + } + + fh_put(&argp->fh); + return rpc_success; +} + +/* + * Commit a file (range) to stable storage. + */ +static __be32 +nfsd3_proc_commit(struct svc_rqst *rqstp) +{ + struct nfsd3_commitargs *argp = rqstp->rq_argp; + struct nfsd3_commitres *resp = rqstp->rq_resp; + + dprintk("nfsd: COMMIT(3) %s %u@%Lu\n", + SVCFH_fmt(&argp->fh), + argp->count, + (unsigned long long) argp->offset); + + if (argp->offset > NFS_OFFSET_MAX) { + resp->status = nfserr_inval; + goto out; + } + + fh_copy(&resp->fh, &argp->fh); + resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset, + argp->count, resp->verf); +out: + return rpc_success; +} + + +/* + * NFSv3 Server procedures. + * Only the results of non-idempotent operations are cached. + */ +#define nfs3svc_decode_fhandleargs nfs3svc_decode_fhandle +#define nfs3svc_encode_attrstatres nfs3svc_encode_attrstat +#define nfs3svc_encode_wccstatres nfs3svc_encode_wccstat +#define nfsd3_mkdirargs nfsd3_createargs +#define nfsd3_readdirplusargs nfsd3_readdirargs +#define nfsd3_fhandleargs nfsd_fhandle +#define nfsd3_fhandleres nfsd3_attrstat +#define nfsd3_attrstatres nfsd3_attrstat +#define nfsd3_wccstatres nfsd3_attrstat +#define nfsd3_createres nfsd3_diropres +#define nfsd3_voidres nfsd3_voidargs +struct nfsd3_voidargs { int dummy; }; + +#define ST 1 /* status*/ +#define FH 17 /* filehandle with length */ +#define AT 21 /* attributes */ +#define pAT (1+AT) /* post attributes - conditional */ +#define WC (7+pAT) /* WCC attributes */ + +static const struct svc_procedure nfsd_procedures3[22] = { + [NFS3PROC_NULL] = { + .pc_func = nfsd3_proc_null, + .pc_decode = nfs3svc_decode_voidarg, + .pc_encode = nfs3svc_encode_voidres, + .pc_argsize = sizeof(struct nfsd3_voidargs), + .pc_ressize = sizeof(struct nfsd3_voidres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [NFS3PROC_GETATTR] = { + .pc_func = nfsd3_proc_getattr, + .pc_decode = nfs3svc_decode_fhandleargs, + .pc_encode = nfs3svc_encode_attrstatres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_ressize = sizeof(struct nfsd3_attrstatres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [NFS3PROC_SETATTR] = { + .pc_func = nfsd3_proc_setattr, + .pc_decode = nfs3svc_decode_sattrargs, + .pc_encode = nfs3svc_encode_wccstatres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_sattrargs), + .pc_ressize = sizeof(struct nfsd3_wccstatres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC, + }, + [NFS3PROC_LOOKUP] = { + .pc_func = nfsd3_proc_lookup, + .pc_decode = nfs3svc_decode_diropargs, + .pc_encode = nfs3svc_encode_diropres, + .pc_release = nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_ressize = sizeof(struct nfsd3_diropres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+FH+pAT+pAT, + }, + [NFS3PROC_ACCESS] = { + .pc_func = nfsd3_proc_access, + .pc_decode = nfs3svc_decode_accessargs, + .pc_encode = nfs3svc_encode_accessres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_accessargs), + .pc_ressize = sizeof(struct nfsd3_accessres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+1, + }, + [NFS3PROC_READLINK] = { + .pc_func = nfsd3_proc_readlink, + .pc_decode = nfs3svc_decode_readlinkargs, + .pc_encode = nfs3svc_encode_readlinkres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_readlinkargs), + .pc_ressize = sizeof(struct nfsd3_readlinkres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4, + }, + [NFS3PROC_READ] = { + .pc_func = nfsd3_proc_read, + .pc_decode = nfs3svc_decode_readargs, + .pc_encode = nfs3svc_encode_readres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_readargs), + .pc_ressize = sizeof(struct nfsd3_readres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4, + }, + [NFS3PROC_WRITE] = { + .pc_func = nfsd3_proc_write, + .pc_decode = nfs3svc_decode_writeargs, + .pc_encode = nfs3svc_encode_writeres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_writeargs), + .pc_ressize = sizeof(struct nfsd3_writeres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC+4, + }, + [NFS3PROC_CREATE] = { + .pc_func = nfsd3_proc_create, + .pc_decode = nfs3svc_decode_createargs, + .pc_encode = nfs3svc_encode_createres, + .pc_release = nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_createargs), + .pc_ressize = sizeof(struct nfsd3_createres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+(1+FH+pAT)+WC, + }, + [NFS3PROC_MKDIR] = { + .pc_func = nfsd3_proc_mkdir, + .pc_decode = nfs3svc_decode_mkdirargs, + .pc_encode = nfs3svc_encode_createres, + .pc_release = nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_mkdirargs), + .pc_ressize = sizeof(struct nfsd3_createres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+(1+FH+pAT)+WC, + }, + [NFS3PROC_SYMLINK] = { + .pc_func = nfsd3_proc_symlink, + .pc_decode = nfs3svc_decode_symlinkargs, + .pc_encode = nfs3svc_encode_createres, + .pc_release = nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_symlinkargs), + .pc_ressize = sizeof(struct nfsd3_createres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+(1+FH+pAT)+WC, + }, + [NFS3PROC_MKNOD] = { + .pc_func = nfsd3_proc_mknod, + .pc_decode = nfs3svc_decode_mknodargs, + .pc_encode = nfs3svc_encode_createres, + .pc_release = nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_mknodargs), + .pc_ressize = sizeof(struct nfsd3_createres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+(1+FH+pAT)+WC, + }, + [NFS3PROC_REMOVE] = { + .pc_func = nfsd3_proc_remove, + .pc_decode = nfs3svc_decode_diropargs, + .pc_encode = nfs3svc_encode_wccstatres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_ressize = sizeof(struct nfsd3_wccstatres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC, + }, + [NFS3PROC_RMDIR] = { + .pc_func = nfsd3_proc_rmdir, + .pc_decode = nfs3svc_decode_diropargs, + .pc_encode = nfs3svc_encode_wccstatres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_ressize = sizeof(struct nfsd3_wccstatres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC, + }, + [NFS3PROC_RENAME] = { + .pc_func = nfsd3_proc_rename, + .pc_decode = nfs3svc_decode_renameargs, + .pc_encode = nfs3svc_encode_renameres, + .pc_release = nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_renameargs), + .pc_ressize = sizeof(struct nfsd3_renameres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+WC+WC, + }, + [NFS3PROC_LINK] = { + .pc_func = nfsd3_proc_link, + .pc_decode = nfs3svc_decode_linkargs, + .pc_encode = nfs3svc_encode_linkres, + .pc_release = nfs3svc_release_fhandle2, + .pc_argsize = sizeof(struct nfsd3_linkargs), + .pc_ressize = sizeof(struct nfsd3_linkres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+pAT+WC, + }, + [NFS3PROC_READDIR] = { + .pc_func = nfsd3_proc_readdir, + .pc_decode = nfs3svc_decode_readdirargs, + .pc_encode = nfs3svc_encode_readdirres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_readdirargs), + .pc_ressize = sizeof(struct nfsd3_readdirres), + .pc_cachetype = RC_NOCACHE, + }, + [NFS3PROC_READDIRPLUS] = { + .pc_func = nfsd3_proc_readdirplus, + .pc_decode = nfs3svc_decode_readdirplusargs, + .pc_encode = nfs3svc_encode_readdirres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_readdirplusargs), + .pc_ressize = sizeof(struct nfsd3_readdirres), + .pc_cachetype = RC_NOCACHE, + }, + [NFS3PROC_FSSTAT] = { + .pc_func = nfsd3_proc_fsstat, + .pc_decode = nfs3svc_decode_fhandleargs, + .pc_encode = nfs3svc_encode_fsstatres, + .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_ressize = sizeof(struct nfsd3_fsstatres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+2*6+1, + }, + [NFS3PROC_FSINFO] = { + .pc_func = nfsd3_proc_fsinfo, + .pc_decode = nfs3svc_decode_fhandleargs, + .pc_encode = nfs3svc_encode_fsinfores, + .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_ressize = sizeof(struct nfsd3_fsinfores), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+12, + }, + [NFS3PROC_PATHCONF] = { + .pc_func = nfsd3_proc_pathconf, + .pc_decode = nfs3svc_decode_fhandleargs, + .pc_encode = nfs3svc_encode_pathconfres, + .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_ressize = sizeof(struct nfsd3_pathconfres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT+6, + }, + [NFS3PROC_COMMIT] = { + .pc_func = nfsd3_proc_commit, + .pc_decode = nfs3svc_decode_commitargs, + .pc_encode = nfs3svc_encode_commitres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_commitargs), + .pc_ressize = sizeof(struct nfsd3_commitres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+WC+2, + }, +}; + +static unsigned int nfsd_count3[ARRAY_SIZE(nfsd_procedures3)]; +const struct svc_version nfsd_version3 = { + .vs_vers = 3, + .vs_nproc = 22, + .vs_proc = nfsd_procedures3, + .vs_dispatch = nfsd_dispatch, + .vs_count = nfsd_count3, + .vs_xdrsize = NFS3_SVC_XDRSIZE, +}; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c new file mode 100644 index 000000000..716566da4 --- /dev/null +++ b/fs/nfsd/nfs3xdr.c @@ -0,0 +1,1171 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * XDR support for nfsd/protocol version 3. + * + * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> + * + * 2003-08-09 Jamie Lokier: Use htonl() for nanoseconds, not htons()! + */ + +#include <linux/namei.h> +#include <linux/sunrpc/svc_xprt.h> +#include "xdr3.h" +#include "auth.h" +#include "netns.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_XDR + + +/* + * Mapping of S_IF* types to NFS file types + */ +static u32 nfs3_ftypes[] = { + NF3NON, NF3FIFO, NF3CHR, NF3BAD, + NF3DIR, NF3BAD, NF3BLK, NF3BAD, + NF3REG, NF3BAD, NF3LNK, NF3BAD, + NF3SOCK, NF3BAD, NF3LNK, NF3BAD, +}; + + +/* + * XDR functions for basic NFS types + */ +static __be32 * +encode_time3(__be32 *p, struct timespec64 *time) +{ + *p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec); + return p; +} + +static __be32 * +decode_time3(__be32 *p, struct timespec64 *time) +{ + time->tv_sec = ntohl(*p++); + time->tv_nsec = ntohl(*p++); + return p; +} + +static __be32 * +decode_fh(__be32 *p, struct svc_fh *fhp) +{ + unsigned int size; + fh_init(fhp, NFS3_FHSIZE); + size = ntohl(*p++); + if (size > NFS3_FHSIZE) + return NULL; + + memcpy(&fhp->fh_handle.fh_base, p, size); + fhp->fh_handle.fh_size = size; + return p + XDR_QUADLEN(size); +} + +/* Helper function for NFSv3 ACL code */ +__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp) +{ + return decode_fh(p, fhp); +} + +static __be32 * +encode_fh(__be32 *p, struct svc_fh *fhp) +{ + unsigned int size = fhp->fh_handle.fh_size; + *p++ = htonl(size); + if (size) p[XDR_QUADLEN(size)-1]=0; + memcpy(p, &fhp->fh_handle.fh_base, size); + return p + XDR_QUADLEN(size); +} + +/* + * Decode a file name and make sure that the path contains + * no slashes or null bytes. + */ +static __be32 * +decode_filename(__be32 *p, char **namp, unsigned int *lenp) +{ + char *name; + unsigned int i; + + if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) { + for (i = 0, name = *namp; i < *lenp; i++, name++) { + if (*name == '\0' || *name == '/') + return NULL; + } + } + + return p; +} + +static __be32 * +decode_sattr3(__be32 *p, struct iattr *iap, struct user_namespace *userns) +{ + u32 tmp; + + iap->ia_valid = 0; + + if (*p++) { + iap->ia_valid |= ATTR_MODE; + iap->ia_mode = ntohl(*p++); + } + if (*p++) { + iap->ia_uid = make_kuid(userns, ntohl(*p++)); + if (uid_valid(iap->ia_uid)) + iap->ia_valid |= ATTR_UID; + } + if (*p++) { + iap->ia_gid = make_kgid(userns, ntohl(*p++)); + if (gid_valid(iap->ia_gid)) + iap->ia_valid |= ATTR_GID; + } + if (*p++) { + u64 newsize; + + iap->ia_valid |= ATTR_SIZE; + p = xdr_decode_hyper(p, &newsize); + iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX); + } + if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ + iap->ia_valid |= ATTR_ATIME; + } else if (tmp == 2) { /* set to client time */ + iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; + iap->ia_atime.tv_sec = ntohl(*p++); + iap->ia_atime.tv_nsec = ntohl(*p++); + } + if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ + iap->ia_valid |= ATTR_MTIME; + } else if (tmp == 2) { /* set to client time */ + iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; + iap->ia_mtime.tv_sec = ntohl(*p++); + iap->ia_mtime.tv_nsec = ntohl(*p++); + } + return p; +} + +static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) +{ + u64 f; + switch(fsid_source(fhp)) { + default: + case FSIDSOURCE_DEV: + p = xdr_encode_hyper(p, (u64)huge_encode_dev + (fhp->fh_dentry->d_sb->s_dev)); + break; + case FSIDSOURCE_FSID: + p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); + break; + case FSIDSOURCE_UUID: + f = ((u64*)fhp->fh_export->ex_uuid)[0]; + f ^= ((u64*)fhp->fh_export->ex_uuid)[1]; + p = xdr_encode_hyper(p, f); + break; + } + return p; +} + +static __be32 * +encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, + struct kstat *stat) +{ + struct user_namespace *userns = nfsd_user_namespace(rqstp); + *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); + *p++ = htonl((u32) (stat->mode & S_IALLUGO)); + *p++ = htonl((u32) stat->nlink); + *p++ = htonl((u32) from_kuid_munged(userns, stat->uid)); + *p++ = htonl((u32) from_kgid_munged(userns, stat->gid)); + if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) { + p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); + } else { + p = xdr_encode_hyper(p, (u64) stat->size); + } + p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9); + *p++ = htonl((u32) MAJOR(stat->rdev)); + *p++ = htonl((u32) MINOR(stat->rdev)); + p = encode_fsid(p, fhp); + p = xdr_encode_hyper(p, stat->ino); + p = encode_time3(p, &stat->atime); + p = encode_time3(p, &stat->mtime); + p = encode_time3(p, &stat->ctime); + + return p; +} + +static __be32 * +encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +{ + /* Attributes to follow */ + *p++ = xdr_one; + return encode_fattr3(rqstp, p, fhp, &fhp->fh_post_attr); +} + +/* + * Encode post-operation attributes. + * The inode may be NULL if the call failed because of a stale file + * handle. In this case, no attributes are returned. + */ +static __be32 * +encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +{ + struct dentry *dentry = fhp->fh_dentry; + if (dentry && d_really_is_positive(dentry)) { + __be32 err; + struct kstat stat; + + err = fh_getattr(fhp, &stat); + if (!err) { + *p++ = xdr_one; /* attributes follow */ + lease_get_mtime(d_inode(dentry), &stat.mtime); + return encode_fattr3(rqstp, p, fhp, &stat); + } + } + *p++ = xdr_zero; + return p; +} + +/* Helper for NFSv3 ACLs */ +__be32 * +nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +{ + return encode_post_op_attr(rqstp, p, fhp); +} + +/* + * Enocde weak cache consistency data + */ +static __be32 * +encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +{ + struct dentry *dentry = fhp->fh_dentry; + + if (dentry && d_really_is_positive(dentry) && fhp->fh_post_saved) { + if (fhp->fh_pre_saved) { + *p++ = xdr_one; + p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size); + p = encode_time3(p, &fhp->fh_pre_mtime); + p = encode_time3(p, &fhp->fh_pre_ctime); + } else { + *p++ = xdr_zero; + } + return encode_saved_post_attr(rqstp, p, fhp); + } + /* no pre- or post-attrs */ + *p++ = xdr_zero; + return encode_post_op_attr(rqstp, p, fhp); +} + +/* + * Fill in the pre_op attr for the wcc data + */ +void fill_pre_wcc(struct svc_fh *fhp) +{ + struct inode *inode; + struct kstat stat; + __be32 err; + + if (fhp->fh_pre_saved) + return; + + inode = d_inode(fhp->fh_dentry); + err = fh_getattr(fhp, &stat); + if (err) { + /* Grab the times from inode anyway */ + stat.mtime = inode->i_mtime; + stat.ctime = inode->i_ctime; + stat.size = inode->i_size; + } + + fhp->fh_pre_mtime = stat.mtime; + fhp->fh_pre_ctime = stat.ctime; + fhp->fh_pre_size = stat.size; + fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); + fhp->fh_pre_saved = true; +} + +/* + * Fill in the post_op attr for the wcc data + */ +void fill_post_wcc(struct svc_fh *fhp) +{ + __be32 err; + + if (fhp->fh_post_saved) + printk("nfsd: inode locked twice during operation.\n"); + + err = fh_getattr(fhp, &fhp->fh_post_attr); + fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, + d_inode(fhp->fh_dentry)); + if (err) { + fhp->fh_post_saved = false; + /* Grab the ctime anyway - set_change_info might use it */ + fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime; + } else + fhp->fh_post_saved = true; +} + +/* + * XDR decode functions + */ +int +nfs3svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) +{ + return 1; +} + +int +nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_fhandle *args = rqstp->rq_argp; + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_sattrargs *args = rqstp->rq_argp; + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); + + if ((args->check_guard = ntohl(*p++)) != 0) { + struct timespec64 time; + p = decode_time3(p, &time); + args->guardtime = time.tv_sec; + } + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_diropargs *args = rqstp->rq_argp; + + if (!(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_accessargs *args = rqstp->rq_argp; + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + args->access = ntohl(*p++); + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_readargs *args = rqstp->rq_argp; + unsigned int len; + int v; + u32 max_blocksize = svc_max_payload(rqstp); + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = xdr_decode_hyper(p, &args->offset); + + args->count = ntohl(*p++); + len = min(args->count, max_blocksize); + + /* set up the kvec */ + v=0; + while (len > 0) { + struct page *p = *(rqstp->rq_next_page++); + + rqstp->rq_vec[v].iov_base = page_address(p); + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); + len -= rqstp->rq_vec[v].iov_len; + v++; + } + args->vlen = v; + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_writeargs *args = rqstp->rq_argp; + unsigned int len, hdr, dlen; + u32 max_blocksize = svc_max_payload(rqstp); + struct kvec *head = rqstp->rq_arg.head; + struct kvec *tail = rqstp->rq_arg.tail; + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = xdr_decode_hyper(p, &args->offset); + + args->count = ntohl(*p++); + args->stable = ntohl(*p++); + len = args->len = ntohl(*p++); + if ((void *)p > head->iov_base + head->iov_len) + return 0; + /* + * The count must equal the amount of data passed. + */ + if (args->count != args->len) + return 0; + + /* + * Check to make sure that we got the right number of + * bytes. + */ + hdr = (void*)p - head->iov_base; + dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr; + /* + * Round the length of the data which was specified up to + * the next multiple of XDR units and then compare that + * against the length which was actually received. + * Note that when RPCSEC/GSS (for example) is used, the + * data buffer can be padded so dlen might be larger + * than required. It must never be smaller. + */ + if (dlen < XDR_QUADLEN(len)*4) + return 0; + + if (args->count > max_blocksize) { + args->count = max_blocksize; + len = args->len = max_blocksize; + } + + args->first.iov_base = (void *)p; + args->first.iov_len = head->iov_len - hdr; + return 1; +} + +int +nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_createargs *args = rqstp->rq_argp; + + if (!(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + + switch (args->createmode = ntohl(*p++)) { + case NFS3_CREATE_UNCHECKED: + case NFS3_CREATE_GUARDED: + p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); + break; + case NFS3_CREATE_EXCLUSIVE: + args->verf = p; + p += 2; + break; + default: + return 0; + } + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_createargs *args = rqstp->rq_argp; + + if (!(p = decode_fh(p, &args->fh)) || + !(p = decode_filename(p, &args->name, &args->len))) + return 0; + p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_symlinkargs *args = rqstp->rq_argp; + char *base = (char *)p; + size_t dlen; + + if (!(p = decode_fh(p, &args->ffh)) || + !(p = decode_filename(p, &args->fname, &args->flen))) + return 0; + p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); + + args->tlen = ntohl(*p++); + + args->first.iov_base = p; + args->first.iov_len = rqstp->rq_arg.head[0].iov_len; + args->first.iov_len -= (char *)p - base; + + dlen = args->first.iov_len + rqstp->rq_arg.page_len + + rqstp->rq_arg.tail[0].iov_len; + if (dlen < XDR_QUADLEN(args->tlen) << 2) + return 0; + return 1; +} + +int +nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_mknodargs *args = rqstp->rq_argp; + + if (!(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + + args->ftype = ntohl(*p++); + + if (args->ftype == NF3BLK || args->ftype == NF3CHR + || args->ftype == NF3SOCK || args->ftype == NF3FIFO) + p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); + + if (args->ftype == NF3BLK || args->ftype == NF3CHR) { + args->major = ntohl(*p++); + args->minor = ntohl(*p++); + } + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_renameargs *args = rqstp->rq_argp; + + if (!(p = decode_fh(p, &args->ffh)) + || !(p = decode_filename(p, &args->fname, &args->flen)) + || !(p = decode_fh(p, &args->tfh)) + || !(p = decode_filename(p, &args->tname, &args->tlen))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_readlinkargs *args = rqstp->rq_argp; + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + args->buffer = page_address(*(rqstp->rq_next_page++)); + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_linkargs *args = rqstp->rq_argp; + + if (!(p = decode_fh(p, &args->ffh)) + || !(p = decode_fh(p, &args->tfh)) + || !(p = decode_filename(p, &args->tname, &args->tlen))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_readdirargs *args = rqstp->rq_argp; + int len; + u32 max_blocksize = svc_max_payload(rqstp); + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = xdr_decode_hyper(p, &args->cookie); + args->verf = p; p += 2; + args->dircount = ~0; + args->count = ntohl(*p++); + len = args->count = min_t(u32, args->count, max_blocksize); + + while (len > 0) { + struct page *p = *(rqstp->rq_next_page++); + if (!args->buffer) + args->buffer = page_address(p); + len -= PAGE_SIZE; + } + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_readdirargs *args = rqstp->rq_argp; + int len; + u32 max_blocksize = svc_max_payload(rqstp); + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = xdr_decode_hyper(p, &args->cookie); + args->verf = p; p += 2; + args->dircount = ntohl(*p++); + args->count = ntohl(*p++); + + len = args->count = min(args->count, max_blocksize); + while (len > 0) { + struct page *p = *(rqstp->rq_next_page++); + if (!args->buffer) + args->buffer = page_address(p); + len -= PAGE_SIZE; + } + + return xdr_argsize_check(rqstp, p); +} + +int +nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_commitargs *args = rqstp->rq_argp; + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = xdr_decode_hyper(p, &args->offset); + args->count = ntohl(*p++); + + return xdr_argsize_check(rqstp, p); +} + +/* + * XDR encode functions + */ + +int +nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) +{ + return xdr_ressize_check(rqstp, p); +} + +/* GETATTR */ +int +nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_attrstat *resp = rqstp->rq_resp; + + *p++ = resp->status; + if (resp->status == 0) { + lease_get_mtime(d_inode(resp->fh.fh_dentry), + &resp->stat.mtime); + p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat); + } + return xdr_ressize_check(rqstp, p); +} + +/* SETATTR, REMOVE, RMDIR */ +int +nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_attrstat *resp = rqstp->rq_resp; + + *p++ = resp->status; + p = encode_wcc_data(rqstp, p, &resp->fh); + return xdr_ressize_check(rqstp, p); +} + +/* LOOKUP */ +int +nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_diropres *resp = rqstp->rq_resp; + + *p++ = resp->status; + if (resp->status == 0) { + p = encode_fh(p, &resp->fh); + p = encode_post_op_attr(rqstp, p, &resp->fh); + } + p = encode_post_op_attr(rqstp, p, &resp->dirfh); + return xdr_ressize_check(rqstp, p); +} + +/* ACCESS */ +int +nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_accessres *resp = rqstp->rq_resp; + + *p++ = resp->status; + p = encode_post_op_attr(rqstp, p, &resp->fh); + if (resp->status == 0) + *p++ = htonl(resp->access); + return xdr_ressize_check(rqstp, p); +} + +/* READLINK */ +int +nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_readlinkres *resp = rqstp->rq_resp; + + *p++ = resp->status; + p = encode_post_op_attr(rqstp, p, &resp->fh); + if (resp->status == 0) { + *p++ = htonl(resp->len); + xdr_ressize_check(rqstp, p); + rqstp->rq_res.page_len = resp->len; + if (resp->len & 3) { + /* need to pad the tail */ + rqstp->rq_res.tail[0].iov_base = p; + *p = 0; + rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); + } + return 1; + } else + return xdr_ressize_check(rqstp, p); +} + +/* READ */ +int +nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_readres *resp = rqstp->rq_resp; + + *p++ = resp->status; + p = encode_post_op_attr(rqstp, p, &resp->fh); + if (resp->status == 0) { + *p++ = htonl(resp->count); + *p++ = htonl(resp->eof); + *p++ = htonl(resp->count); /* xdr opaque count */ + xdr_ressize_check(rqstp, p); + /* now update rqstp->rq_res to reflect data as well */ + rqstp->rq_res.page_len = resp->count; + if (resp->count & 3) { + /* need to pad the tail */ + rqstp->rq_res.tail[0].iov_base = p; + *p = 0; + rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3); + } + return 1; + } else + return xdr_ressize_check(rqstp, p); +} + +/* WRITE */ +int +nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_writeres *resp = rqstp->rq_resp; + + *p++ = resp->status; + p = encode_wcc_data(rqstp, p, &resp->fh); + if (resp->status == 0) { + *p++ = htonl(resp->count); + *p++ = htonl(resp->committed); + *p++ = resp->verf[0]; + *p++ = resp->verf[1]; + } + return xdr_ressize_check(rqstp, p); +} + +/* CREATE, MKDIR, SYMLINK, MKNOD */ +int +nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_diropres *resp = rqstp->rq_resp; + + *p++ = resp->status; + if (resp->status == 0) { + *p++ = xdr_one; + p = encode_fh(p, &resp->fh); + p = encode_post_op_attr(rqstp, p, &resp->fh); + } + p = encode_wcc_data(rqstp, p, &resp->dirfh); + return xdr_ressize_check(rqstp, p); +} + +/* RENAME */ +int +nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_renameres *resp = rqstp->rq_resp; + + *p++ = resp->status; + p = encode_wcc_data(rqstp, p, &resp->ffh); + p = encode_wcc_data(rqstp, p, &resp->tfh); + return xdr_ressize_check(rqstp, p); +} + +/* LINK */ +int +nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_linkres *resp = rqstp->rq_resp; + + *p++ = resp->status; + p = encode_post_op_attr(rqstp, p, &resp->fh); + p = encode_wcc_data(rqstp, p, &resp->tfh); + return xdr_ressize_check(rqstp, p); +} + +/* READDIR */ +int +nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_readdirres *resp = rqstp->rq_resp; + + *p++ = resp->status; + p = encode_post_op_attr(rqstp, p, &resp->fh); + + if (resp->status == 0) { + /* stupid readdir cookie */ + memcpy(p, resp->verf, 8); p += 2; + xdr_ressize_check(rqstp, p); + if (rqstp->rq_res.head[0].iov_len + (2<<2) > PAGE_SIZE) + return 1; /*No room for trailer */ + rqstp->rq_res.page_len = (resp->count) << 2; + + /* add the 'tail' to the end of the 'head' page - page 0. */ + rqstp->rq_res.tail[0].iov_base = p; + *p++ = 0; /* no more entries */ + *p++ = htonl(resp->common.err == nfserr_eof); + rqstp->rq_res.tail[0].iov_len = 2<<2; + return 1; + } else + return xdr_ressize_check(rqstp, p); +} + +static __be32 * +encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, + int namlen, u64 ino) +{ + *p++ = xdr_one; /* mark entry present */ + p = xdr_encode_hyper(p, ino); /* file id */ + p = xdr_encode_array(p, name, namlen);/* name length & name */ + + cd->offset = p; /* remember pointer */ + p = xdr_encode_hyper(p, NFS_OFFSET_MAX);/* offset of next entry */ + + return p; +} + +static __be32 +compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, + const char *name, int namlen, u64 ino) +{ + struct svc_export *exp; + struct dentry *dparent, *dchild; + __be32 rv = nfserr_noent; + + dparent = cd->fh.fh_dentry; + exp = cd->fh.fh_export; + + if (isdotent(name, namlen)) { + if (namlen == 2) { + dchild = dget_parent(dparent); + /* + * Don't return filehandle for ".." if we're at + * the filesystem or export root: + */ + if (dchild == dparent) + goto out; + if (dparent == exp->ex_path.dentry) + goto out; + } else + dchild = dget(dparent); + } else + dchild = lookup_positive_unlocked(name, dparent, namlen); + if (IS_ERR(dchild)) + return rv; + if (d_mountpoint(dchild)) + goto out; + if (dchild->d_inode->i_ino != ino) + goto out; + rv = fh_compose(fhp, exp, dchild, &cd->fh); +out: + dput(dchild); + return rv; +} + +static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino) +{ + struct svc_fh *fh = &cd->scratch; + __be32 err; + + fh_init(fh, NFS3_FHSIZE); + err = compose_entry_fh(cd, fh, name, namlen, ino); + if (err) { + *p++ = 0; + *p++ = 0; + goto out; + } + p = encode_post_op_attr(cd->rqstp, p, fh); + *p++ = xdr_one; /* yes, a file handle follows */ + p = encode_fh(p, fh); +out: + fh_put(fh); + return p; +} + +/* + * Encode a directory entry. This one works for both normal readdir + * and readdirplus. + * The normal readdir reply requires 2 (fileid) + 1 (stringlen) + * + string + 2 (cookie) + 1 (next) words, i.e. 6 + strlen. + * + * The readdirplus baggage is 1+21 words for post_op_attr, plus the + * file handle. + */ + +#define NFS3_ENTRY_BAGGAGE (2 + 1 + 2 + 1) +#define NFS3_ENTRYPLUS_BAGGAGE (1 + 21 + 1 + (NFS3_FHSIZE >> 2)) +static int +encode_entry(struct readdir_cd *ccd, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type, int plus) +{ + struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres, + common); + __be32 *p = cd->buffer; + caddr_t curr_page_addr = NULL; + struct page ** page; + int slen; /* string (name) length */ + int elen; /* estimated entry length in words */ + int num_entry_words = 0; /* actual number of words */ + + if (cd->offset) { + u64 offset64 = offset; + + if (unlikely(cd->offset1)) { + /* we ended up with offset on a page boundary */ + *cd->offset = htonl(offset64 >> 32); + *cd->offset1 = htonl(offset64 & 0xffffffff); + cd->offset1 = NULL; + } else { + xdr_encode_hyper(cd->offset, offset64); + } + cd->offset = NULL; + } + + /* + dprintk("encode_entry(%.*s @%ld%s)\n", + namlen, name, (long) offset, plus? " plus" : ""); + */ + + /* truncate filename if too long */ + namlen = min(namlen, NFS3_MAXNAMLEN); + + slen = XDR_QUADLEN(namlen); + elen = slen + NFS3_ENTRY_BAGGAGE + + (plus? NFS3_ENTRYPLUS_BAGGAGE : 0); + + if (cd->buflen < elen) { + cd->common.err = nfserr_toosmall; + return -EINVAL; + } + + /* determine which page in rq_respages[] we are currently filling */ + for (page = cd->rqstp->rq_respages + 1; + page < cd->rqstp->rq_next_page; page++) { + curr_page_addr = page_address(*page); + + if (((caddr_t)cd->buffer >= curr_page_addr) && + ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE)) + break; + } + + if ((caddr_t)(cd->buffer + elen) < (curr_page_addr + PAGE_SIZE)) { + /* encode entry in current page */ + + p = encode_entry_baggage(cd, p, name, namlen, ino); + + if (plus) + p = encode_entryplus_baggage(cd, p, name, namlen, ino); + num_entry_words = p - cd->buffer; + } else if (*(page+1) != NULL) { + /* temporarily encode entry into next page, then move back to + * current and next page in rq_respages[] */ + __be32 *p1, *tmp; + int len1, len2; + + /* grab next page for temporary storage of entry */ + p1 = tmp = page_address(*(page+1)); + + p1 = encode_entry_baggage(cd, p1, name, namlen, ino); + + if (plus) + p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino); + + /* determine entry word length and lengths to go in pages */ + num_entry_words = p1 - tmp; + len1 = curr_page_addr + PAGE_SIZE - (caddr_t)cd->buffer; + if ((num_entry_words << 2) < len1) { + /* the actual number of words in the entry is less + * than elen and can still fit in the current page + */ + memmove(p, tmp, num_entry_words << 2); + p += num_entry_words; + + /* update offset */ + cd->offset = cd->buffer + (cd->offset - tmp); + } else { + unsigned int offset_r = (cd->offset - tmp) << 2; + + /* update pointer to offset location. + * This is a 64bit quantity, so we need to + * deal with 3 cases: + * - entirely in first page + * - entirely in second page + * - 4 bytes in each page + */ + if (offset_r + 8 <= len1) { + cd->offset = p + (cd->offset - tmp); + } else if (offset_r >= len1) { + cd->offset -= len1 >> 2; + } else { + /* sitting on the fence */ + BUG_ON(offset_r != len1 - 4); + cd->offset = p + (cd->offset - tmp); + cd->offset1 = tmp; + } + + len2 = (num_entry_words << 2) - len1; + + /* move from temp page to current and next pages */ + memmove(p, tmp, len1); + memmove(tmp, (caddr_t)tmp+len1, len2); + + p = tmp + (len2 >> 2); + } + } + else { + cd->common.err = nfserr_toosmall; + return -EINVAL; + } + + cd->buflen -= num_entry_words; + cd->buffer = p; + cd->common.err = nfs_ok; + return 0; + +} + +int +nfs3svc_encode_entry(void *cd, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int d_type) +{ + return encode_entry(cd, name, namlen, offset, ino, d_type, 0); +} + +int +nfs3svc_encode_entry_plus(void *cd, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) +{ + return encode_entry(cd, name, namlen, offset, ino, d_type, 1); +} + +/* FSSTAT */ +int +nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_fsstatres *resp = rqstp->rq_resp; + struct kstatfs *s = &resp->stats; + u64 bs = s->f_bsize; + + *p++ = resp->status; + *p++ = xdr_zero; /* no post_op_attr */ + + if (resp->status == 0) { + p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */ + p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */ + p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */ + p = xdr_encode_hyper(p, s->f_files); /* total inodes */ + p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */ + p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */ + *p++ = htonl(resp->invarsec); /* mean unchanged time */ + } + return xdr_ressize_check(rqstp, p); +} + +/* FSINFO */ +int +nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_fsinfores *resp = rqstp->rq_resp; + + *p++ = resp->status; + *p++ = xdr_zero; /* no post_op_attr */ + + if (resp->status == 0) { + *p++ = htonl(resp->f_rtmax); + *p++ = htonl(resp->f_rtpref); + *p++ = htonl(resp->f_rtmult); + *p++ = htonl(resp->f_wtmax); + *p++ = htonl(resp->f_wtpref); + *p++ = htonl(resp->f_wtmult); + *p++ = htonl(resp->f_dtpref); + p = xdr_encode_hyper(p, resp->f_maxfilesize); + *p++ = xdr_one; + *p++ = xdr_zero; + *p++ = htonl(resp->f_properties); + } + + return xdr_ressize_check(rqstp, p); +} + +/* PATHCONF */ +int +nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_pathconfres *resp = rqstp->rq_resp; + + *p++ = resp->status; + *p++ = xdr_zero; /* no post_op_attr */ + + if (resp->status == 0) { + *p++ = htonl(resp->p_link_max); + *p++ = htonl(resp->p_name_max); + *p++ = htonl(resp->p_no_trunc); + *p++ = htonl(resp->p_chown_restricted); + *p++ = htonl(resp->p_case_insensitive); + *p++ = htonl(resp->p_case_preserving); + } + + return xdr_ressize_check(rqstp, p); +} + +/* COMMIT */ +int +nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd3_commitres *resp = rqstp->rq_resp; + + *p++ = resp->status; + p = encode_wcc_data(rqstp, p, &resp->fh); + /* Write verifier */ + if (resp->status == 0) { + *p++ = resp->verf[0]; + *p++ = resp->verf[1]; + } + return xdr_ressize_check(rqstp, p); +} + +/* + * XDR release functions + */ +void +nfs3svc_release_fhandle(struct svc_rqst *rqstp) +{ + struct nfsd3_attrstat *resp = rqstp->rq_resp; + + fh_put(&resp->fh); +} + +void +nfs3svc_release_fhandle2(struct svc_rqst *rqstp) +{ + struct nfsd3_fhandle_pair *resp = rqstp->rq_resp; + + fh_put(&resp->fh1); + fh_put(&resp->fh2); +} diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c new file mode 100644 index 000000000..71292a0d6 --- /dev/null +++ b/fs/nfsd/nfs4acl.c @@ -0,0 +1,884 @@ +/* + * Common NFSv4 ACL handling code. + * + * Copyright (c) 2002, 2003 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen <marius@umich.edu> + * Jeff Sedlak <jsedlak@umich.edu> + * J. Bruce Fields <bfields@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/posix_acl.h> + +#include "nfsfh.h" +#include "nfsd.h" +#include "acl.h" +#include "vfs.h" + +#define NFS4_ACL_TYPE_DEFAULT 0x01 +#define NFS4_ACL_DIR 0x02 +#define NFS4_ACL_OWNER 0x04 + +/* mode bit translations: */ +#define NFS4_READ_MODE (NFS4_ACE_READ_DATA) +#define NFS4_WRITE_MODE (NFS4_ACE_WRITE_DATA | NFS4_ACE_APPEND_DATA) +#define NFS4_EXECUTE_MODE NFS4_ACE_EXECUTE +#define NFS4_ANYONE_MODE (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL | NFS4_ACE_SYNCHRONIZE) +#define NFS4_OWNER_MODE (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL) + +/* flags used to simulate posix default ACLs */ +#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \ + | NFS4_ACE_DIRECTORY_INHERIT_ACE) + +#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS \ + | NFS4_ACE_INHERIT_ONLY_ACE \ + | NFS4_ACE_IDENTIFIER_GROUP) + +static u32 +mask_from_posix(unsigned short perm, unsigned int flags) +{ + int mask = NFS4_ANYONE_MODE; + + if (flags & NFS4_ACL_OWNER) + mask |= NFS4_OWNER_MODE; + if (perm & ACL_READ) + mask |= NFS4_READ_MODE; + if (perm & ACL_WRITE) + mask |= NFS4_WRITE_MODE; + if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR)) + mask |= NFS4_ACE_DELETE_CHILD; + if (perm & ACL_EXECUTE) + mask |= NFS4_EXECUTE_MODE; + return mask; +} + +static u32 +deny_mask_from_posix(unsigned short perm, u32 flags) +{ + u32 mask = 0; + + if (perm & ACL_READ) + mask |= NFS4_READ_MODE; + if (perm & ACL_WRITE) + mask |= NFS4_WRITE_MODE; + if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR)) + mask |= NFS4_ACE_DELETE_CHILD; + if (perm & ACL_EXECUTE) + mask |= NFS4_EXECUTE_MODE; + return mask; +} + +/* XXX: modify functions to return NFS errors; they're only ever + * used by nfs code, after all.... */ + +/* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the + * side of being more restrictive, so the mode bit mapping below is + * pessimistic. An optimistic version would be needed to handle DENY's, + * but we expect to coalesce all ALLOWs and DENYs before mapping to mode + * bits. */ + +static void +low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags) +{ + u32 write_mode = NFS4_WRITE_MODE; + + if (flags & NFS4_ACL_DIR) + write_mode |= NFS4_ACE_DELETE_CHILD; + *mode = 0; + if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE) + *mode |= ACL_READ; + if ((perm & write_mode) == write_mode) + *mode |= ACL_WRITE; + if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE) + *mode |= ACL_EXECUTE; +} + +static short ace2type(struct nfs4_ace *); +static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, + unsigned int); + +int +nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, + struct nfs4_acl **acl) +{ + struct inode *inode = d_inode(dentry); + int error = 0; + struct posix_acl *pacl = NULL, *dpacl = NULL; + unsigned int flags = 0; + int size = 0; + + pacl = get_acl(inode, ACL_TYPE_ACCESS); + if (!pacl) + pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + + if (IS_ERR(pacl)) + return PTR_ERR(pacl); + + /* allocate for worst case: one (deny, allow) pair each: */ + size += 2 * pacl->a_count; + + if (S_ISDIR(inode->i_mode)) { + flags = NFS4_ACL_DIR; + dpacl = get_acl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(dpacl)) { + error = PTR_ERR(dpacl); + goto rel_pacl; + } + + if (dpacl) + size += 2 * dpacl->a_count; + } + + *acl = kmalloc(nfs4_acl_bytes(size), GFP_KERNEL); + if (*acl == NULL) { + error = -ENOMEM; + goto out; + } + (*acl)->naces = 0; + + _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT); + + if (dpacl) + _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT); + +out: + posix_acl_release(dpacl); +rel_pacl: + posix_acl_release(pacl); + return error; +} + +struct posix_acl_summary { + unsigned short owner; + unsigned short users; + unsigned short group; + unsigned short groups; + unsigned short other; + unsigned short mask; +}; + +static void +summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas) +{ + struct posix_acl_entry *pa, *pe; + + /* + * Only pas.users and pas.groups need initialization; previous + * posix_acl_valid() calls ensure that the other fields will be + * initialized in the following loop. But, just to placate gcc: + */ + memset(pas, 0, sizeof(*pas)); + pas->mask = 07; + + pe = acl->a_entries + acl->a_count; + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch (pa->e_tag) { + case ACL_USER_OBJ: + pas->owner = pa->e_perm; + break; + case ACL_GROUP_OBJ: + pas->group = pa->e_perm; + break; + case ACL_USER: + pas->users |= pa->e_perm; + break; + case ACL_GROUP: + pas->groups |= pa->e_perm; + break; + case ACL_OTHER: + pas->other = pa->e_perm; + break; + case ACL_MASK: + pas->mask = pa->e_perm; + break; + } + } + /* We'll only care about effective permissions: */ + pas->users &= pas->mask; + pas->group &= pas->mask; + pas->groups &= pas->mask; +} + +/* We assume the acl has been verified with posix_acl_valid. */ +static void +_posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, + unsigned int flags) +{ + struct posix_acl_entry *pa, *group_owner_entry; + struct nfs4_ace *ace; + struct posix_acl_summary pas; + unsigned short deny; + int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ? + NFS4_INHERITANCE_FLAGS | NFS4_ACE_INHERIT_ONLY_ACE : 0); + + BUG_ON(pacl->a_count < 3); + summarize_posix_acl(pacl, &pas); + + pa = pacl->a_entries; + ace = acl->aces + acl->naces; + + /* We could deny everything not granted by the owner: */ + deny = ~pas.owner; + /* + * but it is equivalent (and simpler) to deny only what is not + * granted by later entries: + */ + deny &= pas.users | pas.group | pas.groups | pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_OWNER; + ace++; + acl->naces++; + } + + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER); + ace->whotype = NFS4_ACL_WHO_OWNER; + ace++; + acl->naces++; + pa++; + + while (pa->e_tag == ACL_USER) { + deny = ~(pa->e_perm & pas.mask); + deny &= pas.groups | pas.group | pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who_uid = pa->e_uid; + ace++; + acl->naces++; + } + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, + flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who_uid = pa->e_uid; + ace++; + acl->naces++; + pa++; + } + + /* In the case of groups, we apply allow ACEs first, then deny ACEs, + * since a user can be in more than one group. */ + + /* allow ACEs */ + + group_owner_entry = pa; + + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pas.group, flags); + ace->whotype = NFS4_ACL_WHO_GROUP; + ace++; + acl->naces++; + pa++; + + while (pa->e_tag == ACL_GROUP) { + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->access_mask = mask_from_posix(pa->e_perm & pas.mask, + flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who_gid = pa->e_gid; + ace++; + acl->naces++; + pa++; + } + + /* deny ACEs */ + + pa = group_owner_entry; + + deny = ~pas.group & pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_GROUP; + ace++; + acl->naces++; + } + pa++; + + while (pa->e_tag == ACL_GROUP) { + deny = ~(pa->e_perm & pas.mask); + deny &= pas.other; + if (deny) { + ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; + ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; + ace->access_mask = deny_mask_from_posix(deny, flags); + ace->whotype = NFS4_ACL_WHO_NAMED; + ace->who_gid = pa->e_gid; + ace++; + acl->naces++; + } + pa++; + } + + if (pa->e_tag == ACL_MASK) + pa++; + ace->type = NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE; + ace->flag = eflag; + ace->access_mask = mask_from_posix(pa->e_perm, flags); + ace->whotype = NFS4_ACL_WHO_EVERYONE; + acl->naces++; +} + +static bool +pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2) +{ + if (pace1->e_tag != pace2->e_tag) + return pace1->e_tag > pace2->e_tag; + if (pace1->e_tag == ACL_USER) + return uid_gt(pace1->e_uid, pace2->e_uid); + if (pace1->e_tag == ACL_GROUP) + return gid_gt(pace1->e_gid, pace2->e_gid); + return false; +} + +static void +sort_pacl_range(struct posix_acl *pacl, int start, int end) { + int sorted = 0, i; + + /* We just do a bubble sort; easy to do in place, and we're not + * expecting acl's to be long enough to justify anything more. */ + while (!sorted) { + sorted = 1; + for (i = start; i < end; i++) { + if (pace_gt(&pacl->a_entries[i], + &pacl->a_entries[i+1])) { + sorted = 0; + swap(pacl->a_entries[i], + pacl->a_entries[i + 1]); + } + } + } +} + +static void +sort_pacl(struct posix_acl *pacl) +{ + /* posix_acl_valid requires that users and groups be in order + * by uid/gid. */ + int i, j; + + /* no users or groups */ + if (!pacl || pacl->a_count <= 4) + return; + + i = 1; + while (pacl->a_entries[i].e_tag == ACL_USER) + i++; + sort_pacl_range(pacl, 1, i-1); + + BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ); + j = ++i; + while (pacl->a_entries[j].e_tag == ACL_GROUP) + j++; + sort_pacl_range(pacl, i, j-1); + return; +} + +/* + * While processing the NFSv4 ACE, this maintains bitmasks representing + * which permission bits have been allowed and which denied to a given + * entity: */ +struct posix_ace_state { + u32 allow; + u32 deny; +}; + +struct posix_user_ace_state { + union { + kuid_t uid; + kgid_t gid; + }; + struct posix_ace_state perms; +}; + +struct posix_ace_state_array { + int n; + struct posix_user_ace_state aces[]; +}; + +/* + * While processing the NFSv4 ACE, this maintains the partial permissions + * calculated so far: */ + +struct posix_acl_state { + int empty; + struct posix_ace_state owner; + struct posix_ace_state group; + struct posix_ace_state other; + struct posix_ace_state everyone; + struct posix_ace_state mask; /* Deny unused in this case */ + struct posix_ace_state_array *users; + struct posix_ace_state_array *groups; +}; + +static int +init_state(struct posix_acl_state *state, int cnt) +{ + int alloc; + + memset(state, 0, sizeof(struct posix_acl_state)); + state->empty = 1; + /* + * In the worst case, each individual acl could be for a distinct + * named user or group, but we don't know which, so we allocate + * enough space for either: + */ + alloc = sizeof(struct posix_ace_state_array) + + cnt*sizeof(struct posix_user_ace_state); + state->users = kzalloc(alloc, GFP_KERNEL); + if (!state->users) + return -ENOMEM; + state->groups = kzalloc(alloc, GFP_KERNEL); + if (!state->groups) { + kfree(state->users); + return -ENOMEM; + } + return 0; +} + +static void +free_state(struct posix_acl_state *state) { + kfree(state->users); + kfree(state->groups); +} + +static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_state *astate) +{ + state->mask.allow |= astate->allow; +} + +static struct posix_acl * +posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) +{ + struct posix_acl_entry *pace; + struct posix_acl *pacl; + int nace; + int i; + + /* + * ACLs with no ACEs are treated differently in the inheritable + * and effective cases: when there are no inheritable ACEs, + * calls ->set_acl with a NULL ACL structure. + */ + if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) + return NULL; + + /* + * When there are no effective ACEs, the following will end + * up setting a 3-element effective posix ACL with all + * permissions zero. + */ + if (!state->users->n && !state->groups->n) + nace = 3; + else /* Note we also include a MASK ACE in this case: */ + nace = 4 + state->users->n + state->groups->n; + pacl = posix_acl_alloc(nace, GFP_KERNEL); + if (!pacl) + return ERR_PTR(-ENOMEM); + + pace = pacl->a_entries; + pace->e_tag = ACL_USER_OBJ; + low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags); + + for (i=0; i < state->users->n; i++) { + pace++; + pace->e_tag = ACL_USER; + low_mode_from_nfs4(state->users->aces[i].perms.allow, + &pace->e_perm, flags); + pace->e_uid = state->users->aces[i].uid; + add_to_mask(state, &state->users->aces[i].perms); + } + + pace++; + pace->e_tag = ACL_GROUP_OBJ; + low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags); + add_to_mask(state, &state->group); + + for (i=0; i < state->groups->n; i++) { + pace++; + pace->e_tag = ACL_GROUP; + low_mode_from_nfs4(state->groups->aces[i].perms.allow, + &pace->e_perm, flags); + pace->e_gid = state->groups->aces[i].gid; + add_to_mask(state, &state->groups->aces[i].perms); + } + + if (state->users->n || state->groups->n) { + pace++; + pace->e_tag = ACL_MASK; + low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); + } + + pace++; + pace->e_tag = ACL_OTHER; + low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags); + + return pacl; +} + +static inline void allow_bits(struct posix_ace_state *astate, u32 mask) +{ + /* Allow all bits in the mask not already denied: */ + astate->allow |= mask & ~astate->deny; +} + +static inline void deny_bits(struct posix_ace_state *astate, u32 mask) +{ + /* Deny all bits in the mask not already allowed: */ + astate->deny |= mask & ~astate->allow; +} + +static int find_uid(struct posix_acl_state *state, kuid_t uid) +{ + struct posix_ace_state_array *a = state->users; + int i; + + for (i = 0; i < a->n; i++) + if (uid_eq(a->aces[i].uid, uid)) + return i; + /* Not found: */ + a->n++; + a->aces[i].uid = uid; + a->aces[i].perms.allow = state->everyone.allow; + a->aces[i].perms.deny = state->everyone.deny; + + return i; +} + +static int find_gid(struct posix_acl_state *state, kgid_t gid) +{ + struct posix_ace_state_array *a = state->groups; + int i; + + for (i = 0; i < a->n; i++) + if (gid_eq(a->aces[i].gid, gid)) + return i; + /* Not found: */ + a->n++; + a->aces[i].gid = gid; + a->aces[i].perms.allow = state->everyone.allow; + a->aces[i].perms.deny = state->everyone.deny; + + return i; +} + +static void deny_bits_array(struct posix_ace_state_array *a, u32 mask) +{ + int i; + + for (i=0; i < a->n; i++) + deny_bits(&a->aces[i].perms, mask); +} + +static void allow_bits_array(struct posix_ace_state_array *a, u32 mask) +{ + int i; + + for (i=0; i < a->n; i++) + allow_bits(&a->aces[i].perms, mask); +} + +static void process_one_v4_ace(struct posix_acl_state *state, + struct nfs4_ace *ace) +{ + u32 mask = ace->access_mask; + int i; + + state->empty = 0; + + switch (ace2type(ace)) { + case ACL_USER_OBJ: + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->owner, mask); + } else { + deny_bits(&state->owner, mask); + } + break; + case ACL_USER: + i = find_uid(state, ace->who_uid); + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->users->aces[i].perms, mask); + } else { + deny_bits(&state->users->aces[i].perms, mask); + mask = state->users->aces[i].perms.deny; + deny_bits(&state->owner, mask); + } + break; + case ACL_GROUP_OBJ: + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->group, mask); + } else { + deny_bits(&state->group, mask); + mask = state->group.deny; + deny_bits(&state->owner, mask); + deny_bits(&state->everyone, mask); + deny_bits_array(state->users, mask); + deny_bits_array(state->groups, mask); + } + break; + case ACL_GROUP: + i = find_gid(state, ace->who_gid); + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->groups->aces[i].perms, mask); + } else { + deny_bits(&state->groups->aces[i].perms, mask); + mask = state->groups->aces[i].perms.deny; + deny_bits(&state->owner, mask); + deny_bits(&state->group, mask); + deny_bits(&state->everyone, mask); + deny_bits_array(state->users, mask); + deny_bits_array(state->groups, mask); + } + break; + case ACL_OTHER: + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->owner, mask); + allow_bits(&state->group, mask); + allow_bits(&state->other, mask); + allow_bits(&state->everyone, mask); + allow_bits_array(state->users, mask); + allow_bits_array(state->groups, mask); + } else { + deny_bits(&state->owner, mask); + deny_bits(&state->group, mask); + deny_bits(&state->other, mask); + deny_bits(&state->everyone, mask); + deny_bits_array(state->users, mask); + deny_bits_array(state->groups, mask); + } + } +} + +static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, + struct posix_acl **pacl, struct posix_acl **dpacl, + unsigned int flags) +{ + struct posix_acl_state effective_acl_state, default_acl_state; + struct nfs4_ace *ace; + int ret; + + ret = init_state(&effective_acl_state, acl->naces); + if (ret) + return ret; + ret = init_state(&default_acl_state, acl->naces); + if (ret) + goto out_estate; + ret = -EINVAL; + for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { + if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE && + ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE) + goto out_dstate; + if (ace->flag & ~NFS4_SUPPORTED_FLAGS) + goto out_dstate; + if ((ace->flag & NFS4_INHERITANCE_FLAGS) == 0) { + process_one_v4_ace(&effective_acl_state, ace); + continue; + } + if (!(flags & NFS4_ACL_DIR)) + goto out_dstate; + /* + * Note that when only one of FILE_INHERIT or DIRECTORY_INHERIT + * is set, we're effectively turning on the other. That's OK, + * according to rfc 3530. + */ + process_one_v4_ace(&default_acl_state, ace); + + if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE)) + process_one_v4_ace(&effective_acl_state, ace); + } + *pacl = posix_state_to_acl(&effective_acl_state, flags); + if (IS_ERR(*pacl)) { + ret = PTR_ERR(*pacl); + *pacl = NULL; + goto out_dstate; + } + *dpacl = posix_state_to_acl(&default_acl_state, + flags | NFS4_ACL_TYPE_DEFAULT); + if (IS_ERR(*dpacl)) { + ret = PTR_ERR(*dpacl); + *dpacl = NULL; + posix_acl_release(*pacl); + *pacl = NULL; + goto out_dstate; + } + sort_pacl(*pacl); + sort_pacl(*dpacl); + ret = 0; +out_dstate: + free_state(&default_acl_state); +out_estate: + free_state(&effective_acl_state); + return ret; +} + +__be32 +nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl) +{ + __be32 error; + int host_error; + struct dentry *dentry; + struct inode *inode; + struct posix_acl *pacl = NULL, *dpacl = NULL; + unsigned int flags = 0; + + /* Get inode */ + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR); + if (error) + return error; + + dentry = fhp->fh_dentry; + inode = d_inode(dentry); + + if (S_ISDIR(inode->i_mode)) + flags = NFS4_ACL_DIR; + + host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); + if (host_error == -EINVAL) + return nfserr_attrnotsupp; + if (host_error < 0) + goto out_nfserr; + + fh_lock(fhp); + + host_error = set_posix_acl(inode, ACL_TYPE_ACCESS, pacl); + if (host_error < 0) + goto out_drop_lock; + + if (S_ISDIR(inode->i_mode)) { + host_error = set_posix_acl(inode, ACL_TYPE_DEFAULT, dpacl); + } + +out_drop_lock: + fh_unlock(fhp); + + posix_acl_release(pacl); + posix_acl_release(dpacl); +out_nfserr: + if (host_error == -EOPNOTSUPP) + return nfserr_attrnotsupp; + else + return nfserrno(host_error); +} + + +static short +ace2type(struct nfs4_ace *ace) +{ + switch (ace->whotype) { + case NFS4_ACL_WHO_NAMED: + return (ace->flag & NFS4_ACE_IDENTIFIER_GROUP ? + ACL_GROUP : ACL_USER); + case NFS4_ACL_WHO_OWNER: + return ACL_USER_OBJ; + case NFS4_ACL_WHO_GROUP: + return ACL_GROUP_OBJ; + case NFS4_ACL_WHO_EVERYONE: + return ACL_OTHER; + } + BUG(); + return -1; +} + +/* + * return the size of the struct nfs4_acl required to represent an acl + * with @entries entries. + */ +int nfs4_acl_bytes(int entries) +{ + return sizeof(struct nfs4_acl) + entries * sizeof(struct nfs4_ace); +} + +static struct { + char *string; + int stringlen; + int type; +} s2t_map[] = { + { + .string = "OWNER@", + .stringlen = sizeof("OWNER@") - 1, + .type = NFS4_ACL_WHO_OWNER, + }, + { + .string = "GROUP@", + .stringlen = sizeof("GROUP@") - 1, + .type = NFS4_ACL_WHO_GROUP, + }, + { + .string = "EVERYONE@", + .stringlen = sizeof("EVERYONE@") - 1, + .type = NFS4_ACL_WHO_EVERYONE, + }, +}; + +int +nfs4_acl_get_whotype(char *p, u32 len) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(s2t_map); i++) { + if (s2t_map[i].stringlen == len && + 0 == memcmp(s2t_map[i].string, p, len)) + return s2t_map[i].type; + } + return NFS4_ACL_WHO_NAMED; +} + +__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who) +{ + __be32 *p; + int i; + + for (i = 0; i < ARRAY_SIZE(s2t_map); i++) { + if (s2t_map[i].type != who) + continue; + p = xdr_reserve_space(xdr, s2t_map[i].stringlen + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, s2t_map[i].string, + s2t_map[i].stringlen); + return 0; + } + WARN_ON_ONCE(1); + return nfserr_serverfault; +} diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c new file mode 100644 index 000000000..f5b7ad084 --- /dev/null +++ b/fs/nfsd/nfs4callback.c @@ -0,0 +1,1381 @@ +/* + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith <kmsmith@umich.edu> + * Andy Adamson <andros@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/xprt.h> +#include <linux/sunrpc/svc_xprt.h> +#include <linux/slab.h> +#include "nfsd.h" +#include "state.h" +#include "netns.h" +#include "trace.h" +#include "xdr4cb.h" +#include "xdr4.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason); + +#define NFSPROC4_CB_NULL 0 +#define NFSPROC4_CB_COMPOUND 1 + +/* Index of predefined Linux callback client operations */ + +struct nfs4_cb_compound_hdr { + /* args */ + u32 ident; /* minorversion 0 only */ + u32 nops; + __be32 *nops_p; + u32 minorversion; + /* res */ + int status; +}; + +static __be32 *xdr_encode_empty_array(__be32 *p) +{ + *p++ = xdr_zero; + return p; +} + +/* + * Encode/decode NFSv4 CB basic data types + * + * Basic NFSv4 callback data types are defined in section 15 of RFC + * 3530: "Network File System (NFS) version 4 Protocol" and section + * 20 of RFC 5661: "Network File System (NFS) Version 4 Minor Version + * 1 Protocol" + */ + +/* + * nfs_cb_opnum4 + * + * enum nfs_cb_opnum4 { + * OP_CB_GETATTR = 3, + * ... + * }; + */ +enum nfs_cb_opnum4 { + OP_CB_GETATTR = 3, + OP_CB_RECALL = 4, + OP_CB_LAYOUTRECALL = 5, + OP_CB_NOTIFY = 6, + OP_CB_PUSH_DELEG = 7, + OP_CB_RECALL_ANY = 8, + OP_CB_RECALLABLE_OBJ_AVAIL = 9, + OP_CB_RECALL_SLOT = 10, + OP_CB_SEQUENCE = 11, + OP_CB_WANTS_CANCELLED = 12, + OP_CB_NOTIFY_LOCK = 13, + OP_CB_NOTIFY_DEVICEID = 14, + OP_CB_OFFLOAD = 15, + OP_CB_ILLEGAL = 10044 +}; + +static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(op); +} + +/* + * nfs_fh4 + * + * typedef opaque nfs_fh4<NFS4_FHSIZE>; + */ +static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh) +{ + u32 length = fh->fh_size; + __be32 *p; + + BUG_ON(length > NFS4_FHSIZE); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, &fh->fh_base, length); +} + +/* + * stateid4 + * + * struct stateid4 { + * uint32_t seqid; + * opaque other[12]; + * }; + */ +static void encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(sid->si_generation); + xdr_encode_opaque_fixed(p, &sid->si_opaque, NFS4_STATEID_OTHER_SIZE); +} + +/* + * sessionid4 + * + * typedef opaque sessionid4[NFS4_SESSIONID_SIZE]; + */ +static void encode_sessionid4(struct xdr_stream *xdr, + const struct nfsd4_session *session) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN); + xdr_encode_opaque_fixed(p, session->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN); +} + +/* + * nfsstat4 + */ +static const struct { + int stat; + int errno; +} nfs_cb_errtbl[] = { + { NFS4_OK, 0 }, + { NFS4ERR_PERM, -EPERM }, + { NFS4ERR_NOENT, -ENOENT }, + { NFS4ERR_IO, -EIO }, + { NFS4ERR_NXIO, -ENXIO }, + { NFS4ERR_ACCESS, -EACCES }, + { NFS4ERR_EXIST, -EEXIST }, + { NFS4ERR_XDEV, -EXDEV }, + { NFS4ERR_NOTDIR, -ENOTDIR }, + { NFS4ERR_ISDIR, -EISDIR }, + { NFS4ERR_INVAL, -EINVAL }, + { NFS4ERR_FBIG, -EFBIG }, + { NFS4ERR_NOSPC, -ENOSPC }, + { NFS4ERR_ROFS, -EROFS }, + { NFS4ERR_MLINK, -EMLINK }, + { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFS4ERR_NOTEMPTY, -ENOTEMPTY }, + { NFS4ERR_DQUOT, -EDQUOT }, + { NFS4ERR_STALE, -ESTALE }, + { NFS4ERR_BADHANDLE, -EBADHANDLE }, + { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, + { NFS4ERR_NOTSUPP, -ENOTSUPP }, + { NFS4ERR_TOOSMALL, -ETOOSMALL }, + { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, + { NFS4ERR_BADTYPE, -EBADTYPE }, + { NFS4ERR_LOCKED, -EAGAIN }, + { NFS4ERR_RESOURCE, -EREMOTEIO }, + { NFS4ERR_SYMLINK, -ELOOP }, + { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, + { NFS4ERR_DEADLOCK, -EDEADLK }, + { -1, -EIO } +}; + +/* + * If we cannot translate the error, the recovery routines should + * handle it. + * + * Note: remaining NFSv4 error codes have values > 10000, so should + * not conflict with native Linux error codes. + */ +static int nfs_cb_stat_to_errno(int status) +{ + int i; + + for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) { + if (nfs_cb_errtbl[i].stat == status) + return nfs_cb_errtbl[i].errno; + } + + dprintk("NFSD: Unrecognized NFS CB status value: %u\n", status); + return -status; +} + +static int decode_cb_op_status(struct xdr_stream *xdr, + enum nfs_cb_opnum4 expected, int *status) +{ + __be32 *p; + u32 op; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + op = be32_to_cpup(p++); + if (unlikely(op != expected)) + goto out_unexpected; + *status = nfs_cb_stat_to_errno(be32_to_cpup(p)); + return 0; +out_overflow: + return -EIO; +out_unexpected: + dprintk("NFSD: Callback server returned operation %d but " + "we issued a request for %d\n", op, expected); + return -EIO; +} + +/* + * CB_COMPOUND4args + * + * struct CB_COMPOUND4args { + * utf8str_cs tag; + * uint32_t minorversion; + * uint32_t callback_ident; + * nfs_cb_argop4 argarray<>; + * }; +*/ +static void encode_cb_compound4args(struct xdr_stream *xdr, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 * p; + + p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4); + p = xdr_encode_empty_array(p); /* empty tag */ + *p++ = cpu_to_be32(hdr->minorversion); + *p++ = cpu_to_be32(hdr->ident); + + hdr->nops_p = p; + *p = cpu_to_be32(hdr->nops); /* argarray element count */ +} + +/* + * Update argarray element count + */ +static void encode_cb_nops(struct nfs4_cb_compound_hdr *hdr) +{ + BUG_ON(hdr->nops > NFS4_MAX_BACK_CHANNEL_OPS); + *hdr->nops_p = cpu_to_be32(hdr->nops); +} + +/* + * CB_COMPOUND4res + * + * struct CB_COMPOUND4res { + * nfsstat4 status; + * utf8str_cs tag; + * nfs_cb_resop4 resarray<>; + * }; + */ +static int decode_cb_compound4res(struct xdr_stream *xdr, + struct nfs4_cb_compound_hdr *hdr) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + hdr->status = be32_to_cpup(p++); + /* Ignore the tag */ + length = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, length + 4); + if (unlikely(p == NULL)) + goto out_overflow; + p += XDR_QUADLEN(length); + hdr->nops = be32_to_cpup(p); + return 0; +out_overflow: + return -EIO; +} + +/* + * CB_RECALL4args + * + * struct CB_RECALL4args { + * stateid4 stateid; + * bool truncate; + * nfs_fh4 fh; + * }; + */ +static void encode_cb_recall4args(struct xdr_stream *xdr, + const struct nfs4_delegation *dp, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + encode_nfs_cb_opnum4(xdr, OP_CB_RECALL); + encode_stateid4(xdr, &dp->dl_stid.sc_stateid); + + p = xdr_reserve_space(xdr, 4); + *p++ = xdr_zero; /* truncate */ + + encode_nfs_fh4(xdr, &dp->dl_stid.sc_file->fi_fhandle); + + hdr->nops++; +} + +/* + * CB_SEQUENCE4args + * + * struct CB_SEQUENCE4args { + * sessionid4 csa_sessionid; + * sequenceid4 csa_sequenceid; + * slotid4 csa_slotid; + * slotid4 csa_highest_slotid; + * bool csa_cachethis; + * referring_call_list4 csa_referring_call_lists<>; + * }; + */ +static void encode_cb_sequence4args(struct xdr_stream *xdr, + const struct nfsd4_callback *cb, + struct nfs4_cb_compound_hdr *hdr) +{ + struct nfsd4_session *session = cb->cb_clp->cl_cb_session; + __be32 *p; + + if (hdr->minorversion == 0) + return; + + encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE); + encode_sessionid4(xdr, session); + + p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4); + *p++ = cpu_to_be32(session->se_cb_seq_nr); /* csa_sequenceid */ + *p++ = xdr_zero; /* csa_slotid */ + *p++ = xdr_zero; /* csa_highest_slotid */ + *p++ = xdr_zero; /* csa_cachethis */ + xdr_encode_empty_array(p); /* csa_referring_call_lists */ + + hdr->nops++; +} + +/* + * CB_SEQUENCE4resok + * + * struct CB_SEQUENCE4resok { + * sessionid4 csr_sessionid; + * sequenceid4 csr_sequenceid; + * slotid4 csr_slotid; + * slotid4 csr_highest_slotid; + * slotid4 csr_target_highest_slotid; + * }; + * + * union CB_SEQUENCE4res switch (nfsstat4 csr_status) { + * case NFS4_OK: + * CB_SEQUENCE4resok csr_resok4; + * default: + * void; + * }; + * + * Our current back channel implmentation supports a single backchannel + * with a single slot. + */ +static int decode_cb_sequence4resok(struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + struct nfsd4_session *session = cb->cb_clp->cl_cb_session; + int status = -ESERVERFAULT; + __be32 *p; + u32 dummy; + + /* + * If the server returns different values for sessionID, slotID or + * sequence number, the server is looney tunes. + */ + p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + + if (memcmp(p, session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) { + dprintk("NFS: %s Invalid session id\n", __func__); + goto out; + } + p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); + + dummy = be32_to_cpup(p++); + if (dummy != session->se_cb_seq_nr) { + dprintk("NFS: %s Invalid sequence number\n", __func__); + goto out; + } + + dummy = be32_to_cpup(p++); + if (dummy != 0) { + dprintk("NFS: %s Invalid slotid\n", __func__); + goto out; + } + + /* + * FIXME: process highest slotid and target highest slotid + */ + status = 0; +out: + cb->cb_seq_status = status; + return status; +out_overflow: + status = -EIO; + goto out; +} + +static int decode_cb_sequence4res(struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + int status; + + if (cb->cb_clp->cl_minorversion == 0) + return 0; + + status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status); + if (unlikely(status || cb->cb_seq_status)) + return status; + + return decode_cb_sequence4resok(xdr, cb); +} + +/* + * NFSv4.0 and NFSv4.1 XDR encode functions + * + * NFSv4.0 callback argument types are defined in section 15 of RFC + * 3530: "Network File System (NFS) version 4 Protocol" and section 20 + * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1 + * Protocol". + */ + +/* + * NB: Without this zero space reservation, callbacks over krb5p fail + */ +static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, + const void *__unused) +{ + xdr_reserve_space(xdr, 0); +} + +/* + * 20.2. Operation 4: CB_RECALL - Recall a Delegation + */ +static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, + const void *data) +{ + const struct nfsd4_callback *cb = data; + const struct nfs4_delegation *dp = cb_to_delegation(cb); + struct nfs4_cb_compound_hdr hdr = { + .ident = cb->cb_clp->cl_cb_ident, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_recall4args(xdr, dp, &hdr); + encode_cb_nops(&hdr); +} + + +/* + * NFSv4.0 and NFSv4.1 XDR decode functions + * + * NFSv4.0 callback result types are defined in section 15 of RFC + * 3530: "Network File System (NFS) version 4 Protocol" and section 20 + * of RFC 5661: "Network File System (NFS) Version 4 Minor Version 1 + * Protocol". + */ + +static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, + void *__unused) +{ + return 0; +} + +/* + * 20.2. Operation 4: CB_RECALL - Recall a Delegation + */ +static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + + return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); +} + +#ifdef CONFIG_NFSD_PNFS +/* + * CB_LAYOUTRECALL4args + * + * struct layoutrecall_file4 { + * nfs_fh4 lor_fh; + * offset4 lor_offset; + * length4 lor_length; + * stateid4 lor_stateid; + * }; + * + * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) { + * case LAYOUTRECALL4_FILE: + * layoutrecall_file4 lor_layout; + * case LAYOUTRECALL4_FSID: + * fsid4 lor_fsid; + * case LAYOUTRECALL4_ALL: + * void; + * }; + * + * struct CB_LAYOUTRECALL4args { + * layouttype4 clora_type; + * layoutiomode4 clora_iomode; + * bool clora_changed; + * layoutrecall4 clora_recall; + * }; + */ +static void encode_cb_layout4args(struct xdr_stream *xdr, + const struct nfs4_layout_stateid *ls, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + BUG_ON(hdr->minorversion == 0); + + p = xdr_reserve_space(xdr, 5 * 4); + *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL); + *p++ = cpu_to_be32(ls->ls_layout_type); + *p++ = cpu_to_be32(IOMODE_ANY); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(RETURN_FILE); + + encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle); + + p = xdr_reserve_space(xdr, 2 * 8); + p = xdr_encode_hyper(p, 0); + xdr_encode_hyper(p, NFS4_MAX_UINT64); + + encode_stateid4(xdr, &ls->ls_recall_sid); + + hdr->nops++; +} + +static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfsd4_callback *cb = data; + const struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_layout4args(xdr, ls, &hdr); + encode_cb_nops(&hdr); +} + +static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + + return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); +} +#endif /* CONFIG_NFSD_PNFS */ + +static void encode_stateowner(struct xdr_stream *xdr, struct nfs4_stateowner *so) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 8 + 4 + so->so_owner.len); + p = xdr_encode_opaque_fixed(p, &so->so_client->cl_clientid, 8); + xdr_encode_opaque(p, so->so_owner.data, so->so_owner.len); +} + +static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfsd4_callback *cb = data; + const struct nfsd4_blocked_lock *nbl = + container_of(cb, struct nfsd4_blocked_lock, nbl_cb); + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner; + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + __be32 *p; + + BUG_ON(hdr.minorversion == 0); + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(OP_CB_NOTIFY_LOCK); + encode_nfs_fh4(xdr, &nbl->nbl_fh); + encode_stateowner(xdr, &lo->lo_owner); + hdr.nops++; + + encode_cb_nops(&hdr); +} + +static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + + return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status); +} + +/* + * struct write_response4 { + * stateid4 wr_callback_id<1>; + * length4 wr_count; + * stable_how4 wr_committed; + * verifier4 wr_writeverf; + * }; + * union offload_info4 switch (nfsstat4 coa_status) { + * case NFS4_OK: + * write_response4 coa_resok4; + * default: + * length4 coa_bytes_copied; + * }; + * struct CB_OFFLOAD4args { + * nfs_fh4 coa_fh; + * stateid4 coa_stateid; + * offload_info4 coa_offload_info; + * }; + */ +static void encode_offload_info4(struct xdr_stream *xdr, + __be32 nfserr, + const struct nfsd4_copy *cp) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p++ = nfserr; + if (!nfserr) { + p = xdr_reserve_space(xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE); + p = xdr_encode_empty_array(p); + p = xdr_encode_hyper(p, cp->cp_res.wr_bytes_written); + *p++ = cpu_to_be32(cp->cp_res.wr_stable_how); + p = xdr_encode_opaque_fixed(p, cp->cp_res.wr_verifier.data, + NFS4_VERIFIER_SIZE); + } else { + p = xdr_reserve_space(xdr, 8); + /* We always return success if bytes were written */ + p = xdr_encode_hyper(p, 0); + } +} + +static void encode_cb_offload4args(struct xdr_stream *xdr, + __be32 nfserr, + const struct knfsd_fh *fh, + const struct nfsd4_copy *cp, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p++ = cpu_to_be32(OP_CB_OFFLOAD); + encode_nfs_fh4(xdr, fh); + encode_stateid4(xdr, &cp->cp_res.cb_stateid); + encode_offload_info4(xdr, nfserr, cp); + + hdr->nops++; +} + +static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfsd4_callback *cb = data; + const struct nfsd4_copy *cp = + container_of(cb, struct nfsd4_copy, cp_cb); + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_offload4args(xdr, cp->nfserr, &cp->fh, cp, &hdr); + encode_cb_nops(&hdr); +} + +static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + + return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status); +} +/* + * RPC procedure tables + */ +#define PROC(proc, call, argtype, restype) \ +[NFSPROC4_CLNT_##proc] = { \ + .p_proc = NFSPROC4_CB_##call, \ + .p_encode = nfs4_xdr_enc_##argtype, \ + .p_decode = nfs4_xdr_dec_##restype, \ + .p_arglen = NFS4_enc_##argtype##_sz, \ + .p_replen = NFS4_dec_##restype##_sz, \ + .p_statidx = NFSPROC4_CB_##call, \ + .p_name = #proc, \ +} + +static const struct rpc_procinfo nfs4_cb_procedures[] = { + PROC(CB_NULL, NULL, cb_null, cb_null), + PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall), +#ifdef CONFIG_NFSD_PNFS + PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), +#endif + PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), + PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload), +}; + +static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)]; +static const struct rpc_version nfs_cb_version4 = { +/* + * Note on the callback rpc program version number: despite language in rfc + * 5661 section 18.36.3 requiring servers to use 4 in this field, the + * official xdr descriptions for both 4.0 and 4.1 specify version 1, and + * in practice that appears to be what implementations use. The section + * 18.36.3 language is expected to be fixed in an erratum. + */ + .number = 1, + .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), + .procs = nfs4_cb_procedures, + .counts = nfs4_cb_counts, +}; + +static const struct rpc_version *nfs_cb_version[2] = { + [1] = &nfs_cb_version4, +}; + +static const struct rpc_program cb_program; + +static struct rpc_stat cb_stats = { + .program = &cb_program +}; + +#define NFS4_CALLBACK 0x40000000 +static const struct rpc_program cb_program = { + .name = "nfs4_cb", + .number = NFS4_CALLBACK, + .nrvers = ARRAY_SIZE(nfs_cb_version), + .version = nfs_cb_version, + .stats = &cb_stats, + .pipe_dir_name = "nfsd4_cb", +}; + +static int max_cb_time(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + /* + * nfsd4_lease is set to at most one hour in __nfsd4_write_time, + * so we can use 32-bit math on it. Warn if that assumption + * ever stops being true. + */ + if (WARN_ON_ONCE(nn->nfsd4_lease > 3600)) + return 360 * HZ; + + return max(((u32)nn->nfsd4_lease)/10, 1u) * HZ; +} + +static struct workqueue_struct *callback_wq; + +static bool nfsd4_queue_cb(struct nfsd4_callback *cb) +{ + return queue_work(callback_wq, &cb->cb_work); +} + +static void nfsd41_cb_inflight_begin(struct nfs4_client *clp) +{ + atomic_inc(&clp->cl_cb_inflight); +} + +static void nfsd41_cb_inflight_end(struct nfs4_client *clp) +{ + + if (atomic_dec_and_test(&clp->cl_cb_inflight)) + wake_up_var(&clp->cl_cb_inflight); +} + +static void nfsd41_cb_inflight_wait_complete(struct nfs4_client *clp) +{ + wait_var_event(&clp->cl_cb_inflight, + !atomic_read(&clp->cl_cb_inflight)); +} + +static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) +{ + if (clp->cl_minorversion == 0) { + client->cl_principal = clp->cl_cred.cr_targ_princ ? + clp->cl_cred.cr_targ_princ : "nfs"; + + return get_cred(rpc_machine_cred()); + } else { + struct cred *kcred; + + kcred = prepare_kernel_cred(NULL); + if (!kcred) + return NULL; + + kcred->fsuid = ses->se_cb_sec.uid; + kcred->fsgid = ses->se_cb_sec.gid; + return kcred; + } +} + +static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) +{ + int maxtime = max_cb_time(clp->net); + struct rpc_timeout timeparms = { + .to_initval = maxtime, + .to_retries = 0, + .to_maxval = maxtime, + }; + struct rpc_create_args args = { + .net = clp->net, + .address = (struct sockaddr *) &conn->cb_addr, + .addrsize = conn->cb_addrlen, + .saddress = (struct sockaddr *) &conn->cb_saddr, + .timeout = &timeparms, + .program = &cb_program, + .version = 1, + .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), + .cred = current_cred(), + }; + struct rpc_clnt *client; + const struct cred *cred; + + if (clp->cl_minorversion == 0) { + if (!clp->cl_cred.cr_principal && + (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5)) { + trace_nfsd_cb_setup_err(clp, -EINVAL); + return -EINVAL; + } + args.client_name = clp->cl_cred.cr_principal; + args.prognumber = conn->cb_prog; + args.protocol = XPRT_TRANSPORT_TCP; + args.authflavor = clp->cl_cred.cr_flavor; + clp->cl_cb_ident = conn->cb_ident; + } else { + if (!conn->cb_xprt) + return -EINVAL; + clp->cl_cb_session = ses; + args.bc_xprt = conn->cb_xprt; + args.prognumber = clp->cl_cb_session->se_cb_prog; + args.protocol = conn->cb_xprt->xpt_class->xcl_ident | + XPRT_TRANSPORT_BC; + args.authflavor = ses->se_cb_sec.flavor; + } + /* Create RPC client */ + client = rpc_create(&args); + if (IS_ERR(client)) { + trace_nfsd_cb_setup_err(clp, PTR_ERR(client)); + return PTR_ERR(client); + } + cred = get_backchannel_cred(clp, client, ses); + if (!cred) { + trace_nfsd_cb_setup_err(clp, -ENOMEM); + rpc_shutdown_client(client); + return -ENOMEM; + } + + if (clp->cl_minorversion != 0) + clp->cl_cb_conn.cb_xprt = conn->cb_xprt; + clp->cl_cb_client = client; + clp->cl_cb_cred = cred; + trace_nfsd_cb_setup(clp); + return 0; +} + +static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) +{ + if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) + return; + clp->cl_cb_state = NFSD4_CB_DOWN; + trace_nfsd_cb_state(clp); +} + +static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) +{ + if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) + return; + clp->cl_cb_state = NFSD4_CB_FAULT; + trace_nfsd_cb_state(clp); +} + +static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); + + trace_nfsd_cb_done(clp, task->tk_status); + if (task->tk_status) + nfsd4_mark_cb_down(clp, task->tk_status); + else { + clp->cl_cb_state = NFSD4_CB_UP; + trace_nfsd_cb_state(clp); + } +} + +static void nfsd4_cb_probe_release(void *calldata) +{ + struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); + + nfsd41_cb_inflight_end(clp); + +} + +static const struct rpc_call_ops nfsd4_cb_probe_ops = { + /* XXX: release method to ensure we set the cb channel down if + * necessary on early failure? */ + .rpc_call_done = nfsd4_cb_probe_done, + .rpc_release = nfsd4_cb_probe_release, +}; + +/* + * Poke the callback thread to process any updates to the callback + * parameters, and send a null probe. + */ +void nfsd4_probe_callback(struct nfs4_client *clp) +{ + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + trace_nfsd_cb_state(clp); + set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); + nfsd4_run_cb(&clp->cl_cb_null); +} + +void nfsd4_probe_callback_sync(struct nfs4_client *clp) +{ + nfsd4_probe_callback(clp); + flush_workqueue(callback_wq); +} + +void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) +{ + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + spin_lock(&clp->cl_lock); + memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn)); + spin_unlock(&clp->cl_lock); + trace_nfsd_cb_state(clp); +} + +/* + * There's currently a single callback channel slot. + * If the slot is available, then mark it busy. Otherwise, set the + * thread for sleeping on the callback RPC wait queue. + */ +static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task) +{ + struct nfs4_client *clp = cb->cb_clp; + + if (!cb->cb_holds_slot && + test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { + rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); + /* Race breaker */ + if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { + dprintk("%s slot is busy\n", __func__); + return false; + } + rpc_wake_up_queued_task(&clp->cl_cb_waitq, task); + } + cb->cb_holds_slot = true; + return true; +} + +static void nfsd41_cb_release_slot(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + + if (cb->cb_holds_slot) { + cb->cb_holds_slot = false; + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_wake_up_next(&clp->cl_cb_waitq); + } +} + +static void nfsd41_destroy_cb(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + + nfsd41_cb_release_slot(cb); + if (cb->cb_ops && cb->cb_ops->release) + cb->cb_ops->release(cb); + nfsd41_cb_inflight_end(clp); +} + +/* + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) +{ + struct nfsd4_callback *cb = calldata; + struct nfs4_client *clp = cb->cb_clp; + u32 minorversion = clp->cl_minorversion; + + /* + * cb_seq_status is only set in decode_cb_sequence4res, + * and so will remain 1 if an rpc level failure occurs. + */ + cb->cb_seq_status = 1; + cb->cb_status = 0; + if (minorversion && !nfsd41_cb_get_slot(cb, task)) + return; + rpc_call_start(task); +} + +static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + struct nfsd4_session *session = clp->cl_cb_session; + bool ret = true; + + if (!clp->cl_minorversion) { + /* + * If the backchannel connection was shut down while this + * task was queued, we need to resubmit it after setting up + * a new backchannel connection. + * + * Note that if we lost our callback connection permanently + * the submission code will error out, so we don't need to + * handle that case here. + */ + if (RPC_SIGNALLED(task)) + goto need_restart; + + return true; + } + + if (!cb->cb_holds_slot) + goto need_restart; + + switch (cb->cb_seq_status) { + case 0: + /* + * No need for lock, access serialized in nfsd4_cb_prepare + * + * RFC5661 20.9.3 + * If CB_SEQUENCE returns an error, then the state of the slot + * (sequence ID, cached reply) MUST NOT change. + */ + ++session->se_cb_seq_nr; + break; + case -ESERVERFAULT: + ++session->se_cb_seq_nr; + fallthrough; + case 1: + case -NFS4ERR_BADSESSION: + nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); + ret = false; + break; + case -NFS4ERR_DELAY: + if (!rpc_restart_call(task)) + goto out; + + rpc_delay(task, 2 * HZ); + return false; + case -NFS4ERR_BADSLOT: + goto retry_nowait; + case -NFS4ERR_SEQ_MISORDERED: + if (session->se_cb_seq_nr != 1) { + session->se_cb_seq_nr = 1; + goto retry_nowait; + } + break; + default: + nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); + dprintk("%s: unprocessed error %d\n", __func__, + cb->cb_seq_status); + } + + nfsd41_cb_release_slot(cb); + dprintk("%s: freed slot, new seqid=%d\n", __func__, + clp->cl_cb_session->se_cb_seq_nr); + + if (RPC_SIGNALLED(task)) + goto need_restart; +out: + return ret; +retry_nowait: + if (rpc_restart_call_prepare(task)) + ret = false; + goto out; +need_restart: + if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { + task->tk_status = 0; + cb->cb_need_restart = true; + } + return false; +} + +static void nfsd4_cb_done(struct rpc_task *task, void *calldata) +{ + struct nfsd4_callback *cb = calldata; + struct nfs4_client *clp = cb->cb_clp; + + trace_nfsd_cb_done(clp, task->tk_status); + + if (!nfsd4_cb_sequence_done(task, cb)) + return; + + if (cb->cb_status) { + WARN_ON_ONCE(task->tk_status); + task->tk_status = cb->cb_status; + } + + switch (cb->cb_ops->done(cb, task)) { + case 0: + task->tk_status = 0; + rpc_restart_call_prepare(task); + return; + case 1: + switch (task->tk_status) { + case -EIO: + case -ETIMEDOUT: + case -EACCES: + nfsd4_mark_cb_down(clp, task->tk_status); + } + break; + default: + BUG(); + } +} + +static void nfsd4_cb_release(void *calldata) +{ + struct nfsd4_callback *cb = calldata; + + if (cb->cb_need_restart) + nfsd4_queue_cb(cb); + else + nfsd41_destroy_cb(cb); + +} + +static const struct rpc_call_ops nfsd4_cb_ops = { + .rpc_call_prepare = nfsd4_cb_prepare, + .rpc_call_done = nfsd4_cb_done, + .rpc_release = nfsd4_cb_release, +}; + +int nfsd4_create_callback_queue(void) +{ + callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0); + if (!callback_wq) + return -ENOMEM; + return 0; +} + +void nfsd4_destroy_callback_queue(void) +{ + destroy_workqueue(callback_wq); +} + +/* must be called under the state lock */ +void nfsd4_shutdown_callback(struct nfs4_client *clp) +{ + set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags); + /* + * Note this won't actually result in a null callback; + * instead, nfsd4_run_cb_null() will detect the killed + * client, destroy the rpc client, and stop: + */ + nfsd4_run_cb(&clp->cl_cb_null); + flush_workqueue(callback_wq); + nfsd41_cb_inflight_wait_complete(clp); +} + +/* requires cl_lock: */ +static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) +{ + struct nfsd4_session *s; + struct nfsd4_conn *c; + + list_for_each_entry(s, &clp->cl_sessions, se_perclnt) { + list_for_each_entry(c, &s->se_conns, cn_persession) { + if (c->cn_flags & NFS4_CDFC4_BACK) + return c; + } + } + return NULL; +} + +/* + * Note there isn't a lot of locking in this code; instead we depend on + * the fact that it is run from the callback_wq, which won't run two + * work items at once. So, for example, callback_wq handles all access + * of cl_cb_client and all calls to rpc_create or rpc_shutdown_client. + */ +static void nfsd4_process_cb_update(struct nfsd4_callback *cb) +{ + struct nfs4_cb_conn conn; + struct nfs4_client *clp = cb->cb_clp; + struct nfsd4_session *ses = NULL; + struct nfsd4_conn *c; + int err; + + /* + * This is either an update, or the client dying; in either case, + * kill the old client: + */ + if (clp->cl_cb_client) { + trace_nfsd_cb_shutdown(clp); + rpc_shutdown_client(clp->cl_cb_client); + clp->cl_cb_client = NULL; + put_cred(clp->cl_cb_cred); + clp->cl_cb_cred = NULL; + } + if (clp->cl_cb_conn.cb_xprt) { + svc_xprt_put(clp->cl_cb_conn.cb_xprt); + clp->cl_cb_conn.cb_xprt = NULL; + } + if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) + return; + spin_lock(&clp->cl_lock); + /* + * Only serialized callback code is allowed to clear these + * flags; main nfsd code can only set them: + */ + BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)); + clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); + memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); + c = __nfsd4_find_backchannel(clp); + if (c) { + svc_xprt_get(c->cn_xprt); + conn.cb_xprt = c->cn_xprt; + ses = c->cn_session; + } + spin_unlock(&clp->cl_lock); + + err = setup_callback_client(clp, &conn, ses); + if (err) { + nfsd4_mark_cb_down(clp, err); + if (c) + svc_xprt_put(c->cn_xprt); + return; + } +} + +static void +nfsd4_run_cb_work(struct work_struct *work) +{ + struct nfsd4_callback *cb = + container_of(work, struct nfsd4_callback, cb_work); + struct nfs4_client *clp = cb->cb_clp; + struct rpc_clnt *clnt; + int flags; + + trace_nfsd_cb_work(clp, cb->cb_msg.rpc_proc->p_name); + + if (cb->cb_need_restart) { + cb->cb_need_restart = false; + } else { + if (cb->cb_ops && cb->cb_ops->prepare) + cb->cb_ops->prepare(cb); + } + + if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) + nfsd4_process_cb_update(cb); + + clnt = clp->cl_cb_client; + if (!clnt) { + /* Callback channel broken, or client killed; give up: */ + nfsd41_destroy_cb(cb); + return; + } + + /* + * Don't send probe messages for 4.1 or later. + */ + if (!cb->cb_ops && clp->cl_minorversion) { + clp->cl_cb_state = NFSD4_CB_UP; + nfsd41_destroy_cb(cb); + return; + } + + cb->cb_msg.rpc_cred = clp->cl_cb_cred; + flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN; + rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags, + cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); +} + +void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, + const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op) +{ + cb->cb_clp = clp; + cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op]; + cb->cb_msg.rpc_argp = cb; + cb->cb_msg.rpc_resp = cb; + cb->cb_ops = ops; + INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); + cb->cb_seq_status = 1; + cb->cb_status = 0; + cb->cb_need_restart = false; + cb->cb_holds_slot = false; +} + +void nfsd4_run_cb(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + + nfsd41_cb_inflight_begin(clp); + if (!nfsd4_queue_cb(cb)) + nfsd41_cb_inflight_end(clp); +} diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c new file mode 100644 index 000000000..f92161ce1 --- /dev/null +++ b/fs/nfsd/nfs4idmap.c @@ -0,0 +1,686 @@ +/* + * Mapping of UID/GIDs to name and vice versa. + * + * Copyright (c) 2002, 2003 The Regents of the University of + * Michigan. All rights reserved. + * + * Marius Aamodt Eriksen <marius@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/sunrpc/svc_xprt.h> +#include <net/net_namespace.h> +#include "idmap.h" +#include "nfsd.h" +#include "netns.h" + +/* + * Turn off idmapping when using AUTH_SYS. + */ +static bool nfs4_disable_idmapping = true; +module_param(nfs4_disable_idmapping, bool, 0644); +MODULE_PARM_DESC(nfs4_disable_idmapping, + "Turn off server's NFSv4 idmapping when using 'sec=sys'"); + +/* + * Cache entry + */ + +/* + * XXX we know that IDMAP_NAMESZ < PAGE_SIZE, but it's ugly to rely on + * that. + */ + +struct ent { + struct cache_head h; + int type; /* User / Group */ + u32 id; + char name[IDMAP_NAMESZ]; + char authname[IDMAP_NAMESZ]; + struct rcu_head rcu_head; +}; + +/* Common entry handling */ + +#define ENT_HASHBITS 8 +#define ENT_HASHMAX (1 << ENT_HASHBITS) + +static void +ent_init(struct cache_head *cnew, struct cache_head *citm) +{ + struct ent *new = container_of(cnew, struct ent, h); + struct ent *itm = container_of(citm, struct ent, h); + + new->id = itm->id; + new->type = itm->type; + + strlcpy(new->name, itm->name, sizeof(new->name)); + strlcpy(new->authname, itm->authname, sizeof(new->authname)); +} + +static void +ent_put(struct kref *ref) +{ + struct ent *map = container_of(ref, struct ent, h.ref); + kfree_rcu(map, rcu_head); +} + +static struct cache_head * +ent_alloc(void) +{ + struct ent *e = kmalloc(sizeof(*e), GFP_KERNEL); + if (e) + return &e->h; + else + return NULL; +} + +/* + * ID -> Name cache + */ + +static uint32_t +idtoname_hash(struct ent *ent) +{ + uint32_t hash; + + hash = hash_str(ent->authname, ENT_HASHBITS); + hash = hash_long(hash ^ ent->id, ENT_HASHBITS); + + /* Flip LSB for user/group */ + if (ent->type == IDMAP_TYPE_GROUP) + hash ^= 1; + + return hash; +} + +static int +idtoname_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall_timeout(cd, h); +} + +static void +idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, + int *blen) +{ + struct ent *ent = container_of(ch, struct ent, h); + char idstr[11]; + + qword_add(bpp, blen, ent->authname); + snprintf(idstr, sizeof(idstr), "%u", ent->id); + qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user"); + qword_add(bpp, blen, idstr); + + (*bpp)[-1] = '\n'; +} + +static int +idtoname_match(struct cache_head *ca, struct cache_head *cb) +{ + struct ent *a = container_of(ca, struct ent, h); + struct ent *b = container_of(cb, struct ent, h); + + return (a->id == b->id && a->type == b->type && + strcmp(a->authname, b->authname) == 0); +} + +static int +idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) +{ + struct ent *ent; + + if (h == NULL) { + seq_puts(m, "#domain type id [name]\n"); + return 0; + } + ent = container_of(h, struct ent, h); + seq_printf(m, "%s %s %u", ent->authname, + ent->type == IDMAP_TYPE_GROUP ? "group" : "user", + ent->id); + if (test_bit(CACHE_VALID, &h->flags)) + seq_printf(m, " %s", ent->name); + seq_putc(m, '\n'); + return 0; +} + +static void +warn_no_idmapd(struct cache_detail *detail, int has_died) +{ + printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n", + has_died ? "died" : "not been started"); +} + + +static int idtoname_parse(struct cache_detail *, char *, int); +static struct ent *idtoname_lookup(struct cache_detail *, struct ent *); +static struct ent *idtoname_update(struct cache_detail *, struct ent *, + struct ent *); + +static const struct cache_detail idtoname_cache_template = { + .owner = THIS_MODULE, + .hash_size = ENT_HASHMAX, + .name = "nfs4.idtoname", + .cache_put = ent_put, + .cache_upcall = idtoname_upcall, + .cache_request = idtoname_request, + .cache_parse = idtoname_parse, + .cache_show = idtoname_show, + .warn_no_listener = warn_no_idmapd, + .match = idtoname_match, + .init = ent_init, + .update = ent_init, + .alloc = ent_alloc, +}; + +static int +idtoname_parse(struct cache_detail *cd, char *buf, int buflen) +{ + struct ent ent, *res; + char *buf1, *bp; + int len; + int error = -EINVAL; + + if (buf[buflen - 1] != '\n') + return (-EINVAL); + buf[buflen - 1]= '\0'; + + buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (buf1 == NULL) + return (-ENOMEM); + + memset(&ent, 0, sizeof(ent)); + + /* Authentication name */ + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len <= 0 || len >= IDMAP_NAMESZ) + goto out; + memcpy(ent.authname, buf1, sizeof(ent.authname)); + + /* Type */ + if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + goto out; + ent.type = strcmp(buf1, "user") == 0 ? + IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; + + /* ID */ + if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + goto out; + ent.id = simple_strtoul(buf1, &bp, 10); + if (bp == buf1) + goto out; + + /* expiry */ + ent.h.expiry_time = get_expiry(&buf); + if (ent.h.expiry_time == 0) + goto out; + + error = -ENOMEM; + res = idtoname_lookup(cd, &ent); + if (!res) + goto out; + + /* Name */ + error = -EINVAL; + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len < 0 || len >= IDMAP_NAMESZ) + goto out; + if (len == 0) + set_bit(CACHE_NEGATIVE, &ent.h.flags); + else + memcpy(ent.name, buf1, sizeof(ent.name)); + error = -ENOMEM; + res = idtoname_update(cd, &ent, res); + if (res == NULL) + goto out; + + cache_put(&res->h, cd); + error = 0; +out: + kfree(buf1); + return error; +} + +static struct ent * +idtoname_lookup(struct cache_detail *cd, struct ent *item) +{ + struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h, + idtoname_hash(item)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + +static struct ent * +idtoname_update(struct cache_detail *cd, struct ent *new, struct ent *old) +{ + struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h, + idtoname_hash(new)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + + +/* + * Name -> ID cache + */ + +static inline int +nametoid_hash(struct ent *ent) +{ + return hash_str(ent->name, ENT_HASHBITS); +} + +static int +nametoid_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall_timeout(cd, h); +} + +static void +nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, + int *blen) +{ + struct ent *ent = container_of(ch, struct ent, h); + + qword_add(bpp, blen, ent->authname); + qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user"); + qword_add(bpp, blen, ent->name); + + (*bpp)[-1] = '\n'; +} + +static int +nametoid_match(struct cache_head *ca, struct cache_head *cb) +{ + struct ent *a = container_of(ca, struct ent, h); + struct ent *b = container_of(cb, struct ent, h); + + return (a->type == b->type && strcmp(a->name, b->name) == 0 && + strcmp(a->authname, b->authname) == 0); +} + +static int +nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) +{ + struct ent *ent; + + if (h == NULL) { + seq_puts(m, "#domain type name [id]\n"); + return 0; + } + ent = container_of(h, struct ent, h); + seq_printf(m, "%s %s %s", ent->authname, + ent->type == IDMAP_TYPE_GROUP ? "group" : "user", + ent->name); + if (test_bit(CACHE_VALID, &h->flags)) + seq_printf(m, " %u", ent->id); + seq_putc(m, '\n'); + return 0; +} + +static struct ent *nametoid_lookup(struct cache_detail *, struct ent *); +static struct ent *nametoid_update(struct cache_detail *, struct ent *, + struct ent *); +static int nametoid_parse(struct cache_detail *, char *, int); + +static const struct cache_detail nametoid_cache_template = { + .owner = THIS_MODULE, + .hash_size = ENT_HASHMAX, + .name = "nfs4.nametoid", + .cache_put = ent_put, + .cache_upcall = nametoid_upcall, + .cache_request = nametoid_request, + .cache_parse = nametoid_parse, + .cache_show = nametoid_show, + .warn_no_listener = warn_no_idmapd, + .match = nametoid_match, + .init = ent_init, + .update = ent_init, + .alloc = ent_alloc, +}; + +static int +nametoid_parse(struct cache_detail *cd, char *buf, int buflen) +{ + struct ent ent, *res; + char *buf1; + int len, error = -EINVAL; + + if (buf[buflen - 1] != '\n') + return (-EINVAL); + buf[buflen - 1]= '\0'; + + buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (buf1 == NULL) + return (-ENOMEM); + + memset(&ent, 0, sizeof(ent)); + + /* Authentication name */ + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len <= 0 || len >= IDMAP_NAMESZ) + goto out; + memcpy(ent.authname, buf1, sizeof(ent.authname)); + + /* Type */ + if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + goto out; + ent.type = strcmp(buf1, "user") == 0 ? + IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; + + /* Name */ + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len <= 0 || len >= IDMAP_NAMESZ) + goto out; + memcpy(ent.name, buf1, sizeof(ent.name)); + + /* expiry */ + ent.h.expiry_time = get_expiry(&buf); + if (ent.h.expiry_time == 0) + goto out; + + /* ID */ + error = get_int(&buf, &ent.id); + if (error == -EINVAL) + goto out; + if (error == -ENOENT) + set_bit(CACHE_NEGATIVE, &ent.h.flags); + + error = -ENOMEM; + res = nametoid_lookup(cd, &ent); + if (res == NULL) + goto out; + res = nametoid_update(cd, &ent, res); + if (res == NULL) + goto out; + + cache_put(&res->h, cd); + error = 0; +out: + kfree(buf1); + return (error); +} + + +static struct ent * +nametoid_lookup(struct cache_detail *cd, struct ent *item) +{ + struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h, + nametoid_hash(item)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + +static struct ent * +nametoid_update(struct cache_detail *cd, struct ent *new, struct ent *old) +{ + struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h, + nametoid_hash(new)); + if (ch) + return container_of(ch, struct ent, h); + else + return NULL; +} + +/* + * Exported API + */ + +int +nfsd_idmap_init(struct net *net) +{ + int rv; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nn->idtoname_cache = cache_create_net(&idtoname_cache_template, net); + if (IS_ERR(nn->idtoname_cache)) + return PTR_ERR(nn->idtoname_cache); + rv = cache_register_net(nn->idtoname_cache, net); + if (rv) + goto destroy_idtoname_cache; + nn->nametoid_cache = cache_create_net(&nametoid_cache_template, net); + if (IS_ERR(nn->nametoid_cache)) { + rv = PTR_ERR(nn->nametoid_cache); + goto unregister_idtoname_cache; + } + rv = cache_register_net(nn->nametoid_cache, net); + if (rv) + goto destroy_nametoid_cache; + return 0; + +destroy_nametoid_cache: + cache_destroy_net(nn->nametoid_cache, net); +unregister_idtoname_cache: + cache_unregister_net(nn->idtoname_cache, net); +destroy_idtoname_cache: + cache_destroy_net(nn->idtoname_cache, net); + return rv; +} + +void +nfsd_idmap_shutdown(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + cache_unregister_net(nn->idtoname_cache, net); + cache_unregister_net(nn->nametoid_cache, net); + cache_destroy_net(nn->idtoname_cache, net); + cache_destroy_net(nn->nametoid_cache, net); +} + +static int +idmap_lookup(struct svc_rqst *rqstp, + struct ent *(*lookup_fn)(struct cache_detail *, struct ent *), + struct ent *key, struct cache_detail *detail, struct ent **item) +{ + int ret; + + *item = lookup_fn(detail, key); + if (!*item) + return -ENOMEM; + retry: + ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle); + + if (ret == -ETIMEDOUT) { + struct ent *prev_item = *item; + *item = lookup_fn(detail, key); + if (*item != prev_item) + goto retry; + cache_put(&(*item)->h, detail); + } + return ret; +} + +static char * +rqst_authname(struct svc_rqst *rqstp) +{ + struct auth_domain *clp; + + clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client; + return clp->name; +} + +static __be32 +idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, + u32 *id) +{ + struct ent *item, key = { + .type = type, + }; + int ret; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + if (namelen + 1 > sizeof(key.name)) + return nfserr_badowner; + memcpy(key.name, name, namelen); + key.name[namelen] = '\0'; + strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); + ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item); + if (ret == -ENOENT) + return nfserr_badowner; + if (ret) + return nfserrno(ret); + *id = item->id; + cache_put(&item->h, nn->nametoid_cache); + return 0; +} + +static __be32 encode_ascii_id(struct xdr_stream *xdr, u32 id) +{ + char buf[11]; + int len; + __be32 *p; + + len = sprintf(buf, "%u", id); + p = xdr_reserve_space(xdr, len + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, buf, len); + return 0; +} + +static __be32 idmap_id_to_name(struct xdr_stream *xdr, + struct svc_rqst *rqstp, int type, u32 id) +{ + struct ent *item, key = { + .id = id, + .type = type, + }; + __be32 *p; + int ret; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); + ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); + if (ret == -ENOENT) + return encode_ascii_id(xdr, id); + if (ret) + return nfserrno(ret); + ret = strlen(item->name); + WARN_ON_ONCE(ret > IDMAP_NAMESZ); + p = xdr_reserve_space(xdr, ret + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, item->name, ret); + cache_put(&item->h, nn->idtoname_cache); + return 0; +} + +static bool +numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) +{ + int ret; + char buf[11]; + + if (namelen + 1 > sizeof(buf)) + /* too long to represent a 32-bit id: */ + return false; + /* Just to make sure it's null-terminated: */ + memcpy(buf, name, namelen); + buf[namelen] = '\0'; + ret = kstrtouint(buf, 10, id); + return ret == 0; +} + +static __be32 +do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) +{ + if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) + if (numeric_name_to_id(rqstp, type, name, namelen, id)) + return 0; + /* + * otherwise, fall through and try idmapping, for + * backwards compatibility with clients sending names: + */ + return idmap_name_to_id(rqstp, type, name, namelen, id); +} + +static __be32 encode_name_from_id(struct xdr_stream *xdr, + struct svc_rqst *rqstp, int type, u32 id) +{ + if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) + return encode_ascii_id(xdr, id); + return idmap_id_to_name(xdr, rqstp, type, id); +} + +__be32 +nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, + kuid_t *uid) +{ + __be32 status; + u32 id = -1; + + if (name == NULL || namelen == 0) + return nfserr_inval; + + status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id); + *uid = make_kuid(nfsd_user_namespace(rqstp), id); + if (!uid_valid(*uid)) + status = nfserr_badowner; + return status; +} + +__be32 +nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, + kgid_t *gid) +{ + __be32 status; + u32 id = -1; + + if (name == NULL || namelen == 0) + return nfserr_inval; + + status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id); + *gid = make_kgid(nfsd_user_namespace(rqstp), id); + if (!gid_valid(*gid)) + status = nfserr_badowner; + return status; +} + +__be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp, + kuid_t uid) +{ + u32 id = from_kuid_munged(nfsd_user_namespace(rqstp), uid); + return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id); +} + +__be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp, + kgid_t gid) +{ + u32 id = from_kgid_munged(nfsd_user_namespace(rqstp), gid); + return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id); +} diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c new file mode 100644 index 000000000..2673019d3 --- /dev/null +++ b/fs/nfsd/nfs4layouts.c @@ -0,0 +1,786 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include <linux/blkdev.h> +#include <linux/kmod.h> +#include <linux/file.h> +#include <linux/jhash.h> +#include <linux/sched.h> +#include <linux/sunrpc/addr.h> + +#include "pnfs.h" +#include "netns.h" +#include "trace.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +struct nfs4_layout { + struct list_head lo_perstate; + struct nfs4_layout_stateid *lo_state; + struct nfsd4_layout_seg lo_seg; +}; + +static struct kmem_cache *nfs4_layout_cache; +static struct kmem_cache *nfs4_layout_stateid_cache; + +static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; +static const struct lock_manager_operations nfsd4_layouts_lm_ops; + +const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { +#ifdef CONFIG_NFSD_FLEXFILELAYOUT + [LAYOUT_FLEX_FILES] = &ff_layout_ops, +#endif +#ifdef CONFIG_NFSD_BLOCKLAYOUT + [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops, +#endif +#ifdef CONFIG_NFSD_SCSILAYOUT + [LAYOUT_SCSI] = &scsi_layout_ops, +#endif +}; + +/* pNFS device ID to export fsid mapping */ +#define DEVID_HASH_BITS 8 +#define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS) +#define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1) +static u64 nfsd_devid_seq = 1; +static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE]; +static DEFINE_SPINLOCK(nfsd_devid_lock); + +static inline u32 devid_hashfn(u64 idx) +{ + return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK; +} + +static void +nfsd4_alloc_devid_map(const struct svc_fh *fhp) +{ + const struct knfsd_fh *fh = &fhp->fh_handle; + size_t fsid_len = key_len(fh->fh_fsid_type); + struct nfsd4_deviceid_map *map, *old; + int i; + + map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL); + if (!map) + return; + + map->fsid_type = fh->fh_fsid_type; + memcpy(&map->fsid, fh->fh_fsid, fsid_len); + + spin_lock(&nfsd_devid_lock); + if (fhp->fh_export->ex_devid_map) + goto out_unlock; + + for (i = 0; i < DEVID_HASH_SIZE; i++) { + list_for_each_entry(old, &nfsd_devid_hash[i], hash) { + if (old->fsid_type != fh->fh_fsid_type) + continue; + if (memcmp(old->fsid, fh->fh_fsid, + key_len(old->fsid_type))) + continue; + + fhp->fh_export->ex_devid_map = old; + goto out_unlock; + } + } + + map->idx = nfsd_devid_seq++; + list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]); + fhp->fh_export->ex_devid_map = map; + map = NULL; + +out_unlock: + spin_unlock(&nfsd_devid_lock); + kfree(map); +} + +struct nfsd4_deviceid_map * +nfsd4_find_devid_map(int idx) +{ + struct nfsd4_deviceid_map *map, *ret = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash) + if (map->idx == idx) + ret = map; + rcu_read_unlock(); + + return ret; +} + +int +nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, + u32 device_generation) +{ + if (!fhp->fh_export->ex_devid_map) { + nfsd4_alloc_devid_map(fhp); + if (!fhp->fh_export->ex_devid_map) + return -ENOMEM; + } + + id->fsid_idx = fhp->fh_export->ex_devid_map->idx; + id->generation = device_generation; + id->pad = 0; + return 0; +} + +void nfsd4_setup_layout_type(struct svc_export *exp) +{ +#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT) + struct super_block *sb = exp->ex_path.mnt->mnt_sb; +#endif + + if (!(exp->ex_flags & NFSEXP_PNFS)) + return; + +#ifdef CONFIG_NFSD_FLEXFILELAYOUT + exp->ex_layout_types |= 1 << LAYOUT_FLEX_FILES; +#endif +#ifdef CONFIG_NFSD_BLOCKLAYOUT + if (sb->s_export_op->get_uuid && + sb->s_export_op->map_blocks && + sb->s_export_op->commit_blocks) + exp->ex_layout_types |= 1 << LAYOUT_BLOCK_VOLUME; +#endif +#ifdef CONFIG_NFSD_SCSILAYOUT + if (sb->s_export_op->map_blocks && + sb->s_export_op->commit_blocks && + sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops && + blk_queue_scsi_passthrough(sb->s_bdev->bd_disk->queue)) + exp->ex_layout_types |= 1 << LAYOUT_SCSI; +#endif +} + +static void +nfsd4_free_layout_stateid(struct nfs4_stid *stid) +{ + struct nfs4_layout_stateid *ls = layoutstateid(stid); + struct nfs4_client *clp = ls->ls_stid.sc_client; + struct nfs4_file *fp = ls->ls_stid.sc_file; + + trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid); + + spin_lock(&clp->cl_lock); + list_del_init(&ls->ls_perclnt); + spin_unlock(&clp->cl_lock); + + spin_lock(&fp->fi_lock); + list_del_init(&ls->ls_perfile); + spin_unlock(&fp->fi_lock); + + if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls); + nfsd_file_put(ls->ls_file); + + if (ls->ls_recalled) + atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); + + kmem_cache_free(nfs4_layout_stateid_cache, ls); +} + +static int +nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) +{ + struct file_lock *fl; + int status; + + if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + return 0; + + fl = locks_alloc_lock(); + if (!fl) + return -ENOMEM; + locks_init_lock(fl); + fl->fl_lmops = &nfsd4_layouts_lm_ops; + fl->fl_flags = FL_LAYOUT; + fl->fl_type = F_RDLCK; + fl->fl_end = OFFSET_MAX; + fl->fl_owner = ls; + fl->fl_pid = current->tgid; + fl->fl_file = ls->ls_file->nf_file; + + status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL); + if (status) { + locks_free_lock(fl); + return status; + } + BUG_ON(fl != NULL); + return 0; +} + +static struct nfs4_layout_stateid * +nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, + struct nfs4_stid *parent, u32 layout_type) +{ + struct nfs4_client *clp = cstate->clp; + struct nfs4_file *fp = parent->sc_file; + struct nfs4_layout_stateid *ls; + struct nfs4_stid *stp; + + stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache, + nfsd4_free_layout_stateid); + if (!stp) + return NULL; + + get_nfs4_file(fp); + stp->sc_file = fp; + + ls = layoutstateid(stp); + INIT_LIST_HEAD(&ls->ls_perclnt); + INIT_LIST_HEAD(&ls->ls_perfile); + spin_lock_init(&ls->ls_lock); + INIT_LIST_HEAD(&ls->ls_layouts); + mutex_init(&ls->ls_mutex); + ls->ls_layout_type = layout_type; + nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops, + NFSPROC4_CLNT_CB_LAYOUT); + + if (parent->sc_type == NFS4_DELEG_STID) + ls->ls_file = nfsd_file_get(fp->fi_deleg_file); + else + ls->ls_file = find_any_file(fp); + BUG_ON(!ls->ls_file); + + if (nfsd4_layout_setlease(ls)) { + nfsd_file_put(ls->ls_file); + put_nfs4_file(fp); + kmem_cache_free(nfs4_layout_stateid_cache, ls); + return NULL; + } + + spin_lock(&clp->cl_lock); + stp->sc_type = NFS4_LAYOUT_STID; + list_add(&ls->ls_perclnt, &clp->cl_lo_states); + spin_unlock(&clp->cl_lock); + + spin_lock(&fp->fi_lock); + list_add(&ls->ls_perfile, &fp->fi_lo_states); + spin_unlock(&fp->fi_lock); + + trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); + return ls; +} + +__be32 +nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, stateid_t *stateid, + bool create, u32 layout_type, struct nfs4_layout_stateid **lsp) +{ + struct nfs4_layout_stateid *ls; + struct nfs4_stid *stid; + unsigned char typemask = NFS4_LAYOUT_STID; + __be32 status; + + if (create) + typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID); + + status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid, + net_generic(SVC_NET(rqstp), nfsd_net_id)); + if (status) + goto out; + + if (!fh_match(&cstate->current_fh.fh_handle, + &stid->sc_file->fi_fhandle)) { + status = nfserr_bad_stateid; + goto out_put_stid; + } + + if (stid->sc_type != NFS4_LAYOUT_STID) { + ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type); + nfs4_put_stid(stid); + + status = nfserr_jukebox; + if (!ls) + goto out; + mutex_lock(&ls->ls_mutex); + } else { + ls = container_of(stid, struct nfs4_layout_stateid, ls_stid); + + status = nfserr_bad_stateid; + mutex_lock(&ls->ls_mutex); + if (nfsd4_stateid_generation_after(stateid, &stid->sc_stateid)) + goto out_unlock_stid; + if (layout_type != ls->ls_layout_type) + goto out_unlock_stid; + } + + *lsp = ls; + return 0; + +out_unlock_stid: + mutex_unlock(&ls->ls_mutex); +out_put_stid: + nfs4_put_stid(stid); +out: + return status; +} + +static void +nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) +{ + spin_lock(&ls->ls_lock); + if (ls->ls_recalled) + goto out_unlock; + + if (list_empty(&ls->ls_layouts)) + goto out_unlock; + + ls->ls_recalled = true; + atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); + trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid); + + refcount_inc(&ls->ls_stid.sc_count); + nfsd4_run_cb(&ls->ls_recall); + +out_unlock: + spin_unlock(&ls->ls_lock); +} + +static inline u64 +layout_end(struct nfsd4_layout_seg *seg) +{ + u64 end = seg->offset + seg->length; + return end >= seg->offset ? end : NFS4_MAX_UINT64; +} + +static void +layout_update_len(struct nfsd4_layout_seg *lo, u64 end) +{ + if (end == NFS4_MAX_UINT64) + lo->length = NFS4_MAX_UINT64; + else + lo->length = end - lo->offset; +} + +static bool +layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s) +{ + if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode) + return false; + if (layout_end(&lo->lo_seg) <= s->offset) + return false; + if (layout_end(s) <= lo->lo_seg.offset) + return false; + return true; +} + +static bool +layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new) +{ + if (lo->iomode != new->iomode) + return false; + if (layout_end(new) < lo->offset) + return false; + if (layout_end(lo) < new->offset) + return false; + + lo->offset = min(lo->offset, new->offset); + layout_update_len(lo, max(layout_end(lo), layout_end(new))); + return true; +} + +static __be32 +nfsd4_recall_conflict(struct nfs4_layout_stateid *ls) +{ + struct nfs4_file *fp = ls->ls_stid.sc_file; + struct nfs4_layout_stateid *l, *n; + __be32 nfserr = nfs_ok; + + assert_spin_locked(&fp->fi_lock); + + list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) { + if (l != ls) { + nfsd4_recall_file_layout(l); + nfserr = nfserr_recallconflict; + } + } + + return nfserr; +} + +__be32 +nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls) +{ + struct nfsd4_layout_seg *seg = &lgp->lg_seg; + struct nfs4_file *fp = ls->ls_stid.sc_file; + struct nfs4_layout *lp, *new = NULL; + __be32 nfserr; + + spin_lock(&fp->fi_lock); + nfserr = nfsd4_recall_conflict(ls); + if (nfserr) + goto out; + spin_lock(&ls->ls_lock); + list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) { + if (layouts_try_merge(&lp->lo_seg, seg)) + goto done; + } + spin_unlock(&ls->ls_lock); + spin_unlock(&fp->fi_lock); + + new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL); + if (!new) + return nfserr_jukebox; + memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg)); + new->lo_state = ls; + + spin_lock(&fp->fi_lock); + nfserr = nfsd4_recall_conflict(ls); + if (nfserr) + goto out; + spin_lock(&ls->ls_lock); + list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) { + if (layouts_try_merge(&lp->lo_seg, seg)) + goto done; + } + + refcount_inc(&ls->ls_stid.sc_count); + list_add_tail(&new->lo_perstate, &ls->ls_layouts); + new = NULL; +done: + nfs4_inc_and_copy_stateid(&lgp->lg_sid, &ls->ls_stid); + spin_unlock(&ls->ls_lock); +out: + spin_unlock(&fp->fi_lock); + if (new) + kmem_cache_free(nfs4_layout_cache, new); + return nfserr; +} + +static void +nfsd4_free_layouts(struct list_head *reaplist) +{ + while (!list_empty(reaplist)) { + struct nfs4_layout *lp = list_first_entry(reaplist, + struct nfs4_layout, lo_perstate); + + list_del(&lp->lo_perstate); + nfs4_put_stid(&lp->lo_state->ls_stid); + kmem_cache_free(nfs4_layout_cache, lp); + } +} + +static void +nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg, + struct list_head *reaplist) +{ + struct nfsd4_layout_seg *lo = &lp->lo_seg; + u64 end = layout_end(lo); + + if (seg->offset <= lo->offset) { + if (layout_end(seg) >= end) { + list_move_tail(&lp->lo_perstate, reaplist); + return; + } + lo->offset = layout_end(seg); + } else { + /* retain the whole layout segment on a split. */ + if (layout_end(seg) < end) { + dprintk("%s: split not supported\n", __func__); + return; + } + end = seg->offset; + } + + layout_update_len(lo, end); +} + +__be32 +nfsd4_return_file_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp) +{ + struct nfs4_layout_stateid *ls; + struct nfs4_layout *lp, *n; + LIST_HEAD(reaplist); + __be32 nfserr; + int found = 0; + + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid, + false, lrp->lr_layout_type, + &ls); + if (nfserr) { + trace_nfsd_layout_return_lookup_fail(&lrp->lr_sid); + return nfserr; + } + + spin_lock(&ls->ls_lock); + list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) { + if (layouts_overlapping(lp, &lrp->lr_seg)) { + nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist); + found++; + } + } + if (!list_empty(&ls->ls_layouts)) { + if (found) + nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid); + lrp->lrs_present = 1; + } else { + trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid); + nfs4_unhash_stid(&ls->ls_stid); + lrp->lrs_present = 0; + } + spin_unlock(&ls->ls_lock); + + mutex_unlock(&ls->ls_mutex); + nfs4_put_stid(&ls->ls_stid); + nfsd4_free_layouts(&reaplist); + return nfs_ok; +} + +__be32 +nfsd4_return_client_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp) +{ + struct nfs4_layout_stateid *ls, *n; + struct nfs4_client *clp = cstate->clp; + struct nfs4_layout *lp, *t; + LIST_HEAD(reaplist); + + lrp->lrs_present = 0; + + spin_lock(&clp->cl_lock); + list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) { + if (ls->ls_layout_type != lrp->lr_layout_type) + continue; + + if (lrp->lr_return_type == RETURN_FSID && + !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle, + &cstate->current_fh.fh_handle)) + continue; + + spin_lock(&ls->ls_lock); + list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) { + if (lrp->lr_seg.iomode == IOMODE_ANY || + lrp->lr_seg.iomode == lp->lo_seg.iomode) + list_move_tail(&lp->lo_perstate, &reaplist); + } + spin_unlock(&ls->ls_lock); + } + spin_unlock(&clp->cl_lock); + + nfsd4_free_layouts(&reaplist); + return 0; +} + +static void +nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls, + struct list_head *reaplist) +{ + spin_lock(&ls->ls_lock); + list_splice_init(&ls->ls_layouts, reaplist); + spin_unlock(&ls->ls_lock); +} + +void +nfsd4_return_all_client_layouts(struct nfs4_client *clp) +{ + struct nfs4_layout_stateid *ls, *n; + LIST_HEAD(reaplist); + + spin_lock(&clp->cl_lock); + list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) + nfsd4_return_all_layouts(ls, &reaplist); + spin_unlock(&clp->cl_lock); + + nfsd4_free_layouts(&reaplist); +} + +void +nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) +{ + struct nfs4_layout_stateid *ls, *n; + LIST_HEAD(reaplist); + + spin_lock(&fp->fi_lock); + list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) { + if (ls->ls_stid.sc_client == clp) + nfsd4_return_all_layouts(ls, &reaplist); + } + spin_unlock(&fp->fi_lock); + + nfsd4_free_layouts(&reaplist); +} + +static void +nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) +{ + struct nfs4_client *clp = ls->ls_stid.sc_client; + char addr_str[INET6_ADDRSTRLEN]; + static char const nfsd_recall_failed[] = "/sbin/nfsd-recall-failed"; + static char *envp[] = { + "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[8]; + int error; + + rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str)); + + printk(KERN_WARNING + "nfsd: client %s failed to respond to layout recall. " + " Fencing..\n", addr_str); + + argv[0] = (char *)nfsd_recall_failed; + argv[1] = addr_str; + argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id; + argv[3] = NULL; + + error = call_usermodehelper(nfsd_recall_failed, argv, envp, + UMH_WAIT_PROC); + if (error) { + printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n", + addr_str, error); + } +} + +static void +nfsd4_cb_layout_prepare(struct nfsd4_callback *cb) +{ + struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + + mutex_lock(&ls->ls_mutex); + nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid); + mutex_unlock(&ls->ls_mutex); +} + +static int +nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) +{ + struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + struct nfsd_net *nn; + ktime_t now, cutoff; + const struct nfsd4_layout_ops *ops; + + + switch (task->tk_status) { + case 0: + case -NFS4ERR_DELAY: + /* + * Anything left? If not, then call it done. Note that we don't + * take the spinlock since this is an optimization and nothing + * should get added until the cb counter goes to zero. + */ + if (list_empty(&ls->ls_layouts)) + return 1; + + /* Poll the client until it's done with the layout */ + now = ktime_get(); + nn = net_generic(ls->ls_stid.sc_client->net, nfsd_net_id); + + /* Client gets 2 lease periods to return it */ + cutoff = ktime_add_ns(task->tk_start, + (u64)nn->nfsd4_lease * NSEC_PER_SEC * 2); + + if (ktime_before(now, cutoff)) { + rpc_delay(task, HZ/100); /* 10 mili-seconds */ + return 0; + } + fallthrough; + default: + /* + * Unknown error or non-responding client, we'll need to fence. + */ + trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid); + + ops = nfsd4_layout_ops[ls->ls_layout_type]; + if (ops->fence_client) + ops->fence_client(ls); + else + nfsd4_cb_layout_fail(ls); + return 1; + case -NFS4ERR_NOMATCHING_LAYOUT: + trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid); + task->tk_status = 0; + return 1; + } +} + +static void +nfsd4_cb_layout_release(struct nfsd4_callback *cb) +{ + struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + LIST_HEAD(reaplist); + + trace_nfsd_layout_recall_release(&ls->ls_stid.sc_stateid); + + nfsd4_return_all_layouts(ls, &reaplist); + nfsd4_free_layouts(&reaplist); + nfs4_put_stid(&ls->ls_stid); +} + +static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = { + .prepare = nfsd4_cb_layout_prepare, + .done = nfsd4_cb_layout_done, + .release = nfsd4_cb_layout_release, +}; + +static bool +nfsd4_layout_lm_break(struct file_lock *fl) +{ + /* + * We don't want the locks code to timeout the lease for us; + * we'll remove it ourself if a layout isn't returned + * in time: + */ + fl->fl_break_time = 0; + nfsd4_recall_file_layout(fl->fl_owner); + return false; +} + +static int +nfsd4_layout_lm_change(struct file_lock *onlist, int arg, + struct list_head *dispose) +{ + BUG_ON(!(arg & F_UNLCK)); + return lease_modify(onlist, arg, dispose); +} + +static const struct lock_manager_operations nfsd4_layouts_lm_ops = { + .lm_break = nfsd4_layout_lm_break, + .lm_change = nfsd4_layout_lm_change, +}; + +int +nfsd4_init_pnfs(void) +{ + int i; + + for (i = 0; i < DEVID_HASH_SIZE; i++) + INIT_LIST_HEAD(&nfsd_devid_hash[i]); + + nfs4_layout_cache = kmem_cache_create("nfs4_layout", + sizeof(struct nfs4_layout), 0, 0, NULL); + if (!nfs4_layout_cache) + return -ENOMEM; + + nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid", + sizeof(struct nfs4_layout_stateid), 0, 0, NULL); + if (!nfs4_layout_stateid_cache) { + kmem_cache_destroy(nfs4_layout_cache); + return -ENOMEM; + } + return 0; +} + +void +nfsd4_exit_pnfs(void) +{ + int i; + + kmem_cache_destroy(nfs4_layout_cache); + kmem_cache_destroy(nfs4_layout_stateid_cache); + + for (i = 0; i < DEVID_HASH_SIZE; i++) { + struct nfsd4_deviceid_map *map, *n; + + list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash) + kfree(map); + } +} diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c new file mode 100644 index 000000000..e84996c38 --- /dev/null +++ b/fs/nfsd/nfs4proc.c @@ -0,0 +1,3329 @@ +/* + * Server-side procedures for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith <kmsmith@umich.edu> + * Andy Adamson <andros@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <linux/fs_struct.h> +#include <linux/file.h> +#include <linux/falloc.h> +#include <linux/slab.h> +#include <linux/kthread.h> +#include <linux/sunrpc/addr.h> +#include <linux/nfs_ssc.h> + +#include "idmap.h" +#include "cache.h" +#include "xdr4.h" +#include "vfs.h" +#include "current_stateid.h" +#include "netns.h" +#include "acl.h" +#include "pnfs.h" +#include "trace.h" + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +#include <linux/security.h> + +static inline void +nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval) +{ + struct inode *inode = d_inode(resfh->fh_dentry); + int status; + + inode_lock(inode); + status = security_inode_setsecctx(resfh->fh_dentry, + label->data, label->len); + inode_unlock(inode); + + if (status) + /* + * XXX: We should really fail the whole open, but we may + * already have created a new file, so it may be too + * late. For now this seems the least of evils: + */ + bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + + return; +} +#else +static inline void +nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval) +{ } +#endif + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +static u32 nfsd_attrmask[] = { + NFSD_WRITEABLE_ATTRS_WORD0, + NFSD_WRITEABLE_ATTRS_WORD1, + NFSD_WRITEABLE_ATTRS_WORD2 +}; + +static u32 nfsd41_ex_attrmask[] = { + NFSD_SUPPATTR_EXCLCREAT_WORD0, + NFSD_SUPPATTR_EXCLCREAT_WORD1, + NFSD_SUPPATTR_EXCLCREAT_WORD2 +}; + +static __be32 +check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + u32 *bmval, u32 *writable) +{ + struct dentry *dentry = cstate->current_fh.fh_dentry; + struct svc_export *exp = cstate->current_fh.fh_export; + + if (!nfsd_attrs_supported(cstate->minorversion, bmval)) + return nfserr_attrnotsupp; + if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry))) + return nfserr_attrnotsupp; + if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) && + !(exp->ex_flags & NFSEXP_SECURITY_LABEL)) + return nfserr_attrnotsupp; + if (writable && !bmval_is_subset(bmval, writable)) + return nfserr_inval; + if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) && + (bmval[1] & FATTR4_WORD1_MODE)) + return nfserr_inval; + return nfs_ok; +} + +static __be32 +nfsd4_check_open_attributes(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, struct nfsd4_open *open) +{ + __be32 status = nfs_ok; + + if (open->op_create == NFS4_OPEN_CREATE) { + if (open->op_createmode == NFS4_CREATE_UNCHECKED + || open->op_createmode == NFS4_CREATE_GUARDED) + status = check_attr_support(rqstp, cstate, + open->op_bmval, nfsd_attrmask); + else if (open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1) + status = check_attr_support(rqstp, cstate, + open->op_bmval, nfsd41_ex_attrmask); + } + + return status; +} + +static int +is_create_with_attrs(struct nfsd4_open *open) +{ + return open->op_create == NFS4_OPEN_CREATE + && (open->op_createmode == NFS4_CREATE_UNCHECKED + || open->op_createmode == NFS4_CREATE_GUARDED + || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1); +} + +/* + * if error occurs when setting the acl, just clear the acl bit + * in the returned attr bitmap. + */ +static void +do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl, u32 *bmval) +{ + __be32 status; + + status = nfsd4_set_nfs4_acl(rqstp, fhp, acl); + if (status) + /* + * We should probably fail the whole open at this point, + * but we've already created the file, so it's too late; + * So this seems the least of evils: + */ + bmval[0] &= ~FATTR4_WORD0_ACL; +} + +static inline void +fh_dup2(struct svc_fh *dst, struct svc_fh *src) +{ + fh_put(dst); + dget(src->fh_dentry); + if (src->fh_export) + exp_get(src->fh_export); + *dst = *src; +} + +static __be32 +do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode) +{ + __be32 status; + + if (open->op_truncate && + !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) + return nfserr_inval; + + accmode |= NFSD_MAY_READ_IF_EXEC; + + if (open->op_share_access & NFS4_SHARE_ACCESS_READ) + accmode |= NFSD_MAY_READ; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) + accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC); + if (open->op_share_deny & NFS4_SHARE_DENY_READ) + accmode |= NFSD_MAY_WRITE; + + status = fh_verify(rqstp, current_fh, S_IFREG, accmode); + + return status; +} + +static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) +{ + umode_t mode = d_inode(fh->fh_dentry)->i_mode; + + if (S_ISREG(mode)) + return nfs_ok; + if (S_ISDIR(mode)) + return nfserr_isdir; + /* + * Using err_symlink as our catch-all case may look odd; but + * there's no other obvious error for this case in 4.0, and we + * happen to know that it will cause the linux v4 client to do + * the right thing on attempts to open something other than a + * regular file. + */ + return nfserr_symlink; +} + +static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh *resfh) +{ + if (nfsd4_has_session(cstate)) + return; + fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, + &resfh->fh_handle); +} + +static __be32 +do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) +{ + struct svc_fh *current_fh = &cstate->current_fh; + int accmode; + __be32 status; + + *resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); + if (!*resfh) + return nfserr_jukebox; + fh_init(*resfh, NFS4_FHSIZE); + open->op_truncate = false; + + if (open->op_create) { + /* FIXME: check session persistence and pnfs flags. + * The nfsv4.1 spec requires the following semantics: + * + * Persistent | pNFS | Server REQUIRED | Client Allowed + * Reply Cache | server | | + * -------------+--------+-----------------+-------------------- + * no | no | EXCLUSIVE4_1 | EXCLUSIVE4_1 + * | | | (SHOULD) + * | | and EXCLUSIVE4 | or EXCLUSIVE4 + * | | | (SHOULD NOT) + * no | yes | EXCLUSIVE4_1 | EXCLUSIVE4_1 + * yes | no | GUARDED4 | GUARDED4 + * yes | yes | GUARDED4 | GUARDED4 + */ + + /* + * Note: create modes (UNCHECKED,GUARDED...) are the same + * in NFSv4 as in v3 except EXCLUSIVE4_1. + */ + current->fs->umask = open->op_umask; + status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, + open->op_fname.len, &open->op_iattr, + *resfh, open->op_createmode, + (u32 *)open->op_verf.data, + &open->op_truncate, &open->op_created); + current->fs->umask = 0; + + if (!status && open->op_label.len) + nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval); + + /* + * Following rfc 3530 14.2.16, and rfc 5661 18.16.4 + * use the returned bitmask to indicate which attributes + * we used to store the verifier: + */ + if (nfsd_create_is_exclusive(open->op_createmode) && status == 0) + open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | + FATTR4_WORD1_TIME_MODIFY); + } else + /* + * Note this may exit with the parent still locked. + * We will hold the lock until nfsd4_open's final + * lookup, to prevent renames or unlinks until we've had + * a chance to an acquire a delegation if appropriate. + */ + status = nfsd_lookup(rqstp, current_fh, + open->op_fname.data, open->op_fname.len, *resfh); + if (status) + goto out; + status = nfsd_check_obj_isreg(*resfh); + if (status) + goto out; + + if (is_create_with_attrs(open) && open->op_acl != NULL) + do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval); + + nfsd4_set_open_owner_reply_cache(cstate, open, *resfh); + accmode = NFSD_MAY_NOP; + if (open->op_created || + open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + accmode |= NFSD_MAY_OWNER_OVERRIDE; + status = do_open_permission(rqstp, *resfh, open, accmode); + set_change_info(&open->op_cinfo, current_fh); +out: + return status; +} + +static __be32 +do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) +{ + struct svc_fh *current_fh = &cstate->current_fh; + __be32 status; + int accmode = 0; + + /* We don't know the target directory, and therefore can not + * set the change info + */ + + memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); + + nfsd4_set_open_owner_reply_cache(cstate, open, current_fh); + + open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && + (open->op_iattr.ia_size == 0); + /* + * In the delegation case, the client is telling us about an + * open that it *already* performed locally, some time ago. We + * should let it succeed now if possible. + * + * In the case of a CLAIM_FH open, on the other hand, the client + * may be counting on us to enforce permissions (the Linux 4.1 + * client uses this for normal opens, for example). + */ + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH) + accmode = NFSD_MAY_OWNER_OVERRIDE; + + status = do_open_permission(rqstp, current_fh, open, accmode); + + return status; +} + +static void +copy_clientid(clientid_t *clid, struct nfsd4_session *session) +{ + struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)session->se_sessionid.data; + + clid->cl_boot = sid->clientid.cl_boot; + clid->cl_id = sid->clientid.cl_id; +} + +static __be32 +nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_open *open = &u->open; + __be32 status; + struct svc_fh *resfh = NULL; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + bool reclaim = false; + + dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", + (int)open->op_fname.len, open->op_fname.data, + open->op_openowner); + + /* This check required by spec. */ + if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) + return nfserr_inval; + + open->op_created = false; + /* + * RFC5661 18.51.3 + * Before RECLAIM_COMPLETE done, server should deny new lock + */ + if (nfsd4_has_session(cstate) && + !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, + &cstate->session->se_client->cl_flags) && + open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + return nfserr_grace; + + if (nfsd4_has_session(cstate)) + copy_clientid(&open->op_clientid, cstate->session); + + /* check seqid for replay. set nfs4_owner */ + status = nfsd4_process_open1(cstate, open, nn); + if (status == nfserr_replay_me) { + struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay; + fh_put(&cstate->current_fh); + fh_copy_shallow(&cstate->current_fh.fh_handle, + &rp->rp_openfh); + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); + if (status) + dprintk("nfsd4_open: replay failed" + " restoring previous filehandle\n"); + else + status = nfserr_replay_me; + } + if (status) + goto out; + if (open->op_xdr_error) { + status = open->op_xdr_error; + goto out; + } + + status = nfsd4_check_open_attributes(rqstp, cstate, open); + if (status) + goto out; + + /* Openowner is now set, so sequence id will get bumped. Now we need + * these checks before we do any creates: */ + status = nfserr_grace; + if (opens_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + goto out; + status = nfserr_no_grace; + if (!opens_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + goto out; + + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_NULL: + status = do_open_lookup(rqstp, cstate, open, &resfh); + if (status) + goto out; + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + status = nfs4_check_open_reclaim(&open->op_clientid, + cstate, nn); + if (status) + goto out; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + reclaim = true; + fallthrough; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + status = do_open_fhandle(rqstp, cstate, open); + if (status) + goto out; + resfh = &cstate->current_fh; + break; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + dprintk("NFSD: unsupported OPEN claim type %d\n", + open->op_claim_type); + status = nfserr_notsupp; + goto out; + default: + dprintk("NFSD: Invalid OPEN claim type %d\n", + open->op_claim_type); + status = nfserr_inval; + goto out; + } + /* + * nfsd4_process_open2() does the actual opening of the file. If + * successful, it (1) truncates the file if open->op_truncate was + * set, (2) sets open->op_stateid, (3) sets open->op_delegation. + */ + status = nfsd4_process_open2(rqstp, resfh, open); + WARN(status && open->op_created, + "nfsd4_process_open2 failed to open newly-created file! status=%u\n", + be32_to_cpu(status)); + if (reclaim && !status) + nn->somebody_reclaimed = true; +out: + if (resfh && resfh != &cstate->current_fh) { + fh_dup2(&cstate->current_fh, resfh); + fh_put(resfh); + kfree(resfh); + } + nfsd4_cleanup_open_state(cstate, open); + nfsd4_bump_seqid(cstate, status); + return status; +} + +/* + * OPEN is the only seqid-mutating operation whose decoding can fail + * with a seqid-mutating error (specifically, decoding of user names in + * the attributes). Therefore we have to do some processing to look up + * the stateowner so that we can bump the seqid. + */ +static __be32 nfsd4_open_omfg(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_op *op) +{ + struct nfsd4_open *open = &op->u.open; + + if (!seqid_mutating_err(ntohl(op->status))) + return op->status; + if (nfsd4_has_session(cstate)) + return op->status; + open->op_xdr_error = op->status; + return nfsd4_open(rqstp, cstate, &op->u); +} + +/* + * filehandle-manipulating ops. + */ +static __be32 +nfsd4_getfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + u->getfh = &cstate->current_fh; + return nfs_ok; +} + +static __be32 +nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_putfh *putfh = &u->putfh; + __be32 ret; + + fh_put(&cstate->current_fh); + cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen; + memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval, + putfh->pf_fhlen); + ret = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS); +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + if (ret == nfserr_stale && putfh->no_verify) { + SET_FH_FLAG(&cstate->current_fh, NFSD4_FH_FOREIGN); + ret = 0; + } +#endif + return ret; +} + +static __be32 +nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + __be32 status; + + fh_put(&cstate->current_fh); + status = exp_pseudoroot(rqstp, &cstate->current_fh); + return status; +} + +static __be32 +nfsd4_restorefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + if (!cstate->save_fh.fh_dentry) + return nfserr_restorefh; + + fh_dup2(&cstate->current_fh, &cstate->save_fh); + if (HAS_CSTATE_FLAG(cstate, SAVED_STATE_ID_FLAG)) { + memcpy(&cstate->current_stateid, &cstate->save_stateid, sizeof(stateid_t)); + SET_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG); + } + return nfs_ok; +} + +static __be32 +nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + fh_dup2(&cstate->save_fh, &cstate->current_fh); + if (HAS_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG)) { + memcpy(&cstate->save_stateid, &cstate->current_stateid, sizeof(stateid_t)); + SET_CSTATE_FLAG(cstate, SAVED_STATE_ID_FLAG); + } + return nfs_ok; +} + +/* + * misc nfsv4 ops + */ +static __be32 +nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_access *access = &u->access; + u32 access_full; + + access_full = NFS3_ACCESS_FULL; + if (cstate->minorversion >= 2) + access_full |= NFS4_ACCESS_XALIST | NFS4_ACCESS_XAREAD | + NFS4_ACCESS_XAWRITE; + + if (access->ac_req_access & ~access_full) + return nfserr_inval; + + access->ac_resp_access = access->ac_req_access; + return nfsd_access(rqstp, &cstate->current_fh, &access->ac_resp_access, + &access->ac_supported); +} + +static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) +{ + __be32 *verf = (__be32 *)verifier->data; + + BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data)); + + nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id)); +} + +static __be32 +nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_commit *commit = &u->commit; + + return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + commit->co_count, + (__be32 *)commit->co_verf.data); +} + +static __be32 +nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_create *create = &u->create; + struct svc_fh resfh; + __be32 status; + dev_t rdev; + + fh_init(&resfh, NFS4_FHSIZE); + + status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_NOP); + if (status) + return status; + + status = check_attr_support(rqstp, cstate, create->cr_bmval, + nfsd_attrmask); + if (status) + return status; + + current->fs->umask = create->cr_umask; + switch (create->cr_type) { + case NF4LNK: + status = nfsd_symlink(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + create->cr_data, &resfh); + break; + + case NF4BLK: + status = nfserr_inval; + rdev = MKDEV(create->cr_specdata1, create->cr_specdata2); + if (MAJOR(rdev) != create->cr_specdata1 || + MINOR(rdev) != create->cr_specdata2) + goto out_umask; + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFBLK, rdev, &resfh); + break; + + case NF4CHR: + status = nfserr_inval; + rdev = MKDEV(create->cr_specdata1, create->cr_specdata2); + if (MAJOR(rdev) != create->cr_specdata1 || + MINOR(rdev) != create->cr_specdata2) + goto out_umask; + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr,S_IFCHR, rdev, &resfh); + break; + + case NF4SOCK: + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFSOCK, 0, &resfh); + break; + + case NF4FIFO: + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFIFO, 0, &resfh); + break; + + case NF4DIR: + create->cr_iattr.ia_valid &= ~ATTR_SIZE; + status = nfsd_create(rqstp, &cstate->current_fh, + create->cr_name, create->cr_namelen, + &create->cr_iattr, S_IFDIR, 0, &resfh); + break; + + default: + status = nfserr_badtype; + } + + if (status) + goto out; + + if (create->cr_label.len) + nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval); + + if (create->cr_acl != NULL) + do_set_nfs4_acl(rqstp, &resfh, create->cr_acl, + create->cr_bmval); + + fh_unlock(&cstate->current_fh); + set_change_info(&create->cr_cinfo, &cstate->current_fh); + fh_dup2(&cstate->current_fh, &resfh); +out: + fh_put(&resfh); +out_umask: + current->fs->umask = 0; + return status; +} + +static __be32 +nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_getattr *getattr = &u->getattr; + __be32 status; + + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); + if (status) + return status; + + if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) + return nfserr_inval; + + getattr->ga_bmval[0] &= nfsd_suppattrs[cstate->minorversion][0]; + getattr->ga_bmval[1] &= nfsd_suppattrs[cstate->minorversion][1]; + getattr->ga_bmval[2] &= nfsd_suppattrs[cstate->minorversion][2]; + + getattr->ga_fhp = &cstate->current_fh; + return nfs_ok; +} + +static __be32 +nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_link *link = &u->link; + __be32 status; + + status = nfsd_link(rqstp, &cstate->current_fh, + link->li_name, link->li_namelen, &cstate->save_fh); + if (!status) + set_change_info(&link->li_cinfo, &cstate->current_fh); + return status; +} + +static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh) +{ + struct svc_fh tmp_fh; + __be32 ret; + + fh_init(&tmp_fh, NFS4_FHSIZE); + ret = exp_pseudoroot(rqstp, &tmp_fh); + if (ret) + return ret; + if (tmp_fh.fh_dentry == fh->fh_dentry) { + fh_put(&tmp_fh); + return nfserr_noent; + } + fh_put(&tmp_fh); + return nfsd_lookup(rqstp, fh, "..", 2, fh); +} + +static __be32 +nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + return nfsd4_do_lookupp(rqstp, &cstate->current_fh); +} + +static __be32 +nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + return nfsd_lookup(rqstp, &cstate->current_fh, + u->lookup.lo_name, u->lookup.lo_len, + &cstate->current_fh); +} + +static __be32 +nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_read *read = &u->read; + __be32 status; + + read->rd_nf = NULL; + if (read->rd_offset >= OFFSET_MAX) + return nfserr_inval; + + trace_nfsd_read_start(rqstp, &cstate->current_fh, + read->rd_offset, read->rd_length); + + /* + * If we do a zero copy read, then a client will see read data + * that reflects the state of the file *after* performing the + * following compound. + * + * To ensure proper ordering, we therefore turn off zero copy if + * the client wants us to do more in this compound: + */ + if (!nfsd4_last_compound_op(rqstp)) + clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + + /* check stateid */ + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + &read->rd_stateid, RD_STATE, + &read->rd_nf, NULL); + if (status) { + dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); + goto out; + } + status = nfs_ok; +out: + read->rd_rqstp = rqstp; + read->rd_fhp = &cstate->current_fh; + return status; +} + + +static void +nfsd4_read_release(union nfsd4_op_u *u) +{ + if (u->read.rd_nf) + nfsd_file_put(u->read.rd_nf); + trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, + u->read.rd_offset, u->read.rd_length); +} + +static __be32 +nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_readdir *readdir = &u->readdir; + u64 cookie = readdir->rd_cookie; + static const nfs4_verifier zeroverf; + + /* no need to check permission - this will be done in nfsd_readdir() */ + + if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) + return nfserr_inval; + + readdir->rd_bmval[0] &= nfsd_suppattrs[cstate->minorversion][0]; + readdir->rd_bmval[1] &= nfsd_suppattrs[cstate->minorversion][1]; + readdir->rd_bmval[2] &= nfsd_suppattrs[cstate->minorversion][2]; + + if ((cookie == 1) || (cookie == 2) || + (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) + return nfserr_bad_cookie; + + readdir->rd_rqstp = rqstp; + readdir->rd_fhp = &cstate->current_fh; + return nfs_ok; +} + +static __be32 +nfsd4_readlink(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + u->readlink.rl_rqstp = rqstp; + u->readlink.rl_fhp = &cstate->current_fh; + return nfs_ok; +} + +static __be32 +nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_remove *remove = &u->remove; + __be32 status; + + if (opens_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + status = nfsd_unlink(rqstp, &cstate->current_fh, 0, + remove->rm_name, remove->rm_namelen); + if (!status) { + fh_unlock(&cstate->current_fh); + set_change_info(&remove->rm_cinfo, &cstate->current_fh); + } + return status; +} + +static __be32 +nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_rename *rename = &u->rename; + __be32 status; + + if (opens_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, + rename->rn_snamelen, &cstate->current_fh, + rename->rn_tname, rename->rn_tnamelen); + if (status) + return status; + set_change_info(&rename->rn_sinfo, &cstate->save_fh); + set_change_info(&rename->rn_tinfo, &cstate->current_fh); + return nfs_ok; +} + +static __be32 +nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_secinfo *secinfo = &u->secinfo; + struct svc_export *exp; + struct dentry *dentry; + __be32 err; + + err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC); + if (err) + return err; + err = nfsd_lookup_dentry(rqstp, &cstate->current_fh, + secinfo->si_name, secinfo->si_namelen, + &exp, &dentry); + if (err) + return err; + fh_unlock(&cstate->current_fh); + if (d_really_is_negative(dentry)) { + exp_put(exp); + err = nfserr_noent; + } else + secinfo->si_exp = exp; + dput(dentry); + if (cstate->minorversion) + /* See rfc 5661 section 2.6.3.1.1.8 */ + fh_put(&cstate->current_fh); + return err; +} + +static __be32 +nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + __be32 err; + + switch (u->secinfo_no_name.sin_style) { + case NFS4_SECINFO_STYLE4_CURRENT_FH: + break; + case NFS4_SECINFO_STYLE4_PARENT: + err = nfsd4_do_lookupp(rqstp, &cstate->current_fh); + if (err) + return err; + break; + default: + return nfserr_inval; + } + + u->secinfo_no_name.sin_exp = exp_get(cstate->current_fh.fh_export); + fh_put(&cstate->current_fh); + return nfs_ok; +} + +static void +nfsd4_secinfo_release(union nfsd4_op_u *u) +{ + if (u->secinfo.si_exp) + exp_put(u->secinfo.si_exp); +} + +static void +nfsd4_secinfo_no_name_release(union nfsd4_op_u *u) +{ + if (u->secinfo_no_name.sin_exp) + exp_put(u->secinfo_no_name.sin_exp); +} + +static __be32 +nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_setattr *setattr = &u->setattr; + __be32 status = nfs_ok; + int err; + + if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { + status = nfs4_preprocess_stateid_op(rqstp, cstate, + &cstate->current_fh, &setattr->sa_stateid, + WR_STATE, NULL, NULL); + if (status) { + dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); + return status; + } + } + err = fh_want_write(&cstate->current_fh); + if (err) + return nfserrno(err); + status = nfs_ok; + + status = check_attr_support(rqstp, cstate, setattr->sa_bmval, + nfsd_attrmask); + if (status) + goto out; + + if (setattr->sa_acl != NULL) + status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh, + setattr->sa_acl); + if (status) + goto out; + if (setattr->sa_label.len) + status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh, + &setattr->sa_label); + if (status) + goto out; + status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, + 0, (time64_t)0); +out: + fh_drop_write(&cstate->current_fh); + return status; +} + +static __be32 +nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_write *write = &u->write; + stateid_t *stateid = &write->wr_stateid; + struct nfsd_file *nf = NULL; + __be32 status = nfs_ok; + unsigned long cnt; + int nvecs; + + if (write->wr_offset > (u64)OFFSET_MAX || + write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX) + return nfserr_fbig; + + cnt = write->wr_buflen; + trace_nfsd_write_start(rqstp, &cstate->current_fh, + write->wr_offset, cnt); + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + stateid, WR_STATE, &nf, NULL); + if (status) { + dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); + return status; + } + + write->wr_how_written = write->wr_stable_how; + + nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist, + &write->wr_head, write->wr_buflen); + WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); + + status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf, + write->wr_offset, rqstp->rq_vec, nvecs, &cnt, + write->wr_how_written, + (__be32 *)write->wr_verifier.data); + nfsd_file_put(nf); + + write->wr_bytes_written = cnt; + trace_nfsd_write_done(rqstp, &cstate->current_fh, + write->wr_offset, cnt); + return status; +} + +static __be32 +nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + stateid_t *src_stateid, struct nfsd_file **src, + stateid_t *dst_stateid, struct nfsd_file **dst) +{ + __be32 status; + + if (!cstate->save_fh.fh_dentry) + return nfserr_nofilehandle; + + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, + src_stateid, RD_STATE, src, NULL); + if (status) { + dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); + goto out; + } + + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + dst_stateid, WR_STATE, dst, NULL); + if (status) { + dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); + goto out_put_src; + } + + /* fix up for NFS-specific error code */ + if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) || + !S_ISREG(file_inode((*dst)->nf_file)->i_mode)) { + status = nfserr_wrong_type; + goto out_put_dst; + } + +out: + return status; +out_put_dst: + nfsd_file_put(*dst); + *dst = NULL; +out_put_src: + nfsd_file_put(*src); + *src = NULL; + goto out; +} + +static __be32 +nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_clone *clone = &u->clone; + struct nfsd_file *src, *dst; + __be32 status; + + status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src, + &clone->cl_dst_stateid, &dst); + if (status) + goto out; + + status = nfsd4_clone_file_range(src, clone->cl_src_pos, + dst, clone->cl_dst_pos, clone->cl_count, + EX_ISSYNC(cstate->current_fh.fh_export)); + + nfsd_file_put(dst); + nfsd_file_put(src); +out: + return status; +} + +void nfs4_put_copy(struct nfsd4_copy *copy) +{ + if (!refcount_dec_and_test(©->refcount)) + return; + kfree(copy); +} + +static bool +check_and_set_stop_copy(struct nfsd4_copy *copy) +{ + bool value; + + spin_lock(©->cp_clp->async_lock); + value = copy->stopped; + if (!copy->stopped) + copy->stopped = true; + spin_unlock(©->cp_clp->async_lock); + return value; +} + +static void nfsd4_stop_copy(struct nfsd4_copy *copy) +{ + /* only 1 thread should stop the copy */ + if (!check_and_set_stop_copy(copy)) + kthread_stop(copy->copy_task); + nfs4_put_copy(copy); +} + +static struct nfsd4_copy *nfsd4_get_copy(struct nfs4_client *clp) +{ + struct nfsd4_copy *copy = NULL; + + spin_lock(&clp->async_lock); + if (!list_empty(&clp->async_copies)) { + copy = list_first_entry(&clp->async_copies, struct nfsd4_copy, + copies); + refcount_inc(©->refcount); + } + spin_unlock(&clp->async_lock); + return copy; +} + +void nfsd4_shutdown_copy(struct nfs4_client *clp) +{ + struct nfsd4_copy *copy; + + while ((copy = nfsd4_get_copy(clp)) != NULL) + nfsd4_stop_copy(copy); +} +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + +extern struct file *nfs42_ssc_open(struct vfsmount *ss_mnt, + struct nfs_fh *src_fh, + nfs4_stateid *stateid); +extern void nfs42_ssc_close(struct file *filep); + +extern void nfs_sb_deactive(struct super_block *sb); + +#define NFSD42_INTERSSC_MOUNTOPS "vers=4.2,addr=%s,sec=sys" + +/* + * Support one copy source server for now. + */ +static __be32 +nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, + struct vfsmount **mount) +{ + struct file_system_type *type; + struct vfsmount *ss_mnt; + struct nfs42_netaddr *naddr; + struct sockaddr_storage tmp_addr; + size_t tmp_addrlen, match_netid_len = 3; + char *startsep = "", *endsep = "", *match_netid = "tcp"; + char *ipaddr, *dev_name, *raw_data; + int len, raw_len; + __be32 status = nfserr_inval; + + naddr = &nss->u.nl4_addr; + tmp_addrlen = rpc_uaddr2sockaddr(SVC_NET(rqstp), naddr->addr, + naddr->addr_len, + (struct sockaddr *)&tmp_addr, + sizeof(tmp_addr)); + if (tmp_addrlen == 0) + goto out_err; + + if (tmp_addr.ss_family == AF_INET6) { + startsep = "["; + endsep = "]"; + match_netid = "tcp6"; + match_netid_len = 4; + } + + if (naddr->netid_len != match_netid_len || + strncmp(naddr->netid, match_netid, naddr->netid_len)) + goto out_err; + + /* Construct the raw data for the vfs_kern_mount call */ + len = RPC_MAX_ADDRBUFLEN + 1; + ipaddr = kzalloc(len, GFP_KERNEL); + if (!ipaddr) + goto out_err; + + rpc_ntop((struct sockaddr *)&tmp_addr, ipaddr, len); + + /* 2 for ipv6 endsep and startsep. 3 for ":/" and trailing '/0'*/ + + raw_len = strlen(NFSD42_INTERSSC_MOUNTOPS) + strlen(ipaddr); + raw_data = kzalloc(raw_len, GFP_KERNEL); + if (!raw_data) + goto out_free_ipaddr; + + snprintf(raw_data, raw_len, NFSD42_INTERSSC_MOUNTOPS, ipaddr); + + status = nfserr_nodev; + type = get_fs_type("nfs"); + if (!type) + goto out_free_rawdata; + + /* Set the server:<export> for the vfs_kern_mount call */ + dev_name = kzalloc(len + 5, GFP_KERNEL); + if (!dev_name) + goto out_free_rawdata; + snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep); + + /* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */ + ss_mnt = vfs_kern_mount(type, SB_KERNMOUNT, dev_name, raw_data); + module_put(type->owner); + if (IS_ERR(ss_mnt)) + goto out_free_devname; + + status = 0; + *mount = ss_mnt; + +out_free_devname: + kfree(dev_name); +out_free_rawdata: + kfree(raw_data); +out_free_ipaddr: + kfree(ipaddr); +out_err: + return status; +} + +/* + * Verify COPY destination stateid. + * + * Connect to the source server with NFSv4.1. + * Create the source struct file for nfsd_copy_range. + * Called with COPY cstate: + * SAVED_FH: source filehandle + * CURRENT_FH: destination filehandle + */ +static __be32 +nfsd4_setup_inter_ssc(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_copy *copy, struct vfsmount **mount) +{ + struct svc_fh *s_fh = NULL; + stateid_t *s_stid = ©->cp_src_stateid; + __be32 status = nfserr_inval; + + /* Verify the destination stateid and set dst struct file*/ + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + ©->cp_dst_stateid, + WR_STATE, ©->nf_dst, NULL); + if (status) + goto out; + + status = nfsd4_interssc_connect(©->cp_src, rqstp, mount); + if (status) + goto out; + + s_fh = &cstate->save_fh; + + copy->c_fh.size = s_fh->fh_handle.fh_size; + memcpy(copy->c_fh.data, &s_fh->fh_handle.fh_base, copy->c_fh.size); + copy->stateid.seqid = cpu_to_be32(s_stid->si_generation); + memcpy(copy->stateid.other, (void *)&s_stid->si_opaque, + sizeof(stateid_opaque_t)); + + status = 0; +out: + return status; +} + +static void +nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src, + struct nfsd_file *dst) +{ + nfs42_ssc_close(src->nf_file); + fput(src->nf_file); + nfsd_file_put(dst); + mntput(ss_mnt); +} + +#else /* CONFIG_NFSD_V4_2_INTER_SSC */ + +static __be32 +nfsd4_setup_inter_ssc(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_copy *copy, + struct vfsmount **mount) +{ + *mount = NULL; + return nfserr_inval; +} + +static void +nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src, + struct nfsd_file *dst) +{ +} + +static struct file *nfs42_ssc_open(struct vfsmount *ss_mnt, + struct nfs_fh *src_fh, + nfs4_stateid *stateid) +{ + return NULL; +} +#endif /* CONFIG_NFSD_V4_2_INTER_SSC */ + +static __be32 +nfsd4_setup_intra_ssc(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_copy *copy) +{ + return nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, + ©->nf_src, ©->cp_dst_stateid, + ©->nf_dst); +} + +static void +nfsd4_cleanup_intra_ssc(struct nfsd_file *src, struct nfsd_file *dst) +{ + nfsd_file_put(src); + nfsd_file_put(dst); +} + +static void nfsd4_cb_offload_release(struct nfsd4_callback *cb) +{ + struct nfsd4_copy *copy = container_of(cb, struct nfsd4_copy, cp_cb); + + nfs4_put_copy(copy); +} + +static int nfsd4_cb_offload_done(struct nfsd4_callback *cb, + struct rpc_task *task) +{ + return 1; +} + +static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = { + .release = nfsd4_cb_offload_release, + .done = nfsd4_cb_offload_done +}; + +static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync) +{ + copy->cp_res.wr_stable_how = NFS_UNSTABLE; + copy->cp_synchronous = sync; + gen_boot_verifier(©->cp_res.wr_verifier, copy->cp_clp->net); +} + +static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) +{ + struct file *dst = copy->nf_dst->nf_file; + struct file *src = copy->nf_src->nf_file; + ssize_t bytes_copied = 0; + size_t bytes_total = copy->cp_count; + u64 src_pos = copy->cp_src_pos; + u64 dst_pos = copy->cp_dst_pos; + + do { + if (kthread_should_stop()) + break; + bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos, + bytes_total); + if (bytes_copied <= 0) + break; + bytes_total -= bytes_copied; + copy->cp_res.wr_bytes_written += bytes_copied; + src_pos += bytes_copied; + dst_pos += bytes_copied; + } while (bytes_total > 0 && !copy->cp_synchronous); + return bytes_copied; +} + +static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync) +{ + __be32 status; + ssize_t bytes; + + bytes = _nfsd_copy_file_range(copy); + /* for async copy, we ignore the error, client can always retry + * to get the error + */ + if (bytes < 0 && !copy->cp_res.wr_bytes_written) + status = nfserrno(bytes); + else { + nfsd4_init_copy_res(copy, sync); + status = nfs_ok; + } + + if (!copy->cp_intra) /* Inter server SSC */ + nfsd4_cleanup_inter_ssc(copy->ss_mnt, copy->nf_src, + copy->nf_dst); + else + nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst); + + return status; +} + +static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) +{ + dst->cp_src_pos = src->cp_src_pos; + dst->cp_dst_pos = src->cp_dst_pos; + dst->cp_count = src->cp_count; + dst->cp_synchronous = src->cp_synchronous; + memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res)); + memcpy(&dst->fh, &src->fh, sizeof(src->fh)); + dst->cp_clp = src->cp_clp; + dst->nf_dst = nfsd_file_get(src->nf_dst); + dst->cp_intra = src->cp_intra; + if (src->cp_intra) /* for inter, file_src doesn't exist yet */ + dst->nf_src = nfsd_file_get(src->nf_src); + + memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid)); + memcpy(&dst->cp_src, &src->cp_src, sizeof(struct nl4_server)); + memcpy(&dst->stateid, &src->stateid, sizeof(src->stateid)); + memcpy(&dst->c_fh, &src->c_fh, sizeof(src->c_fh)); + dst->ss_mnt = src->ss_mnt; +} + +static void cleanup_async_copy(struct nfsd4_copy *copy) +{ + nfs4_free_copy_state(copy); + nfsd_file_put(copy->nf_dst); + if (copy->cp_intra) + nfsd_file_put(copy->nf_src); + spin_lock(©->cp_clp->async_lock); + list_del(©->copies); + spin_unlock(©->cp_clp->async_lock); + nfs4_put_copy(copy); +} + +static int nfsd4_do_async_copy(void *data) +{ + struct nfsd4_copy *copy = (struct nfsd4_copy *)data; + struct nfsd4_copy *cb_copy; + + if (!copy->cp_intra) { /* Inter server SSC */ + copy->nf_src = kzalloc(sizeof(struct nfsd_file), GFP_KERNEL); + if (!copy->nf_src) { + copy->nfserr = nfserr_serverfault; + /* ss_mnt will be unmounted by the laundromat */ + goto do_callback; + } + copy->nf_src->nf_file = nfs42_ssc_open(copy->ss_mnt, ©->c_fh, + ©->stateid); + if (IS_ERR(copy->nf_src->nf_file)) { + copy->nfserr = nfserr_offload_denied; + /* ss_mnt will be unmounted by the laundromat */ + goto do_callback; + } + } + + copy->nfserr = nfsd4_do_copy(copy, 0); +do_callback: + cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); + if (!cb_copy) + goto out; + refcount_set(&cb_copy->refcount, 1); + memcpy(&cb_copy->cp_res, ©->cp_res, sizeof(copy->cp_res)); + cb_copy->cp_clp = copy->cp_clp; + cb_copy->nfserr = copy->nfserr; + memcpy(&cb_copy->fh, ©->fh, sizeof(copy->fh)); + nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp, + &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD); + nfsd4_run_cb(&cb_copy->cp_cb); +out: + if (!copy->cp_intra) + kfree(copy->nf_src); + cleanup_async_copy(copy); + return 0; +} + +static __be32 +nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_copy *copy = &u->copy; + __be32 status; + struct nfsd4_copy *async_copy = NULL; + + if (!copy->cp_intra) { /* Inter server SSC */ + if (!inter_copy_offload_enable || copy->cp_synchronous) { + status = nfserr_notsupp; + goto out; + } + status = nfsd4_setup_inter_ssc(rqstp, cstate, copy, + ©->ss_mnt); + if (status) + return nfserr_offload_denied; + } else { + status = nfsd4_setup_intra_ssc(rqstp, cstate, copy); + if (status) + return status; + } + + copy->cp_clp = cstate->clp; + memcpy(©->fh, &cstate->current_fh.fh_handle, + sizeof(struct knfsd_fh)); + if (!copy->cp_synchronous) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + status = nfserrno(-ENOMEM); + async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); + if (!async_copy) + goto out_err; + if (!nfs4_init_copy_state(nn, copy)) + goto out_err; + refcount_set(&async_copy->refcount, 1); + memcpy(©->cp_res.cb_stateid, ©->cp_stateid.stid, + sizeof(copy->cp_res.cb_stateid)); + dup_copy_fields(copy, async_copy); + async_copy->copy_task = kthread_create(nfsd4_do_async_copy, + async_copy, "%s", "copy thread"); + if (IS_ERR(async_copy->copy_task)) + goto out_err; + spin_lock(&async_copy->cp_clp->async_lock); + list_add(&async_copy->copies, + &async_copy->cp_clp->async_copies); + spin_unlock(&async_copy->cp_clp->async_lock); + wake_up_process(async_copy->copy_task); + status = nfs_ok; + } else { + status = nfsd4_do_copy(copy, 1); + } +out: + return status; +out_err: + if (async_copy) + cleanup_async_copy(async_copy); + status = nfserrno(-ENOMEM); + /* + * source's vfsmount of inter-copy will be unmounted + * by the laundromat + */ + goto out; +} + +struct nfsd4_copy * +find_async_copy(struct nfs4_client *clp, stateid_t *stateid) +{ + struct nfsd4_copy *copy; + + spin_lock(&clp->async_lock); + list_for_each_entry(copy, &clp->async_copies, copies) { + if (memcmp(©->cp_stateid.stid, stateid, NFS4_STATEID_SIZE)) + continue; + refcount_inc(©->refcount); + spin_unlock(&clp->async_lock); + return copy; + } + spin_unlock(&clp->async_lock); + return NULL; +} + +static __be32 +nfsd4_offload_cancel(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_offload_status *os = &u->offload_status; + struct nfsd4_copy *copy; + struct nfs4_client *clp = cstate->clp; + + copy = find_async_copy(clp, &os->stateid); + if (!copy) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + return manage_cpntf_state(nn, &os->stateid, clp, NULL); + } else + nfsd4_stop_copy(copy); + + return nfs_ok; +} + +static __be32 +nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_copy_notify *cn = &u->copy_notify; + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfs4_stid *stid; + struct nfs4_cpntf_state *cps; + struct nfs4_client *clp = cstate->clp; + + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + &cn->cpn_src_stateid, RD_STATE, NULL, + &stid); + if (status) + return status; + + cn->cpn_sec = nn->nfsd4_lease; + cn->cpn_nsec = 0; + + status = nfserrno(-ENOMEM); + cps = nfs4_alloc_init_cpntf_state(nn, stid); + if (!cps) + goto out; + memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.stid, sizeof(stateid_t)); + memcpy(&cps->cp_p_stateid, &stid->sc_stateid, sizeof(stateid_t)); + memcpy(&cps->cp_p_clid, &clp->cl_clientid, sizeof(clientid_t)); + + /* For now, only return one server address in cpn_src, the + * address used by the client to connect to this server. + */ + cn->cpn_src.nl4_type = NL4_NETADDR; + status = nfsd4_set_netaddr((struct sockaddr *)&rqstp->rq_daddr, + &cn->cpn_src.u.nl4_addr); + WARN_ON_ONCE(status); + if (status) { + nfs4_put_cpntf_state(nn, cps); + goto out; + } +out: + nfs4_put_stid(stid); + return status; +} + +static __be32 +nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_fallocate *fallocate, int flags) +{ + __be32 status; + struct nfsd_file *nf; + + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + &fallocate->falloc_stateid, + WR_STATE, &nf, NULL); + if (status != nfs_ok) { + dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); + return status; + } + + status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file, + fallocate->falloc_offset, + fallocate->falloc_length, + flags); + nfsd_file_put(nf); + return status; +} +static __be32 +nfsd4_offload_status(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_offload_status *os = &u->offload_status; + __be32 status = 0; + struct nfsd4_copy *copy; + struct nfs4_client *clp = cstate->clp; + + copy = find_async_copy(clp, &os->stateid); + if (copy) { + os->count = copy->cp_res.wr_bytes_written; + nfs4_put_copy(copy); + } else + status = nfserr_bad_stateid; + + return status; +} + +static __be32 +nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + return nfsd4_fallocate(rqstp, cstate, &u->allocate, 0); +} + +static __be32 +nfsd4_deallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + return nfsd4_fallocate(rqstp, cstate, &u->deallocate, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE); +} + +static __be32 +nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_seek *seek = &u->seek; + int whence; + __be32 status; + struct nfsd_file *nf; + + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + &seek->seek_stateid, + RD_STATE, &nf, NULL); + if (status) { + dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); + return status; + } + + switch (seek->seek_whence) { + case NFS4_CONTENT_DATA: + whence = SEEK_DATA; + break; + case NFS4_CONTENT_HOLE: + whence = SEEK_HOLE; + break; + default: + status = nfserr_union_notsupp; + goto out; + } + + /* + * Note: This call does change file->f_pos, but nothing in NFSD + * should ever file->f_pos. + */ + seek->seek_pos = vfs_llseek(nf->nf_file, seek->seek_offset, whence); + if (seek->seek_pos < 0) + status = nfserrno(seek->seek_pos); + else if (seek->seek_pos >= i_size_read(file_inode(nf->nf_file))) + seek->seek_eof = true; + +out: + nfsd_file_put(nf); + return status; +} + +/* This routine never returns NFS_OK! If there are no other errors, it + * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the + * attributes matched. VERIFY is implemented by mapping NFSERR_SAME + * to NFS_OK after the call; NVERIFY by mapping NFSERR_NOT_SAME to NFS_OK. + */ +static __be32 +_nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_verify *verify) +{ + __be32 *buf, *p; + int count; + __be32 status; + + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); + if (status) + return status; + + status = check_attr_support(rqstp, cstate, verify->ve_bmval, NULL); + if (status) + return status; + + if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR) + || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)) + return nfserr_inval; + if (verify->ve_attrlen & 3) + return nfserr_inval; + + /* count in words: + * bitmap_len(1) + bitmap(2) + attr_len(1) = 4 + */ + count = 4 + (verify->ve_attrlen >> 2); + buf = kmalloc(count << 2, GFP_KERNEL); + if (!buf) + return nfserr_jukebox; + + p = buf; + status = nfsd4_encode_fattr_to_buf(&p, count, &cstate->current_fh, + cstate->current_fh.fh_export, + cstate->current_fh.fh_dentry, + verify->ve_bmval, + rqstp, 0); + /* + * If nfsd4_encode_fattr() ran out of space, assume that's because + * the attributes are longer (hence different) than those given: + */ + if (status == nfserr_resource) + status = nfserr_not_same; + if (status) + goto out_kfree; + + /* skip bitmap */ + p = buf + 1 + ntohl(buf[0]); + status = nfserr_not_same; + if (ntohl(*p++) != verify->ve_attrlen) + goto out_kfree; + if (!memcmp(p, verify->ve_attrval, verify->ve_attrlen)) + status = nfserr_same; + +out_kfree: + kfree(buf); + return status; +} + +static __be32 +nfsd4_nverify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + __be32 status; + + status = _nfsd4_verify(rqstp, cstate, &u->verify); + return status == nfserr_not_same ? nfs_ok : status; +} + +static __be32 +nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + __be32 status; + + status = _nfsd4_verify(rqstp, cstate, &u->nverify); + return status == nfserr_same ? nfs_ok : status; +} + +#ifdef CONFIG_NFSD_PNFS +static const struct nfsd4_layout_ops * +nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) +{ + if (!exp->ex_layout_types) { + dprintk("%s: export does not support pNFS\n", __func__); + return NULL; + } + + if (layout_type >= LAYOUT_TYPE_MAX || + !(exp->ex_layout_types & (1 << layout_type))) { + dprintk("%s: layout type %d not supported\n", + __func__, layout_type); + return NULL; + } + + return nfsd4_layout_ops[layout_type]; +} + +static __be32 +nfsd4_getdeviceinfo(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) +{ + struct nfsd4_getdeviceinfo *gdp = &u->getdeviceinfo; + const struct nfsd4_layout_ops *ops; + struct nfsd4_deviceid_map *map; + struct svc_export *exp; + __be32 nfserr; + + dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n", + __func__, + gdp->gd_layout_type, + gdp->gd_devid.fsid_idx, gdp->gd_devid.generation, + gdp->gd_maxcount); + + map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx); + if (!map) { + dprintk("%s: couldn't find device ID to export mapping!\n", + __func__); + return nfserr_noent; + } + + exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid); + if (IS_ERR(exp)) { + dprintk("%s: could not find device id\n", __func__); + return nfserr_noent; + } + + nfserr = nfserr_layoutunavailable; + ops = nfsd4_layout_verify(exp, gdp->gd_layout_type); + if (!ops) + goto out; + + nfserr = nfs_ok; + if (gdp->gd_maxcount != 0) { + nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, + rqstp, cstate->session->se_client, gdp); + } + + gdp->gd_notify_types &= ops->notify_types; +out: + exp_put(exp); + return nfserr; +} + +static void +nfsd4_getdeviceinfo_release(union nfsd4_op_u *u) +{ + kfree(u->getdeviceinfo.gd_device); +} + +static __be32 +nfsd4_layoutget(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) +{ + struct nfsd4_layoutget *lgp = &u->layoutget; + struct svc_fh *current_fh = &cstate->current_fh; + const struct nfsd4_layout_ops *ops; + struct nfs4_layout_stateid *ls; + __be32 nfserr; + int accmode = NFSD_MAY_READ_IF_EXEC; + + switch (lgp->lg_seg.iomode) { + case IOMODE_READ: + accmode |= NFSD_MAY_READ; + break; + case IOMODE_RW: + accmode |= NFSD_MAY_READ | NFSD_MAY_WRITE; + break; + default: + dprintk("%s: invalid iomode %d\n", + __func__, lgp->lg_seg.iomode); + nfserr = nfserr_badiomode; + goto out; + } + + nfserr = fh_verify(rqstp, current_fh, 0, accmode); + if (nfserr) + goto out; + + nfserr = nfserr_layoutunavailable; + ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type); + if (!ops) + goto out; + + /* + * Verify minlength and range as per RFC5661: + * o If loga_length is less than loga_minlength, + * the metadata server MUST return NFS4ERR_INVAL. + * o If the sum of loga_offset and loga_minlength exceeds + * NFS4_UINT64_MAX, and loga_minlength is not + * NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result. + * o If the sum of loga_offset and loga_length exceeds + * NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX, + * the error NFS4ERR_INVAL MUST result. + */ + nfserr = nfserr_inval; + if (lgp->lg_seg.length < lgp->lg_minlength || + (lgp->lg_minlength != NFS4_MAX_UINT64 && + lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) || + (lgp->lg_seg.length != NFS4_MAX_UINT64 && + lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset)) + goto out; + if (lgp->lg_seg.length == 0) + goto out; + + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid, + true, lgp->lg_layout_type, &ls); + if (nfserr) { + trace_nfsd_layout_get_lookup_fail(&lgp->lg_sid); + goto out; + } + + nfserr = nfserr_recallconflict; + if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls)) + goto out_put_stid; + + nfserr = ops->proc_layoutget(d_inode(current_fh->fh_dentry), + current_fh, lgp); + if (nfserr) + goto out_put_stid; + + nfserr = nfsd4_insert_layout(lgp, ls); + +out_put_stid: + mutex_unlock(&ls->ls_mutex); + nfs4_put_stid(&ls->ls_stid); +out: + return nfserr; +} + +static void +nfsd4_layoutget_release(union nfsd4_op_u *u) +{ + kfree(u->layoutget.lg_content); +} + +static __be32 +nfsd4_layoutcommit(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) +{ + struct nfsd4_layoutcommit *lcp = &u->layoutcommit; + const struct nfsd4_layout_seg *seg = &lcp->lc_seg; + struct svc_fh *current_fh = &cstate->current_fh; + const struct nfsd4_layout_ops *ops; + loff_t new_size = lcp->lc_last_wr + 1; + struct inode *inode; + struct nfs4_layout_stateid *ls; + __be32 nfserr; + + nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE); + if (nfserr) + goto out; + + nfserr = nfserr_layoutunavailable; + ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type); + if (!ops) + goto out; + inode = d_inode(current_fh->fh_dentry); + + nfserr = nfserr_inval; + if (new_size <= seg->offset) { + dprintk("pnfsd: last write before layout segment\n"); + goto out; + } + if (new_size > seg->offset + seg->length) { + dprintk("pnfsd: last write beyond layout segment\n"); + goto out; + } + if (!lcp->lc_newoffset && new_size > i_size_read(inode)) { + dprintk("pnfsd: layoutcommit beyond EOF\n"); + goto out; + } + + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid, + false, lcp->lc_layout_type, + &ls); + if (nfserr) { + trace_nfsd_layout_commit_lookup_fail(&lcp->lc_sid); + /* fixup error code as per RFC5661 */ + if (nfserr == nfserr_bad_stateid) + nfserr = nfserr_badlayout; + goto out; + } + + /* LAYOUTCOMMIT does not require any serialization */ + mutex_unlock(&ls->ls_mutex); + + if (new_size > i_size_read(inode)) { + lcp->lc_size_chg = 1; + lcp->lc_newsize = new_size; + } else { + lcp->lc_size_chg = 0; + } + + nfserr = ops->proc_layoutcommit(inode, lcp); + nfs4_put_stid(&ls->ls_stid); +out: + return nfserr; +} + +static __be32 +nfsd4_layoutreturn(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) +{ + struct nfsd4_layoutreturn *lrp = &u->layoutreturn; + struct svc_fh *current_fh = &cstate->current_fh; + __be32 nfserr; + + nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); + if (nfserr) + goto out; + + nfserr = nfserr_layoutunavailable; + if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type)) + goto out; + + switch (lrp->lr_seg.iomode) { + case IOMODE_READ: + case IOMODE_RW: + case IOMODE_ANY: + break; + default: + dprintk("%s: invalid iomode %d\n", __func__, + lrp->lr_seg.iomode); + nfserr = nfserr_inval; + goto out; + } + + switch (lrp->lr_return_type) { + case RETURN_FILE: + nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp); + break; + case RETURN_FSID: + case RETURN_ALL: + nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp); + break; + default: + dprintk("%s: invalid return_type %d\n", __func__, + lrp->lr_return_type); + nfserr = nfserr_inval; + break; + } +out: + return nfserr; +} +#endif /* CONFIG_NFSD_PNFS */ + +static __be32 +nfsd4_getxattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_getxattr *getxattr = &u->getxattr; + + return nfsd_getxattr(rqstp, &cstate->current_fh, + getxattr->getxa_name, &getxattr->getxa_buf, + &getxattr->getxa_len); +} + +static __be32 +nfsd4_setxattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_setxattr *setxattr = &u->setxattr; + __be32 ret; + + if (opens_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + + ret = nfsd_setxattr(rqstp, &cstate->current_fh, setxattr->setxa_name, + setxattr->setxa_buf, setxattr->setxa_len, + setxattr->setxa_flags); + + if (!ret) + set_change_info(&setxattr->setxa_cinfo, &cstate->current_fh); + + return ret; +} + +static __be32 +nfsd4_listxattrs(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + /* + * Get the entire list, then copy out only the user attributes + * in the encode function. + */ + return nfsd_listxattr(rqstp, &cstate->current_fh, + &u->listxattrs.lsxa_buf, &u->listxattrs.lsxa_len); +} + +static __be32 +nfsd4_removexattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_removexattr *removexattr = &u->removexattr; + __be32 ret; + + if (opens_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + + ret = nfsd_removexattr(rqstp, &cstate->current_fh, + removexattr->rmxa_name); + + if (!ret) + set_change_info(&removexattr->rmxa_cinfo, &cstate->current_fh); + + return ret; +} + +/* + * NULL call. + */ +static __be32 +nfsd4_proc_null(struct svc_rqst *rqstp) +{ + return rpc_success; +} + +static inline void nfsd4_increment_op_stats(u32 opnum) +{ + if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) + nfsdstats.nfs4_opcount[opnum]++; +} + +static const struct nfsd4_operation nfsd4_ops[]; + +static const char *nfsd4_op_name(unsigned opnum); + +/* + * Enforce NFSv4.1 COMPOUND ordering rules: + * + * Also note, enforced elsewhere: + * - SEQUENCE other than as first op results in + * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().) + * - BIND_CONN_TO_SESSION must be the only op in its compound. + * (Enforced in nfsd4_bind_conn_to_session().) + * - DESTROY_SESSION must be the final operation in a compound, if + * sessionid's in SEQUENCE and DESTROY_SESSION are the same. + * (Enforced in nfsd4_destroy_session().) + */ +static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args) +{ + struct nfsd4_op *first_op = &args->ops[0]; + + /* These ordering requirements don't apply to NFSv4.0: */ + if (args->minorversion == 0) + return nfs_ok; + /* This is weird, but OK, not our problem: */ + if (args->opcnt == 0) + return nfs_ok; + if (first_op->status == nfserr_op_illegal) + return nfs_ok; + if (!(nfsd4_ops[first_op->opnum].op_flags & ALLOWED_AS_FIRST_OP)) + return nfserr_op_not_in_session; + if (first_op->opnum == OP_SEQUENCE) + return nfs_ok; + /* + * So first_op is something allowed outside a session, like + * EXCHANGE_ID; but then it has to be the only op in the + * compound: + */ + if (args->opcnt != 1) + return nfserr_not_only_op; + return nfs_ok; +} + +const struct nfsd4_operation *OPDESC(struct nfsd4_op *op) +{ + return &nfsd4_ops[op->opnum]; +} + +bool nfsd4_cache_this_op(struct nfsd4_op *op) +{ + if (op->opnum == OP_ILLEGAL) + return false; + return OPDESC(op)->op_flags & OP_CACHEME; +} + +static bool need_wrongsec_check(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + struct nfsd4_op *this = &argp->ops[resp->opcnt - 1]; + struct nfsd4_op *next = &argp->ops[resp->opcnt]; + const struct nfsd4_operation *thisd = OPDESC(this); + const struct nfsd4_operation *nextd; + + /* + * Most ops check wronsec on our own; only the putfh-like ops + * have special rules. + */ + if (!(thisd->op_flags & OP_IS_PUTFH_LIKE)) + return false; + /* + * rfc 5661 2.6.3.1.1.6: don't bother erroring out a + * put-filehandle operation if we're not going to use the + * result: + */ + if (argp->opcnt == resp->opcnt) + return false; + if (next->opnum == OP_ILLEGAL) + return false; + nextd = OPDESC(next); + /* + * Rest of 2.6.3.1.1: certain operations will return WRONGSEC + * errors themselves as necessary; others should check for them + * now: + */ + return !(nextd->op_flags & OP_HANDLES_WRONGSEC); +} + +static void svcxdr_init_encode(struct svc_rqst *rqstp, + struct nfsd4_compoundres *resp) +{ + struct xdr_stream *xdr = &resp->xdr; + struct xdr_buf *buf = &rqstp->rq_res; + struct kvec *head = buf->head; + + xdr->buf = buf; + xdr->iov = head; + xdr->p = head->iov_base + head->iov_len; + xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack; + /* Tail and page_len should be zero at this point: */ + buf->len = buf->head[0].iov_len; + xdr->scratch.iov_len = 0; + xdr->page_ptr = buf->pages - 1; + buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages) + - rqstp->rq_auth_slack; +} + +#ifdef CONFIG_NFSD_V4_2_INTER_SSC +static void +check_if_stalefh_allowed(struct nfsd4_compoundargs *args) +{ + struct nfsd4_op *op, *current_op = NULL, *saved_op = NULL; + struct nfsd4_copy *copy; + struct nfsd4_putfh *putfh; + int i; + + /* traverse all operation and if it's a COPY compound, mark the + * source filehandle to skip verification + */ + for (i = 0; i < args->opcnt; i++) { + op = &args->ops[i]; + if (op->opnum == OP_PUTFH) + current_op = op; + else if (op->opnum == OP_SAVEFH) + saved_op = current_op; + else if (op->opnum == OP_RESTOREFH) + current_op = saved_op; + else if (op->opnum == OP_COPY) { + copy = (struct nfsd4_copy *)&op->u; + if (!saved_op) { + op->status = nfserr_nofilehandle; + return; + } + putfh = (struct nfsd4_putfh *)&saved_op->u; + if (!copy->cp_intra) + putfh->no_verify = true; + } + } +} +#else +static void +check_if_stalefh_allowed(struct nfsd4_compoundargs *args) +{ +} +#endif + +/* + * COMPOUND call. + */ +static __be32 +nfsd4_proc_compound(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundargs *args = rqstp->rq_argp; + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_op *op; + struct nfsd4_compound_state *cstate = &resp->cstate; + struct svc_fh *current_fh = &cstate->current_fh; + struct svc_fh *save_fh = &cstate->save_fh; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 status; + + svcxdr_init_encode(rqstp, resp); + resp->tagp = resp->xdr.p; + /* reserve space for: taglen, tag, and opcnt */ + xdr_reserve_space(&resp->xdr, 8 + args->taglen); + resp->taglen = args->taglen; + resp->tag = args->tag; + resp->rqstp = rqstp; + cstate->minorversion = args->minorversion; + fh_init(current_fh, NFS4_FHSIZE); + fh_init(save_fh, NFS4_FHSIZE); + /* + * Don't use the deferral mechanism for NFSv4; compounds make it + * too hard to avoid non-idempotency problems. + */ + clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + + /* + * According to RFC3010, this takes precedence over all other errors. + */ + status = nfserr_minor_vers_mismatch; + if (nfsd_minorversion(nn, args->minorversion, NFSD_TEST) <= 0) + goto out; + status = nfserr_resource; + if (args->opcnt > NFSD_MAX_OPS_PER_COMPOUND) + goto out; + + status = nfs41_check_op_ordering(args); + if (status) { + op = &args->ops[0]; + op->status = status; + resp->opcnt = 1; + goto encode_op; + } + check_if_stalefh_allowed(args); + + rqstp->rq_lease_breaker = (void **)&cstate->clp; + + trace_nfsd_compound(rqstp, args->opcnt); + while (!status && resp->opcnt < args->opcnt) { + op = &args->ops[resp->opcnt++]; + + /* + * The XDR decode routines may have pre-set op->status; + * for example, if there is a miscellaneous XDR error + * it will be set to nfserr_bad_xdr. + */ + if (op->status) { + if (op->opnum == OP_OPEN) + op->status = nfsd4_open_omfg(rqstp, cstate, op); + goto encode_op; + } + if (!current_fh->fh_dentry && + !HAS_FH_FLAG(current_fh, NFSD4_FH_FOREIGN)) { + if (!(op->opdesc->op_flags & ALLOWED_WITHOUT_FH)) { + op->status = nfserr_nofilehandle; + goto encode_op; + } + } else if (current_fh->fh_export && + current_fh->fh_export->ex_fslocs.migrated && + !(op->opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) { + op->status = nfserr_moved; + goto encode_op; + } + + fh_clear_wcc(current_fh); + + /* If op is non-idempotent */ + if (op->opdesc->op_flags & OP_MODIFIES_SOMETHING) { + /* + * Don't execute this op if we couldn't encode a + * succesful reply: + */ + u32 plen = op->opdesc->op_rsize_bop(rqstp, op); + /* + * Plus if there's another operation, make sure + * we'll have space to at least encode an error: + */ + if (resp->opcnt < args->opcnt) + plen += COMPOUND_ERR_SLACK_SPACE; + op->status = nfsd4_check_resp_size(resp, plen); + } + + if (op->status) + goto encode_op; + + if (op->opdesc->op_get_currentstateid) + op->opdesc->op_get_currentstateid(cstate, &op->u); + op->status = op->opdesc->op_func(rqstp, cstate, &op->u); + + /* Only from SEQUENCE */ + if (cstate->status == nfserr_replay_cache) { + dprintk("%s NFS4.1 replay from cache\n", __func__); + status = op->status; + goto out; + } + if (!op->status) { + if (op->opdesc->op_set_currentstateid) + op->opdesc->op_set_currentstateid(cstate, &op->u); + + if (op->opdesc->op_flags & OP_CLEAR_STATEID) + clear_current_stateid(cstate); + + if (current_fh->fh_export && + need_wrongsec_check(rqstp)) + op->status = check_nfsd_access(current_fh->fh_export, rqstp); + } +encode_op: + if (op->status == nfserr_replay_me) { + op->replay = &cstate->replay_owner->so_replay; + nfsd4_encode_replay(&resp->xdr, op); + status = op->status = op->replay->rp_status; + } else { + nfsd4_encode_operation(resp, op); + status = op->status; + } + + trace_nfsd_compound_status(args->opcnt, resp->opcnt, status, + nfsd4_op_name(op->opnum)); + + nfsd4_cstate_clear_replay(cstate); + nfsd4_increment_op_stats(op->opnum); + } + + fh_put(current_fh); + fh_put(save_fh); + BUG_ON(cstate->replay_owner); +out: + cstate->status = status; + /* Reset deferral mechanism for RPC deferrals */ + set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + return rpc_success; +} + +#define op_encode_hdr_size (2) +#define op_encode_stateid_maxsz (XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define op_encode_verifier_maxsz (XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define op_encode_change_info_maxsz (5) +#define nfs4_fattr_bitmap_maxsz (4) + +/* We'll fall back on returning no lockowner if run out of space: */ +#define op_encode_lockowner_maxsz (0) +#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz) + +#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) + +#define op_encode_ace_maxsz (3 + nfs4_owner_maxsz) +#define op_encode_delegation_maxsz (1 + op_encode_stateid_maxsz + 1 + \ + op_encode_ace_maxsz) + +#define op_encode_channel_attrs_maxsz (6 + 1 + 1) + +static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size) * sizeof(__be32); +} + +static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32); +} + +static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + /* ac_supported, ac_resp_access */ + return (op_encode_hdr_size + 2)* sizeof(__be32); +} + +static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz + + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); +} + +/* + * Note since this is an idempotent operation we won't insist on failing + * the op prematurely if the estimate is too large. We may turn off splice + * reads unnecessarily. + */ +static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + u32 *bmap = op->u.getattr.ga_bmval; + u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2]; + u32 ret = 0; + + if (bmap0 & FATTR4_WORD0_ACL) + return svc_max_payload(rqstp); + if (bmap0 & FATTR4_WORD0_FS_LOCATIONS) + return svc_max_payload(rqstp); + + if (bmap1 & FATTR4_WORD1_OWNER) { + ret += IDMAP_NAMESZ + 4; + bmap1 &= ~FATTR4_WORD1_OWNER; + } + if (bmap1 & FATTR4_WORD1_OWNER_GROUP) { + ret += IDMAP_NAMESZ + 4; + bmap1 &= ~FATTR4_WORD1_OWNER_GROUP; + } + if (bmap0 & FATTR4_WORD0_FILEHANDLE) { + ret += NFS4_FHSIZE + 4; + bmap0 &= ~FATTR4_WORD0_FILEHANDLE; + } + if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) { + ret += NFS4_MAXLABELLEN + 12; + bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL; + } + /* + * Largest of remaining attributes are 16 bytes (e.g., + * supported_attributes) + */ + ret += 16 * (hweight32(bmap0) + hweight32(bmap1) + hweight32(bmap2)); + /* bitmask, length */ + ret += 20; + return ret; +} + +static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE; +} + +static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_lock_denied_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_stateid_maxsz + + op_encode_change_info_maxsz + 1 + + nfs4_fattr_bitmap_maxsz + + op_encode_delegation_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = 0, rlen = 0; + + maxcount = svc_max_payload(rqstp); + rlen = min(op->u.read.rd_length, maxcount); + + return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32); +} + +static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = svc_max_payload(rqstp); + u32 rlen = min(op->u.read.rd_length, maxcount); + /* + * If we detect that the file changed during hole encoding, then we + * recover by encoding the remaining reply as data. This means we need + * to set aside enough room to encode two data segments. + */ + u32 seg_len = 2 * (1 + 2 + 1); + + return (op_encode_hdr_size + 2 + seg_len + XDR_QUADLEN(rlen)) * sizeof(__be32); +} + +static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = 0, rlen = 0; + + maxcount = svc_max_payload(rqstp); + rlen = min(op->u.readdir.rd_maxcount, maxcount); + + return (op_encode_hdr_size + op_encode_verifier_maxsz + + XDR_QUADLEN(rlen)) * sizeof(__be32); +} + +static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE; +} + +static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + +static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz + + op_encode_change_info_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32); +} + +static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids) + * sizeof(__be32); +} + +static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR * + (4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32); +} + +static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) * + sizeof(__be32); +} + +static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ + 1 + 1 + /* eir_flags, spr_how */\ + 4 + /* spo_must_enforce & _allow with bitmap */\ + 2 + /*eir_server_owner.so_minor_id */\ + /* eir_server_owner.so_major_id<> */\ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ + /* eir_server_scope<> */\ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ + 1 + /* eir_server_impl_id array length */\ + 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32); +} + +static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\ + 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32); +} + +static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\ + 2 + /* csr_sequence, csr_flags */\ + op_encode_channel_attrs_maxsz + \ + op_encode_channel_attrs_maxsz) * sizeof(__be32); +} + +static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* wr_callback */ + + op_encode_stateid_maxsz /* wr_callback */ + + 2 /* wr_count */ + + 1 /* wr_committed */ + + op_encode_verifier_maxsz + + 1 /* cr_consecutive */ + + 1 /* cr_synchronous */) * sizeof(__be32); +} + +static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 2 /* osr_count */ + + 1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32); +} + +static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 3 /* cnr_lease_time */ + + 1 /* We support one cnr_source_server */ + + 1 /* cnr_stateid seq */ + + op_encode_stateid_maxsz /* cnr_stateid */ + + 1 /* num cnr_source_server*/ + + 1 /* nl4_type */ + + 1 /* nl4 size */ + + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) /*nl4_loc + nl4_loc_sz */) + * sizeof(__be32); +} + +#ifdef CONFIG_NFSD_PNFS +static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = 0, rlen = 0; + + maxcount = svc_max_payload(rqstp); + rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount); + + return (op_encode_hdr_size + + 1 /* gd_layout_type*/ + + XDR_QUADLEN(rlen) + + 2 /* gd_notify_types */) * sizeof(__be32); +} + +/* + * At this stage we don't really know what layout driver will handle the request, + * so we need to define an arbitrary upper bound here. + */ +#define MAX_LAYOUT_SIZE 128 +static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* logr_return_on_close */ + + op_encode_stateid_maxsz + + 1 /* nr of layouts */ + + MAX_LAYOUT_SIZE) * sizeof(__be32); +} + +static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* locr_newsize */ + + 2 /* ns_size */) * sizeof(__be32); +} + +static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* lrs_stateid */ + + op_encode_stateid_maxsz) * sizeof(__be32); +} +#endif /* CONFIG_NFSD_PNFS */ + + +static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 3) * sizeof(__be32); +} + +static inline u32 nfsd4_getxattr_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + u32 maxcount, rlen; + + maxcount = svc_max_payload(rqstp); + rlen = min_t(u32, XATTR_SIZE_MAX, maxcount); + + return (op_encode_hdr_size + 1 + XDR_QUADLEN(rlen)) * sizeof(__be32); +} + +static inline u32 nfsd4_setxattr_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} +static inline u32 nfsd4_listxattrs_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + u32 maxcount, rlen; + + maxcount = svc_max_payload(rqstp); + rlen = min(op->u.listxattrs.lsxa_maxcount, maxcount); + + return (op_encode_hdr_size + 4 + XDR_QUADLEN(rlen)) * sizeof(__be32); +} + +static inline u32 nfsd4_removexattr_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + + +static const struct nfsd4_operation nfsd4_ops[] = { + [OP_ACCESS] = { + .op_func = nfsd4_access, + .op_name = "OP_ACCESS", + .op_rsize_bop = nfsd4_access_rsize, + }, + [OP_CLOSE] = { + .op_func = nfsd4_close, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_CLOSE", + .op_rsize_bop = nfsd4_status_stateid_rsize, + .op_get_currentstateid = nfsd4_get_closestateid, + .op_set_currentstateid = nfsd4_set_closestateid, + }, + [OP_COMMIT] = { + .op_func = nfsd4_commit, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_COMMIT", + .op_rsize_bop = nfsd4_commit_rsize, + }, + [OP_CREATE] = { + .op_func = nfsd4_create, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME | OP_CLEAR_STATEID, + .op_name = "OP_CREATE", + .op_rsize_bop = nfsd4_create_rsize, + }, + [OP_DELEGRETURN] = { + .op_func = nfsd4_delegreturn, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_DELEGRETURN", + .op_rsize_bop = nfsd4_only_status_rsize, + .op_get_currentstateid = nfsd4_get_delegreturnstateid, + }, + [OP_GETATTR] = { + .op_func = nfsd4_getattr, + .op_flags = ALLOWED_ON_ABSENT_FS, + .op_rsize_bop = nfsd4_getattr_rsize, + .op_name = "OP_GETATTR", + }, + [OP_GETFH] = { + .op_func = nfsd4_getfh, + .op_name = "OP_GETFH", + .op_rsize_bop = nfsd4_getfh_rsize, + }, + [OP_LINK] = { + .op_func = nfsd4_link, + .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING + | OP_CACHEME, + .op_name = "OP_LINK", + .op_rsize_bop = nfsd4_link_rsize, + }, + [OP_LOCK] = { + .op_func = nfsd4_lock, + .op_flags = OP_MODIFIES_SOMETHING | + OP_NONTRIVIAL_ERROR_ENCODE, + .op_name = "OP_LOCK", + .op_rsize_bop = nfsd4_lock_rsize, + .op_set_currentstateid = nfsd4_set_lockstateid, + }, + [OP_LOCKT] = { + .op_func = nfsd4_lockt, + .op_flags = OP_NONTRIVIAL_ERROR_ENCODE, + .op_name = "OP_LOCKT", + .op_rsize_bop = nfsd4_lock_rsize, + }, + [OP_LOCKU] = { + .op_func = nfsd4_locku, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LOCKU", + .op_rsize_bop = nfsd4_status_stateid_rsize, + .op_get_currentstateid = nfsd4_get_lockustateid, + }, + [OP_LOOKUP] = { + .op_func = nfsd4_lookup, + .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID, + .op_name = "OP_LOOKUP", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_LOOKUPP] = { + .op_func = nfsd4_lookupp, + .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID, + .op_name = "OP_LOOKUPP", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_NVERIFY] = { + .op_func = nfsd4_nverify, + .op_name = "OP_NVERIFY", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_OPEN] = { + .op_func = nfsd4_open, + .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING, + .op_name = "OP_OPEN", + .op_rsize_bop = nfsd4_open_rsize, + .op_set_currentstateid = nfsd4_set_openstateid, + }, + [OP_OPEN_CONFIRM] = { + .op_func = nfsd4_open_confirm, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_OPEN_CONFIRM", + .op_rsize_bop = nfsd4_status_stateid_rsize, + }, + [OP_OPEN_DOWNGRADE] = { + .op_func = nfsd4_open_downgrade, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_OPEN_DOWNGRADE", + .op_rsize_bop = nfsd4_status_stateid_rsize, + .op_get_currentstateid = nfsd4_get_opendowngradestateid, + .op_set_currentstateid = nfsd4_set_opendowngradestateid, + }, + [OP_PUTFH] = { + .op_func = nfsd4_putfh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, + .op_name = "OP_PUTFH", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_PUTPUBFH] = { + .op_func = nfsd4_putrootfh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, + .op_name = "OP_PUTPUBFH", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_PUTROOTFH] = { + .op_func = nfsd4_putrootfh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, + .op_name = "OP_PUTROOTFH", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_READ] = { + .op_func = nfsd4_read, + .op_release = nfsd4_read_release, + .op_name = "OP_READ", + .op_rsize_bop = nfsd4_read_rsize, + .op_get_currentstateid = nfsd4_get_readstateid, + }, + [OP_READDIR] = { + .op_func = nfsd4_readdir, + .op_name = "OP_READDIR", + .op_rsize_bop = nfsd4_readdir_rsize, + }, + [OP_READLINK] = { + .op_func = nfsd4_readlink, + .op_name = "OP_READLINK", + .op_rsize_bop = nfsd4_readlink_rsize, + }, + [OP_REMOVE] = { + .op_func = nfsd4_remove, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_REMOVE", + .op_rsize_bop = nfsd4_remove_rsize, + }, + [OP_RENAME] = { + .op_func = nfsd4_rename, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_RENAME", + .op_rsize_bop = nfsd4_rename_rsize, + }, + [OP_RENEW] = { + .op_func = nfsd4_renew, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, + .op_name = "OP_RENEW", + .op_rsize_bop = nfsd4_only_status_rsize, + + }, + [OP_RESTOREFH] = { + .op_func = nfsd4_restorefh, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING, + .op_name = "OP_RESTOREFH", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_SAVEFH] = { + .op_func = nfsd4_savefh, + .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING, + .op_name = "OP_SAVEFH", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_SECINFO] = { + .op_func = nfsd4_secinfo, + .op_release = nfsd4_secinfo_release, + .op_flags = OP_HANDLES_WRONGSEC, + .op_name = "OP_SECINFO", + .op_rsize_bop = nfsd4_secinfo_rsize, + }, + [OP_SETATTR] = { + .op_func = nfsd4_setattr, + .op_name = "OP_SETATTR", + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME + | OP_NONTRIVIAL_ERROR_ENCODE, + .op_rsize_bop = nfsd4_setattr_rsize, + .op_get_currentstateid = nfsd4_get_setattrstateid, + }, + [OP_SETCLIENTID] = { + .op_func = nfsd4_setclientid, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING | OP_CACHEME + | OP_NONTRIVIAL_ERROR_ENCODE, + .op_name = "OP_SETCLIENTID", + .op_rsize_bop = nfsd4_setclientid_rsize, + }, + [OP_SETCLIENTID_CONFIRM] = { + .op_func = nfsd4_setclientid_confirm, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_SETCLIENTID_CONFIRM", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_VERIFY] = { + .op_func = nfsd4_verify, + .op_name = "OP_VERIFY", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_WRITE] = { + .op_func = nfsd4_write, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_WRITE", + .op_rsize_bop = nfsd4_write_rsize, + .op_get_currentstateid = nfsd4_get_writestateid, + }, + [OP_RELEASE_LOCKOWNER] = { + .op_func = nfsd4_release_lockowner, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS + | OP_MODIFIES_SOMETHING, + .op_name = "OP_RELEASE_LOCKOWNER", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + + /* NFSv4.1 operations */ + [OP_EXCHANGE_ID] = { + .op_func = nfsd4_exchange_id, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, + .op_name = "OP_EXCHANGE_ID", + .op_rsize_bop = nfsd4_exchange_id_rsize, + }, + [OP_BACKCHANNEL_CTL] = { + .op_func = nfsd4_backchannel_ctl, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, + .op_name = "OP_BACKCHANNEL_CTL", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_BIND_CONN_TO_SESSION] = { + .op_func = nfsd4_bind_conn_to_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, + .op_name = "OP_BIND_CONN_TO_SESSION", + .op_rsize_bop = nfsd4_bind_conn_to_session_rsize, + }, + [OP_CREATE_SESSION] = { + .op_func = nfsd4_create_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, + .op_name = "OP_CREATE_SESSION", + .op_rsize_bop = nfsd4_create_session_rsize, + }, + [OP_DESTROY_SESSION] = { + .op_func = nfsd4_destroy_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, + .op_name = "OP_DESTROY_SESSION", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_SEQUENCE] = { + .op_func = nfsd4_sequence, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_SEQUENCE", + .op_rsize_bop = nfsd4_sequence_rsize, + }, + [OP_DESTROY_CLIENTID] = { + .op_func = nfsd4_destroy_clientid, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP + | OP_MODIFIES_SOMETHING, + .op_name = "OP_DESTROY_CLIENTID", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_RECLAIM_COMPLETE] = { + .op_func = nfsd4_reclaim_complete, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, + .op_name = "OP_RECLAIM_COMPLETE", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_SECINFO_NO_NAME] = { + .op_func = nfsd4_secinfo_no_name, + .op_release = nfsd4_secinfo_no_name_release, + .op_flags = OP_HANDLES_WRONGSEC, + .op_name = "OP_SECINFO_NO_NAME", + .op_rsize_bop = nfsd4_secinfo_rsize, + }, + [OP_TEST_STATEID] = { + .op_func = nfsd4_test_stateid, + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_TEST_STATEID", + .op_rsize_bop = nfsd4_test_stateid_rsize, + }, + [OP_FREE_STATEID] = { + .op_func = nfsd4_free_stateid, + .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, + .op_name = "OP_FREE_STATEID", + .op_get_currentstateid = nfsd4_get_freestateid, + .op_rsize_bop = nfsd4_only_status_rsize, + }, +#ifdef CONFIG_NFSD_PNFS + [OP_GETDEVICEINFO] = { + .op_func = nfsd4_getdeviceinfo, + .op_release = nfsd4_getdeviceinfo_release, + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_GETDEVICEINFO", + .op_rsize_bop = nfsd4_getdeviceinfo_rsize, + }, + [OP_LAYOUTGET] = { + .op_func = nfsd4_layoutget, + .op_release = nfsd4_layoutget_release, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LAYOUTGET", + .op_rsize_bop = nfsd4_layoutget_rsize, + }, + [OP_LAYOUTCOMMIT] = { + .op_func = nfsd4_layoutcommit, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LAYOUTCOMMIT", + .op_rsize_bop = nfsd4_layoutcommit_rsize, + }, + [OP_LAYOUTRETURN] = { + .op_func = nfsd4_layoutreturn, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LAYOUTRETURN", + .op_rsize_bop = nfsd4_layoutreturn_rsize, + }, +#endif /* CONFIG_NFSD_PNFS */ + + /* NFSv4.2 operations */ + [OP_ALLOCATE] = { + .op_func = nfsd4_allocate, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_ALLOCATE", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_DEALLOCATE] = { + .op_func = nfsd4_deallocate, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_DEALLOCATE", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_CLONE] = { + .op_func = nfsd4_clone, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_CLONE", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_COPY] = { + .op_func = nfsd4_copy, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_COPY", + .op_rsize_bop = nfsd4_copy_rsize, + }, + [OP_READ_PLUS] = { + .op_func = nfsd4_read, + .op_release = nfsd4_read_release, + .op_name = "OP_READ_PLUS", + .op_rsize_bop = nfsd4_read_plus_rsize, + .op_get_currentstateid = nfsd4_get_readstateid, + }, + [OP_SEEK] = { + .op_func = nfsd4_seek, + .op_name = "OP_SEEK", + .op_rsize_bop = nfsd4_seek_rsize, + }, + [OP_OFFLOAD_STATUS] = { + .op_func = nfsd4_offload_status, + .op_name = "OP_OFFLOAD_STATUS", + .op_rsize_bop = nfsd4_offload_status_rsize, + }, + [OP_OFFLOAD_CANCEL] = { + .op_func = nfsd4_offload_cancel, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_OFFLOAD_CANCEL", + .op_rsize_bop = nfsd4_only_status_rsize, + }, + [OP_COPY_NOTIFY] = { + .op_func = nfsd4_copy_notify, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_COPY_NOTIFY", + .op_rsize_bop = nfsd4_copy_notify_rsize, + }, + [OP_GETXATTR] = { + .op_func = nfsd4_getxattr, + .op_name = "OP_GETXATTR", + .op_rsize_bop = nfsd4_getxattr_rsize, + }, + [OP_SETXATTR] = { + .op_func = nfsd4_setxattr, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_SETXATTR", + .op_rsize_bop = nfsd4_setxattr_rsize, + }, + [OP_LISTXATTRS] = { + .op_func = nfsd4_listxattrs, + .op_name = "OP_LISTXATTRS", + .op_rsize_bop = nfsd4_listxattrs_rsize, + }, + [OP_REMOVEXATTR] = { + .op_func = nfsd4_removexattr, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_REMOVEXATTR", + .op_rsize_bop = nfsd4_removexattr_rsize, + }, +}; + +/** + * nfsd4_spo_must_allow - Determine if the compound op contains an + * operation that is allowed to be sent with machine credentials + * + * @rqstp: a pointer to the struct svc_rqst + * + * Checks to see if the compound contains a spo_must_allow op + * and confirms that it was sent with the proper machine creds. + */ + +bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + struct nfsd4_op *this = &argp->ops[resp->opcnt - 1]; + struct nfsd4_compound_state *cstate = &resp->cstate; + struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow; + u32 opiter; + + if (!cstate->minorversion) + return false; + + if (cstate->spo_must_allowed) + return true; + + opiter = resp->opcnt; + while (opiter < argp->opcnt) { + this = &argp->ops[opiter++]; + if (test_bit(this->opnum, allow->u.longs) && + cstate->clp->cl_mach_cred && + nfsd4_mach_creds_match(cstate->clp, rqstp)) { + cstate->spo_must_allowed = true; + return true; + } + } + cstate->spo_must_allowed = false; + return false; +} + +int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + if (op->opnum == OP_ILLEGAL || op->status == nfserr_notsupp) + return op_encode_hdr_size * sizeof(__be32); + + BUG_ON(OPDESC(op)->op_rsize_bop == NULL); + return OPDESC(op)->op_rsize_bop(rqstp, op); +} + +void warn_on_nonidempotent_op(struct nfsd4_op *op) +{ + if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) { + pr_err("unable to encode reply to nonidempotent op %d (%s)\n", + op->opnum, nfsd4_op_name(op->opnum)); + WARN_ON_ONCE(1); + } +} + +static const char *nfsd4_op_name(unsigned opnum) +{ + if (opnum < ARRAY_SIZE(nfsd4_ops)) + return nfsd4_ops[opnum].op_name; + return "unknown_operation"; +} + +#define nfsd4_voidres nfsd4_voidargs +struct nfsd4_voidargs { int dummy; }; + +static const struct svc_procedure nfsd_procedures4[2] = { + [NFSPROC4_NULL] = { + .pc_func = nfsd4_proc_null, + .pc_decode = nfs4svc_decode_voidarg, + .pc_encode = nfs4svc_encode_voidres, + .pc_argsize = sizeof(struct nfsd4_voidargs), + .pc_ressize = sizeof(struct nfsd4_voidres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = 1, + }, + [NFSPROC4_COMPOUND] = { + .pc_func = nfsd4_proc_compound, + .pc_decode = nfs4svc_decode_compoundargs, + .pc_encode = nfs4svc_encode_compoundres, + .pc_argsize = sizeof(struct nfsd4_compoundargs), + .pc_ressize = sizeof(struct nfsd4_compoundres), + .pc_release = nfsd4_release_compoundargs, + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = NFSD_BUFSIZE/4, + }, +}; + +static unsigned int nfsd_count3[ARRAY_SIZE(nfsd_procedures4)]; +const struct svc_version nfsd_version4 = { + .vs_vers = 4, + .vs_nproc = 2, + .vs_proc = nfsd_procedures4, + .vs_count = nfsd_count3, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS4_SVC_XDRSIZE, + .vs_rpcb_optnl = true, + .vs_need_cong_ctrl = true, +}; + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c new file mode 100644 index 000000000..83c4e6883 --- /dev/null +++ b/fs/nfsd/nfs4recover.c @@ -0,0 +1,2169 @@ +/* +* Copyright (c) 2004 The Regents of the University of Michigan. +* Copyright (c) 2012 Jeff Layton <jlayton@redhat.com> +* All rights reserved. +* +* Andy Adamson <andros@citi.umich.edu> +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* 3. Neither the name of the University nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED +* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#include <crypto/hash.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/namei.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <net/net_namespace.h> +#include <linux/sunrpc/rpc_pipe_fs.h> +#include <linux/sunrpc/clnt.h> +#include <linux/nfsd/cld.h> + +#include "nfsd.h" +#include "state.h" +#include "vfs.h" +#include "netns.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +/* Declarations */ +struct nfsd4_client_tracking_ops { + int (*init)(struct net *); + void (*exit)(struct net *); + void (*create)(struct nfs4_client *); + void (*remove)(struct nfs4_client *); + int (*check)(struct nfs4_client *); + void (*grace_done)(struct nfsd_net *); + uint8_t version; + size_t msglen; +}; + +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops; +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2; + +/* Globals */ +static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; + +static int +nfs4_save_creds(const struct cred **original_creds) +{ + struct cred *new; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->fsuid = GLOBAL_ROOT_UID; + new->fsgid = GLOBAL_ROOT_GID; + *original_creds = override_creds(new); + put_cred(new); + return 0; +} + +static void +nfs4_reset_creds(const struct cred *original) +{ + revert_creds(original); +} + +static void +md5_to_hex(char *out, char *md5) +{ + int i; + + for (i=0; i<16; i++) { + unsigned char c = md5[i]; + + *out++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); + *out++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); + } + *out = '\0'; +} + +static int +nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) +{ + struct xdr_netobj cksum; + struct crypto_shash *tfm; + int status; + + dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", + clname->len, clname->data); + tfm = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(tfm)) { + status = PTR_ERR(tfm); + goto out_no_tfm; + } + + cksum.len = crypto_shash_digestsize(tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) { + status = -ENOMEM; + goto out; + } + + status = crypto_shash_tfm_digest(tfm, clname->data, clname->len, + cksum.data); + if (status) + goto out; + + md5_to_hex(dname, cksum.data); + + status = 0; +out: + kfree(cksum.data); + crypto_free_shash(tfm); +out_no_tfm: + return status; +} + +/* + * If we had an error generating the recdir name for the legacy tracker + * then warn the admin. If the error doesn't appear to be transient, + * then disable recovery tracking. + */ +static void +legacy_recdir_name_error(struct nfs4_client *clp, int error) +{ + printk(KERN_ERR "NFSD: unable to generate recoverydir " + "name (%d).\n", error); + + /* + * if the algorithm just doesn't exist, then disable the recovery + * tracker altogether. The crypto libs will generally return this if + * FIPS is enabled as well. + */ + if (error == -ENOENT) { + printk(KERN_ERR "NFSD: disabling legacy clientid tracking. " + "Reboot recovery will not function correctly!\n"); + nfsd4_client_tracking_exit(clp->net); + } +} + +static void +__nfsd4_create_reclaim_record_grace(struct nfs4_client *clp, + const char *dname, int len, struct nfsd_net *nn) +{ + struct xdr_netobj name; + struct xdr_netobj princhash = { .len = 0, .data = NULL }; + struct nfs4_client_reclaim *crp; + + name.data = kmemdup(dname, len, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data!\n", + __func__); + return; + } + name.len = len; + crp = nfs4_client_to_reclaim(name, princhash, nn); + if (!crp) { + kfree(name.data); + return; + } + crp->cr_clp = clp; +} + +static void +nfsd4_create_clid_dir(struct nfs4_client *clp) +{ + const struct cred *original_cred; + char dname[HEXDIR_LEN]; + struct dentry *dir, *dentry; + int status; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + if (!nn->rec_file) + return; + + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) + return legacy_recdir_name_error(clp, status); + + status = nfs4_save_creds(&original_cred); + if (status < 0) + return; + + status = mnt_want_write_file(nn->rec_file); + if (status) + goto out_creds; + + dir = nn->rec_file->f_path.dentry; + /* lock the parent */ + inode_lock(d_inode(dir)); + + dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + goto out_unlock; + } + if (d_really_is_positive(dentry)) + /* + * In the 4.1 case, where we're called from + * reclaim_complete(), records from the previous reboot + * may still be left, so this is OK. + * + * In the 4.0 case, we should never get here; but we may + * as well be forgiving and just succeed silently. + */ + goto out_put; + status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU); +out_put: + dput(dentry); +out_unlock: + inode_unlock(d_inode(dir)); + if (status == 0) { + if (nn->in_grace) + __nfsd4_create_reclaim_record_grace(clp, dname, + HEXDIR_LEN, nn); + vfs_fsync(nn->rec_file, 0); + } else { + printk(KERN_ERR "NFSD: failed to write recovery record" + " (err %d); please check that %s exists" + " and is writeable", status, + user_recovery_dirname); + } + mnt_drop_write_file(nn->rec_file); +out_creds: + nfs4_reset_creds(original_cred); +} + +typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *); + +struct name_list { + char name[HEXDIR_LEN]; + struct list_head list; +}; + +struct nfs4_dir_ctx { + struct dir_context ctx; + struct list_head names; +}; + +static int +nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct nfs4_dir_ctx *ctx = + container_of(__ctx, struct nfs4_dir_ctx, ctx); + struct name_list *entry; + + if (namlen != HEXDIR_LEN - 1) + return 0; + entry = kmalloc(sizeof(struct name_list), GFP_KERNEL); + if (entry == NULL) + return -ENOMEM; + memcpy(entry->name, name, HEXDIR_LEN - 1); + entry->name[HEXDIR_LEN - 1] = '\0'; + list_add(&entry->list, &ctx->names); + return 0; +} + +static int +nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) +{ + const struct cred *original_cred; + struct dentry *dir = nn->rec_file->f_path.dentry; + struct nfs4_dir_ctx ctx = { + .ctx.actor = nfsd4_build_namelist, + .names = LIST_HEAD_INIT(ctx.names) + }; + struct name_list *entry, *tmp; + int status; + + status = nfs4_save_creds(&original_cred); + if (status < 0) + return status; + + status = vfs_llseek(nn->rec_file, 0, SEEK_SET); + if (status < 0) { + nfs4_reset_creds(original_cred); + return status; + } + + status = iterate_dir(nn->rec_file, &ctx.ctx); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); + + list_for_each_entry_safe(entry, tmp, &ctx.names, list) { + if (!status) { + struct dentry *dentry; + dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + break; + } + status = f(dir, dentry, nn); + dput(dentry); + } + list_del(&entry->list); + kfree(entry); + } + inode_unlock(d_inode(dir)); + nfs4_reset_creds(original_cred); + + list_for_each_entry_safe(entry, tmp, &ctx.names, list) { + dprintk("NFSD: %s. Left entry %s\n", __func__, entry->name); + list_del(&entry->list); + kfree(entry); + } + return status; +} + +static int +nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) +{ + struct dentry *dir, *dentry; + int status; + + dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); + + dir = nn->rec_file->f_path.dentry; + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); + dentry = lookup_one_len(name, dir, namlen); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + goto out_unlock; + } + status = -ENOENT; + if (d_really_is_negative(dentry)) + goto out; + status = vfs_rmdir(d_inode(dir), dentry); +out: + dput(dentry); +out_unlock: + inode_unlock(d_inode(dir)); + return status; +} + +static void +__nfsd4_remove_reclaim_record_grace(const char *dname, int len, + struct nfsd_net *nn) +{ + struct xdr_netobj name; + struct nfs4_client_reclaim *crp; + + name.data = kmemdup(dname, len, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data!\n", + __func__); + return; + } + name.len = len; + crp = nfsd4_find_reclaim_client(name, nn); + kfree(name.data); + if (crp) + nfs4_remove_reclaim_record(crp, nn); +} + +static void +nfsd4_remove_clid_dir(struct nfs4_client *clp) +{ + const struct cred *original_cred; + char dname[HEXDIR_LEN]; + int status; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) + return legacy_recdir_name_error(clp, status); + + status = mnt_want_write_file(nn->rec_file); + if (status) + goto out; + clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + + status = nfs4_save_creds(&original_cred); + if (status < 0) + goto out_drop_write; + + status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn); + nfs4_reset_creds(original_cred); + if (status == 0) { + vfs_fsync(nn->rec_file, 0); + if (nn->in_grace) + __nfsd4_remove_reclaim_record_grace(dname, + HEXDIR_LEN, nn); + } +out_drop_write: + mnt_drop_write_file(nn->rec_file); +out: + if (status) + printk("NFSD: Failed to remove expired client state directory" + " %.*s\n", HEXDIR_LEN, dname); +} + +static int +purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) +{ + int status; + struct xdr_netobj name; + + if (child->d_name.len != HEXDIR_LEN - 1) { + printk("%s: illegal name %pd in recovery directory\n", + __func__, child); + /* Keep trying; maybe the others are OK: */ + return 0; + } + name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data!\n", + __func__); + goto out; + } + name.len = HEXDIR_LEN; + if (nfs4_has_reclaimed_state(name, nn)) + goto out_free; + + status = vfs_rmdir(d_inode(parent), child); + if (status) + printk("failed to remove client recovery directory %pd\n", + child); +out_free: + kfree(name.data); +out: + /* Keep trying, success or failure: */ + return 0; +} + +static void +nfsd4_recdir_purge_old(struct nfsd_net *nn) +{ + int status; + + nn->in_grace = false; + if (!nn->rec_file) + return; + status = mnt_want_write_file(nn->rec_file); + if (status) + goto out; + status = nfsd4_list_rec_dir(purge_old, nn); + if (status == 0) + vfs_fsync(nn->rec_file, 0); + mnt_drop_write_file(nn->rec_file); +out: + nfs4_release_reclaim(nn); + if (status) + printk("nfsd4: failed to purge old clients from recovery" + " directory %pD\n", nn->rec_file); +} + +static int +load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) +{ + struct xdr_netobj name; + struct xdr_netobj princhash = { .len = 0, .data = NULL }; + + if (child->d_name.len != HEXDIR_LEN - 1) { + printk("%s: illegal name %pd in recovery directory\n", + __func__, child); + /* Keep trying; maybe the others are OK: */ + return 0; + } + name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data!\n", + __func__); + goto out; + } + name.len = HEXDIR_LEN; + if (!nfs4_client_to_reclaim(name, princhash, nn)) + kfree(name.data); +out: + return 0; +} + +static int +nfsd4_recdir_load(struct net *net) { + int status; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (!nn->rec_file) + return 0; + + status = nfsd4_list_rec_dir(load_recdir, nn); + if (status) + printk("nfsd4: failed loading clients from recovery" + " directory %pD\n", nn->rec_file); + return status; +} + +/* + * Hold reference to the recovery directory. + */ + +static int +nfsd4_init_recdir(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + const struct cred *original_cred; + int status; + + printk("NFSD: Using %s as the NFSv4 state recovery directory\n", + user_recovery_dirname); + + BUG_ON(nn->rec_file); + + status = nfs4_save_creds(&original_cred); + if (status < 0) { + printk("NFSD: Unable to change credentials to find recovery" + " directory: error %d\n", + status); + return status; + } + + nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); + if (IS_ERR(nn->rec_file)) { + printk("NFSD: unable to find recovery directory %s\n", + user_recovery_dirname); + status = PTR_ERR(nn->rec_file); + nn->rec_file = NULL; + } + + nfs4_reset_creds(original_cred); + if (!status) + nn->in_grace = true; + return status; +} + +static void +nfsd4_shutdown_recdir(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (!nn->rec_file) + return; + fput(nn->rec_file); + nn->rec_file = NULL; +} + +static int +nfs4_legacy_state_init(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int i; + + nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); + if (!nn->reclaim_str_hashtbl) + return -ENOMEM; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) + INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]); + nn->reclaim_str_hashtbl_size = 0; + + return 0; +} + +static void +nfs4_legacy_state_shutdown(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + kfree(nn->reclaim_str_hashtbl); +} + +static int +nfsd4_load_reboot_recovery_data(struct net *net) +{ + int status; + + status = nfsd4_init_recdir(net); + if (status) + return status; + + status = nfsd4_recdir_load(net); + if (status) + nfsd4_shutdown_recdir(net); + + return status; +} + +static int +nfsd4_legacy_tracking_init(struct net *net) +{ + int status; + + /* XXX: The legacy code won't work in a container */ + if (net != &init_net) { + pr_warn("NFSD: attempt to initialize legacy client tracking in a container ignored.\n"); + return -EINVAL; + } + + status = nfs4_legacy_state_init(net); + if (status) + return status; + + status = nfsd4_load_reboot_recovery_data(net); + if (status) + goto err; + printk("NFSD: Using legacy client tracking operations.\n"); + return 0; + +err: + nfs4_legacy_state_shutdown(net); + return status; +} + +static void +nfsd4_legacy_tracking_exit(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfs4_release_reclaim(nn); + nfsd4_shutdown_recdir(net); + nfs4_legacy_state_shutdown(net); +} + +/* + * Change the NFSv4 recovery directory to recdir. + */ +int +nfs4_reset_recoverydir(char *recdir) +{ + int status; + struct path path; + + status = kern_path(recdir, LOOKUP_FOLLOW, &path); + if (status) + return status; + status = -ENOTDIR; + if (d_is_dir(path.dentry)) { + strcpy(user_recovery_dirname, recdir); + status = 0; + } + path_put(&path); + return status; +} + +char * +nfs4_recoverydir(void) +{ + return user_recovery_dirname; +} + +static int +nfsd4_check_legacy_client(struct nfs4_client *clp) +{ + int status; + char dname[HEXDIR_LEN]; + struct nfs4_client_reclaim *crp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct xdr_netobj name; + + /* did we already find that this client is stable? */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) { + legacy_recdir_name_error(clp, status); + return status; + } + + /* look for it in the reclaim hashtable otherwise */ + name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data!\n", + __func__); + goto out_enoent; + } + name.len = HEXDIR_LEN; + crp = nfsd4_find_reclaim_client(name, nn); + kfree(name.data); + if (crp) { + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + crp->cr_clp = clp; + return 0; + } + +out_enoent: + return -ENOENT; +} + +static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { + .init = nfsd4_legacy_tracking_init, + .exit = nfsd4_legacy_tracking_exit, + .create = nfsd4_create_clid_dir, + .remove = nfsd4_remove_clid_dir, + .check = nfsd4_check_legacy_client, + .grace_done = nfsd4_recdir_purge_old, + .version = 1, + .msglen = 0, +}; + +/* Globals */ +#define NFSD_PIPE_DIR "nfsd" +#define NFSD_CLD_PIPE "cld" + +/* per-net-ns structure for holding cld upcall info */ +struct cld_net { + struct rpc_pipe *cn_pipe; + spinlock_t cn_lock; + struct list_head cn_list; + unsigned int cn_xid; + bool cn_has_legacy; + struct crypto_shash *cn_tfm; +}; + +struct cld_upcall { + struct list_head cu_list; + struct cld_net *cu_net; + struct completion cu_done; + union { + struct cld_msg_hdr cu_hdr; + struct cld_msg cu_msg; + struct cld_msg_v2 cu_msg_v2; + } cu_u; +}; + +static int +__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn) +{ + int ret; + struct rpc_pipe_msg msg; + struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u); + + memset(&msg, 0, sizeof(msg)); + msg.data = cmsg; + msg.len = nn->client_tracking_ops->msglen; + + ret = rpc_queue_upcall(pipe, &msg); + if (ret < 0) { + goto out; + } + + wait_for_completion(&cup->cu_done); + + if (msg.errno < 0) + ret = msg.errno; +out: + return ret; +} + +static int +cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn) +{ + int ret; + + /* + * -EAGAIN occurs when pipe is closed and reopened while there are + * upcalls queued. + */ + do { + ret = __cld_pipe_upcall(pipe, cmsg, nn); + } while (ret == -EAGAIN); + + return ret; +} + +static ssize_t +__cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, + struct nfsd_net *nn) +{ + uint8_t cmd, princhashlen; + struct xdr_netobj name, princhash = { .len = 0, .data = NULL }; + uint16_t namelen; + struct cld_net *cn = nn->cld_net; + + if (get_user(cmd, &cmsg->cm_cmd)) { + dprintk("%s: error when copying cmd from userspace", __func__); + return -EFAULT; + } + if (cmd == Cld_GraceStart) { + if (nn->client_tracking_ops->version >= 2) { + const struct cld_clntinfo __user *ci; + + ci = &cmsg->cm_u.cm_clntinfo; + if (get_user(namelen, &ci->cc_name.cn_len)) + return -EFAULT; + name.data = memdup_user(&ci->cc_name.cn_id, namelen); + if (IS_ERR_OR_NULL(name.data)) + return -EFAULT; + name.len = namelen; + get_user(princhashlen, &ci->cc_princhash.cp_len); + if (princhashlen > 0) { + princhash.data = memdup_user( + &ci->cc_princhash.cp_data, + princhashlen); + if (IS_ERR_OR_NULL(princhash.data)) { + kfree(name.data); + return -EFAULT; + } + princhash.len = princhashlen; + } else + princhash.len = 0; + } else { + const struct cld_name __user *cnm; + + cnm = &cmsg->cm_u.cm_name; + if (get_user(namelen, &cnm->cn_len)) + return -EFAULT; + name.data = memdup_user(&cnm->cn_id, namelen); + if (IS_ERR_OR_NULL(name.data)) + return -EFAULT; + name.len = namelen; + } + if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) { + name.len = name.len - 5; + memmove(name.data, name.data + 5, name.len); + cn->cn_has_legacy = true; + } + if (!nfs4_client_to_reclaim(name, princhash, nn)) { + kfree(name.data); + kfree(princhash.data); + return -EFAULT; + } + return nn->client_tracking_ops->msglen; + } + return -EFAULT; +} + +static ssize_t +cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) +{ + struct cld_upcall *tmp, *cup; + struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src; + struct cld_msg_v2 __user *cmsg = (struct cld_msg_v2 __user *)src; + uint32_t xid; + struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, + nfsd_net_id); + struct cld_net *cn = nn->cld_net; + int16_t status; + + if (mlen != nn->client_tracking_ops->msglen) { + dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen, + nn->client_tracking_ops->msglen); + return -EINVAL; + } + + /* copy just the xid so we can try to find that */ + if (copy_from_user(&xid, &hdr->cm_xid, sizeof(xid)) != 0) { + dprintk("%s: error when copying xid from userspace", __func__); + return -EFAULT; + } + + /* + * copy the status so we know whether to remove the upcall from the + * list (for -EINPROGRESS, we just want to make sure the xid is + * valid, not remove the upcall from the list) + */ + if (get_user(status, &hdr->cm_status)) { + dprintk("%s: error when copying status from userspace", __func__); + return -EFAULT; + } + + /* walk the list and find corresponding xid */ + cup = NULL; + spin_lock(&cn->cn_lock); + list_for_each_entry(tmp, &cn->cn_list, cu_list) { + if (get_unaligned(&tmp->cu_u.cu_hdr.cm_xid) == xid) { + cup = tmp; + if (status != -EINPROGRESS) + list_del_init(&cup->cu_list); + break; + } + } + spin_unlock(&cn->cn_lock); + + /* couldn't find upcall? */ + if (!cup) { + dprintk("%s: couldn't find upcall -- xid=%u\n", __func__, xid); + return -EINVAL; + } + + if (status == -EINPROGRESS) + return __cld_pipe_inprogress_downcall(cmsg, nn); + + if (copy_from_user(&cup->cu_u.cu_msg_v2, src, mlen) != 0) + return -EFAULT; + + complete(&cup->cu_done); + return mlen; +} + +static void +cld_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct cld_msg *cmsg = msg->data; + struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, + cu_u.cu_msg); + + /* errno >= 0 means we got a downcall */ + if (msg->errno >= 0) + return; + + complete(&cup->cu_done); +} + +static const struct rpc_pipe_ops cld_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = cld_pipe_downcall, + .destroy_msg = cld_pipe_destroy_msg, +}; + +static struct dentry * +nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe) +{ + struct dentry *dir, *dentry; + + dir = rpc_d_lookup_sb(sb, NFSD_PIPE_DIR); + if (dir == NULL) + return ERR_PTR(-ENOENT); + dentry = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe); + dput(dir); + return dentry; +} + +static void +nfsd4_cld_unregister_sb(struct rpc_pipe *pipe) +{ + if (pipe->dentry) + rpc_unlink(pipe->dentry); +} + +static struct dentry * +nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe) +{ + struct super_block *sb; + struct dentry *dentry; + + sb = rpc_get_sb_net(net); + if (!sb) + return NULL; + dentry = nfsd4_cld_register_sb(sb, pipe); + rpc_put_sb_net(net); + return dentry; +} + +static void +nfsd4_cld_unregister_net(struct net *net, struct rpc_pipe *pipe) +{ + struct super_block *sb; + + sb = rpc_get_sb_net(net); + if (sb) { + nfsd4_cld_unregister_sb(pipe); + rpc_put_sb_net(net); + } +} + +/* Initialize rpc_pipefs pipe for communication with client tracking daemon */ +static int +__nfsd4_init_cld_pipe(struct net *net) +{ + int ret; + struct dentry *dentry; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct cld_net *cn; + + if (nn->cld_net) + return 0; + + cn = kzalloc(sizeof(*cn), GFP_KERNEL); + if (!cn) { + ret = -ENOMEM; + goto err; + } + + cn->cn_pipe = rpc_mkpipe_data(&cld_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); + if (IS_ERR(cn->cn_pipe)) { + ret = PTR_ERR(cn->cn_pipe); + goto err; + } + spin_lock_init(&cn->cn_lock); + INIT_LIST_HEAD(&cn->cn_list); + + dentry = nfsd4_cld_register_net(net, cn->cn_pipe); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto err_destroy_data; + } + + cn->cn_pipe->dentry = dentry; + cn->cn_has_legacy = false; + nn->cld_net = cn; + return 0; + +err_destroy_data: + rpc_destroy_pipe_data(cn->cn_pipe); +err: + kfree(cn); + printk(KERN_ERR "NFSD: unable to create nfsdcld upcall pipe (%d)\n", + ret); + return ret; +} + +static int +nfsd4_init_cld_pipe(struct net *net) +{ + int status; + + status = __nfsd4_init_cld_pipe(net); + if (!status) + printk("NFSD: Using old nfsdcld client tracking operations.\n"); + return status; +} + +static void +nfsd4_remove_cld_pipe(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + nfsd4_cld_unregister_net(net, cn->cn_pipe); + rpc_destroy_pipe_data(cn->cn_pipe); + if (cn->cn_tfm) + crypto_free_shash(cn->cn_tfm); + kfree(nn->cld_net); + nn->cld_net = NULL; +} + +static struct cld_upcall * +alloc_cld_upcall(struct nfsd_net *nn) +{ + struct cld_upcall *new, *tmp; + struct cld_net *cn = nn->cld_net; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return new; + + /* FIXME: hard cap on number in flight? */ +restart_search: + spin_lock(&cn->cn_lock); + list_for_each_entry(tmp, &cn->cn_list, cu_list) { + if (tmp->cu_u.cu_msg.cm_xid == cn->cn_xid) { + cn->cn_xid++; + spin_unlock(&cn->cn_lock); + goto restart_search; + } + } + init_completion(&new->cu_done); + new->cu_u.cu_msg.cm_vers = nn->client_tracking_ops->version; + put_unaligned(cn->cn_xid++, &new->cu_u.cu_msg.cm_xid); + new->cu_net = cn; + list_add(&new->cu_list, &cn->cn_list); + spin_unlock(&cn->cn_lock); + + dprintk("%s: allocated xid %u\n", __func__, new->cu_u.cu_msg.cm_xid); + + return new; +} + +static void +free_cld_upcall(struct cld_upcall *victim) +{ + struct cld_net *cn = victim->cu_net; + + spin_lock(&cn->cn_lock); + list_del(&victim->cu_list); + spin_unlock(&cn->cn_lock); + kfree(victim); +} + +/* Ask daemon to create a new record */ +static void +nfsd4_cld_create(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + /* Don't upcall if it's already stored */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cup->cu_u.cu_msg.cm_cmd = Cld_Create; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); + if (!ret) { + ret = cup->cu_u.cu_msg.cm_status; + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + + free_cld_upcall(cup); +out_err: + if (ret) + printk(KERN_ERR "NFSD: Unable to create client " + "record on stable storage: %d\n", ret); +} + +/* Ask daemon to create a new record */ +static void +nfsd4_cld_create_v2(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + struct cld_msg_v2 *cmsg; + struct crypto_shash *tfm = cn->cn_tfm; + struct xdr_netobj cksum; + char *principal = NULL; + + /* Don't upcall if it's already stored */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cmsg = &cup->cu_u.cu_msg_v2; + cmsg->cm_cmd = Cld_Create; + cmsg->cm_u.cm_clntinfo.cc_name.cn_len = clp->cl_name.len; + memcpy(cmsg->cm_u.cm_clntinfo.cc_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + if (clp->cl_cred.cr_raw_principal) + principal = clp->cl_cred.cr_raw_principal; + else if (clp->cl_cred.cr_principal) + principal = clp->cl_cred.cr_principal; + if (principal) { + cksum.len = crypto_shash_digestsize(tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) { + ret = -ENOMEM; + goto out; + } + ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal), + cksum.data); + if (ret) { + kfree(cksum.data); + goto out; + } + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len; + memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data, + cksum.data, cksum.len); + kfree(cksum.data); + } else + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; + + ret = cld_pipe_upcall(cn->cn_pipe, cmsg, nn); + if (!ret) { + ret = cmsg->cm_status; + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + +out: + free_cld_upcall(cup); +out_err: + if (ret) + pr_err("NFSD: Unable to create client record on stable storage: %d\n", + ret); +} + +/* Ask daemon to create a new record */ +static void +nfsd4_cld_remove(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + /* Don't upcall if it's already removed */ + if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cup->cu_u.cu_msg.cm_cmd = Cld_Remove; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); + if (!ret) { + ret = cup->cu_u.cu_msg.cm_status; + clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + + free_cld_upcall(cup); +out_err: + if (ret) + printk(KERN_ERR "NFSD: Unable to remove client " + "record from stable storage: %d\n", ret); +} + +/* + * For older nfsdcld's that do not allow us to "slurp" the clients + * from the tracking database during startup. + * + * Check for presence of a record, and update its timestamp + */ +static int +nfsd4_cld_check_v0(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + + /* Don't upcall if one was already stored during this grace pd */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + cup = alloc_cld_upcall(nn); + if (!cup) { + printk(KERN_ERR "NFSD: Unable to check client record on " + "stable storage: %d\n", -ENOMEM); + return -ENOMEM; + } + + cup->cu_u.cu_msg.cm_cmd = Cld_Check; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); + if (!ret) { + ret = cup->cu_u.cu_msg.cm_status; + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + + free_cld_upcall(cup); + return ret; +} + +/* + * For newer nfsdcld's that allow us to "slurp" the clients + * from the tracking database during startup. + * + * Check for presence of a record in the reclaim_str_hashtbl + */ +static int +nfsd4_cld_check(struct nfs4_client *clp) +{ + struct nfs4_client_reclaim *crp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + int status; + char dname[HEXDIR_LEN]; + struct xdr_netobj name; + + /* did we already find that this client is stable? */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + /* look for it in the reclaim hashtable otherwise */ + crp = nfsd4_find_reclaim_client(clp->cl_name, nn); + if (crp) + goto found; + + if (cn->cn_has_legacy) { + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) + return -ENOENT; + + name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data!\n", + __func__); + return -ENOENT; + } + name.len = HEXDIR_LEN; + crp = nfsd4_find_reclaim_client(name, nn); + kfree(name.data); + if (crp) + goto found; + + } + return -ENOENT; +found: + crp->cr_clp = clp; + return 0; +} + +static int +nfsd4_cld_check_v2(struct nfs4_client *clp) +{ + struct nfs4_client_reclaim *crp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + int status; + char dname[HEXDIR_LEN]; + struct xdr_netobj name; + struct crypto_shash *tfm = cn->cn_tfm; + struct xdr_netobj cksum; + char *principal = NULL; + + /* did we already find that this client is stable? */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + /* look for it in the reclaim hashtable otherwise */ + crp = nfsd4_find_reclaim_client(clp->cl_name, nn); + if (crp) + goto found; + + if (cn->cn_has_legacy) { + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) + return -ENOENT; + + name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data\n", + __func__); + return -ENOENT; + } + name.len = HEXDIR_LEN; + crp = nfsd4_find_reclaim_client(name, nn); + kfree(name.data); + if (crp) + goto found; + + } + return -ENOENT; +found: + if (crp->cr_princhash.len) { + if (clp->cl_cred.cr_raw_principal) + principal = clp->cl_cred.cr_raw_principal; + else if (clp->cl_cred.cr_principal) + principal = clp->cl_cred.cr_principal; + if (principal == NULL) + return -ENOENT; + cksum.len = crypto_shash_digestsize(tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) + return -ENOENT; + status = crypto_shash_tfm_digest(tfm, principal, + strlen(principal), cksum.data); + if (status) { + kfree(cksum.data); + return -ENOENT; + } + if (memcmp(crp->cr_princhash.data, cksum.data, + crp->cr_princhash.len)) { + kfree(cksum.data); + return -ENOENT; + } + kfree(cksum.data); + } + crp->cr_clp = clp; + return 0; +} + +static int +nfsd4_cld_grace_start(struct nfsd_net *nn) +{ + int ret; + struct cld_upcall *cup; + struct cld_net *cn = nn->cld_net; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); + if (!ret) + ret = cup->cu_u.cu_msg.cm_status; + + free_cld_upcall(cup); +out_err: + if (ret) + dprintk("%s: Unable to get clients from userspace: %d\n", + __func__, ret); + return ret; +} + +/* For older nfsdcld's that need cm_gracetime */ +static void +nfsd4_cld_grace_done_v0(struct nfsd_net *nn) +{ + int ret; + struct cld_upcall *cup; + struct cld_net *cn = nn->cld_net; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; + cup->cu_u.cu_msg.cm_u.cm_gracetime = nn->boot_time; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); + if (!ret) + ret = cup->cu_u.cu_msg.cm_status; + + free_cld_upcall(cup); +out_err: + if (ret) + printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret); +} + +/* + * For newer nfsdcld's that do not need cm_gracetime. We also need to call + * nfs4_release_reclaim() to clear out the reclaim_str_hashtbl. + */ +static void +nfsd4_cld_grace_done(struct nfsd_net *nn) +{ + int ret; + struct cld_upcall *cup; + struct cld_net *cn = nn->cld_net; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); + if (!ret) + ret = cup->cu_u.cu_msg.cm_status; + + free_cld_upcall(cup); +out_err: + nfs4_release_reclaim(nn); + if (ret) + printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret); +} + +static int +nfs4_cld_state_init(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int i; + + nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); + if (!nn->reclaim_str_hashtbl) + return -ENOMEM; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) + INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]); + nn->reclaim_str_hashtbl_size = 0; + nn->track_reclaim_completes = true; + atomic_set(&nn->nr_reclaim_complete, 0); + + return 0; +} + +static void +nfs4_cld_state_shutdown(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nn->track_reclaim_completes = false; + kfree(nn->reclaim_str_hashtbl); +} + +static bool +cld_running(struct nfsd_net *nn) +{ + struct cld_net *cn = nn->cld_net; + struct rpc_pipe *pipe = cn->cn_pipe; + + return pipe->nreaders || pipe->nwriters; +} + +static int +nfsd4_cld_get_version(struct nfsd_net *nn) +{ + int ret = 0; + struct cld_upcall *cup; + struct cld_net *cn = nn->cld_net; + uint8_t version; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); + if (!ret) { + ret = cup->cu_u.cu_msg.cm_status; + if (ret) + goto out_free; + version = cup->cu_u.cu_msg.cm_u.cm_version; + dprintk("%s: userspace returned version %u\n", + __func__, version); + if (version < 1) + version = 1; + else if (version > CLD_UPCALL_VERSION) + version = CLD_UPCALL_VERSION; + + switch (version) { + case 1: + nn->client_tracking_ops = &nfsd4_cld_tracking_ops; + break; + case 2: + nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v2; + break; + default: + break; + } + } +out_free: + free_cld_upcall(cup); +out_err: + if (ret) + dprintk("%s: Unable to get version from userspace: %d\n", + __func__, ret); + return ret; +} + +static int +nfsd4_cld_tracking_init(struct net *net) +{ + int status; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + bool running; + int retries = 10; + struct crypto_shash *tfm; + + status = nfs4_cld_state_init(net); + if (status) + return status; + + status = __nfsd4_init_cld_pipe(net); + if (status) + goto err_shutdown; + + /* + * rpc pipe upcalls take 30 seconds to time out, so we don't want to + * queue an upcall unless we know that nfsdcld is running (because we + * want this to fail fast so that nfsd4_client_tracking_init() can try + * the next client tracking method). nfsdcld should already be running + * before nfsd is started, so the wait here is for nfsdcld to open the + * pipefs file we just created. + */ + while (!(running = cld_running(nn)) && retries--) + msleep(100); + + if (!running) { + status = -ETIMEDOUT; + goto err_remove; + } + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + status = PTR_ERR(tfm); + goto err_remove; + } + nn->cld_net->cn_tfm = tfm; + + status = nfsd4_cld_get_version(nn); + if (status == -EOPNOTSUPP) + pr_warn("NFSD: nfsdcld GetVersion upcall failed. Please upgrade nfsdcld.\n"); + + status = nfsd4_cld_grace_start(nn); + if (status) { + if (status == -EOPNOTSUPP) + pr_warn("NFSD: nfsdcld GraceStart upcall failed. Please upgrade nfsdcld.\n"); + nfs4_release_reclaim(nn); + goto err_remove; + } else + printk("NFSD: Using nfsdcld client tracking operations.\n"); + return 0; + +err_remove: + nfsd4_remove_cld_pipe(net); +err_shutdown: + nfs4_cld_state_shutdown(net); + return status; +} + +static void +nfsd4_cld_tracking_exit(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfs4_release_reclaim(nn); + nfsd4_remove_cld_pipe(net); + nfs4_cld_state_shutdown(net); +} + +/* For older nfsdcld's */ +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = { + .init = nfsd4_init_cld_pipe, + .exit = nfsd4_remove_cld_pipe, + .create = nfsd4_cld_create, + .remove = nfsd4_cld_remove, + .check = nfsd4_cld_check_v0, + .grace_done = nfsd4_cld_grace_done_v0, + .version = 1, + .msglen = sizeof(struct cld_msg), +}; + +/* For newer nfsdcld's */ +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { + .init = nfsd4_cld_tracking_init, + .exit = nfsd4_cld_tracking_exit, + .create = nfsd4_cld_create, + .remove = nfsd4_cld_remove, + .check = nfsd4_cld_check, + .grace_done = nfsd4_cld_grace_done, + .version = 1, + .msglen = sizeof(struct cld_msg), +}; + +/* v2 create/check ops include the principal, if available */ +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = { + .init = nfsd4_cld_tracking_init, + .exit = nfsd4_cld_tracking_exit, + .create = nfsd4_cld_create_v2, + .remove = nfsd4_cld_remove, + .check = nfsd4_cld_check_v2, + .grace_done = nfsd4_cld_grace_done, + .version = 2, + .msglen = sizeof(struct cld_msg_v2), +}; + +/* upcall via usermodehelper */ +static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack"; +module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog), + S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program"); + +static bool cltrack_legacy_disable; +module_param(cltrack_legacy_disable, bool, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cltrack_legacy_disable, + "Disable legacy recoverydir conversion. Default: false"); + +#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR=" +#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR=" +#define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION=" +#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START=" + +static char * +nfsd4_cltrack_legacy_topdir(void) +{ + int copied; + size_t len; + char *result; + + if (cltrack_legacy_disable) + return NULL; + + len = strlen(LEGACY_TOPDIR_ENV_PREFIX) + + strlen(nfs4_recoverydir()) + 1; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, LEGACY_TOPDIR_ENV_PREFIX "%s", + nfs4_recoverydir()); + if (copied >= len) { + /* just return nothing if output was truncated */ + kfree(result); + return NULL; + } + + return result; +} + +static char * +nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name) +{ + int copied; + size_t len; + char *result; + + if (cltrack_legacy_disable) + return NULL; + + /* +1 is for '/' between "topdir" and "recdir" */ + len = strlen(LEGACY_RECDIR_ENV_PREFIX) + + strlen(nfs4_recoverydir()) + 1 + HEXDIR_LEN; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/", + nfs4_recoverydir()); + if (copied > (len - HEXDIR_LEN)) { + /* just return nothing if output will be truncated */ + kfree(result); + return NULL; + } + + copied = nfs4_make_rec_clidname(result + copied, name); + if (copied) { + kfree(result); + return NULL; + } + + return result; +} + +static char * +nfsd4_cltrack_client_has_session(struct nfs4_client *clp) +{ + int copied; + size_t len; + char *result; + + /* prefix + Y/N character + terminating NULL */ + len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c", + clp->cl_minorversion ? 'Y' : 'N'); + if (copied >= len) { + /* just return nothing if output was truncated */ + kfree(result); + return NULL; + } + + return result; +} + +static char * +nfsd4_cltrack_grace_start(time64_t grace_start) +{ + int copied; + size_t len; + char *result; + + /* prefix + max width of int64_t string + terminating NULL */ + len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%lld", + grace_start); + if (copied >= len) { + /* just return nothing if output was truncated */ + kfree(result); + return NULL; + } + + return result; +} + +static int +nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1) +{ + char *envp[3]; + char *argv[4]; + int ret; + + if (unlikely(!cltrack_prog[0])) { + dprintk("%s: cltrack_prog is disabled\n", __func__); + return -EACCES; + } + + dprintk("%s: cmd: %s\n", __func__, cmd); + dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)"); + dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)"); + dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)"); + + envp[0] = env0; + envp[1] = env1; + envp[2] = NULL; + + argv[0] = (char *)cltrack_prog; + argv[1] = cmd; + argv[2] = arg; + argv[3] = NULL; + + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or EACCES + * error. The admin can re-enable it on the fly by using sysfs + * once the problem has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) { + dprintk("NFSD: %s was not found or isn't executable (%d). " + "Setting cltrack_prog to blank string!", + cltrack_prog, ret); + cltrack_prog[0] = '\0'; + } + dprintk("%s: %s return value: %d\n", __func__, cltrack_prog, ret); + + return ret; +} + +static char * +bin_to_hex_dup(const unsigned char *src, int srclen) +{ + char *buf; + + /* +1 for terminating NULL */ + buf = kzalloc((srclen * 2) + 1, GFP_KERNEL); + if (!buf) + return buf; + + bin2hex(buf, src, srclen); + return buf; +} + +static int +nfsd4_umh_cltrack_init(struct net *net) +{ + int ret; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time); + + /* XXX: The usermode helper s not working in container yet. */ + if (net != &init_net) { + pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n"); + kfree(grace_start); + return -EINVAL; + } + + ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL); + kfree(grace_start); + if (!ret) + printk("NFSD: Using UMH upcall client tracking operations.\n"); + return ret; +} + +static void +nfsd4_cltrack_upcall_lock(struct nfs4_client *clp) +{ + wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK, + TASK_UNINTERRUPTIBLE); +} + +static void +nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp) +{ + smp_mb__before_atomic(); + clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags); + smp_mb__after_atomic(); + wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK); +} + +static void +nfsd4_umh_cltrack_create(struct nfs4_client *clp) +{ + char *hexid, *has_session, *grace_start; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + /* + * With v4.0 clients, there's little difference in outcome between a + * create and check operation, and we can end up calling into this + * function multiple times per client (once for each openowner). So, + * for v4.0 clients skip upcalling once the client has been recorded + * on stable storage. + * + * For v4.1+ clients, the outcome of the two operations is different, + * so we must ensure that we upcall for the create operation. v4.1+ + * clients call this on RECLAIM_COMPLETE though, so we should only end + * up doing a single create upcall per client. + */ + if (clp->cl_minorversion == 0 && + test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); + if (!hexid) { + dprintk("%s: can't allocate memory for upcall!\n", __func__); + return; + } + + has_session = nfsd4_cltrack_client_has_session(clp); + grace_start = nfsd4_cltrack_grace_start(nn->boot_time); + + nfsd4_cltrack_upcall_lock(clp); + if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start)) + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + nfsd4_cltrack_upcall_unlock(clp); + + kfree(has_session); + kfree(grace_start); + kfree(hexid); +} + +static void +nfsd4_umh_cltrack_remove(struct nfs4_client *clp) +{ + char *hexid; + + if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); + if (!hexid) { + dprintk("%s: can't allocate memory for upcall!\n", __func__); + return; + } + + nfsd4_cltrack_upcall_lock(clp); + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) && + nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0) + clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + nfsd4_cltrack_upcall_unlock(clp); + + kfree(hexid); +} + +static int +nfsd4_umh_cltrack_check(struct nfs4_client *clp) +{ + int ret; + char *hexid, *has_session, *legacy; + + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); + if (!hexid) { + dprintk("%s: can't allocate memory for upcall!\n", __func__); + return -ENOMEM; + } + + has_session = nfsd4_cltrack_client_has_session(clp); + legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name); + + nfsd4_cltrack_upcall_lock(clp); + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) { + ret = 0; + } else { + ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy); + if (ret == 0) + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + nfsd4_cltrack_upcall_unlock(clp); + kfree(has_session); + kfree(legacy); + kfree(hexid); + + return ret; +} + +static void +nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn) +{ + char *legacy; + char timestr[22]; /* FIXME: better way to determine max size? */ + + sprintf(timestr, "%lld", nn->boot_time); + legacy = nfsd4_cltrack_legacy_topdir(); + nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL); + kfree(legacy); +} + +static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { + .init = nfsd4_umh_cltrack_init, + .exit = NULL, + .create = nfsd4_umh_cltrack_create, + .remove = nfsd4_umh_cltrack_remove, + .check = nfsd4_umh_cltrack_check, + .grace_done = nfsd4_umh_cltrack_grace_done, + .version = 1, + .msglen = 0, +}; + +int +nfsd4_client_tracking_init(struct net *net) +{ + int status; + struct path path; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + /* just run the init if it the method is already decided */ + if (nn->client_tracking_ops) + goto do_init; + + /* First, try to use nfsdcld */ + nn->client_tracking_ops = &nfsd4_cld_tracking_ops; + status = nn->client_tracking_ops->init(net); + if (!status) + return status; + if (status != -ETIMEDOUT) { + nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0; + status = nn->client_tracking_ops->init(net); + if (!status) + return status; + } + + /* + * Next, try the UMH upcall. + */ + nn->client_tracking_ops = &nfsd4_umh_tracking_ops; + status = nn->client_tracking_ops->init(net); + if (!status) + return status; + + /* + * Finally, See if the recoverydir exists and is a directory. + * If it is, then use the legacy ops. + */ + nn->client_tracking_ops = &nfsd4_legacy_tracking_ops; + status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); + if (!status) { + status = d_is_dir(path.dentry); + path_put(&path); + if (!status) { + status = -EINVAL; + goto out; + } + } + +do_init: + status = nn->client_tracking_ops->init(net); +out: + if (status) { + printk(KERN_WARNING "NFSD: Unable to initialize client " + "recovery tracking! (%d)\n", status); + nn->client_tracking_ops = NULL; + } + return status; +} + +void +nfsd4_client_tracking_exit(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (nn->client_tracking_ops) { + if (nn->client_tracking_ops->exit) + nn->client_tracking_ops->exit(net); + nn->client_tracking_ops = NULL; + } +} + +void +nfsd4_client_record_create(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (nn->client_tracking_ops) + nn->client_tracking_ops->create(clp); +} + +void +nfsd4_client_record_remove(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (nn->client_tracking_ops) + nn->client_tracking_ops->remove(clp); +} + +int +nfsd4_client_record_check(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (nn->client_tracking_ops) + return nn->client_tracking_ops->check(clp); + + return -EOPNOTSUPP; +} + +void +nfsd4_record_grace_done(struct nfsd_net *nn) +{ + if (nn->client_tracking_ops) + nn->client_tracking_ops->grace_done(nn); +} + +static int +rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) +{ + struct super_block *sb = ptr; + struct net *net = sb->s_fs_info; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + struct dentry *dentry; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return 0; + + if (!cn) { + module_put(THIS_MODULE); + return 0; + } + + switch (event) { + case RPC_PIPEFS_MOUNT: + dentry = nfsd4_cld_register_sb(sb, cn->cn_pipe); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + break; + } + cn->cn_pipe->dentry = dentry; + break; + case RPC_PIPEFS_UMOUNT: + if (cn->cn_pipe->dentry) + nfsd4_cld_unregister_sb(cn->cn_pipe); + break; + default: + ret = -ENOTSUPP; + break; + } + module_put(THIS_MODULE); + return ret; +} + +static struct notifier_block nfsd4_cld_block = { + .notifier_call = rpc_pipefs_event, +}; + +int +register_cld_notifier(void) +{ + WARN_ON(!nfsd_net_id); + return rpc_pipefs_notifier_register(&nfsd4_cld_block); +} + +void +unregister_cld_notifier(void) +{ + rpc_pipefs_notifier_unregister(&nfsd4_cld_block); +} diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c new file mode 100644 index 000000000..d402ca0b5 --- /dev/null +++ b/fs/nfsd/nfs4state.c @@ -0,0 +1,7588 @@ +/* +* Copyright (c) 2001 The Regents of the University of Michigan. +* All rights reserved. +* +* Kendrick Smith <kmsmith@umich.edu> +* Andy Adamson <kandros@umich.edu> +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* 3. Neither the name of the University nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED +* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/namei.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/ratelimit.h> +#include <linux/sunrpc/svcauth_gss.h> +#include <linux/sunrpc/addr.h> +#include <linux/jhash.h> +#include <linux/string_helpers.h> +#include "xdr4.h" +#include "xdr4cb.h" +#include "vfs.h" +#include "current_stateid.h" + +#include "netns.h" +#include "pnfs.h" +#include "filecache.h" +#include "trace.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +#define all_ones {{~0,~0},~0} +static const stateid_t one_stateid = { + .si_generation = ~0, + .si_opaque = all_ones, +}; +static const stateid_t zero_stateid = { + /* all fields zero */ +}; +static const stateid_t currentstateid = { + .si_generation = 1, +}; +static const stateid_t close_stateid = { + .si_generation = 0xffffffffU, +}; + +static u64 current_sessionid = 1; + +#define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) +#define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) +#define CURRENT_STATEID(stateid) (!memcmp((stateid), ¤tstateid, sizeof(stateid_t))) +#define CLOSE_STATEID(stateid) (!memcmp((stateid), &close_stateid, sizeof(stateid_t))) + +/* forward declarations */ +static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner); +static void nfs4_free_ol_stateid(struct nfs4_stid *stid); +void nfsd4_end_grace(struct nfsd_net *nn); +static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps); + +/* Locking: */ + +/* + * Currently used for the del_recall_lru and file hash table. In an + * effort to decrease the scope of the client_mutex, this spinlock may + * eventually cover more: + */ +static DEFINE_SPINLOCK(state_lock); + +enum nfsd4_st_mutex_lock_subclass { + OPEN_STATEID_MUTEX = 0, + LOCK_STATEID_MUTEX = 1, +}; + +/* + * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for + * the refcount on the open stateid to drop. + */ +static DECLARE_WAIT_QUEUE_HEAD(close_wq); + +/* + * A waitqueue where a writer to clients/#/ctl destroying a client can + * wait for cl_rpc_users to drop to 0 and then for the client to be + * unhashed. + */ +static DECLARE_WAIT_QUEUE_HEAD(expiry_wq); + +static struct kmem_cache *client_slab; +static struct kmem_cache *openowner_slab; +static struct kmem_cache *lockowner_slab; +static struct kmem_cache *file_slab; +static struct kmem_cache *stateid_slab; +static struct kmem_cache *deleg_slab; +static struct kmem_cache *odstate_slab; + +static void free_session(struct nfsd4_session *); + +static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; +static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; + +static bool is_session_dead(struct nfsd4_session *ses) +{ + return ses->se_flags & NFS4_SESSION_DEAD; +} + +static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me) +{ + if (atomic_read(&ses->se_ref) > ref_held_by_me) + return nfserr_jukebox; + ses->se_flags |= NFS4_SESSION_DEAD; + return nfs_ok; +} + +static bool is_client_expired(struct nfs4_client *clp) +{ + return clp->cl_time == 0; +} + +static __be32 get_client_locked(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + if (is_client_expired(clp)) + return nfserr_expired; + atomic_inc(&clp->cl_rpc_users); + return nfs_ok; +} + +/* must be called under the client_lock */ +static inline void +renew_client_locked(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (is_client_expired(clp)) { + WARN_ON(1); + printk("%s: client (clientid %08x/%08x) already expired\n", + __func__, + clp->cl_clientid.cl_boot, + clp->cl_clientid.cl_id); + return; + } + + list_move_tail(&clp->cl_lru, &nn->client_lru); + clp->cl_time = ktime_get_boottime_seconds(); +} + +static void put_client_renew_locked(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + if (!atomic_dec_and_test(&clp->cl_rpc_users)) + return; + if (!is_client_expired(clp)) + renew_client_locked(clp); + else + wake_up_all(&expiry_wq); +} + +static void put_client_renew(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (!atomic_dec_and_lock(&clp->cl_rpc_users, &nn->client_lock)) + return; + if (!is_client_expired(clp)) + renew_client_locked(clp); + else + wake_up_all(&expiry_wq); + spin_unlock(&nn->client_lock); +} + +static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses) +{ + __be32 status; + + if (is_session_dead(ses)) + return nfserr_badsession; + status = get_client_locked(ses->se_client); + if (status) + return status; + atomic_inc(&ses->se_ref); + return nfs_ok; +} + +static void nfsd4_put_session_locked(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses)) + free_session(ses); + put_client_renew_locked(clp); +} + +static void nfsd4_put_session(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); + nfsd4_put_session_locked(ses); + spin_unlock(&nn->client_lock); +} + +static struct nfsd4_blocked_lock * +find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, + struct nfsd_net *nn) +{ + struct nfsd4_blocked_lock *cur, *found = NULL; + + spin_lock(&nn->blocked_locks_lock); + list_for_each_entry(cur, &lo->lo_blocked, nbl_list) { + if (fh_match(fh, &cur->nbl_fh)) { + list_del_init(&cur->nbl_list); + list_del_init(&cur->nbl_lru); + found = cur; + break; + } + } + spin_unlock(&nn->blocked_locks_lock); + if (found) + locks_delete_block(&found->nbl_lock); + return found; +} + +static struct nfsd4_blocked_lock * +find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, + struct nfsd_net *nn) +{ + struct nfsd4_blocked_lock *nbl; + + nbl = find_blocked_lock(lo, fh, nn); + if (!nbl) { + nbl= kmalloc(sizeof(*nbl), GFP_KERNEL); + if (nbl) { + INIT_LIST_HEAD(&nbl->nbl_list); + INIT_LIST_HEAD(&nbl->nbl_lru); + fh_copy_shallow(&nbl->nbl_fh, fh); + locks_init_lock(&nbl->nbl_lock); + nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, + &nfsd4_cb_notify_lock_ops, + NFSPROC4_CLNT_CB_NOTIFY_LOCK); + } + } + return nbl; +} + +static void +free_blocked_lock(struct nfsd4_blocked_lock *nbl) +{ + locks_delete_block(&nbl->nbl_lock); + locks_release_private(&nbl->nbl_lock); + kfree(nbl); +} + +static void +remove_blocked_locks(struct nfs4_lockowner *lo) +{ + struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct nfsd4_blocked_lock *nbl; + LIST_HEAD(reaplist); + + /* Dequeue all blocked locks */ + spin_lock(&nn->blocked_locks_lock); + while (!list_empty(&lo->lo_blocked)) { + nbl = list_first_entry(&lo->lo_blocked, + struct nfsd4_blocked_lock, + nbl_list); + list_del_init(&nbl->nbl_list); + list_move(&nbl->nbl_lru, &reaplist); + } + spin_unlock(&nn->blocked_locks_lock); + + /* Now free them */ + while (!list_empty(&reaplist)) { + nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, + nbl_lru); + list_del_init(&nbl->nbl_lru); + free_blocked_lock(nbl); + } +} + +static void +nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb) +{ + struct nfsd4_blocked_lock *nbl = container_of(cb, + struct nfsd4_blocked_lock, nbl_cb); + locks_delete_block(&nbl->nbl_lock); +} + +static int +nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) +{ + /* + * Since this is just an optimization, we don't try very hard if it + * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and + * just quit trying on anything else. + */ + switch (task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, 1 * HZ); + return 0; + default: + return 1; + } +} + +static void +nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb) +{ + struct nfsd4_blocked_lock *nbl = container_of(cb, + struct nfsd4_blocked_lock, nbl_cb); + + free_blocked_lock(nbl); +} + +static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { + .prepare = nfsd4_cb_notify_lock_prepare, + .done = nfsd4_cb_notify_lock_done, + .release = nfsd4_cb_notify_lock_release, +}; + +static inline struct nfs4_stateowner * +nfs4_get_stateowner(struct nfs4_stateowner *sop) +{ + atomic_inc(&sop->so_count); + return sop; +} + +static int +same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner) +{ + return (sop->so_owner.len == owner->len) && + 0 == memcmp(sop->so_owner.data, owner->data, owner->len); +} + +static struct nfs4_openowner * +find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open, + struct nfs4_client *clp) +{ + struct nfs4_stateowner *so; + + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[hashval], + so_strhash) { + if (!so->so_is_open_owner) + continue; + if (same_owner_str(so, &open->op_owner)) + return openowner(nfs4_get_stateowner(so)); + } + return NULL; +} + +static struct nfs4_openowner * +find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, + struct nfs4_client *clp) +{ + struct nfs4_openowner *oo; + + spin_lock(&clp->cl_lock); + oo = find_openstateowner_str_locked(hashval, open, clp); + spin_unlock(&clp->cl_lock); + return oo; +} + +static inline u32 +opaque_hashval(const void *ptr, int nbytes) +{ + unsigned char *cptr = (unsigned char *) ptr; + + u32 x = 0; + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x; +} + +static void nfsd4_free_file_rcu(struct rcu_head *rcu) +{ + struct nfs4_file *fp = container_of(rcu, struct nfs4_file, fi_rcu); + + kmem_cache_free(file_slab, fp); +} + +void +put_nfs4_file(struct nfs4_file *fi) +{ + might_lock(&state_lock); + + if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) { + hlist_del_rcu(&fi->fi_hash); + spin_unlock(&state_lock); + WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); + WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); + call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); + } +} + +static struct nfsd_file * +__nfs4_get_fd(struct nfs4_file *f, int oflag) +{ + if (f->fi_fds[oflag]) + return nfsd_file_get(f->fi_fds[oflag]); + return NULL; +} + +static struct nfsd_file * +find_writeable_file_locked(struct nfs4_file *f) +{ + struct nfsd_file *ret; + + lockdep_assert_held(&f->fi_lock); + + ret = __nfs4_get_fd(f, O_WRONLY); + if (!ret) + ret = __nfs4_get_fd(f, O_RDWR); + return ret; +} + +static struct nfsd_file * +find_writeable_file(struct nfs4_file *f) +{ + struct nfsd_file *ret; + + spin_lock(&f->fi_lock); + ret = find_writeable_file_locked(f); + spin_unlock(&f->fi_lock); + + return ret; +} + +static struct nfsd_file * +find_readable_file_locked(struct nfs4_file *f) +{ + struct nfsd_file *ret; + + lockdep_assert_held(&f->fi_lock); + + ret = __nfs4_get_fd(f, O_RDONLY); + if (!ret) + ret = __nfs4_get_fd(f, O_RDWR); + return ret; +} + +static struct nfsd_file * +find_readable_file(struct nfs4_file *f) +{ + struct nfsd_file *ret; + + spin_lock(&f->fi_lock); + ret = find_readable_file_locked(f); + spin_unlock(&f->fi_lock); + + return ret; +} + +struct nfsd_file * +find_any_file(struct nfs4_file *f) +{ + struct nfsd_file *ret; + + if (!f) + return NULL; + spin_lock(&f->fi_lock); + ret = __nfs4_get_fd(f, O_RDWR); + if (!ret) { + ret = __nfs4_get_fd(f, O_WRONLY); + if (!ret) + ret = __nfs4_get_fd(f, O_RDONLY); + } + spin_unlock(&f->fi_lock); + return ret; +} + +static struct nfsd_file *find_any_file_locked(struct nfs4_file *f) +{ + lockdep_assert_held(&f->fi_lock); + + if (f->fi_fds[O_RDWR]) + return f->fi_fds[O_RDWR]; + if (f->fi_fds[O_WRONLY]) + return f->fi_fds[O_WRONLY]; + if (f->fi_fds[O_RDONLY]) + return f->fi_fds[O_RDONLY]; + return NULL; +} + +static struct nfsd_file *find_deleg_file_locked(struct nfs4_file *f) +{ + lockdep_assert_held(&f->fi_lock); + + if (f->fi_deleg_file) + return f->fi_deleg_file; + return NULL; +} + +static atomic_long_t num_delegations; +unsigned long max_delegations; + +/* + * Open owner state (share locks) + */ + +/* hash tables for lock and open owners */ +#define OWNER_HASH_BITS 8 +#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) +#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) + +static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) +{ + unsigned int ret; + + ret = opaque_hashval(ownername->data, ownername->len); + return ret & OWNER_HASH_MASK; +} + +/* hash table for nfs4_file */ +#define FILE_HASH_BITS 8 +#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) + +static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh) +{ + return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0); +} + +static unsigned int file_hashval(struct knfsd_fh *fh) +{ + return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1); +} + +static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; + +static void +__nfs4_file_get_access(struct nfs4_file *fp, u32 access) +{ + lockdep_assert_held(&fp->fi_lock); + + if (access & NFS4_SHARE_ACCESS_WRITE) + atomic_inc(&fp->fi_access[O_WRONLY]); + if (access & NFS4_SHARE_ACCESS_READ) + atomic_inc(&fp->fi_access[O_RDONLY]); +} + +static __be32 +nfs4_file_get_access(struct nfs4_file *fp, u32 access) +{ + lockdep_assert_held(&fp->fi_lock); + + /* Does this access mode make sense? */ + if (access & ~NFS4_SHARE_ACCESS_BOTH) + return nfserr_inval; + + /* Does it conflict with a deny mode already set? */ + if ((access & fp->fi_share_deny) != 0) + return nfserr_share_denied; + + __nfs4_file_get_access(fp, access); + return nfs_ok; +} + +static __be32 nfs4_file_check_deny(struct nfs4_file *fp, u32 deny) +{ + /* Common case is that there is no deny mode. */ + if (deny) { + /* Does this deny mode make sense? */ + if (deny & ~NFS4_SHARE_DENY_BOTH) + return nfserr_inval; + + if ((deny & NFS4_SHARE_DENY_READ) && + atomic_read(&fp->fi_access[O_RDONLY])) + return nfserr_share_denied; + + if ((deny & NFS4_SHARE_DENY_WRITE) && + atomic_read(&fp->fi_access[O_WRONLY])) + return nfserr_share_denied; + } + return nfs_ok; +} + +static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) +{ + might_lock(&fp->fi_lock); + + if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) { + struct nfsd_file *f1 = NULL; + struct nfsd_file *f2 = NULL; + + swap(f1, fp->fi_fds[oflag]); + if (atomic_read(&fp->fi_access[1 - oflag]) == 0) + swap(f2, fp->fi_fds[O_RDWR]); + spin_unlock(&fp->fi_lock); + if (f1) + nfsd_file_put(f1); + if (f2) + nfsd_file_put(f2); + } +} + +static void nfs4_file_put_access(struct nfs4_file *fp, u32 access) +{ + WARN_ON_ONCE(access & ~NFS4_SHARE_ACCESS_BOTH); + + if (access & NFS4_SHARE_ACCESS_WRITE) + __nfs4_file_put_access(fp, O_WRONLY); + if (access & NFS4_SHARE_ACCESS_READ) + __nfs4_file_put_access(fp, O_RDONLY); +} + +/* + * Allocate a new open/delegation state counter. This is needed for + * pNFS for proper return on close semantics. + * + * Note that we only allocate it for pNFS-enabled exports, otherwise + * all pointers to struct nfs4_clnt_odstate are always NULL. + */ +static struct nfs4_clnt_odstate * +alloc_clnt_odstate(struct nfs4_client *clp) +{ + struct nfs4_clnt_odstate *co; + + co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL); + if (co) { + co->co_client = clp; + refcount_set(&co->co_odcount, 1); + } + return co; +} + +static void +hash_clnt_odstate_locked(struct nfs4_clnt_odstate *co) +{ + struct nfs4_file *fp = co->co_file; + + lockdep_assert_held(&fp->fi_lock); + list_add(&co->co_perfile, &fp->fi_clnt_odstate); +} + +static inline void +get_clnt_odstate(struct nfs4_clnt_odstate *co) +{ + if (co) + refcount_inc(&co->co_odcount); +} + +static void +put_clnt_odstate(struct nfs4_clnt_odstate *co) +{ + struct nfs4_file *fp; + + if (!co) + return; + + fp = co->co_file; + if (refcount_dec_and_lock(&co->co_odcount, &fp->fi_lock)) { + list_del(&co->co_perfile); + spin_unlock(&fp->fi_lock); + + nfsd4_return_all_file_layouts(co->co_client, fp); + kmem_cache_free(odstate_slab, co); + } +} + +static struct nfs4_clnt_odstate * +find_or_hash_clnt_odstate(struct nfs4_file *fp, struct nfs4_clnt_odstate *new) +{ + struct nfs4_clnt_odstate *co; + struct nfs4_client *cl; + + if (!new) + return NULL; + + cl = new->co_client; + + spin_lock(&fp->fi_lock); + list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) { + if (co->co_client == cl) { + get_clnt_odstate(co); + goto out; + } + } + co = new; + co->co_file = fp; + hash_clnt_odstate_locked(new); +out: + spin_unlock(&fp->fi_lock); + return co; +} + +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, + void (*sc_free)(struct nfs4_stid *)) +{ + struct nfs4_stid *stid; + int new_id; + + stid = kmem_cache_zalloc(slab, GFP_KERNEL); + if (!stid) + return NULL; + + idr_preload(GFP_KERNEL); + spin_lock(&cl->cl_lock); + /* Reserving 0 for start of file in nfsdfs "states" file: */ + new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 1, 0, GFP_NOWAIT); + spin_unlock(&cl->cl_lock); + idr_preload_end(); + if (new_id < 0) + goto out_free; + + stid->sc_free = sc_free; + stid->sc_client = cl; + stid->sc_stateid.si_opaque.so_id = new_id; + stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; + /* Will be incremented before return to client: */ + refcount_set(&stid->sc_count, 1); + spin_lock_init(&stid->sc_lock); + INIT_LIST_HEAD(&stid->sc_cp_list); + + /* + * It shouldn't be a problem to reuse an opaque stateid value. + * I don't think it is for 4.1. But with 4.0 I worry that, for + * example, a stray write retransmission could be accepted by + * the server when it should have been rejected. Therefore, + * adopt a trick from the sctp code to attempt to maximize the + * amount of time until an id is reused, by ensuring they always + * "increase" (mod INT_MAX): + */ + return stid; +out_free: + kmem_cache_free(slab, stid); + return NULL; +} + +/* + * Create a unique stateid_t to represent each COPY. + */ +static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid, + unsigned char sc_type) +{ + int new_id; + + stid->stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time; + stid->stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; + stid->sc_type = sc_type; + + idr_preload(GFP_KERNEL); + spin_lock(&nn->s2s_cp_lock); + new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT); + stid->stid.si_opaque.so_id = new_id; + stid->stid.si_generation = 1; + spin_unlock(&nn->s2s_cp_lock); + idr_preload_end(); + if (new_id < 0) + return 0; + return 1; +} + +int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy) +{ + return nfs4_init_cp_state(nn, ©->cp_stateid, NFS4_COPY_STID); +} + +struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn, + struct nfs4_stid *p_stid) +{ + struct nfs4_cpntf_state *cps; + + cps = kzalloc(sizeof(struct nfs4_cpntf_state), GFP_KERNEL); + if (!cps) + return NULL; + cps->cpntf_time = ktime_get_boottime_seconds(); + refcount_set(&cps->cp_stateid.sc_count, 1); + if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID)) + goto out_free; + spin_lock(&nn->s2s_cp_lock); + list_add(&cps->cp_list, &p_stid->sc_cp_list); + spin_unlock(&nn->s2s_cp_lock); + return cps; +out_free: + kfree(cps); + return NULL; +} + +void nfs4_free_copy_state(struct nfsd4_copy *copy) +{ + struct nfsd_net *nn; + + WARN_ON_ONCE(copy->cp_stateid.sc_type != NFS4_COPY_STID); + nn = net_generic(copy->cp_clp->net, nfsd_net_id); + spin_lock(&nn->s2s_cp_lock); + idr_remove(&nn->s2s_cp_stateids, + copy->cp_stateid.stid.si_opaque.so_id); + spin_unlock(&nn->s2s_cp_lock); +} + +static void nfs4_free_cpntf_statelist(struct net *net, struct nfs4_stid *stid) +{ + struct nfs4_cpntf_state *cps; + struct nfsd_net *nn; + + nn = net_generic(net, nfsd_net_id); + spin_lock(&nn->s2s_cp_lock); + while (!list_empty(&stid->sc_cp_list)) { + cps = list_first_entry(&stid->sc_cp_list, + struct nfs4_cpntf_state, cp_list); + _free_cpntf_state_locked(nn, cps); + } + spin_unlock(&nn->s2s_cp_lock); +} + +static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) +{ + struct nfs4_stid *stid; + + stid = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_ol_stateid); + if (!stid) + return NULL; + + return openlockstateid(stid); +} + +static void nfs4_free_deleg(struct nfs4_stid *stid) +{ + WARN_ON(!list_empty(&stid->sc_cp_list)); + kmem_cache_free(deleg_slab, stid); + atomic_long_dec(&num_delegations); +} + +/* + * When we recall a delegation, we should be careful not to hand it + * out again straight away. + * To ensure this we keep a pair of bloom filters ('new' and 'old') + * in which the filehandles of recalled delegations are "stored". + * If a filehandle appear in either filter, a delegation is blocked. + * When a delegation is recalled, the filehandle is stored in the "new" + * filter. + * Every 30 seconds we swap the filters and clear the "new" one, + * unless both are empty of course. + * + * Each filter is 256 bits. We hash the filehandle to 32bit and use the + * low 3 bytes as hash-table indices. + * + * 'blocked_delegations_lock', which is always taken in block_delegations(), + * is used to manage concurrent access. Testing does not need the lock + * except when swapping the two filters. + */ +static DEFINE_SPINLOCK(blocked_delegations_lock); +static struct bloom_pair { + int entries, old_entries; + time64_t swap_time; + int new; /* index into 'set' */ + DECLARE_BITMAP(set[2], 256); +} blocked_delegations; + +static int delegation_blocked(struct knfsd_fh *fh) +{ + u32 hash; + struct bloom_pair *bd = &blocked_delegations; + + if (bd->entries == 0) + return 0; + if (ktime_get_seconds() - bd->swap_time > 30) { + spin_lock(&blocked_delegations_lock); + if (ktime_get_seconds() - bd->swap_time > 30) { + bd->entries -= bd->old_entries; + bd->old_entries = bd->entries; + memset(bd->set[bd->new], 0, + sizeof(bd->set[0])); + bd->new = 1-bd->new; + bd->swap_time = ktime_get_seconds(); + } + spin_unlock(&blocked_delegations_lock); + } + hash = jhash(&fh->fh_base, fh->fh_size, 0); + if (test_bit(hash&255, bd->set[0]) && + test_bit((hash>>8)&255, bd->set[0]) && + test_bit((hash>>16)&255, bd->set[0])) + return 1; + + if (test_bit(hash&255, bd->set[1]) && + test_bit((hash>>8)&255, bd->set[1]) && + test_bit((hash>>16)&255, bd->set[1])) + return 1; + + return 0; +} + +static void block_delegations(struct knfsd_fh *fh) +{ + u32 hash; + struct bloom_pair *bd = &blocked_delegations; + + hash = jhash(&fh->fh_base, fh->fh_size, 0); + + spin_lock(&blocked_delegations_lock); + __set_bit(hash&255, bd->set[bd->new]); + __set_bit((hash>>8)&255, bd->set[bd->new]); + __set_bit((hash>>16)&255, bd->set[bd->new]); + if (bd->entries == 0) + bd->swap_time = ktime_get_seconds(); + bd->entries += 1; + spin_unlock(&blocked_delegations_lock); +} + +static struct nfs4_delegation * +alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, + struct svc_fh *current_fh, + struct nfs4_clnt_odstate *odstate) +{ + struct nfs4_delegation *dp; + long n; + + dprintk("NFSD alloc_init_deleg\n"); + n = atomic_long_inc_return(&num_delegations); + if (n < 0 || n > max_delegations) + goto out_dec; + if (delegation_blocked(¤t_fh->fh_handle)) + goto out_dec; + dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg)); + if (dp == NULL) + goto out_dec; + + /* + * delegation seqid's are never incremented. The 4.1 special + * meaning of seqid 0 isn't meaningful, really, but let's avoid + * 0 anyway just for consistency and use 1: + */ + dp->dl_stid.sc_stateid.si_generation = 1; + INIT_LIST_HEAD(&dp->dl_perfile); + INIT_LIST_HEAD(&dp->dl_perclnt); + INIT_LIST_HEAD(&dp->dl_recall_lru); + dp->dl_clnt_odstate = odstate; + get_clnt_odstate(odstate); + dp->dl_type = NFS4_OPEN_DELEGATE_READ; + dp->dl_retries = 1; + nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, + &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); + get_nfs4_file(fp); + dp->dl_stid.sc_file = fp; + return dp; +out_dec: + atomic_long_dec(&num_delegations); + return NULL; +} + +void +nfs4_put_stid(struct nfs4_stid *s) +{ + struct nfs4_file *fp = s->sc_file; + struct nfs4_client *clp = s->sc_client; + + might_lock(&clp->cl_lock); + + if (!refcount_dec_and_lock(&s->sc_count, &clp->cl_lock)) { + wake_up_all(&close_wq); + return; + } + idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + nfs4_free_cpntf_statelist(clp->net, s); + spin_unlock(&clp->cl_lock); + s->sc_free(s); + if (fp) + put_nfs4_file(fp); +} + +void +nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid) +{ + stateid_t *src = &stid->sc_stateid; + + spin_lock(&stid->sc_lock); + if (unlikely(++src->si_generation == 0)) + src->si_generation = 1; + memcpy(dst, src, sizeof(*dst)); + spin_unlock(&stid->sc_lock); +} + +static void put_deleg_file(struct nfs4_file *fp) +{ + struct nfsd_file *nf = NULL; + + spin_lock(&fp->fi_lock); + if (--fp->fi_delegees == 0) + swap(nf, fp->fi_deleg_file); + spin_unlock(&fp->fi_lock); + + if (nf) + nfsd_file_put(nf); +} + +static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) +{ + struct nfs4_file *fp = dp->dl_stid.sc_file; + struct nfsd_file *nf = fp->fi_deleg_file; + + WARN_ON_ONCE(!fp->fi_delegees); + + vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); + put_deleg_file(fp); +} + +static void destroy_unhashed_deleg(struct nfs4_delegation *dp) +{ + put_clnt_odstate(dp->dl_clnt_odstate); + nfs4_unlock_deleg_lease(dp); + nfs4_put_stid(&dp->dl_stid); +} + +void nfs4_unhash_stid(struct nfs4_stid *s) +{ + s->sc_type = 0; +} + +/** + * nfs4_delegation_exists - Discover if this delegation already exists + * @clp: a pointer to the nfs4_client we're granting a delegation to + * @fp: a pointer to the nfs4_file we're granting a delegation on + * + * Return: + * On success: true iff an existing delegation is found + */ + +static bool +nfs4_delegation_exists(struct nfs4_client *clp, struct nfs4_file *fp) +{ + struct nfs4_delegation *searchdp = NULL; + struct nfs4_client *searchclp = NULL; + + lockdep_assert_held(&state_lock); + lockdep_assert_held(&fp->fi_lock); + + list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) { + searchclp = searchdp->dl_stid.sc_client; + if (clp == searchclp) { + return true; + } + } + return false; +} + +/** + * hash_delegation_locked - Add a delegation to the appropriate lists + * @dp: a pointer to the nfs4_delegation we are adding. + * @fp: a pointer to the nfs4_file we're granting a delegation on + * + * Return: + * On success: NULL if the delegation was successfully hashed. + * + * On error: -EAGAIN if one was previously granted to this + * nfs4_client for this nfs4_file. Delegation is not hashed. + * + */ + +static int +hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) +{ + struct nfs4_client *clp = dp->dl_stid.sc_client; + + lockdep_assert_held(&state_lock); + lockdep_assert_held(&fp->fi_lock); + + if (nfs4_delegation_exists(clp, fp)) + return -EAGAIN; + refcount_inc(&dp->dl_stid.sc_count); + dp->dl_stid.sc_type = NFS4_DELEG_STID; + list_add(&dp->dl_perfile, &fp->fi_delegations); + list_add(&dp->dl_perclnt, &clp->cl_delegations); + return 0; +} + +static bool delegation_hashed(struct nfs4_delegation *dp) +{ + return !(list_empty(&dp->dl_perfile)); +} + +static bool +unhash_delegation_locked(struct nfs4_delegation *dp) +{ + struct nfs4_file *fp = dp->dl_stid.sc_file; + + lockdep_assert_held(&state_lock); + + if (!delegation_hashed(dp)) + return false; + + dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID; + /* Ensure that deleg break won't try to requeue it */ + ++dp->dl_time; + spin_lock(&fp->fi_lock); + list_del_init(&dp->dl_perclnt); + list_del_init(&dp->dl_recall_lru); + list_del_init(&dp->dl_perfile); + spin_unlock(&fp->fi_lock); + return true; +} + +static void destroy_delegation(struct nfs4_delegation *dp) +{ + bool unhashed; + + spin_lock(&state_lock); + unhashed = unhash_delegation_locked(dp); + spin_unlock(&state_lock); + if (unhashed) + destroy_unhashed_deleg(dp); +} + +static void revoke_delegation(struct nfs4_delegation *dp) +{ + struct nfs4_client *clp = dp->dl_stid.sc_client; + + WARN_ON(!list_empty(&dp->dl_recall_lru)); + + if (clp->cl_minorversion) { + spin_lock(&clp->cl_lock); + dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; + refcount_inc(&dp->dl_stid.sc_count); + list_add(&dp->dl_recall_lru, &clp->cl_revoked); + spin_unlock(&clp->cl_lock); + } + destroy_unhashed_deleg(dp); +} + +/* + * SETCLIENTID state + */ + +static unsigned int clientid_hashval(u32 id) +{ + return id & CLIENT_HASH_MASK; +} + +static unsigned int clientstr_hashval(struct xdr_netobj name) +{ + return opaque_hashval(name.data, 8) & CLIENT_HASH_MASK; +} + +/* + * We store the NONE, READ, WRITE, and BOTH bits separately in the + * st_{access,deny}_bmap field of the stateid, in order to track not + * only what share bits are currently in force, but also what + * combinations of share bits previous opens have used. This allows us + * to enforce the recommendation of rfc 3530 14.2.19 that the server + * return an error if the client attempt to downgrade to a combination + * of share bits not explicable by closing some of its previous opens. + * + * XXX: This enforcement is actually incomplete, since we don't keep + * track of access/deny bit combinations; so, e.g., we allow: + * + * OPEN allow read, deny write + * OPEN allow both, deny none + * DOWNGRADE allow read, deny none + * + * which we should reject. + */ +static unsigned int +bmap_to_share_mode(unsigned long bmap) { + int i; + unsigned int access = 0; + + for (i = 1; i < 4; i++) { + if (test_bit(i, &bmap)) + access |= i; + } + return access; +} + +/* set share access for a given stateid */ +static inline void +set_access(u32 access, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << access; + + WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); + stp->st_access_bmap |= mask; +} + +/* clear share access for a given stateid */ +static inline void +clear_access(u32 access, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << access; + + WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); + stp->st_access_bmap &= ~mask; +} + +/* test whether a given stateid has access */ +static inline bool +test_access(u32 access, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << access; + + return (bool)(stp->st_access_bmap & mask); +} + +/* set share deny for a given stateid */ +static inline void +set_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << deny; + + WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); + stp->st_deny_bmap |= mask; +} + +/* clear share deny for a given stateid */ +static inline void +clear_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << deny; + + WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); + stp->st_deny_bmap &= ~mask; +} + +/* test whether a given stateid is denying specific access */ +static inline bool +test_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << deny; + + return (bool)(stp->st_deny_bmap & mask); +} + +static int nfs4_access_to_omode(u32 access) +{ + switch (access & NFS4_SHARE_ACCESS_BOTH) { + case NFS4_SHARE_ACCESS_READ: + return O_RDONLY; + case NFS4_SHARE_ACCESS_WRITE: + return O_WRONLY; + case NFS4_SHARE_ACCESS_BOTH: + return O_RDWR; + } + WARN_ON_ONCE(1); + return O_RDONLY; +} + +/* + * A stateid that had a deny mode associated with it is being released + * or downgraded. Recalculate the deny mode on the file. + */ +static void +recalculate_deny_mode(struct nfs4_file *fp) +{ + struct nfs4_ol_stateid *stp; + + spin_lock(&fp->fi_lock); + fp->fi_share_deny = 0; + list_for_each_entry(stp, &fp->fi_stateids, st_perfile) + fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap); + spin_unlock(&fp->fi_lock); +} + +static void +reset_union_bmap_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + int i; + bool change = false; + + for (i = 1; i < 4; i++) { + if ((i & deny) != i) { + change = true; + clear_deny(i, stp); + } + } + + /* Recalculate per-file deny mode if there was a change */ + if (change) + recalculate_deny_mode(stp->st_stid.sc_file); +} + +/* release all access and file references for a given stateid */ +static void +release_all_access(struct nfs4_ol_stateid *stp) +{ + int i; + struct nfs4_file *fp = stp->st_stid.sc_file; + + if (fp && stp->st_deny_bmap != 0) + recalculate_deny_mode(fp); + + for (i = 1; i < 4; i++) { + if (test_access(i, stp)) + nfs4_file_put_access(stp->st_stid.sc_file, i); + clear_access(i, stp); + } +} + +static inline void nfs4_free_stateowner(struct nfs4_stateowner *sop) +{ + kfree(sop->so_owner.data); + sop->so_ops->so_free(sop); +} + +static void nfs4_put_stateowner(struct nfs4_stateowner *sop) +{ + struct nfs4_client *clp = sop->so_client; + + might_lock(&clp->cl_lock); + + if (!atomic_dec_and_lock(&sop->so_count, &clp->cl_lock)) + return; + sop->so_ops->so_unhash(sop); + spin_unlock(&clp->cl_lock); + nfs4_free_stateowner(sop); +} + +static bool +nfs4_ol_stateid_unhashed(const struct nfs4_ol_stateid *stp) +{ + return list_empty(&stp->st_perfile); +} + +static bool unhash_ol_stateid(struct nfs4_ol_stateid *stp) +{ + struct nfs4_file *fp = stp->st_stid.sc_file; + + lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock); + + if (list_empty(&stp->st_perfile)) + return false; + + spin_lock(&fp->fi_lock); + list_del_init(&stp->st_perfile); + spin_unlock(&fp->fi_lock); + list_del(&stp->st_perstateowner); + return true; +} + +static void nfs4_free_ol_stateid(struct nfs4_stid *stid) +{ + struct nfs4_ol_stateid *stp = openlockstateid(stid); + + put_clnt_odstate(stp->st_clnt_odstate); + release_all_access(stp); + if (stp->st_stateowner) + nfs4_put_stateowner(stp->st_stateowner); + WARN_ON(!list_empty(&stid->sc_cp_list)); + kmem_cache_free(stateid_slab, stid); +} + +static void nfs4_free_lock_stateid(struct nfs4_stid *stid) +{ + struct nfs4_ol_stateid *stp = openlockstateid(stid); + struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); + struct nfsd_file *nf; + + nf = find_any_file(stp->st_stid.sc_file); + if (nf) { + get_file(nf->nf_file); + filp_close(nf->nf_file, (fl_owner_t)lo); + nfsd_file_put(nf); + } + nfs4_free_ol_stateid(stid); +} + +/* + * Put the persistent reference to an already unhashed generic stateid, while + * holding the cl_lock. If it's the last reference, then put it onto the + * reaplist for later destruction. + */ +static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, + struct list_head *reaplist) +{ + struct nfs4_stid *s = &stp->st_stid; + struct nfs4_client *clp = s->sc_client; + + lockdep_assert_held(&clp->cl_lock); + + WARN_ON_ONCE(!list_empty(&stp->st_locks)); + + if (!refcount_dec_and_test(&s->sc_count)) { + wake_up_all(&close_wq); + return; + } + + idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + list_add(&stp->st_locks, reaplist); +} + +static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) +{ + lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); + + if (!unhash_ol_stateid(stp)) + return false; + list_del_init(&stp->st_locks); + nfs4_unhash_stid(&stp->st_stid); + return true; +} + +static void release_lock_stateid(struct nfs4_ol_stateid *stp) +{ + struct nfs4_client *clp = stp->st_stid.sc_client; + bool unhashed; + + spin_lock(&clp->cl_lock); + unhashed = unhash_lock_stateid(stp); + spin_unlock(&clp->cl_lock); + if (unhashed) + nfs4_put_stid(&stp->st_stid); +} + +static void unhash_lockowner_locked(struct nfs4_lockowner *lo) +{ + struct nfs4_client *clp = lo->lo_owner.so_client; + + lockdep_assert_held(&clp->cl_lock); + + list_del_init(&lo->lo_owner.so_strhash); +} + +/* + * Free a list of generic stateids that were collected earlier after being + * fully unhashed. + */ +static void +free_ol_stateid_reaplist(struct list_head *reaplist) +{ + struct nfs4_ol_stateid *stp; + struct nfs4_file *fp; + + might_sleep(); + + while (!list_empty(reaplist)) { + stp = list_first_entry(reaplist, struct nfs4_ol_stateid, + st_locks); + list_del(&stp->st_locks); + fp = stp->st_stid.sc_file; + stp->st_stid.sc_free(&stp->st_stid); + if (fp) + put_nfs4_file(fp); + } +} + +static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, + struct list_head *reaplist) +{ + struct nfs4_ol_stateid *stp; + + lockdep_assert_held(&open_stp->st_stid.sc_client->cl_lock); + + while (!list_empty(&open_stp->st_locks)) { + stp = list_entry(open_stp->st_locks.next, + struct nfs4_ol_stateid, st_locks); + WARN_ON(!unhash_lock_stateid(stp)); + put_ol_stateid_locked(stp, reaplist); + } +} + +static bool unhash_open_stateid(struct nfs4_ol_stateid *stp, + struct list_head *reaplist) +{ + lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); + + if (!unhash_ol_stateid(stp)) + return false; + release_open_stateid_locks(stp, reaplist); + return true; +} + +static void release_open_stateid(struct nfs4_ol_stateid *stp) +{ + LIST_HEAD(reaplist); + + spin_lock(&stp->st_stid.sc_client->cl_lock); + if (unhash_open_stateid(stp, &reaplist)) + put_ol_stateid_locked(stp, &reaplist); + spin_unlock(&stp->st_stid.sc_client->cl_lock); + free_ol_stateid_reaplist(&reaplist); +} + +static void unhash_openowner_locked(struct nfs4_openowner *oo) +{ + struct nfs4_client *clp = oo->oo_owner.so_client; + + lockdep_assert_held(&clp->cl_lock); + + list_del_init(&oo->oo_owner.so_strhash); + list_del_init(&oo->oo_perclient); +} + +static void release_last_closed_stateid(struct nfs4_openowner *oo) +{ + struct nfsd_net *nn = net_generic(oo->oo_owner.so_client->net, + nfsd_net_id); + struct nfs4_ol_stateid *s; + + spin_lock(&nn->client_lock); + s = oo->oo_last_closed_stid; + if (s) { + list_del_init(&oo->oo_close_lru); + oo->oo_last_closed_stid = NULL; + } + spin_unlock(&nn->client_lock); + if (s) + nfs4_put_stid(&s->st_stid); +} + +static void release_openowner(struct nfs4_openowner *oo) +{ + struct nfs4_ol_stateid *stp; + struct nfs4_client *clp = oo->oo_owner.so_client; + struct list_head reaplist; + + INIT_LIST_HEAD(&reaplist); + + spin_lock(&clp->cl_lock); + unhash_openowner_locked(oo); + while (!list_empty(&oo->oo_owner.so_stateids)) { + stp = list_first_entry(&oo->oo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); + if (unhash_open_stateid(stp, &reaplist)) + put_ol_stateid_locked(stp, &reaplist); + } + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); + release_last_closed_stateid(oo); + nfs4_put_stateowner(&oo->oo_owner); +} + +static inline int +hash_sessionid(struct nfs4_sessionid *sessionid) +{ + struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid; + + return sid->sequence % SESSION_HASH_SIZE; +} + +#ifdef CONFIG_SUNRPC_DEBUG +static inline void +dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) +{ + u32 *ptr = (u32 *)(&sessionid->data[0]); + dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]); +} +#else +static inline void +dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) +{ +} +#endif + +/* + * Bump the seqid on cstate->replay_owner, and clear replay_owner if it + * won't be used for replay. + */ +void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr) +{ + struct nfs4_stateowner *so = cstate->replay_owner; + + if (nfserr == nfserr_replay_me) + return; + + if (!seqid_mutating_err(ntohl(nfserr))) { + nfsd4_cstate_clear_replay(cstate); + return; + } + if (!so) + return; + if (so->so_is_open_owner) + release_last_closed_stateid(openowner(so)); + so->so_seqid++; + return; +} + +static void +gen_sessionid(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd4_sessionid *sid; + + sid = (struct nfsd4_sessionid *)ses->se_sessionid.data; + sid->clientid = clp->cl_clientid; + sid->sequence = current_sessionid++; + sid->reserved = 0; +} + +/* + * The protocol defines ca_maxresponssize_cached to include the size of + * the rpc header, but all we need to cache is the data starting after + * the end of the initial SEQUENCE operation--the rest we regenerate + * each time. Therefore we can advertise a ca_maxresponssize_cached + * value that is the number of bytes in our cache plus a few additional + * bytes. In order to stay on the safe side, and not promise more than + * we can cache, those additional bytes must be the minimum possible: 24 + * bytes of rpc header (xid through accept state, with AUTH_NULL + * verifier), 12 for the compound header (with zero-length tag), and 44 + * for the SEQUENCE op response: + */ +#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44) + +static void +free_session_slots(struct nfsd4_session *ses) +{ + int i; + + for (i = 0; i < ses->se_fchannel.maxreqs; i++) { + free_svc_cred(&ses->se_slots[i]->sl_cred); + kfree(ses->se_slots[i]); + } +} + +/* + * We don't actually need to cache the rpc and session headers, so we + * can allocate a little less for each slot: + */ +static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) +{ + u32 size; + + if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ) + size = 0; + else + size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; + return size + sizeof(struct nfsd4_slot); +} + +/* + * XXX: If we run out of reserved DRC memory we could (up to a point) + * re-negotiate active sessions and reduce their slot usage to make + * room for new connections. For now we just fail the create session. + */ +static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn) +{ + u32 slotsize = slot_bytes(ca); + u32 num = ca->maxreqs; + unsigned long avail, total_avail; + unsigned int scale_factor; + + spin_lock(&nfsd_drc_lock); + if (nfsd_drc_max_mem > nfsd_drc_mem_used) + total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + else + /* We have handed out more space than we chose in + * set_max_drc() to allow. That isn't really a + * problem as long as that doesn't make us think we + * have lots more due to integer overflow. + */ + total_avail = 0; + avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); + /* + * Never use more than a fraction of the remaining memory, + * unless it's the only way to give this client a slot. + * The chosen fraction is either 1/8 or 1/number of threads, + * whichever is smaller. This ensures there are adequate + * slots to support multiple clients per thread. + * Give the client one slot even if that would require + * over-allocation--it is better than failure. + */ + scale_factor = max_t(unsigned int, 8, nn->nfsd_serv->sv_nrthreads); + + avail = clamp_t(unsigned long, avail, slotsize, + total_avail/scale_factor); + num = min_t(int, num, avail / slotsize); + num = max_t(int, num, 1); + nfsd_drc_mem_used += num * slotsize; + spin_unlock(&nfsd_drc_lock); + + return num; +} + +static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca) +{ + int slotsize = slot_bytes(ca); + + spin_lock(&nfsd_drc_lock); + nfsd_drc_mem_used -= slotsize * ca->maxreqs; + spin_unlock(&nfsd_drc_lock); +} + +static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, + struct nfsd4_channel_attrs *battrs) +{ + int numslots = fattrs->maxreqs; + int slotsize = slot_bytes(fattrs); + struct nfsd4_session *new; + int mem, i; + + BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *) + + sizeof(struct nfsd4_session) > PAGE_SIZE); + mem = numslots * sizeof(struct nfsd4_slot *); + + new = kzalloc(sizeof(*new) + mem, GFP_KERNEL); + if (!new) + return NULL; + /* allocate each struct nfsd4_slot and data cache in one piece */ + for (i = 0; i < numslots; i++) { + new->se_slots[i] = kzalloc(slotsize, GFP_KERNEL); + if (!new->se_slots[i]) + goto out_free; + } + + memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs)); + memcpy(&new->se_bchannel, battrs, sizeof(struct nfsd4_channel_attrs)); + + return new; +out_free: + while (i--) + kfree(new->se_slots[i]); + kfree(new); + return NULL; +} + +static void free_conn(struct nfsd4_conn *c) +{ + svc_xprt_put(c->cn_xprt); + kfree(c); +} + +static void nfsd4_conn_lost(struct svc_xpt_user *u) +{ + struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user); + struct nfs4_client *clp = c->cn_session->se_client; + + spin_lock(&clp->cl_lock); + if (!list_empty(&c->cn_persession)) { + list_del(&c->cn_persession); + free_conn(c); + } + nfsd4_probe_callback(clp); + spin_unlock(&clp->cl_lock); +} + +static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags) +{ + struct nfsd4_conn *conn; + + conn = kmalloc(sizeof(struct nfsd4_conn), GFP_KERNEL); + if (!conn) + return NULL; + svc_xprt_get(rqstp->rq_xprt); + conn->cn_xprt = rqstp->rq_xprt; + conn->cn_flags = flags; + INIT_LIST_HEAD(&conn->cn_xpt_user.list); + return conn; +} + +static void __nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) +{ + conn->cn_session = ses; + list_add(&conn->cn_persession, &ses->se_conns); +} + +static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + + spin_lock(&clp->cl_lock); + __nfsd4_hash_conn(conn, ses); + spin_unlock(&clp->cl_lock); +} + +static int nfsd4_register_conn(struct nfsd4_conn *conn) +{ + conn->cn_xpt_user.callback = nfsd4_conn_lost; + return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); +} + +static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, struct nfsd4_session *ses) +{ + int ret; + + nfsd4_hash_conn(conn, ses); + ret = nfsd4_register_conn(conn); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&conn->cn_xpt_user); + /* We may have gained or lost a callback channel: */ + nfsd4_probe_callback_sync(ses->se_client); +} + +static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses) +{ + u32 dir = NFS4_CDFC4_FORE; + + if (cses->flags & SESSION4_BACK_CHAN) + dir |= NFS4_CDFC4_BACK; + return alloc_conn(rqstp, dir); +} + +/* must be called under client_lock */ +static void nfsd4_del_conns(struct nfsd4_session *s) +{ + struct nfs4_client *clp = s->se_client; + struct nfsd4_conn *c; + + spin_lock(&clp->cl_lock); + while (!list_empty(&s->se_conns)) { + c = list_first_entry(&s->se_conns, struct nfsd4_conn, cn_persession); + list_del_init(&c->cn_persession); + spin_unlock(&clp->cl_lock); + + unregister_xpt_user(c->cn_xprt, &c->cn_xpt_user); + free_conn(c); + + spin_lock(&clp->cl_lock); + } + spin_unlock(&clp->cl_lock); +} + +static void __free_session(struct nfsd4_session *ses) +{ + free_session_slots(ses); + kfree(ses); +} + +static void free_session(struct nfsd4_session *ses) +{ + nfsd4_del_conns(ses); + nfsd4_put_drc_mem(&ses->se_fchannel); + __free_session(ses); +} + +static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) +{ + int idx; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + new->se_client = clp; + gen_sessionid(new); + + INIT_LIST_HEAD(&new->se_conns); + + new->se_cb_seq_nr = 1; + new->se_flags = cses->flags; + new->se_cb_prog = cses->callback_prog; + new->se_cb_sec = cses->cb_sec; + atomic_set(&new->se_ref, 0); + idx = hash_sessionid(&new->se_sessionid); + list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]); + spin_lock(&clp->cl_lock); + list_add(&new->se_perclnt, &clp->cl_sessions); + spin_unlock(&clp->cl_lock); + + { + struct sockaddr *sa = svc_addr(rqstp); + /* + * This is a little silly; with sessions there's no real + * use for the callback address. Use the peer address + * as a reasonable default for now, but consider fixing + * the rpc client not to require an address in the + * future: + */ + rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); + clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); + } +} + +/* caller must hold client_lock */ +static struct nfsd4_session * +__find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net) +{ + struct nfsd4_session *elem; + int idx; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + dump_sessionid(__func__, sessionid); + idx = hash_sessionid(sessionid); + /* Search in the appropriate list */ + list_for_each_entry(elem, &nn->sessionid_hashtbl[idx], se_hash) { + if (!memcmp(elem->se_sessionid.data, sessionid->data, + NFS4_MAX_SESSIONID_LEN)) { + return elem; + } + } + + dprintk("%s: session not found\n", __func__); + return NULL; +} + +static struct nfsd4_session * +find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net, + __be32 *ret) +{ + struct nfsd4_session *session; + __be32 status = nfserr_badsession; + + session = __find_in_sessionid_hashtbl(sessionid, net); + if (!session) + goto out; + status = nfsd4_get_session_locked(session); + if (status) + session = NULL; +out: + *ret = status; + return session; +} + +/* caller must hold client_lock */ +static void +unhash_session(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + list_del(&ses->se_hash); + spin_lock(&ses->se_client->cl_lock); + list_del(&ses->se_perclnt); + spin_unlock(&ses->se_client->cl_lock); +} + +/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ +static int +STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) +{ + /* + * We're assuming the clid was not given out from a boot + * precisely 2^32 (about 136 years) before this one. That seems + * a safe assumption: + */ + if (clid->cl_boot == (u32)nn->boot_time) + return 0; + trace_nfsd_clid_stale(clid); + return 1; +} + +/* + * XXX Should we use a slab cache ? + * This type of memory management is somewhat inefficient, but we use it + * anyway since SETCLIENTID is not a common operation. + */ +static struct nfs4_client *alloc_client(struct xdr_netobj name) +{ + struct nfs4_client *clp; + int i; + + clp = kmem_cache_zalloc(client_slab, GFP_KERNEL); + if (clp == NULL) + return NULL; + xdr_netobj_dup(&clp->cl_name, &name, GFP_KERNEL); + if (clp->cl_name.data == NULL) + goto err_no_name; + clp->cl_ownerstr_hashtbl = kmalloc_array(OWNER_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); + if (!clp->cl_ownerstr_hashtbl) + goto err_no_hashtbl; + for (i = 0; i < OWNER_HASH_SIZE; i++) + INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]); + INIT_LIST_HEAD(&clp->cl_sessions); + idr_init(&clp->cl_stateids); + atomic_set(&clp->cl_rpc_users, 0); + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_lru); + INIT_LIST_HEAD(&clp->cl_revoked); +#ifdef CONFIG_NFSD_PNFS + INIT_LIST_HEAD(&clp->cl_lo_states); +#endif + INIT_LIST_HEAD(&clp->async_copies); + spin_lock_init(&clp->async_lock); + spin_lock_init(&clp->cl_lock); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); + return clp; +err_no_hashtbl: + kfree(clp->cl_name.data); +err_no_name: + kmem_cache_free(client_slab, clp); + return NULL; +} + +static void __free_client(struct kref *k) +{ + struct nfsdfs_client *c = container_of(k, struct nfsdfs_client, cl_ref); + struct nfs4_client *clp = container_of(c, struct nfs4_client, cl_nfsdfs); + + free_svc_cred(&clp->cl_cred); + kfree(clp->cl_ownerstr_hashtbl); + kfree(clp->cl_name.data); + kfree(clp->cl_nii_domain.data); + kfree(clp->cl_nii_name.data); + idr_destroy(&clp->cl_stateids); + kmem_cache_free(client_slab, clp); +} + +static void drop_client(struct nfs4_client *clp) +{ + kref_put(&clp->cl_nfsdfs.cl_ref, __free_client); +} + +static void +free_client(struct nfs4_client *clp) +{ + while (!list_empty(&clp->cl_sessions)) { + struct nfsd4_session *ses; + ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, + se_perclnt); + list_del(&ses->se_perclnt); + WARN_ON_ONCE(atomic_read(&ses->se_ref)); + free_session(ses); + } + rpc_destroy_wait_queue(&clp->cl_cb_waitq); + if (clp->cl_nfsd_dentry) { + nfsd_client_rmdir(clp->cl_nfsd_dentry); + clp->cl_nfsd_dentry = NULL; + wake_up_all(&expiry_wq); + } + drop_client(clp); +} + +/* must be called under the client_lock */ +static void +unhash_client_locked(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct nfsd4_session *ses; + + lockdep_assert_held(&nn->client_lock); + + /* Mark the client as expired! */ + clp->cl_time = 0; + /* Make it invisible */ + if (!list_empty(&clp->cl_idhash)) { + list_del_init(&clp->cl_idhash); + if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) + rb_erase(&clp->cl_namenode, &nn->conf_name_tree); + else + rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); + } + list_del_init(&clp->cl_lru); + spin_lock(&clp->cl_lock); + list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) + list_del_init(&ses->se_hash); + spin_unlock(&clp->cl_lock); +} + +static void +unhash_client(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); + unhash_client_locked(clp); + spin_unlock(&nn->client_lock); +} + +static __be32 mark_client_expired_locked(struct nfs4_client *clp) +{ + if (atomic_read(&clp->cl_rpc_users)) + return nfserr_jukebox; + unhash_client_locked(clp); + return nfs_ok; +} + +static void +__destroy_client(struct nfs4_client *clp) +{ + int i; + struct nfs4_openowner *oo; + struct nfs4_delegation *dp; + struct list_head reaplist; + + INIT_LIST_HEAD(&reaplist); + spin_lock(&state_lock); + while (!list_empty(&clp->cl_delegations)) { + dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); + WARN_ON(!unhash_delegation_locked(dp)); + list_add(&dp->dl_recall_lru, &reaplist); + } + spin_unlock(&state_lock); + while (!list_empty(&reaplist)) { + dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); + list_del_init(&dp->dl_recall_lru); + destroy_unhashed_deleg(dp); + } + while (!list_empty(&clp->cl_revoked)) { + dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru); + list_del_init(&dp->dl_recall_lru); + nfs4_put_stid(&dp->dl_stid); + } + while (!list_empty(&clp->cl_openowners)) { + oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); + nfs4_get_stateowner(&oo->oo_owner); + release_openowner(oo); + } + for (i = 0; i < OWNER_HASH_SIZE; i++) { + struct nfs4_stateowner *so, *tmp; + + list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i], + so_strhash) { + /* Should be no openowners at this point */ + WARN_ON_ONCE(so->so_is_open_owner); + remove_blocked_locks(lockowner(so)); + } + } + nfsd4_return_all_client_layouts(clp); + nfsd4_shutdown_copy(clp); + nfsd4_shutdown_callback(clp); + if (clp->cl_cb_conn.cb_xprt) + svc_xprt_put(clp->cl_cb_conn.cb_xprt); + free_client(clp); + wake_up_all(&expiry_wq); +} + +static void +destroy_client(struct nfs4_client *clp) +{ + unhash_client(clp); + __destroy_client(clp); +} + +static void inc_reclaim_complete(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (!nn->track_reclaim_completes) + return; + if (!nfsd4_find_reclaim_client(clp->cl_name, nn)) + return; + if (atomic_inc_return(&nn->nr_reclaim_complete) == + nn->reclaim_str_hashtbl_size) { + printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n", + clp->net->ns.inum); + nfsd4_end_grace(nn); + } +} + +static void expire_client(struct nfs4_client *clp) +{ + unhash_client(clp); + nfsd4_client_record_remove(clp); + __destroy_client(clp); +} + +static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) +{ + memcpy(target->cl_verifier.data, source->data, + sizeof(target->cl_verifier.data)); +} + +static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) +{ + target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; + target->cl_clientid.cl_id = source->cl_clientid.cl_id; +} + +static int copy_cred(struct svc_cred *target, struct svc_cred *source) +{ + target->cr_principal = kstrdup(source->cr_principal, GFP_KERNEL); + target->cr_raw_principal = kstrdup(source->cr_raw_principal, + GFP_KERNEL); + target->cr_targ_princ = kstrdup(source->cr_targ_princ, GFP_KERNEL); + if ((source->cr_principal && !target->cr_principal) || + (source->cr_raw_principal && !target->cr_raw_principal) || + (source->cr_targ_princ && !target->cr_targ_princ)) + return -ENOMEM; + + target->cr_flavor = source->cr_flavor; + target->cr_uid = source->cr_uid; + target->cr_gid = source->cr_gid; + target->cr_group_info = source->cr_group_info; + get_group_info(target->cr_group_info); + target->cr_gss_mech = source->cr_gss_mech; + if (source->cr_gss_mech) + gss_mech_get(source->cr_gss_mech); + return 0; +} + +static int +compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2) +{ + if (o1->len < o2->len) + return -1; + if (o1->len > o2->len) + return 1; + return memcmp(o1->data, o2->data, o1->len); +} + +static int +same_verf(nfs4_verifier *v1, nfs4_verifier *v2) +{ + return 0 == memcmp(v1->data, v2->data, sizeof(v1->data)); +} + +static int +same_clid(clientid_t *cl1, clientid_t *cl2) +{ + return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id); +} + +static bool groups_equal(struct group_info *g1, struct group_info *g2) +{ + int i; + + if (g1->ngroups != g2->ngroups) + return false; + for (i=0; i<g1->ngroups; i++) + if (!gid_eq(g1->gid[i], g2->gid[i])) + return false; + return true; +} + +/* + * RFC 3530 language requires clid_inuse be returned when the + * "principal" associated with a requests differs from that previously + * used. We use uid, gid's, and gss principal string as our best + * approximation. We also don't want to allow non-gss use of a client + * established using gss: in theory cr_principal should catch that + * change, but in practice cr_principal can be null even in the gss case + * since gssd doesn't always pass down a principal string. + */ +static bool is_gss_cred(struct svc_cred *cr) +{ + /* Is cr_flavor one of the gss "pseudoflavors"?: */ + return (cr->cr_flavor > RPC_AUTH_MAXFLAVOR); +} + + +static bool +same_creds(struct svc_cred *cr1, struct svc_cred *cr2) +{ + if ((is_gss_cred(cr1) != is_gss_cred(cr2)) + || (!uid_eq(cr1->cr_uid, cr2->cr_uid)) + || (!gid_eq(cr1->cr_gid, cr2->cr_gid)) + || !groups_equal(cr1->cr_group_info, cr2->cr_group_info)) + return false; + /* XXX: check that cr_targ_princ fields match ? */ + if (cr1->cr_principal == cr2->cr_principal) + return true; + if (!cr1->cr_principal || !cr2->cr_principal) + return false; + return 0 == strcmp(cr1->cr_principal, cr2->cr_principal); +} + +static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp) +{ + struct svc_cred *cr = &rqstp->rq_cred; + u32 service; + + if (!cr->cr_gss_mech) + return false; + service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor); + return service == RPC_GSS_SVC_INTEGRITY || + service == RPC_GSS_SVC_PRIVACY; +} + +bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) +{ + struct svc_cred *cr = &rqstp->rq_cred; + + if (!cl->cl_mach_cred) + return true; + if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech) + return false; + if (!svc_rqst_integrity_protected(rqstp)) + return false; + if (cl->cl_cred.cr_raw_principal) + return 0 == strcmp(cl->cl_cred.cr_raw_principal, + cr->cr_raw_principal); + if (!cr->cr_principal) + return false; + return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal); +} + +static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn) +{ + __be32 verf[2]; + + /* + * This is opaque to client, so no need to byte-swap. Use + * __force to keep sparse happy + */ + verf[0] = (__force __be32)(u32)ktime_get_real_seconds(); + verf[1] = (__force __be32)nn->clverifier_counter++; + memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data)); +} + +static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) +{ + clp->cl_clientid.cl_boot = (u32)nn->boot_time; + clp->cl_clientid.cl_id = nn->clientid_counter++; + gen_confirm(clp, nn); +} + +static struct nfs4_stid * +find_stateid_locked(struct nfs4_client *cl, stateid_t *t) +{ + struct nfs4_stid *ret; + + ret = idr_find(&cl->cl_stateids, t->si_opaque.so_id); + if (!ret || !ret->sc_type) + return NULL; + return ret; +} + +static struct nfs4_stid * +find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) +{ + struct nfs4_stid *s; + + spin_lock(&cl->cl_lock); + s = find_stateid_locked(cl, t); + if (s != NULL) { + if (typemask & s->sc_type) + refcount_inc(&s->sc_count); + else + s = NULL; + } + spin_unlock(&cl->cl_lock); + return s; +} + +static struct nfs4_client *get_nfsdfs_clp(struct inode *inode) +{ + struct nfsdfs_client *nc; + nc = get_nfsdfs_client(inode); + if (!nc) + return NULL; + return container_of(nc, struct nfs4_client, cl_nfsdfs); +} + +static void seq_quote_mem(struct seq_file *m, char *data, int len) +{ + seq_printf(m, "\""); + seq_escape_mem_ascii(m, data, len); + seq_printf(m, "\""); +} + +static int client_info_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct nfs4_client *clp; + u64 clid; + + clp = get_nfsdfs_clp(inode); + if (!clp) + return -ENXIO; + memcpy(&clid, &clp->cl_clientid, sizeof(clid)); + seq_printf(m, "clientid: 0x%llx\n", clid); + seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr); + seq_printf(m, "name: "); + seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len); + seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion); + if (clp->cl_nii_domain.data) { + seq_printf(m, "Implementation domain: "); + seq_quote_mem(m, clp->cl_nii_domain.data, + clp->cl_nii_domain.len); + seq_printf(m, "\nImplementation name: "); + seq_quote_mem(m, clp->cl_nii_name.data, clp->cl_nii_name.len); + seq_printf(m, "\nImplementation time: [%lld, %ld]\n", + clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec); + } + drop_client(clp); + + return 0; +} + +static int client_info_open(struct inode *inode, struct file *file) +{ + return single_open(file, client_info_show, inode); +} + +static const struct file_operations client_info_fops = { + .open = client_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void *states_start(struct seq_file *s, loff_t *pos) + __acquires(&clp->cl_lock) +{ + struct nfs4_client *clp = s->private; + unsigned long id = *pos; + void *ret; + + spin_lock(&clp->cl_lock); + ret = idr_get_next_ul(&clp->cl_stateids, &id); + *pos = id; + return ret; +} + +static void *states_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct nfs4_client *clp = s->private; + unsigned long id = *pos; + void *ret; + + id = *pos; + id++; + ret = idr_get_next_ul(&clp->cl_stateids, &id); + *pos = id; + return ret; +} + +static void states_stop(struct seq_file *s, void *v) + __releases(&clp->cl_lock) +{ + struct nfs4_client *clp = s->private; + + spin_unlock(&clp->cl_lock); +} + +static void nfs4_show_fname(struct seq_file *s, struct nfsd_file *f) +{ + seq_printf(s, "filename: \"%pD2\"", f->nf_file); +} + +static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) +{ + struct inode *inode = f->nf_inode; + + seq_printf(s, "superblock: \"%02x:%02x:%ld\"", + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), + inode->i_ino); +} + +static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo) +{ + seq_printf(s, "owner: "); + seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len); +} + +static void nfs4_show_stateid(struct seq_file *s, stateid_t *stid) +{ + seq_printf(s, "0x%.8x", stid->si_generation); + seq_printf(s, "%12phN", &stid->si_opaque); +} + +static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) +{ + struct nfs4_ol_stateid *ols; + struct nfs4_file *nf; + struct nfsd_file *file; + struct nfs4_stateowner *oo; + unsigned int access, deny; + + if (st->sc_type != NFS4_OPEN_STID && st->sc_type != NFS4_LOCK_STID) + return 0; /* XXX: or SEQ_SKIP? */ + ols = openlockstateid(st); + oo = ols->st_stateowner; + nf = st->sc_file; + + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); + if (!file) + goto out; + + seq_printf(s, "- "); + nfs4_show_stateid(s, &st->sc_stateid); + seq_printf(s, ": { type: open, "); + + access = bmap_to_share_mode(ols->st_access_bmap); + deny = bmap_to_share_mode(ols->st_deny_bmap); + + seq_printf(s, "access: %s%s, ", + access & NFS4_SHARE_ACCESS_READ ? "r" : "-", + access & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); + seq_printf(s, "deny: %s%s, ", + deny & NFS4_SHARE_ACCESS_READ ? "r" : "-", + deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); + + nfs4_show_superblock(s, file); + seq_printf(s, ", "); + nfs4_show_fname(s, file); + seq_printf(s, ", "); + nfs4_show_owner(s, oo); + seq_printf(s, " }\n"); +out: + spin_unlock(&nf->fi_lock); + return 0; +} + +static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) +{ + struct nfs4_ol_stateid *ols; + struct nfs4_file *nf; + struct nfsd_file *file; + struct nfs4_stateowner *oo; + + ols = openlockstateid(st); + oo = ols->st_stateowner; + nf = st->sc_file; + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); + if (!file) + goto out; + + seq_printf(s, "- "); + nfs4_show_stateid(s, &st->sc_stateid); + seq_printf(s, ": { type: lock, "); + + /* + * Note: a lock stateid isn't really the same thing as a lock, + * it's the locking state held by one owner on a file, and there + * may be multiple (or no) lock ranges associated with it. + * (Same for the matter is true of open stateids.) + */ + + nfs4_show_superblock(s, file); + /* XXX: open stateid? */ + seq_printf(s, ", "); + nfs4_show_fname(s, file); + seq_printf(s, ", "); + nfs4_show_owner(s, oo); + seq_printf(s, " }\n"); +out: + spin_unlock(&nf->fi_lock); + return 0; +} + +static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) +{ + struct nfs4_delegation *ds; + struct nfs4_file *nf; + struct nfsd_file *file; + + ds = delegstateid(st); + nf = st->sc_file; + spin_lock(&nf->fi_lock); + file = find_deleg_file_locked(nf); + if (!file) + goto out; + + seq_printf(s, "- "); + nfs4_show_stateid(s, &st->sc_stateid); + seq_printf(s, ": { type: deleg, "); + + /* Kinda dead code as long as we only support read delegs: */ + seq_printf(s, "access: %s, ", + ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w"); + + /* XXX: lease time, whether it's being recalled. */ + + nfs4_show_superblock(s, file); + seq_printf(s, ", "); + nfs4_show_fname(s, file); + seq_printf(s, " }\n"); +out: + spin_unlock(&nf->fi_lock); + return 0; +} + +static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) +{ + struct nfs4_layout_stateid *ls; + struct nfsd_file *file; + + ls = container_of(st, struct nfs4_layout_stateid, ls_stid); + file = ls->ls_file; + + seq_printf(s, "- "); + nfs4_show_stateid(s, &st->sc_stateid); + seq_printf(s, ": { type: layout, "); + + /* XXX: What else would be useful? */ + + nfs4_show_superblock(s, file); + seq_printf(s, ", "); + nfs4_show_fname(s, file); + seq_printf(s, " }\n"); + + return 0; +} + +static int states_show(struct seq_file *s, void *v) +{ + struct nfs4_stid *st = v; + + switch (st->sc_type) { + case NFS4_OPEN_STID: + return nfs4_show_open(s, st); + case NFS4_LOCK_STID: + return nfs4_show_lock(s, st); + case NFS4_DELEG_STID: + return nfs4_show_deleg(s, st); + case NFS4_LAYOUT_STID: + return nfs4_show_layout(s, st); + default: + return 0; /* XXX: or SEQ_SKIP? */ + } + /* XXX: copy stateids? */ +} + +static struct seq_operations states_seq_ops = { + .start = states_start, + .next = states_next, + .stop = states_stop, + .show = states_show +}; + +static int client_states_open(struct inode *inode, struct file *file) +{ + struct seq_file *s; + struct nfs4_client *clp; + int ret; + + clp = get_nfsdfs_clp(inode); + if (!clp) + return -ENXIO; + + ret = seq_open(file, &states_seq_ops); + if (ret) + return ret; + s = file->private_data; + s->private = clp; + return 0; +} + +static int client_opens_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct nfs4_client *clp = m->private; + + /* XXX: alternatively, we could get/drop in seq start/stop */ + drop_client(clp); + return seq_release(inode, file); +} + +static const struct file_operations client_states_fops = { + .open = client_states_open, + .read = seq_read, + .llseek = seq_lseek, + .release = client_opens_release, +}; + +/* + * Normally we refuse to destroy clients that are in use, but here the + * administrator is telling us to just do it. We also want to wait + * so the caller has a guarantee that the client's locks are gone by + * the time the write returns: + */ +static void force_expire_client(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + bool already_expired; + + spin_lock(&nn->client_lock); + clp->cl_time = 0; + spin_unlock(&nn->client_lock); + + wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0); + spin_lock(&nn->client_lock); + already_expired = list_empty(&clp->cl_lru); + if (!already_expired) + unhash_client_locked(clp); + spin_unlock(&nn->client_lock); + + if (!already_expired) + expire_client(clp); + else + wait_event(expiry_wq, clp->cl_nfsd_dentry == NULL); +} + +static ssize_t client_ctl_write(struct file *file, const char __user *buf, + size_t size, loff_t *pos) +{ + char *data; + struct nfs4_client *clp; + + data = simple_transaction_get(file, buf, size); + if (IS_ERR(data)) + return PTR_ERR(data); + if (size != 7 || 0 != memcmp(data, "expire\n", 7)) + return -EINVAL; + clp = get_nfsdfs_clp(file_inode(file)); + if (!clp) + return -ENXIO; + force_expire_client(clp); + drop_client(clp); + return 7; +} + +static const struct file_operations client_ctl_fops = { + .write = client_ctl_write, + .release = simple_transaction_release, +}; + +static const struct tree_descr client_files[] = { + [0] = {"info", &client_info_fops, S_IRUSR}, + [1] = {"states", &client_states_fops, S_IRUSR}, + [2] = {"ctl", &client_ctl_fops, S_IWUSR}, + [3] = {""}, +}; + +static struct nfs4_client *create_client(struct xdr_netobj name, + struct svc_rqst *rqstp, nfs4_verifier *verf) +{ + struct nfs4_client *clp; + struct sockaddr *sa = svc_addr(rqstp); + int ret; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + clp = alloc_client(name); + if (clp == NULL) + return NULL; + + ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); + if (ret) { + free_client(clp); + return NULL; + } + gen_clid(clp, nn); + kref_init(&clp->cl_nfsdfs.cl_ref); + nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL); + clp->cl_time = ktime_get_boottime_seconds(); + clear_bit(0, &clp->cl_cb_slot_busy); + copy_verf(clp, verf); + memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage)); + clp->cl_cb_session = NULL; + clp->net = net; + clp->cl_nfsd_dentry = nfsd_client_mkdir(nn, &clp->cl_nfsdfs, + clp->cl_clientid.cl_id - nn->clientid_base, + client_files); + if (!clp->cl_nfsd_dentry) { + free_client(clp); + return NULL; + } + return clp; +} + +static void +add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct nfs4_client *clp; + + while (*new) { + clp = rb_entry(*new, struct nfs4_client, cl_namenode); + parent = *new; + + if (compare_blob(&clp->cl_name, &new_clp->cl_name) > 0) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&new_clp->cl_namenode, parent, new); + rb_insert_color(&new_clp->cl_namenode, root); +} + +static struct nfs4_client * +find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root) +{ + int cmp; + struct rb_node *node = root->rb_node; + struct nfs4_client *clp; + + while (node) { + clp = rb_entry(node, struct nfs4_client, cl_namenode); + cmp = compare_blob(&clp->cl_name, name); + if (cmp > 0) + node = node->rb_left; + else if (cmp < 0) + node = node->rb_right; + else + return clp; + } + return NULL; +} + +static void +add_to_unconfirmed(struct nfs4_client *clp) +{ + unsigned int idhashval; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); + add_clp_to_name_tree(clp, &nn->unconf_name_tree); + idhashval = clientid_hashval(clp->cl_clientid.cl_id); + list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]); + renew_client_locked(clp); +} + +static void +move_to_confirmed(struct nfs4_client *clp) +{ + unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + lockdep_assert_held(&nn->client_lock); + + dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); + list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); + rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); + add_clp_to_name_tree(clp, &nn->conf_name_tree); + set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); + renew_client_locked(clp); +} + +static struct nfs4_client * +find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions) +{ + struct nfs4_client *clp; + unsigned int idhashval = clientid_hashval(clid->cl_id); + + list_for_each_entry(clp, &tbl[idhashval], cl_idhash) { + if (same_clid(&clp->cl_clientid, clid)) { + if ((bool)clp->cl_minorversion != sessions) + return NULL; + renew_client_locked(clp); + return clp; + } + } + return NULL; +} + +static struct nfs4_client * +find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) +{ + struct list_head *tbl = nn->conf_id_hashtbl; + + lockdep_assert_held(&nn->client_lock); + return find_client_in_id_table(tbl, clid, sessions); +} + +static struct nfs4_client * +find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) +{ + struct list_head *tbl = nn->unconf_id_hashtbl; + + lockdep_assert_held(&nn->client_lock); + return find_client_in_id_table(tbl, clid, sessions); +} + +static bool clp_used_exchangeid(struct nfs4_client *clp) +{ + return clp->cl_exchange_flags != 0; +} + +static struct nfs4_client * +find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) +{ + lockdep_assert_held(&nn->client_lock); + return find_clp_in_name_tree(name, &nn->conf_name_tree); +} + +static struct nfs4_client * +find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) +{ + lockdep_assert_held(&nn->client_lock); + return find_clp_in_name_tree(name, &nn->unconf_name_tree); +} + +static void +gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp) +{ + struct nfs4_cb_conn *conn = &clp->cl_cb_conn; + struct sockaddr *sa = svc_addr(rqstp); + u32 scopeid = rpc_get_scope_id(sa); + unsigned short expected_family; + + /* Currently, we only support tcp and tcp6 for the callback channel */ + if (se->se_callback_netid_len == 3 && + !memcmp(se->se_callback_netid_val, "tcp", 3)) + expected_family = AF_INET; + else if (se->se_callback_netid_len == 4 && + !memcmp(se->se_callback_netid_val, "tcp6", 4)) + expected_family = AF_INET6; + else + goto out_err; + + conn->cb_addrlen = rpc_uaddr2sockaddr(clp->net, se->se_callback_addr_val, + se->se_callback_addr_len, + (struct sockaddr *)&conn->cb_addr, + sizeof(conn->cb_addr)); + + if (!conn->cb_addrlen || conn->cb_addr.ss_family != expected_family) + goto out_err; + + if (conn->cb_addr.ss_family == AF_INET6) + ((struct sockaddr_in6 *)&conn->cb_addr)->sin6_scope_id = scopeid; + + conn->cb_prog = se->se_callback_prog; + conn->cb_ident = se->se_callback_ident; + memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen); + trace_nfsd_cb_args(clp, conn); + return; +out_err: + conn->cb_addr.ss_family = AF_UNSPEC; + conn->cb_addrlen = 0; + trace_nfsd_cb_nodelegs(clp); + return; +} + +/* + * Cache a reply. nfsd4_check_resp_size() has bounded the cache size. + */ +static void +nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) +{ + struct xdr_buf *buf = resp->xdr.buf; + struct nfsd4_slot *slot = resp->cstate.slot; + unsigned int base; + + dprintk("--> %s slot %p\n", __func__, slot); + + slot->sl_flags |= NFSD4_SLOT_INITIALIZED; + slot->sl_opcnt = resp->opcnt; + slot->sl_status = resp->cstate.status; + free_svc_cred(&slot->sl_cred); + copy_cred(&slot->sl_cred, &resp->rqstp->rq_cred); + + if (!nfsd4_cache_this(resp)) { + slot->sl_flags &= ~NFSD4_SLOT_CACHED; + return; + } + slot->sl_flags |= NFSD4_SLOT_CACHED; + + base = resp->cstate.data_offset; + slot->sl_datalen = buf->len - base; + if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen)) + WARN(1, "%s: sessions DRC could not cache compound\n", + __func__); + return; +} + +/* + * Encode the replay sequence operation from the slot values. + * If cachethis is FALSE encode the uncached rep error on the next + * operation which sets resp->p and increments resp->opcnt for + * nfs4svc_encode_compoundres. + * + */ +static __be32 +nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, + struct nfsd4_compoundres *resp) +{ + struct nfsd4_op *op; + struct nfsd4_slot *slot = resp->cstate.slot; + + /* Encode the replayed sequence operation */ + op = &args->ops[resp->opcnt - 1]; + nfsd4_encode_operation(resp, op); + + if (slot->sl_flags & NFSD4_SLOT_CACHED) + return op->status; + if (args->opcnt == 1) { + /* + * The original operation wasn't a solo sequence--we + * always cache those--so this retry must not match the + * original: + */ + op->status = nfserr_seq_false_retry; + } else { + op = &args->ops[resp->opcnt++]; + op->status = nfserr_retry_uncached_rep; + nfsd4_encode_operation(resp, op); + } + return op->status; +} + +/* + * The sequence operation is not cached because we can use the slot and + * session values. + */ +static __be32 +nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, + struct nfsd4_sequence *seq) +{ + struct nfsd4_slot *slot = resp->cstate.slot; + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + __be32 status; + + dprintk("--> %s slot %p\n", __func__, slot); + + status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp); + if (status) + return status; + + p = xdr_reserve_space(xdr, slot->sl_datalen); + if (!p) { + WARN_ON_ONCE(1); + return nfserr_serverfault; + } + xdr_encode_opaque_fixed(p, slot->sl_data, slot->sl_datalen); + xdr_commit_encode(xdr); + + resp->opcnt = slot->sl_opcnt; + return slot->sl_status; +} + +/* + * Set the exchange_id flags returned by the server. + */ +static void +nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) +{ +#ifdef CONFIG_NFSD_PNFS + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS; +#else + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; +#endif + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; + + /* set the wire flags to return to client. */ + clid->flags = new->cl_exchange_flags; +} + +static bool client_has_openowners(struct nfs4_client *clp) +{ + struct nfs4_openowner *oo; + + list_for_each_entry(oo, &clp->cl_openowners, oo_perclient) { + if (!list_empty(&oo->oo_owner.so_stateids)) + return true; + } + return false; +} + +static bool client_has_state(struct nfs4_client *clp) +{ + return client_has_openowners(clp) +#ifdef CONFIG_NFSD_PNFS + || !list_empty(&clp->cl_lo_states) +#endif + || !list_empty(&clp->cl_delegations) + || !list_empty(&clp->cl_sessions) + || !list_empty(&clp->async_copies); +} + +static __be32 copy_impl_id(struct nfs4_client *clp, + struct nfsd4_exchange_id *exid) +{ + if (!exid->nii_domain.data) + return 0; + xdr_netobj_dup(&clp->cl_nii_domain, &exid->nii_domain, GFP_KERNEL); + if (!clp->cl_nii_domain.data) + return nfserr_jukebox; + xdr_netobj_dup(&clp->cl_nii_name, &exid->nii_name, GFP_KERNEL); + if (!clp->cl_nii_name.data) + return nfserr_jukebox; + clp->cl_nii_time = exid->nii_time; + return 0; +} + +__be32 +nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_exchange_id *exid = &u->exchange_id; + struct nfs4_client *conf, *new; + struct nfs4_client *unconf = NULL; + __be32 status; + char addr_str[INET6_ADDRSTRLEN]; + nfs4_verifier verf = exid->verifier; + struct sockaddr *sa = svc_addr(rqstp); + bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + rpc_ntop(sa, addr_str, sizeof(addr_str)); + dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " + "ip_addr=%s flags %x, spa_how %d\n", + __func__, rqstp, exid, exid->clname.len, exid->clname.data, + addr_str, exid->flags, exid->spa_how); + + if (exid->flags & ~EXCHGID4_FLAG_MASK_A) + return nfserr_inval; + + new = create_client(exid->clname, rqstp, &verf); + if (new == NULL) + return nfserr_jukebox; + status = copy_impl_id(new, exid); + if (status) + goto out_nolock; + + switch (exid->spa_how) { + case SP4_MACH_CRED: + exid->spo_must_enforce[0] = 0; + exid->spo_must_enforce[1] = ( + 1 << (OP_BIND_CONN_TO_SESSION - 32) | + 1 << (OP_EXCHANGE_ID - 32) | + 1 << (OP_CREATE_SESSION - 32) | + 1 << (OP_DESTROY_SESSION - 32) | + 1 << (OP_DESTROY_CLIENTID - 32)); + + exid->spo_must_allow[0] &= (1 << (OP_CLOSE) | + 1 << (OP_OPEN_DOWNGRADE) | + 1 << (OP_LOCKU) | + 1 << (OP_DELEGRETURN)); + + exid->spo_must_allow[1] &= ( + 1 << (OP_TEST_STATEID - 32) | + 1 << (OP_FREE_STATEID - 32)); + if (!svc_rqst_integrity_protected(rqstp)) { + status = nfserr_inval; + goto out_nolock; + } + /* + * Sometimes userspace doesn't give us a principal. + * Which is a bug, really. Anyway, we can't enforce + * MACH_CRED in that case, better to give up now: + */ + if (!new->cl_cred.cr_principal && + !new->cl_cred.cr_raw_principal) { + status = nfserr_serverfault; + goto out_nolock; + } + new->cl_mach_cred = true; + case SP4_NONE: + break; + default: /* checked by xdr code */ + WARN_ON_ONCE(1); + fallthrough; + case SP4_SSV: + status = nfserr_encr_alg_unsupp; + goto out_nolock; + } + + /* Cases below refer to rfc 5661 section 18.35.4: */ + spin_lock(&nn->client_lock); + conf = find_confirmed_client_by_name(&exid->clname, nn); + if (conf) { + bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred); + bool verfs_match = same_verf(&verf, &conf->cl_verifier); + + if (update) { + if (!clp_used_exchangeid(conf)) { /* buggy client */ + status = nfserr_inval; + goto out; + } + if (!nfsd4_mach_creds_match(conf, rqstp)) { + status = nfserr_wrong_cred; + goto out; + } + if (!creds_match) { /* case 9 */ + status = nfserr_perm; + goto out; + } + if (!verfs_match) { /* case 8 */ + status = nfserr_not_same; + goto out; + } + /* case 6 */ + exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; + goto out_copy; + } + if (!creds_match) { /* case 3 */ + if (client_has_state(conf)) { + status = nfserr_clid_inuse; + goto out; + } + goto out_new; + } + if (verfs_match) { /* case 2 */ + conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; + goto out_copy; + } + /* case 5, client reboot */ + conf = NULL; + goto out_new; + } + + if (update) { /* case 7 */ + status = nfserr_noent; + goto out; + } + + unconf = find_unconfirmed_client_by_name(&exid->clname, nn); + if (unconf) /* case 4, possible retry or client restart */ + unhash_client_locked(unconf); + + /* case 1 (normal case) */ +out_new: + if (conf) { + status = mark_client_expired_locked(conf); + if (status) + goto out; + } + new->cl_minorversion = cstate->minorversion; + new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0]; + new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1]; + + add_to_unconfirmed(new); + swap(new, conf); +out_copy: + exid->clientid.cl_boot = conf->cl_clientid.cl_boot; + exid->clientid.cl_id = conf->cl_clientid.cl_id; + + exid->seqid = conf->cl_cs_slot.sl_seqid + 1; + nfsd4_set_ex_flags(conf, exid); + + dprintk("nfsd4_exchange_id seqid %d flags %x\n", + conf->cl_cs_slot.sl_seqid, conf->cl_exchange_flags); + status = nfs_ok; + +out: + spin_unlock(&nn->client_lock); +out_nolock: + if (new) + expire_client(new); + if (unconf) + expire_client(unconf); + return status; +} + +static __be32 +check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse) +{ + dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid, + slot_seqid); + + /* The slot is in use, and no response has been sent. */ + if (slot_inuse) { + if (seqid == slot_seqid) + return nfserr_jukebox; + else + return nfserr_seq_misordered; + } + /* Note unsigned 32-bit arithmetic handles wraparound: */ + if (likely(seqid == slot_seqid + 1)) + return nfs_ok; + if (seqid == slot_seqid) + return nfserr_replay_cache; + return nfserr_seq_misordered; +} + +/* + * Cache the create session result into the create session single DRC + * slot cache by saving the xdr structure. sl_seqid has been set. + * Do this for solo or embedded create session operations. + */ +static void +nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses, + struct nfsd4_clid_slot *slot, __be32 nfserr) +{ + slot->sl_status = nfserr; + memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses)); +} + +static __be32 +nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses, + struct nfsd4_clid_slot *slot) +{ + memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses)); + return slot->sl_status; +} + +#define NFSD_MIN_REQ_HDR_SEQ_SZ ((\ + 2 * 2 + /* credential,verifier: AUTH_NULL, length 0 */ \ + 1 + /* MIN tag is length with zero, only length */ \ + 3 + /* version, opcount, opcode */ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + /* seqid, slotID, slotID, cache */ \ + 4 ) * sizeof(__be32)) + +#define NFSD_MIN_RESP_HDR_SEQ_SZ ((\ + 2 + /* verifier: AUTH_NULL, length 0 */\ + 1 + /* status */ \ + 1 + /* MIN tag is length with zero, only length */ \ + 3 + /* opcount, opcode, opstatus*/ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + /* seqid, slotID, slotID, slotID, status */ \ + 5 ) * sizeof(__be32)) + +static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn) +{ + u32 maxrpc = nn->nfsd_serv->sv_max_mesg; + + if (ca->maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ) + return nfserr_toosmall; + if (ca->maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ) + return nfserr_toosmall; + ca->headerpadsz = 0; + ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc); + ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc); + ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND); + ca->maxresp_cached = min_t(u32, ca->maxresp_cached, + NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ); + ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION); + /* + * Note decreasing slot size below client's request may make it + * difficult for client to function correctly, whereas + * decreasing the number of slots will (just?) affect + * performance. When short on memory we therefore prefer to + * decrease number of slots instead of their size. Clients that + * request larger slots than they need will get poor results: + * Note that we always allow at least one slot, because our + * accounting is soft and provides no guarantees either way. + */ + ca->maxreqs = nfsd4_get_drc_mem(ca, nn); + + return nfs_ok; +} + +/* + * Server's NFSv4.1 backchannel support is AUTH_SYS-only for now. + * These are based on similar macros in linux/sunrpc/msg_prot.h . + */ +#define RPC_MAX_HEADER_WITH_AUTH_SYS \ + (RPC_CALLHDRSIZE + 2 * (2 + UNX_CALLSLACK)) + +#define RPC_MAX_REPHEADER_WITH_AUTH_SYS \ + (RPC_REPHDRSIZE + (2 + NUL_REPLYSLACK)) + +#define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \ + RPC_MAX_HEADER_WITH_AUTH_SYS) * sizeof(__be32)) +#define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \ + RPC_MAX_REPHEADER_WITH_AUTH_SYS) * \ + sizeof(__be32)) + +static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) +{ + ca->headerpadsz = 0; + + if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ) + return nfserr_toosmall; + if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ) + return nfserr_toosmall; + ca->maxresp_cached = 0; + if (ca->maxops < 2) + return nfserr_toosmall; + + return nfs_ok; +} + +static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs) +{ + switch (cbs->flavor) { + case RPC_AUTH_NULL: + case RPC_AUTH_UNIX: + return nfs_ok; + default: + /* + * GSS case: the spec doesn't allow us to return this + * error. But it also doesn't allow us not to support + * GSS. + * I'd rather this fail hard than return some error the + * client might think it can already handle: + */ + return nfserr_encr_alg_unsupp; + } +} + +__be32 +nfsd4_create_session(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) +{ + struct nfsd4_create_session *cr_ses = &u->create_session; + struct sockaddr *sa = svc_addr(rqstp); + struct nfs4_client *conf, *unconf; + struct nfs4_client *old = NULL; + struct nfsd4_session *new; + struct nfsd4_conn *conn; + struct nfsd4_clid_slot *cs_slot = NULL; + __be32 status = 0; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) + return nfserr_inval; + status = nfsd4_check_cb_sec(&cr_ses->cb_sec); + if (status) + return status; + status = check_forechannel_attrs(&cr_ses->fore_channel, nn); + if (status) + return status; + status = check_backchannel_attrs(&cr_ses->back_channel); + if (status) + goto out_release_drc_mem; + status = nfserr_jukebox; + new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel); + if (!new) + goto out_release_drc_mem; + conn = alloc_conn_from_crses(rqstp, cr_ses); + if (!conn) + goto out_free_session; + + spin_lock(&nn->client_lock); + unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); + conf = find_confirmed_client(&cr_ses->clientid, true, nn); + WARN_ON_ONCE(conf && unconf); + + if (conf) { + status = nfserr_wrong_cred; + if (!nfsd4_mach_creds_match(conf, rqstp)) + goto out_free_conn; + cs_slot = &conf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); + if (status) { + if (status == nfserr_replay_cache) + status = nfsd4_replay_create_session(cr_ses, cs_slot); + goto out_free_conn; + } + } else if (unconf) { + if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || + !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { + status = nfserr_clid_inuse; + goto out_free_conn; + } + status = nfserr_wrong_cred; + if (!nfsd4_mach_creds_match(unconf, rqstp)) + goto out_free_conn; + cs_slot = &unconf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); + if (status) { + /* an unconfirmed replay returns misordered */ + status = nfserr_seq_misordered; + goto out_free_conn; + } + old = find_confirmed_client_by_name(&unconf->cl_name, nn); + if (old) { + status = mark_client_expired_locked(old); + if (status) { + old = NULL; + goto out_free_conn; + } + } + move_to_confirmed(unconf); + conf = unconf; + } else { + status = nfserr_stale_clientid; + goto out_free_conn; + } + status = nfs_ok; + /* Persistent sessions are not supported */ + cr_ses->flags &= ~SESSION4_PERSIST; + /* Upshifting from TCP to RDMA is not supported */ + cr_ses->flags &= ~SESSION4_RDMA; + + init_session(rqstp, new, conf, cr_ses); + nfsd4_get_session_locked(new); + + memcpy(cr_ses->sessionid.data, new->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + cs_slot->sl_seqid++; + cr_ses->seqid = cs_slot->sl_seqid; + + /* cache solo and embedded create sessions under the client_lock */ + nfsd4_cache_create_session(cr_ses, cs_slot, status); + spin_unlock(&nn->client_lock); + /* init connection and backchannel */ + nfsd4_init_conn(rqstp, conn, new); + nfsd4_put_session(new); + if (old) + expire_client(old); + return status; +out_free_conn: + spin_unlock(&nn->client_lock); + free_conn(conn); + if (old) + expire_client(old); +out_free_session: + __free_session(new); +out_release_drc_mem: + nfsd4_put_drc_mem(&cr_ses->fore_channel); + return status; +} + +static __be32 nfsd4_map_bcts_dir(u32 *dir) +{ + switch (*dir) { + case NFS4_CDFC4_FORE: + case NFS4_CDFC4_BACK: + return nfs_ok; + case NFS4_CDFC4_FORE_OR_BOTH: + case NFS4_CDFC4_BACK_OR_BOTH: + *dir = NFS4_CDFC4_BOTH; + return nfs_ok; + } + return nfserr_inval; +} + +__be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_backchannel_ctl *bc = &u->backchannel_ctl; + struct nfsd4_session *session = cstate->session; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 status; + + status = nfsd4_check_cb_sec(&bc->bc_cb_sec); + if (status) + return status; + spin_lock(&nn->client_lock); + session->se_cb_prog = bc->bc_cb_program; + session->se_cb_sec = bc->bc_cb_sec; + spin_unlock(&nn->client_lock); + + nfsd4_probe_callback(session->se_client); + + return nfs_ok; +} + +static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s) +{ + struct nfsd4_conn *c; + + list_for_each_entry(c, &s->se_conns, cn_persession) { + if (c->cn_xprt == xpt) { + return c; + } + } + return NULL; +} + +static __be32 nfsd4_match_existing_connection(struct svc_rqst *rqst, + struct nfsd4_session *session, u32 req, struct nfsd4_conn **conn) +{ + struct nfs4_client *clp = session->se_client; + struct svc_xprt *xpt = rqst->rq_xprt; + struct nfsd4_conn *c; + __be32 status; + + /* Following the last paragraph of RFC 5661 Section 18.34.3: */ + spin_lock(&clp->cl_lock); + c = __nfsd4_find_conn(xpt, session); + if (!c) + status = nfserr_noent; + else if (req == c->cn_flags) + status = nfs_ok; + else if (req == NFS4_CDFC4_FORE_OR_BOTH && + c->cn_flags != NFS4_CDFC4_BACK) + status = nfs_ok; + else if (req == NFS4_CDFC4_BACK_OR_BOTH && + c->cn_flags != NFS4_CDFC4_FORE) + status = nfs_ok; + else + status = nfserr_inval; + spin_unlock(&clp->cl_lock); + if (status == nfs_ok && conn) + *conn = c; + return status; +} + +__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session; + __be32 status; + struct nfsd4_conn *conn; + struct nfsd4_session *session; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (!nfsd4_last_compound_op(rqstp)) + return nfserr_not_only_op; + spin_lock(&nn->client_lock); + session = find_in_sessionid_hashtbl(&bcts->sessionid, net, &status); + spin_unlock(&nn->client_lock); + if (!session) + goto out_no_session; + status = nfserr_wrong_cred; + if (!nfsd4_mach_creds_match(session->se_client, rqstp)) + goto out; + status = nfsd4_match_existing_connection(rqstp, session, + bcts->dir, &conn); + if (status == nfs_ok) { + if (bcts->dir == NFS4_CDFC4_FORE_OR_BOTH || + bcts->dir == NFS4_CDFC4_BACK) + conn->cn_flags |= NFS4_CDFC4_BACK; + nfsd4_probe_callback(session->se_client); + goto out; + } + if (status == nfserr_inval) + goto out; + status = nfsd4_map_bcts_dir(&bcts->dir); + if (status) + goto out; + conn = alloc_conn(rqstp, bcts->dir); + status = nfserr_jukebox; + if (!conn) + goto out; + nfsd4_init_conn(rqstp, conn, session); + status = nfs_ok; +out: + nfsd4_put_session(session); +out_no_session: + return status; +} + +static bool nfsd4_compound_in_session(struct nfsd4_compound_state *cstate, struct nfs4_sessionid *sid) +{ + if (!cstate->session) + return false; + return !memcmp(sid, &cstate->session->se_sessionid, sizeof(*sid)); +} + +__be32 +nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfs4_sessionid *sessionid = &u->destroy_session.sessionid; + struct nfsd4_session *ses; + __be32 status; + int ref_held_by_me = 0; + struct net *net = SVC_NET(r); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + status = nfserr_not_only_op; + if (nfsd4_compound_in_session(cstate, sessionid)) { + if (!nfsd4_last_compound_op(r)) + goto out; + ref_held_by_me++; + } + dump_sessionid(__func__, sessionid); + spin_lock(&nn->client_lock); + ses = find_in_sessionid_hashtbl(sessionid, net, &status); + if (!ses) + goto out_client_lock; + status = nfserr_wrong_cred; + if (!nfsd4_mach_creds_match(ses->se_client, r)) + goto out_put_session; + status = mark_session_dead_locked(ses, 1 + ref_held_by_me); + if (status) + goto out_put_session; + unhash_session(ses); + spin_unlock(&nn->client_lock); + + nfsd4_probe_callback_sync(ses->se_client); + + spin_lock(&nn->client_lock); + status = nfs_ok; +out_put_session: + nfsd4_put_session_locked(ses); +out_client_lock: + spin_unlock(&nn->client_lock); +out: + return status; +} + +static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd4_conn *c; + __be32 status = nfs_ok; + int ret; + + spin_lock(&clp->cl_lock); + c = __nfsd4_find_conn(new->cn_xprt, ses); + if (c) + goto out_free; + status = nfserr_conn_not_bound_to_session; + if (clp->cl_mach_cred) + goto out_free; + __nfsd4_hash_conn(new, ses); + spin_unlock(&clp->cl_lock); + ret = nfsd4_register_conn(new); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&new->cn_xpt_user); + return nfs_ok; +out_free: + spin_unlock(&clp->cl_lock); + free_conn(new); + return status; +} + +static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session) +{ + struct nfsd4_compoundargs *args = rqstp->rq_argp; + + return args->opcnt > session->se_fchannel.maxops; +} + +static bool nfsd4_request_too_big(struct svc_rqst *rqstp, + struct nfsd4_session *session) +{ + struct xdr_buf *xb = &rqstp->rq_arg; + + return xb->len > session->se_fchannel.maxreq_sz; +} + +static bool replay_matches_cache(struct svc_rqst *rqstp, + struct nfsd4_sequence *seq, struct nfsd4_slot *slot) +{ + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + + if ((bool)(slot->sl_flags & NFSD4_SLOT_CACHETHIS) != + (bool)seq->cachethis) + return false; + /* + * If there's an error then the reply can have fewer ops than + * the call. + */ + if (slot->sl_opcnt < argp->opcnt && !slot->sl_status) + return false; + /* + * But if we cached a reply with *more* ops than the call you're + * sending us now, then this new call is clearly not really a + * replay of the old one: + */ + if (slot->sl_opcnt > argp->opcnt) + return false; + /* This is the only check explicitly called by spec: */ + if (!same_creds(&rqstp->rq_cred, &slot->sl_cred)) + return false; + /* + * There may be more comparisons we could actually do, but the + * spec doesn't require us to catch every case where the calls + * don't match (that would require caching the call as well as + * the reply), so we don't bother. + */ + return true; +} + +__be32 +nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_sequence *seq = &u->sequence; + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct xdr_stream *xdr = &resp->xdr; + struct nfsd4_session *session; + struct nfs4_client *clp; + struct nfsd4_slot *slot; + struct nfsd4_conn *conn; + __be32 status; + int buflen; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (resp->opcnt != 1) + return nfserr_sequence_pos; + + /* + * Will be either used or freed by nfsd4_sequence_check_conn + * below. + */ + conn = alloc_conn(rqstp, NFS4_CDFC4_FORE); + if (!conn) + return nfserr_jukebox; + + spin_lock(&nn->client_lock); + session = find_in_sessionid_hashtbl(&seq->sessionid, net, &status); + if (!session) + goto out_no_session; + clp = session->se_client; + + status = nfserr_too_many_ops; + if (nfsd4_session_too_many_ops(rqstp, session)) + goto out_put_session; + + status = nfserr_req_too_big; + if (nfsd4_request_too_big(rqstp, session)) + goto out_put_session; + + status = nfserr_badslot; + if (seq->slotid >= session->se_fchannel.maxreqs) + goto out_put_session; + + slot = session->se_slots[seq->slotid]; + dprintk("%s: slotid %d\n", __func__, seq->slotid); + + /* We do not negotiate the number of slots yet, so set the + * maxslots to the session maxreqs which is used to encode + * sr_highest_slotid and the sr_target_slot id to maxslots */ + seq->maxslots = session->se_fchannel.maxreqs; + + status = check_slot_seqid(seq->seqid, slot->sl_seqid, + slot->sl_flags & NFSD4_SLOT_INUSE); + if (status == nfserr_replay_cache) { + status = nfserr_seq_misordered; + if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED)) + goto out_put_session; + status = nfserr_seq_false_retry; + if (!replay_matches_cache(rqstp, seq, slot)) + goto out_put_session; + cstate->slot = slot; + cstate->session = session; + cstate->clp = clp; + /* Return the cached reply status and set cstate->status + * for nfsd4_proc_compound processing */ + status = nfsd4_replay_cache_entry(resp, seq); + cstate->status = nfserr_replay_cache; + goto out; + } + if (status) + goto out_put_session; + + status = nfsd4_sequence_check_conn(conn, session); + conn = NULL; + if (status) + goto out_put_session; + + buflen = (seq->cachethis) ? + session->se_fchannel.maxresp_cached : + session->se_fchannel.maxresp_sz; + status = (seq->cachethis) ? nfserr_rep_too_big_to_cache : + nfserr_rep_too_big; + if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack)) + goto out_put_session; + svc_reserve(rqstp, buflen); + + status = nfs_ok; + /* Success! bump slot seqid */ + slot->sl_seqid = seq->seqid; + slot->sl_flags |= NFSD4_SLOT_INUSE; + if (seq->cachethis) + slot->sl_flags |= NFSD4_SLOT_CACHETHIS; + else + slot->sl_flags &= ~NFSD4_SLOT_CACHETHIS; + + cstate->slot = slot; + cstate->session = session; + cstate->clp = clp; + +out: + switch (clp->cl_cb_state) { + case NFSD4_CB_DOWN: + seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; + break; + case NFSD4_CB_FAULT: + seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; + break; + default: + seq->status_flags = 0; + } + if (!list_empty(&clp->cl_revoked)) + seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; +out_no_session: + if (conn) + free_conn(conn); + spin_unlock(&nn->client_lock); + return status; +out_put_session: + nfsd4_put_session_locked(session); + goto out_no_session; +} + +void +nfsd4_sequence_done(struct nfsd4_compoundres *resp) +{ + struct nfsd4_compound_state *cs = &resp->cstate; + + if (nfsd4_has_session(cs)) { + if (cs->status != nfserr_replay_cache) { + nfsd4_store_cache_entry(resp); + cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; + } + /* Drop session reference that was taken in nfsd4_sequence() */ + nfsd4_put_session(cs->session); + } else if (cs->clp) + put_client_renew(cs->clp); +} + +__be32 +nfsd4_destroy_clientid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_destroy_clientid *dc = &u->destroy_clientid; + struct nfs4_client *conf, *unconf; + struct nfs4_client *clp = NULL; + __be32 status = 0; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + spin_lock(&nn->client_lock); + unconf = find_unconfirmed_client(&dc->clientid, true, nn); + conf = find_confirmed_client(&dc->clientid, true, nn); + WARN_ON_ONCE(conf && unconf); + + if (conf) { + if (client_has_state(conf)) { + status = nfserr_clientid_busy; + goto out; + } + status = mark_client_expired_locked(conf); + if (status) + goto out; + clp = conf; + } else if (unconf) + clp = unconf; + else { + status = nfserr_stale_clientid; + goto out; + } + if (!nfsd4_mach_creds_match(clp, rqstp)) { + clp = NULL; + status = nfserr_wrong_cred; + goto out; + } + unhash_client_locked(clp); +out: + spin_unlock(&nn->client_lock); + if (clp) + expire_client(clp); + return status; +} + +__be32 +nfsd4_reclaim_complete(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) +{ + struct nfsd4_reclaim_complete *rc = &u->reclaim_complete; + __be32 status = 0; + + if (rc->rca_one_fs) { + if (!cstate->current_fh.fh_dentry) + return nfserr_nofilehandle; + /* + * We don't take advantage of the rca_one_fs case. + * That's OK, it's optional, we can safely ignore it. + */ + return nfs_ok; + } + + status = nfserr_complete_already; + if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, + &cstate->session->se_client->cl_flags)) + goto out; + + status = nfserr_stale_clientid; + if (is_client_expired(cstate->session->se_client)) + /* + * The following error isn't really legal. + * But we only get here if the client just explicitly + * destroyed the client. Surely it no longer cares what + * error it gets back on an operation for the dead + * client. + */ + goto out; + + status = nfs_ok; + nfsd4_client_record_create(cstate->session->se_client); + inc_reclaim_complete(cstate->session->se_client); +out: + return status; +} + +__be32 +nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_setclientid *setclid = &u->setclientid; + struct xdr_netobj clname = setclid->se_name; + nfs4_verifier clverifier = setclid->se_verf; + struct nfs4_client *conf, *new; + struct nfs4_client *unconf = NULL; + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + new = create_client(clname, rqstp, &clverifier); + if (new == NULL) + return nfserr_jukebox; + /* Cases below refer to rfc 3530 section 14.2.33: */ + spin_lock(&nn->client_lock); + conf = find_confirmed_client_by_name(&clname, nn); + if (conf && client_has_state(conf)) { + /* case 0: */ + status = nfserr_clid_inuse; + if (clp_used_exchangeid(conf)) + goto out; + if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { + trace_nfsd_clid_inuse_err(conf); + goto out; + } + } + unconf = find_unconfirmed_client_by_name(&clname, nn); + if (unconf) + unhash_client_locked(unconf); + /* We need to handle only case 1: probable callback update */ + if (conf && same_verf(&conf->cl_verifier, &clverifier)) { + copy_clid(new, conf); + gen_confirm(new, nn); + } + new->cl_minorversion = 0; + gen_callback(new, setclid, rqstp); + add_to_unconfirmed(new); + setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; + setclid->se_clientid.cl_id = new->cl_clientid.cl_id; + memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); + new = NULL; + status = nfs_ok; +out: + spin_unlock(&nn->client_lock); + if (new) + free_client(new); + if (unconf) + expire_client(unconf); + return status; +} + + +__be32 +nfsd4_setclientid_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_setclientid_confirm *setclientid_confirm = + &u->setclientid_confirm; + struct nfs4_client *conf, *unconf; + struct nfs4_client *old = NULL; + nfs4_verifier confirm = setclientid_confirm->sc_confirm; + clientid_t * clid = &setclientid_confirm->sc_clientid; + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + if (STALE_CLIENTID(clid, nn)) + return nfserr_stale_clientid; + + spin_lock(&nn->client_lock); + conf = find_confirmed_client(clid, false, nn); + unconf = find_unconfirmed_client(clid, false, nn); + /* + * We try hard to give out unique clientid's, so if we get an + * attempt to confirm the same clientid with a different cred, + * the client may be buggy; this should never happen. + * + * Nevertheless, RFC 7530 recommends INUSE for this case: + */ + status = nfserr_clid_inuse; + if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) + goto out; + if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) + goto out; + /* cases below refer to rfc 3530 section 14.2.34: */ + if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) { + if (conf && same_verf(&confirm, &conf->cl_confirm)) { + /* case 2: probable retransmit */ + status = nfs_ok; + } else /* case 4: client hasn't noticed we rebooted yet? */ + status = nfserr_stale_clientid; + goto out; + } + status = nfs_ok; + if (conf) { /* case 1: callback update */ + old = unconf; + unhash_client_locked(old); + nfsd4_change_callback(conf, &unconf->cl_cb_conn); + } else { /* case 3: normal case; new or rebooted client */ + old = find_confirmed_client_by_name(&unconf->cl_name, nn); + if (old) { + status = nfserr_clid_inuse; + if (client_has_state(old) + && !same_creds(&unconf->cl_cred, + &old->cl_cred)) { + old = NULL; + goto out; + } + status = mark_client_expired_locked(old); + if (status) { + old = NULL; + goto out; + } + } + move_to_confirmed(unconf); + conf = unconf; + } + get_client_locked(conf); + spin_unlock(&nn->client_lock); + nfsd4_probe_callback(conf); + spin_lock(&nn->client_lock); + put_client_renew_locked(conf); +out: + spin_unlock(&nn->client_lock); + if (old) + expire_client(old); + return status; +} + +static struct nfs4_file *nfsd4_alloc_file(void) +{ + return kmem_cache_alloc(file_slab, GFP_KERNEL); +} + +/* OPEN Share state helper functions */ +static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, + struct nfs4_file *fp) +{ + lockdep_assert_held(&state_lock); + + refcount_set(&fp->fi_ref, 1); + spin_lock_init(&fp->fi_lock); + INIT_LIST_HEAD(&fp->fi_stateids); + INIT_LIST_HEAD(&fp->fi_delegations); + INIT_LIST_HEAD(&fp->fi_clnt_odstate); + fh_copy_shallow(&fp->fi_fhandle, fh); + fp->fi_deleg_file = NULL; + fp->fi_had_conflict = false; + fp->fi_share_deny = 0; + memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); + memset(fp->fi_access, 0, sizeof(fp->fi_access)); +#ifdef CONFIG_NFSD_PNFS + INIT_LIST_HEAD(&fp->fi_lo_states); + atomic_set(&fp->fi_lo_recalls, 0); +#endif + hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); +} + +void +nfsd4_free_slabs(void) +{ + kmem_cache_destroy(client_slab); + kmem_cache_destroy(openowner_slab); + kmem_cache_destroy(lockowner_slab); + kmem_cache_destroy(file_slab); + kmem_cache_destroy(stateid_slab); + kmem_cache_destroy(deleg_slab); + kmem_cache_destroy(odstate_slab); +} + +int +nfsd4_init_slabs(void) +{ + client_slab = kmem_cache_create("nfsd4_clients", + sizeof(struct nfs4_client), 0, 0, NULL); + if (client_slab == NULL) + goto out; + openowner_slab = kmem_cache_create("nfsd4_openowners", + sizeof(struct nfs4_openowner), 0, 0, NULL); + if (openowner_slab == NULL) + goto out_free_client_slab; + lockowner_slab = kmem_cache_create("nfsd4_lockowners", + sizeof(struct nfs4_lockowner), 0, 0, NULL); + if (lockowner_slab == NULL) + goto out_free_openowner_slab; + file_slab = kmem_cache_create("nfsd4_files", + sizeof(struct nfs4_file), 0, 0, NULL); + if (file_slab == NULL) + goto out_free_lockowner_slab; + stateid_slab = kmem_cache_create("nfsd4_stateids", + sizeof(struct nfs4_ol_stateid), 0, 0, NULL); + if (stateid_slab == NULL) + goto out_free_file_slab; + deleg_slab = kmem_cache_create("nfsd4_delegations", + sizeof(struct nfs4_delegation), 0, 0, NULL); + if (deleg_slab == NULL) + goto out_free_stateid_slab; + odstate_slab = kmem_cache_create("nfsd4_odstate", + sizeof(struct nfs4_clnt_odstate), 0, 0, NULL); + if (odstate_slab == NULL) + goto out_free_deleg_slab; + return 0; + +out_free_deleg_slab: + kmem_cache_destroy(deleg_slab); +out_free_stateid_slab: + kmem_cache_destroy(stateid_slab); +out_free_file_slab: + kmem_cache_destroy(file_slab); +out_free_lockowner_slab: + kmem_cache_destroy(lockowner_slab); +out_free_openowner_slab: + kmem_cache_destroy(openowner_slab); +out_free_client_slab: + kmem_cache_destroy(client_slab); +out: + return -ENOMEM; +} + +static void init_nfs4_replay(struct nfs4_replay *rp) +{ + rp->rp_status = nfserr_serverfault; + rp->rp_buflen = 0; + rp->rp_buf = rp->rp_ibuf; + mutex_init(&rp->rp_mutex); +} + +static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate, + struct nfs4_stateowner *so) +{ + if (!nfsd4_has_session(cstate)) { + mutex_lock(&so->so_replay.rp_mutex); + cstate->replay_owner = nfs4_get_stateowner(so); + } +} + +void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate) +{ + struct nfs4_stateowner *so = cstate->replay_owner; + + if (so != NULL) { + cstate->replay_owner = NULL; + mutex_unlock(&so->so_replay.rp_mutex); + nfs4_put_stateowner(so); + } +} + +static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp) +{ + struct nfs4_stateowner *sop; + + sop = kmem_cache_alloc(slab, GFP_KERNEL); + if (!sop) + return NULL; + + xdr_netobj_dup(&sop->so_owner, owner, GFP_KERNEL); + if (!sop->so_owner.data) { + kmem_cache_free(slab, sop); + return NULL; + } + + INIT_LIST_HEAD(&sop->so_stateids); + sop->so_client = clp; + init_nfs4_replay(&sop->so_replay); + atomic_set(&sop->so_count, 1); + return sop; +} + +static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) +{ + lockdep_assert_held(&clp->cl_lock); + + list_add(&oo->oo_owner.so_strhash, + &clp->cl_ownerstr_hashtbl[strhashval]); + list_add(&oo->oo_perclient, &clp->cl_openowners); +} + +static void nfs4_unhash_openowner(struct nfs4_stateowner *so) +{ + unhash_openowner_locked(openowner(so)); +} + +static void nfs4_free_openowner(struct nfs4_stateowner *so) +{ + struct nfs4_openowner *oo = openowner(so); + + kmem_cache_free(openowner_slab, oo); +} + +static const struct nfs4_stateowner_operations openowner_ops = { + .so_unhash = nfs4_unhash_openowner, + .so_free = nfs4_free_openowner, +}; + +static struct nfs4_ol_stateid * +nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) +{ + struct nfs4_ol_stateid *local, *ret = NULL; + struct nfs4_openowner *oo = open->op_openowner; + + lockdep_assert_held(&fp->fi_lock); + + list_for_each_entry(local, &fp->fi_stateids, st_perfile) { + /* ignore lock owners */ + if (local->st_stateowner->so_is_open_owner == 0) + continue; + if (local->st_stateowner != &oo->oo_owner) + continue; + if (local->st_stid.sc_type == NFS4_OPEN_STID) { + ret = local; + refcount_inc(&ret->st_stid.sc_count); + break; + } + } + return ret; +} + +static __be32 +nfsd4_verify_open_stid(struct nfs4_stid *s) +{ + __be32 ret = nfs_ok; + + switch (s->sc_type) { + default: + break; + case 0: + case NFS4_CLOSED_STID: + case NFS4_CLOSED_DELEG_STID: + ret = nfserr_bad_stateid; + break; + case NFS4_REVOKED_DELEG_STID: + ret = nfserr_deleg_revoked; + } + return ret; +} + +/* Lock the stateid st_mutex, and deal with races with CLOSE */ +static __be32 +nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp) +{ + __be32 ret; + + mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX); + ret = nfsd4_verify_open_stid(&stp->st_stid); + if (ret != nfs_ok) + mutex_unlock(&stp->st_mutex); + return ret; +} + +static struct nfs4_ol_stateid * +nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) +{ + struct nfs4_ol_stateid *stp; + for (;;) { + spin_lock(&fp->fi_lock); + stp = nfsd4_find_existing_open(fp, open); + spin_unlock(&fp->fi_lock); + if (!stp || nfsd4_lock_ol_stateid(stp) == nfs_ok) + break; + nfs4_put_stid(&stp->st_stid); + } + return stp; +} + +static struct nfs4_openowner * +alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, + struct nfsd4_compound_state *cstate) +{ + struct nfs4_client *clp = cstate->clp; + struct nfs4_openowner *oo, *ret; + + oo = alloc_stateowner(openowner_slab, &open->op_owner, clp); + if (!oo) + return NULL; + oo->oo_owner.so_ops = &openowner_ops; + oo->oo_owner.so_is_open_owner = 1; + oo->oo_owner.so_seqid = open->op_seqid; + oo->oo_flags = 0; + if (nfsd4_has_session(cstate)) + oo->oo_flags |= NFS4_OO_CONFIRMED; + oo->oo_time = 0; + oo->oo_last_closed_stid = NULL; + INIT_LIST_HEAD(&oo->oo_close_lru); + spin_lock(&clp->cl_lock); + ret = find_openstateowner_str_locked(strhashval, open, clp); + if (ret == NULL) { + hash_openowner(oo, clp, strhashval); + ret = oo; + } else + nfs4_free_stateowner(&oo->oo_owner); + + spin_unlock(&clp->cl_lock); + return ret; +} + +static struct nfs4_ol_stateid * +init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open) +{ + + struct nfs4_openowner *oo = open->op_openowner; + struct nfs4_ol_stateid *retstp = NULL; + struct nfs4_ol_stateid *stp; + + stp = open->op_stp; + /* We are moving these outside of the spinlocks to avoid the warnings */ + mutex_init(&stp->st_mutex); + mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); + +retry: + spin_lock(&oo->oo_owner.so_client->cl_lock); + spin_lock(&fp->fi_lock); + + retstp = nfsd4_find_existing_open(fp, open); + if (retstp) + goto out_unlock; + + open->op_stp = NULL; + refcount_inc(&stp->st_stid.sc_count); + stp->st_stid.sc_type = NFS4_OPEN_STID; + INIT_LIST_HEAD(&stp->st_locks); + stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner); + get_nfs4_file(fp); + stp->st_stid.sc_file = fp; + stp->st_access_bmap = 0; + stp->st_deny_bmap = 0; + stp->st_openstp = NULL; + list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); + list_add(&stp->st_perfile, &fp->fi_stateids); + +out_unlock: + spin_unlock(&fp->fi_lock); + spin_unlock(&oo->oo_owner.so_client->cl_lock); + if (retstp) { + /* Handle races with CLOSE */ + if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { + nfs4_put_stid(&retstp->st_stid); + goto retry; + } + /* To keep mutex tracking happy */ + mutex_unlock(&stp->st_mutex); + stp = retstp; + } + return stp; +} + +/* + * In the 4.0 case we need to keep the owners around a little while to handle + * CLOSE replay. We still do need to release any file access that is held by + * them before returning however. + */ +static void +move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) +{ + struct nfs4_ol_stateid *last; + struct nfs4_openowner *oo = openowner(s->st_stateowner); + struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net, + nfsd_net_id); + + dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); + + /* + * We know that we hold one reference via nfsd4_close, and another + * "persistent" reference for the client. If the refcount is higher + * than 2, then there are still calls in progress that are using this + * stateid. We can't put the sc_file reference until they are finished. + * Wait for the refcount to drop to 2. Since it has been unhashed, + * there should be no danger of the refcount going back up again at + * this point. + */ + wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2); + + release_all_access(s); + if (s->st_stid.sc_file) { + put_nfs4_file(s->st_stid.sc_file); + s->st_stid.sc_file = NULL; + } + + spin_lock(&nn->client_lock); + last = oo->oo_last_closed_stid; + oo->oo_last_closed_stid = s; + list_move_tail(&oo->oo_close_lru, &nn->close_lru); + oo->oo_time = ktime_get_boottime_seconds(); + spin_unlock(&nn->client_lock); + if (last) + nfs4_put_stid(&last->st_stid); +} + +/* search file_hashtbl[] for file */ +static struct nfs4_file * +find_file_locked(struct knfsd_fh *fh, unsigned int hashval) +{ + struct nfs4_file *fp; + + hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash, + lockdep_is_held(&state_lock)) { + if (fh_match(&fp->fi_fhandle, fh)) { + if (refcount_inc_not_zero(&fp->fi_ref)) + return fp; + } + } + return NULL; +} + +struct nfs4_file * +find_file(struct knfsd_fh *fh) +{ + struct nfs4_file *fp; + unsigned int hashval = file_hashval(fh); + + rcu_read_lock(); + fp = find_file_locked(fh, hashval); + rcu_read_unlock(); + return fp; +} + +static struct nfs4_file * +find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh) +{ + struct nfs4_file *fp; + unsigned int hashval = file_hashval(fh); + + rcu_read_lock(); + fp = find_file_locked(fh, hashval); + rcu_read_unlock(); + if (fp) + return fp; + + spin_lock(&state_lock); + fp = find_file_locked(fh, hashval); + if (likely(fp == NULL)) { + nfsd4_init_file(fh, hashval, new); + fp = new; + } + spin_unlock(&state_lock); + + return fp; +} + +/* + * Called to check deny when READ with all zero stateid or + * WRITE with all zero or all one stateid + */ +static __be32 +nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) +{ + struct nfs4_file *fp; + __be32 ret = nfs_ok; + + fp = find_file(¤t_fh->fh_handle); + if (!fp) + return ret; + /* Check for conflicting share reservations */ + spin_lock(&fp->fi_lock); + if (fp->fi_share_deny & deny_type) + ret = nfserr_locked; + spin_unlock(&fp->fi_lock); + put_nfs4_file(fp); + return ret; +} + +static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) +{ + struct nfs4_delegation *dp = cb_to_delegation(cb); + struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net, + nfsd_net_id); + + block_delegations(&dp->dl_stid.sc_file->fi_fhandle); + + /* + * We can't do this in nfsd_break_deleg_cb because it is + * already holding inode->i_lock. + * + * If the dl_time != 0, then we know that it has already been + * queued for a lease break. Don't queue it again. + */ + spin_lock(&state_lock); + if (delegation_hashed(dp) && dp->dl_time == 0) { + dp->dl_time = ktime_get_boottime_seconds(); + list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); + } + spin_unlock(&state_lock); +} + +static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, + struct rpc_task *task) +{ + struct nfs4_delegation *dp = cb_to_delegation(cb); + + if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID || + dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) + return 1; + + switch (task->tk_status) { + case 0: + return 1; + case -NFS4ERR_DELAY: + rpc_delay(task, 2 * HZ); + return 0; + case -EBADHANDLE: + case -NFS4ERR_BAD_STATEID: + /* + * Race: client probably got cb_recall before open reply + * granting delegation. + */ + if (dp->dl_retries--) { + rpc_delay(task, 2 * HZ); + return 0; + } + fallthrough; + default: + return 1; + } +} + +static void nfsd4_cb_recall_release(struct nfsd4_callback *cb) +{ + struct nfs4_delegation *dp = cb_to_delegation(cb); + + nfs4_put_stid(&dp->dl_stid); +} + +static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = { + .prepare = nfsd4_cb_recall_prepare, + .done = nfsd4_cb_recall_done, + .release = nfsd4_cb_recall_release, +}; + +static void nfsd_break_one_deleg(struct nfs4_delegation *dp) +{ + /* + * We're assuming the state code never drops its reference + * without first removing the lease. Since we're in this lease + * callback (and since the lease code is serialized by the + * i_lock) we know the server hasn't removed the lease yet, and + * we know it's safe to take a reference. + */ + refcount_inc(&dp->dl_stid.sc_count); + nfsd4_run_cb(&dp->dl_recall); +} + +/* Called from break_lease() with i_lock held. */ +static bool +nfsd_break_deleg_cb(struct file_lock *fl) +{ + bool ret = false; + struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; + struct nfs4_file *fp = dp->dl_stid.sc_file; + + trace_nfsd_deleg_break(&dp->dl_stid.sc_stateid); + + /* + * We don't want the locks code to timeout the lease for us; + * we'll remove it ourself if a delegation isn't returned + * in time: + */ + fl->fl_break_time = 0; + + spin_lock(&fp->fi_lock); + fp->fi_had_conflict = true; + nfsd_break_one_deleg(dp); + spin_unlock(&fp->fi_lock); + return ret; +} + +/** + * nfsd_breaker_owns_lease - Check if lease conflict was resolved + * @fl: Lock state to check + * + * Return values: + * %true: Lease conflict was resolved + * %false: Lease conflict was not resolved. + */ +static bool nfsd_breaker_owns_lease(struct file_lock *fl) +{ + struct nfs4_delegation *dl = fl->fl_owner; + struct svc_rqst *rqst; + struct nfs4_client *clp; + + if (!i_am_nfsd()) + return false; + rqst = kthread_data(current); + /* Note rq_prog == NFS_ACL_PROGRAM is also possible: */ + if (rqst->rq_prog != NFS_PROGRAM || rqst->rq_vers < 4) + return false; + clp = *(rqst->rq_lease_breaker); + return dl->dl_stid.sc_client == clp; +} + +static int +nfsd_change_deleg_cb(struct file_lock *onlist, int arg, + struct list_head *dispose) +{ + if (arg & F_UNLCK) + return lease_modify(onlist, arg, dispose); + else + return -EAGAIN; +} + +static const struct lock_manager_operations nfsd_lease_mng_ops = { + .lm_breaker_owns_lease = nfsd_breaker_owns_lease, + .lm_break = nfsd_break_deleg_cb, + .lm_change = nfsd_change_deleg_cb, +}; + +static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid) +{ + if (nfsd4_has_session(cstate)) + return nfs_ok; + if (seqid == so->so_seqid - 1) + return nfserr_replay_me; + if (seqid == so->so_seqid) + return nfs_ok; + return nfserr_bad_seqid; +} + +static __be32 lookup_clientid(clientid_t *clid, + struct nfsd4_compound_state *cstate, + struct nfsd_net *nn, + bool sessions) +{ + struct nfs4_client *found; + + if (cstate->clp) { + found = cstate->clp; + if (!same_clid(&found->cl_clientid, clid)) + return nfserr_stale_clientid; + return nfs_ok; + } + + if (STALE_CLIENTID(clid, nn)) + return nfserr_stale_clientid; + + /* + * For v4.1+ we get the client in the SEQUENCE op. If we don't have one + * cached already then we know this is for is for v4.0 and "sessions" + * will be false. + */ + WARN_ON_ONCE(cstate->session); + spin_lock(&nn->client_lock); + found = find_confirmed_client(clid, sessions, nn); + if (!found) { + spin_unlock(&nn->client_lock); + return nfserr_expired; + } + atomic_inc(&found->cl_rpc_users); + spin_unlock(&nn->client_lock); + + /* Cache the nfs4_client in cstate! */ + cstate->clp = found; + return nfs_ok; +} + +__be32 +nfsd4_process_open1(struct nfsd4_compound_state *cstate, + struct nfsd4_open *open, struct nfsd_net *nn) +{ + clientid_t *clientid = &open->op_clientid; + struct nfs4_client *clp = NULL; + unsigned int strhashval; + struct nfs4_openowner *oo = NULL; + __be32 status; + + if (STALE_CLIENTID(&open->op_clientid, nn)) + return nfserr_stale_clientid; + /* + * In case we need it later, after we've already created the + * file and don't want to risk a further failure: + */ + open->op_file = nfsd4_alloc_file(); + if (open->op_file == NULL) + return nfserr_jukebox; + + status = lookup_clientid(clientid, cstate, nn, false); + if (status) + return status; + clp = cstate->clp; + + strhashval = ownerstr_hashval(&open->op_owner); + oo = find_openstateowner_str(strhashval, open, clp); + open->op_openowner = oo; + if (!oo) { + goto new_owner; + } + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { + /* Replace unconfirmed owners without checking for replay. */ + release_openowner(oo); + open->op_openowner = NULL; + goto new_owner; + } + status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); + if (status) + return status; + goto alloc_stateid; +new_owner: + oo = alloc_init_open_stateowner(strhashval, open, cstate); + if (oo == NULL) + return nfserr_jukebox; + open->op_openowner = oo; +alloc_stateid: + open->op_stp = nfs4_alloc_open_stateid(clp); + if (!open->op_stp) + return nfserr_jukebox; + + if (nfsd4_has_session(cstate) && + (cstate->current_fh.fh_export->ex_flags & NFSEXP_PNFS)) { + open->op_odstate = alloc_clnt_odstate(clp); + if (!open->op_odstate) + return nfserr_jukebox; + } + + return nfs_ok; +} + +static inline __be32 +nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) +{ + if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ)) + return nfserr_openmode; + else + return nfs_ok; +} + +static int share_access_to_flags(u32 share_access) +{ + return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; +} + +static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s) +{ + struct nfs4_stid *ret; + + ret = find_stateid_by_type(cl, s, + NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID); + if (!ret) + return NULL; + return delegstateid(ret); +} + +static bool nfsd4_is_deleg_cur(struct nfsd4_open *open) +{ + return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR || + open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH; +} + +static __be32 +nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, + struct nfs4_delegation **dp) +{ + int flags; + __be32 status = nfserr_bad_stateid; + struct nfs4_delegation *deleg; + + deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); + if (deleg == NULL) + goto out; + if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) { + nfs4_put_stid(&deleg->dl_stid); + if (cl->cl_minorversion) + status = nfserr_deleg_revoked; + goto out; + } + flags = share_access_to_flags(open->op_share_access); + status = nfs4_check_delegmode(deleg, flags); + if (status) { + nfs4_put_stid(&deleg->dl_stid); + goto out; + } + *dp = deleg; +out: + if (!nfsd4_is_deleg_cur(open)) + return nfs_ok; + if (status) + return status; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + return nfs_ok; +} + +static inline int nfs4_access_to_access(u32 nfs4_access) +{ + int flags = 0; + + if (nfs4_access & NFS4_SHARE_ACCESS_READ) + flags |= NFSD_MAY_READ; + if (nfs4_access & NFS4_SHARE_ACCESS_WRITE) + flags |= NFSD_MAY_WRITE; + return flags; +} + +static inline __be32 +nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, + struct nfsd4_open *open) +{ + struct iattr iattr = { + .ia_valid = ATTR_SIZE, + .ia_size = 0, + }; + if (!open->op_truncate) + return 0; + if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) + return nfserr_inval; + return nfsd_setattr(rqstp, fh, &iattr, 0, (time64_t)0); +} + +static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, + struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, + struct nfsd4_open *open) +{ + struct nfsd_file *nf = NULL; + __be32 status; + int oflag = nfs4_access_to_omode(open->op_share_access); + int access = nfs4_access_to_access(open->op_share_access); + unsigned char old_access_bmap, old_deny_bmap; + + spin_lock(&fp->fi_lock); + + /* + * Are we trying to set a deny mode that would conflict with + * current access? + */ + status = nfs4_file_check_deny(fp, open->op_share_deny); + if (status != nfs_ok) { + spin_unlock(&fp->fi_lock); + goto out; + } + + /* set access to the file */ + status = nfs4_file_get_access(fp, open->op_share_access); + if (status != nfs_ok) { + spin_unlock(&fp->fi_lock); + goto out; + } + + /* Set access bits in stateid */ + old_access_bmap = stp->st_access_bmap; + set_access(open->op_share_access, stp); + + /* Set new deny mask */ + old_deny_bmap = stp->st_deny_bmap; + set_deny(open->op_share_deny, stp); + fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH); + + if (!fp->fi_fds[oflag]) { + spin_unlock(&fp->fi_lock); + status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); + if (status) + goto out_put_access; + spin_lock(&fp->fi_lock); + if (!fp->fi_fds[oflag]) { + fp->fi_fds[oflag] = nf; + nf = NULL; + } + } + spin_unlock(&fp->fi_lock); + if (nf) + nfsd_file_put(nf); + + status = nfserrno(nfsd_open_break_lease(cur_fh->fh_dentry->d_inode, + access)); + if (status) + goto out_put_access; + + status = nfsd4_truncate(rqstp, cur_fh, open); + if (status) + goto out_put_access; +out: + return status; +out_put_access: + stp->st_access_bmap = old_access_bmap; + nfs4_file_put_access(fp, open->op_share_access); + reset_union_bmap_deny(bmap_to_share_mode(old_deny_bmap), stp); + goto out; +} + +static __be32 +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) +{ + __be32 status; + unsigned char old_deny_bmap = stp->st_deny_bmap; + + if (!test_access(open->op_share_access, stp)) + return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open); + + /* test and set deny mode */ + spin_lock(&fp->fi_lock); + status = nfs4_file_check_deny(fp, open->op_share_deny); + if (status == nfs_ok) { + set_deny(open->op_share_deny, stp); + fp->fi_share_deny |= + (open->op_share_deny & NFS4_SHARE_DENY_BOTH); + } + spin_unlock(&fp->fi_lock); + + if (status != nfs_ok) + return status; + + status = nfsd4_truncate(rqstp, cur_fh, open); + if (status != nfs_ok) + reset_union_bmap_deny(old_deny_bmap, stp); + return status; +} + +/* Should we give out recallable state?: */ +static bool nfsd4_cb_channel_good(struct nfs4_client *clp) +{ + if (clp->cl_cb_state == NFSD4_CB_UP) + return true; + /* + * In the sessions case, since we don't have to establish a + * separate connection for callbacks, we assume it's OK + * until we hear otherwise: + */ + return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; +} + +static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, + int flag) +{ + struct file_lock *fl; + + fl = locks_alloc_lock(); + if (!fl) + return NULL; + fl->fl_lmops = &nfsd_lease_mng_ops; + fl->fl_flags = FL_DELEG; + fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; + fl->fl_end = OFFSET_MAX; + fl->fl_owner = (fl_owner_t)dp; + fl->fl_pid = current->tgid; + fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file; + return fl; +} + +static struct nfs4_delegation * +nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, + struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate) +{ + int status = 0; + struct nfs4_delegation *dp; + struct nfsd_file *nf; + struct file_lock *fl; + + /* + * The fi_had_conflict and nfs_get_existing_delegation checks + * here are just optimizations; we'll need to recheck them at + * the end: + */ + if (fp->fi_had_conflict) + return ERR_PTR(-EAGAIN); + + nf = find_readable_file(fp); + if (!nf) { + /* We should always have a readable file here */ + WARN_ON_ONCE(1); + return ERR_PTR(-EBADF); + } + spin_lock(&state_lock); + spin_lock(&fp->fi_lock); + if (nfs4_delegation_exists(clp, fp)) + status = -EAGAIN; + else if (!fp->fi_deleg_file) { + fp->fi_deleg_file = nf; + /* increment early to prevent fi_deleg_file from being + * cleared */ + fp->fi_delegees = 1; + nf = NULL; + } else + fp->fi_delegees++; + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); + if (nf) + nfsd_file_put(nf); + if (status) + return ERR_PTR(status); + + status = -ENOMEM; + dp = alloc_init_deleg(clp, fp, fh, odstate); + if (!dp) + goto out_delegees; + + fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); + if (!fl) + goto out_clnt_odstate; + + status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL); + if (fl) + locks_free_lock(fl); + if (status) + goto out_clnt_odstate; + + spin_lock(&state_lock); + spin_lock(&fp->fi_lock); + if (fp->fi_had_conflict) + status = -EAGAIN; + else + status = hash_delegation_locked(dp, fp); + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); + + if (status) + goto out_unlock; + + return dp; +out_unlock: + vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); +out_clnt_odstate: + put_clnt_odstate(dp->dl_clnt_odstate); + nfs4_put_stid(&dp->dl_stid); +out_delegees: + put_deleg_file(fp); + return ERR_PTR(status); +} + +static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) +{ + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + if (status == -EAGAIN) + open->op_why_no_deleg = WND4_CONTENTION; + else { + open->op_why_no_deleg = WND4_RESOURCE; + switch (open->op_deleg_want) { + case NFS4_SHARE_WANT_READ_DELEG: + case NFS4_SHARE_WANT_WRITE_DELEG: + case NFS4_SHARE_WANT_ANY_DELEG: + break; + case NFS4_SHARE_WANT_CANCEL: + open->op_why_no_deleg = WND4_CANCELLED; + break; + case NFS4_SHARE_WANT_NO_DELEG: + WARN_ON_ONCE(1); + } + } +} + +/* + * Attempt to hand out a delegation. + * + * Note we don't support write delegations, and won't until the vfs has + * proper support for them. + */ +static void +nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, + struct nfs4_ol_stateid *stp) +{ + struct nfs4_delegation *dp; + struct nfs4_openowner *oo = openowner(stp->st_stateowner); + struct nfs4_client *clp = stp->st_stid.sc_client; + int cb_up; + int status = 0; + + cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); + open->op_recall = 0; + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_PREVIOUS: + if (!cb_up) + open->op_recall = 1; + if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ) + goto out_no_deleg; + break; + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: + /* + * Let's not give out any delegations till everyone's + * had the chance to reclaim theirs, *and* until + * NLM locks have all been reclaimed: + */ + if (locks_in_grace(clp->net)) + goto out_no_deleg; + if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) + goto out_no_deleg; + /* + * Also, if the file was opened for write or + * create, there's a good chance the client's + * about to write to it, resulting in an + * immediate recall (since we don't support + * write delegations): + */ + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) + goto out_no_deleg; + if (open->op_create == NFS4_OPEN_CREATE) + goto out_no_deleg; + break; + default: + goto out_no_deleg; + } + dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate); + if (IS_ERR(dp)) + goto out_no_deleg; + + memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); + + trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); + open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; + nfs4_put_stid(&dp->dl_stid); + return; +out_no_deleg: + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE; + if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && + open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) { + dprintk("NFSD: WARNING: refusing delegation reclaim\n"); + open->op_recall = 1; + } + + /* 4.1 client asking for a delegation? */ + if (open->op_deleg_want) + nfsd4_open_deleg_none_ext(open, status); + return; +} + +static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, + struct nfs4_delegation *dp) +{ + if (open->op_deleg_want == NFS4_SHARE_WANT_READ_DELEG && + dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) { + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE; + } else if (open->op_deleg_want == NFS4_SHARE_WANT_WRITE_DELEG && + dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) { + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE; + } + /* Otherwise the client must be confused wanting a delegation + * it already has, therefore we don't return + * NFS4_OPEN_DELEGATE_NONE_EXT and reason. + */ +} + +__be32 +nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; + struct nfs4_file *fp = NULL; + struct nfs4_ol_stateid *stp = NULL; + struct nfs4_delegation *dp = NULL; + __be32 status; + bool new_stp = false; + + /* + * Lookup file; if found, lookup stateid and check open request, + * and check for delegations in the process of being recalled. + * If not found, create the nfs4_file struct + */ + fp = find_or_add_file(open->op_file, ¤t_fh->fh_handle); + if (fp != open->op_file) { + status = nfs4_check_deleg(cl, open, &dp); + if (status) + goto out; + stp = nfsd4_find_and_lock_existing_open(fp, open); + } else { + open->op_file = NULL; + status = nfserr_bad_stateid; + if (nfsd4_is_deleg_cur(open)) + goto out; + } + + if (!stp) { + stp = init_open_stateid(fp, open); + if (!open->op_stp) + new_stp = true; + } + + /* + * OPEN the file, or upgrade an existing OPEN. + * If truncate fails, the OPEN fails. + * + * stp is already locked. + */ + if (!new_stp) { + /* Stateid was found, this is an OPEN upgrade */ + status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); + if (status) { + mutex_unlock(&stp->st_mutex); + goto out; + } + } else { + status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); + if (status) { + stp->st_stid.sc_type = NFS4_CLOSED_STID; + release_open_stateid(stp); + mutex_unlock(&stp->st_mutex); + goto out; + } + + stp->st_clnt_odstate = find_or_hash_clnt_odstate(fp, + open->op_odstate); + if (stp->st_clnt_odstate == open->op_odstate) + open->op_odstate = NULL; + } + + nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid); + mutex_unlock(&stp->st_mutex); + + if (nfsd4_has_session(&resp->cstate)) { + if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { + open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + open->op_why_no_deleg = WND4_NOT_WANTED; + goto nodeleg; + } + } + + /* + * Attempt to hand out a delegation. No error return, because the + * OPEN succeeds even if we fail. + */ + nfs4_open_delegation(current_fh, open, stp); +nodeleg: + status = nfs_ok; + trace_nfsd_open(&stp->st_stid.sc_stateid); +out: + /* 4.1 client trying to upgrade/downgrade delegation? */ + if (open->op_delegate_type == NFS4_OPEN_DELEGATE_NONE && dp && + open->op_deleg_want) + nfsd4_deleg_xgrade_none_ext(open, dp); + + if (fp) + put_nfs4_file(fp); + if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + /* + * To finish the open response, we just need to set the rflags. + */ + open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; + if (nfsd4_has_session(&resp->cstate)) + open->op_rflags |= NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK; + else if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED)) + open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; + + if (dp) + nfs4_put_stid(&dp->dl_stid); + if (stp) + nfs4_put_stid(&stp->st_stid); + + return status; +} + +void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, + struct nfsd4_open *open) +{ + if (open->op_openowner) { + struct nfs4_stateowner *so = &open->op_openowner->oo_owner; + + nfsd4_cstate_assign_replay(cstate, so); + nfs4_put_stateowner(so); + } + if (open->op_file) + kmem_cache_free(file_slab, open->op_file); + if (open->op_stp) + nfs4_put_stid(&open->op_stp->st_stid); + if (open->op_odstate) + kmem_cache_free(odstate_slab, open->op_odstate); +} + +__be32 +nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + clientid_t *clid = &u->renew; + struct nfs4_client *clp; + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + trace_nfsd_clid_renew(clid); + status = lookup_clientid(clid, cstate, nn, false); + if (status) + goto out; + clp = cstate->clp; + status = nfserr_cb_path_down; + if (!list_empty(&clp->cl_delegations) + && clp->cl_cb_state != NFSD4_CB_UP) + goto out; + status = nfs_ok; +out: + return status; +} + +void +nfsd4_end_grace(struct nfsd_net *nn) +{ + /* do nothing if grace period already ended */ + if (nn->grace_ended) + return; + + trace_nfsd_grace_complete(nn); + nn->grace_ended = true; + /* + * If the server goes down again right now, an NFSv4 + * client will still be allowed to reclaim after it comes back up, + * even if it hasn't yet had a chance to reclaim state this time. + * + */ + nfsd4_record_grace_done(nn); + /* + * At this point, NFSv4 clients can still reclaim. But if the + * server crashes, any that have not yet reclaimed will be out + * of luck on the next boot. + * + * (NFSv4.1+ clients are considered to have reclaimed once they + * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to + * have reclaimed after their first OPEN.) + */ + locks_end_grace(&nn->nfsd4_manager); + /* + * At this point, and once lockd and/or any other containers + * exit their grace period, further reclaims will fail and + * regular locking can resume. + */ +} + +/* + * If we've waited a lease period but there are still clients trying to + * reclaim, wait a little longer to give them a chance to finish. + */ +static bool clients_still_reclaiming(struct nfsd_net *nn) +{ + time64_t double_grace_period_end = nn->boot_time + + 2 * nn->nfsd4_lease; + + if (nn->track_reclaim_completes && + atomic_read(&nn->nr_reclaim_complete) == + nn->reclaim_str_hashtbl_size) + return false; + if (!nn->somebody_reclaimed) + return false; + nn->somebody_reclaimed = false; + /* + * If we've given them *two* lease times to reclaim, and they're + * still not done, give up: + */ + if (ktime_get_boottime_seconds() > double_grace_period_end) + return false; + return true; +} + +static time64_t +nfs4_laundromat(struct nfsd_net *nn) +{ + struct nfs4_client *clp; + struct nfs4_openowner *oo; + struct nfs4_delegation *dp; + struct nfs4_ol_stateid *stp; + struct nfsd4_blocked_lock *nbl; + struct list_head *pos, *next, reaplist; + time64_t cutoff = ktime_get_boottime_seconds() - nn->nfsd4_lease; + time64_t t, new_timeo = nn->nfsd4_lease; + struct nfs4_cpntf_state *cps; + copy_stateid_t *cps_t; + int i; + + if (clients_still_reclaiming(nn)) { + new_timeo = 0; + goto out; + } + nfsd4_end_grace(nn); + INIT_LIST_HEAD(&reaplist); + + spin_lock(&nn->s2s_cp_lock); + idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) { + cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid); + if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID && + cps->cpntf_time < cutoff) + _free_cpntf_state_locked(nn, cps); + } + spin_unlock(&nn->s2s_cp_lock); + + spin_lock(&nn->client_lock); + list_for_each_safe(pos, next, &nn->client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (clp->cl_time > cutoff) { + t = clp->cl_time - cutoff; + new_timeo = min(new_timeo, t); + break; + } + if (mark_client_expired_locked(clp)) { + trace_nfsd_clid_expired(&clp->cl_clientid); + continue; + } + list_add(&clp->cl_lru, &reaplist); + } + spin_unlock(&nn->client_lock); + list_for_each_safe(pos, next, &reaplist) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + trace_nfsd_clid_purged(&clp->cl_clientid); + list_del_init(&clp->cl_lru); + expire_client(clp); + } + spin_lock(&state_lock); + list_for_each_safe(pos, next, &nn->del_recall_lru) { + dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + if (dp->dl_time > cutoff) { + t = dp->dl_time - cutoff; + new_timeo = min(new_timeo, t); + break; + } + WARN_ON(!unhash_delegation_locked(dp)); + list_add(&dp->dl_recall_lru, &reaplist); + } + spin_unlock(&state_lock); + while (!list_empty(&reaplist)) { + dp = list_first_entry(&reaplist, struct nfs4_delegation, + dl_recall_lru); + list_del_init(&dp->dl_recall_lru); + revoke_delegation(dp); + } + + spin_lock(&nn->client_lock); + while (!list_empty(&nn->close_lru)) { + oo = list_first_entry(&nn->close_lru, struct nfs4_openowner, + oo_close_lru); + if (oo->oo_time > cutoff) { + t = oo->oo_time - cutoff; + new_timeo = min(new_timeo, t); + break; + } + list_del_init(&oo->oo_close_lru); + stp = oo->oo_last_closed_stid; + oo->oo_last_closed_stid = NULL; + spin_unlock(&nn->client_lock); + nfs4_put_stid(&stp->st_stid); + spin_lock(&nn->client_lock); + } + spin_unlock(&nn->client_lock); + + /* + * It's possible for a client to try and acquire an already held lock + * that is being held for a long time, and then lose interest in it. + * So, we clean out any un-revisited request after a lease period + * under the assumption that the client is no longer interested. + * + * RFC5661, sec. 9.6 states that the client must not rely on getting + * notifications and must continue to poll for locks, even when the + * server supports them. Thus this shouldn't lead to clients blocking + * indefinitely once the lock does become free. + */ + BUG_ON(!list_empty(&reaplist)); + spin_lock(&nn->blocked_locks_lock); + while (!list_empty(&nn->blocked_locks_lru)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + if (nbl->nbl_time > cutoff) { + t = nbl->nbl_time - cutoff; + new_timeo = min(new_timeo, t); + break; + } + list_move(&nbl->nbl_lru, &reaplist); + list_del_init(&nbl->nbl_list); + } + spin_unlock(&nn->blocked_locks_lock); + + while (!list_empty(&reaplist)) { + nbl = list_first_entry(&reaplist, + struct nfsd4_blocked_lock, nbl_lru); + list_del_init(&nbl->nbl_lru); + free_blocked_lock(nbl); + } +out: + new_timeo = max_t(time64_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); + return new_timeo; +} + +static struct workqueue_struct *laundry_wq; +static void laundromat_main(struct work_struct *); + +static void +laundromat_main(struct work_struct *laundry) +{ + time64_t t; + struct delayed_work *dwork = to_delayed_work(laundry); + struct nfsd_net *nn = container_of(dwork, struct nfsd_net, + laundromat_work); + + t = nfs4_laundromat(nn); + queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ); +} + +static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) +{ + if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle)) + return nfserr_bad_stateid; + return nfs_ok; +} + +static inline int +access_permit_read(struct nfs4_ol_stateid *stp) +{ + return test_access(NFS4_SHARE_ACCESS_READ, stp) || + test_access(NFS4_SHARE_ACCESS_BOTH, stp) || + test_access(NFS4_SHARE_ACCESS_WRITE, stp); +} + +static inline int +access_permit_write(struct nfs4_ol_stateid *stp) +{ + return test_access(NFS4_SHARE_ACCESS_WRITE, stp) || + test_access(NFS4_SHARE_ACCESS_BOTH, stp); +} + +static +__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags) +{ + __be32 status = nfserr_openmode; + + /* For lock stateid's, we test the parent open, not the lock: */ + if (stp->st_openstp) + stp = stp->st_openstp; + if ((flags & WR_STATE) && !access_permit_write(stp)) + goto out; + if ((flags & RD_STATE) && !access_permit_read(stp)) + goto out; + status = nfs_ok; +out: + return status; +} + +static inline __be32 +check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, int flags) +{ + if (ONE_STATEID(stateid) && (flags & RD_STATE)) + return nfs_ok; + else if (opens_in_grace(net)) { + /* Answer in remaining cases depends on existence of + * conflicting state; so we must wait out the grace period. */ + return nfserr_grace; + } else if (flags & WR_STATE) + return nfs4_share_conflict(current_fh, + NFS4_SHARE_DENY_WRITE); + else /* (flags & RD_STATE) && ZERO_STATEID(stateid) */ + return nfs4_share_conflict(current_fh, + NFS4_SHARE_DENY_READ); +} + +/* + * Allow READ/WRITE during grace period on recovered state only for files + * that are not able to provide mandatory locking. + */ +static inline int +grace_disallows_io(struct net *net, struct inode *inode) +{ + return opens_in_grace(net) && mandatory_lock(inode); +} + +static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) +{ + /* + * When sessions are used the stateid generation number is ignored + * when it is zero. + */ + if (has_session && in->si_generation == 0) + return nfs_ok; + + if (in->si_generation == ref->si_generation) + return nfs_ok; + + /* If the client sends us a stateid from the future, it's buggy: */ + if (nfsd4_stateid_generation_after(in, ref)) + return nfserr_bad_stateid; + /* + * However, we could see a stateid from the past, even from a + * non-buggy client. For example, if the client sends a lock + * while some IO is outstanding, the lock may bump si_generation + * while the IO is still in flight. The client could avoid that + * situation by waiting for responses on all the IO requests, + * but better performance may result in retrying IO that + * receives an old_stateid error if requests are rarely + * reordered in flight: + */ + return nfserr_old_stateid; +} + +static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_stid *s, bool has_session) +{ + __be32 ret; + + spin_lock(&s->sc_lock); + ret = nfsd4_verify_open_stid(s); + if (ret == nfs_ok) + ret = check_stateid_generation(in, &s->sc_stateid, has_session); + spin_unlock(&s->sc_lock); + return ret; +} + +static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols) +{ + if (ols->st_stateowner->so_is_open_owner && + !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; + return nfs_ok; +} + +static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) +{ + struct nfs4_stid *s; + __be32 status = nfserr_bad_stateid; + + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || + CLOSE_STATEID(stateid)) + return status; + spin_lock(&cl->cl_lock); + s = find_stateid_locked(cl, stateid); + if (!s) + goto out_unlock; + status = nfsd4_stid_check_stateid_generation(stateid, s, 1); + if (status) + goto out_unlock; + switch (s->sc_type) { + case NFS4_DELEG_STID: + status = nfs_ok; + break; + case NFS4_REVOKED_DELEG_STID: + status = nfserr_deleg_revoked; + break; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + status = nfsd4_check_openowner_confirmed(openlockstateid(s)); + break; + default: + printk("unknown stateid type %x\n", s->sc_type); + fallthrough; + case NFS4_CLOSED_STID: + case NFS4_CLOSED_DELEG_STID: + status = nfserr_bad_stateid; + } +out_unlock: + spin_unlock(&cl->cl_lock); + return status; +} + +__be32 +nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, + stateid_t *stateid, unsigned char typemask, + struct nfs4_stid **s, struct nfsd_net *nn) +{ + __be32 status; + bool return_revoked = false; + + /* + * only return revoked delegations if explicitly asked. + * otherwise we report revoked or bad_stateid status. + */ + if (typemask & NFS4_REVOKED_DELEG_STID) + return_revoked = true; + else if (typemask & NFS4_DELEG_STID) + typemask |= NFS4_REVOKED_DELEG_STID; + + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || + CLOSE_STATEID(stateid)) + return nfserr_bad_stateid; + status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn, + false); + if (status == nfserr_stale_clientid) { + if (cstate->session) + return nfserr_bad_stateid; + return nfserr_stale_stateid; + } + if (status) + return status; + *s = find_stateid_by_type(cstate->clp, stateid, typemask); + if (!*s) + return nfserr_bad_stateid; + if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) { + nfs4_put_stid(*s); + if (cstate->minorversion) + return nfserr_deleg_revoked; + return nfserr_bad_stateid; + } + return nfs_ok; +} + +static struct nfsd_file * +nfs4_find_file(struct nfs4_stid *s, int flags) +{ + if (!s) + return NULL; + + switch (s->sc_type) { + case NFS4_DELEG_STID: + if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file)) + return NULL; + return nfsd_file_get(s->sc_file->fi_deleg_file); + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + if (flags & RD_STATE) + return find_readable_file(s->sc_file); + else + return find_writeable_file(s->sc_file); + } + + return NULL; +} + +static __be32 +nfs4_check_olstateid(struct nfs4_ol_stateid *ols, int flags) +{ + __be32 status; + + status = nfsd4_check_openowner_confirmed(ols); + if (status) + return status; + return nfs4_check_openmode(ols, flags); +} + +static __be32 +nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, + struct nfsd_file **nfp, int flags) +{ + int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE; + struct nfsd_file *nf; + __be32 status; + + nf = nfs4_find_file(s, flags); + if (nf) { + status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, + acc | NFSD_MAY_OWNER_OVERRIDE); + if (status) { + nfsd_file_put(nf); + goto out; + } + } else { + status = nfsd_file_acquire(rqstp, fhp, acc, &nf); + if (status) + return status; + } + *nfp = nf; +out: + return status; +} +static void +_free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps) +{ + WARN_ON_ONCE(cps->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID); + if (!refcount_dec_and_test(&cps->cp_stateid.sc_count)) + return; + list_del(&cps->cp_list); + idr_remove(&nn->s2s_cp_stateids, + cps->cp_stateid.stid.si_opaque.so_id); + kfree(cps); +} +/* + * A READ from an inter server to server COPY will have a + * copy stateid. Look up the copy notify stateid from the + * idr structure and take a reference on it. + */ +__be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st, + struct nfs4_client *clp, + struct nfs4_cpntf_state **cps) +{ + copy_stateid_t *cps_t; + struct nfs4_cpntf_state *state = NULL; + + if (st->si_opaque.so_clid.cl_id != nn->s2s_cp_cl_id) + return nfserr_bad_stateid; + spin_lock(&nn->s2s_cp_lock); + cps_t = idr_find(&nn->s2s_cp_stateids, st->si_opaque.so_id); + if (cps_t) { + state = container_of(cps_t, struct nfs4_cpntf_state, + cp_stateid); + if (state->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID) { + state = NULL; + goto unlock; + } + if (!clp) + refcount_inc(&state->cp_stateid.sc_count); + else + _free_cpntf_state_locked(nn, state); + } +unlock: + spin_unlock(&nn->s2s_cp_lock); + if (!state) + return nfserr_bad_stateid; + if (!clp && state) + *cps = state; + return 0; +} + +static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st, + struct nfs4_stid **stid) +{ + __be32 status; + struct nfs4_cpntf_state *cps = NULL; + struct nfsd4_compound_state cstate; + + status = manage_cpntf_state(nn, st, NULL, &cps); + if (status) + return status; + + cps->cpntf_time = ktime_get_boottime_seconds(); + memset(&cstate, 0, sizeof(cstate)); + status = lookup_clientid(&cps->cp_p_clid, &cstate, nn, true); + if (status) + goto out; + status = nfsd4_lookup_stateid(&cstate, &cps->cp_p_stateid, + NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, + stid, nn); + put_client_renew(cstate.clp); +out: + nfs4_put_cpntf_state(nn, cps); + return status; +} + +void nfs4_put_cpntf_state(struct nfsd_net *nn, struct nfs4_cpntf_state *cps) +{ + spin_lock(&nn->s2s_cp_lock); + _free_cpntf_state_locked(nn, cps); + spin_unlock(&nn->s2s_cp_lock); +} + +/* + * Checks for stateid operations + */ +__be32 +nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, struct svc_fh *fhp, + stateid_t *stateid, int flags, struct nfsd_file **nfp, + struct nfs4_stid **cstid) +{ + struct inode *ino = d_inode(fhp->fh_dentry); + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfs4_stid *s = NULL; + __be32 status; + + if (nfp) + *nfp = NULL; + + if (grace_disallows_io(net, ino)) + return nfserr_grace; + + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { + status = check_special_stateids(net, fhp, stateid, flags); + goto done; + } + + status = nfsd4_lookup_stateid(cstate, stateid, + NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, + &s, nn); + if (status == nfserr_bad_stateid) + status = find_cpntf_state(nn, stateid, &s); + if (status) + return status; + status = nfsd4_stid_check_stateid_generation(stateid, s, + nfsd4_has_session(cstate)); + if (status) + goto out; + + switch (s->sc_type) { + case NFS4_DELEG_STID: + status = nfs4_check_delegmode(delegstateid(s), flags); + break; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + status = nfs4_check_olstateid(openlockstateid(s), flags); + break; + default: + status = nfserr_bad_stateid; + break; + } + if (status) + goto out; + status = nfs4_check_fh(fhp, s); + +done: + if (status == nfs_ok && nfp) + status = nfs4_check_file(rqstp, fhp, s, nfp, flags); +out: + if (s) { + if (!status && cstid) + *cstid = s; + else + nfs4_put_stid(s); + } + return status; +} + +/* + * Test if the stateid is valid + */ +__be32 +nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_test_stateid *test_stateid = &u->test_stateid; + struct nfsd4_test_stateid_id *stateid; + struct nfs4_client *cl = cstate->session->se_client; + + list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) + stateid->ts_id_status = + nfsd4_validate_stateid(cl, &stateid->ts_id_stateid); + + return nfs_ok; +} + +static __be32 +nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) +{ + struct nfs4_ol_stateid *stp = openlockstateid(s); + __be32 ret; + + ret = nfsd4_lock_ol_stateid(stp); + if (ret) + goto out_put_stid; + + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + goto out; + + ret = nfserr_locks_held; + if (check_for_locks(stp->st_stid.sc_file, + lockowner(stp->st_stateowner))) + goto out; + + release_lock_stateid(stp); + ret = nfs_ok; + +out: + mutex_unlock(&stp->st_mutex); +out_put_stid: + nfs4_put_stid(s); + return ret; +} + +__be32 +nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_free_stateid *free_stateid = &u->free_stateid; + stateid_t *stateid = &free_stateid->fr_stateid; + struct nfs4_stid *s; + struct nfs4_delegation *dp; + struct nfs4_client *cl = cstate->session->se_client; + __be32 ret = nfserr_bad_stateid; + + spin_lock(&cl->cl_lock); + s = find_stateid_locked(cl, stateid); + if (!s) + goto out_unlock; + spin_lock(&s->sc_lock); + switch (s->sc_type) { + case NFS4_DELEG_STID: + ret = nfserr_locks_held; + break; + case NFS4_OPEN_STID: + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + break; + ret = nfserr_locks_held; + break; + case NFS4_LOCK_STID: + spin_unlock(&s->sc_lock); + refcount_inc(&s->sc_count); + spin_unlock(&cl->cl_lock); + ret = nfsd4_free_lock_stateid(stateid, s); + goto out; + case NFS4_REVOKED_DELEG_STID: + spin_unlock(&s->sc_lock); + dp = delegstateid(s); + list_del_init(&dp->dl_recall_lru); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); + ret = nfs_ok; + goto out; + /* Default falls through and returns nfserr_bad_stateid */ + } + spin_unlock(&s->sc_lock); +out_unlock: + spin_unlock(&cl->cl_lock); +out: + return ret; +} + +static inline int +setlkflg (int type) +{ + return (type == NFS4_READW_LT || type == NFS4_READ_LT) ? + RD_STATE : WR_STATE; +} + +static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp) +{ + struct svc_fh *current_fh = &cstate->current_fh; + struct nfs4_stateowner *sop = stp->st_stateowner; + __be32 status; + + status = nfsd4_check_seqid(cstate, sop, seqid); + if (status) + return status; + status = nfsd4_lock_ol_stateid(stp); + if (status != nfs_ok) + return status; + status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); + if (status == nfs_ok) + status = nfs4_check_fh(current_fh, &stp->st_stid); + if (status != nfs_ok) + mutex_unlock(&stp->st_mutex); + return status; +} + +/* + * Checks for sequence id mutating operations. + */ +static __be32 +nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, + stateid_t *stateid, char typemask, + struct nfs4_ol_stateid **stpp, + struct nfsd_net *nn) +{ + __be32 status; + struct nfs4_stid *s; + struct nfs4_ol_stateid *stp = NULL; + + trace_nfsd_preprocess(seqid, stateid); + + *stpp = NULL; + status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn); + if (status) + return status; + stp = openlockstateid(s); + nfsd4_cstate_assign_replay(cstate, stp->st_stateowner); + + status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp); + if (!status) + *stpp = stp; + else + nfs4_put_stid(&stp->st_stid); + return status; +} + +static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, + stateid_t *stateid, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn) +{ + __be32 status; + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *stp; + + status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, + NFS4_OPEN_STID, &stp, nn); + if (status) + return status; + oo = openowner(stp->st_stateowner); + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { + mutex_unlock(&stp->st_mutex); + nfs4_put_stid(&stp->st_stid); + return nfserr_bad_stateid; + } + *stpp = stp; + return nfs_ok; +} + +__be32 +nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_open_confirm *oc = &u->open_confirm; + __be32 status; + struct nfs4_openowner *oo; + struct nfs4_ol_stateid *stp; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + dprintk("NFSD: nfsd4_open_confirm on file %pd\n", + cstate->current_fh.fh_dentry); + + status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0); + if (status) + return status; + + status = nfs4_preprocess_seqid_op(cstate, + oc->oc_seqid, &oc->oc_req_stateid, + NFS4_OPEN_STID, &stp, nn); + if (status) + goto out; + oo = openowner(stp->st_stateowner); + status = nfserr_bad_stateid; + if (oo->oo_flags & NFS4_OO_CONFIRMED) { + mutex_unlock(&stp->st_mutex); + goto put_stateid; + } + oo->oo_flags |= NFS4_OO_CONFIRMED; + nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid); + mutex_unlock(&stp->st_mutex); + trace_nfsd_open_confirm(oc->oc_seqid, &stp->st_stid.sc_stateid); + nfsd4_client_record_create(oo->oo_owner.so_client); + status = nfs_ok; +put_stateid: + nfs4_put_stid(&stp->st_stid); +out: + nfsd4_bump_seqid(cstate, status); + return status; +} + +static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access) +{ + if (!test_access(access, stp)) + return; + nfs4_file_put_access(stp->st_stid.sc_file, access); + clear_access(access, stp); +} + +static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access) +{ + switch (to_access) { + case NFS4_SHARE_ACCESS_READ: + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE); + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); + break; + case NFS4_SHARE_ACCESS_WRITE: + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ); + nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH); + break; + case NFS4_SHARE_ACCESS_BOTH: + break; + default: + WARN_ON_ONCE(1); + } +} + +__be32 +nfsd4_open_downgrade(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) +{ + struct nfsd4_open_downgrade *od = &u->open_downgrade; + __be32 status; + struct nfs4_ol_stateid *stp; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + dprintk("NFSD: nfsd4_open_downgrade on file %pd\n", + cstate->current_fh.fh_dentry); + + /* We don't yet support WANT bits: */ + if (od->od_deleg_want) + dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__, + od->od_deleg_want); + + status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, + &od->od_stateid, &stp, nn); + if (status) + goto out; + status = nfserr_inval; + if (!test_access(od->od_share_access, stp)) { + dprintk("NFSD: access not a subset of current bitmap: 0x%hhx, input access=%08x\n", + stp->st_access_bmap, od->od_share_access); + goto put_stateid; + } + if (!test_deny(od->od_share_deny, stp)) { + dprintk("NFSD: deny not a subset of current bitmap: 0x%hhx, input deny=%08x\n", + stp->st_deny_bmap, od->od_share_deny); + goto put_stateid; + } + nfs4_stateid_downgrade(stp, od->od_share_access); + reset_union_bmap_deny(od->od_share_deny, stp); + nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid); + status = nfs_ok; +put_stateid: + mutex_unlock(&stp->st_mutex); + nfs4_put_stid(&stp->st_stid); +out: + nfsd4_bump_seqid(cstate, status); + return status; +} + +static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) +{ + struct nfs4_client *clp = s->st_stid.sc_client; + bool unhashed; + LIST_HEAD(reaplist); + struct nfs4_ol_stateid *stp; + + spin_lock(&clp->cl_lock); + unhashed = unhash_open_stateid(s, &reaplist); + + if (clp->cl_minorversion) { + if (unhashed) + put_ol_stateid_locked(s, &reaplist); + spin_unlock(&clp->cl_lock); + list_for_each_entry(stp, &reaplist, st_locks) + nfs4_free_cpntf_statelist(clp->net, &stp->st_stid); + free_ol_stateid_reaplist(&reaplist); + } else { + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); + if (unhashed) + move_to_close_lru(s, clp->net); + } +} + +/* + * nfs4_unlock_state() called after encode + */ +__be32 +nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_close *close = &u->close; + __be32 status; + struct nfs4_ol_stateid *stp; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + dprintk("NFSD: nfsd4_close on file %pd\n", + cstate->current_fh.fh_dentry); + + status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, + &close->cl_stateid, + NFS4_OPEN_STID|NFS4_CLOSED_STID, + &stp, nn); + nfsd4_bump_seqid(cstate, status); + if (status) + goto out; + + stp->st_stid.sc_type = NFS4_CLOSED_STID; + + /* + * Technically we don't _really_ have to increment or copy it, since + * it should just be gone after this operation and we clobber the + * copied value below, but we continue to do so here just to ensure + * that racing ops see that there was a state change. + */ + nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); + + nfsd4_close_open_stateid(stp); + mutex_unlock(&stp->st_mutex); + + /* v4.1+ suggests that we send a special stateid in here, since the + * clients should just ignore this anyway. Since this is not useful + * for v4.0 clients either, we set it to the special close_stateid + * universally. + * + * See RFC5661 section 18.2.4, and RFC7530 section 16.2.5 + */ + memcpy(&close->cl_stateid, &close_stateid, sizeof(close->cl_stateid)); + + /* put reference from nfs4_preprocess_seqid_op */ + nfs4_put_stid(&stp->st_stid); +out: + return status; +} + +__be32 +nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_delegreturn *dr = &u->delegreturn; + struct nfs4_delegation *dp; + stateid_t *stateid = &dr->dr_stateid; + struct nfs4_stid *s; + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) + return status; + + status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn); + if (status) + goto out; + dp = delegstateid(s); + status = nfsd4_stid_check_stateid_generation(stateid, &dp->dl_stid, nfsd4_has_session(cstate)); + if (status) + goto put_stateid; + + destroy_delegation(dp); +put_stateid: + nfs4_put_stid(&dp->dl_stid); +out: + return status; +} + +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end: NFS4_MAX_UINT64; +} + +/* last octet in a range */ +static inline u64 +last_byte_offset(u64 start, u64 len) +{ + u64 end; + + WARN_ON_ONCE(!len); + end = start + len; + return end > start ? end - 1: NFS4_MAX_UINT64; +} + +/* + * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that + * we can't properly handle lock requests that go beyond the (2^63 - 1)-th + * byte, because of sign extension problems. Since NFSv4 calls for 64-bit + * locking, this prevents us from being completely protocol-compliant. The + * real solution to this problem is to start using unsigned file offsets in + * the VFS, but this is a very deep change! + */ +static inline void +nfs4_transform_lock_offset(struct file_lock *lock) +{ + if (lock->fl_start < 0) + lock->fl_start = OFFSET_MAX; + if (lock->fl_end < 0) + lock->fl_end = OFFSET_MAX; +} + +static fl_owner_t +nfsd4_fl_get_owner(fl_owner_t owner) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; + + nfs4_get_stateowner(&lo->lo_owner); + return owner; +} + +static void +nfsd4_fl_put_owner(fl_owner_t owner) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; + + if (lo) + nfs4_put_stateowner(&lo->lo_owner); +} + +static void +nfsd4_lm_notify(struct file_lock *fl) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner; + struct net *net = lo->lo_owner.so_client->net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd4_blocked_lock *nbl = container_of(fl, + struct nfsd4_blocked_lock, nbl_lock); + bool queue = false; + + /* An empty list means that something else is going to be using it */ + spin_lock(&nn->blocked_locks_lock); + if (!list_empty(&nbl->nbl_list)) { + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + queue = true; + } + spin_unlock(&nn->blocked_locks_lock); + + if (queue) + nfsd4_run_cb(&nbl->nbl_cb); +} + +static const struct lock_manager_operations nfsd_posix_mng_ops = { + .lm_notify = nfsd4_lm_notify, + .lm_get_owner = nfsd4_fl_get_owner, + .lm_put_owner = nfsd4_fl_put_owner, +}; + +static inline void +nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) +{ + struct nfs4_lockowner *lo; + + if (fl->fl_lmops == &nfsd_posix_mng_ops) { + lo = (struct nfs4_lockowner *) fl->fl_owner; + xdr_netobj_dup(&deny->ld_owner, &lo->lo_owner.so_owner, + GFP_KERNEL); + if (!deny->ld_owner.data) + /* We just don't care that much */ + goto nevermind; + deny->ld_clientid = lo->lo_owner.so_client->cl_clientid; + } else { +nevermind: + deny->ld_owner.len = 0; + deny->ld_owner.data = NULL; + deny->ld_clientid.cl_boot = 0; + deny->ld_clientid.cl_id = 0; + } + deny->ld_start = fl->fl_start; + deny->ld_length = NFS4_MAX_UINT64; + if (fl->fl_end != NFS4_MAX_UINT64) + deny->ld_length = fl->fl_end - fl->fl_start + 1; + deny->ld_type = NFS4_READ_LT; + if (fl->fl_type != F_RDLCK) + deny->ld_type = NFS4_WRITE_LT; +} + +static struct nfs4_lockowner * +find_lockowner_str_locked(struct nfs4_client *clp, struct xdr_netobj *owner) +{ + unsigned int strhashval = ownerstr_hashval(owner); + struct nfs4_stateowner *so; + + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[strhashval], + so_strhash) { + if (so->so_is_open_owner) + continue; + if (same_owner_str(so, owner)) + return lockowner(nfs4_get_stateowner(so)); + } + return NULL; +} + +static struct nfs4_lockowner * +find_lockowner_str(struct nfs4_client *clp, struct xdr_netobj *owner) +{ + struct nfs4_lockowner *lo; + + spin_lock(&clp->cl_lock); + lo = find_lockowner_str_locked(clp, owner); + spin_unlock(&clp->cl_lock); + return lo; +} + +static void nfs4_unhash_lockowner(struct nfs4_stateowner *sop) +{ + unhash_lockowner_locked(lockowner(sop)); +} + +static void nfs4_free_lockowner(struct nfs4_stateowner *sop) +{ + struct nfs4_lockowner *lo = lockowner(sop); + + kmem_cache_free(lockowner_slab, lo); +} + +static const struct nfs4_stateowner_operations lockowner_ops = { + .so_unhash = nfs4_unhash_lockowner, + .so_free = nfs4_free_lockowner, +}; + +/* + * Alloc a lock owner structure. + * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has + * occurred. + * + * strhashval = ownerstr_hashval + */ +static struct nfs4_lockowner * +alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, + struct nfs4_ol_stateid *open_stp, + struct nfsd4_lock *lock) +{ + struct nfs4_lockowner *lo, *ret; + + lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); + if (!lo) + return NULL; + INIT_LIST_HEAD(&lo->lo_blocked); + INIT_LIST_HEAD(&lo->lo_owner.so_stateids); + lo->lo_owner.so_is_open_owner = 0; + lo->lo_owner.so_seqid = lock->lk_new_lock_seqid; + lo->lo_owner.so_ops = &lockowner_ops; + spin_lock(&clp->cl_lock); + ret = find_lockowner_str_locked(clp, &lock->lk_new_owner); + if (ret == NULL) { + list_add(&lo->lo_owner.so_strhash, + &clp->cl_ownerstr_hashtbl[strhashval]); + ret = lo; + } else + nfs4_free_stateowner(&lo->lo_owner); + + spin_unlock(&clp->cl_lock); + return ret; +} + +static struct nfs4_ol_stateid * +find_lock_stateid(const struct nfs4_lockowner *lo, + const struct nfs4_ol_stateid *ost) +{ + struct nfs4_ol_stateid *lst; + + lockdep_assert_held(&ost->st_stid.sc_client->cl_lock); + + /* If ost is not hashed, ost->st_locks will not be valid */ + if (!nfs4_ol_stateid_unhashed(ost)) + list_for_each_entry(lst, &ost->st_locks, st_locks) { + if (lst->st_stateowner == &lo->lo_owner) { + refcount_inc(&lst->st_stid.sc_count); + return lst; + } + } + return NULL; +} + +static struct nfs4_ol_stateid * +init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, + struct nfs4_file *fp, struct inode *inode, + struct nfs4_ol_stateid *open_stp) +{ + struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfs4_ol_stateid *retstp; + + mutex_init(&stp->st_mutex); + mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); +retry: + spin_lock(&clp->cl_lock); + if (nfs4_ol_stateid_unhashed(open_stp)) + goto out_close; + retstp = find_lock_stateid(lo, open_stp); + if (retstp) + goto out_found; + refcount_inc(&stp->st_stid.sc_count); + stp->st_stid.sc_type = NFS4_LOCK_STID; + stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); + get_nfs4_file(fp); + stp->st_stid.sc_file = fp; + stp->st_access_bmap = 0; + stp->st_deny_bmap = open_stp->st_deny_bmap; + stp->st_openstp = open_stp; + spin_lock(&fp->fi_lock); + list_add(&stp->st_locks, &open_stp->st_locks); + list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); + list_add(&stp->st_perfile, &fp->fi_stateids); + spin_unlock(&fp->fi_lock); + spin_unlock(&clp->cl_lock); + return stp; +out_found: + spin_unlock(&clp->cl_lock); + if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { + nfs4_put_stid(&retstp->st_stid); + goto retry; + } + /* To keep mutex tracking happy */ + mutex_unlock(&stp->st_mutex); + return retstp; +out_close: + spin_unlock(&clp->cl_lock); + mutex_unlock(&stp->st_mutex); + return NULL; +} + +static struct nfs4_ol_stateid * +find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, + struct inode *inode, struct nfs4_ol_stateid *ost, + bool *new) +{ + struct nfs4_stid *ns = NULL; + struct nfs4_ol_stateid *lst; + struct nfs4_openowner *oo = openowner(ost->st_stateowner); + struct nfs4_client *clp = oo->oo_owner.so_client; + + *new = false; + spin_lock(&clp->cl_lock); + lst = find_lock_stateid(lo, ost); + spin_unlock(&clp->cl_lock); + if (lst != NULL) { + if (nfsd4_lock_ol_stateid(lst) == nfs_ok) + goto out; + nfs4_put_stid(&lst->st_stid); + } + ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid); + if (ns == NULL) + return NULL; + + lst = init_lock_stateid(openlockstateid(ns), lo, fi, inode, ost); + if (lst == openlockstateid(ns)) + *new = true; + else + nfs4_put_stid(ns); +out: + return lst; +} + +static int +check_lock_length(u64 offset, u64 length) +{ + return ((length == 0) || ((length != NFS4_MAX_UINT64) && + (length > ~offset))); +} + +static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) +{ + struct nfs4_file *fp = lock_stp->st_stid.sc_file; + + lockdep_assert_held(&fp->fi_lock); + + if (test_access(access, lock_stp)) + return; + __nfs4_file_get_access(fp, access); + set_access(access, lock_stp); +} + +static __be32 +lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, + struct nfs4_ol_stateid *ost, + struct nfsd4_lock *lock, + struct nfs4_ol_stateid **plst, bool *new) +{ + __be32 status; + struct nfs4_file *fi = ost->st_stid.sc_file; + struct nfs4_openowner *oo = openowner(ost->st_stateowner); + struct nfs4_client *cl = oo->oo_owner.so_client; + struct inode *inode = d_inode(cstate->current_fh.fh_dentry); + struct nfs4_lockowner *lo; + struct nfs4_ol_stateid *lst; + unsigned int strhashval; + + lo = find_lockowner_str(cl, &lock->lk_new_owner); + if (!lo) { + strhashval = ownerstr_hashval(&lock->lk_new_owner); + lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock); + if (lo == NULL) + return nfserr_jukebox; + } else { + /* with an existing lockowner, seqids must be the same */ + status = nfserr_bad_seqid; + if (!cstate->minorversion && + lock->lk_new_lock_seqid != lo->lo_owner.so_seqid) + goto out; + } + + lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); + if (lst == NULL) { + status = nfserr_jukebox; + goto out; + } + + status = nfs_ok; + *plst = lst; +out: + nfs4_put_stateowner(&lo->lo_owner); + return status; +} + +/* + * LOCK operation + */ +__be32 +nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_lock *lock = &u->lock; + struct nfs4_openowner *open_sop = NULL; + struct nfs4_lockowner *lock_sop = NULL; + struct nfs4_ol_stateid *lock_stp = NULL; + struct nfs4_ol_stateid *open_stp = NULL; + struct nfs4_file *fp; + struct nfsd_file *nf = NULL; + struct nfsd4_blocked_lock *nbl = NULL; + struct file_lock *file_lock = NULL; + struct file_lock *conflock = NULL; + __be32 status = 0; + int lkflg; + int err; + bool new = false; + unsigned char fl_type; + unsigned int fl_flags = FL_POSIX; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", + (long long) lock->lk_offset, + (long long) lock->lk_length); + + if (check_lock_length(lock->lk_offset, lock->lk_length)) + return nfserr_inval; + + if ((status = fh_verify(rqstp, &cstate->current_fh, + S_IFREG, NFSD_MAY_LOCK))) { + dprintk("NFSD: nfsd4_lock: permission denied!\n"); + return status; + } + + if (lock->lk_is_new) { + if (nfsd4_has_session(cstate)) + /* See rfc 5661 18.10.3: given clientid is ignored: */ + memcpy(&lock->lk_new_clientid, + &cstate->session->se_client->cl_clientid, + sizeof(clientid_t)); + + status = nfserr_stale_clientid; + if (STALE_CLIENTID(&lock->lk_new_clientid, nn)) + goto out; + + /* validate and update open stateid and open seqid */ + status = nfs4_preprocess_confirmed_seqid_op(cstate, + lock->lk_new_open_seqid, + &lock->lk_new_open_stateid, + &open_stp, nn); + if (status) + goto out; + mutex_unlock(&open_stp->st_mutex); + open_sop = openowner(open_stp->st_stateowner); + status = nfserr_bad_stateid; + if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, + &lock->lk_new_clientid)) + goto out; + status = lookup_or_create_lock_state(cstate, open_stp, lock, + &lock_stp, &new); + } else { + status = nfs4_preprocess_seqid_op(cstate, + lock->lk_old_lock_seqid, + &lock->lk_old_lock_stateid, + NFS4_LOCK_STID, &lock_stp, nn); + } + if (status) + goto out; + lock_sop = lockowner(lock_stp->st_stateowner); + + lkflg = setlkflg(lock->lk_type); + status = nfs4_check_openmode(lock_stp, lkflg); + if (status) + goto out; + + status = nfserr_grace; + if (locks_in_grace(net) && !lock->lk_reclaim) + goto out; + status = nfserr_no_grace; + if (!locks_in_grace(net) && lock->lk_reclaim) + goto out; + + fp = lock_stp->st_stid.sc_file; + switch (lock->lk_type) { + case NFS4_READW_LT: + if (nfsd4_has_session(cstate)) + fl_flags |= FL_SLEEP; + fallthrough; + case NFS4_READ_LT: + spin_lock(&fp->fi_lock); + nf = find_readable_file_locked(fp); + if (nf) + get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); + spin_unlock(&fp->fi_lock); + fl_type = F_RDLCK; + break; + case NFS4_WRITEW_LT: + if (nfsd4_has_session(cstate)) + fl_flags |= FL_SLEEP; + fallthrough; + case NFS4_WRITE_LT: + spin_lock(&fp->fi_lock); + nf = find_writeable_file_locked(fp); + if (nf) + get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); + spin_unlock(&fp->fi_lock); + fl_type = F_WRLCK; + break; + default: + status = nfserr_inval; + goto out; + } + + if (!nf) { + status = nfserr_openmode; + goto out; + } + + nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); + if (!nbl) { + dprintk("NFSD: %s: unable to allocate block!\n", __func__); + status = nfserr_jukebox; + goto out; + } + + file_lock = &nbl->nbl_lock; + file_lock->fl_type = fl_type; + file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); + file_lock->fl_pid = current->tgid; + file_lock->fl_file = nf->nf_file; + file_lock->fl_flags = fl_flags; + file_lock->fl_lmops = &nfsd_posix_mng_ops; + file_lock->fl_start = lock->lk_offset; + file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); + nfs4_transform_lock_offset(file_lock); + + conflock = locks_alloc_lock(); + if (!conflock) { + dprintk("NFSD: %s: unable to allocate lock!\n", __func__); + status = nfserr_jukebox; + goto out; + } + + if (fl_flags & FL_SLEEP) { + nbl->nbl_time = ktime_get_boottime_seconds(); + spin_lock(&nn->blocked_locks_lock); + list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); + list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru); + spin_unlock(&nn->blocked_locks_lock); + } + + err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, conflock); + switch (err) { + case 0: /* success! */ + nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); + status = 0; + if (lock->lk_reclaim) + nn->somebody_reclaimed = true; + break; + case FILE_LOCK_DEFERRED: + nbl = NULL; + fallthrough; + case -EAGAIN: /* conflock holds conflicting lock */ + status = nfserr_denied; + dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); + nfs4_set_lock_denied(conflock, &lock->lk_denied); + break; + case -EDEADLK: + status = nfserr_deadlock; + break; + default: + dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err); + status = nfserrno(err); + break; + } +out: + if (nbl) { + /* dequeue it if we queued it before */ + if (fl_flags & FL_SLEEP) { + spin_lock(&nn->blocked_locks_lock); + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + spin_unlock(&nn->blocked_locks_lock); + } + free_blocked_lock(nbl); + } + if (nf) + nfsd_file_put(nf); + if (lock_stp) { + /* Bump seqid manually if the 4.0 replay owner is openowner */ + if (cstate->replay_owner && + cstate->replay_owner != &lock_sop->lo_owner && + seqid_mutating_err(ntohl(status))) + lock_sop->lo_owner.so_seqid++; + + /* + * If this is a new, never-before-used stateid, and we are + * returning an error, then just go ahead and release it. + */ + if (status && new) + release_lock_stateid(lock_stp); + + mutex_unlock(&lock_stp->st_mutex); + + nfs4_put_stid(&lock_stp->st_stid); + } + if (open_stp) + nfs4_put_stid(&open_stp->st_stid); + nfsd4_bump_seqid(cstate, status); + if (conflock) + locks_free_lock(conflock); + return status; +} + +/* + * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, + * so we do a temporary open here just to get an open file to pass to + * vfs_test_lock. + */ +static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) +{ + struct nfsd_file *nf; + __be32 err; + + err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); + if (err) + return err; + fh_lock(fhp); /* to block new leases till after test_lock: */ + err = nfserrno(nfsd_open_break_lease(fhp->fh_dentry->d_inode, + NFSD_MAY_READ)); + if (err) + goto out; + lock->fl_file = nf->nf_file; + err = nfserrno(vfs_test_lock(nf->nf_file, lock)); + lock->fl_file = NULL; +out: + fh_unlock(fhp); + nfsd_file_put(nf); + return err; +} + +/* + * LOCKT operation + */ +__be32 +nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_lockt *lockt = &u->lockt; + struct file_lock *file_lock = NULL; + struct nfs4_lockowner *lo = NULL; + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + if (locks_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + + if (check_lock_length(lockt->lt_offset, lockt->lt_length)) + return nfserr_inval; + + if (!nfsd4_has_session(cstate)) { + status = lookup_clientid(&lockt->lt_clientid, cstate, nn, + false); + if (status) + goto out; + } + + if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) + goto out; + + file_lock = locks_alloc_lock(); + if (!file_lock) { + dprintk("NFSD: %s: unable to allocate lock!\n", __func__); + status = nfserr_jukebox; + goto out; + } + + switch (lockt->lt_type) { + case NFS4_READ_LT: + case NFS4_READW_LT: + file_lock->fl_type = F_RDLCK; + break; + case NFS4_WRITE_LT: + case NFS4_WRITEW_LT: + file_lock->fl_type = F_WRLCK; + break; + default: + dprintk("NFSD: nfs4_lockt: bad lock type!\n"); + status = nfserr_inval; + goto out; + } + + lo = find_lockowner_str(cstate->clp, &lockt->lt_owner); + if (lo) + file_lock->fl_owner = (fl_owner_t)lo; + file_lock->fl_pid = current->tgid; + file_lock->fl_flags = FL_POSIX; + + file_lock->fl_start = lockt->lt_offset; + file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); + + nfs4_transform_lock_offset(file_lock); + + status = nfsd_test_lock(rqstp, &cstate->current_fh, file_lock); + if (status) + goto out; + + if (file_lock->fl_type != F_UNLCK) { + status = nfserr_denied; + nfs4_set_lock_denied(file_lock, &lockt->lt_denied); + } +out: + if (lo) + nfs4_put_stateowner(&lo->lo_owner); + if (file_lock) + locks_free_lock(file_lock); + return status; +} + +__be32 +nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_locku *locku = &u->locku; + struct nfs4_ol_stateid *stp; + struct nfsd_file *nf = NULL; + struct file_lock *file_lock = NULL; + __be32 status; + int err; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n", + (long long) locku->lu_offset, + (long long) locku->lu_length); + + if (check_lock_length(locku->lu_offset, locku->lu_length)) + return nfserr_inval; + + status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, + &locku->lu_stateid, NFS4_LOCK_STID, + &stp, nn); + if (status) + goto out; + nf = find_any_file(stp->st_stid.sc_file); + if (!nf) { + status = nfserr_lock_range; + goto put_stateid; + } + file_lock = locks_alloc_lock(); + if (!file_lock) { + dprintk("NFSD: %s: unable to allocate lock!\n", __func__); + status = nfserr_jukebox; + goto put_file; + } + + file_lock->fl_type = F_UNLCK; + file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); + file_lock->fl_pid = current->tgid; + file_lock->fl_file = nf->nf_file; + file_lock->fl_flags = FL_POSIX; + file_lock->fl_lmops = &nfsd_posix_mng_ops; + file_lock->fl_start = locku->lu_offset; + + file_lock->fl_end = last_byte_offset(locku->lu_offset, + locku->lu_length); + nfs4_transform_lock_offset(file_lock); + + err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, NULL); + if (err) { + dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); + goto out_nfserr; + } + nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid); +put_file: + nfsd_file_put(nf); +put_stateid: + mutex_unlock(&stp->st_mutex); + nfs4_put_stid(&stp->st_stid); +out: + nfsd4_bump_seqid(cstate, status); + if (file_lock) + locks_free_lock(file_lock); + return status; + +out_nfserr: + status = nfserrno(err); + goto put_file; +} + +/* + * returns + * true: locks held by lockowner + * false: no locks held by lockowner + */ +static bool +check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) +{ + struct file_lock *fl; + int status = false; + struct nfsd_file *nf = find_any_file(fp); + struct inode *inode; + struct file_lock_context *flctx; + + if (!nf) { + /* Any valid lock stateid should have some sort of access */ + WARN_ON_ONCE(1); + return status; + } + + inode = locks_inode(nf->nf_file); + flctx = inode->i_flctx; + + if (flctx && !list_empty_careful(&flctx->flc_posix)) { + spin_lock(&flctx->flc_lock); + list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + if (fl->fl_owner == (fl_owner_t)lowner) { + status = true; + break; + } + } + spin_unlock(&flctx->flc_lock); + } + nfsd_file_put(nf); + return status; +} + +__be32 +nfsd4_release_lockowner(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner; + clientid_t *clid = &rlockowner->rl_clientid; + struct nfs4_stateowner *sop; + struct nfs4_lockowner *lo = NULL; + struct nfs4_ol_stateid *stp; + struct xdr_netobj *owner = &rlockowner->rl_owner; + unsigned int hashval = ownerstr_hashval(owner); + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfs4_client *clp; + LIST_HEAD (reaplist); + + dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", + clid->cl_boot, clid->cl_id); + + status = lookup_clientid(clid, cstate, nn, false); + if (status) + return status; + + clp = cstate->clp; + /* Find the matching lock stateowner */ + spin_lock(&clp->cl_lock); + list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval], + so_strhash) { + + if (sop->so_is_open_owner || !same_owner_str(sop, owner)) + continue; + + if (atomic_read(&sop->so_count) != 1) { + spin_unlock(&clp->cl_lock); + return nfserr_locks_held; + } + + lo = lockowner(sop); + nfs4_get_stateowner(sop); + break; + } + if (!lo) { + spin_unlock(&clp->cl_lock); + return status; + } + + unhash_lockowner_locked(lo); + while (!list_empty(&lo->lo_owner.so_stateids)) { + stp = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, + st_perstateowner); + WARN_ON(!unhash_lock_stateid(stp)); + put_ol_stateid_locked(stp, &reaplist); + } + spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); + remove_blocked_locks(lo); + nfs4_put_stateowner(&lo->lo_owner); + + return status; +} + +static inline struct nfs4_client_reclaim * +alloc_reclaim(void) +{ + return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL); +} + +bool +nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn) +{ + struct nfs4_client_reclaim *crp; + + crp = nfsd4_find_reclaim_client(name, nn); + return (crp && crp->cr_clp); +} + +/* + * failure => all reset bets are off, nfserr_no_grace... + * + * The caller is responsible for freeing name.data if NULL is returned (it + * will be freed in nfs4_remove_reclaim_record in the normal case). + */ +struct nfs4_client_reclaim * +nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, + struct nfsd_net *nn) +{ + unsigned int strhashval; + struct nfs4_client_reclaim *crp; + + crp = alloc_reclaim(); + if (crp) { + strhashval = clientstr_hashval(name); + INIT_LIST_HEAD(&crp->cr_strhash); + list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]); + crp->cr_name.data = name.data; + crp->cr_name.len = name.len; + crp->cr_princhash.data = princhash.data; + crp->cr_princhash.len = princhash.len; + crp->cr_clp = NULL; + nn->reclaim_str_hashtbl_size++; + } + return crp; +} + +void +nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn) +{ + list_del(&crp->cr_strhash); + kfree(crp->cr_name.data); + kfree(crp->cr_princhash.data); + kfree(crp); + nn->reclaim_str_hashtbl_size--; +} + +void +nfs4_release_reclaim(struct nfsd_net *nn) +{ + struct nfs4_client_reclaim *crp = NULL; + int i; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + while (!list_empty(&nn->reclaim_str_hashtbl[i])) { + crp = list_entry(nn->reclaim_str_hashtbl[i].next, + struct nfs4_client_reclaim, cr_strhash); + nfs4_remove_reclaim_record(crp, nn); + } + } + WARN_ON_ONCE(nn->reclaim_str_hashtbl_size); +} + +/* + * called from OPEN, CLAIM_PREVIOUS with a new clientid. */ +struct nfs4_client_reclaim * +nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn) +{ + unsigned int strhashval; + struct nfs4_client_reclaim *crp = NULL; + + strhashval = clientstr_hashval(name); + list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) { + if (compare_blob(&crp->cr_name, &name) == 0) { + return crp; + } + } + return NULL; +} + +/* +* Called from OPEN. Look for clientid in reclaim list. +*/ +__be32 +nfs4_check_open_reclaim(clientid_t *clid, + struct nfsd4_compound_state *cstate, + struct nfsd_net *nn) +{ + __be32 status; + + /* find clientid in conf_id_hashtbl */ + status = lookup_clientid(clid, cstate, nn, false); + if (status) + return nfserr_reclaim_bad; + + if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags)) + return nfserr_no_grace; + + if (nfsd4_client_record_check(cstate->clp)) + return nfserr_reclaim_bad; + + return nfs_ok; +} + +/* + * Since the lifetime of a delegation isn't limited to that of an open, a + * client may quite reasonably hang on to a delegation as long as it has + * the inode cached. This becomes an obvious problem the first time a + * client's inode cache approaches the size of the server's total memory. + * + * For now we avoid this problem by imposing a hard limit on the number + * of delegations, which varies according to the server's memory size. + */ +static void +set_max_delegations(void) +{ + /* + * Allow at most 4 delegations per megabyte of RAM. Quick + * estimates suggest that in the worst case (where every delegation + * is for a different inode), a delegation could take about 1.5K, + * giving a worst case usage of about 6% of memory. + */ + max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT); +} + +static int nfs4_state_create_net(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int i; + + nn->conf_id_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); + if (!nn->conf_id_hashtbl) + goto err; + nn->unconf_id_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); + if (!nn->unconf_id_hashtbl) + goto err_unconf_id; + nn->sessionid_hashtbl = kmalloc_array(SESSION_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); + if (!nn->sessionid_hashtbl) + goto err_sessionid; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); + INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]); + } + for (i = 0; i < SESSION_HASH_SIZE; i++) + INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); + nn->conf_name_tree = RB_ROOT; + nn->unconf_name_tree = RB_ROOT; + nn->boot_time = ktime_get_real_seconds(); + nn->grace_ended = false; + nn->nfsd4_manager.block_opens = true; + INIT_LIST_HEAD(&nn->nfsd4_manager.list); + INIT_LIST_HEAD(&nn->client_lru); + INIT_LIST_HEAD(&nn->close_lru); + INIT_LIST_HEAD(&nn->del_recall_lru); + spin_lock_init(&nn->client_lock); + spin_lock_init(&nn->s2s_cp_lock); + idr_init(&nn->s2s_cp_stateids); + + spin_lock_init(&nn->blocked_locks_lock); + INIT_LIST_HEAD(&nn->blocked_locks_lru); + + INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); + get_net(net); + + return 0; + +err_sessionid: + kfree(nn->unconf_id_hashtbl); +err_unconf_id: + kfree(nn->conf_id_hashtbl); +err: + return -ENOMEM; +} + +static void +nfs4_state_destroy_net(struct net *net) +{ + int i; + struct nfs4_client *clp = NULL; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + while (!list_empty(&nn->conf_id_hashtbl[i])) { + clp = list_entry(nn->conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); + destroy_client(clp); + } + } + + WARN_ON(!list_empty(&nn->blocked_locks_lru)); + + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + while (!list_empty(&nn->unconf_id_hashtbl[i])) { + clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); + destroy_client(clp); + } + } + + kfree(nn->sessionid_hashtbl); + kfree(nn->unconf_id_hashtbl); + kfree(nn->conf_id_hashtbl); + put_net(net); +} + +int +nfs4_state_start_net(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int ret; + + ret = nfs4_state_create_net(net); + if (ret) + return ret; + locks_start_grace(net, &nn->nfsd4_manager); + nfsd4_client_tracking_init(net); + if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0) + goto skip_grace; + printk(KERN_INFO "NFSD: starting %lld-second grace period (net %x)\n", + nn->nfsd4_grace, net->ns.inum); + trace_nfsd_grace_start(nn); + queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); + return 0; + +skip_grace: + printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n", + net->ns.inum); + queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ); + nfsd4_end_grace(nn); + return 0; +} + +/* initialization to perform when the nfsd service is started: */ + +int +nfs4_state_start(void) +{ + int ret; + + laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); + if (laundry_wq == NULL) { + ret = -ENOMEM; + goto out; + } + ret = nfsd4_create_callback_queue(); + if (ret) + goto out_free_laundry; + + set_max_delegations(); + return 0; + +out_free_laundry: + destroy_workqueue(laundry_wq); +out: + return ret; +} + +void +nfs4_state_shutdown_net(struct net *net) +{ + struct nfs4_delegation *dp = NULL; + struct list_head *pos, *next, reaplist; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + cancel_delayed_work_sync(&nn->laundromat_work); + locks_end_grace(&nn->nfsd4_manager); + + INIT_LIST_HEAD(&reaplist); + spin_lock(&state_lock); + list_for_each_safe(pos, next, &nn->del_recall_lru) { + dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + WARN_ON(!unhash_delegation_locked(dp)); + list_add(&dp->dl_recall_lru, &reaplist); + } + spin_unlock(&state_lock); + list_for_each_safe(pos, next, &reaplist) { + dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); + list_del_init(&dp->dl_recall_lru); + destroy_unhashed_deleg(dp); + } + + nfsd4_client_tracking_exit(net); + nfs4_state_destroy_net(net); +} + +void +nfs4_state_shutdown(void) +{ + destroy_workqueue(laundry_wq); + nfsd4_destroy_callback_queue(); +} + +static void +get_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid) +{ + if (HAS_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG) && + CURRENT_STATEID(stateid)) + memcpy(stateid, &cstate->current_stateid, sizeof(stateid_t)); +} + +static void +put_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid) +{ + if (cstate->minorversion) { + memcpy(&cstate->current_stateid, stateid, sizeof(stateid_t)); + SET_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG); + } +} + +void +clear_current_stateid(struct nfsd4_compound_state *cstate) +{ + CLEAR_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG); +} + +/* + * functions to set current state id + */ +void +nfsd4_set_opendowngradestateid(struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + put_stateid(cstate, &u->open_downgrade.od_stateid); +} + +void +nfsd4_set_openstateid(struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + put_stateid(cstate, &u->open.op_stateid); +} + +void +nfsd4_set_closestateid(struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + put_stateid(cstate, &u->close.cl_stateid); +} + +void +nfsd4_set_lockstateid(struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + put_stateid(cstate, &u->lock.lk_resp_stateid); +} + +/* + * functions to consume current state id + */ + +void +nfsd4_get_opendowngradestateid(struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + get_stateid(cstate, &u->open_downgrade.od_stateid); +} + +void +nfsd4_get_delegreturnstateid(struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + get_stateid(cstate, &u->delegreturn.dr_stateid); +} + +void +nfsd4_get_freestateid(struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + get_stateid(cstate, &u->free_stateid.fr_stateid); +} + +void +nfsd4_get_setattrstateid(struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + get_stateid(cstate, &u->setattr.sa_stateid); +} + +void +nfsd4_get_closestateid(struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + get_stateid(cstate, &u->close.cl_stateid); +} + +void +nfsd4_get_lockustateid(struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + get_stateid(cstate, &u->locku.lu_stateid); +} + +void +nfsd4_get_readstateid(struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + get_stateid(cstate, &u->read.rd_stateid); +} + +void +nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + get_stateid(cstate, &u->write.wr_stateid); +} diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c new file mode 100644 index 000000000..dbfa24cf3 --- /dev/null +++ b/fs/nfsd/nfs4xdr.c @@ -0,0 +1,5354 @@ +/* + * Server-side XDR for NFSv4 + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith <kmsmith@umich.edu> + * Andy Adamson <andros@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/namei.h> +#include <linux/statfs.h> +#include <linux/utsname.h> +#include <linux/pagemap.h> +#include <linux/sunrpc/svcauth_gss.h> +#include <linux/sunrpc/addr.h> +#include <linux/xattr.h> +#include <uapi/linux/xattr.h> + +#include "idmap.h" +#include "acl.h" +#include "xdr4.h" +#include "vfs.h" +#include "state.h" +#include "cache.h" +#include "netns.h" +#include "pnfs.h" +#include "filecache.h" + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +#include <linux/security.h> +#endif + + +#define NFSDDBG_FACILITY NFSDDBG_XDR + +const u32 nfsd_suppattrs[3][3] = { + {NFSD4_SUPPORTED_ATTRS_WORD0, + NFSD4_SUPPORTED_ATTRS_WORD1, + NFSD4_SUPPORTED_ATTRS_WORD2}, + + {NFSD4_1_SUPPORTED_ATTRS_WORD0, + NFSD4_1_SUPPORTED_ATTRS_WORD1, + NFSD4_1_SUPPORTED_ATTRS_WORD2}, + + {NFSD4_1_SUPPORTED_ATTRS_WORD0, + NFSD4_1_SUPPORTED_ATTRS_WORD1, + NFSD4_2_SUPPORTED_ATTRS_WORD2}, +}; + +/* + * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing + * directory in order to indicate to the client that a filesystem boundary is present + * We use a fixed fsid for a referral + */ +#define NFS4_REFERRAL_FSID_MAJOR 0x8000000ULL +#define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL + +static __be32 +check_filename(char *str, int len) +{ + int i; + + if (len == 0) + return nfserr_inval; + if (isdotent(str, len)) + return nfserr_badname; + for (i = 0; i < len; i++) + if (str[i] == '/') + return nfserr_badname; + return 0; +} + +#define DECODE_HEAD \ + __be32 *p; \ + __be32 status +#define DECODE_TAIL \ + status = 0; \ +out: \ + return status; \ +xdr_error: \ + dprintk("NFSD: xdr error (%s:%d)\n", \ + __FILE__, __LINE__); \ + status = nfserr_bad_xdr; \ + goto out + +#define READMEM(x,nbytes) do { \ + x = (char *)p; \ + p += XDR_QUADLEN(nbytes); \ +} while (0) +#define SAVEMEM(x,nbytes) do { \ + if (!(x = (p==argp->tmp || p == argp->tmpp) ? \ + savemem(argp, p, nbytes) : \ + (char *)p)) { \ + dprintk("NFSD: xdr error (%s:%d)\n", \ + __FILE__, __LINE__); \ + goto xdr_error; \ + } \ + p += XDR_QUADLEN(nbytes); \ +} while (0) +#define COPYMEM(x,nbytes) do { \ + memcpy((x), p, nbytes); \ + p += XDR_QUADLEN(nbytes); \ +} while (0) + +/* READ_BUF, read_buf(): nbytes must be <= PAGE_SIZE */ +#define READ_BUF(nbytes) do { \ + if (nbytes <= (u32)((char *)argp->end - (char *)argp->p)) { \ + p = argp->p; \ + argp->p += XDR_QUADLEN(nbytes); \ + } else if (!(p = read_buf(argp, nbytes))) { \ + dprintk("NFSD: xdr error (%s:%d)\n", \ + __FILE__, __LINE__); \ + goto xdr_error; \ + } \ +} while (0) + +static void next_decode_page(struct nfsd4_compoundargs *argp) +{ + argp->p = page_address(argp->pagelist[0]); + argp->pagelist++; + if (argp->pagelen < PAGE_SIZE) { + argp->end = argp->p + XDR_QUADLEN(argp->pagelen); + argp->pagelen = 0; + } else { + argp->end = argp->p + (PAGE_SIZE>>2); + argp->pagelen -= PAGE_SIZE; + } +} + +static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) +{ + /* We want more bytes than seem to be available. + * Maybe we need a new page, maybe we have just run out + */ + unsigned int avail = (char *)argp->end - (char *)argp->p; + __be32 *p; + + if (argp->pagelen == 0) { + struct kvec *vec = &argp->rqstp->rq_arg.tail[0]; + + if (!argp->tail) { + argp->tail = true; + avail = vec->iov_len; + argp->p = vec->iov_base; + argp->end = vec->iov_base + avail; + } + + if (avail < nbytes) + return NULL; + + p = argp->p; + argp->p += XDR_QUADLEN(nbytes); + return p; + } + + if (avail + argp->pagelen < nbytes) + return NULL; + if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */ + return NULL; + /* ok, we can do it with the current plus the next page */ + if (nbytes <= sizeof(argp->tmp)) + p = argp->tmp; + else { + kfree(argp->tmpp); + p = argp->tmpp = kmalloc(nbytes, GFP_KERNEL); + if (!p) + return NULL; + + } + /* + * The following memcpy is safe because read_buf is always + * called with nbytes > avail, and the two cases above both + * guarantee p points to at least nbytes bytes. + */ + memcpy(p, argp->p, avail); + next_decode_page(argp); + memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); + argp->p += XDR_QUADLEN(nbytes - avail); + return p; +} + +static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp) +{ + unsigned int this = (char *)argp->end - (char *)argp->p; + + return this + argp->pagelen; +} + +static int zero_clientid(clientid_t *clid) +{ + return (clid->cl_boot == 0) && (clid->cl_id == 0); +} + +/** + * svcxdr_tmpalloc - allocate memory to be freed after compound processing + * @argp: NFSv4 compound argument structure + * @len: length of buffer to allocate + * + * Allocates a buffer of size @len to be freed when processing the compound + * operation described in @argp finishes. + */ +static void * +svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len) +{ + struct svcxdr_tmpbuf *tb; + + tb = kmalloc(sizeof(*tb) + len, GFP_KERNEL); + if (!tb) + return NULL; + tb->next = argp->to_free; + argp->to_free = tb; + return tb->buf; +} + +/* + * For xdr strings that need to be passed to other kernel api's + * as null-terminated strings. + * + * Note null-terminating in place usually isn't safe since the + * buffer might end on a page boundary. + */ +static char * +svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len) +{ + char *p = svcxdr_tmpalloc(argp, len + 1); + + if (!p) + return NULL; + memcpy(p, buf, len); + p[len] = '\0'; + return p; +} + +static __be32 +svcxdr_construct_vector(struct nfsd4_compoundargs *argp, struct kvec *head, + struct page ***pagelist, u32 buflen) +{ + int avail; + int len; + int pages; + + /* Sorry .. no magic macros for this.. * + * READ_BUF(write->wr_buflen); + * SAVEMEM(write->wr_buf, write->wr_buflen); + */ + avail = (char *)argp->end - (char *)argp->p; + if (avail + argp->pagelen < buflen) { + dprintk("NFSD: xdr error (%s:%d)\n", + __FILE__, __LINE__); + return nfserr_bad_xdr; + } + head->iov_base = argp->p; + head->iov_len = avail; + *pagelist = argp->pagelist; + + len = XDR_QUADLEN(buflen) << 2; + if (len >= avail) { + len -= avail; + + pages = len >> PAGE_SHIFT; + argp->pagelist += pages; + argp->pagelen -= pages * PAGE_SIZE; + len -= pages * PAGE_SIZE; + + next_decode_page(argp); + } + argp->p += XDR_QUADLEN(len); + + return 0; +} + +/** + * savemem - duplicate a chunk of memory for later processing + * @argp: NFSv4 compound argument structure to be freed with + * @p: pointer to be duplicated + * @nbytes: length to be duplicated + * + * Returns a pointer to a copy of @nbytes bytes of memory at @p + * that are preserved until processing of the NFSv4 compound + * operation described by @argp finishes. + */ +static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) +{ + void *ret; + + ret = svcxdr_tmpalloc(argp, nbytes); + if (!ret) + return NULL; + memcpy(ret, p, nbytes); + return ret; +} + +static __be32 +nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec64 *tv) +{ + DECODE_HEAD; + + READ_BUF(12); + p = xdr_decode_hyper(p, &tv->tv_sec); + tv->tv_nsec = be32_to_cpup(p++); + if (tv->tv_nsec >= (u32)1000000000) + return nfserr_inval; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) +{ + u32 bmlen; + DECODE_HEAD; + + bmval[0] = 0; + bmval[1] = 0; + bmval[2] = 0; + + READ_BUF(4); + bmlen = be32_to_cpup(p++); + if (bmlen > 1000) + goto xdr_error; + + READ_BUF(bmlen << 2); + if (bmlen > 0) + bmval[0] = be32_to_cpup(p++); + if (bmlen > 1) + bmval[1] = be32_to_cpup(p++); + if (bmlen > 2) + bmval[2] = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, + struct iattr *iattr, struct nfs4_acl **acl, + struct xdr_netobj *label, int *umask) +{ + int expected_len, len = 0; + u32 dummy32; + char *buf; + + DECODE_HEAD; + iattr->ia_valid = 0; + if ((status = nfsd4_decode_bitmap(argp, bmval))) + return status; + + if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 + || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 + || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) { + if (nfsd_attrs_supported(argp->minorversion, bmval)) + return nfserr_inval; + return nfserr_attrnotsupp; + } + + READ_BUF(4); + expected_len = be32_to_cpup(p++); + + if (bmval[0] & FATTR4_WORD0_SIZE) { + READ_BUF(8); + len += 8; + p = xdr_decode_hyper(p, &iattr->ia_size); + iattr->ia_valid |= ATTR_SIZE; + } + if (bmval[0] & FATTR4_WORD0_ACL) { + u32 nace; + struct nfs4_ace *ace; + + READ_BUF(4); len += 4; + nace = be32_to_cpup(p++); + + if (nace > compoundargs_bytes_left(argp)/20) + /* + * Even with 4-byte names there wouldn't be + * space for that many aces; something fishy is + * going on: + */ + return nfserr_fbig; + + *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace)); + if (*acl == NULL) + return nfserr_jukebox; + + (*acl)->naces = nace; + for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { + READ_BUF(16); len += 16; + ace->type = be32_to_cpup(p++); + ace->flag = be32_to_cpup(p++); + ace->access_mask = be32_to_cpup(p++); + dummy32 = be32_to_cpup(p++); + READ_BUF(dummy32); + len += XDR_QUADLEN(dummy32) << 2; + READMEM(buf, dummy32); + ace->whotype = nfs4_acl_get_whotype(buf, dummy32); + status = nfs_ok; + if (ace->whotype != NFS4_ACL_WHO_NAMED) + ; + else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + status = nfsd_map_name_to_gid(argp->rqstp, + buf, dummy32, &ace->who_gid); + else + status = nfsd_map_name_to_uid(argp->rqstp, + buf, dummy32, &ace->who_uid); + if (status) + return status; + } + } else + *acl = NULL; + if (bmval[1] & FATTR4_WORD1_MODE) { + READ_BUF(4); + len += 4; + iattr->ia_mode = be32_to_cpup(p++); + iattr->ia_mode &= (S_IFMT | S_IALLUGO); + iattr->ia_valid |= ATTR_MODE; + } + if (bmval[1] & FATTR4_WORD1_OWNER) { + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); + READ_BUF(dummy32); + len += (XDR_QUADLEN(dummy32) << 2); + READMEM(buf, dummy32); + if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) + return status; + iattr->ia_valid |= ATTR_UID; + } + if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); + READ_BUF(dummy32); + len += (XDR_QUADLEN(dummy32) << 2); + READMEM(buf, dummy32); + if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) + return status; + iattr->ia_valid |= ATTR_GID; + } + if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); + switch (dummy32) { + case NFS4_SET_TO_CLIENT_TIME: + len += 12; + status = nfsd4_decode_time(argp, &iattr->ia_atime); + if (status) + return status; + iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); + break; + case NFS4_SET_TO_SERVER_TIME: + iattr->ia_valid |= ATTR_ATIME; + break; + default: + goto xdr_error; + } + } + if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); + switch (dummy32) { + case NFS4_SET_TO_CLIENT_TIME: + len += 12; + status = nfsd4_decode_time(argp, &iattr->ia_mtime); + if (status) + return status; + iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); + break; + case NFS4_SET_TO_SERVER_TIME: + iattr->ia_valid |= ATTR_MTIME; + break; + default: + goto xdr_error; + } + } + + label->len = 0; + if (IS_ENABLED(CONFIG_NFSD_V4_SECURITY_LABEL) && + bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */ + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */ + READ_BUF(4); + len += 4; + dummy32 = be32_to_cpup(p++); + READ_BUF(dummy32); + if (dummy32 > NFS4_MAXLABELLEN) + return nfserr_badlabel; + len += (XDR_QUADLEN(dummy32) << 2); + READMEM(buf, dummy32); + label->len = dummy32; + label->data = svcxdr_dupstr(argp, buf, dummy32); + if (!label->data) + return nfserr_jukebox; + } + if (bmval[2] & FATTR4_WORD2_MODE_UMASK) { + if (!umask) + goto xdr_error; + READ_BUF(8); + len += 8; + dummy32 = be32_to_cpup(p++); + iattr->ia_mode = dummy32 & (S_IFMT | S_IALLUGO); + dummy32 = be32_to_cpup(p++); + *umask = dummy32 & S_IRWXUGO; + iattr->ia_valid |= ATTR_MODE; + } + if (len != expected_len) + goto xdr_error; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) +{ + DECODE_HEAD; + + READ_BUF(sizeof(stateid_t)); + sid->si_generation = be32_to_cpup(p++); + COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) +{ + DECODE_HEAD; + + READ_BUF(4); + access->ac_req_access = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) +{ + DECODE_HEAD; + struct user_namespace *userns = nfsd_user_namespace(argp->rqstp); + u32 dummy, uid, gid; + char *machine_name; + int i; + int nr_secflavs; + + /* callback_sec_params4 */ + READ_BUF(4); + nr_secflavs = be32_to_cpup(p++); + if (nr_secflavs) + cbs->flavor = (u32)(-1); + else + /* Is this legal? Be generous, take it to mean AUTH_NONE: */ + cbs->flavor = 0; + for (i = 0; i < nr_secflavs; ++i) { + READ_BUF(4); + dummy = be32_to_cpup(p++); + switch (dummy) { + case RPC_AUTH_NULL: + /* Nothing to read */ + if (cbs->flavor == (u32)(-1)) + cbs->flavor = RPC_AUTH_NULL; + break; + case RPC_AUTH_UNIX: + READ_BUF(8); + /* stamp */ + dummy = be32_to_cpup(p++); + + /* machine name */ + dummy = be32_to_cpup(p++); + READ_BUF(dummy); + SAVEMEM(machine_name, dummy); + + /* uid, gid */ + READ_BUF(8); + uid = be32_to_cpup(p++); + gid = be32_to_cpup(p++); + + /* more gids */ + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy * 4); + if (cbs->flavor == (u32)(-1)) { + kuid_t kuid = make_kuid(userns, uid); + kgid_t kgid = make_kgid(userns, gid); + if (uid_valid(kuid) && gid_valid(kgid)) { + cbs->uid = kuid; + cbs->gid = kgid; + cbs->flavor = RPC_AUTH_UNIX; + } else { + dprintk("RPC_AUTH_UNIX with invalid" + "uid or gid ignoring!\n"); + } + } + break; + case RPC_AUTH_GSS: + dprintk("RPC_AUTH_GSS callback secflavor " + "not supported!\n"); + READ_BUF(8); + /* gcbp_service */ + dummy = be32_to_cpup(p++); + /* gcbp_handle_from_server */ + dummy = be32_to_cpup(p++); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + /* gcbp_handle_from_client */ + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy); + break; + default: + dprintk("Illegal callback secflavor\n"); + return nfserr_inval; + } + } + DECODE_TAIL; +} + +static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc) +{ + DECODE_HEAD; + + READ_BUF(4); + bc->bc_cb_program = be32_to_cpup(p++); + nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) +{ + DECODE_HEAD; + + READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); + COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); + bcts->dir = be32_to_cpup(p++); + /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker + * could help us figure out we should be using it. */ + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) +{ + DECODE_HEAD; + + READ_BUF(4); + close->cl_seqid = be32_to_cpup(p++); + return nfsd4_decode_stateid(argp, &close->cl_stateid); + + DECODE_TAIL; +} + + +static __be32 +nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit) +{ + DECODE_HEAD; + + READ_BUF(12); + p = xdr_decode_hyper(p, &commit->co_offset); + commit->co_count = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create) +{ + DECODE_HEAD; + + READ_BUF(4); + create->cr_type = be32_to_cpup(p++); + switch (create->cr_type) { + case NF4LNK: + READ_BUF(4); + create->cr_datalen = be32_to_cpup(p++); + READ_BUF(create->cr_datalen); + create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen); + if (!create->cr_data) + return nfserr_jukebox; + break; + case NF4BLK: + case NF4CHR: + READ_BUF(8); + create->cr_specdata1 = be32_to_cpup(p++); + create->cr_specdata2 = be32_to_cpup(p++); + break; + case NF4SOCK: + case NF4FIFO: + case NF4DIR: + default: + break; + } + + READ_BUF(4); + create->cr_namelen = be32_to_cpup(p++); + READ_BUF(create->cr_namelen); + SAVEMEM(create->cr_name, create->cr_namelen); + if ((status = check_filename(create->cr_name, create->cr_namelen))) + return status; + + status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, + &create->cr_acl, &create->cr_label, + &create->cr_umask); + if (status) + goto out; + + DECODE_TAIL; +} + +static inline __be32 +nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) +{ + return nfsd4_decode_stateid(argp, &dr->dr_stateid); +} + +static inline __be32 +nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr) +{ + return nfsd4_decode_bitmap(argp, getattr->ga_bmval); +} + +static __be32 +nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) +{ + DECODE_HEAD; + + READ_BUF(4); + link->li_namelen = be32_to_cpup(p++); + READ_BUF(link->li_namelen); + SAVEMEM(link->li_name, link->li_namelen); + if ((status = check_filename(link->li_name, link->li_namelen))) + return status; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) +{ + DECODE_HEAD; + + /* + * type, reclaim(boolean), offset, length, new_lock_owner(boolean) + */ + READ_BUF(28); + lock->lk_type = be32_to_cpup(p++); + if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT)) + goto xdr_error; + lock->lk_reclaim = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &lock->lk_offset); + p = xdr_decode_hyper(p, &lock->lk_length); + lock->lk_is_new = be32_to_cpup(p++); + + if (lock->lk_is_new) { + READ_BUF(4); + lock->lk_new_open_seqid = be32_to_cpup(p++); + status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); + if (status) + return status; + READ_BUF(8 + sizeof(clientid_t)); + lock->lk_new_lock_seqid = be32_to_cpup(p++); + COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); + lock->lk_new_owner.len = be32_to_cpup(p++); + READ_BUF(lock->lk_new_owner.len); + READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); + } else { + status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid); + if (status) + return status; + READ_BUF(4); + lock->lk_old_lock_seqid = be32_to_cpup(p++); + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) +{ + DECODE_HEAD; + + READ_BUF(32); + lockt->lt_type = be32_to_cpup(p++); + if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) + goto xdr_error; + p = xdr_decode_hyper(p, &lockt->lt_offset); + p = xdr_decode_hyper(p, &lockt->lt_length); + COPYMEM(&lockt->lt_clientid, 8); + lockt->lt_owner.len = be32_to_cpup(p++); + READ_BUF(lockt->lt_owner.len); + READMEM(lockt->lt_owner.data, lockt->lt_owner.len); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) +{ + DECODE_HEAD; + + READ_BUF(8); + locku->lu_type = be32_to_cpup(p++); + if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) + goto xdr_error; + locku->lu_seqid = be32_to_cpup(p++); + status = nfsd4_decode_stateid(argp, &locku->lu_stateid); + if (status) + return status; + READ_BUF(16); + p = xdr_decode_hyper(p, &locku->lu_offset); + p = xdr_decode_hyper(p, &locku->lu_length); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup) +{ + DECODE_HEAD; + + READ_BUF(4); + lookup->lo_len = be32_to_cpup(p++); + READ_BUF(lookup->lo_len); + SAVEMEM(lookup->lo_name, lookup->lo_len); + if ((status = check_filename(lookup->lo_name, lookup->lo_len))) + return status; + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *share_access, u32 *deleg_want, u32 *deleg_when) +{ + __be32 *p; + u32 w; + + READ_BUF(4); + w = be32_to_cpup(p++); + *share_access = w & NFS4_SHARE_ACCESS_MASK; + *deleg_want = w & NFS4_SHARE_WANT_MASK; + if (deleg_when) + *deleg_when = w & NFS4_SHARE_WHEN_MASK; + + switch (w & NFS4_SHARE_ACCESS_MASK) { + case NFS4_SHARE_ACCESS_READ: + case NFS4_SHARE_ACCESS_WRITE: + case NFS4_SHARE_ACCESS_BOTH: + break; + default: + return nfserr_bad_xdr; + } + w &= ~NFS4_SHARE_ACCESS_MASK; + if (!w) + return nfs_ok; + if (!argp->minorversion) + return nfserr_bad_xdr; + switch (w & NFS4_SHARE_WANT_MASK) { + case NFS4_SHARE_WANT_NO_PREFERENCE: + case NFS4_SHARE_WANT_READ_DELEG: + case NFS4_SHARE_WANT_WRITE_DELEG: + case NFS4_SHARE_WANT_ANY_DELEG: + case NFS4_SHARE_WANT_NO_DELEG: + case NFS4_SHARE_WANT_CANCEL: + break; + default: + return nfserr_bad_xdr; + } + w &= ~NFS4_SHARE_WANT_MASK; + if (!w) + return nfs_ok; + + if (!deleg_when) /* open_downgrade */ + return nfserr_inval; + switch (w) { + case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL: + case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED: + case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL | + NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED): + return nfs_ok; + } +xdr_error: + return nfserr_bad_xdr; +} + +static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x) +{ + __be32 *p; + + READ_BUF(4); + *x = be32_to_cpup(p++); + /* Note: unlinke access bits, deny bits may be zero. */ + if (*x & ~NFS4_SHARE_DENY_BOTH) + return nfserr_bad_xdr; + return nfs_ok; +xdr_error: + return nfserr_bad_xdr; +} + +static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o) +{ + __be32 *p; + + READ_BUF(4); + o->len = be32_to_cpup(p++); + + if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT) + return nfserr_bad_xdr; + + READ_BUF(o->len); + SAVEMEM(o->data, o->len); + return nfs_ok; +xdr_error: + return nfserr_bad_xdr; +} + +static __be32 +nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +{ + DECODE_HEAD; + u32 dummy; + + memset(open->op_bmval, 0, sizeof(open->op_bmval)); + open->op_iattr.ia_valid = 0; + open->op_openowner = NULL; + + open->op_xdr_error = 0; + /* seqid, share_access, share_deny, clientid, ownerlen */ + READ_BUF(4); + open->op_seqid = be32_to_cpup(p++); + /* decode, yet ignore deleg_when until supported */ + status = nfsd4_decode_share_access(argp, &open->op_share_access, + &open->op_deleg_want, &dummy); + if (status) + goto xdr_error; + status = nfsd4_decode_share_deny(argp, &open->op_share_deny); + if (status) + goto xdr_error; + READ_BUF(sizeof(clientid_t)); + COPYMEM(&open->op_clientid, sizeof(clientid_t)); + status = nfsd4_decode_opaque(argp, &open->op_owner); + if (status) + goto xdr_error; + READ_BUF(4); + open->op_create = be32_to_cpup(p++); + switch (open->op_create) { + case NFS4_OPEN_NOCREATE: + break; + case NFS4_OPEN_CREATE: + READ_BUF(4); + open->op_createmode = be32_to_cpup(p++); + switch (open->op_createmode) { + case NFS4_CREATE_UNCHECKED: + case NFS4_CREATE_GUARDED: + status = nfsd4_decode_fattr(argp, open->op_bmval, + &open->op_iattr, &open->op_acl, &open->op_label, + &open->op_umask); + if (status) + goto out; + break; + case NFS4_CREATE_EXCLUSIVE: + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); + break; + case NFS4_CREATE_EXCLUSIVE4_1: + if (argp->minorversion < 1) + goto xdr_error; + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); + status = nfsd4_decode_fattr(argp, open->op_bmval, + &open->op_iattr, &open->op_acl, &open->op_label, + &open->op_umask); + if (status) + goto out; + break; + default: + goto xdr_error; + } + break; + default: + goto xdr_error; + } + + /* open_claim */ + READ_BUF(4); + open->op_claim_type = be32_to_cpup(p++); + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + READ_BUF(4); + open->op_fname.len = be32_to_cpup(p++); + READ_BUF(open->op_fname.len); + SAVEMEM(open->op_fname.data, open->op_fname.len); + if ((status = check_filename(open->op_fname.data, open->op_fname.len))) + return status; + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + READ_BUF(4); + open->op_delegate_type = be32_to_cpup(p++); + break; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + READ_BUF(4); + open->op_fname.len = be32_to_cpup(p++); + READ_BUF(open->op_fname.len); + SAVEMEM(open->op_fname.data, open->op_fname.len); + if ((status = check_filename(open->op_fname.data, open->op_fname.len))) + return status; + break; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + if (argp->minorversion < 1) + goto xdr_error; + /* void */ + break; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + if (argp->minorversion < 1) + goto xdr_error; + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + break; + default: + goto xdr_error; + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf) +{ + DECODE_HEAD; + + if (argp->minorversion >= 1) + return nfserr_notsupp; + + status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); + if (status) + return status; + READ_BUF(4); + open_conf->oc_seqid = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &open_down->od_stateid); + if (status) + return status; + READ_BUF(4); + open_down->od_seqid = be32_to_cpup(p++); + status = nfsd4_decode_share_access(argp, &open_down->od_share_access, + &open_down->od_deleg_want, NULL); + if (status) + return status; + status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny); + if (status) + return status; + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) +{ + DECODE_HEAD; + + READ_BUF(4); + putfh->pf_fhlen = be32_to_cpup(p++); + if (putfh->pf_fhlen > NFS4_FHSIZE) + goto xdr_error; + READ_BUF(putfh->pf_fhlen); + SAVEMEM(putfh->pf_fhval, putfh->pf_fhlen); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p) +{ + if (argp->minorversion == 0) + return nfs_ok; + return nfserr_notsupp; +} + +static __be32 +nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &read->rd_stateid); + if (status) + return status; + READ_BUF(12); + p = xdr_decode_hyper(p, &read->rd_offset); + read->rd_length = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir) +{ + DECODE_HEAD; + + READ_BUF(24); + p = xdr_decode_hyper(p, &readdir->rd_cookie); + COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data)); + readdir->rd_dircount = be32_to_cpup(p++); + readdir->rd_maxcount = be32_to_cpup(p++); + if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval))) + goto out; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove) +{ + DECODE_HEAD; + + READ_BUF(4); + remove->rm_namelen = be32_to_cpup(p++); + READ_BUF(remove->rm_namelen); + SAVEMEM(remove->rm_name, remove->rm_namelen); + if ((status = check_filename(remove->rm_name, remove->rm_namelen))) + return status; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename) +{ + DECODE_HEAD; + + READ_BUF(4); + rename->rn_snamelen = be32_to_cpup(p++); + READ_BUF(rename->rn_snamelen); + SAVEMEM(rename->rn_sname, rename->rn_snamelen); + READ_BUF(4); + rename->rn_tnamelen = be32_to_cpup(p++); + READ_BUF(rename->rn_tnamelen); + SAVEMEM(rename->rn_tname, rename->rn_tnamelen); + if ((status = check_filename(rename->rn_sname, rename->rn_snamelen))) + return status; + if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen))) + return status; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid) +{ + DECODE_HEAD; + + if (argp->minorversion >= 1) + return nfserr_notsupp; + + READ_BUF(sizeof(clientid_t)); + COPYMEM(clientid, sizeof(clientid_t)); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, + struct nfsd4_secinfo *secinfo) +{ + DECODE_HEAD; + + READ_BUF(4); + secinfo->si_namelen = be32_to_cpup(p++); + READ_BUF(secinfo->si_namelen); + SAVEMEM(secinfo->si_name, secinfo->si_namelen); + status = check_filename(secinfo->si_name, secinfo->si_namelen); + if (status) + return status; + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, + struct nfsd4_secinfo_no_name *sin) +{ + DECODE_HEAD; + + READ_BUF(4); + sin->sin_style = be32_to_cpup(p++); + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) +{ + __be32 status; + + status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); + if (status) + return status; + return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, + &setattr->sa_acl, &setattr->sa_label, NULL); +} + +static __be32 +nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid) +{ + DECODE_HEAD; + + if (argp->minorversion >= 1) + return nfserr_notsupp; + + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE); + + status = nfsd4_decode_opaque(argp, &setclientid->se_name); + if (status) + return nfserr_bad_xdr; + READ_BUF(8); + setclientid->se_callback_prog = be32_to_cpup(p++); + setclientid->se_callback_netid_len = be32_to_cpup(p++); + READ_BUF(setclientid->se_callback_netid_len); + SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len); + READ_BUF(4); + setclientid->se_callback_addr_len = be32_to_cpup(p++); + + READ_BUF(setclientid->se_callback_addr_len); + SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len); + READ_BUF(4); + setclientid->se_callback_ident = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c) +{ + DECODE_HEAD; + + if (argp->minorversion >= 1) + return nfserr_notsupp; + + READ_BUF(8 + NFS4_VERIFIER_SIZE); + COPYMEM(&scd_c->sc_clientid, 8); + COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE); + + DECODE_TAIL; +} + +/* Also used for NVERIFY */ +static __be32 +nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify) +{ + DECODE_HEAD; + + if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval))) + goto out; + + /* For convenience's sake, we compare raw xdr'd attributes in + * nfsd4_proc_verify */ + + READ_BUF(4); + verify->ve_attrlen = be32_to_cpup(p++); + READ_BUF(verify->ve_attrlen); + SAVEMEM(verify->ve_attrval, verify->ve_attrlen); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &write->wr_stateid); + if (status) + return status; + READ_BUF(16); + p = xdr_decode_hyper(p, &write->wr_offset); + write->wr_stable_how = be32_to_cpup(p++); + if (write->wr_stable_how > NFS_FILE_SYNC) + goto xdr_error; + write->wr_buflen = be32_to_cpup(p++); + + status = svcxdr_construct_vector(argp, &write->wr_head, + &write->wr_pagelist, write->wr_buflen); + if (status) + return status; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner) +{ + DECODE_HEAD; + + if (argp->minorversion >= 1) + return nfserr_notsupp; + + READ_BUF(12); + COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t)); + rlockowner->rl_owner.len = be32_to_cpup(p++); + READ_BUF(rlockowner->rl_owner.len); + READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); + + if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid)) + return nfserr_inval; + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + int dummy, tmp; + DECODE_HEAD; + + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); + + status = nfsd4_decode_opaque(argp, &exid->clname); + if (status) + return nfserr_bad_xdr; + + READ_BUF(4); + exid->flags = be32_to_cpup(p++); + + /* Ignore state_protect4_a */ + READ_BUF(4); + exid->spa_how = be32_to_cpup(p++); + switch (exid->spa_how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + /* spo_must_enforce */ + status = nfsd4_decode_bitmap(argp, + exid->spo_must_enforce); + if (status) + goto out; + /* spo_must_allow */ + status = nfsd4_decode_bitmap(argp, exid->spo_must_allow); + if (status) + goto out; + break; + case SP4_SSV: + /* ssp_ops */ + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy * 4); + p += dummy; + + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy * 4); + p += dummy; + + /* ssp_hash_algs<> */ + READ_BUF(4); + tmp = be32_to_cpup(p++); + while (tmp--) { + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + } + + /* ssp_encr_algs<> */ + READ_BUF(4); + tmp = be32_to_cpup(p++); + while (tmp--) { + READ_BUF(4); + dummy = be32_to_cpup(p++); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + } + + /* ignore ssp_window and ssp_num_gss_handles: */ + READ_BUF(8); + break; + default: + goto xdr_error; + } + + READ_BUF(4); /* nfs_impl_id4 array length */ + dummy = be32_to_cpup(p++); + + if (dummy > 1) + goto xdr_error; + + if (dummy == 1) { + status = nfsd4_decode_opaque(argp, &exid->nii_domain); + if (status) + goto xdr_error; + + /* nii_name */ + status = nfsd4_decode_opaque(argp, &exid->nii_name); + if (status) + goto xdr_error; + + /* nii_date */ + status = nfsd4_decode_time(argp, &exid->nii_time); + if (status) + goto xdr_error; + } + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, + struct nfsd4_create_session *sess) +{ + DECODE_HEAD; + + READ_BUF(16); + COPYMEM(&sess->clientid, 8); + sess->seqid = be32_to_cpup(p++); + sess->flags = be32_to_cpup(p++); + + /* Fore channel attrs */ + READ_BUF(28); + p++; /* headerpadsz is always 0 */ + sess->fore_channel.maxreq_sz = be32_to_cpup(p++); + sess->fore_channel.maxresp_sz = be32_to_cpup(p++); + sess->fore_channel.maxresp_cached = be32_to_cpup(p++); + sess->fore_channel.maxops = be32_to_cpup(p++); + sess->fore_channel.maxreqs = be32_to_cpup(p++); + sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++); + if (sess->fore_channel.nr_rdma_attrs == 1) { + READ_BUF(4); + sess->fore_channel.rdma_attrs = be32_to_cpup(p++); + } else if (sess->fore_channel.nr_rdma_attrs > 1) { + dprintk("Too many fore channel attr bitmaps!\n"); + goto xdr_error; + } + + /* Back channel attrs */ + READ_BUF(28); + p++; /* headerpadsz is always 0 */ + sess->back_channel.maxreq_sz = be32_to_cpup(p++); + sess->back_channel.maxresp_sz = be32_to_cpup(p++); + sess->back_channel.maxresp_cached = be32_to_cpup(p++); + sess->back_channel.maxops = be32_to_cpup(p++); + sess->back_channel.maxreqs = be32_to_cpup(p++); + sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++); + if (sess->back_channel.nr_rdma_attrs == 1) { + READ_BUF(4); + sess->back_channel.rdma_attrs = be32_to_cpup(p++); + } else if (sess->back_channel.nr_rdma_attrs > 1) { + dprintk("Too many back channel attr bitmaps!\n"); + goto xdr_error; + } + + READ_BUF(4); + sess->callback_prog = be32_to_cpup(p++); + nfsd4_decode_cb_sec(argp, &sess->cb_sec); + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, + struct nfsd4_destroy_session *destroy_session) +{ + DECODE_HEAD; + READ_BUF(NFS4_MAX_SESSIONID_LEN); + COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, + struct nfsd4_free_stateid *free_stateid) +{ + DECODE_HEAD; + + READ_BUF(sizeof(stateid_t)); + free_stateid->fr_stateid.si_generation = be32_to_cpup(p++); + COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t)); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, + struct nfsd4_sequence *seq) +{ + DECODE_HEAD; + + READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); + COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); + seq->seqid = be32_to_cpup(p++); + seq->slotid = be32_to_cpup(p++); + seq->maxslots = be32_to_cpup(p++); + seq->cachethis = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid) +{ + int i; + __be32 *p, status; + struct nfsd4_test_stateid_id *stateid; + + READ_BUF(4); + test_stateid->ts_num_ids = ntohl(*p++); + + INIT_LIST_HEAD(&test_stateid->ts_stateid_list); + + for (i = 0; i < test_stateid->ts_num_ids; i++) { + stateid = svcxdr_tmpalloc(argp, sizeof(*stateid)); + if (!stateid) { + status = nfserrno(-ENOMEM); + goto out; + } + + INIT_LIST_HEAD(&stateid->ts_id_list); + list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list); + + status = nfsd4_decode_stateid(argp, &stateid->ts_id_stateid); + if (status) + goto out; + } + + status = 0; +out: + return status; +xdr_error: + dprintk("NFSD: xdr error (%s:%d)\n", __FILE__, __LINE__); + status = nfserr_bad_xdr; + goto out; +} + +static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc) +{ + DECODE_HEAD; + + READ_BUF(8); + COPYMEM(&dc->clientid, 8); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) +{ + DECODE_HEAD; + + READ_BUF(4); + rc->rca_one_fs = be32_to_cpup(p++); + + DECODE_TAIL; +} + +#ifdef CONFIG_NFSD_PNFS +static __be32 +nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, + struct nfsd4_getdeviceinfo *gdev) +{ + DECODE_HEAD; + u32 num, i; + + READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4); + COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid)); + gdev->gd_layout_type = be32_to_cpup(p++); + gdev->gd_maxcount = be32_to_cpup(p++); + num = be32_to_cpup(p++); + if (num) { + if (num > 1000) + goto xdr_error; + READ_BUF(4 * num); + gdev->gd_notify_types = be32_to_cpup(p++); + for (i = 1; i < num; i++) { + if (be32_to_cpup(p++)) { + status = nfserr_inval; + goto out; + } + } + } + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutget *lgp) +{ + DECODE_HEAD; + + READ_BUF(36); + lgp->lg_signal = be32_to_cpup(p++); + lgp->lg_layout_type = be32_to_cpup(p++); + lgp->lg_seg.iomode = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &lgp->lg_seg.offset); + p = xdr_decode_hyper(p, &lgp->lg_seg.length); + p = xdr_decode_hyper(p, &lgp->lg_minlength); + + status = nfsd4_decode_stateid(argp, &lgp->lg_sid); + if (status) + return status; + + READ_BUF(4); + lgp->lg_maxcount = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutcommit *lcp) +{ + DECODE_HEAD; + u32 timechange; + + READ_BUF(20); + p = xdr_decode_hyper(p, &lcp->lc_seg.offset); + p = xdr_decode_hyper(p, &lcp->lc_seg.length); + lcp->lc_reclaim = be32_to_cpup(p++); + + status = nfsd4_decode_stateid(argp, &lcp->lc_sid); + if (status) + return status; + + READ_BUF(4); + lcp->lc_newoffset = be32_to_cpup(p++); + if (lcp->lc_newoffset) { + READ_BUF(8); + p = xdr_decode_hyper(p, &lcp->lc_last_wr); + } else + lcp->lc_last_wr = 0; + READ_BUF(4); + timechange = be32_to_cpup(p++); + if (timechange) { + status = nfsd4_decode_time(argp, &lcp->lc_mtime); + if (status) + return status; + } else { + lcp->lc_mtime.tv_nsec = UTIME_NOW; + } + READ_BUF(8); + lcp->lc_layout_type = be32_to_cpup(p++); + + /* + * Save the layout update in XDR format and let the layout driver deal + * with it later. + */ + lcp->lc_up_len = be32_to_cpup(p++); + if (lcp->lc_up_len > 0) { + READ_BUF(lcp->lc_up_len); + READMEM(lcp->lc_up_layout, lcp->lc_up_len); + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutreturn *lrp) +{ + DECODE_HEAD; + + READ_BUF(16); + lrp->lr_reclaim = be32_to_cpup(p++); + lrp->lr_layout_type = be32_to_cpup(p++); + lrp->lr_seg.iomode = be32_to_cpup(p++); + lrp->lr_return_type = be32_to_cpup(p++); + if (lrp->lr_return_type == RETURN_FILE) { + READ_BUF(16); + p = xdr_decode_hyper(p, &lrp->lr_seg.offset); + p = xdr_decode_hyper(p, &lrp->lr_seg.length); + + status = nfsd4_decode_stateid(argp, &lrp->lr_sid); + if (status) + return status; + + READ_BUF(4); + lrp->lrf_body_len = be32_to_cpup(p++); + if (lrp->lrf_body_len > 0) { + READ_BUF(lrp->lrf_body_len); + READMEM(lrp->lrf_body, lrp->lrf_body_len); + } + } else { + lrp->lr_seg.offset = 0; + lrp->lr_seg.length = NFS4_MAX_UINT64; + } + + DECODE_TAIL; +} +#endif /* CONFIG_NFSD_PNFS */ + +static __be32 +nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, + struct nfsd4_fallocate *fallocate) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &fallocate->falloc_stateid); + if (status) + return status; + + READ_BUF(16); + p = xdr_decode_hyper(p, &fallocate->falloc_offset); + xdr_decode_hyper(p, &fallocate->falloc_length); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid); + if (status) + return status; + status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid); + if (status) + return status; + + READ_BUF(8 + 8 + 8); + p = xdr_decode_hyper(p, &clone->cl_src_pos); + p = xdr_decode_hyper(p, &clone->cl_dst_pos); + p = xdr_decode_hyper(p, &clone->cl_count); + DECODE_TAIL; +} + +static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp, + struct nl4_server *ns) +{ + DECODE_HEAD; + struct nfs42_netaddr *naddr; + + READ_BUF(4); + ns->nl4_type = be32_to_cpup(p++); + + /* currently support for 1 inter-server source server */ + switch (ns->nl4_type) { + case NL4_NETADDR: + naddr = &ns->u.nl4_addr; + + READ_BUF(4); + naddr->netid_len = be32_to_cpup(p++); + if (naddr->netid_len > RPCBIND_MAXNETIDLEN) + goto xdr_error; + + READ_BUF(naddr->netid_len + 4); /* 4 for uaddr len */ + COPYMEM(naddr->netid, naddr->netid_len); + + naddr->addr_len = be32_to_cpup(p++); + if (naddr->addr_len > RPCBIND_MAXUADDRLEN) + goto xdr_error; + + READ_BUF(naddr->addr_len); + COPYMEM(naddr->addr, naddr->addr_len); + break; + default: + goto xdr_error; + } + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) +{ + DECODE_HEAD; + struct nl4_server *ns_dummy; + int i, count; + + status = nfsd4_decode_stateid(argp, ©->cp_src_stateid); + if (status) + return status; + status = nfsd4_decode_stateid(argp, ©->cp_dst_stateid); + if (status) + return status; + + READ_BUF(8 + 8 + 8 + 4 + 4 + 4); + p = xdr_decode_hyper(p, ©->cp_src_pos); + p = xdr_decode_hyper(p, ©->cp_dst_pos); + p = xdr_decode_hyper(p, ©->cp_count); + p++; /* ca_consecutive: we always do consecutive copies */ + copy->cp_synchronous = be32_to_cpup(p++); + + count = be32_to_cpup(p++); + + copy->cp_intra = false; + if (count == 0) { /* intra-server copy */ + copy->cp_intra = true; + goto intra; + } + + /* decode all the supplied server addresses but use first */ + status = nfsd4_decode_nl4_server(argp, ©->cp_src); + if (status) + return status; + + ns_dummy = kmalloc(sizeof(struct nl4_server), GFP_KERNEL); + if (ns_dummy == NULL) + return nfserrno(-ENOMEM); + for (i = 0; i < count - 1; i++) { + status = nfsd4_decode_nl4_server(argp, ns_dummy); + if (status) { + kfree(ns_dummy); + return status; + } + } + kfree(ns_dummy); +intra: + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp, + struct nfsd4_offload_status *os) +{ + return nfsd4_decode_stateid(argp, &os->stateid); +} + +static __be32 +nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp, + struct nfsd4_copy_notify *cn) +{ + __be32 status; + + status = nfsd4_decode_stateid(argp, &cn->cpn_src_stateid); + if (status) + return status; + return nfsd4_decode_nl4_server(argp, &cn->cpn_dst); +} + +static __be32 +nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &seek->seek_stateid); + if (status) + return status; + + READ_BUF(8 + 4); + p = xdr_decode_hyper(p, &seek->seek_offset); + seek->seek_whence = be32_to_cpup(p); + + DECODE_TAIL; +} + +/* + * XDR data that is more than PAGE_SIZE in size is normally part of a + * read or write. However, the size of extended attributes is limited + * by the maximum request size, and then further limited by the underlying + * filesystem limits. This can exceed PAGE_SIZE (currently, XATTR_SIZE_MAX + * is 64k). Since there is no kvec- or page-based interface to xattrs, + * and we're not dealing with contiguous pages, we need to do some copying. + */ + +/* + * Decode data into buffer. Uses head and pages constructed by + * svcxdr_construct_vector. + */ +static __be32 +nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct kvec *head, + struct page **pages, char **bufp, u32 buflen) +{ + char *tmp, *dp; + u32 len; + + if (buflen <= head->iov_len) { + /* + * We're in luck, the head has enough space. Just return + * the head, no need for copying. + */ + *bufp = head->iov_base; + return 0; + } + + tmp = svcxdr_tmpalloc(argp, buflen); + if (tmp == NULL) + return nfserr_jukebox; + + dp = tmp; + memcpy(dp, head->iov_base, head->iov_len); + buflen -= head->iov_len; + dp += head->iov_len; + + while (buflen > 0) { + len = min_t(u32, buflen, PAGE_SIZE); + memcpy(dp, page_address(*pages), len); + + buflen -= len; + dp += len; + pages++; + } + + *bufp = tmp; + return 0; +} + +/* + * Get a user extended attribute name from the XDR buffer. + * It will not have the "user." prefix, so prepend it. + * Lastly, check for nul characters in the name. + */ +static __be32 +nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep) +{ + DECODE_HEAD; + char *name, *sp, *dp; + u32 namelen, cnt; + + READ_BUF(4); + namelen = be32_to_cpup(p++); + + if (namelen > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) + return nfserr_nametoolong; + + if (namelen == 0) + goto xdr_error; + + READ_BUF(namelen); + + name = svcxdr_tmpalloc(argp, namelen + XATTR_USER_PREFIX_LEN + 1); + if (!name) + return nfserr_jukebox; + + memcpy(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + + /* + * Copy the extended attribute name over while checking for 0 + * characters. + */ + sp = (char *)p; + dp = name + XATTR_USER_PREFIX_LEN; + cnt = namelen; + + while (cnt-- > 0) { + if (*sp == '\0') + goto xdr_error; + *dp++ = *sp++; + } + *dp = '\0'; + + *namep = name; + + DECODE_TAIL; +} + +/* + * A GETXATTR op request comes without a length specifier. We just set the + * maximum length for the reply based on XATTR_SIZE_MAX and the maximum + * channel reply size. nfsd_getxattr will probe the length of the xattr, + * check it against getxa_len, and allocate + return the value. + */ +static __be32 +nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp, + struct nfsd4_getxattr *getxattr) +{ + __be32 status; + u32 maxcount; + + status = nfsd4_decode_xattr_name(argp, &getxattr->getxa_name); + if (status) + return status; + + maxcount = svc_max_payload(argp->rqstp); + maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount); + + getxattr->getxa_len = maxcount; + + return status; +} + +static __be32 +nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, + struct nfsd4_setxattr *setxattr) +{ + DECODE_HEAD; + u32 flags, maxcount, size; + struct kvec head; + struct page **pagelist; + + READ_BUF(4); + flags = be32_to_cpup(p++); + + if (flags > SETXATTR4_REPLACE) + return nfserr_inval; + setxattr->setxa_flags = flags; + + status = nfsd4_decode_xattr_name(argp, &setxattr->setxa_name); + if (status) + return status; + + maxcount = svc_max_payload(argp->rqstp); + maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount); + + READ_BUF(4); + size = be32_to_cpup(p++); + if (size > maxcount) + return nfserr_xattr2big; + + setxattr->setxa_len = size; + if (size > 0) { + status = svcxdr_construct_vector(argp, &head, &pagelist, size); + if (status) + return status; + + status = nfsd4_vbuf_from_vector(argp, &head, pagelist, + &setxattr->setxa_buf, size); + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, + struct nfsd4_listxattrs *listxattrs) +{ + DECODE_HEAD; + u32 maxcount; + + READ_BUF(12); + p = xdr_decode_hyper(p, &listxattrs->lsxa_cookie); + + /* + * If the cookie is too large to have even one user.x attribute + * plus trailing '\0' left in a maximum size buffer, it's invalid. + */ + if (listxattrs->lsxa_cookie >= + (XATTR_LIST_MAX / (XATTR_USER_PREFIX_LEN + 2))) + return nfserr_badcookie; + + maxcount = be32_to_cpup(p++); + if (maxcount < 8) + /* Always need at least 2 words (length and one character) */ + return nfserr_inval; + + maxcount = min(maxcount, svc_max_payload(argp->rqstp)); + listxattrs->lsxa_maxcount = maxcount; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp, + struct nfsd4_removexattr *removexattr) +{ + return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name); +} + +static __be32 +nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) +{ + return nfs_ok; +} + +static __be32 +nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) +{ + return nfserr_notsupp; +} + +typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); + +static const nfsd4_dec nfsd4_dec_ops[] = { + [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, + [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close, + [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit, + [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create, + [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn, + [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr, + [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_LINK] = (nfsd4_dec)nfsd4_decode_link, + [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock, + [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt, + [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku, + [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup, + [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop, + [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify, + [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open, + [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, + [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, + [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, + [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_putpubfh, + [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_READ] = (nfsd4_dec)nfsd4_decode_read, + [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, + [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop, + [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove, + [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename, + [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew, + [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop, + [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo, + [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr, + [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid, + [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm, + [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify, + [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write, + [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, + + /* new operations for NFSv4.1 */ + [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl, + [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session, + [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, + [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, + [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, + [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, +#ifdef CONFIG_NFSD_PNFS + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, +#else + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, +#endif + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, + [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid, + [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, + [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, + + /* new operations for NFSv4.2 */ + [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, + [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy, + [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_copy_notify, + [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, + [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status, + [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status, + [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_read, + [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, + [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone, + /* RFC 8276 extended atributes operations */ + [OP_GETXATTR] = (nfsd4_dec)nfsd4_decode_getxattr, + [OP_SETXATTR] = (nfsd4_dec)nfsd4_decode_setxattr, + [OP_LISTXATTRS] = (nfsd4_dec)nfsd4_decode_listxattrs, + [OP_REMOVEXATTR] = (nfsd4_dec)nfsd4_decode_removexattr, +}; + +static inline bool +nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op) +{ + if (op->opnum < FIRST_NFS4_OP) + return false; + else if (argp->minorversion == 0 && op->opnum > LAST_NFS40_OP) + return false; + else if (argp->minorversion == 1 && op->opnum > LAST_NFS41_OP) + return false; + else if (argp->minorversion == 2 && op->opnum > LAST_NFS42_OP) + return false; + return true; +} + +static __be32 +nfsd4_decode_compound(struct nfsd4_compoundargs *argp) +{ + DECODE_HEAD; + struct nfsd4_op *op; + bool cachethis = false; + int auth_slack= argp->rqstp->rq_auth_slack; + int max_reply = auth_slack + 8; /* opcnt, status */ + int readcount = 0; + int readbytes = 0; + int i; + + READ_BUF(4); + argp->taglen = be32_to_cpup(p++); + READ_BUF(argp->taglen); + SAVEMEM(argp->tag, argp->taglen); + READ_BUF(8); + argp->minorversion = be32_to_cpup(p++); + argp->opcnt = be32_to_cpup(p++); + max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2); + + if (argp->taglen > NFSD4_MAX_TAGLEN) + goto xdr_error; + /* + * NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS + * here, so we return success at the xdr level so that + * nfsd4_proc can handle this is an NFS-level error. + */ + if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND) + return 0; + + if (argp->opcnt > ARRAY_SIZE(argp->iops)) { + argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); + if (!argp->ops) { + argp->ops = argp->iops; + dprintk("nfsd: couldn't allocate room for COMPOUND\n"); + goto xdr_error; + } + } + + if (argp->minorversion > NFSD_SUPPORTED_MINOR_VERSION) + argp->opcnt = 0; + + for (i = 0; i < argp->opcnt; i++) { + op = &argp->ops[i]; + op->replay = NULL; + + READ_BUF(4); + op->opnum = be32_to_cpup(p++); + + if (nfsd4_opnum_in_range(argp, op)) + op->status = nfsd4_dec_ops[op->opnum](argp, &op->u); + else { + op->opnum = OP_ILLEGAL; + op->status = nfserr_op_illegal; + } + op->opdesc = OPDESC(op); + /* + * We'll try to cache the result in the DRC if any one + * op in the compound wants to be cached: + */ + cachethis |= nfsd4_cache_this_op(op); + + if (op->opnum == OP_READ || op->opnum == OP_READ_PLUS) { + readcount++; + readbytes += nfsd4_max_reply(argp->rqstp, op); + } else + max_reply += nfsd4_max_reply(argp->rqstp, op); + /* + * OP_LOCK and OP_LOCKT may return a conflicting lock. + * (Special case because it will just skip encoding this + * if it runs out of xdr buffer space, and it is the only + * operation that behaves this way.) + */ + if (op->opnum == OP_LOCK || op->opnum == OP_LOCKT) + max_reply += NFS4_OPAQUE_LIMIT; + + if (op->status) { + argp->opcnt = i+1; + break; + } + } + /* Sessions make the DRC unnecessary: */ + if (argp->minorversion) + cachethis = false; + svc_reserve(argp->rqstp, max_reply + readbytes); + argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; + + if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) + clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); + + DECODE_TAIL; +} + +static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, + struct svc_export *exp) +{ + if (exp->ex_flags & NFSEXP_V4ROOT) { + *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time)); + *p++ = 0; + } else if (IS_I_VERSION(inode)) { + p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode)); + } else { + *p++ = cpu_to_be32(stat->ctime.tv_sec); + *p++ = cpu_to_be32(stat->ctime.tv_nsec); + } + return p; +} + +/* + * ctime (in NFSv4, time_metadata) is not writeable, and the client + * doesn't really care what resolution could theoretically be stored by + * the filesystem. + * + * The client cares how close together changes can be while still + * guaranteeing ctime changes. For most filesystems (which have + * timestamps with nanosecond fields) that is limited by the resolution + * of the time returned from current_time() (which I'm assuming to be + * 1/HZ). + */ +static __be32 *encode_time_delta(__be32 *p, struct inode *inode) +{ + struct timespec64 ts; + u32 ns; + + ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran); + ts = ns_to_timespec64(ns); + + p = xdr_encode_hyper(p, ts.tv_sec); + *p++ = cpu_to_be32(ts.tv_nsec); + + return p; +} + +static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c) +{ + *p++ = cpu_to_be32(c->atomic); + if (c->change_supported) { + p = xdr_encode_hyper(p, c->before_change); + p = xdr_encode_hyper(p, c->after_change); + } else { + *p++ = cpu_to_be32(c->before_ctime_sec); + *p++ = cpu_to_be32(c->before_ctime_nsec); + *p++ = cpu_to_be32(c->after_ctime_sec); + *p++ = cpu_to_be32(c->after_ctime_nsec); + } + return p; +} + +/* Encode as an array of strings the string given with components + * separated @sep, escaped with esc_enter and esc_exit. + */ +static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, + char *components, char esc_enter, + char esc_exit) +{ + __be32 *p; + __be32 pathlen; + int pathlen_offset; + int strlen, count=0; + char *str, *end, *next; + + dprintk("nfsd4_encode_components(%s)\n", components); + + pathlen_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + p++; /* We will fill this in with @count later */ + + end = str = components; + while (*end) { + bool found_esc = false; + + /* try to parse as esc_start, ..., esc_end, sep */ + if (*str == esc_enter) { + for (; *end && (*end != esc_exit); end++) + /* find esc_exit or end of string */; + next = end + 1; + if (*end && (!*next || *next == sep)) { + str++; + found_esc = true; + } + } + + if (!found_esc) + for (; *end && (*end != sep); end++) + /* find sep or end of string */; + + strlen = end - str; + if (strlen) { + p = xdr_reserve_space(xdr, strlen + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, str, strlen); + count++; + } + else + end++; + if (found_esc) + end = next; + + str = end; + } + pathlen = htonl(count); + write_bytes_to_xdr_buf(xdr->buf, pathlen_offset, &pathlen, 4); + return 0; +} + +/* Encode as an array of strings the string given with components + * separated @sep. + */ +static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep, + char *components) +{ + return nfsd4_encode_components_esc(xdr, sep, components, 0, 0); +} + +/* + * encode a location element of a fs_locations structure + */ +static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr, + struct nfsd4_fs_location *location) +{ + __be32 status; + + status = nfsd4_encode_components_esc(xdr, ':', location->hosts, + '[', ']'); + if (status) + return status; + status = nfsd4_encode_components(xdr, '/', location->path); + if (status) + return status; + return 0; +} + +/* + * Encode a path in RFC3530 'pathname4' format + */ +static __be32 nfsd4_encode_path(struct xdr_stream *xdr, + const struct path *root, + const struct path *path) +{ + struct path cur = *path; + __be32 *p; + struct dentry **components = NULL; + unsigned int ncomponents = 0; + __be32 err = nfserr_jukebox; + + dprintk("nfsd4_encode_components("); + + path_get(&cur); + /* First walk the path up to the nfsd root, and store the + * dentries/path components in an array. + */ + for (;;) { + if (path_equal(&cur, root)) + break; + if (cur.dentry == cur.mnt->mnt_root) { + if (follow_up(&cur)) + continue; + goto out_free; + } + if ((ncomponents & 15) == 0) { + struct dentry **new; + new = krealloc(components, + sizeof(*new) * (ncomponents + 16), + GFP_KERNEL); + if (!new) + goto out_free; + components = new; + } + components[ncomponents++] = cur.dentry; + cur.dentry = dget_parent(cur.dentry); + } + err = nfserr_resource; + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_free; + *p++ = cpu_to_be32(ncomponents); + + while (ncomponents) { + struct dentry *dentry = components[ncomponents - 1]; + unsigned int len; + + spin_lock(&dentry->d_lock); + len = dentry->d_name.len; + p = xdr_reserve_space(xdr, len + 4); + if (!p) { + spin_unlock(&dentry->d_lock); + goto out_free; + } + p = xdr_encode_opaque(p, dentry->d_name.name, len); + dprintk("/%pd", dentry); + spin_unlock(&dentry->d_lock); + dput(dentry); + ncomponents--; + } + + err = 0; +out_free: + dprintk(")\n"); + while (ncomponents) + dput(components[--ncomponents]); + kfree(components); + path_put(&cur); + return err; +} + +static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr, + struct svc_rqst *rqstp, const struct path *path) +{ + struct svc_export *exp_ps; + __be32 res; + + exp_ps = rqst_find_fsidzero_export(rqstp); + if (IS_ERR(exp_ps)) + return nfserrno(PTR_ERR(exp_ps)); + res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path); + exp_put(exp_ps); + return res; +} + +/* + * encode a fs_locations structure + */ +static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr, + struct svc_rqst *rqstp, struct svc_export *exp) +{ + __be32 status; + int i; + __be32 *p; + struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; + + status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path); + if (status) + return status; + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(fslocs->locations_count); + for (i=0; i<fslocs->locations_count; i++) { + status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]); + if (status) + return status; + } + return 0; +} + +static u32 nfs4_file_type(umode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFIFO: return NF4FIFO; + case S_IFCHR: return NF4CHR; + case S_IFDIR: return NF4DIR; + case S_IFBLK: return NF4BLK; + case S_IFLNK: return NF4LNK; + case S_IFREG: return NF4REG; + case S_IFSOCK: return NF4SOCK; + default: return NF4BAD; + }; +} + +static inline __be32 +nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, + struct nfs4_ace *ace) +{ + if (ace->whotype != NFS4_ACL_WHO_NAMED) + return nfs4_acl_write_who(xdr, ace->whotype); + else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + return nfsd4_encode_group(xdr, rqstp, ace->who_gid); + else + return nfsd4_encode_user(xdr, rqstp, ace->who_uid); +} + +static inline __be32 +nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types) +{ + __be32 *p; + unsigned long i = hweight_long(layout_types); + + p = xdr_reserve_space(xdr, 4 + 4 * i); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(i); + + for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i) + if (layout_types & (1 << i)) + *p++ = cpu_to_be32(i); + + return 0; +} + +#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ + FATTR4_WORD0_RDATTR_ERROR) +#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID +#define WORD2_ABSENT_FS_ATTRS 0 + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +static inline __be32 +nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, + void *context, int len) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, len + 4 + 4 + 4); + if (!p) + return nfserr_resource; + + /* + * For now we use a 0 here to indicate the null translation; in + * the future we may place a call to translation code here. + */ + *p++ = cpu_to_be32(0); /* lfs */ + *p++ = cpu_to_be32(0); /* pi */ + p = xdr_encode_opaque(p, context, len); + return 0; +} +#else +static inline __be32 +nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, + void *context, int len) +{ return 0; } +#endif + +static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32 *rdattr_err) +{ + /* As per referral draft: */ + if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS || + *bmval1 & ~WORD1_ABSENT_FS_ATTRS) { + if (*bmval0 & FATTR4_WORD0_RDATTR_ERROR || + *bmval0 & FATTR4_WORD0_FS_LOCATIONS) + *rdattr_err = NFSERR_MOVED; + else + return nfserr_moved; + } + *bmval0 &= WORD0_ABSENT_FS_ATTRS; + *bmval1 &= WORD1_ABSENT_FS_ATTRS; + *bmval2 &= WORD2_ABSENT_FS_ATTRS; + return 0; +} + + +static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) +{ + struct path path = exp->ex_path; + int err; + + path_get(&path); + while (follow_up(&path)) { + if (path.dentry != path.mnt->mnt_root) + break; + } + err = vfs_getattr(&path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + path_put(&path); + return err; +} + +static __be32 +nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2) +{ + __be32 *p; + + if (bmval2) { + p = xdr_reserve_space(xdr, 16); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); + *p++ = cpu_to_be32(bmval2); + } else if (bmval1) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); + } else { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(bmval0); + } + + return 0; +out_resource: + return nfserr_resource; +} + +/* + * Note: @fhp can be NULL; in this case, we might have to compose the filehandle + * ourselves. + */ +static __be32 +nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, + struct svc_export *exp, + struct dentry *dentry, u32 *bmval, + struct svc_rqst *rqstp, int ignore_crossmnt) +{ + u32 bmval0 = bmval[0]; + u32 bmval1 = bmval[1]; + u32 bmval2 = bmval[2]; + struct kstat stat; + struct svc_fh *tempfh = NULL; + struct kstatfs statfs; + __be32 *p; + int starting_len = xdr->buf->len; + int attrlen_offset; + __be32 attrlen; + u32 dummy; + u64 dummy64; + u32 rdattr_err = 0; + __be32 status; + int err; + struct nfs4_acl *acl = NULL; +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + void *context = NULL; + int contextlen; +#endif + bool contextsupport = false; + struct nfsd4_compoundres *resp = rqstp->rq_resp; + u32 minorversion = resp->cstate.minorversion; + struct path path = { + .mnt = exp->ex_path.mnt, + .dentry = dentry, + }; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); + BUG_ON(!nfsd_attrs_supported(minorversion, bmval)); + + if (exp->ex_fslocs.migrated) { + status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err); + if (status) + goto out; + } + + err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + if (err) + goto out_nfserr; + if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | + FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) || + (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | + FATTR4_WORD1_SPACE_TOTAL))) { + err = vfs_statfs(&path, &statfs); + if (err) + goto out_nfserr; + } + if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { + tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); + status = nfserr_jukebox; + if (!tempfh) + goto out; + fh_init(tempfh, NFS4_FHSIZE); + status = fh_compose(tempfh, exp, dentry, NULL); + if (status) + goto out; + fhp = tempfh; + } + if (bmval0 & FATTR4_WORD0_ACL) { + err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl); + if (err == -EOPNOTSUPP) + bmval0 &= ~FATTR4_WORD0_ACL; + else if (err == -EINVAL) { + status = nfserr_attrnotsupp; + goto out; + } else if (err != 0) + goto out_nfserr; + } + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) || + bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { + if (exp->ex_flags & NFSEXP_SECURITY_LABEL) + err = security_inode_getsecctx(d_inode(dentry), + &context, &contextlen); + else + err = -EOPNOTSUPP; + contextsupport = (err == 0); + if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { + if (err == -EOPNOTSUPP) + bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL; + else if (err) + goto out_nfserr; + } + } +#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ + + status = nfsd4_encode_bitmap(xdr, bmval0, bmval1, bmval2); + if (status) + goto out; + + attrlen_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + p++; /* to be backfilled later */ + + if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); + + if (!IS_POSIXACL(dentry->d_inode)) + supp[0] &= ~FATTR4_WORD0_ACL; + if (!contextsupport) + supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + if (!supp[2]) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(supp[0]); + *p++ = cpu_to_be32(supp[1]); + } else { + p = xdr_reserve_space(xdr, 16); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(supp[0]); + *p++ = cpu_to_be32(supp[1]); + *p++ = cpu_to_be32(supp[2]); + } + } + if (bmval0 & FATTR4_WORD0_TYPE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + dummy = nfs4_file_type(stat.mode); + if (dummy == NF4BAD) { + status = nfserr_serverfault; + goto out; + } + *p++ = cpu_to_be32(dummy); + } + if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) + *p++ = cpu_to_be32(NFS4_FH_PERSISTENT); + else + *p++ = cpu_to_be32(NFS4_FH_PERSISTENT| + NFS4_FH_VOL_RENAME); + } + if (bmval0 & FATTR4_WORD0_CHANGE) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = encode_change(p, &stat, d_inode(dentry), exp); + } + if (bmval0 & FATTR4_WORD0_SIZE) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, stat.size); + } + if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval0 & FATTR4_WORD0_NAMED_ATTR) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(0); + } + if (bmval0 & FATTR4_WORD0_FSID) { + p = xdr_reserve_space(xdr, 16); + if (!p) + goto out_resource; + if (exp->ex_fslocs.migrated) { + p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR); + p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR); + } else switch(fsid_source(fhp)) { + case FSIDSOURCE_FSID: + p = xdr_encode_hyper(p, (u64)exp->ex_fsid); + p = xdr_encode_hyper(p, (u64)0); + break; + case FSIDSOURCE_DEV: + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(MAJOR(stat.dev)); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(MINOR(stat.dev)); + break; + case FSIDSOURCE_UUID: + p = xdr_encode_opaque_fixed(p, exp->ex_uuid, + EX_UUID_LEN); + break; + } + } + if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(0); + } + if (bmval0 & FATTR4_WORD0_LEASE_TIME) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(nn->nfsd4_lease); + } + if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(rdattr_err); + } + if (bmval0 & FATTR4_WORD0_ACL) { + struct nfs4_ace *ace; + + if (acl == NULL) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + + *p++ = cpu_to_be32(0); + goto out_acl; + } + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(acl->naces); + + for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { + p = xdr_reserve_space(xdr, 4*3); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(ace->type); + *p++ = cpu_to_be32(ace->flag); + *p++ = cpu_to_be32(ace->access_mask & + NFS4_ACE_MASK_ALL); + status = nfsd4_encode_aclname(xdr, rqstp, ace); + if (status) + goto out; + } + } +out_acl: + if (bmval0 & FATTR4_WORD0_ACLSUPPORT) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(IS_POSIXACL(dentry->d_inode) ? + ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0); + } + if (bmval0 & FATTR4_WORD0_CANSETTIME) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(0); + } + if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval0 & FATTR4_WORD0_FILEHANDLE) { + p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4); + if (!p) + goto out_resource; + p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, + fhp->fh_handle.fh_size); + } + if (bmval0 & FATTR4_WORD0_FILEID) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, stat.ino); + } + if (bmval0 & FATTR4_WORD0_FILES_AVAIL) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (u64) statfs.f_ffree); + } + if (bmval0 & FATTR4_WORD0_FILES_FREE) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (u64) statfs.f_ffree); + } + if (bmval0 & FATTR4_WORD0_FILES_TOTAL) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (u64) statfs.f_files); + } + if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { + status = nfsd4_encode_fs_locations(xdr, rqstp, exp); + if (status) + goto out; + } + if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval0 & FATTR4_WORD0_MAXFILESIZE) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes); + } + if (bmval0 & FATTR4_WORD0_MAXLINK) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(255); + } + if (bmval0 & FATTR4_WORD0_MAXNAME) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(statfs.f_namelen); + } + if (bmval0 & FATTR4_WORD0_MAXREAD) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); + } + if (bmval0 & FATTR4_WORD0_MAXWRITE) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); + } + if (bmval1 & FATTR4_WORD1_MODE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(stat.mode & S_IALLUGO); + } + if (bmval1 & FATTR4_WORD1_NO_TRUNC) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + } + if (bmval1 & FATTR4_WORD1_NUMLINKS) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(stat.nlink); + } + if (bmval1 & FATTR4_WORD1_OWNER) { + status = nfsd4_encode_user(xdr, rqstp, stat.uid); + if (status) + goto out; + } + if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { + status = nfsd4_encode_group(xdr, rqstp, stat.gid); + if (status) + goto out; + } + if (bmval1 & FATTR4_WORD1_RAWDEV) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + *p++ = cpu_to_be32((u32) MAJOR(stat.rdev)); + *p++ = cpu_to_be32((u32) MINOR(stat.rdev)); + } + if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize; + p = xdr_encode_hyper(p, dummy64); + } + if (bmval1 & FATTR4_WORD1_SPACE_FREE) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize; + p = xdr_encode_hyper(p, dummy64); + } + if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize; + p = xdr_encode_hyper(p, dummy64); + } + if (bmval1 & FATTR4_WORD1_SPACE_USED) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + dummy64 = (u64)stat.blocks << 9; + p = xdr_encode_hyper(p, dummy64); + } + if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec); + *p++ = cpu_to_be32(stat.atime.tv_nsec); + } + if (bmval1 & FATTR4_WORD1_TIME_DELTA) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + p = encode_time_delta(p, d_inode(dentry)); + } + if (bmval1 & FATTR4_WORD1_TIME_METADATA) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec); + *p++ = cpu_to_be32(stat.ctime.tv_nsec); + } + if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { + p = xdr_reserve_space(xdr, 12); + if (!p) + goto out_resource; + p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec); + *p++ = cpu_to_be32(stat.mtime.tv_nsec); + } + if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { + struct kstat parent_stat; + u64 ino = stat.ino; + + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + /* + * Get parent's attributes if not ignoring crossmount + * and this is the root of a cross-mounted filesystem. + */ + if (ignore_crossmnt == 0 && + dentry == exp->ex_path.mnt->mnt_root) { + err = get_parent_attributes(exp, &parent_stat); + if (err) + goto out_nfserr; + ino = parent_stat.ino; + } + p = xdr_encode_hyper(p, ino); + } +#ifdef CONFIG_NFSD_PNFS + if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { + status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); + if (status) + goto out; + } + + if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) { + status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); + if (status) + goto out; + } + + if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(stat.blksize); + } +#endif /* CONFIG_NFSD_PNFS */ + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); + supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0; + supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1; + supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2; + + status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]); + if (status) + goto out; + } + + if (bmval2 & FATTR4_WORD2_CHANGE_ATTR_TYPE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + if (IS_I_VERSION(d_inode(dentry))) + *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR); + else + *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_TIME_METADATA); + } + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { + status = nfsd4_encode_security_label(xdr, rqstp, context, + contextlen); + if (status) + goto out; + } +#endif + + if (bmval2 & FATTR4_WORD2_XATTR_SUPPORT) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + err = xattr_supported_namespace(d_inode(dentry), + XATTR_USER_PREFIX); + *p++ = cpu_to_be32(err == 0); + } + + attrlen = htonl(xdr->buf->len - attrlen_offset - 4); + write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4); + status = nfs_ok; + +out: +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + if (context) + security_release_secctx(context, contextlen); +#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ + kfree(acl); + if (tempfh) { + fh_put(tempfh); + kfree(tempfh); + } + if (status) + xdr_truncate_encode(xdr, starting_len); + return status; +out_nfserr: + status = nfserrno(err); + goto out; +out_resource: + status = nfserr_resource; + goto out; +} + +static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr, + struct xdr_buf *buf, __be32 *p, int bytes) +{ + xdr->scratch.iov_len = 0; + memset(buf, 0, sizeof(struct xdr_buf)); + buf->head[0].iov_base = p; + buf->head[0].iov_len = 0; + buf->len = 0; + xdr->buf = buf; + xdr->iov = buf->head; + xdr->p = p; + xdr->end = (void *)p + bytes; + buf->buflen = bytes; +} + +__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words, + struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, u32 *bmval, + struct svc_rqst *rqstp, int ignore_crossmnt) +{ + struct xdr_buf dummy; + struct xdr_stream xdr; + __be32 ret; + + svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2); + ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp, + ignore_crossmnt); + *p = xdr.p; + return ret; +} + +static inline int attributes_need_mount(u32 *bmval) +{ + if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME)) + return 1; + if (bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID) + return 1; + return 0; +} + +static __be32 +nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, + const char *name, int namlen) +{ + struct svc_export *exp = cd->rd_fhp->fh_export; + struct dentry *dentry; + __be32 nfserr; + int ignore_crossmnt = 0; + + dentry = lookup_positive_unlocked(name, cd->rd_fhp->fh_dentry, namlen); + if (IS_ERR(dentry)) + return nfserrno(PTR_ERR(dentry)); + + exp_get(exp); + /* + * In the case of a mountpoint, the client may be asking for + * attributes that are only properties of the underlying filesystem + * as opposed to the cross-mounted file system. In such a case, + * we will not follow the cross mount and will fill the attribtutes + * directly from the mountpoint dentry. + */ + if (nfsd_mountpoint(dentry, exp)) { + int err; + + if (!(exp->ex_flags & NFSEXP_V4ROOT) + && !attributes_need_mount(cd->rd_bmval)) { + ignore_crossmnt = 1; + goto out_encode; + } + /* + * Why the heck aren't we just using nfsd_lookup?? + * Different "."/".." handling? Something else? + * At least, add a comment here to explain.... + */ + err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp); + if (err) { + nfserr = nfserrno(err); + goto out_put; + } + nfserr = check_nfsd_access(exp, cd->rd_rqstp); + if (nfserr) + goto out_put; + + } +out_encode: + nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval, + cd->rd_rqstp, ignore_crossmnt); +out_put: + dput(dentry); + exp_put(exp); + return nfserr; +} + +static __be32 * +nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 20); + if (!p) + return NULL; + *p++ = htonl(2); + *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */ + *p++ = htonl(0); /* bmval1 */ + + *p++ = htonl(4); /* attribute length */ + *p++ = nfserr; /* no htonl */ + return p; +} + +static int +nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = ccdv; + struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common); + struct xdr_stream *xdr = cd->xdr; + int start_offset = xdr->buf->len; + int cookie_offset; + u32 name_and_cookie; + int entry_bytes; + __be32 nfserr = nfserr_toosmall; + __be64 wire_offset; + __be32 *p; + + /* In nfsv4, "." and ".." never make it onto the wire.. */ + if (name && isdotent(name, namlen)) { + cd->common.err = nfs_ok; + return 0; + } + + if (cd->cookie_offset) { + wire_offset = cpu_to_be64(offset); + write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset, + &wire_offset, 8); + } + + p = xdr_reserve_space(xdr, 4); + if (!p) + goto fail; + *p++ = xdr_one; /* mark entry present */ + cookie_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 3*4 + namlen); + if (!p) + goto fail; + p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ + p = xdr_encode_array(p, name, namlen); /* name length & name */ + + nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen); + switch (nfserr) { + case nfs_ok: + break; + case nfserr_resource: + nfserr = nfserr_toosmall; + goto fail; + case nfserr_noent: + xdr_truncate_encode(xdr, start_offset); + goto skip_entry; + case nfserr_jukebox: + /* + * The pseudoroot should only display dentries that lead to + * exports. If we get EJUKEBOX here, then we can't tell whether + * this entry should be included. Just fail the whole READDIR + * with NFS4ERR_DELAY in that case, and hope that the situation + * will resolve itself by the client's next attempt. + */ + if (cd->rd_fhp->fh_export->ex_flags & NFSEXP_V4ROOT) + goto fail; + fallthrough; + default: + /* + * If the client requested the RDATTR_ERROR attribute, + * we stuff the error code into this attribute + * and continue. If this attribute was not requested, + * then in accordance with the spec, we fail the + * entire READDIR operation(!) + */ + if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)) + goto fail; + p = nfsd4_encode_rdattr_error(xdr, nfserr); + if (p == NULL) { + nfserr = nfserr_toosmall; + goto fail; + } + } + nfserr = nfserr_toosmall; + entry_bytes = xdr->buf->len - start_offset; + if (entry_bytes > cd->rd_maxcount) + goto fail; + cd->rd_maxcount -= entry_bytes; + /* + * RFC 3530 14.2.24 describes rd_dircount as only a "hint", and + * notes that it could be zero. If it is zero, then the server + * should enforce only the rd_maxcount value. + */ + if (cd->rd_dircount) { + name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; + if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) + goto fail; + cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + if (!cd->rd_dircount) + cd->rd_maxcount = 0; + } + + cd->cookie_offset = cookie_offset; +skip_entry: + cd->common.err = nfs_ok; + return 0; +fail: + xdr_truncate_encode(xdr, start_offset); + cd->common.err = nfserr; + return -EINVAL; +} + +static __be32 +nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(stateid_t)); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sid->si_generation); + p = xdr_encode_opaque_fixed(p, &sid->si_opaque, + sizeof(stateid_opaque_t)); + return 0; +} + +static __be32 +nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(access->ac_supported); + *p++ = cpu_to_be32(access->ac_resp_access); + return 0; +} + +static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, bcts->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(bcts->dir); + /* Upshifting from TCP to RDMA is not supported */ + *p++ = cpu_to_be32(0); + return 0; +} + +static __be32 +nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) +{ + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_encode_stateid(xdr, &close->cl_stateid); +} + + +static __be32 +nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, commit->co_verf.data, + NFS4_VERIFIER_SIZE); + return 0; +} + +static __be32 +nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + encode_cinfo(p, &create->cr_cinfo); + return nfsd4_encode_bitmap(xdr, create->cr_bmval[0], + create->cr_bmval[1], create->cr_bmval[2]); +} + +static __be32 +nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr) +{ + struct svc_fh *fhp = getattr->ga_fhp; + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry, + getattr->ga_bmval, resp->rqstp, 0); +} + +static __be32 +nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) +{ + struct xdr_stream *xdr = &resp->xdr; + struct svc_fh *fhp = *fhpp; + unsigned int len; + __be32 *p; + + len = fhp->fh_handle.fh_size; + p = xdr_reserve_space(xdr, len + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len); + return 0; +} + +/* +* Including all fields other than the name, a LOCK4denied structure requires +* 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes. +*/ +static __be32 +nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld) +{ + struct xdr_netobj *conf = &ld->ld_owner; + __be32 *p; + +again: + p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len)); + if (!p) { + /* + * Don't fail to return the result just because we can't + * return the conflicting open: + */ + if (conf->len) { + kfree(conf->data); + conf->len = 0; + conf->data = NULL; + goto again; + } + return nfserr_resource; + } + p = xdr_encode_hyper(p, ld->ld_start); + p = xdr_encode_hyper(p, ld->ld_length); + *p++ = cpu_to_be32(ld->ld_type); + if (conf->len) { + p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8); + p = xdr_encode_opaque(p, conf->data, conf->len); + kfree(conf->data); + } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ + p = xdr_encode_hyper(p, (u64)0); /* clientid */ + *p++ = cpu_to_be32(0); /* length of owner name */ + } + return nfserr_denied; +} + +static __be32 +nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) +{ + struct xdr_stream *xdr = &resp->xdr; + + if (!nfserr) + nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid); + else if (nfserr == nfserr_denied) + nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied); + + return nfserr; +} + +static __be32 +nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) +{ + struct xdr_stream *xdr = &resp->xdr; + + if (nfserr == nfserr_denied) + nfsd4_encode_lock_denied(xdr, &lockt->lt_denied); + return nfserr; +} + +static __be32 +nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) +{ + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_encode_stateid(xdr, &locku->lu_stateid); +} + + +static __be32 +nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &link->li_cinfo); + return 0; +} + + +static __be32 +nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 24); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &open->op_cinfo); + *p++ = cpu_to_be32(open->op_rflags); + + nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1], + open->op_bmval[2]); + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(open->op_delegate_type); + switch (open->op_delegate_type) { + case NFS4_OPEN_DELEGATE_NONE: + break; + case NFS4_OPEN_DELEGATE_READ: + nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_recall); + + /* + * TODO: ACE's in delegations + */ + *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ + break; + case NFS4_OPEN_DELEGATE_WRITE: + nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 32); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_recall); + + /* + * TODO: space_limit's in delegations + */ + *p++ = cpu_to_be32(NFS4_LIMIT_SIZE); + *p++ = cpu_to_be32(~(u32)0); + *p++ = cpu_to_be32(~(u32)0); + + /* + * TODO: ACE's in delegations + */ + *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ + break; + case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */ + switch (open->op_why_no_deleg) { + case WND4_CONTENTION: + case WND4_RESOURCE: + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_why_no_deleg); + /* deleg signaling not supported yet: */ + *p++ = cpu_to_be32(0); + break; + default: + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_why_no_deleg); + } + break; + default: + BUG(); + } + /* XXX save filehandle here */ + return 0; +} + +static __be32 +nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) +{ + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); +} + +static __be32 +nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) +{ + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_encode_stateid(xdr, &od->od_stateid); +} + +static __be32 nfsd4_encode_splice_read( + struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + struct file *file, unsigned long maxcount) +{ + struct xdr_stream *xdr = &resp->xdr; + struct xdr_buf *buf = xdr->buf; + u32 eof; + int space_left; + __be32 nfserr; + __be32 *p = xdr->p - 2; + + /* Make sure there will be room for padding if needed */ + if (xdr->end - xdr->p < 1) + return nfserr_resource; + + nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, + file, read->rd_offset, &maxcount, &eof); + read->rd_length = maxcount; + if (nfserr) { + /* + * nfsd_splice_actor may have already messed with the + * page length; reset it so as not to confuse + * xdr_truncate_encode: + */ + buf->page_len = 0; + return nfserr; + } + + *(p++) = htonl(eof); + *(p++) = htonl(maxcount); + + buf->page_len = maxcount; + buf->len += maxcount; + xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1) + / PAGE_SIZE; + + /* Use rest of head for padding and remaining ops: */ + buf->tail[0].iov_base = xdr->p; + buf->tail[0].iov_len = 0; + xdr->iov = buf->tail; + if (maxcount&3) { + int pad = 4 - (maxcount&3); + + *(xdr->p++) = 0; + + buf->tail[0].iov_base += maxcount&3; + buf->tail[0].iov_len = pad; + buf->len += pad; + } + + space_left = min_t(int, (void *)xdr->end - (void *)xdr->p, + buf->buflen - buf->len); + buf->buflen = buf->len + space_left; + xdr->end = (__be32 *)((void *)xdr->end + space_left); + + return 0; +} + +static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + struct file *file, unsigned long maxcount) +{ + struct xdr_stream *xdr = &resp->xdr; + u32 eof; + int starting_len = xdr->buf->len - 8; + __be32 nfserr; + __be32 tmp; + int pad; + + read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount); + if (read->rd_vlen < 0) + return nfserr_resource; + + nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, + resp->rqstp->rq_vec, read->rd_vlen, &maxcount, + &eof); + read->rd_length = maxcount; + if (nfserr) + return nfserr; + if (svc_encode_read_payload(resp->rqstp, starting_len + 8, maxcount)) + return nfserr_io; + xdr_truncate_encode(xdr, starting_len + 8 + xdr_align_size(maxcount)); + + tmp = htonl(eof); + write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); + tmp = htonl(maxcount); + write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); + + tmp = xdr_zero; + pad = (maxcount&3) ? 4 - (maxcount&3) : 0; + write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount, + &tmp, pad); + return 0; + +} + +static __be32 +nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_read *read) +{ + unsigned long maxcount; + struct xdr_stream *xdr = &resp->xdr; + struct file *file; + int starting_len = xdr->buf->len; + __be32 *p; + + if (nfserr) + return nfserr; + file = read->rd_nf->nf_file; + + p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ + if (!p) { + WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); + return nfserr_resource; + } + if (resp->xdr.buf->page_len && + test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) { + WARN_ON_ONCE(1); + return nfserr_serverfault; + } + xdr_commit_encode(xdr); + + maxcount = svc_max_payload(resp->rqstp); + maxcount = min_t(unsigned long, maxcount, + (xdr->buf->buflen - xdr->buf->len)); + maxcount = min_t(unsigned long, maxcount, read->rd_length); + + if (file->f_op->splice_read && + test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); + else + nfserr = nfsd4_encode_readv(resp, read, file, maxcount); + + if (nfserr) + xdr_truncate_encode(xdr, starting_len); + + return nfserr; +} + +static __be32 +nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink) +{ + int maxcount; + __be32 wire_count; + int zero = 0; + struct xdr_stream *xdr = &resp->xdr; + int length_offset = xdr->buf->len; + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + maxcount = PAGE_SIZE; + + p = xdr_reserve_space(xdr, maxcount); + if (!p) + return nfserr_resource; + /* + * XXX: By default, vfs_readlink() will truncate symlinks if they + * would overflow the buffer. Is this kosher in NFSv4? If not, one + * easy fix is: if vfs_readlink() precisely fills the buffer, assume + * that truncation occurred, and return NFS4ERR_RESOURCE. + */ + nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, + (char *)p, &maxcount); + if (nfserr == nfserr_isdir) + nfserr = nfserr_inval; + if (nfserr) { + xdr_truncate_encode(xdr, length_offset); + return nfserr; + } + + wire_count = htonl(maxcount); + write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4); + xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4)); + if (maxcount & 3) + write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, + &zero, 4 - (maxcount&3)); + return 0; +} + +static __be32 +nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir) +{ + int maxcount; + int bytes_left; + loff_t offset; + __be64 wire_offset; + struct xdr_stream *xdr = &resp->xdr; + int starting_len = xdr->buf->len; + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + + /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */ + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p) + - (char *)resp->xdr.buf->head[0].iov_base; + + /* + * Number of bytes left for directory entries allowing for the + * final 8 bytes of the readdir and a following failed op: + */ + bytes_left = xdr->buf->buflen - xdr->buf->len + - COMPOUND_ERR_SLACK_SPACE - 8; + if (bytes_left < 0) { + nfserr = nfserr_resource; + goto err_no_verf; + } + maxcount = svc_max_payload(resp->rqstp); + maxcount = min_t(u32, readdir->rd_maxcount, maxcount); + /* + * Note the rfc defines rd_maxcount as the size of the + * READDIR4resok structure, which includes the verifier above + * and the 8 bytes encoded at the end of this function: + */ + if (maxcount < 16) { + nfserr = nfserr_toosmall; + goto err_no_verf; + } + maxcount = min_t(int, maxcount-16, bytes_left); + + /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */ + if (!readdir->rd_dircount) + readdir->rd_dircount = svc_max_payload(resp->rqstp); + + readdir->xdr = xdr; + readdir->rd_maxcount = maxcount; + readdir->common.err = 0; + readdir->cookie_offset = 0; + + offset = readdir->rd_cookie; + nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, + &offset, + &readdir->common, nfsd4_encode_dirent); + if (nfserr == nfs_ok && + readdir->common.err == nfserr_toosmall && + xdr->buf->len == starting_len + 8) { + /* nothing encoded; which limit did we hit?: */ + if (maxcount - 16 < bytes_left) + /* It was the fault of rd_maxcount: */ + nfserr = nfserr_toosmall; + else + /* We ran out of buffer space: */ + nfserr = nfserr_resource; + } + if (nfserr) + goto err_no_verf; + + if (readdir->cookie_offset) { + wire_offset = cpu_to_be64(offset); + write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, + &wire_offset, 8); + } + + p = xdr_reserve_space(xdr, 8); + if (!p) { + WARN_ON_ONCE(1); + goto err_no_verf; + } + *p++ = 0; /* no more entries */ + *p++ = htonl(readdir->common.err == nfserr_eof); + + return 0; +err_no_verf: + xdr_truncate_encode(xdr, starting_len); + return nfserr; +} + +static __be32 +nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &remove->rm_cinfo); + return 0; +} + +static __be32 +nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 40); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &rename->rn_sinfo); + p = encode_cinfo(p, &rename->rn_tinfo); + return 0; +} + +static __be32 +nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) +{ + u32 i, nflavs, supported; + struct exp_flavor_info *flavs; + struct exp_flavor_info def_flavs[2]; + __be32 *p, *flavorsp; + static bool report = true; + + if (exp->ex_nflavors) { + flavs = exp->ex_flavors; + nflavs = exp->ex_nflavors; + } else { /* Handling of some defaults in absence of real secinfo: */ + flavs = def_flavs; + if (exp->ex_client->flavour->flavour == RPC_AUTH_UNIX) { + nflavs = 2; + flavs[0].pseudoflavor = RPC_AUTH_UNIX; + flavs[1].pseudoflavor = RPC_AUTH_NULL; + } else if (exp->ex_client->flavour->flavour == RPC_AUTH_GSS) { + nflavs = 1; + flavs[0].pseudoflavor + = svcauth_gss_flavor(exp->ex_client); + } else { + nflavs = 1; + flavs[0].pseudoflavor + = exp->ex_client->flavour->flavour; + } + } + + supported = 0; + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + flavorsp = p++; /* to be backfilled later */ + + for (i = 0; i < nflavs; i++) { + rpc_authflavor_t pf = flavs[i].pseudoflavor; + struct rpcsec_gss_info info; + + if (rpcauth_get_gssinfo(pf, &info) == 0) { + supported++; + p = xdr_reserve_space(xdr, 4 + 4 + + XDR_LEN(info.oid.len) + 4 + 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(RPC_AUTH_GSS); + p = xdr_encode_opaque(p, info.oid.data, info.oid.len); + *p++ = cpu_to_be32(info.qop); + *p++ = cpu_to_be32(info.service); + } else if (pf < RPC_AUTH_MAXFLAVOR) { + supported++; + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(pf); + } else { + if (report) + pr_warn("NFS: SECINFO: security flavor %u " + "is not supported\n", pf); + } + } + + if (nflavs != supported) + report = false; + *flavorsp = htonl(supported); + return 0; +} + +static __be32 +nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_secinfo *secinfo) +{ + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp); +} + +static __be32 +nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_secinfo_no_name *secinfo) +{ + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp); +} + +/* + * The SETATTR encode routine is special -- it always encodes a bitmap, + * regardless of the error status. + */ +static __be32 +nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; + if (nfserr) { + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + } + else { + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(setattr->sa_bmval[0]); + *p++ = cpu_to_be32(setattr->sa_bmval[1]); + *p++ = cpu_to_be32(setattr->sa_bmval[2]); + } + return nfserr; +} + +static __be32 +nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (!nfserr) { + p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8); + p = xdr_encode_opaque_fixed(p, &scd->se_confirm, + NFS4_VERIFIER_SIZE); + } + else if (nfserr == nfserr_clid_inuse) { + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + } + return nfserr; +} + +static __be32 +nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(write->wr_bytes_written); + *p++ = cpu_to_be32(write->wr_how_written); + p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, + NFS4_VERIFIER_SIZE); + return 0; +} + +static __be32 +nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_exchange_id *exid) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + char *major_id; + char *server_scope; + int major_id_sz; + int server_scope_sz; + uint64_t minor_id = 0; + struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id); + + major_id = nn->nfsd_name; + major_id_sz = strlen(nn->nfsd_name); + server_scope = nn->nfsd_name; + server_scope_sz = strlen(nn->nfsd_name); + + p = xdr_reserve_space(xdr, + 8 /* eir_clientid */ + + 4 /* eir_sequenceid */ + + 4 /* eir_flags */ + + 4 /* spr_how */); + if (!p) + return nfserr_resource; + + p = xdr_encode_opaque_fixed(p, &exid->clientid, 8); + *p++ = cpu_to_be32(exid->seqid); + *p++ = cpu_to_be32(exid->flags); + + *p++ = cpu_to_be32(exid->spa_how); + + switch (exid->spa_how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + /* spo_must_enforce bitmap: */ + nfserr = nfsd4_encode_bitmap(xdr, + exid->spo_must_enforce[0], + exid->spo_must_enforce[1], + exid->spo_must_enforce[2]); + if (nfserr) + return nfserr; + /* spo_must_allow bitmap: */ + nfserr = nfsd4_encode_bitmap(xdr, + exid->spo_must_allow[0], + exid->spo_must_allow[1], + exid->spo_must_allow[2]); + if (nfserr) + return nfserr; + break; + default: + WARN_ON_ONCE(1); + } + + p = xdr_reserve_space(xdr, + 8 /* so_minor_id */ + + 4 /* so_major_id.len */ + + (XDR_QUADLEN(major_id_sz) * 4) + + 4 /* eir_server_scope.len */ + + (XDR_QUADLEN(server_scope_sz) * 4) + + 4 /* eir_server_impl_id.count (0) */); + if (!p) + return nfserr_resource; + + /* The server_owner struct */ + p = xdr_encode_hyper(p, minor_id); /* Minor id */ + /* major id */ + p = xdr_encode_opaque(p, major_id, major_id_sz); + + /* Server scope */ + p = xdr_encode_opaque(p, server_scope, server_scope_sz); + + /* Implementation id */ + *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */ + return 0; +} + +static __be32 +nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_create_session *sess) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 24); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, sess->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(sess->seqid); + *p++ = cpu_to_be32(sess->flags); + + p = xdr_reserve_space(xdr, 28); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); /* headerpadsz */ + *p++ = cpu_to_be32(sess->fore_channel.maxreq_sz); + *p++ = cpu_to_be32(sess->fore_channel.maxresp_sz); + *p++ = cpu_to_be32(sess->fore_channel.maxresp_cached); + *p++ = cpu_to_be32(sess->fore_channel.maxops); + *p++ = cpu_to_be32(sess->fore_channel.maxreqs); + *p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs); + + if (sess->fore_channel.nr_rdma_attrs) { + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sess->fore_channel.rdma_attrs); + } + + p = xdr_reserve_space(xdr, 28); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); /* headerpadsz */ + *p++ = cpu_to_be32(sess->back_channel.maxreq_sz); + *p++ = cpu_to_be32(sess->back_channel.maxresp_sz); + *p++ = cpu_to_be32(sess->back_channel.maxresp_cached); + *p++ = cpu_to_be32(sess->back_channel.maxops); + *p++ = cpu_to_be32(sess->back_channel.maxreqs); + *p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs); + + if (sess->back_channel.nr_rdma_attrs) { + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sess->back_channel.rdma_attrs); + } + return 0; +} + +static __be32 +nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_sequence *seq) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, seq->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(seq->seqid); + *p++ = cpu_to_be32(seq->slotid); + /* Note slotid's are numbered from zero: */ + *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */ + *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */ + *p++ = cpu_to_be32(seq->status_flags); + + resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */ + return 0; +} + +static __be32 +nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_test_stateid *test_stateid) +{ + struct xdr_stream *xdr = &resp->xdr; + struct nfsd4_test_stateid_id *stateid, *next; + __be32 *p; + + p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids)); + if (!p) + return nfserr_resource; + *p++ = htonl(test_stateid->ts_num_ids); + + list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) { + *p++ = stateid->ts_id_status; + } + + return 0; +} + +#ifdef CONFIG_NFSD_PNFS +static __be32 +nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_getdeviceinfo *gdev) +{ + struct xdr_stream *xdr = &resp->xdr; + const struct nfsd4_layout_ops *ops; + u32 starting_len = xdr->buf->len, needed_len; + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(gdev->gd_layout_type); + + ops = nfsd4_layout_ops[gdev->gd_layout_type]; + nfserr = ops->encode_getdeviceinfo(xdr, gdev); + if (nfserr) { + /* + * We don't bother to burden the layout drivers with + * enforcing gd_maxcount, just tell the client to + * come back with a bigger buffer if it's not enough. + */ + if (xdr->buf->len + 4 > gdev->gd_maxcount) + goto toosmall; + return nfserr; + } + + if (gdev->gd_notify_types) { + p = xdr_reserve_space(xdr, 4 + 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(1); /* bitmap length */ + *p++ = cpu_to_be32(gdev->gd_notify_types); + } else { + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = 0; + } + + return 0; +toosmall: + dprintk("%s: maxcount too small\n", __func__); + needed_len = xdr->buf->len + 4 /* notifications */; + xdr_truncate_encode(xdr, starting_len); + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(needed_len); + return nfserr_toosmall; +} + +static __be32 +nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_layoutget *lgp) +{ + struct xdr_stream *xdr = &resp->xdr; + const struct nfsd4_layout_ops *ops; + __be32 *p; + + p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t)); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(1); /* we always set return-on-close */ + *p++ = cpu_to_be32(lgp->lg_sid.si_generation); + p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque, + sizeof(stateid_opaque_t)); + + *p++ = cpu_to_be32(1); /* we always return a single layout */ + p = xdr_encode_hyper(p, lgp->lg_seg.offset); + p = xdr_encode_hyper(p, lgp->lg_seg.length); + *p++ = cpu_to_be32(lgp->lg_seg.iomode); + *p++ = cpu_to_be32(lgp->lg_layout_type); + + ops = nfsd4_layout_ops[lgp->lg_layout_type]; + return ops->encode_layoutget(xdr, lgp); +} + +static __be32 +nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_layoutcommit *lcp) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(lcp->lc_size_chg); + if (lcp->lc_size_chg) { + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + p = xdr_encode_hyper(p, lcp->lc_newsize); + } + + return 0; +} + +static __be32 +nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_layoutreturn *lrp) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(lrp->lrs_present); + if (lrp->lrs_present) + return nfsd4_encode_stateid(xdr, &lrp->lr_sid); + return 0; +} +#endif /* CONFIG_NFSD_PNFS */ + +static __be32 +nfsd42_encode_write_res(struct nfsd4_compoundres *resp, + struct nfsd42_write_res *write, bool sync) +{ + __be32 *p; + p = xdr_reserve_space(&resp->xdr, 4); + if (!p) + return nfserr_resource; + + if (sync) + *p++ = cpu_to_be32(0); + else { + __be32 nfserr; + *p++ = cpu_to_be32(1); + nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid); + if (nfserr) + return nfserr; + } + p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + + p = xdr_encode_hyper(p, write->wr_bytes_written); + *p++ = cpu_to_be32(write->wr_stable_how); + p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, + NFS4_VERIFIER_SIZE); + return nfs_ok; +} + +static __be32 +nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns) +{ + struct xdr_stream *xdr = &resp->xdr; + struct nfs42_netaddr *addr; + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p++ = cpu_to_be32(ns->nl4_type); + + switch (ns->nl4_type) { + case NL4_NETADDR: + addr = &ns->u.nl4_addr; + + /* netid_len, netid, uaddr_len, uaddr (port included + * in RPCBIND_MAXUADDRLEN) + */ + p = xdr_reserve_space(xdr, + 4 /* netid len */ + + (XDR_QUADLEN(addr->netid_len) * 4) + + 4 /* uaddr len */ + + (XDR_QUADLEN(addr->addr_len) * 4)); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(addr->netid_len); + p = xdr_encode_opaque_fixed(p, addr->netid, + addr->netid_len); + *p++ = cpu_to_be32(addr->addr_len); + p = xdr_encode_opaque_fixed(p, addr->addr, + addr->addr_len); + break; + default: + WARN_ON_ONCE(ns->nl4_type != NL4_NETADDR); + return nfserr_inval; + } + + return 0; +} + +static __be32 +nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_copy *copy) +{ + __be32 *p; + + nfserr = nfsd42_encode_write_res(resp, ©->cp_res, + copy->cp_synchronous); + if (nfserr) + return nfserr; + + p = xdr_reserve_space(&resp->xdr, 4 + 4); + *p++ = xdr_one; /* cr_consecutive */ + *p++ = cpu_to_be32(copy->cp_synchronous); + return 0; +} + +static __be32 +nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_offload_status *os) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 8 + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_hyper(p, os->count); + *p++ = cpu_to_be32(0); + return nfserr; +} + +static __be32 +nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + unsigned long *maxcount, u32 *eof, + loff_t *pos) +{ + struct xdr_stream *xdr = &resp->xdr; + struct file *file = read->rd_nf->nf_file; + int starting_len = xdr->buf->len; + loff_t hole_pos; + __be32 nfserr; + __be32 *p, tmp; + __be64 tmp64; + + hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE); + if (hole_pos > read->rd_offset) + *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset); + *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len)); + + /* Content type, offset, byte count */ + p = xdr_reserve_space(xdr, 4 + 8 + 4); + if (!p) + return nfserr_resource; + + read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount); + if (read->rd_vlen < 0) + return nfserr_resource; + + nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, + resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof); + if (nfserr) + return nfserr; + xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount)); + + tmp = htonl(NFS4_CONTENT_DATA); + write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); + tmp64 = cpu_to_be64(read->rd_offset); + write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8); + tmp = htonl(*maxcount); + write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4); + + tmp = xdr_zero; + write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp, + xdr_pad_size(*maxcount)); + return nfs_ok; +} + +static __be32 +nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + unsigned long *maxcount, u32 *eof) +{ + struct file *file = read->rd_nf->nf_file; + loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA); + loff_t f_size = i_size_read(file_inode(file)); + unsigned long count; + __be32 *p; + + if (data_pos == -ENXIO) + data_pos = f_size; + else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE)) + return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size); + count = data_pos - read->rd_offset; + + /* Content type, offset, byte count */ + p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8); + if (!p) + return nfserr_resource; + + *p++ = htonl(NFS4_CONTENT_HOLE); + p = xdr_encode_hyper(p, read->rd_offset); + p = xdr_encode_hyper(p, count); + + *eof = (read->rd_offset + count) >= f_size; + *maxcount = min_t(unsigned long, count, *maxcount); + return nfs_ok; +} + +static __be32 +nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_read *read) +{ + unsigned long maxcount, count; + struct xdr_stream *xdr = &resp->xdr; + struct file *file; + int starting_len = xdr->buf->len; + int last_segment = xdr->buf->len; + int segments = 0; + __be32 *p, tmp; + bool is_data; + loff_t pos; + u32 eof; + + if (nfserr) + return nfserr; + file = read->rd_nf->nf_file; + + /* eof flag, segment count */ + p = xdr_reserve_space(xdr, 4 + 4); + if (!p) + return nfserr_resource; + xdr_commit_encode(xdr); + + maxcount = svc_max_payload(resp->rqstp); + maxcount = min_t(unsigned long, maxcount, + (xdr->buf->buflen - xdr->buf->len)); + maxcount = min_t(unsigned long, maxcount, read->rd_length); + count = maxcount; + + eof = read->rd_offset >= i_size_read(file_inode(file)); + if (eof) + goto out; + + pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE); + is_data = pos > read->rd_offset; + + while (count > 0 && !eof) { + maxcount = count; + if (is_data) + nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof, + segments == 0 ? &pos : NULL); + else + nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof); + if (nfserr) + goto out; + count -= maxcount; + read->rd_offset += maxcount; + is_data = !is_data; + last_segment = xdr->buf->len; + segments++; + } + +out: + if (nfserr && segments == 0) + xdr_truncate_encode(xdr, starting_len); + else { + if (nfserr) { + xdr_truncate_encode(xdr, last_segment); + nfserr = nfs_ok; + eof = 0; + } + tmp = htonl(eof); + write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4); + tmp = htonl(segments); + write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); + } + + return nfserr; +} + +static __be32 +nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_copy_notify *cn) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (nfserr) + return nfserr; + + /* 8 sec, 4 nsec */ + p = xdr_reserve_space(xdr, 12); + if (!p) + return nfserr_resource; + + /* cnr_lease_time */ + p = xdr_encode_hyper(p, cn->cpn_sec); + *p++ = cpu_to_be32(cn->cpn_nsec); + + /* cnr_stateid */ + nfserr = nfsd4_encode_stateid(xdr, &cn->cpn_cnr_stateid); + if (nfserr) + return nfserr; + + /* cnr_src.nl_nsvr */ + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(1); + + return nfsd42_encode_nl4_server(resp, &cn->cpn_src); +} + +static __be32 +nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_seek *seek) +{ + __be32 *p; + + p = xdr_reserve_space(&resp->xdr, 4 + 8); + *p++ = cpu_to_be32(seek->seek_eof); + p = xdr_encode_hyper(p, seek->seek_pos); + + return 0; +} + +static __be32 +nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) +{ + return nfserr; +} + +/* + * Encode kmalloc-ed buffer in to XDR stream. + */ +static __be32 +nfsd4_vbuf_to_stream(struct xdr_stream *xdr, char *buf, u32 buflen) +{ + u32 cplen; + __be32 *p; + + cplen = min_t(unsigned long, buflen, + ((void *)xdr->end - (void *)xdr->p)); + p = xdr_reserve_space(xdr, cplen); + if (!p) + return nfserr_resource; + + memcpy(p, buf, cplen); + buf += cplen; + buflen -= cplen; + + while (buflen) { + cplen = min_t(u32, buflen, PAGE_SIZE); + p = xdr_reserve_space(xdr, cplen); + if (!p) + return nfserr_resource; + + memcpy(p, buf, cplen); + + if (cplen < PAGE_SIZE) { + /* + * We're done, with a length that wasn't page + * aligned, so possibly not word aligned. Pad + * any trailing bytes with 0. + */ + xdr_encode_opaque_fixed(p, NULL, cplen); + break; + } + + buflen -= PAGE_SIZE; + buf += PAGE_SIZE; + } + + return 0; +} + +static __be32 +nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_getxattr *getxattr) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p, err; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + + *p = cpu_to_be32(getxattr->getxa_len); + + if (getxattr->getxa_len == 0) + return 0; + + err = nfsd4_vbuf_to_stream(xdr, getxattr->getxa_buf, + getxattr->getxa_len); + + kvfree(getxattr->getxa_buf); + + return err; +} + +static __be32 +nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_setxattr *setxattr) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + + encode_cinfo(p, &setxattr->setxa_cinfo); + + return 0; +} + +/* + * See if there are cookie values that can be rejected outright. + */ +static __be32 +nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs, + u32 *offsetp) +{ + u64 cookie = listxattrs->lsxa_cookie; + + /* + * If the cookie is larger than the maximum number we can fit + * in either the buffer we just got back from vfs_listxattr, or, + * XDR-encoded, in the return buffer, it's invalid. + */ + if (cookie > (listxattrs->lsxa_len) / (XATTR_USER_PREFIX_LEN + 2)) + return nfserr_badcookie; + + if (cookie > (listxattrs->lsxa_maxcount / + (XDR_QUADLEN(XATTR_USER_PREFIX_LEN + 2) + 4))) + return nfserr_badcookie; + + *offsetp = (u32)cookie; + return 0; +} + +static __be32 +nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_listxattrs *listxattrs) +{ + struct xdr_stream *xdr = &resp->xdr; + u32 cookie_offset, count_offset, eof; + u32 left, xdrleft, slen, count; + u32 xdrlen, offset; + u64 cookie; + char *sp; + __be32 status, tmp; + __be32 *p; + u32 nuser; + + eof = 1; + + status = nfsd4_listxattr_validate_cookie(listxattrs, &offset); + if (status) + goto out; + + /* + * Reserve space for the cookie and the name array count. Record + * the offsets to save them later. + */ + cookie_offset = xdr->buf->len; + count_offset = cookie_offset + 8; + p = xdr_reserve_space(xdr, 12); + if (!p) { + status = nfserr_resource; + goto out; + } + + count = 0; + left = listxattrs->lsxa_len; + sp = listxattrs->lsxa_buf; + nuser = 0; + + xdrleft = listxattrs->lsxa_maxcount; + + while (left > 0 && xdrleft > 0) { + slen = strlen(sp); + + /* + * Check if this is a "user." attribute, skip it if not. + */ + if (strncmp(sp, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + goto contloop; + + slen -= XATTR_USER_PREFIX_LEN; + xdrlen = 4 + ((slen + 3) & ~3); + if (xdrlen > xdrleft) { + if (count == 0) { + /* + * Can't even fit the first attribute name. + */ + status = nfserr_toosmall; + goto out; + } + eof = 0; + goto wreof; + } + + left -= XATTR_USER_PREFIX_LEN; + sp += XATTR_USER_PREFIX_LEN; + if (nuser++ < offset) + goto contloop; + + + p = xdr_reserve_space(xdr, xdrlen); + if (!p) { + status = nfserr_resource; + goto out; + } + + xdr_encode_opaque(p, sp, slen); + + xdrleft -= xdrlen; + count++; +contloop: + sp += slen + 1; + left -= slen + 1; + } + + /* + * If there were user attributes to copy, but we didn't copy + * any, the offset was too large (e.g. the cookie was invalid). + */ + if (nuser > 0 && count == 0) { + status = nfserr_badcookie; + goto out; + } + +wreof: + p = xdr_reserve_space(xdr, 4); + if (!p) { + status = nfserr_resource; + goto out; + } + *p = cpu_to_be32(eof); + + cookie = offset + count; + + write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &cookie, 8); + tmp = cpu_to_be32(count); + write_bytes_to_xdr_buf(xdr->buf, count_offset, &tmp, 4); +out: + if (listxattrs->lsxa_len) + kvfree(listxattrs->lsxa_buf); + return status; +} + +static __be32 +nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_removexattr *removexattr) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + + p = encode_cinfo(p, &removexattr->rmxa_cinfo); + return 0; +} + +typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); + +/* + * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1 + * since we don't need to filter out obsolete ops as this is + * done in the decoding phase. + */ +static const nfsd4_enc nfsd4_enc_ops[] = { + [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, + [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, + [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit, + [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create, + [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr, + [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh, + [OP_LINK] = (nfsd4_enc)nfsd4_encode_link, + [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock, + [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt, + [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku, + [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, + [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, + [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, + [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, + [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_READ] = (nfsd4_enc)nfsd4_encode_read, + [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir, + [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink, + [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove, + [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename, + [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop, + [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo, + [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr, + [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid, + [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop, + [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, + [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, + + /* NFSv4.1 operations */ + [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, + [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session, + [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, + [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, +#ifdef CONFIG_NFSD_PNFS + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, +#else + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, +#endif + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, + [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_test_stateid, + [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, + + /* NFSv4.2 operations */ + [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy, + [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_copy_notify, + [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status, + [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_read_plus, + [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, + [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop, + + /* RFC 8276 extended atributes operations */ + [OP_GETXATTR] = (nfsd4_enc)nfsd4_encode_getxattr, + [OP_SETXATTR] = (nfsd4_enc)nfsd4_encode_setxattr, + [OP_LISTXATTRS] = (nfsd4_enc)nfsd4_encode_listxattrs, + [OP_REMOVEXATTR] = (nfsd4_enc)nfsd4_encode_removexattr, +}; + +/* + * Calculate whether we still have space to encode repsize bytes. + * There are two considerations: + * - For NFS versions >=4.1, the size of the reply must stay within + * session limits + * - For all NFS versions, we must stay within limited preallocated + * buffer space. + * + * This is called before the operation is processed, so can only provide + * an upper estimate. For some nonidempotent operations (such as + * getattr), it's not necessarily a problem if that estimate is wrong, + * as we can fail it after processing without significant side effects. + */ +__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize) +{ + struct xdr_buf *buf = &resp->rqstp->rq_res; + struct nfsd4_slot *slot = resp->cstate.slot; + + if (buf->len + respsize <= buf->buflen) + return nfs_ok; + if (!nfsd4_has_session(&resp->cstate)) + return nfserr_resource; + if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) { + WARN_ON_ONCE(1); + return nfserr_rep_too_big_to_cache; + } + return nfserr_rep_too_big; +} + +void +nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) +{ + struct xdr_stream *xdr = &resp->xdr; + struct nfs4_stateowner *so = resp->cstate.replay_owner; + struct svc_rqst *rqstp = resp->rqstp; + const struct nfsd4_operation *opdesc = op->opdesc; + int post_err_offset; + nfsd4_enc encoder; + __be32 *p; + + p = xdr_reserve_space(xdr, 8); + if (!p) { + WARN_ON_ONCE(1); + return; + } + *p++ = cpu_to_be32(op->opnum); + post_err_offset = xdr->buf->len; + + if (op->opnum == OP_ILLEGAL) + goto status; + if (op->status && opdesc && + !(opdesc->op_flags & OP_NONTRIVIAL_ERROR_ENCODE)) + goto status; + BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || + !nfsd4_enc_ops[op->opnum]); + encoder = nfsd4_enc_ops[op->opnum]; + op->status = encoder(resp, op->status, &op->u); + if (opdesc && opdesc->op_release) + opdesc->op_release(&op->u); + xdr_commit_encode(xdr); + + /* nfsd4_check_resp_size guarantees enough room for error status */ + if (!op->status) { + int space_needed = 0; + if (!nfsd4_last_compound_op(rqstp)) + space_needed = COMPOUND_ERR_SLACK_SPACE; + op->status = nfsd4_check_resp_size(resp, space_needed); + } + if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) { + struct nfsd4_slot *slot = resp->cstate.slot; + + if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) + op->status = nfserr_rep_too_big_to_cache; + else + op->status = nfserr_rep_too_big; + } + if (op->status == nfserr_resource || + op->status == nfserr_rep_too_big || + op->status == nfserr_rep_too_big_to_cache) { + /* + * The operation may have already been encoded or + * partially encoded. No op returns anything additional + * in the case of one of these three errors, so we can + * just truncate back to after the status. But it's a + * bug if we had to do this on a non-idempotent op: + */ + warn_on_nonidempotent_op(op); + xdr_truncate_encode(xdr, post_err_offset); + } + if (so) { + int len = xdr->buf->len - post_err_offset; + + so->so_replay.rp_status = op->status; + so->so_replay.rp_buflen = len; + read_bytes_from_xdr_buf(xdr->buf, post_err_offset, + so->so_replay.rp_buf, len); + } +status: + /* Note that op->status is already in network byte order: */ + write_bytes_to_xdr_buf(xdr->buf, post_err_offset - 4, &op->status, 4); +} + +/* + * Encode the reply stored in the stateowner reply cache + * + * XDR note: do not encode rp->rp_buflen: the buffer contains the + * previously sent already encoded operation. + */ +void +nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) +{ + __be32 *p; + struct nfs4_replay *rp = op->replay; + + p = xdr_reserve_space(xdr, 8 + rp->rp_buflen); + if (!p) { + WARN_ON_ONCE(1); + return; + } + *p++ = cpu_to_be32(op->opnum); + *p++ = rp->rp_status; /* already xdr'ed */ + + p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen); +} + +int +nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) +{ + return xdr_ressize_check(rqstp, p); +} + +void nfsd4_release_compoundargs(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundargs *args = rqstp->rq_argp; + + if (args->ops != args->iops) { + kfree(args->ops); + args->ops = args->iops; + } + kfree(args->tmpp); + args->tmpp = NULL; + while (args->to_free) { + struct svcxdr_tmpbuf *tb = args->to_free; + args->to_free = tb->next; + kfree(tb); + } +} + +int +nfs4svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) +{ + return 1; +} + +int +nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd4_compoundargs *args = rqstp->rq_argp; + + if (rqstp->rq_arg.head[0].iov_len % 4) { + /* client is nuts */ + dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)", + __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid)); + return 0; + } + args->p = p; + args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; + args->pagelist = rqstp->rq_arg.pages; + args->pagelen = rqstp->rq_arg.page_len; + args->tail = false; + args->tmpp = NULL; + args->to_free = NULL; + args->ops = args->iops; + args->rqstp = rqstp; + + return !nfsd4_decode_compound(args); +} + +int +nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct xdr_buf *buf = resp->xdr.buf; + + WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + + buf->tail[0].iov_len); + + *p = resp->cstate.status; + + rqstp->rq_next_page = resp->xdr.page_ptr + 1; + + p = resp->tagp; + *p++ = htonl(resp->taglen); + memcpy(p, resp->tag, resp->taglen); + p += XDR_QUADLEN(resp->taglen); + *p++ = htonl(resp->opcnt); + + nfsd4_sequence_done(resp); + return 1; +} + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c new file mode 100644 index 000000000..80c90fc23 --- /dev/null +++ b/fs/nfsd/nfscache.c @@ -0,0 +1,609 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Request reply cache. This is currently a global cache, but this may + * change in the future and be a per-client cache. + * + * This code is heavily inspired by the 44BSD implementation, although + * it does things a bit differently. + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/sunrpc/svc_xprt.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/sunrpc/addr.h> +#include <linux/highmem.h> +#include <linux/log2.h> +#include <linux/hash.h> +#include <net/checksum.h> + +#include "nfsd.h" +#include "cache.h" +#include "trace.h" + +/* + * We use this value to determine the number of hash buckets from the max + * cache size, the idea being that when the cache is at its maximum number + * of entries, then this should be the average number of entries per bucket. + */ +#define TARGET_BUCKET_SIZE 64 + +struct nfsd_drc_bucket { + struct rb_root rb_head; + struct list_head lru_head; + spinlock_t cache_lock; +}; + +static struct kmem_cache *drc_slab; + +static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); +static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); + +/* + * Put a cap on the size of the DRC based on the amount of available + * low memory in the machine. + * + * 64MB: 8192 + * 128MB: 11585 + * 256MB: 16384 + * 512MB: 23170 + * 1GB: 32768 + * 2GB: 46340 + * 4GB: 65536 + * 8GB: 92681 + * 16GB: 131072 + * + * ...with a hard cap of 256k entries. In the worst case, each entry will be + * ~1k, so the above numbers should give a rough max of the amount of memory + * used in k. + * + * XXX: these limits are per-container, so memory used will increase + * linearly with number of containers. Maybe that's OK. + */ +static unsigned int +nfsd_cache_size_limit(void) +{ + unsigned int limit; + unsigned long low_pages = totalram_pages() - totalhigh_pages(); + + limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10); + return min_t(unsigned int, limit, 256*1024); +} + +/* + * Compute the number of hash buckets we need. Divide the max cachesize by + * the "target" max bucket size, and round up to next power of two. + */ +static unsigned int +nfsd_hashsize(unsigned int limit) +{ + return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); +} + +static u32 +nfsd_cache_hash(__be32 xid, struct nfsd_net *nn) +{ + return hash_32(be32_to_cpu(xid), nn->maskbits); +} + +static struct svc_cacherep * +nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum, + struct nfsd_net *nn) +{ + struct svc_cacherep *rp; + + rp = kmem_cache_alloc(drc_slab, GFP_KERNEL); + if (rp) { + rp->c_state = RC_UNUSED; + rp->c_type = RC_NOCACHE; + RB_CLEAR_NODE(&rp->c_node); + INIT_LIST_HEAD(&rp->c_lru); + + memset(&rp->c_key, 0, sizeof(rp->c_key)); + rp->c_key.k_xid = rqstp->rq_xid; + rp->c_key.k_proc = rqstp->rq_proc; + rpc_copy_addr((struct sockaddr *)&rp->c_key.k_addr, svc_addr(rqstp)); + rpc_set_port((struct sockaddr *)&rp->c_key.k_addr, rpc_get_port(svc_addr(rqstp))); + rp->c_key.k_prot = rqstp->rq_prot; + rp->c_key.k_vers = rqstp->rq_vers; + rp->c_key.k_len = rqstp->rq_arg.len; + rp->c_key.k_csum = csum; + } + return rp; +} + +static void +nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, + struct nfsd_net *nn) +{ + if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { + nn->drc_mem_usage -= rp->c_replvec.iov_len; + kfree(rp->c_replvec.iov_base); + } + if (rp->c_state != RC_UNUSED) { + rb_erase(&rp->c_node, &b->rb_head); + list_del(&rp->c_lru); + atomic_dec(&nn->num_drc_entries); + nn->drc_mem_usage -= sizeof(*rp); + } + kmem_cache_free(drc_slab, rp); +} + +static void +nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, + struct nfsd_net *nn) +{ + spin_lock(&b->cache_lock); + nfsd_reply_cache_free_locked(b, rp, nn); + spin_unlock(&b->cache_lock); +} + +int nfsd_drc_slab_create(void) +{ + drc_slab = kmem_cache_create("nfsd_drc", + sizeof(struct svc_cacherep), 0, 0, NULL); + return drc_slab ? 0: -ENOMEM; +} + +void nfsd_drc_slab_free(void) +{ + kmem_cache_destroy(drc_slab); +} + +int nfsd_reply_cache_init(struct nfsd_net *nn) +{ + unsigned int hashsize; + unsigned int i; + int status = 0; + + nn->max_drc_entries = nfsd_cache_size_limit(); + atomic_set(&nn->num_drc_entries, 0); + hashsize = nfsd_hashsize(nn->max_drc_entries); + nn->maskbits = ilog2(hashsize); + + nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; + nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; + nn->nfsd_reply_cache_shrinker.seeks = 1; + status = register_shrinker(&nn->nfsd_reply_cache_shrinker); + if (status) + goto out_nomem; + + nn->drc_hashtbl = kvzalloc(array_size(hashsize, + sizeof(*nn->drc_hashtbl)), GFP_KERNEL); + if (!nn->drc_hashtbl) + goto out_shrinker; + + for (i = 0; i < hashsize; i++) { + INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head); + spin_lock_init(&nn->drc_hashtbl[i].cache_lock); + } + nn->drc_hashsize = hashsize; + + return 0; +out_shrinker: + unregister_shrinker(&nn->nfsd_reply_cache_shrinker); +out_nomem: + printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); + return -ENOMEM; +} + +void nfsd_reply_cache_shutdown(struct nfsd_net *nn) +{ + struct svc_cacherep *rp; + unsigned int i; + + unregister_shrinker(&nn->nfsd_reply_cache_shrinker); + + for (i = 0; i < nn->drc_hashsize; i++) { + struct list_head *head = &nn->drc_hashtbl[i].lru_head; + while (!list_empty(head)) { + rp = list_first_entry(head, struct svc_cacherep, c_lru); + nfsd_reply_cache_free_locked(&nn->drc_hashtbl[i], + rp, nn); + } + } + + kvfree(nn->drc_hashtbl); + nn->drc_hashtbl = NULL; + nn->drc_hashsize = 0; + +} + +/* + * Move cache entry to end of LRU list, and queue the cleaner to run if it's + * not already scheduled. + */ +static void +lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) +{ + rp->c_timestamp = jiffies; + list_move_tail(&rp->c_lru, &b->lru_head); +} + +static long +prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn) +{ + struct svc_cacherep *rp, *tmp; + long freed = 0; + + list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { + /* + * Don't free entries attached to calls that are still + * in-progress, but do keep scanning the list. + */ + if (rp->c_state == RC_INPROG) + continue; + if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries && + time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) + break; + nfsd_reply_cache_free_locked(b, rp, nn); + freed++; + } + return freed; +} + +/* + * Walk the LRU list and prune off entries that are older than RC_EXPIRE. + * Also prune the oldest ones when the total exceeds the max number of entries. + */ +static long +prune_cache_entries(struct nfsd_net *nn) +{ + unsigned int i; + long freed = 0; + + for (i = 0; i < nn->drc_hashsize; i++) { + struct nfsd_drc_bucket *b = &nn->drc_hashtbl[i]; + + if (list_empty(&b->lru_head)) + continue; + spin_lock(&b->cache_lock); + freed += prune_bucket(b, nn); + spin_unlock(&b->cache_lock); + } + return freed; +} + +static unsigned long +nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) +{ + struct nfsd_net *nn = container_of(shrink, + struct nfsd_net, nfsd_reply_cache_shrinker); + + return atomic_read(&nn->num_drc_entries); +} + +static unsigned long +nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + struct nfsd_net *nn = container_of(shrink, + struct nfsd_net, nfsd_reply_cache_shrinker); + + return prune_cache_entries(nn); +} +/* + * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes + */ +static __wsum +nfsd_cache_csum(struct svc_rqst *rqstp) +{ + int idx; + unsigned int base; + __wsum csum; + struct xdr_buf *buf = &rqstp->rq_arg; + const unsigned char *p = buf->head[0].iov_base; + size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len, + RC_CSUMLEN); + size_t len = min(buf->head[0].iov_len, csum_len); + + /* rq_arg.head first */ + csum = csum_partial(p, len, 0); + csum_len -= len; + + /* Continue into page array */ + idx = buf->page_base / PAGE_SIZE; + base = buf->page_base & ~PAGE_MASK; + while (csum_len) { + p = page_address(buf->pages[idx]) + base; + len = min_t(size_t, PAGE_SIZE - base, csum_len); + csum = csum_partial(p, len, csum); + csum_len -= len; + base = 0; + ++idx; + } + return csum; +} + +static int +nfsd_cache_key_cmp(const struct svc_cacherep *key, + const struct svc_cacherep *rp, struct nfsd_net *nn) +{ + if (key->c_key.k_xid == rp->c_key.k_xid && + key->c_key.k_csum != rp->c_key.k_csum) { + ++nn->payload_misses; + trace_nfsd_drc_mismatch(nn, key, rp); + } + + return memcmp(&key->c_key, &rp->c_key, sizeof(key->c_key)); +} + +/* + * Search the request hash for an entry that matches the given rqstp. + * Must be called with cache_lock held. Returns the found entry or + * inserts an empty key on failure. + */ +static struct svc_cacherep * +nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key, + struct nfsd_net *nn) +{ + struct svc_cacherep *rp, *ret = key; + struct rb_node **p = &b->rb_head.rb_node, + *parent = NULL; + unsigned int entries = 0; + int cmp; + + while (*p != NULL) { + ++entries; + parent = *p; + rp = rb_entry(parent, struct svc_cacherep, c_node); + + cmp = nfsd_cache_key_cmp(key, rp, nn); + if (cmp < 0) + p = &parent->rb_left; + else if (cmp > 0) + p = &parent->rb_right; + else { + ret = rp; + goto out; + } + } + rb_link_node(&key->c_node, parent, p); + rb_insert_color(&key->c_node, &b->rb_head); +out: + /* tally hash chain length stats */ + if (entries > nn->longest_chain) { + nn->longest_chain = entries; + nn->longest_chain_cachesize = atomic_read(&nn->num_drc_entries); + } else if (entries == nn->longest_chain) { + /* prefer to keep the smallest cachesize possible here */ + nn->longest_chain_cachesize = min_t(unsigned int, + nn->longest_chain_cachesize, + atomic_read(&nn->num_drc_entries)); + } + + lru_put_end(b, ret); + return ret; +} + +/** + * nfsd_cache_lookup - Find an entry in the duplicate reply cache + * @rqstp: Incoming Call to find + * + * Try to find an entry matching the current call in the cache. When none + * is found, we try to grab the oldest expired entry off the LRU list. If + * a suitable one isn't there, then drop the cache_lock and allocate a + * new one, then search again in case one got inserted while this thread + * didn't hold the lock. + * + * Return values: + * %RC_DOIT: Process the request normally + * %RC_REPLY: Reply from cache + * %RC_DROPIT: Do not process the request further + */ +int nfsd_cache_lookup(struct svc_rqst *rqstp) +{ + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct svc_cacherep *rp, *found; + __be32 xid = rqstp->rq_xid; + __wsum csum; + u32 hash = nfsd_cache_hash(xid, nn); + struct nfsd_drc_bucket *b = &nn->drc_hashtbl[hash]; + int type = rqstp->rq_cachetype; + int rtn = RC_DOIT; + + rqstp->rq_cacherep = NULL; + if (type == RC_NOCACHE) { + nfsdstats.rcnocache++; + goto out; + } + + csum = nfsd_cache_csum(rqstp); + + /* + * Since the common case is a cache miss followed by an insert, + * preallocate an entry. + */ + rp = nfsd_reply_cache_alloc(rqstp, csum, nn); + if (!rp) + goto out; + + spin_lock(&b->cache_lock); + found = nfsd_cache_insert(b, rp, nn); + if (found != rp) { + nfsd_reply_cache_free_locked(NULL, rp, nn); + rp = found; + goto found_entry; + } + + nfsdstats.rcmisses++; + rqstp->rq_cacherep = rp; + rp->c_state = RC_INPROG; + + atomic_inc(&nn->num_drc_entries); + nn->drc_mem_usage += sizeof(*rp); + + /* go ahead and prune the cache */ + prune_bucket(b, nn); + +out_unlock: + spin_unlock(&b->cache_lock); +out: + return rtn; + +found_entry: + /* We found a matching entry which is either in progress or done. */ + nfsdstats.rchits++; + rtn = RC_DROPIT; + + /* Request being processed */ + if (rp->c_state == RC_INPROG) + goto out_trace; + + /* From the hall of fame of impractical attacks: + * Is this a user who tries to snoop on the cache? */ + rtn = RC_DOIT; + if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && rp->c_secure) + goto out_trace; + + /* Compose RPC reply header */ + switch (rp->c_type) { + case RC_NOCACHE: + break; + case RC_REPLSTAT: + svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat); + rtn = RC_REPLY; + break; + case RC_REPLBUFF: + if (!nfsd_cache_append(rqstp, &rp->c_replvec)) + goto out_unlock; /* should not happen */ + rtn = RC_REPLY; + break; + default: + WARN_ONCE(1, "nfsd: bad repcache type %d\n", rp->c_type); + } + +out_trace: + trace_nfsd_drc_found(nn, rqstp, rtn); + goto out_unlock; +} + +/** + * nfsd_cache_update - Update an entry in the duplicate reply cache. + * @rqstp: svc_rqst with a finished Reply + * @cachetype: which cache to update + * @statp: Reply's status code + * + * This is called from nfsd_dispatch when the procedure has been + * executed and the complete reply is in rqstp->rq_res. + * + * We're copying around data here rather than swapping buffers because + * the toplevel loop requires max-sized buffers, which would be a waste + * of memory for a cache with a max reply size of 100 bytes (diropokres). + * + * If we should start to use different types of cache entries tailored + * specifically for attrstat and fh's, we may save even more space. + * + * Also note that a cachetype of RC_NOCACHE can legally be passed when + * nfsd failed to encode a reply that otherwise would have been cached. + * In this case, nfsd_cache_update is called with statp == NULL. + */ +void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) +{ + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct svc_cacherep *rp = rqstp->rq_cacherep; + struct kvec *resv = &rqstp->rq_res.head[0], *cachv; + u32 hash; + struct nfsd_drc_bucket *b; + int len; + size_t bufsize = 0; + + if (!rp) + return; + + hash = nfsd_cache_hash(rp->c_key.k_xid, nn); + b = &nn->drc_hashtbl[hash]; + + len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); + len >>= 2; + + /* Don't cache excessive amounts of data and XDR failures */ + if (!statp || len > (256 >> 2)) { + nfsd_reply_cache_free(b, rp, nn); + return; + } + + switch (cachetype) { + case RC_REPLSTAT: + if (len != 1) + printk("nfsd: RC_REPLSTAT/reply len %d!\n",len); + rp->c_replstat = *statp; + break; + case RC_REPLBUFF: + cachv = &rp->c_replvec; + bufsize = len << 2; + cachv->iov_base = kmalloc(bufsize, GFP_KERNEL); + if (!cachv->iov_base) { + nfsd_reply_cache_free(b, rp, nn); + return; + } + cachv->iov_len = bufsize; + memcpy(cachv->iov_base, statp, bufsize); + break; + case RC_NOCACHE: + nfsd_reply_cache_free(b, rp, nn); + return; + } + spin_lock(&b->cache_lock); + nn->drc_mem_usage += bufsize; + lru_put_end(b, rp); + rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags); + rp->c_type = cachetype; + rp->c_state = RC_DONE; + spin_unlock(&b->cache_lock); + return; +} + +/* + * Copy cached reply to current reply buffer. Should always fit. + * FIXME as reply is in a page, we should just attach the page, and + * keep a refcount.... + */ +static int +nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) +{ + struct kvec *vec = &rqstp->rq_res.head[0]; + + if (vec->iov_len + data->iov_len > PAGE_SIZE) { + printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n", + data->iov_len); + return 0; + } + memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len); + vec->iov_len += data->iov_len; + return 1; +} + +/* + * Note that fields may be added, removed or reordered in the future. Programs + * scraping this file for info should test the labels to ensure they're + * getting the correct field. + */ +static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) +{ + struct nfsd_net *nn = m->private; + + seq_printf(m, "max entries: %u\n", nn->max_drc_entries); + seq_printf(m, "num entries: %u\n", + atomic_read(&nn->num_drc_entries)); + seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits); + seq_printf(m, "mem usage: %u\n", nn->drc_mem_usage); + seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); + seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses); + seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache); + seq_printf(m, "payload misses: %u\n", nn->payload_misses); + seq_printf(m, "longest chain len: %u\n", nn->longest_chain); + seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize); + return 0; +} + +int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file) +{ + struct nfsd_net *nn = net_generic(file_inode(file)->i_sb->s_fs_info, + nfsd_net_id); + + return single_open(file, nfsd_reply_cache_stats_show, nn); +} diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c new file mode 100644 index 000000000..c4b11560a --- /dev/null +++ b/fs/nfsd/nfsctl.c @@ -0,0 +1,1578 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Syscall interface to knfsd. + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/slab.h> +#include <linux/namei.h> +#include <linux/ctype.h> +#include <linux/fs_context.h> + +#include <linux/sunrpc/svcsock.h> +#include <linux/lockd/lockd.h> +#include <linux/sunrpc/addr.h> +#include <linux/sunrpc/gss_api.h> +#include <linux/sunrpc/gss_krb5_enctypes.h> +#include <linux/sunrpc/rpc_pipe_fs.h> +#include <linux/module.h> +#include <linux/fsnotify.h> + +#include "idmap.h" +#include "nfsd.h" +#include "cache.h" +#include "state.h" +#include "netns.h" +#include "pnfs.h" + +/* + * We have a single directory with several nodes in it. + */ +enum { + NFSD_Root = 1, + NFSD_List, + NFSD_Export_features, + NFSD_Fh, + NFSD_FO_UnlockIP, + NFSD_FO_UnlockFS, + NFSD_Threads, + NFSD_Pool_Threads, + NFSD_Pool_Stats, + NFSD_Reply_Cache_Stats, + NFSD_Versions, + NFSD_Ports, + NFSD_MaxBlkSize, + NFSD_MaxConnections, + NFSD_SupportedEnctypes, + /* + * The below MUST come last. Otherwise we leave a hole in nfsd_files[] + * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops + */ +#ifdef CONFIG_NFSD_V4 + NFSD_Leasetime, + NFSD_Gracetime, + NFSD_RecoveryDir, + NFSD_V4EndGrace, +#endif + NFSD_MaxReserved +}; + +/* + * write() for these nodes. + */ +static ssize_t write_filehandle(struct file *file, char *buf, size_t size); +static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); +static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); +static ssize_t write_threads(struct file *file, char *buf, size_t size); +static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); +static ssize_t write_versions(struct file *file, char *buf, size_t size); +static ssize_t write_ports(struct file *file, char *buf, size_t size); +static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); +static ssize_t write_maxconn(struct file *file, char *buf, size_t size); +#ifdef CONFIG_NFSD_V4 +static ssize_t write_leasetime(struct file *file, char *buf, size_t size); +static ssize_t write_gracetime(struct file *file, char *buf, size_t size); +static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); +static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size); +#endif + +static ssize_t (*const write_op[])(struct file *, char *, size_t) = { + [NFSD_Fh] = write_filehandle, + [NFSD_FO_UnlockIP] = write_unlock_ip, + [NFSD_FO_UnlockFS] = write_unlock_fs, + [NFSD_Threads] = write_threads, + [NFSD_Pool_Threads] = write_pool_threads, + [NFSD_Versions] = write_versions, + [NFSD_Ports] = write_ports, + [NFSD_MaxBlkSize] = write_maxblksize, + [NFSD_MaxConnections] = write_maxconn, +#ifdef CONFIG_NFSD_V4 + [NFSD_Leasetime] = write_leasetime, + [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + [NFSD_V4EndGrace] = write_v4_end_grace, +#endif +}; + +static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) +{ + ino_t ino = file_inode(file)->i_ino; + char *data; + ssize_t rv; + + if (ino >= ARRAY_SIZE(write_op) || !write_op[ino]) + return -EINVAL; + + data = simple_transaction_get(file, buf, size); + if (IS_ERR(data)) + return PTR_ERR(data); + + rv = write_op[ino](file, data, size); + if (rv >= 0) { + simple_transaction_set(file, rv); + rv = size; + } + return rv; +} + +static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) +{ + if (! file->private_data) { + /* An attempt to read a transaction file without writing + * causes a 0-byte write so that the file can return + * state information + */ + ssize_t rv = nfsctl_transaction_write(file, buf, 0, pos); + if (rv < 0) + return rv; + } + return simple_transaction_read(file, buf, size, pos); +} + +static const struct file_operations transaction_ops = { + .write = nfsctl_transaction_write, + .read = nfsctl_transaction_read, + .release = simple_transaction_release, + .llseek = default_llseek, +}; + +static int exports_net_open(struct net *net, struct file *file) +{ + int err; + struct seq_file *seq; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + err = seq_open(file, &nfs_exports_op); + if (err) + return err; + + seq = file->private_data; + seq->private = nn->svc_export_cache; + return 0; +} + +static int exports_proc_open(struct inode *inode, struct file *file) +{ + return exports_net_open(current->nsproxy->net_ns, file); +} + +static const struct proc_ops exports_proc_ops = { + .proc_open = exports_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int exports_nfsd_open(struct inode *inode, struct file *file) +{ + return exports_net_open(inode->i_sb->s_fs_info, file); +} + +static const struct file_operations exports_nfsd_operations = { + .open = exports_nfsd_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int export_features_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS); + return 0; +} + +static int export_features_open(struct inode *inode, struct file *file) +{ + return single_open(file, export_features_show, NULL); +} + +static const struct file_operations export_features_operations = { + .open = export_features_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) +static int supported_enctypes_show(struct seq_file *m, void *v) +{ + seq_printf(m, KRB5_SUPPORTED_ENCTYPES); + return 0; +} + +static int supported_enctypes_open(struct inode *inode, struct file *file) +{ + return single_open(file, supported_enctypes_show, NULL); +} + +static const struct file_operations supported_enctypes_ops = { + .open = supported_enctypes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ + +static const struct file_operations pool_stats_operations = { + .open = nfsd_pool_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = nfsd_pool_stats_release, +}; + +static const struct file_operations reply_cache_stats_operations = { + .open = nfsd_reply_cache_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/*----------------------------------------------------------------------------*/ +/* + * payload - write methods + */ + +static inline struct net *netns(struct file *file) +{ + return file_inode(file)->i_sb->s_fs_info; +} + +/* + * write_unlock_ip - Release all locks used by a client + * + * Experimental. + * + * Input: + * buf: '\n'-terminated C string containing a + * presentation format IP address + * size: length of C string in @buf + * Output: + * On success: returns zero if all specified locks were released; + * returns one if one or more locks were not released + * On error: return code is negative errno value + */ +static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) +{ + struct sockaddr_storage address; + struct sockaddr *sap = (struct sockaddr *)&address; + size_t salen = sizeof(address); + char *fo_path; + struct net *net = netns(file); + + /* sanity check */ + if (size == 0) + return -EINVAL; + + if (buf[size-1] != '\n') + return -EINVAL; + + fo_path = buf; + if (qword_get(&buf, fo_path, size) < 0) + return -EINVAL; + + if (rpc_pton(net, fo_path, size, sap, salen) == 0) + return -EINVAL; + + return nlmsvc_unlock_all_by_ip(sap); +} + +/* + * write_unlock_fs - Release all locks on a local file system + * + * Experimental. + * + * Input: + * buf: '\n'-terminated C string containing the + * absolute pathname of a local file system + * size: length of C string in @buf + * Output: + * On success: returns zero if all specified locks were released; + * returns one if one or more locks were not released + * On error: return code is negative errno value + */ +static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) +{ + struct path path; + char *fo_path; + int error; + + /* sanity check */ + if (size == 0) + return -EINVAL; + + if (buf[size-1] != '\n') + return -EINVAL; + + fo_path = buf; + if (qword_get(&buf, fo_path, size) < 0) + return -EINVAL; + + error = kern_path(fo_path, 0, &path); + if (error) + return error; + + /* + * XXX: Needs better sanity checking. Otherwise we could end up + * releasing locks on the wrong file system. + * + * For example: + * 1. Does the path refer to a directory? + * 2. Is that directory a mount point, or + * 3. Is that directory the root of an exported file system? + */ + error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb); + + path_put(&path); + return error; +} + +/* + * write_filehandle - Get a variable-length NFS file handle by path + * + * On input, the buffer contains a '\n'-terminated C string comprised of + * three alphanumeric words separated by whitespace. The string may + * contain escape sequences. + * + * Input: + * buf: + * domain: client domain name + * path: export pathname + * maxsize: numeric maximum size of + * @buf + * size: length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing a ASCII hex text version + * of the NFS file handle; + * return code is the size in bytes of the string + * On error: return code is negative errno value + */ +static ssize_t write_filehandle(struct file *file, char *buf, size_t size) +{ + char *dname, *path; + int maxsize; + char *mesg = buf; + int len; + struct auth_domain *dom; + struct knfsd_fh fh; + + if (size == 0) + return -EINVAL; + + if (buf[size-1] != '\n') + return -EINVAL; + buf[size-1] = 0; + + dname = mesg; + len = qword_get(&mesg, dname, size); + if (len <= 0) + return -EINVAL; + + path = dname+len+1; + len = qword_get(&mesg, path, size); + if (len <= 0) + return -EINVAL; + + len = get_int(&mesg, &maxsize); + if (len) + return len; + + if (maxsize < NFS_FHSIZE) + return -EINVAL; + maxsize = min(maxsize, NFS3_FHSIZE); + + if (qword_get(&mesg, mesg, size)>0) + return -EINVAL; + + /* we have all the words, they are in buf.. */ + dom = unix_domain_find(dname); + if (!dom) + return -ENOMEM; + + len = exp_rootfh(netns(file), dom, path, &fh, maxsize); + auth_domain_put(dom); + if (len) + return len; + + mesg = buf; + len = SIMPLE_TRANSACTION_LIMIT; + qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size); + mesg[-1] = '\n'; + return mesg - buf; +} + +/* + * write_threads - Start NFSD, or report the current number of running threads + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string numeric value representing the number of + * running NFSD threads; + * return code is the size in bytes of the string + * On error: return code is zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the + * number of NFSD threads to start + * size: non-zero length of C string in @buf + * Output: + * On success: NFS service is started; + * passed-in buffer filled with '\n'-terminated C + * string numeric value representing the number of + * running NFSD threads; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_threads(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + int rv; + struct net *net = netns(file); + + if (size > 0) { + int newthreads; + rv = get_int(&mesg, &newthreads); + if (rv) + return rv; + if (newthreads < 0) + return -EINVAL; + rv = nfsd_svc(newthreads, net, file->f_cred); + if (rv < 0) + return rv; + } else + rv = nfsd_nrthreads(net); + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv); +} + +/* + * write_pool_threads - Set or report the current number of threads per pool + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing whitespace- + * separated unsigned integer values + * representing the number of NFSD + * threads to start in each pool + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing integer values representing the + * number of NFSD threads in each pool; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) +{ + /* if size > 0, look for an array of number of threads per node + * and apply them then write out number of threads per node as reply + */ + char *mesg = buf; + int i; + int rv; + int len; + int npools; + int *nthreads; + struct net *net = netns(file); + + mutex_lock(&nfsd_mutex); + npools = nfsd_nrpools(net); + if (npools == 0) { + /* + * NFS is shut down. The admin can start it by + * writing to the threads file but NOT the pool_threads + * file, sorry. Report zero threads. + */ + mutex_unlock(&nfsd_mutex); + strcpy(buf, "0\n"); + return strlen(buf); + } + + nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL); + rv = -ENOMEM; + if (nthreads == NULL) + goto out_free; + + if (size > 0) { + for (i = 0; i < npools; i++) { + rv = get_int(&mesg, &nthreads[i]); + if (rv == -ENOENT) + break; /* fewer numbers than pools */ + if (rv) + goto out_free; /* syntax error */ + rv = -EINVAL; + if (nthreads[i] < 0) + goto out_free; + } + rv = nfsd_set_nrthreads(i, nthreads, net); + if (rv) + goto out_free; + } + + rv = nfsd_get_nrthreads(npools, nthreads, net); + if (rv) + goto out_free; + + mesg = buf; + size = SIMPLE_TRANSACTION_LIMIT; + for (i = 0; i < npools && size > 0; i++) { + snprintf(mesg, size, "%d%c", nthreads[i], (i == npools-1 ? '\n' : ' ')); + len = strlen(mesg); + size -= len; + mesg += len; + } + rv = mesg - buf; +out_free: + kfree(nthreads); + mutex_unlock(&nfsd_mutex); + return rv; +} + +static ssize_t +nfsd_print_version_support(struct nfsd_net *nn, char *buf, int remaining, + const char *sep, unsigned vers, int minor) +{ + const char *format = minor < 0 ? "%s%c%u" : "%s%c%u.%u"; + bool supported = !!nfsd_vers(nn, vers, NFSD_TEST); + + if (vers == 4 && minor >= 0 && + !nfsd_minorversion(nn, minor, NFSD_TEST)) + supported = false; + if (minor == 0 && supported) + /* + * special case for backward compatability. + * +4.0 is never reported, it is implied by + * +4, unless -4.0 is present. + */ + return 0; + return snprintf(buf, remaining, format, sep, + supported ? '+' : '-', vers, minor); +} + +static ssize_t __write_versions(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + char *vers, *minorp, sign; + int len, num, remaining; + ssize_t tlen = 0; + char *sep; + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + + if (size>0) { + if (nn->nfsd_serv) + /* Cannot change versions without updating + * nn->nfsd_serv->sv_xdrsize, and reallocing + * rq_argp and rq_resp + */ + return -EBUSY; + if (buf[size-1] != '\n') + return -EINVAL; + buf[size-1] = 0; + + vers = mesg; + len = qword_get(&mesg, vers, size); + if (len <= 0) return -EINVAL; + do { + enum vers_op cmd; + unsigned minor; + sign = *vers; + if (sign == '+' || sign == '-') + num = simple_strtol((vers+1), &minorp, 0); + else + num = simple_strtol(vers, &minorp, 0); + if (*minorp == '.') { + if (num != 4) + return -EINVAL; + if (kstrtouint(minorp+1, 0, &minor) < 0) + return -EINVAL; + } + + cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET; + switch(num) { + case 2: + case 3: + nfsd_vers(nn, num, cmd); + break; + case 4: + if (*minorp == '.') { + if (nfsd_minorversion(nn, minor, cmd) < 0) + return -EINVAL; + } else if ((cmd == NFSD_SET) != nfsd_vers(nn, num, NFSD_TEST)) { + /* + * Either we have +4 and no minors are enabled, + * or we have -4 and at least one minor is enabled. + * In either case, propagate 'cmd' to all minors. + */ + minor = 0; + while (nfsd_minorversion(nn, minor, cmd) >= 0) + minor++; + } + break; + default: + return -EINVAL; + } + vers += len + 1; + } while ((len = qword_get(&mesg, vers, size)) > 0); + /* If all get turned off, turn them back on, as + * having no versions is BAD + */ + nfsd_reset_versions(nn); + } + + /* Now write current state into reply buffer */ + len = 0; + sep = ""; + remaining = SIMPLE_TRANSACTION_LIMIT; + for (num=2 ; num <= 4 ; num++) { + int minor; + if (!nfsd_vers(nn, num, NFSD_AVAIL)) + continue; + + minor = -1; + do { + len = nfsd_print_version_support(nn, buf, remaining, + sep, num, minor); + if (len >= remaining) + goto out; + remaining -= len; + buf += len; + tlen += len; + minor++; + if (len) + sep = " "; + } while (num == 4 && minor <= NFSD_SUPPORTED_MINOR_VERSION); + } +out: + len = snprintf(buf, remaining, "\n"); + if (len >= remaining) + return -EINVAL; + return tlen + len; +} + +/* + * write_versions - Set or report the available NFS protocol versions + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing positive or negative integer + * values representing the current status of each + * protocol version; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + * + * OR + * + * Input: + * buf: C string containing whitespace- + * separated positive or negative + * integer values representing NFS + * protocol versions to enable ("+n") + * or disable ("-n") + * size: non-zero length of C string in @buf + * Output: + * On success: status of zero or more protocol versions has + * been updated; passed-in buffer filled with + * '\n'-terminated C string containing positive + * or negative integer values representing the + * current status of each protocol version; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_versions(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_versions(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + +/* + * Zero-length write. Return a list of NFSD's current listener + * transports. + */ +static ssize_t __write_ports_names(char *buf, struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (nn->nfsd_serv == NULL) + return 0; + return svc_xprt_names(nn->nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT); +} + +/* + * A single 'fd' number was written, in which case it must be for + * a socket of a supported family/protocol, and we use it as an + * nfsd listener. + */ +static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred *cred) +{ + char *mesg = buf; + int fd, err; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + err = get_int(&mesg, &fd); + if (err != 0 || fd < 0) + return -EINVAL; + + if (svc_alien_sock(net, fd)) { + printk(KERN_ERR "%s: socket net is different to NFSd's one\n", __func__); + return -EINVAL; + } + + err = nfsd_create_serv(net); + if (err != 0) + return err; + + err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred); + if (err < 0) { + nfsd_destroy(net); + return err; + } + + /* Decrease the count, but don't shut down the service */ + nn->nfsd_serv->sv_nrthreads--; + return err; +} + +/* + * A transport listener is added by writing it's transport name and + * a port number. + */ +static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cred *cred) +{ + char transport[16]; + struct svc_xprt *xprt; + int port, err; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (sscanf(buf, "%15s %5u", transport, &port) != 2) + return -EINVAL; + + if (port < 1 || port > USHRT_MAX) + return -EINVAL; + + err = nfsd_create_serv(net); + if (err != 0) + return err; + + err = svc_create_xprt(nn->nfsd_serv, transport, net, + PF_INET, port, SVC_SOCK_ANONYMOUS, cred); + if (err < 0) + goto out_err; + + err = svc_create_xprt(nn->nfsd_serv, transport, net, + PF_INET6, port, SVC_SOCK_ANONYMOUS, cred); + if (err < 0 && err != -EAFNOSUPPORT) + goto out_close; + + /* Decrease the count, but don't shut down the service */ + nn->nfsd_serv->sv_nrthreads--; + return 0; +out_close: + xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port); + if (xprt != NULL) { + svc_close_xprt(xprt); + svc_xprt_put(xprt); + } +out_err: + if (!list_empty(&nn->nfsd_serv->sv_permsocks)) + nn->nfsd_serv->sv_nrthreads--; + else + nfsd_destroy(net); + return err; +} + +static ssize_t __write_ports(struct file *file, char *buf, size_t size, + struct net *net) +{ + if (size == 0) + return __write_ports_names(buf, net); + + if (isdigit(buf[0])) + return __write_ports_addfd(buf, net, file->f_cred); + + if (isalpha(buf[0])) + return __write_ports_addxprt(buf, net, file->f_cred); + + return -EINVAL; +} + +/* + * write_ports - Pass a socket file descriptor or transport name to listen on + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with a '\n'-terminated C + * string containing a whitespace-separated list of + * named NFSD listeners; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing a bound + * but unconnected socket that is to be + * used as an NFSD listener; listen(3) + * must be called for a SOCK_STREAM + * socket, otherwise it is ignored + * size: non-zero length of C string in @buf + * Output: + * On success: NFS service is started; + * passed-in buffer filled with a '\n'-terminated C + * string containing a unique alphanumeric name of + * the listener; + * return code is the size in bytes of the string + * On error: return code is a negative errno value + * + * OR + * + * Input: + * buf: C string containing a transport + * name and an unsigned integer value + * representing the port to listen on, + * separated by whitespace + * size: non-zero length of C string in @buf + * Output: + * On success: returns zero; NFS service is started + * On error: return code is a negative errno value + */ +static ssize_t write_ports(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_ports(file, buf, size, netns(file)); + mutex_unlock(&nfsd_mutex); + return rv; +} + + +int nfsd_max_blksize; + +/* + * write_maxblksize - Set or report the current NFS blksize + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * NFS blksize + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing numeric value of the current NFS blksize + * setting; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + + if (size > 0) { + int bsize; + int rv = get_int(&mesg, &bsize); + if (rv) + return rv; + /* force bsize into allowed range and + * required alignment. + */ + bsize = max_t(int, bsize, 1024); + bsize = min_t(int, bsize, NFSSVC_MAXBLKSIZE); + bsize &= ~(1024-1); + mutex_lock(&nfsd_mutex); + if (nn->nfsd_serv) { + mutex_unlock(&nfsd_mutex); + return -EBUSY; + } + nfsd_max_blksize = bsize; + mutex_unlock(&nfsd_mutex); + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", + nfsd_max_blksize); +} + +/* + * write_maxconn - Set or report the current max number of connections + * + * Input: + * buf: ignored + * size: zero + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * number of max connections + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing numeric value of max_connections setting + * for this net namespace; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_maxconn(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + unsigned int maxconn = nn->max_connections; + + if (size > 0) { + int rv = get_uint(&mesg, &maxconn); + + if (rv) + return rv; + nn->max_connections = maxconn; + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn); +} + +#ifdef CONFIG_NFSD_V4 +static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, + time64_t *time, struct nfsd_net *nn) +{ + char *mesg = buf; + int rv, i; + + if (size > 0) { + if (nn->nfsd_serv) + return -EBUSY; + rv = get_int(&mesg, &i); + if (rv) + return rv; + /* + * Some sanity checking. We don't have a reason for + * these particular numbers, but problems with the + * extremes are: + * - Too short: the briefest network outage may + * cause clients to lose all their locks. Also, + * the frequent polling may be wasteful. + * - Too long: do you really want reboot recovery + * to take more than an hour? Or to make other + * clients wait an hour before being able to + * revoke a dead client's locks? + */ + if (i < 10 || i > 3600) + return -EINVAL; + *time = i; + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%lld\n", *time); +} + +static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, + time64_t *time, struct nfsd_net *nn) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __nfsd4_write_time(file, buf, size, time, nn); + mutex_unlock(&nfsd_mutex); + return rv; +} + +/* + * write_leasetime - Set or report the current NFSv4 lease time + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * NFSv4 lease expiry time + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing unsigned integer value of the + * current lease expiry time; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_leasetime(struct file *file, char *buf, size_t size) +{ + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn); +} + +/* + * write_gracetime - Set or report current NFSv4 grace period time + * + * As above, but sets the time of the NFSv4 grace period. + * + * Note this should never be set to less than the *previous* + * lease-period time, but we don't try to enforce this. (In the common + * case (a new boot), we don't know what the previous lease time was + * anyway.) + */ +static ssize_t write_gracetime(struct file *file, char *buf, size_t size) +{ + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn); +} + +static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, + struct nfsd_net *nn) +{ + char *mesg = buf; + char *recdir; + int len, status; + + if (size > 0) { + if (nn->nfsd_serv) + return -EBUSY; + if (size > PATH_MAX || buf[size-1] != '\n') + return -EINVAL; + buf[size-1] = 0; + + recdir = mesg; + len = qword_get(&mesg, recdir, size); + if (len <= 0) + return -EINVAL; + + status = nfs4_reset_recoverydir(recdir); + if (status) + return status; + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n", + nfs4_recoverydir()); +} + +/* + * write_recoverydir - Set or report the pathname of the recovery directory + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing the pathname + * of the directory on a local file + * system containing permanent NFSv4 + * recovery data + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing the current recovery pathname setting; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ +static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + + mutex_lock(&nfsd_mutex); + rv = __write_recoverydir(file, buf, size, nn); + mutex_unlock(&nfsd_mutex); + return rv; +} + +/* + * write_v4_end_grace - release grace period for nfsd's v4.x lock manager + * + * Input: + * buf: ignored + * size: zero + * OR + * + * Input: + * buf: any value + * size: non-zero length of C string in @buf + * Output: + * passed-in buffer filled with "Y" or "N" with a newline + * and NULL-terminated C string. This indicates whether + * the grace period has ended in the current net + * namespace. Return code is the size in bytes of the + * string. Writing a string that starts with 'Y', 'y', or + * '1' to the file will end the grace period for nfsd's v4 + * lock manager. + */ +static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) +{ + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); + + if (size > 0) { + switch(buf[0]) { + case 'Y': + case 'y': + case '1': + if (!nn->nfsd_serv) + return -EBUSY; + nfsd4_end_grace(nn); + break; + default: + return -EINVAL; + } + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n", + nn->grace_ended ? 'Y' : 'N'); +} + +#endif + +/*----------------------------------------------------------------------------*/ +/* + * populating the filesystem. + */ + +/* Basically copying rpc_get_inode. */ +static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) +{ + struct inode *inode = new_inode(sb); + if (!inode) + return NULL; + /* Following advice from simple_fill_super documentation: */ + inode->i_ino = iunique(sb, NFSD_MaxReserved); + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + switch (mode & S_IFMT) { + case S_IFDIR: + inode->i_fop = &simple_dir_operations; + inode->i_op = &simple_dir_inode_operations; + inc_nlink(inode); + default: + break; + } + return inode; +} + +static int __nfsd_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, struct nfsdfs_client *ncl) +{ + struct inode *inode; + + inode = nfsd_get_inode(dir->i_sb, mode); + if (!inode) + return -ENOMEM; + if (ncl) { + inode->i_private = ncl; + kref_get(&ncl->cl_ref); + } + d_add(dentry, inode); + inc_nlink(dir); + fsnotify_mkdir(dir, dentry); + return 0; +} + +static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name) +{ + struct inode *dir = parent->d_inode; + struct dentry *dentry; + int ret = -ENOMEM; + + inode_lock(dir); + dentry = d_alloc_name(parent, name); + if (!dentry) + goto out_err; + ret = __nfsd_mkdir(d_inode(parent), dentry, S_IFDIR | 0600, ncl); + if (ret) + goto out_err; +out: + inode_unlock(dir); + return dentry; +out_err: + dput(dentry); + dentry = ERR_PTR(ret); + goto out; +} + +static void clear_ncl(struct inode *inode) +{ + struct nfsdfs_client *ncl = inode->i_private; + + inode->i_private = NULL; + kref_put(&ncl->cl_ref, ncl->cl_release); +} + +static struct nfsdfs_client *__get_nfsdfs_client(struct inode *inode) +{ + struct nfsdfs_client *nc = inode->i_private; + + if (nc) + kref_get(&nc->cl_ref); + return nc; +} + +struct nfsdfs_client *get_nfsdfs_client(struct inode *inode) +{ + struct nfsdfs_client *nc; + + inode_lock_shared(inode); + nc = __get_nfsdfs_client(inode); + inode_unlock_shared(inode); + return nc; +} +/* from __rpc_unlink */ +static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry) +{ + int ret; + + clear_ncl(d_inode(dentry)); + dget(dentry); + ret = simple_unlink(dir, dentry); + d_drop(dentry); + fsnotify_unlink(dir, dentry); + dput(dentry); + WARN_ON_ONCE(ret); +} + +static void nfsdfs_remove_files(struct dentry *root) +{ + struct dentry *dentry, *tmp; + + list_for_each_entry_safe(dentry, tmp, &root->d_subdirs, d_child) { + if (!simple_positive(dentry)) { + WARN_ON_ONCE(1); /* I think this can't happen? */ + continue; + } + nfsdfs_remove_file(d_inode(root), dentry); + } +} + +/* XXX: cut'n'paste from simple_fill_super; figure out if we could share + * code instead. */ +static int nfsdfs_create_files(struct dentry *root, + const struct tree_descr *files) +{ + struct inode *dir = d_inode(root); + struct inode *inode; + struct dentry *dentry; + int i; + + inode_lock(dir); + for (i = 0; files->name && files->name[0]; i++, files++) { + if (!files->name) + continue; + dentry = d_alloc_name(root, files->name); + if (!dentry) + goto out; + inode = nfsd_get_inode(d_inode(root)->i_sb, + S_IFREG | files->mode); + if (!inode) { + dput(dentry); + goto out; + } + inode->i_fop = files->ops; + inode->i_private = __get_nfsdfs_client(dir); + d_add(dentry, inode); + fsnotify_create(dir, dentry); + } + inode_unlock(dir); + return 0; +out: + nfsdfs_remove_files(root); + inode_unlock(dir); + return -ENOMEM; +} + +/* on success, returns positive number unique to that client. */ +struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, + struct nfsdfs_client *ncl, u32 id, + const struct tree_descr *files) +{ + struct dentry *dentry; + char name[11]; + int ret; + + sprintf(name, "%u", id); + + dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name); + if (IS_ERR(dentry)) /* XXX: tossing errors? */ + return NULL; + ret = nfsdfs_create_files(dentry, files); + if (ret) { + nfsd_client_rmdir(dentry); + return NULL; + } + return dentry; +} + +/* Taken from __rpc_rmdir: */ +void nfsd_client_rmdir(struct dentry *dentry) +{ + struct inode *dir = d_inode(dentry->d_parent); + struct inode *inode = d_inode(dentry); + int ret; + + inode_lock(dir); + nfsdfs_remove_files(dentry); + clear_ncl(inode); + dget(dentry); + ret = simple_rmdir(dir, dentry); + WARN_ON_ONCE(ret); + d_drop(dentry); + fsnotify_rmdir(dir, dentry); + dput(dentry); + inode_unlock(dir); +} + +static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + struct dentry *dentry; + int ret; + + static const struct tree_descr nfsd_files[] = { + [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO}, + [NFSD_Export_features] = {"export_features", + &export_features_operations, S_IRUGO}, + [NFSD_FO_UnlockIP] = {"unlock_ip", + &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_FO_UnlockFS] = {"unlock_filesystem", + &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, + [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO}, + [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, + [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, + [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO}, +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) + [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ +#ifdef CONFIG_NFSD_V4 + [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO}, +#endif + /* last one */ {""} + }; + + ret = simple_fill_super(sb, 0x6e667364, nfsd_files); + if (ret) + return ret; + dentry = nfsd_mkdir(sb->s_root, NULL, "clients"); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + nn->nfsd_client_dir = dentry; + return 0; +} + +static int nfsd_fs_get_tree(struct fs_context *fc) +{ + return get_tree_keyed(fc, nfsd_fill_super, get_net(fc->net_ns)); +} + +static void nfsd_fs_free_fc(struct fs_context *fc) +{ + if (fc->s_fs_info) + put_net(fc->s_fs_info); +} + +static const struct fs_context_operations nfsd_fs_context_ops = { + .free = nfsd_fs_free_fc, + .get_tree = nfsd_fs_get_tree, +}; + +static int nfsd_init_fs_context(struct fs_context *fc) +{ + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(fc->net_ns->user_ns); + fc->ops = &nfsd_fs_context_ops; + return 0; +} + +static void nfsd_umount(struct super_block *sb) +{ + struct net *net = sb->s_fs_info; + + nfsd_shutdown_threads(net); + + kill_litter_super(sb); + put_net(net); +} + +static struct file_system_type nfsd_fs_type = { + .owner = THIS_MODULE, + .name = "nfsd", + .init_fs_context = nfsd_init_fs_context, + .kill_sb = nfsd_umount, +}; +MODULE_ALIAS_FS("nfsd"); + +#ifdef CONFIG_PROC_FS +static int create_proc_exports_entry(void) +{ + struct proc_dir_entry *entry; + + entry = proc_mkdir("fs/nfs", NULL); + if (!entry) + return -ENOMEM; + entry = proc_create("exports", 0, entry, &exports_proc_ops); + if (!entry) { + remove_proc_entry("fs/nfs", NULL); + return -ENOMEM; + } + return 0; +} +#else /* CONFIG_PROC_FS */ +static int create_proc_exports_entry(void) +{ + return 0; +} +#endif + +unsigned int nfsd_net_id; + +static __net_init int nfsd_init_net(struct net *net) +{ + int retval; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + retval = nfsd_export_init(net); + if (retval) + goto out_export_error; + retval = nfsd_idmap_init(net); + if (retval) + goto out_idmap_error; + nn->nfsd_versions = NULL; + nn->nfsd4_minorversions = NULL; + retval = nfsd_reply_cache_init(nn); + if (retval) + goto out_drc_error; + nn->nfsd4_lease = 90; /* default lease time */ + nn->nfsd4_grace = 90; + nn->somebody_reclaimed = false; + nn->track_reclaim_completes = false; + nn->clverifier_counter = prandom_u32(); + nn->clientid_base = prandom_u32(); + nn->clientid_counter = nn->clientid_base + 1; + nn->s2s_cp_cl_id = nn->clientid_counter++; + + atomic_set(&nn->ntf_refcnt, 0); + init_waitqueue_head(&nn->ntf_wq); + seqlock_init(&nn->boot_lock); + + return 0; + +out_drc_error: + nfsd_idmap_shutdown(net); +out_idmap_error: + nfsd_export_shutdown(net); +out_export_error: + return retval; +} + +static __net_exit void nfsd_exit_net(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfsd_reply_cache_shutdown(nn); + nfsd_idmap_shutdown(net); + nfsd_export_shutdown(net); + nfsd_netns_free_versions(net_generic(net, nfsd_net_id)); +} + +static struct pernet_operations nfsd_net_ops = { + .init = nfsd_init_net, + .exit = nfsd_exit_net, + .id = &nfsd_net_id, + .size = sizeof(struct nfsd_net), +}; + +static int __init init_nfsd(void) +{ + int retval; + printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); + + retval = nfsd4_init_slabs(); + if (retval) + return retval; + retval = nfsd4_init_pnfs(); + if (retval) + goto out_free_slabs; + nfsd_stat_init(); /* Statistics */ + retval = nfsd_drc_slab_create(); + if (retval) + goto out_free_stat; + nfsd_lockd_init(); /* lockd->nfsd callbacks */ + retval = create_proc_exports_entry(); + if (retval) + goto out_free_lockd; + retval = register_filesystem(&nfsd_fs_type); + if (retval) + goto out_free_exports; + retval = register_pernet_subsys(&nfsd_net_ops); + if (retval < 0) + goto out_free_filesystem; + retval = register_cld_notifier(); + if (retval) + goto out_free_all; + return 0; +out_free_all: + unregister_pernet_subsys(&nfsd_net_ops); +out_free_filesystem: + unregister_filesystem(&nfsd_fs_type); +out_free_exports: + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); +out_free_lockd: + nfsd_lockd_shutdown(); + nfsd_drc_slab_free(); +out_free_stat: + nfsd_stat_shutdown(); + nfsd4_exit_pnfs(); +out_free_slabs: + nfsd4_free_slabs(); + return retval; +} + +static void __exit exit_nfsd(void) +{ + unregister_cld_notifier(); + unregister_pernet_subsys(&nfsd_net_ops); + nfsd_drc_slab_free(); + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); + nfsd_stat_shutdown(); + nfsd_lockd_shutdown(); + nfsd4_free_slabs(); + nfsd4_exit_pnfs(); + unregister_filesystem(&nfsd_fs_type); +} + +MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); +MODULE_LICENSE("GPL"); +module_init(init_nfsd) +module_exit(exit_nfsd) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h new file mode 100644 index 000000000..4362d295e --- /dev/null +++ b/fs/nfsd/nfsd.h @@ -0,0 +1,489 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hodge-podge collection of knfsd-related stuff. + * I will sort this out later. + * + * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> + */ + +#ifndef LINUX_NFSD_NFSD_H +#define LINUX_NFSD_NFSD_H + +#include <linux/types.h> +#include <linux/mount.h> + +#include <linux/nfs.h> +#include <linux/nfs2.h> +#include <linux/nfs3.h> +#include <linux/nfs4.h> +#include <linux/sunrpc/svc.h> +#include <linux/sunrpc/svc_xprt.h> +#include <linux/sunrpc/msg_prot.h> +#include <linux/sunrpc/addr.h> + +#include <uapi/linux/nfsd/debug.h> + +#include "netns.h" +#include "stats.h" +#include "export.h" + +#undef ifdebug +#ifdef CONFIG_SUNRPC_DEBUG +# define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag) +#else +# define ifdebug(flag) if (0) +#endif + +/* + * nfsd version + */ +#define NFSD_SUPPORTED_MINOR_VERSION 2 +/* + * Maximum blocksizes supported by daemon under various circumstances. + */ +#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD +/* NFSv2 is limited by the protocol specification, see RFC 1094 */ +#define NFSSVC_MAXBLKSIZE_V2 (8*1024) + + +/* + * Largest number of bytes we need to allocate for an NFS + * call or reply. Used to control buffer sizes. We use + * the length of v3 WRITE, READDIR and READDIR replies + * which are an RPC header, up to 26 XDR units of reply + * data, and some page data. + * + * Note that accuracy here doesn't matter too much as the + * size is rounded up to a page size when allocating space. + */ +#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE) + +struct readdir_cd { + __be32 err; /* 0, nfserr, or nfserr_eof */ +}; + + +extern struct svc_program nfsd_program; +extern const struct svc_version nfsd_version2, nfsd_version3, + nfsd_version4; +extern struct mutex nfsd_mutex; +extern spinlock_t nfsd_drc_lock; +extern unsigned long nfsd_drc_max_mem; +extern unsigned long nfsd_drc_mem_used; + +extern const struct seq_operations nfs_exports_op; + +/* + * Function prototypes. + */ +int nfsd_svc(int nrservs, struct net *net, const struct cred *cred); +int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); + +int nfsd_nrthreads(struct net *); +int nfsd_nrpools(struct net *); +int nfsd_get_nrthreads(int n, int *, struct net *); +int nfsd_set_nrthreads(int n, int *, struct net *); +int nfsd_pool_stats_open(struct inode *, struct file *); +int nfsd_pool_stats_release(struct inode *, struct file *); +void nfsd_shutdown_threads(struct net *net); + +void nfsd_destroy(struct net *net); + +bool i_am_nfsd(void); + +struct nfsdfs_client { + struct kref cl_ref; + void (*cl_release)(struct kref *kref); +}; + +struct nfsdfs_client *get_nfsdfs_client(struct inode *); +struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, + struct nfsdfs_client *ncl, u32 id, const struct tree_descr *); +void nfsd_client_rmdir(struct dentry *dentry); + + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +#ifdef CONFIG_NFSD_V2_ACL +extern const struct svc_version nfsd_acl_version2; +#else +#define nfsd_acl_version2 NULL +#endif +#ifdef CONFIG_NFSD_V3_ACL +extern const struct svc_version nfsd_acl_version3; +#else +#define nfsd_acl_version3 NULL +#endif +#endif + +struct nfsd_net; + +enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL }; +int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change); +int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change); +void nfsd_reset_versions(struct nfsd_net *nn); +int nfsd_create_serv(struct net *net); + +extern int nfsd_max_blksize; + +static inline int nfsd_v4client(struct svc_rqst *rq) +{ + return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4; +} +static inline struct user_namespace * +nfsd_user_namespace(const struct svc_rqst *rqstp) +{ + const struct cred *cred = rqstp->rq_xprt->xpt_cred; + return cred ? cred->user_ns : &init_user_ns; +} + +/* + * NFSv4 State + */ +#ifdef CONFIG_NFSD_V4 +extern unsigned long max_delegations; +int nfsd4_init_slabs(void); +void nfsd4_free_slabs(void); +int nfs4_state_start(void); +int nfs4_state_start_net(struct net *net); +void nfs4_state_shutdown(void); +void nfs4_state_shutdown_net(struct net *net); +int nfs4_reset_recoverydir(char *recdir); +char * nfs4_recoverydir(void); +bool nfsd4_spo_must_allow(struct svc_rqst *rqstp); +#else +static inline int nfsd4_init_slabs(void) { return 0; } +static inline void nfsd4_free_slabs(void) { } +static inline int nfs4_state_start(void) { return 0; } +static inline int nfs4_state_start_net(struct net *net) { return 0; } +static inline void nfs4_state_shutdown(void) { } +static inline void nfs4_state_shutdown_net(struct net *net) { } +static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } +static inline char * nfs4_recoverydir(void) {return NULL; } +static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) +{ + return false; +} +#endif + +/* + * lockd binding + */ +void nfsd_lockd_init(void); +void nfsd_lockd_shutdown(void); + + +/* + * These macros provide pre-xdr'ed values for faster operation. + */ +#define nfs_ok cpu_to_be32(NFS_OK) +#define nfserr_perm cpu_to_be32(NFSERR_PERM) +#define nfserr_noent cpu_to_be32(NFSERR_NOENT) +#define nfserr_io cpu_to_be32(NFSERR_IO) +#define nfserr_nxio cpu_to_be32(NFSERR_NXIO) +#define nfserr_eagain cpu_to_be32(NFSERR_EAGAIN) +#define nfserr_acces cpu_to_be32(NFSERR_ACCES) +#define nfserr_exist cpu_to_be32(NFSERR_EXIST) +#define nfserr_xdev cpu_to_be32(NFSERR_XDEV) +#define nfserr_nodev cpu_to_be32(NFSERR_NODEV) +#define nfserr_notdir cpu_to_be32(NFSERR_NOTDIR) +#define nfserr_isdir cpu_to_be32(NFSERR_ISDIR) +#define nfserr_inval cpu_to_be32(NFSERR_INVAL) +#define nfserr_fbig cpu_to_be32(NFSERR_FBIG) +#define nfserr_nospc cpu_to_be32(NFSERR_NOSPC) +#define nfserr_rofs cpu_to_be32(NFSERR_ROFS) +#define nfserr_mlink cpu_to_be32(NFSERR_MLINK) +#define nfserr_opnotsupp cpu_to_be32(NFSERR_OPNOTSUPP) +#define nfserr_nametoolong cpu_to_be32(NFSERR_NAMETOOLONG) +#define nfserr_notempty cpu_to_be32(NFSERR_NOTEMPTY) +#define nfserr_dquot cpu_to_be32(NFSERR_DQUOT) +#define nfserr_stale cpu_to_be32(NFSERR_STALE) +#define nfserr_remote cpu_to_be32(NFSERR_REMOTE) +#define nfserr_wflush cpu_to_be32(NFSERR_WFLUSH) +#define nfserr_badhandle cpu_to_be32(NFSERR_BADHANDLE) +#define nfserr_notsync cpu_to_be32(NFSERR_NOT_SYNC) +#define nfserr_badcookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_notsupp cpu_to_be32(NFSERR_NOTSUPP) +#define nfserr_toosmall cpu_to_be32(NFSERR_TOOSMALL) +#define nfserr_serverfault cpu_to_be32(NFSERR_SERVERFAULT) +#define nfserr_badtype cpu_to_be32(NFSERR_BADTYPE) +#define nfserr_jukebox cpu_to_be32(NFSERR_JUKEBOX) +#define nfserr_denied cpu_to_be32(NFSERR_DENIED) +#define nfserr_deadlock cpu_to_be32(NFSERR_DEADLOCK) +#define nfserr_expired cpu_to_be32(NFSERR_EXPIRED) +#define nfserr_bad_cookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_same cpu_to_be32(NFSERR_SAME) +#define nfserr_clid_inuse cpu_to_be32(NFSERR_CLID_INUSE) +#define nfserr_stale_clientid cpu_to_be32(NFSERR_STALE_CLIENTID) +#define nfserr_resource cpu_to_be32(NFSERR_RESOURCE) +#define nfserr_moved cpu_to_be32(NFSERR_MOVED) +#define nfserr_nofilehandle cpu_to_be32(NFSERR_NOFILEHANDLE) +#define nfserr_minor_vers_mismatch cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH) +#define nfserr_share_denied cpu_to_be32(NFSERR_SHARE_DENIED) +#define nfserr_stale_stateid cpu_to_be32(NFSERR_STALE_STATEID) +#define nfserr_old_stateid cpu_to_be32(NFSERR_OLD_STATEID) +#define nfserr_bad_stateid cpu_to_be32(NFSERR_BAD_STATEID) +#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID) +#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK) +#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME) +#define nfserr_lock_range cpu_to_be32(NFSERR_LOCK_RANGE) +#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH) +#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) +#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) +#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) +#define nfserr_badowner cpu_to_be32(NFSERR_BADOWNER) +#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) +#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) +#define nfserr_grace cpu_to_be32(NFSERR_GRACE) +#define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE) +#define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD) +#define nfserr_badname cpu_to_be32(NFSERR_BADNAME) +#define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN) +#define nfserr_locked cpu_to_be32(NFSERR_LOCKED) +#define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC) +#define nfserr_badiomode cpu_to_be32(NFS4ERR_BADIOMODE) +#define nfserr_badlayout cpu_to_be32(NFS4ERR_BADLAYOUT) +#define nfserr_bad_session_digest cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST) +#define nfserr_badsession cpu_to_be32(NFS4ERR_BADSESSION) +#define nfserr_badslot cpu_to_be32(NFS4ERR_BADSLOT) +#define nfserr_complete_already cpu_to_be32(NFS4ERR_COMPLETE_ALREADY) +#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION) +#define nfserr_deleg_already_wanted cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED) +#define nfserr_back_chan_busy cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY) +#define nfserr_layouttrylater cpu_to_be32(NFS4ERR_LAYOUTTRYLATER) +#define nfserr_layoutunavailable cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE) +#define nfserr_nomatching_layout cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT) +#define nfserr_recallconflict cpu_to_be32(NFS4ERR_RECALLCONFLICT) +#define nfserr_unknown_layouttype cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE) +#define nfserr_seq_misordered cpu_to_be32(NFS4ERR_SEQ_MISORDERED) +#define nfserr_sequence_pos cpu_to_be32(NFS4ERR_SEQUENCE_POS) +#define nfserr_req_too_big cpu_to_be32(NFS4ERR_REQ_TOO_BIG) +#define nfserr_rep_too_big cpu_to_be32(NFS4ERR_REP_TOO_BIG) +#define nfserr_rep_too_big_to_cache cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE) +#define nfserr_retry_uncached_rep cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP) +#define nfserr_unsafe_compound cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND) +#define nfserr_too_many_ops cpu_to_be32(NFS4ERR_TOO_MANY_OPS) +#define nfserr_op_not_in_session cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION) +#define nfserr_hash_alg_unsupp cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP) +#define nfserr_clientid_busy cpu_to_be32(NFS4ERR_CLIENTID_BUSY) +#define nfserr_pnfs_io_hole cpu_to_be32(NFS4ERR_PNFS_IO_HOLE) +#define nfserr_seq_false_retry cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY) +#define nfserr_bad_high_slot cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT) +#define nfserr_deadsession cpu_to_be32(NFS4ERR_DEADSESSION) +#define nfserr_encr_alg_unsupp cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP) +#define nfserr_pnfs_no_layout cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT) +#define nfserr_not_only_op cpu_to_be32(NFS4ERR_NOT_ONLY_OP) +#define nfserr_wrong_cred cpu_to_be32(NFS4ERR_WRONG_CRED) +#define nfserr_wrong_type cpu_to_be32(NFS4ERR_WRONG_TYPE) +#define nfserr_dirdeleg_unavail cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL) +#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG) +#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT) +#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) +#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP) +#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH) +#define nfserr_union_notsupp cpu_to_be32(NFS4ERR_UNION_NOTSUPP) +#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED) +#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS) +#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) +#define nfserr_file_open cpu_to_be32(NFS4ERR_FILE_OPEN) +#define nfserr_xattr2big cpu_to_be32(NFS4ERR_XATTR2BIG) +#define nfserr_noxattr cpu_to_be32(NFS4ERR_NOXATTR) + +/* error codes for internal use */ +/* if a request fails due to kmalloc failure, it gets dropped. + * Client should resend eventually + */ +#define nfserr_dropit cpu_to_be32(30000) +/* end-of-file indicator in readdir */ +#define nfserr_eof cpu_to_be32(30001) +/* replay detected */ +#define nfserr_replay_me cpu_to_be32(11001) +/* nfs41 replay detected */ +#define nfserr_replay_cache cpu_to_be32(11002) + +/* Check for dir entries '.' and '..' */ +#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) + +#ifdef CONFIG_NFSD_V4 + +/* before processing a COMPOUND operation, we have to check that there + * is enough space in the buffer for XDR encode to succeed. otherwise, + * we might process an operation with side effects, and be unable to + * tell the client that the operation succeeded. + * + * COMPOUND_SLACK_SPACE - this is the minimum bytes of buffer space + * needed to encode an "ordinary" _successful_ operation. (GETATTR, + * READ, READDIR, and READLINK have their own buffer checks.) if we + * fall below this level, we fail the next operation with NFS4ERR_RESOURCE. + * + * COMPOUND_ERR_SLACK_SPACE - this is the minimum bytes of buffer space + * needed to encode an operation which has failed with NFS4ERR_RESOURCE. + * care is taken to ensure that we never fall below this level for any + * reason. + */ +#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ +#define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */ + +#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */ + +/* + * The following attributes are currently not supported by the NFSv4 server: + * ARCHIVE (deprecated anyway) + * HIDDEN (unlikely to be supported any time soon) + * MIMETYPE (unlikely to be supported any time soon) + * QUOTA_* (will be supported in a forthcoming patch) + * SYSTEM (unlikely to be supported any time soon) + * TIME_BACKUP (unlikely to be supported any time soon) + * TIME_CREATE (unlikely to be supported any time soon) + */ +#define NFSD4_SUPPORTED_ATTRS_WORD0 \ +(FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \ + | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_LINK_SUPPORT \ + | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR | FATTR4_WORD0_FSID \ + | FATTR4_WORD0_UNIQUE_HANDLES | FATTR4_WORD0_LEASE_TIME | FATTR4_WORD0_RDATTR_ERROR \ + | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_CANSETTIME | FATTR4_WORD0_CASE_INSENSITIVE \ + | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED \ + | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \ + | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_HOMOGENEOUS \ + | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \ + | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_ACL) + +#define NFSD4_SUPPORTED_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \ + | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \ + | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \ + | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \ + | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \ + | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID) + +#define NFSD4_SUPPORTED_ATTRS_WORD2 0 + +/* 4.1 */ +#ifdef CONFIG_NFSD_PNFS +#define PNFSD_SUPPORTED_ATTRS_WORD1 FATTR4_WORD1_FS_LAYOUT_TYPES +#define PNFSD_SUPPORTED_ATTRS_WORD2 \ +(FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_LAYOUT_TYPES) +#else +#define PNFSD_SUPPORTED_ATTRS_WORD1 0 +#define PNFSD_SUPPORTED_ATTRS_WORD2 0 +#endif /* CONFIG_NFSD_PNFS */ + +#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + (NFSD4_SUPPORTED_ATTRS_WORD1 | PNFSD_SUPPORTED_ATTRS_WORD1) + +#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ + (NFSD4_SUPPORTED_ATTRS_WORD2 | PNFSD_SUPPORTED_ATTRS_WORD2 | \ + FATTR4_WORD2_SUPPATTR_EXCLCREAT) + +/* 4.2 */ +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL +#else +#define NFSD4_2_SECURITY_ATTRS 0 +#endif + +#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ + (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ + FATTR4_WORD2_CHANGE_ATTR_TYPE | \ + FATTR4_WORD2_MODE_UMASK | \ + NFSD4_2_SECURITY_ATTRS | \ + FATTR4_WORD2_XATTR_SUPPORT) + +extern const u32 nfsd_suppattrs[3][3]; + +static inline __be32 nfsd4_set_netaddr(struct sockaddr *addr, + struct nfs42_netaddr *netaddr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)addr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; + unsigned int port; + size_t ret_addr, ret_port; + + switch (addr->sa_family) { + case AF_INET: + port = ntohs(sin->sin_port); + sprintf(netaddr->netid, "tcp"); + netaddr->netid_len = 3; + break; + case AF_INET6: + port = ntohs(sin6->sin6_port); + sprintf(netaddr->netid, "tcp6"); + netaddr->netid_len = 4; + break; + default: + return nfserr_inval; + } + ret_addr = rpc_ntop(addr, netaddr->addr, sizeof(netaddr->addr)); + ret_port = snprintf(netaddr->addr + ret_addr, + RPCBIND_MAXUADDRLEN + 1 - ret_addr, + ".%u.%u", port >> 8, port & 0xff); + WARN_ON(ret_port >= RPCBIND_MAXUADDRLEN + 1 - ret_addr); + netaddr->addr_len = ret_addr + ret_port; + return 0; +} + +static inline bool bmval_is_subset(const u32 *bm1, const u32 *bm2) +{ + return !((bm1[0] & ~bm2[0]) || + (bm1[1] & ~bm2[1]) || + (bm1[2] & ~bm2[2])); +} + +static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval) +{ + return bmval_is_subset(bmval, nfsd_suppattrs[minorversion]); +} + +/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ +#define NFSD_WRITEONLY_ATTRS_WORD1 \ + (FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) + +/* + * These are the only attrs allowed in CREATE/OPEN/SETATTR. Don't add + * a writeable attribute here without also adding code to parse it to + * nfsd4_decode_fattr(). + */ +#define NFSD_WRITEABLE_ATTRS_WORD0 \ + (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL) +#define NFSD_WRITEABLE_ATTRS_WORD1 \ + (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +#define MAYBE_FATTR4_WORD2_SECURITY_LABEL \ + FATTR4_WORD2_SECURITY_LABEL +#else +#define MAYBE_FATTR4_WORD2_SECURITY_LABEL 0 +#endif +#define NFSD_WRITEABLE_ATTRS_WORD2 \ + (FATTR4_WORD2_MODE_UMASK \ + | MAYBE_FATTR4_WORD2_SECURITY_LABEL) + +#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ + NFSD_WRITEABLE_ATTRS_WORD0 +/* + * we currently store the exclusive create verifier in the v_{a,m}time + * attributes so the client can't set these at create time using EXCLUSIVE4_1 + */ +#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \ + (NFSD_WRITEABLE_ATTRS_WORD1 & \ + ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)) +#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \ + NFSD_WRITEABLE_ATTRS_WORD2 + +extern int nfsd4_is_junction(struct dentry *dentry); +extern int register_cld_notifier(void); +extern void unregister_cld_notifier(void); +#else /* CONFIG_NFSD_V4 */ +static inline int nfsd4_is_junction(struct dentry *dentry) +{ + return 0; +} + +#define register_cld_notifier() 0 +#define unregister_cld_notifier() do { } while(0) + +#endif /* CONFIG_NFSD_V4 */ + +#endif /* LINUX_NFSD_NFSD_H */ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c new file mode 100644 index 000000000..c81dbbad8 --- /dev/null +++ b/fs/nfsd/nfsfh.c @@ -0,0 +1,713 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NFS server file handle treatment. + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + * Portions Copyright (C) 1999 G. Allen Morris III <gam3@acm.org> + * Extensive rewrite by Neil Brown <neilb@cse.unsw.edu.au> Southern-Spring 1999 + * ... and again Southern-Winter 2001 to support export_operations + */ + +#include <linux/exportfs.h> + +#include <linux/sunrpc/svcauth_gss.h> +#include "nfsd.h" +#include "vfs.h" +#include "auth.h" +#include "trace.h" + +#define NFSDDBG_FACILITY NFSDDBG_FH + + +/* + * our acceptability function. + * if NOSUBTREECHECK, accept anything + * if not, require that we can walk up to exp->ex_dentry + * doing some checks on the 'x' bits + */ +static int nfsd_acceptable(void *expv, struct dentry *dentry) +{ + struct svc_export *exp = expv; + int rv; + struct dentry *tdentry; + struct dentry *parent; + + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) + return 1; + + tdentry = dget(dentry); + while (tdentry != exp->ex_path.dentry && !IS_ROOT(tdentry)) { + /* make sure parents give x permission to user */ + int err; + parent = dget_parent(tdentry); + err = inode_permission(d_inode(parent), MAY_EXEC); + if (err < 0) { + dput(parent); + break; + } + dput(tdentry); + tdentry = parent; + } + if (tdentry != exp->ex_path.dentry) + dprintk("nfsd_acceptable failed at %p %pd\n", tdentry, tdentry); + rv = (tdentry == exp->ex_path.dentry); + dput(tdentry); + return rv; +} + +/* Type check. The correct error return for type mismatches does not seem to be + * generally agreed upon. SunOS seems to use EISDIR if file isn't S_IFREG; a + * comment in the NFSv3 spec says this is incorrect (implementation notes for + * the write call). + */ +static inline __be32 +nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry, + umode_t requested) +{ + umode_t mode = d_inode(dentry)->i_mode & S_IFMT; + + if (requested == 0) /* the caller doesn't care */ + return nfs_ok; + if (mode == requested) { + if (mode == S_IFDIR && !d_can_lookup(dentry)) { + WARN_ON_ONCE(1); + return nfserr_notdir; + } + return nfs_ok; + } + /* + * v4 has an error more specific than err_notdir which we should + * return in preference to err_notdir: + */ + if (rqstp->rq_vers == 4 && mode == S_IFLNK) + return nfserr_symlink; + if (requested == S_IFDIR) + return nfserr_notdir; + if (mode == S_IFDIR) + return nfserr_isdir; + return nfserr_inval; +} + +static bool nfsd_originating_port_ok(struct svc_rqst *rqstp, int flags) +{ + if (flags & NFSEXP_INSECURE_PORT) + return true; + /* We don't require gss requests to use low ports: */ + if (rqstp->rq_cred.cr_flavor >= RPC_AUTH_GSS) + return true; + return test_bit(RQ_SECURE, &rqstp->rq_flags); +} + +static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, + struct svc_export *exp) +{ + int flags = nfsexp_flags(rqstp, exp); + + /* Check if the request originated from a secure port. */ + if (!nfsd_originating_port_ok(rqstp, flags)) { + RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + dprintk("nfsd: request from insecure port %s!\n", + svc_print_addr(rqstp, buf, sizeof(buf))); + return nfserr_perm; + } + + /* Set user creds for this exportpoint */ + return nfserrno(nfsd_setuser(rqstp, exp)); +} + +static inline __be32 check_pseudo_root(struct svc_rqst *rqstp, + struct dentry *dentry, struct svc_export *exp) +{ + if (!(exp->ex_flags & NFSEXP_V4ROOT)) + return nfs_ok; + /* + * v2/v3 clients have no need for the V4ROOT export--they use + * the mount protocl instead; also, further V4ROOT checks may be + * in v4-specific code, in which case v2/v3 clients could bypass + * them. + */ + if (!nfsd_v4client(rqstp)) + return nfserr_stale; + /* + * We're exposing only the directories and symlinks that have to be + * traversed on the way to real exports: + */ + if (unlikely(!d_is_dir(dentry) && + !d_is_symlink(dentry))) + return nfserr_stale; + /* + * A pseudoroot export gives permission to access only one + * single directory; the kernel has to make another upcall + * before granting access to anything else under it: + */ + if (unlikely(dentry != exp->ex_path.dentry)) + return nfserr_stale; + return nfs_ok; +} + +/* + * Use the given filehandle to look up the corresponding export and + * dentry. On success, the results are used to set fh_export and + * fh_dentry. + */ +static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) +{ + struct knfsd_fh *fh = &fhp->fh_handle; + struct fid *fid = NULL, sfid; + struct svc_export *exp; + struct dentry *dentry; + int fileid_type; + int data_left = fh->fh_size/4; + __be32 error; + + error = nfserr_stale; + if (rqstp->rq_vers > 2) + error = nfserr_badhandle; + if (rqstp->rq_vers == 4 && fh->fh_size == 0) + return nfserr_nofilehandle; + + if (fh->fh_version == 1) { + int len; + + if (--data_left < 0) + return error; + if (fh->fh_auth_type != 0) + return error; + len = key_len(fh->fh_fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { + /* deprecated, convert to type 3 */ + len = key_len(FSID_ENCODE_DEV)/4; + fh->fh_fsid_type = FSID_ENCODE_DEV; + /* + * struct knfsd_fh uses host-endian fields, which are + * sometimes used to hold net-endian values. This + * confuses sparse, so we must use __force here to + * keep it from complaining. + */ + fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]), + ntohl((__force __be32)fh->fh_fsid[1]))); + fh->fh_fsid[1] = fh->fh_fsid[2]; + } + data_left -= len; + if (data_left < 0) + return error; + exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid); + fid = (struct fid *)(fh->fh_fsid + len); + } else { + __u32 tfh[2]; + dev_t xdev; + ino_t xino; + + if (fh->fh_size != NFS_FHSIZE) + return error; + /* assume old filehandle format */ + xdev = old_decode_dev(fh->ofh_xdev); + xino = u32_to_ino_t(fh->ofh_xino); + mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL); + exp = rqst_exp_find(rqstp, FSID_DEV, tfh); + } + + error = nfserr_stale; + if (IS_ERR(exp)) { + trace_nfsd_set_fh_dentry_badexport(rqstp, fhp, PTR_ERR(exp)); + + if (PTR_ERR(exp) == -ENOENT) + return error; + + return nfserrno(PTR_ERR(exp)); + } + + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) { + /* Elevate privileges so that the lack of 'r' or 'x' + * permission on some parent directory will + * not stop exportfs_decode_fh from being able + * to reconnect a directory into the dentry cache. + * The same problem can affect "SUBTREECHECK" exports, + * but as nfsd_acceptable depends on correct + * access control settings being in effect, we cannot + * fix that case easily. + */ + struct cred *new = prepare_creds(); + if (!new) { + error = nfserrno(-ENOMEM); + goto out; + } + new->cap_effective = + cap_raise_nfsd_set(new->cap_effective, + new->cap_permitted); + put_cred(override_creds(new)); + put_cred(new); + } else { + error = nfsd_setuser_and_check_port(rqstp, exp); + if (error) + goto out; + } + + /* + * Look up the dentry using the NFS file handle. + */ + error = nfserr_stale; + if (rqstp->rq_vers > 2) + error = nfserr_badhandle; + + if (fh->fh_version != 1) { + sfid.i32.ino = fh->ofh_ino; + sfid.i32.gen = fh->ofh_generation; + sfid.i32.parent_ino = fh->ofh_dirino; + fid = &sfid; + data_left = 3; + if (fh->ofh_dirino == 0) + fileid_type = FILEID_INO32_GEN; + else + fileid_type = FILEID_INO32_GEN_PARENT; + } else + fileid_type = fh->fh_fileid_type; + + if (fileid_type == FILEID_ROOT) + dentry = dget(exp->ex_path.dentry); + else { + dentry = exportfs_decode_fh(exp->ex_path.mnt, fid, + data_left, fileid_type, + nfsd_acceptable, exp); + if (IS_ERR_OR_NULL(dentry)) + trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp, + dentry ? PTR_ERR(dentry) : -ESTALE); + } + if (dentry == NULL) + goto out; + if (IS_ERR(dentry)) { + if (PTR_ERR(dentry) != -EINVAL) + error = nfserrno(PTR_ERR(dentry)); + goto out; + } + + if (d_is_dir(dentry) && + (dentry->d_flags & DCACHE_DISCONNECTED)) { + printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %pd2\n", + dentry); + } + + fhp->fh_dentry = dentry; + fhp->fh_export = exp; + return 0; +out: + exp_put(exp); + return error; +} + +/** + * fh_verify - filehandle lookup and access checking + * @rqstp: pointer to current rpc request + * @fhp: filehandle to be verified + * @type: expected type of object pointed to by filehandle + * @access: type of access needed to object + * + * Look up a dentry from the on-the-wire filehandle, check the client's + * access to the export, and set the current task's credentials. + * + * Regardless of success or failure of fh_verify(), fh_put() should be + * called on @fhp when the caller is finished with the filehandle. + * + * fh_verify() may be called multiple times on a given filehandle, for + * example, when processing an NFSv4 compound. The first call will look + * up a dentry using the on-the-wire filehandle. Subsequent calls will + * skip the lookup and just perform the other checks and possibly change + * the current task's credentials. + * + * @type specifies the type of object expected using one of the S_IF* + * constants defined in include/linux/stat.h. The caller may use zero + * to indicate that it doesn't care, or a negative integer to indicate + * that it expects something not of the given type. + * + * @access is formed from the NFSD_MAY_* constants defined in + * fs/nfsd/vfs.h. + */ +__be32 +fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) +{ + struct svc_export *exp; + struct dentry *dentry; + __be32 error; + + dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp)); + + if (!fhp->fh_dentry) { + error = nfsd_set_fh_dentry(rqstp, fhp); + if (error) + goto out; + } + dentry = fhp->fh_dentry; + exp = fhp->fh_export; + /* + * We still have to do all these permission checks, even when + * fh_dentry is already set: + * - fh_verify may be called multiple times with different + * "access" arguments (e.g. nfsd_proc_create calls + * fh_verify(...,NFSD_MAY_EXEC) first, then later (in + * nfsd_create) calls fh_verify(...,NFSD_MAY_CREATE). + * - in the NFSv4 case, the filehandle may have been filled + * in by fh_compose, and given a dentry, but further + * compound operations performed with that filehandle + * still need permissions checks. In the worst case, a + * mountpoint crossing may have changed the export + * options, and we may now need to use a different uid + * (for example, if different id-squashing options are in + * effect on the new filesystem). + */ + error = check_pseudo_root(rqstp, dentry, exp); + if (error) + goto out; + + error = nfsd_setuser_and_check_port(rqstp, exp); + if (error) + goto out; + + error = nfsd_mode_check(rqstp, dentry, type); + if (error) + goto out; + + /* + * pseudoflavor restrictions are not enforced on NLM, + * which clients virtually always use auth_sys for, + * even while using RPCSEC_GSS for NFS. + */ + if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS) + goto skip_pseudoflavor_check; + /* + * Clients may expect to be able to use auth_sys during mount, + * even if they use gss for everything else; see section 2.3.2 + * of rfc 2623. + */ + if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT + && exp->ex_path.dentry == dentry) + goto skip_pseudoflavor_check; + + error = check_nfsd_access(exp, rqstp); + if (error) + goto out; + +skip_pseudoflavor_check: + /* Finally, check access permissions. */ + error = nfsd_permission(rqstp, exp, dentry, access); + + if (error) { + dprintk("fh_verify: %pd2 permission failure, " + "acc=%x, error=%d\n", + dentry, + access, ntohl(error)); + } +out: + if (error == nfserr_stale) + nfsdstats.fh_stale++; + return error; +} + + +/* + * Compose a file handle for an NFS reply. + * + * Note that when first composed, the dentry may not yet have + * an inode. In this case a call to fh_update should be made + * before the fh goes out on the wire ... + */ +static void _fh_update(struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry) +{ + if (dentry != exp->ex_path.dentry) { + struct fid *fid = (struct fid *) + (fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1); + int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4; + int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK); + + fhp->fh_handle.fh_fileid_type = + exportfs_encode_fh(dentry, fid, &maxsize, subtreecheck); + fhp->fh_handle.fh_size += maxsize * 4; + } else { + fhp->fh_handle.fh_fileid_type = FILEID_ROOT; + } +} + +/* + * for composing old style file handles + */ +static inline void _fh_update_old(struct dentry *dentry, + struct svc_export *exp, + struct knfsd_fh *fh) +{ + fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino); + fh->ofh_generation = d_inode(dentry)->i_generation; + if (d_is_dir(dentry) || + (exp->ex_flags & NFSEXP_NOSUBTREECHECK)) + fh->ofh_dirino = 0; +} + +static bool is_root_export(struct svc_export *exp) +{ + return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root; +} + +static struct super_block *exp_sb(struct svc_export *exp) +{ + return exp->ex_path.dentry->d_sb; +} + +static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) +{ + switch (fsid_type) { + case FSID_DEV: + if (!old_valid_dev(exp_sb(exp)->s_dev)) + return false; + fallthrough; + case FSID_MAJOR_MINOR: + case FSID_ENCODE_DEV: + return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV; + case FSID_NUM: + return exp->ex_flags & NFSEXP_FSID; + case FSID_UUID8: + case FSID_UUID16: + if (!is_root_export(exp)) + return false; + fallthrough; + case FSID_UUID4_INUM: + case FSID_UUID16_INUM: + return exp->ex_uuid != NULL; + } + return true; +} + + +static void set_version_and_fsid_type(struct svc_fh *fhp, struct svc_export *exp, struct svc_fh *ref_fh) +{ + u8 version; + u8 fsid_type; +retry: + version = 1; + if (ref_fh && ref_fh->fh_export == exp) { + version = ref_fh->fh_handle.fh_version; + fsid_type = ref_fh->fh_handle.fh_fsid_type; + + ref_fh = NULL; + + switch (version) { + case 0xca: + fsid_type = FSID_DEV; + break; + case 1: + break; + default: + goto retry; + } + + /* + * As the fsid -> filesystem mapping was guided by + * user-space, there is no guarantee that the filesystem + * actually supports that fsid type. If it doesn't we + * loop around again without ref_fh set. + */ + if (!fsid_type_ok_for_exp(fsid_type, exp)) + goto retry; + } else if (exp->ex_flags & NFSEXP_FSID) { + fsid_type = FSID_NUM; + } else if (exp->ex_uuid) { + if (fhp->fh_maxsize >= 64) { + if (is_root_export(exp)) + fsid_type = FSID_UUID16; + else + fsid_type = FSID_UUID16_INUM; + } else { + if (is_root_export(exp)) + fsid_type = FSID_UUID8; + else + fsid_type = FSID_UUID4_INUM; + } + } else if (!old_valid_dev(exp_sb(exp)->s_dev)) + /* for newer device numbers, we must use a newer fsid format */ + fsid_type = FSID_ENCODE_DEV; + else + fsid_type = FSID_DEV; + fhp->fh_handle.fh_version = version; + if (version) + fhp->fh_handle.fh_fsid_type = fsid_type; +} + +__be32 +fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, + struct svc_fh *ref_fh) +{ + /* ref_fh is a reference file handle. + * if it is non-null and for the same filesystem, then we should compose + * a filehandle which is of the same version, where possible. + * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca + * Then create a 32byte filehandle using nfs_fhbase_old + * + */ + + struct inode * inode = d_inode(dentry); + dev_t ex_dev = exp_sb(exp)->s_dev; + + dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n", + MAJOR(ex_dev), MINOR(ex_dev), + (long) d_inode(exp->ex_path.dentry)->i_ino, + dentry, + (inode ? inode->i_ino : 0)); + + /* Choose filehandle version and fsid type based on + * the reference filehandle (if it is in the same export) + * or the export options. + */ + set_version_and_fsid_type(fhp, exp, ref_fh); + + if (ref_fh == fhp) + fh_put(ref_fh); + + if (fhp->fh_locked || fhp->fh_dentry) { + printk(KERN_ERR "fh_compose: fh %pd2 not initialized!\n", + dentry); + } + if (fhp->fh_maxsize < NFS_FHSIZE) + printk(KERN_ERR "fh_compose: called with maxsize %d! %pd2\n", + fhp->fh_maxsize, + dentry); + + fhp->fh_dentry = dget(dentry); /* our internal copy */ + fhp->fh_export = exp_get(exp); + + if (fhp->fh_handle.fh_version == 0xca) { + /* old style filehandle please */ + memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE); + fhp->fh_handle.fh_size = NFS_FHSIZE; + fhp->fh_handle.ofh_dcookie = 0xfeebbaca; + fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev); + fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev; + fhp->fh_handle.ofh_xino = + ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino); + fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry)); + if (inode) + _fh_update_old(dentry, exp, &fhp->fh_handle); + } else { + fhp->fh_handle.fh_size = + key_len(fhp->fh_handle.fh_fsid_type) + 4; + fhp->fh_handle.fh_auth_type = 0; + + mk_fsid(fhp->fh_handle.fh_fsid_type, + fhp->fh_handle.fh_fsid, + ex_dev, + d_inode(exp->ex_path.dentry)->i_ino, + exp->ex_fsid, exp->ex_uuid); + + if (inode) + _fh_update(fhp, exp, dentry); + if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) { + fh_put(fhp); + return nfserr_opnotsupp; + } + } + + return 0; +} + +/* + * Update file handle information after changing a dentry. + * This is only called by nfsd_create, nfsd_create_v3 and nfsd_proc_create + */ +__be32 +fh_update(struct svc_fh *fhp) +{ + struct dentry *dentry; + + if (!fhp->fh_dentry) + goto out_bad; + + dentry = fhp->fh_dentry; + if (d_really_is_negative(dentry)) + goto out_negative; + if (fhp->fh_handle.fh_version != 1) { + _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle); + } else { + if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT) + return 0; + + _fh_update(fhp, fhp->fh_export, dentry); + if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) + return nfserr_opnotsupp; + } + return 0; +out_bad: + printk(KERN_ERR "fh_update: fh not verified!\n"); + return nfserr_serverfault; +out_negative: + printk(KERN_ERR "fh_update: %pd2 still negative!\n", + dentry); + return nfserr_serverfault; +} + +/* + * Release a file handle. + */ +void +fh_put(struct svc_fh *fhp) +{ + struct dentry * dentry = fhp->fh_dentry; + struct svc_export * exp = fhp->fh_export; + if (dentry) { + fh_unlock(fhp); + fhp->fh_dentry = NULL; + dput(dentry); + fh_clear_wcc(fhp); + } + fh_drop_write(fhp); + if (exp) { + exp_put(exp); + fhp->fh_export = NULL; + } + return; +} + +/* + * Shorthand for dprintk()'s + */ +char * SVCFH_fmt(struct svc_fh *fhp) +{ + struct knfsd_fh *fh = &fhp->fh_handle; + + static char buf[80]; + sprintf(buf, "%d: %08x %08x %08x %08x %08x %08x", + fh->fh_size, + fh->fh_base.fh_pad[0], + fh->fh_base.fh_pad[1], + fh->fh_base.fh_pad[2], + fh->fh_base.fh_pad[3], + fh->fh_base.fh_pad[4], + fh->fh_base.fh_pad[5]); + return buf; +} + +enum fsid_source fsid_source(struct svc_fh *fhp) +{ + if (fhp->fh_handle.fh_version != 1) + return FSIDSOURCE_DEV; + switch(fhp->fh_handle.fh_fsid_type) { + case FSID_DEV: + case FSID_ENCODE_DEV: + case FSID_MAJOR_MINOR: + if (exp_sb(fhp->fh_export)->s_type->fs_flags & FS_REQUIRES_DEV) + return FSIDSOURCE_DEV; + break; + case FSID_NUM: + if (fhp->fh_export->ex_flags & NFSEXP_FSID) + return FSIDSOURCE_FSID; + break; + default: + break; + } + /* either a UUID type filehandle, or the filehandle doesn't + * match the export. + */ + if (fhp->fh_export->ex_flags & NFSEXP_FSID) + return FSIDSOURCE_FSID; + if (fhp->fh_export->ex_uuid) + return FSIDSOURCE_UUID; + return FSIDSOURCE_DEV; +} diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h new file mode 100644 index 000000000..56cfbc361 --- /dev/null +++ b/fs/nfsd/nfsfh.h @@ -0,0 +1,326 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> + * + * This file describes the layout of the file handles as passed + * over the wire. + */ +#ifndef _LINUX_NFSD_NFSFH_H +#define _LINUX_NFSD_NFSFH_H + +#include <linux/crc32.h> +#include <linux/sunrpc/svc.h> +#include <uapi/linux/nfsd/nfsfh.h> +#include <linux/iversion.h> + +static inline __u32 ino_t_to_u32(ino_t ino) +{ + return (__u32) ino; +} + +static inline ino_t u32_to_ino_t(__u32 uino) +{ + return (ino_t) uino; +} + +/* + * This is the internal representation of an NFS handle used in knfsd. + * pre_mtime/post_version will be used to support wcc_attr's in NFSv3. + */ +typedef struct svc_fh { + struct knfsd_fh fh_handle; /* FH data */ + int fh_maxsize; /* max size for fh_handle */ + struct dentry * fh_dentry; /* validated dentry */ + struct svc_export * fh_export; /* export pointer */ + + bool fh_locked; /* inode locked by us */ + bool fh_want_write; /* remount protection taken */ + int fh_flags; /* FH flags */ +#ifdef CONFIG_NFSD_V3 + bool fh_post_saved; /* post-op attrs saved */ + bool fh_pre_saved; /* pre-op attrs saved */ + + /* Pre-op attributes saved during fh_lock */ + __u64 fh_pre_size; /* size before operation */ + struct timespec64 fh_pre_mtime; /* mtime before oper */ + struct timespec64 fh_pre_ctime; /* ctime before oper */ + /* + * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode) + * to find out if it is valid. + */ + u64 fh_pre_change; + + /* Post-op attributes saved in fh_unlock */ + struct kstat fh_post_attr; /* full attrs after operation */ + u64 fh_post_change; /* nfsv4 change; see above */ +#endif /* CONFIG_NFSD_V3 */ + +} svc_fh; +#define NFSD4_FH_FOREIGN (1<<0) +#define SET_FH_FLAG(c, f) ((c)->fh_flags |= (f)) +#define HAS_FH_FLAG(c, f) ((c)->fh_flags & (f)) + +enum nfsd_fsid { + FSID_DEV = 0, + FSID_NUM, + FSID_MAJOR_MINOR, + FSID_ENCODE_DEV, + FSID_UUID4_INUM, + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, +}; + +enum fsid_source { + FSIDSOURCE_DEV, + FSIDSOURCE_FSID, + FSIDSOURCE_UUID, +}; +extern enum fsid_source fsid_source(struct svc_fh *fhp); + + +/* + * This might look a little large to "inline" but in all calls except + * one, 'vers' is constant so moste of the function disappears. + * + * In some cases the values are considered to be host endian and in + * others, net endian. fsidv is always considered to be u32 as the + * callers don't know which it will be. So we must use __force to keep + * sparse from complaining. Since these values are opaque to the + * client, that shouldn't be a problem. + */ +static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, + u32 fsid, unsigned char *uuid) +{ + u32 *up; + switch(vers) { + case FSID_DEV: + fsidv[0] = (__force __u32)htonl((MAJOR(dev)<<16) | + MINOR(dev)); + fsidv[1] = ino_t_to_u32(ino); + break; + case FSID_NUM: + fsidv[0] = fsid; + break; + case FSID_MAJOR_MINOR: + fsidv[0] = (__force __u32)htonl(MAJOR(dev)); + fsidv[1] = (__force __u32)htonl(MINOR(dev)); + fsidv[2] = ino_t_to_u32(ino); + break; + + case FSID_ENCODE_DEV: + fsidv[0] = new_encode_dev(dev); + fsidv[1] = ino_t_to_u32(ino); + break; + + case FSID_UUID4_INUM: + /* 4 byte fsid and inode number */ + up = (u32*)uuid; + fsidv[0] = ino_t_to_u32(ino); + fsidv[1] = up[0] ^ up[1] ^ up[2] ^ up[3]; + break; + + case FSID_UUID8: + /* 8 byte fsid */ + up = (u32*)uuid; + fsidv[0] = up[0] ^ up[2]; + fsidv[1] = up[1] ^ up[3]; + break; + + case FSID_UUID16: + /* 16 byte fsid - NFSv3+ only */ + memcpy(fsidv, uuid, 16); + break; + + case FSID_UUID16_INUM: + /* 8 byte inode and 16 byte fsid */ + *(u64*)fsidv = (u64)ino; + memcpy(fsidv+2, uuid, 16); + break; + default: BUG(); + } +} + +static inline int key_len(int type) +{ + switch(type) { + case FSID_DEV: return 8; + case FSID_NUM: return 4; + case FSID_MAJOR_MINOR: return 12; + case FSID_ENCODE_DEV: return 8; + case FSID_UUID4_INUM: return 8; + case FSID_UUID8: return 8; + case FSID_UUID16: return 16; + case FSID_UUID16_INUM: return 24; + default: return 0; + } +} + +/* + * Shorthand for dprintk()'s + */ +extern char * SVCFH_fmt(struct svc_fh *fhp); + +/* + * Function prototypes + */ +__be32 fh_verify(struct svc_rqst *, struct svc_fh *, umode_t, int); +__be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *); +__be32 fh_update(struct svc_fh *); +void fh_put(struct svc_fh *); + +static __inline__ struct svc_fh * +fh_copy(struct svc_fh *dst, struct svc_fh *src) +{ + WARN_ON(src->fh_dentry || src->fh_locked); + + *dst = *src; + return dst; +} + +static inline void +fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src) +{ + dst->fh_size = src->fh_size; + memcpy(&dst->fh_base, &src->fh_base, src->fh_size); +} + +static __inline__ struct svc_fh * +fh_init(struct svc_fh *fhp, int maxsize) +{ + memset(fhp, 0, sizeof(*fhp)); + fhp->fh_maxsize = maxsize; + return fhp; +} + +static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +{ + if (fh1->fh_size != fh2->fh_size) + return false; + if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0) + return false; + return true; +} + +static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +{ + if (fh1->fh_fsid_type != fh2->fh_fsid_type) + return false; + if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0) + return false; + return true; +} + +#ifdef CONFIG_CRC32 +/** + * knfsd_fh_hash - calculate the crc32 hash for the filehandle + * @fh - pointer to filehandle + * + * returns a crc32 hash for the filehandle that is compatible with + * the one displayed by "wireshark". + */ + +static inline u32 +knfsd_fh_hash(struct knfsd_fh *fh) +{ + return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size); +} +#else +static inline u32 +knfsd_fh_hash(struct knfsd_fh *fh) +{ + return 0; +} +#endif + +#ifdef CONFIG_NFSD_V3 +/* + * The wcc data stored in current_fh should be cleared + * between compound ops. + */ +static inline void +fh_clear_wcc(struct svc_fh *fhp) +{ + fhp->fh_post_saved = false; + fhp->fh_pre_saved = false; +} + +/* + * We could use i_version alone as the change attribute. However, + * i_version can go backwards after a reboot. On its own that doesn't + * necessarily cause a problem, but if i_version goes backwards and then + * is incremented again it could reuse a value that was previously used + * before boot, and a client who queried the two values might + * incorrectly assume nothing changed. + * + * By using both ctime and the i_version counter we guarantee that as + * long as time doesn't go backwards we never reuse an old value. + */ +static inline u64 nfsd4_change_attribute(struct kstat *stat, + struct inode *inode) +{ + u64 chattr; + + chattr = stat->ctime.tv_sec; + chattr <<= 30; + chattr += stat->ctime.tv_nsec; + chattr += inode_query_iversion(inode); + return chattr; +} + +extern void fill_pre_wcc(struct svc_fh *fhp); +extern void fill_post_wcc(struct svc_fh *fhp); +#else +#define fh_clear_wcc(ignored) +#define fill_pre_wcc(ignored) +#define fill_post_wcc(notused) +#endif /* CONFIG_NFSD_V3 */ + + +/* + * Lock a file handle/inode + * NOTE: both fh_lock and fh_unlock are done "by hand" in + * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once + * so, any changes here should be reflected there. + */ + +static inline void +fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) +{ + struct dentry *dentry = fhp->fh_dentry; + struct inode *inode; + + BUG_ON(!dentry); + + if (fhp->fh_locked) { + printk(KERN_WARNING "fh_lock: %pd2 already locked!\n", + dentry); + return; + } + + inode = d_inode(dentry); + inode_lock_nested(inode, subclass); + fill_pre_wcc(fhp); + fhp->fh_locked = true; +} + +static inline void +fh_lock(struct svc_fh *fhp) +{ + fh_lock_nested(fhp, I_MUTEX_NORMAL); +} + +/* + * Unlock a file handle/inode + */ +static inline void +fh_unlock(struct svc_fh *fhp) +{ + if (fhp->fh_locked) { + fill_post_wcc(fhp); + inode_unlock(d_inode(fhp->fh_dentry)); + fhp->fh_locked = false; + } +} + +#endif /* _LINUX_NFSD_NFSFH_H */ diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c new file mode 100644 index 000000000..bbd01e839 --- /dev/null +++ b/fs/nfsd/nfsproc.c @@ -0,0 +1,856 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Process version 2 NFS requests. + * + * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/namei.h> + +#include "cache.h" +#include "xdr.h" +#include "vfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +static __be32 +nfsd_proc_null(struct svc_rqst *rqstp) +{ + return rpc_success; +} + +/* + * Get a file's attributes + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_getattr(struct svc_rqst *rqstp) +{ + struct nfsd_fhandle *argp = rqstp->rq_argp; + struct nfsd_attrstat *resp = rqstp->rq_resp; + + dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); + + fh_copy(&resp->fh, &argp->fh); + resp->status = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + if (resp->status != nfs_ok) + goto out; + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; +} + +/* + * Set a file's attributes + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_setattr(struct svc_rqst *rqstp) +{ + struct nfsd_sattrargs *argp = rqstp->rq_argp; + struct nfsd_attrstat *resp = rqstp->rq_resp; + struct iattr *iap = &argp->attrs; + struct svc_fh *fhp; + + dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n", + SVCFH_fmt(&argp->fh), + argp->attrs.ia_valid, (long) argp->attrs.ia_size); + + fhp = fh_copy(&resp->fh, &argp->fh); + + /* + * NFSv2 does not differentiate between "set-[ac]time-to-now" + * which only requires access, and "set-[ac]time-to-X" which + * requires ownership. + * So if it looks like it might be "set both to the same time which + * is close to now", and if setattr_prepare fails, then we + * convert to "set to now" instead of "set to explicit time" + * + * We only call setattr_prepare as the last test as technically + * it is not an interface that we should be using. + */ +#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) +#define MAX_TOUCH_TIME_ERROR (30*60) + if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET && + iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) { + /* + * Looks probable. + * + * Now just make sure time is in the right ballpark. + * Solaris, at least, doesn't seem to care what the time + * request is. We require it be within 30 minutes of now. + */ + time64_t delta = iap->ia_atime.tv_sec - ktime_get_real_seconds(); + + resp->status = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; + + if (delta < 0) + delta = -delta; + if (delta < MAX_TOUCH_TIME_ERROR && + setattr_prepare(fhp->fh_dentry, iap) != 0) { + /* + * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. + * This will cause notify_change to set these times + * to "now" + */ + iap->ia_valid &= ~BOTH_TIME_SET; + } + } + + resp->status = nfsd_setattr(rqstp, fhp, iap, 0, (time64_t)0); + if (resp->status != nfs_ok) + goto out; + + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; +} + +/* Obsolete, replaced by MNTPROC_MNT. */ +static __be32 +nfsd_proc_root(struct svc_rqst *rqstp) +{ + return rpc_success; +} + +/* + * Look up a path name component + * Note: the dentry in the resp->fh may be negative if the file + * doesn't exist yet. + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_lookup(struct svc_rqst *rqstp) +{ + struct nfsd_diropargs *argp = rqstp->rq_argp; + struct nfsd_diropres *resp = rqstp->rq_resp; + + dprintk("nfsd: LOOKUP %s %.*s\n", + SVCFH_fmt(&argp->fh), argp->len, argp->name); + + fh_init(&resp->fh, NFS_FHSIZE); + resp->status = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len, + &resp->fh); + fh_put(&argp->fh); + if (resp->status != nfs_ok) + goto out; + + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; +} + +/* + * Read a symlink. + */ +static __be32 +nfsd_proc_readlink(struct svc_rqst *rqstp) +{ + struct nfsd_readlinkargs *argp = rqstp->rq_argp; + struct nfsd_readlinkres *resp = rqstp->rq_resp; + + dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh)); + + /* Read the symlink. */ + resp->len = NFS_MAXPATHLEN; + resp->status = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len); + + fh_put(&argp->fh); + return rpc_success; +} + +/* + * Read a portion of a file. + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_read(struct svc_rqst *rqstp) +{ + struct nfsd_readargs *argp = rqstp->rq_argp; + struct nfsd_readres *resp = rqstp->rq_resp; + u32 eof; + + dprintk("nfsd: READ %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->count, argp->offset); + + /* Obtain buffer pointer for payload. 19 is 1 word for + * status, 17 words for fattr, and 1 word for the byte count. + */ + + if (NFSSVC_MAXBLKSIZE_V2 < argp->count) { + char buf[RPC_MAX_ADDRBUFLEN]; + printk(KERN_NOTICE + "oversized read request from %s (%d bytes)\n", + svc_print_addr(rqstp, buf, sizeof(buf)), + argp->count); + argp->count = NFSSVC_MAXBLKSIZE_V2; + } + svc_reserve_auth(rqstp, (19<<2) + argp->count + 4); + + resp->count = argp->count; + resp->status = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), + argp->offset, + rqstp->rq_vec, argp->vlen, + &resp->count, + &eof); + if (resp->status == nfs_ok) + resp->status = fh_getattr(&resp->fh, &resp->stat); + else if (resp->status == nfserr_jukebox) + return rpc_drop_reply; + return rpc_success; +} + +/* Reserved */ +static __be32 +nfsd_proc_writecache(struct svc_rqst *rqstp) +{ + return rpc_success; +} + +/* + * Write data to a file + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_write(struct svc_rqst *rqstp) +{ + struct nfsd_writeargs *argp = rqstp->rq_argp; + struct nfsd_attrstat *resp = rqstp->rq_resp; + unsigned long cnt = argp->len; + unsigned int nvecs; + + dprintk("nfsd: WRITE %s %u bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->len, argp->offset); + + nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages, + &argp->first, cnt); + if (!nvecs) { + resp->status = nfserr_io; + goto out; + } + + resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), + argp->offset, rqstp->rq_vec, nvecs, + &cnt, NFS_DATA_SYNC, NULL); + if (resp->status == nfs_ok) + resp->status = fh_getattr(&resp->fh, &resp->stat); + else if (resp->status == nfserr_jukebox) + return rpc_drop_reply; +out: + return rpc_success; +} + +/* + * CREATE processing is complicated. The keyword here is `overloaded.' + * The parent directory is kept locked between the check for existence + * and the actual create() call in compliance with VFS protocols. + * N.B. After this call _both_ argp->fh and resp->fh need an fh_put + */ +static __be32 +nfsd_proc_create(struct svc_rqst *rqstp) +{ + struct nfsd_createargs *argp = rqstp->rq_argp; + struct nfsd_diropres *resp = rqstp->rq_resp; + svc_fh *dirfhp = &argp->fh; + svc_fh *newfhp = &resp->fh; + struct iattr *attr = &argp->attrs; + struct inode *inode; + struct dentry *dchild; + int type, mode; + int hosterr; + dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); + + dprintk("nfsd: CREATE %s %.*s\n", + SVCFH_fmt(dirfhp), argp->len, argp->name); + + /* First verify the parent file handle */ + resp->status = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC); + if (resp->status != nfs_ok) + goto done; /* must fh_put dirfhp even on error */ + + /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */ + + resp->status = nfserr_exist; + if (isdotent(argp->name, argp->len)) + goto done; + hosterr = fh_want_write(dirfhp); + if (hosterr) { + resp->status = nfserrno(hosterr); + goto done; + } + + fh_lock_nested(dirfhp, I_MUTEX_PARENT); + dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); + if (IS_ERR(dchild)) { + resp->status = nfserrno(PTR_ERR(dchild)); + goto out_unlock; + } + fh_init(newfhp, NFS_FHSIZE); + resp->status = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp); + if (!resp->status && d_really_is_negative(dchild)) + resp->status = nfserr_noent; + dput(dchild); + if (resp->status) { + if (resp->status != nfserr_noent) + goto out_unlock; + /* + * If the new file handle wasn't verified, we can't tell + * whether the file exists or not. Time to bail ... + */ + resp->status = nfserr_acces; + if (!newfhp->fh_dentry) { + printk(KERN_WARNING + "nfsd_proc_create: file handle not verified\n"); + goto out_unlock; + } + } + + inode = d_inode(newfhp->fh_dentry); + + /* Unfudge the mode bits */ + if (attr->ia_valid & ATTR_MODE) { + type = attr->ia_mode & S_IFMT; + mode = attr->ia_mode & ~S_IFMT; + if (!type) { + /* no type, so if target exists, assume same as that, + * else assume a file */ + if (inode) { + type = inode->i_mode & S_IFMT; + switch(type) { + case S_IFCHR: + case S_IFBLK: + /* reserve rdev for later checking */ + rdev = inode->i_rdev; + attr->ia_valid |= ATTR_SIZE; + + fallthrough; + case S_IFIFO: + /* this is probably a permission check.. + * at least IRIX implements perm checking on + * echo thing > device-special-file-or-pipe + * by doing a CREATE with type==0 + */ + resp->status = nfsd_permission(rqstp, + newfhp->fh_export, + newfhp->fh_dentry, + NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS); + if (resp->status && resp->status != nfserr_rofs) + goto out_unlock; + } + } else + type = S_IFREG; + } + } else if (inode) { + type = inode->i_mode & S_IFMT; + mode = inode->i_mode & ~S_IFMT; + } else { + type = S_IFREG; + mode = 0; /* ??? */ + } + + attr->ia_valid |= ATTR_MODE; + attr->ia_mode = mode; + + /* Special treatment for non-regular files according to the + * gospel of sun micro + */ + if (type != S_IFREG) { + if (type != S_IFBLK && type != S_IFCHR) { + rdev = 0; + } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) { + /* If you think you've seen the worst, grok this. */ + type = S_IFIFO; + } else { + /* Okay, char or block special */ + if (!rdev) + rdev = wanted; + } + + /* we've used the SIZE information, so discard it */ + attr->ia_valid &= ~ATTR_SIZE; + + /* Make sure the type and device matches */ + resp->status = nfserr_exist; + if (inode && inode_wrong_type(inode, type)) + goto out_unlock; + } + + resp->status = nfs_ok; + if (!inode) { + /* File doesn't exist. Create it and set attrs */ + resp->status = nfsd_create_locked(rqstp, dirfhp, argp->name, + argp->len, attr, type, rdev, + newfhp); + } else if (type == S_IFREG) { + dprintk("nfsd: existing %s, valid=%x, size=%ld\n", + argp->name, attr->ia_valid, (long) attr->ia_size); + /* File already exists. We ignore all attributes except + * size, so that creat() behaves exactly like + * open(..., O_CREAT|O_TRUNC|O_WRONLY). + */ + attr->ia_valid &= ATTR_SIZE; + if (attr->ia_valid) + resp->status = nfsd_setattr(rqstp, newfhp, attr, 0, + (time64_t)0); + } + +out_unlock: + /* We don't really need to unlock, as fh_put does it. */ + fh_unlock(dirfhp); + fh_drop_write(dirfhp); +done: + fh_put(dirfhp); + if (resp->status != nfs_ok) + goto out; + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; +} + +static __be32 +nfsd_proc_remove(struct svc_rqst *rqstp) +{ + struct nfsd_diropargs *argp = rqstp->rq_argp; + struct nfsd_stat *resp = rqstp->rq_resp; + + dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh), + argp->len, argp->name); + + /* Unlink. -SIFDIR means file must not be a directory */ + resp->status = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, + argp->name, argp->len); + fh_put(&argp->fh); + return rpc_success; +} + +static __be32 +nfsd_proc_rename(struct svc_rqst *rqstp) +{ + struct nfsd_renameargs *argp = rqstp->rq_argp; + struct nfsd_stat *resp = rqstp->rq_resp; + + dprintk("nfsd: RENAME %s %.*s -> \n", + SVCFH_fmt(&argp->ffh), argp->flen, argp->fname); + dprintk("nfsd: -> %s %.*s\n", + SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname); + + resp->status = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen, + &argp->tfh, argp->tname, argp->tlen); + fh_put(&argp->ffh); + fh_put(&argp->tfh); + return rpc_success; +} + +static __be32 +nfsd_proc_link(struct svc_rqst *rqstp) +{ + struct nfsd_linkargs *argp = rqstp->rq_argp; + struct nfsd_stat *resp = rqstp->rq_resp; + + dprintk("nfsd: LINK %s ->\n", + SVCFH_fmt(&argp->ffh)); + dprintk("nfsd: %s %.*s\n", + SVCFH_fmt(&argp->tfh), + argp->tlen, + argp->tname); + + resp->status = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen, + &argp->ffh); + fh_put(&argp->ffh); + fh_put(&argp->tfh); + return rpc_success; +} + +static __be32 +nfsd_proc_symlink(struct svc_rqst *rqstp) +{ + struct nfsd_symlinkargs *argp = rqstp->rq_argp; + struct nfsd_stat *resp = rqstp->rq_resp; + struct svc_fh newfh; + + if (argp->tlen > NFS_MAXPATHLEN) { + resp->status = nfserr_nametoolong; + goto out; + } + + argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, + page_address(rqstp->rq_arg.pages[0]), + argp->tlen); + if (IS_ERR(argp->tname)) { + resp->status = nfserrno(PTR_ERR(argp->tname)); + goto out; + } + + dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n", + SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, + argp->tlen, argp->tname); + + fh_init(&newfh, NFS_FHSIZE); + resp->status = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, + argp->tname, &newfh); + + kfree(argp->tname); + fh_put(&argp->ffh); + fh_put(&newfh); +out: + return rpc_success; +} + +/* + * Make directory. This operation is not idempotent. + * N.B. After this call resp->fh needs an fh_put + */ +static __be32 +nfsd_proc_mkdir(struct svc_rqst *rqstp) +{ + struct nfsd_createargs *argp = rqstp->rq_argp; + struct nfsd_diropres *resp = rqstp->rq_resp; + + dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); + + if (resp->fh.fh_dentry) { + printk(KERN_WARNING + "nfsd_proc_mkdir: response already verified??\n"); + } + + argp->attrs.ia_valid &= ~ATTR_SIZE; + fh_init(&resp->fh, NFS_FHSIZE); + resp->status = nfsd_create(rqstp, &argp->fh, argp->name, argp->len, + &argp->attrs, S_IFDIR, 0, &resp->fh); + fh_put(&argp->fh); + if (resp->status != nfs_ok) + goto out; + + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; +} + +/* + * Remove a directory + */ +static __be32 +nfsd_proc_rmdir(struct svc_rqst *rqstp) +{ + struct nfsd_diropargs *argp = rqstp->rq_argp; + struct nfsd_stat *resp = rqstp->rq_resp; + + dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); + + resp->status = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, + argp->name, argp->len); + fh_put(&argp->fh); + return rpc_success; +} + +/* + * Read a portion of a directory. + */ +static __be32 +nfsd_proc_readdir(struct svc_rqst *rqstp) +{ + struct nfsd_readdirargs *argp = rqstp->rq_argp; + struct nfsd_readdirres *resp = rqstp->rq_resp; + int count; + loff_t offset; + + dprintk("nfsd: READDIR %s %d bytes at %d\n", + SVCFH_fmt(&argp->fh), + argp->count, argp->cookie); + + /* Shrink to the client read size */ + count = (argp->count >> 2) - 2; + + /* Make sure we've room for the NULL ptr & eof flag */ + count -= 2; + if (count < 0) + count = 0; + + resp->buffer = argp->buffer; + resp->offset = NULL; + resp->buflen = count; + resp->common.err = nfs_ok; + /* Read directory and encode entries on the fly */ + offset = argp->cookie; + resp->status = nfsd_readdir(rqstp, &argp->fh, &offset, + &resp->common, nfssvc_encode_entry); + + resp->count = resp->buffer - argp->buffer; + if (resp->offset) + *resp->offset = htonl(offset); + + fh_put(&argp->fh); + return rpc_success; +} + +/* + * Get file system info + */ +static __be32 +nfsd_proc_statfs(struct svc_rqst *rqstp) +{ + struct nfsd_fhandle *argp = rqstp->rq_argp; + struct nfsd_statfsres *resp = rqstp->rq_resp; + + dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); + + resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, + NFSD_MAY_BYPASS_GSS_ON_ROOT); + fh_put(&argp->fh); + return rpc_success; +} + +/* + * NFSv2 Server procedures. + * Only the results of non-idempotent operations are cached. + */ +struct nfsd_void { int dummy; }; + +#define ST 1 /* status */ +#define FH 8 /* filehandle */ +#define AT 18 /* attributes */ + +static const struct svc_procedure nfsd_procedures2[18] = { + [NFSPROC_NULL] = { + .pc_func = nfsd_proc_null, + .pc_decode = nfssvc_decode_void, + .pc_encode = nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = 0, + }, + [NFSPROC_GETATTR] = { + .pc_func = nfsd_proc_getattr, + .pc_decode = nfssvc_decode_fhandle, + .pc_encode = nfssvc_encode_attrstat, + .pc_release = nfssvc_release_attrstat, + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [NFSPROC_SETATTR] = { + .pc_func = nfsd_proc_setattr, + .pc_decode = nfssvc_decode_sattrargs, + .pc_encode = nfssvc_encode_attrstat, + .pc_release = nfssvc_release_attrstat, + .pc_argsize = sizeof(struct nfsd_sattrargs), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+AT, + }, + [NFSPROC_ROOT] = { + .pc_func = nfsd_proc_root, + .pc_decode = nfssvc_decode_void, + .pc_encode = nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = 0, + }, + [NFSPROC_LOOKUP] = { + .pc_func = nfsd_proc_lookup, + .pc_decode = nfssvc_decode_diropargs, + .pc_encode = nfssvc_encode_diropres, + .pc_release = nfssvc_release_diropres, + .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_ressize = sizeof(struct nfsd_diropres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+FH+AT, + }, + [NFSPROC_READLINK] = { + .pc_func = nfsd_proc_readlink, + .pc_decode = nfssvc_decode_readlinkargs, + .pc_encode = nfssvc_encode_readlinkres, + .pc_argsize = sizeof(struct nfsd_readlinkargs), + .pc_ressize = sizeof(struct nfsd_readlinkres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4, + }, + [NFSPROC_READ] = { + .pc_func = nfsd_proc_read, + .pc_decode = nfssvc_decode_readargs, + .pc_encode = nfssvc_encode_readres, + .pc_release = nfssvc_release_readres, + .pc_argsize = sizeof(struct nfsd_readargs), + .pc_ressize = sizeof(struct nfsd_readres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4, + }, + [NFSPROC_WRITECACHE] = { + .pc_func = nfsd_proc_writecache, + .pc_decode = nfssvc_decode_void, + .pc_encode = nfssvc_encode_void, + .pc_argsize = sizeof(struct nfsd_void), + .pc_ressize = sizeof(struct nfsd_void), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = 0, + }, + [NFSPROC_WRITE] = { + .pc_func = nfsd_proc_write, + .pc_decode = nfssvc_decode_writeargs, + .pc_encode = nfssvc_encode_attrstat, + .pc_release = nfssvc_release_attrstat, + .pc_argsize = sizeof(struct nfsd_writeargs), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+AT, + }, + [NFSPROC_CREATE] = { + .pc_func = nfsd_proc_create, + .pc_decode = nfssvc_decode_createargs, + .pc_encode = nfssvc_encode_diropres, + .pc_release = nfssvc_release_diropres, + .pc_argsize = sizeof(struct nfsd_createargs), + .pc_ressize = sizeof(struct nfsd_diropres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+FH+AT, + }, + [NFSPROC_REMOVE] = { + .pc_func = nfsd_proc_remove, + .pc_decode = nfssvc_decode_diropargs, + .pc_encode = nfssvc_encode_stat, + .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_ressize = sizeof(struct nfsd_stat), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_RENAME] = { + .pc_func = nfsd_proc_rename, + .pc_decode = nfssvc_decode_renameargs, + .pc_encode = nfssvc_encode_stat, + .pc_argsize = sizeof(struct nfsd_renameargs), + .pc_ressize = sizeof(struct nfsd_stat), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_LINK] = { + .pc_func = nfsd_proc_link, + .pc_decode = nfssvc_decode_linkargs, + .pc_encode = nfssvc_encode_stat, + .pc_argsize = sizeof(struct nfsd_linkargs), + .pc_ressize = sizeof(struct nfsd_stat), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_SYMLINK] = { + .pc_func = nfsd_proc_symlink, + .pc_decode = nfssvc_decode_symlinkargs, + .pc_encode = nfssvc_encode_stat, + .pc_argsize = sizeof(struct nfsd_symlinkargs), + .pc_ressize = sizeof(struct nfsd_stat), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_MKDIR] = { + .pc_func = nfsd_proc_mkdir, + .pc_decode = nfssvc_decode_createargs, + .pc_encode = nfssvc_encode_diropres, + .pc_release = nfssvc_release_diropres, + .pc_argsize = sizeof(struct nfsd_createargs), + .pc_ressize = sizeof(struct nfsd_diropres), + .pc_cachetype = RC_REPLBUFF, + .pc_xdrressize = ST+FH+AT, + }, + [NFSPROC_RMDIR] = { + .pc_func = nfsd_proc_rmdir, + .pc_decode = nfssvc_decode_diropargs, + .pc_encode = nfssvc_encode_stat, + .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_ressize = sizeof(struct nfsd_stat), + .pc_cachetype = RC_REPLSTAT, + .pc_xdrressize = ST, + }, + [NFSPROC_READDIR] = { + .pc_func = nfsd_proc_readdir, + .pc_decode = nfssvc_decode_readdirargs, + .pc_encode = nfssvc_encode_readdirres, + .pc_argsize = sizeof(struct nfsd_readdirargs), + .pc_ressize = sizeof(struct nfsd_readdirres), + .pc_cachetype = RC_NOCACHE, + }, + [NFSPROC_STATFS] = { + .pc_func = nfsd_proc_statfs, + .pc_decode = nfssvc_decode_fhandle, + .pc_encode = nfssvc_encode_statfsres, + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_ressize = sizeof(struct nfsd_statfsres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+5, + }, +}; + + +static unsigned int nfsd_count2[ARRAY_SIZE(nfsd_procedures2)]; +const struct svc_version nfsd_version2 = { + .vs_vers = 2, + .vs_nproc = 18, + .vs_proc = nfsd_procedures2, + .vs_count = nfsd_count2, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS2_SVC_XDRSIZE, +}; + +/* + * Map errnos to NFS errnos. + */ +__be32 +nfserrno (int errno) +{ + static struct { + __be32 nfserr; + int syserr; + } nfs_errtbl[] = { + { nfs_ok, 0 }, + { nfserr_perm, -EPERM }, + { nfserr_noent, -ENOENT }, + { nfserr_io, -EIO }, + { nfserr_nxio, -ENXIO }, + { nfserr_fbig, -E2BIG }, + { nfserr_acces, -EACCES }, + { nfserr_exist, -EEXIST }, + { nfserr_xdev, -EXDEV }, + { nfserr_mlink, -EMLINK }, + { nfserr_nodev, -ENODEV }, + { nfserr_notdir, -ENOTDIR }, + { nfserr_isdir, -EISDIR }, + { nfserr_inval, -EINVAL }, + { nfserr_fbig, -EFBIG }, + { nfserr_nospc, -ENOSPC }, + { nfserr_rofs, -EROFS }, + { nfserr_mlink, -EMLINK }, + { nfserr_nametoolong, -ENAMETOOLONG }, + { nfserr_notempty, -ENOTEMPTY }, +#ifdef EDQUOT + { nfserr_dquot, -EDQUOT }, +#endif + { nfserr_stale, -ESTALE }, + { nfserr_jukebox, -ETIMEDOUT }, + { nfserr_jukebox, -ERESTARTSYS }, + { nfserr_jukebox, -EAGAIN }, + { nfserr_jukebox, -EWOULDBLOCK }, + { nfserr_jukebox, -ENOMEM }, + { nfserr_io, -ETXTBSY }, + { nfserr_notsupp, -EOPNOTSUPP }, + { nfserr_toosmall, -ETOOSMALL }, + { nfserr_serverfault, -ESERVERFAULT }, + { nfserr_serverfault, -ENFILE }, + { nfserr_io, -EUCLEAN }, + { nfserr_perm, -ENOKEY }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) { + if (nfs_errtbl[i].syserr == errno) + return nfs_errtbl[i].nfserr; + } + WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno); + return nfserr_io; +} + diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c new file mode 100644 index 000000000..2e61a565c --- /dev/null +++ b/fs/nfsd/nfssvc.c @@ -0,0 +1,1135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Central processing for nfsd. + * + * Authors: Olaf Kirch (okir@monad.swb.de) + * + * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/sched/signal.h> +#include <linux/freezer.h> +#include <linux/module.h> +#include <linux/fs_struct.h> +#include <linux/swap.h> + +#include <linux/sunrpc/stats.h> +#include <linux/sunrpc/svcsock.h> +#include <linux/sunrpc/svc_xprt.h> +#include <linux/lockd/bind.h> +#include <linux/nfsacl.h> +#include <linux/seq_file.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> +#include <net/ipv6.h> +#include <net/net_namespace.h> +#include "nfsd.h" +#include "cache.h" +#include "vfs.h" +#include "netns.h" +#include "filecache.h" + +#define NFSDDBG_FACILITY NFSDDBG_SVC + +bool inter_copy_offload_enable; +EXPORT_SYMBOL_GPL(inter_copy_offload_enable); +module_param(inter_copy_offload_enable, bool, 0644); +MODULE_PARM_DESC(inter_copy_offload_enable, + "Enable inter server to server copy offload. Default: false"); + +extern struct svc_program nfsd_program; +static int nfsd(void *vrqstp); +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +static int nfsd_acl_rpcbind_set(struct net *, + const struct svc_program *, + u32, int, + unsigned short, + unsigned short); +static __be32 nfsd_acl_init_request(struct svc_rqst *, + const struct svc_program *, + struct svc_process_info *); +#endif +static int nfsd_rpcbind_set(struct net *, + const struct svc_program *, + u32, int, + unsigned short, + unsigned short); +static __be32 nfsd_init_request(struct svc_rqst *, + const struct svc_program *, + struct svc_process_info *); + +/* + * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members + * of the svc_serv struct. In particular, ->sv_nrthreads but also to some + * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt + * + * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a + * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number + * of nfsd threads must exist and each must listed in ->sp_all_threads in each + * entry of ->sv_pools[]. + * + * Transitions of the thread count between zero and non-zero are of particular + * interest since the svc_serv needs to be created and initialized at that + * point, or freed. + * + * Finally, the nfsd_mutex also protects some of the global variables that are + * accessed when nfsd starts and that are settable via the write_* routines in + * nfsctl.c. In particular: + * + * user_recovery_dirname + * user_lease_time + * nfsd_versions + */ +DEFINE_MUTEX(nfsd_mutex); + +/* + * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used. + * nfsd_drc_max_pages limits the total amount of memory available for + * version 4.1 DRC caches. + * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. + */ +spinlock_t nfsd_drc_lock; +unsigned long nfsd_drc_max_mem; +unsigned long nfsd_drc_mem_used; + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +static struct svc_stat nfsd_acl_svcstats; +static const struct svc_version *nfsd_acl_version[] = { + [2] = &nfsd_acl_version2, + [3] = &nfsd_acl_version3, +}; + +#define NFSD_ACL_MINVERS 2 +#define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version) + +static struct svc_program nfsd_acl_program = { + .pg_prog = NFS_ACL_PROGRAM, + .pg_nvers = NFSD_ACL_NRVERS, + .pg_vers = nfsd_acl_version, + .pg_name = "nfsacl", + .pg_class = "nfsd", + .pg_stats = &nfsd_acl_svcstats, + .pg_authenticate = &svc_set_client, + .pg_init_request = nfsd_acl_init_request, + .pg_rpcbind_set = nfsd_acl_rpcbind_set, +}; + +static struct svc_stat nfsd_acl_svcstats = { + .program = &nfsd_acl_program, +}; +#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ + +static const struct svc_version *nfsd_version[] = { + [2] = &nfsd_version2, +#if defined(CONFIG_NFSD_V3) + [3] = &nfsd_version3, +#endif +#if defined(CONFIG_NFSD_V4) + [4] = &nfsd_version4, +#endif +}; + +#define NFSD_MINVERS 2 +#define NFSD_NRVERS ARRAY_SIZE(nfsd_version) + +struct svc_program nfsd_program = { +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + .pg_next = &nfsd_acl_program, +#endif + .pg_prog = NFS_PROGRAM, /* program number */ + .pg_nvers = NFSD_NRVERS, /* nr of entries in nfsd_version */ + .pg_vers = nfsd_version, /* version table */ + .pg_name = "nfsd", /* program name */ + .pg_class = "nfsd", /* authentication class */ + .pg_stats = &nfsd_svcstats, /* version table */ + .pg_authenticate = &svc_set_client, /* export authentication */ + .pg_init_request = nfsd_init_request, + .pg_rpcbind_set = nfsd_rpcbind_set, +}; + +static bool +nfsd_support_version(int vers) +{ + if (vers >= NFSD_MINVERS && vers < NFSD_NRVERS) + return nfsd_version[vers] != NULL; + return false; +} + +static bool * +nfsd_alloc_versions(void) +{ + bool *vers = kmalloc_array(NFSD_NRVERS, sizeof(bool), GFP_KERNEL); + unsigned i; + + if (vers) { + /* All compiled versions are enabled by default */ + for (i = 0; i < NFSD_NRVERS; i++) + vers[i] = nfsd_support_version(i); + } + return vers; +} + +static bool * +nfsd_alloc_minorversions(void) +{ + bool *vers = kmalloc_array(NFSD_SUPPORTED_MINOR_VERSION + 1, + sizeof(bool), GFP_KERNEL); + unsigned i; + + if (vers) { + /* All minor versions are enabled by default */ + for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) + vers[i] = nfsd_support_version(4); + } + return vers; +} + +void +nfsd_netns_free_versions(struct nfsd_net *nn) +{ + kfree(nn->nfsd_versions); + kfree(nn->nfsd4_minorversions); + nn->nfsd_versions = NULL; + nn->nfsd4_minorversions = NULL; +} + +static void +nfsd_netns_init_versions(struct nfsd_net *nn) +{ + if (!nn->nfsd_versions) { + nn->nfsd_versions = nfsd_alloc_versions(); + nn->nfsd4_minorversions = nfsd_alloc_minorversions(); + if (!nn->nfsd_versions || !nn->nfsd4_minorversions) + nfsd_netns_free_versions(nn); + } +} + +int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change) +{ + if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) + return 0; + switch(change) { + case NFSD_SET: + if (nn->nfsd_versions) + nn->nfsd_versions[vers] = nfsd_support_version(vers); + break; + case NFSD_CLEAR: + nfsd_netns_init_versions(nn); + if (nn->nfsd_versions) + nn->nfsd_versions[vers] = false; + break; + case NFSD_TEST: + if (nn->nfsd_versions) + return nn->nfsd_versions[vers]; + fallthrough; + case NFSD_AVAIL: + return nfsd_support_version(vers); + } + return 0; +} + +static void +nfsd_adjust_nfsd_versions4(struct nfsd_net *nn) +{ + unsigned i; + + for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) { + if (nn->nfsd4_minorversions[i]) + return; + } + nfsd_vers(nn, 4, NFSD_CLEAR); +} + +int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change) +{ + if (minorversion > NFSD_SUPPORTED_MINOR_VERSION && + change != NFSD_AVAIL) + return -1; + + switch(change) { + case NFSD_SET: + if (nn->nfsd4_minorversions) { + nfsd_vers(nn, 4, NFSD_SET); + nn->nfsd4_minorversions[minorversion] = + nfsd_vers(nn, 4, NFSD_TEST); + } + break; + case NFSD_CLEAR: + nfsd_netns_init_versions(nn); + if (nn->nfsd4_minorversions) { + nn->nfsd4_minorversions[minorversion] = false; + nfsd_adjust_nfsd_versions4(nn); + } + break; + case NFSD_TEST: + if (nn->nfsd4_minorversions) + return nn->nfsd4_minorversions[minorversion]; + return nfsd_vers(nn, 4, NFSD_TEST); + case NFSD_AVAIL: + return minorversion <= NFSD_SUPPORTED_MINOR_VERSION && + nfsd_vers(nn, 4, NFSD_AVAIL); + } + return 0; +} + +/* + * Maximum number of nfsd processes + */ +#define NFSD_MAXSERVS 8192 + +int nfsd_nrthreads(struct net *net) +{ + int rv = 0; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + mutex_lock(&nfsd_mutex); + if (nn->nfsd_serv) + rv = nn->nfsd_serv->sv_nrthreads; + mutex_unlock(&nfsd_mutex); + return rv; +} + +static int nfsd_init_socks(struct net *net, const struct cred *cred) +{ + int error; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (!list_empty(&nn->nfsd_serv->sv_permsocks)) + return 0; + + error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT, + SVC_SOCK_DEFAULTS, cred); + if (error < 0) + return error; + + error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT, + SVC_SOCK_DEFAULTS, cred); + if (error < 0) + return error; + + return 0; +} + +static int nfsd_users = 0; + +static int nfsd_startup_generic(int nrservs) +{ + int ret; + + if (nfsd_users++) + return 0; + + ret = nfsd_file_cache_init(); + if (ret) + goto dec_users; + + ret = nfs4_state_start(); + if (ret) + goto out_file_cache; + return 0; + +out_file_cache: + nfsd_file_cache_shutdown(); +dec_users: + nfsd_users--; + return ret; +} + +static void nfsd_shutdown_generic(void) +{ + if (--nfsd_users) + return; + + nfs4_state_shutdown(); + nfsd_file_cache_shutdown(); +} + +static bool nfsd_needs_lockd(struct nfsd_net *nn) +{ + return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); +} + +void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn) +{ + int seq = 0; + + do { + read_seqbegin_or_lock(&nn->boot_lock, &seq); + /* + * This is opaque to client, so no need to byte-swap. Use + * __force to keep sparse happy. y2038 time_t overflow is + * irrelevant in this usage + */ + verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; + verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; + } while (need_seqretry(&nn->boot_lock, seq)); + done_seqretry(&nn->boot_lock, seq); +} + +static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) +{ + ktime_get_real_ts64(&nn->nfssvc_boot); +} + +void nfsd_reset_boot_verifier(struct nfsd_net *nn) +{ + write_seqlock(&nn->boot_lock); + nfsd_reset_boot_verifier_locked(nn); + write_sequnlock(&nn->boot_lock); +} + +static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int ret; + + if (nn->nfsd_net_up) + return 0; + + ret = nfsd_startup_generic(nrservs); + if (ret) + return ret; + ret = nfsd_init_socks(net, cred); + if (ret) + goto out_socks; + + if (nfsd_needs_lockd(nn) && !nn->lockd_up) { + ret = lockd_up(net, cred); + if (ret) + goto out_socks; + nn->lockd_up = true; + } + + ret = nfsd_file_cache_start_net(net); + if (ret) + goto out_lockd; + ret = nfs4_state_start_net(net); + if (ret) + goto out_filecache; + + nn->nfsd_net_up = true; + return 0; + +out_filecache: + nfsd_file_cache_shutdown_net(net); +out_lockd: + if (nn->lockd_up) { + lockd_down(net); + nn->lockd_up = false; + } +out_socks: + nfsd_shutdown_generic(); + return ret; +} + +static void nfsd_shutdown_net(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfs4_state_shutdown_net(net); + nfsd_file_cache_shutdown_net(net); + if (nn->lockd_up) { + lockd_down(net); + nn->lockd_up = false; + } + nn->nfsd_net_up = false; + nfsd_shutdown_generic(); +} + +static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct net_device *dev = ifa->ifa_dev->dev; + struct net *net = dev_net(dev); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct sockaddr_in sin; + + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nn->ntf_refcnt)) + goto out; + + if (nn->nfsd_serv) { + dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ifa->ifa_local; + svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); + } + atomic_dec(&nn->ntf_refcnt); + wake_up(&nn->ntf_wq); + +out: + return NOTIFY_DONE; +} + +static struct notifier_block nfsd_inetaddr_notifier = { + .notifier_call = nfsd_inetaddr_event, +}; + +#if IS_ENABLED(CONFIG_IPV6) +static int nfsd_inet6addr_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; + struct net_device *dev = ifa->idev->dev; + struct net *net = dev_net(dev); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct sockaddr_in6 sin6; + + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nn->ntf_refcnt)) + goto out; + + if (nn->nfsd_serv) { + dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = ifa->addr; + if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6.sin6_scope_id = ifa->idev->dev->ifindex; + svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); + } + atomic_dec(&nn->ntf_refcnt); + wake_up(&nn->ntf_wq); +out: + return NOTIFY_DONE; +} + +static struct notifier_block nfsd_inet6addr_notifier = { + .notifier_call = nfsd_inet6addr_event, +}; +#endif + +/* Only used under nfsd_mutex, so this atomic may be overkill: */ +static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); + +static void nfsd_last_thread(struct svc_serv *serv, struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + atomic_dec(&nn->ntf_refcnt); + /* check if the notifier still has clients */ + if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { + unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); +#endif + } + wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0); + + /* + * write_ports can create the server without actually starting + * any threads--if we get shut down before any threads are + * started, then nfsd_last_thread will be run before any of this + * other initialization has been done except the rpcb information. + */ + svc_rpcb_cleanup(serv, net); + if (!nn->nfsd_net_up) + return; + + nfsd_shutdown_net(net); + pr_info("nfsd: last server has exited, flushing export cache\n"); + nfsd_export_flush(net); +} + +void nfsd_reset_versions(struct nfsd_net *nn) +{ + int i; + + for (i = 0; i < NFSD_NRVERS; i++) + if (nfsd_vers(nn, i, NFSD_TEST)) + return; + + for (i = 0; i < NFSD_NRVERS; i++) + if (i != 4) + nfsd_vers(nn, i, NFSD_SET); + else { + int minor = 0; + while (nfsd_minorversion(nn, minor, NFSD_SET) >= 0) + minor++; + } +} + +/* + * Each session guarantees a negotiated per slot memory cache for replies + * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated + * NFSv4.1 server might want to use more memory for a DRC than a machine + * with mutiple services. + * + * Impose a hard limit on the number of pages for the DRC which varies + * according to the machines free pages. This is of course only a default. + * + * For now this is a #defined shift which could be under admin control + * in the future. + */ +static void set_max_drc(void) +{ + #define NFSD_DRC_SIZE_SHIFT 7 + nfsd_drc_max_mem = (nr_free_buffer_pages() + >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; + nfsd_drc_mem_used = 0; + spin_lock_init(&nfsd_drc_lock); + dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem); +} + +static int nfsd_get_default_max_blksize(void) +{ + struct sysinfo i; + unsigned long long target; + unsigned long ret; + + si_meminfo(&i); + target = (i.totalram - i.totalhigh) << PAGE_SHIFT; + /* + * Aim for 1/4096 of memory per thread This gives 1MB on 4Gig + * machines, but only uses 32K on 128M machines. Bottom out at + * 8K on 32M and smaller. Of course, this is only a default. + */ + target >>= 12; + + ret = NFSSVC_MAXBLKSIZE; + while (ret > target && ret >= 8*1024*2) + ret /= 2; + return ret; +} + +static const struct svc_serv_ops nfsd_thread_sv_ops = { + .svo_shutdown = nfsd_last_thread, + .svo_function = nfsd, + .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_setup = svc_set_num_threads, + .svo_module = THIS_MODULE, +}; + +static void nfsd_complete_shutdown(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + WARN_ON(!mutex_is_locked(&nfsd_mutex)); + + nn->nfsd_serv = NULL; + complete(&nn->nfsd_shutdown_complete); +} + +void nfsd_shutdown_threads(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; + + mutex_lock(&nfsd_mutex); + serv = nn->nfsd_serv; + if (serv == NULL) { + mutex_unlock(&nfsd_mutex); + return; + } + + svc_get(serv); + /* Kill outstanding nfsd threads */ + serv->sv_ops->svo_setup(serv, NULL, 0); + nfsd_destroy(net); + mutex_unlock(&nfsd_mutex); + /* Wait for shutdown of nfsd_serv to complete */ + wait_for_completion(&nn->nfsd_shutdown_complete); +} + +bool i_am_nfsd(void) +{ + return kthread_func(current) == nfsd; +} + +int nfsd_create_serv(struct net *net) +{ + int error; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + WARN_ON(!mutex_is_locked(&nfsd_mutex)); + if (nn->nfsd_serv) { + svc_get(nn->nfsd_serv); + return 0; + } + if (nfsd_max_blksize == 0) + nfsd_max_blksize = nfsd_get_default_max_blksize(); + nfsd_reset_versions(nn); + nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + &nfsd_thread_sv_ops); + if (nn->nfsd_serv == NULL) + return -ENOMEM; + init_completion(&nn->nfsd_shutdown_complete); + + nn->nfsd_serv->sv_maxconn = nn->max_connections; + error = svc_bind(nn->nfsd_serv, net); + if (error < 0) { + svc_destroy(nn->nfsd_serv); + nfsd_complete_shutdown(net); + return error; + } + + set_max_drc(); + /* check if the notifier is already set */ + if (atomic_inc_return(&nfsd_notifier_refcount) == 1) { + register_inetaddr_notifier(&nfsd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + register_inet6addr_notifier(&nfsd_inet6addr_notifier); +#endif + } + atomic_inc(&nn->ntf_refcnt); + nfsd_reset_boot_verifier(nn); + return 0; +} + +int nfsd_nrpools(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (nn->nfsd_serv == NULL) + return 0; + else + return nn->nfsd_serv->sv_nrpools; +} + +int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) +{ + int i = 0; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (nn->nfsd_serv != NULL) { + for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++) + nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads; + } + + return 0; +} + +void nfsd_destroy(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int destroy = (nn->nfsd_serv->sv_nrthreads == 1); + + if (destroy) + svc_shutdown_net(nn->nfsd_serv, net); + svc_destroy(nn->nfsd_serv); + if (destroy) + nfsd_complete_shutdown(net); +} + +int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) +{ + int i = 0; + int tot = 0; + int err = 0; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + WARN_ON(!mutex_is_locked(&nfsd_mutex)); + + if (nn->nfsd_serv == NULL || n <= 0) + return 0; + + if (n > nn->nfsd_serv->sv_nrpools) + n = nn->nfsd_serv->sv_nrpools; + + /* enforce a global maximum number of threads */ + tot = 0; + for (i = 0; i < n; i++) { + nthreads[i] = min(nthreads[i], NFSD_MAXSERVS); + tot += nthreads[i]; + } + if (tot > NFSD_MAXSERVS) { + /* total too large: scale down requested numbers */ + for (i = 0; i < n && tot > 0; i++) { + int new = nthreads[i] * NFSD_MAXSERVS / tot; + tot -= (nthreads[i] - new); + nthreads[i] = new; + } + for (i = 0; i < n && tot > 0; i++) { + nthreads[i]--; + tot--; + } + } + + /* + * There must always be a thread in pool 0; the admin + * can't shut down NFS completely using pool_threads. + */ + if (nthreads[0] == 0) + nthreads[0] = 1; + + /* apply the new numbers */ + svc_get(nn->nfsd_serv); + for (i = 0; i < n; i++) { + err = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, + &nn->nfsd_serv->sv_pools[i], nthreads[i]); + if (err) + break; + } + nfsd_destroy(net); + return err; +} + +/* + * Adjust the number of threads and return the new number of threads. + * This is also the function that starts the server if necessary, if + * this is the first time nrservs is nonzero. + */ +int +nfsd_svc(int nrservs, struct net *net, const struct cred *cred) +{ + int error; + bool nfsd_up_before; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + mutex_lock(&nfsd_mutex); + dprintk("nfsd: creating service\n"); + + nrservs = max(nrservs, 0); + nrservs = min(nrservs, NFSD_MAXSERVS); + error = 0; + + if (nrservs == 0 && nn->nfsd_serv == NULL) + goto out; + + strlcpy(nn->nfsd_name, utsname()->nodename, + sizeof(nn->nfsd_name)); + + error = nfsd_create_serv(net); + if (error) + goto out; + + nfsd_up_before = nn->nfsd_net_up; + + error = nfsd_startup_net(nrservs, net, cred); + if (error) + goto out_destroy; + error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, + NULL, nrservs); + if (error) + goto out_shutdown; + /* We are holding a reference to nn->nfsd_serv which + * we don't want to count in the return value, + * so subtract 1 + */ + error = nn->nfsd_serv->sv_nrthreads - 1; +out_shutdown: + if (error < 0 && !nfsd_up_before) + nfsd_shutdown_net(net); +out_destroy: + nfsd_destroy(net); /* Release server */ +out: + mutex_unlock(&nfsd_mutex); + return error; +} + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +static bool +nfsd_support_acl_version(int vers) +{ + if (vers >= NFSD_ACL_MINVERS && vers < NFSD_ACL_NRVERS) + return nfsd_acl_version[vers] != NULL; + return false; +} + +static int +nfsd_acl_rpcbind_set(struct net *net, const struct svc_program *progp, + u32 version, int family, unsigned short proto, + unsigned short port) +{ + if (!nfsd_support_acl_version(version) || + !nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST)) + return 0; + return svc_generic_rpcbind_set(net, progp, version, family, + proto, port); +} + +static __be32 +nfsd_acl_init_request(struct svc_rqst *rqstp, + const struct svc_program *progp, + struct svc_process_info *ret) +{ + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + int i; + + if (likely(nfsd_support_acl_version(rqstp->rq_vers) && + nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST))) + return svc_generic_init_request(rqstp, progp, ret); + + ret->mismatch.lovers = NFSD_ACL_NRVERS; + for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) { + if (nfsd_support_acl_version(rqstp->rq_vers) && + nfsd_vers(nn, i, NFSD_TEST)) { + ret->mismatch.lovers = i; + break; + } + } + if (ret->mismatch.lovers == NFSD_ACL_NRVERS) + return rpc_prog_unavail; + ret->mismatch.hivers = NFSD_ACL_MINVERS; + for (i = NFSD_ACL_NRVERS - 1; i >= NFSD_ACL_MINVERS; i--) { + if (nfsd_support_acl_version(rqstp->rq_vers) && + nfsd_vers(nn, i, NFSD_TEST)) { + ret->mismatch.hivers = i; + break; + } + } + return rpc_prog_mismatch; +} +#endif + +static int +nfsd_rpcbind_set(struct net *net, const struct svc_program *progp, + u32 version, int family, unsigned short proto, + unsigned short port) +{ + if (!nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST)) + return 0; + return svc_generic_rpcbind_set(net, progp, version, family, + proto, port); +} + +static __be32 +nfsd_init_request(struct svc_rqst *rqstp, + const struct svc_program *progp, + struct svc_process_info *ret) +{ + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + int i; + + if (likely(nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST))) + return svc_generic_init_request(rqstp, progp, ret); + + ret->mismatch.lovers = NFSD_NRVERS; + for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) { + if (nfsd_vers(nn, i, NFSD_TEST)) { + ret->mismatch.lovers = i; + break; + } + } + if (ret->mismatch.lovers == NFSD_NRVERS) + return rpc_prog_unavail; + ret->mismatch.hivers = NFSD_MINVERS; + for (i = NFSD_NRVERS - 1; i >= NFSD_MINVERS; i--) { + if (nfsd_vers(nn, i, NFSD_TEST)) { + ret->mismatch.hivers = i; + break; + } + } + return rpc_prog_mismatch; +} + +/* + * This is the NFS server kernel thread + */ +static int +nfsd(void *vrqstp) +{ + struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; + struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); + struct net *net = perm_sock->xpt_net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int err; + + /* Lock module and set up kernel thread */ + mutex_lock(&nfsd_mutex); + + /* At this point, the thread shares current->fs + * with the init process. We need to create files with the + * umask as defined by the client instead of init's umask. */ + if (unshare_fs_struct() < 0) { + printk("Unable to start nfsd thread: out of memory\n"); + goto out; + } + + current->fs->umask = 0; + + /* + * thread is spawned with all signals set to SIG_IGN, re-enable + * the ones that will bring down the thread + */ + allow_signal(SIGKILL); + allow_signal(SIGHUP); + allow_signal(SIGINT); + allow_signal(SIGQUIT); + + nfsdstats.th_cnt++; + mutex_unlock(&nfsd_mutex); + + set_freezable(); + + /* + * The main request loop + */ + for (;;) { + /* Update sv_maxconn if it has changed */ + rqstp->rq_server->sv_maxconn = nn->max_connections; + + /* + * Find a socket with data available and call its + * recvfrom routine. + */ + while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN) + ; + if (err == -EINTR) + break; + validate_process_creds(); + svc_process(rqstp); + validate_process_creds(); + } + + /* Clear signals before calling svc_exit_thread() */ + flush_signals(current); + + mutex_lock(&nfsd_mutex); + nfsdstats.th_cnt --; + +out: + rqstp->rq_server = NULL; + + /* Release the thread */ + svc_exit_thread(rqstp); + + nfsd_destroy(net); + + /* Release module */ + mutex_unlock(&nfsd_mutex); + module_put_and_exit(0); + return 0; +} + +/* + * A write procedure can have a large argument, and a read procedure can + * have a large reply, but no NFSv2 or NFSv3 procedure has argument and + * reply that can both be larger than a page. The xdr code has taken + * advantage of this assumption to be a sloppy about bounds checking in + * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that + * problem, we enforce these assumptions here: + */ +static bool nfs_request_too_big(struct svc_rqst *rqstp, + const struct svc_procedure *proc) +{ + /* + * The ACL code has more careful bounds-checking and is not + * susceptible to this problem: + */ + if (rqstp->rq_prog != NFS_PROGRAM) + return false; + /* + * Ditto NFSv4 (which can in theory have argument and reply both + * more than a page): + */ + if (rqstp->rq_vers >= 4) + return false; + /* The reply will be small, we're OK: */ + if (proc->pc_xdrressize > 0 && + proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE)) + return false; + + return rqstp->rq_arg.len > PAGE_SIZE; +} + +/** + * nfsd_dispatch - Process an NFS or NFSACL Request + * @rqstp: incoming request + * @statp: pointer to location of accept_stat field in RPC Reply buffer + * + * This RPC dispatcher integrates the NFS server's duplicate reply cache. + * + * Return values: + * %0: Processing complete; do not send a Reply + * %1: Processing complete; send Reply in rqstp->rq_res + */ +int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) +{ + const struct svc_procedure *proc = rqstp->rq_procinfo; + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + __be32 *p; + + dprintk("nfsd_dispatch: vers %d proc %d\n", + rqstp->rq_vers, rqstp->rq_proc); + + if (nfs_request_too_big(rqstp, proc)) + goto out_too_large; + + /* + * Give the xdr decoder a chance to change this if it wants + * (necessary in the NFSv4.0 compound case) + */ + rqstp->rq_cachetype = proc->pc_cachetype; + if (!proc->pc_decode(rqstp, argv->iov_base)) + goto out_decode_err; + + switch (nfsd_cache_lookup(rqstp)) { + case RC_DOIT: + break; + case RC_REPLY: + goto out_cached_reply; + case RC_DROPIT: + goto out_dropit; + } + + /* + * Need to grab the location to store the status, as + * NFSv4 does some encoding while processing + */ + p = resv->iov_base + resv->iov_len; + resv->iov_len += sizeof(__be32); + + *statp = proc->pc_func(rqstp); + if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags)) + goto out_update_drop; + + if (!proc->pc_encode(rqstp, p)) + goto out_encode_err; + + nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1); +out_cached_reply: + return 1; + +out_too_large: + dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers); + *statp = rpc_garbage_args; + return 1; + +out_decode_err: + dprintk("nfsd: failed to decode arguments!\n"); + *statp = rpc_garbage_args; + return 1; + +out_update_drop: + dprintk("nfsd: Dropping request; may be revisited later\n"); + nfsd_cache_update(rqstp, RC_NOCACHE, NULL); +out_dropit: + return 0; + +out_encode_err: + dprintk("nfsd: failed to encode result!\n"); + nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + *statp = rpc_system_err; + return 1; +} + +int nfsd_pool_stats_open(struct inode *inode, struct file *file) +{ + int ret; + struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id); + + mutex_lock(&nfsd_mutex); + if (nn->nfsd_serv == NULL) { + mutex_unlock(&nfsd_mutex); + return -ENODEV; + } + /* bump up the psudo refcount while traversing */ + svc_get(nn->nfsd_serv); + ret = svc_pool_stats_open(nn->nfsd_serv, file); + mutex_unlock(&nfsd_mutex); + return ret; +} + +int nfsd_pool_stats_release(struct inode *inode, struct file *file) +{ + int ret = seq_release(inode, file); + struct net *net = inode->i_sb->s_fs_info; + + mutex_lock(&nfsd_mutex); + /* this function really, really should have been called svc_put() */ + nfsd_destroy(net); + mutex_unlock(&nfsd_mutex); + return ret; +} diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c new file mode 100644 index 000000000..8a288c8fc --- /dev/null +++ b/fs/nfsd/nfsxdr.c @@ -0,0 +1,616 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * XDR support for nfsd + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ + +#include "vfs.h" +#include "xdr.h" +#include "auth.h" + +#define NFSDDBG_FACILITY NFSDDBG_XDR + +/* + * Mapping of S_IF* types to NFS file types + */ +static u32 nfs_ftypes[] = { + NFNON, NFCHR, NFCHR, NFBAD, + NFDIR, NFBAD, NFBLK, NFBAD, + NFREG, NFBAD, NFLNK, NFBAD, + NFSOCK, NFBAD, NFLNK, NFBAD, +}; + + +/* + * XDR functions for basic NFS types + */ +static __be32 * +decode_fh(__be32 *p, struct svc_fh *fhp) +{ + fh_init(fhp, NFS_FHSIZE); + memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE); + fhp->fh_handle.fh_size = NFS_FHSIZE; + + /* FIXME: Look up export pointer here and verify + * Sun Secure RPC if requested */ + return p + (NFS_FHSIZE >> 2); +} + +/* Helper function for NFSv2 ACL code */ +__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp) +{ + return decode_fh(p, fhp); +} + +static __be32 * +encode_fh(__be32 *p, struct svc_fh *fhp) +{ + memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE); + return p + (NFS_FHSIZE>> 2); +} + +/* + * Decode a file name and make sure that the path contains + * no slashes or null bytes. + */ +static __be32 * +decode_filename(__be32 *p, char **namp, unsigned int *lenp) +{ + char *name; + unsigned int i; + + if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) { + for (i = 0, name = *namp; i < *lenp; i++, name++) { + if (*name == '\0' || *name == '/') + return NULL; + } + } + + return p; +} + +static __be32 * +decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns) +{ + u32 tmp, tmp1; + + iap->ia_valid = 0; + + /* Sun client bug compatibility check: some sun clients seem to + * put 0xffff in the mode field when they mean 0xffffffff. + * Quoting the 4.4BSD nfs server code: Nah nah nah nah na nah. + */ + if ((tmp = ntohl(*p++)) != (u32)-1 && tmp != 0xffff) { + iap->ia_valid |= ATTR_MODE; + iap->ia_mode = tmp; + } + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_uid = make_kuid(userns, tmp); + if (uid_valid(iap->ia_uid)) + iap->ia_valid |= ATTR_UID; + } + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_gid = make_kgid(userns, tmp); + if (gid_valid(iap->ia_gid)) + iap->ia_valid |= ATTR_GID; + } + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_valid |= ATTR_SIZE; + iap->ia_size = tmp; + } + tmp = ntohl(*p++); tmp1 = ntohl(*p++); + if (tmp != (u32)-1 && tmp1 != (u32)-1) { + iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; + iap->ia_atime.tv_sec = tmp; + iap->ia_atime.tv_nsec = tmp1 * 1000; + } + tmp = ntohl(*p++); tmp1 = ntohl(*p++); + if (tmp != (u32)-1 && tmp1 != (u32)-1) { + iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; + iap->ia_mtime.tv_sec = tmp; + iap->ia_mtime.tv_nsec = tmp1 * 1000; + /* + * Passing the invalid value useconds=1000000 for mtime + * is a Sun convention for "set both mtime and atime to + * current server time". It's needed to make permissions + * checks for the "touch" program across v2 mounts to + * Solaris and Irix boxes work correctly. See description of + * sattr in section 6.1 of "NFS Illustrated" by + * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5 + */ + if (tmp1 == 1000000) + iap->ia_valid &= ~(ATTR_ATIME_SET|ATTR_MTIME_SET); + } + return p; +} + +static __be32 * +encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, + struct kstat *stat) +{ + struct user_namespace *userns = nfsd_user_namespace(rqstp); + struct dentry *dentry = fhp->fh_dentry; + int type; + struct timespec64 time; + u32 f; + + type = (stat->mode & S_IFMT); + + *p++ = htonl(nfs_ftypes[type >> 12]); + *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) stat->nlink); + *p++ = htonl((u32) from_kuid_munged(userns, stat->uid)); + *p++ = htonl((u32) from_kgid_munged(userns, stat->gid)); + + if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) { + *p++ = htonl(NFS_MAXPATHLEN); + } else { + *p++ = htonl((u32) stat->size); + } + *p++ = htonl((u32) stat->blksize); + if (S_ISCHR(type) || S_ISBLK(type)) + *p++ = htonl(new_encode_dev(stat->rdev)); + else + *p++ = htonl(0xffffffff); + *p++ = htonl((u32) stat->blocks); + switch (fsid_source(fhp)) { + default: + case FSIDSOURCE_DEV: + *p++ = htonl(new_encode_dev(stat->dev)); + break; + case FSIDSOURCE_FSID: + *p++ = htonl((u32) fhp->fh_export->ex_fsid); + break; + case FSIDSOURCE_UUID: + f = ((u32*)fhp->fh_export->ex_uuid)[0]; + f ^= ((u32*)fhp->fh_export->ex_uuid)[1]; + f ^= ((u32*)fhp->fh_export->ex_uuid)[2]; + f ^= ((u32*)fhp->fh_export->ex_uuid)[3]; + *p++ = htonl(f); + break; + } + *p++ = htonl((u32) stat->ino); + *p++ = htonl((u32) stat->atime.tv_sec); + *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0); + time = stat->mtime; + lease_get_mtime(d_inode(dentry), &time); + *p++ = htonl((u32) time.tv_sec); + *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); + *p++ = htonl((u32) stat->ctime.tv_sec); + *p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0); + + return p; +} + +/* Helper function for NFSv2 ACL code */ +__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) +{ + return encode_fattr(rqstp, p, fhp, stat); +} + +/* + * XDR decode functions + */ +int +nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p) +{ + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_fhandle *args = rqstp->rq_argp; + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_sattrargs *args = rqstp->rq_argp; + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp)); + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_diropargs *args = rqstp->rq_argp; + + if (!(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_readargs *args = rqstp->rq_argp; + unsigned int len; + int v; + p = decode_fh(p, &args->fh); + if (!p) + return 0; + + args->offset = ntohl(*p++); + len = args->count = ntohl(*p++); + p++; /* totalcount - unused */ + + len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2); + + /* set up somewhere to store response. + * We take pages, put them on reslist and include in iovec + */ + v=0; + while (len > 0) { + struct page *p = *(rqstp->rq_next_page++); + + rqstp->rq_vec[v].iov_base = page_address(p); + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); + len -= rqstp->rq_vec[v].iov_len; + v++; + } + args->vlen = v; + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_writeargs *args = rqstp->rq_argp; + unsigned int len, hdr, dlen; + struct kvec *head = rqstp->rq_arg.head; + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + + p++; /* beginoffset */ + args->offset = ntohl(*p++); /* offset */ + p++; /* totalcount */ + len = args->len = ntohl(*p++); + /* + * The protocol specifies a maximum of 8192 bytes. + */ + if (len > NFSSVC_MAXBLKSIZE_V2) + return 0; + + /* + * Check to make sure that we got the right number of + * bytes. + */ + hdr = (void*)p - head->iov_base; + if (hdr > head->iov_len) + return 0; + dlen = head->iov_len + rqstp->rq_arg.page_len - hdr; + + /* + * Round the length of the data which was specified up to + * the next multiple of XDR units and then compare that + * against the length which was actually received. + * Note that when RPCSEC/GSS (for example) is used, the + * data buffer can be padded so dlen might be larger + * than required. It must never be smaller. + */ + if (dlen < XDR_QUADLEN(len)*4) + return 0; + + args->first.iov_base = (void *)p; + args->first.iov_len = head->iov_len - hdr; + return 1; +} + +int +nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_createargs *args = rqstp->rq_argp; + + if ( !(p = decode_fh(p, &args->fh)) + || !(p = decode_filename(p, &args->name, &args->len))) + return 0; + p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp)); + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_renameargs *args = rqstp->rq_argp; + + if (!(p = decode_fh(p, &args->ffh)) + || !(p = decode_filename(p, &args->fname, &args->flen)) + || !(p = decode_fh(p, &args->tfh)) + || !(p = decode_filename(p, &args->tname, &args->tlen))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_readlinkargs *args = rqstp->rq_argp; + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + args->buffer = page_address(*(rqstp->rq_next_page++)); + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_linkargs *args = rqstp->rq_argp; + + if (!(p = decode_fh(p, &args->ffh)) + || !(p = decode_fh(p, &args->tfh)) + || !(p = decode_filename(p, &args->tname, &args->tlen))) + return 0; + + return xdr_argsize_check(rqstp, p); +} + +int +nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_symlinkargs *args = rqstp->rq_argp; + char *base = (char *)p; + size_t xdrlen; + + if ( !(p = decode_fh(p, &args->ffh)) + || !(p = decode_filename(p, &args->fname, &args->flen))) + return 0; + + args->tlen = ntohl(*p++); + if (args->tlen == 0) + return 0; + + args->first.iov_base = p; + args->first.iov_len = rqstp->rq_arg.head[0].iov_len; + args->first.iov_len -= (char *)p - base; + + /* This request is never larger than a page. Therefore, + * transport will deliver either: + * 1. pathname in the pagelist -> sattr is in the tail. + * 2. everything in the head buffer -> sattr is in the head. + */ + if (rqstp->rq_arg.page_len) { + if (args->tlen != rqstp->rq_arg.page_len) + return 0; + p = rqstp->rq_arg.tail[0].iov_base; + } else { + xdrlen = XDR_QUADLEN(args->tlen); + if (xdrlen > args->first.iov_len - (8 * sizeof(__be32))) + return 0; + p += xdrlen; + } + decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp)); + + return 1; +} + +int +nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_readdirargs *args = rqstp->rq_argp; + + p = decode_fh(p, &args->fh); + if (!p) + return 0; + args->cookie = ntohl(*p++); + args->count = ntohl(*p++); + args->count = min_t(u32, args->count, PAGE_SIZE); + args->buffer = page_address(*(rqstp->rq_next_page++)); + + return xdr_argsize_check(rqstp, p); +} + +/* + * XDR encode functions + */ +int +nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p) +{ + return xdr_ressize_check(rqstp, p); +} + +int +nfssvc_encode_stat(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_stat *resp = rqstp->rq_resp; + + *p++ = resp->status; + return xdr_ressize_check(rqstp, p); +} + +int +nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_attrstat *resp = rqstp->rq_resp; + + *p++ = resp->status; + if (resp->status != nfs_ok) + goto out; + p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); +out: + return xdr_ressize_check(rqstp, p); +} + +int +nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_diropres *resp = rqstp->rq_resp; + + *p++ = resp->status; + if (resp->status != nfs_ok) + goto out; + p = encode_fh(p, &resp->fh); + p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); +out: + return xdr_ressize_check(rqstp, p); +} + +int +nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_readlinkres *resp = rqstp->rq_resp; + + *p++ = resp->status; + if (resp->status != nfs_ok) + return xdr_ressize_check(rqstp, p); + + *p++ = htonl(resp->len); + xdr_ressize_check(rqstp, p); + rqstp->rq_res.page_len = resp->len; + if (resp->len & 3) { + /* need to pad the tail */ + rqstp->rq_res.tail[0].iov_base = p; + *p = 0; + rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); + } + return 1; +} + +int +nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_readres *resp = rqstp->rq_resp; + + *p++ = resp->status; + if (resp->status != nfs_ok) + return xdr_ressize_check(rqstp, p); + + p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); + *p++ = htonl(resp->count); + xdr_ressize_check(rqstp, p); + + /* now update rqstp->rq_res to reflect data as well */ + rqstp->rq_res.page_len = resp->count; + if (resp->count & 3) { + /* need to pad the tail */ + rqstp->rq_res.tail[0].iov_base = p; + *p = 0; + rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3); + } + return 1; +} + +int +nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_readdirres *resp = rqstp->rq_resp; + + *p++ = resp->status; + if (resp->status != nfs_ok) + return xdr_ressize_check(rqstp, p); + + xdr_ressize_check(rqstp, p); + p = resp->buffer; + *p++ = 0; /* no more entries */ + *p++ = htonl((resp->common.err == nfserr_eof)); + rqstp->rq_res.page_len = (((unsigned long)p-1) & ~PAGE_MASK)+1; + + return 1; +} + +int +nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p) +{ + struct nfsd_statfsres *resp = rqstp->rq_resp; + struct kstatfs *stat = &resp->stats; + + *p++ = resp->status; + if (resp->status != nfs_ok) + return xdr_ressize_check(rqstp, p); + + *p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */ + *p++ = htonl(stat->f_bsize); + *p++ = htonl(stat->f_blocks); + *p++ = htonl(stat->f_bfree); + *p++ = htonl(stat->f_bavail); + return xdr_ressize_check(rqstp, p); +} + +int +nfssvc_encode_entry(void *ccdv, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = ccdv; + struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common); + __be32 *p = cd->buffer; + int buflen, slen; + + /* + dprintk("nfsd: entry(%.*s off %ld ino %ld)\n", + namlen, name, offset, ino); + */ + + if (offset > ~((u32) 0)) { + cd->common.err = nfserr_fbig; + return -EINVAL; + } + if (cd->offset) + *cd->offset = htonl(offset); + + /* truncate filename */ + namlen = min(namlen, NFS2_MAXNAMLEN); + slen = XDR_QUADLEN(namlen); + + if ((buflen = cd->buflen - slen - 4) < 0) { + cd->common.err = nfserr_toosmall; + return -EINVAL; + } + if (ino > ~((u32) 0)) { + cd->common.err = nfserr_fbig; + return -EINVAL; + } + *p++ = xdr_one; /* mark entry present */ + *p++ = htonl((u32) ino); /* file id */ + p = xdr_encode_array(p, name, namlen);/* name length & name */ + cd->offset = p; /* remember pointer */ + *p++ = htonl(~0U); /* offset of next entry */ + + cd->buflen = buflen; + cd->buffer = p; + cd->common.err = nfs_ok; + return 0; +} + +/* + * XDR release functions + */ +void nfssvc_release_attrstat(struct svc_rqst *rqstp) +{ + struct nfsd_attrstat *resp = rqstp->rq_resp; + + fh_put(&resp->fh); +} + +void nfssvc_release_diropres(struct svc_rqst *rqstp) +{ + struct nfsd_diropres *resp = rqstp->rq_resp; + + fh_put(&resp->fh); +} + +void nfssvc_release_readres(struct svc_rqst *rqstp) +{ + struct nfsd_readres *resp = rqstp->rq_resp; + + fh_put(&resp->fh); +} diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h new file mode 100644 index 000000000..4f4282d4e --- /dev/null +++ b/fs/nfsd/pnfs.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_NFSD_PNFS_H +#define _FS_NFSD_PNFS_H 1 + +#ifdef CONFIG_NFSD_V4 +#include <linux/exportfs.h> +#include <linux/nfsd/export.h> + +#include "state.h" +#include "xdr4.h" + +struct xdr_stream; + +struct nfsd4_deviceid_map { + struct list_head hash; + u64 idx; + int fsid_type; + u32 fsid[]; +}; + +struct nfsd4_layout_ops { + u32 notify_types; + bool disable_recalls; + + __be32 (*proc_getdeviceinfo)(struct super_block *sb, + struct svc_rqst *rqstp, + struct nfs4_client *clp, + struct nfsd4_getdeviceinfo *gdevp); + __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdevp); + + __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp, + struct nfsd4_layoutget *lgp); + __be32 (*encode_layoutget)(struct xdr_stream *, + struct nfsd4_layoutget *lgp); + + __be32 (*proc_layoutcommit)(struct inode *inode, + struct nfsd4_layoutcommit *lcp); + + void (*fence_client)(struct nfs4_layout_stateid *ls); +}; + +extern const struct nfsd4_layout_ops *nfsd4_layout_ops[]; +#ifdef CONFIG_NFSD_BLOCKLAYOUT +extern const struct nfsd4_layout_ops bl_layout_ops; +#endif +#ifdef CONFIG_NFSD_SCSILAYOUT +extern const struct nfsd4_layout_ops scsi_layout_ops; +#endif +#ifdef CONFIG_NFSD_FLEXFILELAYOUT +extern const struct nfsd4_layout_ops ff_layout_ops; +#endif + +__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, stateid_t *stateid, + bool create, u32 layout_type, struct nfs4_layout_stateid **lsp); +__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp, + struct nfs4_layout_stateid *ls); +__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp); +__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp); +int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, + u32 device_generation); +struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx); +#endif /* CONFIG_NFSD_V4 */ + +#ifdef CONFIG_NFSD_PNFS +void nfsd4_setup_layout_type(struct svc_export *exp); +void nfsd4_return_all_client_layouts(struct nfs4_client *); +void nfsd4_return_all_file_layouts(struct nfs4_client *clp, + struct nfs4_file *fp); +int nfsd4_init_pnfs(void); +void nfsd4_exit_pnfs(void); +#else +struct nfs4_client; +struct nfs4_file; + +static inline void nfsd4_setup_layout_type(struct svc_export *exp) +{ +} + +static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp) +{ +} +static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp, + struct nfs4_file *fp) +{ +} +static inline void nfsd4_exit_pnfs(void) +{ +} +static inline int nfsd4_init_pnfs(void) +{ + return 0; +} +#endif /* CONFIG_NFSD_PNFS */ +#endif /* _FS_NFSD_PNFS_H */ diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h new file mode 100644 index 000000000..9eae11a9d --- /dev/null +++ b/fs/nfsd/state.h @@ -0,0 +1,696 @@ +/* + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith <kmsmith@umich.edu> + * Andy Adamson <andros@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _NFSD4_STATE_H +#define _NFSD4_STATE_H + +#include <linux/idr.h> +#include <linux/refcount.h> +#include <linux/sunrpc/svc_xprt.h> +#include "nfsfh.h" +#include "nfsd.h" + +typedef struct { + u32 cl_boot; + u32 cl_id; +} clientid_t; + +typedef struct { + clientid_t so_clid; + u32 so_id; +} stateid_opaque_t; + +typedef struct { + u32 si_generation; + stateid_opaque_t si_opaque; +} stateid_t; + +typedef struct { + stateid_t stid; +#define NFS4_COPY_STID 1 +#define NFS4_COPYNOTIFY_STID 2 + unsigned char sc_type; + refcount_t sc_count; +} copy_stateid_t; + +struct nfsd4_callback { + struct nfs4_client *cb_clp; + struct rpc_message cb_msg; + const struct nfsd4_callback_ops *cb_ops; + struct work_struct cb_work; + int cb_seq_status; + int cb_status; + bool cb_need_restart; + bool cb_holds_slot; +}; + +struct nfsd4_callback_ops { + void (*prepare)(struct nfsd4_callback *); + int (*done)(struct nfsd4_callback *, struct rpc_task *); + void (*release)(struct nfsd4_callback *); +}; + +/* + * A core object that represents a "common" stateid. These are generally + * embedded within the different (more specific) stateid objects and contain + * fields that are of general use to any stateid. + */ +struct nfs4_stid { + refcount_t sc_count; +#define NFS4_OPEN_STID 1 +#define NFS4_LOCK_STID 2 +#define NFS4_DELEG_STID 4 +/* For an open stateid kept around *only* to process close replays: */ +#define NFS4_CLOSED_STID 8 +/* For a deleg stateid kept around only to process free_stateid's: */ +#define NFS4_REVOKED_DELEG_STID 16 +#define NFS4_CLOSED_DELEG_STID 32 +#define NFS4_LAYOUT_STID 64 + struct list_head sc_cp_list; + unsigned char sc_type; + stateid_t sc_stateid; + spinlock_t sc_lock; + struct nfs4_client *sc_client; + struct nfs4_file *sc_file; + void (*sc_free)(struct nfs4_stid *); +}; + +/* Keep a list of stateids issued by the COPY_NOTIFY, associate it with the + * parent OPEN/LOCK/DELEG stateid. + */ +struct nfs4_cpntf_state { + copy_stateid_t cp_stateid; + struct list_head cp_list; /* per parent nfs4_stid */ + stateid_t cp_p_stateid; /* copy of parent's stateid */ + clientid_t cp_p_clid; /* copy of parent's clid */ + time64_t cpntf_time; /* last time stateid used */ +}; + +/* + * Represents a delegation stateid. The nfs4_client holds references to these + * and they are put when it is being destroyed or when the delegation is + * returned by the client: + * + * o 1 reference as long as a delegation is still in force (taken when it's + * alloc'd, put when it's returned or revoked) + * + * o 1 reference as long as a recall rpc is in progress (taken when the lease + * is broken, put when the rpc exits) + * + * o 1 more ephemeral reference for each nfsd thread currently doing something + * with that delegation without holding the cl_lock + * + * If the server attempts to recall a delegation and the client doesn't do so + * before a timeout, the server may also revoke the delegation. In that case, + * the object will either be destroyed (v4.0) or moved to a per-client list of + * revoked delegations (v4.1+). + * + * This object is a superset of the nfs4_stid. + */ +struct nfs4_delegation { + struct nfs4_stid dl_stid; /* must be first field */ + struct list_head dl_perfile; + struct list_head dl_perclnt; + struct list_head dl_recall_lru; /* delegation recalled */ + struct nfs4_clnt_odstate *dl_clnt_odstate; + u32 dl_type; + time64_t dl_time; +/* For recall: */ + int dl_retries; + struct nfsd4_callback dl_recall; +}; + +#define cb_to_delegation(cb) \ + container_of(cb, struct nfs4_delegation, dl_recall) + +/* client delegation callback info */ +struct nfs4_cb_conn { + /* SETCLIENTID info */ + struct sockaddr_storage cb_addr; + struct sockaddr_storage cb_saddr; + size_t cb_addrlen; + u32 cb_prog; /* used only in 4.0 case; + per-session otherwise */ + u32 cb_ident; /* minorversion 0 only */ + struct svc_xprt *cb_xprt; /* minorversion 1 only */ +}; + +static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_delegation, dl_stid); +} + +/* Maximum number of slots per session. 160 is useful for long haul TCP */ +#define NFSD_MAX_SLOTS_PER_SESSION 160 +/* Maximum number of operations per session compound */ +#define NFSD_MAX_OPS_PER_COMPOUND 16 +/* Maximum session per slot cache size */ +#define NFSD_SLOT_CACHE_SIZE 2048 +/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ +#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32 +#define NFSD_MAX_MEM_PER_SESSION \ + (NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE) + +struct nfsd4_slot { + u32 sl_seqid; + __be32 sl_status; + struct svc_cred sl_cred; + u32 sl_datalen; + u16 sl_opcnt; +#define NFSD4_SLOT_INUSE (1 << 0) +#define NFSD4_SLOT_CACHETHIS (1 << 1) +#define NFSD4_SLOT_INITIALIZED (1 << 2) +#define NFSD4_SLOT_CACHED (1 << 3) + u8 sl_flags; + char sl_data[]; +}; + +struct nfsd4_channel_attrs { + u32 headerpadsz; + u32 maxreq_sz; + u32 maxresp_sz; + u32 maxresp_cached; + u32 maxops; + u32 maxreqs; + u32 nr_rdma_attrs; + u32 rdma_attrs; +}; + +struct nfsd4_cb_sec { + u32 flavor; /* (u32)(-1) used to mean "no valid flavor" */ + kuid_t uid; + kgid_t gid; +}; + +struct nfsd4_create_session { + clientid_t clientid; + struct nfs4_sessionid sessionid; + u32 seqid; + u32 flags; + struct nfsd4_channel_attrs fore_channel; + struct nfsd4_channel_attrs back_channel; + u32 callback_prog; + struct nfsd4_cb_sec cb_sec; +}; + +struct nfsd4_backchannel_ctl { + u32 bc_cb_program; + struct nfsd4_cb_sec bc_cb_sec; +}; + +struct nfsd4_bind_conn_to_session { + struct nfs4_sessionid sessionid; + u32 dir; +}; + +/* The single slot clientid cache structure */ +struct nfsd4_clid_slot { + u32 sl_seqid; + __be32 sl_status; + struct nfsd4_create_session sl_cr_ses; +}; + +struct nfsd4_conn { + struct list_head cn_persession; + struct svc_xprt *cn_xprt; + struct svc_xpt_user cn_xpt_user; + struct nfsd4_session *cn_session; +/* CDFC4_FORE, CDFC4_BACK: */ + unsigned char cn_flags; +}; + +/* + * Representation of a v4.1+ session. These are refcounted in a similar fashion + * to the nfs4_client. References are only taken when the server is actively + * working on the object (primarily during the processing of compounds). + */ +struct nfsd4_session { + atomic_t se_ref; + struct list_head se_hash; /* hash by sessionid */ + struct list_head se_perclnt; +/* See SESSION4_PERSIST, etc. for standard flags; this is internal-only: */ +#define NFS4_SESSION_DEAD 0x010 + u32 se_flags; + struct nfs4_client *se_client; + struct nfs4_sessionid se_sessionid; + struct nfsd4_channel_attrs se_fchannel; + struct nfsd4_channel_attrs se_bchannel; + struct nfsd4_cb_sec se_cb_sec; + struct list_head se_conns; + u32 se_cb_prog; + u32 se_cb_seq_nr; + struct nfsd4_slot *se_slots[]; /* forward channel slots */ +}; + +/* formatted contents of nfs4_sessionid */ +struct nfsd4_sessionid { + clientid_t clientid; + u32 sequence; + u32 reserved; +}; + +#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */ + +/* + * struct nfs4_client - one per client. Clientids live here. + * + * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0) + * or EXCHANGE_ID (for NFSv4.1+). These objects are refcounted and timestamped. + * Each nfsd_net_ns object contains a set of these and they are tracked via + * short and long form clientid. They are hashed and searched for under the + * per-nfsd_net client_lock spinlock. + * + * References to it are only held during the processing of compounds, and in + * certain other operations. In their "resting state" they have a refcount of + * 0. If they are not renewed within a lease period, they become eligible for + * destruction by the laundromat. + * + * These objects can also be destroyed prematurely by the fault injection code, + * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates. + * Care is taken *not* to do this however when the objects have an elevated + * refcount. + * + * o Each nfs4_client is hashed by clientid + * + * o Each nfs4_clients is also hashed by name (the opaque quantity initially + * sent by the client to identify itself). + * + * o cl_perclient list is used to ensure no dangling stateowner references + * when we expire the nfs4_client + */ +struct nfs4_client { + struct list_head cl_idhash; /* hash by cl_clientid.id */ + struct rb_node cl_namenode; /* link into by-name trees */ + struct list_head *cl_ownerstr_hashtbl; + struct list_head cl_openowners; + struct idr cl_stateids; /* stateid lookup */ + struct list_head cl_delegations; + struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */ + struct list_head cl_lru; /* tail queue */ +#ifdef CONFIG_NFSD_PNFS + struct list_head cl_lo_states; /* outstanding layout states */ +#endif + struct xdr_netobj cl_name; /* id generated by client */ + nfs4_verifier cl_verifier; /* generated by client */ + time64_t cl_time; /* time of last lease renewal */ + struct sockaddr_storage cl_addr; /* client ipaddress */ + bool cl_mach_cred; /* SP4_MACH_CRED in force */ + struct svc_cred cl_cred; /* setclientid principal */ + clientid_t cl_clientid; /* generated by server */ + nfs4_verifier cl_confirm; /* generated by server */ + u32 cl_minorversion; + /* NFSv4.1 client implementation id: */ + struct xdr_netobj cl_nii_domain; + struct xdr_netobj cl_nii_name; + struct timespec64 cl_nii_time; + + /* for v4.0 and v4.1 callbacks: */ + struct nfs4_cb_conn cl_cb_conn; +#define NFSD4_CLIENT_CB_UPDATE (0) +#define NFSD4_CLIENT_CB_KILL (1) +#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */ +#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */ +#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */ +#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */ +#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ + 1 << NFSD4_CLIENT_CB_KILL) + unsigned long cl_flags; + const struct cred *cl_cb_cred; + struct rpc_clnt *cl_cb_client; + u32 cl_cb_ident; +#define NFSD4_CB_UP 0 +#define NFSD4_CB_UNKNOWN 1 +#define NFSD4_CB_DOWN 2 +#define NFSD4_CB_FAULT 3 + int cl_cb_state; + struct nfsd4_callback cl_cb_null; + struct nfsd4_session *cl_cb_session; + + /* for all client information that callback code might need: */ + spinlock_t cl_lock; + + /* for nfs41 */ + struct list_head cl_sessions; + struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ + u32 cl_exchange_flags; + /* number of rpc's in progress over an associated session: */ + atomic_t cl_rpc_users; + struct nfsdfs_client cl_nfsdfs; + struct nfs4_op_map cl_spo_must_allow; + + /* debugging info directory under nfsd/clients/ : */ + struct dentry *cl_nfsd_dentry; + + /* for nfs41 callbacks */ + /* We currently support a single back channel with a single slot */ + unsigned long cl_cb_slot_busy; + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ + struct net *net; + struct list_head async_copies; /* list of async copies */ + spinlock_t async_lock; /* lock for async copies */ + atomic_t cl_cb_inflight; /* Outstanding callbacks */ +}; + +/* struct nfs4_client_reset + * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl + * upon lease reset, or from upcall to state_daemon (to read in state + * from non-volitile storage) upon reboot. + */ +struct nfs4_client_reclaim { + struct list_head cr_strhash; /* hash by cr_name */ + struct nfs4_client *cr_clp; /* pointer to associated clp */ + struct xdr_netobj cr_name; /* recovery dir name */ + struct xdr_netobj cr_princhash; +}; + +/* A reasonable value for REPLAY_ISIZE was estimated as follows: + * The OPEN response, typically the largest, requires + * 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) + + * 4(deleg. type) + 8(deleg. stateid) + 4(deleg. recall flag) + + * 20(deleg. space limit) + ~32(deleg. ace) = 112 bytes + */ + +#define NFSD4_REPLAY_ISIZE 112 + +/* + * Replay buffer, where the result of the last seqid-mutating operation + * is cached. + */ +struct nfs4_replay { + __be32 rp_status; + unsigned int rp_buflen; + char *rp_buf; + struct knfsd_fh rp_openfh; + struct mutex rp_mutex; + char rp_ibuf[NFSD4_REPLAY_ISIZE]; +}; + +struct nfs4_stateowner; + +struct nfs4_stateowner_operations { + void (*so_unhash)(struct nfs4_stateowner *); + void (*so_free)(struct nfs4_stateowner *); +}; + +/* + * A core object that represents either an open or lock owner. The object and + * lock owner objects have one of these embedded within them. Refcounts and + * other fields common to both owner types are contained within these + * structures. + */ +struct nfs4_stateowner { + struct list_head so_strhash; + struct list_head so_stateids; + struct nfs4_client *so_client; + const struct nfs4_stateowner_operations *so_ops; + /* after increment in nfsd4_bump_seqid, represents the next + * sequence id expected from the client: */ + atomic_t so_count; + u32 so_seqid; + struct xdr_netobj so_owner; /* open owner name */ + struct nfs4_replay so_replay; + bool so_is_open_owner; +}; + +/* + * When a file is opened, the client provides an open state owner opaque string + * that indicates the "owner" of that open. These objects are refcounted. + * References to it are held by each open state associated with it. This object + * is a superset of the nfs4_stateowner struct. + */ +struct nfs4_openowner { + struct nfs4_stateowner oo_owner; /* must be first field */ + struct list_head oo_perclient; + /* + * We keep around openowners a little while after last close, + * which saves clients from having to confirm, and allows us to + * handle close replays if they come soon enough. The close_lru + * is a list of such openowners, to be reaped by the laundromat + * thread eventually if they remain unused: + */ + struct list_head oo_close_lru; + struct nfs4_ol_stateid *oo_last_closed_stid; + time64_t oo_time; /* time of placement on so_close_lru */ +#define NFS4_OO_CONFIRMED 1 + unsigned char oo_flags; +}; + +/* + * Represents a generic "lockowner". Similar to an openowner. References to it + * are held by the lock stateids that are created on its behalf. This object is + * a superset of the nfs4_stateowner struct. + */ +struct nfs4_lockowner { + struct nfs4_stateowner lo_owner; /* must be first element */ + struct list_head lo_blocked; /* blocked file_locks */ +}; + +static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so) +{ + return container_of(so, struct nfs4_openowner, oo_owner); +} + +static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) +{ + return container_of(so, struct nfs4_lockowner, lo_owner); +} + +/* + * Per-client state indicating no. of opens and outstanding delegations + * on a file from a particular client.'od' stands for 'open & delegation' + */ +struct nfs4_clnt_odstate { + struct nfs4_client *co_client; + struct nfs4_file *co_file; + struct list_head co_perfile; + refcount_t co_odcount; +}; + +/* + * nfs4_file: a file opened by some number of (open) nfs4_stateowners. + * + * These objects are global. nfsd keeps one instance of a nfs4_file per + * filehandle (though it may keep multiple file descriptors for each). Each + * inode can have multiple filehandles associated with it, so there is + * (potentially) a many to one relationship between this struct and struct + * inode. + * + * These are hashed by filehandle in the file_hashtbl, which is protected by + * the global state_lock spinlock. + */ +struct nfs4_file { + refcount_t fi_ref; + spinlock_t fi_lock; + struct hlist_node fi_hash; /* hash on fi_fhandle */ + struct list_head fi_stateids; + union { + struct list_head fi_delegations; + struct rcu_head fi_rcu; + }; + struct list_head fi_clnt_odstate; + /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ + struct nfsd_file *fi_fds[3]; + /* + * Each open or lock stateid contributes 0-4 to the counts + * below depending on which bits are set in st_access_bitmap: + * 1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set + * + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set + * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set. + */ + atomic_t fi_access[2]; + u32 fi_share_deny; + struct nfsd_file *fi_deleg_file; + int fi_delegees; + struct knfsd_fh fi_fhandle; + bool fi_had_conflict; +#ifdef CONFIG_NFSD_PNFS + struct list_head fi_lo_states; + atomic_t fi_lo_recalls; +#endif +}; + +/* + * A generic struct representing either a open or lock stateid. The nfs4_client + * holds a reference to each of these objects, and they in turn hold a + * reference to their respective stateowners. The client's reference is + * released in response to a close or unlock (depending on whether it's an open + * or lock stateid) or when the client is being destroyed. + * + * In the case of v4.0 open stateids, these objects are preserved for a little + * while after close in order to handle CLOSE replays. Those are eventually + * reclaimed via a LRU scheme by the laundromat. + * + * This object is a superset of the nfs4_stid. "ol" stands for "Open or Lock". + * Better suggestions welcome. + */ +struct nfs4_ol_stateid { + struct nfs4_stid st_stid; + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_locks; + struct nfs4_stateowner *st_stateowner; + struct nfs4_clnt_odstate *st_clnt_odstate; + unsigned char st_access_bmap; + unsigned char st_deny_bmap; + struct nfs4_ol_stateid *st_openstp; + struct mutex st_mutex; +}; + +static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_ol_stateid, st_stid); +} + +struct nfs4_layout_stateid { + struct nfs4_stid ls_stid; + struct list_head ls_perclnt; + struct list_head ls_perfile; + spinlock_t ls_lock; + struct list_head ls_layouts; + u32 ls_layout_type; + struct nfsd_file *ls_file; + struct nfsd4_callback ls_recall; + stateid_t ls_recall_sid; + bool ls_recalled; + struct mutex ls_mutex; +}; + +static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_layout_stateid, ls_stid); +} + +/* flags for preprocess_seqid_op() */ +#define RD_STATE 0x00000010 +#define WR_STATE 0x00000020 + +enum nfsd4_cb_op { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_LAYOUT, + NFSPROC4_CLNT_CB_OFFLOAD, + NFSPROC4_CLNT_CB_SEQUENCE, + NFSPROC4_CLNT_CB_NOTIFY_LOCK, +}; + +/* Returns true iff a is later than b: */ +static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b) +{ + return (s32)(a->si_generation - b->si_generation) > 0; +} + +/* + * When a client tries to get a lock on a file, we set one of these objects + * on the blocking lock. When the lock becomes free, we can then issue a + * CB_NOTIFY_LOCK to the server. + */ +struct nfsd4_blocked_lock { + struct list_head nbl_list; + struct list_head nbl_lru; + time64_t nbl_time; + struct file_lock nbl_lock; + struct knfsd_fh nbl_fh; + struct nfsd4_callback nbl_cb; +}; + +struct nfsd4_compound_state; +struct nfsd_net; +struct nfsd4_copy; + +extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, struct svc_fh *fhp, + stateid_t *stateid, int flags, struct nfsd_file **filp, + struct nfs4_stid **cstid); +__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, + stateid_t *stateid, unsigned char typemask, + struct nfs4_stid **s, struct nfsd_net *nn); +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, + void (*sc_free)(struct nfs4_stid *)); +int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy); +void nfs4_free_copy_state(struct nfsd4_copy *copy); +struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn, + struct nfs4_stid *p_stid); +void nfs4_unhash_stid(struct nfs4_stid *s); +void nfs4_put_stid(struct nfs4_stid *s); +void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid); +void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); +extern void nfs4_release_reclaim(struct nfsd_net *); +extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct xdr_netobj name, + struct nfsd_net *nn); +extern __be32 nfs4_check_open_reclaim(clientid_t *clid, + struct nfsd4_compound_state *cstate, struct nfsd_net *nn); +extern void nfsd4_probe_callback(struct nfs4_client *clp); +extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); +extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); +extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, + const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); +extern void nfsd4_run_cb(struct nfsd4_callback *cb); +extern int nfsd4_create_callback_queue(void); +extern void nfsd4_destroy_callback_queue(void); +extern void nfsd4_shutdown_callback(struct nfs4_client *); +extern void nfsd4_shutdown_copy(struct nfs4_client *clp); +extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); +extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, + struct xdr_netobj princhash, struct nfsd_net *nn); +extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); + +struct nfs4_file *find_file(struct knfsd_fh *fh); +void put_nfs4_file(struct nfs4_file *fi); +extern void nfs4_put_copy(struct nfsd4_copy *copy); +extern struct nfsd4_copy * +find_async_copy(struct nfs4_client *clp, stateid_t *staetid); +extern void nfs4_put_cpntf_state(struct nfsd_net *nn, + struct nfs4_cpntf_state *cps); +extern __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st, + struct nfs4_client *clp, + struct nfs4_cpntf_state **cps); +static inline void get_nfs4_file(struct nfs4_file *fi) +{ + refcount_inc(&fi->fi_ref); +} +struct nfsd_file *find_any_file(struct nfs4_file *f); + +/* grace period management */ +void nfsd4_end_grace(struct nfsd_net *nn); + +/* nfs4recover operations */ +extern int nfsd4_client_tracking_init(struct net *net); +extern void nfsd4_client_tracking_exit(struct net *net); +extern void nfsd4_client_record_create(struct nfs4_client *clp); +extern void nfsd4_client_record_remove(struct nfs4_client *clp); +extern int nfsd4_client_record_check(struct nfs4_client *clp); +extern void nfsd4_record_grace_done(struct nfsd_net *nn); + +#endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c new file mode 100644 index 000000000..b1bc582b0 --- /dev/null +++ b/fs/nfsd/stats.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * procfs-based user access to knfsd statistics + * + * /proc/net/rpc/nfsd + * + * Format: + * rc <hits> <misses> <nocache> + * Statistsics for the reply cache + * fh <stale> <total-lookups> <anonlookups> <dir-not-in-dcache> <nondir-not-in-dcache> + * statistics for filehandle lookup + * io <bytes-read> <bytes-written> + * statistics for IO throughput + * th <threads> <fullcnt> <10%-20%> <20%-30%> ... <90%-100%> <100%> + * time (seconds) when nfsd thread usage above thresholds + * and number of times that all threads were in use + * ra cache-size <10% <20% <30% ... <100% not-found + * number of times that read-ahead entry was found that deep in + * the cache. + * plus generic RPC stats (see net/sunrpc/stats.c) + * + * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> + */ + +#include <linux/seq_file.h> +#include <linux/module.h> +#include <linux/sunrpc/stats.h> +#include <net/net_namespace.h> + +#include "nfsd.h" + +struct nfsd_stats nfsdstats; +struct svc_stat nfsd_svcstats = { + .program = &nfsd_program, +}; + +static int nfsd_proc_show(struct seq_file *seq, void *v) +{ + int i; + + seq_printf(seq, "rc %u %u %u\nfh %u %u %u %u %u\nio %u %u\n", + nfsdstats.rchits, + nfsdstats.rcmisses, + nfsdstats.rcnocache, + nfsdstats.fh_stale, + nfsdstats.fh_lookup, + nfsdstats.fh_anon, + nfsdstats.fh_nocache_dir, + nfsdstats.fh_nocache_nondir, + nfsdstats.io_read, + nfsdstats.io_write); + /* thread usage: */ + seq_printf(seq, "th %u %u", nfsdstats.th_cnt, nfsdstats.th_fullcnt); + for (i=0; i<10; i++) { + unsigned int jifs = nfsdstats.th_usage[i]; + unsigned int sec = jifs / HZ, msec = (jifs % HZ)*1000/HZ; + seq_printf(seq, " %u.%03u", sec, msec); + } + + /* newline and ra-cache */ + seq_printf(seq, "\nra %u", nfsdstats.ra_size); + for (i=0; i<11; i++) + seq_printf(seq, " %u", nfsdstats.ra_depth[i]); + seq_putc(seq, '\n'); + + /* show my rpc info */ + svc_seq_show(seq, &nfsd_svcstats); + +#ifdef CONFIG_NFSD_V4 + /* Show count for individual nfsv4 operations */ + /* Writing operation numbers 0 1 2 also for maintaining uniformity */ + seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1); + for (i = 0; i <= LAST_NFS4_OP; i++) + seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]); + + seq_putc(seq, '\n'); +#endif + + return 0; +} + +static int nfsd_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, nfsd_proc_show, NULL); +} + +static const struct proc_ops nfsd_proc_ops = { + .proc_open = nfsd_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + +void +nfsd_stat_init(void) +{ + svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops); +} + +void +nfsd_stat_shutdown(void) +{ + svc_proc_unregister(&init_net, "nfsd"); +} diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h new file mode 100644 index 000000000..b23fdac69 --- /dev/null +++ b/fs/nfsd/stats.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Statistics for NFS server. + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ +#ifndef _NFSD_STATS_H +#define _NFSD_STATS_H + +#include <uapi/linux/nfsd/stats.h> + + +struct nfsd_stats { + unsigned int rchits; /* repcache hits */ + unsigned int rcmisses; /* repcache hits */ + unsigned int rcnocache; /* uncached reqs */ + unsigned int fh_stale; /* FH stale error */ + unsigned int fh_lookup; /* dentry cached */ + unsigned int fh_anon; /* anon file dentry returned */ + unsigned int fh_nocache_dir; /* filehandle not found in dcache */ + unsigned int fh_nocache_nondir; /* filehandle not found in dcache */ + unsigned int io_read; /* bytes returned to read requests */ + unsigned int io_write; /* bytes passed in write requests */ + unsigned int th_cnt; /* number of available threads */ + unsigned int th_usage[10]; /* number of ticks during which n perdeciles + * of available threads were in use */ + unsigned int th_fullcnt; /* number of times last free thread was used */ + unsigned int ra_size; /* size of ra cache */ + unsigned int ra_depth[11]; /* number of times ra entry was found that deep + * in the cache (10percentiles). [10] = not found */ +#ifdef CONFIG_NFSD_V4 + unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */ +#endif + +}; + + +extern struct nfsd_stats nfsdstats; +extern struct svc_stat nfsd_svcstats; + +void nfsd_stat_init(void); +void nfsd_stat_shutdown(void); + +#endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c new file mode 100644 index 000000000..90967466a --- /dev/null +++ b/fs/nfsd/trace.c @@ -0,0 +1,3 @@ + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h new file mode 100644 index 000000000..a952f4a9b --- /dev/null +++ b/fs/nfsd/trace.h @@ -0,0 +1,756 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nfsd + +#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _NFSD_TRACE_H + +#include <linux/tracepoint.h> +#include "export.h" +#include "nfsfh.h" + +TRACE_EVENT(nfsd_compound, + TP_PROTO(const struct svc_rqst *rqst, + u32 args_opcnt), + TP_ARGS(rqst, args_opcnt), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, args_opcnt) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->args_opcnt = args_opcnt; + ), + TP_printk("xid=0x%08x opcnt=%u", + __entry->xid, __entry->args_opcnt) +) + +TRACE_EVENT(nfsd_compound_status, + TP_PROTO(u32 args_opcnt, + u32 resp_opcnt, + __be32 status, + const char *name), + TP_ARGS(args_opcnt, resp_opcnt, status, name), + TP_STRUCT__entry( + __field(u32, args_opcnt) + __field(u32, resp_opcnt) + __field(int, status) + __string(name, name) + ), + TP_fast_assign( + __entry->args_opcnt = args_opcnt; + __entry->resp_opcnt = resp_opcnt; + __entry->status = be32_to_cpu(status); + __assign_str(name, name); + ), + TP_printk("op=%u/%u %s status=%d", + __entry->resp_opcnt, __entry->args_opcnt, + __get_str(name), __entry->status) +) + +DECLARE_EVENT_CLASS(nfsd_fh_err_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *fhp, + int status), + TP_ARGS(rqstp, fhp, status), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, fh_hash) + __field(int, status) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->status = status; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x status=%d", + __entry->xid, __entry->fh_hash, + __entry->status) +) + +#define DEFINE_NFSD_FH_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_fh_err_class, nfsd_##name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *fhp, \ + int status), \ + TP_ARGS(rqstp, fhp, status)) + +DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badexport); +DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badhandle); + +TRACE_EVENT(nfsd_exp_find_key, + TP_PROTO(const struct svc_expkey *key, + int status), + TP_ARGS(key, status), + TP_STRUCT__entry( + __field(int, fsidtype) + __array(u32, fsid, 6) + __string(auth_domain, key->ek_client->name) + __field(int, status) + ), + TP_fast_assign( + __entry->fsidtype = key->ek_fsidtype; + memcpy(__entry->fsid, key->ek_fsid, 4*6); + __assign_str(auth_domain, key->ek_client->name); + __entry->status = status; + ), + TP_printk("fsid=%x::%s domain=%s status=%d", + __entry->fsidtype, + __print_array(__entry->fsid, 6, 4), + __get_str(auth_domain), + __entry->status + ) +); + +TRACE_EVENT(nfsd_expkey_update, + TP_PROTO(const struct svc_expkey *key, const char *exp_path), + TP_ARGS(key, exp_path), + TP_STRUCT__entry( + __field(int, fsidtype) + __array(u32, fsid, 6) + __string(auth_domain, key->ek_client->name) + __string(path, exp_path) + __field(bool, cache) + ), + TP_fast_assign( + __entry->fsidtype = key->ek_fsidtype; + memcpy(__entry->fsid, key->ek_fsid, 4*6); + __assign_str(auth_domain, key->ek_client->name); + __assign_str(path, exp_path); + __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags); + ), + TP_printk("fsid=%x::%s domain=%s path=%s cache=%s", + __entry->fsidtype, + __print_array(__entry->fsid, 6, 4), + __get_str(auth_domain), + __get_str(path), + __entry->cache ? "pos" : "neg" + ) +); + +TRACE_EVENT(nfsd_exp_get_by_name, + TP_PROTO(const struct svc_export *key, + int status), + TP_ARGS(key, status), + TP_STRUCT__entry( + __string(path, key->ex_path.dentry->d_name.name) + __string(auth_domain, key->ex_client->name) + __field(int, status) + ), + TP_fast_assign( + __assign_str(path, key->ex_path.dentry->d_name.name); + __assign_str(auth_domain, key->ex_client->name); + __entry->status = status; + ), + TP_printk("path=%s domain=%s status=%d", + __get_str(path), + __get_str(auth_domain), + __entry->status + ) +); + +TRACE_EVENT(nfsd_export_update, + TP_PROTO(const struct svc_export *key), + TP_ARGS(key), + TP_STRUCT__entry( + __string(path, key->ex_path.dentry->d_name.name) + __string(auth_domain, key->ex_client->name) + __field(bool, cache) + ), + TP_fast_assign( + __assign_str(path, key->ex_path.dentry->d_name.name); + __assign_str(auth_domain, key->ex_client->name); + __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags); + ), + TP_printk("path=%s domain=%s cache=%s", + __get_str(path), + __get_str(auth_domain), + __entry->cache ? "pos" : "neg" + ) +); + +DECLARE_EVENT_CLASS(nfsd_io_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *fhp, + u64 offset, + u32 len), + TP_ARGS(rqstp, fhp, offset, len), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, fh_hash) + __field(u64, offset) + __field(u32, len) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->offset = offset; + __entry->len = len; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u", + __entry->xid, __entry->fh_hash, + __entry->offset, __entry->len) +) + +#define DEFINE_NFSD_IO_EVENT(name) \ +DEFINE_EVENT(nfsd_io_class, nfsd_##name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *fhp, \ + u64 offset, \ + u32 len), \ + TP_ARGS(rqstp, fhp, offset, len)) + +DEFINE_NFSD_IO_EVENT(read_start); +DEFINE_NFSD_IO_EVENT(read_splice); +DEFINE_NFSD_IO_EVENT(read_vector); +DEFINE_NFSD_IO_EVENT(read_io_done); +DEFINE_NFSD_IO_EVENT(read_done); +DEFINE_NFSD_IO_EVENT(write_start); +DEFINE_NFSD_IO_EVENT(write_opened); +DEFINE_NFSD_IO_EVENT(write_io_done); +DEFINE_NFSD_IO_EVENT(write_done); + +DECLARE_EVENT_CLASS(nfsd_err_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *fhp, + loff_t offset, + int status), + TP_ARGS(rqstp, fhp, offset, status), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, fh_hash) + __field(loff_t, offset) + __field(int, status) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->offset = offset; + __entry->status = status; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld status=%d", + __entry->xid, __entry->fh_hash, + __entry->offset, __entry->status) +) + +#define DEFINE_NFSD_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_err_class, nfsd_##name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *fhp, \ + loff_t offset, \ + int len), \ + TP_ARGS(rqstp, fhp, offset, len)) + +DEFINE_NFSD_ERR_EVENT(read_err); +DEFINE_NFSD_ERR_EVENT(write_err); + +#include "state.h" +#include "filecache.h" +#include "vfs.h" + +DECLARE_EVENT_CLASS(nfsd_stateid_class, + TP_PROTO(stateid_t *stp), + TP_ARGS(stp), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + ), + TP_fast_assign( + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + ), + TP_printk("client %08x:%08x stateid %08x:%08x", + __entry->cl_boot, + __entry->cl_id, + __entry->si_id, + __entry->si_generation) +) + +#define DEFINE_STATEID_EVENT(name) \ +DEFINE_EVENT(nfsd_stateid_class, nfsd_##name, \ + TP_PROTO(stateid_t *stp), \ + TP_ARGS(stp)) + +DEFINE_STATEID_EVENT(layoutstate_alloc); +DEFINE_STATEID_EVENT(layoutstate_unhash); +DEFINE_STATEID_EVENT(layoutstate_free); +DEFINE_STATEID_EVENT(layout_get_lookup_fail); +DEFINE_STATEID_EVENT(layout_commit_lookup_fail); +DEFINE_STATEID_EVENT(layout_return_lookup_fail); +DEFINE_STATEID_EVENT(layout_recall); +DEFINE_STATEID_EVENT(layout_recall_done); +DEFINE_STATEID_EVENT(layout_recall_fail); +DEFINE_STATEID_EVENT(layout_recall_release); + +DEFINE_STATEID_EVENT(open); +DEFINE_STATEID_EVENT(deleg_read); +DEFINE_STATEID_EVENT(deleg_break); +DEFINE_STATEID_EVENT(deleg_recall); + +DECLARE_EVENT_CLASS(nfsd_stateseqid_class, + TP_PROTO(u32 seqid, const stateid_t *stp), + TP_ARGS(seqid, stp), + TP_STRUCT__entry( + __field(u32, seqid) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + ), + TP_fast_assign( + __entry->seqid = seqid; + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + ), + TP_printk("seqid=%u client %08x:%08x stateid %08x:%08x", + __entry->seqid, __entry->cl_boot, __entry->cl_id, + __entry->si_id, __entry->si_generation) +) + +#define DEFINE_STATESEQID_EVENT(name) \ +DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \ + TP_PROTO(u32 seqid, const stateid_t *stp), \ + TP_ARGS(seqid, stp)) + +DEFINE_STATESEQID_EVENT(preprocess); +DEFINE_STATESEQID_EVENT(open_confirm); + +DECLARE_EVENT_CLASS(nfsd_clientid_class, + TP_PROTO(const clientid_t *clid), + TP_ARGS(clid), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + ), + TP_fast_assign( + __entry->cl_boot = clid->cl_boot; + __entry->cl_id = clid->cl_id; + ), + TP_printk("client %08x:%08x", __entry->cl_boot, __entry->cl_id) +) + +#define DEFINE_CLIENTID_EVENT(name) \ +DEFINE_EVENT(nfsd_clientid_class, nfsd_clid_##name, \ + TP_PROTO(const clientid_t *clid), \ + TP_ARGS(clid)) + +DEFINE_CLIENTID_EVENT(expired); +DEFINE_CLIENTID_EVENT(purged); +DEFINE_CLIENTID_EVENT(renew); +DEFINE_CLIENTID_EVENT(stale); + +DECLARE_EVENT_CLASS(nfsd_net_class, + TP_PROTO(const struct nfsd_net *nn), + TP_ARGS(nn), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + ), + TP_printk("boot_time=%16llx", __entry->boot_time) +) + +#define DEFINE_NET_EVENT(name) \ +DEFINE_EVENT(nfsd_net_class, nfsd_##name, \ + TP_PROTO(const struct nfsd_net *nn), \ + TP_ARGS(nn)) + +DEFINE_NET_EVENT(grace_start); +DEFINE_NET_EVENT(grace_complete); + +TRACE_EVENT(nfsd_clid_inuse_err, + TP_PROTO(const struct nfs4_client *clp), + TP_ARGS(clp), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __field(unsigned int, namelen) + __dynamic_array(unsigned char, name, clp->cl_name.len) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + memcpy(__entry->addr, &clp->cl_addr, + sizeof(struct sockaddr_in6)); + __entry->namelen = clp->cl_name.len; + memcpy(__get_dynamic_array(name), clp->cl_name.data, + clp->cl_name.len); + ), + TP_printk("nfs4_clientid %.*s already in use by %pISpc, client %08x:%08x", + __entry->namelen, __get_str(name), __entry->addr, + __entry->cl_boot, __entry->cl_id) +) + +TRACE_DEFINE_ENUM(NFSD_FILE_HASHED); +TRACE_DEFINE_ENUM(NFSD_FILE_PENDING); +TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ); +TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_WRITE); +TRACE_DEFINE_ENUM(NFSD_FILE_REFERENCED); + +#define show_nf_flags(val) \ + __print_flags(val, "|", \ + { 1 << NFSD_FILE_HASHED, "HASHED" }, \ + { 1 << NFSD_FILE_PENDING, "PENDING" }, \ + { 1 << NFSD_FILE_BREAK_READ, "BREAK_READ" }, \ + { 1 << NFSD_FILE_BREAK_WRITE, "BREAK_WRITE" }, \ + { 1 << NFSD_FILE_REFERENCED, "REFERENCED"}) + +/* FIXME: This should probably be fleshed out in the future. */ +#define show_nf_may(val) \ + __print_flags(val, "|", \ + { NFSD_MAY_READ, "READ" }, \ + { NFSD_MAY_WRITE, "WRITE" }, \ + { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }) + +DECLARE_EVENT_CLASS(nfsd_file_class, + TP_PROTO(struct nfsd_file *nf), + TP_ARGS(nf), + TP_STRUCT__entry( + __field(unsigned int, nf_hashval) + __field(void *, nf_inode) + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned char, nf_may) + __field(struct file *, nf_file) + ), + TP_fast_assign( + __entry->nf_hashval = nf->nf_hashval; + __entry->nf_inode = nf->nf_inode; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("hash=0x%x inode=0x%p ref=%d flags=%s may=%s file=%p", + __entry->nf_hashval, + __entry->nf_inode, + __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nf_may(__entry->nf_may), + __entry->nf_file) +) + +#define DEFINE_NFSD_FILE_EVENT(name) \ +DEFINE_EVENT(nfsd_file_class, name, \ + TP_PROTO(struct nfsd_file *nf), \ + TP_ARGS(nf)) + +DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc); +DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash); +DEFINE_NFSD_FILE_EVENT(nfsd_file_put); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked); + +TRACE_EVENT(nfsd_file_acquire, + TP_PROTO(struct svc_rqst *rqstp, unsigned int hash, + struct inode *inode, unsigned int may_flags, + struct nfsd_file *nf, __be32 status), + + TP_ARGS(rqstp, hash, inode, may_flags, nf, status), + + TP_STRUCT__entry( + __field(u32, xid) + __field(unsigned int, hash) + __field(void *, inode) + __field(unsigned int, may_flags) + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned char, nf_may) + __field(struct file *, nf_file) + __field(u32, status) + ), + + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->hash = hash; + __entry->inode = inode; + __entry->may_flags = may_flags; + __entry->nf_ref = nf ? refcount_read(&nf->nf_ref) : 0; + __entry->nf_flags = nf ? nf->nf_flags : 0; + __entry->nf_may = nf ? nf->nf_may : 0; + __entry->nf_file = nf ? nf->nf_file : NULL; + __entry->status = be32_to_cpu(status); + ), + + TP_printk("xid=0x%x hash=0x%x inode=0x%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=0x%p status=%u", + __entry->xid, __entry->hash, __entry->inode, + show_nf_may(__entry->may_flags), __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nf_may(__entry->nf_may), __entry->nf_file, + __entry->status) +); + +DECLARE_EVENT_CLASS(nfsd_file_search_class, + TP_PROTO(struct inode *inode, unsigned int hash, int found), + TP_ARGS(inode, hash, found), + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned int, hash) + __field(int, found) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->hash = hash; + __entry->found = found; + ), + TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash, + __entry->inode, __entry->found) +); + +#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \ +DEFINE_EVENT(nfsd_file_search_class, name, \ + TP_PROTO(struct inode *inode, unsigned int hash, int found), \ + TP_ARGS(inode, hash, found)) + +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync); +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode); +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_is_cached); + +TRACE_EVENT(nfsd_file_fsnotify_handle_event, + TP_PROTO(struct inode *inode, u32 mask), + TP_ARGS(inode, mask), + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned int, nlink) + __field(umode_t, mode) + __field(u32, mask) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->nlink = inode->i_nlink; + __entry->mode = inode->i_mode; + __entry->mask = mask; + ), + TP_printk("inode=0x%p nlink=%u mode=0%ho mask=0x%x", __entry->inode, + __entry->nlink, __entry->mode, __entry->mask) +); + +#include "cache.h" + +TRACE_DEFINE_ENUM(RC_DROPIT); +TRACE_DEFINE_ENUM(RC_REPLY); +TRACE_DEFINE_ENUM(RC_DOIT); + +#define show_drc_retval(x) \ + __print_symbolic(x, \ + { RC_DROPIT, "DROPIT" }, \ + { RC_REPLY, "REPLY" }, \ + { RC_DOIT, "DOIT" }) + +TRACE_EVENT(nfsd_drc_found, + TP_PROTO( + const struct nfsd_net *nn, + const struct svc_rqst *rqstp, + int result + ), + TP_ARGS(nn, rqstp, result), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + __field(unsigned long, result) + __field(u32, xid) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + __entry->result = result; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + ), + TP_printk("boot_time=%16llx xid=0x%08x result=%s", + __entry->boot_time, __entry->xid, + show_drc_retval(__entry->result)) + +); + +TRACE_EVENT(nfsd_drc_mismatch, + TP_PROTO( + const struct nfsd_net *nn, + const struct svc_cacherep *key, + const struct svc_cacherep *rp + ), + TP_ARGS(nn, key, rp), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + __field(u32, xid) + __field(u32, cached) + __field(u32, ingress) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + __entry->xid = be32_to_cpu(key->c_key.k_xid); + __entry->cached = (__force u32)key->c_key.k_csum; + __entry->ingress = (__force u32)rp->c_key.k_csum; + ), + TP_printk("boot_time=%16llx xid=0x%08x cached-csum=0x%08x ingress-csum=0x%08x", + __entry->boot_time, __entry->xid, __entry->cached, + __entry->ingress) +); + +TRACE_EVENT(nfsd_cb_args, + TP_PROTO( + const struct nfs4_client *clp, + const struct nfs4_cb_conn *conn + ), + TP_ARGS(clp, conn), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, prog) + __field(u32, ident) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __entry->prog = conn->cb_prog; + __entry->ident = conn->cb_ident; + memcpy(__entry->addr, &conn->cb_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("client %08x:%08x callback addr=%pISpc prog=%u ident=%u", + __entry->cl_boot, __entry->cl_id, + __entry->addr, __entry->prog, __entry->ident) +); + +TRACE_EVENT(nfsd_cb_nodelegs, + TP_PROTO(const struct nfs4_client *clp), + TP_ARGS(clp), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + ), + TP_printk("client %08x:%08x", __entry->cl_boot, __entry->cl_id) +) + +TRACE_DEFINE_ENUM(NFSD4_CB_UP); +TRACE_DEFINE_ENUM(NFSD4_CB_UNKNOWN); +TRACE_DEFINE_ENUM(NFSD4_CB_DOWN); +TRACE_DEFINE_ENUM(NFSD4_CB_FAULT); + +#define show_cb_state(val) \ + __print_symbolic(val, \ + { NFSD4_CB_UP, "UP" }, \ + { NFSD4_CB_UNKNOWN, "UNKNOWN" }, \ + { NFSD4_CB_DOWN, "DOWN" }, \ + { NFSD4_CB_FAULT, "FAULT"}) + +DECLARE_EVENT_CLASS(nfsd_cb_class, + TP_PROTO(const struct nfs4_client *clp), + TP_ARGS(clp), + TP_STRUCT__entry( + __field(unsigned long, state) + __field(u32, cl_boot) + __field(u32, cl_id) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + __entry->state = clp->cl_cb_state; + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("addr=%pISpc client %08x:%08x state=%s", + __entry->addr, __entry->cl_boot, __entry->cl_id, + show_cb_state(__entry->state)) +); + +#define DEFINE_NFSD_CB_EVENT(name) \ +DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name, \ + TP_PROTO(const struct nfs4_client *clp), \ + TP_ARGS(clp)) + +DEFINE_NFSD_CB_EVENT(setup); +DEFINE_NFSD_CB_EVENT(state); +DEFINE_NFSD_CB_EVENT(shutdown); + +TRACE_EVENT(nfsd_cb_setup_err, + TP_PROTO( + const struct nfs4_client *clp, + long error + ), + TP_ARGS(clp, error), + TP_STRUCT__entry( + __field(long, error) + __field(u32, cl_boot) + __field(u32, cl_id) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + __entry->error = error; + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("addr=%pISpc client %08x:%08x error=%ld", + __entry->addr, __entry->cl_boot, __entry->cl_id, __entry->error) +); + +TRACE_EVENT(nfsd_cb_work, + TP_PROTO( + const struct nfs4_client *clp, + const char *procedure + ), + TP_ARGS(clp, procedure), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __string(procedure, procedure) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __assign_str(procedure, procedure) + memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("addr=%pISpc client %08x:%08x procedure=%s", + __entry->addr, __entry->cl_boot, __entry->cl_id, + __get_str(procedure)) +); + +TRACE_EVENT(nfsd_cb_done, + TP_PROTO( + const struct nfs4_client *clp, + int status + ), + TP_ARGS(clp, status), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(int, status) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __entry->status = status; + memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("addr=%pISpc client %08x:%08x status=%d", + __entry->addr, __entry->cl_boot, __entry->cl_id, + __entry->status) +); + +#endif /* _NFSD_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c new file mode 100644 index 000000000..31edb883a --- /dev/null +++ b/fs/nfsd/vfs.c @@ -0,0 +1,2407 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * File operations used by nfsd. Some of these have been ripped from + * other parts of the kernel because they weren't exported, others + * are partial duplicates with added or changed functionality. + * + * Note that several functions dget() the dentry upon which they want + * to act, most notably those that create directory entries. Response + * dentry's are dput()'d if necessary in the release callback. + * So if you notice code paths that apparently fail to dput() the + * dentry, don't worry--they have been taken care of. + * + * Copyright (C) 1995-1999 Olaf Kirch <okir@monad.swb.de> + * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp> + */ + +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/splice.h> +#include <linux/falloc.h> +#include <linux/fcntl.h> +#include <linux/namei.h> +#include <linux/delay.h> +#include <linux/fsnotify.h> +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> +#include <linux/jhash.h> +#include <linux/ima.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/exportfs.h> +#include <linux/writeback.h> +#include <linux/security.h> + +#ifdef CONFIG_NFSD_V3 +#include "xdr3.h" +#endif /* CONFIG_NFSD_V3 */ + +#ifdef CONFIG_NFSD_V4 +#include "../internal.h" +#include "acl.h" +#include "idmap.h" +#endif /* CONFIG_NFSD_V4 */ + +#include "nfsd.h" +#include "vfs.h" +#include "filecache.h" +#include "trace.h" + +#define NFSDDBG_FACILITY NFSDDBG_FILEOP + +/* + * Called from nfsd_lookup and encode_dirent. Check if we have crossed + * a mount point. + * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged, + * or nfs_ok having possibly changed *dpp and *expp + */ +int +nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, + struct svc_export **expp) +{ + struct svc_export *exp = *expp, *exp2 = NULL; + struct dentry *dentry = *dpp; + struct path path = {.mnt = mntget(exp->ex_path.mnt), + .dentry = dget(dentry)}; + int err = 0; + + err = follow_down(&path); + if (err < 0) + goto out; + if (path.mnt == exp->ex_path.mnt && path.dentry == dentry && + nfsd_mountpoint(dentry, exp) == 2) { + /* This is only a mountpoint in some other namespace */ + path_put(&path); + goto out; + } + + exp2 = rqst_exp_get_by_name(rqstp, &path); + if (IS_ERR(exp2)) { + err = PTR_ERR(exp2); + /* + * We normally allow NFS clients to continue + * "underneath" a mountpoint that is not exported. + * The exception is V4ROOT, where no traversal is ever + * allowed without an explicit export of the new + * directory. + */ + if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT)) + err = 0; + path_put(&path); + goto out; + } + if (nfsd_v4client(rqstp) || + (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { + /* successfully crossed mount point */ + /* + * This is subtle: path.dentry is *not* on path.mnt + * at this point. The only reason we are safe is that + * original mnt is pinned down by exp, so we should + * put path *before* putting exp + */ + *dpp = path.dentry; + path.dentry = dentry; + *expp = exp2; + exp2 = exp; + } + path_put(&path); + exp_put(exp2); +out: + return err; +} + +static void follow_to_parent(struct path *path) +{ + struct dentry *dp; + + while (path->dentry == path->mnt->mnt_root && follow_up(path)) + ; + dp = dget_parent(path->dentry); + dput(path->dentry); + path->dentry = dp; +} + +static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp) +{ + struct svc_export *exp2; + struct path path = {.mnt = mntget((*exp)->ex_path.mnt), + .dentry = dget(dparent)}; + + follow_to_parent(&path); + + exp2 = rqst_exp_parent(rqstp, &path); + if (PTR_ERR(exp2) == -ENOENT) { + *dentryp = dget(dparent); + } else if (IS_ERR(exp2)) { + path_put(&path); + return PTR_ERR(exp2); + } else { + *dentryp = dget(path.dentry); + exp_put(*exp); + *exp = exp2; + } + path_put(&path); + return 0; +} + +/* + * For nfsd purposes, we treat V4ROOT exports as though there was an + * export at *every* directory. + * We return: + * '1' if this dentry *must* be an export point, + * '2' if it might be, if there is really a mount here, and + * '0' if there is no chance of an export point here. + */ +int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) +{ + if (!d_inode(dentry)) + return 0; + if (exp->ex_flags & NFSEXP_V4ROOT) + return 1; + if (nfsd4_is_junction(dentry)) + return 1; + if (d_mountpoint(dentry)) + /* + * Might only be a mountpoint in a different namespace, + * but we need to check. + */ + return 2; + return 0; +} + +__be32 +nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, + const char *name, unsigned int len, + struct svc_export **exp_ret, struct dentry **dentry_ret) +{ + struct svc_export *exp; + struct dentry *dparent; + struct dentry *dentry; + int host_err; + + dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); + + dparent = fhp->fh_dentry; + exp = exp_get(fhp->fh_export); + + /* Lookup the name, but don't follow links */ + if (isdotent(name, len)) { + if (len==1) + dentry = dget(dparent); + else if (dparent != exp->ex_path.dentry) + dentry = dget_parent(dparent); + else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp)) + dentry = dget(dparent); /* .. == . just like at / */ + else { + /* checking mountpoint crossing is very different when stepping up */ + host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry); + if (host_err) + goto out_nfserr; + } + } else { + /* + * In the nfsd4_open() case, this may be held across + * subsequent open and delegation acquisition which may + * need to take the child's i_mutex: + */ + fh_lock_nested(fhp, I_MUTEX_PARENT); + dentry = lookup_one_len(name, dparent, len); + host_err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_nfserr; + if (nfsd_mountpoint(dentry, exp)) { + /* + * We don't need the i_mutex after all. It's + * still possible we could open this (regular + * files can be mountpoints too), but the + * i_mutex is just there to prevent renames of + * something that we might be about to delegate, + * and a mountpoint won't be renamed: + */ + fh_unlock(fhp); + if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { + dput(dentry); + goto out_nfserr; + } + } + } + *dentry_ret = dentry; + *exp_ret = exp; + return 0; + +out_nfserr: + exp_put(exp); + return nfserrno(host_err); +} + +/* + * Look up one component of a pathname. + * N.B. After this call _both_ fhp and resfh need an fh_put + * + * If the lookup would cross a mountpoint, and the mounted filesystem + * is exported to the client with NFSEXP_NOHIDE, then the lookup is + * accepted as it stands and the mounted directory is + * returned. Otherwise the covered directory is returned. + * NOTE: this mountpoint crossing is not supported properly by all + * clients and is explicitly disallowed for NFSv3 + * NeilBrown <neilb@cse.unsw.edu.au> + */ +__be32 +nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, + unsigned int len, struct svc_fh *resfh) +{ + struct svc_export *exp; + struct dentry *dentry; + __be32 err; + + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (err) + return err; + err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); + if (err) + return err; + err = check_nfsd_access(exp, rqstp); + if (err) + goto out; + /* + * Note: we compose the file handle now, but as the + * dentry may be negative, it may need to be updated. + */ + err = fh_compose(resfh, exp, dentry, fhp); + if (!err && d_really_is_negative(dentry)) + err = nfserr_noent; +out: + dput(dentry); + exp_put(exp); + return err; +} + +/* + * Commit metadata changes to stable storage. + */ +static int +commit_inode_metadata(struct inode *inode) +{ + const struct export_operations *export_ops = inode->i_sb->s_export_op; + + if (export_ops->commit_metadata) + return export_ops->commit_metadata(inode); + return sync_inode_metadata(inode, 1); +} + +static int +commit_metadata(struct svc_fh *fhp) +{ + struct inode *inode = d_inode(fhp->fh_dentry); + + if (!EX_ISSYNC(fhp->fh_export)) + return 0; + return commit_inode_metadata(inode); +} + +/* + * Go over the attributes and take care of the small differences between + * NFS semantics and what Linux expects. + */ +static void +nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) +{ + /* sanitize the mode change */ + if (iap->ia_valid & ATTR_MODE) { + iap->ia_mode &= S_IALLUGO; + iap->ia_mode |= (inode->i_mode & ~S_IALLUGO); + } + + /* Revoke setuid/setgid on chown */ + if (!S_ISDIR(inode->i_mode) && + ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) { + iap->ia_valid |= ATTR_KILL_PRIV; + if (iap->ia_valid & ATTR_MODE) { + /* we're setting mode too, just clear the s*id bits */ + iap->ia_mode &= ~S_ISUID; + if (iap->ia_mode & S_IXGRP) + iap->ia_mode &= ~S_ISGID; + } else { + /* set ATTR_KILL_* bits and let VFS handle it */ + iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); + } + } +} + +static __be32 +nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct iattr *iap) +{ + struct inode *inode = d_inode(fhp->fh_dentry); + int host_err; + + if (iap->ia_size < inode->i_size) { + __be32 err; + + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, + NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE); + if (err) + return err; + } + + host_err = get_write_access(inode); + if (host_err) + goto out_nfserrno; + + host_err = locks_verify_truncate(inode, NULL, iap->ia_size); + if (host_err) + goto out_put_write_access; + return 0; + +out_put_write_access: + put_write_access(inode); +out_nfserrno: + return nfserrno(host_err); +} + +/* + * Set various file attributes. After this call fhp needs an fh_put. + */ +__be32 +nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, + int check_guard, time64_t guardtime) +{ + struct dentry *dentry; + struct inode *inode; + int accmode = NFSD_MAY_SATTR; + umode_t ftype = 0; + __be32 err; + int host_err; + bool get_write_count; + bool size_change = (iap->ia_valid & ATTR_SIZE); + + if (iap->ia_valid & ATTR_SIZE) { + accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; + ftype = S_IFREG; + } + + /* + * If utimes(2) and friends are called with times not NULL, we should + * not set NFSD_MAY_WRITE bit. Otherwise fh_verify->nfsd_permission + * will return EACCES, when the caller's effective UID does not match + * the owner of the file, and the caller is not privileged. In this + * situation, we should return EPERM(notify_change will return this). + */ + if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME)) { + accmode |= NFSD_MAY_OWNER_OVERRIDE; + if (!(iap->ia_valid & (ATTR_ATIME_SET | ATTR_MTIME_SET))) + accmode |= NFSD_MAY_WRITE; + } + + /* Callers that do fh_verify should do the fh_want_write: */ + get_write_count = !fhp->fh_dentry; + + /* Get inode */ + err = fh_verify(rqstp, fhp, ftype, accmode); + if (err) + return err; + if (get_write_count) { + host_err = fh_want_write(fhp); + if (host_err) + goto out; + } + + dentry = fhp->fh_dentry; + inode = d_inode(dentry); + + /* Ignore any mode updates on symlinks */ + if (S_ISLNK(inode->i_mode)) + iap->ia_valid &= ~ATTR_MODE; + + if (!iap->ia_valid) + return 0; + + nfsd_sanitize_attrs(inode, iap); + + if (check_guard && guardtime != inode->i_ctime.tv_sec) + return nfserr_notsync; + + /* + * The size case is special, it changes the file in addition to the + * attributes, and file systems don't expect it to be mixed with + * "random" attribute changes. We thus split out the size change + * into a separate call to ->setattr, and do the rest as a separate + * setattr call. + */ + if (size_change) { + err = nfsd_get_write_access(rqstp, fhp, iap); + if (err) + return err; + } + + fh_lock(fhp); + if (size_change) { + /* + * RFC5661, Section 18.30.4: + * Changing the size of a file with SETATTR indirectly + * changes the time_modify and change attributes. + * + * (and similar for the older RFCs) + */ + struct iattr size_attr = { + .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME, + .ia_size = iap->ia_size, + }; + + host_err = notify_change(dentry, &size_attr, NULL); + if (host_err) + goto out_unlock; + iap->ia_valid &= ~ATTR_SIZE; + + /* + * Avoid the additional setattr call below if the only other + * attribute that the client sends is the mtime, as we update + * it as part of the size change above. + */ + if ((iap->ia_valid & ~ATTR_MTIME) == 0) + goto out_unlock; + } + + iap->ia_valid |= ATTR_CTIME; + host_err = notify_change(dentry, iap, NULL); + +out_unlock: + fh_unlock(fhp); + if (size_change) + put_write_access(inode); +out: + if (!host_err) + host_err = commit_metadata(fhp); + return nfserrno(host_err); +} + +#if defined(CONFIG_NFSD_V4) +/* + * NFS junction information is stored in an extended attribute. + */ +#define NFSD_JUNCTION_XATTR_NAME XATTR_TRUSTED_PREFIX "junction.nfs" + +/** + * nfsd4_is_junction - Test if an object could be an NFS junction + * + * @dentry: object to test + * + * Returns 1 if "dentry" appears to contain NFS junction information. + * Otherwise 0 is returned. + */ +int nfsd4_is_junction(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (inode == NULL) + return 0; + if (inode->i_mode & S_IXUGO) + return 0; + if (!(inode->i_mode & S_ISVTX)) + return 0; + if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0) + return 0; + return 1; +} +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct xdr_netobj *label) +{ + __be32 error; + int host_error; + struct dentry *dentry; + + error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); + if (error) + return error; + + dentry = fhp->fh_dentry; + + inode_lock(d_inode(dentry)); + host_error = security_inode_setsecctx(dentry, label->data, label->len); + inode_unlock(d_inode(dentry)); + return nfserrno(host_error); +} +#else +__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct xdr_netobj *label) +{ + return nfserr_notsupp; +} +#endif + +__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, + struct nfsd_file *nf_dst, u64 dst_pos, u64 count, bool sync) +{ + struct file *src = nf_src->nf_file; + struct file *dst = nf_dst->nf_file; + errseq_t since; + loff_t cloned; + __be32 ret = 0; + + since = READ_ONCE(dst->f_wb_err); + cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0); + if (cloned < 0) { + ret = nfserrno(cloned); + goto out_err; + } + if (count && cloned != count) { + ret = nfserrno(-EINVAL); + goto out_err; + } + if (sync) { + loff_t dst_end = count ? dst_pos + count - 1 : LLONG_MAX; + int status = vfs_fsync_range(dst, dst_pos, dst_end, 0); + + if (!status) + status = filemap_check_wb_err(dst->f_mapping, since); + if (!status) + status = commit_inode_metadata(file_inode(src)); + if (status < 0) { + nfsd_reset_boot_verifier(net_generic(nf_dst->nf_net, + nfsd_net_id)); + ret = nfserrno(status); + } + } +out_err: + return ret; +} + +ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, + u64 dst_pos, u64 count) +{ + ssize_t ret; + + /* + * Limit copy to 4MB to prevent indefinitely blocking an nfsd + * thread and client rpc slot. The choice of 4MB is somewhat + * arbitrary. We might instead base this on r/wsize, or make it + * tunable, or use a time instead of a byte limit, or implement + * asynchronous copy. In theory a client could also recognize a + * limit like this and pipeline multiple COPY requests. + */ + count = min_t(u64, count, 1 << 22); + ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); + + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, + COPY_FILE_SPLICE); + return ret; +} + +__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, loff_t len, + int flags) +{ + int error; + + if (!S_ISREG(file_inode(file)->i_mode)) + return nfserr_inval; + + error = vfs_fallocate(file, flags, offset, len); + if (!error) + error = commit_metadata(fhp); + + return nfserrno(error); +} +#endif /* defined(CONFIG_NFSD_V4) */ + +#ifdef CONFIG_NFSD_V3 +/* + * Check server access rights to a file system object + */ +struct accessmap { + u32 access; + int how; +}; +static struct accessmap nfs3_regaccess[] = { + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_TRUNC }, + { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE }, + +#ifdef CONFIG_NFSD_V4 + { NFS4_ACCESS_XAREAD, NFSD_MAY_READ }, + { NFS4_ACCESS_XAWRITE, NFSD_MAY_WRITE }, + { NFS4_ACCESS_XALIST, NFSD_MAY_READ }, +#endif + + { 0, 0 } +}; + +static struct accessmap nfs3_diraccess[] = { + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_LOOKUP, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC}, + { NFS3_ACCESS_EXTEND, NFSD_MAY_EXEC|NFSD_MAY_WRITE }, + { NFS3_ACCESS_DELETE, NFSD_MAY_REMOVE }, + +#ifdef CONFIG_NFSD_V4 + { NFS4_ACCESS_XAREAD, NFSD_MAY_READ }, + { NFS4_ACCESS_XAWRITE, NFSD_MAY_WRITE }, + { NFS4_ACCESS_XALIST, NFSD_MAY_READ }, +#endif + + { 0, 0 } +}; + +static struct accessmap nfs3_anyaccess[] = { + /* Some clients - Solaris 2.6 at least, make an access call + * to the server to check for access for things like /dev/null + * (which really, the server doesn't care about). So + * We provide simple access checking for them, looking + * mainly at mode bits, and we make sure to ignore read-only + * filesystem checks + */ + { NFS3_ACCESS_READ, NFSD_MAY_READ }, + { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC }, + { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, + { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS }, + + { 0, 0 } +}; + +__be32 +nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported) +{ + struct accessmap *map; + struct svc_export *export; + struct dentry *dentry; + u32 query, result = 0, sresult = 0; + __be32 error; + + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); + if (error) + goto out; + + export = fhp->fh_export; + dentry = fhp->fh_dentry; + + if (d_is_reg(dentry)) + map = nfs3_regaccess; + else if (d_is_dir(dentry)) + map = nfs3_diraccess; + else + map = nfs3_anyaccess; + + + query = *access; + for (; map->access; map++) { + if (map->access & query) { + __be32 err2; + + sresult |= map->access; + + err2 = nfsd_permission(rqstp, export, dentry, map->how); + switch (err2) { + case nfs_ok: + result |= map->access; + break; + + /* the following error codes just mean the access was not allowed, + * rather than an error occurred */ + case nfserr_rofs: + case nfserr_acces: + case nfserr_perm: + /* simply don't "or" in the access bit. */ + break; + default: + error = err2; + goto out; + } + } + } + *access = result; + if (supported) + *supported = sresult; + + out: + return error; +} +#endif /* CONFIG_NFSD_V3 */ + +int nfsd_open_break_lease(struct inode *inode, int access) +{ + unsigned int mode; + + if (access & NFSD_MAY_NOT_BREAK_LEASE) + return 0; + mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY; + return break_lease(inode, mode | O_NONBLOCK); +} + +/* + * Open an existing file or directory. + * The may_flags argument indicates the type of open (read/write/lock) + * and additional flags. + * N.B. After this call fhp needs an fh_put + */ +static __be32 +__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, + int may_flags, struct file **filp) +{ + struct path path; + struct inode *inode; + struct file *file; + int flags = O_RDONLY|O_LARGEFILE; + __be32 err; + int host_err = 0; + + path.mnt = fhp->fh_export->ex_path.mnt; + path.dentry = fhp->fh_dentry; + inode = d_inode(path.dentry); + + /* Disallow write access to files with the append-only bit set + * or any access when mandatory locking enabled + */ + err = nfserr_perm; + if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE)) + goto out; + /* + * We must ignore files (but only files) which might have mandatory + * locks on them because there is no way to know if the accesser has + * the lock. + */ + if (S_ISREG((inode)->i_mode) && mandatory_lock(inode)) + goto out; + + if (!inode->i_fop) + goto out; + + host_err = nfsd_open_break_lease(inode, may_flags); + if (host_err) /* NOMEM or WOULDBLOCK */ + goto out_nfserr; + + if (may_flags & NFSD_MAY_WRITE) { + if (may_flags & NFSD_MAY_READ) + flags = O_RDWR|O_LARGEFILE; + else + flags = O_WRONLY|O_LARGEFILE; + } + + file = dentry_open(&path, flags, current_cred()); + if (IS_ERR(file)) { + host_err = PTR_ERR(file); + goto out_nfserr; + } + + host_err = ima_file_check(file, may_flags); + if (host_err) { + fput(file); + goto out_nfserr; + } + + if (may_flags & NFSD_MAY_64BIT_COOKIE) + file->f_mode |= FMODE_64BITHASH; + else + file->f_mode |= FMODE_32BITHASH; + + *filp = file; +out_nfserr: + err = nfserrno(host_err); +out: + return err; +} + +__be32 +nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, + int may_flags, struct file **filp) +{ + __be32 err; + + validate_process_creds(); + /* + * If we get here, then the client has already done an "open", + * and (hopefully) checked permission - so allow OWNER_OVERRIDE + * in case a chmod has now revoked permission. + * + * Arguably we should also allow the owner override for + * directories, but we never have and it doesn't seem to have + * caused anyone a problem. If we were to change this, note + * also that our filldir callbacks would need a variant of + * lookup_one_len that doesn't check permissions. + */ + if (type == S_IFREG) + may_flags |= NFSD_MAY_OWNER_OVERRIDE; + err = fh_verify(rqstp, fhp, type, may_flags); + if (!err) + err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + validate_process_creds(); + return err; +} + +__be32 +nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, + int may_flags, struct file **filp) +{ + __be32 err; + + validate_process_creds(); + err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + validate_process_creds(); + return err; +} + +/* + * Grab and keep cached pages associated with a file in the svc_rqst + * so that they can be passed to the network sendmsg/sendpage routines + * directly. They will be released after the sending has completed. + */ +static int +nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + struct svc_rqst *rqstp = sd->u.data; + struct page **pp = rqstp->rq_next_page; + struct page *page = buf->page; + size_t size; + + size = sd->len; + + if (rqstp->rq_res.page_len == 0) { + get_page(page); + put_page(*rqstp->rq_next_page); + *(rqstp->rq_next_page++) = page; + rqstp->rq_res.page_base = buf->offset; + rqstp->rq_res.page_len = size; + } else if (page != pp[-1]) { + get_page(page); + if (*rqstp->rq_next_page) + put_page(*rqstp->rq_next_page); + *(rqstp->rq_next_page++) = page; + rqstp->rq_res.page_len += size; + } else + rqstp->rq_res.page_len += size; + + return size; +} + +static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + return __splice_from_pipe(pipe, sd, nfsd_splice_actor); +} + +static u32 nfsd_eof_on_read(struct file *file, loff_t offset, ssize_t len, + size_t expected) +{ + if (expected != 0 && len == 0) + return 1; + if (offset+len >= i_size_read(file_inode(file))) + return 1; + return 0; +} + +static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + unsigned long *count, u32 *eof, ssize_t host_err) +{ + if (host_err >= 0) { + nfsdstats.io_read += host_err; + *eof = nfsd_eof_on_read(file, offset, host_err, *count); + *count = host_err; + fsnotify_access(file); + trace_nfsd_read_io_done(rqstp, fhp, offset, *count); + return 0; + } else { + trace_nfsd_read_err(rqstp, fhp, offset, host_err); + return nfserrno(host_err); + } +} + +__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, unsigned long *count, + u32 *eof) +{ + struct splice_desc sd = { + .len = 0, + .total_len = *count, + .pos = offset, + .u.data = rqstp, + }; + ssize_t host_err; + + trace_nfsd_read_splice(rqstp, fhp, offset, *count); + rqstp->rq_next_page = rqstp->rq_respages + 1; + host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); +} + +__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + struct kvec *vec, int vlen, unsigned long *count, + u32 *eof) +{ + struct iov_iter iter; + loff_t ppos = offset; + ssize_t host_err; + + trace_nfsd_read_vector(rqstp, fhp, offset, *count); + iov_iter_kvec(&iter, READ, vec, vlen, *count); + host_err = vfs_iter_read(file, &iter, &ppos, 0); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); +} + +/* + * Gathered writes: If another process is currently writing to the file, + * there's a high chance this is another nfsd (triggered by a bulk write + * from a client's biod). Rather than syncing the file with each write + * request, we sleep for 10 msec. + * + * I don't know if this roughly approximates C. Juszak's idea of + * gathered writes, but it's a nice and simple solution (IMHO), and it + * seems to work:-) + * + * Note: we do this only in the NFSv2 case, since v3 and higher have a + * better tool (separate unstable writes and commits) for solving this + * problem. + */ +static int wait_for_concurrent_writes(struct file *file) +{ + struct inode *inode = file_inode(file); + static ino_t last_ino; + static dev_t last_dev; + int err = 0; + + if (atomic_read(&inode->i_writecount) > 1 + || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) { + dprintk("nfsd: write defer %d\n", task_pid_nr(current)); + msleep(10); + dprintk("nfsd: write resume %d\n", task_pid_nr(current)); + } + + if (inode->i_state & I_DIRTY) { + dprintk("nfsd: write sync %d\n", task_pid_nr(current)); + err = vfs_fsync(file, 0); + } + last_ino = inode->i_ino; + last_dev = inode->i_sb->s_dev; + return err; +} + +__be32 +nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, + loff_t offset, struct kvec *vec, int vlen, + unsigned long *cnt, int stable, + __be32 *verf) +{ + struct file *file = nf->nf_file; + struct svc_export *exp; + struct iov_iter iter; + errseq_t since; + __be32 nfserr; + int host_err; + int use_wgather; + loff_t pos = offset; + unsigned int pflags = current->flags; + rwf_t flags = 0; + + trace_nfsd_write_opened(rqstp, fhp, offset, *cnt); + + if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) + /* + * We want throttling in balance_dirty_pages() + * and shrink_inactive_list() to only consider + * the backingdev we are writing to, so that nfs to + * localhost doesn't cause nfsd to lock up due to all + * the client's dirty pages or its congested queue. + */ + current->flags |= PF_LOCAL_THROTTLE; + + exp = fhp->fh_export; + use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); + + if (!EX_ISSYNC(exp)) + stable = NFS_UNSTABLE; + + if (stable && !use_wgather) + flags |= RWF_SYNC; + + iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); + since = READ_ONCE(file->f_wb_err); + if (flags & RWF_SYNC) { + if (verf) + nfsd_copy_boot_verifier(verf, + net_generic(SVC_NET(rqstp), + nfsd_net_id)); + host_err = vfs_iter_write(file, &iter, &pos, flags); + if (host_err < 0) + nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), + nfsd_net_id)); + } else { + if (verf) + nfsd_copy_boot_verifier(verf, + net_generic(SVC_NET(rqstp), + nfsd_net_id)); + host_err = vfs_iter_write(file, &iter, &pos, flags); + } + if (host_err < 0) { + nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), + nfsd_net_id)); + goto out_nfserr; + } + *cnt = host_err; + nfsdstats.io_write += *cnt; + fsnotify_modify(file); + host_err = filemap_check_wb_err(file->f_mapping, since); + if (host_err < 0) + goto out_nfserr; + + if (stable && use_wgather) { + host_err = wait_for_concurrent_writes(file); + if (host_err < 0) + nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), + nfsd_net_id)); + } + +out_nfserr: + if (host_err >= 0) { + trace_nfsd_write_io_done(rqstp, fhp, offset, *cnt); + nfserr = nfs_ok; + } else { + trace_nfsd_write_err(rqstp, fhp, offset, host_err); + nfserr = nfserrno(host_err); + } + if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) + current_restore_flags(pflags, PF_LOCAL_THROTTLE); + return nfserr; +} + +/* + * Read data from a file. count must contain the requested read count + * on entry. On return, *count contains the number of bytes actually read. + * N.B. After this call fhp needs an fh_put + */ +__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, struct kvec *vec, int vlen, unsigned long *count, + u32 *eof) +{ + struct nfsd_file *nf; + struct file *file; + __be32 err; + + trace_nfsd_read_start(rqstp, fhp, offset, *count); + err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); + if (err) + return err; + + file = nf->nf_file; + if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) + err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof); + else + err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof); + + nfsd_file_put(nf); + + trace_nfsd_read_done(rqstp, fhp, offset, *count); + + return err; +} + +/* + * Write data to a file. + * The stable flag requests synchronous writes. + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, + struct kvec *vec, int vlen, unsigned long *cnt, int stable, + __be32 *verf) +{ + struct nfsd_file *nf; + __be32 err; + + trace_nfsd_write_start(rqstp, fhp, offset, *cnt); + + err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf); + if (err) + goto out; + + err = nfsd_vfs_write(rqstp, fhp, nf, offset, vec, + vlen, cnt, stable, verf); + nfsd_file_put(nf); +out: + trace_nfsd_write_done(rqstp, fhp, offset, *cnt); + return err; +} + +#ifdef CONFIG_NFSD_V3 +/* + * Commit all pending writes to stable storage. + * + * Note: we only guarantee that data that lies within the range specified + * by the 'offset' and 'count' parameters will be synced. + * + * Unfortunately we cannot lock the file to make sure we return full WCC + * data to the client, as locking happens lower down in the filesystem. + */ +__be32 +nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, unsigned long count, __be32 *verf) +{ + struct nfsd_file *nf; + loff_t end = LLONG_MAX; + __be32 err = nfserr_inval; + + if (offset < 0) + goto out; + if (count != 0) { + end = offset + (loff_t)count - 1; + if (end < offset) + goto out; + } + + err = nfsd_file_acquire(rqstp, fhp, + NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf); + if (err) + goto out; + if (EX_ISSYNC(fhp->fh_export)) { + errseq_t since = READ_ONCE(nf->nf_file->f_wb_err); + int err2; + + err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); + switch (err2) { + case 0: + nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net, + nfsd_net_id)); + err2 = filemap_check_wb_err(nf->nf_file->f_mapping, + since); + err = nfserrno(err2); + break; + case -EINVAL: + err = nfserr_notsupp; + break; + default: + nfsd_reset_boot_verifier(net_generic(nf->nf_net, + nfsd_net_id)); + err = nfserrno(err2); + } + } else + nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net, + nfsd_net_id)); + + nfsd_file_put(nf); +out: + return err; +} +#endif /* CONFIG_NFSD_V3 */ + +static __be32 +nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, + struct iattr *iap) +{ + /* + * Mode has already been set earlier in create: + */ + iap->ia_valid &= ~ATTR_MODE; + /* + * Setting uid/gid works only for root. Irix appears to + * send along the gid on create when it tries to implement + * setgid directories via NFS: + */ + if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) + iap->ia_valid &= ~(ATTR_UID|ATTR_GID); + if (iap->ia_valid) + return nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0); + /* Callers expect file metadata to be committed here */ + return nfserrno(commit_metadata(resfhp)); +} + +/* HPUX client sometimes creates a file in mode 000, and sets size to 0. + * setting size to 0 may fail for some specific file systems by the permission + * checking which requires WRITE permission but the mode is 000. + * we ignore the resizing(to 0) on the just new created file, since the size is + * 0 after file created. + * + * call this only after vfs_create() is called. + * */ +static void +nfsd_check_ignore_resizing(struct iattr *iap) +{ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; +} + +/* The parent directory should already be locked: */ +__be32 +nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, struct iattr *iap, + int type, dev_t rdev, struct svc_fh *resfhp) +{ + struct dentry *dentry, *dchild; + struct inode *dirp; + __be32 err; + __be32 err2; + int host_err; + + dentry = fhp->fh_dentry; + dirp = d_inode(dentry); + + dchild = dget(resfhp->fh_dentry); + if (!fhp->fh_locked) { + WARN_ONCE(1, "nfsd_create: parent %pd2 not locked!\n", + dentry); + err = nfserr_io; + goto out; + } + + err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE); + if (err) + goto out; + + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; + + if (!IS_POSIXACL(dirp)) + iap->ia_mode &= ~current_umask(); + + err = 0; + host_err = 0; + switch (type) { + case S_IFREG: + host_err = vfs_create(dirp, dchild, iap->ia_mode, true); + if (!host_err) + nfsd_check_ignore_resizing(iap); + break; + case S_IFDIR: + host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); + if (!host_err && unlikely(d_unhashed(dchild))) { + struct dentry *d; + d = lookup_one_len(dchild->d_name.name, + dchild->d_parent, + dchild->d_name.len); + if (IS_ERR(d)) { + host_err = PTR_ERR(d); + break; + } + if (unlikely(d_is_negative(d))) { + dput(d); + err = nfserr_serverfault; + goto out; + } + dput(resfhp->fh_dentry); + resfhp->fh_dentry = dget(d); + err = fh_update(resfhp); + dput(dchild); + dchild = d; + if (err) + goto out; + } + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); + break; + default: + printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", + type); + host_err = -EINVAL; + } + if (host_err < 0) + goto out_nfserr; + + err = nfsd_create_setattr(rqstp, resfhp, iap); + + /* + * nfsd_create_setattr already committed the child. Transactional + * filesystems had a chance to commit changes for both parent and + * child simultaneously making the following commit_metadata a + * noop. + */ + err2 = nfserrno(commit_metadata(fhp)); + if (err2) + err = err2; + /* + * Update the file handle to get the new inode info. + */ + if (!err) + err = fh_update(resfhp); +out: + dput(dchild); + return err; + +out_nfserr: + err = nfserrno(host_err); + goto out; +} + +/* + * Create a filesystem object (regular, directory, special). + * Note that the parent directory is left locked. + * + * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp + */ +__be32 +nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, struct iattr *iap, + int type, dev_t rdev, struct svc_fh *resfhp) +{ + struct dentry *dentry, *dchild = NULL; + __be32 err; + int host_err; + + if (isdotent(fname, flen)) + return nfserr_exist; + + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_NOP); + if (err) + return err; + + dentry = fhp->fh_dentry; + + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + + fh_lock_nested(fhp, I_MUTEX_PARENT); + dchild = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(dchild); + if (IS_ERR(dchild)) + return nfserrno(host_err); + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); + /* + * We unconditionally drop our ref to dchild as fh_compose will have + * already grabbed its own ref for it. + */ + dput(dchild); + if (err) + return err; + return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type, + rdev, resfhp); +} + +#ifdef CONFIG_NFSD_V3 + +/* + * NFSv3 and NFSv4 version of nfsd_create + */ +__be32 +do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, struct iattr *iap, + struct svc_fh *resfhp, int createmode, u32 *verifier, + bool *truncp, bool *created) +{ + struct dentry *dentry, *dchild = NULL; + struct inode *dirp; + __be32 err; + int host_err; + __u32 v_mtime=0, v_atime=0; + + err = nfserr_perm; + if (!flen) + goto out; + err = nfserr_exist; + if (isdotent(fname, flen)) + goto out; + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (err) + goto out; + + dentry = fhp->fh_dentry; + dirp = d_inode(dentry); + + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + + fh_lock_nested(fhp, I_MUTEX_PARENT); + + /* + * Compose the response file handle. + */ + dchild = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(dchild); + if (IS_ERR(dchild)) + goto out_nfserr; + + /* If file doesn't exist, check for permissions to create one */ + if (d_really_is_negative(dchild)) { + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + } + + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); + if (err) + goto out; + + if (nfsd_create_is_exclusive(createmode)) { + /* solaris7 gets confused (bugid 4218508) if these have + * the high bit set, so just clear the high bits. If this is + * ever changed to use different attrs for storing the + * verifier, then do_open_lookup() will also need to be fixed + * accordingly. + */ + v_mtime = verifier[0]&0x7fffffff; + v_atime = verifier[1]&0x7fffffff; + } + + if (d_really_is_positive(dchild)) { + err = 0; + + switch (createmode) { + case NFS3_CREATE_UNCHECKED: + if (! d_is_reg(dchild)) + goto out; + else if (truncp) { + /* in nfsv4, we need to treat this case a little + * differently. we don't want to truncate the + * file now; this would be wrong if the OPEN + * fails for some other reason. furthermore, + * if the size is nonzero, we should ignore it + * according to spec! + */ + *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size; + } + else { + iap->ia_valid &= ATTR_SIZE; + goto set_attr; + } + break; + case NFS3_CREATE_EXCLUSIVE: + if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime + && d_inode(dchild)->i_atime.tv_sec == v_atime + && d_inode(dchild)->i_size == 0 ) { + if (created) + *created = true; + break; + } + fallthrough; + case NFS4_CREATE_EXCLUSIVE4_1: + if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime + && d_inode(dchild)->i_atime.tv_sec == v_atime + && d_inode(dchild)->i_size == 0 ) { + if (created) + *created = true; + goto set_attr; + } + fallthrough; + case NFS3_CREATE_GUARDED: + err = nfserr_exist; + } + fh_drop_write(fhp); + goto out; + } + + if (!IS_POSIXACL(dirp)) + iap->ia_mode &= ~current_umask(); + + host_err = vfs_create(dirp, dchild, iap->ia_mode, true); + if (host_err < 0) { + fh_drop_write(fhp); + goto out_nfserr; + } + if (created) + *created = true; + + nfsd_check_ignore_resizing(iap); + + if (nfsd_create_is_exclusive(createmode)) { + /* Cram the verifier into atime/mtime */ + iap->ia_valid = ATTR_MTIME|ATTR_ATIME + | ATTR_MTIME_SET|ATTR_ATIME_SET; + /* XXX someone who knows this better please fix it for nsec */ + iap->ia_mtime.tv_sec = v_mtime; + iap->ia_atime.tv_sec = v_atime; + iap->ia_mtime.tv_nsec = 0; + iap->ia_atime.tv_nsec = 0; + } + + set_attr: + err = nfsd_create_setattr(rqstp, resfhp, iap); + + /* + * nfsd_create_setattr already committed the child + * (and possibly also the parent). + */ + if (!err) + err = nfserrno(commit_metadata(fhp)); + + /* + * Update the filehandle to get the new inode info. + */ + if (!err) + err = fh_update(resfhp); + + out: + fh_unlock(fhp); + if (dchild && !IS_ERR(dchild)) + dput(dchild); + fh_drop_write(fhp); + return err; + + out_nfserr: + err = nfserrno(host_err); + goto out; +} +#endif /* CONFIG_NFSD_V3 */ + +/* + * Read a symlink. On entry, *lenp must contain the maximum path length that + * fits into the buffer. On return, it contains the true length. + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) +{ + __be32 err; + const char *link; + struct path path; + DEFINE_DELAYED_CALL(done); + int len; + + err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP); + if (unlikely(err)) + return err; + + path.mnt = fhp->fh_export->ex_path.mnt; + path.dentry = fhp->fh_dentry; + + if (unlikely(!d_is_symlink(path.dentry))) + return nfserr_inval; + + touch_atime(&path); + + link = vfs_get_link(path.dentry, &done); + if (IS_ERR(link)) + return nfserrno(PTR_ERR(link)); + + len = strlen(link); + if (len < *lenp) + *lenp = len; + memcpy(buf, link, *lenp); + do_delayed_call(&done); + return 0; +} + +/* + * Create a symlink and look up its inode + * N.B. After this call _both_ fhp and resfhp need an fh_put + */ +__be32 +nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, + char *path, + struct svc_fh *resfhp) +{ + struct dentry *dentry, *dnew; + __be32 err, cerr; + int host_err; + + err = nfserr_noent; + if (!flen || path[0] == '\0') + goto out; + err = nfserr_exist; + if (isdotent(fname, flen)) + goto out; + + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + + fh_lock(fhp); + dentry = fhp->fh_dentry; + dnew = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(dnew); + if (IS_ERR(dnew)) + goto out_nfserr; + + host_err = vfs_symlink(d_inode(dentry), dnew, path); + err = nfserrno(host_err); + if (!err) + err = nfserrno(commit_metadata(fhp)); + fh_unlock(fhp); + + fh_drop_write(fhp); + + cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); + dput(dnew); + if (err==0) err = cerr; +out: + return err; + +out_nfserr: + err = nfserrno(host_err); + goto out; +} + +/* + * Create a hardlink + * N.B. After this call _both_ ffhp and tfhp need an fh_put + */ +__be32 +nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, + char *name, int len, struct svc_fh *tfhp) +{ + struct dentry *ddir, *dnew, *dold; + struct inode *dirp; + __be32 err; + int host_err; + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP); + if (err) + goto out; + err = nfserr_isdir; + if (d_is_dir(tfhp->fh_dentry)) + goto out; + err = nfserr_perm; + if (!len) + goto out; + err = nfserr_exist; + if (isdotent(name, len)) + goto out; + + host_err = fh_want_write(tfhp); + if (host_err) { + err = nfserrno(host_err); + goto out; + } + + fh_lock_nested(ffhp, I_MUTEX_PARENT); + ddir = ffhp->fh_dentry; + dirp = d_inode(ddir); + + dnew = lookup_one_len(name, ddir, len); + host_err = PTR_ERR(dnew); + if (IS_ERR(dnew)) + goto out_nfserr; + + dold = tfhp->fh_dentry; + + err = nfserr_noent; + if (d_really_is_negative(dold)) + goto out_dput; + host_err = vfs_link(dold, dirp, dnew, NULL); + if (!host_err) { + err = nfserrno(commit_metadata(ffhp)); + if (!err) + err = nfserrno(commit_metadata(tfhp)); + } else { + if (host_err == -EXDEV && rqstp->rq_vers == 2) + err = nfserr_acces; + else + err = nfserrno(host_err); + } +out_dput: + dput(dnew); +out_unlock: + fh_unlock(ffhp); + fh_drop_write(tfhp); +out: + return err; + +out_nfserr: + err = nfserrno(host_err); + goto out_unlock; +} + +static void +nfsd_close_cached_files(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (inode && S_ISREG(inode->i_mode)) + nfsd_file_close_inode_sync(inode); +} + +static bool +nfsd_has_cached_files(struct dentry *dentry) +{ + bool ret = false; + struct inode *inode = d_inode(dentry); + + if (inode && S_ISREG(inode->i_mode)) + ret = nfsd_file_is_cached(inode); + return ret; +} + +/* + * Rename a file + * N.B. After this call _both_ ffhp and tfhp need an fh_put + */ +__be32 +nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, + struct svc_fh *tfhp, char *tname, int tlen) +{ + struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; + struct inode *fdir, *tdir; + __be32 err; + int host_err; + bool has_cached = false; + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) + goto out; + err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + + fdentry = ffhp->fh_dentry; + fdir = d_inode(fdentry); + + tdentry = tfhp->fh_dentry; + tdir = d_inode(tdentry); + + err = nfserr_perm; + if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) + goto out; + + err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev; + if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) + goto out; + if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) + goto out; + +retry: + host_err = fh_want_write(ffhp); + if (host_err) { + err = nfserrno(host_err); + goto out; + } + + /* cannot use fh_lock as we need deadlock protective ordering + * so do it by hand */ + trap = lock_rename(tdentry, fdentry); + ffhp->fh_locked = tfhp->fh_locked = true; + fill_pre_wcc(ffhp); + fill_pre_wcc(tfhp); + + odentry = lookup_one_len(fname, fdentry, flen); + host_err = PTR_ERR(odentry); + if (IS_ERR(odentry)) + goto out_nfserr; + + host_err = -ENOENT; + if (d_really_is_negative(odentry)) + goto out_dput_old; + host_err = -EINVAL; + if (odentry == trap) + goto out_dput_old; + + ndentry = lookup_one_len(tname, tdentry, tlen); + host_err = PTR_ERR(ndentry); + if (IS_ERR(ndentry)) + goto out_dput_old; + host_err = -ENOTEMPTY; + if (ndentry == trap) + goto out_dput_new; + + if (nfsd_has_cached_files(ndentry)) { + has_cached = true; + goto out_dput_old; + } else { + host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) + host_err = commit_metadata(ffhp); + } + } + out_dput_new: + dput(ndentry); + out_dput_old: + dput(odentry); + out_nfserr: + err = nfserrno(host_err); + /* + * We cannot rely on fh_unlock on the two filehandles, + * as that would do the wrong thing if the two directories + * were the same, so again we do it by hand. + */ + if (!has_cached) { + fill_post_wcc(ffhp); + fill_post_wcc(tfhp); + } + unlock_rename(tdentry, fdentry); + ffhp->fh_locked = tfhp->fh_locked = false; + fh_drop_write(ffhp); + + /* + * If the target dentry has cached open files, then we need to try to + * close them prior to doing the rename. Flushing delayed fput + * shouldn't be done with locks held however, so we delay it until this + * point and then reattempt the whole shebang. + */ + if (has_cached) { + has_cached = false; + nfsd_close_cached_files(ndentry); + dput(ndentry); + goto retry; + } +out: + return err; +} + +/* + * Unlink a file or directory + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, + char *fname, int flen) +{ + struct dentry *dentry, *rdentry; + struct inode *dirp; + __be32 err; + int host_err; + + err = nfserr_acces; + if (!flen || isdotent(fname, flen)) + goto out; + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) + goto out; + + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + + fh_lock_nested(fhp, I_MUTEX_PARENT); + dentry = fhp->fh_dentry; + dirp = d_inode(dentry); + + rdentry = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(rdentry); + if (IS_ERR(rdentry)) + goto out_drop_write; + + if (d_really_is_negative(rdentry)) { + dput(rdentry); + host_err = -ENOENT; + goto out_drop_write; + } + + if (!type) + type = d_inode(rdentry)->i_mode & S_IFMT; + + if (type != S_IFDIR) { + nfsd_close_cached_files(rdentry); + host_err = vfs_unlink(dirp, rdentry, NULL); + } else { + host_err = vfs_rmdir(dirp, rdentry); + } + + if (!host_err) + host_err = commit_metadata(fhp); + dput(rdentry); + +out_drop_write: + fh_drop_write(fhp); +out_nfserr: + if (host_err == -EBUSY) { + /* name is mounted-on. There is no perfect + * error status. + */ + if (nfsd_v4client(rqstp)) + err = nfserr_file_open; + else + err = nfserr_acces; + } else { + err = nfserrno(host_err); + } +out: + return err; +} + +/* + * We do this buffering because we must not call back into the file + * system's ->lookup() method from the filldir callback. That may well + * deadlock a number of file systems. + * + * This is based heavily on the implementation of same in XFS. + */ +struct buffered_dirent { + u64 ino; + loff_t offset; + int namlen; + unsigned int d_type; + char name[]; +}; + +struct readdir_data { + struct dir_context ctx; + char *dirent; + size_t used; + int full; +}; + +static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct readdir_data *buf = + container_of(ctx, struct readdir_data, ctx); + struct buffered_dirent *de = (void *)(buf->dirent + buf->used); + unsigned int reclen; + + reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64)); + if (buf->used + reclen > PAGE_SIZE) { + buf->full = 1; + return -EINVAL; + } + + de->namlen = namlen; + de->offset = offset; + de->ino = ino; + de->d_type = d_type; + memcpy(de->name, name, namlen); + buf->used += reclen; + + return 0; +} + +static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, + struct readdir_cd *cdp, loff_t *offsetp) +{ + struct buffered_dirent *de; + int host_err; + int size; + loff_t offset; + struct readdir_data buf = { + .ctx.actor = nfsd_buffered_filldir, + .dirent = (void *)__get_free_page(GFP_KERNEL) + }; + + if (!buf.dirent) + return nfserrno(-ENOMEM); + + offset = *offsetp; + + while (1) { + unsigned int reclen; + + cdp->err = nfserr_eof; /* will be cleared on successful read */ + buf.used = 0; + buf.full = 0; + + host_err = iterate_dir(file, &buf.ctx); + if (buf.full) + host_err = 0; + + if (host_err < 0) + break; + + size = buf.used; + + if (!size) + break; + + de = (struct buffered_dirent *)buf.dirent; + while (size > 0) { + offset = de->offset; + + if (func(cdp, de->name, de->namlen, de->offset, + de->ino, de->d_type)) + break; + + if (cdp->err != nfs_ok) + break; + + reclen = ALIGN(sizeof(*de) + de->namlen, + sizeof(u64)); + size -= reclen; + de = (struct buffered_dirent *)((char *)de + reclen); + } + if (size > 0) /* We bailed out early */ + break; + + offset = vfs_llseek(file, 0, SEEK_CUR); + } + + free_page((unsigned long)(buf.dirent)); + + if (host_err) + return nfserrno(host_err); + + *offsetp = offset; + return cdp->err; +} + +/* + * Read entries from a directory. + * The NFSv3/4 verifier we ignore for now. + */ +__be32 +nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, + struct readdir_cd *cdp, nfsd_filldir_t func) +{ + __be32 err; + struct file *file; + loff_t offset = *offsetp; + int may_flags = NFSD_MAY_READ; + + /* NFSv2 only supports 32 bit cookies */ + if (rqstp->rq_vers > 2) + may_flags |= NFSD_MAY_64BIT_COOKIE; + + err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file); + if (err) + goto out; + + offset = vfs_llseek(file, offset, SEEK_SET); + if (offset < 0) { + err = nfserrno((int)offset); + goto out_close; + } + + err = nfsd_buffered_readdir(file, func, cdp, offsetp); + + if (err == nfserr_eof || err == nfserr_toosmall) + err = nfs_ok; /* can still be found in ->err */ +out_close: + fput(file); +out: + return err; +} + +/* + * Get file system stats + * N.B. After this call fhp needs an fh_put + */ +__be32 +nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) +{ + __be32 err; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); + if (!err) { + struct path path = { + .mnt = fhp->fh_export->ex_path.mnt, + .dentry = fhp->fh_dentry, + }; + if (vfs_statfs(&path, stat)) + err = nfserr_io; + } + return err; +} + +static int exp_rdonly(struct svc_rqst *rqstp, struct svc_export *exp) +{ + return nfsexp_flags(rqstp, exp) & NFSEXP_READONLY; +} + +#ifdef CONFIG_NFSD_V4 +/* + * Helper function to translate error numbers. In the case of xattr operations, + * some error codes need to be translated outside of the standard translations. + * + * ENODATA needs to be translated to nfserr_noxattr. + * E2BIG to nfserr_xattr2big. + * + * Additionally, vfs_listxattr can return -ERANGE. This means that the + * file has too many extended attributes to retrieve inside an + * XATTR_LIST_MAX sized buffer. This is a bug in the xattr implementation: + * filesystems will allow the adding of extended attributes until they hit + * their own internal limit. This limit may be larger than XATTR_LIST_MAX. + * So, at that point, the attributes are present and valid, but can't + * be retrieved using listxattr, since the upper level xattr code enforces + * the XATTR_LIST_MAX limit. + * + * This bug means that we need to deal with listxattr returning -ERANGE. The + * best mapping is to return TOOSMALL. + */ +static __be32 +nfsd_xattr_errno(int err) +{ + switch (err) { + case -ENODATA: + return nfserr_noxattr; + case -E2BIG: + return nfserr_xattr2big; + case -ERANGE: + return nfserr_toosmall; + } + return nfserrno(err); +} + +/* + * Retrieve the specified user extended attribute. To avoid always + * having to allocate the maximum size (since we are not getting + * a maximum size from the RPC), do a probe + alloc. Hold a reader + * lock on i_rwsem to prevent the extended attribute from changing + * size while we're doing this. + */ +__be32 +nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, + void **bufp, int *lenp) +{ + ssize_t len; + __be32 err; + char *buf; + struct inode *inode; + struct dentry *dentry; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_READ); + if (err) + return err; + + err = nfs_ok; + dentry = fhp->fh_dentry; + inode = d_inode(dentry); + + inode_lock_shared(inode); + + len = vfs_getxattr(dentry, name, NULL, 0); + + /* + * Zero-length attribute, just return. + */ + if (len == 0) { + *bufp = NULL; + *lenp = 0; + goto out; + } + + if (len < 0) { + err = nfsd_xattr_errno(len); + goto out; + } + + if (len > *lenp) { + err = nfserr_toosmall; + goto out; + } + + buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS); + if (buf == NULL) { + err = nfserr_jukebox; + goto out; + } + + len = vfs_getxattr(dentry, name, buf, len); + if (len <= 0) { + kvfree(buf); + buf = NULL; + err = nfsd_xattr_errno(len); + } + + *lenp = len; + *bufp = buf; + +out: + inode_unlock_shared(inode); + + return err; +} + +/* + * Retrieve the xattr names. Since we can't know how many are + * user extended attributes, we must get all attributes here, + * and have the XDR encode filter out the "user." ones. + * + * While this could always just allocate an XATTR_LIST_MAX + * buffer, that's a waste, so do a probe + allocate. To + * avoid any changes between the probe and allocate, wrap + * this in inode_lock. + */ +__be32 +nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char **bufp, + int *lenp) +{ + ssize_t len; + __be32 err; + char *buf; + struct inode *inode; + struct dentry *dentry; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_READ); + if (err) + return err; + + dentry = fhp->fh_dentry; + inode = d_inode(dentry); + *lenp = 0; + + inode_lock_shared(inode); + + len = vfs_listxattr(dentry, NULL, 0); + if (len <= 0) { + err = nfsd_xattr_errno(len); + goto out; + } + + if (len > XATTR_LIST_MAX) { + err = nfserr_xattr2big; + goto out; + } + + /* + * We're holding i_rwsem - use GFP_NOFS. + */ + buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS); + if (buf == NULL) { + err = nfserr_jukebox; + goto out; + } + + len = vfs_listxattr(dentry, buf, len); + if (len <= 0) { + kvfree(buf); + err = nfsd_xattr_errno(len); + goto out; + } + + *lenp = len; + *bufp = buf; + + err = nfs_ok; +out: + inode_unlock_shared(inode); + + return err; +} + +/* + * Removexattr and setxattr need to call fh_lock to both lock the inode + * and set the change attribute. Since the top-level vfs_removexattr + * and vfs_setxattr calls already do their own inode_lock calls, call + * the _locked variant. Pass in a NULL pointer for delegated_inode, + * and let the client deal with NFS4ERR_DELAY (same as with e.g. + * setattr and remove). + */ +__be32 +nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name) +{ + __be32 err; + int ret; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE); + if (err) + return err; + + ret = fh_want_write(fhp); + if (ret) + return nfserrno(ret); + + fh_lock(fhp); + + ret = __vfs_removexattr_locked(fhp->fh_dentry, name, NULL); + + fh_unlock(fhp); + fh_drop_write(fhp); + + return nfsd_xattr_errno(ret); +} + +__be32 +nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, + void *buf, u32 len, u32 flags) +{ + __be32 err; + int ret; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE); + if (err) + return err; + + ret = fh_want_write(fhp); + if (ret) + return nfserrno(ret); + fh_lock(fhp); + + ret = __vfs_setxattr_locked(fhp->fh_dentry, name, buf, len, flags, + NULL); + + fh_unlock(fhp); + fh_drop_write(fhp); + + return nfsd_xattr_errno(ret); +} +#endif + +/* + * Check for a user's access permissions to this inode. + */ +__be32 +nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, + struct dentry *dentry, int acc) +{ + struct inode *inode = d_inode(dentry); + int err; + + if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP) + return 0; +#if 0 + dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n", + acc, + (acc & NFSD_MAY_READ)? " read" : "", + (acc & NFSD_MAY_WRITE)? " write" : "", + (acc & NFSD_MAY_EXEC)? " exec" : "", + (acc & NFSD_MAY_SATTR)? " sattr" : "", + (acc & NFSD_MAY_TRUNC)? " trunc" : "", + (acc & NFSD_MAY_LOCK)? " lock" : "", + (acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "", + inode->i_mode, + IS_IMMUTABLE(inode)? " immut" : "", + IS_APPEND(inode)? " append" : "", + __mnt_is_readonly(exp->ex_path.mnt)? " ro" : ""); + dprintk(" owner %d/%d user %d/%d\n", + inode->i_uid, inode->i_gid, current_fsuid(), current_fsgid()); +#endif + + /* Normally we reject any write/sattr etc access on a read-only file + * system. But if it is IRIX doing check on write-access for a + * device special file, we ignore rofs. + */ + if (!(acc & NFSD_MAY_LOCAL_ACCESS)) + if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) { + if (exp_rdonly(rqstp, exp) || + __mnt_is_readonly(exp->ex_path.mnt)) + return nfserr_rofs; + if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode)) + return nfserr_perm; + } + if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode)) + return nfserr_perm; + + if (acc & NFSD_MAY_LOCK) { + /* If we cannot rely on authentication in NLM requests, + * just allow locks, otherwise require read permission, or + * ownership + */ + if (exp->ex_flags & NFSEXP_NOAUTHNLM) + return 0; + else + acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE; + } + /* + * The file owner always gets access permission for accesses that + * would normally be checked at open time. This is to make + * file access work even when the client has done a fchmod(fd, 0). + * + * However, `cp foo bar' should fail nevertheless when bar is + * readonly. A sensible way to do this might be to reject all + * attempts to truncate a read-only file, because a creat() call + * always implies file truncation. + * ... but this isn't really fair. A process may reasonably call + * ftruncate on an open file descriptor on a file with perm 000. + * We must trust the client to do permission checking - using "ACCESS" + * with NFSv3. + */ + if ((acc & NFSD_MAY_OWNER_OVERRIDE) && + uid_eq(inode->i_uid, current_fsuid())) + return 0; + + /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ + err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC)); + + /* Allow read access to binaries even when mode 111 */ + if (err == -EACCES && S_ISREG(inode->i_mode) && + (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || + acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) + err = inode_permission(inode, MAY_EXEC); + + return err? nfserrno(err) : 0; +} diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h new file mode 100644 index 000000000..a2442ebe5 --- /dev/null +++ b/fs/nfsd/vfs.h @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> + */ + +#ifndef LINUX_NFSD_VFS_H +#define LINUX_NFSD_VFS_H + +#include "nfsfh.h" +#include "nfsd.h" + +/* + * Flags for nfsd_permission + */ +#define NFSD_MAY_NOP 0 +#define NFSD_MAY_EXEC 0x001 /* == MAY_EXEC */ +#define NFSD_MAY_WRITE 0x002 /* == MAY_WRITE */ +#define NFSD_MAY_READ 0x004 /* == MAY_READ */ +#define NFSD_MAY_SATTR 0x008 +#define NFSD_MAY_TRUNC 0x010 +#define NFSD_MAY_LOCK 0x020 +#define NFSD_MAY_MASK 0x03f + +/* extra hints to permission and open routines: */ +#define NFSD_MAY_OWNER_OVERRIDE 0x040 +#define NFSD_MAY_LOCAL_ACCESS 0x080 /* for device special files */ +#define NFSD_MAY_BYPASS_GSS_ON_ROOT 0x100 +#define NFSD_MAY_NOT_BREAK_LEASE 0x200 +#define NFSD_MAY_BYPASS_GSS 0x400 +#define NFSD_MAY_READ_IF_EXEC 0x800 + +#define NFSD_MAY_64BIT_COOKIE 0x1000 /* 64 bit readdir cookies for >= NFSv3 */ + +#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) +#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) + +struct nfsd_file; + +/* + * Callback function for readdir + */ +typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned); + +/* nfsd/vfs.c */ +int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, + struct svc_export **expp); +__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, + const char *, unsigned int, struct svc_fh *); +__be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, + const char *, unsigned int, + struct svc_export **, struct dentry **); +__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, + struct iattr *, int, time64_t); +int nfsd_mountpoint(struct dentry *, struct svc_export *); +#ifdef CONFIG_NFSD_V4 +__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, + struct xdr_netobj *); +__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, + struct file *, loff_t, loff_t, int); +__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos, + struct nfsd_file *nf_dst, u64 dst_pos, + u64 count, bool sync); +#endif /* CONFIG_NFSD_V4 */ +__be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + int type, dev_t rdev, struct svc_fh *res); +__be32 nfsd_create(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + int type, dev_t rdev, struct svc_fh *res); +#ifdef CONFIG_NFSD_V3 +__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); +__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + struct svc_fh *res, int createmode, + u32 *verifier, bool *truncp, bool *created); +__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, + loff_t, unsigned long, __be32 *verf); +#endif /* CONFIG_NFSD_V3 */ +#ifdef CONFIG_NFSD_V4 +__be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *name, void **bufp, int *lenp); +__be32 nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + char **bufp, int *lenp); +__be32 nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *name); +__be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *name, void *buf, u32 len, u32 flags); +#endif +int nfsd_open_break_lease(struct inode *, int); +__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, + int, struct file **); +__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, + int, struct file **); +__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + unsigned long *count, + u32 *eof); +__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + struct kvec *vec, int vlen, + unsigned long *count, + u32 *eof); +__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, + loff_t, struct kvec *, int, unsigned long *, + u32 *eof); +__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, + struct kvec *, int, unsigned long *, + int stable, __be32 *verf); +__be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file *nf, loff_t offset, + struct kvec *vec, int vlen, unsigned long *cnt, + int stable, __be32 *verf); +__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, + char *, int *); +__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, + char *name, int len, char *path, + struct svc_fh *res); +__be32 nfsd_link(struct svc_rqst *, struct svc_fh *, + char *, int, struct svc_fh *); +ssize_t nfsd_copy_file_range(struct file *, u64, + struct file *, u64, u64); +__be32 nfsd_rename(struct svc_rqst *, + struct svc_fh *, char *, int, + struct svc_fh *, char *, int); +__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, + char *name, int len); +__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, + loff_t *, struct readdir_cd *, nfsd_filldir_t); +__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, + struct kstatfs *, int access); + +__be32 nfsd_permission(struct svc_rqst *, struct svc_export *, + struct dentry *, int); + +static inline int fh_want_write(struct svc_fh *fh) +{ + int ret; + + if (fh->fh_want_write) + return 0; + ret = mnt_want_write(fh->fh_export->ex_path.mnt); + if (!ret) + fh->fh_want_write = true; + return ret; +} + +static inline void fh_drop_write(struct svc_fh *fh) +{ + if (fh->fh_want_write) { + fh->fh_want_write = false; + mnt_drop_write(fh->fh_export->ex_path.mnt); + } +} + +static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat) +{ + struct path p = {.mnt = fh->fh_export->ex_path.mnt, + .dentry = fh->fh_dentry}; + return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS, + AT_STATX_SYNC_AS_STAT)); +} + +static inline int nfsd_create_is_exclusive(int createmode) +{ + return createmode == NFS3_CREATE_EXCLUSIVE + || createmode == NFS4_CREATE_EXCLUSIVE4_1; +} + +#endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h new file mode 100644 index 000000000..b8cc6a4b2 --- /dev/null +++ b/fs/nfsd/xdr.h @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* XDR types for nfsd. This is mainly a typing exercise. */ + +#ifndef LINUX_NFSD_H +#define LINUX_NFSD_H + +#include <linux/vfs.h> +#include "nfsd.h" +#include "nfsfh.h" + +struct nfsd_fhandle { + struct svc_fh fh; +}; + +struct nfsd_sattrargs { + struct svc_fh fh; + struct iattr attrs; +}; + +struct nfsd_diropargs { + struct svc_fh fh; + char * name; + unsigned int len; +}; + +struct nfsd_readargs { + struct svc_fh fh; + __u32 offset; + __u32 count; + int vlen; +}; + +struct nfsd_writeargs { + svc_fh fh; + __u32 offset; + __u32 len; + struct kvec first; +}; + +struct nfsd_createargs { + struct svc_fh fh; + char * name; + unsigned int len; + struct iattr attrs; +}; + +struct nfsd_renameargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd_readlinkargs { + struct svc_fh fh; + char * buffer; +}; + +struct nfsd_linkargs { + struct svc_fh ffh; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd_symlinkargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + char * tname; + unsigned int tlen; + struct iattr attrs; + struct kvec first; +}; + +struct nfsd_readdirargs { + struct svc_fh fh; + __u32 cookie; + __u32 count; + __be32 * buffer; +}; + +struct nfsd_stat { + __be32 status; +}; + +struct nfsd_attrstat { + __be32 status; + struct svc_fh fh; + struct kstat stat; +}; + +struct nfsd_diropres { + __be32 status; + struct svc_fh fh; + struct kstat stat; +}; + +struct nfsd_readlinkres { + __be32 status; + int len; +}; + +struct nfsd_readres { + __be32 status; + struct svc_fh fh; + unsigned long count; + struct kstat stat; +}; + +struct nfsd_readdirres { + __be32 status; + + int count; + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; +}; + +struct nfsd_statfsres { + __be32 status; + struct kstatfs stats; +}; + +/* + * Storage requirements for XDR arguments and results. + */ +union nfsd_xdrstore { + struct nfsd_sattrargs sattr; + struct nfsd_diropargs dirop; + struct nfsd_readargs read; + struct nfsd_writeargs write; + struct nfsd_createargs create; + struct nfsd_renameargs rename; + struct nfsd_linkargs link; + struct nfsd_symlinkargs symlink; + struct nfsd_readdirargs readdir; +}; + +#define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore) + + +int nfssvc_decode_void(struct svc_rqst *, __be32 *); +int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *); +int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *); +int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *); +int nfssvc_decode_readargs(struct svc_rqst *, __be32 *); +int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *); +int nfssvc_decode_createargs(struct svc_rqst *, __be32 *); +int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *); +int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *); +int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *); +int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *); +int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *); +int nfssvc_encode_void(struct svc_rqst *, __be32 *); +int nfssvc_encode_stat(struct svc_rqst *, __be32 *); +int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *); +int nfssvc_encode_diropres(struct svc_rqst *, __be32 *); +int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *); +int nfssvc_encode_readres(struct svc_rqst *, __be32 *); +int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *); +int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *); + +int nfssvc_encode_entry(void *, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int); + +void nfssvc_release_attrstat(struct svc_rqst *rqstp); +void nfssvc_release_diropres(struct svc_rqst *rqstp); +void nfssvc_release_readres(struct svc_rqst *rqstp); + +/* Helper functions for NFSv2 ACL code */ +__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat); +__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp); + +#endif /* LINUX_NFSD_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h new file mode 100644 index 000000000..ae6fa6c9c --- /dev/null +++ b/fs/nfsd/xdr3.h @@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * XDR types for NFSv3 in nfsd. + * + * Copyright (C) 1996-1998, Olaf Kirch <okir@monad.swb.de> + */ + +#ifndef _LINUX_NFSD_XDR3_H +#define _LINUX_NFSD_XDR3_H + +#include "xdr.h" + +struct nfsd3_sattrargs { + struct svc_fh fh; + struct iattr attrs; + int check_guard; + time64_t guardtime; +}; + +struct nfsd3_diropargs { + struct svc_fh fh; + char * name; + unsigned int len; +}; + +struct nfsd3_accessargs { + struct svc_fh fh; + unsigned int access; +}; + +struct nfsd3_readargs { + struct svc_fh fh; + __u64 offset; + __u32 count; + int vlen; +}; + +struct nfsd3_writeargs { + svc_fh fh; + __u64 offset; + __u32 count; + int stable; + __u32 len; + struct kvec first; +}; + +struct nfsd3_createargs { + struct svc_fh fh; + char * name; + unsigned int len; + int createmode; + struct iattr attrs; + __be32 * verf; +}; + +struct nfsd3_mknodargs { + struct svc_fh fh; + char * name; + unsigned int len; + __u32 ftype; + __u32 major, minor; + struct iattr attrs; +}; + +struct nfsd3_renameargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd3_readlinkargs { + struct svc_fh fh; + char * buffer; +}; + +struct nfsd3_linkargs { + struct svc_fh ffh; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd3_symlinkargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + char * tname; + unsigned int tlen; + struct iattr attrs; + struct kvec first; +}; + +struct nfsd3_readdirargs { + struct svc_fh fh; + __u64 cookie; + __u32 dircount; + __u32 count; + __be32 * verf; + __be32 * buffer; +}; + +struct nfsd3_commitargs { + struct svc_fh fh; + __u64 offset; + __u32 count; +}; + +struct nfsd3_getaclargs { + struct svc_fh fh; + int mask; +}; + +struct posix_acl; +struct nfsd3_setaclargs { + struct svc_fh fh; + int mask; + struct posix_acl *acl_access; + struct posix_acl *acl_default; +}; + +struct nfsd3_attrstat { + __be32 status; + struct svc_fh fh; + struct kstat stat; +}; + +/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */ +struct nfsd3_diropres { + __be32 status; + struct svc_fh dirfh; + struct svc_fh fh; +}; + +struct nfsd3_accessres { + __be32 status; + struct svc_fh fh; + __u32 access; + struct kstat stat; +}; + +struct nfsd3_readlinkres { + __be32 status; + struct svc_fh fh; + __u32 len; +}; + +struct nfsd3_readres { + __be32 status; + struct svc_fh fh; + unsigned long count; + __u32 eof; +}; + +struct nfsd3_writeres { + __be32 status; + struct svc_fh fh; + unsigned long count; + int committed; + __be32 verf[2]; +}; + +struct nfsd3_renameres { + __be32 status; + struct svc_fh ffh; + struct svc_fh tfh; +}; + +struct nfsd3_linkres { + __be32 status; + struct svc_fh tfh; + struct svc_fh fh; +}; + +struct nfsd3_readdirres { + __be32 status; + struct svc_fh fh; + /* Just to save kmalloc on every readdirplus entry (svc_fh is a + * little large for the stack): */ + struct svc_fh scratch; + int count; + __be32 verf[2]; + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; + __be32 * offset1; + struct svc_rqst * rqstp; + +}; + +struct nfsd3_fsstatres { + __be32 status; + struct kstatfs stats; + __u32 invarsec; +}; + +struct nfsd3_fsinfores { + __be32 status; + __u32 f_rtmax; + __u32 f_rtpref; + __u32 f_rtmult; + __u32 f_wtmax; + __u32 f_wtpref; + __u32 f_wtmult; + __u32 f_dtpref; + __u64 f_maxfilesize; + __u32 f_properties; +}; + +struct nfsd3_pathconfres { + __be32 status; + __u32 p_link_max; + __u32 p_name_max; + __u32 p_no_trunc; + __u32 p_chown_restricted; + __u32 p_case_insensitive; + __u32 p_case_preserving; +}; + +struct nfsd3_commitres { + __be32 status; + struct svc_fh fh; + __be32 verf[2]; +}; + +struct nfsd3_getaclres { + __be32 status; + struct svc_fh fh; + int mask; + struct posix_acl *acl_access; + struct posix_acl *acl_default; + struct kstat stat; +}; + +/* dummy type for release */ +struct nfsd3_fhandle_pair { + __u32 dummy; + struct svc_fh fh1; + struct svc_fh fh2; +}; + +/* + * Storage requirements for XDR arguments and results. + */ +union nfsd3_xdrstore { + struct nfsd3_sattrargs sattrargs; + struct nfsd3_diropargs diropargs; + struct nfsd3_readargs readargs; + struct nfsd3_writeargs writeargs; + struct nfsd3_createargs createargs; + struct nfsd3_renameargs renameargs; + struct nfsd3_linkargs linkargs; + struct nfsd3_symlinkargs symlinkargs; + struct nfsd3_readdirargs readdirargs; + struct nfsd3_diropres diropres; + struct nfsd3_accessres accessres; + struct nfsd3_readlinkres readlinkres; + struct nfsd3_readres readres; + struct nfsd3_writeres writeres; + struct nfsd3_renameres renameres; + struct nfsd3_linkres linkres; + struct nfsd3_readdirres readdirres; + struct nfsd3_fsstatres fsstatres; + struct nfsd3_fsinfores fsinfores; + struct nfsd3_pathconfres pathconfres; + struct nfsd3_commitres commitres; + struct nfsd3_getaclres getaclres; +}; + +#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) + +int nfs3svc_decode_voidarg(struct svc_rqst *, __be32 *); +int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *); +int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *); +int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *); +int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *); +int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *); +int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_readres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_createres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *); +int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *); + +void nfs3svc_release_fhandle(struct svc_rqst *); +void nfs3svc_release_fhandle2(struct svc_rqst *); +int nfs3svc_encode_entry(void *, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int); +int nfs3svc_encode_entry_plus(void *, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int); +/* Helper functions for NFSv3 ACL code */ +__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, + struct svc_fh *fhp); +__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp); + + +#endif /* _LINUX_NFSD_XDR3_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h new file mode 100644 index 000000000..679d40af1 --- /dev/null +++ b/fs/nfsd/xdr4.h @@ -0,0 +1,902 @@ +/* + * Server-side types for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith <kmsmith@umich.edu> + * Andy Adamson <andros@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _LINUX_NFSD_XDR4_H +#define _LINUX_NFSD_XDR4_H + +#include "state.h" +#include "nfsd.h" + +#define NFSD4_MAX_TAGLEN 128 +#define XDR_LEN(n) (((n) + 3) & ~3) + +#define CURRENT_STATE_ID_FLAG (1<<0) +#define SAVED_STATE_ID_FLAG (1<<1) + +#define SET_CSTATE_FLAG(c, f) ((c)->sid_flags |= (f)) +#define HAS_CSTATE_FLAG(c, f) ((c)->sid_flags & (f)) +#define CLEAR_CSTATE_FLAG(c, f) ((c)->sid_flags &= ~(f)) + +struct nfsd4_compound_state { + struct svc_fh current_fh; + struct svc_fh save_fh; + struct nfs4_stateowner *replay_owner; + struct nfs4_client *clp; + /* For sessions DRC */ + struct nfsd4_session *session; + struct nfsd4_slot *slot; + int data_offset; + bool spo_must_allowed; + size_t iovlen; + u32 minorversion; + __be32 status; + stateid_t current_stateid; + stateid_t save_stateid; + /* to indicate current and saved state id presents */ + u32 sid_flags; +}; + +static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs) +{ + return cs->slot != NULL; +} + +struct nfsd4_change_info { + u32 atomic; + bool change_supported; + u32 before_ctime_sec; + u32 before_ctime_nsec; + u64 before_change; + u32 after_ctime_sec; + u32 after_ctime_nsec; + u64 after_change; +}; + +struct nfsd4_access { + u32 ac_req_access; /* request */ + u32 ac_supported; /* response */ + u32 ac_resp_access; /* response */ +}; + +struct nfsd4_close { + u32 cl_seqid; /* request */ + stateid_t cl_stateid; /* request+response */ +}; + +struct nfsd4_commit { + u64 co_offset; /* request */ + u32 co_count; /* request */ + nfs4_verifier co_verf; /* response */ +}; + +struct nfsd4_create { + u32 cr_namelen; /* request */ + char * cr_name; /* request */ + u32 cr_type; /* request */ + union { /* request */ + struct { + u32 datalen; + char *data; + struct kvec first; + } link; /* NF4LNK */ + struct { + u32 specdata1; + u32 specdata2; + } dev; /* NF4BLK, NF4CHR */ + } u; + u32 cr_bmval[3]; /* request */ + struct iattr cr_iattr; /* request */ + int cr_umask; /* request */ + struct nfsd4_change_info cr_cinfo; /* response */ + struct nfs4_acl *cr_acl; + struct xdr_netobj cr_label; +}; +#define cr_datalen u.link.datalen +#define cr_data u.link.data +#define cr_first u.link.first +#define cr_specdata1 u.dev.specdata1 +#define cr_specdata2 u.dev.specdata2 + +struct nfsd4_delegreturn { + stateid_t dr_stateid; +}; + +struct nfsd4_getattr { + u32 ga_bmval[3]; /* request */ + struct svc_fh *ga_fhp; /* response */ +}; + +struct nfsd4_link { + u32 li_namelen; /* request */ + char * li_name; /* request */ + struct nfsd4_change_info li_cinfo; /* response */ +}; + +struct nfsd4_lock_denied { + clientid_t ld_clientid; + struct xdr_netobj ld_owner; + u64 ld_start; + u64 ld_length; + u32 ld_type; +}; + +struct nfsd4_lock { + /* request */ + u32 lk_type; + u32 lk_reclaim; /* boolean */ + u64 lk_offset; + u64 lk_length; + u32 lk_is_new; + union { + struct { + u32 open_seqid; + stateid_t open_stateid; + u32 lock_seqid; + clientid_t clientid; + struct xdr_netobj owner; + } new; + struct { + stateid_t lock_stateid; + u32 lock_seqid; + } old; + } v; + + /* response */ + union { + struct { + stateid_t stateid; + } ok; + struct nfsd4_lock_denied denied; + } u; +}; +#define lk_new_open_seqid v.new.open_seqid +#define lk_new_open_stateid v.new.open_stateid +#define lk_new_lock_seqid v.new.lock_seqid +#define lk_new_clientid v.new.clientid +#define lk_new_owner v.new.owner +#define lk_old_lock_stateid v.old.lock_stateid +#define lk_old_lock_seqid v.old.lock_seqid + +#define lk_resp_stateid u.ok.stateid +#define lk_denied u.denied + + +struct nfsd4_lockt { + u32 lt_type; + clientid_t lt_clientid; + struct xdr_netobj lt_owner; + u64 lt_offset; + u64 lt_length; + struct nfsd4_lock_denied lt_denied; +}; + + +struct nfsd4_locku { + u32 lu_type; + u32 lu_seqid; + stateid_t lu_stateid; + u64 lu_offset; + u64 lu_length; +}; + + +struct nfsd4_lookup { + u32 lo_len; /* request */ + char * lo_name; /* request */ +}; + +struct nfsd4_putfh { + u32 pf_fhlen; /* request */ + char *pf_fhval; /* request */ + bool no_verify; /* represents foreigh fh */ +}; + +struct nfsd4_getxattr { + char *getxa_name; /* request */ + u32 getxa_len; /* request */ + void *getxa_buf; +}; + +struct nfsd4_setxattr { + u32 setxa_flags; /* request */ + char *setxa_name; /* request */ + char *setxa_buf; /* request */ + u32 setxa_len; /* request */ + struct nfsd4_change_info setxa_cinfo; /* response */ +}; + +struct nfsd4_removexattr { + char *rmxa_name; /* request */ + struct nfsd4_change_info rmxa_cinfo; /* response */ +}; + +struct nfsd4_listxattrs { + u64 lsxa_cookie; /* request */ + u32 lsxa_maxcount; /* request */ + char *lsxa_buf; /* unfiltered buffer (reply) */ + u32 lsxa_len; /* unfiltered len (reply) */ +}; + +struct nfsd4_open { + u32 op_claim_type; /* request */ + struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */ + u32 op_delegate_type; /* request - CLAIM_PREV only */ + stateid_t op_delegate_stateid; /* request - response */ + u32 op_why_no_deleg; /* response - DELEG_NONE_EXT only */ + u32 op_create; /* request */ + u32 op_createmode; /* request */ + int op_umask; /* request */ + u32 op_bmval[3]; /* request */ + struct iattr op_iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ + nfs4_verifier op_verf __attribute__((aligned(32))); + /* EXCLUSIVE4 */ + clientid_t op_clientid; /* request */ + struct xdr_netobj op_owner; /* request */ + u32 op_seqid; /* request */ + u32 op_share_access; /* request */ + u32 op_share_deny; /* request */ + u32 op_deleg_want; /* request */ + stateid_t op_stateid; /* response */ + __be32 op_xdr_error; /* see nfsd4_open_omfg() */ + u32 op_recall; /* recall */ + struct nfsd4_change_info op_cinfo; /* response */ + u32 op_rflags; /* response */ + bool op_truncate; /* used during processing */ + bool op_created; /* used during processing */ + struct nfs4_openowner *op_openowner; /* used during processing */ + struct nfs4_file *op_file; /* used during processing */ + struct nfs4_ol_stateid *op_stp; /* used during processing */ + struct nfs4_clnt_odstate *op_odstate; /* used during processing */ + struct nfs4_acl *op_acl; + struct xdr_netobj op_label; +}; + +struct nfsd4_open_confirm { + stateid_t oc_req_stateid /* request */; + u32 oc_seqid /* request */; + stateid_t oc_resp_stateid /* response */; +}; + +struct nfsd4_open_downgrade { + stateid_t od_stateid; + u32 od_seqid; + u32 od_share_access; /* request */ + u32 od_deleg_want; /* request */ + u32 od_share_deny; /* request */ +}; + + +struct nfsd4_read { + stateid_t rd_stateid; /* request */ + u64 rd_offset; /* request */ + u32 rd_length; /* request */ + int rd_vlen; + struct nfsd_file *rd_nf; + + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh *rd_fhp; /* response */ +}; + +struct nfsd4_readdir { + u64 rd_cookie; /* request */ + nfs4_verifier rd_verf; /* request */ + u32 rd_dircount; /* request */ + u32 rd_maxcount; /* request */ + u32 rd_bmval[3]; /* request */ + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh * rd_fhp; /* response */ + + struct readdir_cd common; + struct xdr_stream *xdr; + int cookie_offset; +}; + +struct nfsd4_release_lockowner { + clientid_t rl_clientid; + struct xdr_netobj rl_owner; +}; +struct nfsd4_readlink { + struct svc_rqst *rl_rqstp; /* request */ + struct svc_fh * rl_fhp; /* request */ +}; + +struct nfsd4_remove { + u32 rm_namelen; /* request */ + char * rm_name; /* request */ + struct nfsd4_change_info rm_cinfo; /* response */ +}; + +struct nfsd4_rename { + u32 rn_snamelen; /* request */ + char * rn_sname; /* request */ + u32 rn_tnamelen; /* request */ + char * rn_tname; /* request */ + struct nfsd4_change_info rn_sinfo; /* response */ + struct nfsd4_change_info rn_tinfo; /* response */ +}; + +struct nfsd4_secinfo { + u32 si_namelen; /* request */ + char *si_name; /* request */ + struct svc_export *si_exp; /* response */ +}; + +struct nfsd4_secinfo_no_name { + u32 sin_style; /* request */ + struct svc_export *sin_exp; /* response */ +}; + +struct nfsd4_setattr { + stateid_t sa_stateid; /* request */ + u32 sa_bmval[3]; /* request */ + struct iattr sa_iattr; /* request */ + struct nfs4_acl *sa_acl; + struct xdr_netobj sa_label; +}; + +struct nfsd4_setclientid { + nfs4_verifier se_verf; /* request */ + struct xdr_netobj se_name; + u32 se_callback_prog; /* request */ + u32 se_callback_netid_len; /* request */ + char * se_callback_netid_val; /* request */ + u32 se_callback_addr_len; /* request */ + char * se_callback_addr_val; /* request */ + u32 se_callback_ident; /* request */ + clientid_t se_clientid; /* response */ + nfs4_verifier se_confirm; /* response */ +}; + +struct nfsd4_setclientid_confirm { + clientid_t sc_clientid; + nfs4_verifier sc_confirm; +}; + +struct nfsd4_saved_compoundargs { + __be32 *p; + __be32 *end; + int pagelen; + struct page **pagelist; +}; + +struct nfsd4_test_stateid_id { + __be32 ts_id_status; + stateid_t ts_id_stateid; + struct list_head ts_id_list; +}; + +struct nfsd4_test_stateid { + u32 ts_num_ids; + struct list_head ts_stateid_list; +}; + +struct nfsd4_free_stateid { + stateid_t fr_stateid; /* request */ +}; + +/* also used for NVERIFY */ +struct nfsd4_verify { + u32 ve_bmval[3]; /* request */ + u32 ve_attrlen; /* request */ + char * ve_attrval; /* request */ +}; + +struct nfsd4_write { + stateid_t wr_stateid; /* request */ + u64 wr_offset; /* request */ + u32 wr_stable_how; /* request */ + u32 wr_buflen; /* request */ + struct kvec wr_head; + struct page ** wr_pagelist; /* request */ + + u32 wr_bytes_written; /* response */ + u32 wr_how_written; /* response */ + nfs4_verifier wr_verifier; /* response */ +}; + +struct nfsd4_exchange_id { + nfs4_verifier verifier; + struct xdr_netobj clname; + u32 flags; + clientid_t clientid; + u32 seqid; + int spa_how; + u32 spo_must_enforce[3]; + u32 spo_must_allow[3]; + struct xdr_netobj nii_domain; + struct xdr_netobj nii_name; + struct timespec64 nii_time; +}; + +struct nfsd4_sequence { + struct nfs4_sessionid sessionid; /* request/response */ + u32 seqid; /* request/response */ + u32 slotid; /* request/response */ + u32 maxslots; /* request/response */ + u32 cachethis; /* request */ +#if 0 + u32 target_maxslots; /* response */ +#endif /* not yet */ + u32 status_flags; /* response */ +}; + +struct nfsd4_destroy_session { + struct nfs4_sessionid sessionid; +}; + +struct nfsd4_destroy_clientid { + clientid_t clientid; +}; + +struct nfsd4_reclaim_complete { + u32 rca_one_fs; +}; + +struct nfsd4_deviceid { + u64 fsid_idx; + u32 generation; + u32 pad; +}; + +struct nfsd4_layout_seg { + u32 iomode; + u64 offset; + u64 length; +}; + +struct nfsd4_getdeviceinfo { + struct nfsd4_deviceid gd_devid; /* request */ + u32 gd_layout_type; /* request */ + u32 gd_maxcount; /* request */ + u32 gd_notify_types;/* request - response */ + void *gd_device; /* response */ +}; + +struct nfsd4_layoutget { + u64 lg_minlength; /* request */ + u32 lg_signal; /* request */ + u32 lg_layout_type; /* request */ + u32 lg_maxcount; /* request */ + stateid_t lg_sid; /* request/response */ + struct nfsd4_layout_seg lg_seg; /* request/response */ + void *lg_content; /* response */ +}; + +struct nfsd4_layoutcommit { + stateid_t lc_sid; /* request */ + struct nfsd4_layout_seg lc_seg; /* request */ + u32 lc_reclaim; /* request */ + u32 lc_newoffset; /* request */ + u64 lc_last_wr; /* request */ + struct timespec64 lc_mtime; /* request */ + u32 lc_layout_type; /* request */ + u32 lc_up_len; /* layout length */ + void *lc_up_layout; /* decoded by callback */ + u32 lc_size_chg; /* boolean for response */ + u64 lc_newsize; /* response */ +}; + +struct nfsd4_layoutreturn { + u32 lr_return_type; /* request */ + u32 lr_layout_type; /* request */ + struct nfsd4_layout_seg lr_seg; /* request */ + u32 lr_reclaim; /* request */ + u32 lrf_body_len; /* request */ + void *lrf_body; /* request */ + stateid_t lr_sid; /* request/response */ + u32 lrs_present; /* response */ +}; + +struct nfsd4_fallocate { + /* request */ + stateid_t falloc_stateid; + loff_t falloc_offset; + u64 falloc_length; +}; + +struct nfsd4_clone { + /* request */ + stateid_t cl_src_stateid; + stateid_t cl_dst_stateid; + u64 cl_src_pos; + u64 cl_dst_pos; + u64 cl_count; +}; + +struct nfsd42_write_res { + u64 wr_bytes_written; + u32 wr_stable_how; + nfs4_verifier wr_verifier; + stateid_t cb_stateid; +}; + +struct nfsd4_copy { + /* request */ + stateid_t cp_src_stateid; + stateid_t cp_dst_stateid; + u64 cp_src_pos; + u64 cp_dst_pos; + u64 cp_count; + struct nl4_server cp_src; + bool cp_intra; + + /* both */ + bool cp_synchronous; + + /* response */ + struct nfsd42_write_res cp_res; + + /* for cb_offload */ + struct nfsd4_callback cp_cb; + __be32 nfserr; + struct knfsd_fh fh; + + struct nfs4_client *cp_clp; + + struct nfsd_file *nf_src; + struct nfsd_file *nf_dst; + + copy_stateid_t cp_stateid; + + struct list_head copies; + struct task_struct *copy_task; + refcount_t refcount; + bool stopped; + + struct vfsmount *ss_mnt; + struct nfs_fh c_fh; + nfs4_stateid stateid; +}; +extern bool inter_copy_offload_enable; + +struct nfsd4_seek { + /* request */ + stateid_t seek_stateid; + loff_t seek_offset; + u32 seek_whence; + + /* response */ + u32 seek_eof; + loff_t seek_pos; +}; + +struct nfsd4_offload_status { + /* request */ + stateid_t stateid; + + /* response */ + u64 count; + u32 status; +}; + +struct nfsd4_copy_notify { + /* request */ + stateid_t cpn_src_stateid; + struct nl4_server cpn_dst; + + /* response */ + stateid_t cpn_cnr_stateid; + u64 cpn_sec; + u32 cpn_nsec; + struct nl4_server cpn_src; +}; + +struct nfsd4_op { + int opnum; + const struct nfsd4_operation * opdesc; + __be32 status; + union nfsd4_op_u { + struct nfsd4_access access; + struct nfsd4_close close; + struct nfsd4_commit commit; + struct nfsd4_create create; + struct nfsd4_delegreturn delegreturn; + struct nfsd4_getattr getattr; + struct svc_fh * getfh; + struct nfsd4_link link; + struct nfsd4_lock lock; + struct nfsd4_lockt lockt; + struct nfsd4_locku locku; + struct nfsd4_lookup lookup; + struct nfsd4_verify nverify; + struct nfsd4_open open; + struct nfsd4_open_confirm open_confirm; + struct nfsd4_open_downgrade open_downgrade; + struct nfsd4_putfh putfh; + struct nfsd4_read read; + struct nfsd4_readdir readdir; + struct nfsd4_readlink readlink; + struct nfsd4_remove remove; + struct nfsd4_rename rename; + clientid_t renew; + struct nfsd4_secinfo secinfo; + struct nfsd4_setattr setattr; + struct nfsd4_setclientid setclientid; + struct nfsd4_setclientid_confirm setclientid_confirm; + struct nfsd4_verify verify; + struct nfsd4_write write; + struct nfsd4_release_lockowner release_lockowner; + + /* NFSv4.1 */ + struct nfsd4_exchange_id exchange_id; + struct nfsd4_backchannel_ctl backchannel_ctl; + struct nfsd4_bind_conn_to_session bind_conn_to_session; + struct nfsd4_create_session create_session; + struct nfsd4_destroy_session destroy_session; + struct nfsd4_destroy_clientid destroy_clientid; + struct nfsd4_sequence sequence; + struct nfsd4_reclaim_complete reclaim_complete; + struct nfsd4_test_stateid test_stateid; + struct nfsd4_free_stateid free_stateid; + struct nfsd4_getdeviceinfo getdeviceinfo; + struct nfsd4_layoutget layoutget; + struct nfsd4_layoutcommit layoutcommit; + struct nfsd4_layoutreturn layoutreturn; + struct nfsd4_secinfo_no_name secinfo_no_name; + + /* NFSv4.2 */ + struct nfsd4_fallocate allocate; + struct nfsd4_fallocate deallocate; + struct nfsd4_clone clone; + struct nfsd4_copy copy; + struct nfsd4_offload_status offload_status; + struct nfsd4_copy_notify copy_notify; + struct nfsd4_seek seek; + + struct nfsd4_getxattr getxattr; + struct nfsd4_setxattr setxattr; + struct nfsd4_listxattrs listxattrs; + struct nfsd4_removexattr removexattr; + } u; + struct nfs4_replay * replay; +}; + +bool nfsd4_cache_this_op(struct nfsd4_op *); + +/* + * Memory needed just for the duration of processing one compound: + */ +struct svcxdr_tmpbuf { + struct svcxdr_tmpbuf *next; + char buf[]; +}; + +struct nfsd4_compoundargs { + /* scratch variables for XDR decode */ + __be32 * p; + __be32 * end; + struct page ** pagelist; + int pagelen; + bool tail; + __be32 tmp[8]; + __be32 * tmpp; + struct svcxdr_tmpbuf *to_free; + + struct svc_rqst *rqstp; + + u32 taglen; + char * tag; + u32 minorversion; + u32 opcnt; + struct nfsd4_op *ops; + struct nfsd4_op iops[8]; + int cachetype; +}; + +struct nfsd4_compoundres { + /* scratch variables for XDR encode */ + struct xdr_stream xdr; + struct svc_rqst * rqstp; + + u32 taglen; + char * tag; + u32 opcnt; + __be32 * tagp; /* tag, opcount encode location */ + struct nfsd4_compound_state cstate; +}; + +static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) +{ + struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; + return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE; +} + +/* + * The session reply cache only needs to cache replies that the client + * actually asked us to. But it's almost free for us to cache compounds + * consisting of only a SEQUENCE op, so we may as well cache those too. + * Also, the protocol doesn't give us a convenient response in the case + * of a replay of a solo SEQUENCE op that wasn't cached + * (RETRY_UNCACHED_REP can only be returned in the second op of a + * compound). + */ +static inline bool nfsd4_cache_this(struct nfsd4_compoundres *resp) +{ + return (resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS) + || nfsd4_is_solo_sequence(resp); +} + +static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + + return argp->opcnt == resp->opcnt; +} + +const struct nfsd4_operation *OPDESC(struct nfsd4_op *op); +int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op); +void warn_on_nonidempotent_op(struct nfsd4_op *op); + +#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) + +static inline void +set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) +{ + BUG_ON(!fhp->fh_pre_saved); + cinfo->atomic = (u32)fhp->fh_post_saved; + cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry)); + + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; + cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; + cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; + cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; + +} + + +bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp); +int nfs4svc_decode_voidarg(struct svc_rqst *, __be32 *); +int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *); +int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *); +int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *); +__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); +void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); +void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op); +__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words, + struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, + u32 *bmval, struct svc_rqst *, int ignore_crossmnt); +extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, + struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, + struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern __be32 nfsd4_create_session(struct svc_rqst *, + struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern __be32 nfsd4_sequence(struct svc_rqst *, + struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern void nfsd4_sequence_done(struct nfsd4_compoundres *resp); +extern __be32 nfsd4_destroy_session(struct svc_rqst *, + struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, + union nfsd4_op_u *u); +__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, + union nfsd4_op_u *u); +extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, + struct nfsd4_open *open, struct nfsd_net *nn); +extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, + struct svc_fh *current_fh, struct nfsd4_open *open); +extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate); +extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, + struct nfsd4_open *open); +extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern __be32 nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *, + union nfsd4_op_u *u); +extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *, + union nfsd4_op_u *u); +extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *, + union nfsd4_op_u *u); +extern __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *, + union nfsd4_op_u *u); +extern __be32 +nfsd4_release_lockowner(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern void nfsd4_release_compoundargs(struct svc_rqst *rqstp); +extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern __be32 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *, + union nfsd4_op_u *u); +extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, union nfsd4_op_u *); +extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, union nfsd4_op_u *); +extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr); + +enum nfsd4_op_flags { + ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ + ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ + ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */ + /* For rfc 5661 section 2.6.3.1.1: */ + OP_HANDLES_WRONGSEC = 1 << 3, + OP_IS_PUTFH_LIKE = 1 << 4, + /* + * These are the ops whose result size we estimate before + * encoding, to avoid performing an op then not being able to + * respond or cache a response. This includes writes and setattrs + * as well as the operations usually called "nonidempotent": + */ + OP_MODIFIES_SOMETHING = 1 << 5, + /* + * Cache compounds containing these ops in the xid-based drc: + * We use the DRC for compounds containing non-idempotent + * operations, *except* those that are 4.1-specific (since + * sessions provide their own EOS), and except for stateful + * operations other than setclientid and setclientid_confirm + * (since sequence numbers provide EOS for open, lock, etc in + * the v4.0 case). + */ + OP_CACHEME = 1 << 6, + /* + * These are ops which clear current state id. + */ + OP_CLEAR_STATEID = 1 << 7, + /* Most ops return only an error on failure; some may do more: */ + OP_NONTRIVIAL_ERROR_ENCODE = 1 << 8, +}; + +struct nfsd4_operation { + __be32 (*op_func)(struct svc_rqst *, struct nfsd4_compound_state *, + union nfsd4_op_u *); + void (*op_release)(union nfsd4_op_u *); + u32 op_flags; + char *op_name; + /* Try to get response size before operation */ + u32 (*op_rsize_bop)(struct svc_rqst *, struct nfsd4_op *); + void (*op_get_currentstateid)(struct nfsd4_compound_state *, + union nfsd4_op_u *); + void (*op_set_currentstateid)(struct nfsd4_compound_state *, + union nfsd4_op_u *); +}; + + +#endif + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h new file mode 100644 index 000000000..547cf07cf --- /dev/null +++ b/fs/nfsd/xdr4cb.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#define NFS4_MAXTAGLEN 20 + +#define NFS4_enc_cb_null_sz 0 +#define NFS4_dec_cb_null_sz 0 +#define cb_compound_enc_hdr_sz 4 +#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) +#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2) +#define cb_sequence_enc_sz (sessionid_sz + 4 + \ + 1 /* no referring calls list yet */) +#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4) + +#define op_enc_sz 1 +#define op_dec_sz 2 +#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) +#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2) +#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + enc_stateid_sz + \ + enc_nfs4_fh_sz) + +#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) +#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + 3 + \ + enc_nfs4_fh_sz + 4) +#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) + +#define NFS4_enc_cb_notify_lock_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 2 + 1 + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + enc_nfs4_fh_sz) +#define NFS4_dec_cb_notify_lock_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) +#define enc_cb_offload_info_sz (1 + 1 + 2 + 1 + \ + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define NFS4_enc_cb_offload_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + enc_nfs4_fh_sz + \ + enc_stateid_sz + \ + enc_cb_offload_info_sz) +#define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) |