diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /net/9p | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'net/9p')
-rw-r--r-- | net/9p/Kconfig | 54 | ||||
-rw-r--r-- | net/9p/Makefile | 25 | ||||
-rw-r--r-- | net/9p/client.c | 2263 | ||||
-rw-r--r-- | net/9p/error.c | 230 | ||||
-rw-r--r-- | net/9p/mod.c | 212 | ||||
-rw-r--r-- | net/9p/protocol.c | 801 | ||||
-rw-r--r-- | net/9p/protocol.h | 19 | ||||
-rw-r--r-- | net/9p/trans_common.c | 24 | ||||
-rw-r--r-- | net/9p/trans_common.h | 7 | ||||
-rw-r--r-- | net/9p/trans_fd.c | 1205 | ||||
-rw-r--r-- | net/9p/trans_rdma.c | 781 | ||||
-rw-r--r-- | net/9p/trans_virtio.c | 838 | ||||
-rw-r--r-- | net/9p/trans_xen.c | 571 |
13 files changed, 7030 insertions, 0 deletions
diff --git a/net/9p/Kconfig b/net/9p/Kconfig new file mode 100644 index 0000000000..00ebce9e5a --- /dev/null +++ b/net/9p/Kconfig @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# 9P protocol configuration +# + +menuconfig NET_9P + tristate "Plan 9 Resource Sharing Support (9P2000)" + help + If you say Y here, you will get experimental support for + Plan 9 resource sharing via the 9P2000 protocol. + + See <http://v9fs.sf.net> for more information. + + If unsure, say N. + +if NET_9P + +config NET_9P_FD + default NET_9P + imply INET + imply UNIX + tristate "9P FD Transport" + help + This builds support for transports over TCP, Unix sockets and + filedescriptors. + +config NET_9P_VIRTIO + depends on VIRTIO + tristate "9P Virtio Transport" + help + This builds support for a transports between + guest partitions and a host partition. + +config NET_9P_XEN + depends on XEN + select XEN_XENBUS_FRONTEND + tristate "9P Xen Transport" + help + This builds support for a transport for 9pfs between + two Xen domains. + + +config NET_9P_RDMA + depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS + tristate "9P RDMA Transport (Experimental)" + help + This builds support for an RDMA transport. + +config NET_9P_DEBUG + bool "Debug information" + help + Say Y if you want the 9P subsystem to log debug information. + +endif diff --git a/net/9p/Makefile b/net/9p/Makefile new file mode 100644 index 0000000000..1df9b344c3 --- /dev/null +++ b/net/9p/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_NET_9P) := 9pnet.o +obj-$(CONFIG_NET_9P_FD) += 9pnet_fd.o +obj-$(CONFIG_NET_9P_XEN) += 9pnet_xen.o +obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o +obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o + +9pnet-objs := \ + mod.o \ + client.o \ + error.o \ + protocol.o \ + trans_common.o \ + +9pnet_fd-objs := \ + trans_fd.o \ + +9pnet_virtio-objs := \ + trans_virtio.o \ + +9pnet_xen-objs := \ + trans_xen.o \ + +9pnet_rdma-objs := \ + trans_rdma.o \ diff --git a/net/9p/client.c b/net/9p/client.c new file mode 100644 index 0000000000..e265a0ca6b --- /dev/null +++ b/net/9p/client.c @@ -0,0 +1,2263 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * 9P Client + * + * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com> + * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/idr.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/sched/signal.h> +#include <linux/uaccess.h> +#include <linux/uio.h> +#include <net/9p/9p.h> +#include <linux/parser.h> +#include <linux/seq_file.h> +#include <net/9p/client.h> +#include <net/9p/transport.h> +#include "protocol.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/9p.h> + +/* DEFAULT MSIZE = 32 pages worth of payload + P9_HDRSZ + + * room for write (16 extra) or read (11 extra) operands. + */ + +#define DEFAULT_MSIZE ((128 * 1024) + P9_IOHDRSZ) + +/* Client Option Parsing (code inspired by NFS code) + * - a little lazy - parse all client options + */ + +enum { + Opt_msize, + Opt_trans, + Opt_legacy, + Opt_version, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_msize, "msize=%u"}, + {Opt_legacy, "noextend"}, + {Opt_trans, "trans=%s"}, + {Opt_version, "version=%s"}, + {Opt_err, NULL}, +}; + +inline int p9_is_proto_dotl(struct p9_client *clnt) +{ + return clnt->proto_version == p9_proto_2000L; +} +EXPORT_SYMBOL(p9_is_proto_dotl); + +inline int p9_is_proto_dotu(struct p9_client *clnt) +{ + return clnt->proto_version == p9_proto_2000u; +} +EXPORT_SYMBOL(p9_is_proto_dotu); + +int p9_show_client_options(struct seq_file *m, struct p9_client *clnt) +{ + if (clnt->msize != DEFAULT_MSIZE) + seq_printf(m, ",msize=%u", clnt->msize); + seq_printf(m, ",trans=%s", clnt->trans_mod->name); + + switch (clnt->proto_version) { + case p9_proto_legacy: + seq_puts(m, ",noextend"); + break; + case p9_proto_2000u: + seq_puts(m, ",version=9p2000.u"); + break; + case p9_proto_2000L: + /* Default */ + break; + } + + if (clnt->trans_mod->show_options) + return clnt->trans_mod->show_options(m, clnt); + return 0; +} +EXPORT_SYMBOL(p9_show_client_options); + +/* Some error codes are taken directly from the server replies, + * make sure they are valid. + */ +static int safe_errno(int err) +{ + if (err > 0 || err < -MAX_ERRNO) { + p9_debug(P9_DEBUG_ERROR, "Invalid error code %d\n", err); + return -EPROTO; + } + return err; +} + +/* Interpret mount option for protocol version */ +static int get_protocol_version(char *s) +{ + int version = -EINVAL; + + if (!strcmp(s, "9p2000")) { + version = p9_proto_legacy; + p9_debug(P9_DEBUG_9P, "Protocol version: Legacy\n"); + } else if (!strcmp(s, "9p2000.u")) { + version = p9_proto_2000u; + p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.u\n"); + } else if (!strcmp(s, "9p2000.L")) { + version = p9_proto_2000L; + p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.L\n"); + } else { + pr_info("Unknown protocol version %s\n", s); + } + + return version; +} + +/** + * parse_opts - parse mount options into client structure + * @opts: options string passed from mount + * @clnt: existing v9fs client information + * + * Return 0 upon success, -ERRNO upon failure + */ + +static int parse_opts(char *opts, struct p9_client *clnt) +{ + char *options, *tmp_options; + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + char *s; + int ret = 0; + + clnt->proto_version = p9_proto_2000L; + clnt->msize = DEFAULT_MSIZE; + + if (!opts) + return 0; + + tmp_options = kstrdup(opts, GFP_KERNEL); + if (!tmp_options) + return -ENOMEM; + options = tmp_options; + + while ((p = strsep(&options, ",")) != NULL) { + int token, r; + + if (!*p) + continue; + token = match_token(p, tokens, args); + switch (token) { + case Opt_msize: + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } + if (option < 4096) { + p9_debug(P9_DEBUG_ERROR, + "msize should be at least 4k\n"); + ret = -EINVAL; + continue; + } + clnt->msize = option; + break; + case Opt_trans: + s = match_strdup(&args[0]); + if (!s) { + ret = -ENOMEM; + p9_debug(P9_DEBUG_ERROR, + "problem allocating copy of trans arg\n"); + goto free_and_return; + } + + v9fs_put_trans(clnt->trans_mod); + clnt->trans_mod = v9fs_get_trans_by_name(s); + if (!clnt->trans_mod) { + pr_info("Could not find request transport: %s\n", + s); + ret = -EINVAL; + } + kfree(s); + break; + case Opt_legacy: + clnt->proto_version = p9_proto_legacy; + break; + case Opt_version: + s = match_strdup(&args[0]); + if (!s) { + ret = -ENOMEM; + p9_debug(P9_DEBUG_ERROR, + "problem allocating copy of version arg\n"); + goto free_and_return; + } + r = get_protocol_version(s); + if (r < 0) + ret = r; + else + clnt->proto_version = r; + kfree(s); + break; + default: + continue; + } + } + +free_and_return: + if (ret) + v9fs_put_trans(clnt->trans_mod); + kfree(tmp_options); + return ret; +} + +static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, + int alloc_msize) +{ + if (likely(c->fcall_cache) && alloc_msize == c->msize) { + fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS); + fc->cache = c->fcall_cache; + } else { + fc->sdata = kmalloc(alloc_msize, GFP_NOFS); + fc->cache = NULL; + } + if (!fc->sdata) + return -ENOMEM; + fc->capacity = alloc_msize; + return 0; +} + +void p9_fcall_fini(struct p9_fcall *fc) +{ + /* sdata can be NULL for interrupted requests in trans_rdma, + * and kmem_cache_free does not do NULL-check for us + */ + if (unlikely(!fc->sdata)) + return; + + if (fc->cache) + kmem_cache_free(fc->cache, fc->sdata); + else + kfree(fc->sdata); +} +EXPORT_SYMBOL(p9_fcall_fini); + +static struct kmem_cache *p9_req_cache; + +/** + * p9_tag_alloc - Allocate a new request. + * @c: Client session. + * @type: Transaction type. + * @t_size: Buffer size for holding this request + * (automatic calculation by format template if 0). + * @r_size: Buffer size for holding server's reply on this request + * (automatic calculation by format template if 0). + * @fmt: Format template for assembling 9p request message + * (see p9pdu_vwritef). + * @ap: Variable arguments to be fed to passed format template + * (see p9pdu_vwritef). + * + * Context: Process context. + * Return: Pointer to new request. + */ +static struct p9_req_t * +p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size, + const char *fmt, va_list ap) +{ + struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS); + int alloc_tsize; + int alloc_rsize; + int tag; + va_list apc; + + va_copy(apc, ap); + alloc_tsize = min_t(size_t, c->msize, + t_size ?: p9_msg_buf_size(c, type, fmt, apc)); + va_end(apc); + + alloc_rsize = min_t(size_t, c->msize, + r_size ?: p9_msg_buf_size(c, type + 1, fmt, ap)); + + if (!req) + return ERR_PTR(-ENOMEM); + + if (p9_fcall_init(c, &req->tc, alloc_tsize)) + goto free_req; + if (p9_fcall_init(c, &req->rc, alloc_rsize)) + goto free; + + p9pdu_reset(&req->tc); + p9pdu_reset(&req->rc); + req->t_err = 0; + req->status = REQ_STATUS_ALLOC; + /* refcount needs to be set to 0 before inserting into the idr + * so p9_tag_lookup does not accept a request that is not fully + * initialized. refcount_set to 2 below will mark request ready. + */ + refcount_set(&req->refcount, 0); + init_waitqueue_head(&req->wq); + INIT_LIST_HEAD(&req->req_list); + + idr_preload(GFP_NOFS); + spin_lock_irq(&c->lock); + if (type == P9_TVERSION) + tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1, + GFP_NOWAIT); + else + tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT); + req->tc.tag = tag; + spin_unlock_irq(&c->lock); + idr_preload_end(); + if (tag < 0) + goto free; + + /* Init ref to two because in the general case there is one ref + * that is put asynchronously by a writer thread, one ref + * temporarily given by p9_tag_lookup and put by p9_client_cb + * in the recv thread, and one ref put by p9_req_put in the + * main thread. The only exception is virtio that does not use + * p9_tag_lookup but does not have a writer thread either + * (the write happens synchronously in the request/zc_request + * callback), so p9_client_cb eats the second ref there + * as the pointer is duplicated directly by virtqueue_add_sgs() + */ + refcount_set(&req->refcount, 2); + + return req; + +free: + p9_fcall_fini(&req->tc); + p9_fcall_fini(&req->rc); +free_req: + kmem_cache_free(p9_req_cache, req); + return ERR_PTR(-ENOMEM); +} + +/** + * p9_tag_lookup - Look up a request by tag. + * @c: Client session. + * @tag: Transaction ID. + * + * Context: Any context. + * Return: A request, or %NULL if there is no request with that tag. + */ +struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag) +{ + struct p9_req_t *req; + + rcu_read_lock(); +again: + req = idr_find(&c->reqs, tag); + if (req) { + /* We have to be careful with the req found under rcu_read_lock + * Thanks to SLAB_TYPESAFE_BY_RCU we can safely try to get the + * ref again without corrupting other data, then check again + * that the tag matches once we have the ref + */ + if (!p9_req_try_get(req)) + goto again; + if (req->tc.tag != tag) { + p9_req_put(c, req); + goto again; + } + } + rcu_read_unlock(); + + return req; +} +EXPORT_SYMBOL(p9_tag_lookup); + +/** + * p9_tag_remove - Remove a tag. + * @c: Client session. + * @r: Request of reference. + * + * Context: Any context. + */ +static void p9_tag_remove(struct p9_client *c, struct p9_req_t *r) +{ + unsigned long flags; + u16 tag = r->tc.tag; + + p9_debug(P9_DEBUG_MUX, "freeing clnt %p req %p tag: %d\n", c, r, tag); + spin_lock_irqsave(&c->lock, flags); + idr_remove(&c->reqs, tag); + spin_unlock_irqrestore(&c->lock, flags); +} + +int p9_req_put(struct p9_client *c, struct p9_req_t *r) +{ + if (refcount_dec_and_test(&r->refcount)) { + p9_tag_remove(c, r); + + p9_fcall_fini(&r->tc); + p9_fcall_fini(&r->rc); + kmem_cache_free(p9_req_cache, r); + return 1; + } + return 0; +} +EXPORT_SYMBOL(p9_req_put); + +/** + * p9_tag_cleanup - cleans up tags structure and reclaims resources + * @c: v9fs client struct + * + * This frees resources associated with the tags structure + * + */ +static void p9_tag_cleanup(struct p9_client *c) +{ + struct p9_req_t *req; + int id; + + rcu_read_lock(); + idr_for_each_entry(&c->reqs, req, id) { + pr_info("Tag %d still in use\n", id); + if (p9_req_put(c, req) == 0) + pr_warn("Packet with tag %d has still references", + req->tc.tag); + } + rcu_read_unlock(); +} + +/** + * p9_client_cb - call back from transport to client + * @c: client state + * @req: request received + * @status: request status, one of REQ_STATUS_* + * + */ +void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) +{ + p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc.tag); + + /* This barrier is needed to make sure any change made to req before + * the status change is visible to another thread + */ + smp_wmb(); + WRITE_ONCE(req->status, status); + + wake_up(&req->wq); + p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag); + p9_req_put(c, req); +} +EXPORT_SYMBOL(p9_client_cb); + +/** + * p9_parse_header - parse header arguments out of a packet + * @pdu: packet to parse + * @size: size of packet + * @type: type of request + * @tag: tag of packet + * @rewind: set if we need to rewind offset afterwards + */ + +int +p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, + int16_t *tag, int rewind) +{ + s8 r_type; + s16 r_tag; + s32 r_size; + int offset = pdu->offset; + int err; + + pdu->offset = 0; + + err = p9pdu_readf(pdu, 0, "dbw", &r_size, &r_type, &r_tag); + if (err) + goto rewind_and_exit; + + if (type) + *type = r_type; + if (tag) + *tag = r_tag; + if (size) + *size = r_size; + + if (pdu->size != r_size || r_size < 7) { + err = -EINVAL; + goto rewind_and_exit; + } + + pdu->id = r_type; + pdu->tag = r_tag; + + p9_debug(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n", + pdu->size, pdu->id, pdu->tag); + +rewind_and_exit: + if (rewind) + pdu->offset = offset; + return err; +} +EXPORT_SYMBOL(p9_parse_header); + +/** + * p9_check_errors - check 9p packet for error return and process it + * @c: current client instance + * @req: request to parse and check for error conditions + * + * returns error code if one is discovered, otherwise returns 0 + * + * this will have to be more complicated if we have multiple + * error packet types + */ + +static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) +{ + s8 type; + int err; + int ecode; + + err = p9_parse_header(&req->rc, NULL, &type, NULL, 0); + if (req->rc.size > req->rc.capacity && !req->rc.zc) { + pr_err("requested packet size too big: %d does not fit %zu (type=%d)\n", + req->rc.size, req->rc.capacity, req->rc.id); + return -EIO; + } + /* dump the response from server + * This should be after check errors which poplulate pdu_fcall. + */ + trace_9p_protocol_dump(c, &req->rc); + if (err) { + p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); + return err; + } + if (type != P9_RERROR && type != P9_RLERROR) + return 0; + + if (!p9_is_proto_dotl(c)) { + char *ename = NULL; + + err = p9pdu_readf(&req->rc, c->proto_version, "s?d", + &ename, &ecode); + if (err) { + kfree(ename); + goto out_err; + } + + if (p9_is_proto_dotu(c) && ecode < 512) + err = -ecode; + + if (!err) { + err = p9_errstr2errno(ename, strlen(ename)); + + p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", + -ecode, ename); + } + kfree(ename); + } else { + err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode); + if (err) + goto out_err; + err = -ecode; + + p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); + } + + return err; + +out_err: + p9_debug(P9_DEBUG_ERROR, "couldn't parse error%d\n", err); + + return err; +} + +static struct p9_req_t * +p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...); + +/** + * p9_client_flush - flush (cancel) a request + * @c: client state + * @oldreq: request to cancel + * + * This sents a flush for a particular request and links + * the flush request to the original request. The current + * code only supports a single flush request although the protocol + * allows for multiple flush requests to be sent for a single request. + * + */ + +static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) +{ + struct p9_req_t *req; + s16 oldtag; + int err; + + err = p9_parse_header(&oldreq->tc, NULL, NULL, &oldtag, 1); + if (err) + return err; + + p9_debug(P9_DEBUG_9P, ">>> TFLUSH tag %d\n", oldtag); + + req = p9_client_rpc(c, P9_TFLUSH, "w", oldtag); + if (IS_ERR(req)) + return PTR_ERR(req); + + /* if we haven't received a response for oldreq, + * remove it from the list + */ + if (READ_ONCE(oldreq->status) == REQ_STATUS_SENT) { + if (c->trans_mod->cancelled) + c->trans_mod->cancelled(c, oldreq); + } + + p9_req_put(c, req); + return 0; +} + +static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, + int8_t type, uint t_size, uint r_size, + const char *fmt, va_list ap) +{ + int err; + struct p9_req_t *req; + va_list apc; + + p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type); + + /* we allow for any status other than disconnected */ + if (c->status == Disconnected) + return ERR_PTR(-EIO); + + /* if status is begin_disconnected we allow only clunk request */ + if (c->status == BeginDisconnect && type != P9_TCLUNK) + return ERR_PTR(-EIO); + + va_copy(apc, ap); + req = p9_tag_alloc(c, type, t_size, r_size, fmt, apc); + va_end(apc); + if (IS_ERR(req)) + return req; + + /* marshall the data */ + p9pdu_prepare(&req->tc, req->tc.tag, type); + err = p9pdu_vwritef(&req->tc, c->proto_version, fmt, ap); + if (err) + goto reterr; + p9pdu_finalize(c, &req->tc); + trace_9p_client_req(c, type, req->tc.tag); + return req; +reterr: + p9_req_put(c, req); + /* We have to put also the 2nd reference as it won't be used */ + p9_req_put(c, req); + return ERR_PTR(err); +} + +/** + * p9_client_rpc - issue a request and wait for a response + * @c: client session + * @type: type of request + * @fmt: protocol format string (see protocol.c) + * + * Returns request structure (which client must free using p9_req_put) + */ + +static struct p9_req_t * +p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) +{ + va_list ap; + int sigpending, err; + unsigned long flags; + struct p9_req_t *req; + /* Passing zero for tsize/rsize to p9_client_prepare_req() tells it to + * auto determine an appropriate (small) request/response size + * according to actual message data being sent. Currently RDMA + * transport is excluded from this response message size optimization, + * as it would not cope with it, due to its pooled response buffers + * (using an optimized request size for RDMA as well though). + */ + const uint tsize = 0; + const uint rsize = c->trans_mod->pooled_rbuffers ? c->msize : 0; + + va_start(ap, fmt); + req = p9_client_prepare_req(c, type, tsize, rsize, fmt, ap); + va_end(ap); + if (IS_ERR(req)) + return req; + + req->tc.zc = false; + req->rc.zc = false; + + if (signal_pending(current)) { + sigpending = 1; + clear_thread_flag(TIF_SIGPENDING); + } else { + sigpending = 0; + } + + err = c->trans_mod->request(c, req); + if (err < 0) { + /* write won't happen */ + p9_req_put(c, req); + if (err != -ERESTARTSYS && err != -EFAULT) + c->status = Disconnected; + goto recalc_sigpending; + } +again: + /* Wait for the response */ + err = wait_event_killable(req->wq, + READ_ONCE(req->status) >= REQ_STATUS_RCVD); + + /* Make sure our req is coherent with regard to updates in other + * threads - echoes to wmb() in the callback + */ + smp_rmb(); + + if (err == -ERESTARTSYS && c->status == Connected && + type == P9_TFLUSH) { + sigpending = 1; + clear_thread_flag(TIF_SIGPENDING); + goto again; + } + + if (READ_ONCE(req->status) == REQ_STATUS_ERROR) { + p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); + err = req->t_err; + } + if (err == -ERESTARTSYS && c->status == Connected) { + p9_debug(P9_DEBUG_MUX, "flushing\n"); + sigpending = 1; + clear_thread_flag(TIF_SIGPENDING); + + if (c->trans_mod->cancel(c, req)) + p9_client_flush(c, req); + + /* if we received the response anyway, don't signal error */ + if (READ_ONCE(req->status) == REQ_STATUS_RCVD) + err = 0; + } +recalc_sigpending: + if (sigpending) { + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + if (err < 0) + goto reterr; + + err = p9_check_errors(c, req); + trace_9p_client_res(c, type, req->rc.tag, err); + if (!err) + return req; +reterr: + p9_req_put(c, req); + return ERR_PTR(safe_errno(err)); +} + +/** + * p9_client_zc_rpc - issue a request and wait for a response + * @c: client session + * @type: type of request + * @uidata: destination for zero copy read + * @uodata: source for zero copy write + * @inlen: read buffer size + * @olen: write buffer size + * @in_hdrlen: reader header size, This is the size of response protocol data + * @fmt: protocol format string (see protocol.c) + * + * Returns request structure (which client must free using p9_req_put) + */ +static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, + struct iov_iter *uidata, + struct iov_iter *uodata, + int inlen, int olen, int in_hdrlen, + const char *fmt, ...) +{ + va_list ap; + int sigpending, err; + unsigned long flags; + struct p9_req_t *req; + + va_start(ap, fmt); + /* We allocate a inline protocol data of only 4k bytes. + * The actual content is passed in zero-copy fashion. + */ + req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, P9_ZC_HDR_SZ, fmt, ap); + va_end(ap); + if (IS_ERR(req)) + return req; + + req->tc.zc = true; + req->rc.zc = true; + + if (signal_pending(current)) { + sigpending = 1; + clear_thread_flag(TIF_SIGPENDING); + } else { + sigpending = 0; + } + + err = c->trans_mod->zc_request(c, req, uidata, uodata, + inlen, olen, in_hdrlen); + if (err < 0) { + if (err == -EIO) + c->status = Disconnected; + if (err != -ERESTARTSYS) + goto recalc_sigpending; + } + if (READ_ONCE(req->status) == REQ_STATUS_ERROR) { + p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); + err = req->t_err; + } + if (err == -ERESTARTSYS && c->status == Connected) { + p9_debug(P9_DEBUG_MUX, "flushing\n"); + sigpending = 1; + clear_thread_flag(TIF_SIGPENDING); + + if (c->trans_mod->cancel(c, req)) + p9_client_flush(c, req); + + /* if we received the response anyway, don't signal error */ + if (READ_ONCE(req->status) == REQ_STATUS_RCVD) + err = 0; + } +recalc_sigpending: + if (sigpending) { + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + if (err < 0) + goto reterr; + + err = p9_check_errors(c, req); + trace_9p_client_res(c, type, req->rc.tag, err); + if (!err) + return req; +reterr: + p9_req_put(c, req); + return ERR_PTR(safe_errno(err)); +} + +static struct p9_fid *p9_fid_create(struct p9_client *clnt) +{ + int ret; + struct p9_fid *fid; + + p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt); + fid = kzalloc(sizeof(*fid), GFP_KERNEL); + if (!fid) + return NULL; + + fid->mode = -1; + fid->uid = current_fsuid(); + fid->clnt = clnt; + refcount_set(&fid->count, 1); + + idr_preload(GFP_KERNEL); + spin_lock_irq(&clnt->lock); + ret = idr_alloc_u32(&clnt->fids, fid, &fid->fid, P9_NOFID - 1, + GFP_NOWAIT); + spin_unlock_irq(&clnt->lock); + idr_preload_end(); + if (!ret) { + trace_9p_fid_ref(fid, P9_FID_REF_CREATE); + return fid; + } + + kfree(fid); + return NULL; +} + +static void p9_fid_destroy(struct p9_fid *fid) +{ + struct p9_client *clnt; + unsigned long flags; + + p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid); + trace_9p_fid_ref(fid, P9_FID_REF_DESTROY); + clnt = fid->clnt; + spin_lock_irqsave(&clnt->lock, flags); + idr_remove(&clnt->fids, fid->fid); + spin_unlock_irqrestore(&clnt->lock, flags); + kfree(fid->rdir); + kfree(fid); +} + +/* We also need to export tracepoint symbols for tracepoint_enabled() */ +EXPORT_TRACEPOINT_SYMBOL(9p_fid_ref); + +void do_trace_9p_fid_get(struct p9_fid *fid) +{ + trace_9p_fid_ref(fid, P9_FID_REF_GET); +} +EXPORT_SYMBOL(do_trace_9p_fid_get); + +void do_trace_9p_fid_put(struct p9_fid *fid) +{ + trace_9p_fid_ref(fid, P9_FID_REF_PUT); +} +EXPORT_SYMBOL(do_trace_9p_fid_put); + +static int p9_client_version(struct p9_client *c) +{ + int err; + struct p9_req_t *req; + char *version = NULL; + int msize; + + p9_debug(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n", + c->msize, c->proto_version); + + switch (c->proto_version) { + case p9_proto_2000L: + req = p9_client_rpc(c, P9_TVERSION, "ds", + c->msize, "9P2000.L"); + break; + case p9_proto_2000u: + req = p9_client_rpc(c, P9_TVERSION, "ds", + c->msize, "9P2000.u"); + break; + case p9_proto_legacy: + req = p9_client_rpc(c, P9_TVERSION, "ds", + c->msize, "9P2000"); + break; + default: + return -EINVAL; + } + + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(&req->rc, c->proto_version, "ds", &msize, &version); + if (err) { + p9_debug(P9_DEBUG_9P, "version error %d\n", err); + trace_9p_protocol_dump(c, &req->rc); + goto error; + } + + p9_debug(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version); + if (!strncmp(version, "9P2000.L", 8)) { + c->proto_version = p9_proto_2000L; + } else if (!strncmp(version, "9P2000.u", 8)) { + c->proto_version = p9_proto_2000u; + } else if (!strncmp(version, "9P2000", 6)) { + c->proto_version = p9_proto_legacy; + } else { + p9_debug(P9_DEBUG_ERROR, + "server returned an unknown version: %s\n", version); + err = -EREMOTEIO; + goto error; + } + + if (msize < 4096) { + p9_debug(P9_DEBUG_ERROR, + "server returned a msize < 4096: %d\n", msize); + err = -EREMOTEIO; + goto error; + } + if (msize < c->msize) + c->msize = msize; + +error: + kfree(version); + p9_req_put(c, req); + + return err; +} + +struct p9_client *p9_client_create(const char *dev_name, char *options) +{ + int err; + struct p9_client *clnt; + char *client_id; + + clnt = kmalloc(sizeof(*clnt), GFP_KERNEL); + if (!clnt) + return ERR_PTR(-ENOMEM); + + clnt->trans_mod = NULL; + clnt->trans = NULL; + clnt->fcall_cache = NULL; + + client_id = utsname()->nodename; + memcpy(clnt->name, client_id, strlen(client_id) + 1); + + spin_lock_init(&clnt->lock); + idr_init(&clnt->fids); + idr_init(&clnt->reqs); + + err = parse_opts(options, clnt); + if (err < 0) + goto free_client; + + if (!clnt->trans_mod) + clnt->trans_mod = v9fs_get_default_trans(); + + if (!clnt->trans_mod) { + err = -EPROTONOSUPPORT; + p9_debug(P9_DEBUG_ERROR, + "No transport defined or default transport\n"); + goto free_client; + } + + p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n", + clnt, clnt->trans_mod, clnt->msize, clnt->proto_version); + + err = clnt->trans_mod->create(clnt, dev_name, options); + if (err) + goto put_trans; + + if (clnt->msize > clnt->trans_mod->maxsize) { + clnt->msize = clnt->trans_mod->maxsize; + pr_info("Limiting 'msize' to %d as this is the maximum " + "supported by transport %s\n", + clnt->msize, clnt->trans_mod->name + ); + } + + if (clnt->msize < 4096) { + p9_debug(P9_DEBUG_ERROR, + "Please specify a msize of at least 4k\n"); + err = -EINVAL; + goto close_trans; + } + + err = p9_client_version(clnt); + if (err) + goto close_trans; + + /* P9_HDRSZ + 4 is the smallest packet header we can have that is + * followed by data accessed from userspace by read + */ + clnt->fcall_cache = + kmem_cache_create_usercopy("9p-fcall-cache", clnt->msize, + 0, 0, P9_HDRSZ + 4, + clnt->msize - (P9_HDRSZ + 4), + NULL); + + return clnt; + +close_trans: + clnt->trans_mod->close(clnt); +put_trans: + v9fs_put_trans(clnt->trans_mod); +free_client: + kfree(clnt); + return ERR_PTR(err); +} +EXPORT_SYMBOL(p9_client_create); + +void p9_client_destroy(struct p9_client *clnt) +{ + struct p9_fid *fid; + int id; + + p9_debug(P9_DEBUG_MUX, "clnt %p\n", clnt); + + if (clnt->trans_mod) + clnt->trans_mod->close(clnt); + + v9fs_put_trans(clnt->trans_mod); + + idr_for_each_entry(&clnt->fids, fid, id) { + pr_info("Found fid %d not clunked\n", fid->fid); + p9_fid_destroy(fid); + } + + p9_tag_cleanup(clnt); + + kmem_cache_destroy(clnt->fcall_cache); + kfree(clnt); +} +EXPORT_SYMBOL(p9_client_destroy); + +void p9_client_disconnect(struct p9_client *clnt) +{ + p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt); + clnt->status = Disconnected; +} +EXPORT_SYMBOL(p9_client_disconnect); + +void p9_client_begin_disconnect(struct p9_client *clnt) +{ + p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt); + clnt->status = BeginDisconnect; +} +EXPORT_SYMBOL(p9_client_begin_disconnect); + +struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, + const char *uname, kuid_t n_uname, + const char *aname) +{ + int err; + struct p9_req_t *req; + struct p9_fid *fid; + struct p9_qid qid; + + p9_debug(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n", + afid ? afid->fid : -1, uname, aname); + fid = p9_fid_create(clnt); + if (!fid) { + err = -ENOMEM; + goto error; + } + fid->uid = n_uname; + + req = p9_client_rpc(clnt, P9_TATTACH, "ddss?u", fid->fid, + afid ? afid->fid : P9_NOFID, uname, aname, n_uname); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", &qid); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + p9_req_put(clnt, req); + goto error; + } + + p9_debug(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n", + qid.type, qid.path, qid.version); + + memmove(&fid->qid, &qid, sizeof(struct p9_qid)); + + p9_req_put(clnt, req); + return fid; + +error: + if (fid) + p9_fid_destroy(fid); + return ERR_PTR(err); +} +EXPORT_SYMBOL(p9_client_attach); + +struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, + const unsigned char * const *wnames, int clone) +{ + int err; + struct p9_client *clnt; + struct p9_fid *fid; + struct p9_qid *wqids; + struct p9_req_t *req; + u16 nwqids, count; + + wqids = NULL; + clnt = oldfid->clnt; + if (clone) { + fid = p9_fid_create(clnt); + if (!fid) { + err = -ENOMEM; + goto error; + } + + fid->uid = oldfid->uid; + } else { + fid = oldfid; + } + + p9_debug(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n", + oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL); + req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid, + nwname, wnames); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(&req->rc, clnt->proto_version, "R", &nwqids, &wqids); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + p9_req_put(clnt, req); + goto clunk_fid; + } + p9_req_put(clnt, req); + + p9_debug(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids); + + if (nwqids != nwname) { + err = -ENOENT; + goto clunk_fid; + } + + for (count = 0; count < nwqids; count++) + p9_debug(P9_DEBUG_9P, "<<< [%d] %x.%llx.%x\n", + count, wqids[count].type, + wqids[count].path, + wqids[count].version); + + if (nwname) + memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid)); + else + memmove(&fid->qid, &oldfid->qid, sizeof(struct p9_qid)); + + kfree(wqids); + return fid; + +clunk_fid: + kfree(wqids); + p9_fid_put(fid); + fid = NULL; + +error: + if (fid && fid != oldfid) + p9_fid_destroy(fid); + + return ERR_PTR(err); +} +EXPORT_SYMBOL(p9_client_walk); + +int p9_client_open(struct p9_fid *fid, int mode) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + struct p9_qid qid; + int iounit; + + clnt = fid->clnt; + p9_debug(P9_DEBUG_9P, ">>> %s fid %d mode %d\n", + p9_is_proto_dotl(clnt) ? "TLOPEN" : "TOPEN", fid->fid, mode); + + if (fid->mode != -1) + return -EINVAL; + + if (p9_is_proto_dotl(clnt)) + req = p9_client_rpc(clnt, P9_TLOPEN, "dd", fid->fid, mode & P9L_MODE_MASK); + else + req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode & P9L_MODE_MASK); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + goto free_and_error; + } + + p9_debug(P9_DEBUG_9P, "<<< %s qid %x.%llx.%x iounit %x\n", + p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN", qid.type, + qid.path, qid.version, iounit); + + memmove(&fid->qid, &qid, sizeof(struct p9_qid)); + fid->mode = mode; + fid->iounit = iounit; + +free_and_error: + p9_req_put(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_open); + +int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, + u32 mode, kgid_t gid, struct p9_qid *qid) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + int iounit; + + p9_debug(P9_DEBUG_9P, + ">>> TLCREATE fid %d name %s flags %d mode %d gid %d\n", + ofid->fid, name, flags, mode, + from_kgid(&init_user_ns, gid)); + clnt = ofid->clnt; + + if (ofid->mode != -1) + return -EINVAL; + + req = p9_client_rpc(clnt, P9_TLCREATE, "dsddg", ofid->fid, name, flags, + mode & P9L_MODE_MASK, gid); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", qid, &iounit); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + goto free_and_error; + } + + p9_debug(P9_DEBUG_9P, "<<< RLCREATE qid %x.%llx.%x iounit %x\n", + qid->type, qid->path, qid->version, iounit); + + memmove(&ofid->qid, qid, sizeof(struct p9_qid)); + ofid->mode = flags; + ofid->iounit = iounit; + +free_and_error: + p9_req_put(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_create_dotl); + +int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode, + char *extension) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + struct p9_qid qid; + int iounit; + + p9_debug(P9_DEBUG_9P, ">>> TCREATE fid %d name %s perm %d mode %d\n", + fid->fid, name, perm, mode); + clnt = fid->clnt; + + if (fid->mode != -1) + return -EINVAL; + + req = p9_client_rpc(clnt, P9_TCREATE, "dsdb?s", fid->fid, name, perm, + mode & P9L_MODE_MASK, extension); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + goto free_and_error; + } + + p9_debug(P9_DEBUG_9P, "<<< RCREATE qid %x.%llx.%x iounit %x\n", + qid.type, qid.path, qid.version, iounit); + + memmove(&fid->qid, &qid, sizeof(struct p9_qid)); + fid->mode = mode; + fid->iounit = iounit; + +free_and_error: + p9_req_put(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_fcreate); + +int p9_client_symlink(struct p9_fid *dfid, const char *name, + const char *symtgt, kgid_t gid, struct p9_qid *qid) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + p9_debug(P9_DEBUG_9P, ">>> TSYMLINK dfid %d name %s symtgt %s\n", + dfid->fid, name, symtgt); + clnt = dfid->clnt; + + req = p9_client_rpc(clnt, P9_TSYMLINK, "dssg", dfid->fid, name, symtgt, + gid); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + goto free_and_error; + } + + p9_debug(P9_DEBUG_9P, "<<< RSYMLINK qid %x.%llx.%x\n", + qid->type, qid->path, qid->version); + +free_and_error: + p9_req_put(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_symlink); + +int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, const char *newname) +{ + struct p9_client *clnt; + struct p9_req_t *req; + + p9_debug(P9_DEBUG_9P, ">>> TLINK dfid %d oldfid %d newname %s\n", + dfid->fid, oldfid->fid, newname); + clnt = dfid->clnt; + req = p9_client_rpc(clnt, P9_TLINK, "dds", dfid->fid, oldfid->fid, + newname); + if (IS_ERR(req)) + return PTR_ERR(req); + + p9_debug(P9_DEBUG_9P, "<<< RLINK\n"); + p9_req_put(clnt, req); + return 0; +} +EXPORT_SYMBOL(p9_client_link); + +int p9_client_fsync(struct p9_fid *fid, int datasync) +{ + int err = 0; + struct p9_client *clnt; + struct p9_req_t *req; + + p9_debug(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n", + fid->fid, datasync); + clnt = fid->clnt; + + req = p9_client_rpc(clnt, P9_TFSYNC, "dd", fid->fid, datasync); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + p9_debug(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid); + + p9_req_put(clnt, req); + +error: + return err; +} +EXPORT_SYMBOL(p9_client_fsync); + +int p9_client_clunk(struct p9_fid *fid) +{ + int err = 0; + struct p9_client *clnt; + struct p9_req_t *req; + int retries = 0; + +again: + p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n", + fid->fid, retries); + clnt = fid->clnt; + + req = p9_client_rpc(clnt, P9_TCLUNK, "d", fid->fid); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + p9_debug(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid); + + p9_req_put(clnt, req); +error: + /* Fid is not valid even after a failed clunk + * If interrupted, retry once then give up and + * leak fid until umount. + */ + if (err == -ERESTARTSYS) { + if (retries++ == 0) + goto again; + } else { + p9_fid_destroy(fid); + } + return err; +} +EXPORT_SYMBOL(p9_client_clunk); + +int p9_client_remove(struct p9_fid *fid) +{ + int err = 0; + struct p9_client *clnt; + struct p9_req_t *req; + + p9_debug(P9_DEBUG_9P, ">>> TREMOVE fid %d\n", fid->fid); + clnt = fid->clnt; + + req = p9_client_rpc(clnt, P9_TREMOVE, "d", fid->fid); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + p9_debug(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid); + + p9_req_put(clnt, req); +error: + if (err == -ERESTARTSYS) + p9_fid_put(fid); + else + p9_fid_destroy(fid); + return err; +} +EXPORT_SYMBOL(p9_client_remove); + +int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags) +{ + int err = 0; + struct p9_req_t *req; + struct p9_client *clnt; + + p9_debug(P9_DEBUG_9P, ">>> TUNLINKAT fid %d %s %d\n", + dfid->fid, name, flags); + + clnt = dfid->clnt; + req = p9_client_rpc(clnt, P9_TUNLINKAT, "dsd", dfid->fid, name, flags); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + p9_debug(P9_DEBUG_9P, "<<< RUNLINKAT fid %d %s\n", dfid->fid, name); + + p9_req_put(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_unlinkat); + +int +p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) +{ + int total = 0; + *err = 0; + + while (iov_iter_count(to)) { + int count; + + count = p9_client_read_once(fid, offset, to, err); + if (!count || *err) + break; + offset += count; + total += count; + } + return total; +} +EXPORT_SYMBOL(p9_client_read); + +int +p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, + int *err) +{ + struct p9_client *clnt = fid->clnt; + struct p9_req_t *req; + int count = iov_iter_count(to); + int rsize, received, non_zc = 0; + char *dataptr; + + *err = 0; + p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %zu\n", + fid->fid, offset, iov_iter_count(to)); + + rsize = fid->iounit; + if (!rsize || rsize > clnt->msize - P9_IOHDRSZ) + rsize = clnt->msize - P9_IOHDRSZ; + + if (count < rsize) + rsize = count; + + /* Don't bother zerocopy for small IO (< 1024) */ + if (clnt->trans_mod->zc_request && rsize > 1024) { + /* response header len is 11 + * PDU Header(7) + IO Size (4) + */ + req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize, + 0, 11, "dqd", fid->fid, + offset, rsize); + } else { + non_zc = 1; + req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, + rsize); + } + if (IS_ERR(req)) { + *err = PTR_ERR(req); + if (!non_zc) + iov_iter_revert(to, count - iov_iter_count(to)); + return 0; + } + + *err = p9pdu_readf(&req->rc, clnt->proto_version, + "D", &received, &dataptr); + if (*err) { + if (!non_zc) + iov_iter_revert(to, count - iov_iter_count(to)); + trace_9p_protocol_dump(clnt, &req->rc); + p9_req_put(clnt, req); + return 0; + } + if (rsize < received) { + pr_err("bogus RREAD count (%d > %d)\n", received, rsize); + received = rsize; + } + + p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); + + if (non_zc) { + int n = copy_to_iter(dataptr, received, to); + + if (n != received) { + *err = -EFAULT; + p9_req_put(clnt, req); + return n; + } + } else { + iov_iter_revert(to, count - received - iov_iter_count(to)); + } + p9_req_put(clnt, req); + return received; +} +EXPORT_SYMBOL(p9_client_read_once); + +int +p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) +{ + struct p9_client *clnt = fid->clnt; + struct p9_req_t *req; + int total = 0; + *err = 0; + + p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %zd\n", + fid->fid, offset, iov_iter_count(from)); + + while (iov_iter_count(from)) { + int count = iov_iter_count(from); + int rsize = fid->iounit; + int written; + + if (!rsize || rsize > clnt->msize - P9_IOHDRSZ) + rsize = clnt->msize - P9_IOHDRSZ; + + if (count < rsize) + rsize = count; + + /* Don't bother zerocopy for small IO (< 1024) */ + if (clnt->trans_mod->zc_request && rsize > 1024) { + req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0, + rsize, P9_ZC_HDR_SZ, "dqd", + fid->fid, offset, rsize); + } else { + req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid, + offset, rsize, from); + } + if (IS_ERR(req)) { + iov_iter_revert(from, count - iov_iter_count(from)); + *err = PTR_ERR(req); + break; + } + + *err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &written); + if (*err) { + iov_iter_revert(from, count - iov_iter_count(from)); + trace_9p_protocol_dump(clnt, &req->rc); + p9_req_put(clnt, req); + break; + } + if (rsize < written) { + pr_err("bogus RWRITE count (%d > %d)\n", written, rsize); + written = rsize; + } + + p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); + + p9_req_put(clnt, req); + iov_iter_revert(from, count - written - iov_iter_count(from)); + total += written; + offset += written; + } + return total; +} +EXPORT_SYMBOL(p9_client_write); + +struct p9_wstat *p9_client_stat(struct p9_fid *fid) +{ + int err; + struct p9_client *clnt; + struct p9_wstat *ret; + struct p9_req_t *req; + u16 ignored; + + p9_debug(P9_DEBUG_9P, ">>> TSTAT fid %d\n", fid->fid); + + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return ERR_PTR(-ENOMEM); + + clnt = fid->clnt; + + req = p9_client_rpc(clnt, P9_TSTAT, "d", fid->fid); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(&req->rc, clnt->proto_version, "wS", &ignored, ret); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + p9_req_put(clnt, req); + goto error; + } + + p9_debug(P9_DEBUG_9P, + "<<< RSTAT sz=%x type=%x dev=%x qid=%x.%llx.%x\n" + "<<< mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n" + "<<< name=%s uid=%s gid=%s muid=%s extension=(%s)\n" + "<<< uid=%d gid=%d n_muid=%d\n", + ret->size, ret->type, ret->dev, ret->qid.type, ret->qid.path, + ret->qid.version, ret->mode, + ret->atime, ret->mtime, ret->length, + ret->name, ret->uid, ret->gid, ret->muid, ret->extension, + from_kuid(&init_user_ns, ret->n_uid), + from_kgid(&init_user_ns, ret->n_gid), + from_kuid(&init_user_ns, ret->n_muid)); + + p9_req_put(clnt, req); + return ret; + +error: + kfree(ret); + return ERR_PTR(err); +} +EXPORT_SYMBOL(p9_client_stat); + +struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, + u64 request_mask) +{ + int err; + struct p9_client *clnt; + struct p9_stat_dotl *ret; + struct p9_req_t *req; + + p9_debug(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n", + fid->fid, request_mask); + + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return ERR_PTR(-ENOMEM); + + clnt = fid->clnt; + + req = p9_client_rpc(clnt, P9_TGETATTR, "dq", fid->fid, request_mask); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(&req->rc, clnt->proto_version, "A", ret); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + p9_req_put(clnt, req); + goto error; + } + + p9_debug(P9_DEBUG_9P, "<<< RGETATTR st_result_mask=%lld\n" + "<<< qid=%x.%llx.%x\n" + "<<< st_mode=%8.8x st_nlink=%llu\n" + "<<< st_uid=%d st_gid=%d\n" + "<<< st_rdev=%llx st_size=%llx st_blksize=%llu st_blocks=%llu\n" + "<<< st_atime_sec=%lld st_atime_nsec=%lld\n" + "<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n" + "<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n" + "<<< st_btime_sec=%lld st_btime_nsec=%lld\n" + "<<< st_gen=%lld st_data_version=%lld\n", + ret->st_result_mask, + ret->qid.type, ret->qid.path, ret->qid.version, + ret->st_mode, ret->st_nlink, + from_kuid(&init_user_ns, ret->st_uid), + from_kgid(&init_user_ns, ret->st_gid), + ret->st_rdev, ret->st_size, ret->st_blksize, ret->st_blocks, + ret->st_atime_sec, ret->st_atime_nsec, + ret->st_mtime_sec, ret->st_mtime_nsec, + ret->st_ctime_sec, ret->st_ctime_nsec, + ret->st_btime_sec, ret->st_btime_nsec, + ret->st_gen, ret->st_data_version); + + p9_req_put(clnt, req); + return ret; + +error: + kfree(ret); + return ERR_PTR(err); +} +EXPORT_SYMBOL(p9_client_getattr_dotl); + +static int p9_client_statsize(struct p9_wstat *wst, int proto_version) +{ + int ret; + + /* NOTE: size shouldn't include its own length */ + /* size[2] type[2] dev[4] qid[13] */ + /* mode[4] atime[4] mtime[4] length[8]*/ + /* name[s] uid[s] gid[s] muid[s] */ + ret = 2 + 4 + 13 + 4 + 4 + 4 + 8 + 2 + 2 + 2 + 2; + + if (wst->name) + ret += strlen(wst->name); + if (wst->uid) + ret += strlen(wst->uid); + if (wst->gid) + ret += strlen(wst->gid); + if (wst->muid) + ret += strlen(wst->muid); + + if (proto_version == p9_proto_2000u || + proto_version == p9_proto_2000L) { + /* extension[s] n_uid[4] n_gid[4] n_muid[4] */ + ret += 2 + 4 + 4 + 4; + if (wst->extension) + ret += strlen(wst->extension); + } + + return ret; +} + +int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) +{ + int err = 0; + struct p9_req_t *req; + struct p9_client *clnt; + + clnt = fid->clnt; + wst->size = p9_client_statsize(wst, clnt->proto_version); + p9_debug(P9_DEBUG_9P, ">>> TWSTAT fid %d\n", + fid->fid); + p9_debug(P9_DEBUG_9P, + " sz=%x type=%x dev=%x qid=%x.%llx.%x\n" + " mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n" + " name=%s uid=%s gid=%s muid=%s extension=(%s)\n" + " uid=%d gid=%d n_muid=%d\n", + wst->size, wst->type, wst->dev, wst->qid.type, + wst->qid.path, wst->qid.version, + wst->mode, wst->atime, wst->mtime, wst->length, + wst->name, wst->uid, wst->gid, wst->muid, wst->extension, + from_kuid(&init_user_ns, wst->n_uid), + from_kgid(&init_user_ns, wst->n_gid), + from_kuid(&init_user_ns, wst->n_muid)); + + req = p9_client_rpc(clnt, P9_TWSTAT, "dwS", + fid->fid, wst->size + 2, wst); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + p9_debug(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid); + + p9_req_put(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_wstat); + +int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr) +{ + int err = 0; + struct p9_req_t *req; + struct p9_client *clnt; + + clnt = fid->clnt; + p9_debug(P9_DEBUG_9P, ">>> TSETATTR fid %d\n", fid->fid); + p9_debug(P9_DEBUG_9P, " valid=%x mode=%x uid=%d gid=%d size=%lld\n", + p9attr->valid, p9attr->mode, + from_kuid(&init_user_ns, p9attr->uid), + from_kgid(&init_user_ns, p9attr->gid), + p9attr->size); + p9_debug(P9_DEBUG_9P, " atime_sec=%lld atime_nsec=%lld\n", + p9attr->atime_sec, p9attr->atime_nsec); + p9_debug(P9_DEBUG_9P, " mtime_sec=%lld mtime_nsec=%lld\n", + p9attr->mtime_sec, p9attr->mtime_nsec); + + req = p9_client_rpc(clnt, P9_TSETATTR, "dI", fid->fid, p9attr); + + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + p9_debug(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid); + p9_req_put(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_setattr); + +int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb) +{ + int err; + struct p9_req_t *req; + struct p9_client *clnt; + + clnt = fid->clnt; + + p9_debug(P9_DEBUG_9P, ">>> TSTATFS fid %d\n", fid->fid); + + req = p9_client_rpc(clnt, P9_TSTATFS, "d", fid->fid); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(&req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type, + &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail, + &sb->files, &sb->ffree, &sb->fsid, &sb->namelen); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + p9_req_put(clnt, req); + goto error; + } + + p9_debug(P9_DEBUG_9P, + "<<< RSTATFS fid %d type 0x%x bsize %u blocks %llu bfree %llu bavail %llu files %llu ffree %llu fsid %llu namelen %u\n", + fid->fid, sb->type, sb->bsize, sb->blocks, sb->bfree, + sb->bavail, sb->files, sb->ffree, sb->fsid, sb->namelen); + + p9_req_put(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_statfs); + +int p9_client_rename(struct p9_fid *fid, + struct p9_fid *newdirfid, const char *name) +{ + int err = 0; + struct p9_req_t *req; + struct p9_client *clnt; + + clnt = fid->clnt; + + p9_debug(P9_DEBUG_9P, ">>> TRENAME fid %d newdirfid %d name %s\n", + fid->fid, newdirfid->fid, name); + + req = p9_client_rpc(clnt, P9_TRENAME, "dds", fid->fid, + newdirfid->fid, name); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + p9_debug(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid); + + p9_req_put(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_rename); + +int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name, + struct p9_fid *newdirfid, const char *new_name) +{ + int err = 0; + struct p9_req_t *req; + struct p9_client *clnt; + + clnt = olddirfid->clnt; + + p9_debug(P9_DEBUG_9P, + ">>> TRENAMEAT olddirfid %d old name %s newdirfid %d new name %s\n", + olddirfid->fid, old_name, newdirfid->fid, new_name); + + req = p9_client_rpc(clnt, P9_TRENAMEAT, "dsds", olddirfid->fid, + old_name, newdirfid->fid, new_name); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + p9_debug(P9_DEBUG_9P, "<<< RRENAMEAT newdirfid %d new name %s\n", + newdirfid->fid, new_name); + + p9_req_put(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_renameat); + +/* An xattrwalk without @attr_name gives the fid for the lisxattr namespace + */ +struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid, + const char *attr_name, u64 *attr_size) +{ + int err; + struct p9_req_t *req; + struct p9_client *clnt; + struct p9_fid *attr_fid; + + clnt = file_fid->clnt; + attr_fid = p9_fid_create(clnt); + if (!attr_fid) { + err = -ENOMEM; + goto error; + } + p9_debug(P9_DEBUG_9P, + ">>> TXATTRWALK file_fid %d, attr_fid %d name '%s'\n", + file_fid->fid, attr_fid->fid, attr_name); + + req = p9_client_rpc(clnt, P9_TXATTRWALK, "dds", + file_fid->fid, attr_fid->fid, attr_name); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + err = p9pdu_readf(&req->rc, clnt->proto_version, "q", attr_size); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + p9_req_put(clnt, req); + goto clunk_fid; + } + p9_req_put(clnt, req); + p9_debug(P9_DEBUG_9P, "<<< RXATTRWALK fid %d size %llu\n", + attr_fid->fid, *attr_size); + return attr_fid; +clunk_fid: + p9_fid_put(attr_fid); + attr_fid = NULL; +error: + if (attr_fid && attr_fid != file_fid) + p9_fid_destroy(attr_fid); + + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(p9_client_xattrwalk); + +int p9_client_xattrcreate(struct p9_fid *fid, const char *name, + u64 attr_size, int flags) +{ + int err = 0; + struct p9_req_t *req; + struct p9_client *clnt; + + p9_debug(P9_DEBUG_9P, + ">>> TXATTRCREATE fid %d name %s size %llu flag %d\n", + fid->fid, name, attr_size, flags); + clnt = fid->clnt; + req = p9_client_rpc(clnt, P9_TXATTRCREATE, "dsqd", + fid->fid, name, attr_size, flags); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + p9_debug(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid); + p9_req_put(clnt, req); +error: + return err; +} +EXPORT_SYMBOL_GPL(p9_client_xattrcreate); + +int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) +{ + int err, rsize, non_zc = 0; + struct p9_client *clnt; + struct p9_req_t *req; + char *dataptr; + struct kvec kv = {.iov_base = data, .iov_len = count}; + struct iov_iter to; + + iov_iter_kvec(&to, ITER_DEST, &kv, 1, count); + + p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n", + fid->fid, offset, count); + + clnt = fid->clnt; + + rsize = fid->iounit; + if (!rsize || rsize > clnt->msize - P9_READDIRHDRSZ) + rsize = clnt->msize - P9_READDIRHDRSZ; + + if (count < rsize) + rsize = count; + + /* Don't bother zerocopy for small IO (< 1024) */ + if (clnt->trans_mod->zc_request && rsize > 1024) { + /* response header len is 11 + * PDU Header(7) + IO Size (4) + */ + req = p9_client_zc_rpc(clnt, P9_TREADDIR, &to, NULL, rsize, 0, + 11, "dqd", fid->fid, offset, rsize); + } else { + non_zc = 1; + req = p9_client_rpc(clnt, P9_TREADDIR, "dqd", fid->fid, + offset, rsize); + } + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto error; + } + + err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &count, &dataptr); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + goto free_and_error; + } + if (rsize < count) { + pr_err("bogus RREADDIR count (%d > %d)\n", count, rsize); + count = rsize; + } + + p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count); + + if (non_zc) + memmove(data, dataptr, count); + + p9_req_put(clnt, req); + return count; + +free_and_error: + p9_req_put(clnt, req); +error: + return err; +} +EXPORT_SYMBOL(p9_client_readdir); + +int p9_client_mknod_dotl(struct p9_fid *fid, const char *name, int mode, + dev_t rdev, kgid_t gid, struct p9_qid *qid) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + clnt = fid->clnt; + p9_debug(P9_DEBUG_9P, + ">>> TMKNOD fid %d name %s mode %d major %d minor %d\n", + fid->fid, name, mode, MAJOR(rdev), MINOR(rdev)); + req = p9_client_rpc(clnt, P9_TMKNOD, "dsdddg", fid->fid, name, mode, + MAJOR(rdev), MINOR(rdev), gid); + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + goto error; + } + p9_debug(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", + qid->type, qid->path, qid->version); + +error: + p9_req_put(clnt, req); + return err; +} +EXPORT_SYMBOL(p9_client_mknod_dotl); + +int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode, + kgid_t gid, struct p9_qid *qid) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + clnt = fid->clnt; + p9_debug(P9_DEBUG_9P, ">>> TMKDIR fid %d name %s mode %d gid %d\n", + fid->fid, name, mode, from_kgid(&init_user_ns, gid)); + req = p9_client_rpc(clnt, P9_TMKDIR, "dsdg", + fid->fid, name, mode, gid); + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + goto error; + } + p9_debug(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type, + qid->path, qid->version); + +error: + p9_req_put(clnt, req); + return err; +} +EXPORT_SYMBOL(p9_client_mkdir_dotl); + +int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + clnt = fid->clnt; + p9_debug(P9_DEBUG_9P, + ">>> TLOCK fid %d type %i flags %d start %lld length %lld proc_id %d client_id %s\n", + fid->fid, flock->type, flock->flags, flock->start, + flock->length, flock->proc_id, flock->client_id); + + req = p9_client_rpc(clnt, P9_TLOCK, "dbdqqds", fid->fid, flock->type, + flock->flags, flock->start, flock->length, + flock->proc_id, flock->client_id); + + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(&req->rc, clnt->proto_version, "b", status); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + goto error; + } + p9_debug(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status); +error: + p9_req_put(clnt, req); + return err; +} +EXPORT_SYMBOL(p9_client_lock_dotl); + +int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + clnt = fid->clnt; + p9_debug(P9_DEBUG_9P, + ">>> TGETLOCK fid %d, type %i start %lld length %lld proc_id %d client_id %s\n", + fid->fid, glock->type, glock->start, glock->length, + glock->proc_id, glock->client_id); + + req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid, + glock->type, glock->start, glock->length, + glock->proc_id, glock->client_id); + + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(&req->rc, clnt->proto_version, "bqqds", &glock->type, + &glock->start, &glock->length, &glock->proc_id, + &glock->client_id); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + goto error; + } + p9_debug(P9_DEBUG_9P, + "<<< RGETLOCK type %i start %lld length %lld proc_id %d client_id %s\n", + glock->type, glock->start, glock->length, + glock->proc_id, glock->client_id); +error: + p9_req_put(clnt, req); + return err; +} +EXPORT_SYMBOL(p9_client_getlock_dotl); + +int p9_client_readlink(struct p9_fid *fid, char **target) +{ + int err; + struct p9_client *clnt; + struct p9_req_t *req; + + clnt = fid->clnt; + p9_debug(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid); + + req = p9_client_rpc(clnt, P9_TREADLINK, "d", fid->fid); + if (IS_ERR(req)) + return PTR_ERR(req); + + err = p9pdu_readf(&req->rc, clnt->proto_version, "s", target); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + goto error; + } + p9_debug(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target); +error: + p9_req_put(clnt, req); + return err; +} +EXPORT_SYMBOL(p9_client_readlink); + +int __init p9_client_init(void) +{ + p9_req_cache = KMEM_CACHE(p9_req_t, SLAB_TYPESAFE_BY_RCU); + return p9_req_cache ? 0 : -ENOMEM; +} + +void __exit p9_client_exit(void) +{ + kmem_cache_destroy(p9_req_cache); +} diff --git a/net/9p/error.c b/net/9p/error.c new file mode 100644 index 0000000000..8da744494b --- /dev/null +++ b/net/9p/error.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Error string handling + * + * Plan 9 uses error strings, Unix uses error numbers. These functions + * try to help manage that and provide for dynamically adding error + * mappings. + * + * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> + * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/errno.h> +#include <net/9p/9p.h> + +/** + * struct errormap - map string errors from Plan 9 to Linux numeric ids + * @name: string sent over 9P + * @val: numeric id most closely representing @name + * @namelen: length of string + * @list: hash-table list for string lookup + */ +struct errormap { + char *name; + int val; + + int namelen; + struct hlist_node list; +}; + +#define ERRHASHSZ 32 +static struct hlist_head hash_errmap[ERRHASHSZ]; + +/* FixMe - reduce to a reasonable size */ +static struct errormap errmap[] = { + {"Operation not permitted", EPERM}, + {"wstat prohibited", EPERM}, + {"No such file or directory", ENOENT}, + {"directory entry not found", ENOENT}, + {"file not found", ENOENT}, + {"Interrupted system call", EINTR}, + {"Input/output error", EIO}, + {"No such device or address", ENXIO}, + {"Argument list too long", E2BIG}, + {"Bad file descriptor", EBADF}, + {"Resource temporarily unavailable", EAGAIN}, + {"Cannot allocate memory", ENOMEM}, + {"Permission denied", EACCES}, + {"Bad address", EFAULT}, + {"Block device required", ENOTBLK}, + {"Device or resource busy", EBUSY}, + {"File exists", EEXIST}, + {"Invalid cross-device link", EXDEV}, + {"No such device", ENODEV}, + {"Not a directory", ENOTDIR}, + {"Is a directory", EISDIR}, + {"Invalid argument", EINVAL}, + {"Too many open files in system", ENFILE}, + {"Too many open files", EMFILE}, + {"Text file busy", ETXTBSY}, + {"File too large", EFBIG}, + {"No space left on device", ENOSPC}, + {"Illegal seek", ESPIPE}, + {"Read-only file system", EROFS}, + {"Too many links", EMLINK}, + {"Broken pipe", EPIPE}, + {"Numerical argument out of domain", EDOM}, + {"Numerical result out of range", ERANGE}, + {"Resource deadlock avoided", EDEADLK}, + {"File name too long", ENAMETOOLONG}, + {"No locks available", ENOLCK}, + {"Function not implemented", ENOSYS}, + {"Directory not empty", ENOTEMPTY}, + {"Too many levels of symbolic links", ELOOP}, + {"No message of desired type", ENOMSG}, + {"Identifier removed", EIDRM}, + {"No data available", ENODATA}, + {"Machine is not on the network", ENONET}, + {"Package not installed", ENOPKG}, + {"Object is remote", EREMOTE}, + {"Link has been severed", ENOLINK}, + {"Communication error on send", ECOMM}, + {"Protocol error", EPROTO}, + {"Bad message", EBADMSG}, + {"File descriptor in bad state", EBADFD}, + {"Streams pipe error", ESTRPIPE}, + {"Too many users", EUSERS}, + {"Socket operation on non-socket", ENOTSOCK}, + {"Message too long", EMSGSIZE}, + {"Protocol not available", ENOPROTOOPT}, + {"Protocol not supported", EPROTONOSUPPORT}, + {"Socket type not supported", ESOCKTNOSUPPORT}, + {"Operation not supported", EOPNOTSUPP}, + {"Protocol family not supported", EPFNOSUPPORT}, + {"Network is down", ENETDOWN}, + {"Network is unreachable", ENETUNREACH}, + {"Network dropped connection on reset", ENETRESET}, + {"Software caused connection abort", ECONNABORTED}, + {"Connection reset by peer", ECONNRESET}, + {"No buffer space available", ENOBUFS}, + {"Transport endpoint is already connected", EISCONN}, + {"Transport endpoint is not connected", ENOTCONN}, + {"Cannot send after transport endpoint shutdown", ESHUTDOWN}, + {"Connection timed out", ETIMEDOUT}, + {"Connection refused", ECONNREFUSED}, + {"Host is down", EHOSTDOWN}, + {"No route to host", EHOSTUNREACH}, + {"Operation already in progress", EALREADY}, + {"Operation now in progress", EINPROGRESS}, + {"Is a named type file", EISNAM}, + {"Remote I/O error", EREMOTEIO}, + {"Disk quota exceeded", EDQUOT}, +/* errors from fossil, vacfs, and u9fs */ + {"fid unknown or out of range", EBADF}, + {"permission denied", EACCES}, + {"file does not exist", ENOENT}, + {"authentication failed", ECONNREFUSED}, + {"bad offset in directory read", ESPIPE}, + {"bad use of fid", EBADF}, + {"wstat can't convert between files and directories", EPERM}, + {"directory is not empty", ENOTEMPTY}, + {"file exists", EEXIST}, + {"file already exists", EEXIST}, + {"file or directory already exists", EEXIST}, + {"fid already in use", EBADF}, + {"file in use", ETXTBSY}, + {"i/o error", EIO}, + {"file already open for I/O", ETXTBSY}, + {"illegal mode", EINVAL}, + {"illegal name", ENAMETOOLONG}, + {"not a directory", ENOTDIR}, + {"not a member of proposed group", EPERM}, + {"not owner", EACCES}, + {"only owner can change group in wstat", EACCES}, + {"read only file system", EROFS}, + {"no access to special file", EPERM}, + {"i/o count too large", EIO}, + {"unknown group", EINVAL}, + {"unknown user", EINVAL}, + {"bogus wstat buffer", EPROTO}, + {"exclusive use file already open", EAGAIN}, + {"corrupted directory entry", EIO}, + {"corrupted file entry", EIO}, + {"corrupted block label", EIO}, + {"corrupted meta data", EIO}, + {"illegal offset", EINVAL}, + {"illegal path element", ENOENT}, + {"root of file system is corrupted", EIO}, + {"corrupted super block", EIO}, + {"protocol botch", EPROTO}, + {"file system is full", ENOSPC}, + {"file is in use", EAGAIN}, + {"directory entry is not allocated", ENOENT}, + {"file is read only", EROFS}, + {"file has been removed", EIDRM}, + {"only support truncation to zero length", EPERM}, + {"cannot remove root", EPERM}, + {"file too big", EFBIG}, + {"venti i/o error", EIO}, + /* these are not errors */ + {"u9fs rhostsauth: no authentication required", 0}, + {"u9fs authnone: no authentication required", 0}, + {NULL, -1} +}; + +/** + * p9_error_init - preload mappings into hash list + * + */ + +int p9_error_init(void) +{ + struct errormap *c; + int bucket; + + /* initialize hash table */ + for (bucket = 0; bucket < ERRHASHSZ; bucket++) + INIT_HLIST_HEAD(&hash_errmap[bucket]); + + /* load initial error map into hash table */ + for (c = errmap; c->name; c++) { + c->namelen = strlen(c->name); + bucket = jhash(c->name, c->namelen, 0) % ERRHASHSZ; + INIT_HLIST_NODE(&c->list); + hlist_add_head(&c->list, &hash_errmap[bucket]); + } + + return 1; +} +EXPORT_SYMBOL(p9_error_init); + +/** + * p9_errstr2errno - convert error string to error number + * @errstr: error string + * @len: length of error string + * + */ + +int p9_errstr2errno(char *errstr, int len) +{ + int errno; + struct errormap *c; + int bucket; + + errno = 0; + c = NULL; + bucket = jhash(errstr, len, 0) % ERRHASHSZ; + hlist_for_each_entry(c, &hash_errmap[bucket], list) { + if (c->namelen == len && !memcmp(c->name, errstr, len)) { + errno = c->val; + break; + } + } + + if (errno == 0) { + /* TODO: if error isn't found, add it dynamically */ + errstr[len] = 0; + pr_err("%s: server reported unknown error %s\n", + __func__, errstr); + errno = ESERVERFAULT; + } + + return -errno; +} +EXPORT_SYMBOL(p9_errstr2errno); diff --git a/net/9p/mod.c b/net/9p/mod.c new file mode 100644 index 0000000000..55576c1866 --- /dev/null +++ b/net/9p/mod.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * 9P entry point + * + * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net> + * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> + * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kmod.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/moduleparam.h> +#include <net/9p/9p.h> +#include <linux/fs.h> +#include <linux/parser.h> +#include <net/9p/client.h> +#include <net/9p/transport.h> +#include <linux/list.h> +#include <linux/spinlock.h> + +#ifdef CONFIG_NET_9P_DEBUG +unsigned int p9_debug_level; /* feature-rific global debug level */ +EXPORT_SYMBOL(p9_debug_level); +module_param_named(debug, p9_debug_level, uint, 0); +MODULE_PARM_DESC(debug, "9P debugging level"); + +void _p9_debug(enum p9_debug_flags level, const char *func, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if ((p9_debug_level & level) != level) + return; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + if (level == P9_DEBUG_9P) + pr_notice("(%8.8d) %pV", task_pid_nr(current), &vaf); + else + pr_notice("-- %s (%d): %pV", func, task_pid_nr(current), &vaf); + + va_end(args); +} +EXPORT_SYMBOL(_p9_debug); +#endif + +/* Dynamic Transport Registration Routines */ + +static DEFINE_SPINLOCK(v9fs_trans_lock); +static LIST_HEAD(v9fs_trans_list); + +/** + * v9fs_register_trans - register a new transport with 9p + * @m: structure describing the transport module and entry points + * + */ +void v9fs_register_trans(struct p9_trans_module *m) +{ + spin_lock(&v9fs_trans_lock); + list_add_tail(&m->list, &v9fs_trans_list); + spin_unlock(&v9fs_trans_lock); +} +EXPORT_SYMBOL(v9fs_register_trans); + +/** + * v9fs_unregister_trans - unregister a 9p transport + * @m: the transport to remove + * + */ +void v9fs_unregister_trans(struct p9_trans_module *m) +{ + spin_lock(&v9fs_trans_lock); + list_del_init(&m->list); + spin_unlock(&v9fs_trans_lock); +} +EXPORT_SYMBOL(v9fs_unregister_trans); + +static struct p9_trans_module *_p9_get_trans_by_name(const char *s) +{ + struct p9_trans_module *t, *found = NULL; + + spin_lock(&v9fs_trans_lock); + + list_for_each_entry(t, &v9fs_trans_list, list) + if (strcmp(t->name, s) == 0 && + try_module_get(t->owner)) { + found = t; + break; + } + + spin_unlock(&v9fs_trans_lock); + + return found; +} + +/** + * v9fs_get_trans_by_name - get transport with the matching name + * @s: string identifying transport + * + */ +struct p9_trans_module *v9fs_get_trans_by_name(const char *s) +{ + struct p9_trans_module *found = NULL; + + found = _p9_get_trans_by_name(s); + +#ifdef CONFIG_MODULES + if (!found) { + request_module("9p-%s", s); + found = _p9_get_trans_by_name(s); + } +#endif + + return found; +} +EXPORT_SYMBOL(v9fs_get_trans_by_name); + +static const char * const v9fs_default_transports[] = { + "virtio", "tcp", "fd", "unix", "xen", "rdma", +}; + +/** + * v9fs_get_default_trans - get the default transport + * + */ + +struct p9_trans_module *v9fs_get_default_trans(void) +{ + struct p9_trans_module *t, *found = NULL; + int i; + + spin_lock(&v9fs_trans_lock); + + list_for_each_entry(t, &v9fs_trans_list, list) + if (t->def && try_module_get(t->owner)) { + found = t; + break; + } + + if (!found) + list_for_each_entry(t, &v9fs_trans_list, list) + if (try_module_get(t->owner)) { + found = t; + break; + } + + spin_unlock(&v9fs_trans_lock); + + for (i = 0; !found && i < ARRAY_SIZE(v9fs_default_transports); i++) + found = v9fs_get_trans_by_name(v9fs_default_transports[i]); + + return found; +} +EXPORT_SYMBOL(v9fs_get_default_trans); + +/** + * v9fs_put_trans - put trans + * @m: transport to put + * + */ +void v9fs_put_trans(struct p9_trans_module *m) +{ + if (m) + module_put(m->owner); +} + +/** + * init_p9 - Initialize module + * + */ +static int __init init_p9(void) +{ + int ret; + + ret = p9_client_init(); + if (ret) + return ret; + + p9_error_init(); + pr_info("Installing 9P2000 support\n"); + + return ret; +} + +/** + * exit_p9 - shutdown module + * + */ + +static void __exit exit_p9(void) +{ + pr_info("Unloading 9P2000 support\n"); + + p9_client_exit(); +} + +module_init(init_p9) +module_exit(exit_p9) + +MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>"); +MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); +MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Plan 9 Resource Sharing Support (9P2000)"); diff --git a/net/9p/protocol.c b/net/9p/protocol.c new file mode 100644 index 0000000000..0e6603b1ec --- /dev/null +++ b/net/9p/protocol.c @@ -0,0 +1,801 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * 9P Protocol Support Code + * + * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com> + * + * Base on code from Anthony Liguori <aliguori@us.ibm.com> + * Copyright (C) 2008 by IBM, Corp. + */ + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/uio.h> +#include <net/9p/9p.h> +#include <net/9p/client.h> +#include "protocol.h" + +#include <trace/events/9p.h> + +/* len[2] text[len] */ +#define P9_STRLEN(s) \ + (2 + min_t(size_t, s ? strlen(s) : 0, USHRT_MAX)) + +/** + * p9_msg_buf_size - Returns a buffer size sufficiently large to hold the + * intended 9p message. + * @c: client + * @type: message type + * @fmt: format template for assembling request message + * (see p9pdu_vwritef) + * @ap: variable arguments to be fed to passed format template + * (see p9pdu_vwritef) + * + * Note: Even for response types (P9_R*) the format template and variable + * arguments must always be for the originating request type (P9_T*). + */ +size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type, + const char *fmt, va_list ap) +{ + /* size[4] type[1] tag[2] */ + const int hdr = 4 + 1 + 2; + /* ename[s] errno[4] */ + const int rerror_size = hdr + P9_ERRMAX + 4; + /* ecode[4] */ + const int rlerror_size = hdr + 4; + const int err_size = + c->proto_version == p9_proto_2000L ? rlerror_size : rerror_size; + + static_assert(NAME_MAX <= 4*1024, "p9_msg_buf_size() currently assumes " + "a max. allowed directory entry name length of 4k"); + + switch (type) { + + /* message types not used at all */ + case P9_TERROR: + case P9_TLERROR: + case P9_TAUTH: + case P9_RAUTH: + BUG(); + + /* variable length & potentially large message types */ + case P9_TATTACH: + BUG_ON(strcmp("ddss?u", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int32_t); + { + const char *uname = va_arg(ap, const char *); + const char *aname = va_arg(ap, const char *); + /* fid[4] afid[4] uname[s] aname[s] n_uname[4] */ + return hdr + 4 + 4 + P9_STRLEN(uname) + P9_STRLEN(aname) + 4; + } + case P9_TWALK: + BUG_ON(strcmp("ddT", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int32_t); + { + uint i, nwname = va_arg(ap, int); + size_t wname_all; + const char **wnames = va_arg(ap, const char **); + for (i = 0, wname_all = 0; i < nwname; ++i) { + wname_all += P9_STRLEN(wnames[i]); + } + /* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */ + return hdr + 4 + 4 + 2 + wname_all; + } + case P9_RWALK: + BUG_ON(strcmp("ddT", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int32_t); + { + uint nwname = va_arg(ap, int); + /* nwqid[2] nwqid*(wqid[13]) */ + return max_t(size_t, hdr + 2 + nwname * 13, err_size); + } + case P9_TCREATE: + BUG_ON(strcmp("dsdb?s", fmt)); + va_arg(ap, int32_t); + { + const char *name = va_arg(ap, const char *); + if (c->proto_version == p9_proto_legacy) { + /* fid[4] name[s] perm[4] mode[1] */ + return hdr + 4 + P9_STRLEN(name) + 4 + 1; + } else { + va_arg(ap, int32_t); + va_arg(ap, int); + { + const char *ext = va_arg(ap, const char *); + /* fid[4] name[s] perm[4] mode[1] extension[s] */ + return hdr + 4 + P9_STRLEN(name) + 4 + 1 + P9_STRLEN(ext); + } + } + } + case P9_TLCREATE: + BUG_ON(strcmp("dsddg", fmt)); + va_arg(ap, int32_t); + { + const char *name = va_arg(ap, const char *); + /* fid[4] name[s] flags[4] mode[4] gid[4] */ + return hdr + 4 + P9_STRLEN(name) + 4 + 4 + 4; + } + case P9_RREAD: + case P9_RREADDIR: + BUG_ON(strcmp("dqd", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int64_t); + { + const int32_t count = va_arg(ap, int32_t); + /* count[4] data[count] */ + return max_t(size_t, hdr + 4 + count, err_size); + } + case P9_TWRITE: + BUG_ON(strcmp("dqV", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int64_t); + { + const int32_t count = va_arg(ap, int32_t); + /* fid[4] offset[8] count[4] data[count] */ + return hdr + 4 + 8 + 4 + count; + } + case P9_TRENAMEAT: + BUG_ON(strcmp("dsds", fmt)); + va_arg(ap, int32_t); + { + const char *oldname, *newname; + oldname = va_arg(ap, const char *); + va_arg(ap, int32_t); + newname = va_arg(ap, const char *); + /* olddirfid[4] oldname[s] newdirfid[4] newname[s] */ + return hdr + 4 + P9_STRLEN(oldname) + 4 + P9_STRLEN(newname); + } + case P9_TSYMLINK: + BUG_ON(strcmp("dssg", fmt)); + va_arg(ap, int32_t); + { + const char *name = va_arg(ap, const char *); + const char *symtgt = va_arg(ap, const char *); + /* fid[4] name[s] symtgt[s] gid[4] */ + return hdr + 4 + P9_STRLEN(name) + P9_STRLEN(symtgt) + 4; + } + + case P9_RERROR: + return rerror_size; + case P9_RLERROR: + return rlerror_size; + + /* small message types */ + case P9_TWSTAT: + case P9_RSTAT: + case P9_RREADLINK: + case P9_TXATTRWALK: + case P9_TXATTRCREATE: + case P9_TLINK: + case P9_TMKDIR: + case P9_TMKNOD: + case P9_TRENAME: + case P9_TUNLINKAT: + case P9_TLOCK: + return 8 * 1024; + + /* tiny message types */ + default: + return 4 * 1024; + + } +} + +static int +p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); + +void p9stat_free(struct p9_wstat *stbuf) +{ + kfree(stbuf->name); + stbuf->name = NULL; + kfree(stbuf->uid); + stbuf->uid = NULL; + kfree(stbuf->gid); + stbuf->gid = NULL; + kfree(stbuf->muid); + stbuf->muid = NULL; + kfree(stbuf->extension); + stbuf->extension = NULL; +} +EXPORT_SYMBOL(p9stat_free); + +size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size) +{ + size_t len = min(pdu->size - pdu->offset, size); + + memcpy(data, &pdu->sdata[pdu->offset], len); + pdu->offset += len; + return size - len; +} + +static size_t pdu_write(struct p9_fcall *pdu, const void *data, size_t size) +{ + size_t len = min(pdu->capacity - pdu->size, size); + + memcpy(&pdu->sdata[pdu->size], data, len); + pdu->size += len; + return size - len; +} + +static size_t +pdu_write_u(struct p9_fcall *pdu, struct iov_iter *from, size_t size) +{ + size_t len = min(pdu->capacity - pdu->size, size); + + if (!copy_from_iter_full(&pdu->sdata[pdu->size], len, from)) + len = 0; + + pdu->size += len; + return size - len; +} + +/* b - int8_t + * w - int16_t + * d - int32_t + * q - int64_t + * s - string + * u - numeric uid + * g - numeric gid + * S - stat + * Q - qid + * D - data blob (int32_t size followed by void *, results are not freed) + * T - array of strings (int16_t count, followed by strings) + * R - array of qids (int16_t count, followed by qids) + * A - stat for 9p2000.L (p9_stat_dotl) + * ? - if optional = 1, continue parsing + */ + +static int +p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, + va_list ap) +{ + const char *ptr; + int errcode = 0; + + for (ptr = fmt; *ptr; ptr++) { + switch (*ptr) { + case 'b':{ + int8_t *val = va_arg(ap, int8_t *); + if (pdu_read(pdu, val, sizeof(*val))) { + errcode = -EFAULT; + break; + } + } + break; + case 'w':{ + int16_t *val = va_arg(ap, int16_t *); + __le16 le_val; + if (pdu_read(pdu, &le_val, sizeof(le_val))) { + errcode = -EFAULT; + break; + } + *val = le16_to_cpu(le_val); + } + break; + case 'd':{ + int32_t *val = va_arg(ap, int32_t *); + __le32 le_val; + if (pdu_read(pdu, &le_val, sizeof(le_val))) { + errcode = -EFAULT; + break; + } + *val = le32_to_cpu(le_val); + } + break; + case 'q':{ + int64_t *val = va_arg(ap, int64_t *); + __le64 le_val; + if (pdu_read(pdu, &le_val, sizeof(le_val))) { + errcode = -EFAULT; + break; + } + *val = le64_to_cpu(le_val); + } + break; + case 's':{ + char **sptr = va_arg(ap, char **); + uint16_t len; + + errcode = p9pdu_readf(pdu, proto_version, + "w", &len); + if (errcode) + break; + + *sptr = kmalloc(len + 1, GFP_NOFS); + if (*sptr == NULL) { + errcode = -ENOMEM; + break; + } + if (pdu_read(pdu, *sptr, len)) { + errcode = -EFAULT; + kfree(*sptr); + *sptr = NULL; + } else + (*sptr)[len] = 0; + } + break; + case 'u': { + kuid_t *uid = va_arg(ap, kuid_t *); + __le32 le_val; + if (pdu_read(pdu, &le_val, sizeof(le_val))) { + errcode = -EFAULT; + break; + } + *uid = make_kuid(&init_user_ns, + le32_to_cpu(le_val)); + } break; + case 'g': { + kgid_t *gid = va_arg(ap, kgid_t *); + __le32 le_val; + if (pdu_read(pdu, &le_val, sizeof(le_val))) { + errcode = -EFAULT; + break; + } + *gid = make_kgid(&init_user_ns, + le32_to_cpu(le_val)); + } break; + case 'Q':{ + struct p9_qid *qid = + va_arg(ap, struct p9_qid *); + + errcode = p9pdu_readf(pdu, proto_version, "bdq", + &qid->type, &qid->version, + &qid->path); + } + break; + case 'S':{ + struct p9_wstat *stbuf = + va_arg(ap, struct p9_wstat *); + + memset(stbuf, 0, sizeof(struct p9_wstat)); + stbuf->n_uid = stbuf->n_muid = INVALID_UID; + stbuf->n_gid = INVALID_GID; + + errcode = + p9pdu_readf(pdu, proto_version, + "wwdQdddqssss?sugu", + &stbuf->size, &stbuf->type, + &stbuf->dev, &stbuf->qid, + &stbuf->mode, &stbuf->atime, + &stbuf->mtime, &stbuf->length, + &stbuf->name, &stbuf->uid, + &stbuf->gid, &stbuf->muid, + &stbuf->extension, + &stbuf->n_uid, &stbuf->n_gid, + &stbuf->n_muid); + if (errcode) + p9stat_free(stbuf); + } + break; + case 'D':{ + uint32_t *count = va_arg(ap, uint32_t *); + void **data = va_arg(ap, void **); + + errcode = + p9pdu_readf(pdu, proto_version, "d", count); + if (!errcode) { + *count = + min_t(uint32_t, *count, + pdu->size - pdu->offset); + *data = &pdu->sdata[pdu->offset]; + } + } + break; + case 'T':{ + uint16_t *nwname = va_arg(ap, uint16_t *); + char ***wnames = va_arg(ap, char ***); + + *wnames = NULL; + + errcode = p9pdu_readf(pdu, proto_version, + "w", nwname); + if (!errcode) { + *wnames = + kmalloc_array(*nwname, + sizeof(char *), + GFP_NOFS); + if (!*wnames) + errcode = -ENOMEM; + else + (*wnames)[0] = NULL; + } + + if (!errcode) { + int i; + + for (i = 0; i < *nwname; i++) { + errcode = + p9pdu_readf(pdu, + proto_version, + "s", + &(*wnames)[i]); + if (errcode) { + (*wnames)[i] = NULL; + break; + } + } + } + + if (errcode) { + if (*wnames) { + int i; + + for (i = 0; i < *nwname; i++) { + if (!(*wnames)[i]) + break; + kfree((*wnames)[i]); + } + kfree(*wnames); + *wnames = NULL; + } + } + } + break; + case 'R':{ + uint16_t *nwqid = va_arg(ap, uint16_t *); + struct p9_qid **wqids = + va_arg(ap, struct p9_qid **); + + *wqids = NULL; + + errcode = + p9pdu_readf(pdu, proto_version, "w", nwqid); + if (!errcode) { + *wqids = + kmalloc_array(*nwqid, + sizeof(struct p9_qid), + GFP_NOFS); + if (*wqids == NULL) + errcode = -ENOMEM; + } + + if (!errcode) { + int i; + + for (i = 0; i < *nwqid; i++) { + errcode = + p9pdu_readf(pdu, + proto_version, + "Q", + &(*wqids)[i]); + if (errcode) + break; + } + } + + if (errcode) { + kfree(*wqids); + *wqids = NULL; + } + } + break; + case 'A': { + struct p9_stat_dotl *stbuf = + va_arg(ap, struct p9_stat_dotl *); + + memset(stbuf, 0, sizeof(struct p9_stat_dotl)); + errcode = + p9pdu_readf(pdu, proto_version, + "qQdugqqqqqqqqqqqqqqq", + &stbuf->st_result_mask, + &stbuf->qid, + &stbuf->st_mode, + &stbuf->st_uid, &stbuf->st_gid, + &stbuf->st_nlink, + &stbuf->st_rdev, &stbuf->st_size, + &stbuf->st_blksize, &stbuf->st_blocks, + &stbuf->st_atime_sec, + &stbuf->st_atime_nsec, + &stbuf->st_mtime_sec, + &stbuf->st_mtime_nsec, + &stbuf->st_ctime_sec, + &stbuf->st_ctime_nsec, + &stbuf->st_btime_sec, + &stbuf->st_btime_nsec, + &stbuf->st_gen, + &stbuf->st_data_version); + } + break; + case '?': + if ((proto_version != p9_proto_2000u) && + (proto_version != p9_proto_2000L)) + return 0; + break; + default: + BUG(); + break; + } + + if (errcode) + break; + } + + return errcode; +} + +int +p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, + va_list ap) +{ + const char *ptr; + int errcode = 0; + + for (ptr = fmt; *ptr; ptr++) { + switch (*ptr) { + case 'b':{ + int8_t val = va_arg(ap, int); + if (pdu_write(pdu, &val, sizeof(val))) + errcode = -EFAULT; + } + break; + case 'w':{ + __le16 val = cpu_to_le16(va_arg(ap, int)); + if (pdu_write(pdu, &val, sizeof(val))) + errcode = -EFAULT; + } + break; + case 'd':{ + __le32 val = cpu_to_le32(va_arg(ap, int32_t)); + if (pdu_write(pdu, &val, sizeof(val))) + errcode = -EFAULT; + } + break; + case 'q':{ + __le64 val = cpu_to_le64(va_arg(ap, int64_t)); + if (pdu_write(pdu, &val, sizeof(val))) + errcode = -EFAULT; + } + break; + case 's':{ + const char *sptr = va_arg(ap, const char *); + uint16_t len = 0; + if (sptr) + len = min_t(size_t, strlen(sptr), + USHRT_MAX); + + errcode = p9pdu_writef(pdu, proto_version, + "w", len); + if (!errcode && pdu_write(pdu, sptr, len)) + errcode = -EFAULT; + } + break; + case 'u': { + kuid_t uid = va_arg(ap, kuid_t); + __le32 val = cpu_to_le32( + from_kuid(&init_user_ns, uid)); + if (pdu_write(pdu, &val, sizeof(val))) + errcode = -EFAULT; + } break; + case 'g': { + kgid_t gid = va_arg(ap, kgid_t); + __le32 val = cpu_to_le32( + from_kgid(&init_user_ns, gid)); + if (pdu_write(pdu, &val, sizeof(val))) + errcode = -EFAULT; + } break; + case 'Q':{ + const struct p9_qid *qid = + va_arg(ap, const struct p9_qid *); + errcode = + p9pdu_writef(pdu, proto_version, "bdq", + qid->type, qid->version, + qid->path); + } break; + case 'S':{ + const struct p9_wstat *stbuf = + va_arg(ap, const struct p9_wstat *); + errcode = + p9pdu_writef(pdu, proto_version, + "wwdQdddqssss?sugu", + stbuf->size, stbuf->type, + stbuf->dev, &stbuf->qid, + stbuf->mode, stbuf->atime, + stbuf->mtime, stbuf->length, + stbuf->name, stbuf->uid, + stbuf->gid, stbuf->muid, + stbuf->extension, stbuf->n_uid, + stbuf->n_gid, stbuf->n_muid); + } break; + case 'V':{ + uint32_t count = va_arg(ap, uint32_t); + struct iov_iter *from = + va_arg(ap, struct iov_iter *); + errcode = p9pdu_writef(pdu, proto_version, "d", + count); + if (!errcode && pdu_write_u(pdu, from, count)) + errcode = -EFAULT; + } + break; + case 'T':{ + uint16_t nwname = va_arg(ap, int); + const char **wnames = va_arg(ap, const char **); + + errcode = p9pdu_writef(pdu, proto_version, "w", + nwname); + if (!errcode) { + int i; + + for (i = 0; i < nwname; i++) { + errcode = + p9pdu_writef(pdu, + proto_version, + "s", + wnames[i]); + if (errcode) + break; + } + } + } + break; + case 'R':{ + uint16_t nwqid = va_arg(ap, int); + struct p9_qid *wqids = + va_arg(ap, struct p9_qid *); + + errcode = p9pdu_writef(pdu, proto_version, "w", + nwqid); + if (!errcode) { + int i; + + for (i = 0; i < nwqid; i++) { + errcode = + p9pdu_writef(pdu, + proto_version, + "Q", + &wqids[i]); + if (errcode) + break; + } + } + } + break; + case 'I':{ + struct p9_iattr_dotl *p9attr = va_arg(ap, + struct p9_iattr_dotl *); + + errcode = p9pdu_writef(pdu, proto_version, + "ddugqqqqq", + p9attr->valid, + p9attr->mode, + p9attr->uid, + p9attr->gid, + p9attr->size, + p9attr->atime_sec, + p9attr->atime_nsec, + p9attr->mtime_sec, + p9attr->mtime_nsec); + } + break; + case '?': + if ((proto_version != p9_proto_2000u) && + (proto_version != p9_proto_2000L)) + return 0; + break; + default: + BUG(); + break; + } + + if (errcode) + break; + } + + return errcode; +} + +int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = p9pdu_vreadf(pdu, proto_version, fmt, ap); + va_end(ap); + + return ret; +} + +static int +p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = p9pdu_vwritef(pdu, proto_version, fmt, ap); + va_end(ap); + + return ret; +} + +int p9stat_read(struct p9_client *clnt, char *buf, int len, struct p9_wstat *st) +{ + struct p9_fcall fake_pdu; + int ret; + + fake_pdu.size = len; + fake_pdu.capacity = len; + fake_pdu.sdata = buf; + fake_pdu.offset = 0; + + ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "S", st); + if (ret) { + p9_debug(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret); + trace_9p_protocol_dump(clnt, &fake_pdu); + return ret; + } + + return fake_pdu.offset; +} +EXPORT_SYMBOL(p9stat_read); + +int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type) +{ + pdu->id = type; + return p9pdu_writef(pdu, 0, "dbw", 0, type, tag); +} + +int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu) +{ + int size = pdu->size; + int err; + + pdu->size = 0; + err = p9pdu_writef(pdu, 0, "d", size); + pdu->size = size; + + trace_9p_protocol_dump(clnt, pdu); + p9_debug(P9_DEBUG_9P, ">>> size=%d type: %d tag: %d\n", + pdu->size, pdu->id, pdu->tag); + + return err; +} + +void p9pdu_reset(struct p9_fcall *pdu) +{ + pdu->offset = 0; + pdu->size = 0; +} + +int p9dirent_read(struct p9_client *clnt, char *buf, int len, + struct p9_dirent *dirent) +{ + struct p9_fcall fake_pdu; + int ret; + char *nameptr; + + fake_pdu.size = len; + fake_pdu.capacity = len; + fake_pdu.sdata = buf; + fake_pdu.offset = 0; + + ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "Qqbs", &dirent->qid, + &dirent->d_off, &dirent->d_type, &nameptr); + if (ret) { + p9_debug(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret); + trace_9p_protocol_dump(clnt, &fake_pdu); + return ret; + } + + ret = strscpy(dirent->d_name, nameptr, sizeof(dirent->d_name)); + if (ret < 0) { + p9_debug(P9_DEBUG_ERROR, + "On the wire dirent name too long: %s\n", + nameptr); + kfree(nameptr); + return ret; + } + kfree(nameptr); + + return fake_pdu.offset; +} +EXPORT_SYMBOL(p9dirent_read); diff --git a/net/9p/protocol.h b/net/9p/protocol.h new file mode 100644 index 0000000000..ad2283d1f9 --- /dev/null +++ b/net/9p/protocol.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * 9P Protocol Support Code + * + * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com> + * + * Base on code from Anthony Liguori <aliguori@us.ibm.com> + * Copyright (C) 2008 by IBM, Corp. + */ + +size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type, + const char *fmt, va_list ap); +int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, + va_list ap); +int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); +int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type); +int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu); +void p9pdu_reset(struct p9_fcall *pdu); +size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size); diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c new file mode 100644 index 0000000000..c827f69455 --- /dev/null +++ b/net/9p/trans_common.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* + * Copyright IBM Corporation, 2010 + * Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com> + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include "trans_common.h" + +/** + * p9_release_pages - Release pages after the transaction. + * @pages: array of pages to be put + * @nr_pages: size of array + */ +void p9_release_pages(struct page **pages, int nr_pages) +{ + int i; + + for (i = 0; i < nr_pages; i++) + if (pages[i]) + put_page(pages[i]); +} +EXPORT_SYMBOL(p9_release_pages); diff --git a/net/9p/trans_common.h b/net/9p/trans_common.h new file mode 100644 index 0000000000..32134db6ab --- /dev/null +++ b/net/9p/trans_common.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ +/* + * Copyright IBM Corporation, 2010 + * Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com> + */ + +void p9_release_pages(struct page **pages, int nr_pages); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c new file mode 100644 index 0000000000..d0eb03ada7 --- /dev/null +++ b/net/9p/trans_fd.c @@ -0,0 +1,1205 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Fd transport layer. Includes deprecated socket layer. + * + * Copyright (C) 2006 by Russ Cox <rsc@swtch.com> + * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> + * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> + * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/in.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/ipv6.h> +#include <linux/kthread.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/un.h> +#include <linux/uaccess.h> +#include <linux/inet.h> +#include <linux/file.h> +#include <linux/parser.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <net/9p/9p.h> +#include <net/9p/client.h> +#include <net/9p/transport.h> + +#include <linux/syscalls.h> /* killme */ + +#define P9_PORT 564 +#define MAX_SOCK_BUF (1024*1024) +#define MAXPOLLWADDR 2 + +static struct p9_trans_module p9_tcp_trans; +static struct p9_trans_module p9_fd_trans; + +/** + * struct p9_fd_opts - per-transport options + * @rfd: file descriptor for reading (trans=fd) + * @wfd: file descriptor for writing (trans=fd) + * @port: port to connect to (trans=tcp) + * @privport: port is privileged + */ + +struct p9_fd_opts { + int rfd; + int wfd; + u16 port; + bool privport; +}; + +/* + * Option Parsing (code inspired by NFS code) + * - a little lazy - parse all fd-transport options + */ + +enum { + /* Options that take integer arguments */ + Opt_port, Opt_rfdno, Opt_wfdno, Opt_err, + /* Options that take no arguments */ + Opt_privport, +}; + +static const match_table_t tokens = { + {Opt_port, "port=%u"}, + {Opt_rfdno, "rfdno=%u"}, + {Opt_wfdno, "wfdno=%u"}, + {Opt_privport, "privport"}, + {Opt_err, NULL}, +}; + +enum { + Rworksched = 1, /* read work scheduled or running */ + Rpending = 2, /* can read */ + Wworksched = 4, /* write work scheduled or running */ + Wpending = 8, /* can write */ +}; + +struct p9_poll_wait { + struct p9_conn *conn; + wait_queue_entry_t wait; + wait_queue_head_t *wait_addr; +}; + +/** + * struct p9_conn - fd mux connection state information + * @mux_list: list link for mux to manage multiple connections (?) + * @client: reference to client instance for this connection + * @err: error state + * @req_lock: lock protecting req_list and requests statuses + * @req_list: accounting for requests which have been sent + * @unsent_req_list: accounting for requests that haven't been sent + * @rreq: read request + * @wreq: write request + * @req: current request being processed (if any) + * @tmp_buf: temporary buffer to read in header + * @rc: temporary fcall for reading current frame + * @wpos: write position for current frame + * @wsize: amount of data to write for current frame + * @wbuf: current write buffer + * @poll_pending_link: pending links to be polled per conn + * @poll_wait: array of wait_q's for various worker threads + * @pt: poll state + * @rq: current read work + * @wq: current write work + * @wsched: ???? + * + */ + +struct p9_conn { + struct list_head mux_list; + struct p9_client *client; + int err; + spinlock_t req_lock; + struct list_head req_list; + struct list_head unsent_req_list; + struct p9_req_t *rreq; + struct p9_req_t *wreq; + char tmp_buf[P9_HDRSZ]; + struct p9_fcall rc; + int wpos; + int wsize; + char *wbuf; + struct list_head poll_pending_link; + struct p9_poll_wait poll_wait[MAXPOLLWADDR]; + poll_table pt; + struct work_struct rq; + struct work_struct wq; + unsigned long wsched; +}; + +/** + * struct p9_trans_fd - transport state + * @rd: reference to file to read from + * @wr: reference of file to write to + * @conn: connection state reference + * + */ + +struct p9_trans_fd { + struct file *rd; + struct file *wr; + struct p9_conn conn; +}; + +static void p9_poll_workfn(struct work_struct *work); + +static DEFINE_SPINLOCK(p9_poll_lock); +static LIST_HEAD(p9_poll_pending_list); +static DECLARE_WORK(p9_poll_work, p9_poll_workfn); + +static unsigned int p9_ipport_resv_min = P9_DEF_MIN_RESVPORT; +static unsigned int p9_ipport_resv_max = P9_DEF_MAX_RESVPORT; + +static void p9_mux_poll_stop(struct p9_conn *m) +{ + unsigned long flags; + int i; + + for (i = 0; i < ARRAY_SIZE(m->poll_wait); i++) { + struct p9_poll_wait *pwait = &m->poll_wait[i]; + + if (pwait->wait_addr) { + remove_wait_queue(pwait->wait_addr, &pwait->wait); + pwait->wait_addr = NULL; + } + } + + spin_lock_irqsave(&p9_poll_lock, flags); + list_del_init(&m->poll_pending_link); + spin_unlock_irqrestore(&p9_poll_lock, flags); + + flush_work(&p9_poll_work); +} + +/** + * p9_conn_cancel - cancel all pending requests with error + * @m: mux data + * @err: error code + * + */ + +static void p9_conn_cancel(struct p9_conn *m, int err) +{ + struct p9_req_t *req, *rtmp; + LIST_HEAD(cancel_list); + + p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err); + + spin_lock(&m->req_lock); + + if (m->err) { + spin_unlock(&m->req_lock); + return; + } + + m->err = err; + + list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) { + list_move(&req->req_list, &cancel_list); + WRITE_ONCE(req->status, REQ_STATUS_ERROR); + } + list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { + list_move(&req->req_list, &cancel_list); + WRITE_ONCE(req->status, REQ_STATUS_ERROR); + } + + spin_unlock(&m->req_lock); + + list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { + p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req); + list_del(&req->req_list); + if (!req->t_err) + req->t_err = err; + p9_client_cb(m->client, req, REQ_STATUS_ERROR); + } +} + +static __poll_t +p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err) +{ + __poll_t ret; + struct p9_trans_fd *ts = NULL; + + if (client && client->status == Connected) + ts = client->trans; + + if (!ts) { + if (err) + *err = -EREMOTEIO; + return EPOLLERR; + } + + ret = vfs_poll(ts->rd, pt); + if (ts->rd != ts->wr) + ret = (ret & ~EPOLLOUT) | (vfs_poll(ts->wr, pt) & ~EPOLLIN); + return ret; +} + +/** + * p9_fd_read- read from a fd + * @client: client instance + * @v: buffer to receive data into + * @len: size of receive buffer + * + */ + +static int p9_fd_read(struct p9_client *client, void *v, int len) +{ + int ret; + struct p9_trans_fd *ts = NULL; + loff_t pos; + + if (client && client->status != Disconnected) + ts = client->trans; + + if (!ts) + return -EREMOTEIO; + + if (!(ts->rd->f_flags & O_NONBLOCK)) + p9_debug(P9_DEBUG_ERROR, "blocking read ...\n"); + + pos = ts->rd->f_pos; + ret = kernel_read(ts->rd, v, len, &pos); + if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN) + client->status = Disconnected; + return ret; +} + +/** + * p9_read_work - called when there is some data to be read from a transport + * @work: container of work to be done + * + */ + +static void p9_read_work(struct work_struct *work) +{ + __poll_t n; + int err; + struct p9_conn *m; + + m = container_of(work, struct p9_conn, rq); + + if (m->err < 0) + return; + + p9_debug(P9_DEBUG_TRANS, "start mux %p pos %zd\n", m, m->rc.offset); + + if (!m->rc.sdata) { + m->rc.sdata = m->tmp_buf; + m->rc.offset = 0; + m->rc.capacity = P9_HDRSZ; /* start by reading header */ + } + + clear_bit(Rpending, &m->wsched); + p9_debug(P9_DEBUG_TRANS, "read mux %p pos %zd size: %zd = %zd\n", + m, m->rc.offset, m->rc.capacity, + m->rc.capacity - m->rc.offset); + err = p9_fd_read(m->client, m->rc.sdata + m->rc.offset, + m->rc.capacity - m->rc.offset); + p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err); + if (err == -EAGAIN) + goto end_clear; + + if (err <= 0) + goto error; + + m->rc.offset += err; + + /* header read in */ + if ((!m->rreq) && (m->rc.offset == m->rc.capacity)) { + p9_debug(P9_DEBUG_TRANS, "got new header\n"); + + /* Header size */ + m->rc.size = P9_HDRSZ; + err = p9_parse_header(&m->rc, &m->rc.size, NULL, NULL, 0); + if (err) { + p9_debug(P9_DEBUG_ERROR, + "error parsing header: %d\n", err); + goto error; + } + + p9_debug(P9_DEBUG_TRANS, + "mux %p pkt: size: %d bytes tag: %d\n", + m, m->rc.size, m->rc.tag); + + m->rreq = p9_tag_lookup(m->client, m->rc.tag); + if (!m->rreq || (m->rreq->status != REQ_STATUS_SENT)) { + p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", + m->rc.tag); + err = -EIO; + goto error; + } + + if (m->rc.size > m->rreq->rc.capacity) { + p9_debug(P9_DEBUG_ERROR, + "requested packet size too big: %d for tag %d with capacity %zd\n", + m->rc.size, m->rc.tag, m->rreq->rc.capacity); + err = -EIO; + goto error; + } + + if (!m->rreq->rc.sdata) { + p9_debug(P9_DEBUG_ERROR, + "No recv fcall for tag %d (req %p), disconnecting!\n", + m->rc.tag, m->rreq); + p9_req_put(m->client, m->rreq); + m->rreq = NULL; + err = -EIO; + goto error; + } + m->rc.sdata = m->rreq->rc.sdata; + memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity); + m->rc.capacity = m->rc.size; + } + + /* packet is read in + * not an else because some packets (like clunk) have no payload + */ + if ((m->rreq) && (m->rc.offset == m->rc.capacity)) { + p9_debug(P9_DEBUG_TRANS, "got new packet\n"); + m->rreq->rc.size = m->rc.offset; + spin_lock(&m->req_lock); + if (m->rreq->status == REQ_STATUS_SENT) { + list_del(&m->rreq->req_list); + p9_client_cb(m->client, m->rreq, REQ_STATUS_RCVD); + } else if (m->rreq->status == REQ_STATUS_FLSHD) { + /* Ignore replies associated with a cancelled request. */ + p9_debug(P9_DEBUG_TRANS, + "Ignore replies associated with a cancelled request\n"); + } else { + spin_unlock(&m->req_lock); + p9_debug(P9_DEBUG_ERROR, + "Request tag %d errored out while we were reading the reply\n", + m->rc.tag); + err = -EIO; + goto error; + } + spin_unlock(&m->req_lock); + m->rc.sdata = NULL; + m->rc.offset = 0; + m->rc.capacity = 0; + p9_req_put(m->client, m->rreq); + m->rreq = NULL; + } + +end_clear: + clear_bit(Rworksched, &m->wsched); + + if (!list_empty(&m->req_list)) { + if (test_and_clear_bit(Rpending, &m->wsched)) + n = EPOLLIN; + else + n = p9_fd_poll(m->client, NULL, NULL); + + if ((n & EPOLLIN) && !test_and_set_bit(Rworksched, &m->wsched)) { + p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m); + schedule_work(&m->rq); + } + } + + return; +error: + p9_conn_cancel(m, err); + clear_bit(Rworksched, &m->wsched); +} + +/** + * p9_fd_write - write to a socket + * @client: client instance + * @v: buffer to send data from + * @len: size of send buffer + * + */ + +static int p9_fd_write(struct p9_client *client, void *v, int len) +{ + ssize_t ret; + struct p9_trans_fd *ts = NULL; + + if (client && client->status != Disconnected) + ts = client->trans; + + if (!ts) + return -EREMOTEIO; + + if (!(ts->wr->f_flags & O_NONBLOCK)) + p9_debug(P9_DEBUG_ERROR, "blocking write ...\n"); + + ret = kernel_write(ts->wr, v, len, &ts->wr->f_pos); + if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN) + client->status = Disconnected; + return ret; +} + +/** + * p9_write_work - called when a transport can send some data + * @work: container for work to be done + * + */ + +static void p9_write_work(struct work_struct *work) +{ + __poll_t n; + int err; + struct p9_conn *m; + struct p9_req_t *req; + + m = container_of(work, struct p9_conn, wq); + + if (m->err < 0) { + clear_bit(Wworksched, &m->wsched); + return; + } + + if (!m->wsize) { + spin_lock(&m->req_lock); + if (list_empty(&m->unsent_req_list)) { + clear_bit(Wworksched, &m->wsched); + spin_unlock(&m->req_lock); + return; + } + + req = list_entry(m->unsent_req_list.next, struct p9_req_t, + req_list); + WRITE_ONCE(req->status, REQ_STATUS_SENT); + p9_debug(P9_DEBUG_TRANS, "move req %p\n", req); + list_move_tail(&req->req_list, &m->req_list); + + m->wbuf = req->tc.sdata; + m->wsize = req->tc.size; + m->wpos = 0; + p9_req_get(req); + m->wreq = req; + spin_unlock(&m->req_lock); + } + + p9_debug(P9_DEBUG_TRANS, "mux %p pos %d size %d\n", + m, m->wpos, m->wsize); + clear_bit(Wpending, &m->wsched); + err = p9_fd_write(m->client, m->wbuf + m->wpos, m->wsize - m->wpos); + p9_debug(P9_DEBUG_TRANS, "mux %p sent %d bytes\n", m, err); + if (err == -EAGAIN) + goto end_clear; + + + if (err < 0) + goto error; + else if (err == 0) { + err = -EREMOTEIO; + goto error; + } + + m->wpos += err; + if (m->wpos == m->wsize) { + m->wpos = m->wsize = 0; + p9_req_put(m->client, m->wreq); + m->wreq = NULL; + } + +end_clear: + clear_bit(Wworksched, &m->wsched); + + if (m->wsize || !list_empty(&m->unsent_req_list)) { + if (test_and_clear_bit(Wpending, &m->wsched)) + n = EPOLLOUT; + else + n = p9_fd_poll(m->client, NULL, NULL); + + if ((n & EPOLLOUT) && + !test_and_set_bit(Wworksched, &m->wsched)) { + p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m); + schedule_work(&m->wq); + } + } + + return; + +error: + p9_conn_cancel(m, err); + clear_bit(Wworksched, &m->wsched); +} + +static int p9_pollwake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) +{ + struct p9_poll_wait *pwait = + container_of(wait, struct p9_poll_wait, wait); + struct p9_conn *m = pwait->conn; + unsigned long flags; + + spin_lock_irqsave(&p9_poll_lock, flags); + if (list_empty(&m->poll_pending_link)) + list_add_tail(&m->poll_pending_link, &p9_poll_pending_list); + spin_unlock_irqrestore(&p9_poll_lock, flags); + + schedule_work(&p9_poll_work); + return 1; +} + +/** + * p9_pollwait - add poll task to the wait queue + * @filp: file pointer being polled + * @wait_address: wait_q to block on + * @p: poll state + * + * called by files poll operation to add v9fs-poll task to files wait queue + */ + +static void +p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) +{ + struct p9_conn *m = container_of(p, struct p9_conn, pt); + struct p9_poll_wait *pwait = NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(m->poll_wait); i++) { + if (m->poll_wait[i].wait_addr == NULL) { + pwait = &m->poll_wait[i]; + break; + } + } + + if (!pwait) { + p9_debug(P9_DEBUG_ERROR, "not enough wait_address slots\n"); + return; + } + + pwait->conn = m; + pwait->wait_addr = wait_address; + init_waitqueue_func_entry(&pwait->wait, p9_pollwake); + add_wait_queue(wait_address, &pwait->wait); +} + +/** + * p9_conn_create - initialize the per-session mux data + * @client: client instance + * + * Note: Creates the polling task if this is the first session. + */ + +static void p9_conn_create(struct p9_client *client) +{ + __poll_t n; + struct p9_trans_fd *ts = client->trans; + struct p9_conn *m = &ts->conn; + + p9_debug(P9_DEBUG_TRANS, "client %p msize %d\n", client, client->msize); + + INIT_LIST_HEAD(&m->mux_list); + m->client = client; + + spin_lock_init(&m->req_lock); + INIT_LIST_HEAD(&m->req_list); + INIT_LIST_HEAD(&m->unsent_req_list); + INIT_WORK(&m->rq, p9_read_work); + INIT_WORK(&m->wq, p9_write_work); + INIT_LIST_HEAD(&m->poll_pending_link); + init_poll_funcptr(&m->pt, p9_pollwait); + + n = p9_fd_poll(client, &m->pt, NULL); + if (n & EPOLLIN) { + p9_debug(P9_DEBUG_TRANS, "mux %p can read\n", m); + set_bit(Rpending, &m->wsched); + } + + if (n & EPOLLOUT) { + p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m); + set_bit(Wpending, &m->wsched); + } +} + +/** + * p9_poll_mux - polls a mux and schedules read or write works if necessary + * @m: connection to poll + * + */ + +static void p9_poll_mux(struct p9_conn *m) +{ + __poll_t n; + int err = -ECONNRESET; + + if (m->err < 0) + return; + + n = p9_fd_poll(m->client, NULL, &err); + if (n & (EPOLLERR | EPOLLHUP | EPOLLNVAL)) { + p9_debug(P9_DEBUG_TRANS, "error mux %p err %d\n", m, n); + p9_conn_cancel(m, err); + } + + if (n & EPOLLIN) { + set_bit(Rpending, &m->wsched); + p9_debug(P9_DEBUG_TRANS, "mux %p can read\n", m); + if (!test_and_set_bit(Rworksched, &m->wsched)) { + p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m); + schedule_work(&m->rq); + } + } + + if (n & EPOLLOUT) { + set_bit(Wpending, &m->wsched); + p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m); + if ((m->wsize || !list_empty(&m->unsent_req_list)) && + !test_and_set_bit(Wworksched, &m->wsched)) { + p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m); + schedule_work(&m->wq); + } + } +} + +/** + * p9_fd_request - send 9P request + * The function can sleep until the request is scheduled for sending. + * The function can be interrupted. Return from the function is not + * a guarantee that the request is sent successfully. + * + * @client: client instance + * @req: request to be sent + * + */ + +static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) +{ + __poll_t n; + struct p9_trans_fd *ts = client->trans; + struct p9_conn *m = &ts->conn; + + p9_debug(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n", + m, current, &req->tc, req->tc.id); + if (m->err < 0) + return m->err; + + spin_lock(&m->req_lock); + WRITE_ONCE(req->status, REQ_STATUS_UNSENT); + list_add_tail(&req->req_list, &m->unsent_req_list); + spin_unlock(&m->req_lock); + + if (test_and_clear_bit(Wpending, &m->wsched)) + n = EPOLLOUT; + else + n = p9_fd_poll(m->client, NULL, NULL); + + if (n & EPOLLOUT && !test_and_set_bit(Wworksched, &m->wsched)) + schedule_work(&m->wq); + + return 0; +} + +static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) +{ + struct p9_trans_fd *ts = client->trans; + struct p9_conn *m = &ts->conn; + int ret = 1; + + p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req); + + spin_lock(&m->req_lock); + + if (req->status == REQ_STATUS_UNSENT) { + list_del(&req->req_list); + WRITE_ONCE(req->status, REQ_STATUS_FLSHD); + p9_req_put(client, req); + ret = 0; + } + spin_unlock(&m->req_lock); + + return ret; +} + +static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) +{ + struct p9_trans_fd *ts = client->trans; + struct p9_conn *m = &ts->conn; + + p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req); + + spin_lock(&m->req_lock); + /* Ignore cancelled request if message has been received + * before lock. + */ + if (req->status == REQ_STATUS_RCVD) { + spin_unlock(&m->req_lock); + return 0; + } + + /* we haven't received a response for oldreq, + * remove it from the list. + */ + list_del(&req->req_list); + WRITE_ONCE(req->status, REQ_STATUS_FLSHD); + spin_unlock(&m->req_lock); + + p9_req_put(client, req); + + return 0; +} + +static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt) +{ + if (clnt->trans_mod == &p9_tcp_trans) { + if (clnt->trans_opts.tcp.port != P9_PORT) + seq_printf(m, ",port=%u", clnt->trans_opts.tcp.port); + } else if (clnt->trans_mod == &p9_fd_trans) { + if (clnt->trans_opts.fd.rfd != ~0) + seq_printf(m, ",rfd=%u", clnt->trans_opts.fd.rfd); + if (clnt->trans_opts.fd.wfd != ~0) + seq_printf(m, ",wfd=%u", clnt->trans_opts.fd.wfd); + } + return 0; +} + +/** + * parse_opts - parse mount options into p9_fd_opts structure + * @params: options string passed from mount + * @opts: fd transport-specific structure to parse options into + * + * Returns 0 upon success, -ERRNO upon failure + */ + +static int parse_opts(char *params, struct p9_fd_opts *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + char *options, *tmp_options; + + opts->port = P9_PORT; + opts->rfd = ~0; + opts->wfd = ~0; + opts->privport = false; + + if (!params) + return 0; + + tmp_options = kstrdup(params, GFP_KERNEL); + if (!tmp_options) { + p9_debug(P9_DEBUG_ERROR, + "failed to allocate copy of option string\n"); + return -ENOMEM; + } + options = tmp_options; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + int r; + if (!*p) + continue; + token = match_token(p, tokens, args); + if ((token != Opt_err) && (token != Opt_privport)) { + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + continue; + } + } + switch (token) { + case Opt_port: + opts->port = option; + break; + case Opt_rfdno: + opts->rfd = option; + break; + case Opt_wfdno: + opts->wfd = option; + break; + case Opt_privport: + opts->privport = true; + break; + default: + continue; + } + } + + kfree(tmp_options); + return 0; +} + +static int p9_fd_open(struct p9_client *client, int rfd, int wfd) +{ + struct p9_trans_fd *ts = kzalloc(sizeof(struct p9_trans_fd), + GFP_KERNEL); + if (!ts) + return -ENOMEM; + + ts->rd = fget(rfd); + if (!ts->rd) + goto out_free_ts; + if (!(ts->rd->f_mode & FMODE_READ)) + goto out_put_rd; + /* Prevent workers from hanging on IO when fd is a pipe. + * It's technically possible for userspace or concurrent mounts to + * modify this flag concurrently, which will likely result in a + * broken filesystem. However, just having bad flags here should + * not crash the kernel or cause any other sort of bug, so mark this + * particular data race as intentional so that tooling (like KCSAN) + * can allow it and detect further problems. + */ + data_race(ts->rd->f_flags |= O_NONBLOCK); + ts->wr = fget(wfd); + if (!ts->wr) + goto out_put_rd; + if (!(ts->wr->f_mode & FMODE_WRITE)) + goto out_put_wr; + data_race(ts->wr->f_flags |= O_NONBLOCK); + + client->trans = ts; + client->status = Connected; + + return 0; + +out_put_wr: + fput(ts->wr); +out_put_rd: + fput(ts->rd); +out_free_ts: + kfree(ts); + return -EIO; +} + +static int p9_socket_open(struct p9_client *client, struct socket *csocket) +{ + struct p9_trans_fd *p; + struct file *file; + + p = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL); + if (!p) { + sock_release(csocket); + return -ENOMEM; + } + + csocket->sk->sk_allocation = GFP_NOIO; + csocket->sk->sk_use_task_frag = false; + file = sock_alloc_file(csocket, 0, NULL); + if (IS_ERR(file)) { + pr_err("%s (%d): failed to map fd\n", + __func__, task_pid_nr(current)); + kfree(p); + return PTR_ERR(file); + } + + get_file(file); + p->wr = p->rd = file; + client->trans = p; + client->status = Connected; + + p->rd->f_flags |= O_NONBLOCK; + + p9_conn_create(client); + return 0; +} + +/** + * p9_conn_destroy - cancels all pending requests of mux + * @m: mux to destroy + * + */ + +static void p9_conn_destroy(struct p9_conn *m) +{ + p9_debug(P9_DEBUG_TRANS, "mux %p prev %p next %p\n", + m, m->mux_list.prev, m->mux_list.next); + + p9_mux_poll_stop(m); + cancel_work_sync(&m->rq); + if (m->rreq) { + p9_req_put(m->client, m->rreq); + m->rreq = NULL; + } + cancel_work_sync(&m->wq); + if (m->wreq) { + p9_req_put(m->client, m->wreq); + m->wreq = NULL; + } + + p9_conn_cancel(m, -ECONNRESET); + + m->client = NULL; +} + +/** + * p9_fd_close - shutdown file descriptor transport + * @client: client instance + * + */ + +static void p9_fd_close(struct p9_client *client) +{ + struct p9_trans_fd *ts; + + if (!client) + return; + + ts = client->trans; + if (!ts) + return; + + client->status = Disconnected; + + p9_conn_destroy(&ts->conn); + + if (ts->rd) + fput(ts->rd); + if (ts->wr) + fput(ts->wr); + + kfree(ts); +} + +/* + * stolen from NFS - maybe should be made a generic function? + */ +static inline int valid_ipaddr4(const char *buf) +{ + int rc, count, in[4]; + + rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); + if (rc != 4) + return -EINVAL; + for (count = 0; count < 4; count++) { + if (in[count] > 255) + return -EINVAL; + } + return 0; +} + +static int p9_bind_privport(struct socket *sock) +{ + struct sockaddr_in cl; + int port, err = -EINVAL; + + memset(&cl, 0, sizeof(cl)); + cl.sin_family = AF_INET; + cl.sin_addr.s_addr = htonl(INADDR_ANY); + for (port = p9_ipport_resv_max; port >= p9_ipport_resv_min; port--) { + cl.sin_port = htons((ushort)port); + err = kernel_bind(sock, (struct sockaddr *)&cl, sizeof(cl)); + if (err != -EADDRINUSE) + break; + } + return err; +} + + +static int +p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) +{ + int err; + struct socket *csocket; + struct sockaddr_in sin_server; + struct p9_fd_opts opts; + + err = parse_opts(args, &opts); + if (err < 0) + return err; + + if (addr == NULL || valid_ipaddr4(addr) < 0) + return -EINVAL; + + csocket = NULL; + + client->trans_opts.tcp.port = opts.port; + client->trans_opts.tcp.privport = opts.privport; + sin_server.sin_family = AF_INET; + sin_server.sin_addr.s_addr = in_aton(addr); + sin_server.sin_port = htons(opts.port); + err = __sock_create(current->nsproxy->net_ns, PF_INET, + SOCK_STREAM, IPPROTO_TCP, &csocket, 1); + if (err) { + pr_err("%s (%d): problem creating socket\n", + __func__, task_pid_nr(current)); + return err; + } + + if (opts.privport) { + err = p9_bind_privport(csocket); + if (err < 0) { + pr_err("%s (%d): problem binding to privport\n", + __func__, task_pid_nr(current)); + sock_release(csocket); + return err; + } + } + + err = READ_ONCE(csocket->ops)->connect(csocket, + (struct sockaddr *)&sin_server, + sizeof(struct sockaddr_in), 0); + if (err < 0) { + pr_err("%s (%d): problem connecting socket to %s\n", + __func__, task_pid_nr(current), addr); + sock_release(csocket); + return err; + } + + return p9_socket_open(client, csocket); +} + +static int +p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) +{ + int err; + struct socket *csocket; + struct sockaddr_un sun_server; + + csocket = NULL; + + if (!addr || !strlen(addr)) + return -EINVAL; + + if (strlen(addr) >= UNIX_PATH_MAX) { + pr_err("%s (%d): address too long: %s\n", + __func__, task_pid_nr(current), addr); + return -ENAMETOOLONG; + } + + sun_server.sun_family = PF_UNIX; + strcpy(sun_server.sun_path, addr); + err = __sock_create(current->nsproxy->net_ns, PF_UNIX, + SOCK_STREAM, 0, &csocket, 1); + if (err < 0) { + pr_err("%s (%d): problem creating socket\n", + __func__, task_pid_nr(current)); + + return err; + } + err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr *)&sun_server, + sizeof(struct sockaddr_un) - 1, 0); + if (err < 0) { + pr_err("%s (%d): problem connecting socket: %s: %d\n", + __func__, task_pid_nr(current), addr, err); + sock_release(csocket); + return err; + } + + return p9_socket_open(client, csocket); +} + +static int +p9_fd_create(struct p9_client *client, const char *addr, char *args) +{ + int err; + struct p9_fd_opts opts; + + err = parse_opts(args, &opts); + if (err < 0) + return err; + client->trans_opts.fd.rfd = opts.rfd; + client->trans_opts.fd.wfd = opts.wfd; + + if (opts.rfd == ~0 || opts.wfd == ~0) { + pr_err("Insufficient options for proto=fd\n"); + return -ENOPROTOOPT; + } + + err = p9_fd_open(client, opts.rfd, opts.wfd); + if (err < 0) + return err; + + p9_conn_create(client); + + return 0; +} + +static struct p9_trans_module p9_tcp_trans = { + .name = "tcp", + .maxsize = MAX_SOCK_BUF, + .pooled_rbuffers = false, + .def = 0, + .create = p9_fd_create_tcp, + .close = p9_fd_close, + .request = p9_fd_request, + .cancel = p9_fd_cancel, + .cancelled = p9_fd_cancelled, + .show_options = p9_fd_show_options, + .owner = THIS_MODULE, +}; +MODULE_ALIAS_9P("tcp"); + +static struct p9_trans_module p9_unix_trans = { + .name = "unix", + .maxsize = MAX_SOCK_BUF, + .def = 0, + .create = p9_fd_create_unix, + .close = p9_fd_close, + .request = p9_fd_request, + .cancel = p9_fd_cancel, + .cancelled = p9_fd_cancelled, + .show_options = p9_fd_show_options, + .owner = THIS_MODULE, +}; +MODULE_ALIAS_9P("unix"); + +static struct p9_trans_module p9_fd_trans = { + .name = "fd", + .maxsize = MAX_SOCK_BUF, + .def = 0, + .create = p9_fd_create, + .close = p9_fd_close, + .request = p9_fd_request, + .cancel = p9_fd_cancel, + .cancelled = p9_fd_cancelled, + .show_options = p9_fd_show_options, + .owner = THIS_MODULE, +}; +MODULE_ALIAS_9P("fd"); + +/** + * p9_poll_workfn - poll worker thread + * @work: work queue + * + * polls all v9fs transports for new events and queues the appropriate + * work to the work queue + * + */ + +static void p9_poll_workfn(struct work_struct *work) +{ + unsigned long flags; + + p9_debug(P9_DEBUG_TRANS, "start %p\n", current); + + spin_lock_irqsave(&p9_poll_lock, flags); + while (!list_empty(&p9_poll_pending_list)) { + struct p9_conn *conn = list_first_entry(&p9_poll_pending_list, + struct p9_conn, + poll_pending_link); + list_del_init(&conn->poll_pending_link); + spin_unlock_irqrestore(&p9_poll_lock, flags); + + p9_poll_mux(conn); + + spin_lock_irqsave(&p9_poll_lock, flags); + } + spin_unlock_irqrestore(&p9_poll_lock, flags); + + p9_debug(P9_DEBUG_TRANS, "finish\n"); +} + +static int __init p9_trans_fd_init(void) +{ + v9fs_register_trans(&p9_tcp_trans); + v9fs_register_trans(&p9_unix_trans); + v9fs_register_trans(&p9_fd_trans); + + return 0; +} + +static void __exit p9_trans_fd_exit(void) +{ + flush_work(&p9_poll_work); + v9fs_unregister_trans(&p9_tcp_trans); + v9fs_unregister_trans(&p9_unix_trans); + v9fs_unregister_trans(&p9_fd_trans); +} + +module_init(p9_trans_fd_init); +module_exit(p9_trans_fd_exit); + +MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); +MODULE_DESCRIPTION("Filedescriptor Transport for 9P"); +MODULE_LICENSE("GPL"); diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c new file mode 100644 index 0000000000..b84748baf9 --- /dev/null +++ b/net/9p/trans_rdma.c @@ -0,0 +1,781 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RDMA transport layer based on the trans_fd.c implementation. + * + * Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com> + * Copyright (C) 2006 by Russ Cox <rsc@swtch.com> + * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> + * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> + * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/in.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/ipv6.h> +#include <linux/kthread.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/un.h> +#include <linux/uaccess.h> +#include <linux/inet.h> +#include <linux/file.h> +#include <linux/parser.h> +#include <linux/semaphore.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <net/9p/9p.h> +#include <net/9p/client.h> +#include <net/9p/transport.h> +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> + +#define P9_PORT 5640 +#define P9_RDMA_SQ_DEPTH 32 +#define P9_RDMA_RQ_DEPTH 32 +#define P9_RDMA_SEND_SGE 4 +#define P9_RDMA_RECV_SGE 4 +#define P9_RDMA_IRD 0 +#define P9_RDMA_ORD 0 +#define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ +#define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */ + +/** + * struct p9_trans_rdma - RDMA transport instance + * + * @state: tracks the transport state machine for connection setup and tear down + * @cm_id: The RDMA CM ID + * @pd: Protection Domain pointer + * @qp: Queue Pair pointer + * @cq: Completion Queue pointer + * @timeout: Number of uSecs to wait for connection management events + * @privport: Whether a privileged port may be used + * @port: The port to use + * @sq_depth: The depth of the Send Queue + * @sq_sem: Semaphore for the SQ + * @rq_depth: The depth of the Receive Queue. + * @rq_sem: Semaphore for the RQ + * @excess_rc : Amount of posted Receive Contexts without a pending request. + * See rdma_request() + * @addr: The remote peer's address + * @req_lock: Protects the active request list + * @cm_done: Completion event for connection management tracking + */ +struct p9_trans_rdma { + enum { + P9_RDMA_INIT, + P9_RDMA_ADDR_RESOLVED, + P9_RDMA_ROUTE_RESOLVED, + P9_RDMA_CONNECTED, + P9_RDMA_FLUSHING, + P9_RDMA_CLOSING, + P9_RDMA_CLOSED, + } state; + struct rdma_cm_id *cm_id; + struct ib_pd *pd; + struct ib_qp *qp; + struct ib_cq *cq; + long timeout; + bool privport; + u16 port; + int sq_depth; + struct semaphore sq_sem; + int rq_depth; + struct semaphore rq_sem; + atomic_t excess_rc; + struct sockaddr_in addr; + spinlock_t req_lock; + + struct completion cm_done; +}; + +struct p9_rdma_req; + +/** + * struct p9_rdma_context - Keeps track of in-process WR + * + * @cqe: completion queue entry + * @busa: Bus address to unmap when the WR completes + * @req: Keeps track of requests (send) + * @rc: Keepts track of replies (receive) + */ +struct p9_rdma_context { + struct ib_cqe cqe; + dma_addr_t busa; + union { + struct p9_req_t *req; + struct p9_fcall rc; + }; +}; + +/** + * struct p9_rdma_opts - Collection of mount options + * @port: port of connection + * @privport: Whether a privileged port may be used + * @sq_depth: The requested depth of the SQ. This really doesn't need + * to be any deeper than the number of threads used in the client + * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth + * @timeout: Time to wait in msecs for CM events + */ +struct p9_rdma_opts { + short port; + bool privport; + int sq_depth; + int rq_depth; + long timeout; +}; + +/* + * Option Parsing (code inspired by NFS code) + */ +enum { + /* Options that take integer arguments */ + Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, + /* Options that take no argument */ + Opt_privport, + Opt_err, +}; + +static match_table_t tokens = { + {Opt_port, "port=%u"}, + {Opt_sq_depth, "sq=%u"}, + {Opt_rq_depth, "rq=%u"}, + {Opt_timeout, "timeout=%u"}, + {Opt_privport, "privport"}, + {Opt_err, NULL}, +}; + +static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt) +{ + struct p9_trans_rdma *rdma = clnt->trans; + + if (rdma->port != P9_PORT) + seq_printf(m, ",port=%u", rdma->port); + if (rdma->sq_depth != P9_RDMA_SQ_DEPTH) + seq_printf(m, ",sq=%u", rdma->sq_depth); + if (rdma->rq_depth != P9_RDMA_RQ_DEPTH) + seq_printf(m, ",rq=%u", rdma->rq_depth); + if (rdma->timeout != P9_RDMA_TIMEOUT) + seq_printf(m, ",timeout=%lu", rdma->timeout); + if (rdma->privport) + seq_puts(m, ",privport"); + return 0; +} + +/** + * parse_opts - parse mount options into rdma options structure + * @params: options string passed from mount + * @opts: rdma transport-specific structure to parse options into + * + * Returns 0 upon success, -ERRNO upon failure + */ +static int parse_opts(char *params, struct p9_rdma_opts *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + char *options, *tmp_options; + + opts->port = P9_PORT; + opts->sq_depth = P9_RDMA_SQ_DEPTH; + opts->rq_depth = P9_RDMA_RQ_DEPTH; + opts->timeout = P9_RDMA_TIMEOUT; + opts->privport = false; + + if (!params) + return 0; + + tmp_options = kstrdup(params, GFP_KERNEL); + if (!tmp_options) { + p9_debug(P9_DEBUG_ERROR, + "failed to allocate copy of option string\n"); + return -ENOMEM; + } + options = tmp_options; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + int r; + if (!*p) + continue; + token = match_token(p, tokens, args); + if ((token != Opt_err) && (token != Opt_privport)) { + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + continue; + } + } + switch (token) { + case Opt_port: + opts->port = option; + break; + case Opt_sq_depth: + opts->sq_depth = option; + break; + case Opt_rq_depth: + opts->rq_depth = option; + break; + case Opt_timeout: + opts->timeout = option; + break; + case Opt_privport: + opts->privport = true; + break; + default: + continue; + } + } + /* RQ must be at least as large as the SQ */ + opts->rq_depth = max(opts->rq_depth, opts->sq_depth); + kfree(tmp_options); + return 0; +} + +static int +p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) +{ + struct p9_client *c = id->context; + struct p9_trans_rdma *rdma = c->trans; + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + BUG_ON(rdma->state != P9_RDMA_INIT); + rdma->state = P9_RDMA_ADDR_RESOLVED; + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED); + rdma->state = P9_RDMA_ROUTE_RESOLVED; + break; + + case RDMA_CM_EVENT_ESTABLISHED: + BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED); + rdma->state = P9_RDMA_CONNECTED; + break; + + case RDMA_CM_EVENT_DISCONNECTED: + if (rdma) + rdma->state = P9_RDMA_CLOSED; + c->status = Disconnected; + break; + + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + break; + + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_MULTICAST_ERROR: + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_CONNECT_REQUEST: + case RDMA_CM_EVENT_CONNECT_RESPONSE: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + c->status = Disconnected; + rdma_disconnect(rdma->cm_id); + break; + default: + BUG(); + } + complete(&rdma->cm_done); + return 0; +} + +static void +recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct p9_client *client = cq->cq_context; + struct p9_trans_rdma *rdma = client->trans; + struct p9_rdma_context *c = + container_of(wc->wr_cqe, struct p9_rdma_context, cqe); + struct p9_req_t *req; + int err = 0; + int16_t tag; + + req = NULL; + ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, + DMA_FROM_DEVICE); + + if (wc->status != IB_WC_SUCCESS) + goto err_out; + + c->rc.size = wc->byte_len; + err = p9_parse_header(&c->rc, NULL, NULL, &tag, 1); + if (err) + goto err_out; + + req = p9_tag_lookup(client, tag); + if (!req) + goto err_out; + + /* Check that we have not yet received a reply for this request. + */ + if (unlikely(req->rc.sdata)) { + pr_err("Duplicate reply for request %d", tag); + goto err_out; + } + + req->rc.size = c->rc.size; + req->rc.sdata = c->rc.sdata; + p9_client_cb(client, req, REQ_STATUS_RCVD); + + out: + up(&rdma->rq_sem); + kfree(c); + return; + + err_out: + p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", + req, err, wc->status); + rdma->state = P9_RDMA_FLUSHING; + client->status = Disconnected; + goto out; +} + +static void +send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct p9_client *client = cq->cq_context; + struct p9_trans_rdma *rdma = client->trans; + struct p9_rdma_context *c = + container_of(wc->wr_cqe, struct p9_rdma_context, cqe); + + ib_dma_unmap_single(rdma->cm_id->device, + c->busa, c->req->tc.size, + DMA_TO_DEVICE); + up(&rdma->sq_sem); + p9_req_put(client, c->req); + kfree(c); +} + +static void qp_event_handler(struct ib_event *event, void *context) +{ + p9_debug(P9_DEBUG_ERROR, "QP event %d context %p\n", + event->event, context); +} + +static void rdma_destroy_trans(struct p9_trans_rdma *rdma) +{ + if (!rdma) + return; + + if (rdma->qp && !IS_ERR(rdma->qp)) + ib_destroy_qp(rdma->qp); + + if (rdma->pd && !IS_ERR(rdma->pd)) + ib_dealloc_pd(rdma->pd); + + if (rdma->cq && !IS_ERR(rdma->cq)) + ib_free_cq(rdma->cq); + + if (rdma->cm_id && !IS_ERR(rdma->cm_id)) + rdma_destroy_id(rdma->cm_id); + + kfree(rdma); +} + +static int +post_recv(struct p9_client *client, struct p9_rdma_context *c) +{ + struct p9_trans_rdma *rdma = client->trans; + struct ib_recv_wr wr; + struct ib_sge sge; + int ret; + + c->busa = ib_dma_map_single(rdma->cm_id->device, + c->rc.sdata, client->msize, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) + goto error; + + c->cqe.done = recv_done; + + sge.addr = c->busa; + sge.length = client->msize; + sge.lkey = rdma->pd->local_dma_lkey; + + wr.next = NULL; + wr.wr_cqe = &c->cqe; + wr.sg_list = &sge; + wr.num_sge = 1; + + ret = ib_post_recv(rdma->qp, &wr, NULL); + if (ret) + ib_dma_unmap_single(rdma->cm_id->device, c->busa, + client->msize, DMA_FROM_DEVICE); + return ret; + + error: + p9_debug(P9_DEBUG_ERROR, "EIO\n"); + return -EIO; +} + +static int rdma_request(struct p9_client *client, struct p9_req_t *req) +{ + struct p9_trans_rdma *rdma = client->trans; + struct ib_send_wr wr; + struct ib_sge sge; + int err = 0; + unsigned long flags; + struct p9_rdma_context *c = NULL; + struct p9_rdma_context *rpl_context = NULL; + + /* When an error occurs between posting the recv and the send, + * there will be a receive context posted without a pending request. + * Since there is no way to "un-post" it, we remember it and skip + * post_recv() for the next request. + * So here, + * see if we are this `next request' and need to absorb an excess rc. + * If yes, then drop and free our own, and do not recv_post(). + **/ + if (unlikely(atomic_read(&rdma->excess_rc) > 0)) { + if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) { + /* Got one! */ + p9_fcall_fini(&req->rc); + req->rc.sdata = NULL; + goto dont_need_post_recv; + } else { + /* We raced and lost. */ + atomic_inc(&rdma->excess_rc); + } + } + + /* Allocate an fcall for the reply */ + rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); + if (!rpl_context) { + err = -ENOMEM; + goto recv_error; + } + rpl_context->rc.sdata = req->rc.sdata; + + /* + * Post a receive buffer for this request. We need to ensure + * there is a reply buffer available for every outstanding + * request. A flushed request can result in no reply for an + * outstanding request, so we must keep a count to avoid + * overflowing the RQ. + */ + if (down_interruptible(&rdma->rq_sem)) { + err = -EINTR; + goto recv_error; + } + + err = post_recv(client, rpl_context); + if (err) { + p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err); + goto recv_error; + } + /* remove posted receive buffer from request structure */ + req->rc.sdata = NULL; + +dont_need_post_recv: + /* Post the request */ + c = kmalloc(sizeof *c, GFP_NOFS); + if (!c) { + err = -ENOMEM; + goto send_error; + } + c->req = req; + + c->busa = ib_dma_map_single(rdma->cm_id->device, + c->req->tc.sdata, c->req->tc.size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) { + err = -EIO; + goto send_error; + } + + c->cqe.done = send_done; + + sge.addr = c->busa; + sge.length = c->req->tc.size; + sge.lkey = rdma->pd->local_dma_lkey; + + wr.next = NULL; + wr.wr_cqe = &c->cqe; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + wr.sg_list = &sge; + wr.num_sge = 1; + + if (down_interruptible(&rdma->sq_sem)) { + err = -EINTR; + goto dma_unmap; + } + + /* Mark request as `sent' *before* we actually send it, + * because doing if after could erase the REQ_STATUS_RCVD + * status in case of a very fast reply. + */ + WRITE_ONCE(req->status, REQ_STATUS_SENT); + err = ib_post_send(rdma->qp, &wr, NULL); + if (err) + goto dma_unmap; + + /* Success */ + return 0; + +dma_unmap: + ib_dma_unmap_single(rdma->cm_id->device, c->busa, + c->req->tc.size, DMA_TO_DEVICE); + /* Handle errors that happened during or while preparing the send: */ + send_error: + WRITE_ONCE(req->status, REQ_STATUS_ERROR); + kfree(c); + p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err); + + /* Ach. + * We did recv_post(), but not send. We have one recv_post in excess. + */ + atomic_inc(&rdma->excess_rc); + return err; + + /* Handle errors that happened during or while preparing post_recv(): */ + recv_error: + kfree(rpl_context); + spin_lock_irqsave(&rdma->req_lock, flags); + if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) { + rdma->state = P9_RDMA_CLOSING; + spin_unlock_irqrestore(&rdma->req_lock, flags); + rdma_disconnect(rdma->cm_id); + } else + spin_unlock_irqrestore(&rdma->req_lock, flags); + return err; +} + +static void rdma_close(struct p9_client *client) +{ + struct p9_trans_rdma *rdma; + + if (!client) + return; + + rdma = client->trans; + if (!rdma) + return; + + client->status = Disconnected; + rdma_disconnect(rdma->cm_id); + rdma_destroy_trans(rdma); +} + +/** + * alloc_rdma - Allocate and initialize the rdma transport structure + * @opts: Mount options structure + */ +static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) +{ + struct p9_trans_rdma *rdma; + + rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL); + if (!rdma) + return NULL; + + rdma->port = opts->port; + rdma->privport = opts->privport; + rdma->sq_depth = opts->sq_depth; + rdma->rq_depth = opts->rq_depth; + rdma->timeout = opts->timeout; + spin_lock_init(&rdma->req_lock); + init_completion(&rdma->cm_done); + sema_init(&rdma->sq_sem, rdma->sq_depth); + sema_init(&rdma->rq_sem, rdma->rq_depth); + atomic_set(&rdma->excess_rc, 0); + + return rdma; +} + +static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) +{ + /* Nothing to do here. + * We will take care of it (if we have to) in rdma_cancelled() + */ + return 1; +} + +/* A request has been fully flushed without a reply. + * That means we have posted one buffer in excess. + */ +static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req) +{ + struct p9_trans_rdma *rdma = client->trans; + atomic_inc(&rdma->excess_rc); + return 0; +} + +static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma) +{ + struct sockaddr_in cl = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + }; + int port, err = -EINVAL; + + for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) { + cl.sin_port = htons((ushort)port); + err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl); + if (err != -EADDRINUSE) + break; + } + return err; +} + +/** + * rdma_create_trans - Transport method for creating a transport instance + * @client: client instance + * @addr: IP address string + * @args: Mount options string + */ +static int +rdma_create_trans(struct p9_client *client, const char *addr, char *args) +{ + int err; + struct p9_rdma_opts opts; + struct p9_trans_rdma *rdma; + struct rdma_conn_param conn_param; + struct ib_qp_init_attr qp_attr; + + if (addr == NULL) + return -EINVAL; + + /* Parse the transport specific mount options */ + err = parse_opts(args, &opts); + if (err < 0) + return err; + + /* Create and initialize the RDMA transport structure */ + rdma = alloc_rdma(&opts); + if (!rdma) + return -ENOMEM; + + /* Create the RDMA CM ID */ + rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(rdma->cm_id)) + goto error; + + /* Associate the client with the transport */ + client->trans = rdma; + + /* Bind to a privileged port if we need to */ + if (opts.privport) { + err = p9_rdma_bind_privport(rdma); + if (err < 0) { + pr_err("%s (%d): problem binding to privport: %d\n", + __func__, task_pid_nr(current), -err); + goto error; + } + } + + /* Resolve the server's address */ + rdma->addr.sin_family = AF_INET; + rdma->addr.sin_addr.s_addr = in_aton(addr); + rdma->addr.sin_port = htons(opts.port); + err = rdma_resolve_addr(rdma->cm_id, NULL, + (struct sockaddr *)&rdma->addr, + rdma->timeout); + if (err) + goto error; + err = wait_for_completion_interruptible(&rdma->cm_done); + if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED)) + goto error; + + /* Resolve the route to the server */ + err = rdma_resolve_route(rdma->cm_id, rdma->timeout); + if (err) + goto error; + err = wait_for_completion_interruptible(&rdma->cm_done); + if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) + goto error; + + /* Create the Completion Queue */ + rdma->cq = ib_alloc_cq_any(rdma->cm_id->device, client, + opts.sq_depth + opts.rq_depth + 1, + IB_POLL_SOFTIRQ); + if (IS_ERR(rdma->cq)) + goto error; + + /* Create the Protection Domain */ + rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0); + if (IS_ERR(rdma->pd)) + goto error; + + /* Create the Queue Pair */ + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.event_handler = qp_event_handler; + qp_attr.qp_context = client; + qp_attr.cap.max_send_wr = opts.sq_depth; + qp_attr.cap.max_recv_wr = opts.rq_depth; + qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE; + qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + qp_attr.send_cq = rdma->cq; + qp_attr.recv_cq = rdma->cq; + err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr); + if (err) + goto error; + rdma->qp = rdma->cm_id->qp; + + /* Request a connection */ + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.private_data = NULL; + conn_param.private_data_len = 0; + conn_param.responder_resources = P9_RDMA_IRD; + conn_param.initiator_depth = P9_RDMA_ORD; + err = rdma_connect(rdma->cm_id, &conn_param); + if (err) + goto error; + err = wait_for_completion_interruptible(&rdma->cm_done); + if (err || (rdma->state != P9_RDMA_CONNECTED)) + goto error; + + client->status = Connected; + + return 0; + +error: + rdma_destroy_trans(rdma); + return -ENOTCONN; +} + +static struct p9_trans_module p9_rdma_trans = { + .name = "rdma", + .maxsize = P9_RDMA_MAXSIZE, + .pooled_rbuffers = true, + .def = 0, + .owner = THIS_MODULE, + .create = rdma_create_trans, + .close = rdma_close, + .request = rdma_request, + .cancel = rdma_cancel, + .cancelled = rdma_cancelled, + .show_options = p9_rdma_show_options, +}; + +/** + * p9_trans_rdma_init - Register the 9P RDMA transport driver + */ +static int __init p9_trans_rdma_init(void) +{ + v9fs_register_trans(&p9_rdma_trans); + return 0; +} + +static void __exit p9_trans_rdma_exit(void) +{ + v9fs_unregister_trans(&p9_rdma_trans); +} + +module_init(p9_trans_rdma_init); +module_exit(p9_trans_rdma_exit); +MODULE_ALIAS_9P("rdma"); + +MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); +MODULE_DESCRIPTION("RDMA Transport for 9P"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c new file mode 100644 index 0000000000..e305071eb7 --- /dev/null +++ b/net/9p/trans_virtio.c @@ -0,0 +1,838 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * The Virtio 9p transport driver + * + * This is a block based transport driver based on the lguest block driver + * code. + * + * Copyright (C) 2007, 2008 Eric Van Hensbergen, IBM Corporation + * + * Based on virtio console driver + * Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/in.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/ipv6.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/un.h> +#include <linux/uaccess.h> +#include <linux/inet.h> +#include <linux/file.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <net/9p/9p.h> +#include <linux/parser.h> +#include <net/9p/client.h> +#include <net/9p/transport.h> +#include <linux/scatterlist.h> +#include <linux/swap.h> +#include <linux/virtio.h> +#include <linux/virtio_9p.h> +#include "trans_common.h" + +#define VIRTQUEUE_NUM 128 + +/* a single mutex to manage channel initialization and attachment */ +static DEFINE_MUTEX(virtio_9p_lock); +static DECLARE_WAIT_QUEUE_HEAD(vp_wq); +static atomic_t vp_pinned = ATOMIC_INIT(0); + +/** + * struct virtio_chan - per-instance transport information + * @inuse: whether the channel is in use + * @lock: protects multiple elements within this structure + * @client: client instance + * @vdev: virtio dev associated with this channel + * @vq: virtio queue associated with this channel + * @ring_bufs_avail: flag to indicate there is some available in the ring buf + * @vc_wq: wait queue for waiting for thing to be added to ring buf + * @p9_max_pages: maximum number of pinned pages + * @sg: scatter gather list which is used to pack a request (protected?) + * @chan_list: linked list of channels + * + * We keep all per-channel information in a structure. + * This structure is allocated within the devices dev->mem space. + * A pointer to the structure will get put in the transport private. + * + */ + +struct virtio_chan { + bool inuse; + + spinlock_t lock; + + struct p9_client *client; + struct virtio_device *vdev; + struct virtqueue *vq; + int ring_bufs_avail; + wait_queue_head_t *vc_wq; + /* This is global limit. Since we don't have a global structure, + * will be placing it in each channel. + */ + unsigned long p9_max_pages; + /* Scatterlist: can be too big for stack. */ + struct scatterlist sg[VIRTQUEUE_NUM]; + /** + * @tag: name to identify a mount null terminated + */ + char *tag; + + struct list_head chan_list; +}; + +static struct list_head virtio_chan_list; + +/* How many bytes left in this page. */ +static unsigned int rest_of_page(void *data) +{ + return PAGE_SIZE - offset_in_page(data); +} + +/** + * p9_virtio_close - reclaim resources of a channel + * @client: client instance + * + * This reclaims a channel by freeing its resources and + * resetting its inuse flag. + * + */ + +static void p9_virtio_close(struct p9_client *client) +{ + struct virtio_chan *chan = client->trans; + + mutex_lock(&virtio_9p_lock); + if (chan) + chan->inuse = false; + mutex_unlock(&virtio_9p_lock); +} + +/** + * req_done - callback which signals activity from the server + * @vq: virtio queue activity was received on + * + * This notifies us that the server has triggered some activity + * on the virtio channel - most likely a response to request we + * sent. Figure out which requests now have responses and wake up + * those threads. + * + * Bugs: could do with some additional sanity checking, but appears to work. + * + */ + +static void req_done(struct virtqueue *vq) +{ + struct virtio_chan *chan = vq->vdev->priv; + unsigned int len; + struct p9_req_t *req; + bool need_wakeup = false; + unsigned long flags; + + p9_debug(P9_DEBUG_TRANS, ": request done\n"); + + spin_lock_irqsave(&chan->lock, flags); + while ((req = virtqueue_get_buf(chan->vq, &len)) != NULL) { + if (!chan->ring_bufs_avail) { + chan->ring_bufs_avail = 1; + need_wakeup = true; + } + + if (len) { + req->rc.size = len; + p9_client_cb(chan->client, req, REQ_STATUS_RCVD); + } + } + spin_unlock_irqrestore(&chan->lock, flags); + /* Wakeup if anyone waiting for VirtIO ring space. */ + if (need_wakeup) + wake_up(chan->vc_wq); +} + +/** + * pack_sg_list - pack a scatter gather list from a linear buffer + * @sg: scatter/gather list to pack into + * @start: which segment of the sg_list to start at + * @limit: maximum segment to pack data to + * @data: data to pack into scatter/gather list + * @count: amount of data to pack into the scatter/gather list + * + * sg_lists have multiple segments of various sizes. This will pack + * arbitrary data into an existing scatter gather list, segmenting the + * data as necessary within constraints. + * + */ + +static int pack_sg_list(struct scatterlist *sg, int start, + int limit, char *data, int count) +{ + int s; + int index = start; + + while (count) { + s = rest_of_page(data); + if (s > count) + s = count; + BUG_ON(index >= limit); + /* Make sure we don't terminate early. */ + sg_unmark_end(&sg[index]); + sg_set_buf(&sg[index++], data, s); + count -= s; + data += s; + } + if (index-start) + sg_mark_end(&sg[index - 1]); + return index-start; +} + +/* We don't currently allow canceling of virtio requests */ +static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req) +{ + return 1; +} + +/* Reply won't come, so drop req ref */ +static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req) +{ + p9_req_put(client, req); + return 0; +} + +/** + * pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer, + * this takes a list of pages. + * @sg: scatter/gather list to pack into + * @start: which segment of the sg_list to start at + * @limit: maximum number of pages in sg list. + * @pdata: a list of pages to add into sg. + * @nr_pages: number of pages to pack into the scatter/gather list + * @offs: amount of data in the beginning of first page _not_ to pack + * @count: amount of data to pack into the scatter/gather list + */ +static int +pack_sg_list_p(struct scatterlist *sg, int start, int limit, + struct page **pdata, int nr_pages, size_t offs, int count) +{ + int i = 0, s; + int data_off = offs; + int index = start; + + BUG_ON(nr_pages > (limit - start)); + /* + * if the first page doesn't start at + * page boundary find the offset + */ + while (nr_pages) { + s = PAGE_SIZE - data_off; + if (s > count) + s = count; + BUG_ON(index >= limit); + /* Make sure we don't terminate early. */ + sg_unmark_end(&sg[index]); + sg_set_page(&sg[index++], pdata[i++], s, data_off); + data_off = 0; + count -= s; + nr_pages--; + } + + if (index-start) + sg_mark_end(&sg[index - 1]); + return index - start; +} + +/** + * p9_virtio_request - issue a request + * @client: client instance issuing the request + * @req: request to be issued + * + */ + +static int +p9_virtio_request(struct p9_client *client, struct p9_req_t *req) +{ + int err; + int in, out, out_sgs, in_sgs; + unsigned long flags; + struct virtio_chan *chan = client->trans; + struct scatterlist *sgs[2]; + + p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n"); + + WRITE_ONCE(req->status, REQ_STATUS_SENT); +req_retry: + spin_lock_irqsave(&chan->lock, flags); + + out_sgs = in_sgs = 0; + /* Handle out VirtIO ring buffers */ + out = pack_sg_list(chan->sg, 0, + VIRTQUEUE_NUM, req->tc.sdata, req->tc.size); + if (out) + sgs[out_sgs++] = chan->sg; + + in = pack_sg_list(chan->sg, out, + VIRTQUEUE_NUM, req->rc.sdata, req->rc.capacity); + if (in) + sgs[out_sgs + in_sgs++] = chan->sg + out; + + err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req, + GFP_ATOMIC); + if (err < 0) { + if (err == -ENOSPC) { + chan->ring_bufs_avail = 0; + spin_unlock_irqrestore(&chan->lock, flags); + err = wait_event_killable(*chan->vc_wq, + chan->ring_bufs_avail); + if (err == -ERESTARTSYS) + return err; + + p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n"); + goto req_retry; + } else { + spin_unlock_irqrestore(&chan->lock, flags); + p9_debug(P9_DEBUG_TRANS, + "virtio rpc add_sgs returned failure\n"); + return -EIO; + } + } + virtqueue_kick(chan->vq); + spin_unlock_irqrestore(&chan->lock, flags); + + p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); + return 0; +} + +static int p9_get_mapped_pages(struct virtio_chan *chan, + struct page ***pages, + struct iov_iter *data, + int count, + size_t *offs, + int *need_drop) +{ + int nr_pages; + int err; + + if (!iov_iter_count(data)) + return 0; + + if (!iov_iter_is_kvec(data)) { + int n; + /* + * We allow only p9_max_pages pinned. We wait for the + * Other zc request to finish here + */ + if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { + err = wait_event_killable(vp_wq, + (atomic_read(&vp_pinned) < chan->p9_max_pages)); + if (err == -ERESTARTSYS) + return err; + } + n = iov_iter_get_pages_alloc2(data, pages, count, offs); + if (n < 0) + return n; + *need_drop = 1; + nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE); + atomic_add(nr_pages, &vp_pinned); + return n; + } else { + /* kernel buffer, no need to pin pages */ + int index; + size_t len; + void *p; + + /* we'd already checked that it's non-empty */ + while (1) { + len = iov_iter_single_seg_count(data); + if (likely(len)) { + p = data->kvec->iov_base + data->iov_offset; + break; + } + iov_iter_advance(data, 0); + } + if (len > count) + len = count; + + nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) - + (unsigned long)p / PAGE_SIZE; + + *pages = kmalloc_array(nr_pages, sizeof(struct page *), + GFP_NOFS); + if (!*pages) + return -ENOMEM; + + *need_drop = 0; + p -= (*offs = offset_in_page(p)); + for (index = 0; index < nr_pages; index++) { + if (is_vmalloc_addr(p)) + (*pages)[index] = vmalloc_to_page(p); + else + (*pages)[index] = kmap_to_page(p); + p += PAGE_SIZE; + } + iov_iter_advance(data, len); + return len; + } +} + +static void handle_rerror(struct p9_req_t *req, int in_hdr_len, + size_t offs, struct page **pages) +{ + unsigned size, n; + void *to = req->rc.sdata + in_hdr_len; + + // Fits entirely into the static data? Nothing to do. + if (req->rc.size < in_hdr_len || !pages) + return; + + // Really long error message? Tough, truncate the reply. Might get + // rejected (we can't be arsed to adjust the size encoded in header, + // or string size for that matter), but it wouldn't be anything valid + // anyway. + if (unlikely(req->rc.size > P9_ZC_HDR_SZ)) + req->rc.size = P9_ZC_HDR_SZ; + + // data won't span more than two pages + size = req->rc.size - in_hdr_len; + n = PAGE_SIZE - offs; + if (size > n) { + memcpy_from_page(to, *pages++, offs, n); + offs = 0; + to += n; + size -= n; + } + memcpy_from_page(to, *pages, offs, size); +} + +/** + * p9_virtio_zc_request - issue a zero copy request + * @client: client instance issuing the request + * @req: request to be issued + * @uidata: user buffer that should be used for zero copy read + * @uodata: user buffer that should be used for zero copy write + * @inlen: read buffer size + * @outlen: write buffer size + * @in_hdr_len: reader header size, This is the size of response protocol data + * + */ +static int +p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, + struct iov_iter *uidata, struct iov_iter *uodata, + int inlen, int outlen, int in_hdr_len) +{ + int in, out, err, out_sgs, in_sgs; + unsigned long flags; + int in_nr_pages = 0, out_nr_pages = 0; + struct page **in_pages = NULL, **out_pages = NULL; + struct virtio_chan *chan = client->trans; + struct scatterlist *sgs[4]; + size_t offs = 0; + int need_drop = 0; + int kicked = 0; + + p9_debug(P9_DEBUG_TRANS, "virtio request\n"); + + if (uodata) { + __le32 sz; + int n = p9_get_mapped_pages(chan, &out_pages, uodata, + outlen, &offs, &need_drop); + if (n < 0) { + err = n; + goto err_out; + } + out_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); + if (n != outlen) { + __le32 v = cpu_to_le32(n); + memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4); + outlen = n; + } + /* The size field of the message must include the length of the + * header and the length of the data. We didn't actually know + * the length of the data until this point so add it in now. + */ + sz = cpu_to_le32(req->tc.size + outlen); + memcpy(&req->tc.sdata[0], &sz, sizeof(sz)); + } else if (uidata) { + int n = p9_get_mapped_pages(chan, &in_pages, uidata, + inlen, &offs, &need_drop); + if (n < 0) { + err = n; + goto err_out; + } + in_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); + if (n != inlen) { + __le32 v = cpu_to_le32(n); + memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4); + inlen = n; + } + } + WRITE_ONCE(req->status, REQ_STATUS_SENT); +req_retry_pinned: + spin_lock_irqsave(&chan->lock, flags); + + out_sgs = in_sgs = 0; + + /* out data */ + out = pack_sg_list(chan->sg, 0, + VIRTQUEUE_NUM, req->tc.sdata, req->tc.size); + + if (out) + sgs[out_sgs++] = chan->sg; + + if (out_pages) { + sgs[out_sgs++] = chan->sg + out; + out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, + out_pages, out_nr_pages, offs, outlen); + } + + /* + * Take care of in data + * For example TREAD have 11. + * 11 is the read/write header = PDU Header(7) + IO Size (4). + * Arrange in such a way that server places header in the + * allocated memory and payload onto the user buffer. + */ + in = pack_sg_list(chan->sg, out, + VIRTQUEUE_NUM, req->rc.sdata, in_hdr_len); + if (in) + sgs[out_sgs + in_sgs++] = chan->sg + out; + + if (in_pages) { + sgs[out_sgs + in_sgs++] = chan->sg + out + in; + pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, + in_pages, in_nr_pages, offs, inlen); + } + + BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs)); + err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req, + GFP_ATOMIC); + if (err < 0) { + if (err == -ENOSPC) { + chan->ring_bufs_avail = 0; + spin_unlock_irqrestore(&chan->lock, flags); + err = wait_event_killable(*chan->vc_wq, + chan->ring_bufs_avail); + if (err == -ERESTARTSYS) + goto err_out; + + p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n"); + goto req_retry_pinned; + } else { + spin_unlock_irqrestore(&chan->lock, flags); + p9_debug(P9_DEBUG_TRANS, + "virtio rpc add_sgs returned failure\n"); + err = -EIO; + goto err_out; + } + } + virtqueue_kick(chan->vq); + spin_unlock_irqrestore(&chan->lock, flags); + kicked = 1; + p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); + err = wait_event_killable(req->wq, + READ_ONCE(req->status) >= REQ_STATUS_RCVD); + // RERROR needs reply (== error string) in static data + if (READ_ONCE(req->status) == REQ_STATUS_RCVD && + unlikely(req->rc.sdata[4] == P9_RERROR)) + handle_rerror(req, in_hdr_len, offs, in_pages); + + /* + * Non kernel buffers are pinned, unpin them + */ +err_out: + if (need_drop) { + if (in_pages) { + p9_release_pages(in_pages, in_nr_pages); + atomic_sub(in_nr_pages, &vp_pinned); + } + if (out_pages) { + p9_release_pages(out_pages, out_nr_pages); + atomic_sub(out_nr_pages, &vp_pinned); + } + /* wakeup anybody waiting for slots to pin pages */ + wake_up(&vp_wq); + } + kvfree(in_pages); + kvfree(out_pages); + if (!kicked) { + /* reply won't come */ + p9_req_put(client, req); + } + return err; +} + +static ssize_t p9_mount_tag_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct virtio_chan *chan; + struct virtio_device *vdev; + int tag_len; + + vdev = dev_to_virtio(dev); + chan = vdev->priv; + tag_len = strlen(chan->tag); + + memcpy(buf, chan->tag, tag_len + 1); + + return tag_len + 1; +} + +static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL); + +/** + * p9_virtio_probe - probe for existence of 9P virtio channels + * @vdev: virtio device to probe + * + * This probes for existing virtio channels. + * + */ + +static int p9_virtio_probe(struct virtio_device *vdev) +{ + __u16 tag_len; + char *tag; + int err; + struct virtio_chan *chan; + + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + + chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL); + if (!chan) { + pr_err("Failed to allocate virtio 9P channel\n"); + err = -ENOMEM; + goto fail; + } + + chan->vdev = vdev; + + /* We expect one virtqueue, for requests. */ + chan->vq = virtio_find_single_vq(vdev, req_done, "requests"); + if (IS_ERR(chan->vq)) { + err = PTR_ERR(chan->vq); + goto out_free_chan; + } + chan->vq->vdev->priv = chan; + spin_lock_init(&chan->lock); + + sg_init_table(chan->sg, VIRTQUEUE_NUM); + + chan->inuse = false; + if (virtio_has_feature(vdev, VIRTIO_9P_MOUNT_TAG)) { + virtio_cread(vdev, struct virtio_9p_config, tag_len, &tag_len); + } else { + err = -EINVAL; + goto out_free_vq; + } + tag = kzalloc(tag_len + 1, GFP_KERNEL); + if (!tag) { + err = -ENOMEM; + goto out_free_vq; + } + + virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag), + tag, tag_len); + chan->tag = tag; + err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); + if (err) { + goto out_free_tag; + } + chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); + if (!chan->vc_wq) { + err = -ENOMEM; + goto out_remove_file; + } + init_waitqueue_head(chan->vc_wq); + chan->ring_bufs_avail = 1; + /* Ceiling limit to avoid denial of service attacks */ + chan->p9_max_pages = nr_free_buffer_pages()/4; + + virtio_device_ready(vdev); + + mutex_lock(&virtio_9p_lock); + list_add_tail(&chan->chan_list, &virtio_chan_list); + mutex_unlock(&virtio_9p_lock); + + /* Let udev rules use the new mount_tag attribute. */ + kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE); + + return 0; + +out_remove_file: + sysfs_remove_file(&vdev->dev.kobj, &dev_attr_mount_tag.attr); +out_free_tag: + kfree(tag); +out_free_vq: + vdev->config->del_vqs(vdev); +out_free_chan: + kfree(chan); +fail: + return err; +} + + +/** + * p9_virtio_create - allocate a new virtio channel + * @client: client instance invoking this transport + * @devname: string identifying the channel to connect to (unused) + * @args: args passed from sys_mount() for per-transport options (unused) + * + * This sets up a transport channel for 9p communication. Right now + * we only match the first available channel, but eventually we could look up + * alternate channels by matching devname versus a virtio_config entry. + * We use a simple reference count mechanism to ensure that only a single + * mount has a channel open at a time. + * + */ + +static int +p9_virtio_create(struct p9_client *client, const char *devname, char *args) +{ + struct virtio_chan *chan; + int ret = -ENOENT; + int found = 0; + + if (devname == NULL) + return -EINVAL; + + mutex_lock(&virtio_9p_lock); + list_for_each_entry(chan, &virtio_chan_list, chan_list) { + if (!strcmp(devname, chan->tag)) { + if (!chan->inuse) { + chan->inuse = true; + found = 1; + break; + } + ret = -EBUSY; + } + } + mutex_unlock(&virtio_9p_lock); + + if (!found) { + pr_err("no channels available for device %s\n", devname); + return ret; + } + + client->trans = (void *)chan; + client->status = Connected; + chan->client = client; + + return 0; +} + +/** + * p9_virtio_remove - clean up resources associated with a virtio device + * @vdev: virtio device to remove + * + */ + +static void p9_virtio_remove(struct virtio_device *vdev) +{ + struct virtio_chan *chan = vdev->priv; + unsigned long warning_time; + + mutex_lock(&virtio_9p_lock); + + /* Remove self from list so we don't get new users. */ + list_del(&chan->chan_list); + warning_time = jiffies; + + /* Wait for existing users to close. */ + while (chan->inuse) { + mutex_unlock(&virtio_9p_lock); + msleep(250); + if (time_after(jiffies, warning_time + 10 * HZ)) { + dev_emerg(&vdev->dev, + "p9_virtio_remove: waiting for device in use.\n"); + warning_time = jiffies; + } + mutex_lock(&virtio_9p_lock); + } + + mutex_unlock(&virtio_9p_lock); + + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); + + sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); + kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE); + kfree(chan->tag); + kfree(chan->vc_wq); + kfree(chan); + +} + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_9P, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static unsigned int features[] = { + VIRTIO_9P_MOUNT_TAG, +}; + +/* The standard "struct lguest_driver": */ +static struct virtio_driver p9_virtio_drv = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = p9_virtio_probe, + .remove = p9_virtio_remove, +}; + +static struct p9_trans_module p9_virtio_trans = { + .name = "virtio", + .create = p9_virtio_create, + .close = p9_virtio_close, + .request = p9_virtio_request, + .zc_request = p9_virtio_zc_request, + .cancel = p9_virtio_cancel, + .cancelled = p9_virtio_cancelled, + /* + * We leave one entry for input and one entry for response + * headers. We also skip one more entry to accommodate, address + * that are not at page boundary, that can result in an extra + * page in zero copy. + */ + .maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3), + .pooled_rbuffers = false, + .def = 1, + .owner = THIS_MODULE, +}; + +/* The standard init function */ +static int __init p9_virtio_init(void) +{ + int rc; + + INIT_LIST_HEAD(&virtio_chan_list); + + v9fs_register_trans(&p9_virtio_trans); + rc = register_virtio_driver(&p9_virtio_drv); + if (rc) + v9fs_unregister_trans(&p9_virtio_trans); + + return rc; +} + +static void __exit p9_virtio_cleanup(void) +{ + unregister_virtio_driver(&p9_virtio_drv); + v9fs_unregister_trans(&p9_virtio_trans); +} + +module_init(p9_virtio_init); +module_exit(p9_virtio_cleanup); +MODULE_ALIAS_9P("virtio"); + +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); +MODULE_DESCRIPTION("Virtio 9p Transport"); +MODULE_LICENSE("GPL"); diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c new file mode 100644 index 0000000000..1fffe2bed5 --- /dev/null +++ b/net/9p/trans_xen.c @@ -0,0 +1,571 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/fs/9p/trans_xen + * + * Xen transport layer. + * + * Copyright (C) 2017 by Stefano Stabellini <stefano@aporeto.com> + */ + +#include <xen/events.h> +#include <xen/grant_table.h> +#include <xen/xen.h> +#include <xen/xenbus.h> +#include <xen/interface/io/9pfs.h> + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <net/9p/9p.h> +#include <net/9p/client.h> +#include <net/9p/transport.h> + +#define XEN_9PFS_NUM_RINGS 2 +#define XEN_9PFS_RING_ORDER 9 +#define XEN_9PFS_RING_SIZE(ring) XEN_FLEX_RING_SIZE(ring->intf->ring_order) + +struct xen_9pfs_header { + uint32_t size; + uint8_t id; + uint16_t tag; + + /* uint8_t sdata[]; */ +} __attribute__((packed)); + +/* One per ring, more than one per 9pfs share */ +struct xen_9pfs_dataring { + struct xen_9pfs_front_priv *priv; + + struct xen_9pfs_data_intf *intf; + grant_ref_t ref; + int evtchn; + int irq; + /* protect a ring from concurrent accesses */ + spinlock_t lock; + + struct xen_9pfs_data data; + wait_queue_head_t wq; + struct work_struct work; +}; + +/* One per 9pfs share */ +struct xen_9pfs_front_priv { + struct list_head list; + struct xenbus_device *dev; + char *tag; + struct p9_client *client; + + int num_rings; + struct xen_9pfs_dataring *rings; +}; + +static LIST_HEAD(xen_9pfs_devs); +static DEFINE_RWLOCK(xen_9pfs_lock); + +/* We don't currently allow canceling of requests */ +static int p9_xen_cancel(struct p9_client *client, struct p9_req_t *req) +{ + return 1; +} + +static int p9_xen_create(struct p9_client *client, const char *addr, char *args) +{ + struct xen_9pfs_front_priv *priv; + + if (addr == NULL) + return -EINVAL; + + read_lock(&xen_9pfs_lock); + list_for_each_entry(priv, &xen_9pfs_devs, list) { + if (!strcmp(priv->tag, addr)) { + priv->client = client; + read_unlock(&xen_9pfs_lock); + return 0; + } + } + read_unlock(&xen_9pfs_lock); + return -EINVAL; +} + +static void p9_xen_close(struct p9_client *client) +{ + struct xen_9pfs_front_priv *priv; + + read_lock(&xen_9pfs_lock); + list_for_each_entry(priv, &xen_9pfs_devs, list) { + if (priv->client == client) { + priv->client = NULL; + read_unlock(&xen_9pfs_lock); + return; + } + } + read_unlock(&xen_9pfs_lock); +} + +static bool p9_xen_write_todo(struct xen_9pfs_dataring *ring, RING_IDX size) +{ + RING_IDX cons, prod; + + cons = ring->intf->out_cons; + prod = ring->intf->out_prod; + virt_mb(); + + return XEN_9PFS_RING_SIZE(ring) - + xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) >= size; +} + +static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) +{ + struct xen_9pfs_front_priv *priv; + RING_IDX cons, prod, masked_cons, masked_prod; + unsigned long flags; + u32 size = p9_req->tc.size; + struct xen_9pfs_dataring *ring; + int num; + + read_lock(&xen_9pfs_lock); + list_for_each_entry(priv, &xen_9pfs_devs, list) { + if (priv->client == client) + break; + } + read_unlock(&xen_9pfs_lock); + if (list_entry_is_head(priv, &xen_9pfs_devs, list)) + return -EINVAL; + + num = p9_req->tc.tag % priv->num_rings; + ring = &priv->rings[num]; + +again: + while (wait_event_killable(ring->wq, + p9_xen_write_todo(ring, size)) != 0) + ; + + spin_lock_irqsave(&ring->lock, flags); + cons = ring->intf->out_cons; + prod = ring->intf->out_prod; + virt_mb(); + + if (XEN_9PFS_RING_SIZE(ring) - + xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) < size) { + spin_unlock_irqrestore(&ring->lock, flags); + goto again; + } + + masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE(ring)); + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring)); + + xen_9pfs_write_packet(ring->data.out, p9_req->tc.sdata, size, + &masked_prod, masked_cons, + XEN_9PFS_RING_SIZE(ring)); + + WRITE_ONCE(p9_req->status, REQ_STATUS_SENT); + virt_wmb(); /* write ring before updating pointer */ + prod += size; + ring->intf->out_prod = prod; + spin_unlock_irqrestore(&ring->lock, flags); + notify_remote_via_irq(ring->irq); + p9_req_put(client, p9_req); + + return 0; +} + +static void p9_xen_response(struct work_struct *work) +{ + struct xen_9pfs_front_priv *priv; + struct xen_9pfs_dataring *ring; + RING_IDX cons, prod, masked_cons, masked_prod; + struct xen_9pfs_header h; + struct p9_req_t *req; + int status; + + ring = container_of(work, struct xen_9pfs_dataring, work); + priv = ring->priv; + + while (1) { + cons = ring->intf->in_cons; + prod = ring->intf->in_prod; + virt_rmb(); + + if (xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) < + sizeof(h)) { + notify_remote_via_irq(ring->irq); + return; + } + + masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE(ring)); + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring)); + + /* First, read just the header */ + xen_9pfs_read_packet(&h, ring->data.in, sizeof(h), + masked_prod, &masked_cons, + XEN_9PFS_RING_SIZE(ring)); + + req = p9_tag_lookup(priv->client, h.tag); + if (!req || req->status != REQ_STATUS_SENT) { + dev_warn(&priv->dev->dev, "Wrong req tag=%x\n", h.tag); + cons += h.size; + virt_mb(); + ring->intf->in_cons = cons; + continue; + } + + if (h.size > req->rc.capacity) { + dev_warn(&priv->dev->dev, + "requested packet size too big: %d for tag %d with capacity %zd\n", + h.size, h.tag, req->rc.capacity); + WRITE_ONCE(req->status, REQ_STATUS_ERROR); + goto recv_error; + } + + req->rc.size = h.size; + req->rc.id = h.id; + req->rc.tag = h.tag; + req->rc.offset = 0; + + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring)); + /* Then, read the whole packet (including the header) */ + xen_9pfs_read_packet(req->rc.sdata, ring->data.in, h.size, + masked_prod, &masked_cons, + XEN_9PFS_RING_SIZE(ring)); + +recv_error: + virt_mb(); + cons += h.size; + ring->intf->in_cons = cons; + + status = (req->status != REQ_STATUS_ERROR) ? + REQ_STATUS_RCVD : REQ_STATUS_ERROR; + + p9_client_cb(priv->client, req, status); + } +} + +static irqreturn_t xen_9pfs_front_event_handler(int irq, void *r) +{ + struct xen_9pfs_dataring *ring = r; + + if (!ring || !ring->priv->client) { + /* ignore spurious interrupt */ + return IRQ_HANDLED; + } + + wake_up_interruptible(&ring->wq); + schedule_work(&ring->work); + + return IRQ_HANDLED; +} + +static struct p9_trans_module p9_xen_trans = { + .name = "xen", + .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT - 2), + .pooled_rbuffers = false, + .def = 1, + .create = p9_xen_create, + .close = p9_xen_close, + .request = p9_xen_request, + .cancel = p9_xen_cancel, + .owner = THIS_MODULE, +}; + +static const struct xenbus_device_id xen_9pfs_front_ids[] = { + { "9pfs" }, + { "" } +}; + +static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) +{ + int i, j; + + write_lock(&xen_9pfs_lock); + list_del(&priv->list); + write_unlock(&xen_9pfs_lock); + + for (i = 0; i < priv->num_rings; i++) { + struct xen_9pfs_dataring *ring = &priv->rings[i]; + + cancel_work_sync(&ring->work); + + if (!priv->rings[i].intf) + break; + if (priv->rings[i].irq > 0) + unbind_from_irqhandler(priv->rings[i].irq, priv->dev); + if (priv->rings[i].data.in) { + for (j = 0; + j < (1 << priv->rings[i].intf->ring_order); + j++) { + grant_ref_t ref; + + ref = priv->rings[i].intf->ref[j]; + gnttab_end_foreign_access(ref, NULL); + } + free_pages_exact(priv->rings[i].data.in, + 1UL << (priv->rings[i].intf->ring_order + + XEN_PAGE_SHIFT)); + } + gnttab_end_foreign_access(priv->rings[i].ref, NULL); + free_page((unsigned long)priv->rings[i].intf); + } + kfree(priv->rings); + kfree(priv->tag); + kfree(priv); +} + +static void xen_9pfs_front_remove(struct xenbus_device *dev) +{ + struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev); + + dev_set_drvdata(&dev->dev, NULL); + xen_9pfs_front_free(priv); +} + +static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, + struct xen_9pfs_dataring *ring, + unsigned int order) +{ + int i = 0; + int ret = -ENOMEM; + void *bytes = NULL; + + init_waitqueue_head(&ring->wq); + spin_lock_init(&ring->lock); + INIT_WORK(&ring->work, p9_xen_response); + + ring->intf = (struct xen_9pfs_data_intf *)get_zeroed_page(GFP_KERNEL); + if (!ring->intf) + return ret; + ret = gnttab_grant_foreign_access(dev->otherend_id, + virt_to_gfn(ring->intf), 0); + if (ret < 0) + goto out; + ring->ref = ret; + bytes = alloc_pages_exact(1UL << (order + XEN_PAGE_SHIFT), + GFP_KERNEL | __GFP_ZERO); + if (!bytes) { + ret = -ENOMEM; + goto out; + } + for (; i < (1 << order); i++) { + ret = gnttab_grant_foreign_access( + dev->otherend_id, virt_to_gfn(bytes) + i, 0); + if (ret < 0) + goto out; + ring->intf->ref[i] = ret; + } + ring->intf->ring_order = order; + ring->data.in = bytes; + ring->data.out = bytes + XEN_FLEX_RING_SIZE(order); + + ret = xenbus_alloc_evtchn(dev, &ring->evtchn); + if (ret) + goto out; + ring->irq = bind_evtchn_to_irqhandler(ring->evtchn, + xen_9pfs_front_event_handler, + 0, "xen_9pfs-frontend", ring); + if (ring->irq >= 0) + return 0; + + xenbus_free_evtchn(dev, ring->evtchn); + ret = ring->irq; +out: + if (bytes) { + for (i--; i >= 0; i--) + gnttab_end_foreign_access(ring->intf->ref[i], NULL); + free_pages_exact(bytes, 1UL << (order + XEN_PAGE_SHIFT)); + } + gnttab_end_foreign_access(ring->ref, NULL); + free_page((unsigned long)ring->intf); + return ret; +} + +static int xen_9pfs_front_init(struct xenbus_device *dev) +{ + int ret, i; + struct xenbus_transaction xbt; + struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev); + char *versions, *v; + unsigned int max_rings, max_ring_order, len = 0; + + versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len); + if (IS_ERR(versions)) + return PTR_ERR(versions); + for (v = versions; *v; v++) { + if (simple_strtoul(v, &v, 10) == 1) { + v = NULL; + break; + } + } + if (v) { + kfree(versions); + return -EINVAL; + } + kfree(versions); + max_rings = xenbus_read_unsigned(dev->otherend, "max-rings", 0); + if (max_rings < XEN_9PFS_NUM_RINGS) + return -EINVAL; + max_ring_order = xenbus_read_unsigned(dev->otherend, + "max-ring-page-order", 0); + if (max_ring_order > XEN_9PFS_RING_ORDER) + max_ring_order = XEN_9PFS_RING_ORDER; + if (p9_xen_trans.maxsize > XEN_FLEX_RING_SIZE(max_ring_order)) + p9_xen_trans.maxsize = XEN_FLEX_RING_SIZE(max_ring_order) / 2; + + priv->num_rings = XEN_9PFS_NUM_RINGS; + priv->rings = kcalloc(priv->num_rings, sizeof(*priv->rings), + GFP_KERNEL); + if (!priv->rings) { + kfree(priv); + return -ENOMEM; + } + + for (i = 0; i < priv->num_rings; i++) { + priv->rings[i].priv = priv; + ret = xen_9pfs_front_alloc_dataring(dev, &priv->rings[i], + max_ring_order); + if (ret < 0) + goto error; + } + + again: + ret = xenbus_transaction_start(&xbt); + if (ret) { + xenbus_dev_fatal(dev, ret, "starting transaction"); + goto error; + } + ret = xenbus_printf(xbt, dev->nodename, "version", "%u", 1); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "num-rings", "%u", + priv->num_rings); + if (ret) + goto error_xenbus; + for (i = 0; i < priv->num_rings; i++) { + char str[16]; + + BUILD_BUG_ON(XEN_9PFS_NUM_RINGS > 9); + sprintf(str, "ring-ref%d", i); + ret = xenbus_printf(xbt, dev->nodename, str, "%d", + priv->rings[i].ref); + if (ret) + goto error_xenbus; + + sprintf(str, "event-channel-%d", i); + ret = xenbus_printf(xbt, dev->nodename, str, "%u", + priv->rings[i].evtchn); + if (ret) + goto error_xenbus; + } + priv->tag = xenbus_read(xbt, dev->nodename, "tag", NULL); + if (IS_ERR(priv->tag)) { + ret = PTR_ERR(priv->tag); + goto error_xenbus; + } + ret = xenbus_transaction_end(xbt, 0); + if (ret) { + if (ret == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, ret, "completing transaction"); + goto error; + } + + return 0; + + error_xenbus: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, ret, "writing xenstore"); + error: + xen_9pfs_front_free(priv); + return ret; +} + +static int xen_9pfs_front_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + struct xen_9pfs_front_priv *priv = NULL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->dev = dev; + dev_set_drvdata(&dev->dev, priv); + + write_lock(&xen_9pfs_lock); + list_add_tail(&priv->list, &xen_9pfs_devs); + write_unlock(&xen_9pfs_lock); + + return 0; +} + +static int xen_9pfs_front_resume(struct xenbus_device *dev) +{ + dev_warn(&dev->dev, "suspend/resume unsupported\n"); + return 0; +} + +static void xen_9pfs_front_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + switch (backend_state) { + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateUnknown: + break; + + case XenbusStateInitWait: + if (!xen_9pfs_front_init(dev)) + xenbus_switch_state(dev, XenbusStateInitialised); + break; + + case XenbusStateConnected: + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosed: + if (dev->state == XenbusStateClosed) + break; + fallthrough; /* Missed the backend's CLOSING state */ + case XenbusStateClosing: + xenbus_frontend_closed(dev); + break; + } +} + +static struct xenbus_driver xen_9pfs_front_driver = { + .ids = xen_9pfs_front_ids, + .probe = xen_9pfs_front_probe, + .remove = xen_9pfs_front_remove, + .resume = xen_9pfs_front_resume, + .otherend_changed = xen_9pfs_front_changed, +}; + +static int __init p9_trans_xen_init(void) +{ + int rc; + + if (!xen_domain()) + return -ENODEV; + + pr_info("Initialising Xen transport for 9pfs\n"); + + v9fs_register_trans(&p9_xen_trans); + rc = xenbus_register_frontend(&xen_9pfs_front_driver); + if (rc) + v9fs_unregister_trans(&p9_xen_trans); + + return rc; +} +module_init(p9_trans_xen_init); +MODULE_ALIAS_9P("xen"); + +static void __exit p9_trans_xen_exit(void) +{ + v9fs_unregister_trans(&p9_xen_trans); + return xenbus_unregister_driver(&xen_9pfs_front_driver); +} +module_exit(p9_trans_xen_exit); + +MODULE_ALIAS("xen:9pfs"); +MODULE_AUTHOR("Stefano Stabellini <stefano@aporeto.com>"); +MODULE_DESCRIPTION("Xen Transport for 9P"); +MODULE_LICENSE("GPL"); |