13 files changed, 7030 insertions, 0 deletions
diff --git a/net/9p/Kconfig b/net/9p/Kconfig
new file mode 100644
index 0000000000..00ebce9e5a
--- /dev/null
+++ b/net/9p/Kconfig
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# 9P protocol configuration
+#
+
+menuconfig NET_9P
+	tristate "Plan 9 Resource Sharing Support (9P2000)"
+	help
+	  If you say Y here, you will get experimental support for
+	  Plan 9 resource sharing via the 9P2000 protocol.
+
+	  See <http://v9fs.sf.net> for more information.
+
+	  If unsure, say N.
+
+if NET_9P
+
+config NET_9P_FD
+	default NET_9P
+	imply INET
+	imply UNIX
+	tristate "9P FD Transport"
+	help
+	  This builds support for transports over TCP, Unix sockets and
+	  filedescriptors.
+
+config NET_9P_VIRTIO
+	depends on VIRTIO
+	tristate "9P Virtio Transport"
+	help
+	  This builds support for a transports between
+	  guest partitions and a host partition.
+
+config NET_9P_XEN
+	depends on XEN
+	select XEN_XENBUS_FRONTEND
+	tristate "9P Xen Transport"
+	help
+	  This builds support for a transport for 9pfs between
+	  two Xen domains.
+
+
+config NET_9P_RDMA
+	depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS
+	tristate "9P RDMA Transport (Experimental)"
+	help
+	  This builds support for an RDMA transport.
+
+config NET_9P_DEBUG
+	bool "Debug information"
+	help
+	  Say Y if you want the 9P subsystem to log debug information.
+
+endif
diff --git a/net/9p/Makefile b/net/9p/Makefile
new file mode 100644
index 0000000000..1df9b344c3
--- /dev/null
+++ b/net/9p/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_NET_9P) := 9pnet.o
+obj-$(CONFIG_NET_9P_FD) += 9pnet_fd.o
+obj-$(CONFIG_NET_9P_XEN) += 9pnet_xen.o
+obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o
+obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o
+
+9pnet-objs := \
+	mod.o \
+	client.o \
+	error.o \
+	protocol.o \
+	trans_common.o \
+
+9pnet_fd-objs := \
+	trans_fd.o \
+
+9pnet_virtio-objs := \
+	trans_virtio.o \
+
+9pnet_xen-objs := \
+	trans_xen.o \
+
+9pnet_rdma-objs := \
+	trans_rdma.o \
diff --git a/net/9p/client.c b/net/9p/client.c
new file mode 100644
index 0000000000..e265a0ca6b
--- /dev/null
+++ b/net/9p/client.c
@@ -0,0 +1,2263 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * 9P Client
+ *
+ *  Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+#include <net/9p/9p.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include "protocol.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/9p.h>
+
+/* DEFAULT MSIZE = 32 pages worth of payload + P9_HDRSZ +
+ * room for write (16 extra) or read (11 extra) operands.
+ */
+
+#define DEFAULT_MSIZE ((128 * 1024) + P9_IOHDRSZ)
+
+/* Client Option Parsing (code inspired by NFS code)
+ *  - a little lazy - parse all client options
+ */
+
+enum {
+	Opt_msize,
+	Opt_trans,
+	Opt_legacy,
+	Opt_version,
+	Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_msize, "msize=%u"},
+	{Opt_legacy, "noextend"},
+	{Opt_trans, "trans=%s"},
+	{Opt_version, "version=%s"},
+	{Opt_err, NULL},
+};
+
+inline int p9_is_proto_dotl(struct p9_client *clnt)
+{
+	return clnt->proto_version == p9_proto_2000L;
+}
+EXPORT_SYMBOL(p9_is_proto_dotl);
+
+inline int p9_is_proto_dotu(struct p9_client *clnt)
+{
+	return clnt->proto_version == p9_proto_2000u;
+}
+EXPORT_SYMBOL(p9_is_proto_dotu);
+
+int p9_show_client_options(struct seq_file *m, struct p9_client *clnt)
+{
+	if (clnt->msize != DEFAULT_MSIZE)
+		seq_printf(m, ",msize=%u", clnt->msize);
+	seq_printf(m, ",trans=%s", clnt->trans_mod->name);
+
+	switch (clnt->proto_version) {
+	case p9_proto_legacy:
+		seq_puts(m, ",noextend");
+		break;
+	case p9_proto_2000u:
+		seq_puts(m, ",version=9p2000.u");
+		break;
+	case p9_proto_2000L:
+		/* Default */
+		break;
+	}
+
+	if (clnt->trans_mod->show_options)
+		return clnt->trans_mod->show_options(m, clnt);
+	return 0;
+}
+EXPORT_SYMBOL(p9_show_client_options);
+
+/* Some error codes are taken directly from the server replies,
+ * make sure they are valid.
+ */
+static int safe_errno(int err)
+{
+	if (err > 0 || err < -MAX_ERRNO) {
+		p9_debug(P9_DEBUG_ERROR, "Invalid error code %d\n", err);
+		return -EPROTO;
+	}
+	return err;
+}
+
+/* Interpret mount option for protocol version */
+static int get_protocol_version(char *s)
+{
+	int version = -EINVAL;
+
+	if (!strcmp(s, "9p2000")) {
+		version = p9_proto_legacy;
+		p9_debug(P9_DEBUG_9P, "Protocol version: Legacy\n");
+	} else if (!strcmp(s, "9p2000.u")) {
+		version = p9_proto_2000u;
+		p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.u\n");
+	} else if (!strcmp(s, "9p2000.L")) {
+		version = p9_proto_2000L;
+		p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.L\n");
+	} else {
+		pr_info("Unknown protocol version %s\n", s);
+	}
+
+	return version;
+}
+
+/**
+ * parse_opts - parse mount options into client structure
+ * @opts: options string passed from mount
+ * @clnt: existing v9fs client information
+ *
+ * Return 0 upon success, -ERRNO upon failure
+ */
+
+static int parse_opts(char *opts, struct p9_client *clnt)
+{
+	char *options, *tmp_options;
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	char *s;
+	int ret = 0;
+
+	clnt->proto_version = p9_proto_2000L;
+	clnt->msize = DEFAULT_MSIZE;
+
+	if (!opts)
+		return 0;
+
+	tmp_options = kstrdup(opts, GFP_KERNEL);
+	if (!tmp_options)
+		return -ENOMEM;
+	options = tmp_options;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token, r;
+
+		if (!*p)
+			continue;
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_msize:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
+			if (option < 4096) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "msize should be at least 4k\n");
+				ret = -EINVAL;
+				continue;
+			}
+			clnt->msize = option;
+			break;
+		case Opt_trans:
+			s = match_strdup(&args[0]);
+			if (!s) {
+				ret = -ENOMEM;
+				p9_debug(P9_DEBUG_ERROR,
+					 "problem allocating copy of trans arg\n");
+				goto free_and_return;
+			}
+
+			v9fs_put_trans(clnt->trans_mod);
+			clnt->trans_mod = v9fs_get_trans_by_name(s);
+			if (!clnt->trans_mod) {
+				pr_info("Could not find request transport: %s\n",
+					s);
+				ret = -EINVAL;
+			}
+			kfree(s);
+			break;
+		case Opt_legacy:
+			clnt->proto_version = p9_proto_legacy;
+			break;
+		case Opt_version:
+			s = match_strdup(&args[0]);
+			if (!s) {
+				ret = -ENOMEM;
+				p9_debug(P9_DEBUG_ERROR,
+					 "problem allocating copy of version arg\n");
+				goto free_and_return;
+			}
+			r = get_protocol_version(s);
+			if (r < 0)
+				ret = r;
+			else
+				clnt->proto_version = r;
+			kfree(s);
+			break;
+		default:
+			continue;
+		}
+	}
+
+free_and_return:
+	if (ret)
+		v9fs_put_trans(clnt->trans_mod);
+	kfree(tmp_options);
+	return ret;
+}
+
+static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
+			 int alloc_msize)
+{
+	if (likely(c->fcall_cache) && alloc_msize == c->msize) {
+		fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
+		fc->cache = c->fcall_cache;
+	} else {
+		fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
+		fc->cache = NULL;
+	}
+	if (!fc->sdata)
+		return -ENOMEM;
+	fc->capacity = alloc_msize;
+	return 0;
+}
+
+void p9_fcall_fini(struct p9_fcall *fc)
+{
+	/* sdata can be NULL for interrupted requests in trans_rdma,
+	 * and kmem_cache_free does not do NULL-check for us
+	 */
+	if (unlikely(!fc->sdata))
+		return;
+
+	if (fc->cache)
+		kmem_cache_free(fc->cache, fc->sdata);
+	else
+		kfree(fc->sdata);
+}
+EXPORT_SYMBOL(p9_fcall_fini);
+
+static struct kmem_cache *p9_req_cache;
+
+/**
+ * p9_tag_alloc - Allocate a new request.
+ * @c: Client session.
+ * @type: Transaction type.
+ * @t_size: Buffer size for holding this request
+ * (automatic calculation by format template if 0).
+ * @r_size: Buffer size for holding server's reply on this request
+ * (automatic calculation by format template if 0).
+ * @fmt: Format template for assembling 9p request message
+ * (see p9pdu_vwritef).
+ * @ap: Variable arguments to be fed to passed format template
+ * (see p9pdu_vwritef).
+ *
+ * Context: Process context.
+ * Return: Pointer to new request.
+ */
+static struct p9_req_t *
+p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size,
+	      const char *fmt, va_list ap)
+{
+	struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS);
+	int alloc_tsize;
+	int alloc_rsize;
+	int tag;
+	va_list apc;
+
+	va_copy(apc, ap);
+	alloc_tsize = min_t(size_t, c->msize,
+			    t_size ?: p9_msg_buf_size(c, type, fmt, apc));
+	va_end(apc);
+
+	alloc_rsize = min_t(size_t, c->msize,
+			    r_size ?: p9_msg_buf_size(c, type + 1, fmt, ap));
+
+	if (!req)
+		return ERR_PTR(-ENOMEM);
+
+	if (p9_fcall_init(c, &req->tc, alloc_tsize))
+		goto free_req;
+	if (p9_fcall_init(c, &req->rc, alloc_rsize))
+		goto free;
+
+	p9pdu_reset(&req->tc);
+	p9pdu_reset(&req->rc);
+	req->t_err = 0;
+	req->status = REQ_STATUS_ALLOC;
+	/* refcount needs to be set to 0 before inserting into the idr
+	 * so p9_tag_lookup does not accept a request that is not fully
+	 * initialized. refcount_set to 2 below will mark request ready.
+	 */
+	refcount_set(&req->refcount, 0);
+	init_waitqueue_head(&req->wq);
+	INIT_LIST_HEAD(&req->req_list);
+
+	idr_preload(GFP_NOFS);
+	spin_lock_irq(&c->lock);
+	if (type == P9_TVERSION)
+		tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1,
+				GFP_NOWAIT);
+	else
+		tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT);
+	req->tc.tag = tag;
+	spin_unlock_irq(&c->lock);
+	idr_preload_end();
+	if (tag < 0)
+		goto free;
+
+	/* Init ref to two because in the general case there is one ref
+	 * that is put asynchronously by a writer thread, one ref
+	 * temporarily given by p9_tag_lookup and put by p9_client_cb
+	 * in the recv thread, and one ref put by p9_req_put in the
+	 * main thread. The only exception is virtio that does not use
+	 * p9_tag_lookup but does not have a writer thread either
+	 * (the write happens synchronously in the request/zc_request
+	 * callback), so p9_client_cb eats the second ref there
+	 * as the pointer is duplicated directly by virtqueue_add_sgs()
+	 */
+	refcount_set(&req->refcount, 2);
+
+	return req;
+
+free:
+	p9_fcall_fini(&req->tc);
+	p9_fcall_fini(&req->rc);
+free_req:
+	kmem_cache_free(p9_req_cache, req);
+	return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * p9_tag_lookup - Look up a request by tag.
+ * @c: Client session.
+ * @tag: Transaction ID.
+ *
+ * Context: Any context.
+ * Return: A request, or %NULL if there is no request with that tag.
+ */
+struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
+{
+	struct p9_req_t *req;
+
+	rcu_read_lock();
+again:
+	req = idr_find(&c->reqs, tag);
+	if (req) {
+		/* We have to be careful with the req found under rcu_read_lock
+		 * Thanks to SLAB_TYPESAFE_BY_RCU we can safely try to get the
+		 * ref again without corrupting other data, then check again
+		 * that the tag matches once we have the ref
+		 */
+		if (!p9_req_try_get(req))
+			goto again;
+		if (req->tc.tag != tag) {
+			p9_req_put(c, req);
+			goto again;
+		}
+	}
+	rcu_read_unlock();
+
+	return req;
+}
+EXPORT_SYMBOL(p9_tag_lookup);
+
+/**
+ * p9_tag_remove - Remove a tag.
+ * @c: Client session.
+ * @r: Request of reference.
+ *
+ * Context: Any context.
+ */
+static void p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
+{
+	unsigned long flags;
+	u16 tag = r->tc.tag;
+
+	p9_debug(P9_DEBUG_MUX, "freeing clnt %p req %p tag: %d\n", c, r, tag);
+	spin_lock_irqsave(&c->lock, flags);
+	idr_remove(&c->reqs, tag);
+	spin_unlock_irqrestore(&c->lock, flags);
+}
+
+int p9_req_put(struct p9_client *c, struct p9_req_t *r)
+{
+	if (refcount_dec_and_test(&r->refcount)) {
+		p9_tag_remove(c, r);
+
+		p9_fcall_fini(&r->tc);
+		p9_fcall_fini(&r->rc);
+		kmem_cache_free(p9_req_cache, r);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(p9_req_put);
+
+/**
+ * p9_tag_cleanup - cleans up tags structure and reclaims resources
+ * @c:  v9fs client struct
+ *
+ * This frees resources associated with the tags structure
+ *
+ */
+static void p9_tag_cleanup(struct p9_client *c)
+{
+	struct p9_req_t *req;
+	int id;
+
+	rcu_read_lock();
+	idr_for_each_entry(&c->reqs, req, id) {
+		pr_info("Tag %d still in use\n", id);
+		if (p9_req_put(c, req) == 0)
+			pr_warn("Packet with tag %d has still references",
+				req->tc.tag);
+	}
+	rcu_read_unlock();
+}
+
+/**
+ * p9_client_cb - call back from transport to client
+ * @c: client state
+ * @req: request received
+ * @status: request status, one of REQ_STATUS_*
+ *
+ */
+void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
+{
+	p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc.tag);
+
+	/* This barrier is needed to make sure any change made to req before
+	 * the status change is visible to another thread
+	 */
+	smp_wmb();
+	WRITE_ONCE(req->status, status);
+
+	wake_up(&req->wq);
+	p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag);
+	p9_req_put(c, req);
+}
+EXPORT_SYMBOL(p9_client_cb);
+
+/**
+ * p9_parse_header - parse header arguments out of a packet
+ * @pdu: packet to parse
+ * @size: size of packet
+ * @type: type of request
+ * @tag: tag of packet
+ * @rewind: set if we need to rewind offset afterwards
+ */
+
+int
+p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type,
+		int16_t *tag, int rewind)
+{
+	s8 r_type;
+	s16 r_tag;
+	s32 r_size;
+	int offset = pdu->offset;
+	int err;
+
+	pdu->offset = 0;
+
+	err = p9pdu_readf(pdu, 0, "dbw", &r_size, &r_type, &r_tag);
+	if (err)
+		goto rewind_and_exit;
+
+	if (type)
+		*type = r_type;
+	if (tag)
+		*tag = r_tag;
+	if (size)
+		*size = r_size;
+
+	if (pdu->size != r_size || r_size < 7) {
+		err = -EINVAL;
+		goto rewind_and_exit;
+	}
+
+	pdu->id = r_type;
+	pdu->tag = r_tag;
+
+	p9_debug(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n",
+		 pdu->size, pdu->id, pdu->tag);
+
+rewind_and_exit:
+	if (rewind)
+		pdu->offset = offset;
+	return err;
+}
+EXPORT_SYMBOL(p9_parse_header);
+
+/**
+ * p9_check_errors - check 9p packet for error return and process it
+ * @c: current client instance
+ * @req: request to parse and check for error conditions
+ *
+ * returns error code if one is discovered, otherwise returns 0
+ *
+ * this will have to be more complicated if we have multiple
+ * error packet types
+ */
+
+static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
+{
+	s8 type;
+	int err;
+	int ecode;
+
+	err = p9_parse_header(&req->rc, NULL, &type, NULL, 0);
+	if (req->rc.size > req->rc.capacity && !req->rc.zc) {
+		pr_err("requested packet size too big: %d does not fit %zu (type=%d)\n",
+		       req->rc.size, req->rc.capacity, req->rc.id);
+		return -EIO;
+	}
+	/* dump the response from server
+	 * This should be after check errors which poplulate pdu_fcall.
+	 */
+	trace_9p_protocol_dump(c, &req->rc);
+	if (err) {
+		p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
+		return err;
+	}
+	if (type != P9_RERROR && type != P9_RLERROR)
+		return 0;
+
+	if (!p9_is_proto_dotl(c)) {
+		char *ename = NULL;
+
+		err = p9pdu_readf(&req->rc, c->proto_version, "s?d",
+				  &ename, &ecode);
+		if (err) {
+			kfree(ename);
+			goto out_err;
+		}
+
+		if (p9_is_proto_dotu(c) && ecode < 512)
+			err = -ecode;
+
+		if (!err) {
+			err = p9_errstr2errno(ename, strlen(ename));
+
+			p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
+				 -ecode, ename);
+		}
+		kfree(ename);
+	} else {
+		err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode);
+		if (err)
+			goto out_err;
+		err = -ecode;
+
+		p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
+	}
+
+	return err;
+
+out_err:
+	p9_debug(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
+
+	return err;
+}
+
+static struct p9_req_t *
+p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...);
+
+/**
+ * p9_client_flush - flush (cancel) a request
+ * @c: client state
+ * @oldreq: request to cancel
+ *
+ * This sents a flush for a particular request and links
+ * the flush request to the original request.  The current
+ * code only supports a single flush request although the protocol
+ * allows for multiple flush requests to be sent for a single request.
+ *
+ */
+
+static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
+{
+	struct p9_req_t *req;
+	s16 oldtag;
+	int err;
+
+	err = p9_parse_header(&oldreq->tc, NULL, NULL, &oldtag, 1);
+	if (err)
+		return err;
+
+	p9_debug(P9_DEBUG_9P, ">>> TFLUSH tag %d\n", oldtag);
+
+	req = p9_client_rpc(c, P9_TFLUSH, "w", oldtag);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	/* if we haven't received a response for oldreq,
+	 * remove it from the list
+	 */
+	if (READ_ONCE(oldreq->status) == REQ_STATUS_SENT) {
+		if (c->trans_mod->cancelled)
+			c->trans_mod->cancelled(c, oldreq);
+	}
+
+	p9_req_put(c, req);
+	return 0;
+}
+
+static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
+					      int8_t type, uint t_size, uint r_size,
+					      const char *fmt, va_list ap)
+{
+	int err;
+	struct p9_req_t *req;
+	va_list apc;
+
+	p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type);
+
+	/* we allow for any status other than disconnected */
+	if (c->status == Disconnected)
+		return ERR_PTR(-EIO);
+
+	/* if status is begin_disconnected we allow only clunk request */
+	if (c->status == BeginDisconnect && type != P9_TCLUNK)
+		return ERR_PTR(-EIO);
+
+	va_copy(apc, ap);
+	req = p9_tag_alloc(c, type, t_size, r_size, fmt, apc);
+	va_end(apc);
+	if (IS_ERR(req))
+		return req;
+
+	/* marshall the data */
+	p9pdu_prepare(&req->tc, req->tc.tag, type);
+	err = p9pdu_vwritef(&req->tc, c->proto_version, fmt, ap);
+	if (err)
+		goto reterr;
+	p9pdu_finalize(c, &req->tc);
+	trace_9p_client_req(c, type, req->tc.tag);
+	return req;
+reterr:
+	p9_req_put(c, req);
+	/* We have to put also the 2nd reference as it won't be used */
+	p9_req_put(c, req);
+	return ERR_PTR(err);
+}
+
+/**
+ * p9_client_rpc - issue a request and wait for a response
+ * @c: client session
+ * @type: type of request
+ * @fmt: protocol format string (see protocol.c)
+ *
+ * Returns request structure (which client must free using p9_req_put)
+ */
+
+static struct p9_req_t *
+p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
+{
+	va_list ap;
+	int sigpending, err;
+	unsigned long flags;
+	struct p9_req_t *req;
+	/* Passing zero for tsize/rsize to p9_client_prepare_req() tells it to
+	 * auto determine an appropriate (small) request/response size
+	 * according to actual message data being sent. Currently RDMA
+	 * transport is excluded from this response message size optimization,
+	 * as it would not cope with it, due to its pooled response buffers
+	 * (using an optimized request size for RDMA as well though).
+	 */
+	const uint tsize = 0;
+	const uint rsize = c->trans_mod->pooled_rbuffers ? c->msize : 0;
+
+	va_start(ap, fmt);
+	req = p9_client_prepare_req(c, type, tsize, rsize, fmt, ap);
+	va_end(ap);
+	if (IS_ERR(req))
+		return req;
+
+	req->tc.zc = false;
+	req->rc.zc = false;
+
+	if (signal_pending(current)) {
+		sigpending = 1;
+		clear_thread_flag(TIF_SIGPENDING);
+	} else {
+		sigpending = 0;
+	}
+
+	err = c->trans_mod->request(c, req);
+	if (err < 0) {
+		/* write won't happen */
+		p9_req_put(c, req);
+		if (err != -ERESTARTSYS && err != -EFAULT)
+			c->status = Disconnected;
+		goto recalc_sigpending;
+	}
+again:
+	/* Wait for the response */
+	err = wait_event_killable(req->wq,
+				  READ_ONCE(req->status) >= REQ_STATUS_RCVD);
+
+	/* Make sure our req is coherent with regard to updates in other
+	 * threads - echoes to wmb() in the callback
+	 */
+	smp_rmb();
+
+	if (err == -ERESTARTSYS && c->status == Connected &&
+	    type == P9_TFLUSH) {
+		sigpending = 1;
+		clear_thread_flag(TIF_SIGPENDING);
+		goto again;
+	}
+
+	if (READ_ONCE(req->status) == REQ_STATUS_ERROR) {
+		p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
+		err = req->t_err;
+	}
+	if (err == -ERESTARTSYS && c->status == Connected) {
+		p9_debug(P9_DEBUG_MUX, "flushing\n");
+		sigpending = 1;
+		clear_thread_flag(TIF_SIGPENDING);
+
+		if (c->trans_mod->cancel(c, req))
+			p9_client_flush(c, req);
+
+		/* if we received the response anyway, don't signal error */
+		if (READ_ONCE(req->status) == REQ_STATUS_RCVD)
+			err = 0;
+	}
+recalc_sigpending:
+	if (sigpending) {
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	}
+	if (err < 0)
+		goto reterr;
+
+	err = p9_check_errors(c, req);
+	trace_9p_client_res(c, type, req->rc.tag, err);
+	if (!err)
+		return req;
+reterr:
+	p9_req_put(c, req);
+	return ERR_PTR(safe_errno(err));
+}
+
+/**
+ * p9_client_zc_rpc - issue a request and wait for a response
+ * @c: client session
+ * @type: type of request
+ * @uidata: destination for zero copy read
+ * @uodata: source for zero copy write
+ * @inlen: read buffer size
+ * @olen: write buffer size
+ * @in_hdrlen: reader header size, This is the size of response protocol data
+ * @fmt: protocol format string (see protocol.c)
+ *
+ * Returns request structure (which client must free using p9_req_put)
+ */
+static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
+					 struct iov_iter *uidata,
+					 struct iov_iter *uodata,
+					 int inlen, int olen, int in_hdrlen,
+					 const char *fmt, ...)
+{
+	va_list ap;
+	int sigpending, err;
+	unsigned long flags;
+	struct p9_req_t *req;
+
+	va_start(ap, fmt);
+	/* We allocate a inline protocol data of only 4k bytes.
+	 * The actual content is passed in zero-copy fashion.
+	 */
+	req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, P9_ZC_HDR_SZ, fmt, ap);
+	va_end(ap);
+	if (IS_ERR(req))
+		return req;
+
+	req->tc.zc = true;
+	req->rc.zc = true;
+
+	if (signal_pending(current)) {
+		sigpending = 1;
+		clear_thread_flag(TIF_SIGPENDING);
+	} else {
+		sigpending = 0;
+	}
+
+	err = c->trans_mod->zc_request(c, req, uidata, uodata,
+				       inlen, olen, in_hdrlen);
+	if (err < 0) {
+		if (err == -EIO)
+			c->status = Disconnected;
+		if (err != -ERESTARTSYS)
+			goto recalc_sigpending;
+	}
+	if (READ_ONCE(req->status) == REQ_STATUS_ERROR) {
+		p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
+		err = req->t_err;
+	}
+	if (err == -ERESTARTSYS && c->status == Connected) {
+		p9_debug(P9_DEBUG_MUX, "flushing\n");
+		sigpending = 1;
+		clear_thread_flag(TIF_SIGPENDING);
+
+		if (c->trans_mod->cancel(c, req))
+			p9_client_flush(c, req);
+
+		/* if we received the response anyway, don't signal error */
+		if (READ_ONCE(req->status) == REQ_STATUS_RCVD)
+			err = 0;
+	}
+recalc_sigpending:
+	if (sigpending) {
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	}
+	if (err < 0)
+		goto reterr;
+
+	err = p9_check_errors(c, req);
+	trace_9p_client_res(c, type, req->rc.tag, err);
+	if (!err)
+		return req;
+reterr:
+	p9_req_put(c, req);
+	return ERR_PTR(safe_errno(err));
+}
+
+static struct p9_fid *p9_fid_create(struct p9_client *clnt)
+{
+	int ret;
+	struct p9_fid *fid;
+
+	p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt);
+	fid = kzalloc(sizeof(*fid), GFP_KERNEL);
+	if (!fid)
+		return NULL;
+
+	fid->mode = -1;
+	fid->uid = current_fsuid();
+	fid->clnt = clnt;
+	refcount_set(&fid->count, 1);
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(&clnt->lock);
+	ret = idr_alloc_u32(&clnt->fids, fid, &fid->fid, P9_NOFID - 1,
+			    GFP_NOWAIT);
+	spin_unlock_irq(&clnt->lock);
+	idr_preload_end();
+	if (!ret) {
+		trace_9p_fid_ref(fid, P9_FID_REF_CREATE);
+		return fid;
+	}
+
+	kfree(fid);
+	return NULL;
+}
+
+static void p9_fid_destroy(struct p9_fid *fid)
+{
+	struct p9_client *clnt;
+	unsigned long flags;
+
+	p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid);
+	trace_9p_fid_ref(fid, P9_FID_REF_DESTROY);
+	clnt = fid->clnt;
+	spin_lock_irqsave(&clnt->lock, flags);
+	idr_remove(&clnt->fids, fid->fid);
+	spin_unlock_irqrestore(&clnt->lock, flags);
+	kfree(fid->rdir);
+	kfree(fid);
+}
+
+/* We also need to export tracepoint symbols for tracepoint_enabled() */
+EXPORT_TRACEPOINT_SYMBOL(9p_fid_ref);
+
+void do_trace_9p_fid_get(struct p9_fid *fid)
+{
+	trace_9p_fid_ref(fid, P9_FID_REF_GET);
+}
+EXPORT_SYMBOL(do_trace_9p_fid_get);
+
+void do_trace_9p_fid_put(struct p9_fid *fid)
+{
+	trace_9p_fid_ref(fid, P9_FID_REF_PUT);
+}
+EXPORT_SYMBOL(do_trace_9p_fid_put);
+
+static int p9_client_version(struct p9_client *c)
+{
+	int err;
+	struct p9_req_t *req;
+	char *version = NULL;
+	int msize;
+
+	p9_debug(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n",
+		 c->msize, c->proto_version);
+
+	switch (c->proto_version) {
+	case p9_proto_2000L:
+		req = p9_client_rpc(c, P9_TVERSION, "ds",
+				    c->msize, "9P2000.L");
+		break;
+	case p9_proto_2000u:
+		req = p9_client_rpc(c, P9_TVERSION, "ds",
+				    c->msize, "9P2000.u");
+		break;
+	case p9_proto_legacy:
+		req = p9_client_rpc(c, P9_TVERSION, "ds",
+				    c->msize, "9P2000");
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(&req->rc, c->proto_version, "ds", &msize, &version);
+	if (err) {
+		p9_debug(P9_DEBUG_9P, "version error %d\n", err);
+		trace_9p_protocol_dump(c, &req->rc);
+		goto error;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version);
+	if (!strncmp(version, "9P2000.L", 8)) {
+		c->proto_version = p9_proto_2000L;
+	} else if (!strncmp(version, "9P2000.u", 8)) {
+		c->proto_version = p9_proto_2000u;
+	} else if (!strncmp(version, "9P2000", 6)) {
+		c->proto_version = p9_proto_legacy;
+	} else {
+		p9_debug(P9_DEBUG_ERROR,
+			 "server returned an unknown version: %s\n", version);
+		err = -EREMOTEIO;
+		goto error;
+	}
+
+	if (msize < 4096) {
+		p9_debug(P9_DEBUG_ERROR,
+			 "server returned a msize < 4096: %d\n", msize);
+		err = -EREMOTEIO;
+		goto error;
+	}
+	if (msize < c->msize)
+		c->msize = msize;
+
+error:
+	kfree(version);
+	p9_req_put(c, req);
+
+	return err;
+}
+
+struct p9_client *p9_client_create(const char *dev_name, char *options)
+{
+	int err;
+	struct p9_client *clnt;
+	char *client_id;
+
+	clnt = kmalloc(sizeof(*clnt), GFP_KERNEL);
+	if (!clnt)
+		return ERR_PTR(-ENOMEM);
+
+	clnt->trans_mod = NULL;
+	clnt->trans = NULL;
+	clnt->fcall_cache = NULL;
+
+	client_id = utsname()->nodename;
+	memcpy(clnt->name, client_id, strlen(client_id) + 1);
+
+	spin_lock_init(&clnt->lock);
+	idr_init(&clnt->fids);
+	idr_init(&clnt->reqs);
+
+	err = parse_opts(options, clnt);
+	if (err < 0)
+		goto free_client;
+
+	if (!clnt->trans_mod)
+		clnt->trans_mod = v9fs_get_default_trans();
+
+	if (!clnt->trans_mod) {
+		err = -EPROTONOSUPPORT;
+		p9_debug(P9_DEBUG_ERROR,
+			 "No transport defined or default transport\n");
+		goto free_client;
+	}
+
+	p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n",
+		 clnt, clnt->trans_mod, clnt->msize, clnt->proto_version);
+
+	err = clnt->trans_mod->create(clnt, dev_name, options);
+	if (err)
+		goto put_trans;
+
+	if (clnt->msize > clnt->trans_mod->maxsize) {
+		clnt->msize = clnt->trans_mod->maxsize;
+		pr_info("Limiting 'msize' to %d as this is the maximum "
+			"supported by transport %s\n",
+			clnt->msize, clnt->trans_mod->name
+		);
+	}
+
+	if (clnt->msize < 4096) {
+		p9_debug(P9_DEBUG_ERROR,
+			 "Please specify a msize of at least 4k\n");
+		err = -EINVAL;
+		goto close_trans;
+	}
+
+	err = p9_client_version(clnt);
+	if (err)
+		goto close_trans;
+
+	/* P9_HDRSZ + 4 is the smallest packet header we can have that is
+	 * followed by data accessed from userspace by read
+	 */
+	clnt->fcall_cache =
+		kmem_cache_create_usercopy("9p-fcall-cache", clnt->msize,
+					   0, 0, P9_HDRSZ + 4,
+					   clnt->msize - (P9_HDRSZ + 4),
+					   NULL);
+
+	return clnt;
+
+close_trans:
+	clnt->trans_mod->close(clnt);
+put_trans:
+	v9fs_put_trans(clnt->trans_mod);
+free_client:
+	kfree(clnt);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_create);
+
+void p9_client_destroy(struct p9_client *clnt)
+{
+	struct p9_fid *fid;
+	int id;
+
+	p9_debug(P9_DEBUG_MUX, "clnt %p\n", clnt);
+
+	if (clnt->trans_mod)
+		clnt->trans_mod->close(clnt);
+
+	v9fs_put_trans(clnt->trans_mod);
+
+	idr_for_each_entry(&clnt->fids, fid, id) {
+		pr_info("Found fid %d not clunked\n", fid->fid);
+		p9_fid_destroy(fid);
+	}
+
+	p9_tag_cleanup(clnt);
+
+	kmem_cache_destroy(clnt->fcall_cache);
+	kfree(clnt);
+}
+EXPORT_SYMBOL(p9_client_destroy);
+
+void p9_client_disconnect(struct p9_client *clnt)
+{
+	p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt);
+	clnt->status = Disconnected;
+}
+EXPORT_SYMBOL(p9_client_disconnect);
+
+void p9_client_begin_disconnect(struct p9_client *clnt)
+{
+	p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt);
+	clnt->status = BeginDisconnect;
+}
+EXPORT_SYMBOL(p9_client_begin_disconnect);
+
+struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
+				const char *uname, kuid_t n_uname,
+				const char *aname)
+{
+	int err;
+	struct p9_req_t *req;
+	struct p9_fid *fid;
+	struct p9_qid qid;
+
+	p9_debug(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n",
+		 afid ? afid->fid : -1, uname, aname);
+	fid = p9_fid_create(clnt);
+	if (!fid) {
+		err = -ENOMEM;
+		goto error;
+	}
+	fid->uid = n_uname;
+
+	req = p9_client_rpc(clnt, P9_TATTACH, "ddss?u", fid->fid,
+			    afid ? afid->fid : P9_NOFID, uname, aname, n_uname);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", &qid);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		p9_req_put(clnt, req);
+		goto error;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n",
+		 qid.type, qid.path, qid.version);
+
+	memmove(&fid->qid, &qid, sizeof(struct p9_qid));
+
+	p9_req_put(clnt, req);
+	return fid;
+
+error:
+	if (fid)
+		p9_fid_destroy(fid);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_attach);
+
+struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
+			      const unsigned char * const *wnames, int clone)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_fid *fid;
+	struct p9_qid *wqids;
+	struct p9_req_t *req;
+	u16 nwqids, count;
+
+	wqids = NULL;
+	clnt = oldfid->clnt;
+	if (clone) {
+		fid = p9_fid_create(clnt);
+		if (!fid) {
+			err = -ENOMEM;
+			goto error;
+		}
+
+		fid->uid = oldfid->uid;
+	} else {
+		fid = oldfid;
+	}
+
+	p9_debug(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n",
+		 oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL);
+	req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid,
+			    nwname, wnames);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "R", &nwqids, &wqids);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		p9_req_put(clnt, req);
+		goto clunk_fid;
+	}
+	p9_req_put(clnt, req);
+
+	p9_debug(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids);
+
+	if (nwqids != nwname) {
+		err = -ENOENT;
+		goto clunk_fid;
+	}
+
+	for (count = 0; count < nwqids; count++)
+		p9_debug(P9_DEBUG_9P, "<<<     [%d] %x.%llx.%x\n",
+			 count, wqids[count].type,
+			 wqids[count].path,
+			 wqids[count].version);
+
+	if (nwname)
+		memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid));
+	else
+		memmove(&fid->qid, &oldfid->qid, sizeof(struct p9_qid));
+
+	kfree(wqids);
+	return fid;
+
+clunk_fid:
+	kfree(wqids);
+	p9_fid_put(fid);
+	fid = NULL;
+
+error:
+	if (fid && fid != oldfid)
+		p9_fid_destroy(fid);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_walk);
+
+int p9_client_open(struct p9_fid *fid, int mode)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+	struct p9_qid qid;
+	int iounit;
+
+	clnt = fid->clnt;
+	p9_debug(P9_DEBUG_9P, ">>> %s fid %d mode %d\n",
+		 p9_is_proto_dotl(clnt) ? "TLOPEN" : "TOPEN", fid->fid, mode);
+
+	if (fid->mode != -1)
+		return -EINVAL;
+
+	if (p9_is_proto_dotl(clnt))
+		req = p9_client_rpc(clnt, P9_TLOPEN, "dd", fid->fid, mode & P9L_MODE_MASK);
+	else
+		req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode & P9L_MODE_MASK);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		goto free_and_error;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< %s qid %x.%llx.%x iounit %x\n",
+		 p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN",  qid.type,
+		 qid.path, qid.version, iounit);
+
+	memmove(&fid->qid, &qid, sizeof(struct p9_qid));
+	fid->mode = mode;
+	fid->iounit = iounit;
+
+free_and_error:
+	p9_req_put(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_open);
+
+int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags,
+			  u32 mode, kgid_t gid, struct p9_qid *qid)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+	int iounit;
+
+	p9_debug(P9_DEBUG_9P,
+		 ">>> TLCREATE fid %d name %s flags %d mode %d gid %d\n",
+		 ofid->fid, name, flags, mode,
+		 from_kgid(&init_user_ns, gid));
+	clnt = ofid->clnt;
+
+	if (ofid->mode != -1)
+		return -EINVAL;
+
+	req = p9_client_rpc(clnt, P9_TLCREATE, "dsddg", ofid->fid, name, flags,
+			    mode & P9L_MODE_MASK, gid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", qid, &iounit);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		goto free_and_error;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RLCREATE qid %x.%llx.%x iounit %x\n",
+		 qid->type, qid->path, qid->version, iounit);
+
+	memmove(&ofid->qid, qid, sizeof(struct p9_qid));
+	ofid->mode = flags;
+	ofid->iounit = iounit;
+
+free_and_error:
+	p9_req_put(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_create_dotl);
+
+int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode,
+		     char *extension)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+	struct p9_qid qid;
+	int iounit;
+
+	p9_debug(P9_DEBUG_9P, ">>> TCREATE fid %d name %s perm %d mode %d\n",
+		 fid->fid, name, perm, mode);
+	clnt = fid->clnt;
+
+	if (fid->mode != -1)
+		return -EINVAL;
+
+	req = p9_client_rpc(clnt, P9_TCREATE, "dsdb?s", fid->fid, name, perm,
+			    mode & P9L_MODE_MASK, extension);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		goto free_and_error;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RCREATE qid %x.%llx.%x iounit %x\n",
+		 qid.type, qid.path, qid.version, iounit);
+
+	memmove(&fid->qid, &qid, sizeof(struct p9_qid));
+	fid->mode = mode;
+	fid->iounit = iounit;
+
+free_and_error:
+	p9_req_put(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_fcreate);
+
+int p9_client_symlink(struct p9_fid *dfid, const char *name,
+		      const char *symtgt, kgid_t gid, struct p9_qid *qid)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	p9_debug(P9_DEBUG_9P, ">>> TSYMLINK dfid %d name %s  symtgt %s\n",
+		 dfid->fid, name, symtgt);
+	clnt = dfid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TSYMLINK, "dssg", dfid->fid, name, symtgt,
+			    gid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		goto free_and_error;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RSYMLINK qid %x.%llx.%x\n",
+		 qid->type, qid->path, qid->version);
+
+free_and_error:
+	p9_req_put(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_symlink);
+
+int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, const char *newname)
+{
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	p9_debug(P9_DEBUG_9P, ">>> TLINK dfid %d oldfid %d newname %s\n",
+		 dfid->fid, oldfid->fid, newname);
+	clnt = dfid->clnt;
+	req = p9_client_rpc(clnt, P9_TLINK, "dds", dfid->fid, oldfid->fid,
+			    newname);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	p9_debug(P9_DEBUG_9P, "<<< RLINK\n");
+	p9_req_put(clnt, req);
+	return 0;
+}
+EXPORT_SYMBOL(p9_client_link);
+
+int p9_client_fsync(struct p9_fid *fid, int datasync)
+{
+	int err = 0;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	p9_debug(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n",
+		 fid->fid, datasync);
+	clnt = fid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TFSYNC, "dd", fid->fid, datasync);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid);
+
+	p9_req_put(clnt, req);
+
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_fsync);
+
+int p9_client_clunk(struct p9_fid *fid)
+{
+	int err = 0;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+	int retries = 0;
+
+again:
+	p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n",
+		 fid->fid, retries);
+	clnt = fid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TCLUNK, "d", fid->fid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid);
+
+	p9_req_put(clnt, req);
+error:
+	/* Fid is not valid even after a failed clunk
+	 * If interrupted, retry once then give up and
+	 * leak fid until umount.
+	 */
+	if (err == -ERESTARTSYS) {
+		if (retries++ == 0)
+			goto again;
+	} else {
+		p9_fid_destroy(fid);
+	}
+	return err;
+}
+EXPORT_SYMBOL(p9_client_clunk);
+
+int p9_client_remove(struct p9_fid *fid)
+{
+	int err = 0;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	p9_debug(P9_DEBUG_9P, ">>> TREMOVE fid %d\n", fid->fid);
+	clnt = fid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TREMOVE, "d", fid->fid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid);
+
+	p9_req_put(clnt, req);
+error:
+	if (err == -ERESTARTSYS)
+		p9_fid_put(fid);
+	else
+		p9_fid_destroy(fid);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_remove);
+
+int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
+{
+	int err = 0;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	p9_debug(P9_DEBUG_9P, ">>> TUNLINKAT fid %d %s %d\n",
+		 dfid->fid, name, flags);
+
+	clnt = dfid->clnt;
+	req = p9_client_rpc(clnt, P9_TUNLINKAT, "dsd", dfid->fid, name, flags);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+	p9_debug(P9_DEBUG_9P, "<<< RUNLINKAT fid %d %s\n", dfid->fid, name);
+
+	p9_req_put(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_unlinkat);
+
+int
+p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
+{
+	int total = 0;
+	*err = 0;
+
+	while (iov_iter_count(to)) {
+		int count;
+
+		count = p9_client_read_once(fid, offset, to, err);
+		if (!count || *err)
+			break;
+		offset += count;
+		total += count;
+	}
+	return total;
+}
+EXPORT_SYMBOL(p9_client_read);
+
+int
+p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to,
+		    int *err)
+{
+	struct p9_client *clnt = fid->clnt;
+	struct p9_req_t *req;
+	int count = iov_iter_count(to);
+	int rsize, received, non_zc = 0;
+	char *dataptr;
+
+	*err = 0;
+	p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %zu\n",
+		 fid->fid, offset, iov_iter_count(to));
+
+	rsize = fid->iounit;
+	if (!rsize || rsize > clnt->msize - P9_IOHDRSZ)
+		rsize = clnt->msize - P9_IOHDRSZ;
+
+	if (count < rsize)
+		rsize = count;
+
+	/* Don't bother zerocopy for small IO (< 1024) */
+	if (clnt->trans_mod->zc_request && rsize > 1024) {
+		/* response header len is 11
+		 * PDU Header(7) + IO Size (4)
+		 */
+		req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize,
+				       0, 11, "dqd", fid->fid,
+				       offset, rsize);
+	} else {
+		non_zc = 1;
+		req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
+				    rsize);
+	}
+	if (IS_ERR(req)) {
+		*err = PTR_ERR(req);
+		if (!non_zc)
+			iov_iter_revert(to, count - iov_iter_count(to));
+		return 0;
+	}
+
+	*err = p9pdu_readf(&req->rc, clnt->proto_version,
+			   "D", &received, &dataptr);
+	if (*err) {
+		if (!non_zc)
+			iov_iter_revert(to, count - iov_iter_count(to));
+		trace_9p_protocol_dump(clnt, &req->rc);
+		p9_req_put(clnt, req);
+		return 0;
+	}
+	if (rsize < received) {
+		pr_err("bogus RREAD count (%d > %d)\n", received, rsize);
+		received = rsize;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
+
+	if (non_zc) {
+		int n = copy_to_iter(dataptr, received, to);
+
+		if (n != received) {
+			*err = -EFAULT;
+			p9_req_put(clnt, req);
+			return n;
+		}
+	} else {
+		iov_iter_revert(to, count - received - iov_iter_count(to));
+	}
+	p9_req_put(clnt, req);
+	return received;
+}
+EXPORT_SYMBOL(p9_client_read_once);
+
+int
+p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
+{
+	struct p9_client *clnt = fid->clnt;
+	struct p9_req_t *req;
+	int total = 0;
+	*err = 0;
+
+	p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %zd\n",
+		 fid->fid, offset, iov_iter_count(from));
+
+	while (iov_iter_count(from)) {
+		int count = iov_iter_count(from);
+		int rsize = fid->iounit;
+		int written;
+
+		if (!rsize || rsize > clnt->msize - P9_IOHDRSZ)
+			rsize = clnt->msize - P9_IOHDRSZ;
+
+		if (count < rsize)
+			rsize = count;
+
+		/* Don't bother zerocopy for small IO (< 1024) */
+		if (clnt->trans_mod->zc_request && rsize > 1024) {
+			req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0,
+					       rsize, P9_ZC_HDR_SZ, "dqd",
+					       fid->fid, offset, rsize);
+		} else {
+			req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid,
+					    offset, rsize, from);
+		}
+		if (IS_ERR(req)) {
+			iov_iter_revert(from, count - iov_iter_count(from));
+			*err = PTR_ERR(req);
+			break;
+		}
+
+		*err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &written);
+		if (*err) {
+			iov_iter_revert(from, count - iov_iter_count(from));
+			trace_9p_protocol_dump(clnt, &req->rc);
+			p9_req_put(clnt, req);
+			break;
+		}
+		if (rsize < written) {
+			pr_err("bogus RWRITE count (%d > %d)\n", written, rsize);
+			written = rsize;
+		}
+
+		p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count);
+
+		p9_req_put(clnt, req);
+		iov_iter_revert(from, count - written - iov_iter_count(from));
+		total += written;
+		offset += written;
+	}
+	return total;
+}
+EXPORT_SYMBOL(p9_client_write);
+
+struct p9_wstat *p9_client_stat(struct p9_fid *fid)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_wstat *ret;
+	struct p9_req_t *req;
+	u16 ignored;
+
+	p9_debug(P9_DEBUG_9P, ">>> TSTAT fid %d\n", fid->fid);
+
+	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return ERR_PTR(-ENOMEM);
+
+	clnt = fid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TSTAT, "d", fid->fid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "wS", &ignored, ret);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		p9_req_put(clnt, req);
+		goto error;
+	}
+
+	p9_debug(P9_DEBUG_9P,
+		 "<<< RSTAT sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
+		 "<<<    mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
+		 "<<<    name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
+		 "<<<    uid=%d gid=%d n_muid=%d\n",
+		 ret->size, ret->type, ret->dev, ret->qid.type, ret->qid.path,
+		 ret->qid.version, ret->mode,
+		 ret->atime, ret->mtime, ret->length,
+		 ret->name, ret->uid, ret->gid, ret->muid, ret->extension,
+		 from_kuid(&init_user_ns, ret->n_uid),
+		 from_kgid(&init_user_ns, ret->n_gid),
+		 from_kuid(&init_user_ns, ret->n_muid));
+
+	p9_req_put(clnt, req);
+	return ret;
+
+error:
+	kfree(ret);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_stat);
+
+struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid,
+					    u64 request_mask)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_stat_dotl *ret;
+	struct p9_req_t *req;
+
+	p9_debug(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n",
+		 fid->fid, request_mask);
+
+	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return ERR_PTR(-ENOMEM);
+
+	clnt = fid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TGETATTR, "dq", fid->fid, request_mask);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "A", ret);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		p9_req_put(clnt, req);
+		goto error;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RGETATTR st_result_mask=%lld\n"
+		 "<<< qid=%x.%llx.%x\n"
+		 "<<< st_mode=%8.8x st_nlink=%llu\n"
+		 "<<< st_uid=%d st_gid=%d\n"
+		 "<<< st_rdev=%llx st_size=%llx st_blksize=%llu st_blocks=%llu\n"
+		 "<<< st_atime_sec=%lld st_atime_nsec=%lld\n"
+		 "<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n"
+		 "<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n"
+		 "<<< st_btime_sec=%lld st_btime_nsec=%lld\n"
+		 "<<< st_gen=%lld st_data_version=%lld\n",
+		 ret->st_result_mask,
+		 ret->qid.type, ret->qid.path, ret->qid.version,
+		 ret->st_mode, ret->st_nlink,
+		 from_kuid(&init_user_ns, ret->st_uid),
+		 from_kgid(&init_user_ns, ret->st_gid),
+		 ret->st_rdev, ret->st_size, ret->st_blksize, ret->st_blocks,
+		 ret->st_atime_sec, ret->st_atime_nsec,
+		 ret->st_mtime_sec, ret->st_mtime_nsec,
+		 ret->st_ctime_sec, ret->st_ctime_nsec,
+		 ret->st_btime_sec, ret->st_btime_nsec,
+		 ret->st_gen, ret->st_data_version);
+
+	p9_req_put(clnt, req);
+	return ret;
+
+error:
+	kfree(ret);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_getattr_dotl);
+
+static int p9_client_statsize(struct p9_wstat *wst, int proto_version)
+{
+	int ret;
+
+	/* NOTE: size shouldn't include its own length */
+	/* size[2] type[2] dev[4] qid[13] */
+	/* mode[4] atime[4] mtime[4] length[8]*/
+	/* name[s] uid[s] gid[s] muid[s] */
+	ret = 2 + 4 + 13 + 4 + 4 + 4 + 8 + 2 + 2 + 2 + 2;
+
+	if (wst->name)
+		ret += strlen(wst->name);
+	if (wst->uid)
+		ret += strlen(wst->uid);
+	if (wst->gid)
+		ret += strlen(wst->gid);
+	if (wst->muid)
+		ret += strlen(wst->muid);
+
+	if (proto_version == p9_proto_2000u ||
+	    proto_version == p9_proto_2000L) {
+		/* extension[s] n_uid[4] n_gid[4] n_muid[4] */
+		ret += 2 + 4 + 4 + 4;
+		if (wst->extension)
+			ret += strlen(wst->extension);
+	}
+
+	return ret;
+}
+
+int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst)
+{
+	int err = 0;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	clnt = fid->clnt;
+	wst->size = p9_client_statsize(wst, clnt->proto_version);
+	p9_debug(P9_DEBUG_9P, ">>> TWSTAT fid %d\n",
+		 fid->fid);
+	p9_debug(P9_DEBUG_9P,
+		 "     sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
+		 "     mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
+		 "     name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
+		 "     uid=%d gid=%d n_muid=%d\n",
+		 wst->size, wst->type, wst->dev, wst->qid.type,
+		 wst->qid.path, wst->qid.version,
+		 wst->mode, wst->atime, wst->mtime, wst->length,
+		 wst->name, wst->uid, wst->gid, wst->muid, wst->extension,
+		 from_kuid(&init_user_ns, wst->n_uid),
+		 from_kgid(&init_user_ns, wst->n_gid),
+		 from_kuid(&init_user_ns, wst->n_muid));
+
+	req = p9_client_rpc(clnt, P9_TWSTAT, "dwS",
+			    fid->fid, wst->size + 2, wst);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid);
+
+	p9_req_put(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_wstat);
+
+int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr)
+{
+	int err = 0;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	clnt = fid->clnt;
+	p9_debug(P9_DEBUG_9P, ">>> TSETATTR fid %d\n", fid->fid);
+	p9_debug(P9_DEBUG_9P, "    valid=%x mode=%x uid=%d gid=%d size=%lld\n",
+		 p9attr->valid, p9attr->mode,
+		 from_kuid(&init_user_ns, p9attr->uid),
+		 from_kgid(&init_user_ns, p9attr->gid),
+		 p9attr->size);
+	p9_debug(P9_DEBUG_9P, "    atime_sec=%lld atime_nsec=%lld\n",
+		 p9attr->atime_sec, p9attr->atime_nsec);
+	p9_debug(P9_DEBUG_9P, "    mtime_sec=%lld mtime_nsec=%lld\n",
+		 p9attr->mtime_sec, p9attr->mtime_nsec);
+
+	req = p9_client_rpc(clnt, P9_TSETATTR, "dI", fid->fid, p9attr);
+
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+	p9_debug(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid);
+	p9_req_put(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_setattr);
+
+int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb)
+{
+	int err;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	clnt = fid->clnt;
+
+	p9_debug(P9_DEBUG_9P, ">>> TSTATFS fid %d\n", fid->fid);
+
+	req = p9_client_rpc(clnt, P9_TSTATFS, "d", fid->fid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type,
+			  &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail,
+			  &sb->files, &sb->ffree, &sb->fsid, &sb->namelen);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		p9_req_put(clnt, req);
+		goto error;
+	}
+
+	p9_debug(P9_DEBUG_9P,
+		 "<<< RSTATFS fid %d type 0x%x bsize %u blocks %llu bfree %llu bavail %llu files %llu ffree %llu fsid %llu namelen %u\n",
+		 fid->fid, sb->type, sb->bsize, sb->blocks, sb->bfree,
+		 sb->bavail, sb->files, sb->ffree, sb->fsid, sb->namelen);
+
+	p9_req_put(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_statfs);
+
+int p9_client_rename(struct p9_fid *fid,
+		     struct p9_fid *newdirfid, const char *name)
+{
+	int err = 0;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	clnt = fid->clnt;
+
+	p9_debug(P9_DEBUG_9P, ">>> TRENAME fid %d newdirfid %d name %s\n",
+		 fid->fid, newdirfid->fid, name);
+
+	req = p9_client_rpc(clnt, P9_TRENAME, "dds", fid->fid,
+			    newdirfid->fid, name);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid);
+
+	p9_req_put(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_rename);
+
+int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name,
+		       struct p9_fid *newdirfid, const char *new_name)
+{
+	int err = 0;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	clnt = olddirfid->clnt;
+
+	p9_debug(P9_DEBUG_9P,
+		 ">>> TRENAMEAT olddirfid %d old name %s newdirfid %d new name %s\n",
+		 olddirfid->fid, old_name, newdirfid->fid, new_name);
+
+	req = p9_client_rpc(clnt, P9_TRENAMEAT, "dsds", olddirfid->fid,
+			    old_name, newdirfid->fid, new_name);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RRENAMEAT newdirfid %d new name %s\n",
+		 newdirfid->fid, new_name);
+
+	p9_req_put(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_renameat);
+
+/* An xattrwalk without @attr_name gives the fid for the lisxattr namespace
+ */
+struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid,
+				   const char *attr_name, u64 *attr_size)
+{
+	int err;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+	struct p9_fid *attr_fid;
+
+	clnt = file_fid->clnt;
+	attr_fid = p9_fid_create(clnt);
+	if (!attr_fid) {
+		err = -ENOMEM;
+		goto error;
+	}
+	p9_debug(P9_DEBUG_9P,
+		 ">>> TXATTRWALK file_fid %d, attr_fid %d name '%s'\n",
+		 file_fid->fid, attr_fid->fid, attr_name);
+
+	req = p9_client_rpc(clnt, P9_TXATTRWALK, "dds",
+			    file_fid->fid, attr_fid->fid, attr_name);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "q", attr_size);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		p9_req_put(clnt, req);
+		goto clunk_fid;
+	}
+	p9_req_put(clnt, req);
+	p9_debug(P9_DEBUG_9P, "<<<  RXATTRWALK fid %d size %llu\n",
+		 attr_fid->fid, *attr_size);
+	return attr_fid;
+clunk_fid:
+	p9_fid_put(attr_fid);
+	attr_fid = NULL;
+error:
+	if (attr_fid && attr_fid != file_fid)
+		p9_fid_destroy(attr_fid);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(p9_client_xattrwalk);
+
+int p9_client_xattrcreate(struct p9_fid *fid, const char *name,
+			  u64 attr_size, int flags)
+{
+	int err = 0;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	p9_debug(P9_DEBUG_9P,
+		 ">>> TXATTRCREATE fid %d name  %s size %llu flag %d\n",
+		 fid->fid, name, attr_size, flags);
+	clnt = fid->clnt;
+	req = p9_client_rpc(clnt, P9_TXATTRCREATE, "dsqd",
+			    fid->fid, name, attr_size, flags);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+	p9_debug(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid);
+	p9_req_put(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL_GPL(p9_client_xattrcreate);
+
+int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
+{
+	int err, rsize, non_zc = 0;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+	char *dataptr;
+	struct kvec kv = {.iov_base = data, .iov_len = count};
+	struct iov_iter to;
+
+	iov_iter_kvec(&to, ITER_DEST, &kv, 1, count);
+
+	p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n",
+		 fid->fid, offset, count);
+
+	clnt = fid->clnt;
+
+	rsize = fid->iounit;
+	if (!rsize || rsize > clnt->msize - P9_READDIRHDRSZ)
+		rsize = clnt->msize - P9_READDIRHDRSZ;
+
+	if (count < rsize)
+		rsize = count;
+
+	/* Don't bother zerocopy for small IO (< 1024) */
+	if (clnt->trans_mod->zc_request && rsize > 1024) {
+		/* response header len is 11
+		 * PDU Header(7) + IO Size (4)
+		 */
+		req = p9_client_zc_rpc(clnt, P9_TREADDIR, &to, NULL, rsize, 0,
+				       11, "dqd", fid->fid, offset, rsize);
+	} else {
+		non_zc = 1;
+		req = p9_client_rpc(clnt, P9_TREADDIR, "dqd", fid->fid,
+				    offset, rsize);
+	}
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &count, &dataptr);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		goto free_and_error;
+	}
+	if (rsize < count) {
+		pr_err("bogus RREADDIR count (%d > %d)\n", count, rsize);
+		count = rsize;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count);
+
+	if (non_zc)
+		memmove(data, dataptr, count);
+
+	p9_req_put(clnt, req);
+	return count;
+
+free_and_error:
+	p9_req_put(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_readdir);
+
+int p9_client_mknod_dotl(struct p9_fid *fid, const char *name, int mode,
+			 dev_t rdev, kgid_t gid, struct p9_qid *qid)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	clnt = fid->clnt;
+	p9_debug(P9_DEBUG_9P,
+		 ">>> TMKNOD fid %d name %s mode %d major %d minor %d\n",
+		 fid->fid, name, mode, MAJOR(rdev), MINOR(rdev));
+	req = p9_client_rpc(clnt, P9_TMKNOD, "dsdddg", fid->fid, name, mode,
+			    MAJOR(rdev), MINOR(rdev), gid);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		goto error;
+	}
+	p9_debug(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n",
+		 qid->type, qid->path, qid->version);
+
+error:
+	p9_req_put(clnt, req);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_mknod_dotl);
+
+int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
+			 kgid_t gid, struct p9_qid *qid)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	clnt = fid->clnt;
+	p9_debug(P9_DEBUG_9P, ">>> TMKDIR fid %d name %s mode %d gid %d\n",
+		 fid->fid, name, mode, from_kgid(&init_user_ns, gid));
+	req = p9_client_rpc(clnt, P9_TMKDIR, "dsdg",
+			    fid->fid, name, mode, gid);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		goto error;
+	}
+	p9_debug(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type,
+		 qid->path, qid->version);
+
+error:
+	p9_req_put(clnt, req);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_mkdir_dotl);
+
+int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	clnt = fid->clnt;
+	p9_debug(P9_DEBUG_9P,
+		 ">>> TLOCK fid %d type %i flags %d start %lld length %lld proc_id %d client_id %s\n",
+		 fid->fid, flock->type, flock->flags, flock->start,
+		 flock->length, flock->proc_id, flock->client_id);
+
+	req = p9_client_rpc(clnt, P9_TLOCK, "dbdqqds", fid->fid, flock->type,
+			    flock->flags, flock->start, flock->length,
+			    flock->proc_id, flock->client_id);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "b", status);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		goto error;
+	}
+	p9_debug(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status);
+error:
+	p9_req_put(clnt, req);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_lock_dotl);
+
+int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	clnt = fid->clnt;
+	p9_debug(P9_DEBUG_9P,
+		 ">>> TGETLOCK fid %d, type %i start %lld length %lld proc_id %d client_id %s\n",
+		 fid->fid, glock->type, glock->start, glock->length,
+		 glock->proc_id, glock->client_id);
+
+	req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid,
+			    glock->type, glock->start, glock->length,
+			    glock->proc_id, glock->client_id);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "bqqds", &glock->type,
+			  &glock->start, &glock->length, &glock->proc_id,
+			  &glock->client_id);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		goto error;
+	}
+	p9_debug(P9_DEBUG_9P,
+		 "<<< RGETLOCK type %i start %lld length %lld proc_id %d client_id %s\n",
+		 glock->type, glock->start, glock->length,
+		 glock->proc_id, glock->client_id);
+error:
+	p9_req_put(clnt, req);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_getlock_dotl);
+
+int p9_client_readlink(struct p9_fid *fid, char **target)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	clnt = fid->clnt;
+	p9_debug(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid);
+
+	req = p9_client_rpc(clnt, P9_TREADLINK, "d", fid->fid);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "s", target);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		goto error;
+	}
+	p9_debug(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target);
+error:
+	p9_req_put(clnt, req);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_readlink);
+
+int __init p9_client_init(void)
+{
+	p9_req_cache = KMEM_CACHE(p9_req_t, SLAB_TYPESAFE_BY_RCU);
+	return p9_req_cache ? 0 : -ENOMEM;
+}
+
+void __exit p9_client_exit(void)
+{
+	kmem_cache_destroy(p9_req_cache);
+}
diff --git a/net/9p/error.c b/net/9p/error.c
new file mode 100644
index 0000000000..8da744494b
--- /dev/null
+++ b/net/9p/error.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Error string handling
+ *
+ * Plan 9 uses error strings, Unix uses error numbers.  These functions
+ * try to help manage that and provide for dynamically adding error
+ * mappings.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/errno.h>
+#include <net/9p/9p.h>
+
+/**
+ * struct errormap - map string errors from Plan 9 to Linux numeric ids
+ * @name: string sent over 9P
+ * @val: numeric id most closely representing @name
+ * @namelen: length of string
+ * @list: hash-table list for string lookup
+ */
+struct errormap {
+	char *name;
+	int val;
+
+	int namelen;
+	struct hlist_node list;
+};
+
+#define ERRHASHSZ		32
+static struct hlist_head hash_errmap[ERRHASHSZ];
+
+/* FixMe - reduce to a reasonable size */
+static struct errormap errmap[] = {
+	{"Operation not permitted", EPERM},
+	{"wstat prohibited", EPERM},
+	{"No such file or directory", ENOENT},
+	{"directory entry not found", ENOENT},
+	{"file not found", ENOENT},
+	{"Interrupted system call", EINTR},
+	{"Input/output error", EIO},
+	{"No such device or address", ENXIO},
+	{"Argument list too long", E2BIG},
+	{"Bad file descriptor", EBADF},
+	{"Resource temporarily unavailable", EAGAIN},
+	{"Cannot allocate memory", ENOMEM},
+	{"Permission denied", EACCES},
+	{"Bad address", EFAULT},
+	{"Block device required", ENOTBLK},
+	{"Device or resource busy", EBUSY},
+	{"File exists", EEXIST},
+	{"Invalid cross-device link", EXDEV},
+	{"No such device", ENODEV},
+	{"Not a directory", ENOTDIR},
+	{"Is a directory", EISDIR},
+	{"Invalid argument", EINVAL},
+	{"Too many open files in system", ENFILE},
+	{"Too many open files", EMFILE},
+	{"Text file busy", ETXTBSY},
+	{"File too large", EFBIG},
+	{"No space left on device", ENOSPC},
+	{"Illegal seek", ESPIPE},
+	{"Read-only file system", EROFS},
+	{"Too many links", EMLINK},
+	{"Broken pipe", EPIPE},
+	{"Numerical argument out of domain", EDOM},
+	{"Numerical result out of range", ERANGE},
+	{"Resource deadlock avoided", EDEADLK},
+	{"File name too long", ENAMETOOLONG},
+	{"No locks available", ENOLCK},
+	{"Function not implemented", ENOSYS},
+	{"Directory not empty", ENOTEMPTY},
+	{"Too many levels of symbolic links", ELOOP},
+	{"No message of desired type", ENOMSG},
+	{"Identifier removed", EIDRM},
+	{"No data available", ENODATA},
+	{"Machine is not on the network", ENONET},
+	{"Package not installed", ENOPKG},
+	{"Object is remote", EREMOTE},
+	{"Link has been severed", ENOLINK},
+	{"Communication error on send", ECOMM},
+	{"Protocol error", EPROTO},
+	{"Bad message", EBADMSG},
+	{"File descriptor in bad state", EBADFD},
+	{"Streams pipe error", ESTRPIPE},
+	{"Too many users", EUSERS},
+	{"Socket operation on non-socket", ENOTSOCK},
+	{"Message too long", EMSGSIZE},
+	{"Protocol not available", ENOPROTOOPT},
+	{"Protocol not supported", EPROTONOSUPPORT},
+	{"Socket type not supported", ESOCKTNOSUPPORT},
+	{"Operation not supported", EOPNOTSUPP},
+	{"Protocol family not supported", EPFNOSUPPORT},
+	{"Network is down", ENETDOWN},
+	{"Network is unreachable", ENETUNREACH},
+	{"Network dropped connection on reset", ENETRESET},
+	{"Software caused connection abort", ECONNABORTED},
+	{"Connection reset by peer", ECONNRESET},
+	{"No buffer space available", ENOBUFS},
+	{"Transport endpoint is already connected", EISCONN},
+	{"Transport endpoint is not connected", ENOTCONN},
+	{"Cannot send after transport endpoint shutdown", ESHUTDOWN},
+	{"Connection timed out", ETIMEDOUT},
+	{"Connection refused", ECONNREFUSED},
+	{"Host is down", EHOSTDOWN},
+	{"No route to host", EHOSTUNREACH},
+	{"Operation already in progress", EALREADY},
+	{"Operation now in progress", EINPROGRESS},
+	{"Is a named type file", EISNAM},
+	{"Remote I/O error", EREMOTEIO},
+	{"Disk quota exceeded", EDQUOT},
+/* errors from fossil, vacfs, and u9fs */
+	{"fid unknown or out of range", EBADF},
+	{"permission denied", EACCES},
+	{"file does not exist", ENOENT},
+	{"authentication failed", ECONNREFUSED},
+	{"bad offset in directory read", ESPIPE},
+	{"bad use of fid", EBADF},
+	{"wstat can't convert between files and directories", EPERM},
+	{"directory is not empty", ENOTEMPTY},
+	{"file exists", EEXIST},
+	{"file already exists", EEXIST},
+	{"file or directory already exists", EEXIST},
+	{"fid already in use", EBADF},
+	{"file in use", ETXTBSY},
+	{"i/o error", EIO},
+	{"file already open for I/O", ETXTBSY},
+	{"illegal mode", EINVAL},
+	{"illegal name", ENAMETOOLONG},
+	{"not a directory", ENOTDIR},
+	{"not a member of proposed group", EPERM},
+	{"not owner", EACCES},
+	{"only owner can change group in wstat", EACCES},
+	{"read only file system", EROFS},
+	{"no access to special file", EPERM},
+	{"i/o count too large", EIO},
+	{"unknown group", EINVAL},
+	{"unknown user", EINVAL},
+	{"bogus wstat buffer", EPROTO},
+	{"exclusive use file already open", EAGAIN},
+	{"corrupted directory entry", EIO},
+	{"corrupted file entry", EIO},
+	{"corrupted block label", EIO},
+	{"corrupted meta data", EIO},
+	{"illegal offset", EINVAL},
+	{"illegal path element", ENOENT},
+	{"root of file system is corrupted", EIO},
+	{"corrupted super block", EIO},
+	{"protocol botch", EPROTO},
+	{"file system is full", ENOSPC},
+	{"file is in use", EAGAIN},
+	{"directory entry is not allocated", ENOENT},
+	{"file is read only", EROFS},
+	{"file has been removed", EIDRM},
+	{"only support truncation to zero length", EPERM},
+	{"cannot remove root", EPERM},
+	{"file too big", EFBIG},
+	{"venti i/o error", EIO},
+	/* these are not errors */
+	{"u9fs rhostsauth: no authentication required", 0},
+	{"u9fs authnone: no authentication required", 0},
+	{NULL, -1}
+};
+
+/**
+ * p9_error_init - preload mappings into hash list
+ *
+ */
+
+int p9_error_init(void)
+{
+	struct errormap *c;
+	int bucket;
+
+	/* initialize hash table */
+	for (bucket = 0; bucket < ERRHASHSZ; bucket++)
+		INIT_HLIST_HEAD(&hash_errmap[bucket]);
+
+	/* load initial error map into hash table */
+	for (c = errmap; c->name; c++) {
+		c->namelen = strlen(c->name);
+		bucket = jhash(c->name, c->namelen, 0) % ERRHASHSZ;
+		INIT_HLIST_NODE(&c->list);
+		hlist_add_head(&c->list, &hash_errmap[bucket]);
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(p9_error_init);
+
+/**
+ * p9_errstr2errno - convert error string to error number
+ * @errstr: error string
+ * @len: length of error string
+ *
+ */
+
+int p9_errstr2errno(char *errstr, int len)
+{
+	int errno;
+	struct errormap *c;
+	int bucket;
+
+	errno = 0;
+	c = NULL;
+	bucket = jhash(errstr, len, 0) % ERRHASHSZ;
+	hlist_for_each_entry(c, &hash_errmap[bucket], list) {
+		if (c->namelen == len && !memcmp(c->name, errstr, len)) {
+			errno = c->val;
+			break;
+		}
+	}
+
+	if (errno == 0) {
+		/* TODO: if error isn't found, add it dynamically */
+		errstr[len] = 0;
+		pr_err("%s: server reported unknown error %s\n",
+		       __func__, errstr);
+		errno = ESERVERFAULT;
+	}
+
+	return -errno;
+}
+EXPORT_SYMBOL(p9_errstr2errno);
diff --git a/net/9p/mod.c b/net/9p/mod.c
new file mode 100644
index 0000000000..55576c1866
--- /dev/null
+++ b/net/9p/mod.c
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  9P entry point
+ *
+ *  Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/moduleparam.h>
+#include <net/9p/9p.h>
+#include <linux/fs.h>
+#include <linux/parser.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_NET_9P_DEBUG
+unsigned int p9_debug_level;	/* feature-rific global debug level  */
+EXPORT_SYMBOL(p9_debug_level);
+module_param_named(debug, p9_debug_level, uint, 0);
+MODULE_PARM_DESC(debug, "9P debugging level");
+
+void _p9_debug(enum p9_debug_flags level, const char *func,
+	       const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if ((p9_debug_level & level) != level)
+		return;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	if (level == P9_DEBUG_9P)
+		pr_notice("(%8.8d) %pV", task_pid_nr(current), &vaf);
+	else
+		pr_notice("-- %s (%d): %pV", func, task_pid_nr(current), &vaf);
+
+	va_end(args);
+}
+EXPORT_SYMBOL(_p9_debug);
+#endif
+
+/* Dynamic Transport Registration Routines */
+
+static DEFINE_SPINLOCK(v9fs_trans_lock);
+static LIST_HEAD(v9fs_trans_list);
+
+/**
+ * v9fs_register_trans - register a new transport with 9p
+ * @m: structure describing the transport module and entry points
+ *
+ */
+void v9fs_register_trans(struct p9_trans_module *m)
+{
+	spin_lock(&v9fs_trans_lock);
+	list_add_tail(&m->list, &v9fs_trans_list);
+	spin_unlock(&v9fs_trans_lock);
+}
+EXPORT_SYMBOL(v9fs_register_trans);
+
+/**
+ * v9fs_unregister_trans - unregister a 9p transport
+ * @m: the transport to remove
+ *
+ */
+void v9fs_unregister_trans(struct p9_trans_module *m)
+{
+	spin_lock(&v9fs_trans_lock);
+	list_del_init(&m->list);
+	spin_unlock(&v9fs_trans_lock);
+}
+EXPORT_SYMBOL(v9fs_unregister_trans);
+
+static struct p9_trans_module *_p9_get_trans_by_name(const char *s)
+{
+	struct p9_trans_module *t, *found = NULL;
+
+	spin_lock(&v9fs_trans_lock);
+
+	list_for_each_entry(t, &v9fs_trans_list, list)
+		if (strcmp(t->name, s) == 0 &&
+		    try_module_get(t->owner)) {
+			found = t;
+			break;
+		}
+
+	spin_unlock(&v9fs_trans_lock);
+
+	return found;
+}
+
+/**
+ * v9fs_get_trans_by_name - get transport with the matching name
+ * @s: string identifying transport
+ *
+ */
+struct p9_trans_module *v9fs_get_trans_by_name(const char *s)
+{
+	struct p9_trans_module *found = NULL;
+
+	found = _p9_get_trans_by_name(s);
+
+#ifdef CONFIG_MODULES
+	if (!found) {
+		request_module("9p-%s", s);
+		found = _p9_get_trans_by_name(s);
+	}
+#endif
+
+	return found;
+}
+EXPORT_SYMBOL(v9fs_get_trans_by_name);
+
+static const char * const v9fs_default_transports[] = {
+	"virtio", "tcp", "fd", "unix", "xen", "rdma",
+};
+
+/**
+ * v9fs_get_default_trans - get the default transport
+ *
+ */
+
+struct p9_trans_module *v9fs_get_default_trans(void)
+{
+	struct p9_trans_module *t, *found = NULL;
+	int i;
+
+	spin_lock(&v9fs_trans_lock);
+
+	list_for_each_entry(t, &v9fs_trans_list, list)
+		if (t->def && try_module_get(t->owner)) {
+			found = t;
+			break;
+		}
+
+	if (!found)
+		list_for_each_entry(t, &v9fs_trans_list, list)
+			if (try_module_get(t->owner)) {
+				found = t;
+				break;
+			}
+
+	spin_unlock(&v9fs_trans_lock);
+
+	for (i = 0; !found && i < ARRAY_SIZE(v9fs_default_transports); i++)
+		found = v9fs_get_trans_by_name(v9fs_default_transports[i]);
+
+	return found;
+}
+EXPORT_SYMBOL(v9fs_get_default_trans);
+
+/**
+ * v9fs_put_trans - put trans
+ * @m: transport to put
+ *
+ */
+void v9fs_put_trans(struct p9_trans_module *m)
+{
+	if (m)
+		module_put(m->owner);
+}
+
+/**
+ * init_p9 - Initialize module
+ *
+ */
+static int __init init_p9(void)
+{
+	int ret;
+
+	ret = p9_client_init();
+	if (ret)
+		return ret;
+
+	p9_error_init();
+	pr_info("Installing 9P2000 support\n");
+
+	return ret;
+}
+
+/**
+ * exit_p9 - shutdown module
+ *
+ */
+
+static void __exit exit_p9(void)
+{
+	pr_info("Unloading 9P2000 support\n");
+
+	p9_client_exit();
+}
+
+module_init(init_p9)
+module_exit(exit_p9)
+
+MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Plan 9 Resource Sharing Support (9P2000)");
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
new file mode 100644
index 0000000000..0e6603b1ec
--- /dev/null
+++ b/net/9p/protocol.c
@@ -0,0 +1,801 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * 9P Protocol Support Code
+ *
+ *  Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  Base on code from Anthony Liguori <aliguori@us.ibm.com>
+ *  Copyright (C) 2008 by IBM, Corp.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include "protocol.h"
+
+#include <trace/events/9p.h>
+
+/* len[2] text[len] */
+#define P9_STRLEN(s) \
+	(2 + min_t(size_t, s ? strlen(s) : 0, USHRT_MAX))
+
+/**
+ * p9_msg_buf_size - Returns a buffer size sufficiently large to hold the
+ * intended 9p message.
+ * @c: client
+ * @type: message type
+ * @fmt: format template for assembling request message
+ * (see p9pdu_vwritef)
+ * @ap: variable arguments to be fed to passed format template
+ * (see p9pdu_vwritef)
+ *
+ * Note: Even for response types (P9_R*) the format template and variable
+ * arguments must always be for the originating request type (P9_T*).
+ */
+size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type,
+			const char *fmt, va_list ap)
+{
+	/* size[4] type[1] tag[2] */
+	const int hdr = 4 + 1 + 2;
+	/* ename[s] errno[4] */
+	const int rerror_size = hdr + P9_ERRMAX + 4;
+	/* ecode[4] */
+	const int rlerror_size = hdr + 4;
+	const int err_size =
+		c->proto_version == p9_proto_2000L ? rlerror_size : rerror_size;
+
+	static_assert(NAME_MAX <= 4*1024, "p9_msg_buf_size() currently assumes "
+				  "a max. allowed directory entry name length of 4k");
+
+	switch (type) {
+
+	/* message types not used at all */
+	case P9_TERROR:
+	case P9_TLERROR:
+	case P9_TAUTH:
+	case P9_RAUTH:
+		BUG();
+
+	/* variable length & potentially large message types */
+	case P9_TATTACH:
+		BUG_ON(strcmp("ddss?u", fmt));
+		va_arg(ap, int32_t);
+		va_arg(ap, int32_t);
+		{
+			const char *uname = va_arg(ap, const char *);
+			const char *aname = va_arg(ap, const char *);
+			/* fid[4] afid[4] uname[s] aname[s] n_uname[4] */
+			return hdr + 4 + 4 + P9_STRLEN(uname) + P9_STRLEN(aname) + 4;
+		}
+	case P9_TWALK:
+		BUG_ON(strcmp("ddT", fmt));
+		va_arg(ap, int32_t);
+		va_arg(ap, int32_t);
+		{
+			uint i, nwname = va_arg(ap, int);
+			size_t wname_all;
+			const char **wnames = va_arg(ap, const char **);
+			for (i = 0, wname_all = 0; i < nwname; ++i) {
+				wname_all += P9_STRLEN(wnames[i]);
+			}
+			/* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */
+			return hdr + 4 + 4 + 2 + wname_all;
+		}
+	case P9_RWALK:
+		BUG_ON(strcmp("ddT", fmt));
+		va_arg(ap, int32_t);
+		va_arg(ap, int32_t);
+		{
+			uint nwname = va_arg(ap, int);
+			/* nwqid[2] nwqid*(wqid[13]) */
+			return max_t(size_t, hdr + 2 + nwname * 13, err_size);
+		}
+	case P9_TCREATE:
+		BUG_ON(strcmp("dsdb?s", fmt));
+		va_arg(ap, int32_t);
+		{
+			const char *name = va_arg(ap, const char *);
+			if (c->proto_version == p9_proto_legacy) {
+				/* fid[4] name[s] perm[4] mode[1] */
+				return hdr + 4 + P9_STRLEN(name) + 4 + 1;
+			} else {
+				va_arg(ap, int32_t);
+				va_arg(ap, int);
+				{
+					const char *ext = va_arg(ap, const char *);
+					/* fid[4] name[s] perm[4] mode[1] extension[s] */
+					return hdr + 4 + P9_STRLEN(name) + 4 + 1 + P9_STRLEN(ext);
+				}
+			}
+		}
+	case P9_TLCREATE:
+		BUG_ON(strcmp("dsddg", fmt));
+		va_arg(ap, int32_t);
+		{
+			const char *name = va_arg(ap, const char *);
+			/* fid[4] name[s] flags[4] mode[4] gid[4] */
+			return hdr + 4 + P9_STRLEN(name) + 4 + 4 + 4;
+		}
+	case P9_RREAD:
+	case P9_RREADDIR:
+		BUG_ON(strcmp("dqd", fmt));
+		va_arg(ap, int32_t);
+		va_arg(ap, int64_t);
+		{
+			const int32_t count = va_arg(ap, int32_t);
+			/* count[4] data[count] */
+			return max_t(size_t, hdr + 4 + count, err_size);
+		}
+	case P9_TWRITE:
+		BUG_ON(strcmp("dqV", fmt));
+		va_arg(ap, int32_t);
+		va_arg(ap, int64_t);
+		{
+			const int32_t count = va_arg(ap, int32_t);
+			/* fid[4] offset[8] count[4] data[count] */
+			return hdr + 4 + 8 + 4 + count;
+		}
+	case P9_TRENAMEAT:
+		BUG_ON(strcmp("dsds", fmt));
+		va_arg(ap, int32_t);
+		{
+			const char *oldname, *newname;
+			oldname = va_arg(ap, const char *);
+			va_arg(ap, int32_t);
+			newname = va_arg(ap, const char *);
+			/* olddirfid[4] oldname[s] newdirfid[4] newname[s] */
+			return hdr + 4 + P9_STRLEN(oldname) + 4 + P9_STRLEN(newname);
+		}
+	case P9_TSYMLINK:
+		BUG_ON(strcmp("dssg", fmt));
+		va_arg(ap, int32_t);
+		{
+			const char *name = va_arg(ap, const char *);
+			const char *symtgt = va_arg(ap, const char *);
+			/* fid[4] name[s] symtgt[s] gid[4] */
+			return hdr + 4 + P9_STRLEN(name) + P9_STRLEN(symtgt) + 4;
+		}
+
+	case P9_RERROR:
+		return rerror_size;
+	case P9_RLERROR:
+		return rlerror_size;
+
+	/* small message types */
+	case P9_TWSTAT:
+	case P9_RSTAT:
+	case P9_RREADLINK:
+	case P9_TXATTRWALK:
+	case P9_TXATTRCREATE:
+	case P9_TLINK:
+	case P9_TMKDIR:
+	case P9_TMKNOD:
+	case P9_TRENAME:
+	case P9_TUNLINKAT:
+	case P9_TLOCK:
+		return 8 * 1024;
+
+	/* tiny message types */
+	default:
+		return 4 * 1024;
+
+	}
+}
+
+static int
+p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
+
+void p9stat_free(struct p9_wstat *stbuf)
+{
+	kfree(stbuf->name);
+	stbuf->name = NULL;
+	kfree(stbuf->uid);
+	stbuf->uid = NULL;
+	kfree(stbuf->gid);
+	stbuf->gid = NULL;
+	kfree(stbuf->muid);
+	stbuf->muid = NULL;
+	kfree(stbuf->extension);
+	stbuf->extension = NULL;
+}
+EXPORT_SYMBOL(p9stat_free);
+
+size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size)
+{
+	size_t len = min(pdu->size - pdu->offset, size);
+
+	memcpy(data, &pdu->sdata[pdu->offset], len);
+	pdu->offset += len;
+	return size - len;
+}
+
+static size_t pdu_write(struct p9_fcall *pdu, const void *data, size_t size)
+{
+	size_t len = min(pdu->capacity - pdu->size, size);
+
+	memcpy(&pdu->sdata[pdu->size], data, len);
+	pdu->size += len;
+	return size - len;
+}
+
+static size_t
+pdu_write_u(struct p9_fcall *pdu, struct iov_iter *from, size_t size)
+{
+	size_t len = min(pdu->capacity - pdu->size, size);
+
+	if (!copy_from_iter_full(&pdu->sdata[pdu->size], len, from))
+		len = 0;
+
+	pdu->size += len;
+	return size - len;
+}
+
+/*	b - int8_t
+ *	w - int16_t
+ *	d - int32_t
+ *	q - int64_t
+ *	s - string
+ *	u - numeric uid
+ *	g - numeric gid
+ *	S - stat
+ *	Q - qid
+ *	D - data blob (int32_t size followed by void *, results are not freed)
+ *	T - array of strings (int16_t count, followed by strings)
+ *	R - array of qids (int16_t count, followed by qids)
+ *	A - stat for 9p2000.L (p9_stat_dotl)
+ *	? - if optional = 1, continue parsing
+ */
+
+static int
+p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
+	     va_list ap)
+{
+	const char *ptr;
+	int errcode = 0;
+
+	for (ptr = fmt; *ptr; ptr++) {
+		switch (*ptr) {
+		case 'b':{
+				int8_t *val = va_arg(ap, int8_t *);
+				if (pdu_read(pdu, val, sizeof(*val))) {
+					errcode = -EFAULT;
+					break;
+				}
+			}
+			break;
+		case 'w':{
+				int16_t *val = va_arg(ap, int16_t *);
+				__le16 le_val;
+				if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+					errcode = -EFAULT;
+					break;
+				}
+				*val = le16_to_cpu(le_val);
+			}
+			break;
+		case 'd':{
+				int32_t *val = va_arg(ap, int32_t *);
+				__le32 le_val;
+				if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+					errcode = -EFAULT;
+					break;
+				}
+				*val = le32_to_cpu(le_val);
+			}
+			break;
+		case 'q':{
+				int64_t *val = va_arg(ap, int64_t *);
+				__le64 le_val;
+				if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+					errcode = -EFAULT;
+					break;
+				}
+				*val = le64_to_cpu(le_val);
+			}
+			break;
+		case 's':{
+				char **sptr = va_arg(ap, char **);
+				uint16_t len;
+
+				errcode = p9pdu_readf(pdu, proto_version,
+								"w", &len);
+				if (errcode)
+					break;
+
+				*sptr = kmalloc(len + 1, GFP_NOFS);
+				if (*sptr == NULL) {
+					errcode = -ENOMEM;
+					break;
+				}
+				if (pdu_read(pdu, *sptr, len)) {
+					errcode = -EFAULT;
+					kfree(*sptr);
+					*sptr = NULL;
+				} else
+					(*sptr)[len] = 0;
+			}
+			break;
+		case 'u': {
+				kuid_t *uid = va_arg(ap, kuid_t *);
+				__le32 le_val;
+				if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+					errcode = -EFAULT;
+					break;
+				}
+				*uid = make_kuid(&init_user_ns,
+						 le32_to_cpu(le_val));
+			} break;
+		case 'g': {
+				kgid_t *gid = va_arg(ap, kgid_t *);
+				__le32 le_val;
+				if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+					errcode = -EFAULT;
+					break;
+				}
+				*gid = make_kgid(&init_user_ns,
+						 le32_to_cpu(le_val));
+			} break;
+		case 'Q':{
+				struct p9_qid *qid =
+				    va_arg(ap, struct p9_qid *);
+
+				errcode = p9pdu_readf(pdu, proto_version, "bdq",
+						      &qid->type, &qid->version,
+						      &qid->path);
+			}
+			break;
+		case 'S':{
+				struct p9_wstat *stbuf =
+				    va_arg(ap, struct p9_wstat *);
+
+				memset(stbuf, 0, sizeof(struct p9_wstat));
+				stbuf->n_uid = stbuf->n_muid = INVALID_UID;
+				stbuf->n_gid = INVALID_GID;
+
+				errcode =
+				    p9pdu_readf(pdu, proto_version,
+						"wwdQdddqssss?sugu",
+						&stbuf->size, &stbuf->type,
+						&stbuf->dev, &stbuf->qid,
+						&stbuf->mode, &stbuf->atime,
+						&stbuf->mtime, &stbuf->length,
+						&stbuf->name, &stbuf->uid,
+						&stbuf->gid, &stbuf->muid,
+						&stbuf->extension,
+						&stbuf->n_uid, &stbuf->n_gid,
+						&stbuf->n_muid);
+				if (errcode)
+					p9stat_free(stbuf);
+			}
+			break;
+		case 'D':{
+				uint32_t *count = va_arg(ap, uint32_t *);
+				void **data = va_arg(ap, void **);
+
+				errcode =
+				    p9pdu_readf(pdu, proto_version, "d", count);
+				if (!errcode) {
+					*count =
+					    min_t(uint32_t, *count,
+						  pdu->size - pdu->offset);
+					*data = &pdu->sdata[pdu->offset];
+				}
+			}
+			break;
+		case 'T':{
+				uint16_t *nwname = va_arg(ap, uint16_t *);
+				char ***wnames = va_arg(ap, char ***);
+
+				*wnames = NULL;
+
+				errcode = p9pdu_readf(pdu, proto_version,
+								"w", nwname);
+				if (!errcode) {
+					*wnames =
+					    kmalloc_array(*nwname,
+							  sizeof(char *),
+							  GFP_NOFS);
+					if (!*wnames)
+						errcode = -ENOMEM;
+					else
+						(*wnames)[0] = NULL;
+				}
+
+				if (!errcode) {
+					int i;
+
+					for (i = 0; i < *nwname; i++) {
+						errcode =
+						    p9pdu_readf(pdu,
+								proto_version,
+								"s",
+								&(*wnames)[i]);
+						if (errcode) {
+							(*wnames)[i] = NULL;
+							break;
+						}
+					}
+				}
+
+				if (errcode) {
+					if (*wnames) {
+						int i;
+
+						for (i = 0; i < *nwname; i++) {
+							if (!(*wnames)[i])
+								break;
+							kfree((*wnames)[i]);
+						}
+						kfree(*wnames);
+						*wnames = NULL;
+					}
+				}
+			}
+			break;
+		case 'R':{
+				uint16_t *nwqid = va_arg(ap, uint16_t *);
+				struct p9_qid **wqids =
+				    va_arg(ap, struct p9_qid **);
+
+				*wqids = NULL;
+
+				errcode =
+				    p9pdu_readf(pdu, proto_version, "w", nwqid);
+				if (!errcode) {
+					*wqids =
+					    kmalloc_array(*nwqid,
+							  sizeof(struct p9_qid),
+							  GFP_NOFS);
+					if (*wqids == NULL)
+						errcode = -ENOMEM;
+				}
+
+				if (!errcode) {
+					int i;
+
+					for (i = 0; i < *nwqid; i++) {
+						errcode =
+						    p9pdu_readf(pdu,
+								proto_version,
+								"Q",
+								&(*wqids)[i]);
+						if (errcode)
+							break;
+					}
+				}
+
+				if (errcode) {
+					kfree(*wqids);
+					*wqids = NULL;
+				}
+			}
+			break;
+		case 'A': {
+				struct p9_stat_dotl *stbuf =
+				    va_arg(ap, struct p9_stat_dotl *);
+
+				memset(stbuf, 0, sizeof(struct p9_stat_dotl));
+				errcode =
+				    p9pdu_readf(pdu, proto_version,
+					"qQdugqqqqqqqqqqqqqqq",
+					&stbuf->st_result_mask,
+					&stbuf->qid,
+					&stbuf->st_mode,
+					&stbuf->st_uid, &stbuf->st_gid,
+					&stbuf->st_nlink,
+					&stbuf->st_rdev, &stbuf->st_size,
+					&stbuf->st_blksize, &stbuf->st_blocks,
+					&stbuf->st_atime_sec,
+					&stbuf->st_atime_nsec,
+					&stbuf->st_mtime_sec,
+					&stbuf->st_mtime_nsec,
+					&stbuf->st_ctime_sec,
+					&stbuf->st_ctime_nsec,
+					&stbuf->st_btime_sec,
+					&stbuf->st_btime_nsec,
+					&stbuf->st_gen,
+					&stbuf->st_data_version);
+			}
+			break;
+		case '?':
+			if ((proto_version != p9_proto_2000u) &&
+				(proto_version != p9_proto_2000L))
+				return 0;
+			break;
+		default:
+			BUG();
+			break;
+		}
+
+		if (errcode)
+			break;
+	}
+
+	return errcode;
+}
+
+int
+p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
+	va_list ap)
+{
+	const char *ptr;
+	int errcode = 0;
+
+	for (ptr = fmt; *ptr; ptr++) {
+		switch (*ptr) {
+		case 'b':{
+				int8_t val = va_arg(ap, int);
+				if (pdu_write(pdu, &val, sizeof(val)))
+					errcode = -EFAULT;
+			}
+			break;
+		case 'w':{
+				__le16 val = cpu_to_le16(va_arg(ap, int));
+				if (pdu_write(pdu, &val, sizeof(val)))
+					errcode = -EFAULT;
+			}
+			break;
+		case 'd':{
+				__le32 val = cpu_to_le32(va_arg(ap, int32_t));
+				if (pdu_write(pdu, &val, sizeof(val)))
+					errcode = -EFAULT;
+			}
+			break;
+		case 'q':{
+				__le64 val = cpu_to_le64(va_arg(ap, int64_t));
+				if (pdu_write(pdu, &val, sizeof(val)))
+					errcode = -EFAULT;
+			}
+			break;
+		case 's':{
+				const char *sptr = va_arg(ap, const char *);
+				uint16_t len = 0;
+				if (sptr)
+					len = min_t(size_t, strlen(sptr),
+								USHRT_MAX);
+
+				errcode = p9pdu_writef(pdu, proto_version,
+								"w", len);
+				if (!errcode && pdu_write(pdu, sptr, len))
+					errcode = -EFAULT;
+			}
+			break;
+		case 'u': {
+				kuid_t uid = va_arg(ap, kuid_t);
+				__le32 val = cpu_to_le32(
+						from_kuid(&init_user_ns, uid));
+				if (pdu_write(pdu, &val, sizeof(val)))
+					errcode = -EFAULT;
+			} break;
+		case 'g': {
+				kgid_t gid = va_arg(ap, kgid_t);
+				__le32 val = cpu_to_le32(
+						from_kgid(&init_user_ns, gid));
+				if (pdu_write(pdu, &val, sizeof(val)))
+					errcode = -EFAULT;
+			} break;
+		case 'Q':{
+				const struct p9_qid *qid =
+				    va_arg(ap, const struct p9_qid *);
+				errcode =
+				    p9pdu_writef(pdu, proto_version, "bdq",
+						 qid->type, qid->version,
+						 qid->path);
+			} break;
+		case 'S':{
+				const struct p9_wstat *stbuf =
+				    va_arg(ap, const struct p9_wstat *);
+				errcode =
+				    p9pdu_writef(pdu, proto_version,
+						 "wwdQdddqssss?sugu",
+						 stbuf->size, stbuf->type,
+						 stbuf->dev, &stbuf->qid,
+						 stbuf->mode, stbuf->atime,
+						 stbuf->mtime, stbuf->length,
+						 stbuf->name, stbuf->uid,
+						 stbuf->gid, stbuf->muid,
+						 stbuf->extension, stbuf->n_uid,
+						 stbuf->n_gid, stbuf->n_muid);
+			} break;
+		case 'V':{
+				uint32_t count = va_arg(ap, uint32_t);
+				struct iov_iter *from =
+						va_arg(ap, struct iov_iter *);
+				errcode = p9pdu_writef(pdu, proto_version, "d",
+									count);
+				if (!errcode && pdu_write_u(pdu, from, count))
+					errcode = -EFAULT;
+			}
+			break;
+		case 'T':{
+				uint16_t nwname = va_arg(ap, int);
+				const char **wnames = va_arg(ap, const char **);
+
+				errcode = p9pdu_writef(pdu, proto_version, "w",
+									nwname);
+				if (!errcode) {
+					int i;
+
+					for (i = 0; i < nwname; i++) {
+						errcode =
+						    p9pdu_writef(pdu,
+								proto_version,
+								 "s",
+								 wnames[i]);
+						if (errcode)
+							break;
+					}
+				}
+			}
+			break;
+		case 'R':{
+				uint16_t nwqid = va_arg(ap, int);
+				struct p9_qid *wqids =
+				    va_arg(ap, struct p9_qid *);
+
+				errcode = p9pdu_writef(pdu, proto_version, "w",
+									nwqid);
+				if (!errcode) {
+					int i;
+
+					for (i = 0; i < nwqid; i++) {
+						errcode =
+						    p9pdu_writef(pdu,
+								proto_version,
+								 "Q",
+								 &wqids[i]);
+						if (errcode)
+							break;
+					}
+				}
+			}
+			break;
+		case 'I':{
+				struct p9_iattr_dotl *p9attr = va_arg(ap,
+							struct p9_iattr_dotl *);
+
+				errcode = p9pdu_writef(pdu, proto_version,
+							"ddugqqqqq",
+							p9attr->valid,
+							p9attr->mode,
+							p9attr->uid,
+							p9attr->gid,
+							p9attr->size,
+							p9attr->atime_sec,
+							p9attr->atime_nsec,
+							p9attr->mtime_sec,
+							p9attr->mtime_nsec);
+			}
+			break;
+		case '?':
+			if ((proto_version != p9_proto_2000u) &&
+				(proto_version != p9_proto_2000L))
+				return 0;
+			break;
+		default:
+			BUG();
+			break;
+		}
+
+		if (errcode)
+			break;
+	}
+
+	return errcode;
+}
+
+int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	va_start(ap, fmt);
+	ret = p9pdu_vreadf(pdu, proto_version, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static int
+p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	va_start(ap, fmt);
+	ret = p9pdu_vwritef(pdu, proto_version, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+int p9stat_read(struct p9_client *clnt, char *buf, int len, struct p9_wstat *st)
+{
+	struct p9_fcall fake_pdu;
+	int ret;
+
+	fake_pdu.size = len;
+	fake_pdu.capacity = len;
+	fake_pdu.sdata = buf;
+	fake_pdu.offset = 0;
+
+	ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "S", st);
+	if (ret) {
+		p9_debug(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret);
+		trace_9p_protocol_dump(clnt, &fake_pdu);
+		return ret;
+	}
+
+	return fake_pdu.offset;
+}
+EXPORT_SYMBOL(p9stat_read);
+
+int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type)
+{
+	pdu->id = type;
+	return p9pdu_writef(pdu, 0, "dbw", 0, type, tag);
+}
+
+int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu)
+{
+	int size = pdu->size;
+	int err;
+
+	pdu->size = 0;
+	err = p9pdu_writef(pdu, 0, "d", size);
+	pdu->size = size;
+
+	trace_9p_protocol_dump(clnt, pdu);
+	p9_debug(P9_DEBUG_9P, ">>> size=%d type: %d tag: %d\n",
+		 pdu->size, pdu->id, pdu->tag);
+
+	return err;
+}
+
+void p9pdu_reset(struct p9_fcall *pdu)
+{
+	pdu->offset = 0;
+	pdu->size = 0;
+}
+
+int p9dirent_read(struct p9_client *clnt, char *buf, int len,
+		  struct p9_dirent *dirent)
+{
+	struct p9_fcall fake_pdu;
+	int ret;
+	char *nameptr;
+
+	fake_pdu.size = len;
+	fake_pdu.capacity = len;
+	fake_pdu.sdata = buf;
+	fake_pdu.offset = 0;
+
+	ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "Qqbs", &dirent->qid,
+			  &dirent->d_off, &dirent->d_type, &nameptr);
+	if (ret) {
+		p9_debug(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret);
+		trace_9p_protocol_dump(clnt, &fake_pdu);
+		return ret;
+	}
+
+	ret = strscpy(dirent->d_name, nameptr, sizeof(dirent->d_name));
+	if (ret < 0) {
+		p9_debug(P9_DEBUG_ERROR,
+			 "On the wire dirent name too long: %s\n",
+			 nameptr);
+		kfree(nameptr);
+		return ret;
+	}
+	kfree(nameptr);
+
+	return fake_pdu.offset;
+}
+EXPORT_SYMBOL(p9dirent_read);
diff --git a/net/9p/protocol.h b/net/9p/protocol.h
new file mode 100644
index 0000000000..ad2283d1f9
--- /dev/null
+++ b/net/9p/protocol.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * 9P Protocol Support Code
+ *
+ *  Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  Base on code from Anthony Liguori <aliguori@us.ibm.com>
+ *  Copyright (C) 2008 by IBM, Corp.
+ */
+
+size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type,
+			const char *fmt, va_list ap);
+int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
+		  va_list ap);
+int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
+int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type);
+int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu);
+void p9pdu_reset(struct p9_fcall *pdu);
+size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size);
diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c
new file mode 100644
index 0000000000..c827f69455
--- /dev/null
+++ b/net/9p/trans_common.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include "trans_common.h"
+
+/**
+ * p9_release_pages - Release pages after the transaction.
+ * @pages: array of pages to be put
+ * @nr_pages: size of array
+ */
+void p9_release_pages(struct page **pages, int nr_pages)
+{
+	int i;
+
+	for (i = 0; i < nr_pages; i++)
+		if (pages[i])
+			put_page(pages[i]);
+}
+EXPORT_SYMBOL(p9_release_pages);
diff --git a/net/9p/trans_common.h b/net/9p/trans_common.h
new file mode 100644
index 0000000000..32134db6ab
--- /dev/null
+++ b/net/9p/trans_common.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
+ */
+
+void p9_release_pages(struct page **pages, int nr_pages);
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
new file mode 100644
index 0000000000..d0eb03ada7
--- /dev/null
+++ b/net/9p/trans_fd.c
@@ -0,0 +1,1205 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Fd transport layer.  Includes deprecated socket layer.
+ *
+ *  Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
+ *  Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
+ *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/kthread.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <linux/uaccess.h>
+#include <linux/inet.h>
+#include <linux/file.h>
+#include <linux/parser.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+
+#include <linux/syscalls.h> /* killme */
+
+#define P9_PORT 564
+#define MAX_SOCK_BUF (1024*1024)
+#define MAXPOLLWADDR	2
+
+static struct p9_trans_module p9_tcp_trans;
+static struct p9_trans_module p9_fd_trans;
+
+/**
+ * struct p9_fd_opts - per-transport options
+ * @rfd: file descriptor for reading (trans=fd)
+ * @wfd: file descriptor for writing (trans=fd)
+ * @port: port to connect to (trans=tcp)
+ * @privport: port is privileged
+ */
+
+struct p9_fd_opts {
+	int rfd;
+	int wfd;
+	u16 port;
+	bool privport;
+};
+
+/*
+  * Option Parsing (code inspired by NFS code)
+  *  - a little lazy - parse all fd-transport options
+  */
+
+enum {
+	/* Options that take integer arguments */
+	Opt_port, Opt_rfdno, Opt_wfdno, Opt_err,
+	/* Options that take no arguments */
+	Opt_privport,
+};
+
+static const match_table_t tokens = {
+	{Opt_port, "port=%u"},
+	{Opt_rfdno, "rfdno=%u"},
+	{Opt_wfdno, "wfdno=%u"},
+	{Opt_privport, "privport"},
+	{Opt_err, NULL},
+};
+
+enum {
+	Rworksched = 1,		/* read work scheduled or running */
+	Rpending = 2,		/* can read */
+	Wworksched = 4,		/* write work scheduled or running */
+	Wpending = 8,		/* can write */
+};
+
+struct p9_poll_wait {
+	struct p9_conn *conn;
+	wait_queue_entry_t wait;
+	wait_queue_head_t *wait_addr;
+};
+
+/**
+ * struct p9_conn - fd mux connection state information
+ * @mux_list: list link for mux to manage multiple connections (?)
+ * @client: reference to client instance for this connection
+ * @err: error state
+ * @req_lock: lock protecting req_list and requests statuses
+ * @req_list: accounting for requests which have been sent
+ * @unsent_req_list: accounting for requests that haven't been sent
+ * @rreq: read request
+ * @wreq: write request
+ * @req: current request being processed (if any)
+ * @tmp_buf: temporary buffer to read in header
+ * @rc: temporary fcall for reading current frame
+ * @wpos: write position for current frame
+ * @wsize: amount of data to write for current frame
+ * @wbuf: current write buffer
+ * @poll_pending_link: pending links to be polled per conn
+ * @poll_wait: array of wait_q's for various worker threads
+ * @pt: poll state
+ * @rq: current read work
+ * @wq: current write work
+ * @wsched: ????
+ *
+ */
+
+struct p9_conn {
+	struct list_head mux_list;
+	struct p9_client *client;
+	int err;
+	spinlock_t req_lock;
+	struct list_head req_list;
+	struct list_head unsent_req_list;
+	struct p9_req_t *rreq;
+	struct p9_req_t *wreq;
+	char tmp_buf[P9_HDRSZ];
+	struct p9_fcall rc;
+	int wpos;
+	int wsize;
+	char *wbuf;
+	struct list_head poll_pending_link;
+	struct p9_poll_wait poll_wait[MAXPOLLWADDR];
+	poll_table pt;
+	struct work_struct rq;
+	struct work_struct wq;
+	unsigned long wsched;
+};
+
+/**
+ * struct p9_trans_fd - transport state
+ * @rd: reference to file to read from
+ * @wr: reference of file to write to
+ * @conn: connection state reference
+ *
+ */
+
+struct p9_trans_fd {
+	struct file *rd;
+	struct file *wr;
+	struct p9_conn conn;
+};
+
+static void p9_poll_workfn(struct work_struct *work);
+
+static DEFINE_SPINLOCK(p9_poll_lock);
+static LIST_HEAD(p9_poll_pending_list);
+static DECLARE_WORK(p9_poll_work, p9_poll_workfn);
+
+static unsigned int p9_ipport_resv_min = P9_DEF_MIN_RESVPORT;
+static unsigned int p9_ipport_resv_max = P9_DEF_MAX_RESVPORT;
+
+static void p9_mux_poll_stop(struct p9_conn *m)
+{
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(m->poll_wait); i++) {
+		struct p9_poll_wait *pwait = &m->poll_wait[i];
+
+		if (pwait->wait_addr) {
+			remove_wait_queue(pwait->wait_addr, &pwait->wait);
+			pwait->wait_addr = NULL;
+		}
+	}
+
+	spin_lock_irqsave(&p9_poll_lock, flags);
+	list_del_init(&m->poll_pending_link);
+	spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+	flush_work(&p9_poll_work);
+}
+
+/**
+ * p9_conn_cancel - cancel all pending requests with error
+ * @m: mux data
+ * @err: error code
+ *
+ */
+
+static void p9_conn_cancel(struct p9_conn *m, int err)
+{
+	struct p9_req_t *req, *rtmp;
+	LIST_HEAD(cancel_list);
+
+	p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
+
+	spin_lock(&m->req_lock);
+
+	if (m->err) {
+		spin_unlock(&m->req_lock);
+		return;
+	}
+
+	m->err = err;
+
+	list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
+		list_move(&req->req_list, &cancel_list);
+		WRITE_ONCE(req->status, REQ_STATUS_ERROR);
+	}
+	list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
+		list_move(&req->req_list, &cancel_list);
+		WRITE_ONCE(req->status, REQ_STATUS_ERROR);
+	}
+
+	spin_unlock(&m->req_lock);
+
+	list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
+		p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req);
+		list_del(&req->req_list);
+		if (!req->t_err)
+			req->t_err = err;
+		p9_client_cb(m->client, req, REQ_STATUS_ERROR);
+	}
+}
+
+static __poll_t
+p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
+{
+	__poll_t ret;
+	struct p9_trans_fd *ts = NULL;
+
+	if (client && client->status == Connected)
+		ts = client->trans;
+
+	if (!ts) {
+		if (err)
+			*err = -EREMOTEIO;
+		return EPOLLERR;
+	}
+
+	ret = vfs_poll(ts->rd, pt);
+	if (ts->rd != ts->wr)
+		ret = (ret & ~EPOLLOUT) | (vfs_poll(ts->wr, pt) & ~EPOLLIN);
+	return ret;
+}
+
+/**
+ * p9_fd_read- read from a fd
+ * @client: client instance
+ * @v: buffer to receive data into
+ * @len: size of receive buffer
+ *
+ */
+
+static int p9_fd_read(struct p9_client *client, void *v, int len)
+{
+	int ret;
+	struct p9_trans_fd *ts = NULL;
+	loff_t pos;
+
+	if (client && client->status != Disconnected)
+		ts = client->trans;
+
+	if (!ts)
+		return -EREMOTEIO;
+
+	if (!(ts->rd->f_flags & O_NONBLOCK))
+		p9_debug(P9_DEBUG_ERROR, "blocking read ...\n");
+
+	pos = ts->rd->f_pos;
+	ret = kernel_read(ts->rd, v, len, &pos);
+	if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
+		client->status = Disconnected;
+	return ret;
+}
+
+/**
+ * p9_read_work - called when there is some data to be read from a transport
+ * @work: container of work to be done
+ *
+ */
+
+static void p9_read_work(struct work_struct *work)
+{
+	__poll_t n;
+	int err;
+	struct p9_conn *m;
+
+	m = container_of(work, struct p9_conn, rq);
+
+	if (m->err < 0)
+		return;
+
+	p9_debug(P9_DEBUG_TRANS, "start mux %p pos %zd\n", m, m->rc.offset);
+
+	if (!m->rc.sdata) {
+		m->rc.sdata = m->tmp_buf;
+		m->rc.offset = 0;
+		m->rc.capacity = P9_HDRSZ; /* start by reading header */
+	}
+
+	clear_bit(Rpending, &m->wsched);
+	p9_debug(P9_DEBUG_TRANS, "read mux %p pos %zd size: %zd = %zd\n",
+		 m, m->rc.offset, m->rc.capacity,
+		 m->rc.capacity - m->rc.offset);
+	err = p9_fd_read(m->client, m->rc.sdata + m->rc.offset,
+			 m->rc.capacity - m->rc.offset);
+	p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err);
+	if (err == -EAGAIN)
+		goto end_clear;
+
+	if (err <= 0)
+		goto error;
+
+	m->rc.offset += err;
+
+	/* header read in */
+	if ((!m->rreq) && (m->rc.offset == m->rc.capacity)) {
+		p9_debug(P9_DEBUG_TRANS, "got new header\n");
+
+		/* Header size */
+		m->rc.size = P9_HDRSZ;
+		err = p9_parse_header(&m->rc, &m->rc.size, NULL, NULL, 0);
+		if (err) {
+			p9_debug(P9_DEBUG_ERROR,
+				 "error parsing header: %d\n", err);
+			goto error;
+		}
+
+		p9_debug(P9_DEBUG_TRANS,
+			 "mux %p pkt: size: %d bytes tag: %d\n",
+			 m, m->rc.size, m->rc.tag);
+
+		m->rreq = p9_tag_lookup(m->client, m->rc.tag);
+		if (!m->rreq || (m->rreq->status != REQ_STATUS_SENT)) {
+			p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n",
+				 m->rc.tag);
+			err = -EIO;
+			goto error;
+		}
+
+		if (m->rc.size > m->rreq->rc.capacity) {
+			p9_debug(P9_DEBUG_ERROR,
+				 "requested packet size too big: %d for tag %d with capacity %zd\n",
+				 m->rc.size, m->rc.tag, m->rreq->rc.capacity);
+			err = -EIO;
+			goto error;
+		}
+
+		if (!m->rreq->rc.sdata) {
+			p9_debug(P9_DEBUG_ERROR,
+				 "No recv fcall for tag %d (req %p), disconnecting!\n",
+				 m->rc.tag, m->rreq);
+			p9_req_put(m->client, m->rreq);
+			m->rreq = NULL;
+			err = -EIO;
+			goto error;
+		}
+		m->rc.sdata = m->rreq->rc.sdata;
+		memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity);
+		m->rc.capacity = m->rc.size;
+	}
+
+	/* packet is read in
+	 * not an else because some packets (like clunk) have no payload
+	 */
+	if ((m->rreq) && (m->rc.offset == m->rc.capacity)) {
+		p9_debug(P9_DEBUG_TRANS, "got new packet\n");
+		m->rreq->rc.size = m->rc.offset;
+		spin_lock(&m->req_lock);
+		if (m->rreq->status == REQ_STATUS_SENT) {
+			list_del(&m->rreq->req_list);
+			p9_client_cb(m->client, m->rreq, REQ_STATUS_RCVD);
+		} else if (m->rreq->status == REQ_STATUS_FLSHD) {
+			/* Ignore replies associated with a cancelled request. */
+			p9_debug(P9_DEBUG_TRANS,
+				 "Ignore replies associated with a cancelled request\n");
+		} else {
+			spin_unlock(&m->req_lock);
+			p9_debug(P9_DEBUG_ERROR,
+				 "Request tag %d errored out while we were reading the reply\n",
+				 m->rc.tag);
+			err = -EIO;
+			goto error;
+		}
+		spin_unlock(&m->req_lock);
+		m->rc.sdata = NULL;
+		m->rc.offset = 0;
+		m->rc.capacity = 0;
+		p9_req_put(m->client, m->rreq);
+		m->rreq = NULL;
+	}
+
+end_clear:
+	clear_bit(Rworksched, &m->wsched);
+
+	if (!list_empty(&m->req_list)) {
+		if (test_and_clear_bit(Rpending, &m->wsched))
+			n = EPOLLIN;
+		else
+			n = p9_fd_poll(m->client, NULL, NULL);
+
+		if ((n & EPOLLIN) && !test_and_set_bit(Rworksched, &m->wsched)) {
+			p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m);
+			schedule_work(&m->rq);
+		}
+	}
+
+	return;
+error:
+	p9_conn_cancel(m, err);
+	clear_bit(Rworksched, &m->wsched);
+}
+
+/**
+ * p9_fd_write - write to a socket
+ * @client: client instance
+ * @v: buffer to send data from
+ * @len: size of send buffer
+ *
+ */
+
+static int p9_fd_write(struct p9_client *client, void *v, int len)
+{
+	ssize_t ret;
+	struct p9_trans_fd *ts = NULL;
+
+	if (client && client->status != Disconnected)
+		ts = client->trans;
+
+	if (!ts)
+		return -EREMOTEIO;
+
+	if (!(ts->wr->f_flags & O_NONBLOCK))
+		p9_debug(P9_DEBUG_ERROR, "blocking write ...\n");
+
+	ret = kernel_write(ts->wr, v, len, &ts->wr->f_pos);
+	if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
+		client->status = Disconnected;
+	return ret;
+}
+
+/**
+ * p9_write_work - called when a transport can send some data
+ * @work: container for work to be done
+ *
+ */
+
+static void p9_write_work(struct work_struct *work)
+{
+	__poll_t n;
+	int err;
+	struct p9_conn *m;
+	struct p9_req_t *req;
+
+	m = container_of(work, struct p9_conn, wq);
+
+	if (m->err < 0) {
+		clear_bit(Wworksched, &m->wsched);
+		return;
+	}
+
+	if (!m->wsize) {
+		spin_lock(&m->req_lock);
+		if (list_empty(&m->unsent_req_list)) {
+			clear_bit(Wworksched, &m->wsched);
+			spin_unlock(&m->req_lock);
+			return;
+		}
+
+		req = list_entry(m->unsent_req_list.next, struct p9_req_t,
+			       req_list);
+		WRITE_ONCE(req->status, REQ_STATUS_SENT);
+		p9_debug(P9_DEBUG_TRANS, "move req %p\n", req);
+		list_move_tail(&req->req_list, &m->req_list);
+
+		m->wbuf = req->tc.sdata;
+		m->wsize = req->tc.size;
+		m->wpos = 0;
+		p9_req_get(req);
+		m->wreq = req;
+		spin_unlock(&m->req_lock);
+	}
+
+	p9_debug(P9_DEBUG_TRANS, "mux %p pos %d size %d\n",
+		 m, m->wpos, m->wsize);
+	clear_bit(Wpending, &m->wsched);
+	err = p9_fd_write(m->client, m->wbuf + m->wpos, m->wsize - m->wpos);
+	p9_debug(P9_DEBUG_TRANS, "mux %p sent %d bytes\n", m, err);
+	if (err == -EAGAIN)
+		goto end_clear;
+
+
+	if (err < 0)
+		goto error;
+	else if (err == 0) {
+		err = -EREMOTEIO;
+		goto error;
+	}
+
+	m->wpos += err;
+	if (m->wpos == m->wsize) {
+		m->wpos = m->wsize = 0;
+		p9_req_put(m->client, m->wreq);
+		m->wreq = NULL;
+	}
+
+end_clear:
+	clear_bit(Wworksched, &m->wsched);
+
+	if (m->wsize || !list_empty(&m->unsent_req_list)) {
+		if (test_and_clear_bit(Wpending, &m->wsched))
+			n = EPOLLOUT;
+		else
+			n = p9_fd_poll(m->client, NULL, NULL);
+
+		if ((n & EPOLLOUT) &&
+		   !test_and_set_bit(Wworksched, &m->wsched)) {
+			p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m);
+			schedule_work(&m->wq);
+		}
+	}
+
+	return;
+
+error:
+	p9_conn_cancel(m, err);
+	clear_bit(Wworksched, &m->wsched);
+}
+
+static int p9_pollwake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
+{
+	struct p9_poll_wait *pwait =
+		container_of(wait, struct p9_poll_wait, wait);
+	struct p9_conn *m = pwait->conn;
+	unsigned long flags;
+
+	spin_lock_irqsave(&p9_poll_lock, flags);
+	if (list_empty(&m->poll_pending_link))
+		list_add_tail(&m->poll_pending_link, &p9_poll_pending_list);
+	spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+	schedule_work(&p9_poll_work);
+	return 1;
+}
+
+/**
+ * p9_pollwait - add poll task to the wait queue
+ * @filp: file pointer being polled
+ * @wait_address: wait_q to block on
+ * @p: poll state
+ *
+ * called by files poll operation to add v9fs-poll task to files wait queue
+ */
+
+static void
+p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
+{
+	struct p9_conn *m = container_of(p, struct p9_conn, pt);
+	struct p9_poll_wait *pwait = NULL;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(m->poll_wait); i++) {
+		if (m->poll_wait[i].wait_addr == NULL) {
+			pwait = &m->poll_wait[i];
+			break;
+		}
+	}
+
+	if (!pwait) {
+		p9_debug(P9_DEBUG_ERROR, "not enough wait_address slots\n");
+		return;
+	}
+
+	pwait->conn = m;
+	pwait->wait_addr = wait_address;
+	init_waitqueue_func_entry(&pwait->wait, p9_pollwake);
+	add_wait_queue(wait_address, &pwait->wait);
+}
+
+/**
+ * p9_conn_create - initialize the per-session mux data
+ * @client: client instance
+ *
+ * Note: Creates the polling task if this is the first session.
+ */
+
+static void p9_conn_create(struct p9_client *client)
+{
+	__poll_t n;
+	struct p9_trans_fd *ts = client->trans;
+	struct p9_conn *m = &ts->conn;
+
+	p9_debug(P9_DEBUG_TRANS, "client %p msize %d\n", client, client->msize);
+
+	INIT_LIST_HEAD(&m->mux_list);
+	m->client = client;
+
+	spin_lock_init(&m->req_lock);
+	INIT_LIST_HEAD(&m->req_list);
+	INIT_LIST_HEAD(&m->unsent_req_list);
+	INIT_WORK(&m->rq, p9_read_work);
+	INIT_WORK(&m->wq, p9_write_work);
+	INIT_LIST_HEAD(&m->poll_pending_link);
+	init_poll_funcptr(&m->pt, p9_pollwait);
+
+	n = p9_fd_poll(client, &m->pt, NULL);
+	if (n & EPOLLIN) {
+		p9_debug(P9_DEBUG_TRANS, "mux %p can read\n", m);
+		set_bit(Rpending, &m->wsched);
+	}
+
+	if (n & EPOLLOUT) {
+		p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m);
+		set_bit(Wpending, &m->wsched);
+	}
+}
+
+/**
+ * p9_poll_mux - polls a mux and schedules read or write works if necessary
+ * @m: connection to poll
+ *
+ */
+
+static void p9_poll_mux(struct p9_conn *m)
+{
+	__poll_t n;
+	int err = -ECONNRESET;
+
+	if (m->err < 0)
+		return;
+
+	n = p9_fd_poll(m->client, NULL, &err);
+	if (n & (EPOLLERR | EPOLLHUP | EPOLLNVAL)) {
+		p9_debug(P9_DEBUG_TRANS, "error mux %p err %d\n", m, n);
+		p9_conn_cancel(m, err);
+	}
+
+	if (n & EPOLLIN) {
+		set_bit(Rpending, &m->wsched);
+		p9_debug(P9_DEBUG_TRANS, "mux %p can read\n", m);
+		if (!test_and_set_bit(Rworksched, &m->wsched)) {
+			p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m);
+			schedule_work(&m->rq);
+		}
+	}
+
+	if (n & EPOLLOUT) {
+		set_bit(Wpending, &m->wsched);
+		p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m);
+		if ((m->wsize || !list_empty(&m->unsent_req_list)) &&
+		    !test_and_set_bit(Wworksched, &m->wsched)) {
+			p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m);
+			schedule_work(&m->wq);
+		}
+	}
+}
+
+/**
+ * p9_fd_request - send 9P request
+ * The function can sleep until the request is scheduled for sending.
+ * The function can be interrupted. Return from the function is not
+ * a guarantee that the request is sent successfully.
+ *
+ * @client: client instance
+ * @req: request to be sent
+ *
+ */
+
+static int p9_fd_request(struct p9_client *client, struct p9_req_t *req)
+{
+	__poll_t n;
+	struct p9_trans_fd *ts = client->trans;
+	struct p9_conn *m = &ts->conn;
+
+	p9_debug(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n",
+		 m, current, &req->tc, req->tc.id);
+	if (m->err < 0)
+		return m->err;
+
+	spin_lock(&m->req_lock);
+	WRITE_ONCE(req->status, REQ_STATUS_UNSENT);
+	list_add_tail(&req->req_list, &m->unsent_req_list);
+	spin_unlock(&m->req_lock);
+
+	if (test_and_clear_bit(Wpending, &m->wsched))
+		n = EPOLLOUT;
+	else
+		n = p9_fd_poll(m->client, NULL, NULL);
+
+	if (n & EPOLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
+		schedule_work(&m->wq);
+
+	return 0;
+}
+
+static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+	struct p9_trans_fd *ts = client->trans;
+	struct p9_conn *m = &ts->conn;
+	int ret = 1;
+
+	p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
+
+	spin_lock(&m->req_lock);
+
+	if (req->status == REQ_STATUS_UNSENT) {
+		list_del(&req->req_list);
+		WRITE_ONCE(req->status, REQ_STATUS_FLSHD);
+		p9_req_put(client, req);
+		ret = 0;
+	}
+	spin_unlock(&m->req_lock);
+
+	return ret;
+}
+
+static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
+{
+	struct p9_trans_fd *ts = client->trans;
+	struct p9_conn *m = &ts->conn;
+
+	p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
+
+	spin_lock(&m->req_lock);
+	/* Ignore cancelled request if message has been received
+	 * before lock.
+	 */
+	if (req->status == REQ_STATUS_RCVD) {
+		spin_unlock(&m->req_lock);
+		return 0;
+	}
+
+	/* we haven't received a response for oldreq,
+	 * remove it from the list.
+	 */
+	list_del(&req->req_list);
+	WRITE_ONCE(req->status, REQ_STATUS_FLSHD);
+	spin_unlock(&m->req_lock);
+
+	p9_req_put(client, req);
+
+	return 0;
+}
+
+static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt)
+{
+	if (clnt->trans_mod == &p9_tcp_trans) {
+		if (clnt->trans_opts.tcp.port != P9_PORT)
+			seq_printf(m, ",port=%u", clnt->trans_opts.tcp.port);
+	} else if (clnt->trans_mod == &p9_fd_trans) {
+		if (clnt->trans_opts.fd.rfd != ~0)
+			seq_printf(m, ",rfd=%u", clnt->trans_opts.fd.rfd);
+		if (clnt->trans_opts.fd.wfd != ~0)
+			seq_printf(m, ",wfd=%u", clnt->trans_opts.fd.wfd);
+	}
+	return 0;
+}
+
+/**
+ * parse_opts - parse mount options into p9_fd_opts structure
+ * @params: options string passed from mount
+ * @opts: fd transport-specific structure to parse options into
+ *
+ * Returns 0 upon success, -ERRNO upon failure
+ */
+
+static int parse_opts(char *params, struct p9_fd_opts *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	char *options, *tmp_options;
+
+	opts->port = P9_PORT;
+	opts->rfd = ~0;
+	opts->wfd = ~0;
+	opts->privport = false;
+
+	if (!params)
+		return 0;
+
+	tmp_options = kstrdup(params, GFP_KERNEL);
+	if (!tmp_options) {
+		p9_debug(P9_DEBUG_ERROR,
+			 "failed to allocate copy of option string\n");
+		return -ENOMEM;
+	}
+	options = tmp_options;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		int r;
+		if (!*p)
+			continue;
+		token = match_token(p, tokens, args);
+		if ((token != Opt_err) && (token != Opt_privport)) {
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				continue;
+			}
+		}
+		switch (token) {
+		case Opt_port:
+			opts->port = option;
+			break;
+		case Opt_rfdno:
+			opts->rfd = option;
+			break;
+		case Opt_wfdno:
+			opts->wfd = option;
+			break;
+		case Opt_privport:
+			opts->privport = true;
+			break;
+		default:
+			continue;
+		}
+	}
+
+	kfree(tmp_options);
+	return 0;
+}
+
+static int p9_fd_open(struct p9_client *client, int rfd, int wfd)
+{
+	struct p9_trans_fd *ts = kzalloc(sizeof(struct p9_trans_fd),
+					   GFP_KERNEL);
+	if (!ts)
+		return -ENOMEM;
+
+	ts->rd = fget(rfd);
+	if (!ts->rd)
+		goto out_free_ts;
+	if (!(ts->rd->f_mode & FMODE_READ))
+		goto out_put_rd;
+	/* Prevent workers from hanging on IO when fd is a pipe.
+	 * It's technically possible for userspace or concurrent mounts to
+	 * modify this flag concurrently, which will likely result in a
+	 * broken filesystem. However, just having bad flags here should
+	 * not crash the kernel or cause any other sort of bug, so mark this
+	 * particular data race as intentional so that tooling (like KCSAN)
+	 * can allow it and detect further problems.
+	 */
+	data_race(ts->rd->f_flags |= O_NONBLOCK);
+	ts->wr = fget(wfd);
+	if (!ts->wr)
+		goto out_put_rd;
+	if (!(ts->wr->f_mode & FMODE_WRITE))
+		goto out_put_wr;
+	data_race(ts->wr->f_flags |= O_NONBLOCK);
+
+	client->trans = ts;
+	client->status = Connected;
+
+	return 0;
+
+out_put_wr:
+	fput(ts->wr);
+out_put_rd:
+	fput(ts->rd);
+out_free_ts:
+	kfree(ts);
+	return -EIO;
+}
+
+static int p9_socket_open(struct p9_client *client, struct socket *csocket)
+{
+	struct p9_trans_fd *p;
+	struct file *file;
+
+	p = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL);
+	if (!p) {
+		sock_release(csocket);
+		return -ENOMEM;
+	}
+
+	csocket->sk->sk_allocation = GFP_NOIO;
+	csocket->sk->sk_use_task_frag = false;
+	file = sock_alloc_file(csocket, 0, NULL);
+	if (IS_ERR(file)) {
+		pr_err("%s (%d): failed to map fd\n",
+		       __func__, task_pid_nr(current));
+		kfree(p);
+		return PTR_ERR(file);
+	}
+
+	get_file(file);
+	p->wr = p->rd = file;
+	client->trans = p;
+	client->status = Connected;
+
+	p->rd->f_flags |= O_NONBLOCK;
+
+	p9_conn_create(client);
+	return 0;
+}
+
+/**
+ * p9_conn_destroy - cancels all pending requests of mux
+ * @m: mux to destroy
+ *
+ */
+
+static void p9_conn_destroy(struct p9_conn *m)
+{
+	p9_debug(P9_DEBUG_TRANS, "mux %p prev %p next %p\n",
+		 m, m->mux_list.prev, m->mux_list.next);
+
+	p9_mux_poll_stop(m);
+	cancel_work_sync(&m->rq);
+	if (m->rreq) {
+		p9_req_put(m->client, m->rreq);
+		m->rreq = NULL;
+	}
+	cancel_work_sync(&m->wq);
+	if (m->wreq) {
+		p9_req_put(m->client, m->wreq);
+		m->wreq = NULL;
+	}
+
+	p9_conn_cancel(m, -ECONNRESET);
+
+	m->client = NULL;
+}
+
+/**
+ * p9_fd_close - shutdown file descriptor transport
+ * @client: client instance
+ *
+ */
+
+static void p9_fd_close(struct p9_client *client)
+{
+	struct p9_trans_fd *ts;
+
+	if (!client)
+		return;
+
+	ts = client->trans;
+	if (!ts)
+		return;
+
+	client->status = Disconnected;
+
+	p9_conn_destroy(&ts->conn);
+
+	if (ts->rd)
+		fput(ts->rd);
+	if (ts->wr)
+		fput(ts->wr);
+
+	kfree(ts);
+}
+
+/*
+ * stolen from NFS - maybe should be made a generic function?
+ */
+static inline int valid_ipaddr4(const char *buf)
+{
+	int rc, count, in[4];
+
+	rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
+	if (rc != 4)
+		return -EINVAL;
+	for (count = 0; count < 4; count++) {
+		if (in[count] > 255)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+static int p9_bind_privport(struct socket *sock)
+{
+	struct sockaddr_in cl;
+	int port, err = -EINVAL;
+
+	memset(&cl, 0, sizeof(cl));
+	cl.sin_family = AF_INET;
+	cl.sin_addr.s_addr = htonl(INADDR_ANY);
+	for (port = p9_ipport_resv_max; port >= p9_ipport_resv_min; port--) {
+		cl.sin_port = htons((ushort)port);
+		err = kernel_bind(sock, (struct sockaddr *)&cl, sizeof(cl));
+		if (err != -EADDRINUSE)
+			break;
+	}
+	return err;
+}
+
+
+static int
+p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
+{
+	int err;
+	struct socket *csocket;
+	struct sockaddr_in sin_server;
+	struct p9_fd_opts opts;
+
+	err = parse_opts(args, &opts);
+	if (err < 0)
+		return err;
+
+	if (addr == NULL || valid_ipaddr4(addr) < 0)
+		return -EINVAL;
+
+	csocket = NULL;
+
+	client->trans_opts.tcp.port = opts.port;
+	client->trans_opts.tcp.privport = opts.privport;
+	sin_server.sin_family = AF_INET;
+	sin_server.sin_addr.s_addr = in_aton(addr);
+	sin_server.sin_port = htons(opts.port);
+	err = __sock_create(current->nsproxy->net_ns, PF_INET,
+			    SOCK_STREAM, IPPROTO_TCP, &csocket, 1);
+	if (err) {
+		pr_err("%s (%d): problem creating socket\n",
+		       __func__, task_pid_nr(current));
+		return err;
+	}
+
+	if (opts.privport) {
+		err = p9_bind_privport(csocket);
+		if (err < 0) {
+			pr_err("%s (%d): problem binding to privport\n",
+			       __func__, task_pid_nr(current));
+			sock_release(csocket);
+			return err;
+		}
+	}
+
+	err = READ_ONCE(csocket->ops)->connect(csocket,
+				    (struct sockaddr *)&sin_server,
+				    sizeof(struct sockaddr_in), 0);
+	if (err < 0) {
+		pr_err("%s (%d): problem connecting socket to %s\n",
+		       __func__, task_pid_nr(current), addr);
+		sock_release(csocket);
+		return err;
+	}
+
+	return p9_socket_open(client, csocket);
+}
+
+static int
+p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
+{
+	int err;
+	struct socket *csocket;
+	struct sockaddr_un sun_server;
+
+	csocket = NULL;
+
+	if (!addr || !strlen(addr))
+		return -EINVAL;
+
+	if (strlen(addr) >= UNIX_PATH_MAX) {
+		pr_err("%s (%d): address too long: %s\n",
+		       __func__, task_pid_nr(current), addr);
+		return -ENAMETOOLONG;
+	}
+
+	sun_server.sun_family = PF_UNIX;
+	strcpy(sun_server.sun_path, addr);
+	err = __sock_create(current->nsproxy->net_ns, PF_UNIX,
+			    SOCK_STREAM, 0, &csocket, 1);
+	if (err < 0) {
+		pr_err("%s (%d): problem creating socket\n",
+		       __func__, task_pid_nr(current));
+
+		return err;
+	}
+	err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr *)&sun_server,
+			sizeof(struct sockaddr_un) - 1, 0);
+	if (err < 0) {
+		pr_err("%s (%d): problem connecting socket: %s: %d\n",
+		       __func__, task_pid_nr(current), addr, err);
+		sock_release(csocket);
+		return err;
+	}
+
+	return p9_socket_open(client, csocket);
+}
+
+static int
+p9_fd_create(struct p9_client *client, const char *addr, char *args)
+{
+	int err;
+	struct p9_fd_opts opts;
+
+	err = parse_opts(args, &opts);
+	if (err < 0)
+		return err;
+	client->trans_opts.fd.rfd = opts.rfd;
+	client->trans_opts.fd.wfd = opts.wfd;
+
+	if (opts.rfd == ~0 || opts.wfd == ~0) {
+		pr_err("Insufficient options for proto=fd\n");
+		return -ENOPROTOOPT;
+	}
+
+	err = p9_fd_open(client, opts.rfd, opts.wfd);
+	if (err < 0)
+		return err;
+
+	p9_conn_create(client);
+
+	return 0;
+}
+
+static struct p9_trans_module p9_tcp_trans = {
+	.name = "tcp",
+	.maxsize = MAX_SOCK_BUF,
+	.pooled_rbuffers = false,
+	.def = 0,
+	.create = p9_fd_create_tcp,
+	.close = p9_fd_close,
+	.request = p9_fd_request,
+	.cancel = p9_fd_cancel,
+	.cancelled = p9_fd_cancelled,
+	.show_options = p9_fd_show_options,
+	.owner = THIS_MODULE,
+};
+MODULE_ALIAS_9P("tcp");
+
+static struct p9_trans_module p9_unix_trans = {
+	.name = "unix",
+	.maxsize = MAX_SOCK_BUF,
+	.def = 0,
+	.create = p9_fd_create_unix,
+	.close = p9_fd_close,
+	.request = p9_fd_request,
+	.cancel = p9_fd_cancel,
+	.cancelled = p9_fd_cancelled,
+	.show_options = p9_fd_show_options,
+	.owner = THIS_MODULE,
+};
+MODULE_ALIAS_9P("unix");
+
+static struct p9_trans_module p9_fd_trans = {
+	.name = "fd",
+	.maxsize = MAX_SOCK_BUF,
+	.def = 0,
+	.create = p9_fd_create,
+	.close = p9_fd_close,
+	.request = p9_fd_request,
+	.cancel = p9_fd_cancel,
+	.cancelled = p9_fd_cancelled,
+	.show_options = p9_fd_show_options,
+	.owner = THIS_MODULE,
+};
+MODULE_ALIAS_9P("fd");
+
+/**
+ * p9_poll_workfn - poll worker thread
+ * @work: work queue
+ *
+ * polls all v9fs transports for new events and queues the appropriate
+ * work to the work queue
+ *
+ */
+
+static void p9_poll_workfn(struct work_struct *work)
+{
+	unsigned long flags;
+
+	p9_debug(P9_DEBUG_TRANS, "start %p\n", current);
+
+	spin_lock_irqsave(&p9_poll_lock, flags);
+	while (!list_empty(&p9_poll_pending_list)) {
+		struct p9_conn *conn = list_first_entry(&p9_poll_pending_list,
+							struct p9_conn,
+							poll_pending_link);
+		list_del_init(&conn->poll_pending_link);
+		spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+		p9_poll_mux(conn);
+
+		spin_lock_irqsave(&p9_poll_lock, flags);
+	}
+	spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+	p9_debug(P9_DEBUG_TRANS, "finish\n");
+}
+
+static int __init p9_trans_fd_init(void)
+{
+	v9fs_register_trans(&p9_tcp_trans);
+	v9fs_register_trans(&p9_unix_trans);
+	v9fs_register_trans(&p9_fd_trans);
+
+	return 0;
+}
+
+static void __exit p9_trans_fd_exit(void)
+{
+	flush_work(&p9_poll_work);
+	v9fs_unregister_trans(&p9_tcp_trans);
+	v9fs_unregister_trans(&p9_unix_trans);
+	v9fs_unregister_trans(&p9_fd_trans);
+}
+
+module_init(p9_trans_fd_init);
+module_exit(p9_trans_fd_exit);
+
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_DESCRIPTION("Filedescriptor Transport for 9P");
+MODULE_LICENSE("GPL");
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
new file mode 100644
index 0000000000..b84748baf9
--- /dev/null
+++ b/net/9p/trans_rdma.c
@@ -0,0 +1,781 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RDMA transport layer based on the trans_fd.c implementation.
+ *
+ *  Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com>
+ *  Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
+ *  Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
+ *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/kthread.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <linux/uaccess.h>
+#include <linux/inet.h>
+#include <linux/file.h>
+#include <linux/parser.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+
+#define P9_PORT			5640
+#define P9_RDMA_SQ_DEPTH	32
+#define P9_RDMA_RQ_DEPTH	32
+#define P9_RDMA_SEND_SGE	4
+#define P9_RDMA_RECV_SGE	4
+#define P9_RDMA_IRD		0
+#define P9_RDMA_ORD		0
+#define P9_RDMA_TIMEOUT		30000		/* 30 seconds */
+#define P9_RDMA_MAXSIZE		(1024*1024)	/* 1MB */
+
+/**
+ * struct p9_trans_rdma - RDMA transport instance
+ *
+ * @state: tracks the transport state machine for connection setup and tear down
+ * @cm_id: The RDMA CM ID
+ * @pd: Protection Domain pointer
+ * @qp: Queue Pair pointer
+ * @cq: Completion Queue pointer
+ * @timeout: Number of uSecs to wait for connection management events
+ * @privport: Whether a privileged port may be used
+ * @port: The port to use
+ * @sq_depth: The depth of the Send Queue
+ * @sq_sem: Semaphore for the SQ
+ * @rq_depth: The depth of the Receive Queue.
+ * @rq_sem: Semaphore for the RQ
+ * @excess_rc : Amount of posted Receive Contexts without a pending request.
+ *		See rdma_request()
+ * @addr: The remote peer's address
+ * @req_lock: Protects the active request list
+ * @cm_done: Completion event for connection management tracking
+ */
+struct p9_trans_rdma {
+	enum {
+		P9_RDMA_INIT,
+		P9_RDMA_ADDR_RESOLVED,
+		P9_RDMA_ROUTE_RESOLVED,
+		P9_RDMA_CONNECTED,
+		P9_RDMA_FLUSHING,
+		P9_RDMA_CLOSING,
+		P9_RDMA_CLOSED,
+	} state;
+	struct rdma_cm_id *cm_id;
+	struct ib_pd *pd;
+	struct ib_qp *qp;
+	struct ib_cq *cq;
+	long timeout;
+	bool privport;
+	u16 port;
+	int sq_depth;
+	struct semaphore sq_sem;
+	int rq_depth;
+	struct semaphore rq_sem;
+	atomic_t excess_rc;
+	struct sockaddr_in addr;
+	spinlock_t req_lock;
+
+	struct completion cm_done;
+};
+
+struct p9_rdma_req;
+
+/**
+ * struct p9_rdma_context - Keeps track of in-process WR
+ *
+ * @cqe: completion queue entry
+ * @busa: Bus address to unmap when the WR completes
+ * @req: Keeps track of requests (send)
+ * @rc: Keepts track of replies (receive)
+ */
+struct p9_rdma_context {
+	struct ib_cqe cqe;
+	dma_addr_t busa;
+	union {
+		struct p9_req_t *req;
+		struct p9_fcall rc;
+	};
+};
+
+/**
+ * struct p9_rdma_opts - Collection of mount options
+ * @port: port of connection
+ * @privport: Whether a privileged port may be used
+ * @sq_depth: The requested depth of the SQ. This really doesn't need
+ * to be any deeper than the number of threads used in the client
+ * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth
+ * @timeout: Time to wait in msecs for CM events
+ */
+struct p9_rdma_opts {
+	short port;
+	bool privport;
+	int sq_depth;
+	int rq_depth;
+	long timeout;
+};
+
+/*
+ * Option Parsing (code inspired by NFS code)
+ */
+enum {
+	/* Options that take integer arguments */
+	Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout,
+	/* Options that take no argument */
+	Opt_privport,
+	Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_port, "port=%u"},
+	{Opt_sq_depth, "sq=%u"},
+	{Opt_rq_depth, "rq=%u"},
+	{Opt_timeout, "timeout=%u"},
+	{Opt_privport, "privport"},
+	{Opt_err, NULL},
+};
+
+static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt)
+{
+	struct p9_trans_rdma *rdma = clnt->trans;
+
+	if (rdma->port != P9_PORT)
+		seq_printf(m, ",port=%u", rdma->port);
+	if (rdma->sq_depth != P9_RDMA_SQ_DEPTH)
+		seq_printf(m, ",sq=%u", rdma->sq_depth);
+	if (rdma->rq_depth != P9_RDMA_RQ_DEPTH)
+		seq_printf(m, ",rq=%u", rdma->rq_depth);
+	if (rdma->timeout != P9_RDMA_TIMEOUT)
+		seq_printf(m, ",timeout=%lu", rdma->timeout);
+	if (rdma->privport)
+		seq_puts(m, ",privport");
+	return 0;
+}
+
+/**
+ * parse_opts - parse mount options into rdma options structure
+ * @params: options string passed from mount
+ * @opts: rdma transport-specific structure to parse options into
+ *
+ * Returns 0 upon success, -ERRNO upon failure
+ */
+static int parse_opts(char *params, struct p9_rdma_opts *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	char *options, *tmp_options;
+
+	opts->port = P9_PORT;
+	opts->sq_depth = P9_RDMA_SQ_DEPTH;
+	opts->rq_depth = P9_RDMA_RQ_DEPTH;
+	opts->timeout = P9_RDMA_TIMEOUT;
+	opts->privport = false;
+
+	if (!params)
+		return 0;
+
+	tmp_options = kstrdup(params, GFP_KERNEL);
+	if (!tmp_options) {
+		p9_debug(P9_DEBUG_ERROR,
+			 "failed to allocate copy of option string\n");
+		return -ENOMEM;
+	}
+	options = tmp_options;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		int r;
+		if (!*p)
+			continue;
+		token = match_token(p, tokens, args);
+		if ((token != Opt_err) && (token != Opt_privport)) {
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				continue;
+			}
+		}
+		switch (token) {
+		case Opt_port:
+			opts->port = option;
+			break;
+		case Opt_sq_depth:
+			opts->sq_depth = option;
+			break;
+		case Opt_rq_depth:
+			opts->rq_depth = option;
+			break;
+		case Opt_timeout:
+			opts->timeout = option;
+			break;
+		case Opt_privport:
+			opts->privport = true;
+			break;
+		default:
+			continue;
+		}
+	}
+	/* RQ must be at least as large as the SQ */
+	opts->rq_depth = max(opts->rq_depth, opts->sq_depth);
+	kfree(tmp_options);
+	return 0;
+}
+
+static int
+p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+	struct p9_client *c = id->context;
+	struct p9_trans_rdma *rdma = c->trans;
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		BUG_ON(rdma->state != P9_RDMA_INIT);
+		rdma->state = P9_RDMA_ADDR_RESOLVED;
+		break;
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED);
+		rdma->state = P9_RDMA_ROUTE_RESOLVED;
+		break;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED);
+		rdma->state = P9_RDMA_CONNECTED;
+		break;
+
+	case RDMA_CM_EVENT_DISCONNECTED:
+		if (rdma)
+			rdma->state = P9_RDMA_CLOSED;
+		c->status = Disconnected;
+		break;
+
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		break;
+
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+	case RDMA_CM_EVENT_MULTICAST_JOIN:
+	case RDMA_CM_EVENT_MULTICAST_ERROR:
+	case RDMA_CM_EVENT_REJECTED:
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+	case RDMA_CM_EVENT_CONNECT_RESPONSE:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	case RDMA_CM_EVENT_ADDR_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+		c->status = Disconnected;
+		rdma_disconnect(rdma->cm_id);
+		break;
+	default:
+		BUG();
+	}
+	complete(&rdma->cm_done);
+	return 0;
+}
+
+static void
+recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct p9_client *client = cq->cq_context;
+	struct p9_trans_rdma *rdma = client->trans;
+	struct p9_rdma_context *c =
+		container_of(wc->wr_cqe, struct p9_rdma_context, cqe);
+	struct p9_req_t *req;
+	int err = 0;
+	int16_t tag;
+
+	req = NULL;
+	ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize,
+							 DMA_FROM_DEVICE);
+
+	if (wc->status != IB_WC_SUCCESS)
+		goto err_out;
+
+	c->rc.size = wc->byte_len;
+	err = p9_parse_header(&c->rc, NULL, NULL, &tag, 1);
+	if (err)
+		goto err_out;
+
+	req = p9_tag_lookup(client, tag);
+	if (!req)
+		goto err_out;
+
+	/* Check that we have not yet received a reply for this request.
+	 */
+	if (unlikely(req->rc.sdata)) {
+		pr_err("Duplicate reply for request %d", tag);
+		goto err_out;
+	}
+
+	req->rc.size = c->rc.size;
+	req->rc.sdata = c->rc.sdata;
+	p9_client_cb(client, req, REQ_STATUS_RCVD);
+
+ out:
+	up(&rdma->rq_sem);
+	kfree(c);
+	return;
+
+ err_out:
+	p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n",
+			req, err, wc->status);
+	rdma->state = P9_RDMA_FLUSHING;
+	client->status = Disconnected;
+	goto out;
+}
+
+static void
+send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct p9_client *client = cq->cq_context;
+	struct p9_trans_rdma *rdma = client->trans;
+	struct p9_rdma_context *c =
+		container_of(wc->wr_cqe, struct p9_rdma_context, cqe);
+
+	ib_dma_unmap_single(rdma->cm_id->device,
+			    c->busa, c->req->tc.size,
+			    DMA_TO_DEVICE);
+	up(&rdma->sq_sem);
+	p9_req_put(client, c->req);
+	kfree(c);
+}
+
+static void qp_event_handler(struct ib_event *event, void *context)
+{
+	p9_debug(P9_DEBUG_ERROR, "QP event %d context %p\n",
+		 event->event, context);
+}
+
+static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
+{
+	if (!rdma)
+		return;
+
+	if (rdma->qp && !IS_ERR(rdma->qp))
+		ib_destroy_qp(rdma->qp);
+
+	if (rdma->pd && !IS_ERR(rdma->pd))
+		ib_dealloc_pd(rdma->pd);
+
+	if (rdma->cq && !IS_ERR(rdma->cq))
+		ib_free_cq(rdma->cq);
+
+	if (rdma->cm_id && !IS_ERR(rdma->cm_id))
+		rdma_destroy_id(rdma->cm_id);
+
+	kfree(rdma);
+}
+
+static int
+post_recv(struct p9_client *client, struct p9_rdma_context *c)
+{
+	struct p9_trans_rdma *rdma = client->trans;
+	struct ib_recv_wr wr;
+	struct ib_sge sge;
+	int ret;
+
+	c->busa = ib_dma_map_single(rdma->cm_id->device,
+				    c->rc.sdata, client->msize,
+				    DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
+		goto error;
+
+	c->cqe.done = recv_done;
+
+	sge.addr = c->busa;
+	sge.length = client->msize;
+	sge.lkey = rdma->pd->local_dma_lkey;
+
+	wr.next = NULL;
+	wr.wr_cqe = &c->cqe;
+	wr.sg_list = &sge;
+	wr.num_sge = 1;
+
+	ret = ib_post_recv(rdma->qp, &wr, NULL);
+	if (ret)
+		ib_dma_unmap_single(rdma->cm_id->device, c->busa,
+				    client->msize, DMA_FROM_DEVICE);
+	return ret;
+
+ error:
+	p9_debug(P9_DEBUG_ERROR, "EIO\n");
+	return -EIO;
+}
+
+static int rdma_request(struct p9_client *client, struct p9_req_t *req)
+{
+	struct p9_trans_rdma *rdma = client->trans;
+	struct ib_send_wr wr;
+	struct ib_sge sge;
+	int err = 0;
+	unsigned long flags;
+	struct p9_rdma_context *c = NULL;
+	struct p9_rdma_context *rpl_context = NULL;
+
+	/* When an error occurs between posting the recv and the send,
+	 * there will be a receive context posted without a pending request.
+	 * Since there is no way to "un-post" it, we remember it and skip
+	 * post_recv() for the next request.
+	 * So here,
+	 * see if we are this `next request' and need to absorb an excess rc.
+	 * If yes, then drop and free our own, and do not recv_post().
+	 **/
+	if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
+		if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
+			/* Got one! */
+			p9_fcall_fini(&req->rc);
+			req->rc.sdata = NULL;
+			goto dont_need_post_recv;
+		} else {
+			/* We raced and lost. */
+			atomic_inc(&rdma->excess_rc);
+		}
+	}
+
+	/* Allocate an fcall for the reply */
+	rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS);
+	if (!rpl_context) {
+		err = -ENOMEM;
+		goto recv_error;
+	}
+	rpl_context->rc.sdata = req->rc.sdata;
+
+	/*
+	 * Post a receive buffer for this request. We need to ensure
+	 * there is a reply buffer available for every outstanding
+	 * request. A flushed request can result in no reply for an
+	 * outstanding request, so we must keep a count to avoid
+	 * overflowing the RQ.
+	 */
+	if (down_interruptible(&rdma->rq_sem)) {
+		err = -EINTR;
+		goto recv_error;
+	}
+
+	err = post_recv(client, rpl_context);
+	if (err) {
+		p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err);
+		goto recv_error;
+	}
+	/* remove posted receive buffer from request structure */
+	req->rc.sdata = NULL;
+
+dont_need_post_recv:
+	/* Post the request */
+	c = kmalloc(sizeof *c, GFP_NOFS);
+	if (!c) {
+		err = -ENOMEM;
+		goto send_error;
+	}
+	c->req = req;
+
+	c->busa = ib_dma_map_single(rdma->cm_id->device,
+				    c->req->tc.sdata, c->req->tc.size,
+				    DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) {
+		err = -EIO;
+		goto send_error;
+	}
+
+	c->cqe.done = send_done;
+
+	sge.addr = c->busa;
+	sge.length = c->req->tc.size;
+	sge.lkey = rdma->pd->local_dma_lkey;
+
+	wr.next = NULL;
+	wr.wr_cqe = &c->cqe;
+	wr.opcode = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+	wr.sg_list = &sge;
+	wr.num_sge = 1;
+
+	if (down_interruptible(&rdma->sq_sem)) {
+		err = -EINTR;
+		goto dma_unmap;
+	}
+
+	/* Mark request as `sent' *before* we actually send it,
+	 * because doing if after could erase the REQ_STATUS_RCVD
+	 * status in case of a very fast reply.
+	 */
+	WRITE_ONCE(req->status, REQ_STATUS_SENT);
+	err = ib_post_send(rdma->qp, &wr, NULL);
+	if (err)
+		goto dma_unmap;
+
+	/* Success */
+	return 0;
+
+dma_unmap:
+	ib_dma_unmap_single(rdma->cm_id->device, c->busa,
+			    c->req->tc.size, DMA_TO_DEVICE);
+ /* Handle errors that happened during or while preparing the send: */
+ send_error:
+	WRITE_ONCE(req->status, REQ_STATUS_ERROR);
+	kfree(c);
+	p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err);
+
+	/* Ach.
+	 *  We did recv_post(), but not send. We have one recv_post in excess.
+	 */
+	atomic_inc(&rdma->excess_rc);
+	return err;
+
+ /* Handle errors that happened during or while preparing post_recv(): */
+ recv_error:
+	kfree(rpl_context);
+	spin_lock_irqsave(&rdma->req_lock, flags);
+	if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) {
+		rdma->state = P9_RDMA_CLOSING;
+		spin_unlock_irqrestore(&rdma->req_lock, flags);
+		rdma_disconnect(rdma->cm_id);
+	} else
+		spin_unlock_irqrestore(&rdma->req_lock, flags);
+	return err;
+}
+
+static void rdma_close(struct p9_client *client)
+{
+	struct p9_trans_rdma *rdma;
+
+	if (!client)
+		return;
+
+	rdma = client->trans;
+	if (!rdma)
+		return;
+
+	client->status = Disconnected;
+	rdma_disconnect(rdma->cm_id);
+	rdma_destroy_trans(rdma);
+}
+
+/**
+ * alloc_rdma - Allocate and initialize the rdma transport structure
+ * @opts: Mount options structure
+ */
+static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts)
+{
+	struct p9_trans_rdma *rdma;
+
+	rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL);
+	if (!rdma)
+		return NULL;
+
+	rdma->port = opts->port;
+	rdma->privport = opts->privport;
+	rdma->sq_depth = opts->sq_depth;
+	rdma->rq_depth = opts->rq_depth;
+	rdma->timeout = opts->timeout;
+	spin_lock_init(&rdma->req_lock);
+	init_completion(&rdma->cm_done);
+	sema_init(&rdma->sq_sem, rdma->sq_depth);
+	sema_init(&rdma->rq_sem, rdma->rq_depth);
+	atomic_set(&rdma->excess_rc, 0);
+
+	return rdma;
+}
+
+static int rdma_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+	/* Nothing to do here.
+	 * We will take care of it (if we have to) in rdma_cancelled()
+	 */
+	return 1;
+}
+
+/* A request has been fully flushed without a reply.
+ * That means we have posted one buffer in excess.
+ */
+static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req)
+{
+	struct p9_trans_rdma *rdma = client->trans;
+	atomic_inc(&rdma->excess_rc);
+	return 0;
+}
+
+static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma)
+{
+	struct sockaddr_in cl = {
+		.sin_family = AF_INET,
+		.sin_addr.s_addr = htonl(INADDR_ANY),
+	};
+	int port, err = -EINVAL;
+
+	for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) {
+		cl.sin_port = htons((ushort)port);
+		err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl);
+		if (err != -EADDRINUSE)
+			break;
+	}
+	return err;
+}
+
+/**
+ * rdma_create_trans - Transport method for creating a transport instance
+ * @client: client instance
+ * @addr: IP address string
+ * @args: Mount options string
+ */
+static int
+rdma_create_trans(struct p9_client *client, const char *addr, char *args)
+{
+	int err;
+	struct p9_rdma_opts opts;
+	struct p9_trans_rdma *rdma;
+	struct rdma_conn_param conn_param;
+	struct ib_qp_init_attr qp_attr;
+
+	if (addr == NULL)
+		return -EINVAL;
+
+	/* Parse the transport specific mount options */
+	err = parse_opts(args, &opts);
+	if (err < 0)
+		return err;
+
+	/* Create and initialize the RDMA transport structure */
+	rdma = alloc_rdma(&opts);
+	if (!rdma)
+		return -ENOMEM;
+
+	/* Create the RDMA CM ID */
+	rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client,
+				     RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(rdma->cm_id))
+		goto error;
+
+	/* Associate the client with the transport */
+	client->trans = rdma;
+
+	/* Bind to a privileged port if we need to */
+	if (opts.privport) {
+		err = p9_rdma_bind_privport(rdma);
+		if (err < 0) {
+			pr_err("%s (%d): problem binding to privport: %d\n",
+			       __func__, task_pid_nr(current), -err);
+			goto error;
+		}
+	}
+
+	/* Resolve the server's address */
+	rdma->addr.sin_family = AF_INET;
+	rdma->addr.sin_addr.s_addr = in_aton(addr);
+	rdma->addr.sin_port = htons(opts.port);
+	err = rdma_resolve_addr(rdma->cm_id, NULL,
+				(struct sockaddr *)&rdma->addr,
+				rdma->timeout);
+	if (err)
+		goto error;
+	err = wait_for_completion_interruptible(&rdma->cm_done);
+	if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED))
+		goto error;
+
+	/* Resolve the route to the server */
+	err = rdma_resolve_route(rdma->cm_id, rdma->timeout);
+	if (err)
+		goto error;
+	err = wait_for_completion_interruptible(&rdma->cm_done);
+	if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED))
+		goto error;
+
+	/* Create the Completion Queue */
+	rdma->cq = ib_alloc_cq_any(rdma->cm_id->device, client,
+				   opts.sq_depth + opts.rq_depth + 1,
+				   IB_POLL_SOFTIRQ);
+	if (IS_ERR(rdma->cq))
+		goto error;
+
+	/* Create the Protection Domain */
+	rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0);
+	if (IS_ERR(rdma->pd))
+		goto error;
+
+	/* Create the Queue Pair */
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.event_handler = qp_event_handler;
+	qp_attr.qp_context = client;
+	qp_attr.cap.max_send_wr = opts.sq_depth;
+	qp_attr.cap.max_recv_wr = opts.rq_depth;
+	qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE;
+	qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE;
+	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	qp_attr.qp_type = IB_QPT_RC;
+	qp_attr.send_cq = rdma->cq;
+	qp_attr.recv_cq = rdma->cq;
+	err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr);
+	if (err)
+		goto error;
+	rdma->qp = rdma->cm_id->qp;
+
+	/* Request a connection */
+	memset(&conn_param, 0, sizeof(conn_param));
+	conn_param.private_data = NULL;
+	conn_param.private_data_len = 0;
+	conn_param.responder_resources = P9_RDMA_IRD;
+	conn_param.initiator_depth = P9_RDMA_ORD;
+	err = rdma_connect(rdma->cm_id, &conn_param);
+	if (err)
+		goto error;
+	err = wait_for_completion_interruptible(&rdma->cm_done);
+	if (err || (rdma->state != P9_RDMA_CONNECTED))
+		goto error;
+
+	client->status = Connected;
+
+	return 0;
+
+error:
+	rdma_destroy_trans(rdma);
+	return -ENOTCONN;
+}
+
+static struct p9_trans_module p9_rdma_trans = {
+	.name = "rdma",
+	.maxsize = P9_RDMA_MAXSIZE,
+	.pooled_rbuffers = true,
+	.def = 0,
+	.owner = THIS_MODULE,
+	.create = rdma_create_trans,
+	.close = rdma_close,
+	.request = rdma_request,
+	.cancel = rdma_cancel,
+	.cancelled = rdma_cancelled,
+	.show_options = p9_rdma_show_options,
+};
+
+/**
+ * p9_trans_rdma_init - Register the 9P RDMA transport driver
+ */
+static int __init p9_trans_rdma_init(void)
+{
+	v9fs_register_trans(&p9_rdma_trans);
+	return 0;
+}
+
+static void __exit p9_trans_rdma_exit(void)
+{
+	v9fs_unregister_trans(&p9_rdma_trans);
+}
+
+module_init(p9_trans_rdma_init);
+module_exit(p9_trans_rdma_exit);
+MODULE_ALIAS_9P("rdma");
+
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("RDMA Transport for 9P");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
new file mode 100644
index 0000000000..e305071eb7
--- /dev/null
+++ b/net/9p/trans_virtio.c
@@ -0,0 +1,838 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * The Virtio 9p transport driver
+ *
+ * This is a block based transport driver based on the lguest block driver
+ * code.
+ *
+ *  Copyright (C) 2007, 2008 Eric Van Hensbergen, IBM Corporation
+ *
+ *  Based on virtio console driver
+ *  Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <linux/uaccess.h>
+#include <linux/inet.h>
+#include <linux/file.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <linux/parser.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include <linux/scatterlist.h>
+#include <linux/swap.h>
+#include <linux/virtio.h>
+#include <linux/virtio_9p.h>
+#include "trans_common.h"
+
+#define VIRTQUEUE_NUM	128
+
+/* a single mutex to manage channel initialization and attachment */
+static DEFINE_MUTEX(virtio_9p_lock);
+static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
+static atomic_t vp_pinned = ATOMIC_INIT(0);
+
+/**
+ * struct virtio_chan - per-instance transport information
+ * @inuse: whether the channel is in use
+ * @lock: protects multiple elements within this structure
+ * @client: client instance
+ * @vdev: virtio dev associated with this channel
+ * @vq: virtio queue associated with this channel
+ * @ring_bufs_avail: flag to indicate there is some available in the ring buf
+ * @vc_wq: wait queue for waiting for thing to be added to ring buf
+ * @p9_max_pages: maximum number of pinned pages
+ * @sg: scatter gather list which is used to pack a request (protected?)
+ * @chan_list: linked list of channels
+ *
+ * We keep all per-channel information in a structure.
+ * This structure is allocated within the devices dev->mem space.
+ * A pointer to the structure will get put in the transport private.
+ *
+ */
+
+struct virtio_chan {
+	bool inuse;
+
+	spinlock_t lock;
+
+	struct p9_client *client;
+	struct virtio_device *vdev;
+	struct virtqueue *vq;
+	int ring_bufs_avail;
+	wait_queue_head_t *vc_wq;
+	/* This is global limit. Since we don't have a global structure,
+	 * will be placing it in each channel.
+	 */
+	unsigned long p9_max_pages;
+	/* Scatterlist: can be too big for stack. */
+	struct scatterlist sg[VIRTQUEUE_NUM];
+	/**
+	 * @tag: name to identify a mount null terminated
+	 */
+	char *tag;
+
+	struct list_head chan_list;
+};
+
+static struct list_head virtio_chan_list;
+
+/* How many bytes left in this page. */
+static unsigned int rest_of_page(void *data)
+{
+	return PAGE_SIZE - offset_in_page(data);
+}
+
+/**
+ * p9_virtio_close - reclaim resources of a channel
+ * @client: client instance
+ *
+ * This reclaims a channel by freeing its resources and
+ * resetting its inuse flag.
+ *
+ */
+
+static void p9_virtio_close(struct p9_client *client)
+{
+	struct virtio_chan *chan = client->trans;
+
+	mutex_lock(&virtio_9p_lock);
+	if (chan)
+		chan->inuse = false;
+	mutex_unlock(&virtio_9p_lock);
+}
+
+/**
+ * req_done - callback which signals activity from the server
+ * @vq: virtio queue activity was received on
+ *
+ * This notifies us that the server has triggered some activity
+ * on the virtio channel - most likely a response to request we
+ * sent.  Figure out which requests now have responses and wake up
+ * those threads.
+ *
+ * Bugs: could do with some additional sanity checking, but appears to work.
+ *
+ */
+
+static void req_done(struct virtqueue *vq)
+{
+	struct virtio_chan *chan = vq->vdev->priv;
+	unsigned int len;
+	struct p9_req_t *req;
+	bool need_wakeup = false;
+	unsigned long flags;
+
+	p9_debug(P9_DEBUG_TRANS, ": request done\n");
+
+	spin_lock_irqsave(&chan->lock, flags);
+	while ((req = virtqueue_get_buf(chan->vq, &len)) != NULL) {
+		if (!chan->ring_bufs_avail) {
+			chan->ring_bufs_avail = 1;
+			need_wakeup = true;
+		}
+
+		if (len) {
+			req->rc.size = len;
+			p9_client_cb(chan->client, req, REQ_STATUS_RCVD);
+		}
+	}
+	spin_unlock_irqrestore(&chan->lock, flags);
+	/* Wakeup if anyone waiting for VirtIO ring space. */
+	if (need_wakeup)
+		wake_up(chan->vc_wq);
+}
+
+/**
+ * pack_sg_list - pack a scatter gather list from a linear buffer
+ * @sg: scatter/gather list to pack into
+ * @start: which segment of the sg_list to start at
+ * @limit: maximum segment to pack data to
+ * @data: data to pack into scatter/gather list
+ * @count: amount of data to pack into the scatter/gather list
+ *
+ * sg_lists have multiple segments of various sizes.  This will pack
+ * arbitrary data into an existing scatter gather list, segmenting the
+ * data as necessary within constraints.
+ *
+ */
+
+static int pack_sg_list(struct scatterlist *sg, int start,
+			int limit, char *data, int count)
+{
+	int s;
+	int index = start;
+
+	while (count) {
+		s = rest_of_page(data);
+		if (s > count)
+			s = count;
+		BUG_ON(index >= limit);
+		/* Make sure we don't terminate early. */
+		sg_unmark_end(&sg[index]);
+		sg_set_buf(&sg[index++], data, s);
+		count -= s;
+		data += s;
+	}
+	if (index-start)
+		sg_mark_end(&sg[index - 1]);
+	return index-start;
+}
+
+/* We don't currently allow canceling of virtio requests */
+static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+	return 1;
+}
+
+/* Reply won't come, so drop req ref */
+static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req)
+{
+	p9_req_put(client, req);
+	return 0;
+}
+
+/**
+ * pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer,
+ * this takes a list of pages.
+ * @sg: scatter/gather list to pack into
+ * @start: which segment of the sg_list to start at
+ * @limit: maximum number of pages in sg list.
+ * @pdata: a list of pages to add into sg.
+ * @nr_pages: number of pages to pack into the scatter/gather list
+ * @offs: amount of data in the beginning of first page _not_ to pack
+ * @count: amount of data to pack into the scatter/gather list
+ */
+static int
+pack_sg_list_p(struct scatterlist *sg, int start, int limit,
+	       struct page **pdata, int nr_pages, size_t offs, int count)
+{
+	int i = 0, s;
+	int data_off = offs;
+	int index = start;
+
+	BUG_ON(nr_pages > (limit - start));
+	/*
+	 * if the first page doesn't start at
+	 * page boundary find the offset
+	 */
+	while (nr_pages) {
+		s = PAGE_SIZE - data_off;
+		if (s > count)
+			s = count;
+		BUG_ON(index >= limit);
+		/* Make sure we don't terminate early. */
+		sg_unmark_end(&sg[index]);
+		sg_set_page(&sg[index++], pdata[i++], s, data_off);
+		data_off = 0;
+		count -= s;
+		nr_pages--;
+	}
+
+	if (index-start)
+		sg_mark_end(&sg[index - 1]);
+	return index - start;
+}
+
+/**
+ * p9_virtio_request - issue a request
+ * @client: client instance issuing the request
+ * @req: request to be issued
+ *
+ */
+
+static int
+p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
+{
+	int err;
+	int in, out, out_sgs, in_sgs;
+	unsigned long flags;
+	struct virtio_chan *chan = client->trans;
+	struct scatterlist *sgs[2];
+
+	p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n");
+
+	WRITE_ONCE(req->status, REQ_STATUS_SENT);
+req_retry:
+	spin_lock_irqsave(&chan->lock, flags);
+
+	out_sgs = in_sgs = 0;
+	/* Handle out VirtIO ring buffers */
+	out = pack_sg_list(chan->sg, 0,
+			   VIRTQUEUE_NUM, req->tc.sdata, req->tc.size);
+	if (out)
+		sgs[out_sgs++] = chan->sg;
+
+	in = pack_sg_list(chan->sg, out,
+			  VIRTQUEUE_NUM, req->rc.sdata, req->rc.capacity);
+	if (in)
+		sgs[out_sgs + in_sgs++] = chan->sg + out;
+
+	err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req,
+				GFP_ATOMIC);
+	if (err < 0) {
+		if (err == -ENOSPC) {
+			chan->ring_bufs_avail = 0;
+			spin_unlock_irqrestore(&chan->lock, flags);
+			err = wait_event_killable(*chan->vc_wq,
+						  chan->ring_bufs_avail);
+			if (err  == -ERESTARTSYS)
+				return err;
+
+			p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n");
+			goto req_retry;
+		} else {
+			spin_unlock_irqrestore(&chan->lock, flags);
+			p9_debug(P9_DEBUG_TRANS,
+				 "virtio rpc add_sgs returned failure\n");
+			return -EIO;
+		}
+	}
+	virtqueue_kick(chan->vq);
+	spin_unlock_irqrestore(&chan->lock, flags);
+
+	p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
+	return 0;
+}
+
+static int p9_get_mapped_pages(struct virtio_chan *chan,
+			       struct page ***pages,
+			       struct iov_iter *data,
+			       int count,
+			       size_t *offs,
+			       int *need_drop)
+{
+	int nr_pages;
+	int err;
+
+	if (!iov_iter_count(data))
+		return 0;
+
+	if (!iov_iter_is_kvec(data)) {
+		int n;
+		/*
+		 * We allow only p9_max_pages pinned. We wait for the
+		 * Other zc request to finish here
+		 */
+		if (atomic_read(&vp_pinned) >= chan->p9_max_pages) {
+			err = wait_event_killable(vp_wq,
+			      (atomic_read(&vp_pinned) < chan->p9_max_pages));
+			if (err == -ERESTARTSYS)
+				return err;
+		}
+		n = iov_iter_get_pages_alloc2(data, pages, count, offs);
+		if (n < 0)
+			return n;
+		*need_drop = 1;
+		nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE);
+		atomic_add(nr_pages, &vp_pinned);
+		return n;
+	} else {
+		/* kernel buffer, no need to pin pages */
+		int index;
+		size_t len;
+		void *p;
+
+		/* we'd already checked that it's non-empty */
+		while (1) {
+			len = iov_iter_single_seg_count(data);
+			if (likely(len)) {
+				p = data->kvec->iov_base + data->iov_offset;
+				break;
+			}
+			iov_iter_advance(data, 0);
+		}
+		if (len > count)
+			len = count;
+
+		nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) -
+			   (unsigned long)p / PAGE_SIZE;
+
+		*pages = kmalloc_array(nr_pages, sizeof(struct page *),
+				       GFP_NOFS);
+		if (!*pages)
+			return -ENOMEM;
+
+		*need_drop = 0;
+		p -= (*offs = offset_in_page(p));
+		for (index = 0; index < nr_pages; index++) {
+			if (is_vmalloc_addr(p))
+				(*pages)[index] = vmalloc_to_page(p);
+			else
+				(*pages)[index] = kmap_to_page(p);
+			p += PAGE_SIZE;
+		}
+		iov_iter_advance(data, len);
+		return len;
+	}
+}
+
+static void handle_rerror(struct p9_req_t *req, int in_hdr_len,
+			  size_t offs, struct page **pages)
+{
+	unsigned size, n;
+	void *to = req->rc.sdata + in_hdr_len;
+
+	// Fits entirely into the static data?  Nothing to do.
+	if (req->rc.size < in_hdr_len || !pages)
+		return;
+
+	// Really long error message?  Tough, truncate the reply.  Might get
+	// rejected (we can't be arsed to adjust the size encoded in header,
+	// or string size for that matter), but it wouldn't be anything valid
+	// anyway.
+	if (unlikely(req->rc.size > P9_ZC_HDR_SZ))
+		req->rc.size = P9_ZC_HDR_SZ;
+
+	// data won't span more than two pages
+	size = req->rc.size - in_hdr_len;
+	n = PAGE_SIZE - offs;
+	if (size > n) {
+		memcpy_from_page(to, *pages++, offs, n);
+		offs = 0;
+		to += n;
+		size -= n;
+	}
+	memcpy_from_page(to, *pages, offs, size);
+}
+
+/**
+ * p9_virtio_zc_request - issue a zero copy request
+ * @client: client instance issuing the request
+ * @req: request to be issued
+ * @uidata: user buffer that should be used for zero copy read
+ * @uodata: user buffer that should be used for zero copy write
+ * @inlen: read buffer size
+ * @outlen: write buffer size
+ * @in_hdr_len: reader header size, This is the size of response protocol data
+ *
+ */
+static int
+p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
+		     struct iov_iter *uidata, struct iov_iter *uodata,
+		     int inlen, int outlen, int in_hdr_len)
+{
+	int in, out, err, out_sgs, in_sgs;
+	unsigned long flags;
+	int in_nr_pages = 0, out_nr_pages = 0;
+	struct page **in_pages = NULL, **out_pages = NULL;
+	struct virtio_chan *chan = client->trans;
+	struct scatterlist *sgs[4];
+	size_t offs = 0;
+	int need_drop = 0;
+	int kicked = 0;
+
+	p9_debug(P9_DEBUG_TRANS, "virtio request\n");
+
+	if (uodata) {
+		__le32 sz;
+		int n = p9_get_mapped_pages(chan, &out_pages, uodata,
+					    outlen, &offs, &need_drop);
+		if (n < 0) {
+			err = n;
+			goto err_out;
+		}
+		out_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE);
+		if (n != outlen) {
+			__le32 v = cpu_to_le32(n);
+			memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4);
+			outlen = n;
+		}
+		/* The size field of the message must include the length of the
+		 * header and the length of the data.  We didn't actually know
+		 * the length of the data until this point so add it in now.
+		 */
+		sz = cpu_to_le32(req->tc.size + outlen);
+		memcpy(&req->tc.sdata[0], &sz, sizeof(sz));
+	} else if (uidata) {
+		int n = p9_get_mapped_pages(chan, &in_pages, uidata,
+					    inlen, &offs, &need_drop);
+		if (n < 0) {
+			err = n;
+			goto err_out;
+		}
+		in_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE);
+		if (n != inlen) {
+			__le32 v = cpu_to_le32(n);
+			memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4);
+			inlen = n;
+		}
+	}
+	WRITE_ONCE(req->status, REQ_STATUS_SENT);
+req_retry_pinned:
+	spin_lock_irqsave(&chan->lock, flags);
+
+	out_sgs = in_sgs = 0;
+
+	/* out data */
+	out = pack_sg_list(chan->sg, 0,
+			   VIRTQUEUE_NUM, req->tc.sdata, req->tc.size);
+
+	if (out)
+		sgs[out_sgs++] = chan->sg;
+
+	if (out_pages) {
+		sgs[out_sgs++] = chan->sg + out;
+		out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
+				      out_pages, out_nr_pages, offs, outlen);
+	}
+
+	/*
+	 * Take care of in data
+	 * For example TREAD have 11.
+	 * 11 is the read/write header = PDU Header(7) + IO Size (4).
+	 * Arrange in such a way that server places header in the
+	 * allocated memory and payload onto the user buffer.
+	 */
+	in = pack_sg_list(chan->sg, out,
+			  VIRTQUEUE_NUM, req->rc.sdata, in_hdr_len);
+	if (in)
+		sgs[out_sgs + in_sgs++] = chan->sg + out;
+
+	if (in_pages) {
+		sgs[out_sgs + in_sgs++] = chan->sg + out + in;
+		pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM,
+			       in_pages, in_nr_pages, offs, inlen);
+	}
+
+	BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs));
+	err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req,
+				GFP_ATOMIC);
+	if (err < 0) {
+		if (err == -ENOSPC) {
+			chan->ring_bufs_avail = 0;
+			spin_unlock_irqrestore(&chan->lock, flags);
+			err = wait_event_killable(*chan->vc_wq,
+						  chan->ring_bufs_avail);
+			if (err  == -ERESTARTSYS)
+				goto err_out;
+
+			p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n");
+			goto req_retry_pinned;
+		} else {
+			spin_unlock_irqrestore(&chan->lock, flags);
+			p9_debug(P9_DEBUG_TRANS,
+				 "virtio rpc add_sgs returned failure\n");
+			err = -EIO;
+			goto err_out;
+		}
+	}
+	virtqueue_kick(chan->vq);
+	spin_unlock_irqrestore(&chan->lock, flags);
+	kicked = 1;
+	p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
+	err = wait_event_killable(req->wq,
+			          READ_ONCE(req->status) >= REQ_STATUS_RCVD);
+	// RERROR needs reply (== error string) in static data
+	if (READ_ONCE(req->status) == REQ_STATUS_RCVD &&
+	    unlikely(req->rc.sdata[4] == P9_RERROR))
+		handle_rerror(req, in_hdr_len, offs, in_pages);
+
+	/*
+	 * Non kernel buffers are pinned, unpin them
+	 */
+err_out:
+	if (need_drop) {
+		if (in_pages) {
+			p9_release_pages(in_pages, in_nr_pages);
+			atomic_sub(in_nr_pages, &vp_pinned);
+		}
+		if (out_pages) {
+			p9_release_pages(out_pages, out_nr_pages);
+			atomic_sub(out_nr_pages, &vp_pinned);
+		}
+		/* wakeup anybody waiting for slots to pin pages */
+		wake_up(&vp_wq);
+	}
+	kvfree(in_pages);
+	kvfree(out_pages);
+	if (!kicked) {
+		/* reply won't come */
+		p9_req_put(client, req);
+	}
+	return err;
+}
+
+static ssize_t p9_mount_tag_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct virtio_chan *chan;
+	struct virtio_device *vdev;
+	int tag_len;
+
+	vdev = dev_to_virtio(dev);
+	chan = vdev->priv;
+	tag_len = strlen(chan->tag);
+
+	memcpy(buf, chan->tag, tag_len + 1);
+
+	return tag_len + 1;
+}
+
+static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL);
+
+/**
+ * p9_virtio_probe - probe for existence of 9P virtio channels
+ * @vdev: virtio device to probe
+ *
+ * This probes for existing virtio channels.
+ *
+ */
+
+static int p9_virtio_probe(struct virtio_device *vdev)
+{
+	__u16 tag_len;
+	char *tag;
+	int err;
+	struct virtio_chan *chan;
+
+	if (!vdev->config->get) {
+		dev_err(&vdev->dev, "%s failure: config access disabled\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL);
+	if (!chan) {
+		pr_err("Failed to allocate virtio 9P channel\n");
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	chan->vdev = vdev;
+
+	/* We expect one virtqueue, for requests. */
+	chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
+	if (IS_ERR(chan->vq)) {
+		err = PTR_ERR(chan->vq);
+		goto out_free_chan;
+	}
+	chan->vq->vdev->priv = chan;
+	spin_lock_init(&chan->lock);
+
+	sg_init_table(chan->sg, VIRTQUEUE_NUM);
+
+	chan->inuse = false;
+	if (virtio_has_feature(vdev, VIRTIO_9P_MOUNT_TAG)) {
+		virtio_cread(vdev, struct virtio_9p_config, tag_len, &tag_len);
+	} else {
+		err = -EINVAL;
+		goto out_free_vq;
+	}
+	tag = kzalloc(tag_len + 1, GFP_KERNEL);
+	if (!tag) {
+		err = -ENOMEM;
+		goto out_free_vq;
+	}
+
+	virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag),
+			   tag, tag_len);
+	chan->tag = tag;
+	err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
+	if (err) {
+		goto out_free_tag;
+	}
+	chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL);
+	if (!chan->vc_wq) {
+		err = -ENOMEM;
+		goto out_remove_file;
+	}
+	init_waitqueue_head(chan->vc_wq);
+	chan->ring_bufs_avail = 1;
+	/* Ceiling limit to avoid denial of service attacks */
+	chan->p9_max_pages = nr_free_buffer_pages()/4;
+
+	virtio_device_ready(vdev);
+
+	mutex_lock(&virtio_9p_lock);
+	list_add_tail(&chan->chan_list, &virtio_chan_list);
+	mutex_unlock(&virtio_9p_lock);
+
+	/* Let udev rules use the new mount_tag attribute. */
+	kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE);
+
+	return 0;
+
+out_remove_file:
+	sysfs_remove_file(&vdev->dev.kobj, &dev_attr_mount_tag.attr);
+out_free_tag:
+	kfree(tag);
+out_free_vq:
+	vdev->config->del_vqs(vdev);
+out_free_chan:
+	kfree(chan);
+fail:
+	return err;
+}
+
+
+/**
+ * p9_virtio_create - allocate a new virtio channel
+ * @client: client instance invoking this transport
+ * @devname: string identifying the channel to connect to (unused)
+ * @args: args passed from sys_mount() for per-transport options (unused)
+ *
+ * This sets up a transport channel for 9p communication.  Right now
+ * we only match the first available channel, but eventually we could look up
+ * alternate channels by matching devname versus a virtio_config entry.
+ * We use a simple reference count mechanism to ensure that only a single
+ * mount has a channel open at a time.
+ *
+ */
+
+static int
+p9_virtio_create(struct p9_client *client, const char *devname, char *args)
+{
+	struct virtio_chan *chan;
+	int ret = -ENOENT;
+	int found = 0;
+
+	if (devname == NULL)
+		return -EINVAL;
+
+	mutex_lock(&virtio_9p_lock);
+	list_for_each_entry(chan, &virtio_chan_list, chan_list) {
+		if (!strcmp(devname, chan->tag)) {
+			if (!chan->inuse) {
+				chan->inuse = true;
+				found = 1;
+				break;
+			}
+			ret = -EBUSY;
+		}
+	}
+	mutex_unlock(&virtio_9p_lock);
+
+	if (!found) {
+		pr_err("no channels available for device %s\n", devname);
+		return ret;
+	}
+
+	client->trans = (void *)chan;
+	client->status = Connected;
+	chan->client = client;
+
+	return 0;
+}
+
+/**
+ * p9_virtio_remove - clean up resources associated with a virtio device
+ * @vdev: virtio device to remove
+ *
+ */
+
+static void p9_virtio_remove(struct virtio_device *vdev)
+{
+	struct virtio_chan *chan = vdev->priv;
+	unsigned long warning_time;
+
+	mutex_lock(&virtio_9p_lock);
+
+	/* Remove self from list so we don't get new users. */
+	list_del(&chan->chan_list);
+	warning_time = jiffies;
+
+	/* Wait for existing users to close. */
+	while (chan->inuse) {
+		mutex_unlock(&virtio_9p_lock);
+		msleep(250);
+		if (time_after(jiffies, warning_time + 10 * HZ)) {
+			dev_emerg(&vdev->dev,
+				  "p9_virtio_remove: waiting for device in use.\n");
+			warning_time = jiffies;
+		}
+		mutex_lock(&virtio_9p_lock);
+	}
+
+	mutex_unlock(&virtio_9p_lock);
+
+	virtio_reset_device(vdev);
+	vdev->config->del_vqs(vdev);
+
+	sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
+	kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE);
+	kfree(chan->tag);
+	kfree(chan->vc_wq);
+	kfree(chan);
+
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_9P, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static unsigned int features[] = {
+	VIRTIO_9P_MOUNT_TAG,
+};
+
+/* The standard "struct lguest_driver": */
+static struct virtio_driver p9_virtio_drv = {
+	.feature_table  = features,
+	.feature_table_size = ARRAY_SIZE(features),
+	.driver.name    = KBUILD_MODNAME,
+	.driver.owner	= THIS_MODULE,
+	.id_table	= id_table,
+	.probe		= p9_virtio_probe,
+	.remove		= p9_virtio_remove,
+};
+
+static struct p9_trans_module p9_virtio_trans = {
+	.name = "virtio",
+	.create = p9_virtio_create,
+	.close = p9_virtio_close,
+	.request = p9_virtio_request,
+	.zc_request = p9_virtio_zc_request,
+	.cancel = p9_virtio_cancel,
+	.cancelled = p9_virtio_cancelled,
+	/*
+	 * We leave one entry for input and one entry for response
+	 * headers. We also skip one more entry to accommodate, address
+	 * that are not at page boundary, that can result in an extra
+	 * page in zero copy.
+	 */
+	.maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3),
+	.pooled_rbuffers = false,
+	.def = 1,
+	.owner = THIS_MODULE,
+};
+
+/* The standard init function */
+static int __init p9_virtio_init(void)
+{
+	int rc;
+
+	INIT_LIST_HEAD(&virtio_chan_list);
+
+	v9fs_register_trans(&p9_virtio_trans);
+	rc = register_virtio_driver(&p9_virtio_drv);
+	if (rc)
+		v9fs_unregister_trans(&p9_virtio_trans);
+
+	return rc;
+}
+
+static void __exit p9_virtio_cleanup(void)
+{
+	unregister_virtio_driver(&p9_virtio_drv);
+	v9fs_unregister_trans(&p9_virtio_trans);
+}
+
+module_init(p9_virtio_init);
+module_exit(p9_virtio_cleanup);
+MODULE_ALIAS_9P("virtio");
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_DESCRIPTION("Virtio 9p Transport");
+MODULE_LICENSE("GPL");
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
new file mode 100644
index 0000000000..1fffe2bed5
--- /dev/null
+++ b/net/9p/trans_xen.c
@@ -0,0 +1,571 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/9p/trans_xen
+ *
+ * Xen transport layer.
+ *
+ * Copyright (C) 2017 by Stefano Stabellini <stefano@aporeto.com>
+ */
+
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/9pfs.h>
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+
+#define XEN_9PFS_NUM_RINGS 2
+#define XEN_9PFS_RING_ORDER 9
+#define XEN_9PFS_RING_SIZE(ring)  XEN_FLEX_RING_SIZE(ring->intf->ring_order)
+
+struct xen_9pfs_header {
+	uint32_t size;
+	uint8_t id;
+	uint16_t tag;
+
+	/* uint8_t sdata[]; */
+} __attribute__((packed));
+
+/* One per ring, more than one per 9pfs share */
+struct xen_9pfs_dataring {
+	struct xen_9pfs_front_priv *priv;
+
+	struct xen_9pfs_data_intf *intf;
+	grant_ref_t ref;
+	int evtchn;
+	int irq;
+	/* protect a ring from concurrent accesses */
+	spinlock_t lock;
+
+	struct xen_9pfs_data data;
+	wait_queue_head_t wq;
+	struct work_struct work;
+};
+
+/* One per 9pfs share */
+struct xen_9pfs_front_priv {
+	struct list_head list;
+	struct xenbus_device *dev;
+	char *tag;
+	struct p9_client *client;
+
+	int num_rings;
+	struct xen_9pfs_dataring *rings;
+};
+
+static LIST_HEAD(xen_9pfs_devs);
+static DEFINE_RWLOCK(xen_9pfs_lock);
+
+/* We don't currently allow canceling of requests */
+static int p9_xen_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+	return 1;
+}
+
+static int p9_xen_create(struct p9_client *client, const char *addr, char *args)
+{
+	struct xen_9pfs_front_priv *priv;
+
+	if (addr == NULL)
+		return -EINVAL;
+
+	read_lock(&xen_9pfs_lock);
+	list_for_each_entry(priv, &xen_9pfs_devs, list) {
+		if (!strcmp(priv->tag, addr)) {
+			priv->client = client;
+			read_unlock(&xen_9pfs_lock);
+			return 0;
+		}
+	}
+	read_unlock(&xen_9pfs_lock);
+	return -EINVAL;
+}
+
+static void p9_xen_close(struct p9_client *client)
+{
+	struct xen_9pfs_front_priv *priv;
+
+	read_lock(&xen_9pfs_lock);
+	list_for_each_entry(priv, &xen_9pfs_devs, list) {
+		if (priv->client == client) {
+			priv->client = NULL;
+			read_unlock(&xen_9pfs_lock);
+			return;
+		}
+	}
+	read_unlock(&xen_9pfs_lock);
+}
+
+static bool p9_xen_write_todo(struct xen_9pfs_dataring *ring, RING_IDX size)
+{
+	RING_IDX cons, prod;
+
+	cons = ring->intf->out_cons;
+	prod = ring->intf->out_prod;
+	virt_mb();
+
+	return XEN_9PFS_RING_SIZE(ring) -
+		xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) >= size;
+}
+
+static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req)
+{
+	struct xen_9pfs_front_priv *priv;
+	RING_IDX cons, prod, masked_cons, masked_prod;
+	unsigned long flags;
+	u32 size = p9_req->tc.size;
+	struct xen_9pfs_dataring *ring;
+	int num;
+
+	read_lock(&xen_9pfs_lock);
+	list_for_each_entry(priv, &xen_9pfs_devs, list) {
+		if (priv->client == client)
+			break;
+	}
+	read_unlock(&xen_9pfs_lock);
+	if (list_entry_is_head(priv, &xen_9pfs_devs, list))
+		return -EINVAL;
+
+	num = p9_req->tc.tag % priv->num_rings;
+	ring = &priv->rings[num];
+
+again:
+	while (wait_event_killable(ring->wq,
+				   p9_xen_write_todo(ring, size)) != 0)
+		;
+
+	spin_lock_irqsave(&ring->lock, flags);
+	cons = ring->intf->out_cons;
+	prod = ring->intf->out_prod;
+	virt_mb();
+
+	if (XEN_9PFS_RING_SIZE(ring) -
+	    xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) < size) {
+		spin_unlock_irqrestore(&ring->lock, flags);
+		goto again;
+	}
+
+	masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE(ring));
+	masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring));
+
+	xen_9pfs_write_packet(ring->data.out, p9_req->tc.sdata, size,
+			      &masked_prod, masked_cons,
+			      XEN_9PFS_RING_SIZE(ring));
+
+	WRITE_ONCE(p9_req->status, REQ_STATUS_SENT);
+	virt_wmb();			/* write ring before updating pointer */
+	prod += size;
+	ring->intf->out_prod = prod;
+	spin_unlock_irqrestore(&ring->lock, flags);
+	notify_remote_via_irq(ring->irq);
+	p9_req_put(client, p9_req);
+
+	return 0;
+}
+
+static void p9_xen_response(struct work_struct *work)
+{
+	struct xen_9pfs_front_priv *priv;
+	struct xen_9pfs_dataring *ring;
+	RING_IDX cons, prod, masked_cons, masked_prod;
+	struct xen_9pfs_header h;
+	struct p9_req_t *req;
+	int status;
+
+	ring = container_of(work, struct xen_9pfs_dataring, work);
+	priv = ring->priv;
+
+	while (1) {
+		cons = ring->intf->in_cons;
+		prod = ring->intf->in_prod;
+		virt_rmb();
+
+		if (xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) <
+		    sizeof(h)) {
+			notify_remote_via_irq(ring->irq);
+			return;
+		}
+
+		masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE(ring));
+		masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring));
+
+		/* First, read just the header */
+		xen_9pfs_read_packet(&h, ring->data.in, sizeof(h),
+				     masked_prod, &masked_cons,
+				     XEN_9PFS_RING_SIZE(ring));
+
+		req = p9_tag_lookup(priv->client, h.tag);
+		if (!req || req->status != REQ_STATUS_SENT) {
+			dev_warn(&priv->dev->dev, "Wrong req tag=%x\n", h.tag);
+			cons += h.size;
+			virt_mb();
+			ring->intf->in_cons = cons;
+			continue;
+		}
+
+		if (h.size > req->rc.capacity) {
+			dev_warn(&priv->dev->dev,
+				 "requested packet size too big: %d for tag %d with capacity %zd\n",
+				 h.size, h.tag, req->rc.capacity);
+			WRITE_ONCE(req->status, REQ_STATUS_ERROR);
+			goto recv_error;
+		}
+
+		req->rc.size = h.size;
+		req->rc.id = h.id;
+		req->rc.tag = h.tag;
+		req->rc.offset = 0;
+
+		masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring));
+		/* Then, read the whole packet (including the header) */
+		xen_9pfs_read_packet(req->rc.sdata, ring->data.in, h.size,
+				     masked_prod, &masked_cons,
+				     XEN_9PFS_RING_SIZE(ring));
+
+recv_error:
+		virt_mb();
+		cons += h.size;
+		ring->intf->in_cons = cons;
+
+		status = (req->status != REQ_STATUS_ERROR) ?
+			REQ_STATUS_RCVD : REQ_STATUS_ERROR;
+
+		p9_client_cb(priv->client, req, status);
+	}
+}
+
+static irqreturn_t xen_9pfs_front_event_handler(int irq, void *r)
+{
+	struct xen_9pfs_dataring *ring = r;
+
+	if (!ring || !ring->priv->client) {
+		/* ignore spurious interrupt */
+		return IRQ_HANDLED;
+	}
+
+	wake_up_interruptible(&ring->wq);
+	schedule_work(&ring->work);
+
+	return IRQ_HANDLED;
+}
+
+static struct p9_trans_module p9_xen_trans = {
+	.name = "xen",
+	.maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT - 2),
+	.pooled_rbuffers = false,
+	.def = 1,
+	.create = p9_xen_create,
+	.close = p9_xen_close,
+	.request = p9_xen_request,
+	.cancel = p9_xen_cancel,
+	.owner = THIS_MODULE,
+};
+
+static const struct xenbus_device_id xen_9pfs_front_ids[] = {
+	{ "9pfs" },
+	{ "" }
+};
+
+static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv)
+{
+	int i, j;
+
+	write_lock(&xen_9pfs_lock);
+	list_del(&priv->list);
+	write_unlock(&xen_9pfs_lock);
+
+	for (i = 0; i < priv->num_rings; i++) {
+		struct xen_9pfs_dataring *ring = &priv->rings[i];
+
+		cancel_work_sync(&ring->work);
+
+		if (!priv->rings[i].intf)
+			break;
+		if (priv->rings[i].irq > 0)
+			unbind_from_irqhandler(priv->rings[i].irq, priv->dev);
+		if (priv->rings[i].data.in) {
+			for (j = 0;
+			     j < (1 << priv->rings[i].intf->ring_order);
+			     j++) {
+				grant_ref_t ref;
+
+				ref = priv->rings[i].intf->ref[j];
+				gnttab_end_foreign_access(ref, NULL);
+			}
+			free_pages_exact(priv->rings[i].data.in,
+				   1UL << (priv->rings[i].intf->ring_order +
+					   XEN_PAGE_SHIFT));
+		}
+		gnttab_end_foreign_access(priv->rings[i].ref, NULL);
+		free_page((unsigned long)priv->rings[i].intf);
+	}
+	kfree(priv->rings);
+	kfree(priv->tag);
+	kfree(priv);
+}
+
+static void xen_9pfs_front_remove(struct xenbus_device *dev)
+{
+	struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev);
+
+	dev_set_drvdata(&dev->dev, NULL);
+	xen_9pfs_front_free(priv);
+}
+
+static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev,
+					 struct xen_9pfs_dataring *ring,
+					 unsigned int order)
+{
+	int i = 0;
+	int ret = -ENOMEM;
+	void *bytes = NULL;
+
+	init_waitqueue_head(&ring->wq);
+	spin_lock_init(&ring->lock);
+	INIT_WORK(&ring->work, p9_xen_response);
+
+	ring->intf = (struct xen_9pfs_data_intf *)get_zeroed_page(GFP_KERNEL);
+	if (!ring->intf)
+		return ret;
+	ret = gnttab_grant_foreign_access(dev->otherend_id,
+					  virt_to_gfn(ring->intf), 0);
+	if (ret < 0)
+		goto out;
+	ring->ref = ret;
+	bytes = alloc_pages_exact(1UL << (order + XEN_PAGE_SHIFT),
+				  GFP_KERNEL | __GFP_ZERO);
+	if (!bytes) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	for (; i < (1 << order); i++) {
+		ret = gnttab_grant_foreign_access(
+				dev->otherend_id, virt_to_gfn(bytes) + i, 0);
+		if (ret < 0)
+			goto out;
+		ring->intf->ref[i] = ret;
+	}
+	ring->intf->ring_order = order;
+	ring->data.in = bytes;
+	ring->data.out = bytes + XEN_FLEX_RING_SIZE(order);
+
+	ret = xenbus_alloc_evtchn(dev, &ring->evtchn);
+	if (ret)
+		goto out;
+	ring->irq = bind_evtchn_to_irqhandler(ring->evtchn,
+					      xen_9pfs_front_event_handler,
+					      0, "xen_9pfs-frontend", ring);
+	if (ring->irq >= 0)
+		return 0;
+
+	xenbus_free_evtchn(dev, ring->evtchn);
+	ret = ring->irq;
+out:
+	if (bytes) {
+		for (i--; i >= 0; i--)
+			gnttab_end_foreign_access(ring->intf->ref[i], NULL);
+		free_pages_exact(bytes, 1UL << (order + XEN_PAGE_SHIFT));
+	}
+	gnttab_end_foreign_access(ring->ref, NULL);
+	free_page((unsigned long)ring->intf);
+	return ret;
+}
+
+static int xen_9pfs_front_init(struct xenbus_device *dev)
+{
+	int ret, i;
+	struct xenbus_transaction xbt;
+	struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev);
+	char *versions, *v;
+	unsigned int max_rings, max_ring_order, len = 0;
+
+	versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len);
+	if (IS_ERR(versions))
+		return PTR_ERR(versions);
+	for (v = versions; *v; v++) {
+		if (simple_strtoul(v, &v, 10) == 1) {
+			v = NULL;
+			break;
+		}
+	}
+	if (v) {
+		kfree(versions);
+		return -EINVAL;
+	}
+	kfree(versions);
+	max_rings = xenbus_read_unsigned(dev->otherend, "max-rings", 0);
+	if (max_rings < XEN_9PFS_NUM_RINGS)
+		return -EINVAL;
+	max_ring_order = xenbus_read_unsigned(dev->otherend,
+					      "max-ring-page-order", 0);
+	if (max_ring_order > XEN_9PFS_RING_ORDER)
+		max_ring_order = XEN_9PFS_RING_ORDER;
+	if (p9_xen_trans.maxsize > XEN_FLEX_RING_SIZE(max_ring_order))
+		p9_xen_trans.maxsize = XEN_FLEX_RING_SIZE(max_ring_order) / 2;
+
+	priv->num_rings = XEN_9PFS_NUM_RINGS;
+	priv->rings = kcalloc(priv->num_rings, sizeof(*priv->rings),
+			      GFP_KERNEL);
+	if (!priv->rings) {
+		kfree(priv);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < priv->num_rings; i++) {
+		priv->rings[i].priv = priv;
+		ret = xen_9pfs_front_alloc_dataring(dev, &priv->rings[i],
+						    max_ring_order);
+		if (ret < 0)
+			goto error;
+	}
+
+ again:
+	ret = xenbus_transaction_start(&xbt);
+	if (ret) {
+		xenbus_dev_fatal(dev, ret, "starting transaction");
+		goto error;
+	}
+	ret = xenbus_printf(xbt, dev->nodename, "version", "%u", 1);
+	if (ret)
+		goto error_xenbus;
+	ret = xenbus_printf(xbt, dev->nodename, "num-rings", "%u",
+			    priv->num_rings);
+	if (ret)
+		goto error_xenbus;
+	for (i = 0; i < priv->num_rings; i++) {
+		char str[16];
+
+		BUILD_BUG_ON(XEN_9PFS_NUM_RINGS > 9);
+		sprintf(str, "ring-ref%d", i);
+		ret = xenbus_printf(xbt, dev->nodename, str, "%d",
+				    priv->rings[i].ref);
+		if (ret)
+			goto error_xenbus;
+
+		sprintf(str, "event-channel-%d", i);
+		ret = xenbus_printf(xbt, dev->nodename, str, "%u",
+				    priv->rings[i].evtchn);
+		if (ret)
+			goto error_xenbus;
+	}
+	priv->tag = xenbus_read(xbt, dev->nodename, "tag", NULL);
+	if (IS_ERR(priv->tag)) {
+		ret = PTR_ERR(priv->tag);
+		goto error_xenbus;
+	}
+	ret = xenbus_transaction_end(xbt, 0);
+	if (ret) {
+		if (ret == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, ret, "completing transaction");
+		goto error;
+	}
+
+	return 0;
+
+ error_xenbus:
+	xenbus_transaction_end(xbt, 1);
+	xenbus_dev_fatal(dev, ret, "writing xenstore");
+ error:
+	xen_9pfs_front_free(priv);
+	return ret;
+}
+
+static int xen_9pfs_front_probe(struct xenbus_device *dev,
+				const struct xenbus_device_id *id)
+{
+	struct xen_9pfs_front_priv *priv = NULL;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->dev = dev;
+	dev_set_drvdata(&dev->dev, priv);
+
+	write_lock(&xen_9pfs_lock);
+	list_add_tail(&priv->list, &xen_9pfs_devs);
+	write_unlock(&xen_9pfs_lock);
+
+	return 0;
+}
+
+static int xen_9pfs_front_resume(struct xenbus_device *dev)
+{
+	dev_warn(&dev->dev, "suspend/resume unsupported\n");
+	return 0;
+}
+
+static void xen_9pfs_front_changed(struct xenbus_device *dev,
+				   enum xenbus_state backend_state)
+{
+	switch (backend_state) {
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
+	case XenbusStateInitialising:
+	case XenbusStateInitialised:
+	case XenbusStateUnknown:
+		break;
+
+	case XenbusStateInitWait:
+		if (!xen_9pfs_front_init(dev))
+			xenbus_switch_state(dev, XenbusStateInitialised);
+		break;
+
+	case XenbusStateConnected:
+		xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateClosed:
+		if (dev->state == XenbusStateClosed)
+			break;
+		fallthrough;	/* Missed the backend's CLOSING state */
+	case XenbusStateClosing:
+		xenbus_frontend_closed(dev);
+		break;
+	}
+}
+
+static struct xenbus_driver xen_9pfs_front_driver = {
+	.ids = xen_9pfs_front_ids,
+	.probe = xen_9pfs_front_probe,
+	.remove = xen_9pfs_front_remove,
+	.resume = xen_9pfs_front_resume,
+	.otherend_changed = xen_9pfs_front_changed,
+};
+
+static int __init p9_trans_xen_init(void)
+{
+	int rc;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	pr_info("Initialising Xen transport for 9pfs\n");
+
+	v9fs_register_trans(&p9_xen_trans);
+	rc = xenbus_register_frontend(&xen_9pfs_front_driver);
+	if (rc)
+		v9fs_unregister_trans(&p9_xen_trans);
+
+	return rc;
+}
+module_init(p9_trans_xen_init);
+MODULE_ALIAS_9P("xen");
+
+static void __exit p9_trans_xen_exit(void)
+{
+	v9fs_unregister_trans(&p9_xen_trans);
+	return xenbus_unregister_driver(&xen_9pfs_front_driver);
+}
+module_exit(p9_trans_xen_exit);
+
+MODULE_ALIAS("xen:9pfs");
+MODULE_AUTHOR("Stefano Stabellini <stefano@aporeto.com>");
+MODULE_DESCRIPTION("Xen Transport for 9P");
+MODULE_LICENSE("GPL");