diff options
Diffstat (limited to 'drivers/vhost')
-rw-r--r-- | drivers/vhost/Kconfig | 55 | ||||
-rw-r--r-- | drivers/vhost/Kconfig.vringh | 5 | ||||
-rw-r--r-- | drivers/vhost/Makefile | 13 | ||||
-rw-r--r-- | drivers/vhost/net.c | 1593 | ||||
-rw-r--r-- | drivers/vhost/scsi.c | 2151 | ||||
-rw-r--r-- | drivers/vhost/test.c | 337 | ||||
-rw-r--r-- | drivers/vhost/test.h | 8 | ||||
-rw-r--r-- | drivers/vhost/vhost.c | 2549 | ||||
-rw-r--r-- | drivers/vhost/vhost.h | 298 | ||||
-rw-r--r-- | drivers/vhost/vringh.c | 1045 | ||||
-rw-r--r-- | drivers/vhost/vsock.c | 870 |
11 files changed, 8924 insertions, 0 deletions
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig new file mode 100644 index 000000000..b58088524 --- /dev/null +++ b/drivers/vhost/Kconfig @@ -0,0 +1,55 @@ +config VHOST_NET + tristate "Host kernel accelerator for virtio net" + depends on NET && EVENTFD && (TUN || !TUN) && (TAP || !TAP) + select VHOST + ---help--- + This kernel module can be loaded in host kernel to accelerate + guest networking with virtio_net. Not to be confused with virtio_net + module itself which needs to be loaded in guest kernel. + + To compile this driver as a module, choose M here: the module will + be called vhost_net. + +config VHOST_SCSI + tristate "VHOST_SCSI TCM fabric driver" + depends on TARGET_CORE && EVENTFD + select VHOST + default n + ---help--- + Say M here to enable the vhost_scsi TCM fabric module + for use with virtio-scsi guests + +config VHOST_VSOCK + tristate "vhost virtio-vsock driver" + depends on VSOCKETS && EVENTFD + select VIRTIO_VSOCKETS_COMMON + select VHOST + default n + ---help--- + This kernel module can be loaded in the host kernel to provide AF_VSOCK + sockets for communicating with guests. The guests must have the + virtio_transport.ko driver loaded to use the virtio-vsock device. + + To compile this driver as a module, choose M here: the module will be called + vhost_vsock. + +config VHOST + tristate + ---help--- + This option is selected by any driver which needs to access + the core of vhost. + +config VHOST_CROSS_ENDIAN_LEGACY + bool "Cross-endian support for vhost" + default n + ---help--- + This option allows vhost to support guests with a different byte + ordering from host while using legacy virtio. + + Userspace programs can control the feature using the + VHOST_SET_VRING_ENDIAN and VHOST_GET_VRING_ENDIAN ioctls. + + This is only useful on a few platforms (ppc64 and arm64). Since it + adds some overhead, it is disabled by default. + + If unsure, say "N". diff --git a/drivers/vhost/Kconfig.vringh b/drivers/vhost/Kconfig.vringh new file mode 100644 index 000000000..6a4490c09 --- /dev/null +++ b/drivers/vhost/Kconfig.vringh @@ -0,0 +1,5 @@ +config VHOST_RING + tristate + ---help--- + This option is selected by any driver which needs to access + the host side of a virtio ring. diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile new file mode 100644 index 000000000..6c6df24f7 --- /dev/null +++ b/drivers/vhost/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_VHOST_NET) += vhost_net.o +vhost_net-y := net.o + +obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o +vhost_scsi-y := scsi.o + +obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o +vhost_vsock-y := vsock.o + +obj-$(CONFIG_VHOST_RING) += vringh.o + +obj-$(CONFIG_VHOST) += vhost.o diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c new file mode 100644 index 000000000..5ea7b0a94 --- /dev/null +++ b/drivers/vhost/net.c @@ -0,0 +1,1593 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * virtio-net server in host kernel. + */ + +#include <linux/compat.h> +#include <linux/eventfd.h> +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/mutex.h> +#include <linux/workqueue.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/sched/clock.h> +#include <linux/sched/signal.h> +#include <linux/vmalloc.h> + +#include <linux/net.h> +#include <linux/if_packet.h> +#include <linux/if_arp.h> +#include <linux/if_tun.h> +#include <linux/if_macvlan.h> +#include <linux/if_tap.h> +#include <linux/if_vlan.h> +#include <linux/skb_array.h> +#include <linux/skbuff.h> + +#include <net/sock.h> +#include <net/xdp.h> + +#include "vhost.h" + +static int experimental_zcopytx = 0; +module_param(experimental_zcopytx, int, 0444); +MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" + " 1 -Enable; 0 - Disable"); + +/* Max number of bytes transferred before requeueing the job. + * Using this limit prevents one virtqueue from starving others. */ +#define VHOST_NET_WEIGHT 0x80000 + +/* Max number of packets transferred before requeueing the job. + * Using this limit prevents one virtqueue from starving others with small + * pkts. + */ +#define VHOST_NET_PKT_WEIGHT 256 + +/* MAX number of TX used buffers for outstanding zerocopy */ +#define VHOST_MAX_PEND 128 +#define VHOST_GOODCOPY_LEN 256 + +/* + * For transmit, used buffer len is unused; we override it to track buffer + * status internally; used for zerocopy tx only. + */ +/* Lower device DMA failed */ +#define VHOST_DMA_FAILED_LEN ((__force __virtio32)3) +/* Lower device DMA done */ +#define VHOST_DMA_DONE_LEN ((__force __virtio32)2) +/* Lower device DMA in progress */ +#define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1) +/* Buffer unused */ +#define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0) + +#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN) + +enum { + VHOST_NET_FEATURES = VHOST_FEATURES | + (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | + (1ULL << VIRTIO_NET_F_MRG_RXBUF) | + (1ULL << VIRTIO_F_IOMMU_PLATFORM) +}; + +enum { + VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) +}; + +enum { + VHOST_NET_VQ_RX = 0, + VHOST_NET_VQ_TX = 1, + VHOST_NET_VQ_MAX = 2, +}; + +struct vhost_net_ubuf_ref { + /* refcount follows semantics similar to kref: + * 0: object is released + * 1: no outstanding ubufs + * >1: outstanding ubufs + */ + atomic_t refcount; + wait_queue_head_t wait; + struct vhost_virtqueue *vq; +}; + +#define VHOST_NET_BATCH 64 +struct vhost_net_buf { + void **queue; + int tail; + int head; +}; + +struct vhost_net_virtqueue { + struct vhost_virtqueue vq; + size_t vhost_hlen; + size_t sock_hlen; + /* vhost zerocopy support fields below: */ + /* last used idx for outstanding DMA zerocopy buffers */ + int upend_idx; + /* For TX, first used idx for DMA done zerocopy buffers + * For RX, number of batched heads + */ + int done_idx; + /* an array of userspace buffers info */ + struct ubuf_info *ubuf_info; + /* Reference counting for outstanding ubufs. + * Protected by vq mutex. Writers must also take device mutex. */ + struct vhost_net_ubuf_ref *ubufs; + struct ptr_ring *rx_ring; + struct vhost_net_buf rxq; +}; + +struct vhost_net { + struct vhost_dev dev; + struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX]; + struct vhost_poll poll[VHOST_NET_VQ_MAX]; + /* Number of TX recently submitted. + * Protected by tx vq lock. */ + unsigned tx_packets; + /* Number of times zerocopy TX recently failed. + * Protected by tx vq lock. */ + unsigned tx_zcopy_err; + /* Flush in progress. Protected by tx vq lock. */ + bool tx_flush; +}; + +static unsigned vhost_net_zcopy_mask __read_mostly; + +static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq) +{ + if (rxq->tail != rxq->head) + return rxq->queue[rxq->head]; + else + return NULL; +} + +static int vhost_net_buf_get_size(struct vhost_net_buf *rxq) +{ + return rxq->tail - rxq->head; +} + +static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq) +{ + return rxq->tail == rxq->head; +} + +static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) +{ + void *ret = vhost_net_buf_get_ptr(rxq); + ++rxq->head; + return ret; +} + +static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) +{ + struct vhost_net_buf *rxq = &nvq->rxq; + + rxq->head = 0; + rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue, + VHOST_NET_BATCH); + return rxq->tail; +} + +static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) +{ + struct vhost_net_buf *rxq = &nvq->rxq; + + if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { + ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head, + vhost_net_buf_get_size(rxq), + tun_ptr_free); + rxq->head = rxq->tail = 0; + } +} + +static int vhost_net_buf_peek_len(void *ptr) +{ + if (tun_is_xdp_frame(ptr)) { + struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); + + return xdpf->len; + } + + return __skb_array_len_with_tag(ptr); +} + +static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) +{ + struct vhost_net_buf *rxq = &nvq->rxq; + + if (!vhost_net_buf_is_empty(rxq)) + goto out; + + if (!vhost_net_buf_produce(nvq)) + return 0; + +out: + return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq)); +} + +static void vhost_net_buf_init(struct vhost_net_buf *rxq) +{ + rxq->head = rxq->tail = 0; +} + +static void vhost_net_enable_zcopy(int vq) +{ + vhost_net_zcopy_mask |= 0x1 << vq; +} + +static struct vhost_net_ubuf_ref * +vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) +{ + struct vhost_net_ubuf_ref *ubufs; + /* No zero copy backend? Nothing to count. */ + if (!zcopy) + return NULL; + ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL); + if (!ubufs) + return ERR_PTR(-ENOMEM); + atomic_set(&ubufs->refcount, 1); + init_waitqueue_head(&ubufs->wait); + ubufs->vq = vq; + return ubufs; +} + +static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) +{ + int r = atomic_sub_return(1, &ubufs->refcount); + if (unlikely(!r)) + wake_up(&ubufs->wait); + return r; +} + +static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) +{ + vhost_net_ubuf_put(ubufs); + wait_event(ubufs->wait, !atomic_read(&ubufs->refcount)); +} + +static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) +{ + vhost_net_ubuf_put_and_wait(ubufs); + kfree(ubufs); +} + +static void vhost_net_clear_ubuf_info(struct vhost_net *n) +{ + int i; + + for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { + kfree(n->vqs[i].ubuf_info); + n->vqs[i].ubuf_info = NULL; + } +} + +static int vhost_net_set_ubuf_info(struct vhost_net *n) +{ + bool zcopy; + int i; + + for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { + zcopy = vhost_net_zcopy_mask & (0x1 << i); + if (!zcopy) + continue; + n->vqs[i].ubuf_info = + kmalloc_array(UIO_MAXIOV, + sizeof(*n->vqs[i].ubuf_info), + GFP_KERNEL); + if (!n->vqs[i].ubuf_info) + goto err; + } + return 0; + +err: + vhost_net_clear_ubuf_info(n); + return -ENOMEM; +} + +static void vhost_net_vq_reset(struct vhost_net *n) +{ + int i; + + vhost_net_clear_ubuf_info(n); + + for (i = 0; i < VHOST_NET_VQ_MAX; i++) { + n->vqs[i].done_idx = 0; + n->vqs[i].upend_idx = 0; + n->vqs[i].ubufs = NULL; + n->vqs[i].vhost_hlen = 0; + n->vqs[i].sock_hlen = 0; + vhost_net_buf_init(&n->vqs[i].rxq); + } + +} + +static void vhost_net_tx_packet(struct vhost_net *net) +{ + ++net->tx_packets; + if (net->tx_packets < 1024) + return; + net->tx_packets = 0; + net->tx_zcopy_err = 0; +} + +static void vhost_net_tx_err(struct vhost_net *net) +{ + ++net->tx_zcopy_err; +} + +static bool vhost_net_tx_select_zcopy(struct vhost_net *net) +{ + /* TX flush waits for outstanding DMAs to be done. + * Don't start new DMAs. + */ + return !net->tx_flush && + net->tx_packets / 64 >= net->tx_zcopy_err; +} + +static bool vhost_sock_zcopy(struct socket *sock) +{ + return unlikely(experimental_zcopytx) && + sock_flag(sock->sk, SOCK_ZEROCOPY); +} + +/* In case of DMA done not in order in lower device driver for some reason. + * upend_idx is used to track end of used idx, done_idx is used to track head + * of used idx. Once lower device DMA done contiguously, we will signal KVM + * guest used idx. + */ +static void vhost_zerocopy_signal_used(struct vhost_net *net, + struct vhost_virtqueue *vq) +{ + struct vhost_net_virtqueue *nvq = + container_of(vq, struct vhost_net_virtqueue, vq); + int i, add; + int j = 0; + + for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { + if (vq->heads[i].len == VHOST_DMA_FAILED_LEN) + vhost_net_tx_err(net); + if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { + vq->heads[i].len = VHOST_DMA_CLEAR_LEN; + ++j; + } else + break; + } + while (j) { + add = min(UIO_MAXIOV - nvq->done_idx, j); + vhost_add_used_and_signal_n(vq->dev, vq, + &vq->heads[nvq->done_idx], add); + nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; + j -= add; + } +} + +static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) +{ + struct vhost_net_ubuf_ref *ubufs = ubuf->ctx; + struct vhost_virtqueue *vq = ubufs->vq; + int cnt; + + rcu_read_lock_bh(); + + /* set len to mark this desc buffers done DMA */ + vq->heads[ubuf->desc].len = success ? + VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; + cnt = vhost_net_ubuf_put(ubufs); + + /* + * Trigger polling thread if guest stopped submitting new buffers: + * in this case, the refcount after decrement will eventually reach 1. + * We also trigger polling periodically after each 16 packets + * (the value 16 here is more or less arbitrary, it's tuned to trigger + * less than 10% of times). + */ + if (cnt <= 1 || !(cnt % 16)) + vhost_poll_queue(&vq->poll); + + rcu_read_unlock_bh(); +} + +static inline unsigned long busy_clock(void) +{ + return local_clock() >> 10; +} + +static bool vhost_can_busy_poll(unsigned long endtime) +{ + return likely(!need_resched() && !time_after(busy_clock(), endtime) && + !signal_pending(current)); +} + +static void vhost_net_disable_vq(struct vhost_net *n, + struct vhost_virtqueue *vq) +{ + struct vhost_net_virtqueue *nvq = + container_of(vq, struct vhost_net_virtqueue, vq); + struct vhost_poll *poll = n->poll + (nvq - n->vqs); + if (!vq->private_data) + return; + vhost_poll_stop(poll); +} + +static int vhost_net_enable_vq(struct vhost_net *n, + struct vhost_virtqueue *vq) +{ + struct vhost_net_virtqueue *nvq = + container_of(vq, struct vhost_net_virtqueue, vq); + struct vhost_poll *poll = n->poll + (nvq - n->vqs); + struct socket *sock; + + sock = vq->private_data; + if (!sock) + return 0; + + return vhost_poll_start(poll, sock->file); +} + +static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) +{ + struct vhost_virtqueue *vq = &nvq->vq; + struct vhost_dev *dev = vq->dev; + + if (!nvq->done_idx) + return; + + vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx); + nvq->done_idx = 0; +} + +static int vhost_net_tx_get_vq_desc(struct vhost_net *net, + struct vhost_net_virtqueue *nvq, + unsigned int *out_num, unsigned int *in_num, + bool *busyloop_intr) +{ + struct vhost_virtqueue *vq = &nvq->vq; + unsigned long uninitialized_var(endtime); + int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), + out_num, in_num, NULL, NULL); + + if (r == vq->num && vq->busyloop_timeout) { + if (!vhost_sock_zcopy(vq->private_data)) + vhost_net_signal_used(nvq); + preempt_disable(); + endtime = busy_clock() + vq->busyloop_timeout; + while (vhost_can_busy_poll(endtime)) { + if (vhost_has_work(vq->dev)) { + *busyloop_intr = true; + break; + } + if (!vhost_vq_avail_empty(vq->dev, vq)) + break; + cpu_relax(); + } + preempt_enable(); + r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), + out_num, in_num, NULL, NULL); + } + + return r; +} + +static bool vhost_exceeds_maxpend(struct vhost_net *net) +{ + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; + struct vhost_virtqueue *vq = &nvq->vq; + + return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV > + min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2); +} + +static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter, + size_t hdr_size, int out) +{ + /* Skip header. TODO: support TSO. */ + size_t len = iov_length(vq->iov, out); + + iov_iter_init(iter, WRITE, vq->iov, out, len); + iov_iter_advance(iter, hdr_size); + + return iov_iter_count(iter); +} + +static int get_tx_bufs(struct vhost_net *net, + struct vhost_net_virtqueue *nvq, + struct msghdr *msg, + unsigned int *out, unsigned int *in, + size_t *len, bool *busyloop_intr) +{ + struct vhost_virtqueue *vq = &nvq->vq; + int ret; + + ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, busyloop_intr); + + if (ret < 0 || ret == vq->num) + return ret; + + if (*in) { + vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n", + *out, *in); + return -EFAULT; + } + + /* Sanity check */ + *len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out); + if (*len == 0) { + vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n", + *len, nvq->vhost_hlen); + return -EFAULT; + } + + return ret; +} + +static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len) +{ + return total_len < VHOST_NET_WEIGHT && + !vhost_vq_avail_empty(vq->dev, vq); +} + +static void handle_tx_copy(struct vhost_net *net, struct socket *sock) +{ + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; + struct vhost_virtqueue *vq = &nvq->vq; + unsigned out, in; + int head; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = MSG_DONTWAIT, + }; + size_t len, total_len = 0; + int err; + int sent_pkts = 0; + + do { + bool busyloop_intr = false; + + head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, + &busyloop_intr); + /* On error, stop handling until the next kick. */ + if (unlikely(head < 0)) + break; + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (head == vq->num) { + if (unlikely(busyloop_intr)) { + vhost_poll_queue(&vq->poll); + } else if (unlikely(vhost_enable_notify(&net->dev, + vq))) { + vhost_disable_notify(&net->dev, vq); + continue; + } + break; + } + + vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); + vq->heads[nvq->done_idx].len = 0; + + total_len += len; + if (tx_can_batch(vq, total_len)) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags &= ~MSG_MORE; + + /* TODO: Check specific error and bomb out unless ENOBUFS? */ + err = sock->ops->sendmsg(sock, &msg, len); + if (unlikely(err < 0)) { + vhost_discard_vq_desc(vq, 1); + vhost_net_enable_vq(net, vq); + break; + } + if (err != len) + pr_debug("Truncated TX packet: len %d != %zd\n", + err, len); + if (++nvq->done_idx >= VHOST_NET_BATCH) + vhost_net_signal_used(nvq); + } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); + + vhost_net_signal_used(nvq); +} + +static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) +{ + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; + struct vhost_virtqueue *vq = &nvq->vq; + unsigned out, in; + int head; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = MSG_DONTWAIT, + }; + size_t len, total_len = 0; + int err; + struct vhost_net_ubuf_ref *uninitialized_var(ubufs); + struct ubuf_info *ubuf; + bool zcopy_used; + int sent_pkts = 0; + + do { + bool busyloop_intr; + + /* Release DMAs done buffers first */ + vhost_zerocopy_signal_used(net, vq); + + busyloop_intr = false; + head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, + &busyloop_intr); + /* On error, stop handling until the next kick. */ + if (unlikely(head < 0)) + break; + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (head == vq->num) { + if (unlikely(busyloop_intr)) { + vhost_poll_queue(&vq->poll); + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { + vhost_disable_notify(&net->dev, vq); + continue; + } + break; + } + + zcopy_used = len >= VHOST_GOODCOPY_LEN + && !vhost_exceeds_maxpend(net) + && vhost_net_tx_select_zcopy(net); + + /* use msg_control to pass vhost zerocopy ubuf info to skb */ + if (zcopy_used) { + ubuf = nvq->ubuf_info + nvq->upend_idx; + vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head); + vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; + ubuf->callback = vhost_zerocopy_callback; + ubuf->ctx = nvq->ubufs; + ubuf->desc = nvq->upend_idx; + refcount_set(&ubuf->refcnt, 1); + msg.msg_control = ubuf; + msg.msg_controllen = sizeof(ubuf); + ubufs = nvq->ubufs; + atomic_inc(&ubufs->refcount); + nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; + } else { + msg.msg_control = NULL; + ubufs = NULL; + } + total_len += len; + if (tx_can_batch(vq, total_len) && + likely(!vhost_exceeds_maxpend(net))) { + msg.msg_flags |= MSG_MORE; + } else { + msg.msg_flags &= ~MSG_MORE; + } + + /* TODO: Check specific error and bomb out unless ENOBUFS? */ + err = sock->ops->sendmsg(sock, &msg, len); + if (unlikely(err < 0)) { + if (zcopy_used) { + if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS) + vhost_net_ubuf_put(ubufs); + nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) + % UIO_MAXIOV; + } + vhost_discard_vq_desc(vq, 1); + vhost_net_enable_vq(net, vq); + break; + } + if (err != len) + pr_debug("Truncated TX packet: " + " len %d != %zd\n", err, len); + if (!zcopy_used) + vhost_add_used_and_signal(&net->dev, vq, head, 0); + else + vhost_zerocopy_signal_used(net, vq); + vhost_net_tx_packet(net); + } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); +} + +/* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ +static void handle_tx(struct vhost_net *net) +{ + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; + struct vhost_virtqueue *vq = &nvq->vq; + struct socket *sock; + + mutex_lock(&vq->mutex); + sock = vq->private_data; + if (!sock) + goto out; + + if (!vq_iotlb_prefetch(vq)) + goto out; + + vhost_disable_notify(&net->dev, vq); + vhost_net_disable_vq(net, vq); + + if (vhost_sock_zcopy(sock)) + handle_tx_zerocopy(net, sock); + else + handle_tx_copy(net, sock); + +out: + mutex_unlock(&vq->mutex); +} + +static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) +{ + struct sk_buff *head; + int len = 0; + unsigned long flags; + + if (rvq->rx_ring) + return vhost_net_buf_peek(rvq); + + spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); + head = skb_peek(&sk->sk_receive_queue); + if (likely(head)) { + len = head->len; + if (skb_vlan_tag_present(head)) + len += VLAN_HLEN; + } + + spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags); + return len; +} + +static int sk_has_rx_data(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + + if (sock->ops->peek_len) + return sock->ops->peek_len(sock); + + return skb_queue_empty(&sk->sk_receive_queue); +} + +static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, + bool *busyloop_intr) +{ + struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; + struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; + struct vhost_virtqueue *rvq = &rnvq->vq; + struct vhost_virtqueue *tvq = &tnvq->vq; + unsigned long uninitialized_var(endtime); + int len = peek_head_len(rnvq, sk); + + if (!len && tvq->busyloop_timeout) { + /* Flush batched heads first */ + vhost_net_signal_used(rnvq); + /* Both tx vq and rx socket were polled here */ + mutex_lock_nested(&tvq->mutex, 1); + vhost_disable_notify(&net->dev, tvq); + + preempt_disable(); + endtime = busy_clock() + tvq->busyloop_timeout; + + while (vhost_can_busy_poll(endtime)) { + if (vhost_has_work(&net->dev)) { + *busyloop_intr = true; + break; + } + if ((sk_has_rx_data(sk) && + !vhost_vq_avail_empty(&net->dev, rvq)) || + !vhost_vq_avail_empty(&net->dev, tvq)) + break; + cpu_relax(); + } + + preempt_enable(); + + if (!vhost_vq_avail_empty(&net->dev, tvq)) { + vhost_poll_queue(&tvq->poll); + } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) { + vhost_disable_notify(&net->dev, tvq); + vhost_poll_queue(&tvq->poll); + } + + mutex_unlock(&tvq->mutex); + + len = peek_head_len(rnvq, sk); + } + + return len; +} + +/* This is a multi-buffer version of vhost_get_desc, that works if + * vq has read descriptors only. + * @vq - the relevant virtqueue + * @datalen - data length we'll be reading + * @iovcount - returned count of io vectors we fill + * @log - vhost log + * @log_num - log offset + * @quota - headcount quota, 1 for big buffer + * returns number of buffer heads allocated, negative on error + */ +static int get_rx_bufs(struct vhost_virtqueue *vq, + struct vring_used_elem *heads, + int datalen, + unsigned *iovcount, + struct vhost_log *log, + unsigned *log_num, + unsigned int quota) +{ + unsigned int out, in; + int seg = 0; + int headcount = 0; + unsigned d; + int r, nlogs = 0; + /* len is always initialized before use since we are always called with + * datalen > 0. + */ + u32 uninitialized_var(len); + + while (datalen > 0 && headcount < quota) { + if (unlikely(seg >= UIO_MAXIOV)) { + r = -ENOBUFS; + goto err; + } + r = vhost_get_vq_desc(vq, vq->iov + seg, + ARRAY_SIZE(vq->iov) - seg, &out, + &in, log, log_num); + if (unlikely(r < 0)) + goto err; + + d = r; + if (d == vq->num) { + r = 0; + goto err; + } + if (unlikely(out || in <= 0)) { + vq_err(vq, "unexpected descriptor format for RX: " + "out %d, in %d\n", out, in); + r = -EINVAL; + goto err; + } + if (unlikely(log)) { + nlogs += *log_num; + log += *log_num; + } + heads[headcount].id = cpu_to_vhost32(vq, d); + len = iov_length(vq->iov + seg, in); + heads[headcount].len = cpu_to_vhost32(vq, len); + datalen -= len; + ++headcount; + seg += in; + } + heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); + *iovcount = seg; + if (unlikely(log)) + *log_num = nlogs; + + /* Detect overrun */ + if (unlikely(datalen > 0)) { + r = UIO_MAXIOV + 1; + goto err; + } + return headcount; +err: + vhost_discard_vq_desc(vq, headcount); + return r; +} + +/* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ +static void handle_rx(struct vhost_net *net) +{ + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; + struct vhost_virtqueue *vq = &nvq->vq; + unsigned uninitialized_var(in), log; + struct vhost_log *vq_log; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_control = NULL, /* FIXME: get and handle RX aux data. */ + .msg_controllen = 0, + .msg_flags = MSG_DONTWAIT, + }; + struct virtio_net_hdr hdr = { + .flags = 0, + .gso_type = VIRTIO_NET_HDR_GSO_NONE + }; + size_t total_len = 0; + int err, mergeable; + s16 headcount; + size_t vhost_hlen, sock_hlen; + size_t vhost_len, sock_len; + bool busyloop_intr = false; + struct socket *sock; + struct iov_iter fixup; + __virtio16 num_buffers; + int recv_pkts = 0; + + mutex_lock_nested(&vq->mutex, 0); + sock = vq->private_data; + if (!sock) + goto out; + + if (!vq_iotlb_prefetch(vq)) + goto out; + + vhost_disable_notify(&net->dev, vq); + vhost_net_disable_vq(net, vq); + + vhost_hlen = nvq->vhost_hlen; + sock_hlen = nvq->sock_hlen; + + vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? + vq->log : NULL; + mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); + + do { + sock_len = vhost_net_rx_peek_head_len(net, sock->sk, + &busyloop_intr); + if (!sock_len) + break; + sock_len += sock_hlen; + vhost_len = sock_len + vhost_hlen; + headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx, + vhost_len, &in, vq_log, &log, + likely(mergeable) ? UIO_MAXIOV : 1); + /* On error, stop handling until the next kick. */ + if (unlikely(headcount < 0)) + goto out; + /* OK, now we need to know about added descriptors. */ + if (!headcount) { + if (unlikely(busyloop_intr)) { + vhost_poll_queue(&vq->poll); + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { + /* They have slipped one in as we were + * doing that: check again. */ + vhost_disable_notify(&net->dev, vq); + continue; + } + /* Nothing new? Wait for eventfd to tell us + * they refilled. */ + goto out; + } + busyloop_intr = false; + if (nvq->rx_ring) + msg.msg_control = vhost_net_buf_consume(&nvq->rxq); + /* On overrun, truncate and discard */ + if (unlikely(headcount > UIO_MAXIOV)) { + iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1); + err = sock->ops->recvmsg(sock, &msg, + 1, MSG_DONTWAIT | MSG_TRUNC); + pr_debug("Discarded rx packet: len %zd\n", sock_len); + continue; + } + /* We don't need to be notified again. */ + iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len); + fixup = msg.msg_iter; + if (unlikely((vhost_hlen))) { + /* We will supply the header ourselves + * TODO: support TSO. + */ + iov_iter_advance(&msg.msg_iter, vhost_hlen); + } + err = sock->ops->recvmsg(sock, &msg, + sock_len, MSG_DONTWAIT | MSG_TRUNC); + /* Userspace might have consumed the packet meanwhile: + * it's not supposed to do this usually, but might be hard + * to prevent. Discard data we got (if any) and keep going. */ + if (unlikely(err != sock_len)) { + pr_debug("Discarded rx packet: " + " len %d, expected %zd\n", err, sock_len); + vhost_discard_vq_desc(vq, headcount); + continue; + } + /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ + if (unlikely(vhost_hlen)) { + if (copy_to_iter(&hdr, sizeof(hdr), + &fixup) != sizeof(hdr)) { + vq_err(vq, "Unable to write vnet_hdr " + "at addr %p\n", vq->iov->iov_base); + goto out; + } + } else { + /* Header came from socket; we'll need to patch + * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF + */ + iov_iter_advance(&fixup, sizeof(hdr)); + } + /* TODO: Should check and handle checksum. */ + + num_buffers = cpu_to_vhost16(vq, headcount); + if (likely(mergeable) && + copy_to_iter(&num_buffers, sizeof num_buffers, + &fixup) != sizeof num_buffers) { + vq_err(vq, "Failed num_buffers write"); + vhost_discard_vq_desc(vq, headcount); + goto out; + } + nvq->done_idx += headcount; + if (nvq->done_idx > VHOST_NET_BATCH) + vhost_net_signal_used(nvq); + if (unlikely(vq_log)) + vhost_log_write(vq, vq_log, log, vhost_len, + vq->iov, in); + total_len += vhost_len; + } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len))); + + if (unlikely(busyloop_intr)) + vhost_poll_queue(&vq->poll); + else if (!sock_len) + vhost_net_enable_vq(net, vq); +out: + vhost_net_signal_used(nvq); + mutex_unlock(&vq->mutex); +} + +static void handle_tx_kick(struct vhost_work *work) +{ + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); + + handle_tx(net); +} + +static void handle_rx_kick(struct vhost_work *work) +{ + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); + + handle_rx(net); +} + +static void handle_tx_net(struct vhost_work *work) +{ + struct vhost_net *net = container_of(work, struct vhost_net, + poll[VHOST_NET_VQ_TX].work); + handle_tx(net); +} + +static void handle_rx_net(struct vhost_work *work) +{ + struct vhost_net *net = container_of(work, struct vhost_net, + poll[VHOST_NET_VQ_RX].work); + handle_rx(net); +} + +static int vhost_net_open(struct inode *inode, struct file *f) +{ + struct vhost_net *n; + struct vhost_dev *dev; + struct vhost_virtqueue **vqs; + void **queue; + int i; + + n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + if (!n) + return -ENOMEM; + vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL); + if (!vqs) { + kvfree(n); + return -ENOMEM; + } + + queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *), + GFP_KERNEL); + if (!queue) { + kfree(vqs); + kvfree(n); + return -ENOMEM; + } + n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue; + + dev = &n->dev; + vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; + vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; + n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; + n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; + for (i = 0; i < VHOST_NET_VQ_MAX; i++) { + n->vqs[i].ubufs = NULL; + n->vqs[i].ubuf_info = NULL; + n->vqs[i].upend_idx = 0; + n->vqs[i].done_idx = 0; + n->vqs[i].vhost_hlen = 0; + n->vqs[i].sock_hlen = 0; + n->vqs[i].rx_ring = NULL; + vhost_net_buf_init(&n->vqs[i].rxq); + } + vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, + UIO_MAXIOV + VHOST_NET_BATCH, + VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT); + + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev); + + f->private_data = n; + + return 0; +} + +static struct socket *vhost_net_stop_vq(struct vhost_net *n, + struct vhost_virtqueue *vq) +{ + struct socket *sock; + struct vhost_net_virtqueue *nvq = + container_of(vq, struct vhost_net_virtqueue, vq); + + mutex_lock(&vq->mutex); + sock = vq->private_data; + vhost_net_disable_vq(n, vq); + vq->private_data = NULL; + vhost_net_buf_unproduce(nvq); + nvq->rx_ring = NULL; + mutex_unlock(&vq->mutex); + return sock; +} + +static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, + struct socket **rx_sock) +{ + *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq); + *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); +} + +static void vhost_net_flush_vq(struct vhost_net *n, int index) +{ + vhost_poll_flush(n->poll + index); + vhost_poll_flush(&n->vqs[index].vq.poll); +} + +static void vhost_net_flush(struct vhost_net *n) +{ + vhost_net_flush_vq(n, VHOST_NET_VQ_TX); + vhost_net_flush_vq(n, VHOST_NET_VQ_RX); + if (n->vqs[VHOST_NET_VQ_TX].ubufs) { + mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); + n->tx_flush = true; + mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); + /* Wait for all lower device DMAs done. */ + vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs); + mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); + n->tx_flush = false; + atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1); + mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); + } +} + +static int vhost_net_release(struct inode *inode, struct file *f) +{ + struct vhost_net *n = f->private_data; + struct socket *tx_sock; + struct socket *rx_sock; + + vhost_net_stop(n, &tx_sock, &rx_sock); + vhost_net_flush(n); + vhost_dev_stop(&n->dev); + vhost_dev_cleanup(&n->dev); + vhost_net_vq_reset(n); + if (tx_sock) + sockfd_put(tx_sock); + if (rx_sock) + sockfd_put(rx_sock); + /* Make sure no callbacks are outstanding */ + synchronize_rcu_bh(); + /* We do an extra flush before freeing memory, + * since jobs can re-queue themselves. */ + vhost_net_flush(n); + kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); + kfree(n->dev.vqs); + kvfree(n); + return 0; +} + +static struct socket *get_raw_socket(int fd) +{ + int r; + struct socket *sock = sockfd_lookup(fd, &r); + + if (!sock) + return ERR_PTR(-ENOTSOCK); + + /* Parameter checking */ + if (sock->sk->sk_type != SOCK_RAW) { + r = -ESOCKTNOSUPPORT; + goto err; + } + + if (sock->sk->sk_family != AF_PACKET) { + r = -EPFNOSUPPORT; + goto err; + } + return sock; +err: + sockfd_put(sock); + return ERR_PTR(r); +} + +static struct ptr_ring *get_tap_ptr_ring(struct file *file) +{ + struct ptr_ring *ring; + ring = tun_get_tx_ring(file); + if (!IS_ERR(ring)) + goto out; + ring = tap_get_ptr_ring(file); + if (!IS_ERR(ring)) + goto out; + ring = NULL; +out: + return ring; +} + +static struct socket *get_tap_socket(int fd) +{ + struct file *file = fget(fd); + struct socket *sock; + + if (!file) + return ERR_PTR(-EBADF); + sock = tun_get_socket(file); + if (!IS_ERR(sock)) + return sock; + sock = tap_get_socket(file); + if (IS_ERR(sock)) + fput(file); + return sock; +} + +static struct socket *get_socket(int fd) +{ + struct socket *sock; + + /* special case to disable backend */ + if (fd == -1) + return NULL; + sock = get_raw_socket(fd); + if (!IS_ERR(sock)) + return sock; + sock = get_tap_socket(fd); + if (!IS_ERR(sock)) + return sock; + return ERR_PTR(-ENOTSOCK); +} + +static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) +{ + struct socket *sock, *oldsock; + struct vhost_virtqueue *vq; + struct vhost_net_virtqueue *nvq; + struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL; + int r; + + mutex_lock(&n->dev.mutex); + r = vhost_dev_check_owner(&n->dev); + if (r) + goto err; + + if (index >= VHOST_NET_VQ_MAX) { + r = -ENOBUFS; + goto err; + } + vq = &n->vqs[index].vq; + nvq = &n->vqs[index]; + mutex_lock(&vq->mutex); + + /* Verify that ring has been setup correctly. */ + if (!vhost_vq_access_ok(vq)) { + r = -EFAULT; + goto err_vq; + } + sock = get_socket(fd); + if (IS_ERR(sock)) { + r = PTR_ERR(sock); + goto err_vq; + } + + /* start polling new socket */ + oldsock = vq->private_data; + if (sock != oldsock) { + ubufs = vhost_net_ubuf_alloc(vq, + sock && vhost_sock_zcopy(sock)); + if (IS_ERR(ubufs)) { + r = PTR_ERR(ubufs); + goto err_ubufs; + } + + vhost_net_disable_vq(n, vq); + vq->private_data = sock; + vhost_net_buf_unproduce(nvq); + r = vhost_vq_init_access(vq); + if (r) + goto err_used; + r = vhost_net_enable_vq(n, vq); + if (r) + goto err_used; + if (index == VHOST_NET_VQ_RX) { + if (sock) + nvq->rx_ring = get_tap_ptr_ring(sock->file); + else + nvq->rx_ring = NULL; + } + + oldubufs = nvq->ubufs; + nvq->ubufs = ubufs; + + n->tx_packets = 0; + n->tx_zcopy_err = 0; + n->tx_flush = false; + } + + mutex_unlock(&vq->mutex); + + if (oldubufs) { + vhost_net_ubuf_put_wait_and_free(oldubufs); + mutex_lock(&vq->mutex); + vhost_zerocopy_signal_used(n, vq); + mutex_unlock(&vq->mutex); + } + + if (oldsock) { + vhost_net_flush_vq(n, index); + sockfd_put(oldsock); + } + + mutex_unlock(&n->dev.mutex); + return 0; + +err_used: + vq->private_data = oldsock; + vhost_net_enable_vq(n, vq); + if (ubufs) + vhost_net_ubuf_put_wait_and_free(ubufs); +err_ubufs: + if (sock) + sockfd_put(sock); +err_vq: + mutex_unlock(&vq->mutex); +err: + mutex_unlock(&n->dev.mutex); + return r; +} + +static long vhost_net_reset_owner(struct vhost_net *n) +{ + struct socket *tx_sock = NULL; + struct socket *rx_sock = NULL; + long err; + struct vhost_umem *umem; + + mutex_lock(&n->dev.mutex); + err = vhost_dev_check_owner(&n->dev); + if (err) + goto done; + umem = vhost_dev_reset_owner_prepare(); + if (!umem) { + err = -ENOMEM; + goto done; + } + vhost_net_stop(n, &tx_sock, &rx_sock); + vhost_net_flush(n); + vhost_dev_stop(&n->dev); + vhost_dev_reset_owner(&n->dev, umem); + vhost_net_vq_reset(n); +done: + mutex_unlock(&n->dev.mutex); + if (tx_sock) + sockfd_put(tx_sock); + if (rx_sock) + sockfd_put(rx_sock); + return err; +} + +static int vhost_net_set_backend_features(struct vhost_net *n, u64 features) +{ + int i; + + mutex_lock(&n->dev.mutex); + for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { + mutex_lock(&n->vqs[i].vq.mutex); + n->vqs[i].vq.acked_backend_features = features; + mutex_unlock(&n->vqs[i].vq.mutex); + } + mutex_unlock(&n->dev.mutex); + + return 0; +} + +static int vhost_net_set_features(struct vhost_net *n, u64 features) +{ + size_t vhost_hlen, sock_hlen, hdr_len; + int i; + + hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | + (1ULL << VIRTIO_F_VERSION_1))) ? + sizeof(struct virtio_net_hdr_mrg_rxbuf) : + sizeof(struct virtio_net_hdr); + if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { + /* vhost provides vnet_hdr */ + vhost_hlen = hdr_len; + sock_hlen = 0; + } else { + /* socket provides vnet_hdr */ + vhost_hlen = 0; + sock_hlen = hdr_len; + } + mutex_lock(&n->dev.mutex); + if ((features & (1 << VHOST_F_LOG_ALL)) && + !vhost_log_access_ok(&n->dev)) + goto out_unlock; + + if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) { + if (vhost_init_device_iotlb(&n->dev, true)) + goto out_unlock; + } + + for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { + mutex_lock(&n->vqs[i].vq.mutex); + n->vqs[i].vq.acked_features = features; + n->vqs[i].vhost_hlen = vhost_hlen; + n->vqs[i].sock_hlen = sock_hlen; + mutex_unlock(&n->vqs[i].vq.mutex); + } + mutex_unlock(&n->dev.mutex); + return 0; + +out_unlock: + mutex_unlock(&n->dev.mutex); + return -EFAULT; +} + +static long vhost_net_set_owner(struct vhost_net *n) +{ + int r; + + mutex_lock(&n->dev.mutex); + if (vhost_dev_has_owner(&n->dev)) { + r = -EBUSY; + goto out; + } + r = vhost_net_set_ubuf_info(n); + if (r) + goto out; + r = vhost_dev_set_owner(&n->dev); + if (r) + vhost_net_clear_ubuf_info(n); + vhost_net_flush(n); +out: + mutex_unlock(&n->dev.mutex); + return r; +} + +static long vhost_net_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_net *n = f->private_data; + void __user *argp = (void __user *)arg; + u64 __user *featurep = argp; + struct vhost_vring_file backend; + u64 features; + int r; + + switch (ioctl) { + case VHOST_NET_SET_BACKEND: + if (copy_from_user(&backend, argp, sizeof backend)) + return -EFAULT; + return vhost_net_set_backend(n, backend.index, backend.fd); + case VHOST_GET_FEATURES: + features = VHOST_NET_FEATURES; + if (copy_to_user(featurep, &features, sizeof features)) + return -EFAULT; + return 0; + case VHOST_SET_FEATURES: + if (copy_from_user(&features, featurep, sizeof features)) + return -EFAULT; + if (features & ~VHOST_NET_FEATURES) + return -EOPNOTSUPP; + return vhost_net_set_features(n, features); + case VHOST_GET_BACKEND_FEATURES: + features = VHOST_NET_BACKEND_FEATURES; + if (copy_to_user(featurep, &features, sizeof(features))) + return -EFAULT; + return 0; + case VHOST_SET_BACKEND_FEATURES: + if (copy_from_user(&features, featurep, sizeof(features))) + return -EFAULT; + if (features & ~VHOST_NET_BACKEND_FEATURES) + return -EOPNOTSUPP; + return vhost_net_set_backend_features(n, features); + case VHOST_RESET_OWNER: + return vhost_net_reset_owner(n); + case VHOST_SET_OWNER: + return vhost_net_set_owner(n); + default: + mutex_lock(&n->dev.mutex); + r = vhost_dev_ioctl(&n->dev, ioctl, argp); + if (r == -ENOIOCTLCMD) + r = vhost_vring_ioctl(&n->dev, ioctl, argp); + else + vhost_net_flush(n); + mutex_unlock(&n->dev.mutex); + return r; + } +} + +#ifdef CONFIG_COMPAT +static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); +} +#endif + +static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct vhost_net *n = file->private_data; + struct vhost_dev *dev = &n->dev; + int noblock = file->f_flags & O_NONBLOCK; + + return vhost_chr_read_iter(dev, to, noblock); +} + +static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct vhost_net *n = file->private_data; + struct vhost_dev *dev = &n->dev; + + return vhost_chr_write_iter(dev, from); +} + +static __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait) +{ + struct vhost_net *n = file->private_data; + struct vhost_dev *dev = &n->dev; + + return vhost_chr_poll(file, dev, wait); +} + +static const struct file_operations vhost_net_fops = { + .owner = THIS_MODULE, + .release = vhost_net_release, + .read_iter = vhost_net_chr_read_iter, + .write_iter = vhost_net_chr_write_iter, + .poll = vhost_net_chr_poll, + .unlocked_ioctl = vhost_net_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vhost_net_compat_ioctl, +#endif + .open = vhost_net_open, + .llseek = noop_llseek, +}; + +static struct miscdevice vhost_net_misc = { + .minor = VHOST_NET_MINOR, + .name = "vhost-net", + .fops = &vhost_net_fops, +}; + +static int vhost_net_init(void) +{ + if (experimental_zcopytx) + vhost_net_enable_zcopy(VHOST_NET_VQ_TX); + return misc_register(&vhost_net_misc); +} +module_init(vhost_net_init); + +static void vhost_net_exit(void) +{ + misc_deregister(&vhost_net_misc); +} +module_exit(vhost_net_exit); + +MODULE_VERSION("0.0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Michael S. Tsirkin"); +MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); +MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR); +MODULE_ALIAS("devname:vhost-net"); diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c new file mode 100644 index 000000000..5e298d928 --- /dev/null +++ b/drivers/vhost/scsi.c @@ -0,0 +1,2151 @@ +/******************************************************************************* + * Vhost kernel TCM fabric driver for virtio SCSI initiators + * + * (C) Copyright 2010-2013 Datera, Inc. + * (C) Copyright 2010-2012 IBM Corp. + * + * Licensed to the Linux Foundation under the General Public License (GPL) version 2. + * + * Authors: Nicholas A. Bellinger <nab@daterainc.com> + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + ****************************************************************************/ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <generated/utsrelease.h> +#include <linux/utsname.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/kthread.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/configfs.h> +#include <linux/ctype.h> +#include <linux/compat.h> +#include <linux/eventfd.h> +#include <linux/fs.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <asm/unaligned.h> +#include <scsi/scsi_common.h> +#include <scsi/scsi_proto.h> +#include <target/target_core_base.h> +#include <target/target_core_fabric.h> +#include <linux/vhost.h> +#include <linux/virtio_scsi.h> +#include <linux/llist.h> +#include <linux/bitmap.h> + +#include "vhost.h" + +#define VHOST_SCSI_VERSION "v0.1" +#define VHOST_SCSI_NAMELEN 256 +#define VHOST_SCSI_MAX_CDB_SIZE 32 +#define VHOST_SCSI_DEFAULT_TAGS 256 +#define VHOST_SCSI_PREALLOC_SGLS 2048 +#define VHOST_SCSI_PREALLOC_UPAGES 2048 +#define VHOST_SCSI_PREALLOC_PROT_SGLS 2048 + +/* Max number of requests before requeueing the job. + * Using this limit prevents one virtqueue from starving others with + * request. + */ +#define VHOST_SCSI_WEIGHT 256 + +struct vhost_scsi_inflight { + /* Wait for the flush operation to finish */ + struct completion comp; + /* Refcount for the inflight reqs */ + struct kref kref; +}; + +struct vhost_scsi_cmd { + /* Descriptor from vhost_get_vq_desc() for virt_queue segment */ + int tvc_vq_desc; + /* virtio-scsi initiator task attribute */ + int tvc_task_attr; + /* virtio-scsi response incoming iovecs */ + int tvc_in_iovs; + /* virtio-scsi initiator data direction */ + enum dma_data_direction tvc_data_direction; + /* Expected data transfer length from virtio-scsi header */ + u32 tvc_exp_data_len; + /* The Tag from include/linux/virtio_scsi.h:struct virtio_scsi_cmd_req */ + u64 tvc_tag; + /* The number of scatterlists associated with this cmd */ + u32 tvc_sgl_count; + u32 tvc_prot_sgl_count; + /* Saved unpacked SCSI LUN for vhost_scsi_submission_work() */ + u32 tvc_lun; + /* Pointer to the SGL formatted memory from virtio-scsi */ + struct scatterlist *tvc_sgl; + struct scatterlist *tvc_prot_sgl; + struct page **tvc_upages; + /* Pointer to response header iovec */ + struct iovec tvc_resp_iov; + /* Pointer to vhost_scsi for our device */ + struct vhost_scsi *tvc_vhost; + /* Pointer to vhost_virtqueue for the cmd */ + struct vhost_virtqueue *tvc_vq; + /* Pointer to vhost nexus memory */ + struct vhost_scsi_nexus *tvc_nexus; + /* The TCM I/O descriptor that is accessed via container_of() */ + struct se_cmd tvc_se_cmd; + /* work item used for cmwq dispatch to vhost_scsi_submission_work() */ + struct work_struct work; + /* Copy of the incoming SCSI command descriptor block (CDB) */ + unsigned char tvc_cdb[VHOST_SCSI_MAX_CDB_SIZE]; + /* Sense buffer that will be mapped into outgoing status */ + unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER]; + /* Completed commands list, serviced from vhost worker thread */ + struct llist_node tvc_completion_list; + /* Used to track inflight cmd */ + struct vhost_scsi_inflight *inflight; +}; + +struct vhost_scsi_nexus { + /* Pointer to TCM session for I_T Nexus */ + struct se_session *tvn_se_sess; +}; + +struct vhost_scsi_tpg { + /* Vhost port target portal group tag for TCM */ + u16 tport_tpgt; + /* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */ + int tv_tpg_port_count; + /* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */ + int tv_tpg_vhost_count; + /* Used for enabling T10-PI with legacy devices */ + int tv_fabric_prot_type; + /* list for vhost_scsi_list */ + struct list_head tv_tpg_list; + /* Used to protect access for tpg_nexus */ + struct mutex tv_tpg_mutex; + /* Pointer to the TCM VHost I_T Nexus for this TPG endpoint */ + struct vhost_scsi_nexus *tpg_nexus; + /* Pointer back to vhost_scsi_tport */ + struct vhost_scsi_tport *tport; + /* Returned by vhost_scsi_make_tpg() */ + struct se_portal_group se_tpg; + /* Pointer back to vhost_scsi, protected by tv_tpg_mutex */ + struct vhost_scsi *vhost_scsi; +}; + +struct vhost_scsi_tport { + /* SCSI protocol the tport is providing */ + u8 tport_proto_id; + /* Binary World Wide unique Port Name for Vhost Target port */ + u64 tport_wwpn; + /* ASCII formatted WWPN for Vhost Target port */ + char tport_name[VHOST_SCSI_NAMELEN]; + /* Returned by vhost_scsi_make_tport() */ + struct se_wwn tport_wwn; +}; + +struct vhost_scsi_evt { + /* event to be sent to guest */ + struct virtio_scsi_event event; + /* event list, serviced from vhost worker thread */ + struct llist_node list; +}; + +enum { + VHOST_SCSI_VQ_CTL = 0, + VHOST_SCSI_VQ_EVT = 1, + VHOST_SCSI_VQ_IO = 2, +}; + +/* Note: can't set VIRTIO_F_VERSION_1 yet, since that implies ANY_LAYOUT. */ +enum { + VHOST_SCSI_FEATURES = VHOST_FEATURES | (1ULL << VIRTIO_SCSI_F_HOTPLUG) | + (1ULL << VIRTIO_SCSI_F_T10_PI) +}; + +#define VHOST_SCSI_MAX_TARGET 256 +#define VHOST_SCSI_MAX_VQ 128 +#define VHOST_SCSI_MAX_EVENT 128 + +struct vhost_scsi_virtqueue { + struct vhost_virtqueue vq; + /* + * Reference counting for inflight reqs, used for flush operation. At + * each time, one reference tracks new commands submitted, while we + * wait for another one to reach 0. + */ + struct vhost_scsi_inflight inflights[2]; + /* + * Indicate current inflight in use, protected by vq->mutex. + * Writers must also take dev mutex and flush under it. + */ + int inflight_idx; +}; + +struct vhost_scsi { + /* Protected by vhost_scsi->dev.mutex */ + struct vhost_scsi_tpg **vs_tpg; + char vs_vhost_wwpn[TRANSPORT_IQN_LEN]; + + struct vhost_dev dev; + struct vhost_scsi_virtqueue vqs[VHOST_SCSI_MAX_VQ]; + + struct vhost_work vs_completion_work; /* cmd completion work item */ + struct llist_head vs_completion_list; /* cmd completion queue */ + + struct vhost_work vs_event_work; /* evt injection work item */ + struct llist_head vs_event_list; /* evt injection queue */ + + bool vs_events_missed; /* any missed events, protected by vq->mutex */ + int vs_events_nr; /* num of pending events, protected by vq->mutex */ +}; + +static struct workqueue_struct *vhost_scsi_workqueue; + +/* Global spinlock to protect vhost_scsi TPG list for vhost IOCTL access */ +static DEFINE_MUTEX(vhost_scsi_mutex); +static LIST_HEAD(vhost_scsi_list); + +static void vhost_scsi_done_inflight(struct kref *kref) +{ + struct vhost_scsi_inflight *inflight; + + inflight = container_of(kref, struct vhost_scsi_inflight, kref); + complete(&inflight->comp); +} + +static void vhost_scsi_init_inflight(struct vhost_scsi *vs, + struct vhost_scsi_inflight *old_inflight[]) +{ + struct vhost_scsi_inflight *new_inflight; + struct vhost_virtqueue *vq; + int idx, i; + + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + vq = &vs->vqs[i].vq; + + mutex_lock(&vq->mutex); + + /* store old infight */ + idx = vs->vqs[i].inflight_idx; + if (old_inflight) + old_inflight[i] = &vs->vqs[i].inflights[idx]; + + /* setup new infight */ + vs->vqs[i].inflight_idx = idx ^ 1; + new_inflight = &vs->vqs[i].inflights[idx ^ 1]; + kref_init(&new_inflight->kref); + init_completion(&new_inflight->comp); + + mutex_unlock(&vq->mutex); + } +} + +static struct vhost_scsi_inflight * +vhost_scsi_get_inflight(struct vhost_virtqueue *vq) +{ + struct vhost_scsi_inflight *inflight; + struct vhost_scsi_virtqueue *svq; + + svq = container_of(vq, struct vhost_scsi_virtqueue, vq); + inflight = &svq->inflights[svq->inflight_idx]; + kref_get(&inflight->kref); + + return inflight; +} + +static void vhost_scsi_put_inflight(struct vhost_scsi_inflight *inflight) +{ + kref_put(&inflight->kref, vhost_scsi_done_inflight); +} + +static int vhost_scsi_check_true(struct se_portal_group *se_tpg) +{ + return 1; +} + +static int vhost_scsi_check_false(struct se_portal_group *se_tpg) +{ + return 0; +} + +static char *vhost_scsi_get_fabric_name(void) +{ + return "vhost"; +} + +static char *vhost_scsi_get_fabric_wwn(struct se_portal_group *se_tpg) +{ + struct vhost_scsi_tpg *tpg = container_of(se_tpg, + struct vhost_scsi_tpg, se_tpg); + struct vhost_scsi_tport *tport = tpg->tport; + + return &tport->tport_name[0]; +} + +static u16 vhost_scsi_get_tpgt(struct se_portal_group *se_tpg) +{ + struct vhost_scsi_tpg *tpg = container_of(se_tpg, + struct vhost_scsi_tpg, se_tpg); + return tpg->tport_tpgt; +} + +static int vhost_scsi_check_prot_fabric_only(struct se_portal_group *se_tpg) +{ + struct vhost_scsi_tpg *tpg = container_of(se_tpg, + struct vhost_scsi_tpg, se_tpg); + + return tpg->tv_fabric_prot_type; +} + +static u32 vhost_scsi_tpg_get_inst_index(struct se_portal_group *se_tpg) +{ + return 1; +} + +static void vhost_scsi_release_cmd(struct se_cmd *se_cmd) +{ + struct vhost_scsi_cmd *tv_cmd = container_of(se_cmd, + struct vhost_scsi_cmd, tvc_se_cmd); + struct se_session *se_sess = tv_cmd->tvc_nexus->tvn_se_sess; + int i; + + if (tv_cmd->tvc_sgl_count) { + for (i = 0; i < tv_cmd->tvc_sgl_count; i++) + put_page(sg_page(&tv_cmd->tvc_sgl[i])); + } + if (tv_cmd->tvc_prot_sgl_count) { + for (i = 0; i < tv_cmd->tvc_prot_sgl_count; i++) + put_page(sg_page(&tv_cmd->tvc_prot_sgl[i])); + } + + vhost_scsi_put_inflight(tv_cmd->inflight); + target_free_tag(se_sess, se_cmd); +} + +static u32 vhost_scsi_sess_get_index(struct se_session *se_sess) +{ + return 0; +} + +static int vhost_scsi_write_pending(struct se_cmd *se_cmd) +{ + /* Go ahead and process the write immediately */ + target_execute_cmd(se_cmd); + return 0; +} + +static int vhost_scsi_write_pending_status(struct se_cmd *se_cmd) +{ + return 0; +} + +static void vhost_scsi_set_default_node_attrs(struct se_node_acl *nacl) +{ + return; +} + +static int vhost_scsi_get_cmd_state(struct se_cmd *se_cmd) +{ + return 0; +} + +static void vhost_scsi_complete_cmd(struct vhost_scsi_cmd *cmd) +{ + struct vhost_scsi *vs = cmd->tvc_vhost; + + llist_add(&cmd->tvc_completion_list, &vs->vs_completion_list); + + vhost_work_queue(&vs->dev, &vs->vs_completion_work); +} + +static int vhost_scsi_queue_data_in(struct se_cmd *se_cmd) +{ + struct vhost_scsi_cmd *cmd = container_of(se_cmd, + struct vhost_scsi_cmd, tvc_se_cmd); + vhost_scsi_complete_cmd(cmd); + return 0; +} + +static int vhost_scsi_queue_status(struct se_cmd *se_cmd) +{ + struct vhost_scsi_cmd *cmd = container_of(se_cmd, + struct vhost_scsi_cmd, tvc_se_cmd); + vhost_scsi_complete_cmd(cmd); + return 0; +} + +static void vhost_scsi_queue_tm_rsp(struct se_cmd *se_cmd) +{ + return; +} + +static void vhost_scsi_aborted_task(struct se_cmd *se_cmd) +{ + return; +} + +static void vhost_scsi_free_evt(struct vhost_scsi *vs, struct vhost_scsi_evt *evt) +{ + vs->vs_events_nr--; + kfree(evt); +} + +static struct vhost_scsi_evt * +vhost_scsi_allocate_evt(struct vhost_scsi *vs, + u32 event, u32 reason) +{ + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; + struct vhost_scsi_evt *evt; + + if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) { + vs->vs_events_missed = true; + return NULL; + } + + evt = kzalloc(sizeof(*evt), GFP_KERNEL); + if (!evt) { + vq_err(vq, "Failed to allocate vhost_scsi_evt\n"); + vs->vs_events_missed = true; + return NULL; + } + + evt->event.event = cpu_to_vhost32(vq, event); + evt->event.reason = cpu_to_vhost32(vq, reason); + vs->vs_events_nr++; + + return evt; +} + +static void vhost_scsi_free_cmd(struct vhost_scsi_cmd *cmd) +{ + struct se_cmd *se_cmd = &cmd->tvc_se_cmd; + + /* TODO locking against target/backend threads? */ + transport_generic_free_cmd(se_cmd, 0); + +} + +static int vhost_scsi_check_stop_free(struct se_cmd *se_cmd) +{ + return target_put_sess_cmd(se_cmd); +} + +static void +vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt) +{ + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; + struct virtio_scsi_event *event = &evt->event; + struct virtio_scsi_event __user *eventp; + unsigned out, in; + int head, ret; + + if (!vq->private_data) { + vs->vs_events_missed = true; + return; + } + +again: + vhost_disable_notify(&vs->dev, vq); + head = vhost_get_vq_desc(vq, vq->iov, + ARRAY_SIZE(vq->iov), &out, &in, + NULL, NULL); + if (head < 0) { + vs->vs_events_missed = true; + return; + } + if (head == vq->num) { + if (vhost_enable_notify(&vs->dev, vq)) + goto again; + vs->vs_events_missed = true; + return; + } + + if ((vq->iov[out].iov_len != sizeof(struct virtio_scsi_event))) { + vq_err(vq, "Expecting virtio_scsi_event, got %zu bytes\n", + vq->iov[out].iov_len); + vs->vs_events_missed = true; + return; + } + + if (vs->vs_events_missed) { + event->event |= cpu_to_vhost32(vq, VIRTIO_SCSI_T_EVENTS_MISSED); + vs->vs_events_missed = false; + } + + eventp = vq->iov[out].iov_base; + ret = __copy_to_user(eventp, event, sizeof(*event)); + if (!ret) + vhost_add_used_and_signal(&vs->dev, vq, head, 0); + else + vq_err(vq, "Faulted on vhost_scsi_send_event\n"); +} + +static void vhost_scsi_evt_work(struct vhost_work *work) +{ + struct vhost_scsi *vs = container_of(work, struct vhost_scsi, + vs_event_work); + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; + struct vhost_scsi_evt *evt, *t; + struct llist_node *llnode; + + mutex_lock(&vq->mutex); + llnode = llist_del_all(&vs->vs_event_list); + llist_for_each_entry_safe(evt, t, llnode, list) { + vhost_scsi_do_evt_work(vs, evt); + vhost_scsi_free_evt(vs, evt); + } + mutex_unlock(&vq->mutex); +} + +/* Fill in status and signal that we are done processing this command + * + * This is scheduled in the vhost work queue so we are called with the owner + * process mm and can access the vring. + */ +static void vhost_scsi_complete_cmd_work(struct vhost_work *work) +{ + struct vhost_scsi *vs = container_of(work, struct vhost_scsi, + vs_completion_work); + DECLARE_BITMAP(signal, VHOST_SCSI_MAX_VQ); + struct virtio_scsi_cmd_resp v_rsp; + struct vhost_scsi_cmd *cmd, *t; + struct llist_node *llnode; + struct se_cmd *se_cmd; + struct iov_iter iov_iter; + int ret, vq; + + bitmap_zero(signal, VHOST_SCSI_MAX_VQ); + llnode = llist_del_all(&vs->vs_completion_list); + llist_for_each_entry_safe(cmd, t, llnode, tvc_completion_list) { + se_cmd = &cmd->tvc_se_cmd; + + pr_debug("%s tv_cmd %p resid %u status %#02x\n", __func__, + cmd, se_cmd->residual_count, se_cmd->scsi_status); + + memset(&v_rsp, 0, sizeof(v_rsp)); + v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, se_cmd->residual_count); + /* TODO is status_qualifier field needed? */ + v_rsp.status = se_cmd->scsi_status; + v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq, + se_cmd->scsi_sense_length); + memcpy(v_rsp.sense, cmd->tvc_sense_buf, + se_cmd->scsi_sense_length); + + iov_iter_init(&iov_iter, READ, &cmd->tvc_resp_iov, + cmd->tvc_in_iovs, sizeof(v_rsp)); + ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter); + if (likely(ret == sizeof(v_rsp))) { + struct vhost_scsi_virtqueue *q; + vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0); + q = container_of(cmd->tvc_vq, struct vhost_scsi_virtqueue, vq); + vq = q - vs->vqs; + __set_bit(vq, signal); + } else + pr_err("Faulted on virtio_scsi_cmd_resp\n"); + + vhost_scsi_free_cmd(cmd); + } + + vq = -1; + while ((vq = find_next_bit(signal, VHOST_SCSI_MAX_VQ, vq + 1)) + < VHOST_SCSI_MAX_VQ) + vhost_signal(&vs->dev, &vs->vqs[vq].vq); +} + +static struct vhost_scsi_cmd * +vhost_scsi_get_tag(struct vhost_virtqueue *vq, struct vhost_scsi_tpg *tpg, + unsigned char *cdb, u64 scsi_tag, u16 lun, u8 task_attr, + u32 exp_data_len, int data_direction) +{ + struct vhost_scsi_cmd *cmd; + struct vhost_scsi_nexus *tv_nexus; + struct se_session *se_sess; + struct scatterlist *sg, *prot_sg; + struct page **pages; + int tag, cpu; + + tv_nexus = tpg->tpg_nexus; + if (!tv_nexus) { + pr_err("Unable to locate active struct vhost_scsi_nexus\n"); + return ERR_PTR(-EIO); + } + se_sess = tv_nexus->tvn_se_sess; + + tag = sbitmap_queue_get(&se_sess->sess_tag_pool, &cpu); + if (tag < 0) { + pr_err("Unable to obtain tag for vhost_scsi_cmd\n"); + return ERR_PTR(-ENOMEM); + } + + cmd = &((struct vhost_scsi_cmd *)se_sess->sess_cmd_map)[tag]; + sg = cmd->tvc_sgl; + prot_sg = cmd->tvc_prot_sgl; + pages = cmd->tvc_upages; + memset(cmd, 0, sizeof(*cmd)); + cmd->tvc_sgl = sg; + cmd->tvc_prot_sgl = prot_sg; + cmd->tvc_upages = pages; + cmd->tvc_se_cmd.map_tag = tag; + cmd->tvc_se_cmd.map_cpu = cpu; + cmd->tvc_tag = scsi_tag; + cmd->tvc_lun = lun; + cmd->tvc_task_attr = task_attr; + cmd->tvc_exp_data_len = exp_data_len; + cmd->tvc_data_direction = data_direction; + cmd->tvc_nexus = tv_nexus; + cmd->inflight = vhost_scsi_get_inflight(vq); + + memcpy(cmd->tvc_cdb, cdb, VHOST_SCSI_MAX_CDB_SIZE); + + return cmd; +} + +/* + * Map a user memory range into a scatterlist + * + * Returns the number of scatterlist entries used or -errno on error. + */ +static int +vhost_scsi_map_to_sgl(struct vhost_scsi_cmd *cmd, + struct iov_iter *iter, + struct scatterlist *sgl, + bool write) +{ + struct page **pages = cmd->tvc_upages; + struct scatterlist *sg = sgl; + ssize_t bytes; + size_t offset; + unsigned int npages = 0; + + bytes = iov_iter_get_pages(iter, pages, LONG_MAX, + VHOST_SCSI_PREALLOC_UPAGES, &offset); + /* No pages were pinned */ + if (bytes <= 0) + return bytes < 0 ? bytes : -EFAULT; + + iov_iter_advance(iter, bytes); + + while (bytes) { + unsigned n = min_t(unsigned, PAGE_SIZE - offset, bytes); + sg_set_page(sg++, pages[npages++], n, offset); + bytes -= n; + offset = 0; + } + return npages; +} + +static int +vhost_scsi_calc_sgls(struct iov_iter *iter, size_t bytes, int max_sgls) +{ + int sgl_count = 0; + + if (!iter || !iter->iov) { + pr_err("%s: iter->iov is NULL, but expected bytes: %zu" + " present\n", __func__, bytes); + return -EINVAL; + } + + sgl_count = iov_iter_npages(iter, 0xffff); + if (sgl_count > max_sgls) { + pr_err("%s: requested sgl_count: %d exceeds pre-allocated" + " max_sgls: %d\n", __func__, sgl_count, max_sgls); + return -EINVAL; + } + return sgl_count; +} + +static int +vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write, + struct iov_iter *iter, + struct scatterlist *sg, int sg_count) +{ + struct scatterlist *p = sg; + int ret; + + while (iov_iter_count(iter)) { + ret = vhost_scsi_map_to_sgl(cmd, iter, sg, write); + if (ret < 0) { + while (p < sg) { + struct page *page = sg_page(p++); + if (page) + put_page(page); + } + return ret; + } + sg += ret; + } + return 0; +} + +static int +vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, + size_t prot_bytes, struct iov_iter *prot_iter, + size_t data_bytes, struct iov_iter *data_iter) +{ + int sgl_count, ret; + bool write = (cmd->tvc_data_direction == DMA_FROM_DEVICE); + + if (prot_bytes) { + sgl_count = vhost_scsi_calc_sgls(prot_iter, prot_bytes, + VHOST_SCSI_PREALLOC_PROT_SGLS); + if (sgl_count < 0) + return sgl_count; + + sg_init_table(cmd->tvc_prot_sgl, sgl_count); + cmd->tvc_prot_sgl_count = sgl_count; + pr_debug("%s prot_sg %p prot_sgl_count %u\n", __func__, + cmd->tvc_prot_sgl, cmd->tvc_prot_sgl_count); + + ret = vhost_scsi_iov_to_sgl(cmd, write, prot_iter, + cmd->tvc_prot_sgl, + cmd->tvc_prot_sgl_count); + if (ret < 0) { + cmd->tvc_prot_sgl_count = 0; + return ret; + } + } + sgl_count = vhost_scsi_calc_sgls(data_iter, data_bytes, + VHOST_SCSI_PREALLOC_SGLS); + if (sgl_count < 0) + return sgl_count; + + sg_init_table(cmd->tvc_sgl, sgl_count); + cmd->tvc_sgl_count = sgl_count; + pr_debug("%s data_sg %p data_sgl_count %u\n", __func__, + cmd->tvc_sgl, cmd->tvc_sgl_count); + + ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter, + cmd->tvc_sgl, cmd->tvc_sgl_count); + if (ret < 0) { + cmd->tvc_sgl_count = 0; + return ret; + } + return 0; +} + +static int vhost_scsi_to_tcm_attr(int attr) +{ + switch (attr) { + case VIRTIO_SCSI_S_SIMPLE: + return TCM_SIMPLE_TAG; + case VIRTIO_SCSI_S_ORDERED: + return TCM_ORDERED_TAG; + case VIRTIO_SCSI_S_HEAD: + return TCM_HEAD_TAG; + case VIRTIO_SCSI_S_ACA: + return TCM_ACA_TAG; + default: + break; + } + return TCM_SIMPLE_TAG; +} + +static void vhost_scsi_submission_work(struct work_struct *work) +{ + struct vhost_scsi_cmd *cmd = + container_of(work, struct vhost_scsi_cmd, work); + struct vhost_scsi_nexus *tv_nexus; + struct se_cmd *se_cmd = &cmd->tvc_se_cmd; + struct scatterlist *sg_ptr, *sg_prot_ptr = NULL; + int rc; + + /* FIXME: BIDI operation */ + if (cmd->tvc_sgl_count) { + sg_ptr = cmd->tvc_sgl; + + if (cmd->tvc_prot_sgl_count) + sg_prot_ptr = cmd->tvc_prot_sgl; + else + se_cmd->prot_pto = true; + } else { + sg_ptr = NULL; + } + tv_nexus = cmd->tvc_nexus; + + se_cmd->tag = 0; + rc = target_submit_cmd_map_sgls(se_cmd, tv_nexus->tvn_se_sess, + cmd->tvc_cdb, &cmd->tvc_sense_buf[0], + cmd->tvc_lun, cmd->tvc_exp_data_len, + vhost_scsi_to_tcm_attr(cmd->tvc_task_attr), + cmd->tvc_data_direction, TARGET_SCF_ACK_KREF, + sg_ptr, cmd->tvc_sgl_count, NULL, 0, sg_prot_ptr, + cmd->tvc_prot_sgl_count); + if (rc < 0) { + transport_send_check_condition_and_sense(se_cmd, + TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE, 0); + transport_generic_free_cmd(se_cmd, 0); + } +} + +static void +vhost_scsi_send_bad_target(struct vhost_scsi *vs, + struct vhost_virtqueue *vq, + int head, unsigned out) +{ + struct virtio_scsi_cmd_resp __user *resp; + struct virtio_scsi_cmd_resp rsp; + int ret; + + memset(&rsp, 0, sizeof(rsp)); + rsp.response = VIRTIO_SCSI_S_BAD_TARGET; + resp = vq->iov[out].iov_base; + ret = __copy_to_user(resp, &rsp, sizeof(rsp)); + if (!ret) + vhost_add_used_and_signal(&vs->dev, vq, head, 0); + else + pr_err("Faulted on virtio_scsi_cmd_resp\n"); +} + +static void +vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) +{ + struct vhost_scsi_tpg **vs_tpg, *tpg; + struct virtio_scsi_cmd_req v_req; + struct virtio_scsi_cmd_req_pi v_req_pi; + struct vhost_scsi_cmd *cmd; + struct iov_iter out_iter, in_iter, prot_iter, data_iter; + u64 tag; + u32 exp_data_len, data_direction; + unsigned int out = 0, in = 0; + int head, ret, prot_bytes, c = 0; + size_t req_size, rsp_size = sizeof(struct virtio_scsi_cmd_resp); + size_t out_size, in_size; + u16 lun; + u8 *target, *lunp, task_attr; + bool t10_pi = vhost_has_feature(vq, VIRTIO_SCSI_F_T10_PI); + void *req, *cdb; + + mutex_lock(&vq->mutex); + /* + * We can handle the vq only after the endpoint is setup by calling the + * VHOST_SCSI_SET_ENDPOINT ioctl. + */ + vs_tpg = vq->private_data; + if (!vs_tpg) + goto out; + + vhost_disable_notify(&vs->dev, vq); + + do { + head = vhost_get_vq_desc(vq, vq->iov, + ARRAY_SIZE(vq->iov), &out, &in, + NULL, NULL); + pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n", + head, out, in); + /* On error, stop handling until the next kick. */ + if (unlikely(head < 0)) + break; + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (head == vq->num) { + if (unlikely(vhost_enable_notify(&vs->dev, vq))) { + vhost_disable_notify(&vs->dev, vq); + continue; + } + break; + } + /* + * Check for a sane response buffer so we can report early + * errors back to the guest. + */ + if (unlikely(vq->iov[out].iov_len < rsp_size)) { + vq_err(vq, "Expecting at least virtio_scsi_cmd_resp" + " size, got %zu bytes\n", vq->iov[out].iov_len); + break; + } + /* + * Setup pointers and values based upon different virtio-scsi + * request header if T10_PI is enabled in KVM guest. + */ + if (t10_pi) { + req = &v_req_pi; + req_size = sizeof(v_req_pi); + lunp = &v_req_pi.lun[0]; + target = &v_req_pi.lun[1]; + } else { + req = &v_req; + req_size = sizeof(v_req); + lunp = &v_req.lun[0]; + target = &v_req.lun[1]; + } + /* + * FIXME: Not correct for BIDI operation + */ + out_size = iov_length(vq->iov, out); + in_size = iov_length(&vq->iov[out], in); + + /* + * Copy over the virtio-scsi request header, which for a + * ANY_LAYOUT enabled guest may span multiple iovecs, or a + * single iovec may contain both the header + outgoing + * WRITE payloads. + * + * copy_from_iter() will advance out_iter, so that it will + * point at the start of the outgoing WRITE payload, if + * DMA_TO_DEVICE is set. + */ + iov_iter_init(&out_iter, WRITE, vq->iov, out, out_size); + + if (unlikely(!copy_from_iter_full(req, req_size, &out_iter))) { + vq_err(vq, "Faulted on copy_from_iter\n"); + vhost_scsi_send_bad_target(vs, vq, head, out); + continue; + } + /* virtio-scsi spec requires byte 0 of the lun to be 1 */ + if (unlikely(*lunp != 1)) { + vq_err(vq, "Illegal virtio-scsi lun: %u\n", *lunp); + vhost_scsi_send_bad_target(vs, vq, head, out); + continue; + } + + tpg = READ_ONCE(vs_tpg[*target]); + if (unlikely(!tpg)) { + /* Target does not exist, fail the request */ + vhost_scsi_send_bad_target(vs, vq, head, out); + continue; + } + /* + * Determine data_direction by calculating the total outgoing + * iovec sizes + incoming iovec sizes vs. virtio-scsi request + + * response headers respectively. + * + * For DMA_TO_DEVICE this is out_iter, which is already pointing + * to the right place. + * + * For DMA_FROM_DEVICE, the iovec will be just past the end + * of the virtio-scsi response header in either the same + * or immediately following iovec. + * + * Any associated T10_PI bytes for the outgoing / incoming + * payloads are included in calculation of exp_data_len here. + */ + prot_bytes = 0; + + if (out_size > req_size) { + data_direction = DMA_TO_DEVICE; + exp_data_len = out_size - req_size; + data_iter = out_iter; + } else if (in_size > rsp_size) { + data_direction = DMA_FROM_DEVICE; + exp_data_len = in_size - rsp_size; + + iov_iter_init(&in_iter, READ, &vq->iov[out], in, + rsp_size + exp_data_len); + iov_iter_advance(&in_iter, rsp_size); + data_iter = in_iter; + } else { + data_direction = DMA_NONE; + exp_data_len = 0; + } + /* + * If T10_PI header + payload is present, setup prot_iter values + * and recalculate data_iter for vhost_scsi_mapal() mapping to + * host scatterlists via get_user_pages_fast(). + */ + if (t10_pi) { + if (v_req_pi.pi_bytesout) { + if (data_direction != DMA_TO_DEVICE) { + vq_err(vq, "Received non zero pi_bytesout," + " but wrong data_direction\n"); + vhost_scsi_send_bad_target(vs, vq, head, out); + continue; + } + prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesout); + } else if (v_req_pi.pi_bytesin) { + if (data_direction != DMA_FROM_DEVICE) { + vq_err(vq, "Received non zero pi_bytesin," + " but wrong data_direction\n"); + vhost_scsi_send_bad_target(vs, vq, head, out); + continue; + } + prot_bytes = vhost32_to_cpu(vq, v_req_pi.pi_bytesin); + } + /* + * Set prot_iter to data_iter and truncate it to + * prot_bytes, and advance data_iter past any + * preceeding prot_bytes that may be present. + * + * Also fix up the exp_data_len to reflect only the + * actual data payload length. + */ + if (prot_bytes) { + exp_data_len -= prot_bytes; + prot_iter = data_iter; + iov_iter_truncate(&prot_iter, prot_bytes); + iov_iter_advance(&data_iter, prot_bytes); + } + tag = vhost64_to_cpu(vq, v_req_pi.tag); + task_attr = v_req_pi.task_attr; + cdb = &v_req_pi.cdb[0]; + lun = ((v_req_pi.lun[2] << 8) | v_req_pi.lun[3]) & 0x3FFF; + } else { + tag = vhost64_to_cpu(vq, v_req.tag); + task_attr = v_req.task_attr; + cdb = &v_req.cdb[0]; + lun = ((v_req.lun[2] << 8) | v_req.lun[3]) & 0x3FFF; + } + /* + * Check that the received CDB size does not exceeded our + * hardcoded max for vhost-scsi, then get a pre-allocated + * cmd descriptor for the new virtio-scsi tag. + * + * TODO what if cdb was too small for varlen cdb header? + */ + if (unlikely(scsi_command_size(cdb) > VHOST_SCSI_MAX_CDB_SIZE)) { + vq_err(vq, "Received SCSI CDB with command_size: %d that" + " exceeds SCSI_MAX_VARLEN_CDB_SIZE: %d\n", + scsi_command_size(cdb), VHOST_SCSI_MAX_CDB_SIZE); + vhost_scsi_send_bad_target(vs, vq, head, out); + continue; + } + cmd = vhost_scsi_get_tag(vq, tpg, cdb, tag, lun, task_attr, + exp_data_len + prot_bytes, + data_direction); + if (IS_ERR(cmd)) { + vq_err(vq, "vhost_scsi_get_tag failed %ld\n", + PTR_ERR(cmd)); + vhost_scsi_send_bad_target(vs, vq, head, out); + continue; + } + cmd->tvc_vhost = vs; + cmd->tvc_vq = vq; + cmd->tvc_resp_iov = vq->iov[out]; + cmd->tvc_in_iovs = in; + + pr_debug("vhost_scsi got command opcode: %#02x, lun: %d\n", + cmd->tvc_cdb[0], cmd->tvc_lun); + pr_debug("cmd: %p exp_data_len: %d, prot_bytes: %d data_direction:" + " %d\n", cmd, exp_data_len, prot_bytes, data_direction); + + if (data_direction != DMA_NONE) { + ret = vhost_scsi_mapal(cmd, + prot_bytes, &prot_iter, + exp_data_len, &data_iter); + if (unlikely(ret)) { + vq_err(vq, "Failed to map iov to sgl\n"); + vhost_scsi_release_cmd(&cmd->tvc_se_cmd); + vhost_scsi_send_bad_target(vs, vq, head, out); + continue; + } + } + /* + * Save the descriptor from vhost_get_vq_desc() to be used to + * complete the virtio-scsi request in TCM callback context via + * vhost_scsi_queue_data_in() and vhost_scsi_queue_status() + */ + cmd->tvc_vq_desc = head; + /* + * Dispatch cmd descriptor for cmwq execution in process + * context provided by vhost_scsi_workqueue. This also ensures + * cmd is executed on the same kworker CPU as this vhost + * thread to gain positive L2 cache locality effects. + */ + INIT_WORK(&cmd->work, vhost_scsi_submission_work); + queue_work(vhost_scsi_workqueue, &cmd->work); + } while (likely(!vhost_exceeds_weight(vq, ++c, 0))); +out: + mutex_unlock(&vq->mutex); +} + +static void vhost_scsi_ctl_handle_kick(struct vhost_work *work) +{ + pr_debug("%s: The handling func for control queue.\n", __func__); +} + +static void +vhost_scsi_send_evt(struct vhost_scsi *vs, + struct vhost_scsi_tpg *tpg, + struct se_lun *lun, + u32 event, + u32 reason) +{ + struct vhost_scsi_evt *evt; + + evt = vhost_scsi_allocate_evt(vs, event, reason); + if (!evt) + return; + + if (tpg && lun) { + /* TODO: share lun setup code with virtio-scsi.ko */ + /* + * Note: evt->event is zeroed when we allocate it and + * lun[4-7] need to be zero according to virtio-scsi spec. + */ + evt->event.lun[0] = 0x01; + evt->event.lun[1] = tpg->tport_tpgt; + if (lun->unpacked_lun >= 256) + evt->event.lun[2] = lun->unpacked_lun >> 8 | 0x40 ; + evt->event.lun[3] = lun->unpacked_lun & 0xFF; + } + + llist_add(&evt->list, &vs->vs_event_list); + vhost_work_queue(&vs->dev, &vs->vs_event_work); +} + +static void vhost_scsi_evt_handle_kick(struct vhost_work *work) +{ + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_scsi *vs = container_of(vq->dev, struct vhost_scsi, dev); + + mutex_lock(&vq->mutex); + if (!vq->private_data) + goto out; + + if (vs->vs_events_missed) + vhost_scsi_send_evt(vs, NULL, NULL, VIRTIO_SCSI_T_NO_EVENT, 0); +out: + mutex_unlock(&vq->mutex); +} + +static void vhost_scsi_handle_kick(struct vhost_work *work) +{ + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_scsi *vs = container_of(vq->dev, struct vhost_scsi, dev); + + vhost_scsi_handle_vq(vs, vq); +} + +static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index) +{ + vhost_poll_flush(&vs->vqs[index].vq.poll); +} + +/* Callers must hold dev mutex */ +static void vhost_scsi_flush(struct vhost_scsi *vs) +{ + struct vhost_scsi_inflight *old_inflight[VHOST_SCSI_MAX_VQ]; + int i; + + /* Init new inflight and remember the old inflight */ + vhost_scsi_init_inflight(vs, old_inflight); + + /* + * The inflight->kref was initialized to 1. We decrement it here to + * indicate the start of the flush operation so that it will reach 0 + * when all the reqs are finished. + */ + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) + kref_put(&old_inflight[i]->kref, vhost_scsi_done_inflight); + + /* Flush both the vhost poll and vhost work */ + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) + vhost_scsi_flush_vq(vs, i); + vhost_work_flush(&vs->dev, &vs->vs_completion_work); + vhost_work_flush(&vs->dev, &vs->vs_event_work); + + /* Wait for all reqs issued before the flush to be finished */ + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) + wait_for_completion(&old_inflight[i]->comp); +} + +/* + * Called from vhost_scsi_ioctl() context to walk the list of available + * vhost_scsi_tpg with an active struct vhost_scsi_nexus + * + * The lock nesting rule is: + * vhost_scsi_mutex -> vs->dev.mutex -> tpg->tv_tpg_mutex -> vq->mutex + */ +static int +vhost_scsi_set_endpoint(struct vhost_scsi *vs, + struct vhost_scsi_target *t) +{ + struct se_portal_group *se_tpg; + struct vhost_scsi_tport *tv_tport; + struct vhost_scsi_tpg *tpg; + struct vhost_scsi_tpg **vs_tpg; + struct vhost_virtqueue *vq; + int index, ret, i, len; + bool match = false; + + mutex_lock(&vhost_scsi_mutex); + mutex_lock(&vs->dev.mutex); + + /* Verify that ring has been setup correctly. */ + for (index = 0; index < vs->dev.nvqs; ++index) { + /* Verify that ring has been setup correctly. */ + if (!vhost_vq_access_ok(&vs->vqs[index].vq)) { + ret = -EFAULT; + goto out; + } + } + + len = sizeof(vs_tpg[0]) * VHOST_SCSI_MAX_TARGET; + vs_tpg = kzalloc(len, GFP_KERNEL); + if (!vs_tpg) { + ret = -ENOMEM; + goto out; + } + if (vs->vs_tpg) + memcpy(vs_tpg, vs->vs_tpg, len); + + list_for_each_entry(tpg, &vhost_scsi_list, tv_tpg_list) { + mutex_lock(&tpg->tv_tpg_mutex); + if (!tpg->tpg_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + continue; + } + if (tpg->tv_tpg_vhost_count != 0) { + mutex_unlock(&tpg->tv_tpg_mutex); + continue; + } + tv_tport = tpg->tport; + + if (!strcmp(tv_tport->tport_name, t->vhost_wwpn)) { + if (vs->vs_tpg && vs->vs_tpg[tpg->tport_tpgt]) { + kfree(vs_tpg); + mutex_unlock(&tpg->tv_tpg_mutex); + ret = -EEXIST; + goto out; + } + /* + * In order to ensure individual vhost-scsi configfs + * groups cannot be removed while in use by vhost ioctl, + * go ahead and take an explicit se_tpg->tpg_group.cg_item + * dependency now. + */ + se_tpg = &tpg->se_tpg; + ret = target_depend_item(&se_tpg->tpg_group.cg_item); + if (ret) { + pr_warn("configfs_depend_item() failed: %d\n", ret); + kfree(vs_tpg); + mutex_unlock(&tpg->tv_tpg_mutex); + goto out; + } + tpg->tv_tpg_vhost_count++; + tpg->vhost_scsi = vs; + vs_tpg[tpg->tport_tpgt] = tpg; + smp_mb__after_atomic(); + match = true; + } + mutex_unlock(&tpg->tv_tpg_mutex); + } + + if (match) { + memcpy(vs->vs_vhost_wwpn, t->vhost_wwpn, + sizeof(vs->vs_vhost_wwpn)); + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + vq = &vs->vqs[i].vq; + mutex_lock(&vq->mutex); + vq->private_data = vs_tpg; + vhost_vq_init_access(vq); + mutex_unlock(&vq->mutex); + } + ret = 0; + } else { + ret = -EEXIST; + } + + /* + * Act as synchronize_rcu to make sure access to + * old vs->vs_tpg is finished. + */ + vhost_scsi_flush(vs); + kfree(vs->vs_tpg); + vs->vs_tpg = vs_tpg; + +out: + mutex_unlock(&vs->dev.mutex); + mutex_unlock(&vhost_scsi_mutex); + return ret; +} + +static int +vhost_scsi_clear_endpoint(struct vhost_scsi *vs, + struct vhost_scsi_target *t) +{ + struct se_portal_group *se_tpg; + struct vhost_scsi_tport *tv_tport; + struct vhost_scsi_tpg *tpg; + struct vhost_virtqueue *vq; + bool match = false; + int index, ret, i; + u8 target; + + mutex_lock(&vhost_scsi_mutex); + mutex_lock(&vs->dev.mutex); + /* Verify that ring has been setup correctly. */ + for (index = 0; index < vs->dev.nvqs; ++index) { + if (!vhost_vq_access_ok(&vs->vqs[index].vq)) { + ret = -EFAULT; + goto err_dev; + } + } + + if (!vs->vs_tpg) { + ret = 0; + goto err_dev; + } + + for (i = 0; i < VHOST_SCSI_MAX_TARGET; i++) { + target = i; + tpg = vs->vs_tpg[target]; + if (!tpg) + continue; + + mutex_lock(&tpg->tv_tpg_mutex); + tv_tport = tpg->tport; + if (!tv_tport) { + ret = -ENODEV; + goto err_tpg; + } + + if (strcmp(tv_tport->tport_name, t->vhost_wwpn)) { + pr_warn("tv_tport->tport_name: %s, tpg->tport_tpgt: %hu" + " does not match t->vhost_wwpn: %s, t->vhost_tpgt: %hu\n", + tv_tport->tport_name, tpg->tport_tpgt, + t->vhost_wwpn, t->vhost_tpgt); + ret = -EINVAL; + goto err_tpg; + } + tpg->tv_tpg_vhost_count--; + tpg->vhost_scsi = NULL; + vs->vs_tpg[target] = NULL; + match = true; + mutex_unlock(&tpg->tv_tpg_mutex); + /* + * Release se_tpg->tpg_group.cg_item configfs dependency now + * to allow vhost-scsi WWPN se_tpg->tpg_group shutdown to occur. + */ + se_tpg = &tpg->se_tpg; + target_undepend_item(&se_tpg->tpg_group.cg_item); + } + if (match) { + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + vq = &vs->vqs[i].vq; + mutex_lock(&vq->mutex); + vq->private_data = NULL; + mutex_unlock(&vq->mutex); + } + } + /* + * Act as synchronize_rcu to make sure access to + * old vs->vs_tpg is finished. + */ + vhost_scsi_flush(vs); + kfree(vs->vs_tpg); + vs->vs_tpg = NULL; + WARN_ON(vs->vs_events_nr); + mutex_unlock(&vs->dev.mutex); + mutex_unlock(&vhost_scsi_mutex); + return 0; + +err_tpg: + mutex_unlock(&tpg->tv_tpg_mutex); +err_dev: + mutex_unlock(&vs->dev.mutex); + mutex_unlock(&vhost_scsi_mutex); + return ret; +} + +static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features) +{ + struct vhost_virtqueue *vq; + int i; + + if (features & ~VHOST_SCSI_FEATURES) + return -EOPNOTSUPP; + + mutex_lock(&vs->dev.mutex); + if ((features & (1 << VHOST_F_LOG_ALL)) && + !vhost_log_access_ok(&vs->dev)) { + mutex_unlock(&vs->dev.mutex); + return -EFAULT; + } + + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + vq = &vs->vqs[i].vq; + mutex_lock(&vq->mutex); + vq->acked_features = features; + mutex_unlock(&vq->mutex); + } + mutex_unlock(&vs->dev.mutex); + return 0; +} + +static int vhost_scsi_open(struct inode *inode, struct file *f) +{ + struct vhost_scsi *vs; + struct vhost_virtqueue **vqs; + int r = -ENOMEM, i; + + vs = kzalloc(sizeof(*vs), GFP_KERNEL | __GFP_NOWARN | __GFP_RETRY_MAYFAIL); + if (!vs) { + vs = vzalloc(sizeof(*vs)); + if (!vs) + goto err_vs; + } + + vqs = kmalloc_array(VHOST_SCSI_MAX_VQ, sizeof(*vqs), GFP_KERNEL); + if (!vqs) + goto err_vqs; + + vhost_work_init(&vs->vs_completion_work, vhost_scsi_complete_cmd_work); + vhost_work_init(&vs->vs_event_work, vhost_scsi_evt_work); + + vs->vs_events_nr = 0; + vs->vs_events_missed = false; + + vqs[VHOST_SCSI_VQ_CTL] = &vs->vqs[VHOST_SCSI_VQ_CTL].vq; + vqs[VHOST_SCSI_VQ_EVT] = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; + vs->vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick; + vs->vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick; + for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++) { + vqs[i] = &vs->vqs[i].vq; + vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick; + } + vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV, + VHOST_SCSI_WEIGHT, 0); + + vhost_scsi_init_inflight(vs, NULL); + + f->private_data = vs; + return 0; + +err_vqs: + kvfree(vs); +err_vs: + return r; +} + +static int vhost_scsi_release(struct inode *inode, struct file *f) +{ + struct vhost_scsi *vs = f->private_data; + struct vhost_scsi_target t; + + mutex_lock(&vs->dev.mutex); + memcpy(t.vhost_wwpn, vs->vs_vhost_wwpn, sizeof(t.vhost_wwpn)); + mutex_unlock(&vs->dev.mutex); + vhost_scsi_clear_endpoint(vs, &t); + vhost_dev_stop(&vs->dev); + vhost_dev_cleanup(&vs->dev); + /* Jobs can re-queue themselves in evt kick handler. Do extra flush. */ + vhost_scsi_flush(vs); + kfree(vs->dev.vqs); + kvfree(vs); + return 0; +} + +static long +vhost_scsi_ioctl(struct file *f, + unsigned int ioctl, + unsigned long arg) +{ + struct vhost_scsi *vs = f->private_data; + struct vhost_scsi_target backend; + void __user *argp = (void __user *)arg; + u64 __user *featurep = argp; + u32 __user *eventsp = argp; + u32 events_missed; + u64 features; + int r, abi_version = VHOST_SCSI_ABI_VERSION; + struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; + + switch (ioctl) { + case VHOST_SCSI_SET_ENDPOINT: + if (copy_from_user(&backend, argp, sizeof backend)) + return -EFAULT; + if (backend.reserved != 0) + return -EOPNOTSUPP; + + return vhost_scsi_set_endpoint(vs, &backend); + case VHOST_SCSI_CLEAR_ENDPOINT: + if (copy_from_user(&backend, argp, sizeof backend)) + return -EFAULT; + if (backend.reserved != 0) + return -EOPNOTSUPP; + + return vhost_scsi_clear_endpoint(vs, &backend); + case VHOST_SCSI_GET_ABI_VERSION: + if (copy_to_user(argp, &abi_version, sizeof abi_version)) + return -EFAULT; + return 0; + case VHOST_SCSI_SET_EVENTS_MISSED: + if (get_user(events_missed, eventsp)) + return -EFAULT; + mutex_lock(&vq->mutex); + vs->vs_events_missed = events_missed; + mutex_unlock(&vq->mutex); + return 0; + case VHOST_SCSI_GET_EVENTS_MISSED: + mutex_lock(&vq->mutex); + events_missed = vs->vs_events_missed; + mutex_unlock(&vq->mutex); + if (put_user(events_missed, eventsp)) + return -EFAULT; + return 0; + case VHOST_GET_FEATURES: + features = VHOST_SCSI_FEATURES; + if (copy_to_user(featurep, &features, sizeof features)) + return -EFAULT; + return 0; + case VHOST_SET_FEATURES: + if (copy_from_user(&features, featurep, sizeof features)) + return -EFAULT; + return vhost_scsi_set_features(vs, features); + default: + mutex_lock(&vs->dev.mutex); + r = vhost_dev_ioctl(&vs->dev, ioctl, argp); + /* TODO: flush backend after dev ioctl. */ + if (r == -ENOIOCTLCMD) + r = vhost_vring_ioctl(&vs->dev, ioctl, argp); + mutex_unlock(&vs->dev.mutex); + return r; + } +} + +#ifdef CONFIG_COMPAT +static long vhost_scsi_compat_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + return vhost_scsi_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); +} +#endif + +static const struct file_operations vhost_scsi_fops = { + .owner = THIS_MODULE, + .release = vhost_scsi_release, + .unlocked_ioctl = vhost_scsi_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vhost_scsi_compat_ioctl, +#endif + .open = vhost_scsi_open, + .llseek = noop_llseek, +}; + +static struct miscdevice vhost_scsi_misc = { + MISC_DYNAMIC_MINOR, + "vhost-scsi", + &vhost_scsi_fops, +}; + +static int __init vhost_scsi_register(void) +{ + return misc_register(&vhost_scsi_misc); +} + +static void vhost_scsi_deregister(void) +{ + misc_deregister(&vhost_scsi_misc); +} + +static char *vhost_scsi_dump_proto_id(struct vhost_scsi_tport *tport) +{ + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return "SAS"; + case SCSI_PROTOCOL_FCP: + return "FCP"; + case SCSI_PROTOCOL_ISCSI: + return "iSCSI"; + default: + break; + } + + return "Unknown"; +} + +static void +vhost_scsi_do_plug(struct vhost_scsi_tpg *tpg, + struct se_lun *lun, bool plug) +{ + + struct vhost_scsi *vs = tpg->vhost_scsi; + struct vhost_virtqueue *vq; + u32 reason; + + if (!vs) + return; + + mutex_lock(&vs->dev.mutex); + + if (plug) + reason = VIRTIO_SCSI_EVT_RESET_RESCAN; + else + reason = VIRTIO_SCSI_EVT_RESET_REMOVED; + + vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; + mutex_lock(&vq->mutex); + if (vhost_has_feature(vq, VIRTIO_SCSI_F_HOTPLUG)) + vhost_scsi_send_evt(vs, tpg, lun, + VIRTIO_SCSI_T_TRANSPORT_RESET, reason); + mutex_unlock(&vq->mutex); + mutex_unlock(&vs->dev.mutex); +} + +static void vhost_scsi_hotplug(struct vhost_scsi_tpg *tpg, struct se_lun *lun) +{ + vhost_scsi_do_plug(tpg, lun, true); +} + +static void vhost_scsi_hotunplug(struct vhost_scsi_tpg *tpg, struct se_lun *lun) +{ + vhost_scsi_do_plug(tpg, lun, false); +} + +static int vhost_scsi_port_link(struct se_portal_group *se_tpg, + struct se_lun *lun) +{ + struct vhost_scsi_tpg *tpg = container_of(se_tpg, + struct vhost_scsi_tpg, se_tpg); + + mutex_lock(&vhost_scsi_mutex); + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_port_count++; + mutex_unlock(&tpg->tv_tpg_mutex); + + vhost_scsi_hotplug(tpg, lun); + + mutex_unlock(&vhost_scsi_mutex); + + return 0; +} + +static void vhost_scsi_port_unlink(struct se_portal_group *se_tpg, + struct se_lun *lun) +{ + struct vhost_scsi_tpg *tpg = container_of(se_tpg, + struct vhost_scsi_tpg, se_tpg); + + mutex_lock(&vhost_scsi_mutex); + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_port_count--; + mutex_unlock(&tpg->tv_tpg_mutex); + + vhost_scsi_hotunplug(tpg, lun); + + mutex_unlock(&vhost_scsi_mutex); +} + +static void vhost_scsi_free_cmd_map_res(struct se_session *se_sess) +{ + struct vhost_scsi_cmd *tv_cmd; + unsigned int i; + + if (!se_sess->sess_cmd_map) + return; + + for (i = 0; i < VHOST_SCSI_DEFAULT_TAGS; i++) { + tv_cmd = &((struct vhost_scsi_cmd *)se_sess->sess_cmd_map)[i]; + + kfree(tv_cmd->tvc_sgl); + kfree(tv_cmd->tvc_prot_sgl); + kfree(tv_cmd->tvc_upages); + } +} + +static ssize_t vhost_scsi_tpg_attrib_fabric_prot_type_store( + struct config_item *item, const char *page, size_t count) +{ + struct se_portal_group *se_tpg = attrib_to_tpg(item); + struct vhost_scsi_tpg *tpg = container_of(se_tpg, + struct vhost_scsi_tpg, se_tpg); + unsigned long val; + int ret = kstrtoul(page, 0, &val); + + if (ret) { + pr_err("kstrtoul() returned %d for fabric_prot_type\n", ret); + return ret; + } + if (val != 0 && val != 1 && val != 3) { + pr_err("Invalid vhost_scsi fabric_prot_type: %lu\n", val); + return -EINVAL; + } + tpg->tv_fabric_prot_type = val; + + return count; +} + +static ssize_t vhost_scsi_tpg_attrib_fabric_prot_type_show( + struct config_item *item, char *page) +{ + struct se_portal_group *se_tpg = attrib_to_tpg(item); + struct vhost_scsi_tpg *tpg = container_of(se_tpg, + struct vhost_scsi_tpg, se_tpg); + + return sprintf(page, "%d\n", tpg->tv_fabric_prot_type); +} + +CONFIGFS_ATTR(vhost_scsi_tpg_attrib_, fabric_prot_type); + +static struct configfs_attribute *vhost_scsi_tpg_attrib_attrs[] = { + &vhost_scsi_tpg_attrib_attr_fabric_prot_type, + NULL, +}; + +static int vhost_scsi_nexus_cb(struct se_portal_group *se_tpg, + struct se_session *se_sess, void *p) +{ + struct vhost_scsi_cmd *tv_cmd; + unsigned int i; + + for (i = 0; i < VHOST_SCSI_DEFAULT_TAGS; i++) { + tv_cmd = &((struct vhost_scsi_cmd *)se_sess->sess_cmd_map)[i]; + + tv_cmd->tvc_sgl = kcalloc(VHOST_SCSI_PREALLOC_SGLS, + sizeof(struct scatterlist), + GFP_KERNEL); + if (!tv_cmd->tvc_sgl) { + pr_err("Unable to allocate tv_cmd->tvc_sgl\n"); + goto out; + } + + tv_cmd->tvc_upages = kcalloc(VHOST_SCSI_PREALLOC_UPAGES, + sizeof(struct page *), + GFP_KERNEL); + if (!tv_cmd->tvc_upages) { + pr_err("Unable to allocate tv_cmd->tvc_upages\n"); + goto out; + } + + tv_cmd->tvc_prot_sgl = kcalloc(VHOST_SCSI_PREALLOC_PROT_SGLS, + sizeof(struct scatterlist), + GFP_KERNEL); + if (!tv_cmd->tvc_prot_sgl) { + pr_err("Unable to allocate tv_cmd->tvc_prot_sgl\n"); + goto out; + } + } + return 0; +out: + vhost_scsi_free_cmd_map_res(se_sess); + return -ENOMEM; +} + +static int vhost_scsi_make_nexus(struct vhost_scsi_tpg *tpg, + const char *name) +{ + struct vhost_scsi_nexus *tv_nexus; + + mutex_lock(&tpg->tv_tpg_mutex); + if (tpg->tpg_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_debug("tpg->tpg_nexus already exists\n"); + return -EEXIST; + } + + tv_nexus = kzalloc(sizeof(*tv_nexus), GFP_KERNEL); + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_err("Unable to allocate struct vhost_scsi_nexus\n"); + return -ENOMEM; + } + /* + * Since we are running in 'demo mode' this call with generate a + * struct se_node_acl for the vhost_scsi struct se_portal_group with + * the SCSI Initiator port name of the passed configfs group 'name'. + */ + tv_nexus->tvn_se_sess = target_setup_session(&tpg->se_tpg, + VHOST_SCSI_DEFAULT_TAGS, + sizeof(struct vhost_scsi_cmd), + TARGET_PROT_DIN_PASS | TARGET_PROT_DOUT_PASS, + (unsigned char *)name, tv_nexus, + vhost_scsi_nexus_cb); + if (IS_ERR(tv_nexus->tvn_se_sess)) { + mutex_unlock(&tpg->tv_tpg_mutex); + kfree(tv_nexus); + return -ENOMEM; + } + tpg->tpg_nexus = tv_nexus; + + mutex_unlock(&tpg->tv_tpg_mutex); + return 0; +} + +static int vhost_scsi_drop_nexus(struct vhost_scsi_tpg *tpg) +{ + struct se_session *se_sess; + struct vhost_scsi_nexus *tv_nexus; + + mutex_lock(&tpg->tv_tpg_mutex); + tv_nexus = tpg->tpg_nexus; + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + + se_sess = tv_nexus->tvn_se_sess; + if (!se_sess) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + + if (tpg->tv_tpg_port_count != 0) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_err("Unable to remove TCM_vhost I_T Nexus with" + " active TPG port count: %d\n", + tpg->tv_tpg_port_count); + return -EBUSY; + } + + if (tpg->tv_tpg_vhost_count != 0) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_err("Unable to remove TCM_vhost I_T Nexus with" + " active TPG vhost count: %d\n", + tpg->tv_tpg_vhost_count); + return -EBUSY; + } + + pr_debug("TCM_vhost_ConfigFS: Removing I_T Nexus to emulated" + " %s Initiator Port: %s\n", vhost_scsi_dump_proto_id(tpg->tport), + tv_nexus->tvn_se_sess->se_node_acl->initiatorname); + + vhost_scsi_free_cmd_map_res(se_sess); + /* + * Release the SCSI I_T Nexus to the emulated vhost Target Port + */ + target_remove_session(se_sess); + tpg->tpg_nexus = NULL; + mutex_unlock(&tpg->tv_tpg_mutex); + + kfree(tv_nexus); + return 0; +} + +static ssize_t vhost_scsi_tpg_nexus_show(struct config_item *item, char *page) +{ + struct se_portal_group *se_tpg = to_tpg(item); + struct vhost_scsi_tpg *tpg = container_of(se_tpg, + struct vhost_scsi_tpg, se_tpg); + struct vhost_scsi_nexus *tv_nexus; + ssize_t ret; + + mutex_lock(&tpg->tv_tpg_mutex); + tv_nexus = tpg->tpg_nexus; + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + ret = snprintf(page, PAGE_SIZE, "%s\n", + tv_nexus->tvn_se_sess->se_node_acl->initiatorname); + mutex_unlock(&tpg->tv_tpg_mutex); + + return ret; +} + +static ssize_t vhost_scsi_tpg_nexus_store(struct config_item *item, + const char *page, size_t count) +{ + struct se_portal_group *se_tpg = to_tpg(item); + struct vhost_scsi_tpg *tpg = container_of(se_tpg, + struct vhost_scsi_tpg, se_tpg); + struct vhost_scsi_tport *tport_wwn = tpg->tport; + unsigned char i_port[VHOST_SCSI_NAMELEN], *ptr, *port_ptr; + int ret; + /* + * Shutdown the active I_T nexus if 'NULL' is passed.. + */ + if (!strncmp(page, "NULL", 4)) { + ret = vhost_scsi_drop_nexus(tpg); + return (!ret) ? count : ret; + } + /* + * Otherwise make sure the passed virtual Initiator port WWN matches + * the fabric protocol_id set in vhost_scsi_make_tport(), and call + * vhost_scsi_make_nexus(). + */ + if (strlen(page) >= VHOST_SCSI_NAMELEN) { + pr_err("Emulated NAA Sas Address: %s, exceeds" + " max: %d\n", page, VHOST_SCSI_NAMELEN); + return -EINVAL; + } + snprintf(&i_port[0], VHOST_SCSI_NAMELEN, "%s", page); + + ptr = strstr(i_port, "naa."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_SAS) { + pr_err("Passed SAS Initiator Port %s does not" + " match target port protoid: %s\n", i_port, + vhost_scsi_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[0]; + goto check_newline; + } + ptr = strstr(i_port, "fc."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_FCP) { + pr_err("Passed FCP Initiator Port %s does not" + " match target port protoid: %s\n", i_port, + vhost_scsi_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[3]; /* Skip over "fc." */ + goto check_newline; + } + ptr = strstr(i_port, "iqn."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_ISCSI) { + pr_err("Passed iSCSI Initiator Port %s does not" + " match target port protoid: %s\n", i_port, + vhost_scsi_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[0]; + goto check_newline; + } + pr_err("Unable to locate prefix for emulated Initiator Port:" + " %s\n", i_port); + return -EINVAL; + /* + * Clear any trailing newline for the NAA WWN + */ +check_newline: + if (i_port[strlen(i_port)-1] == '\n') + i_port[strlen(i_port)-1] = '\0'; + + ret = vhost_scsi_make_nexus(tpg, port_ptr); + if (ret < 0) + return ret; + + return count; +} + +CONFIGFS_ATTR(vhost_scsi_tpg_, nexus); + +static struct configfs_attribute *vhost_scsi_tpg_attrs[] = { + &vhost_scsi_tpg_attr_nexus, + NULL, +}; + +static struct se_portal_group * +vhost_scsi_make_tpg(struct se_wwn *wwn, const char *name) +{ + struct vhost_scsi_tport *tport = container_of(wwn, + struct vhost_scsi_tport, tport_wwn); + + struct vhost_scsi_tpg *tpg; + u16 tpgt; + int ret; + + if (strstr(name, "tpgt_") != name) + return ERR_PTR(-EINVAL); + if (kstrtou16(name + 5, 10, &tpgt) || tpgt >= VHOST_SCSI_MAX_TARGET) + return ERR_PTR(-EINVAL); + + tpg = kzalloc(sizeof(*tpg), GFP_KERNEL); + if (!tpg) { + pr_err("Unable to allocate struct vhost_scsi_tpg"); + return ERR_PTR(-ENOMEM); + } + mutex_init(&tpg->tv_tpg_mutex); + INIT_LIST_HEAD(&tpg->tv_tpg_list); + tpg->tport = tport; + tpg->tport_tpgt = tpgt; + + ret = core_tpg_register(wwn, &tpg->se_tpg, tport->tport_proto_id); + if (ret < 0) { + kfree(tpg); + return NULL; + } + mutex_lock(&vhost_scsi_mutex); + list_add_tail(&tpg->tv_tpg_list, &vhost_scsi_list); + mutex_unlock(&vhost_scsi_mutex); + + return &tpg->se_tpg; +} + +static void vhost_scsi_drop_tpg(struct se_portal_group *se_tpg) +{ + struct vhost_scsi_tpg *tpg = container_of(se_tpg, + struct vhost_scsi_tpg, se_tpg); + + mutex_lock(&vhost_scsi_mutex); + list_del(&tpg->tv_tpg_list); + mutex_unlock(&vhost_scsi_mutex); + /* + * Release the virtual I_T Nexus for this vhost TPG + */ + vhost_scsi_drop_nexus(tpg); + /* + * Deregister the se_tpg from TCM.. + */ + core_tpg_deregister(se_tpg); + kfree(tpg); +} + +static struct se_wwn * +vhost_scsi_make_tport(struct target_fabric_configfs *tf, + struct config_group *group, + const char *name) +{ + struct vhost_scsi_tport *tport; + char *ptr; + u64 wwpn = 0; + int off = 0; + + /* if (vhost_scsi_parse_wwn(name, &wwpn, 1) < 0) + return ERR_PTR(-EINVAL); */ + + tport = kzalloc(sizeof(*tport), GFP_KERNEL); + if (!tport) { + pr_err("Unable to allocate struct vhost_scsi_tport"); + return ERR_PTR(-ENOMEM); + } + tport->tport_wwpn = wwpn; + /* + * Determine the emulated Protocol Identifier and Target Port Name + * based on the incoming configfs directory name. + */ + ptr = strstr(name, "naa."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_SAS; + goto check_len; + } + ptr = strstr(name, "fc."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_FCP; + off = 3; /* Skip over "fc." */ + goto check_len; + } + ptr = strstr(name, "iqn."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_ISCSI; + goto check_len; + } + + pr_err("Unable to locate prefix for emulated Target Port:" + " %s\n", name); + kfree(tport); + return ERR_PTR(-EINVAL); + +check_len: + if (strlen(name) >= VHOST_SCSI_NAMELEN) { + pr_err("Emulated %s Address: %s, exceeds" + " max: %d\n", name, vhost_scsi_dump_proto_id(tport), + VHOST_SCSI_NAMELEN); + kfree(tport); + return ERR_PTR(-EINVAL); + } + snprintf(&tport->tport_name[0], VHOST_SCSI_NAMELEN, "%s", &name[off]); + + pr_debug("TCM_VHost_ConfigFS: Allocated emulated Target" + " %s Address: %s\n", vhost_scsi_dump_proto_id(tport), name); + + return &tport->tport_wwn; +} + +static void vhost_scsi_drop_tport(struct se_wwn *wwn) +{ + struct vhost_scsi_tport *tport = container_of(wwn, + struct vhost_scsi_tport, tport_wwn); + + pr_debug("TCM_VHost_ConfigFS: Deallocating emulated Target" + " %s Address: %s\n", vhost_scsi_dump_proto_id(tport), + tport->tport_name); + + kfree(tport); +} + +static ssize_t +vhost_scsi_wwn_version_show(struct config_item *item, char *page) +{ + return sprintf(page, "TCM_VHOST fabric module %s on %s/%s" + "on "UTS_RELEASE"\n", VHOST_SCSI_VERSION, utsname()->sysname, + utsname()->machine); +} + +CONFIGFS_ATTR_RO(vhost_scsi_wwn_, version); + +static struct configfs_attribute *vhost_scsi_wwn_attrs[] = { + &vhost_scsi_wwn_attr_version, + NULL, +}; + +static const struct target_core_fabric_ops vhost_scsi_ops = { + .module = THIS_MODULE, + .name = "vhost", + .get_fabric_name = vhost_scsi_get_fabric_name, + .tpg_get_wwn = vhost_scsi_get_fabric_wwn, + .tpg_get_tag = vhost_scsi_get_tpgt, + .tpg_check_demo_mode = vhost_scsi_check_true, + .tpg_check_demo_mode_cache = vhost_scsi_check_true, + .tpg_check_demo_mode_write_protect = vhost_scsi_check_false, + .tpg_check_prod_mode_write_protect = vhost_scsi_check_false, + .tpg_check_prot_fabric_only = vhost_scsi_check_prot_fabric_only, + .tpg_get_inst_index = vhost_scsi_tpg_get_inst_index, + .release_cmd = vhost_scsi_release_cmd, + .check_stop_free = vhost_scsi_check_stop_free, + .sess_get_index = vhost_scsi_sess_get_index, + .sess_get_initiator_sid = NULL, + .write_pending = vhost_scsi_write_pending, + .write_pending_status = vhost_scsi_write_pending_status, + .set_default_node_attributes = vhost_scsi_set_default_node_attrs, + .get_cmd_state = vhost_scsi_get_cmd_state, + .queue_data_in = vhost_scsi_queue_data_in, + .queue_status = vhost_scsi_queue_status, + .queue_tm_rsp = vhost_scsi_queue_tm_rsp, + .aborted_task = vhost_scsi_aborted_task, + /* + * Setup callers for generic logic in target_core_fabric_configfs.c + */ + .fabric_make_wwn = vhost_scsi_make_tport, + .fabric_drop_wwn = vhost_scsi_drop_tport, + .fabric_make_tpg = vhost_scsi_make_tpg, + .fabric_drop_tpg = vhost_scsi_drop_tpg, + .fabric_post_link = vhost_scsi_port_link, + .fabric_pre_unlink = vhost_scsi_port_unlink, + + .tfc_wwn_attrs = vhost_scsi_wwn_attrs, + .tfc_tpg_base_attrs = vhost_scsi_tpg_attrs, + .tfc_tpg_attrib_attrs = vhost_scsi_tpg_attrib_attrs, +}; + +static int __init vhost_scsi_init(void) +{ + int ret = -ENOMEM; + + pr_debug("TCM_VHOST fabric module %s on %s/%s" + " on "UTS_RELEASE"\n", VHOST_SCSI_VERSION, utsname()->sysname, + utsname()->machine); + + /* + * Use our own dedicated workqueue for submitting I/O into + * target core to avoid contention within system_wq. + */ + vhost_scsi_workqueue = alloc_workqueue("vhost_scsi", 0, 0); + if (!vhost_scsi_workqueue) + goto out; + + ret = vhost_scsi_register(); + if (ret < 0) + goto out_destroy_workqueue; + + ret = target_register_template(&vhost_scsi_ops); + if (ret < 0) + goto out_vhost_scsi_deregister; + + return 0; + +out_vhost_scsi_deregister: + vhost_scsi_deregister(); +out_destroy_workqueue: + destroy_workqueue(vhost_scsi_workqueue); +out: + return ret; +}; + +static void vhost_scsi_exit(void) +{ + target_unregister_template(&vhost_scsi_ops); + vhost_scsi_deregister(); + destroy_workqueue(vhost_scsi_workqueue); +}; + +MODULE_DESCRIPTION("VHOST_SCSI series fabric driver"); +MODULE_ALIAS("tcm_vhost"); +MODULE_LICENSE("GPL"); +module_init(vhost_scsi_init); +module_exit(vhost_scsi_exit); diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c new file mode 100644 index 000000000..55090d9f9 --- /dev/null +++ b/drivers/vhost/test.c @@ -0,0 +1,337 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * test virtio server in host kernel. + */ + +#include <linux/compat.h> +#include <linux/eventfd.h> +#include <linux/vhost.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/workqueue.h> +#include <linux/file.h> +#include <linux/slab.h> + +#include "test.h" +#include "vhost.h" + +/* Max number of bytes transferred before requeueing the job. + * Using this limit prevents one virtqueue from starving others. */ +#define VHOST_TEST_WEIGHT 0x80000 + +/* Max number of packets transferred before requeueing the job. + * Using this limit prevents one virtqueue from starving others with + * pkts. + */ +#define VHOST_TEST_PKT_WEIGHT 256 + +enum { + VHOST_TEST_VQ = 0, + VHOST_TEST_VQ_MAX = 1, +}; + +struct vhost_test { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_TEST_VQ_MAX]; +}; + +/* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ +static void handle_vq(struct vhost_test *n) +{ + struct vhost_virtqueue *vq = &n->vqs[VHOST_TEST_VQ]; + unsigned out, in; + int head; + size_t len, total_len = 0; + void *private; + + mutex_lock(&vq->mutex); + private = vq->private_data; + if (!private) { + mutex_unlock(&vq->mutex); + return; + } + + vhost_disable_notify(&n->dev, vq); + + for (;;) { + head = vhost_get_vq_desc(vq, vq->iov, + ARRAY_SIZE(vq->iov), + &out, &in, + NULL, NULL); + /* On error, stop handling until the next kick. */ + if (unlikely(head < 0)) + break; + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (head == vq->num) { + if (unlikely(vhost_enable_notify(&n->dev, vq))) { + vhost_disable_notify(&n->dev, vq); + continue; + } + break; + } + if (in) { + vq_err(vq, "Unexpected descriptor format for TX: " + "out %d, int %d\n", out, in); + break; + } + len = iov_length(vq->iov, out); + /* Sanity check */ + if (!len) { + vq_err(vq, "Unexpected 0 len for TX\n"); + break; + } + vhost_add_used_and_signal(&n->dev, vq, head, 0); + total_len += len; + if (unlikely(vhost_exceeds_weight(vq, 0, total_len))) + break; + } + + mutex_unlock(&vq->mutex); +} + +static void handle_vq_kick(struct vhost_work *work) +{ + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_test *n = container_of(vq->dev, struct vhost_test, dev); + + handle_vq(n); +} + +static int vhost_test_open(struct inode *inode, struct file *f) +{ + struct vhost_test *n = kmalloc(sizeof *n, GFP_KERNEL); + struct vhost_dev *dev; + struct vhost_virtqueue **vqs; + + if (!n) + return -ENOMEM; + vqs = kmalloc_array(VHOST_TEST_VQ_MAX, sizeof(*vqs), GFP_KERNEL); + if (!vqs) { + kfree(n); + return -ENOMEM; + } + + dev = &n->dev; + vqs[VHOST_TEST_VQ] = &n->vqs[VHOST_TEST_VQ]; + n->vqs[VHOST_TEST_VQ].handle_kick = handle_vq_kick; + vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV, + VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT); + + f->private_data = n; + + return 0; +} + +static void *vhost_test_stop_vq(struct vhost_test *n, + struct vhost_virtqueue *vq) +{ + void *private; + + mutex_lock(&vq->mutex); + private = vq->private_data; + vq->private_data = NULL; + mutex_unlock(&vq->mutex); + return private; +} + +static void vhost_test_stop(struct vhost_test *n, void **privatep) +{ + *privatep = vhost_test_stop_vq(n, n->vqs + VHOST_TEST_VQ); +} + +static void vhost_test_flush_vq(struct vhost_test *n, int index) +{ + vhost_poll_flush(&n->vqs[index].poll); +} + +static void vhost_test_flush(struct vhost_test *n) +{ + vhost_test_flush_vq(n, VHOST_TEST_VQ); +} + +static int vhost_test_release(struct inode *inode, struct file *f) +{ + struct vhost_test *n = f->private_data; + void *private; + + vhost_test_stop(n, &private); + vhost_test_flush(n); + vhost_dev_stop(&n->dev); + vhost_dev_cleanup(&n->dev); + /* We do an extra flush before freeing memory, + * since jobs can re-queue themselves. */ + vhost_test_flush(n); + kfree(n); + return 0; +} + +static long vhost_test_run(struct vhost_test *n, int test) +{ + void *priv, *oldpriv; + struct vhost_virtqueue *vq; + int r, index; + + if (test < 0 || test > 1) + return -EINVAL; + + mutex_lock(&n->dev.mutex); + r = vhost_dev_check_owner(&n->dev); + if (r) + goto err; + + for (index = 0; index < n->dev.nvqs; ++index) { + /* Verify that ring has been setup correctly. */ + if (!vhost_vq_access_ok(&n->vqs[index])) { + r = -EFAULT; + goto err; + } + } + + for (index = 0; index < n->dev.nvqs; ++index) { + vq = n->vqs + index; + mutex_lock(&vq->mutex); + priv = test ? n : NULL; + + /* start polling new socket */ + oldpriv = vq->private_data; + vq->private_data = priv; + + r = vhost_vq_init_access(&n->vqs[index]); + + mutex_unlock(&vq->mutex); + + if (r) + goto err; + + if (oldpriv) { + vhost_test_flush_vq(n, index); + } + } + + mutex_unlock(&n->dev.mutex); + return 0; + +err: + mutex_unlock(&n->dev.mutex); + return r; +} + +static long vhost_test_reset_owner(struct vhost_test *n) +{ + void *priv = NULL; + long err; + struct vhost_umem *umem; + + mutex_lock(&n->dev.mutex); + err = vhost_dev_check_owner(&n->dev); + if (err) + goto done; + umem = vhost_dev_reset_owner_prepare(); + if (!umem) { + err = -ENOMEM; + goto done; + } + vhost_test_stop(n, &priv); + vhost_test_flush(n); + vhost_dev_stop(&n->dev); + vhost_dev_reset_owner(&n->dev, umem); +done: + mutex_unlock(&n->dev.mutex); + return err; +} + +static int vhost_test_set_features(struct vhost_test *n, u64 features) +{ + struct vhost_virtqueue *vq; + + mutex_lock(&n->dev.mutex); + if ((features & (1 << VHOST_F_LOG_ALL)) && + !vhost_log_access_ok(&n->dev)) { + mutex_unlock(&n->dev.mutex); + return -EFAULT; + } + vq = &n->vqs[VHOST_TEST_VQ]; + mutex_lock(&vq->mutex); + vq->acked_features = features; + mutex_unlock(&vq->mutex); + mutex_unlock(&n->dev.mutex); + return 0; +} + +static long vhost_test_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_test *n = f->private_data; + void __user *argp = (void __user *)arg; + u64 __user *featurep = argp; + int test; + u64 features; + int r; + switch (ioctl) { + case VHOST_TEST_RUN: + if (copy_from_user(&test, argp, sizeof test)) + return -EFAULT; + return vhost_test_run(n, test); + case VHOST_GET_FEATURES: + features = VHOST_FEATURES; + if (copy_to_user(featurep, &features, sizeof features)) + return -EFAULT; + return 0; + case VHOST_SET_FEATURES: + printk(KERN_ERR "1\n"); + if (copy_from_user(&features, featurep, sizeof features)) + return -EFAULT; + printk(KERN_ERR "2\n"); + if (features & ~VHOST_FEATURES) + return -EOPNOTSUPP; + printk(KERN_ERR "3\n"); + return vhost_test_set_features(n, features); + case VHOST_RESET_OWNER: + return vhost_test_reset_owner(n); + default: + mutex_lock(&n->dev.mutex); + r = vhost_dev_ioctl(&n->dev, ioctl, argp); + if (r == -ENOIOCTLCMD) + r = vhost_vring_ioctl(&n->dev, ioctl, argp); + vhost_test_flush(n); + mutex_unlock(&n->dev.mutex); + return r; + } +} + +#ifdef CONFIG_COMPAT +static long vhost_test_compat_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + return vhost_test_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); +} +#endif + +static const struct file_operations vhost_test_fops = { + .owner = THIS_MODULE, + .release = vhost_test_release, + .unlocked_ioctl = vhost_test_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vhost_test_compat_ioctl, +#endif + .open = vhost_test_open, + .llseek = noop_llseek, +}; + +static struct miscdevice vhost_test_misc = { + MISC_DYNAMIC_MINOR, + "vhost-test", + &vhost_test_fops, +}; +module_misc_device(vhost_test_misc); + +MODULE_VERSION("0.0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Michael S. Tsirkin"); +MODULE_DESCRIPTION("Host kernel side for virtio simulator"); diff --git a/drivers/vhost/test.h b/drivers/vhost/test.h new file mode 100644 index 000000000..7dd265bfd --- /dev/null +++ b/drivers/vhost/test.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_VHOST_TEST_H +#define LINUX_VHOST_TEST_H + +/* Start a given test on the virtio null device. 0 stops all tests. */ +#define VHOST_TEST_RUN _IOW(VHOST_VIRTIO, 0x31, int) + +#endif diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c new file mode 100644 index 000000000..7a58f6291 --- /dev/null +++ b/drivers/vhost/vhost.c @@ -0,0 +1,2549 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2006 Rusty Russell IBM Corporation + * + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * Inspiration, some code, and most witty comments come from + * Documentation/virtual/lguest/lguest.c, by Rusty Russell + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Generic code for virtio server in host kernel. + */ + +#include <linux/eventfd.h> +#include <linux/vhost.h> +#include <linux/uio.h> +#include <linux/mm.h> +#include <linux/mmu_context.h> +#include <linux/miscdevice.h> +#include <linux/mutex.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/kthread.h> +#include <linux/cgroup.h> +#include <linux/module.h> +#include <linux/sort.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/interval_tree_generic.h> +#include <linux/nospec.h> + +#include "vhost.h" + +static ushort max_mem_regions = 64; +module_param(max_mem_regions, ushort, 0444); +MODULE_PARM_DESC(max_mem_regions, + "Maximum number of memory regions in memory map. (default: 64)"); +static int max_iotlb_entries = 2048; +module_param(max_iotlb_entries, int, 0444); +MODULE_PARM_DESC(max_iotlb_entries, + "Maximum number of iotlb entries. (default: 2048)"); + +enum { + VHOST_MEMORY_F_LOG = 0x1, +}; + +#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num]) +#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num]) + +INTERVAL_TREE_DEFINE(struct vhost_umem_node, + rb, __u64, __subtree_last, + START, LAST, static inline, vhost_umem_interval_tree); + +#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY +static void vhost_disable_cross_endian(struct vhost_virtqueue *vq) +{ + vq->user_be = !virtio_legacy_is_little_endian(); +} + +static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq) +{ + vq->user_be = true; +} + +static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq) +{ + vq->user_be = false; +} + +static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp) +{ + struct vhost_vring_state s; + + if (vq->private_data) + return -EBUSY; + + if (copy_from_user(&s, argp, sizeof(s))) + return -EFAULT; + + if (s.num != VHOST_VRING_LITTLE_ENDIAN && + s.num != VHOST_VRING_BIG_ENDIAN) + return -EINVAL; + + if (s.num == VHOST_VRING_BIG_ENDIAN) + vhost_enable_cross_endian_big(vq); + else + vhost_enable_cross_endian_little(vq); + + return 0; +} + +static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx, + int __user *argp) +{ + struct vhost_vring_state s = { + .index = idx, + .num = vq->user_be + }; + + if (copy_to_user(argp, &s, sizeof(s))) + return -EFAULT; + + return 0; +} + +static void vhost_init_is_le(struct vhost_virtqueue *vq) +{ + /* Note for legacy virtio: user_be is initialized at reset time + * according to the host endianness. If userspace does not set an + * explicit endianness, the default behavior is native endian, as + * expected by legacy virtio. + */ + vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be; +} +#else +static void vhost_disable_cross_endian(struct vhost_virtqueue *vq) +{ +} + +static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp) +{ + return -ENOIOCTLCMD; +} + +static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx, + int __user *argp) +{ + return -ENOIOCTLCMD; +} + +static void vhost_init_is_le(struct vhost_virtqueue *vq) +{ + vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) + || virtio_legacy_is_little_endian(); +} +#endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */ + +static void vhost_reset_is_le(struct vhost_virtqueue *vq) +{ + vhost_init_is_le(vq); +} + +struct vhost_flush_struct { + struct vhost_work work; + struct completion wait_event; +}; + +static void vhost_flush_work(struct vhost_work *work) +{ + struct vhost_flush_struct *s; + + s = container_of(work, struct vhost_flush_struct, work); + complete(&s->wait_event); +} + +static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct vhost_poll *poll; + + poll = container_of(pt, struct vhost_poll, table); + poll->wqh = wqh; + add_wait_queue(wqh, &poll->wait); +} + +static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, + void *key) +{ + struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait); + + if (!(key_to_poll(key) & poll->mask)) + return 0; + + vhost_poll_queue(poll); + return 0; +} + +void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) +{ + clear_bit(VHOST_WORK_QUEUED, &work->flags); + work->fn = fn; +} +EXPORT_SYMBOL_GPL(vhost_work_init); + +/* Init poll structure */ +void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, + __poll_t mask, struct vhost_dev *dev) +{ + init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); + init_poll_funcptr(&poll->table, vhost_poll_func); + poll->mask = mask; + poll->dev = dev; + poll->wqh = NULL; + + vhost_work_init(&poll->work, fn); +} +EXPORT_SYMBOL_GPL(vhost_poll_init); + +/* Start polling a file. We add ourselves to file's wait queue. The caller must + * keep a reference to a file until after vhost_poll_stop is called. */ +int vhost_poll_start(struct vhost_poll *poll, struct file *file) +{ + __poll_t mask; + int ret = 0; + + if (poll->wqh) + return 0; + + mask = vfs_poll(file, &poll->table); + if (mask) + vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask)); + if (mask & EPOLLERR) { + vhost_poll_stop(poll); + ret = -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(vhost_poll_start); + +/* Stop polling a file. After this function returns, it becomes safe to drop the + * file reference. You must also flush afterwards. */ +void vhost_poll_stop(struct vhost_poll *poll) +{ + if (poll->wqh) { + remove_wait_queue(poll->wqh, &poll->wait); + poll->wqh = NULL; + } +} +EXPORT_SYMBOL_GPL(vhost_poll_stop); + +void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +{ + struct vhost_flush_struct flush; + + if (dev->worker) { + init_completion(&flush.wait_event); + vhost_work_init(&flush.work, vhost_flush_work); + + vhost_work_queue(dev, &flush.work); + wait_for_completion(&flush.wait_event); + } +} +EXPORT_SYMBOL_GPL(vhost_work_flush); + +/* Flush any work that has been scheduled. When calling this, don't hold any + * locks that are also used by the callback. */ +void vhost_poll_flush(struct vhost_poll *poll) +{ + vhost_work_flush(poll->dev, &poll->work); +} +EXPORT_SYMBOL_GPL(vhost_poll_flush); + +void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) +{ + if (!dev->worker) + return; + + if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) { + /* We can only add the work to the list after we're + * sure it was not in the list. + * test_and_set_bit() implies a memory barrier. + */ + llist_add(&work->node, &dev->work_list); + wake_up_process(dev->worker); + } +} +EXPORT_SYMBOL_GPL(vhost_work_queue); + +/* A lockless hint for busy polling code to exit the loop */ +bool vhost_has_work(struct vhost_dev *dev) +{ + return !llist_empty(&dev->work_list); +} +EXPORT_SYMBOL_GPL(vhost_has_work); + +void vhost_poll_queue(struct vhost_poll *poll) +{ + vhost_work_queue(poll->dev, &poll->work); +} +EXPORT_SYMBOL_GPL(vhost_poll_queue); + +static void __vhost_vq_meta_reset(struct vhost_virtqueue *vq) +{ + int j; + + for (j = 0; j < VHOST_NUM_ADDRS; j++) + vq->meta_iotlb[j] = NULL; +} + +static void vhost_vq_meta_reset(struct vhost_dev *d) +{ + int i; + + for (i = 0; i < d->nvqs; ++i) + __vhost_vq_meta_reset(d->vqs[i]); +} + +static void vhost_vq_reset(struct vhost_dev *dev, + struct vhost_virtqueue *vq) +{ + vq->num = 1; + vq->desc = NULL; + vq->avail = NULL; + vq->used = NULL; + vq->last_avail_idx = 0; + vq->avail_idx = 0; + vq->last_used_idx = 0; + vq->signalled_used = 0; + vq->signalled_used_valid = false; + vq->used_flags = 0; + vq->log_used = false; + vq->log_addr = -1ull; + vq->private_data = NULL; + vq->acked_features = 0; + vq->acked_backend_features = 0; + vq->log_base = NULL; + vq->error_ctx = NULL; + vq->kick = NULL; + vq->call_ctx = NULL; + vq->log_ctx = NULL; + vhost_disable_cross_endian(vq); + vhost_reset_is_le(vq); + vq->busyloop_timeout = 0; + vq->umem = NULL; + vq->iotlb = NULL; + __vhost_vq_meta_reset(vq); +} + +static int vhost_worker(void *data) +{ + struct vhost_dev *dev = data; + struct vhost_work *work, *work_next; + struct llist_node *node; + mm_segment_t oldfs = get_fs(); + + set_fs(USER_DS); + use_mm(dev->mm); + + for (;;) { + /* mb paired w/ kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } + + node = llist_del_all(&dev->work_list); + if (!node) + schedule(); + + node = llist_reverse_order(node); + /* make sure flag is seen after deletion */ + smp_wmb(); + llist_for_each_entry_safe(work, work_next, node, node) { + clear_bit(VHOST_WORK_QUEUED, &work->flags); + __set_current_state(TASK_RUNNING); + work->fn(work); + if (need_resched()) + schedule(); + } + } + unuse_mm(dev->mm); + set_fs(oldfs); + return 0; +} + +static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) +{ + kfree(vq->indirect); + vq->indirect = NULL; + kfree(vq->log); + vq->log = NULL; + kfree(vq->heads); + vq->heads = NULL; +} + +/* Helper to allocate iovec buffers for all vqs. */ +static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) +{ + struct vhost_virtqueue *vq; + int i; + + for (i = 0; i < dev->nvqs; ++i) { + vq = dev->vqs[i]; + vq->indirect = kmalloc_array(UIO_MAXIOV, + sizeof(*vq->indirect), + GFP_KERNEL); + vq->log = kmalloc_array(dev->iov_limit, sizeof(*vq->log), + GFP_KERNEL); + vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads), + GFP_KERNEL); + if (!vq->indirect || !vq->log || !vq->heads) + goto err_nomem; + } + return 0; + +err_nomem: + for (; i >= 0; --i) + vhost_vq_free_iovecs(dev->vqs[i]); + return -ENOMEM; +} + +static void vhost_dev_free_iovecs(struct vhost_dev *dev) +{ + int i; + + for (i = 0; i < dev->nvqs; ++i) + vhost_vq_free_iovecs(dev->vqs[i]); +} + +bool vhost_exceeds_weight(struct vhost_virtqueue *vq, + int pkts, int total_len) +{ + struct vhost_dev *dev = vq->dev; + + if ((dev->byte_weight && total_len >= dev->byte_weight) || + pkts >= dev->weight) { + vhost_poll_queue(&vq->poll); + return true; + } + + return false; +} +EXPORT_SYMBOL_GPL(vhost_exceeds_weight); + +void vhost_dev_init(struct vhost_dev *dev, + struct vhost_virtqueue **vqs, int nvqs, + int iov_limit, int weight, int byte_weight) +{ + struct vhost_virtqueue *vq; + int i; + + dev->vqs = vqs; + dev->nvqs = nvqs; + mutex_init(&dev->mutex); + dev->log_ctx = NULL; + dev->umem = NULL; + dev->iotlb = NULL; + dev->mm = NULL; + dev->worker = NULL; + dev->iov_limit = iov_limit; + dev->weight = weight; + dev->byte_weight = byte_weight; + init_llist_head(&dev->work_list); + init_waitqueue_head(&dev->wait); + INIT_LIST_HEAD(&dev->read_list); + INIT_LIST_HEAD(&dev->pending_list); + spin_lock_init(&dev->iotlb_lock); + + + for (i = 0; i < dev->nvqs; ++i) { + vq = dev->vqs[i]; + vq->log = NULL; + vq->indirect = NULL; + vq->heads = NULL; + vq->dev = dev; + mutex_init(&vq->mutex); + vhost_vq_reset(dev, vq); + if (vq->handle_kick) + vhost_poll_init(&vq->poll, vq->handle_kick, + EPOLLIN, dev); + } +} +EXPORT_SYMBOL_GPL(vhost_dev_init); + +/* Caller should have device mutex */ +long vhost_dev_check_owner(struct vhost_dev *dev) +{ + /* Are you the owner? If not, I don't think you mean to do that */ + return dev->mm == current->mm ? 0 : -EPERM; +} +EXPORT_SYMBOL_GPL(vhost_dev_check_owner); + +struct vhost_attach_cgroups_struct { + struct vhost_work work; + struct task_struct *owner; + int ret; +}; + +static void vhost_attach_cgroups_work(struct vhost_work *work) +{ + struct vhost_attach_cgroups_struct *s; + + s = container_of(work, struct vhost_attach_cgroups_struct, work); + s->ret = cgroup_attach_task_all(s->owner, current); +} + +static int vhost_attach_cgroups(struct vhost_dev *dev) +{ + struct vhost_attach_cgroups_struct attach; + + attach.owner = current; + vhost_work_init(&attach.work, vhost_attach_cgroups_work); + vhost_work_queue(dev, &attach.work); + vhost_work_flush(dev, &attach.work); + return attach.ret; +} + +/* Caller should have device mutex */ +bool vhost_dev_has_owner(struct vhost_dev *dev) +{ + return dev->mm; +} +EXPORT_SYMBOL_GPL(vhost_dev_has_owner); + +/* Caller should have device mutex */ +long vhost_dev_set_owner(struct vhost_dev *dev) +{ + struct task_struct *worker; + int err; + + /* Is there an owner already? */ + if (vhost_dev_has_owner(dev)) { + err = -EBUSY; + goto err_mm; + } + + /* No owner, become one */ + dev->mm = get_task_mm(current); + worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); + if (IS_ERR(worker)) { + err = PTR_ERR(worker); + goto err_worker; + } + + dev->worker = worker; + wake_up_process(worker); /* avoid contributing to loadavg */ + + err = vhost_attach_cgroups(dev); + if (err) + goto err_cgroup; + + err = vhost_dev_alloc_iovecs(dev); + if (err) + goto err_cgroup; + + return 0; +err_cgroup: + kthread_stop(worker); + dev->worker = NULL; +err_worker: + if (dev->mm) + mmput(dev->mm); + dev->mm = NULL; +err_mm: + return err; +} +EXPORT_SYMBOL_GPL(vhost_dev_set_owner); + +struct vhost_umem *vhost_dev_reset_owner_prepare(void) +{ + return kvzalloc(sizeof(struct vhost_umem), GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare); + +/* Caller should have device mutex */ +void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem) +{ + int i; + + vhost_dev_cleanup(dev); + + /* Restore memory to default empty mapping. */ + INIT_LIST_HEAD(&umem->umem_list); + dev->umem = umem; + /* We don't need VQ locks below since vhost_dev_cleanup makes sure + * VQs aren't running. + */ + for (i = 0; i < dev->nvqs; ++i) + dev->vqs[i]->umem = umem; +} +EXPORT_SYMBOL_GPL(vhost_dev_reset_owner); + +void vhost_dev_stop(struct vhost_dev *dev) +{ + int i; + + for (i = 0; i < dev->nvqs; ++i) { + if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) { + vhost_poll_stop(&dev->vqs[i]->poll); + vhost_poll_flush(&dev->vqs[i]->poll); + } + } +} +EXPORT_SYMBOL_GPL(vhost_dev_stop); + +static void vhost_umem_free(struct vhost_umem *umem, + struct vhost_umem_node *node) +{ + vhost_umem_interval_tree_remove(node, &umem->umem_tree); + list_del(&node->link); + kfree(node); + umem->numem--; +} + +static void vhost_umem_clean(struct vhost_umem *umem) +{ + struct vhost_umem_node *node, *tmp; + + if (!umem) + return; + + list_for_each_entry_safe(node, tmp, &umem->umem_list, link) + vhost_umem_free(umem, node); + + kvfree(umem); +} + +static void vhost_clear_msg(struct vhost_dev *dev) +{ + struct vhost_msg_node *node, *n; + + spin_lock(&dev->iotlb_lock); + + list_for_each_entry_safe(node, n, &dev->read_list, node) { + list_del(&node->node); + kfree(node); + } + + list_for_each_entry_safe(node, n, &dev->pending_list, node) { + list_del(&node->node); + kfree(node); + } + + spin_unlock(&dev->iotlb_lock); +} + +void vhost_dev_cleanup(struct vhost_dev *dev) +{ + int i; + + for (i = 0; i < dev->nvqs; ++i) { + if (dev->vqs[i]->error_ctx) + eventfd_ctx_put(dev->vqs[i]->error_ctx); + if (dev->vqs[i]->kick) + fput(dev->vqs[i]->kick); + if (dev->vqs[i]->call_ctx) + eventfd_ctx_put(dev->vqs[i]->call_ctx); + vhost_vq_reset(dev, dev->vqs[i]); + } + vhost_dev_free_iovecs(dev); + if (dev->log_ctx) + eventfd_ctx_put(dev->log_ctx); + dev->log_ctx = NULL; + /* No one will access memory at this point */ + vhost_umem_clean(dev->umem); + dev->umem = NULL; + vhost_umem_clean(dev->iotlb); + dev->iotlb = NULL; + vhost_clear_msg(dev); + wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM); + WARN_ON(!llist_empty(&dev->work_list)); + if (dev->worker) { + kthread_stop(dev->worker); + dev->worker = NULL; + } + if (dev->mm) + mmput(dev->mm); + dev->mm = NULL; +} +EXPORT_SYMBOL_GPL(vhost_dev_cleanup); + +static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz) +{ + u64 a = addr / VHOST_PAGE_SIZE / 8; + + /* Make sure 64 bit math will not overflow. */ + if (a > ULONG_MAX - (unsigned long)log_base || + a + (unsigned long)log_base > ULONG_MAX) + return false; + + return access_ok(VERIFY_WRITE, log_base + a, + (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); +} + +/* Make sure 64 bit math will not overflow. */ +static bool vhost_overflow(u64 uaddr, u64 size) +{ + if (uaddr > ULONG_MAX || size > ULONG_MAX) + return true; + + if (!size) + return false; + + return uaddr > ULONG_MAX - size + 1; +} + +/* Caller should have vq mutex and device mutex. */ +static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem, + int log_all) +{ + struct vhost_umem_node *node; + + if (!umem) + return false; + + list_for_each_entry(node, &umem->umem_list, link) { + unsigned long a = node->userspace_addr; + + if (vhost_overflow(node->userspace_addr, node->size)) + return false; + + + if (!access_ok(VERIFY_WRITE, (void __user *)a, + node->size)) + return false; + else if (log_all && !log_access_ok(log_base, + node->start, + node->size)) + return false; + } + return true; +} + +static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq, + u64 addr, unsigned int size, + int type) +{ + const struct vhost_umem_node *node = vq->meta_iotlb[type]; + + if (!node) + return NULL; + + return (void *)(uintptr_t)(node->userspace_addr + addr - node->start); +} + +/* Can we switch to this memory table? */ +/* Caller should have device mutex but not vq mutex */ +static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, + int log_all) +{ + int i; + + for (i = 0; i < d->nvqs; ++i) { + bool ok; + bool log; + + mutex_lock(&d->vqs[i]->mutex); + log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL); + /* If ring is inactive, will check when it's enabled. */ + if (d->vqs[i]->private_data) + ok = vq_memory_access_ok(d->vqs[i]->log_base, + umem, log); + else + ok = true; + mutex_unlock(&d->vqs[i]->mutex); + if (!ok) + return false; + } + return true; +} + +static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, + struct iovec iov[], int iov_size, int access); + +static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to, + const void *from, unsigned size) +{ + int ret; + + if (!vq->iotlb) + return __copy_to_user(to, from, size); + else { + /* This function should be called after iotlb + * prefetch, which means we're sure that all vq + * could be access through iotlb. So -EAGAIN should + * not happen in this case. + */ + struct iov_iter t; + void __user *uaddr = vhost_vq_meta_fetch(vq, + (u64)(uintptr_t)to, size, + VHOST_ADDR_USED); + + if (uaddr) + return __copy_to_user(uaddr, from, size); + + ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov, + ARRAY_SIZE(vq->iotlb_iov), + VHOST_ACCESS_WO); + if (ret < 0) + goto out; + iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size); + ret = copy_to_iter(from, size, &t); + if (ret == size) + ret = 0; + } +out: + return ret; +} + +static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to, + void __user *from, unsigned size) +{ + int ret; + + if (!vq->iotlb) + return __copy_from_user(to, from, size); + else { + /* This function should be called after iotlb + * prefetch, which means we're sure that vq + * could be access through iotlb. So -EAGAIN should + * not happen in this case. + */ + void __user *uaddr = vhost_vq_meta_fetch(vq, + (u64)(uintptr_t)from, size, + VHOST_ADDR_DESC); + struct iov_iter f; + + if (uaddr) + return __copy_from_user(to, uaddr, size); + + ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov, + ARRAY_SIZE(vq->iotlb_iov), + VHOST_ACCESS_RO); + if (ret < 0) { + vq_err(vq, "IOTLB translation failure: uaddr " + "%p size 0x%llx\n", from, + (unsigned long long) size); + goto out; + } + iov_iter_init(&f, READ, vq->iotlb_iov, ret, size); + ret = copy_from_iter(to, size, &f); + if (ret == size) + ret = 0; + } + +out: + return ret; +} + +static void __user *__vhost_get_user_slow(struct vhost_virtqueue *vq, + void __user *addr, unsigned int size, + int type) +{ + int ret; + + ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov, + ARRAY_SIZE(vq->iotlb_iov), + VHOST_ACCESS_RO); + if (ret < 0) { + vq_err(vq, "IOTLB translation failure: uaddr " + "%p size 0x%llx\n", addr, + (unsigned long long) size); + return NULL; + } + + if (ret != 1 || vq->iotlb_iov[0].iov_len != size) { + vq_err(vq, "Non atomic userspace memory access: uaddr " + "%p size 0x%llx\n", addr, + (unsigned long long) size); + return NULL; + } + + return vq->iotlb_iov[0].iov_base; +} + +/* This function should be called after iotlb + * prefetch, which means we're sure that vq + * could be access through iotlb. So -EAGAIN should + * not happen in this case. + */ +static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, + void *addr, unsigned int size, + int type) +{ + void __user *uaddr = vhost_vq_meta_fetch(vq, + (u64)(uintptr_t)addr, size, type); + if (uaddr) + return uaddr; + + return __vhost_get_user_slow(vq, addr, size, type); +} + +#define vhost_put_user(vq, x, ptr) \ +({ \ + int ret = -EFAULT; \ + if (!vq->iotlb) { \ + ret = __put_user(x, ptr); \ + } else { \ + __typeof__(ptr) to = \ + (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ + sizeof(*ptr), VHOST_ADDR_USED); \ + if (to != NULL) \ + ret = __put_user(x, to); \ + else \ + ret = -EFAULT; \ + } \ + ret; \ +}) + +#define vhost_get_user(vq, x, ptr, type) \ +({ \ + int ret; \ + if (!vq->iotlb) { \ + ret = __get_user(x, ptr); \ + } else { \ + __typeof__(ptr) from = \ + (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ + sizeof(*ptr), \ + type); \ + if (from != NULL) \ + ret = __get_user(x, from); \ + else \ + ret = -EFAULT; \ + } \ + ret; \ +}) + +#define vhost_get_avail(vq, x, ptr) \ + vhost_get_user(vq, x, ptr, VHOST_ADDR_AVAIL) + +#define vhost_get_used(vq, x, ptr) \ + vhost_get_user(vq, x, ptr, VHOST_ADDR_USED) + +static void vhost_dev_lock_vqs(struct vhost_dev *d) +{ + int i = 0; + for (i = 0; i < d->nvqs; ++i) + mutex_lock_nested(&d->vqs[i]->mutex, i); +} + +static void vhost_dev_unlock_vqs(struct vhost_dev *d) +{ + int i = 0; + for (i = 0; i < d->nvqs; ++i) + mutex_unlock(&d->vqs[i]->mutex); +} + +static int vhost_new_umem_range(struct vhost_umem *umem, + u64 start, u64 size, u64 end, + u64 userspace_addr, int perm) +{ + struct vhost_umem_node *tmp, *node; + + if (!size) + return -EFAULT; + + node = kmalloc(sizeof(*node), GFP_ATOMIC); + if (!node) + return -ENOMEM; + + if (umem->numem == max_iotlb_entries) { + tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link); + vhost_umem_free(umem, tmp); + } + + node->start = start; + node->size = size; + node->last = end; + node->userspace_addr = userspace_addr; + node->perm = perm; + INIT_LIST_HEAD(&node->link); + list_add_tail(&node->link, &umem->umem_list); + vhost_umem_interval_tree_insert(node, &umem->umem_tree); + umem->numem++; + + return 0; +} + +static void vhost_del_umem_range(struct vhost_umem *umem, + u64 start, u64 end) +{ + struct vhost_umem_node *node; + + while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, + start, end))) + vhost_umem_free(umem, node); +} + +static void vhost_iotlb_notify_vq(struct vhost_dev *d, + struct vhost_iotlb_msg *msg) +{ + struct vhost_msg_node *node, *n; + + spin_lock(&d->iotlb_lock); + + list_for_each_entry_safe(node, n, &d->pending_list, node) { + struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb; + if (msg->iova <= vq_msg->iova && + msg->iova + msg->size - 1 >= vq_msg->iova && + vq_msg->type == VHOST_IOTLB_MISS) { + vhost_poll_queue(&node->vq->poll); + list_del(&node->node); + kfree(node); + } + } + + spin_unlock(&d->iotlb_lock); +} + +static bool umem_access_ok(u64 uaddr, u64 size, int access) +{ + unsigned long a = uaddr; + + /* Make sure 64 bit math will not overflow. */ + if (vhost_overflow(uaddr, size)) + return false; + + if ((access & VHOST_ACCESS_RO) && + !access_ok(VERIFY_READ, (void __user *)a, size)) + return false; + if ((access & VHOST_ACCESS_WO) && + !access_ok(VERIFY_WRITE, (void __user *)a, size)) + return false; + return true; +} + +static int vhost_process_iotlb_msg(struct vhost_dev *dev, + struct vhost_iotlb_msg *msg) +{ + int ret = 0; + + mutex_lock(&dev->mutex); + vhost_dev_lock_vqs(dev); + switch (msg->type) { + case VHOST_IOTLB_UPDATE: + if (!dev->iotlb) { + ret = -EFAULT; + break; + } + if (!umem_access_ok(msg->uaddr, msg->size, msg->perm)) { + ret = -EFAULT; + break; + } + vhost_vq_meta_reset(dev); + if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size, + msg->iova + msg->size - 1, + msg->uaddr, msg->perm)) { + ret = -ENOMEM; + break; + } + vhost_iotlb_notify_vq(dev, msg); + break; + case VHOST_IOTLB_INVALIDATE: + if (!dev->iotlb) { + ret = -EFAULT; + break; + } + vhost_vq_meta_reset(dev); + vhost_del_umem_range(dev->iotlb, msg->iova, + msg->iova + msg->size - 1); + break; + default: + ret = -EINVAL; + break; + } + + vhost_dev_unlock_vqs(dev); + mutex_unlock(&dev->mutex); + + return ret; +} +ssize_t vhost_chr_write_iter(struct vhost_dev *dev, + struct iov_iter *from) +{ + struct vhost_iotlb_msg msg; + size_t offset; + int type, ret; + + ret = copy_from_iter(&type, sizeof(type), from); + if (ret != sizeof(type)) { + ret = -EINVAL; + goto done; + } + + switch (type) { + case VHOST_IOTLB_MSG: + /* There maybe a hole after type for V1 message type, + * so skip it here. + */ + offset = offsetof(struct vhost_msg, iotlb) - sizeof(int); + break; + case VHOST_IOTLB_MSG_V2: + offset = sizeof(__u32); + break; + default: + ret = -EINVAL; + goto done; + } + + iov_iter_advance(from, offset); + ret = copy_from_iter(&msg, sizeof(msg), from); + if (ret != sizeof(msg)) { + ret = -EINVAL; + goto done; + } + if (vhost_process_iotlb_msg(dev, &msg)) { + ret = -EFAULT; + goto done; + } + + ret = (type == VHOST_IOTLB_MSG) ? sizeof(struct vhost_msg) : + sizeof(struct vhost_msg_v2); +done: + return ret; +} +EXPORT_SYMBOL(vhost_chr_write_iter); + +__poll_t vhost_chr_poll(struct file *file, struct vhost_dev *dev, + poll_table *wait) +{ + __poll_t mask = 0; + + poll_wait(file, &dev->wait, wait); + + if (!list_empty(&dev->read_list)) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} +EXPORT_SYMBOL(vhost_chr_poll); + +ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to, + int noblock) +{ + DEFINE_WAIT(wait); + struct vhost_msg_node *node; + ssize_t ret = 0; + unsigned size = sizeof(struct vhost_msg); + + if (iov_iter_count(to) < size) + return 0; + + while (1) { + if (!noblock) + prepare_to_wait(&dev->wait, &wait, + TASK_INTERRUPTIBLE); + + node = vhost_dequeue_msg(dev, &dev->read_list); + if (node) + break; + if (noblock) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (!dev->iotlb) { + ret = -EBADFD; + break; + } + + schedule(); + } + + if (!noblock) + finish_wait(&dev->wait, &wait); + + if (node) { + struct vhost_iotlb_msg *msg; + void *start = &node->msg; + + switch (node->msg.type) { + case VHOST_IOTLB_MSG: + size = sizeof(node->msg); + msg = &node->msg.iotlb; + break; + case VHOST_IOTLB_MSG_V2: + size = sizeof(node->msg_v2); + msg = &node->msg_v2.iotlb; + break; + default: + BUG(); + break; + } + + ret = copy_to_iter(start, size, to); + if (ret != size || msg->type != VHOST_IOTLB_MISS) { + kfree(node); + return ret; + } + vhost_enqueue_msg(dev, &dev->pending_list, node); + } + + return ret; +} +EXPORT_SYMBOL_GPL(vhost_chr_read_iter); + +static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access) +{ + struct vhost_dev *dev = vq->dev; + struct vhost_msg_node *node; + struct vhost_iotlb_msg *msg; + bool v2 = vhost_backend_has_feature(vq, VHOST_BACKEND_F_IOTLB_MSG_V2); + + node = vhost_new_msg(vq, v2 ? VHOST_IOTLB_MSG_V2 : VHOST_IOTLB_MSG); + if (!node) + return -ENOMEM; + + if (v2) { + node->msg_v2.type = VHOST_IOTLB_MSG_V2; + msg = &node->msg_v2.iotlb; + } else { + msg = &node->msg.iotlb; + } + + msg->type = VHOST_IOTLB_MISS; + msg->iova = iova; + msg->perm = access; + + vhost_enqueue_msg(dev, &dev->read_list, node); + + return 0; +} + +static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct vring_used __user *used) + +{ + size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; + + return access_ok(VERIFY_READ, desc, num * sizeof *desc) && + access_ok(VERIFY_READ, avail, + sizeof *avail + num * sizeof *avail->ring + s) && + access_ok(VERIFY_WRITE, used, + sizeof *used + num * sizeof *used->ring + s); +} + +static void vhost_vq_meta_update(struct vhost_virtqueue *vq, + const struct vhost_umem_node *node, + int type) +{ + int access = (type == VHOST_ADDR_USED) ? + VHOST_ACCESS_WO : VHOST_ACCESS_RO; + + if (likely(node->perm & access)) + vq->meta_iotlb[type] = node; +} + +static bool iotlb_access_ok(struct vhost_virtqueue *vq, + int access, u64 addr, u64 len, int type) +{ + const struct vhost_umem_node *node; + struct vhost_umem *umem = vq->iotlb; + u64 s = 0, size, orig_addr = addr, last = addr + len - 1; + + if (vhost_vq_meta_fetch(vq, addr, len, type)) + return true; + + while (len > s) { + node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, + addr, + last); + if (node == NULL || node->start > addr) { + vhost_iotlb_miss(vq, addr, access); + return false; + } else if (!(node->perm & access)) { + /* Report the possible access violation by + * request another translation from userspace. + */ + return false; + } + + size = node->size - addr + node->start; + + if (orig_addr == addr && size >= len) + vhost_vq_meta_update(vq, node, type); + + s += size; + addr += size; + } + + return true; +} + +int vq_iotlb_prefetch(struct vhost_virtqueue *vq) +{ + size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; + unsigned int num = vq->num; + + if (!vq->iotlb) + return 1; + + return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc, + num * sizeof(*vq->desc), VHOST_ADDR_DESC) && + iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail, + sizeof *vq->avail + + num * sizeof(*vq->avail->ring) + s, + VHOST_ADDR_AVAIL) && + iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used, + sizeof *vq->used + + num * sizeof(*vq->used->ring) + s, + VHOST_ADDR_USED); +} +EXPORT_SYMBOL_GPL(vq_iotlb_prefetch); + +/* Can we log writes? */ +/* Caller should have device mutex but not vq mutex */ +bool vhost_log_access_ok(struct vhost_dev *dev) +{ + return memory_access_ok(dev, dev->umem, 1); +} +EXPORT_SYMBOL_GPL(vhost_log_access_ok); + +/* Verify access for write logging. */ +/* Caller should have vq mutex and device mutex */ +static bool vq_log_access_ok(struct vhost_virtqueue *vq, + void __user *log_base) +{ + size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; + + return vq_memory_access_ok(log_base, vq->umem, + vhost_has_feature(vq, VHOST_F_LOG_ALL)) && + (!vq->log_used || log_access_ok(log_base, vq->log_addr, + sizeof *vq->used + + vq->num * sizeof *vq->used->ring + s)); +} + +/* Can we start vq? */ +/* Caller should have vq mutex and device mutex */ +bool vhost_vq_access_ok(struct vhost_virtqueue *vq) +{ + if (!vq_log_access_ok(vq, vq->log_base)) + return false; + + /* Access validation occurs at prefetch time with IOTLB */ + if (vq->iotlb) + return true; + + return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); +} +EXPORT_SYMBOL_GPL(vhost_vq_access_ok); + +static struct vhost_umem *vhost_umem_alloc(void) +{ + struct vhost_umem *umem = kvzalloc(sizeof(*umem), GFP_KERNEL); + + if (!umem) + return NULL; + + umem->umem_tree = RB_ROOT_CACHED; + umem->numem = 0; + INIT_LIST_HEAD(&umem->umem_list); + + return umem; +} + +static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) +{ + struct vhost_memory mem, *newmem; + struct vhost_memory_region *region; + struct vhost_umem *newumem, *oldumem; + unsigned long size = offsetof(struct vhost_memory, regions); + int i; + + if (copy_from_user(&mem, m, size)) + return -EFAULT; + if (mem.padding) + return -EOPNOTSUPP; + if (mem.nregions > max_mem_regions) + return -E2BIG; + newmem = kvzalloc(struct_size(newmem, regions, mem.nregions), + GFP_KERNEL); + if (!newmem) + return -ENOMEM; + + memcpy(newmem, &mem, size); + if (copy_from_user(newmem->regions, m->regions, + mem.nregions * sizeof *m->regions)) { + kvfree(newmem); + return -EFAULT; + } + + newumem = vhost_umem_alloc(); + if (!newumem) { + kvfree(newmem); + return -ENOMEM; + } + + for (region = newmem->regions; + region < newmem->regions + mem.nregions; + region++) { + if (vhost_new_umem_range(newumem, + region->guest_phys_addr, + region->memory_size, + region->guest_phys_addr + + region->memory_size - 1, + region->userspace_addr, + VHOST_ACCESS_RW)) + goto err; + } + + if (!memory_access_ok(d, newumem, 0)) + goto err; + + oldumem = d->umem; + d->umem = newumem; + + /* All memory accesses are done under some VQ mutex. */ + for (i = 0; i < d->nvqs; ++i) { + mutex_lock(&d->vqs[i]->mutex); + d->vqs[i]->umem = newumem; + mutex_unlock(&d->vqs[i]->mutex); + } + + kvfree(newmem); + vhost_umem_clean(oldumem); + return 0; + +err: + vhost_umem_clean(newumem); + kvfree(newmem); + return -EFAULT; +} + +long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) +{ + struct file *eventfp, *filep = NULL; + bool pollstart = false, pollstop = false; + struct eventfd_ctx *ctx = NULL; + u32 __user *idxp = argp; + struct vhost_virtqueue *vq; + struct vhost_vring_state s; + struct vhost_vring_file f; + struct vhost_vring_addr a; + u32 idx; + long r; + + r = get_user(idx, idxp); + if (r < 0) + return r; + if (idx >= d->nvqs) + return -ENOBUFS; + + idx = array_index_nospec(idx, d->nvqs); + vq = d->vqs[idx]; + + mutex_lock(&vq->mutex); + + switch (ioctl) { + case VHOST_SET_VRING_NUM: + /* Resizing ring with an active backend? + * You don't want to do that. */ + if (vq->private_data) { + r = -EBUSY; + break; + } + if (copy_from_user(&s, argp, sizeof s)) { + r = -EFAULT; + break; + } + if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) { + r = -EINVAL; + break; + } + vq->num = s.num; + break; + case VHOST_SET_VRING_BASE: + /* Moving base with an active backend? + * You don't want to do that. */ + if (vq->private_data) { + r = -EBUSY; + break; + } + if (copy_from_user(&s, argp, sizeof s)) { + r = -EFAULT; + break; + } + if (s.num > 0xffff) { + r = -EINVAL; + break; + } + vq->last_avail_idx = s.num; + /* Forget the cached index value. */ + vq->avail_idx = vq->last_avail_idx; + break; + case VHOST_GET_VRING_BASE: + s.index = idx; + s.num = vq->last_avail_idx; + if (copy_to_user(argp, &s, sizeof s)) + r = -EFAULT; + break; + case VHOST_SET_VRING_ADDR: + if (copy_from_user(&a, argp, sizeof a)) { + r = -EFAULT; + break; + } + if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) { + r = -EOPNOTSUPP; + break; + } + /* For 32bit, verify that the top 32bits of the user + data are set to zero. */ + if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr || + (u64)(unsigned long)a.used_user_addr != a.used_user_addr || + (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) { + r = -EFAULT; + break; + } + + /* Make sure it's safe to cast pointers to vring types. */ + BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE); + BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE); + if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) || + (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) || + (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1))) { + r = -EINVAL; + break; + } + + /* We only verify access here if backend is configured. + * If it is not, we don't as size might not have been setup. + * We will verify when backend is configured. */ + if (vq->private_data) { + if (!vq_access_ok(vq, vq->num, + (void __user *)(unsigned long)a.desc_user_addr, + (void __user *)(unsigned long)a.avail_user_addr, + (void __user *)(unsigned long)a.used_user_addr)) { + r = -EINVAL; + break; + } + + /* Also validate log access for used ring if enabled. */ + if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) && + !log_access_ok(vq->log_base, a.log_guest_addr, + sizeof *vq->used + + vq->num * sizeof *vq->used->ring)) { + r = -EINVAL; + break; + } + } + + vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); + vq->desc = (void __user *)(unsigned long)a.desc_user_addr; + vq->avail = (void __user *)(unsigned long)a.avail_user_addr; + vq->log_addr = a.log_guest_addr; + vq->used = (void __user *)(unsigned long)a.used_user_addr; + break; + case VHOST_SET_VRING_KICK: + if (copy_from_user(&f, argp, sizeof f)) { + r = -EFAULT; + break; + } + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + if (IS_ERR(eventfp)) { + r = PTR_ERR(eventfp); + break; + } + if (eventfp != vq->kick) { + pollstop = (filep = vq->kick) != NULL; + pollstart = (vq->kick = eventfp) != NULL; + } else + filep = eventfp; + break; + case VHOST_SET_VRING_CALL: + if (copy_from_user(&f, argp, sizeof f)) { + r = -EFAULT; + break; + } + ctx = f.fd == -1 ? NULL : eventfd_ctx_fdget(f.fd); + if (IS_ERR(ctx)) { + r = PTR_ERR(ctx); + break; + } + swap(ctx, vq->call_ctx); + break; + case VHOST_SET_VRING_ERR: + if (copy_from_user(&f, argp, sizeof f)) { + r = -EFAULT; + break; + } + ctx = f.fd == -1 ? NULL : eventfd_ctx_fdget(f.fd); + if (IS_ERR(ctx)) { + r = PTR_ERR(ctx); + break; + } + swap(ctx, vq->error_ctx); + break; + case VHOST_SET_VRING_ENDIAN: + r = vhost_set_vring_endian(vq, argp); + break; + case VHOST_GET_VRING_ENDIAN: + r = vhost_get_vring_endian(vq, idx, argp); + break; + case VHOST_SET_VRING_BUSYLOOP_TIMEOUT: + if (copy_from_user(&s, argp, sizeof(s))) { + r = -EFAULT; + break; + } + vq->busyloop_timeout = s.num; + break; + case VHOST_GET_VRING_BUSYLOOP_TIMEOUT: + s.index = idx; + s.num = vq->busyloop_timeout; + if (copy_to_user(argp, &s, sizeof(s))) + r = -EFAULT; + break; + default: + r = -ENOIOCTLCMD; + } + + if (pollstop && vq->handle_kick) + vhost_poll_stop(&vq->poll); + + if (!IS_ERR_OR_NULL(ctx)) + eventfd_ctx_put(ctx); + if (filep) + fput(filep); + + if (pollstart && vq->handle_kick) + r = vhost_poll_start(&vq->poll, vq->kick); + + mutex_unlock(&vq->mutex); + + if (pollstop && vq->handle_kick) + vhost_poll_flush(&vq->poll); + return r; +} +EXPORT_SYMBOL_GPL(vhost_vring_ioctl); + +int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled) +{ + struct vhost_umem *niotlb, *oiotlb; + int i; + + niotlb = vhost_umem_alloc(); + if (!niotlb) + return -ENOMEM; + + oiotlb = d->iotlb; + d->iotlb = niotlb; + + for (i = 0; i < d->nvqs; ++i) { + struct vhost_virtqueue *vq = d->vqs[i]; + + mutex_lock(&vq->mutex); + vq->iotlb = niotlb; + __vhost_vq_meta_reset(vq); + mutex_unlock(&vq->mutex); + } + + vhost_umem_clean(oiotlb); + + return 0; +} +EXPORT_SYMBOL_GPL(vhost_init_device_iotlb); + +/* Caller must have device mutex */ +long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) +{ + struct eventfd_ctx *ctx; + u64 p; + long r; + int i, fd; + + /* If you are not the owner, you can become one */ + if (ioctl == VHOST_SET_OWNER) { + r = vhost_dev_set_owner(d); + goto done; + } + + /* You must be the owner to do anything else */ + r = vhost_dev_check_owner(d); + if (r) + goto done; + + switch (ioctl) { + case VHOST_SET_MEM_TABLE: + r = vhost_set_memory(d, argp); + break; + case VHOST_SET_LOG_BASE: + if (copy_from_user(&p, argp, sizeof p)) { + r = -EFAULT; + break; + } + if ((u64)(unsigned long)p != p) { + r = -EFAULT; + break; + } + for (i = 0; i < d->nvqs; ++i) { + struct vhost_virtqueue *vq; + void __user *base = (void __user *)(unsigned long)p; + vq = d->vqs[i]; + mutex_lock(&vq->mutex); + /* If ring is inactive, will check when it's enabled. */ + if (vq->private_data && !vq_log_access_ok(vq, base)) + r = -EFAULT; + else + vq->log_base = base; + mutex_unlock(&vq->mutex); + } + break; + case VHOST_SET_LOG_FD: + r = get_user(fd, (int __user *)argp); + if (r < 0) + break; + ctx = fd == -1 ? NULL : eventfd_ctx_fdget(fd); + if (IS_ERR(ctx)) { + r = PTR_ERR(ctx); + break; + } + swap(ctx, d->log_ctx); + for (i = 0; i < d->nvqs; ++i) { + mutex_lock(&d->vqs[i]->mutex); + d->vqs[i]->log_ctx = d->log_ctx; + mutex_unlock(&d->vqs[i]->mutex); + } + if (ctx) + eventfd_ctx_put(ctx); + break; + default: + r = -ENOIOCTLCMD; + break; + } +done: + return r; +} +EXPORT_SYMBOL_GPL(vhost_dev_ioctl); + +/* TODO: This is really inefficient. We need something like get_user() + * (instruction directly accesses the data, with an exception table entry + * returning -EFAULT). See Documentation/x86/exception-tables.txt. + */ +static int set_bit_to_user(int nr, void __user *addr) +{ + unsigned long log = (unsigned long)addr; + struct page *page; + void *base; + int bit = nr + (log % PAGE_SIZE) * 8; + int r; + + r = get_user_pages_fast(log, 1, 1, &page); + if (r < 0) + return r; + BUG_ON(r != 1); + base = kmap_atomic(page); + set_bit(bit, base); + kunmap_atomic(base); + set_page_dirty_lock(page); + put_page(page); + return 0; +} + +static int log_write(void __user *log_base, + u64 write_address, u64 write_length) +{ + u64 write_page = write_address / VHOST_PAGE_SIZE; + int r; + + if (!write_length) + return 0; + write_length += write_address % VHOST_PAGE_SIZE; + for (;;) { + u64 base = (u64)(unsigned long)log_base; + u64 log = base + write_page / 8; + int bit = write_page % 8; + if ((u64)(unsigned long)log != log) + return -EFAULT; + r = set_bit_to_user(bit, (void __user *)(unsigned long)log); + if (r < 0) + return r; + if (write_length <= VHOST_PAGE_SIZE) + break; + write_length -= VHOST_PAGE_SIZE; + write_page += 1; + } + return r; +} + +static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len) +{ + struct vhost_umem *umem = vq->umem; + struct vhost_umem_node *u; + u64 start, end, l, min; + int r; + bool hit = false; + + while (len) { + min = len; + /* More than one GPAs can be mapped into a single HVA. So + * iterate all possible umems here to be safe. + */ + list_for_each_entry(u, &umem->umem_list, link) { + if (u->userspace_addr > hva - 1 + len || + u->userspace_addr - 1 + u->size < hva) + continue; + start = max(u->userspace_addr, hva); + end = min(u->userspace_addr - 1 + u->size, + hva - 1 + len); + l = end - start + 1; + r = log_write(vq->log_base, + u->start + start - u->userspace_addr, + l); + if (r < 0) + return r; + hit = true; + min = min(l, min); + } + + if (!hit) + return -EFAULT; + + len -= min; + hva += min; + } + + return 0; +} + +static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len) +{ + struct iovec iov[64]; + int i, ret; + + if (!vq->iotlb) + return log_write(vq->log_base, vq->log_addr + used_offset, len); + + ret = translate_desc(vq, (uintptr_t)vq->used + used_offset, + len, iov, 64, VHOST_ACCESS_WO); + if (ret < 0) + return ret; + + for (i = 0; i < ret; i++) { + ret = log_write_hva(vq, (uintptr_t)iov[i].iov_base, + iov[i].iov_len); + if (ret) + return ret; + } + + return 0; +} + +int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, + unsigned int log_num, u64 len, struct iovec *iov, int count) +{ + int i, r; + + /* Make sure data written is seen before log. */ + smp_wmb(); + + if (vq->iotlb) { + for (i = 0; i < count; i++) { + r = log_write_hva(vq, (uintptr_t)iov[i].iov_base, + iov[i].iov_len); + if (r < 0) + return r; + } + return 0; + } + + for (i = 0; i < log_num; ++i) { + u64 l = min(log[i].len, len); + r = log_write(vq->log_base, log[i].addr, l); + if (r < 0) + return r; + len -= l; + if (!len) { + if (vq->log_ctx) + eventfd_signal(vq->log_ctx, 1); + return 0; + } + } + /* Length written exceeds what we have stored. This is a bug. */ + BUG(); + return 0; +} +EXPORT_SYMBOL_GPL(vhost_log_write); + +static int vhost_update_used_flags(struct vhost_virtqueue *vq) +{ + void __user *used; + if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags), + &vq->used->flags) < 0) + return -EFAULT; + if (unlikely(vq->log_used)) { + /* Make sure the flag is seen before log. */ + smp_wmb(); + /* Log used flag write. */ + used = &vq->used->flags; + log_used(vq, (used - (void __user *)vq->used), + sizeof vq->used->flags); + if (vq->log_ctx) + eventfd_signal(vq->log_ctx, 1); + } + return 0; +} + +static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event) +{ + if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx), + vhost_avail_event(vq))) + return -EFAULT; + if (unlikely(vq->log_used)) { + void __user *used; + /* Make sure the event is seen before log. */ + smp_wmb(); + /* Log avail event write */ + used = vhost_avail_event(vq); + log_used(vq, (used - (void __user *)vq->used), + sizeof *vhost_avail_event(vq)); + if (vq->log_ctx) + eventfd_signal(vq->log_ctx, 1); + } + return 0; +} + +int vhost_vq_init_access(struct vhost_virtqueue *vq) +{ + __virtio16 last_used_idx; + int r; + bool is_le = vq->is_le; + + if (!vq->private_data) + return 0; + + vhost_init_is_le(vq); + + r = vhost_update_used_flags(vq); + if (r) + goto err; + vq->signalled_used_valid = false; + if (!vq->iotlb && + !access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) { + r = -EFAULT; + goto err; + } + r = vhost_get_used(vq, last_used_idx, &vq->used->idx); + if (r) { + vq_err(vq, "Can't access used idx at %p\n", + &vq->used->idx); + goto err; + } + vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx); + return 0; + +err: + vq->is_le = is_le; + return r; +} +EXPORT_SYMBOL_GPL(vhost_vq_init_access); + +static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, + struct iovec iov[], int iov_size, int access) +{ + const struct vhost_umem_node *node; + struct vhost_dev *dev = vq->dev; + struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem; + struct iovec *_iov; + u64 s = 0; + int ret = 0; + + while ((u64)len > s) { + u64 size; + if (unlikely(ret >= iov_size)) { + ret = -ENOBUFS; + break; + } + + node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, + addr, addr + len - 1); + if (node == NULL || node->start > addr) { + if (umem != dev->iotlb) { + ret = -EFAULT; + break; + } + ret = -EAGAIN; + break; + } else if (!(node->perm & access)) { + ret = -EPERM; + break; + } + + _iov = iov + ret; + size = node->size - addr + node->start; + _iov->iov_len = min((u64)len - s, size); + _iov->iov_base = (void __user *)(unsigned long) + (node->userspace_addr + addr - node->start); + s += size; + addr += size; + ++ret; + } + + if (ret == -EAGAIN) + vhost_iotlb_miss(vq, addr, access); + return ret; +} + +/* Each buffer in the virtqueues is actually a chain of descriptors. This + * function returns the next descriptor in the chain, + * or -1U if we're at the end. */ +static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc) +{ + unsigned int next; + + /* If this descriptor says it doesn't chain, we're done. */ + if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT))) + return -1U; + + /* Check they're not leading us off end of descriptors. */ + next = vhost16_to_cpu(vq, READ_ONCE(desc->next)); + return next; +} + +static int get_indirect(struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num, + struct vring_desc *indirect) +{ + struct vring_desc desc; + unsigned int i = 0, count, found = 0; + u32 len = vhost32_to_cpu(vq, indirect->len); + struct iov_iter from; + int ret, access; + + /* Sanity check */ + if (unlikely(len % sizeof desc)) { + vq_err(vq, "Invalid length in indirect descriptor: " + "len 0x%llx not multiple of 0x%zx\n", + (unsigned long long)len, + sizeof desc); + return -EINVAL; + } + + ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect, + UIO_MAXIOV, VHOST_ACCESS_RO); + if (unlikely(ret < 0)) { + if (ret != -EAGAIN) + vq_err(vq, "Translation failure %d in indirect.\n", ret); + return ret; + } + iov_iter_init(&from, READ, vq->indirect, ret, len); + + /* We will use the result as an address to read from, so most + * architectures only need a compiler barrier here. */ + read_barrier_depends(); + + count = len / sizeof desc; + /* Buffers are chained via a 16 bit next field, so + * we can have at most 2^16 of these. */ + if (unlikely(count > USHRT_MAX + 1)) { + vq_err(vq, "Indirect buffer length too big: %d\n", + indirect->len); + return -E2BIG; + } + + do { + unsigned iov_count = *in_num + *out_num; + if (unlikely(++found > count)) { + vq_err(vq, "Loop detected: last one at %u " + "indirect size %u\n", + i, count); + return -EINVAL; + } + if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) { + vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n", + i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc); + return -EINVAL; + } + if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) { + vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n", + i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc); + return -EINVAL; + } + + if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) + access = VHOST_ACCESS_WO; + else + access = VHOST_ACCESS_RO; + + ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr), + vhost32_to_cpu(vq, desc.len), iov + iov_count, + iov_size - iov_count, access); + if (unlikely(ret < 0)) { + if (ret != -EAGAIN) + vq_err(vq, "Translation failure %d indirect idx %d\n", + ret, i); + return ret; + } + /* If this is an input descriptor, increment that count. */ + if (access == VHOST_ACCESS_WO) { + *in_num += ret; + if (unlikely(log && ret)) { + log[*log_num].addr = vhost64_to_cpu(vq, desc.addr); + log[*log_num].len = vhost32_to_cpu(vq, desc.len); + ++*log_num; + } + } else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (unlikely(*in_num)) { + vq_err(vq, "Indirect descriptor " + "has out after in: idx %d\n", i); + return -EINVAL; + } + *out_num += ret; + } + } while ((i = next_desc(vq, &desc)) != -1); + return 0; +} + +/* This looks in the virtqueue and for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function returns the descriptor number found, or vq->num (which is + * never a valid descriptor number) if none was found. A negative code is + * returned on error. */ +int vhost_get_vq_desc(struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num) +{ + struct vring_desc desc; + unsigned int i, head, found = 0; + u16 last_avail_idx; + __virtio16 avail_idx; + __virtio16 ring_head; + int ret, access; + + /* Check it isn't doing very strange things with descriptor numbers. */ + last_avail_idx = vq->last_avail_idx; + + if (vq->avail_idx == vq->last_avail_idx) { + if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) { + vq_err(vq, "Failed to access avail idx at %p\n", + &vq->avail->idx); + return -EFAULT; + } + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + + if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) { + vq_err(vq, "Guest moved used index from %u to %u", + last_avail_idx, vq->avail_idx); + return -EFAULT; + } + + /* If there's nothing new since last we looked, return + * invalid. + */ + if (vq->avail_idx == last_avail_idx) + return vq->num; + + /* Only get avail ring entries after they have been + * exposed by guest. + */ + smp_rmb(); + } + + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + if (unlikely(vhost_get_avail(vq, ring_head, + &vq->avail->ring[last_avail_idx & (vq->num - 1)]))) { + vq_err(vq, "Failed to read head: idx %d address %p\n", + last_avail_idx, + &vq->avail->ring[last_avail_idx % vq->num]); + return -EFAULT; + } + + head = vhost16_to_cpu(vq, ring_head); + + /* If their number is silly, that's an error. */ + if (unlikely(head >= vq->num)) { + vq_err(vq, "Guest says index %u > %u is available", + head, vq->num); + return -EINVAL; + } + + /* When we start there are none of either input nor output. */ + *out_num = *in_num = 0; + if (unlikely(log)) + *log_num = 0; + + i = head; + do { + unsigned iov_count = *in_num + *out_num; + if (unlikely(i >= vq->num)) { + vq_err(vq, "Desc index is %u > %u, head = %u", + i, vq->num, head); + return -EINVAL; + } + if (unlikely(++found > vq->num)) { + vq_err(vq, "Loop detected: last one at %u " + "vq size %u head %u\n", + i, vq->num, head); + return -EINVAL; + } + ret = vhost_copy_from_user(vq, &desc, vq->desc + i, + sizeof desc); + if (unlikely(ret)) { + vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", + i, vq->desc + i); + return -EFAULT; + } + if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) { + ret = get_indirect(vq, iov, iov_size, + out_num, in_num, + log, log_num, &desc); + if (unlikely(ret < 0)) { + if (ret != -EAGAIN) + vq_err(vq, "Failure detected " + "in indirect descriptor at idx %d\n", i); + return ret; + } + continue; + } + + if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE)) + access = VHOST_ACCESS_WO; + else + access = VHOST_ACCESS_RO; + ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr), + vhost32_to_cpu(vq, desc.len), iov + iov_count, + iov_size - iov_count, access); + if (unlikely(ret < 0)) { + if (ret != -EAGAIN) + vq_err(vq, "Translation failure %d descriptor idx %d\n", + ret, i); + return ret; + } + if (access == VHOST_ACCESS_WO) { + /* If this is an input descriptor, + * increment that count. */ + *in_num += ret; + if (unlikely(log && ret)) { + log[*log_num].addr = vhost64_to_cpu(vq, desc.addr); + log[*log_num].len = vhost32_to_cpu(vq, desc.len); + ++*log_num; + } + } else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (unlikely(*in_num)) { + vq_err(vq, "Descriptor has out after in: " + "idx %d\n", i); + return -EINVAL; + } + *out_num += ret; + } + } while ((i = next_desc(vq, &desc)) != -1); + + /* On success, increment avail index. */ + vq->last_avail_idx++; + + /* Assume notifications from guest are disabled at this point, + * if they aren't we would need to update avail_event index. */ + BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY)); + return head; +} +EXPORT_SYMBOL_GPL(vhost_get_vq_desc); + +/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ +void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n) +{ + vq->last_avail_idx -= n; +} +EXPORT_SYMBOL_GPL(vhost_discard_vq_desc); + +/* After we've used one of their buffers, we tell them about it. We'll then + * want to notify the guest, using eventfd. */ +int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) +{ + struct vring_used_elem heads = { + cpu_to_vhost32(vq, head), + cpu_to_vhost32(vq, len) + }; + + return vhost_add_used_n(vq, &heads, 1); +} +EXPORT_SYMBOL_GPL(vhost_add_used); + +static int __vhost_add_used_n(struct vhost_virtqueue *vq, + struct vring_used_elem *heads, + unsigned count) +{ + struct vring_used_elem __user *used; + u16 old, new; + int start; + + start = vq->last_used_idx & (vq->num - 1); + used = vq->used->ring + start; + if (count == 1) { + if (vhost_put_user(vq, heads[0].id, &used->id)) { + vq_err(vq, "Failed to write used id"); + return -EFAULT; + } + if (vhost_put_user(vq, heads[0].len, &used->len)) { + vq_err(vq, "Failed to write used len"); + return -EFAULT; + } + } else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) { + vq_err(vq, "Failed to write used"); + return -EFAULT; + } + if (unlikely(vq->log_used)) { + /* Make sure data is seen before log. */ + smp_wmb(); + /* Log used ring entry write. */ + log_used(vq, ((void __user *)used - (void __user *)vq->used), + count * sizeof *used); + } + old = vq->last_used_idx; + new = (vq->last_used_idx += count); + /* If the driver never bothers to signal in a very long while, + * used index might wrap around. If that happens, invalidate + * signalled_used index we stored. TODO: make sure driver + * signals at least once in 2^16 and remove this. */ + if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old))) + vq->signalled_used_valid = false; + return 0; +} + +/* After we've used one of their buffers, we tell them about it. We'll then + * want to notify the guest, using eventfd. */ +int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, + unsigned count) +{ + int start, n, r; + + start = vq->last_used_idx & (vq->num - 1); + n = vq->num - start; + if (n < count) { + r = __vhost_add_used_n(vq, heads, n); + if (r < 0) + return r; + heads += n; + count -= n; + } + r = __vhost_add_used_n(vq, heads, count); + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx), + &vq->used->idx)) { + vq_err(vq, "Failed to increment used idx"); + return -EFAULT; + } + if (unlikely(vq->log_used)) { + /* Make sure used idx is seen before log. */ + smp_wmb(); + /* Log used index update. */ + log_used(vq, offsetof(struct vring_used, idx), + sizeof vq->used->idx); + if (vq->log_ctx) + eventfd_signal(vq->log_ctx, 1); + } + return r; +} +EXPORT_SYMBOL_GPL(vhost_add_used_n); + +static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ + __u16 old, new; + __virtio16 event; + bool v; + /* Flush out used index updates. This is paired + * with the barrier that the Guest executes when enabling + * interrupts. */ + smp_mb(); + + if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) && + unlikely(vq->avail_idx == vq->last_avail_idx)) + return true; + + if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { + __virtio16 flags; + if (vhost_get_avail(vq, flags, &vq->avail->flags)) { + vq_err(vq, "Failed to get flags"); + return true; + } + return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT)); + } + old = vq->signalled_used; + v = vq->signalled_used_valid; + new = vq->signalled_used = vq->last_used_idx; + vq->signalled_used_valid = true; + + if (unlikely(!v)) + return true; + + if (vhost_get_avail(vq, event, vhost_used_event(vq))) { + vq_err(vq, "Failed to get used event idx"); + return true; + } + return vring_need_event(vhost16_to_cpu(vq, event), new, old); +} + +/* This actually signals the guest, using eventfd. */ +void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ + /* Signal the Guest tell them we used something up. */ + if (vq->call_ctx && vhost_notify(dev, vq)) + eventfd_signal(vq->call_ctx, 1); +} +EXPORT_SYMBOL_GPL(vhost_signal); + +/* And here's the combo meal deal. Supersize me! */ +void vhost_add_used_and_signal(struct vhost_dev *dev, + struct vhost_virtqueue *vq, + unsigned int head, int len) +{ + vhost_add_used(vq, head, len); + vhost_signal(dev, vq); +} +EXPORT_SYMBOL_GPL(vhost_add_used_and_signal); + +/* multi-buffer version of vhost_add_used_and_signal */ +void vhost_add_used_and_signal_n(struct vhost_dev *dev, + struct vhost_virtqueue *vq, + struct vring_used_elem *heads, unsigned count) +{ + vhost_add_used_n(vq, heads, count); + vhost_signal(dev, vq); +} +EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); + +/* return true if we're sure that avaiable ring is empty */ +bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ + __virtio16 avail_idx; + int r; + + if (vq->avail_idx != vq->last_avail_idx) + return false; + + r = vhost_get_avail(vq, avail_idx, &vq->avail->idx); + if (unlikely(r)) + return false; + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + + return vq->avail_idx == vq->last_avail_idx; +} +EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); + +/* OK, now we need to know about added descriptors. */ +bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ + __virtio16 avail_idx; + int r; + + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) + return false; + vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; + if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { + r = vhost_update_used_flags(vq); + if (r) { + vq_err(vq, "Failed to enable notification at %p: %d\n", + &vq->used->flags, r); + return false; + } + } else { + r = vhost_update_avail_event(vq, vq->avail_idx); + if (r) { + vq_err(vq, "Failed to update avail event index at %p: %d\n", + vhost_avail_event(vq), r); + return false; + } + } + /* They could have slipped one in as we were doing that: make + * sure it's written, then check again. */ + smp_mb(); + r = vhost_get_avail(vq, avail_idx, &vq->avail->idx); + if (r) { + vq_err(vq, "Failed to check avail idx at %p: %d\n", + &vq->avail->idx, r); + return false; + } + + return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx; +} +EXPORT_SYMBOL_GPL(vhost_enable_notify); + +/* We don't need to be notified again. */ +void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ + int r; + + if (vq->used_flags & VRING_USED_F_NO_NOTIFY) + return; + vq->used_flags |= VRING_USED_F_NO_NOTIFY; + if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { + r = vhost_update_used_flags(vq); + if (r) + vq_err(vq, "Failed to enable notification at %p: %d\n", + &vq->used->flags, r); + } +} +EXPORT_SYMBOL_GPL(vhost_disable_notify); + +/* Create a new message. */ +struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type) +{ + struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL); + if (!node) + return NULL; + + /* Make sure all padding within the structure is initialized. */ + memset(&node->msg, 0, sizeof node->msg); + node->vq = vq; + node->msg.type = type; + return node; +} +EXPORT_SYMBOL_GPL(vhost_new_msg); + +void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head, + struct vhost_msg_node *node) +{ + spin_lock(&dev->iotlb_lock); + list_add_tail(&node->node, head); + spin_unlock(&dev->iotlb_lock); + + wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM); +} +EXPORT_SYMBOL_GPL(vhost_enqueue_msg); + +struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev, + struct list_head *head) +{ + struct vhost_msg_node *node = NULL; + + spin_lock(&dev->iotlb_lock); + if (!list_empty(head)) { + node = list_first_entry(head, struct vhost_msg_node, + node); + list_del(&node->node); + } + spin_unlock(&dev->iotlb_lock); + + return node; +} +EXPORT_SYMBOL_GPL(vhost_dequeue_msg); + + +static int __init vhost_init(void) +{ + return 0; +} + +static void __exit vhost_exit(void) +{ +} + +module_init(vhost_init); +module_exit(vhost_exit); + +MODULE_VERSION("0.0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Michael S. Tsirkin"); +MODULE_DESCRIPTION("Host kernel accelerator for virtio"); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h new file mode 100644 index 000000000..27a78a9b8 --- /dev/null +++ b/drivers/vhost/vhost.h @@ -0,0 +1,298 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _VHOST_H +#define _VHOST_H + +#include <linux/eventfd.h> +#include <linux/vhost.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/virtio_config.h> +#include <linux/virtio_ring.h> +#include <linux/atomic.h> + +struct vhost_work; +typedef void (*vhost_work_fn_t)(struct vhost_work *work); + +#define VHOST_WORK_QUEUED 1 +struct vhost_work { + struct llist_node node; + vhost_work_fn_t fn; + unsigned long flags; +}; + +/* Poll a file (eventfd or socket) */ +/* Note: there's nothing vhost specific about this structure. */ +struct vhost_poll { + poll_table table; + wait_queue_head_t *wqh; + wait_queue_entry_t wait; + struct vhost_work work; + __poll_t mask; + struct vhost_dev *dev; +}; + +void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn); +void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work); +bool vhost_has_work(struct vhost_dev *dev); + +void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, + __poll_t mask, struct vhost_dev *dev); +int vhost_poll_start(struct vhost_poll *poll, struct file *file); +void vhost_poll_stop(struct vhost_poll *poll); +void vhost_poll_flush(struct vhost_poll *poll); +void vhost_poll_queue(struct vhost_poll *poll); +void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work); +long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); + +struct vhost_log { + u64 addr; + u64 len; +}; + +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) + +struct vhost_umem_node { + struct rb_node rb; + struct list_head link; + __u64 start; + __u64 last; + __u64 size; + __u64 userspace_addr; + __u32 perm; + __u32 flags_padding; + __u64 __subtree_last; +}; + +struct vhost_umem { + struct rb_root_cached umem_tree; + struct list_head umem_list; + int numem; +}; + +enum vhost_uaddr_type { + VHOST_ADDR_DESC = 0, + VHOST_ADDR_AVAIL = 1, + VHOST_ADDR_USED = 2, + VHOST_NUM_ADDRS = 3, +}; + +/* The virtqueue structure describes a queue attached to a device. */ +struct vhost_virtqueue { + struct vhost_dev *dev; + + /* The actual ring of buffers. */ + struct mutex mutex; + unsigned int num; + struct vring_desc __user *desc; + struct vring_avail __user *avail; + struct vring_used __user *used; + const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS]; + struct file *kick; + struct eventfd_ctx *call_ctx; + struct eventfd_ctx *error_ctx; + struct eventfd_ctx *log_ctx; + + struct vhost_poll poll; + + /* The routine to call when the Guest pings us, or timeout. */ + vhost_work_fn_t handle_kick; + + /* Last available index we saw. */ + u16 last_avail_idx; + + /* Caches available index value from user. */ + u16 avail_idx; + + /* Last index we used. */ + u16 last_used_idx; + + /* Used flags */ + u16 used_flags; + + /* Last used index value we have signalled on */ + u16 signalled_used; + + /* Last used index value we have signalled on */ + bool signalled_used_valid; + + /* Log writes to used structure. */ + bool log_used; + u64 log_addr; + + struct iovec iov[UIO_MAXIOV]; + struct iovec iotlb_iov[64]; + struct iovec *indirect; + struct vring_used_elem *heads; + /* Protected by virtqueue mutex. */ + struct vhost_umem *umem; + struct vhost_umem *iotlb; + void *private_data; + u64 acked_features; + u64 acked_backend_features; + /* Log write descriptors */ + void __user *log_base; + struct vhost_log *log; + + /* Ring endianness. Defaults to legacy native endianness. + * Set to true when starting a modern virtio device. */ + bool is_le; +#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY + /* Ring endianness requested by userspace for cross-endian support. */ + bool user_be; +#endif + u32 busyloop_timeout; +}; + +struct vhost_msg_node { + union { + struct vhost_msg msg; + struct vhost_msg_v2 msg_v2; + }; + struct vhost_virtqueue *vq; + struct list_head node; +}; + +struct vhost_dev { + struct mm_struct *mm; + struct mutex mutex; + struct vhost_virtqueue **vqs; + int nvqs; + struct eventfd_ctx *log_ctx; + struct llist_head work_list; + struct task_struct *worker; + struct vhost_umem *umem; + struct vhost_umem *iotlb; + spinlock_t iotlb_lock; + struct list_head read_list; + struct list_head pending_list; + wait_queue_head_t wait; + int iov_limit; + int weight; + int byte_weight; +}; + +bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len); +void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, + int nvqs, int iov_limit, int weight, int byte_weight); +long vhost_dev_set_owner(struct vhost_dev *dev); +bool vhost_dev_has_owner(struct vhost_dev *dev); +long vhost_dev_check_owner(struct vhost_dev *); +struct vhost_umem *vhost_dev_reset_owner_prepare(void); +void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_umem *); +void vhost_dev_cleanup(struct vhost_dev *); +void vhost_dev_stop(struct vhost_dev *); +long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); +long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); +bool vhost_vq_access_ok(struct vhost_virtqueue *vq); +bool vhost_log_access_ok(struct vhost_dev *); + +int vhost_get_vq_desc(struct vhost_virtqueue *, + struct iovec iov[], unsigned int iov_count, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num); +void vhost_discard_vq_desc(struct vhost_virtqueue *, int n); + +int vhost_vq_init_access(struct vhost_virtqueue *); +int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); +int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads, + unsigned count); +void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, + unsigned int id, int len); +void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *, + struct vring_used_elem *heads, unsigned count); +void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); +void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *); +bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *); +bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *); + +int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, + unsigned int log_num, u64 len, + struct iovec *iov, int count); +int vq_iotlb_prefetch(struct vhost_virtqueue *vq); + +struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type); +void vhost_enqueue_msg(struct vhost_dev *dev, + struct list_head *head, + struct vhost_msg_node *node); +struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev, + struct list_head *head); +__poll_t vhost_chr_poll(struct file *file, struct vhost_dev *dev, + poll_table *wait); +ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to, + int noblock); +ssize_t vhost_chr_write_iter(struct vhost_dev *dev, + struct iov_iter *from); +int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled); + +#define vq_err(vq, fmt, ...) do { \ + pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ + if ((vq)->error_ctx) \ + eventfd_signal((vq)->error_ctx, 1);\ + } while (0) + +enum { + VHOST_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | + (1ULL << VIRTIO_RING_F_EVENT_IDX) | + (1ULL << VHOST_F_LOG_ALL) | + (1ULL << VIRTIO_F_ANY_LAYOUT) | + (1ULL << VIRTIO_F_VERSION_1) +}; + +static inline bool vhost_has_feature(struct vhost_virtqueue *vq, int bit) +{ + return vq->acked_features & (1ULL << bit); +} + +static inline bool vhost_backend_has_feature(struct vhost_virtqueue *vq, int bit) +{ + return vq->acked_backend_features & (1ULL << bit); +} + +#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY +static inline bool vhost_is_little_endian(struct vhost_virtqueue *vq) +{ + return vq->is_le; +} +#else +static inline bool vhost_is_little_endian(struct vhost_virtqueue *vq) +{ + return virtio_legacy_is_little_endian() || vq->is_le; +} +#endif + +/* Memory accessors */ +static inline u16 vhost16_to_cpu(struct vhost_virtqueue *vq, __virtio16 val) +{ + return __virtio16_to_cpu(vhost_is_little_endian(vq), val); +} + +static inline __virtio16 cpu_to_vhost16(struct vhost_virtqueue *vq, u16 val) +{ + return __cpu_to_virtio16(vhost_is_little_endian(vq), val); +} + +static inline u32 vhost32_to_cpu(struct vhost_virtqueue *vq, __virtio32 val) +{ + return __virtio32_to_cpu(vhost_is_little_endian(vq), val); +} + +static inline __virtio32 cpu_to_vhost32(struct vhost_virtqueue *vq, u32 val) +{ + return __cpu_to_virtio32(vhost_is_little_endian(vq), val); +} + +static inline u64 vhost64_to_cpu(struct vhost_virtqueue *vq, __virtio64 val) +{ + return __virtio64_to_cpu(vhost_is_little_endian(vq), val); +} + +static inline __virtio64 cpu_to_vhost64(struct vhost_virtqueue *vq, u64 val) +{ + return __cpu_to_virtio64(vhost_is_little_endian(vq), val); +} +#endif diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c new file mode 100644 index 000000000..6b2efff1c --- /dev/null +++ b/drivers/vhost/vringh.c @@ -0,0 +1,1045 @@ +/* + * Helpers for the host side of a virtio ring. + * + * Since these may be in userspace, we use (inline) accessors. + */ +#include <linux/compiler.h> +#include <linux/module.h> +#include <linux/vringh.h> +#include <linux/virtio_ring.h> +#include <linux/kernel.h> +#include <linux/ratelimit.h> +#include <linux/uaccess.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <uapi/linux/virtio_config.h> + +static __printf(1,2) __cold void vringh_bad(const char *fmt, ...) +{ + static DEFINE_RATELIMIT_STATE(vringh_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + if (__ratelimit(&vringh_rs)) { + va_list ap; + va_start(ap, fmt); + printk(KERN_NOTICE "vringh:"); + vprintk(fmt, ap); + va_end(ap); + } +} + +/* Returns vring->num if empty, -ve on error. */ +static inline int __vringh_get_head(const struct vringh *vrh, + int (*getu16)(const struct vringh *vrh, + u16 *val, const __virtio16 *p), + u16 *last_avail_idx) +{ + u16 avail_idx, i, head; + int err; + + err = getu16(vrh, &avail_idx, &vrh->vring.avail->idx); + if (err) { + vringh_bad("Failed to access avail idx at %p", + &vrh->vring.avail->idx); + return err; + } + + if (*last_avail_idx == avail_idx) + return vrh->vring.num; + + /* Only get avail ring entries after they have been exposed by guest. */ + virtio_rmb(vrh->weak_barriers); + + i = *last_avail_idx & (vrh->vring.num - 1); + + err = getu16(vrh, &head, &vrh->vring.avail->ring[i]); + if (err) { + vringh_bad("Failed to read head: idx %d address %p", + *last_avail_idx, &vrh->vring.avail->ring[i]); + return err; + } + + if (head >= vrh->vring.num) { + vringh_bad("Guest says index %u > %u is available", + head, vrh->vring.num); + return -EINVAL; + } + + (*last_avail_idx)++; + return head; +} + +/* Copy some bytes to/from the iovec. Returns num copied. */ +static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, + void *ptr, size_t len, + int (*xfer)(void *addr, void *ptr, + size_t len)) +{ + int err, done = 0; + + while (len && iov->i < iov->used) { + size_t partlen; + + partlen = min(iov->iov[iov->i].iov_len, len); + err = xfer(iov->iov[iov->i].iov_base, ptr, partlen); + if (err) + return err; + done += partlen; + len -= partlen; + ptr += partlen; + iov->consumed += partlen; + iov->iov[iov->i].iov_len -= partlen; + iov->iov[iov->i].iov_base += partlen; + + if (!iov->iov[iov->i].iov_len) { + /* Fix up old iov element then increment. */ + iov->iov[iov->i].iov_len = iov->consumed; + iov->iov[iov->i].iov_base -= iov->consumed; + + iov->consumed = 0; + iov->i++; + } + } + return done; +} + +/* May reduce *len if range is shorter. */ +static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *, + u64, struct vringh_range *)) +{ + if (addr < range->start || addr > range->end_incl) { + if (!getrange(vrh, addr, range)) + return false; + } + BUG_ON(addr < range->start || addr > range->end_incl); + + /* To end of memory? */ + if (unlikely(addr + *len == 0)) { + if (range->end_incl == -1ULL) + return true; + goto truncate; + } + + /* Otherwise, don't wrap. */ + if (addr + *len < addr) { + vringh_bad("Wrapping descriptor %zu@0x%llx", + *len, (unsigned long long)addr); + return false; + } + + if (unlikely(addr + *len - 1 > range->end_incl)) + goto truncate; + return true; + +truncate: + *len = range->end_incl + 1 - addr; + return true; +} + +static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *, + u64, struct vringh_range *)) +{ + return true; +} + +/* No reason for this code to be inline. */ +static int move_to_indirect(const struct vringh *vrh, + int *up_next, u16 *i, void *addr, + const struct vring_desc *desc, + struct vring_desc **descs, int *desc_max) +{ + u32 len; + + /* Indirect tables can't have indirect. */ + if (*up_next != -1) { + vringh_bad("Multilevel indirect %u->%u", *up_next, *i); + return -EINVAL; + } + + len = vringh32_to_cpu(vrh, desc->len); + if (unlikely(len % sizeof(struct vring_desc))) { + vringh_bad("Strange indirect len %u", desc->len); + return -EINVAL; + } + + /* We will check this when we follow it! */ + if (desc->flags & cpu_to_vringh16(vrh, VRING_DESC_F_NEXT)) + *up_next = vringh16_to_cpu(vrh, desc->next); + else + *up_next = -2; + *descs = addr; + *desc_max = len / sizeof(struct vring_desc); + + /* Now, start at the first indirect. */ + *i = 0; + return 0; +} + +static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp) +{ + struct kvec *new; + unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2; + + if (new_num < 8) + new_num = 8; + + flag = (iov->max_num & VRINGH_IOV_ALLOCATED); + if (flag) + new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp); + else { + new = kmalloc_array(new_num, sizeof(struct iovec), gfp); + if (new) { + memcpy(new, iov->iov, + iov->max_num * sizeof(struct iovec)); + flag = VRINGH_IOV_ALLOCATED; + } + } + if (!new) + return -ENOMEM; + iov->iov = new; + iov->max_num = (new_num | flag); + return 0; +} + +static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next, + struct vring_desc **descs, int *desc_max) +{ + u16 i = *up_next; + + *up_next = -1; + *descs = vrh->vring.desc; + *desc_max = vrh->vring.num; + return i; +} + +static int slow_copy(struct vringh *vrh, void *dst, const void *src, + bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *vrh, + u64, + struct vringh_range *)), + bool (*getrange)(struct vringh *vrh, + u64 addr, + struct vringh_range *r), + struct vringh_range *range, + int (*copy)(void *dst, const void *src, size_t len)) +{ + size_t part, len = sizeof(struct vring_desc); + + do { + u64 addr; + int err; + + part = len; + addr = (u64)(unsigned long)src - range->offset; + + if (!rcheck(vrh, addr, &part, range, getrange)) + return -EINVAL; + + err = copy(dst, src, part); + if (err) + return err; + + dst += part; + src += part; + len -= part; + } while (len); + return 0; +} + +static inline int +__vringh_iov(struct vringh *vrh, u16 i, + struct vringh_kiov *riov, + struct vringh_kiov *wiov, + bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *, u64, + struct vringh_range *)), + bool (*getrange)(struct vringh *, u64, struct vringh_range *), + gfp_t gfp, + int (*copy)(void *dst, const void *src, size_t len)) +{ + int err, count = 0, indirect_count = 0, up_next, desc_max; + struct vring_desc desc, *descs; + struct vringh_range range = { -1ULL, 0 }, slowrange; + bool slow = false; + + /* We start traversing vring's descriptor table. */ + descs = vrh->vring.desc; + desc_max = vrh->vring.num; + up_next = -1; + + /* You must want something! */ + if (WARN_ON(!riov && !wiov)) + return -EINVAL; + + if (riov) + riov->i = riov->used = 0; + if (wiov) + wiov->i = wiov->used = 0; + + for (;;) { + void *addr; + struct vringh_kiov *iov; + size_t len; + + if (unlikely(slow)) + err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange, + &slowrange, copy); + else + err = copy(&desc, &descs[i], sizeof(desc)); + if (unlikely(err)) + goto fail; + + if (unlikely(desc.flags & + cpu_to_vringh16(vrh, VRING_DESC_F_INDIRECT))) { + u64 a = vringh64_to_cpu(vrh, desc.addr); + + /* Make sure it's OK, and get offset. */ + len = vringh32_to_cpu(vrh, desc.len); + if (!rcheck(vrh, a, &len, &range, getrange)) { + err = -EINVAL; + goto fail; + } + + if (unlikely(len != vringh32_to_cpu(vrh, desc.len))) { + slow = true; + /* We need to save this range to use offset */ + slowrange = range; + } + + addr = (void *)(long)(a + range.offset); + err = move_to_indirect(vrh, &up_next, &i, addr, &desc, + &descs, &desc_max); + if (err) + goto fail; + continue; + } + + if (up_next == -1) + count++; + else + indirect_count++; + + if (count > vrh->vring.num || indirect_count > desc_max) { + vringh_bad("Descriptor loop in %p", descs); + err = -ELOOP; + goto fail; + } + + if (desc.flags & cpu_to_vringh16(vrh, VRING_DESC_F_WRITE)) + iov = wiov; + else { + iov = riov; + if (unlikely(wiov && wiov->used)) { + vringh_bad("Readable desc %p after writable", + &descs[i]); + err = -EINVAL; + goto fail; + } + } + + if (!iov) { + vringh_bad("Unexpected %s desc", + !wiov ? "writable" : "readable"); + err = -EPROTO; + goto fail; + } + + again: + /* Make sure it's OK, and get offset. */ + len = vringh32_to_cpu(vrh, desc.len); + if (!rcheck(vrh, vringh64_to_cpu(vrh, desc.addr), &len, &range, + getrange)) { + err = -EINVAL; + goto fail; + } + addr = (void *)(unsigned long)(vringh64_to_cpu(vrh, desc.addr) + + range.offset); + + if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) { + err = resize_iovec(iov, gfp); + if (err) + goto fail; + } + + iov->iov[iov->used].iov_base = addr; + iov->iov[iov->used].iov_len = len; + iov->used++; + + if (unlikely(len != vringh32_to_cpu(vrh, desc.len))) { + desc.len = cpu_to_vringh32(vrh, + vringh32_to_cpu(vrh, desc.len) - len); + desc.addr = cpu_to_vringh64(vrh, + vringh64_to_cpu(vrh, desc.addr) + len); + goto again; + } + + if (desc.flags & cpu_to_vringh16(vrh, VRING_DESC_F_NEXT)) { + i = vringh16_to_cpu(vrh, desc.next); + } else { + /* Just in case we need to finish traversing above. */ + if (unlikely(up_next > 0)) { + i = return_from_indirect(vrh, &up_next, + &descs, &desc_max); + slow = false; + indirect_count = 0; + } else + break; + } + + if (i >= desc_max) { + vringh_bad("Chained index %u > %u", i, desc_max); + err = -EINVAL; + goto fail; + } + } + + return 0; + +fail: + return err; +} + +static inline int __vringh_complete(struct vringh *vrh, + const struct vring_used_elem *used, + unsigned int num_used, + int (*putu16)(const struct vringh *vrh, + __virtio16 *p, u16 val), + int (*putused)(struct vring_used_elem *dst, + const struct vring_used_elem + *src, unsigned num)) +{ + struct vring_used *used_ring; + int err; + u16 used_idx, off; + + used_ring = vrh->vring.used; + used_idx = vrh->last_used_idx + vrh->completed; + + off = used_idx % vrh->vring.num; + + /* Compiler knows num_used == 1 sometimes, hence extra check */ + if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) { + u16 part = vrh->vring.num - off; + err = putused(&used_ring->ring[off], used, part); + if (!err) + err = putused(&used_ring->ring[0], used + part, + num_used - part); + } else + err = putused(&used_ring->ring[off], used, num_used); + + if (err) { + vringh_bad("Failed to write %u used entries %u at %p", + num_used, off, &used_ring->ring[off]); + return err; + } + + /* Make sure buffer is written before we update index. */ + virtio_wmb(vrh->weak_barriers); + + err = putu16(vrh, &vrh->vring.used->idx, used_idx + num_used); + if (err) { + vringh_bad("Failed to update used index at %p", + &vrh->vring.used->idx); + return err; + } + + vrh->completed += num_used; + return 0; +} + + +static inline int __vringh_need_notify(struct vringh *vrh, + int (*getu16)(const struct vringh *vrh, + u16 *val, + const __virtio16 *p)) +{ + bool notify; + u16 used_event; + int err; + + /* Flush out used index update. This is paired with the + * barrier that the Guest executes when enabling + * interrupts. */ + virtio_mb(vrh->weak_barriers); + + /* Old-style, without event indices. */ + if (!vrh->event_indices) { + u16 flags; + err = getu16(vrh, &flags, &vrh->vring.avail->flags); + if (err) { + vringh_bad("Failed to get flags at %p", + &vrh->vring.avail->flags); + return err; + } + return (!(flags & VRING_AVAIL_F_NO_INTERRUPT)); + } + + /* Modern: we know when other side wants to know. */ + err = getu16(vrh, &used_event, &vring_used_event(&vrh->vring)); + if (err) { + vringh_bad("Failed to get used event idx at %p", + &vring_used_event(&vrh->vring)); + return err; + } + + /* Just in case we added so many that we wrap. */ + if (unlikely(vrh->completed > 0xffff)) + notify = true; + else + notify = vring_need_event(used_event, + vrh->last_used_idx + vrh->completed, + vrh->last_used_idx); + + vrh->last_used_idx += vrh->completed; + vrh->completed = 0; + return notify; +} + +static inline bool __vringh_notify_enable(struct vringh *vrh, + int (*getu16)(const struct vringh *vrh, + u16 *val, const __virtio16 *p), + int (*putu16)(const struct vringh *vrh, + __virtio16 *p, u16 val)) +{ + u16 avail; + + if (!vrh->event_indices) { + /* Old-school; update flags. */ + if (putu16(vrh, &vrh->vring.used->flags, 0) != 0) { + vringh_bad("Clearing used flags %p", + &vrh->vring.used->flags); + return true; + } + } else { + if (putu16(vrh, &vring_avail_event(&vrh->vring), + vrh->last_avail_idx) != 0) { + vringh_bad("Updating avail event index %p", + &vring_avail_event(&vrh->vring)); + return true; + } + } + + /* They could have slipped one in as we were doing that: make + * sure it's written, then check again. */ + virtio_mb(vrh->weak_barriers); + + if (getu16(vrh, &avail, &vrh->vring.avail->idx) != 0) { + vringh_bad("Failed to check avail idx at %p", + &vrh->vring.avail->idx); + return true; + } + + /* This is unlikely, so we just leave notifications enabled + * (if we're using event_indices, we'll only get one + * notification anyway). */ + return avail == vrh->last_avail_idx; +} + +static inline void __vringh_notify_disable(struct vringh *vrh, + int (*putu16)(const struct vringh *vrh, + __virtio16 *p, u16 val)) +{ + if (!vrh->event_indices) { + /* Old-school; update flags. */ + if (putu16(vrh, &vrh->vring.used->flags, + VRING_USED_F_NO_NOTIFY)) { + vringh_bad("Setting used flags %p", + &vrh->vring.used->flags); + } + } +} + +/* Userspace access helpers: in this case, addresses are really userspace. */ +static inline int getu16_user(const struct vringh *vrh, u16 *val, const __virtio16 *p) +{ + __virtio16 v = 0; + int rc = get_user(v, (__force __virtio16 __user *)p); + *val = vringh16_to_cpu(vrh, v); + return rc; +} + +static inline int putu16_user(const struct vringh *vrh, __virtio16 *p, u16 val) +{ + __virtio16 v = cpu_to_vringh16(vrh, val); + return put_user(v, (__force __virtio16 __user *)p); +} + +static inline int copydesc_user(void *dst, const void *src, size_t len) +{ + return copy_from_user(dst, (__force void __user *)src, len) ? + -EFAULT : 0; +} + +static inline int putused_user(struct vring_used_elem *dst, + const struct vring_used_elem *src, + unsigned int num) +{ + return copy_to_user((__force void __user *)dst, src, + sizeof(*dst) * num) ? -EFAULT : 0; +} + +static inline int xfer_from_user(void *src, void *dst, size_t len) +{ + return copy_from_user(dst, (__force void __user *)src, len) ? + -EFAULT : 0; +} + +static inline int xfer_to_user(void *dst, void *src, size_t len) +{ + return copy_to_user((__force void __user *)dst, src, len) ? + -EFAULT : 0; +} + +/** + * vringh_init_user - initialize a vringh for a userspace vring. + * @vrh: the vringh to initialize. + * @features: the feature bits for this ring. + * @num: the number of elements. + * @weak_barriers: true if we only need memory barriers, not I/O. + * @desc: the userpace descriptor pointer. + * @avail: the userpace avail pointer. + * @used: the userpace used pointer. + * + * Returns an error if num is invalid: you should check pointers + * yourself! + */ +int vringh_init_user(struct vringh *vrh, u64 features, + unsigned int num, bool weak_barriers, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct vring_used __user *used) +{ + /* Sane power of 2 please! */ + if (!num || num > 0xffff || (num & (num - 1))) { + vringh_bad("Bad ring size %u", num); + return -EINVAL; + } + + vrh->little_endian = (features & (1ULL << VIRTIO_F_VERSION_1)); + vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); + vrh->weak_barriers = weak_barriers; + vrh->completed = 0; + vrh->last_avail_idx = 0; + vrh->last_used_idx = 0; + vrh->vring.num = num; + /* vring expects kernel addresses, but only used via accessors. */ + vrh->vring.desc = (__force struct vring_desc *)desc; + vrh->vring.avail = (__force struct vring_avail *)avail; + vrh->vring.used = (__force struct vring_used *)used; + return 0; +} +EXPORT_SYMBOL(vringh_init_user); + +/** + * vringh_getdesc_user - get next available descriptor from userspace ring. + * @vrh: the userspace vring. + * @riov: where to put the readable descriptors (or NULL) + * @wiov: where to put the writable descriptors (or NULL) + * @getrange: function to call to check ranges. + * @head: head index we received, for passing to vringh_complete_user(). + * + * Returns 0 if there was no descriptor, 1 if there was, or -errno. + * + * Note that on error return, you can tell the difference between an + * invalid ring and a single invalid descriptor: in the former case, + * *head will be vrh->vring.num. You may be able to ignore an invalid + * descriptor, but there's not much you can do with an invalid ring. + * + * Note that you may need to clean up riov and wiov, even on error! + */ +int vringh_getdesc_user(struct vringh *vrh, + struct vringh_iov *riov, + struct vringh_iov *wiov, + bool (*getrange)(struct vringh *vrh, + u64 addr, struct vringh_range *r), + u16 *head) +{ + int err; + + *head = vrh->vring.num; + err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx); + if (err < 0) + return err; + + /* Empty... */ + if (err == vrh->vring.num) + return 0; + + /* We need the layouts to be the identical for this to work */ + BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) != + offsetof(struct vringh_iov, iov)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, i) != + offsetof(struct vringh_iov, i)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, used) != + offsetof(struct vringh_iov, used)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) != + offsetof(struct vringh_iov, max_num)); + BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); + BUILD_BUG_ON(offsetof(struct iovec, iov_base) != + offsetof(struct kvec, iov_base)); + BUILD_BUG_ON(offsetof(struct iovec, iov_len) != + offsetof(struct kvec, iov_len)); + BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base) + != sizeof(((struct kvec *)NULL)->iov_base)); + BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len) + != sizeof(((struct kvec *)NULL)->iov_len)); + + *head = err; + err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov, + (struct vringh_kiov *)wiov, + range_check, getrange, GFP_KERNEL, copydesc_user); + if (err) + return err; + + return 1; +} +EXPORT_SYMBOL(vringh_getdesc_user); + +/** + * vringh_iov_pull_user - copy bytes from vring_iov. + * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len) +{ + return vringh_iov_xfer((struct vringh_kiov *)riov, + dst, len, xfer_from_user); +} +EXPORT_SYMBOL(vringh_iov_pull_user); + +/** + * vringh_iov_push_user - copy bytes into vring_iov. + * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_push_user(struct vringh_iov *wiov, + const void *src, size_t len) +{ + return vringh_iov_xfer((struct vringh_kiov *)wiov, + (void *)src, len, xfer_to_user); +} +EXPORT_SYMBOL(vringh_iov_push_user); + +/** + * vringh_abandon_user - we've decided not to handle the descriptor(s). + * @vrh: the vring. + * @num: the number of descriptors to put back (ie. num + * vringh_get_user() to undo). + * + * The next vringh_get_user() will return the old descriptor(s) again. + */ +void vringh_abandon_user(struct vringh *vrh, unsigned int num) +{ + /* We only update vring_avail_event(vr) when we want to be notified, + * so we haven't changed that yet. */ + vrh->last_avail_idx -= num; +} +EXPORT_SYMBOL(vringh_abandon_user); + +/** + * vringh_complete_user - we've finished with descriptor, publish it. + * @vrh: the vring. + * @head: the head as filled in by vringh_getdesc_user. + * @len: the length of data we have written. + * + * You should check vringh_need_notify_user() after one or more calls + * to this function. + */ +int vringh_complete_user(struct vringh *vrh, u16 head, u32 len) +{ + struct vring_used_elem used; + + used.id = cpu_to_vringh32(vrh, head); + used.len = cpu_to_vringh32(vrh, len); + return __vringh_complete(vrh, &used, 1, putu16_user, putused_user); +} +EXPORT_SYMBOL(vringh_complete_user); + +/** + * vringh_complete_multi_user - we've finished with many descriptors. + * @vrh: the vring. + * @used: the head, length pairs. + * @num_used: the number of used elements. + * + * You should check vringh_need_notify_user() after one or more calls + * to this function. + */ +int vringh_complete_multi_user(struct vringh *vrh, + const struct vring_used_elem used[], + unsigned num_used) +{ + return __vringh_complete(vrh, used, num_used, + putu16_user, putused_user); +} +EXPORT_SYMBOL(vringh_complete_multi_user); + +/** + * vringh_notify_enable_user - we want to know if something changes. + * @vrh: the vring. + * + * This always enables notifications, but returns false if there are + * now more buffers available in the vring. + */ +bool vringh_notify_enable_user(struct vringh *vrh) +{ + return __vringh_notify_enable(vrh, getu16_user, putu16_user); +} +EXPORT_SYMBOL(vringh_notify_enable_user); + +/** + * vringh_notify_disable_user - don't tell us if something changes. + * @vrh: the vring. + * + * This is our normal running state: we disable and then only enable when + * we're going to sleep. + */ +void vringh_notify_disable_user(struct vringh *vrh) +{ + __vringh_notify_disable(vrh, putu16_user); +} +EXPORT_SYMBOL(vringh_notify_disable_user); + +/** + * vringh_need_notify_user - must we tell the other side about used buffers? + * @vrh: the vring we've called vringh_complete_user() on. + * + * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. + */ +int vringh_need_notify_user(struct vringh *vrh) +{ + return __vringh_need_notify(vrh, getu16_user); +} +EXPORT_SYMBOL(vringh_need_notify_user); + +/* Kernelspace access helpers. */ +static inline int getu16_kern(const struct vringh *vrh, + u16 *val, const __virtio16 *p) +{ + *val = vringh16_to_cpu(vrh, READ_ONCE(*p)); + return 0; +} + +static inline int putu16_kern(const struct vringh *vrh, __virtio16 *p, u16 val) +{ + WRITE_ONCE(*p, cpu_to_vringh16(vrh, val)); + return 0; +} + +static inline int copydesc_kern(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); + return 0; +} + +static inline int putused_kern(struct vring_used_elem *dst, + const struct vring_used_elem *src, + unsigned int num) +{ + memcpy(dst, src, num * sizeof(*dst)); + return 0; +} + +static inline int xfer_kern(void *src, void *dst, size_t len) +{ + memcpy(dst, src, len); + return 0; +} + +/** + * vringh_init_kern - initialize a vringh for a kernelspace vring. + * @vrh: the vringh to initialize. + * @features: the feature bits for this ring. + * @num: the number of elements. + * @weak_barriers: true if we only need memory barriers, not I/O. + * @desc: the userpace descriptor pointer. + * @avail: the userpace avail pointer. + * @used: the userpace used pointer. + * + * Returns an error if num is invalid. + */ +int vringh_init_kern(struct vringh *vrh, u64 features, + unsigned int num, bool weak_barriers, + struct vring_desc *desc, + struct vring_avail *avail, + struct vring_used *used) +{ + /* Sane power of 2 please! */ + if (!num || num > 0xffff || (num & (num - 1))) { + vringh_bad("Bad ring size %u", num); + return -EINVAL; + } + + vrh->little_endian = (features & (1ULL << VIRTIO_F_VERSION_1)); + vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); + vrh->weak_barriers = weak_barriers; + vrh->completed = 0; + vrh->last_avail_idx = 0; + vrh->last_used_idx = 0; + vrh->vring.num = num; + vrh->vring.desc = desc; + vrh->vring.avail = avail; + vrh->vring.used = used; + return 0; +} +EXPORT_SYMBOL(vringh_init_kern); + +/** + * vringh_getdesc_kern - get next available descriptor from kernelspace ring. + * @vrh: the kernelspace vring. + * @riov: where to put the readable descriptors (or NULL) + * @wiov: where to put the writable descriptors (or NULL) + * @head: head index we received, for passing to vringh_complete_kern(). + * @gfp: flags for allocating larger riov/wiov. + * + * Returns 0 if there was no descriptor, 1 if there was, or -errno. + * + * Note that on error return, you can tell the difference between an + * invalid ring and a single invalid descriptor: in the former case, + * *head will be vrh->vring.num. You may be able to ignore an invalid + * descriptor, but there's not much you can do with an invalid ring. + * + * Note that you may need to clean up riov and wiov, even on error! + */ +int vringh_getdesc_kern(struct vringh *vrh, + struct vringh_kiov *riov, + struct vringh_kiov *wiov, + u16 *head, + gfp_t gfp) +{ + int err; + + err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx); + if (err < 0) + return err; + + /* Empty... */ + if (err == vrh->vring.num) + return 0; + + *head = err; + err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL, + gfp, copydesc_kern); + if (err) + return err; + + return 1; +} +EXPORT_SYMBOL(vringh_getdesc_kern); + +/** + * vringh_iov_pull_kern - copy bytes from vring_iov. + * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len) +{ + return vringh_iov_xfer(riov, dst, len, xfer_kern); +} +EXPORT_SYMBOL(vringh_iov_pull_kern); + +/** + * vringh_iov_push_kern - copy bytes into vring_iov. + * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, + const void *src, size_t len) +{ + return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern); +} +EXPORT_SYMBOL(vringh_iov_push_kern); + +/** + * vringh_abandon_kern - we've decided not to handle the descriptor(s). + * @vrh: the vring. + * @num: the number of descriptors to put back (ie. num + * vringh_get_kern() to undo). + * + * The next vringh_get_kern() will return the old descriptor(s) again. + */ +void vringh_abandon_kern(struct vringh *vrh, unsigned int num) +{ + /* We only update vring_avail_event(vr) when we want to be notified, + * so we haven't changed that yet. */ + vrh->last_avail_idx -= num; +} +EXPORT_SYMBOL(vringh_abandon_kern); + +/** + * vringh_complete_kern - we've finished with descriptor, publish it. + * @vrh: the vring. + * @head: the head as filled in by vringh_getdesc_kern. + * @len: the length of data we have written. + * + * You should check vringh_need_notify_kern() after one or more calls + * to this function. + */ +int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len) +{ + struct vring_used_elem used; + + used.id = cpu_to_vringh32(vrh, head); + used.len = cpu_to_vringh32(vrh, len); + + return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern); +} +EXPORT_SYMBOL(vringh_complete_kern); + +/** + * vringh_notify_enable_kern - we want to know if something changes. + * @vrh: the vring. + * + * This always enables notifications, but returns false if there are + * now more buffers available in the vring. + */ +bool vringh_notify_enable_kern(struct vringh *vrh) +{ + return __vringh_notify_enable(vrh, getu16_kern, putu16_kern); +} +EXPORT_SYMBOL(vringh_notify_enable_kern); + +/** + * vringh_notify_disable_kern - don't tell us if something changes. + * @vrh: the vring. + * + * This is our normal running state: we disable and then only enable when + * we're going to sleep. + */ +void vringh_notify_disable_kern(struct vringh *vrh) +{ + __vringh_notify_disable(vrh, putu16_kern); +} +EXPORT_SYMBOL(vringh_notify_disable_kern); + +/** + * vringh_need_notify_kern - must we tell the other side about used buffers? + * @vrh: the vring we've called vringh_complete_kern() on. + * + * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. + */ +int vringh_need_notify_kern(struct vringh *vrh) +{ + return __vringh_need_notify(vrh, getu16_kern); +} +EXPORT_SYMBOL(vringh_need_notify_kern); + +MODULE_LICENSE("GPL"); diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c new file mode 100644 index 000000000..9f67717ea --- /dev/null +++ b/drivers/vhost/vsock.c @@ -0,0 +1,870 @@ +/* + * vhost transport for vsock + * + * Copyright (C) 2013-2015 Red Hat, Inc. + * Author: Asias He <asias@redhat.com> + * Stefan Hajnoczi <stefanha@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ +#include <linux/miscdevice.h> +#include <linux/atomic.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/vmalloc.h> +#include <net/sock.h> +#include <linux/virtio_vsock.h> +#include <linux/vhost.h> +#include <linux/hashtable.h> + +#include <net/af_vsock.h> +#include "vhost.h" + +#define VHOST_VSOCK_DEFAULT_HOST_CID 2 +/* Max number of bytes transferred before requeueing the job. + * Using this limit prevents one virtqueue from starving others. */ +#define VHOST_VSOCK_WEIGHT 0x80000 +/* Max number of packets transferred before requeueing the job. + * Using this limit prevents one virtqueue from starving others with + * small pkts. + */ +#define VHOST_VSOCK_PKT_WEIGHT 256 + +enum { + VHOST_VSOCK_FEATURES = VHOST_FEATURES, +}; + +/* Used to track all the vhost_vsock instances on the system. */ +static DEFINE_SPINLOCK(vhost_vsock_lock); +static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8); + +struct vhost_vsock { + struct vhost_dev dev; + struct vhost_virtqueue vqs[2]; + + /* Link to global vhost_vsock_hash, writes use vhost_vsock_lock */ + struct hlist_node hash; + + struct vhost_work send_pkt_work; + spinlock_t send_pkt_list_lock; + struct list_head send_pkt_list; /* host->guest pending packets */ + + atomic_t queued_replies; + + u32 guest_cid; +}; + +static u32 vhost_transport_get_local_cid(void) +{ + return VHOST_VSOCK_DEFAULT_HOST_CID; +} + +/* Callers that dereference the return value must hold vhost_vsock_lock or the + * RCU read lock. + */ +static struct vhost_vsock *vhost_vsock_get(u32 guest_cid) +{ + struct vhost_vsock *vsock; + + hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid) { + u32 other_cid = vsock->guest_cid; + + /* Skip instances that have no CID yet */ + if (other_cid == 0) + continue; + + if (other_cid == guest_cid) + return vsock; + + } + + return NULL; +} + +static void +vhost_transport_do_send_pkt(struct vhost_vsock *vsock, + struct vhost_virtqueue *vq) +{ + struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; + int pkts = 0, total_len = 0; + bool added = false; + bool restart_tx = false; + + mutex_lock(&vq->mutex); + + if (!vq->private_data) + goto out; + + /* Avoid further vmexits, we're already processing the virtqueue */ + vhost_disable_notify(&vsock->dev, vq); + + do { + struct virtio_vsock_pkt *pkt; + struct iov_iter iov_iter; + unsigned out, in; + size_t nbytes; + size_t iov_len, payload_len; + int head; + + spin_lock_bh(&vsock->send_pkt_list_lock); + if (list_empty(&vsock->send_pkt_list)) { + spin_unlock_bh(&vsock->send_pkt_list_lock); + vhost_enable_notify(&vsock->dev, vq); + break; + } + + pkt = list_first_entry(&vsock->send_pkt_list, + struct virtio_vsock_pkt, list); + list_del_init(&pkt->list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + + head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), + &out, &in, NULL, NULL); + if (head < 0) { + spin_lock_bh(&vsock->send_pkt_list_lock); + list_add(&pkt->list, &vsock->send_pkt_list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + break; + } + + if (head == vq->num) { + spin_lock_bh(&vsock->send_pkt_list_lock); + list_add(&pkt->list, &vsock->send_pkt_list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + + /* We cannot finish yet if more buffers snuck in while + * re-enabling notify. + */ + if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { + vhost_disable_notify(&vsock->dev, vq); + continue; + } + break; + } + + if (out) { + virtio_transport_free_pkt(pkt); + vq_err(vq, "Expected 0 output buffers, got %u\n", out); + break; + } + + iov_len = iov_length(&vq->iov[out], in); + if (iov_len < sizeof(pkt->hdr)) { + virtio_transport_free_pkt(pkt); + vq_err(vq, "Buffer len [%zu] too small\n", iov_len); + break; + } + + iov_iter_init(&iov_iter, READ, &vq->iov[out], in, iov_len); + payload_len = pkt->len - pkt->off; + + /* If the packet is greater than the space available in the + * buffer, we split it using multiple buffers. + */ + if (payload_len > iov_len - sizeof(pkt->hdr)) + payload_len = iov_len - sizeof(pkt->hdr); + + /* Set the correct length in the header */ + pkt->hdr.len = cpu_to_le32(payload_len); + + nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter); + if (nbytes != sizeof(pkt->hdr)) { + virtio_transport_free_pkt(pkt); + vq_err(vq, "Faulted on copying pkt hdr\n"); + break; + } + + nbytes = copy_to_iter(pkt->buf + pkt->off, payload_len, + &iov_iter); + if (nbytes != payload_len) { + virtio_transport_free_pkt(pkt); + vq_err(vq, "Faulted on copying pkt buf\n"); + break; + } + + /* Deliver to monitoring devices all packets that we + * will transmit. + */ + virtio_transport_deliver_tap_pkt(pkt); + + vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len); + added = true; + + pkt->off += payload_len; + total_len += payload_len; + + /* If we didn't send all the payload we can requeue the packet + * to send it with the next available buffer. + */ + if (pkt->off < pkt->len) { + spin_lock_bh(&vsock->send_pkt_list_lock); + list_add(&pkt->list, &vsock->send_pkt_list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + } else { + if (pkt->reply) { + int val; + + val = atomic_dec_return(&vsock->queued_replies); + + /* Do we have resources to resume tx + * processing? + */ + if (val + 1 == tx_vq->num) + restart_tx = true; + } + + virtio_transport_free_pkt(pkt); + } + } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); + if (added) + vhost_signal(&vsock->dev, vq); + +out: + mutex_unlock(&vq->mutex); + + if (restart_tx) + vhost_poll_queue(&tx_vq->poll); +} + +static void vhost_transport_send_pkt_work(struct vhost_work *work) +{ + struct vhost_virtqueue *vq; + struct vhost_vsock *vsock; + + vsock = container_of(work, struct vhost_vsock, send_pkt_work); + vq = &vsock->vqs[VSOCK_VQ_RX]; + + vhost_transport_do_send_pkt(vsock, vq); +} + +static int +vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt) +{ + struct vhost_vsock *vsock; + int len = pkt->len; + + rcu_read_lock(); + + /* Find the vhost_vsock according to guest context id */ + vsock = vhost_vsock_get(le64_to_cpu(pkt->hdr.dst_cid)); + if (!vsock) { + rcu_read_unlock(); + virtio_transport_free_pkt(pkt); + return -ENODEV; + } + + if (pkt->reply) + atomic_inc(&vsock->queued_replies); + + spin_lock_bh(&vsock->send_pkt_list_lock); + list_add_tail(&pkt->list, &vsock->send_pkt_list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + + vhost_work_queue(&vsock->dev, &vsock->send_pkt_work); + + rcu_read_unlock(); + return len; +} + +static int +vhost_transport_cancel_pkt(struct vsock_sock *vsk) +{ + struct vhost_vsock *vsock; + struct virtio_vsock_pkt *pkt, *n; + int cnt = 0; + int ret = -ENODEV; + LIST_HEAD(freeme); + + rcu_read_lock(); + + /* Find the vhost_vsock according to guest context id */ + vsock = vhost_vsock_get(vsk->remote_addr.svm_cid); + if (!vsock) + goto out; + + spin_lock_bh(&vsock->send_pkt_list_lock); + list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) { + if (pkt->vsk != vsk) + continue; + list_move(&pkt->list, &freeme); + } + spin_unlock_bh(&vsock->send_pkt_list_lock); + + list_for_each_entry_safe(pkt, n, &freeme, list) { + if (pkt->reply) + cnt++; + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + + if (cnt) { + struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; + int new_cnt; + + new_cnt = atomic_sub_return(cnt, &vsock->queued_replies); + if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num) + vhost_poll_queue(&tx_vq->poll); + } + + ret = 0; +out: + rcu_read_unlock(); + return ret; +} + +static struct virtio_vsock_pkt * +vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq, + unsigned int out, unsigned int in) +{ + struct virtio_vsock_pkt *pkt; + struct iov_iter iov_iter; + size_t nbytes; + size_t len; + + if (in != 0) { + vq_err(vq, "Expected 0 input buffers, got %u\n", in); + return NULL; + } + + pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); + if (!pkt) + return NULL; + + len = iov_length(vq->iov, out); + iov_iter_init(&iov_iter, WRITE, vq->iov, out, len); + + nbytes = copy_from_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter); + if (nbytes != sizeof(pkt->hdr)) { + vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n", + sizeof(pkt->hdr), nbytes); + kfree(pkt); + return NULL; + } + + if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_STREAM) + pkt->len = le32_to_cpu(pkt->hdr.len); + + /* No payload */ + if (!pkt->len) + return pkt; + + /* The pkt is too big */ + if (pkt->len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) { + kfree(pkt); + return NULL; + } + + pkt->buf = kmalloc(pkt->len, GFP_KERNEL); + if (!pkt->buf) { + kfree(pkt); + return NULL; + } + + nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter); + if (nbytes != pkt->len) { + vq_err(vq, "Expected %u byte payload, got %zu bytes\n", + pkt->len, nbytes); + virtio_transport_free_pkt(pkt); + return NULL; + } + + return pkt; +} + +/* Is there space left for replies to rx packets? */ +static bool vhost_vsock_more_replies(struct vhost_vsock *vsock) +{ + struct vhost_virtqueue *vq = &vsock->vqs[VSOCK_VQ_TX]; + int val; + + smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */ + val = atomic_read(&vsock->queued_replies); + + return val < vq->num; +} + +static struct virtio_transport vhost_transport = { + .transport = { + .get_local_cid = vhost_transport_get_local_cid, + + .init = virtio_transport_do_socket_init, + .destruct = virtio_transport_destruct, + .release = virtio_transport_release, + .connect = virtio_transport_connect, + .shutdown = virtio_transport_shutdown, + .cancel_pkt = vhost_transport_cancel_pkt, + + .dgram_enqueue = virtio_transport_dgram_enqueue, + .dgram_dequeue = virtio_transport_dgram_dequeue, + .dgram_bind = virtio_transport_dgram_bind, + .dgram_allow = virtio_transport_dgram_allow, + + .stream_enqueue = virtio_transport_stream_enqueue, + .stream_dequeue = virtio_transport_stream_dequeue, + .stream_has_data = virtio_transport_stream_has_data, + .stream_has_space = virtio_transport_stream_has_space, + .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, + .stream_is_active = virtio_transport_stream_is_active, + .stream_allow = virtio_transport_stream_allow, + + .notify_poll_in = virtio_transport_notify_poll_in, + .notify_poll_out = virtio_transport_notify_poll_out, + .notify_recv_init = virtio_transport_notify_recv_init, + .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, + .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, + .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, + .notify_send_init = virtio_transport_notify_send_init, + .notify_send_pre_block = virtio_transport_notify_send_pre_block, + .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, + .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, + + .set_buffer_size = virtio_transport_set_buffer_size, + .set_min_buffer_size = virtio_transport_set_min_buffer_size, + .set_max_buffer_size = virtio_transport_set_max_buffer_size, + .get_buffer_size = virtio_transport_get_buffer_size, + .get_min_buffer_size = virtio_transport_get_min_buffer_size, + .get_max_buffer_size = virtio_transport_get_max_buffer_size, + }, + + .send_pkt = vhost_transport_send_pkt, +}; + +static void vhost_vsock_handle_tx_kick(struct vhost_work *work) +{ + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, + dev); + struct virtio_vsock_pkt *pkt; + int head, pkts = 0, total_len = 0; + unsigned int out, in; + bool added = false; + + mutex_lock(&vq->mutex); + + if (!vq->private_data) + goto out; + + vhost_disable_notify(&vsock->dev, vq); + do { + u32 len; + + if (!vhost_vsock_more_replies(vsock)) { + /* Stop tx until the device processes already + * pending replies. Leave tx virtqueue + * callbacks disabled. + */ + goto no_more_replies; + } + + head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), + &out, &in, NULL, NULL); + if (head < 0) + break; + + if (head == vq->num) { + if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { + vhost_disable_notify(&vsock->dev, vq); + continue; + } + break; + } + + pkt = vhost_vsock_alloc_pkt(vq, out, in); + if (!pkt) { + vq_err(vq, "Faulted on pkt\n"); + continue; + } + + len = pkt->len; + + /* Deliver to monitoring devices all received packets */ + virtio_transport_deliver_tap_pkt(pkt); + + /* Only accept correctly addressed packets */ + if (le64_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid && + le64_to_cpu(pkt->hdr.dst_cid) == + vhost_transport_get_local_cid()) + virtio_transport_recv_pkt(&vhost_transport, pkt); + else + virtio_transport_free_pkt(pkt); + + len += sizeof(pkt->hdr); + vhost_add_used(vq, head, 0); + total_len += len; + added = true; + } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); + +no_more_replies: + if (added) + vhost_signal(&vsock->dev, vq); + +out: + mutex_unlock(&vq->mutex); +} + +static void vhost_vsock_handle_rx_kick(struct vhost_work *work) +{ + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, + dev); + + vhost_transport_do_send_pkt(vsock, vq); +} + +static int vhost_vsock_start(struct vhost_vsock *vsock) +{ + struct vhost_virtqueue *vq; + size_t i; + int ret; + + mutex_lock(&vsock->dev.mutex); + + ret = vhost_dev_check_owner(&vsock->dev); + if (ret) + goto err; + + for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { + vq = &vsock->vqs[i]; + + mutex_lock(&vq->mutex); + + if (!vhost_vq_access_ok(vq)) { + ret = -EFAULT; + goto err_vq; + } + + if (!vq->private_data) { + vq->private_data = vsock; + ret = vhost_vq_init_access(vq); + if (ret) + goto err_vq; + } + + mutex_unlock(&vq->mutex); + } + + /* Some packets may have been queued before the device was started, + * let's kick the send worker to send them. + */ + vhost_work_queue(&vsock->dev, &vsock->send_pkt_work); + + mutex_unlock(&vsock->dev.mutex); + return 0; + +err_vq: + vq->private_data = NULL; + mutex_unlock(&vq->mutex); + + for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { + vq = &vsock->vqs[i]; + + mutex_lock(&vq->mutex); + vq->private_data = NULL; + mutex_unlock(&vq->mutex); + } +err: + mutex_unlock(&vsock->dev.mutex); + return ret; +} + +static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner) +{ + size_t i; + int ret = 0; + + mutex_lock(&vsock->dev.mutex); + + if (check_owner) { + ret = vhost_dev_check_owner(&vsock->dev); + if (ret) + goto err; + } + + for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { + struct vhost_virtqueue *vq = &vsock->vqs[i]; + + mutex_lock(&vq->mutex); + vq->private_data = NULL; + mutex_unlock(&vq->mutex); + } + +err: + mutex_unlock(&vsock->dev.mutex); + return ret; +} + +static void vhost_vsock_free(struct vhost_vsock *vsock) +{ + kvfree(vsock); +} + +static int vhost_vsock_dev_open(struct inode *inode, struct file *file) +{ + struct vhost_virtqueue **vqs; + struct vhost_vsock *vsock; + int ret; + + /* This struct is large and allocation could fail, fall back to vmalloc + * if there is no other way. + */ + vsock = kvmalloc(sizeof(*vsock), GFP_KERNEL | __GFP_RETRY_MAYFAIL); + if (!vsock) + return -ENOMEM; + + vqs = kmalloc_array(ARRAY_SIZE(vsock->vqs), sizeof(*vqs), GFP_KERNEL); + if (!vqs) { + ret = -ENOMEM; + goto out; + } + + vsock->guest_cid = 0; /* no CID assigned yet */ + + atomic_set(&vsock->queued_replies, 0); + + vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX]; + vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX]; + vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick; + vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick; + + vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs), + UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT, + VHOST_VSOCK_WEIGHT); + + file->private_data = vsock; + spin_lock_init(&vsock->send_pkt_list_lock); + INIT_LIST_HEAD(&vsock->send_pkt_list); + vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work); + return 0; + +out: + vhost_vsock_free(vsock); + return ret; +} + +static void vhost_vsock_flush(struct vhost_vsock *vsock) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) + if (vsock->vqs[i].handle_kick) + vhost_poll_flush(&vsock->vqs[i].poll); + vhost_work_flush(&vsock->dev, &vsock->send_pkt_work); +} + +static void vhost_vsock_reset_orphans(struct sock *sk) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + /* vmci_transport.c doesn't take sk_lock here either. At least we're + * under vsock_table_lock so the sock cannot disappear while we're + * executing. + */ + + /* If the peer is still valid, no need to reset connection */ + if (vhost_vsock_get(vsk->remote_addr.svm_cid)) + return; + + /* If the close timeout is pending, let it expire. This avoids races + * with the timeout callback. + */ + if (vsk->close_work_scheduled) + return; + + sock_set_flag(sk, SOCK_DONE); + vsk->peer_shutdown = SHUTDOWN_MASK; + sk->sk_state = SS_UNCONNECTED; + sk->sk_err = ECONNRESET; + sk->sk_error_report(sk); +} + +static int vhost_vsock_dev_release(struct inode *inode, struct file *file) +{ + struct vhost_vsock *vsock = file->private_data; + + spin_lock_bh(&vhost_vsock_lock); + if (vsock->guest_cid) + hash_del_rcu(&vsock->hash); + spin_unlock_bh(&vhost_vsock_lock); + + /* Wait for other CPUs to finish using vsock */ + synchronize_rcu(); + + /* Iterating over all connections for all CIDs to find orphans is + * inefficient. Room for improvement here. */ + vsock_for_each_connected_socket(vhost_vsock_reset_orphans); + + /* Don't check the owner, because we are in the release path, so we + * need to stop the vsock device in any case. + * vhost_vsock_stop() can not fail in this case, so we don't need to + * check the return code. + */ + vhost_vsock_stop(vsock, false); + vhost_vsock_flush(vsock); + vhost_dev_stop(&vsock->dev); + + spin_lock_bh(&vsock->send_pkt_list_lock); + while (!list_empty(&vsock->send_pkt_list)) { + struct virtio_vsock_pkt *pkt; + + pkt = list_first_entry(&vsock->send_pkt_list, + struct virtio_vsock_pkt, list); + list_del_init(&pkt->list); + virtio_transport_free_pkt(pkt); + } + spin_unlock_bh(&vsock->send_pkt_list_lock); + + vhost_dev_cleanup(&vsock->dev); + kfree(vsock->dev.vqs); + vhost_vsock_free(vsock); + return 0; +} + +static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid) +{ + struct vhost_vsock *other; + + /* Refuse reserved CIDs */ + if (guest_cid <= VMADDR_CID_HOST || + guest_cid == U32_MAX) + return -EINVAL; + + /* 64-bit CIDs are not yet supported */ + if (guest_cid > U32_MAX) + return -EINVAL; + + /* Refuse if CID is already in use */ + spin_lock_bh(&vhost_vsock_lock); + other = vhost_vsock_get(guest_cid); + if (other && other != vsock) { + spin_unlock_bh(&vhost_vsock_lock); + return -EADDRINUSE; + } + + if (vsock->guest_cid) + hash_del_rcu(&vsock->hash); + + vsock->guest_cid = guest_cid; + hash_add_rcu(vhost_vsock_hash, &vsock->hash, vsock->guest_cid); + spin_unlock_bh(&vhost_vsock_lock); + + return 0; +} + +static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) +{ + struct vhost_virtqueue *vq; + int i; + + if (features & ~VHOST_VSOCK_FEATURES) + return -EOPNOTSUPP; + + mutex_lock(&vsock->dev.mutex); + if ((features & (1 << VHOST_F_LOG_ALL)) && + !vhost_log_access_ok(&vsock->dev)) { + mutex_unlock(&vsock->dev.mutex); + return -EFAULT; + } + + for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { + vq = &vsock->vqs[i]; + mutex_lock(&vq->mutex); + vq->acked_features = features; + mutex_unlock(&vq->mutex); + } + mutex_unlock(&vsock->dev.mutex); + return 0; +} + +static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_vsock *vsock = f->private_data; + void __user *argp = (void __user *)arg; + u64 guest_cid; + u64 features; + int start; + int r; + + switch (ioctl) { + case VHOST_VSOCK_SET_GUEST_CID: + if (copy_from_user(&guest_cid, argp, sizeof(guest_cid))) + return -EFAULT; + return vhost_vsock_set_cid(vsock, guest_cid); + case VHOST_VSOCK_SET_RUNNING: + if (copy_from_user(&start, argp, sizeof(start))) + return -EFAULT; + if (start) + return vhost_vsock_start(vsock); + else + return vhost_vsock_stop(vsock, true); + case VHOST_GET_FEATURES: + features = VHOST_VSOCK_FEATURES; + if (copy_to_user(argp, &features, sizeof(features))) + return -EFAULT; + return 0; + case VHOST_SET_FEATURES: + if (copy_from_user(&features, argp, sizeof(features))) + return -EFAULT; + return vhost_vsock_set_features(vsock, features); + default: + mutex_lock(&vsock->dev.mutex); + r = vhost_dev_ioctl(&vsock->dev, ioctl, argp); + if (r == -ENOIOCTLCMD) + r = vhost_vring_ioctl(&vsock->dev, ioctl, argp); + else + vhost_vsock_flush(vsock); + mutex_unlock(&vsock->dev.mutex); + return r; + } +} + +#ifdef CONFIG_COMPAT +static long vhost_vsock_dev_compat_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + return vhost_vsock_dev_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); +} +#endif + +static const struct file_operations vhost_vsock_fops = { + .owner = THIS_MODULE, + .open = vhost_vsock_dev_open, + .release = vhost_vsock_dev_release, + .llseek = noop_llseek, + .unlocked_ioctl = vhost_vsock_dev_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vhost_vsock_dev_compat_ioctl, +#endif +}; + +static struct miscdevice vhost_vsock_misc = { + .minor = VHOST_VSOCK_MINOR, + .name = "vhost-vsock", + .fops = &vhost_vsock_fops, +}; + +static int __init vhost_vsock_init(void) +{ + int ret; + + ret = vsock_core_init(&vhost_transport.transport); + if (ret < 0) + return ret; + return misc_register(&vhost_vsock_misc); +}; + +static void __exit vhost_vsock_exit(void) +{ + misc_deregister(&vhost_vsock_misc); + vsock_core_exit(); +}; + +module_init(vhost_vsock_init); +module_exit(vhost_vsock_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Asias He"); +MODULE_DESCRIPTION("vhost transport for vsock "); +MODULE_ALIAS_MISCDEV(VHOST_VSOCK_MINOR); +MODULE_ALIAS("devname:vhost-vsock"); |