diff options
Diffstat (limited to 'src/spdk/module/sock/uring/uring.c')
-rw-r--r-- | src/spdk/module/sock/uring/uring.c | 1328 |
1 files changed, 1328 insertions, 0 deletions
diff --git a/src/spdk/module/sock/uring/uring.c b/src/spdk/module/sock/uring/uring.c new file mode 100644 index 000000000..3066f2d16 --- /dev/null +++ b/src/spdk/module/sock/uring/uring.c @@ -0,0 +1,1328 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/config.h" + +#include <sys/epoll.h> +#include <liburing.h> + +#include "spdk/barrier.h" +#include "spdk/likely.h" +#include "spdk/log.h" +#include "spdk/pipe.h" +#include "spdk/sock.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#include "spdk_internal/sock.h" +#include "spdk_internal/assert.h" + +#define MAX_TMPBUF 1024 +#define PORTNUMLEN 32 +#define SO_RCVBUF_SIZE (2 * 1024 * 1024) +#define SO_SNDBUF_SIZE (2 * 1024 * 1024) +#define SPDK_SOCK_GROUP_QUEUE_DEPTH 4096 +#define IOV_BATCH_SIZE 64 + +enum spdk_sock_task_type { + SPDK_SOCK_TASK_POLLIN = 0, + SPDK_SOCK_TASK_WRITE, + SPDK_SOCK_TASK_CANCEL, +}; + +enum spdk_uring_sock_task_status { + SPDK_URING_SOCK_TASK_NOT_IN_USE = 0, + SPDK_URING_SOCK_TASK_IN_PROCESS, +}; + +struct spdk_uring_task { + enum spdk_uring_sock_task_status status; + enum spdk_sock_task_type type; + struct spdk_uring_sock *sock; + struct msghdr msg; + struct iovec iovs[IOV_BATCH_SIZE]; + int iov_cnt; + struct spdk_sock_request *last_req; + STAILQ_ENTRY(spdk_uring_task) link; +}; + +struct spdk_uring_sock { + struct spdk_sock base; + int fd; + struct spdk_uring_sock_group_impl *group; + struct spdk_uring_task write_task; + struct spdk_uring_task pollin_task; + struct spdk_uring_task cancel_task; + struct spdk_pipe *recv_pipe; + void *recv_buf; + int recv_buf_sz; + bool pending_recv; + int connection_status; + TAILQ_ENTRY(spdk_uring_sock) link; +}; + +struct spdk_uring_sock_group_impl { + struct spdk_sock_group_impl base; + struct io_uring uring; + uint32_t io_inflight; + uint32_t io_queued; + uint32_t io_avail; + TAILQ_HEAD(, spdk_uring_sock) pending_recv; +}; + +#define SPDK_URING_SOCK_REQUEST_IOV(req) ((struct iovec *)((uint8_t *)req + sizeof(struct spdk_sock_request))) + +static int +get_addr_str(struct sockaddr *sa, char *host, size_t hlen) +{ + const char *result = NULL; + + if (sa == NULL || host == NULL) { + return -1; + } + + switch (sa->sa_family) { + case AF_INET: + result = inet_ntop(AF_INET, &(((struct sockaddr_in *)sa)->sin_addr), + host, hlen); + break; + case AF_INET6: + result = inet_ntop(AF_INET6, &(((struct sockaddr_in6 *)sa)->sin6_addr), + host, hlen); + break; + default: + break; + } + + if (result != NULL) { + return 0; + } else { + return -1; + } +} + +#define __uring_sock(sock) (struct spdk_uring_sock *)sock +#define __uring_group_impl(group) (struct spdk_uring_sock_group_impl *)group + +static int +uring_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport, + char *caddr, int clen, uint16_t *cport) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + struct sockaddr_storage sa; + socklen_t salen; + int rc; + + assert(sock != NULL); + + memset(&sa, 0, sizeof sa); + salen = sizeof sa; + rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); + if (rc != 0) { + SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); + return -1; + } + + switch (sa.ss_family) { + case AF_UNIX: + /* Acceptable connection types that don't have IPs */ + return 0; + case AF_INET: + case AF_INET6: + /* Code below will get IP addresses */ + break; + default: + /* Unsupported socket family */ + return -1; + } + + rc = get_addr_str((struct sockaddr *)&sa, saddr, slen); + if (rc != 0) { + SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); + return -1; + } + + if (sport) { + if (sa.ss_family == AF_INET) { + *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port); + } else if (sa.ss_family == AF_INET6) { + *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); + } + } + + memset(&sa, 0, sizeof sa); + salen = sizeof sa; + rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen); + if (rc != 0) { + SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno); + return -1; + } + + rc = get_addr_str((struct sockaddr *)&sa, caddr, clen); + if (rc != 0) { + SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); + return -1; + } + + if (cport) { + if (sa.ss_family == AF_INET) { + *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port); + } else if (sa.ss_family == AF_INET6) { + *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); + } + } + + return 0; +} + +enum uring_sock_create_type { + SPDK_SOCK_CREATE_LISTEN, + SPDK_SOCK_CREATE_CONNECT, +}; + +static int +uring_sock_alloc_pipe(struct spdk_uring_sock *sock, int sz) +{ + uint8_t *new_buf; + struct spdk_pipe *new_pipe; + struct iovec siov[2]; + struct iovec diov[2]; + int sbytes; + ssize_t bytes; + + if (sock->recv_buf_sz == sz) { + return 0; + } + + /* If the new size is 0, just free the pipe */ + if (sz == 0) { + spdk_pipe_destroy(sock->recv_pipe); + free(sock->recv_buf); + sock->recv_pipe = NULL; + sock->recv_buf = NULL; + return 0; + } else if (sz < MIN_SOCK_PIPE_SIZE) { + SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE); + return -1; + } + + /* Round up to next 64 byte multiple */ + new_buf = calloc(SPDK_ALIGN_CEIL(sz + 1, 64), sizeof(uint8_t)); + if (!new_buf) { + SPDK_ERRLOG("socket recv buf allocation failed\n"); + return -ENOMEM; + } + + new_pipe = spdk_pipe_create(new_buf, sz + 1); + if (new_pipe == NULL) { + SPDK_ERRLOG("socket pipe allocation failed\n"); + free(new_buf); + return -ENOMEM; + } + + if (sock->recv_pipe != NULL) { + /* Pull all of the data out of the old pipe */ + sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); + if (sbytes > sz) { + /* Too much data to fit into the new pipe size */ + spdk_pipe_destroy(new_pipe); + free(new_buf); + return -EINVAL; + } + + sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov); + assert(sbytes == sz); + + bytes = spdk_iovcpy(siov, 2, diov, 2); + spdk_pipe_writer_advance(new_pipe, bytes); + + spdk_pipe_destroy(sock->recv_pipe); + free(sock->recv_buf); + } + + sock->recv_buf_sz = sz; + sock->recv_buf = new_buf; + sock->recv_pipe = new_pipe; + + return 0; +} + +static int +uring_sock_set_recvbuf(struct spdk_sock *_sock, int sz) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + int rc; + + assert(sock != NULL); + +#ifndef __aarch64__ + /* On ARM systems, this buffering does not help. Skip it. */ + /* The size of the pipe is purely derived from benchmarks. It seems to work well. */ + rc = uring_sock_alloc_pipe(sock, sz); + if (rc) { + SPDK_ERRLOG("unable to allocate sufficient recvbuf with sz=%d on sock=%p\n", sz, _sock); + return rc; + } +#endif + + if (sz < SO_RCVBUF_SIZE) { + sz = SO_RCVBUF_SIZE; + } + + rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); + if (rc < 0) { + return rc; + } + + return 0; +} + +static int +uring_sock_set_sendbuf(struct spdk_sock *_sock, int sz) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + int rc; + + assert(sock != NULL); + + if (sz < SO_SNDBUF_SIZE) { + sz = SO_SNDBUF_SIZE; + } + + rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); + if (rc < 0) { + return rc; + } + + return 0; +} + +static struct spdk_uring_sock * +uring_sock_alloc(int fd) +{ + struct spdk_uring_sock *sock; + + sock = calloc(1, sizeof(*sock)); + if (sock == NULL) { + SPDK_ERRLOG("sock allocation failed\n"); + return NULL; + } + + sock->fd = fd; + return sock; +} + +static struct spdk_sock * +uring_sock_create(const char *ip, int port, + enum uring_sock_create_type type, + struct spdk_sock_opts *opts) +{ + struct spdk_uring_sock *sock; + char buf[MAX_TMPBUF]; + char portnum[PORTNUMLEN]; + char *p; + struct addrinfo hints, *res, *res0; + int fd, flag; + int val = 1; + int rc; + + if (ip == NULL) { + return NULL; + } + if (ip[0] == '[') { + snprintf(buf, sizeof(buf), "%s", ip + 1); + p = strchr(buf, ']'); + if (p != NULL) { + *p = '\0'; + } + ip = (const char *) &buf[0]; + } + + snprintf(portnum, sizeof portnum, "%d", port); + memset(&hints, 0, sizeof hints); + hints.ai_family = PF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_NUMERICSERV; + hints.ai_flags |= AI_PASSIVE; + hints.ai_flags |= AI_NUMERICHOST; + rc = getaddrinfo(ip, portnum, &hints, &res0); + if (rc != 0) { + SPDK_ERRLOG("getaddrinfo() failed (errno=%d)\n", errno); + return NULL; + } + + /* try listen */ + fd = -1; + for (res = res0; res != NULL; res = res->ai_next) { +retry: + fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (fd < 0) { + /* error */ + continue; + } + + val = SO_RCVBUF_SIZE; + rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof val); + if (rc) { + /* Not fatal */ + } + + val = SO_SNDBUF_SIZE; + rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &val, sizeof val); + if (rc) { + /* Not fatal */ + } + + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); + if (rc != 0) { + close(fd); + /* error */ + continue; + } + rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val); + if (rc != 0) { + close(fd); + /* error */ + continue; + } + +#if defined(SO_PRIORITY) + if (opts != NULL && opts->priority) { + rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val); + if (rc != 0) { + close(fd); + /* error */ + continue; + } + } +#endif + if (res->ai_family == AF_INET6) { + rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val); + if (rc != 0) { + close(fd); + /* error */ + continue; + } + } + + if (type == SPDK_SOCK_CREATE_LISTEN) { + rc = bind(fd, res->ai_addr, res->ai_addrlen); + if (rc != 0) { + SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno); + switch (errno) { + case EINTR: + /* interrupted? */ + close(fd); + goto retry; + case EADDRNOTAVAIL: + SPDK_ERRLOG("IP address %s not available. " + "Verify IP address in config file " + "and make sure setup script is " + "run before starting spdk app.\n", ip); + /* FALLTHROUGH */ + default: + /* try next family */ + close(fd); + fd = -1; + continue; + } + } + /* bind OK */ + rc = listen(fd, 512); + if (rc != 0) { + SPDK_ERRLOG("listen() failed, errno = %d\n", errno); + close(fd); + fd = -1; + break; + } + } else if (type == SPDK_SOCK_CREATE_CONNECT) { + rc = connect(fd, res->ai_addr, res->ai_addrlen); + if (rc != 0) { + SPDK_ERRLOG("connect() failed, errno = %d\n", errno); + /* try next family */ + close(fd); + fd = -1; + continue; + } + } + + flag = fcntl(fd, F_GETFL); + if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); + close(fd); + fd = -1; + break; + } + break; + } + freeaddrinfo(res0); + + if (fd < 0) { + return NULL; + } + + sock = uring_sock_alloc(fd); + if (sock == NULL) { + SPDK_ERRLOG("sock allocation failed\n"); + close(fd); + return NULL; + } + + return &sock->base; +} + +static struct spdk_sock * +uring_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) +{ + return uring_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts); +} + +static struct spdk_sock * +uring_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) +{ + return uring_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts); +} + +static struct spdk_sock * +uring_sock_accept(struct spdk_sock *_sock) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + struct sockaddr_storage sa; + socklen_t salen; + int rc, fd; + struct spdk_uring_sock *new_sock; + int flag; + + memset(&sa, 0, sizeof(sa)); + salen = sizeof(sa); + + assert(sock != NULL); + + rc = accept(sock->fd, (struct sockaddr *)&sa, &salen); + + if (rc == -1) { + return NULL; + } + + fd = rc; + + flag = fcntl(fd, F_GETFL); + if ((!(flag & O_NONBLOCK)) && (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0)) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); + close(fd); + return NULL; + } + +#if defined(SO_PRIORITY) + /* The priority is not inherited, so call this function again */ + if (sock->base.opts.priority) { + rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int)); + if (rc != 0) { + close(fd); + return NULL; + } + } +#endif + + new_sock = uring_sock_alloc(fd); + if (new_sock == NULL) { + close(fd); + return NULL; + } + + return &new_sock->base; +} + +static int +uring_sock_close(struct spdk_sock *_sock) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + int rc; + + assert(TAILQ_EMPTY(&_sock->pending_reqs)); + assert(sock->group == NULL); + + spdk_pipe_destroy(sock->recv_pipe); + free(sock->recv_buf); + rc = close(sock->fd); + if (rc == 0) { + free(sock); + } + + return rc; +} + +static ssize_t +uring_sock_recv_from_pipe(struct spdk_uring_sock *sock, struct iovec *diov, int diovcnt) +{ + struct iovec siov[2]; + int sbytes; + ssize_t bytes; + struct spdk_uring_sock_group_impl *group; + + sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); + if (sbytes < 0) { + errno = EINVAL; + return -1; + } else if (sbytes == 0) { + errno = EAGAIN; + return -1; + } + + bytes = spdk_iovcpy(siov, 2, diov, diovcnt); + + if (bytes == 0) { + /* The only way this happens is if diov is 0 length */ + errno = EINVAL; + return -1; + } + + spdk_pipe_reader_advance(sock->recv_pipe, bytes); + + /* If we drained the pipe, take it off the level-triggered list */ + if (sock->base.group_impl && spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) { + group = __uring_group_impl(sock->base.group_impl); + TAILQ_REMOVE(&group->pending_recv, sock, link); + sock->pending_recv = false; + } + + return bytes; +} + +static inline ssize_t +uring_sock_read(struct spdk_uring_sock *sock) +{ + struct iovec iov[2]; + int bytes; + struct spdk_uring_sock_group_impl *group; + + bytes = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov); + + if (bytes > 0) { + bytes = readv(sock->fd, iov, 2); + if (bytes > 0) { + spdk_pipe_writer_advance(sock->recv_pipe, bytes); + if (sock->base.group_impl) { + group = __uring_group_impl(sock->base.group_impl); + TAILQ_INSERT_TAIL(&group->pending_recv, sock, link); + sock->pending_recv = true; + } + } + } + + return bytes; +} + +static ssize_t +uring_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + int rc, i; + size_t len; + + if (sock->recv_pipe == NULL) { + return readv(sock->fd, iov, iovcnt); + } + + len = 0; + for (i = 0; i < iovcnt; i++) { + len += iov[i].iov_len; + } + + if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) { + /* If the user is receiving a sufficiently large amount of data, + * receive directly to their buffers. */ + if (len >= MIN_SOCK_PIPE_SIZE) { + return readv(sock->fd, iov, iovcnt); + } + + /* Otherwise, do a big read into our pipe */ + rc = uring_sock_read(sock); + if (rc <= 0) { + return rc; + } + } + + return uring_sock_recv_from_pipe(sock, iov, iovcnt); +} + +static ssize_t +uring_sock_recv(struct spdk_sock *sock, void *buf, size_t len) +{ + struct iovec iov[1]; + + iov[0].iov_base = buf; + iov[0].iov_len = len; + + return uring_sock_readv(sock, iov, 1); +} + +static ssize_t +uring_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + + if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) { + errno = EAGAIN; + return -1; + } + + return writev(sock->fd, iov, iovcnt); +} + +static int +sock_prep_reqs(struct spdk_sock *_sock, struct iovec *iovs, int index, + struct spdk_sock_request **last_req) +{ + int iovcnt, i; + struct spdk_sock_request *req; + unsigned int offset; + + /* Gather an iov */ + iovcnt = index; + if (spdk_unlikely(iovcnt >= IOV_BATCH_SIZE)) { + goto end; + } + + if (last_req != NULL && *last_req != NULL) { + req = TAILQ_NEXT(*last_req, internal.link); + } else { + req = TAILQ_FIRST(&_sock->queued_reqs); + } + + while (req) { + offset = req->internal.offset; + + for (i = 0; i < req->iovcnt; i++) { + /* Consume any offset first */ + if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) { + offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len; + continue; + } + + iovs[iovcnt].iov_base = SPDK_SOCK_REQUEST_IOV(req, i)->iov_base + offset; + iovs[iovcnt].iov_len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset; + iovcnt++; + + offset = 0; + + if (iovcnt >= IOV_BATCH_SIZE) { + break; + } + } + if (iovcnt >= IOV_BATCH_SIZE) { + break; + } + + if (last_req != NULL) { + *last_req = req; + } + req = TAILQ_NEXT(req, internal.link); + } + +end: + return iovcnt; +} + +static int +sock_complete_reqs(struct spdk_sock *_sock, ssize_t rc) +{ + struct spdk_sock_request *req; + int i, retval; + unsigned int offset; + size_t len; + + /* Consume the requests that were actually written */ + req = TAILQ_FIRST(&_sock->queued_reqs); + while (req) { + offset = req->internal.offset; + + for (i = 0; i < req->iovcnt; i++) { + /* Advance by the offset first */ + if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) { + offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len; + continue; + } + + /* Calculate the remaining length of this element */ + len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset; + + if (len > (size_t)rc) { + /* This element was partially sent. */ + req->internal.offset += rc; + return 0; + } + + offset = 0; + req->internal.offset += len; + rc -= len; + } + + /* Handled a full request. */ + spdk_sock_request_pend(_sock, req); + + retval = spdk_sock_request_put(_sock, req, 0); + if (retval) { + return retval; + } + + if (rc == 0) { + break; + } + + req = TAILQ_FIRST(&_sock->queued_reqs); + } + + return 0; +} + +static void +_sock_flush(struct spdk_sock *_sock) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + struct spdk_uring_task *task = &sock->write_task; + uint32_t iovcnt; + struct io_uring_sqe *sqe; + + if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) { + return; + } + + iovcnt = sock_prep_reqs(&sock->base, task->iovs, task->iov_cnt, &task->last_req); + if (!iovcnt) { + return; + } + + task->iov_cnt = iovcnt; + assert(sock->group != NULL); + task->msg.msg_iov = task->iovs; + task->msg.msg_iovlen = task->iov_cnt; + + sock->group->io_queued++; + + sqe = io_uring_get_sqe(&sock->group->uring); + io_uring_prep_sendmsg(sqe, sock->fd, &sock->write_task.msg, 0); + io_uring_sqe_set_data(sqe, task); + task->status = SPDK_URING_SOCK_TASK_IN_PROCESS; +} + +static void +_sock_prep_pollin(struct spdk_sock *_sock) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + struct spdk_uring_task *task = &sock->pollin_task; + struct io_uring_sqe *sqe; + + /* Do not prepare pollin event */ + if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS || sock->pending_recv) { + return; + } + + assert(sock->group != NULL); + sock->group->io_queued++; + + sqe = io_uring_get_sqe(&sock->group->uring); + io_uring_prep_poll_add(sqe, sock->fd, POLLIN); + io_uring_sqe_set_data(sqe, task); + task->status = SPDK_URING_SOCK_TASK_IN_PROCESS; +} + +static void +_sock_prep_cancel_task(struct spdk_sock *_sock, void *user_data) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + struct spdk_uring_task *task = &sock->cancel_task; + struct io_uring_sqe *sqe; + + if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) { + return; + } + + assert(sock->group != NULL); + sock->group->io_queued++; + + sqe = io_uring_get_sqe(&sock->group->uring); + io_uring_prep_cancel(sqe, user_data, 0); + io_uring_sqe_set_data(sqe, task); + task->status = SPDK_URING_SOCK_TASK_IN_PROCESS; +} + +static int +sock_uring_group_reap(struct spdk_uring_sock_group_impl *group, int max, int max_read_events, + struct spdk_sock **socks) +{ + int i, count, ret; + struct io_uring_cqe *cqe; + struct spdk_uring_sock *sock, *tmp; + struct spdk_uring_task *task; + int status; + + for (i = 0; i < max; i++) { + ret = io_uring_peek_cqe(&group->uring, &cqe); + if (ret != 0) { + break; + } + + if (cqe == NULL) { + break; + } + + task = (struct spdk_uring_task *)cqe->user_data; + assert(task != NULL); + sock = task->sock; + assert(sock != NULL); + assert(sock->group != NULL); + assert(sock->group == group); + sock->group->io_inflight--; + sock->group->io_avail++; + status = cqe->res; + io_uring_cqe_seen(&group->uring, cqe); + + task->status = SPDK_URING_SOCK_TASK_NOT_IN_USE; + + if (spdk_unlikely(status <= 0)) { + if (status == -EAGAIN || status == -EWOULDBLOCK) { + continue; + } + } + + switch (task->type) { + case SPDK_SOCK_TASK_POLLIN: + if ((status & POLLIN) == POLLIN) { + if (sock->base.cb_fn != NULL) { + assert(sock->pending_recv == false); + sock->pending_recv = true; + TAILQ_INSERT_TAIL(&group->pending_recv, sock, link); + } + } + break; + case SPDK_SOCK_TASK_WRITE: + assert(TAILQ_EMPTY(&sock->base.pending_reqs)); + task->last_req = NULL; + task->iov_cnt = 0; + if (spdk_unlikely(status) < 0) { + sock->connection_status = status; + spdk_sock_abort_requests(&sock->base); + } else { + sock_complete_reqs(&sock->base, status); + } + + break; + case SPDK_SOCK_TASK_CANCEL: + /* Do nothing */ + break; + default: + SPDK_UNREACHABLE(); + } + } + + if (!socks) { + return 0; + } + count = 0; + TAILQ_FOREACH_SAFE(sock, &group->pending_recv, link, tmp) { + if (count == max_read_events) { + break; + } + + socks[count++] = &sock->base; + } + + /* Cycle the pending_recv list so that each time we poll things aren't + * in the same order. */ + for (i = 0; i < count; i++) { + sock = __uring_sock(socks[i]); + + TAILQ_REMOVE(&group->pending_recv, sock, link); + + if (sock->recv_pipe == NULL || spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) { + sock->pending_recv = false; + } else { + TAILQ_INSERT_TAIL(&group->pending_recv, sock, link); + } + } + + return count; +} + +static int +_sock_flush_client(struct spdk_sock *_sock) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + struct msghdr msg = {}; + struct iovec iovs[IOV_BATCH_SIZE]; + int iovcnt; + ssize_t rc; + + /* Can't flush from within a callback or we end up with recursive calls */ + if (_sock->cb_cnt > 0) { + return 0; + } + + /* Gather an iov */ + iovcnt = sock_prep_reqs(_sock, iovs, 0, NULL); + if (iovcnt == 0) { + return 0; + } + + /* Perform the vectored write */ + msg.msg_iov = iovs; + msg.msg_iovlen = iovcnt; + rc = sendmsg(sock->fd, &msg, 0); + if (rc <= 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + return 0; + } + return rc; + } + + sock_complete_reqs(_sock, rc); + + return 0; +} + +static void +uring_sock_writev_async(struct spdk_sock *_sock, struct spdk_sock_request *req) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + int rc; + + if (spdk_unlikely(sock->connection_status)) { + req->cb_fn(req->cb_arg, sock->connection_status); + return; + } + + spdk_sock_request_queue(_sock, req); + + if (!sock->group) { + if (_sock->queued_iovcnt >= IOV_BATCH_SIZE) { + rc = _sock_flush_client(_sock); + if (rc) { + spdk_sock_abort_requests(_sock); + } + } + } +} + +static int +uring_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + int val; + int rc; + + assert(sock != NULL); + + val = nbytes; + rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val); + if (rc != 0) { + return -1; + } + return 0; +} + +static bool +uring_sock_is_ipv6(struct spdk_sock *_sock) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + struct sockaddr_storage sa; + socklen_t salen; + int rc; + + assert(sock != NULL); + + memset(&sa, 0, sizeof sa); + salen = sizeof sa; + rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); + if (rc != 0) { + SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); + return false; + } + + return (sa.ss_family == AF_INET6); +} + +static bool +uring_sock_is_ipv4(struct spdk_sock *_sock) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + struct sockaddr_storage sa; + socklen_t salen; + int rc; + + assert(sock != NULL); + + memset(&sa, 0, sizeof sa); + salen = sizeof sa; + rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); + if (rc != 0) { + SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); + return false; + } + + return (sa.ss_family == AF_INET); +} + +static bool +uring_sock_is_connected(struct spdk_sock *_sock) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + uint8_t byte; + int rc; + + rc = recv(sock->fd, &byte, 1, MSG_PEEK); + if (rc == 0) { + return false; + } + + if (rc < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + return true; + } + + return false; + } + + return true; +} + +static int +uring_sock_get_placement_id(struct spdk_sock *_sock, int *placement_id) +{ + int rc = -1; + +#if defined(SO_INCOMING_NAPI_ID) + struct spdk_uring_sock *sock = __uring_sock(_sock); + socklen_t salen = sizeof(int); + + rc = getsockopt(sock->fd, SOL_SOCKET, SO_INCOMING_NAPI_ID, placement_id, &salen); + if (rc != 0) { + SPDK_ERRLOG("getsockopt() failed (errno=%d)\n", errno); + } + +#endif + return rc; +} + +static struct spdk_sock_group_impl * +uring_sock_group_impl_create(void) +{ + struct spdk_uring_sock_group_impl *group_impl; + + group_impl = calloc(1, sizeof(*group_impl)); + if (group_impl == NULL) { + SPDK_ERRLOG("group_impl allocation failed\n"); + return NULL; + } + + group_impl->io_avail = SPDK_SOCK_GROUP_QUEUE_DEPTH; + + if (io_uring_queue_init(SPDK_SOCK_GROUP_QUEUE_DEPTH, &group_impl->uring, 0) < 0) { + SPDK_ERRLOG("uring I/O context setup failure\n"); + free(group_impl); + return NULL; + } + + TAILQ_INIT(&group_impl->pending_recv); + + return &group_impl->base; +} + +static int +uring_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, + struct spdk_sock *_sock) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group); + + sock->group = group; + sock->write_task.sock = sock; + sock->write_task.type = SPDK_SOCK_TASK_WRITE; + + sock->pollin_task.sock = sock; + sock->pollin_task.type = SPDK_SOCK_TASK_POLLIN; + + sock->cancel_task.sock = sock; + sock->cancel_task.type = SPDK_SOCK_TASK_CANCEL; + + /* switched from another polling group due to scheduling */ + if (spdk_unlikely(sock->recv_pipe != NULL && + (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) { + assert(sock->pending_recv == false); + sock->pending_recv = true; + TAILQ_INSERT_TAIL(&group->pending_recv, sock, link); + } + + return 0; +} + +static int +uring_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, + struct spdk_sock **socks) +{ + struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group); + int count, ret; + int to_complete, to_submit; + struct spdk_sock *_sock, *tmp; + struct spdk_uring_sock *sock; + + if (spdk_likely(socks)) { + TAILQ_FOREACH_SAFE(_sock, &group->base.socks, link, tmp) { + sock = __uring_sock(_sock); + if (spdk_unlikely(sock->connection_status)) { + continue; + } + _sock_flush(_sock); + _sock_prep_pollin(_sock); + } + } + + to_submit = group->io_queued; + + /* For network I/O, it cannot be set with O_DIRECT, so we do not need to call spdk_io_uring_enter */ + if (to_submit > 0) { + /* If there are I/O to submit, use io_uring_submit here. + * It will automatically call io_uring_enter appropriately. */ + ret = io_uring_submit(&group->uring); + if (ret < 0) { + return 1; + } + group->io_queued = 0; + group->io_inflight += to_submit; + group->io_avail -= to_submit; + } + + count = 0; + to_complete = group->io_inflight; + if (to_complete > 0) { + count = sock_uring_group_reap(group, to_complete, max_events, socks); + } + + return count; +} + +static int +uring_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, + struct spdk_sock *_sock) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group); + + if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) { + _sock_prep_cancel_task(_sock, &sock->write_task); + /* Since spdk_sock_group_remove_sock is not asynchronous interface, so + * currently can use a while loop here. */ + while ((sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) || + (sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) { + uring_sock_group_impl_poll(_group, 32, NULL); + } + } + + if (sock->pollin_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) { + _sock_prep_cancel_task(_sock, &sock->pollin_task); + /* Since spdk_sock_group_remove_sock is not asynchronous interface, so + * currently can use a while loop here. */ + while ((sock->pollin_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) || + (sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) { + uring_sock_group_impl_poll(_group, 32, NULL); + } + } + + if (sock->pending_recv) { + TAILQ_REMOVE(&group->pending_recv, sock, link); + sock->pending_recv = false; + } + assert(sock->pending_recv == false); + + sock->group = NULL; + return 0; +} + +static int +uring_sock_group_impl_close(struct spdk_sock_group_impl *_group) +{ + struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group); + + /* try to reap all the active I/O */ + while (group->io_inflight) { + uring_sock_group_impl_poll(_group, 32, NULL); + } + assert(group->io_inflight == 0); + assert(group->io_avail == SPDK_SOCK_GROUP_QUEUE_DEPTH); + + io_uring_queue_exit(&group->uring); + + free(group); + return 0; +} + +static int +uring_sock_flush(struct spdk_sock *_sock) +{ + struct spdk_uring_sock *sock = __uring_sock(_sock); + + if (!sock->group) { + return _sock_flush_client(_sock); + } + + return 0; +} + +static struct spdk_net_impl g_uring_net_impl = { + .name = "uring", + .getaddr = uring_sock_getaddr, + .connect = uring_sock_connect, + .listen = uring_sock_listen, + .accept = uring_sock_accept, + .close = uring_sock_close, + .recv = uring_sock_recv, + .readv = uring_sock_readv, + .writev = uring_sock_writev, + .writev_async = uring_sock_writev_async, + .flush = uring_sock_flush, + .set_recvlowat = uring_sock_set_recvlowat, + .set_recvbuf = uring_sock_set_recvbuf, + .set_sendbuf = uring_sock_set_sendbuf, + .is_ipv6 = uring_sock_is_ipv6, + .is_ipv4 = uring_sock_is_ipv4, + .is_connected = uring_sock_is_connected, + .get_placement_id = uring_sock_get_placement_id, + .group_impl_create = uring_sock_group_impl_create, + .group_impl_add_sock = uring_sock_group_impl_add_sock, + .group_impl_remove_sock = uring_sock_group_impl_remove_sock, + .group_impl_poll = uring_sock_group_impl_poll, + .group_impl_close = uring_sock_group_impl_close, +}; + +SPDK_NET_IMPL_REGISTER(uring, &g_uring_net_impl, DEFAULT_SOCK_PRIORITY + 1); |