summaryrefslogtreecommitdiffstats
path: root/net/rds
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
commit2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree848558de17fb3008cdf4d861b01ac7781903ce39 /net/rds
parentInitial commit. (diff)
downloadlinux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz
linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip
Adding upstream version 6.1.76.upstream/6.1.76upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'net/rds')
-rw-r--r--net/rds/Kconfig28
-rw-r--r--net/rds/Makefile17
-rw-r--r--net/rds/af_rds.c963
-rw-r--r--net/rds/bind.c283
-rw-r--r--net/rds/cong.c428
-rw-r--r--net/rds/connection.c948
-rw-r--r--net/rds/ib.c607
-rw-r--r--net/rds/ib.h458
-rw-r--r--net/rds/ib_cm.c1287
-rw-r--r--net/rds/ib_frmr.c446
-rw-r--r--net/rds/ib_mr.h143
-rw-r--r--net/rds/ib_rdma.c701
-rw-r--r--net/rds/ib_recv.c1094
-rw-r--r--net/rds/ib_ring.c168
-rw-r--r--net/rds/ib_send.c1017
-rw-r--r--net/rds/ib_stats.c107
-rw-r--r--net/rds/ib_sysctl.c121
-rw-r--r--net/rds/info.c242
-rw-r--r--net/rds/info.h31
-rw-r--r--net/rds/loop.c254
-rw-r--r--net/rds/loop.h12
-rw-r--r--net/rds/message.c521
-rw-r--r--net/rds/page.c167
-rw-r--r--net/rds/rdma.c958
-rw-r--r--net/rds/rdma_transport.c322
-rw-r--r--net/rds/rdma_transport.h31
-rw-r--r--net/rds/rds.h1019
-rw-r--r--net/rds/rds_single_path.h31
-rw-r--r--net/rds/recv.c831
-rw-r--r--net/rds/send.c1515
-rw-r--r--net/rds/stats.c155
-rw-r--r--net/rds/sysctl.c110
-rw-r--r--net/rds/tcp.c754
-rw-r--r--net/rds/tcp.h98
-rw-r--r--net/rds/tcp_connect.c229
-rw-r--r--net/rds/tcp_listen.c348
-rw-r--r--net/rds/tcp_recv.c349
-rw-r--r--net/rds/tcp_send.c225
-rw-r--r--net/rds/tcp_stats.c74
-rw-r--r--net/rds/threads.c311
-rw-r--r--net/rds/transport.c169
41 files changed, 17572 insertions, 0 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
new file mode 100644
index 000000000..75cd69696
--- /dev/null
+++ b/net/rds/Kconfig
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config RDS
+ tristate "The Reliable Datagram Sockets Protocol"
+ depends on INET
+ help
+ The RDS (Reliable Datagram Sockets) protocol provides reliable,
+ sequenced delivery of datagrams over Infiniband or TCP.
+
+config RDS_RDMA
+ tristate "RDS over Infiniband"
+ depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
+ help
+ Allow RDS to use Infiniband as a transport.
+ This transport supports RDMA operations.
+
+config RDS_TCP
+ tristate "RDS over TCP"
+ depends on RDS
+ depends on IPV6 || !IPV6
+ help
+ Allow RDS to use TCP as a transport.
+ This transport does not support RDMA operations.
+
+config RDS_DEBUG
+ bool "RDS debugging messages"
+ depends on RDS
+ default n
diff --git a/net/rds/Makefile b/net/rds/Makefile
new file mode 100644
index 000000000..8fdc118e2
--- /dev/null
+++ b/net/rds/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_RDS) += rds.o
+rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
+ recv.o send.o stats.o sysctl.o threads.o transport.o \
+ loop.o page.o rdma.o
+
+obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
+rds_rdma-y := rdma_transport.o \
+ ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
+ ib_sysctl.o ib_rdma.o ib_frmr.o
+
+
+obj-$(CONFIG_RDS_TCP) += rds_tcp.o
+rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
+ tcp_send.o tcp_stats.o
+
+ccflags-$(CONFIG_RDS_DEBUG) := -DRDS_DEBUG
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
new file mode 100644
index 000000000..d107f7605
--- /dev/null
+++ b/net/rds/af_rds.c
@@ -0,0 +1,963 @@
+/*
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <linux/ipv6.h>
+#include <linux/poll.h>
+#include <net/sock.h>
+
+#include "rds.h"
+
+/* this is just used for stats gathering :/ */
+static DEFINE_SPINLOCK(rds_sock_lock);
+static unsigned long rds_sock_count;
+static LIST_HEAD(rds_sock_list);
+DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
+
+/*
+ * This is called as the final descriptor referencing this socket is closed.
+ * We have to unbind the socket so that another socket can be bound to the
+ * address it was using.
+ *
+ * We have to be careful about racing with the incoming path. sock_orphan()
+ * sets SOCK_DEAD and we use that as an indicator to the rx path that new
+ * messages shouldn't be queued.
+ */
+static int rds_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct rds_sock *rs;
+
+ if (!sk)
+ goto out;
+
+ rs = rds_sk_to_rs(sk);
+
+ sock_orphan(sk);
+ /* Note - rds_clear_recv_queue grabs rs_recv_lock, so
+ * that ensures the recv path has completed messing
+ * with the socket. */
+ rds_clear_recv_queue(rs);
+ rds_cong_remove_socket(rs);
+
+ rds_remove_bound(rs);
+
+ rds_send_drop_to(rs, NULL);
+ rds_rdma_drop_keys(rs);
+ rds_notify_queue_get(rs, NULL);
+ rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue);
+
+ spin_lock_bh(&rds_sock_lock);
+ list_del_init(&rs->rs_item);
+ rds_sock_count--;
+ spin_unlock_bh(&rds_sock_lock);
+
+ rds_trans_put(rs->rs_transport);
+
+ sock->sk = NULL;
+ sock_put(sk);
+out:
+ return 0;
+}
+
+/*
+ * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
+ * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK
+ * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
+ * this seems more conservative.
+ * NB - normally, one would use sk_callback_lock for this, but we can
+ * get here from interrupts, whereas the network code grabs sk_callback_lock
+ * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
+ */
+void rds_wake_sk_sleep(struct rds_sock *rs)
+{
+ unsigned long flags;
+
+ read_lock_irqsave(&rs->rs_recv_lock, flags);
+ __rds_wake_sk_sleep(rds_rs_to_sk(rs));
+ read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
+ int peer)
+{
+ struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_in *sin;
+ int uaddr_len;
+
+ /* racey, don't care */
+ if (peer) {
+ if (ipv6_addr_any(&rs->rs_conn_addr))
+ return -ENOTCONN;
+
+ if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ sin->sin_family = AF_INET;
+ sin->sin_port = rs->rs_conn_port;
+ sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
+ uaddr_len = sizeof(*sin);
+ } else {
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = rs->rs_conn_port;
+ sin6->sin6_addr = rs->rs_conn_addr;
+ sin6->sin6_flowinfo = 0;
+ /* scope_id is the same as in the bound address. */
+ sin6->sin6_scope_id = rs->rs_bound_scope_id;
+ uaddr_len = sizeof(*sin6);
+ }
+ } else {
+ /* If socket is not yet bound and the socket is connected,
+ * set the return address family to be the same as the
+ * connected address, but with 0 address value. If it is not
+ * connected, set the family to be AF_UNSPEC (value 0) and
+ * the address size to be that of an IPv4 address.
+ */
+ if (ipv6_addr_any(&rs->rs_bound_addr)) {
+ if (ipv6_addr_any(&rs->rs_conn_addr)) {
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_UNSPEC;
+ return sizeof(*sin);
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!(ipv6_addr_type(&rs->rs_conn_addr) &
+ IPV6_ADDR_MAPPED)) {
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ memset(sin6, 0, sizeof(*sin6));
+ sin6->sin6_family = AF_INET6;
+ return sizeof(*sin6);
+ }
+#endif
+
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ return sizeof(*sin);
+ }
+ if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ sin->sin_family = AF_INET;
+ sin->sin_port = rs->rs_bound_port;
+ sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
+ uaddr_len = sizeof(*sin);
+ } else {
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = rs->rs_bound_port;
+ sin6->sin6_addr = rs->rs_bound_addr;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_scope_id = rs->rs_bound_scope_id;
+ uaddr_len = sizeof(*sin6);
+ }
+ }
+
+ return uaddr_len;
+}
+
+/*
+ * RDS' poll is without a doubt the least intuitive part of the interface,
+ * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from
+ * a network protocol.
+ *
+ * EPOLLIN is asserted if
+ * - there is data on the receive queue.
+ * - to signal that a previously congested destination may have become
+ * uncongested
+ * - A notification has been queued to the socket (this can be a congestion
+ * update, or a RDMA completion, or a MSG_ZEROCOPY completion).
+ *
+ * EPOLLOUT is asserted if there is room on the send queue. This does not mean
+ * however, that the next sendmsg() call will succeed. If the application tries
+ * to send to a congested destination, the system call may still fail (and
+ * return ENOBUFS).
+ */
+static __poll_t rds_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ __poll_t mask = 0;
+ unsigned long flags;
+
+ poll_wait(file, sk_sleep(sk), wait);
+
+ if (rs->rs_seen_congestion)
+ poll_wait(file, &rds_poll_waitq, wait);
+
+ read_lock_irqsave(&rs->rs_recv_lock, flags);
+ if (!rs->rs_cong_monitor) {
+ /* When a congestion map was updated, we signal EPOLLIN for
+ * "historical" reasons. Applications can also poll for
+ * WRBAND instead. */
+ if (rds_cong_updated_since(&rs->rs_cong_track))
+ mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND);
+ } else {
+ spin_lock(&rs->rs_lock);
+ if (rs->rs_cong_notify)
+ mask |= (EPOLLIN | EPOLLRDNORM);
+ spin_unlock(&rs->rs_lock);
+ }
+ if (!list_empty(&rs->rs_recv_queue) ||
+ !list_empty(&rs->rs_notify_queue) ||
+ !list_empty(&rs->rs_zcookie_queue.zcookie_head))
+ mask |= (EPOLLIN | EPOLLRDNORM);
+ if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
+ mask |= (EPOLLOUT | EPOLLWRNORM);
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ mask |= POLLERR;
+ read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+ /* clear state any time we wake a seen-congested socket */
+ if (mask)
+ rs->rs_seen_congestion = 0;
+
+ return mask;
+}
+
+static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+ rds_tos_t utos, tos = 0;
+
+ switch (cmd) {
+ case SIOCRDSSETTOS:
+ if (get_user(utos, (rds_tos_t __user *)arg))
+ return -EFAULT;
+
+ if (rs->rs_transport &&
+ rs->rs_transport->get_tos_map)
+ tos = rs->rs_transport->get_tos_map(utos);
+ else
+ return -ENOIOCTLCMD;
+
+ spin_lock_bh(&rds_sock_lock);
+ if (rs->rs_tos || rs->rs_conn) {
+ spin_unlock_bh(&rds_sock_lock);
+ return -EINVAL;
+ }
+ rs->rs_tos = tos;
+ spin_unlock_bh(&rds_sock_lock);
+ break;
+ case SIOCRDSGETTOS:
+ spin_lock_bh(&rds_sock_lock);
+ tos = rs->rs_tos;
+ spin_unlock_bh(&rds_sock_lock);
+ if (put_user(tos, (rds_tos_t __user *)arg))
+ return -EFAULT;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+
+static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len)
+{
+ struct sockaddr_in6 sin6;
+ struct sockaddr_in sin;
+ int ret = 0;
+
+ /* racing with another thread binding seems ok here */
+ if (ipv6_addr_any(&rs->rs_bound_addr)) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ goto out;
+ }
+
+ if (len < sizeof(struct sockaddr_in)) {
+ ret = -EINVAL;
+ goto out;
+ } else if (len < sizeof(struct sockaddr_in6)) {
+ /* Assume IPv4 */
+ if (copy_from_sockptr(&sin, optval,
+ sizeof(struct sockaddr_in))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
+ sin6.sin6_port = sin.sin_port;
+ } else {
+ if (copy_from_sockptr(&sin6, optval,
+ sizeof(struct sockaddr_in6))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ rds_send_drop_to(rs, &sin6);
+out:
+ return ret;
+}
+
+static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval,
+ int optlen)
+{
+ int value;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (copy_from_sockptr(&value, optval, sizeof(int)))
+ return -EFAULT;
+ *optvar = !!value;
+ return 0;
+}
+
+static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen)
+{
+ int ret;
+
+ ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
+ if (ret == 0) {
+ if (rs->rs_cong_monitor) {
+ rds_cong_add_socket(rs);
+ } else {
+ rds_cong_remove_socket(rs);
+ rs->rs_cong_mask = 0;
+ rs->rs_cong_notify = 0;
+ }
+ }
+ return ret;
+}
+
+static int rds_set_transport(struct rds_sock *rs, sockptr_t optval, int optlen)
+{
+ int t_type;
+
+ if (rs->rs_transport)
+ return -EOPNOTSUPP; /* previously attached to transport */
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&t_type, optval, sizeof(t_type)))
+ return -EFAULT;
+
+ if (t_type < 0 || t_type >= RDS_TRANS_COUNT)
+ return -EINVAL;
+
+ rs->rs_transport = rds_trans_get(t_type);
+
+ return rs->rs_transport ? 0 : -ENOPROTOOPT;
+}
+
+static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval,
+ int optlen, int optname)
+{
+ int val, valbool;
+
+ if (optlen != sizeof(int))
+ return -EFAULT;
+
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
+ return -EFAULT;
+
+ valbool = val ? 1 : 0;
+
+ if (optname == SO_TIMESTAMP_NEW)
+ sock_set_flag(sk, SOCK_TSTAMP_NEW);
+
+ if (valbool)
+ sock_set_flag(sk, SOCK_RCVTSTAMP);
+ else
+ sock_reset_flag(sk, SOCK_RCVTSTAMP);
+
+ return 0;
+}
+
+static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval,
+ int optlen)
+{
+ struct rds_rx_trace_so trace;
+ int i;
+
+ if (optlen != sizeof(struct rds_rx_trace_so))
+ return -EFAULT;
+
+ if (copy_from_sockptr(&trace, optval, sizeof(trace)))
+ return -EFAULT;
+
+ if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
+ return -EFAULT;
+
+ rs->rs_rx_traces = trace.rx_traces;
+ for (i = 0; i < rs->rs_rx_traces; i++) {
+ if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) {
+ rs->rs_rx_traces = 0;
+ return -EFAULT;
+ }
+ rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
+ }
+
+ return 0;
+}
+
+static int rds_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+ int ret;
+
+ if (level != SOL_RDS) {
+ ret = -ENOPROTOOPT;
+ goto out;
+ }
+
+ switch (optname) {
+ case RDS_CANCEL_SENT_TO:
+ ret = rds_cancel_sent_to(rs, optval, optlen);
+ break;
+ case RDS_GET_MR:
+ ret = rds_get_mr(rs, optval, optlen);
+ break;
+ case RDS_GET_MR_FOR_DEST:
+ ret = rds_get_mr_for_dest(rs, optval, optlen);
+ break;
+ case RDS_FREE_MR:
+ ret = rds_free_mr(rs, optval, optlen);
+ break;
+ case RDS_RECVERR:
+ ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
+ break;
+ case RDS_CONG_MONITOR:
+ ret = rds_cong_monitor(rs, optval, optlen);
+ break;
+ case SO_RDS_TRANSPORT:
+ lock_sock(sock->sk);
+ ret = rds_set_transport(rs, optval, optlen);
+ release_sock(sock->sk);
+ break;
+ case SO_TIMESTAMP_OLD:
+ case SO_TIMESTAMP_NEW:
+ lock_sock(sock->sk);
+ ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname);
+ release_sock(sock->sk);
+ break;
+ case SO_RDS_MSG_RXPATH_LATENCY:
+ ret = rds_recv_track_latency(rs, optval, optlen);
+ break;
+ default:
+ ret = -ENOPROTOOPT;
+ }
+out:
+ return ret;
+}
+
+static int rds_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+ int ret = -ENOPROTOOPT, len;
+ int trans;
+
+ if (level != SOL_RDS)
+ goto out;
+
+ if (get_user(len, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ switch (optname) {
+ case RDS_INFO_FIRST ... RDS_INFO_LAST:
+ ret = rds_info_getsockopt(sock, optname, optval,
+ optlen);
+ break;
+
+ case RDS_RECVERR:
+ if (len < sizeof(int))
+ ret = -EINVAL;
+ else
+ if (put_user(rs->rs_recverr, (int __user *) optval) ||
+ put_user(sizeof(int), optlen))
+ ret = -EFAULT;
+ else
+ ret = 0;
+ break;
+ case SO_RDS_TRANSPORT:
+ if (len < sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
+ trans = (rs->rs_transport ? rs->rs_transport->t_type :
+ RDS_TRANS_NONE); /* unbound */
+ if (put_user(trans, (int __user *)optval) ||
+ put_user(sizeof(int), optlen))
+ ret = -EFAULT;
+ else
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+
+out:
+ return ret;
+
+}
+
+static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_in *sin;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ int ret = 0;
+
+ if (addr_len < offsetofend(struct sockaddr, sa_family))
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ switch (uaddr->sa_family) {
+ case AF_INET:
+ sin = (struct sockaddr_in *)uaddr;
+ if (addr_len < sizeof(struct sockaddr_in)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ ret = -EDESTADDRREQ;
+ break;
+ }
+ if (ipv4_is_multicast(sin->sin_addr.s_addr) ||
+ sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
+ ret = -EINVAL;
+ break;
+ }
+ ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
+ rs->rs_conn_port = sin->sin_port;
+ break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6: {
+ struct sockaddr_in6 *sin6;
+ int addr_type;
+
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ if (addr_len < sizeof(struct sockaddr_in6)) {
+ ret = -EINVAL;
+ break;
+ }
+ addr_type = ipv6_addr_type(&sin6->sin6_addr);
+ if (!(addr_type & IPV6_ADDR_UNICAST)) {
+ __be32 addr4;
+
+ if (!(addr_type & IPV6_ADDR_MAPPED)) {
+ ret = -EPROTOTYPE;
+ break;
+ }
+
+ /* It is a mapped address. Need to do some sanity
+ * checks.
+ */
+ addr4 = sin6->sin6_addr.s6_addr32[3];
+ if (addr4 == htonl(INADDR_ANY) ||
+ addr4 == htonl(INADDR_BROADCAST) ||
+ ipv4_is_multicast(addr4)) {
+ ret = -EPROTOTYPE;
+ break;
+ }
+ }
+
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ /* If socket is arleady bound to a link local address,
+ * the peer address must be on the same link.
+ */
+ if (sin6->sin6_scope_id == 0 ||
+ (!ipv6_addr_any(&rs->rs_bound_addr) &&
+ rs->rs_bound_scope_id &&
+ sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
+ ret = -EINVAL;
+ break;
+ }
+ /* Remember the connected address scope ID. It will
+ * be checked against the binding local address when
+ * the socket is bound.
+ */
+ rs->rs_bound_scope_id = sin6->sin6_scope_id;
+ }
+ rs->rs_conn_addr = sin6->sin6_addr;
+ rs->rs_conn_port = sin6->sin6_port;
+ break;
+ }
+#endif
+
+ default:
+ ret = -EAFNOSUPPORT;
+ break;
+ }
+
+ release_sock(sk);
+ return ret;
+}
+
+static struct proto rds_proto = {
+ .name = "RDS",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct rds_sock),
+};
+
+static const struct proto_ops rds_proto_ops = {
+ .family = AF_RDS,
+ .owner = THIS_MODULE,
+ .release = rds_release,
+ .bind = rds_bind,
+ .connect = rds_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = rds_getname,
+ .poll = rds_poll,
+ .ioctl = rds_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = rds_setsockopt,
+ .getsockopt = rds_getsockopt,
+ .sendmsg = rds_sendmsg,
+ .recvmsg = rds_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static void rds_sock_destruct(struct sock *sk)
+{
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+
+ WARN_ON((&rs->rs_item != rs->rs_item.next ||
+ &rs->rs_item != rs->rs_item.prev));
+}
+
+static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
+{
+ struct rds_sock *rs;
+
+ sock_init_data(sock, sk);
+ sock->ops = &rds_proto_ops;
+ sk->sk_protocol = protocol;
+ sk->sk_destruct = rds_sock_destruct;
+
+ rs = rds_sk_to_rs(sk);
+ spin_lock_init(&rs->rs_lock);
+ rwlock_init(&rs->rs_recv_lock);
+ INIT_LIST_HEAD(&rs->rs_send_queue);
+ INIT_LIST_HEAD(&rs->rs_recv_queue);
+ INIT_LIST_HEAD(&rs->rs_notify_queue);
+ INIT_LIST_HEAD(&rs->rs_cong_list);
+ rds_message_zcopy_queue_init(&rs->rs_zcookie_queue);
+ spin_lock_init(&rs->rs_rdma_lock);
+ rs->rs_rdma_keys = RB_ROOT;
+ rs->rs_rx_traces = 0;
+ rs->rs_tos = 0;
+ rs->rs_conn = NULL;
+
+ spin_lock_bh(&rds_sock_lock);
+ list_add_tail(&rs->rs_item, &rds_sock_list);
+ rds_sock_count++;
+ spin_unlock_bh(&rds_sock_lock);
+
+ return 0;
+}
+
+static int rds_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ if (sock->type != SOCK_SEQPACKET || protocol)
+ return -ESOCKTNOSUPPORT;
+
+ sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ return __rds_create(sock, sk, protocol);
+}
+
+void rds_sock_addref(struct rds_sock *rs)
+{
+ sock_hold(rds_rs_to_sk(rs));
+}
+
+void rds_sock_put(struct rds_sock *rs)
+{
+ sock_put(rds_rs_to_sk(rs));
+}
+
+static const struct net_proto_family rds_family_ops = {
+ .family = AF_RDS,
+ .create = rds_create,
+ .owner = THIS_MODULE,
+};
+
+static void rds_sock_inc_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_sock *rs;
+ struct rds_incoming *inc;
+ unsigned int total = 0;
+
+ len /= sizeof(struct rds_info_message);
+
+ spin_lock_bh(&rds_sock_lock);
+
+ list_for_each_entry(rs, &rds_sock_list, rs_item) {
+ /* This option only supports IPv4 sockets. */
+ if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
+ continue;
+
+ read_lock(&rs->rs_recv_lock);
+
+ /* XXX too lazy to maintain counts.. */
+ list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
+ total++;
+ if (total <= len)
+ rds_inc_info_copy(inc, iter,
+ inc->i_saddr.s6_addr32[3],
+ rs->rs_bound_addr_v4,
+ 1);
+ }
+
+ read_unlock(&rs->rs_recv_lock);
+ }
+
+ spin_unlock_bh(&rds_sock_lock);
+
+ lens->nr = total;
+ lens->each = sizeof(struct rds_info_message);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_sock_inc_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_incoming *inc;
+ unsigned int total = 0;
+ struct rds_sock *rs;
+
+ len /= sizeof(struct rds6_info_message);
+
+ spin_lock_bh(&rds_sock_lock);
+
+ list_for_each_entry(rs, &rds_sock_list, rs_item) {
+ read_lock(&rs->rs_recv_lock);
+
+ list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
+ total++;
+ if (total <= len)
+ rds6_inc_info_copy(inc, iter, &inc->i_saddr,
+ &rs->rs_bound_addr, 1);
+ }
+
+ read_unlock(&rs->rs_recv_lock);
+ }
+
+ spin_unlock_bh(&rds_sock_lock);
+
+ lens->nr = total;
+ lens->each = sizeof(struct rds6_info_message);
+}
+#endif
+
+static void rds_sock_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_info_socket sinfo;
+ unsigned int cnt = 0;
+ struct rds_sock *rs;
+
+ len /= sizeof(struct rds_info_socket);
+
+ spin_lock_bh(&rds_sock_lock);
+
+ if (len < rds_sock_count) {
+ cnt = rds_sock_count;
+ goto out;
+ }
+
+ list_for_each_entry(rs, &rds_sock_list, rs_item) {
+ /* This option only supports IPv4 sockets. */
+ if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
+ continue;
+ sinfo.sndbuf = rds_sk_sndbuf(rs);
+ sinfo.rcvbuf = rds_sk_rcvbuf(rs);
+ sinfo.bound_addr = rs->rs_bound_addr_v4;
+ sinfo.connected_addr = rs->rs_conn_addr_v4;
+ sinfo.bound_port = rs->rs_bound_port;
+ sinfo.connected_port = rs->rs_conn_port;
+ sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
+
+ rds_info_copy(iter, &sinfo, sizeof(sinfo));
+ cnt++;
+ }
+
+out:
+ lens->nr = cnt;
+ lens->each = sizeof(struct rds_info_socket);
+
+ spin_unlock_bh(&rds_sock_lock);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_sock_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds6_info_socket sinfo6;
+ struct rds_sock *rs;
+
+ len /= sizeof(struct rds6_info_socket);
+
+ spin_lock_bh(&rds_sock_lock);
+
+ if (len < rds_sock_count)
+ goto out;
+
+ list_for_each_entry(rs, &rds_sock_list, rs_item) {
+ sinfo6.sndbuf = rds_sk_sndbuf(rs);
+ sinfo6.rcvbuf = rds_sk_rcvbuf(rs);
+ sinfo6.bound_addr = rs->rs_bound_addr;
+ sinfo6.connected_addr = rs->rs_conn_addr;
+ sinfo6.bound_port = rs->rs_bound_port;
+ sinfo6.connected_port = rs->rs_conn_port;
+ sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs));
+
+ rds_info_copy(iter, &sinfo6, sizeof(sinfo6));
+ }
+
+ out:
+ lens->nr = rds_sock_count;
+ lens->each = sizeof(struct rds6_info_socket);
+
+ spin_unlock_bh(&rds_sock_lock);
+}
+#endif
+
+static void rds_exit(void)
+{
+ sock_unregister(rds_family_ops.family);
+ proto_unregister(&rds_proto);
+ rds_conn_exit();
+ rds_cong_exit();
+ rds_sysctl_exit();
+ rds_threads_exit();
+ rds_stats_exit();
+ rds_page_exit();
+ rds_bind_lock_destroy();
+ rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
+ rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info);
+ rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
+#endif
+}
+module_exit(rds_exit);
+
+u32 rds_gen_num;
+
+static int __init rds_init(void)
+{
+ int ret;
+
+ net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));
+
+ ret = rds_bind_lock_init();
+ if (ret)
+ goto out;
+
+ ret = rds_conn_init();
+ if (ret)
+ goto out_bind;
+
+ ret = rds_threads_init();
+ if (ret)
+ goto out_conn;
+ ret = rds_sysctl_init();
+ if (ret)
+ goto out_threads;
+ ret = rds_stats_init();
+ if (ret)
+ goto out_sysctl;
+ ret = proto_register(&rds_proto, 1);
+ if (ret)
+ goto out_stats;
+ ret = sock_register(&rds_family_ops);
+ if (ret)
+ goto out_proto;
+
+ rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
+ rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info);
+ rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
+#endif
+
+ goto out;
+
+out_proto:
+ proto_unregister(&rds_proto);
+out_stats:
+ rds_stats_exit();
+out_sysctl:
+ rds_sysctl_exit();
+out_threads:
+ rds_threads_exit();
+out_conn:
+ rds_conn_exit();
+ rds_cong_exit();
+ rds_page_exit();
+out_bind:
+ rds_bind_lock_destroy();
+out:
+ return ret;
+}
+module_init(rds_init);
+
+#define DRV_VERSION "4.0"
+#define DRV_RELDATE "Feb 12, 2009"
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
+ " v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NETPROTO(PF_RDS);
diff --git a/net/rds/bind.c b/net/rds/bind.c
new file mode 100644
index 000000000..97a29172a
--- /dev/null
+++ b/net/rds/bind.c
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/ipv6.h>
+#include <linux/if_arp.h>
+#include <linux/jhash.h>
+#include <linux/ratelimit.h>
+#include "rds.h"
+
+static struct rhashtable bind_hash_table;
+
+static const struct rhashtable_params ht_parms = {
+ .nelem_hint = 768,
+ .key_len = RDS_BOUND_KEY_LEN,
+ .key_offset = offsetof(struct rds_sock, rs_bound_key),
+ .head_offset = offsetof(struct rds_sock, rs_bound_node),
+ .max_size = 16384,
+ .min_size = 1024,
+};
+
+/* Create a key for the bind hash table manipulation. Port is in network byte
+ * order.
+ */
+static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr,
+ __be16 port, __u32 scope_id)
+{
+ memcpy(key, addr, sizeof(*addr));
+ key += sizeof(*addr);
+ memcpy(key, &port, sizeof(port));
+ key += sizeof(port);
+ memcpy(key, &scope_id, sizeof(scope_id));
+}
+
+/*
+ * Return the rds_sock bound at the given local address.
+ *
+ * The rx path can race with rds_release. We notice if rds_release() has
+ * marked this socket and don't return a rs ref to the rx path.
+ */
+struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
+ __u32 scope_id)
+{
+ u8 key[RDS_BOUND_KEY_LEN];
+ struct rds_sock *rs;
+
+ __rds_create_bind_key(key, addr, port, scope_id);
+ rcu_read_lock();
+ rs = rhashtable_lookup(&bind_hash_table, key, ht_parms);
+ if (rs && (sock_flag(rds_rs_to_sk(rs), SOCK_DEAD) ||
+ !refcount_inc_not_zero(&rds_rs_to_sk(rs)->sk_refcnt)))
+ rs = NULL;
+
+ rcu_read_unlock();
+
+ rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr,
+ ntohs(port));
+
+ return rs;
+}
+
+/* returns -ve errno or +ve port */
+static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr,
+ __be16 *port, __u32 scope_id)
+{
+ int ret = -EADDRINUSE;
+ u16 rover, last;
+ u8 key[RDS_BOUND_KEY_LEN];
+
+ if (*port != 0) {
+ rover = be16_to_cpu(*port);
+ if (rover == RDS_FLAG_PROBE_PORT)
+ return -EINVAL;
+ last = rover;
+ } else {
+ rover = max_t(u16, get_random_u16(), 2);
+ last = rover - 1;
+ }
+
+ do {
+ if (rover == 0)
+ rover++;
+
+ if (rover == RDS_FLAG_PROBE_PORT)
+ continue;
+ __rds_create_bind_key(key, addr, cpu_to_be16(rover),
+ scope_id);
+ if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms))
+ continue;
+
+ memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key));
+ rs->rs_bound_addr = *addr;
+ net_get_random_once(&rs->rs_hash_initval,
+ sizeof(rs->rs_hash_initval));
+ rs->rs_bound_port = cpu_to_be16(rover);
+ rs->rs_bound_node.next = NULL;
+ rds_sock_addref(rs);
+ if (!rhashtable_insert_fast(&bind_hash_table,
+ &rs->rs_bound_node, ht_parms)) {
+ *port = rs->rs_bound_port;
+ rs->rs_bound_scope_id = scope_id;
+ ret = 0;
+ rdsdebug("rs %p binding to %pI6c:%d\n",
+ rs, addr, (int)ntohs(*port));
+ break;
+ } else {
+ rs->rs_bound_addr = in6addr_any;
+ rds_sock_put(rs);
+ ret = -ENOMEM;
+ break;
+ }
+ } while (rover++ != last);
+
+ return ret;
+}
+
+void rds_remove_bound(struct rds_sock *rs)
+{
+
+ if (ipv6_addr_any(&rs->rs_bound_addr))
+ return;
+
+ rdsdebug("rs %p unbinding from %pI6c:%d\n",
+ rs, &rs->rs_bound_addr,
+ ntohs(rs->rs_bound_port));
+
+ rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms);
+ rds_sock_put(rs);
+ rs->rs_bound_addr = in6addr_any;
+}
+
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ struct in6_addr v6addr, *binding_addr;
+ struct rds_transport *trans;
+ __u32 scope_id = 0;
+ int ret = 0;
+ __be16 port;
+
+ /* We allow an RDS socket to be bound to either IPv4 or IPv6
+ * address.
+ */
+ if (addr_len < offsetofend(struct sockaddr, sa_family))
+ return -EINVAL;
+ if (uaddr->sa_family == AF_INET) {
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+
+ if (addr_len < sizeof(struct sockaddr_in) ||
+ sin->sin_addr.s_addr == htonl(INADDR_ANY) ||
+ sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
+ ipv4_is_multicast(sin->sin_addr.s_addr))
+ return -EINVAL;
+ ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr);
+ binding_addr = &v6addr;
+ port = sin->sin_port;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (uaddr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr;
+ int addr_type;
+
+ if (addr_len < sizeof(struct sockaddr_in6))
+ return -EINVAL;
+ addr_type = ipv6_addr_type(&sin6->sin6_addr);
+ if (!(addr_type & IPV6_ADDR_UNICAST)) {
+ __be32 addr4;
+
+ if (!(addr_type & IPV6_ADDR_MAPPED))
+ return -EINVAL;
+
+ /* It is a mapped address. Need to do some sanity
+ * checks.
+ */
+ addr4 = sin6->sin6_addr.s6_addr32[3];
+ if (addr4 == htonl(INADDR_ANY) ||
+ addr4 == htonl(INADDR_BROADCAST) ||
+ ipv4_is_multicast(addr4))
+ return -EINVAL;
+ }
+ /* The scope ID must be specified for link local address. */
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (sin6->sin6_scope_id == 0)
+ return -EINVAL;
+ scope_id = sin6->sin6_scope_id;
+ }
+ binding_addr = &sin6->sin6_addr;
+ port = sin6->sin6_port;
+#endif
+ } else {
+ return -EINVAL;
+ }
+ lock_sock(sk);
+
+ /* RDS socket does not allow re-binding. */
+ if (!ipv6_addr_any(&rs->rs_bound_addr)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /* Socket is connected. The binding address should have the same
+ * scope ID as the connected address, except the case when one is
+ * non-link local address (scope_id is 0).
+ */
+ if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id &&
+ rs->rs_bound_scope_id &&
+ scope_id != rs->rs_bound_scope_id) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* The transport can be set using SO_RDS_TRANSPORT option before the
+ * socket is bound.
+ */
+ if (rs->rs_transport) {
+ trans = rs->rs_transport;
+ if (!trans->laddr_check ||
+ trans->laddr_check(sock_net(sock->sk),
+ binding_addr, scope_id) != 0) {
+ ret = -ENOPROTOOPT;
+ goto out;
+ }
+ } else {
+ trans = rds_trans_get_preferred(sock_net(sock->sk),
+ binding_addr, scope_id);
+ if (!trans) {
+ ret = -EADDRNOTAVAIL;
+ pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n",
+ __func__, binding_addr);
+ goto out;
+ }
+ rs->rs_transport = trans;
+ }
+
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ ret = rds_add_bound(rs, binding_addr, &port, scope_id);
+ if (ret)
+ rs->rs_transport = NULL;
+
+out:
+ release_sock(sk);
+ return ret;
+}
+
+void rds_bind_lock_destroy(void)
+{
+ rhashtable_destroy(&bind_hash_table);
+}
+
+int rds_bind_lock_init(void)
+{
+ return rhashtable_init(&bind_hash_table, &ht_parms);
+}
diff --git a/net/rds/cong.c b/net/rds/cong.c
new file mode 100644
index 000000000..8b689ebbd
--- /dev/null
+++ b/net/rds/cong.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/bitops.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/*
+ * This file implements the receive side of the unconventional congestion
+ * management in RDS.
+ *
+ * Messages waiting in the receive queue on the receiving socket are accounted
+ * against the sockets SO_RCVBUF option value. Only the payload bytes in the
+ * message are accounted for. If the number of bytes queued equals or exceeds
+ * rcvbuf then the socket is congested. All sends attempted to this socket's
+ * address should return block or return -EWOULDBLOCK.
+ *
+ * Applications are expected to be reasonably tuned such that this situation
+ * very rarely occurs. An application encountering this "back-pressure" is
+ * considered a bug.
+ *
+ * This is implemented by having each node maintain bitmaps which indicate
+ * which ports on bound addresses are congested. As the bitmap changes it is
+ * sent through all the connections which terminate in the local address of the
+ * bitmap which changed.
+ *
+ * The bitmaps are allocated as connections are brought up. This avoids
+ * allocation in the interrupt handling path which queues messages on sockets.
+ * The dense bitmaps let transports send the entire bitmap on any bitmap change
+ * reasonably efficiently. This is much easier to implement than some
+ * finer-grained communication of per-port congestion. The sender does a very
+ * inexpensive bit test to test if the port it's about to send to is congested
+ * or not.
+ */
+
+/*
+ * Interaction with poll is a tad tricky. We want all processes stuck in
+ * poll to wake up and check whether a congested destination became uncongested.
+ * The really sad thing is we have no idea which destinations the application
+ * wants to send to - we don't even know which rds_connections are involved.
+ * So until we implement a more flexible rds poll interface, we have to make
+ * do with this:
+ * We maintain a global counter that is incremented each time a congestion map
+ * update is received. Each rds socket tracks this value, and if rds_poll
+ * finds that the saved generation number is smaller than the global generation
+ * number, it wakes up the process.
+ */
+static atomic_t rds_cong_generation = ATOMIC_INIT(0);
+
+/*
+ * Congestion monitoring
+ */
+static LIST_HEAD(rds_cong_monitor);
+static DEFINE_RWLOCK(rds_cong_monitor_lock);
+
+/*
+ * Yes, a global lock. It's used so infrequently that it's worth keeping it
+ * global to simplify the locking. It's only used in the following
+ * circumstances:
+ *
+ * - on connection buildup to associate a conn with its maps
+ * - on map changes to inform conns of a new map to send
+ *
+ * It's sadly ordered under the socket callback lock and the connection lock.
+ * Receive paths can mark ports congested from interrupt context so the
+ * lock masks interrupts.
+ */
+static DEFINE_SPINLOCK(rds_cong_lock);
+static struct rb_root rds_cong_tree = RB_ROOT;
+
+static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr,
+ struct rds_cong_map *insert)
+{
+ struct rb_node **p = &rds_cong_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct rds_cong_map *map;
+
+ while (*p) {
+ int diff;
+
+ parent = *p;
+ map = rb_entry(parent, struct rds_cong_map, m_rb_node);
+
+ diff = rds_addr_cmp(addr, &map->m_addr);
+ if (diff < 0)
+ p = &(*p)->rb_left;
+ else if (diff > 0)
+ p = &(*p)->rb_right;
+ else
+ return map;
+ }
+
+ if (insert) {
+ rb_link_node(&insert->m_rb_node, parent, p);
+ rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
+ }
+ return NULL;
+}
+
+/*
+ * There is only ever one bitmap for any address. Connections try and allocate
+ * these bitmaps in the process getting pointers to them. The bitmaps are only
+ * ever freed as the module is removed after all connections have been freed.
+ */
+static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr)
+{
+ struct rds_cong_map *map;
+ struct rds_cong_map *ret = NULL;
+ unsigned long zp;
+ unsigned long i;
+ unsigned long flags;
+
+ map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
+ if (!map)
+ return NULL;
+
+ map->m_addr = *addr;
+ init_waitqueue_head(&map->m_waitq);
+ INIT_LIST_HEAD(&map->m_conn_list);
+
+ for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+ zp = get_zeroed_page(GFP_KERNEL);
+ if (zp == 0)
+ goto out;
+ map->m_page_addrs[i] = zp;
+ }
+
+ spin_lock_irqsave(&rds_cong_lock, flags);
+ ret = rds_cong_tree_walk(addr, map);
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+ if (!ret) {
+ ret = map;
+ map = NULL;
+ }
+
+out:
+ if (map) {
+ for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+ free_page(map->m_page_addrs[i]);
+ kfree(map);
+ }
+
+ rdsdebug("map %p for addr %pI6c\n", ret, addr);
+
+ return ret;
+}
+
+/*
+ * Put the conn on its local map's list. This is called when the conn is
+ * really added to the hash. It's nested under the rds_conn_lock, sadly.
+ */
+void rds_cong_add_conn(struct rds_connection *conn)
+{
+ unsigned long flags;
+
+ rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
+ spin_lock_irqsave(&rds_cong_lock, flags);
+ list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_remove_conn(struct rds_connection *conn)
+{
+ unsigned long flags;
+
+ rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
+ spin_lock_irqsave(&rds_cong_lock, flags);
+ list_del_init(&conn->c_map_item);
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+int rds_cong_get_maps(struct rds_connection *conn)
+{
+ conn->c_lcong = rds_cong_from_addr(&conn->c_laddr);
+ conn->c_fcong = rds_cong_from_addr(&conn->c_faddr);
+
+ if (!(conn->c_lcong && conn->c_fcong))
+ return -ENOMEM;
+
+ return 0;
+}
+
+void rds_cong_queue_updates(struct rds_cong_map *map)
+{
+ struct rds_connection *conn;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rds_cong_lock, flags);
+
+ list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
+ struct rds_conn_path *cp = &conn->c_path[0];
+
+ rcu_read_lock();
+ if (!test_and_set_bit(0, &conn->c_map_queued) &&
+ !rds_destroy_pending(cp->cp_conn)) {
+ rds_stats_inc(s_cong_update_queued);
+ /* We cannot inline the call to rds_send_xmit() here
+ * for two reasons (both pertaining to a TCP transport):
+ * 1. When we get here from the receive path, we
+ * are already holding the sock_lock (held by
+ * tcp_v4_rcv()). So inlining calls to
+ * tcp_setsockopt and/or tcp_sendmsg will deadlock
+ * when it tries to get the sock_lock())
+ * 2. Interrupts are masked so that we can mark the
+ * port congested from both send and recv paths.
+ * (See comment around declaration of rdc_cong_lock).
+ * An attempt to get the sock_lock() here will
+ * therefore trigger warnings.
+ * Defer the xmit to rds_send_worker() instead.
+ */
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ }
+ rcu_read_unlock();
+ }
+
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
+{
+ rdsdebug("waking map %p for %pI4\n",
+ map, &map->m_addr);
+ rds_stats_inc(s_cong_update_received);
+ atomic_inc(&rds_cong_generation);
+ if (waitqueue_active(&map->m_waitq))
+ wake_up(&map->m_waitq);
+ if (waitqueue_active(&rds_poll_waitq))
+ wake_up_all(&rds_poll_waitq);
+
+ if (portmask && !list_empty(&rds_cong_monitor)) {
+ unsigned long flags;
+ struct rds_sock *rs;
+
+ read_lock_irqsave(&rds_cong_monitor_lock, flags);
+ list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
+ spin_lock(&rs->rs_lock);
+ rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
+ rs->rs_cong_mask &= ~portmask;
+ spin_unlock(&rs->rs_lock);
+ if (rs->rs_cong_notify)
+ rds_wake_sk_sleep(rs);
+ }
+ read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+ }
+}
+EXPORT_SYMBOL_GPL(rds_cong_map_updated);
+
+int rds_cong_updated_since(unsigned long *recent)
+{
+ unsigned long gen = atomic_read(&rds_cong_generation);
+
+ if (likely(*recent == gen))
+ return 0;
+ *recent = gen;
+ return 1;
+}
+
+/*
+ * We're called under the locking that protects the sockets receive buffer
+ * consumption. This makes it a lot easier for the caller to only call us
+ * when it knows that an existing set bit needs to be cleared, and vice versa.
+ * We can't block and we need to deal with concurrent sockets working against
+ * the same per-address map.
+ */
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
+{
+ unsigned long i;
+ unsigned long off;
+
+ rdsdebug("setting congestion for %pI4:%u in map %p\n",
+ &map->m_addr, ntohs(port), map);
+
+ i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+ off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+ set_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
+{
+ unsigned long i;
+ unsigned long off;
+
+ rdsdebug("clearing congestion for %pI4:%u in map %p\n",
+ &map->m_addr, ntohs(port), map);
+
+ i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+ off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+ clear_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
+{
+ unsigned long i;
+ unsigned long off;
+
+ i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+ off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+ return test_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_add_socket(struct rds_sock *rs)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&rds_cong_monitor_lock, flags);
+ if (list_empty(&rs->rs_cong_list))
+ list_add(&rs->rs_cong_list, &rds_cong_monitor);
+ write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+}
+
+void rds_cong_remove_socket(struct rds_sock *rs)
+{
+ unsigned long flags;
+ struct rds_cong_map *map;
+
+ write_lock_irqsave(&rds_cong_monitor_lock, flags);
+ list_del_init(&rs->rs_cong_list);
+ write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+
+ /* update congestion map for now-closed port */
+ spin_lock_irqsave(&rds_cong_lock, flags);
+ map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL);
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+ if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
+ rds_cong_clear_bit(map, rs->rs_bound_port);
+ rds_cong_queue_updates(map);
+ }
+}
+
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
+ struct rds_sock *rs)
+{
+ if (!rds_cong_test_bit(map, port))
+ return 0;
+ if (nonblock) {
+ if (rs && rs->rs_cong_monitor) {
+ unsigned long flags;
+
+ /* It would have been nice to have an atomic set_bit on
+ * a uint64_t. */
+ spin_lock_irqsave(&rs->rs_lock, flags);
+ rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port));
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+ /* Test again - a congestion update may have arrived in
+ * the meantime. */
+ if (!rds_cong_test_bit(map, port))
+ return 0;
+ }
+ rds_stats_inc(s_cong_send_error);
+ return -ENOBUFS;
+ }
+
+ rds_stats_inc(s_cong_send_blocked);
+ rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));
+
+ return wait_event_interruptible(map->m_waitq,
+ !rds_cong_test_bit(map, port));
+}
+
+void rds_cong_exit(void)
+{
+ struct rb_node *node;
+ struct rds_cong_map *map;
+ unsigned long i;
+
+ while ((node = rb_first(&rds_cong_tree))) {
+ map = rb_entry(node, struct rds_cong_map, m_rb_node);
+ rdsdebug("freeing map %p\n", map);
+ rb_erase(&map->m_rb_node, &rds_cong_tree);
+ for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+ free_page(map->m_page_addrs[i]);
+ kfree(map);
+ }
+}
+
+/*
+ * Allocate a RDS message containing a congestion update.
+ */
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn)
+{
+ struct rds_cong_map *map = conn->c_lcong;
+ struct rds_message *rm;
+
+ rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
+ if (!IS_ERR(rm))
+ rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;
+
+ return rm;
+}
diff --git a/net/rds/connection.c b/net/rds/connection.c
new file mode 100644
index 000000000..b4cc699c5
--- /dev/null
+++ b/net/rds/connection.c
@@ -0,0 +1,948 @@
+/*
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/ipv6.h>
+#include <net/inet6_hashtables.h>
+#include <net/addrconf.h>
+
+#include "rds.h"
+#include "loop.h"
+
+#define RDS_CONNECTION_HASH_BITS 12
+#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
+#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1)
+
+/* converting this to RCU is a chore for another day.. */
+static DEFINE_SPINLOCK(rds_conn_lock);
+static unsigned long rds_conn_count;
+static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
+static struct kmem_cache *rds_conn_slab;
+
+static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr,
+ const struct in6_addr *faddr)
+{
+ static u32 rds6_hash_secret __read_mostly;
+ static u32 rds_hash_secret __read_mostly;
+
+ u32 lhash, fhash, hash;
+
+ net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
+ net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret));
+
+ lhash = (__force u32)laddr->s6_addr32[3];
+#if IS_ENABLED(CONFIG_IPV6)
+ fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret);
+#else
+ fhash = (__force u32)faddr->s6_addr32[3];
+#endif
+ hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret);
+
+ return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
+}
+
+#define rds_conn_info_set(var, test, suffix) do { \
+ if (test) \
+ var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
+} while (0)
+
+/* rcu read lock must be held or the connection spinlock */
+static struct rds_connection *rds_conn_lookup(struct net *net,
+ struct hlist_head *head,
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ u8 tos, int dev_if)
+{
+ struct rds_connection *conn, *ret = NULL;
+
+ hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+ if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
+ ipv6_addr_equal(&conn->c_laddr, laddr) &&
+ conn->c_trans == trans &&
+ conn->c_tos == tos &&
+ net == rds_conn_net(conn) &&
+ conn->c_dev_if == dev_if) {
+ ret = conn;
+ break;
+ }
+ }
+ rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret,
+ laddr, faddr);
+ return ret;
+}
+
+/*
+ * This is called by transports as they're bringing down a connection.
+ * It clears partial message state so that the transport can start sending
+ * and receiving over this connection again in the future. It is up to
+ * the transport to have serialized this call with its send and recv.
+ */
+static void rds_conn_path_reset(struct rds_conn_path *cp)
+{
+ struct rds_connection *conn = cp->cp_conn;
+
+ rdsdebug("connection %pI6c to %pI6c reset\n",
+ &conn->c_laddr, &conn->c_faddr);
+
+ rds_stats_inc(s_conn_reset);
+ rds_send_path_reset(cp);
+ cp->cp_flags = 0;
+
+ /* Do not clear next_rx_seq here, else we cannot distinguish
+ * retransmitted packets from new packets, and will hand all
+ * of them to the application. That is not consistent with the
+ * reliability guarantees of RDS. */
+}
+
+static void __rds_conn_path_init(struct rds_connection *conn,
+ struct rds_conn_path *cp, bool is_outgoing)
+{
+ spin_lock_init(&cp->cp_lock);
+ cp->cp_next_tx_seq = 1;
+ init_waitqueue_head(&cp->cp_waitq);
+ INIT_LIST_HEAD(&cp->cp_send_queue);
+ INIT_LIST_HEAD(&cp->cp_retrans);
+
+ cp->cp_conn = conn;
+ atomic_set(&cp->cp_state, RDS_CONN_DOWN);
+ cp->cp_send_gen = 0;
+ cp->cp_reconnect_jiffies = 0;
+ cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
+ INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker);
+ INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker);
+ INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker);
+ INIT_WORK(&cp->cp_down_w, rds_shutdown_worker);
+ mutex_init(&cp->cp_cm_lock);
+ cp->cp_flags = 0;
+}
+
+/*
+ * There is only every one 'conn' for a given pair of addresses in the
+ * system at a time. They contain messages to be retransmitted and so
+ * span the lifetime of the actual underlying transport connections.
+ *
+ * For now they are not garbage collected once they're created. They
+ * are torn down as the module is removed, if ever.
+ */
+static struct rds_connection *__rds_conn_create(struct net *net,
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ gfp_t gfp, u8 tos,
+ int is_outgoing,
+ int dev_if)
+{
+ struct rds_connection *conn, *parent = NULL;
+ struct hlist_head *head = rds_conn_bucket(laddr, faddr);
+ struct rds_transport *loop_trans;
+ unsigned long flags;
+ int ret, i;
+ int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
+
+ rcu_read_lock();
+ conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if);
+ if (conn &&
+ conn->c_loopback &&
+ conn->c_trans != &rds_loop_transport &&
+ ipv6_addr_equal(laddr, faddr) &&
+ !is_outgoing) {
+ /* This is a looped back IB connection, and we're
+ * called by the code handling the incoming connect.
+ * We need a second connection object into which we
+ * can stick the other QP. */
+ parent = conn;
+ conn = parent->c_passive;
+ }
+ rcu_read_unlock();
+ if (conn)
+ goto out;
+
+ conn = kmem_cache_zalloc(rds_conn_slab, gfp);
+ if (!conn) {
+ conn = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ conn->c_path = kcalloc(npaths, sizeof(struct rds_conn_path), gfp);
+ if (!conn->c_path) {
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ INIT_HLIST_NODE(&conn->c_hash_node);
+ conn->c_laddr = *laddr;
+ conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
+ conn->c_faddr = *faddr;
+ conn->c_dev_if = dev_if;
+ conn->c_tos = tos;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* If the local address is link local, set c_bound_if to be the
+ * index used for this connection. Otherwise, set it to 0 as
+ * the socket is not bound to an interface. c_bound_if is used
+ * to look up a socket when a packet is received
+ */
+ if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL)
+ conn->c_bound_if = dev_if;
+ else
+#endif
+ conn->c_bound_if = 0;
+
+ rds_conn_net_set(conn, net);
+
+ ret = rds_cong_get_maps(conn);
+ if (ret) {
+ kfree(conn->c_path);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = ERR_PTR(ret);
+ goto out;
+ }
+
+ /*
+ * This is where a connection becomes loopback. If *any* RDS sockets
+ * can bind to the destination address then we'd rather the messages
+ * flow through loopback rather than either transport.
+ */
+ loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if);
+ if (loop_trans) {
+ rds_trans_put(loop_trans);
+ conn->c_loopback = 1;
+ if (trans->t_prefer_loopback) {
+ if (likely(is_outgoing)) {
+ /* "outgoing" connection to local address.
+ * Protocol says it wants the connection
+ * handled by the loopback transport.
+ * This is what TCP does.
+ */
+ trans = &rds_loop_transport;
+ } else {
+ /* No transport currently in use
+ * should end up here, but if it
+ * does, reset/destroy the connection.
+ */
+ kfree(conn->c_path);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = ERR_PTR(-EOPNOTSUPP);
+ goto out;
+ }
+ }
+ }
+
+ conn->c_trans = trans;
+
+ init_waitqueue_head(&conn->c_hs_waitq);
+ for (i = 0; i < npaths; i++) {
+ __rds_conn_path_init(conn, &conn->c_path[i],
+ is_outgoing);
+ conn->c_path[i].cp_index = i;
+ }
+ rcu_read_lock();
+ if (rds_destroy_pending(conn))
+ ret = -ENETDOWN;
+ else
+ ret = trans->conn_alloc(conn, GFP_ATOMIC);
+ if (ret) {
+ rcu_read_unlock();
+ kfree(conn->c_path);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = ERR_PTR(ret);
+ goto out;
+ }
+
+ rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n",
+ conn, laddr, faddr,
+ strnlen(trans->t_name, sizeof(trans->t_name)) ?
+ trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : "");
+
+ /*
+ * Since we ran without holding the conn lock, someone could
+ * have created the same conn (either normal or passive) in the
+ * interim. We check while holding the lock. If we won, we complete
+ * init and return our conn. If we lost, we rollback and return the
+ * other one.
+ */
+ spin_lock_irqsave(&rds_conn_lock, flags);
+ if (parent) {
+ /* Creating passive conn */
+ if (parent->c_passive) {
+ trans->conn_free(conn->c_path[0].cp_transport_data);
+ kfree(conn->c_path);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = parent->c_passive;
+ } else {
+ parent->c_passive = conn;
+ rds_cong_add_conn(conn);
+ rds_conn_count++;
+ }
+ } else {
+ /* Creating normal conn */
+ struct rds_connection *found;
+
+ found = rds_conn_lookup(net, head, laddr, faddr, trans,
+ tos, dev_if);
+ if (found) {
+ struct rds_conn_path *cp;
+ int i;
+
+ for (i = 0; i < npaths; i++) {
+ cp = &conn->c_path[i];
+ /* The ->conn_alloc invocation may have
+ * allocated resource for all paths, so all
+ * of them may have to be freed here.
+ */
+ if (cp->cp_transport_data)
+ trans->conn_free(cp->cp_transport_data);
+ }
+ kfree(conn->c_path);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = found;
+ } else {
+ conn->c_my_gen_num = rds_gen_num;
+ conn->c_peer_gen_num = 0;
+ hlist_add_head_rcu(&conn->c_hash_node, head);
+ rds_cong_add_conn(conn);
+ rds_conn_count++;
+ }
+ }
+ spin_unlock_irqrestore(&rds_conn_lock, flags);
+ rcu_read_unlock();
+
+out:
+ return conn;
+}
+
+struct rds_connection *rds_conn_create(struct net *net,
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans, u8 tos,
+ gfp_t gfp, int dev_if)
+{
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if);
+}
+EXPORT_SYMBOL_GPL(rds_conn_create);
+
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ u8 tos, gfp_t gfp, int dev_if)
+{
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if);
+}
+EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
+
+void rds_conn_shutdown(struct rds_conn_path *cp)
+{
+ struct rds_connection *conn = cp->cp_conn;
+
+ /* shut it down unless it's down already */
+ if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+ /*
+ * Quiesce the connection mgmt handlers before we start tearing
+ * things down. We don't hold the mutex for the entire
+ * duration of the shutdown operation, else we may be
+ * deadlocking with the CM handler. Instead, the CM event
+ * handler is supposed to check for state DISCONNECTING
+ */
+ mutex_lock(&cp->cp_cm_lock);
+ if (!rds_conn_path_transition(cp, RDS_CONN_UP,
+ RDS_CONN_DISCONNECTING) &&
+ !rds_conn_path_transition(cp, RDS_CONN_ERROR,
+ RDS_CONN_DISCONNECTING)) {
+ rds_conn_path_error(cp,
+ "shutdown called in state %d\n",
+ atomic_read(&cp->cp_state));
+ mutex_unlock(&cp->cp_cm_lock);
+ return;
+ }
+ mutex_unlock(&cp->cp_cm_lock);
+
+ wait_event(cp->cp_waitq,
+ !test_bit(RDS_IN_XMIT, &cp->cp_flags));
+ wait_event(cp->cp_waitq,
+ !test_bit(RDS_RECV_REFILL, &cp->cp_flags));
+
+ conn->c_trans->conn_path_shutdown(cp);
+ rds_conn_path_reset(cp);
+
+ if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING,
+ RDS_CONN_DOWN) &&
+ !rds_conn_path_transition(cp, RDS_CONN_ERROR,
+ RDS_CONN_DOWN)) {
+ /* This can happen - eg when we're in the middle of tearing
+ * down the connection, and someone unloads the rds module.
+ * Quite reproducible with loopback connections.
+ * Mostly harmless.
+ *
+ * Note that this also happens with rds-tcp because
+ * we could have triggered rds_conn_path_drop in irq
+ * mode from rds_tcp_state change on the receipt of
+ * a FIN, thus we need to recheck for RDS_CONN_ERROR
+ * here.
+ */
+ rds_conn_path_error(cp, "%s: failed to transition "
+ "to state DOWN, current state "
+ "is %d\n", __func__,
+ atomic_read(&cp->cp_state));
+ return;
+ }
+ }
+
+ /* Then reconnect if it's still live.
+ * The passive side of an IB loopback connection is never added
+ * to the conn hash, so we never trigger a reconnect on this
+ * conn - the reconnect is always triggered by the active peer. */
+ cancel_delayed_work_sync(&cp->cp_conn_w);
+ rcu_read_lock();
+ if (!hlist_unhashed(&conn->c_hash_node)) {
+ rcu_read_unlock();
+ rds_queue_reconnect(cp);
+ } else {
+ rcu_read_unlock();
+ }
+}
+
+/* destroy a single rds_conn_path. rds_conn_destroy() iterates over
+ * all paths using rds_conn_path_destroy()
+ */
+static void rds_conn_path_destroy(struct rds_conn_path *cp)
+{
+ struct rds_message *rm, *rtmp;
+
+ if (!cp->cp_transport_data)
+ return;
+
+ /* make sure lingering queued work won't try to ref the conn */
+ cancel_delayed_work_sync(&cp->cp_send_w);
+ cancel_delayed_work_sync(&cp->cp_recv_w);
+
+ rds_conn_path_drop(cp, true);
+ flush_work(&cp->cp_down_w);
+
+ /* tear down queued messages */
+ list_for_each_entry_safe(rm, rtmp,
+ &cp->cp_send_queue,
+ m_conn_item) {
+ list_del_init(&rm->m_conn_item);
+ BUG_ON(!list_empty(&rm->m_sock_item));
+ rds_message_put(rm);
+ }
+ if (cp->cp_xmit_rm)
+ rds_message_put(cp->cp_xmit_rm);
+
+ WARN_ON(delayed_work_pending(&cp->cp_send_w));
+ WARN_ON(delayed_work_pending(&cp->cp_recv_w));
+ WARN_ON(delayed_work_pending(&cp->cp_conn_w));
+ WARN_ON(work_pending(&cp->cp_down_w));
+
+ cp->cp_conn->c_trans->conn_free(cp->cp_transport_data);
+}
+
+/*
+ * Stop and free a connection.
+ *
+ * This can only be used in very limited circumstances. It assumes that once
+ * the conn has been shutdown that no one else is referencing the connection.
+ * We can only ensure this in the rmmod path in the current code.
+ */
+void rds_conn_destroy(struct rds_connection *conn)
+{
+ unsigned long flags;
+ int i;
+ struct rds_conn_path *cp;
+ int npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
+
+ rdsdebug("freeing conn %p for %pI4 -> "
+ "%pI4\n", conn, &conn->c_laddr,
+ &conn->c_faddr);
+
+ /* Ensure conn will not be scheduled for reconnect */
+ spin_lock_irq(&rds_conn_lock);
+ hlist_del_init_rcu(&conn->c_hash_node);
+ spin_unlock_irq(&rds_conn_lock);
+ synchronize_rcu();
+
+ /* shut the connection down */
+ for (i = 0; i < npaths; i++) {
+ cp = &conn->c_path[i];
+ rds_conn_path_destroy(cp);
+ BUG_ON(!list_empty(&cp->cp_retrans));
+ }
+
+ /*
+ * The congestion maps aren't freed up here. They're
+ * freed by rds_cong_exit() after all the connections
+ * have been freed.
+ */
+ rds_cong_remove_conn(conn);
+
+ kfree(conn->c_path);
+ kmem_cache_free(rds_conn_slab, conn);
+
+ spin_lock_irqsave(&rds_conn_lock, flags);
+ rds_conn_count--;
+ spin_unlock_irqrestore(&rds_conn_lock, flags);
+}
+EXPORT_SYMBOL_GPL(rds_conn_destroy);
+
+static void __rds_inc_msg_cp(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ void *saddr, void *daddr, int flip, bool isv6)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (isv6)
+ rds6_inc_info_copy(inc, iter, saddr, daddr, flip);
+ else
+#endif
+ rds_inc_info_copy(inc, iter, *(__be32 *)saddr,
+ *(__be32 *)daddr, flip);
+}
+
+static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int want_send, bool isv6)
+{
+ struct hlist_head *head;
+ struct list_head *list;
+ struct rds_connection *conn;
+ struct rds_message *rm;
+ unsigned int total = 0;
+ unsigned long flags;
+ size_t i;
+ int j;
+
+ if (isv6)
+ len /= sizeof(struct rds6_info_message);
+ else
+ len /= sizeof(struct rds_info_message);
+
+ rcu_read_lock();
+
+ for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+ i++, head++) {
+ hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+ struct rds_conn_path *cp;
+ int npaths;
+
+ if (!isv6 && conn->c_isv6)
+ continue;
+
+ npaths = (conn->c_trans->t_mp_capable ?
+ RDS_MPATH_WORKERS : 1);
+
+ for (j = 0; j < npaths; j++) {
+ cp = &conn->c_path[j];
+ if (want_send)
+ list = &cp->cp_send_queue;
+ else
+ list = &cp->cp_retrans;
+
+ spin_lock_irqsave(&cp->cp_lock, flags);
+
+ /* XXX too lazy to maintain counts.. */
+ list_for_each_entry(rm, list, m_conn_item) {
+ total++;
+ if (total <= len)
+ __rds_inc_msg_cp(&rm->m_inc,
+ iter,
+ &conn->c_laddr,
+ &conn->c_faddr,
+ 0, isv6);
+ }
+
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ lens->nr = total;
+ if (isv6)
+ lens->each = sizeof(struct rds6_info_message);
+ else
+ lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_conn_message_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int want_send)
+{
+ rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_conn_message_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int want_send)
+{
+ rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true);
+}
+#endif
+
+static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds_conn_message_info(sock, len, iter, lens, 1);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_conn_message_info_send(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds6_conn_message_info(sock, len, iter, lens, 1);
+}
+#endif
+
+static void rds_conn_message_info_retrans(struct socket *sock,
+ unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds_conn_message_info(sock, len, iter, lens, 0);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_conn_message_info_retrans(struct socket *sock,
+ unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds6_conn_message_info(sock, len, iter, lens, 0);
+}
+#endif
+
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int (*visitor)(struct rds_connection *, void *),
+ u64 *buffer,
+ size_t item_len)
+{
+ struct hlist_head *head;
+ struct rds_connection *conn;
+ size_t i;
+
+ rcu_read_lock();
+
+ lens->nr = 0;
+ lens->each = item_len;
+
+ for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+ i++, head++) {
+ hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+
+ /* XXX no c_lock usage.. */
+ if (!visitor(conn, buffer))
+ continue;
+
+ /* We copy as much as we can fit in the buffer,
+ * but we count all items so that the caller
+ * can resize the buffer. */
+ if (len >= item_len) {
+ rds_info_copy(iter, buffer, item_len);
+ len -= item_len;
+ }
+ lens->nr++;
+ }
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
+
+static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int (*visitor)(struct rds_conn_path *, void *),
+ u64 *buffer,
+ size_t item_len)
+{
+ struct hlist_head *head;
+ struct rds_connection *conn;
+ size_t i;
+
+ rcu_read_lock();
+
+ lens->nr = 0;
+ lens->each = item_len;
+
+ for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+ i++, head++) {
+ hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+ struct rds_conn_path *cp;
+
+ /* XXX We only copy the information from the first
+ * path for now. The problem is that if there are
+ * more than one underlying paths, we cannot report
+ * information of all of them using the existing
+ * API. For example, there is only one next_tx_seq,
+ * which path's next_tx_seq should we report? It is
+ * a bug in the design of MPRDS.
+ */
+ cp = conn->c_path;
+
+ /* XXX no cp_lock usage.. */
+ if (!visitor(cp, buffer))
+ continue;
+
+ /* We copy as much as we can fit in the buffer,
+ * but we count all items so that the caller
+ * can resize the buffer.
+ */
+ if (len >= item_len) {
+ rds_info_copy(iter, buffer, item_len);
+ len -= item_len;
+ }
+ lens->nr++;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
+{
+ struct rds_info_connection *cinfo = buffer;
+ struct rds_connection *conn = cp->cp_conn;
+
+ if (conn->c_isv6)
+ return 0;
+
+ cinfo->next_tx_seq = cp->cp_next_tx_seq;
+ cinfo->next_rx_seq = cp->cp_next_rx_seq;
+ cinfo->laddr = conn->c_laddr.s6_addr32[3];
+ cinfo->faddr = conn->c_faddr.s6_addr32[3];
+ cinfo->tos = conn->c_tos;
+ strncpy(cinfo->transport, conn->c_trans->t_name,
+ sizeof(cinfo->transport));
+ cinfo->flags = 0;
+
+ rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
+ SENDING);
+ /* XXX Future: return the state rather than these funky bits */
+ rds_conn_info_set(cinfo->flags,
+ atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
+ CONNECTING);
+ rds_conn_info_set(cinfo->flags,
+ atomic_read(&cp->cp_state) == RDS_CONN_UP,
+ CONNECTED);
+ return 1;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
+{
+ struct rds6_info_connection *cinfo6 = buffer;
+ struct rds_connection *conn = cp->cp_conn;
+
+ cinfo6->next_tx_seq = cp->cp_next_tx_seq;
+ cinfo6->next_rx_seq = cp->cp_next_rx_seq;
+ cinfo6->laddr = conn->c_laddr;
+ cinfo6->faddr = conn->c_faddr;
+ strncpy(cinfo6->transport, conn->c_trans->t_name,
+ sizeof(cinfo6->transport));
+ cinfo6->flags = 0;
+
+ rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
+ SENDING);
+ /* XXX Future: return the state rather than these funky bits */
+ rds_conn_info_set(cinfo6->flags,
+ atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
+ CONNECTING);
+ rds_conn_info_set(cinfo6->flags,
+ atomic_read(&cp->cp_state) == RDS_CONN_UP,
+ CONNECTED);
+ /* Just return 1 as there is no error case. This is a helper function
+ * for rds_walk_conn_path_info() and it wants a return value.
+ */
+ return 1;
+}
+#endif
+
+static void rds_conn_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ u64 buffer[(sizeof(struct rds_info_connection) + 7) / 8];
+
+ rds_walk_conn_path_info(sock, len, iter, lens,
+ rds_conn_info_visitor,
+ buffer,
+ sizeof(struct rds_info_connection));
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_conn_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8];
+
+ rds_walk_conn_path_info(sock, len, iter, lens,
+ rds6_conn_info_visitor,
+ buffer,
+ sizeof(struct rds6_info_connection));
+}
+#endif
+
+int rds_conn_init(void)
+{
+ int ret;
+
+ ret = rds_loop_net_init(); /* register pernet callback */
+ if (ret)
+ return ret;
+
+ rds_conn_slab = kmem_cache_create("rds_connection",
+ sizeof(struct rds_connection),
+ 0, 0, NULL);
+ if (!rds_conn_slab) {
+ rds_loop_net_exit();
+ return -ENOMEM;
+ }
+
+ rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+ rds_info_register_func(RDS_INFO_SEND_MESSAGES,
+ rds_conn_message_info_send);
+ rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
+ rds_conn_message_info_retrans);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+ rds_info_register_func(RDS6_INFO_SEND_MESSAGES,
+ rds6_conn_message_info_send);
+ rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES,
+ rds6_conn_message_info_retrans);
+#endif
+ return 0;
+}
+
+void rds_conn_exit(void)
+{
+ rds_loop_net_exit(); /* unregister pernet callback */
+ rds_loop_exit();
+
+ WARN_ON(!hlist_empty(rds_conn_hash));
+
+ kmem_cache_destroy(rds_conn_slab);
+
+ rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+ rds_info_deregister_func(RDS_INFO_SEND_MESSAGES,
+ rds_conn_message_info_send);
+ rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
+ rds_conn_message_info_retrans);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+ rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES,
+ rds6_conn_message_info_send);
+ rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES,
+ rds6_conn_message_info_retrans);
+#endif
+}
+
+/*
+ * Force a disconnect
+ */
+void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy)
+{
+ atomic_set(&cp->cp_state, RDS_CONN_ERROR);
+
+ rcu_read_lock();
+ if (!destroy && rds_destroy_pending(cp->cp_conn)) {
+ rcu_read_unlock();
+ return;
+ }
+ queue_work(rds_wq, &cp->cp_down_w);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rds_conn_path_drop);
+
+void rds_conn_drop(struct rds_connection *conn)
+{
+ WARN_ON(conn->c_trans->t_mp_capable);
+ rds_conn_path_drop(&conn->c_path[0], false);
+}
+EXPORT_SYMBOL_GPL(rds_conn_drop);
+
+/*
+ * If the connection is down, trigger a connect. We may have scheduled a
+ * delayed reconnect however - in this case we should not interfere.
+ */
+void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
+{
+ rcu_read_lock();
+ if (rds_destroy_pending(cp->cp_conn)) {
+ rcu_read_unlock();
+ return;
+ }
+ if (rds_conn_path_state(cp) == RDS_CONN_DOWN &&
+ !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
+ queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
+
+/* Check connectivity of all paths
+ */
+void rds_check_all_paths(struct rds_connection *conn)
+{
+ int i = 0;
+
+ do {
+ rds_conn_path_connect_if_down(&conn->c_path[i]);
+ } while (++i < conn->c_npaths);
+}
+
+void rds_conn_connect_if_down(struct rds_connection *conn)
+{
+ WARN_ON(conn->c_trans->t_mp_capable);
+ rds_conn_path_connect_if_down(&conn->c_path[0]);
+}
+EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
+
+void
+__rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vprintk(fmt, ap);
+ va_end(ap);
+
+ rds_conn_path_drop(cp, false);
+}
diff --git a/net/rds/ib.c b/net/rds/ib.c
new file mode 100644
index 000000000..9826fe7f9
--- /dev/null
+++ b/net/rds/ib.c
@@ -0,0 +1,607 @@
+/*
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/addrconf.h>
+
+#include "rds_single_path.h"
+#include "rds.h"
+#include "ib.h"
+#include "ib_mr.h"
+
+static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
+static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
+unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
+static atomic_t rds_ib_unloading;
+
+module_param(rds_ib_mr_1m_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA");
+module_param(rds_ib_mr_8k_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA");
+module_param(rds_ib_retry_count, int, 0444);
+MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
+
+/*
+ * we have a clumsy combination of RCU and a rwsem protecting this list
+ * because it is used both in the get_mr fast path and while blocking in
+ * the FMR flushing path.
+ */
+DECLARE_RWSEM(rds_ib_devices_lock);
+struct list_head rds_ib_devices;
+
+/* NOTE: if also grabbing ibdev lock, grab this first */
+DEFINE_SPINLOCK(ib_nodev_conns_lock);
+LIST_HEAD(ib_nodev_conns);
+
+static void rds_ib_nodev_connect(void)
+{
+ struct rds_ib_connection *ic;
+
+ spin_lock(&ib_nodev_conns_lock);
+ list_for_each_entry(ic, &ib_nodev_conns, ib_node)
+ rds_conn_connect_if_down(ic->conn);
+ spin_unlock(&ib_nodev_conns_lock);
+}
+
+static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
+{
+ struct rds_ib_connection *ic;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rds_ibdev->spinlock, flags);
+ list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
+ rds_conn_path_drop(&ic->conn->c_path[0], true);
+ spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
+}
+
+/*
+ * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
+ * from interrupt context so we push freing off into a work struct in krdsd.
+ */
+static void rds_ib_dev_free(struct work_struct *work)
+{
+ struct rds_ib_ipaddr *i_ipaddr, *i_next;
+ struct rds_ib_device *rds_ibdev = container_of(work,
+ struct rds_ib_device, free_work);
+
+ if (rds_ibdev->mr_8k_pool)
+ rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool);
+ if (rds_ibdev->mr_1m_pool)
+ rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
+ if (rds_ibdev->pd)
+ ib_dealloc_pd(rds_ibdev->pd);
+
+ list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+ list_del(&i_ipaddr->list);
+ kfree(i_ipaddr);
+ }
+
+ kfree(rds_ibdev->vector_load);
+
+ kfree(rds_ibdev);
+}
+
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
+{
+ BUG_ON(refcount_read(&rds_ibdev->refcount) == 0);
+ if (refcount_dec_and_test(&rds_ibdev->refcount))
+ queue_work(rds_wq, &rds_ibdev->free_work);
+}
+
+static int rds_ib_add_one(struct ib_device *device)
+{
+ struct rds_ib_device *rds_ibdev;
+ int ret;
+
+ /* Only handle IB (no iWARP) devices */
+ if (device->node_type != RDMA_NODE_IB_CA)
+ return -EOPNOTSUPP;
+
+ /* Device must support FRWR */
+ if (!(device->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+ return -EOPNOTSUPP;
+
+ rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
+ ibdev_to_node(device));
+ if (!rds_ibdev)
+ return -ENOMEM;
+
+ spin_lock_init(&rds_ibdev->spinlock);
+ refcount_set(&rds_ibdev->refcount, 1);
+ INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
+
+ INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
+ INIT_LIST_HEAD(&rds_ibdev->conn_list);
+
+ rds_ibdev->max_wrs = device->attrs.max_qp_wr;
+ rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE);
+
+ rds_ibdev->odp_capable =
+ !!(device->attrs.kernel_cap_flags &
+ IBK_ON_DEMAND_PAGING) &&
+ !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
+ IB_ODP_SUPPORT_WRITE) &&
+ !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
+ IB_ODP_SUPPORT_READ);
+
+ rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
+ min_t(unsigned int, (device->attrs.max_mr / 2),
+ rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size;
+
+ rds_ibdev->max_8k_mrs = device->attrs.max_mr ?
+ min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
+ rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size;
+
+ rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
+ rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
+
+ rds_ibdev->vector_load = kcalloc(device->num_comp_vectors,
+ sizeof(int),
+ GFP_KERNEL);
+ if (!rds_ibdev->vector_load) {
+ pr_err("RDS/IB: %s failed to allocate vector memory\n",
+ __func__);
+ ret = -ENOMEM;
+ goto put_dev;
+ }
+
+ rds_ibdev->dev = device;
+ rds_ibdev->pd = ib_alloc_pd(device, 0);
+ if (IS_ERR(rds_ibdev->pd)) {
+ ret = PTR_ERR(rds_ibdev->pd);
+ rds_ibdev->pd = NULL;
+ goto put_dev;
+ }
+
+ rds_ibdev->mr_1m_pool =
+ rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL);
+ if (IS_ERR(rds_ibdev->mr_1m_pool)) {
+ ret = PTR_ERR(rds_ibdev->mr_1m_pool);
+ rds_ibdev->mr_1m_pool = NULL;
+ goto put_dev;
+ }
+
+ rds_ibdev->mr_8k_pool =
+ rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL);
+ if (IS_ERR(rds_ibdev->mr_8k_pool)) {
+ ret = PTR_ERR(rds_ibdev->mr_8k_pool);
+ rds_ibdev->mr_8k_pool = NULL;
+ goto put_dev;
+ }
+
+ rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
+ device->attrs.max_mr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
+ rds_ibdev->max_1m_mrs, rds_ibdev->max_8k_mrs);
+
+ pr_info("RDS/IB: %s: added\n", device->name);
+
+ down_write(&rds_ib_devices_lock);
+ list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
+ up_write(&rds_ib_devices_lock);
+ refcount_inc(&rds_ibdev->refcount);
+
+ ib_set_client_data(device, &rds_ib_client, rds_ibdev);
+
+ rds_ib_nodev_connect();
+ return 0;
+
+put_dev:
+ rds_ib_dev_put(rds_ibdev);
+ return ret;
+}
+
+/*
+ * New connections use this to find the device to associate with the
+ * connection. It's not in the fast path so we're not concerned about the
+ * performance of the IB call. (As of this writing, it uses an interrupt
+ * blocking spinlock to serialize walking a per-device list of all registered
+ * clients.)
+ *
+ * RCU is used to handle incoming connections racing with device teardown.
+ * Rather than use a lock to serialize removal from the client_data and
+ * getting a new reference, we use an RCU grace period. The destruction
+ * path removes the device from client_data and then waits for all RCU
+ * readers to finish.
+ *
+ * A new connection can get NULL from this if its arriving on a
+ * device that is in the process of being removed.
+ */
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
+{
+ struct rds_ib_device *rds_ibdev;
+
+ rcu_read_lock();
+ rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+ if (rds_ibdev)
+ refcount_inc(&rds_ibdev->refcount);
+ rcu_read_unlock();
+ return rds_ibdev;
+}
+
+/*
+ * The IB stack is letting us know that a device is going away. This can
+ * happen if the underlying HCA driver is removed or if PCI hotplug is removing
+ * the pci function, for example.
+ *
+ * This can be called at any time and can be racing with any other RDS path.
+ */
+static void rds_ib_remove_one(struct ib_device *device, void *client_data)
+{
+ struct rds_ib_device *rds_ibdev = client_data;
+
+ rds_ib_dev_shutdown(rds_ibdev);
+
+ /* stop connection attempts from getting a reference to this device. */
+ ib_set_client_data(device, &rds_ib_client, NULL);
+
+ down_write(&rds_ib_devices_lock);
+ list_del_rcu(&rds_ibdev->list);
+ up_write(&rds_ib_devices_lock);
+
+ /*
+ * This synchronize rcu is waiting for readers of both the ib
+ * client data and the devices list to finish before we drop
+ * both of those references.
+ */
+ synchronize_rcu();
+ rds_ib_dev_put(rds_ibdev);
+ rds_ib_dev_put(rds_ibdev);
+}
+
+struct ib_client rds_ib_client = {
+ .name = "rds_ib",
+ .add = rds_ib_add_one,
+ .remove = rds_ib_remove_one
+};
+
+static int rds_ib_conn_info_visitor(struct rds_connection *conn,
+ void *buffer)
+{
+ struct rds_info_rdma_connection *iinfo = buffer;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ /* We will only ever look at IB transports */
+ if (conn->c_trans != &rds_ib_transport)
+ return 0;
+ if (conn->c_isv6)
+ return 0;
+
+ iinfo->src_addr = conn->c_laddr.s6_addr32[3];
+ iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
+ if (ic) {
+ iinfo->tos = conn->c_tos;
+ iinfo->sl = ic->i_sl;
+ }
+
+ memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+ memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ struct rds_ib_device *rds_ibdev;
+
+ rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo->src_gid,
+ (union ib_gid *)&iinfo->dst_gid);
+
+ rds_ibdev = ic->rds_ibdev;
+ iinfo->max_send_wr = ic->i_send_ring.w_nr;
+ iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+ iinfo->max_send_sge = rds_ibdev->max_sge;
+ rds_ib_get_mr_info(rds_ibdev, iinfo);
+ iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs);
+ }
+ return 1;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/* IPv6 version of rds_ib_conn_info_visitor(). */
+static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
+ void *buffer)
+{
+ struct rds6_info_rdma_connection *iinfo6 = buffer;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ /* We will only ever look at IB transports */
+ if (conn->c_trans != &rds_ib_transport)
+ return 0;
+
+ iinfo6->src_addr = conn->c_laddr;
+ iinfo6->dst_addr = conn->c_faddr;
+ if (ic) {
+ iinfo6->tos = conn->c_tos;
+ iinfo6->sl = ic->i_sl;
+ }
+
+ memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid));
+ memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid));
+
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ struct rds_ib_device *rds_ibdev;
+
+ rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo6->src_gid,
+ (union ib_gid *)&iinfo6->dst_gid);
+ rds_ibdev = ic->rds_ibdev;
+ iinfo6->max_send_wr = ic->i_send_ring.w_nr;
+ iinfo6->max_recv_wr = ic->i_recv_ring.w_nr;
+ iinfo6->max_send_sge = rds_ibdev->max_sge;
+ rds6_ib_get_mr_info(rds_ibdev, iinfo6);
+ iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs);
+ }
+ return 1;
+}
+#endif
+
+static void rds_ib_ic_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ u64 buffer[(sizeof(struct rds_info_rdma_connection) + 7) / 8];
+
+ rds_for_each_conn_info(sock, len, iter, lens,
+ rds_ib_conn_info_visitor,
+ buffer,
+ sizeof(struct rds_info_rdma_connection));
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/* IPv6 version of rds_ib_ic_info(). */
+static void rds6_ib_ic_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8];
+
+ rds_for_each_conn_info(sock, len, iter, lens,
+ rds6_ib_conn_info_visitor,
+ buffer,
+ sizeof(struct rds6_info_rdma_connection));
+}
+#endif
+
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible. Sending and
+ * receiving should be device-agnostic. Transports would try and maintain
+ * connections between peers who have messages queued. Userspace would be
+ * allowed to influence which paths have priority. We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
+ __u32 scope_id)
+{
+ int ret;
+ struct rdma_cm_id *cm_id;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 sin6;
+#endif
+ struct sockaddr_in sin;
+ struct sockaddr *sa;
+ bool isv4;
+
+ isv4 = ipv6_addr_v4mapped(addr);
+ /* Create a CMA ID and try to bind it. This catches both
+ * IB and iWARP capable NICs.
+ */
+ cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler,
+ NULL, RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(cm_id))
+ return PTR_ERR(cm_id);
+
+ if (isv4) {
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = addr->s6_addr32[3];
+ sa = (struct sockaddr *)&sin;
+ } else {
+#if IS_ENABLED(CONFIG_IPV6)
+ memset(&sin6, 0, sizeof(sin6));
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = *addr;
+ sin6.sin6_scope_id = scope_id;
+ sa = (struct sockaddr *)&sin6;
+
+ /* XXX Do a special IPv6 link local address check here. The
+ * reason is that rdma_bind_addr() always succeeds with IPv6
+ * link local address regardless it is indeed configured in a
+ * system.
+ */
+ if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) {
+ struct net_device *dev;
+
+ if (scope_id == 0) {
+ ret = -EADDRNOTAVAIL;
+ goto out;
+ }
+
+ /* Use init_net for now as RDS is not network
+ * name space aware.
+ */
+ dev = dev_get_by_index(&init_net, scope_id);
+ if (!dev) {
+ ret = -EADDRNOTAVAIL;
+ goto out;
+ }
+ if (!ipv6_chk_addr(&init_net, addr, dev, 1)) {
+ dev_put(dev);
+ ret = -EADDRNOTAVAIL;
+ goto out;
+ }
+ dev_put(dev);
+ }
+#else
+ ret = -EADDRNOTAVAIL;
+ goto out;
+#endif
+ }
+
+ /* rdma_bind_addr will only succeed for IB & iWARP devices */
+ ret = rdma_bind_addr(cm_id, sa);
+ /* due to this, we will claim to support iWARP devices unless we
+ check node_type. */
+ if (ret || !cm_id->device ||
+ cm_id->device->node_type != RDMA_NODE_IB_CA)
+ ret = -EADDRNOTAVAIL;
+
+ rdsdebug("addr %pI6c%%%u ret %d node type %d\n",
+ addr, scope_id, ret,
+ cm_id->device ? cm_id->device->node_type : -1);
+
+out:
+ rdma_destroy_id(cm_id);
+
+ return ret;
+}
+
+static void rds_ib_unregister_client(void)
+{
+ ib_unregister_client(&rds_ib_client);
+ /* wait for rds_ib_dev_free() to complete */
+ flush_workqueue(rds_wq);
+}
+
+static void rds_ib_set_unloading(void)
+{
+ atomic_set(&rds_ib_unloading, 1);
+}
+
+static bool rds_ib_is_unloading(struct rds_connection *conn)
+{
+ struct rds_conn_path *cp = &conn->c_path[0];
+
+ return (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags) ||
+ atomic_read(&rds_ib_unloading) != 0);
+}
+
+void rds_ib_exit(void)
+{
+ rds_ib_set_unloading();
+ synchronize_rcu();
+ rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
+#endif
+ rds_ib_unregister_client();
+ rds_ib_destroy_nodev_conns();
+ rds_ib_sysctl_exit();
+ rds_ib_recv_exit();
+ rds_trans_unregister(&rds_ib_transport);
+ rds_ib_mr_exit();
+}
+
+static u8 rds_ib_get_tos_map(u8 tos)
+{
+ /* 1:1 user to transport map for RDMA transport.
+ * In future, if custom map is desired, hook can export
+ * user configurable map.
+ */
+ return tos;
+}
+
+struct rds_transport rds_ib_transport = {
+ .laddr_check = rds_ib_laddr_check,
+ .xmit_path_complete = rds_ib_xmit_path_complete,
+ .xmit = rds_ib_xmit,
+ .xmit_rdma = rds_ib_xmit_rdma,
+ .xmit_atomic = rds_ib_xmit_atomic,
+ .recv_path = rds_ib_recv_path,
+ .conn_alloc = rds_ib_conn_alloc,
+ .conn_free = rds_ib_conn_free,
+ .conn_path_connect = rds_ib_conn_path_connect,
+ .conn_path_shutdown = rds_ib_conn_path_shutdown,
+ .inc_copy_to_user = rds_ib_inc_copy_to_user,
+ .inc_free = rds_ib_inc_free,
+ .cm_initiate_connect = rds_ib_cm_initiate_connect,
+ .cm_handle_connect = rds_ib_cm_handle_connect,
+ .cm_connect_complete = rds_ib_cm_connect_complete,
+ .stats_info_copy = rds_ib_stats_info_copy,
+ .exit = rds_ib_exit,
+ .get_mr = rds_ib_get_mr,
+ .sync_mr = rds_ib_sync_mr,
+ .free_mr = rds_ib_free_mr,
+ .flush_mrs = rds_ib_flush_mrs,
+ .get_tos_map = rds_ib_get_tos_map,
+ .t_owner = THIS_MODULE,
+ .t_name = "infiniband",
+ .t_unloading = rds_ib_is_unloading,
+ .t_type = RDS_TRANS_IB
+};
+
+int rds_ib_init(void)
+{
+ int ret;
+
+ INIT_LIST_HEAD(&rds_ib_devices);
+
+ ret = rds_ib_mr_init();
+ if (ret)
+ goto out;
+
+ ret = ib_register_client(&rds_ib_client);
+ if (ret)
+ goto out_mr_exit;
+
+ ret = rds_ib_sysctl_init();
+ if (ret)
+ goto out_ibreg;
+
+ ret = rds_ib_recv_init();
+ if (ret)
+ goto out_sysctl;
+
+ rds_trans_register(&rds_ib_transport);
+
+ rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
+#endif
+
+ goto out;
+
+out_sysctl:
+ rds_ib_sysctl_exit();
+out_ibreg:
+ rds_ib_unregister_client();
+out_mr_exit:
+ rds_ib_mr_exit();
+out:
+ return ret;
+}
+
+MODULE_LICENSE("GPL");
diff --git a/net/rds/ib.h b/net/rds/ib.h
new file mode 100644
index 000000000..2ba71102b
--- /dev/null
+++ b/net/rds/ib.h
@@ -0,0 +1,458 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDS_IB_H
+#define _RDS_IB_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include "rds.h"
+#include "rdma_transport.h"
+
+#define RDS_IB_MAX_SGE 8
+#define RDS_IB_RECV_SGE 2
+
+#define RDS_IB_DEFAULT_RECV_WR 1024
+#define RDS_IB_DEFAULT_SEND_WR 256
+#define RDS_IB_DEFAULT_FR_WR 512
+
+#define RDS_IB_DEFAULT_RETRY_COUNT 1
+
+#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
+
+#define RDS_IB_RECYCLE_BATCH_COUNT 32
+
+#define RDS_IB_WC_MAX 32
+
+extern struct rw_semaphore rds_ib_devices_lock;
+extern struct list_head rds_ib_devices;
+
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+struct rds_page_frag {
+ struct list_head f_item;
+ struct list_head f_cache_entry;
+ struct scatterlist f_sg;
+};
+
+struct rds_ib_incoming {
+ struct list_head ii_frags;
+ struct list_head ii_cache_entry;
+ struct rds_incoming ii_inc;
+};
+
+struct rds_ib_cache_head {
+ struct list_head *first;
+ unsigned long count;
+};
+
+struct rds_ib_refill_cache {
+ struct rds_ib_cache_head __percpu *percpu;
+ struct list_head *xfer;
+ struct list_head *ready;
+};
+
+/* This is the common structure for the IB private data exchange in setting up
+ * an RDS connection. The exchange is different for IPv4 and IPv6 connections.
+ * The reason is that the address size is different and the addresses
+ * exchanged are in the beginning of the structure. Hence it is not possible
+ * for interoperability if same structure is used.
+ */
+struct rds_ib_conn_priv_cmn {
+ u8 ricpc_protocol_major;
+ u8 ricpc_protocol_minor;
+ __be16 ricpc_protocol_minor_mask; /* bitmask */
+ u8 ricpc_dp_toss;
+ u8 ripc_reserved1;
+ __be16 ripc_reserved2;
+ __be64 ricpc_ack_seq;
+ __be32 ricpc_credit; /* non-zero enables flow ctl */
+};
+
+struct rds_ib_connect_private {
+ /* Add new fields at the end, and don't permute existing fields. */
+ __be32 dp_saddr;
+ __be32 dp_daddr;
+ struct rds_ib_conn_priv_cmn dp_cmn;
+};
+
+struct rds6_ib_connect_private {
+ /* Add new fields at the end, and don't permute existing fields. */
+ struct in6_addr dp_saddr;
+ struct in6_addr dp_daddr;
+ struct rds_ib_conn_priv_cmn dp_cmn;
+};
+
+#define dp_protocol_major dp_cmn.ricpc_protocol_major
+#define dp_protocol_minor dp_cmn.ricpc_protocol_minor
+#define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask
+#define dp_ack_seq dp_cmn.ricpc_ack_seq
+#define dp_credit dp_cmn.ricpc_credit
+
+union rds_ib_conn_priv {
+ struct rds_ib_connect_private ricp_v4;
+ struct rds6_ib_connect_private ricp_v6;
+};
+
+struct rds_ib_send_work {
+ void *s_op;
+ union {
+ struct ib_send_wr s_wr;
+ struct ib_rdma_wr s_rdma_wr;
+ struct ib_atomic_wr s_atomic_wr;
+ };
+ struct ib_sge s_sge[RDS_IB_MAX_SGE];
+ unsigned long s_queued;
+};
+
+struct rds_ib_recv_work {
+ struct rds_ib_incoming *r_ibinc;
+ struct rds_page_frag *r_frag;
+ struct ib_recv_wr r_wr;
+ struct ib_sge r_sge[2];
+};
+
+struct rds_ib_work_ring {
+ u32 w_nr;
+ u32 w_alloc_ptr;
+ u32 w_alloc_ctr;
+ u32 w_free_ptr;
+ atomic_t w_free_ctr;
+};
+
+/* Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_ib_ack_state {
+ u64 ack_next;
+ u64 ack_recv;
+ unsigned int ack_required:1;
+ unsigned int ack_next_valid:1;
+ unsigned int ack_recv_valid:1;
+};
+
+
+struct rds_ib_device;
+
+struct rds_ib_connection {
+
+ struct list_head ib_node;
+ struct rds_ib_device *rds_ibdev;
+ struct rds_connection *conn;
+
+ /* alphabet soup, IBTA style */
+ struct rdma_cm_id *i_cm_id;
+ struct ib_pd *i_pd;
+ struct ib_cq *i_send_cq;
+ struct ib_cq *i_recv_cq;
+ struct ib_wc i_send_wc[RDS_IB_WC_MAX];
+ struct ib_wc i_recv_wc[RDS_IB_WC_MAX];
+
+ /* To control the number of wrs from fastreg */
+ atomic_t i_fastreg_wrs;
+ atomic_t i_fastreg_inuse_count;
+
+ /* interrupt handling */
+ struct tasklet_struct i_send_tasklet;
+ struct tasklet_struct i_recv_tasklet;
+
+ /* tx */
+ struct rds_ib_work_ring i_send_ring;
+ struct rm_data_op *i_data_op;
+ struct rds_header **i_send_hdrs;
+ dma_addr_t *i_send_hdrs_dma;
+ struct rds_ib_send_work *i_sends;
+ atomic_t i_signaled_sends;
+
+ /* rx */
+ struct mutex i_recv_mutex;
+ struct rds_ib_work_ring i_recv_ring;
+ struct rds_ib_incoming *i_ibinc;
+ u32 i_recv_data_rem;
+ struct rds_header **i_recv_hdrs;
+ dma_addr_t *i_recv_hdrs_dma;
+ struct rds_ib_recv_work *i_recvs;
+ u64 i_ack_recv; /* last ACK received */
+ struct rds_ib_refill_cache i_cache_incs;
+ struct rds_ib_refill_cache i_cache_frags;
+ atomic_t i_cache_allocs;
+
+ /* sending acks */
+ unsigned long i_ack_flags;
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_t i_ack_next; /* next ACK to send */
+#else
+ spinlock_t i_ack_lock; /* protect i_ack_next */
+ u64 i_ack_next; /* next ACK to send */
+#endif
+ struct rds_header *i_ack;
+ struct ib_send_wr i_ack_wr;
+ struct ib_sge i_ack_sge;
+ dma_addr_t i_ack_dma;
+ unsigned long i_ack_queued;
+
+ /* Flow control related information
+ *
+ * Our algorithm uses a pair variables that we need to access
+ * atomically - one for the send credits, and one posted
+ * recv credits we need to transfer to remote.
+ * Rather than protect them using a slow spinlock, we put both into
+ * a single atomic_t and update it using cmpxchg
+ */
+ atomic_t i_credits;
+
+ /* Protocol version specific information */
+ unsigned int i_flowctl:1; /* enable/disable flow ctl */
+
+ /* Batched completions */
+ unsigned int i_unsignaled_wrs;
+
+ /* Endpoint role in connection */
+ bool i_active_side;
+ atomic_t i_cq_quiesce;
+
+ /* Send/Recv vectors */
+ int i_scq_vector;
+ int i_rcq_vector;
+ u8 i_sl;
+};
+
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v) ((v) >> 16)
+#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v) ((v) << 16)
+
+struct rds_ib_ipaddr {
+ struct list_head list;
+ __be32 ipaddr;
+ struct rcu_head rcu;
+};
+
+enum {
+ RDS_IB_MR_8K_POOL,
+ RDS_IB_MR_1M_POOL,
+};
+
+struct rds_ib_device {
+ struct list_head list;
+ struct list_head ipaddr_list;
+ struct list_head conn_list;
+ struct ib_device *dev;
+ struct ib_pd *pd;
+ u8 odp_capable:1;
+
+ unsigned int max_mrs;
+ struct rds_ib_mr_pool *mr_1m_pool;
+ struct rds_ib_mr_pool *mr_8k_pool;
+ unsigned int max_8k_mrs;
+ unsigned int max_1m_mrs;
+ int max_sge;
+ unsigned int max_wrs;
+ unsigned int max_initiator_depth;
+ unsigned int max_responder_resources;
+ spinlock_t spinlock; /* protect the above */
+ refcount_t refcount;
+ struct work_struct free_work;
+ int *vector_load;
+};
+
+#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
+
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT 0
+#define IB_ACK_REQUESTED 1
+
+/* Magic WR_ID for ACKs */
+#define RDS_IB_ACK_WR_ID (~(u64) 0)
+
+struct rds_ib_statistics {
+ uint64_t s_ib_connect_raced;
+ uint64_t s_ib_listen_closed_stale;
+ uint64_t s_ib_evt_handler_call;
+ uint64_t s_ib_tasklet_call;
+ uint64_t s_ib_tx_cq_event;
+ uint64_t s_ib_tx_ring_full;
+ uint64_t s_ib_tx_throttle;
+ uint64_t s_ib_tx_sg_mapping_failure;
+ uint64_t s_ib_tx_stalled;
+ uint64_t s_ib_tx_credit_updates;
+ uint64_t s_ib_rx_cq_event;
+ uint64_t s_ib_rx_ring_empty;
+ uint64_t s_ib_rx_refill_from_cq;
+ uint64_t s_ib_rx_refill_from_thread;
+ uint64_t s_ib_rx_alloc_limit;
+ uint64_t s_ib_rx_total_frags;
+ uint64_t s_ib_rx_total_incs;
+ uint64_t s_ib_rx_credit_updates;
+ uint64_t s_ib_ack_sent;
+ uint64_t s_ib_ack_send_failure;
+ uint64_t s_ib_ack_send_delayed;
+ uint64_t s_ib_ack_send_piggybacked;
+ uint64_t s_ib_ack_received;
+ uint64_t s_ib_rdma_mr_8k_alloc;
+ uint64_t s_ib_rdma_mr_8k_free;
+ uint64_t s_ib_rdma_mr_8k_used;
+ uint64_t s_ib_rdma_mr_8k_pool_flush;
+ uint64_t s_ib_rdma_mr_8k_pool_wait;
+ uint64_t s_ib_rdma_mr_8k_pool_depleted;
+ uint64_t s_ib_rdma_mr_1m_alloc;
+ uint64_t s_ib_rdma_mr_1m_free;
+ uint64_t s_ib_rdma_mr_1m_used;
+ uint64_t s_ib_rdma_mr_1m_pool_flush;
+ uint64_t s_ib_rdma_mr_1m_pool_wait;
+ uint64_t s_ib_rdma_mr_1m_pool_depleted;
+ uint64_t s_ib_rdma_mr_8k_reused;
+ uint64_t s_ib_rdma_mr_1m_reused;
+ uint64_t s_ib_atomic_cswp;
+ uint64_t s_ib_atomic_fadd;
+ uint64_t s_ib_recv_added_to_cache;
+ uint64_t s_ib_recv_removed_from_cache;
+};
+
+extern struct workqueue_struct *rds_ib_wq;
+
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
+ struct scatterlist *sglist,
+ unsigned int sg_dma_len,
+ int direction)
+{
+ struct scatterlist *sg;
+ unsigned int i;
+
+ for_each_sg(sglist, sg, sg_dma_len, i) {
+ ib_dma_sync_single_for_cpu(dev, sg_dma_address(sg),
+ sg_dma_len(sg), direction);
+ }
+}
+#define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu
+
+static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
+ struct scatterlist *sglist,
+ unsigned int sg_dma_len,
+ int direction)
+{
+ struct scatterlist *sg;
+ unsigned int i;
+
+ for_each_sg(sglist, sg, sg_dma_len, i) {
+ ib_dma_sync_single_for_device(dev, sg_dma_address(sg),
+ sg_dma_len(sg), direction);
+ }
+}
+#define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device
+
+
+/* ib.c */
+extern struct rds_transport rds_ib_transport;
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
+extern struct ib_client rds_ib_client;
+
+extern unsigned int rds_ib_retry_count;
+
+extern spinlock_t ib_nodev_conns_lock;
+extern struct list_head ib_nodev_conns;
+
+/* ib_cm.c */
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_ib_conn_free(void *arg);
+int rds_ib_conn_path_connect(struct rds_conn_path *cp);
+void rds_ib_conn_path_shutdown(struct rds_conn_path *cp);
+void rds_ib_state_change(struct sock *sk);
+int rds_ib_listen_init(void);
+void rds_ib_listen_stop(void);
+__printf(2, 3)
+void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event, bool isv6);
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6);
+void rds_ib_cm_connect_complete(struct rds_connection *conn,
+ struct rdma_cm_event *event);
+
+#define rds_ib_conn_error(conn, fmt...) \
+ __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
+
+/* ib_rdma.c */
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
+ struct in6_addr *ipaddr);
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_destroy_nodev_conns(void);
+void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
+
+/* ib_recv.c */
+int rds_ib_recv_init(void);
+void rds_ib_recv_exit(void);
+int rds_ib_recv_path(struct rds_conn_path *conn);
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp);
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp);
+void rds_ib_inc_free(struct rds_incoming *inc);
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc,
+ struct rds_ib_ack_state *state);
+void rds_ib_recv_tasklet_fn(unsigned long data);
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
+void rds_ib_attempt_ack(struct rds_ib_connection *ic);
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
+void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required);
+
+/* ib_ring.c */
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr);
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos);
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val);
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val);
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring);
+int rds_ib_ring_low(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_ib_ring_empty_wait;
+
+/* ib_send.c */
+void rds_ib_xmit_path_complete(struct rds_conn_path *cp);
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
+void rds_ib_send_init_ring(struct rds_ib_connection *ic);
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
+ u32 *adv_credits, int need_posted, int max_posted);
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
+
+/* ib_stats.c */
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
+#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
+#define rds_ib_stats_add(member, count) \
+ rds_stats_add_which(rds_ib_stats, member, count)
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail);
+
+/* ib_sysctl.c */
+int rds_ib_sysctl_init(void);
+void rds_ib_sysctl_exit(void);
+extern unsigned long rds_ib_sysctl_max_send_wr;
+extern unsigned long rds_ib_sysctl_max_recv_wr;
+extern unsigned long rds_ib_sysctl_max_unsig_wrs;
+extern unsigned long rds_ib_sysctl_max_unsig_bytes;
+extern unsigned long rds_ib_sysctl_max_recv_allocation;
+extern unsigned int rds_ib_sysctl_flow_control;
+
+#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
new file mode 100644
index 000000000..26b069e19
--- /dev/null
+++ b/net/rds/ib_cm.c
@@ -0,0 +1,1287 @@
+/*
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/ratelimit.h>
+#include <net/addrconf.h>
+#include <rdma/ib_cm.h>
+
+#include "rds_single_path.h"
+#include "rds.h"
+#include "ib.h"
+#include "ib_mr.h"
+
+/*
+ * Set the selected protocol version
+ */
+static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+ conn->c_version = version;
+}
+
+/*
+ * Set up flow control
+ */
+static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ if (rds_ib_sysctl_flow_control && credits != 0) {
+ /* We're doing flow control */
+ ic->i_flowctl = 1;
+ rds_ib_send_add_credits(conn, credits);
+ } else {
+ ic->i_flowctl = 0;
+ }
+}
+
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ const union rds_ib_conn_priv *dp = NULL;
+ __be64 ack_seq = 0;
+ __be32 credit = 0;
+ u8 major = 0;
+ u8 minor = 0;
+ int err;
+
+ dp = event->param.conn.private_data;
+ if (conn->c_isv6) {
+ if (event->param.conn.private_data_len >=
+ sizeof(struct rds6_ib_connect_private)) {
+ major = dp->ricp_v6.dp_protocol_major;
+ minor = dp->ricp_v6.dp_protocol_minor;
+ credit = dp->ricp_v6.dp_credit;
+ /* dp structure start is not guaranteed to be 8 bytes
+ * aligned. Since dp_ack_seq is 64-bit extended load
+ * operations can be used so go through get_unaligned
+ * to avoid unaligned errors.
+ */
+ ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq);
+ }
+ } else if (event->param.conn.private_data_len >=
+ sizeof(struct rds_ib_connect_private)) {
+ major = dp->ricp_v4.dp_protocol_major;
+ minor = dp->ricp_v4.dp_protocol_minor;
+ credit = dp->ricp_v4.dp_credit;
+ ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq);
+ }
+
+ /* make sure it isn't empty data */
+ if (major) {
+ rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor));
+ rds_ib_set_flow_control(conn, be32_to_cpu(credit));
+ }
+
+ if (conn->c_version < RDS_PROTOCOL_VERSION) {
+ if (conn->c_version != RDS_PROTOCOL_COMPAT_VERSION) {
+ pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
+ &conn->c_laddr, &conn->c_faddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version));
+ rds_conn_destroy(conn);
+ return;
+ }
+ }
+
+ pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c,%d> version %u.%u%s\n",
+ ic->i_active_side ? "Active" : "Passive",
+ &conn->c_laddr, &conn->c_faddr, conn->c_tos,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version),
+ ic->i_flowctl ? ", flow control" : "");
+
+ /* receive sl from the peer */
+ ic->i_sl = ic->i_cm_id->route.path_rec->sl;
+
+ atomic_set(&ic->i_cq_quiesce, 0);
+
+ /* Init rings and fill recv. this needs to wait until protocol
+ * negotiation is complete, since ring layout is different
+ * from 3.1 to 4.1.
+ */
+ rds_ib_send_init_ring(ic);
+ rds_ib_recv_init_ring(ic);
+ /* Post receive buffers - as a side effect, this will update
+ * the posted credit count. */
+ rds_ib_recv_refill(conn, 1, GFP_KERNEL);
+
+ /* update ib_device with this local ipaddr */
+ err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr);
+ if (err)
+ printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
+ err);
+
+ /* If the peer gave us the last packet it saw, process this as if
+ * we had received a regular ACK. */
+ if (dp) {
+ if (ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(ack_seq),
+ NULL);
+ }
+
+ conn->c_proposed_version = conn->c_version;
+ rds_connect_complete(conn);
+}
+
+static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
+ struct rdma_conn_param *conn_param,
+ union rds_ib_conn_priv *dp,
+ u32 protocol_version,
+ u32 max_responder_resources,
+ u32 max_initiator_depth,
+ bool isv6)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
+
+ memset(conn_param, 0, sizeof(struct rdma_conn_param));
+
+ conn_param->responder_resources =
+ min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
+ conn_param->initiator_depth =
+ min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
+ conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
+ conn_param->rnr_retry_count = 7;
+
+ if (dp) {
+ memset(dp, 0, sizeof(*dp));
+ if (isv6) {
+ dp->ricp_v6.dp_saddr = conn->c_laddr;
+ dp->ricp_v6.dp_daddr = conn->c_faddr;
+ dp->ricp_v6.dp_protocol_major =
+ RDS_PROTOCOL_MAJOR(protocol_version);
+ dp->ricp_v6.dp_protocol_minor =
+ RDS_PROTOCOL_MINOR(protocol_version);
+ dp->ricp_v6.dp_protocol_minor_mask =
+ cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+ dp->ricp_v6.dp_ack_seq =
+ cpu_to_be64(rds_ib_piggyb_ack(ic));
+ dp->ricp_v6.dp_cmn.ricpc_dp_toss = conn->c_tos;
+
+ conn_param->private_data = &dp->ricp_v6;
+ conn_param->private_data_len = sizeof(dp->ricp_v6);
+ } else {
+ dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3];
+ dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3];
+ dp->ricp_v4.dp_protocol_major =
+ RDS_PROTOCOL_MAJOR(protocol_version);
+ dp->ricp_v4.dp_protocol_minor =
+ RDS_PROTOCOL_MINOR(protocol_version);
+ dp->ricp_v4.dp_protocol_minor_mask =
+ cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+ dp->ricp_v4.dp_ack_seq =
+ cpu_to_be64(rds_ib_piggyb_ack(ic));
+ dp->ricp_v4.dp_cmn.ricpc_dp_toss = conn->c_tos;
+
+ conn_param->private_data = &dp->ricp_v4;
+ conn_param->private_data_len = sizeof(dp->ricp_v4);
+ }
+
+ /* Advertise flow control */
+ if (ic->i_flowctl) {
+ unsigned int credits;
+
+ credits = IB_GET_POST_CREDITS
+ (atomic_read(&ic->i_credits));
+ if (isv6)
+ dp->ricp_v6.dp_credit = cpu_to_be32(credits);
+ else
+ dp->ricp_v4.dp_credit = cpu_to_be32(credits);
+ atomic_sub(IB_SET_POST_CREDITS(credits),
+ &ic->i_credits);
+ }
+ }
+}
+
+static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
+{
+ rdsdebug("event %u (%s) data %p\n",
+ event->event, ib_event_msg(event->event), data);
+}
+
+/* Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring. Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context)
+{
+ struct rds_connection *conn = context;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ rdsdebug("conn %p cq %p\n", conn, cq);
+
+ rds_ib_stats_inc(s_ib_evt_handler_call);
+
+ tasklet_schedule(&ic->i_recv_tasklet);
+}
+
+static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq,
+ struct ib_wc *wcs)
+{
+ int nr, i;
+ struct ib_wc *wc;
+
+ while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
+ for (i = 0; i < nr; i++) {
+ wc = wcs + i;
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc->wr_id, wc->status,
+ wc->byte_len, be32_to_cpu(wc->ex.imm_data));
+
+ if (wc->wr_id <= ic->i_send_ring.w_nr ||
+ wc->wr_id == RDS_IB_ACK_WR_ID)
+ rds_ib_send_cqe_handler(ic, wc);
+ else
+ rds_ib_mr_cqe_handler(ic, wc);
+
+ }
+ }
+}
+
+static void rds_ib_tasklet_fn_send(unsigned long data)
+{
+ struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
+ struct rds_connection *conn = ic->conn;
+
+ rds_ib_stats_inc(s_ib_tasklet_call);
+
+ /* if cq has been already reaped, ignore incoming cq event */
+ if (atomic_read(&ic->i_cq_quiesce))
+ return;
+
+ poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
+ ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
+ poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
+
+ if (rds_conn_up(conn) &&
+ (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
+ test_bit(0, &conn->c_map_queued)))
+ rds_send_xmit(&ic->conn->c_path[0]);
+}
+
+static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
+ struct ib_wc *wcs,
+ struct rds_ib_ack_state *ack_state)
+{
+ int nr, i;
+ struct ib_wc *wc;
+
+ while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
+ for (i = 0; i < nr; i++) {
+ wc = wcs + i;
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc->wr_id, wc->status,
+ wc->byte_len, be32_to_cpu(wc->ex.imm_data));
+
+ rds_ib_recv_cqe_handler(ic, wc, ack_state);
+ }
+ }
+}
+
+static void rds_ib_tasklet_fn_recv(unsigned long data)
+{
+ struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
+ struct rds_connection *conn = ic->conn;
+ struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
+ struct rds_ib_ack_state state;
+
+ if (!rds_ibdev)
+ rds_conn_drop(conn);
+
+ rds_ib_stats_inc(s_ib_tasklet_call);
+
+ /* if cq has been already reaped, ignore incoming cq event */
+ if (atomic_read(&ic->i_cq_quiesce))
+ return;
+
+ memset(&state, 0, sizeof(state));
+ poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+ ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+ poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+
+ if (state.ack_next_valid)
+ rds_ib_set_ack(ic, state.ack_next, state.ack_required);
+ if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+ rds_send_drop_acked(conn, state.ack_recv, NULL);
+ ic->i_ack_recv = state.ack_recv;
+ }
+
+ if (rds_conn_up(conn))
+ rds_ib_attempt_ack(ic);
+}
+
+static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
+{
+ struct rds_connection *conn = data;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
+ ib_event_msg(event->event));
+
+ switch (event->event) {
+ case IB_EVENT_COMM_EST:
+ rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+ break;
+ default:
+ rdsdebug("Fatal QP Event %u (%s) - connection %pI6c->%pI6c, reconnecting\n",
+ event->event, ib_event_msg(event->event),
+ &conn->c_laddr, &conn->c_faddr);
+ rds_conn_drop(conn);
+ break;
+ }
+}
+
+static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
+{
+ struct rds_connection *conn = context;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ rdsdebug("conn %p cq %p\n", conn, cq);
+
+ rds_ib_stats_inc(s_ib_evt_handler_call);
+
+ tasklet_schedule(&ic->i_send_tasklet);
+}
+
+static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev)
+{
+ int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1];
+ int index = rds_ibdev->dev->num_comp_vectors - 1;
+ int i;
+
+ for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) {
+ if (rds_ibdev->vector_load[i] < min) {
+ index = i;
+ min = rds_ibdev->vector_load[i];
+ }
+ }
+
+ rds_ibdev->vector_load[index]++;
+ return index;
+}
+
+static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
+{
+ rds_ibdev->vector_load[index]--;
+}
+
+static void rds_dma_hdr_free(struct ib_device *dev, struct rds_header *hdr,
+ dma_addr_t dma_addr, enum dma_data_direction dir)
+{
+ ib_dma_unmap_single(dev, dma_addr, sizeof(*hdr), dir);
+ kfree(hdr);
+}
+
+static struct rds_header *rds_dma_hdr_alloc(struct ib_device *dev,
+ dma_addr_t *dma_addr, enum dma_data_direction dir)
+{
+ struct rds_header *hdr;
+
+ hdr = kzalloc_node(sizeof(*hdr), GFP_KERNEL, ibdev_to_node(dev));
+ if (!hdr)
+ return NULL;
+
+ *dma_addr = ib_dma_map_single(dev, hdr, sizeof(*hdr),
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(dev, *dma_addr)) {
+ kfree(hdr);
+ return NULL;
+ }
+
+ return hdr;
+}
+
+/* Free the DMA memory used to store struct rds_header.
+ *
+ * @dev: the RDS IB device
+ * @hdrs: pointer to the array storing DMA memory pointers
+ * @dma_addrs: pointer to the array storing DMA addresses
+ * @num_hdars: number of headers to free.
+ */
+static void rds_dma_hdrs_free(struct rds_ib_device *dev,
+ struct rds_header **hdrs, dma_addr_t *dma_addrs, u32 num_hdrs,
+ enum dma_data_direction dir)
+{
+ u32 i;
+
+ for (i = 0; i < num_hdrs; i++)
+ rds_dma_hdr_free(dev->dev, hdrs[i], dma_addrs[i], dir);
+ kvfree(hdrs);
+ kvfree(dma_addrs);
+}
+
+
+/* Allocate DMA coherent memory to be used to store struct rds_header for
+ * sending/receiving packets. The pointers to the DMA memory and the
+ * associated DMA addresses are stored in two arrays.
+ *
+ * @dev: the RDS IB device
+ * @dma_addrs: pointer to the array for storing DMA addresses
+ * @num_hdrs: number of headers to allocate
+ *
+ * It returns the pointer to the array storing the DMA memory pointers. On
+ * error, NULL pointer is returned.
+ */
+static struct rds_header **rds_dma_hdrs_alloc(struct rds_ib_device *dev,
+ dma_addr_t **dma_addrs, u32 num_hdrs,
+ enum dma_data_direction dir)
+{
+ struct rds_header **hdrs;
+ dma_addr_t *hdr_daddrs;
+ u32 i;
+
+ hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL,
+ ibdev_to_node(dev->dev));
+ if (!hdrs)
+ return NULL;
+
+ hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL,
+ ibdev_to_node(dev->dev));
+ if (!hdr_daddrs) {
+ kvfree(hdrs);
+ return NULL;
+ }
+
+ for (i = 0; i < num_hdrs; i++) {
+ hdrs[i] = rds_dma_hdr_alloc(dev->dev, &hdr_daddrs[i], dir);
+ if (!hdrs[i]) {
+ rds_dma_hdrs_free(dev, hdrs, hdr_daddrs, i, dir);
+ return NULL;
+ }
+ }
+
+ *dma_addrs = hdr_daddrs;
+ return hdrs;
+}
+
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_ib_setup_qp(struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct ib_device *dev = ic->i_cm_id->device;
+ struct ib_qp_init_attr attr;
+ struct ib_cq_init_attr cq_attr = {};
+ struct rds_ib_device *rds_ibdev;
+ unsigned long max_wrs;
+ int ret, fr_queue_space;
+
+ /*
+ * It's normal to see a null device if an incoming connection races
+ * with device removal, so we don't print a warning.
+ */
+ rds_ibdev = rds_ib_get_client_data(dev);
+ if (!rds_ibdev)
+ return -EOPNOTSUPP;
+
+ /* The fr_queue_space is currently set to 512, to add extra space on
+ * completion queue and send queue. This extra space is used for FRWR
+ * registration and invalidation work requests
+ */
+ fr_queue_space = RDS_IB_DEFAULT_FR_WR;
+
+ /* add the conn now so that connection establishment has the dev */
+ rds_ib_add_conn(rds_ibdev, conn);
+
+ max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_send_wr + 1 ?
+ rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_send_wr;
+ if (ic->i_send_ring.w_nr != max_wrs)
+ rds_ib_ring_resize(&ic->i_send_ring, max_wrs);
+
+ max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_recv_wr + 1 ?
+ rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_recv_wr;
+ if (ic->i_recv_ring.w_nr != max_wrs)
+ rds_ib_ring_resize(&ic->i_recv_ring, max_wrs);
+
+ /* Protection domain and memory range */
+ ic->i_pd = rds_ibdev->pd;
+
+ ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev);
+ cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
+ cq_attr.comp_vector = ic->i_scq_vector;
+ ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
+ rds_ib_cq_event_handler, conn,
+ &cq_attr);
+ if (IS_ERR(ic->i_send_cq)) {
+ ret = PTR_ERR(ic->i_send_cq);
+ ic->i_send_cq = NULL;
+ ibdev_put_vector(rds_ibdev, ic->i_scq_vector);
+ rdsdebug("ib_create_cq send failed: %d\n", ret);
+ goto rds_ibdev_out;
+ }
+
+ ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev);
+ cq_attr.cqe = ic->i_recv_ring.w_nr;
+ cq_attr.comp_vector = ic->i_rcq_vector;
+ ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv,
+ rds_ib_cq_event_handler, conn,
+ &cq_attr);
+ if (IS_ERR(ic->i_recv_cq)) {
+ ret = PTR_ERR(ic->i_recv_cq);
+ ic->i_recv_cq = NULL;
+ ibdev_put_vector(rds_ibdev, ic->i_rcq_vector);
+ rdsdebug("ib_create_cq recv failed: %d\n", ret);
+ goto send_cq_out;
+ }
+
+ ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+ goto recv_cq_out;
+ }
+
+ ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+ if (ret) {
+ rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+ goto recv_cq_out;
+ }
+
+ /* XXX negotiate max send/recv with remote? */
+ memset(&attr, 0, sizeof(attr));
+ attr.event_handler = rds_ib_qp_event_handler;
+ attr.qp_context = conn;
+ /* + 1 to allow for the single ack message */
+ attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1;
+ attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
+ attr.cap.max_send_sge = rds_ibdev->max_sge;
+ attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
+ attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ attr.qp_type = IB_QPT_RC;
+ attr.send_cq = ic->i_send_cq;
+ attr.recv_cq = ic->i_recv_cq;
+
+ /*
+ * XXX this can fail if max_*_wr is too large? Are we supposed
+ * to back off until we get a value that the hardware can support?
+ */
+ ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+ if (ret) {
+ rdsdebug("rdma_create_qp failed: %d\n", ret);
+ goto recv_cq_out;
+ }
+
+ ic->i_send_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr,
+ DMA_TO_DEVICE);
+ if (!ic->i_send_hdrs) {
+ ret = -ENOMEM;
+ rdsdebug("DMA send hdrs alloc failed\n");
+ goto qp_out;
+ }
+
+ ic->i_recv_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr,
+ DMA_FROM_DEVICE);
+ if (!ic->i_recv_hdrs) {
+ ret = -ENOMEM;
+ rdsdebug("DMA recv hdrs alloc failed\n");
+ goto send_hdrs_dma_out;
+ }
+
+ ic->i_ack = rds_dma_hdr_alloc(rds_ibdev->dev, &ic->i_ack_dma,
+ DMA_TO_DEVICE);
+ if (!ic->i_ack) {
+ ret = -ENOMEM;
+ rdsdebug("DMA ack header alloc failed\n");
+ goto recv_hdrs_dma_out;
+ }
+
+ ic->i_sends = vzalloc_node(array_size(sizeof(struct rds_ib_send_work),
+ ic->i_send_ring.w_nr),
+ ibdev_to_node(dev));
+ if (!ic->i_sends) {
+ ret = -ENOMEM;
+ rdsdebug("send allocation failed\n");
+ goto ack_dma_out;
+ }
+
+ ic->i_recvs = vzalloc_node(array_size(sizeof(struct rds_ib_recv_work),
+ ic->i_recv_ring.w_nr),
+ ibdev_to_node(dev));
+ if (!ic->i_recvs) {
+ ret = -ENOMEM;
+ rdsdebug("recv allocation failed\n");
+ goto sends_out;
+ }
+
+ rds_ib_recv_init_ack(ic);
+
+ rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
+ ic->i_send_cq, ic->i_recv_cq);
+
+ goto out;
+
+sends_out:
+ vfree(ic->i_sends);
+
+ack_dma_out:
+ rds_dma_hdr_free(rds_ibdev->dev, ic->i_ack, ic->i_ack_dma,
+ DMA_TO_DEVICE);
+ ic->i_ack = NULL;
+
+recv_hdrs_dma_out:
+ rds_dma_hdrs_free(rds_ibdev, ic->i_recv_hdrs, ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr, DMA_FROM_DEVICE);
+ ic->i_recv_hdrs = NULL;
+ ic->i_recv_hdrs_dma = NULL;
+
+send_hdrs_dma_out:
+ rds_dma_hdrs_free(rds_ibdev, ic->i_send_hdrs, ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr, DMA_TO_DEVICE);
+ ic->i_send_hdrs = NULL;
+ ic->i_send_hdrs_dma = NULL;
+
+qp_out:
+ rdma_destroy_qp(ic->i_cm_id);
+recv_cq_out:
+ ib_destroy_cq(ic->i_recv_cq);
+ ic->i_recv_cq = NULL;
+send_cq_out:
+ ib_destroy_cq(ic->i_send_cq);
+ ic->i_send_cq = NULL;
+rds_ibdev_out:
+ rds_ib_remove_conn(rds_ibdev, conn);
+out:
+ rds_ib_dev_put(rds_ibdev);
+
+ return ret;
+}
+
+static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
+{
+ const union rds_ib_conn_priv *dp = event->param.conn.private_data;
+ u8 data_len, major, minor;
+ u32 version = 0;
+ __be16 mask;
+ u16 common;
+
+ /*
+ * rdma_cm private data is odd - when there is any private data in the
+ * request, we will be given a pretty large buffer without telling us the
+ * original size. The only way to tell the difference is by looking at
+ * the contents, which are initialized to zero.
+ * If the protocol version fields aren't set, this is a connection attempt
+ * from an older version. This could be 3.0 or 2.0 - we can't tell.
+ * We really should have changed this for OFED 1.3 :-(
+ */
+
+ /* Be paranoid. RDS always has privdata */
+ if (!event->param.conn.private_data_len) {
+ printk(KERN_NOTICE "RDS incoming connection has no private data, "
+ "rejecting\n");
+ return 0;
+ }
+
+ if (isv6) {
+ data_len = sizeof(struct rds6_ib_connect_private);
+ major = dp->ricp_v6.dp_protocol_major;
+ minor = dp->ricp_v6.dp_protocol_minor;
+ mask = dp->ricp_v6.dp_protocol_minor_mask;
+ } else {
+ data_len = sizeof(struct rds_ib_connect_private);
+ major = dp->ricp_v4.dp_protocol_major;
+ minor = dp->ricp_v4.dp_protocol_minor;
+ mask = dp->ricp_v4.dp_protocol_minor_mask;
+ }
+
+ /* Even if len is crap *now* I still want to check it. -ASG */
+ if (event->param.conn.private_data_len < data_len || major == 0)
+ return RDS_PROTOCOL_4_0;
+
+ common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
+ if (major == 4 && common) {
+ version = RDS_PROTOCOL_4_0;
+ while ((common >>= 1) != 0)
+ version++;
+ } else if (RDS_PROTOCOL_COMPAT_VERSION ==
+ RDS_PROTOCOL(major, minor)) {
+ version = RDS_PROTOCOL_COMPAT_VERSION;
+ } else {
+ if (isv6)
+ printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
+ &dp->ricp_v6.dp_saddr, major, minor);
+ else
+ printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
+ &dp->ricp_v4.dp_saddr, major, minor);
+ }
+ return version;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/* Given an IPv6 address, find the net_device which hosts that address and
+ * return its index. This is used by the rds_ib_cm_handle_connect() code to
+ * find the interface index of where an incoming request comes from when
+ * the request is using a link local address.
+ *
+ * Note one problem in this search. It is possible that two interfaces have
+ * the same link local address. Unfortunately, this cannot be solved unless
+ * the underlying layer gives us the interface which an incoming RDMA connect
+ * request comes from.
+ */
+static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)
+{
+ struct net_device *dev;
+ int idx = 0;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ if (ipv6_chk_addr(net, addr, dev, 1)) {
+ idx = dev->ifindex;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return idx;
+}
+#endif
+
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event, bool isv6)
+{
+ __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
+ __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
+ const struct rds_ib_conn_priv_cmn *dp_cmn;
+ struct rds_connection *conn = NULL;
+ struct rds_ib_connection *ic = NULL;
+ struct rdma_conn_param conn_param;
+ const union rds_ib_conn_priv *dp;
+ union rds_ib_conn_priv dp_rep;
+ struct in6_addr s_mapped_addr;
+ struct in6_addr d_mapped_addr;
+ const struct in6_addr *saddr6;
+ const struct in6_addr *daddr6;
+ int destroy = 1;
+ u32 ifindex = 0;
+ u32 version;
+ int err = 1;
+
+ /* Check whether the remote protocol version matches ours. */
+ version = rds_ib_protocol_compatible(event, isv6);
+ if (!version) {
+ err = RDS_RDMA_REJ_INCOMPAT;
+ goto out;
+ }
+
+ dp = event->param.conn.private_data;
+ if (isv6) {
+#if IS_ENABLED(CONFIG_IPV6)
+ dp_cmn = &dp->ricp_v6.dp_cmn;
+ saddr6 = &dp->ricp_v6.dp_saddr;
+ daddr6 = &dp->ricp_v6.dp_daddr;
+ /* If either address is link local, need to find the
+ * interface index in order to create a proper RDS
+ * connection.
+ */
+ if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) {
+ /* Using init_net for now .. */
+ ifindex = __rds_find_ifindex(&init_net, daddr6);
+ /* No index found... Need to bail out. */
+ if (ifindex == 0) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ } else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) {
+ /* Use our address to find the correct index. */
+ ifindex = __rds_find_ifindex(&init_net, daddr6);
+ /* No index found... Need to bail out. */
+ if (ifindex == 0) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+#else
+ err = -EOPNOTSUPP;
+ goto out;
+#endif
+ } else {
+ dp_cmn = &dp->ricp_v4.dp_cmn;
+ ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
+ ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
+ saddr6 = &s_mapped_addr;
+ daddr6 = &d_mapped_addr;
+ }
+
+ rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid 0x%llx, tos:%d\n",
+ saddr6, daddr6, RDS_PROTOCOL_MAJOR(version),
+ RDS_PROTOCOL_MINOR(version),
+ (unsigned long long)be64_to_cpu(lguid),
+ (unsigned long long)be64_to_cpu(fguid), dp_cmn->ricpc_dp_toss);
+
+ /* RDS/IB is not currently netns aware, thus init_net */
+ conn = rds_conn_create(&init_net, daddr6, saddr6,
+ &rds_ib_transport, dp_cmn->ricpc_dp_toss,
+ GFP_KERNEL, ifindex);
+ if (IS_ERR(conn)) {
+ rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+ conn = NULL;
+ goto out;
+ }
+
+ /*
+ * The connection request may occur while the
+ * previous connection exist, e.g. in case of failover.
+ * But as connections may be initiated simultaneously
+ * by both hosts, we have a random backoff mechanism -
+ * see the comment above rds_queue_reconnect()
+ */
+ mutex_lock(&conn->c_cm_lock);
+ if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ rdsdebug("incoming connect while connecting\n");
+ rds_conn_drop(conn);
+ rds_ib_stats_inc(s_ib_listen_closed_stale);
+ } else
+ if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+ /* Wait and see - our connect may still be succeeding */
+ rds_ib_stats_inc(s_ib_connect_raced);
+ }
+ goto out;
+ }
+
+ ic = conn->c_transport_data;
+
+ rds_ib_set_protocol(conn, version);
+ rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit));
+
+ /* If the peer gave us the last packet it saw, process this as if
+ * we had received a regular ACK. */
+ if (dp_cmn->ricpc_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq),
+ NULL);
+
+ BUG_ON(cm_id->context);
+ BUG_ON(ic->i_cm_id);
+
+ ic->i_cm_id = cm_id;
+ cm_id->context = conn;
+
+ /* We got halfway through setting up the ib_connection, if we
+ * fail now, we have to take the long route out of this mess. */
+ destroy = 0;
+
+ err = rds_ib_setup_qp(conn);
+ if (err) {
+ rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
+ goto out;
+ }
+
+ rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
+ event->param.conn.responder_resources,
+ event->param.conn.initiator_depth, isv6);
+
+ rdma_set_min_rnr_timer(cm_id, IB_RNR_TIMER_000_32);
+ /* rdma_accept() calls rdma_reject() internally if it fails */
+ if (rdma_accept(cm_id, &conn_param))
+ rds_ib_conn_error(conn, "rdma_accept failed\n");
+
+out:
+ if (conn)
+ mutex_unlock(&conn->c_cm_lock);
+ if (err)
+ rdma_reject(cm_id, &err, sizeof(int),
+ IB_CM_REJ_CONSUMER_DEFINED);
+ return destroy;
+}
+
+
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
+{
+ struct rds_connection *conn = cm_id->context;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rdma_conn_param conn_param;
+ union rds_ib_conn_priv dp;
+ int ret;
+
+ /* If the peer doesn't do protocol negotiation, we must
+ * default to RDSv3.0 */
+ rds_ib_set_protocol(conn, RDS_PROTOCOL_4_1);
+ ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */
+
+ ret = rds_ib_setup_qp(conn);
+ if (ret) {
+ rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
+ goto out;
+ }
+
+ rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
+ conn->c_proposed_version,
+ UINT_MAX, UINT_MAX, isv6);
+ ret = rdma_connect_locked(cm_id, &conn_param);
+ if (ret)
+ rds_ib_conn_error(conn, "rdma_connect_locked failed (%d)\n",
+ ret);
+
+out:
+ /* Beware - returning non-zero tells the rdma_cm to destroy
+ * the cm_id. We should certainly not do it as long as we still
+ * "own" the cm_id. */
+ if (ret) {
+ if (ic->i_cm_id == cm_id)
+ ret = 0;
+ }
+ ic->i_active_side = true;
+ return ret;
+}
+
+int rds_ib_conn_path_connect(struct rds_conn_path *cp)
+{
+ struct rds_connection *conn = cp->cp_conn;
+ struct sockaddr_storage src, dest;
+ rdma_cm_event_handler handler;
+ struct rds_ib_connection *ic;
+ int ret;
+
+ ic = conn->c_transport_data;
+
+ /* XXX I wonder what affect the port space has */
+ /* delegate cm event handler to rdma_transport */
+#if IS_ENABLED(CONFIG_IPV6)
+ if (conn->c_isv6)
+ handler = rds6_rdma_cm_event_handler;
+ else
+#endif
+ handler = rds_rdma_cm_event_handler;
+ ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
+ RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(ic->i_cm_id)) {
+ ret = PTR_ERR(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ rdsdebug("rdma_create_id() failed: %d\n", ret);
+ goto out;
+ }
+
+ rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+
+ if (ipv6_addr_v4mapped(&conn->c_faddr)) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&src;
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
+ sin->sin_port = 0;
+
+ sin = (struct sockaddr_in *)&dest;
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
+ sin->sin_port = htons(RDS_PORT);
+ } else {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&src;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_addr = conn->c_laddr;
+ sin6->sin6_port = 0;
+ sin6->sin6_scope_id = conn->c_dev_if;
+
+ sin6 = (struct sockaddr_in6 *)&dest;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_addr = conn->c_faddr;
+ sin6->sin6_port = htons(RDS_CM_PORT);
+ sin6->sin6_scope_id = conn->c_dev_if;
+ }
+
+ ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+ (struct sockaddr *)&dest,
+ RDS_RDMA_RESOLVE_TIMEOUT_MS);
+ if (ret) {
+ rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+ ret);
+ rdma_destroy_id(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup. In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
+{
+ struct rds_connection *conn = cp->cp_conn;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ int err = 0;
+
+ rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+ ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+ ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+
+ if (ic->i_cm_id) {
+ rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+ err = rdma_disconnect(ic->i_cm_id);
+ if (err) {
+ /* Actually this may happen quite frequently, when
+ * an outgoing connect raced with an incoming connect.
+ */
+ rdsdebug("failed to disconnect, cm: %p err %d\n",
+ ic->i_cm_id, err);
+ }
+
+ /* kick off "flush_worker" for all pools in order to reap
+ * all FRMR registrations that are still marked "FRMR_IS_INUSE"
+ */
+ rds_ib_flush_mrs();
+
+ /*
+ * We want to wait for tx and rx completion to finish
+ * before we tear down the connection, but we have to be
+ * careful not to get stuck waiting on a send ring that
+ * only has unsignaled sends in it. We've shutdown new
+ * sends before getting here so by waiting for signaled
+ * sends to complete we're ensured that there will be no
+ * more tx processing.
+ */
+ wait_event(rds_ib_ring_empty_wait,
+ rds_ib_ring_empty(&ic->i_recv_ring) &&
+ (atomic_read(&ic->i_signaled_sends) == 0) &&
+ (atomic_read(&ic->i_fastreg_inuse_count) == 0) &&
+ (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
+ tasklet_kill(&ic->i_send_tasklet);
+ tasklet_kill(&ic->i_recv_tasklet);
+
+ atomic_set(&ic->i_cq_quiesce, 1);
+
+ /* first destroy the ib state that generates callbacks */
+ if (ic->i_cm_id->qp)
+ rdma_destroy_qp(ic->i_cm_id);
+ if (ic->i_send_cq) {
+ if (ic->rds_ibdev)
+ ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector);
+ ib_destroy_cq(ic->i_send_cq);
+ }
+
+ if (ic->i_recv_cq) {
+ if (ic->rds_ibdev)
+ ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector);
+ ib_destroy_cq(ic->i_recv_cq);
+ }
+
+ if (ic->rds_ibdev) {
+ /* then free the resources that ib callbacks use */
+ if (ic->i_send_hdrs) {
+ rds_dma_hdrs_free(ic->rds_ibdev,
+ ic->i_send_hdrs,
+ ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr,
+ DMA_TO_DEVICE);
+ ic->i_send_hdrs = NULL;
+ ic->i_send_hdrs_dma = NULL;
+ }
+
+ if (ic->i_recv_hdrs) {
+ rds_dma_hdrs_free(ic->rds_ibdev,
+ ic->i_recv_hdrs,
+ ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr,
+ DMA_FROM_DEVICE);
+ ic->i_recv_hdrs = NULL;
+ ic->i_recv_hdrs_dma = NULL;
+ }
+
+ if (ic->i_ack) {
+ rds_dma_hdr_free(ic->rds_ibdev->dev, ic->i_ack,
+ ic->i_ack_dma, DMA_TO_DEVICE);
+ ic->i_ack = NULL;
+ }
+ } else {
+ WARN_ON(ic->i_send_hdrs);
+ WARN_ON(ic->i_send_hdrs_dma);
+ WARN_ON(ic->i_recv_hdrs);
+ WARN_ON(ic->i_recv_hdrs_dma);
+ WARN_ON(ic->i_ack);
+ }
+
+ if (ic->i_sends)
+ rds_ib_send_clear_ring(ic);
+ if (ic->i_recvs)
+ rds_ib_recv_clear_ring(ic);
+
+ rdma_destroy_id(ic->i_cm_id);
+
+ /*
+ * Move connection back to the nodev list.
+ */
+ if (ic->rds_ibdev)
+ rds_ib_remove_conn(ic->rds_ibdev, conn);
+
+ ic->i_cm_id = NULL;
+ ic->i_pd = NULL;
+ ic->i_send_cq = NULL;
+ ic->i_recv_cq = NULL;
+ }
+ BUG_ON(ic->rds_ibdev);
+
+ /* Clear pending transmit */
+ if (ic->i_data_op) {
+ struct rds_message *rm;
+
+ rm = container_of(ic->i_data_op, struct rds_message, data);
+ rds_message_put(rm);
+ ic->i_data_op = NULL;
+ }
+
+ /* Clear the ACK state */
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_set(&ic->i_ack_next, 0);
+#else
+ ic->i_ack_next = 0;
+#endif
+ ic->i_ack_recv = 0;
+
+ /* Clear flow control state */
+ ic->i_flowctl = 0;
+ atomic_set(&ic->i_credits, 0);
+
+ /* Re-init rings, but retain sizes. */
+ rds_ib_ring_init(&ic->i_send_ring, ic->i_send_ring.w_nr);
+ rds_ib_ring_init(&ic->i_recv_ring, ic->i_recv_ring.w_nr);
+
+ if (ic->i_ibinc) {
+ rds_inc_put(&ic->i_ibinc->ii_inc);
+ ic->i_ibinc = NULL;
+ }
+
+ vfree(ic->i_sends);
+ ic->i_sends = NULL;
+ vfree(ic->i_recvs);
+ ic->i_recvs = NULL;
+ ic->i_active_side = false;
+}
+
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+ struct rds_ib_connection *ic;
+ unsigned long flags;
+ int ret;
+
+ /* XXX too lazy? */
+ ic = kzalloc(sizeof(struct rds_ib_connection), gfp);
+ if (!ic)
+ return -ENOMEM;
+
+ ret = rds_ib_recv_alloc_caches(ic, gfp);
+ if (ret) {
+ kfree(ic);
+ return ret;
+ }
+
+ INIT_LIST_HEAD(&ic->ib_node);
+ tasklet_init(&ic->i_send_tasklet, rds_ib_tasklet_fn_send,
+ (unsigned long)ic);
+ tasklet_init(&ic->i_recv_tasklet, rds_ib_tasklet_fn_recv,
+ (unsigned long)ic);
+ mutex_init(&ic->i_recv_mutex);
+#ifndef KERNEL_HAS_ATOMIC64
+ spin_lock_init(&ic->i_ack_lock);
+#endif
+ atomic_set(&ic->i_signaled_sends, 0);
+ atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
+
+ /*
+ * rds_ib_conn_shutdown() waits for these to be emptied so they
+ * must be initialized before it can be called.
+ */
+ rds_ib_ring_init(&ic->i_send_ring, 0);
+ rds_ib_ring_init(&ic->i_recv_ring, 0);
+
+ ic->conn = conn;
+ conn->c_transport_data = ic;
+
+ spin_lock_irqsave(&ib_nodev_conns_lock, flags);
+ list_add_tail(&ic->ib_node, &ib_nodev_conns);
+ spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
+
+
+ rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+ return 0;
+}
+
+/*
+ * Free a connection. Connection must be shut down and not set for reconnect.
+ */
+void rds_ib_conn_free(void *arg)
+{
+ struct rds_ib_connection *ic = arg;
+ spinlock_t *lock_ptr;
+
+ rdsdebug("ic %p\n", ic);
+
+ /*
+ * Conn is either on a dev's list or on the nodev list.
+ * A race with shutdown() or connect() would cause problems
+ * (since rds_ibdev would change) but that should never happen.
+ */
+ lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
+
+ spin_lock_irq(lock_ptr);
+ list_del(&ic->ib_node);
+ spin_unlock_irq(lock_ptr);
+
+ rds_ib_recv_free_caches(ic);
+
+ kfree(ic);
+}
+
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+ va_list ap;
+
+ rds_conn_drop(conn);
+
+ va_start(ap, fmt);
+ vprintk(fmt, ap);
+ va_end(ap);
+}
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
new file mode 100644
index 000000000..28c1b0022
--- /dev/null
+++ b/net/rds/ib_frmr.c
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ib_mr.h"
+
+static inline void
+rds_transition_frwr_state(struct rds_ib_mr *ibmr,
+ enum rds_ib_fr_state old_state,
+ enum rds_ib_fr_state new_state)
+{
+ if (cmpxchg(&ibmr->u.frmr.fr_state,
+ old_state, new_state) == old_state &&
+ old_state == FRMR_IS_INUSE) {
+ /* enforce order of ibmr->u.frmr.fr_state update
+ * before decrementing i_fastreg_inuse_count
+ */
+ smp_mb__before_atomic();
+ atomic_dec(&ibmr->ic->i_fastreg_inuse_count);
+ if (waitqueue_active(&rds_ib_ring_empty_wait))
+ wake_up(&rds_ib_ring_empty_wait);
+ }
+}
+
+static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
+ int npages)
+{
+ struct rds_ib_mr_pool *pool;
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_frmr *frmr;
+ int err = 0;
+
+ if (npages <= RDS_MR_8K_MSG_SIZE)
+ pool = rds_ibdev->mr_8k_pool;
+ else
+ pool = rds_ibdev->mr_1m_pool;
+
+ ibmr = rds_ib_try_reuse_ibmr(pool);
+ if (ibmr)
+ return ibmr;
+
+ ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
+ rdsibdev_to_node(rds_ibdev));
+ if (!ibmr) {
+ err = -ENOMEM;
+ goto out_no_cigar;
+ }
+
+ frmr = &ibmr->u.frmr;
+ frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG,
+ pool->max_pages);
+ if (IS_ERR(frmr->mr)) {
+ pr_warn("RDS/IB: %s failed to allocate MR", __func__);
+ err = PTR_ERR(frmr->mr);
+ goto out_no_cigar;
+ }
+
+ ibmr->pool = pool;
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
+ if (atomic_read(&pool->item_count) > pool->max_items_soft)
+ pool->max_items_soft = pool->max_items;
+
+ frmr->fr_state = FRMR_IS_FREE;
+ init_waitqueue_head(&frmr->fr_inv_done);
+ init_waitqueue_head(&frmr->fr_reg_done);
+ return ibmr;
+
+out_no_cigar:
+ kfree(ibmr);
+ atomic_dec(&pool->item_count);
+ return ERR_PTR(err);
+}
+
+static void rds_ib_free_frmr(struct rds_ib_mr *ibmr, bool drop)
+{
+ struct rds_ib_mr_pool *pool = ibmr->pool;
+
+ if (drop)
+ llist_add(&ibmr->llnode, &pool->drop_list);
+ else
+ llist_add(&ibmr->llnode, &pool->free_list);
+ atomic_add(ibmr->sg_len, &pool->free_pinned);
+ atomic_inc(&pool->dirty_count);
+
+ /* If we've pinned too many pages, request a flush */
+ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+ atomic_read(&pool->dirty_count) >= pool->max_items / 5)
+ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+}
+
+static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
+{
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+ struct ib_reg_wr reg_wr;
+ int ret, off = 0;
+
+ while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
+ atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ cpu_relax();
+ }
+
+ ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_dma_len,
+ &off, PAGE_SIZE);
+ if (unlikely(ret != ibmr->sg_dma_len))
+ return ret < 0 ? ret : -EINVAL;
+
+ if (cmpxchg(&frmr->fr_state,
+ FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE)
+ return -EBUSY;
+
+ atomic_inc(&ibmr->ic->i_fastreg_inuse_count);
+
+ /* Perform a WR for the fast_reg_mr. Each individual page
+ * in the sg list is added to the fast reg page list and placed
+ * inside the fast_reg_mr WR. The key used is a rolling 8bit
+ * counter, which should guarantee uniqueness.
+ */
+ ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++);
+ frmr->fr_reg = true;
+
+ memset(&reg_wr, 0, sizeof(reg_wr));
+ reg_wr.wr.wr_id = (unsigned long)(void *)ibmr;
+ reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg_wr.wr.num_sge = 0;
+ reg_wr.mr = frmr->mr;
+ reg_wr.key = frmr->mr->rkey;
+ reg_wr.access = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE;
+ reg_wr.wr.send_flags = IB_SEND_SIGNALED;
+
+ ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, NULL);
+ if (unlikely(ret)) {
+ /* Failure here can be because of -ENOMEM as well */
+ rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
+
+ atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ if (printk_ratelimit())
+ pr_warn("RDS/IB: %s returned error(%d)\n",
+ __func__, ret);
+ goto out;
+ }
+
+ /* Wait for the registration to complete in order to prevent an invalid
+ * access error resulting from a race between the memory region already
+ * being accessed while registration is still pending.
+ */
+ wait_event(frmr->fr_reg_done, !frmr->fr_reg);
+
+out:
+
+ return ret;
+}
+
+static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev,
+ struct rds_ib_mr_pool *pool,
+ struct rds_ib_mr *ibmr,
+ struct scatterlist *sg, unsigned int sg_len)
+{
+ struct ib_device *dev = rds_ibdev->dev;
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+ int i;
+ u32 len;
+ int ret = 0;
+
+ /* We want to teardown old ibmr values here and fill it up with
+ * new sg values
+ */
+ rds_ib_teardown_mr(ibmr);
+
+ ibmr->sg = sg;
+ ibmr->sg_len = sg_len;
+ ibmr->sg_dma_len = 0;
+ frmr->sg_byte_len = 0;
+ WARN_ON(ibmr->sg_dma_len);
+ ibmr->sg_dma_len = ib_dma_map_sg(dev, ibmr->sg, ibmr->sg_len,
+ DMA_BIDIRECTIONAL);
+ if (unlikely(!ibmr->sg_dma_len)) {
+ pr_warn("RDS/IB: %s failed!\n", __func__);
+ return -EBUSY;
+ }
+
+ frmr->sg_byte_len = 0;
+ frmr->dma_npages = 0;
+ len = 0;
+
+ ret = -EINVAL;
+ for (i = 0; i < ibmr->sg_dma_len; ++i) {
+ unsigned int dma_len = sg_dma_len(&ibmr->sg[i]);
+ u64 dma_addr = sg_dma_address(&ibmr->sg[i]);
+
+ frmr->sg_byte_len += dma_len;
+ if (dma_addr & ~PAGE_MASK) {
+ if (i > 0)
+ goto out_unmap;
+ else
+ ++frmr->dma_npages;
+ }
+
+ if ((dma_addr + dma_len) & ~PAGE_MASK) {
+ if (i < ibmr->sg_dma_len - 1)
+ goto out_unmap;
+ else
+ ++frmr->dma_npages;
+ }
+
+ len += dma_len;
+ }
+ frmr->dma_npages += len >> PAGE_SHIFT;
+
+ if (frmr->dma_npages > ibmr->pool->max_pages) {
+ ret = -EMSGSIZE;
+ goto out_unmap;
+ }
+
+ ret = rds_ib_post_reg_frmr(ibmr);
+ if (ret)
+ goto out_unmap;
+
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
+
+ return ret;
+
+out_unmap:
+ ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len,
+ DMA_BIDIRECTIONAL);
+ ibmr->sg_dma_len = 0;
+ return ret;
+}
+
+static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
+{
+ struct ib_send_wr *s_wr;
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+ struct rdma_cm_id *i_cm_id = ibmr->ic->i_cm_id;
+ int ret = -EINVAL;
+
+ if (!i_cm_id || !i_cm_id->qp || !frmr->mr)
+ goto out;
+
+ if (frmr->fr_state != FRMR_IS_INUSE)
+ goto out;
+
+ while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
+ atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ cpu_relax();
+ }
+
+ frmr->fr_inv = true;
+ s_wr = &frmr->fr_wr;
+
+ memset(s_wr, 0, sizeof(*s_wr));
+ s_wr->wr_id = (unsigned long)(void *)ibmr;
+ s_wr->opcode = IB_WR_LOCAL_INV;
+ s_wr->ex.invalidate_rkey = frmr->mr->rkey;
+ s_wr->send_flags = IB_SEND_SIGNALED;
+
+ ret = ib_post_send(i_cm_id->qp, s_wr, NULL);
+ if (unlikely(ret)) {
+ rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
+ frmr->fr_inv = false;
+ /* enforce order of frmr->fr_inv update
+ * before incrementing i_fastreg_wrs
+ */
+ smp_mb__before_atomic();
+ atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
+ goto out;
+ }
+
+ /* Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition in order to
+ * 1) avoid a silly bouncing between "clean_list" and "drop_list"
+ * triggered by function "rds_ib_reg_frmr" as it is releases frmr
+ * regions whose state is not "FRMR_IS_FREE" right away.
+ * 2) prevents an invalid access error in a race
+ * from a pending "IB_WR_LOCAL_INV" operation
+ * with a teardown ("dma_unmap_sg", "put_page")
+ * and de-registration ("ib_dereg_mr") of the corresponding
+ * memory region.
+ */
+ wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE);
+
+out:
+ return ret;
+}
+
+void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
+{
+ struct rds_ib_mr *ibmr = (void *)(unsigned long)wc->wr_id;
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
+ if (rds_conn_up(ic->conn))
+ rds_ib_conn_error(ic->conn,
+ "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n",
+ &ic->conn->c_laddr,
+ &ic->conn->c_faddr,
+ wc->status,
+ ib_wc_status_msg(wc->status),
+ wc->vendor_err);
+ }
+
+ if (frmr->fr_inv) {
+ rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_FREE);
+ frmr->fr_inv = false;
+ wake_up(&frmr->fr_inv_done);
+ }
+
+ if (frmr->fr_reg) {
+ frmr->fr_reg = false;
+ wake_up(&frmr->fr_reg_done);
+ }
+
+ /* enforce order of frmr->{fr_reg,fr_inv} update
+ * before incrementing i_fastreg_wrs
+ */
+ smp_mb__before_atomic();
+ atomic_inc(&ic->i_fastreg_wrs);
+}
+
+void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
+ unsigned long *unpinned, unsigned int goal)
+{
+ struct rds_ib_mr *ibmr, *next;
+ struct rds_ib_frmr *frmr;
+ int ret = 0, ret2;
+ unsigned int freed = *nfreed;
+
+ /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+ list_for_each_entry(ibmr, list, unmap_list) {
+ if (ibmr->sg_dma_len) {
+ ret2 = rds_ib_post_inv(ibmr);
+ if (ret2 && !ret)
+ ret = ret2;
+ }
+ }
+
+ if (ret)
+ pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret);
+
+ /* Now we can destroy the DMA mapping and unpin any pages */
+ list_for_each_entry_safe(ibmr, next, list, unmap_list) {
+ *unpinned += ibmr->sg_len;
+ frmr = &ibmr->u.frmr;
+ __rds_ib_teardown_mr(ibmr);
+ if (freed < goal || frmr->fr_state == FRMR_IS_STALE) {
+ /* Don't de-allocate if the MR is not free yet */
+ if (frmr->fr_state == FRMR_IS_INUSE)
+ continue;
+
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
+ list_del(&ibmr->unmap_list);
+ if (frmr->mr)
+ ib_dereg_mr(frmr->mr);
+ kfree(ibmr);
+ freed++;
+ }
+ }
+ *nfreed = freed;
+}
+
+struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
+ struct rds_ib_connection *ic,
+ struct scatterlist *sg,
+ unsigned long nents, u32 *key)
+{
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_frmr *frmr;
+ int ret;
+
+ if (!ic) {
+ /* TODO: Add FRWR support for RDS_GET_MR using proxy qp*/
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ do {
+ if (ibmr)
+ rds_ib_free_frmr(ibmr, true);
+ ibmr = rds_ib_alloc_frmr(rds_ibdev, nents);
+ if (IS_ERR(ibmr))
+ return ibmr;
+ frmr = &ibmr->u.frmr;
+ } while (frmr->fr_state != FRMR_IS_FREE);
+
+ ibmr->ic = ic;
+ ibmr->device = rds_ibdev;
+ ret = rds_ib_map_frmr(rds_ibdev, ibmr->pool, ibmr, sg, nents);
+ if (ret == 0) {
+ *key = frmr->mr->rkey;
+ } else {
+ rds_ib_free_frmr(ibmr, false);
+ ibmr = ERR_PTR(ret);
+ }
+
+ return ibmr;
+}
+
+void rds_ib_free_frmr_list(struct rds_ib_mr *ibmr)
+{
+ struct rds_ib_mr_pool *pool = ibmr->pool;
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+
+ if (frmr->fr_state == FRMR_IS_STALE)
+ llist_add(&ibmr->llnode, &pool->drop_list);
+ else
+ llist_add(&ibmr->llnode, &pool->free_list);
+}
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
new file mode 100644
index 000000000..ea5e9aee4
--- /dev/null
+++ b/net/rds/ib_mr.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _RDS_IB_MR_H
+#define _RDS_IB_MR_H
+
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "ib.h"
+
+#define RDS_MR_1M_POOL_SIZE (8192 / 2)
+#define RDS_MR_1M_MSG_SIZE 256
+#define RDS_MR_8K_MSG_SIZE 2
+#define RDS_MR_8K_SCALE (256 / (RDS_MR_8K_MSG_SIZE + 1))
+#define RDS_MR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2))
+
+enum rds_ib_fr_state {
+ FRMR_IS_FREE, /* mr invalidated & ready for use */
+ FRMR_IS_INUSE, /* mr is in use or used & can be invalidated */
+ FRMR_IS_STALE, /* Stale MR and needs to be dropped */
+};
+
+struct rds_ib_frmr {
+ struct ib_mr *mr;
+ enum rds_ib_fr_state fr_state;
+ bool fr_inv;
+ wait_queue_head_t fr_inv_done;
+ bool fr_reg;
+ wait_queue_head_t fr_reg_done;
+ struct ib_send_wr fr_wr;
+ unsigned int dma_npages;
+ unsigned int sg_byte_len;
+};
+
+/* This is stored as mr->r_trans_private. */
+struct rds_ib_mr {
+ struct delayed_work work;
+ struct rds_ib_device *device;
+ struct rds_ib_mr_pool *pool;
+ struct rds_ib_connection *ic;
+
+ struct llist_node llnode;
+
+ /* unmap_list is for freeing */
+ struct list_head unmap_list;
+ unsigned int remap_count;
+
+ struct scatterlist *sg;
+ unsigned int sg_len;
+ int sg_dma_len;
+
+ u8 odp:1;
+ union {
+ struct rds_ib_frmr frmr;
+ struct ib_mr *mr;
+ } u;
+};
+
+/* Our own little MR pool */
+struct rds_ib_mr_pool {
+ unsigned int pool_type;
+ struct mutex flush_lock; /* serialize fmr invalidate */
+ struct delayed_work flush_worker; /* flush worker */
+
+ atomic_t item_count; /* total # of MRs */
+ atomic_t dirty_count; /* # dirty of MRs */
+
+ struct llist_head drop_list; /* MRs not reached max_maps */
+ struct llist_head free_list; /* unused MRs */
+ struct llist_head clean_list; /* unused & unmapped MRs */
+ wait_queue_head_t flush_wait;
+ spinlock_t clean_lock; /* "clean_list" concurrency */
+
+ atomic_t free_pinned; /* memory pinned by free MRs */
+ unsigned long max_items;
+ unsigned long max_items_soft;
+ unsigned long max_free_pinned;
+ unsigned int max_pages;
+};
+
+extern struct workqueue_struct *rds_ib_mr_wq;
+extern bool prefer_frmr;
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
+ int npages);
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+ struct rds_info_rdma_connection *iinfo);
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+ struct rds6_info_rdma_connection *iinfo6);
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+ struct rds_sock *rs, u32 *key_ret,
+ struct rds_connection *conn, u64 start, u64 length,
+ int need_odp);
+void rds_ib_sync_mr(void *trans_private, int dir);
+void rds_ib_free_mr(void *trans_private, int invalidate);
+void rds_ib_flush_mrs(void);
+int rds_ib_mr_init(void);
+void rds_ib_mr_exit(void);
+u32 rds_ib_get_lkey(void *trans_private);
+
+void __rds_ib_teardown_mr(struct rds_ib_mr *);
+void rds_ib_teardown_mr(struct rds_ib_mr *);
+struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *);
+int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **);
+struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *);
+struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
+ struct rds_ib_connection *ic,
+ struct scatterlist *sg,
+ unsigned long nents, u32 *key);
+void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
+ unsigned long *unpinned, unsigned int goal);
+void rds_ib_free_frmr_list(struct rds_ib_mr *);
+#endif
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
new file mode 100644
index 000000000..8f070ee7e
--- /dev/null
+++ b/net/rds/ib_rdma.c
@@ -0,0 +1,701 @@
+/*
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/llist.h>
+
+#include "rds_single_path.h"
+#include "ib_mr.h"
+#include "rds.h"
+
+struct workqueue_struct *rds_ib_mr_wq;
+struct rds_ib_dereg_odp_mr {
+ struct work_struct work;
+ struct ib_mr *mr;
+};
+
+static void rds_ib_odp_mr_worker(struct work_struct *work);
+
+static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
+{
+ struct rds_ib_device *rds_ibdev;
+ struct rds_ib_ipaddr *i_ipaddr;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
+ list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+ if (i_ipaddr->ipaddr == ipaddr) {
+ refcount_inc(&rds_ibdev->refcount);
+ rcu_read_unlock();
+ return rds_ibdev;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+ struct rds_ib_ipaddr *i_ipaddr;
+
+ i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
+ if (!i_ipaddr)
+ return -ENOMEM;
+
+ i_ipaddr->ipaddr = ipaddr;
+
+ spin_lock_irq(&rds_ibdev->spinlock);
+ list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+ spin_unlock_irq(&rds_ibdev->spinlock);
+
+ return 0;
+}
+
+static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+ struct rds_ib_ipaddr *i_ipaddr;
+ struct rds_ib_ipaddr *to_free = NULL;
+
+
+ spin_lock_irq(&rds_ibdev->spinlock);
+ list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+ if (i_ipaddr->ipaddr == ipaddr) {
+ list_del_rcu(&i_ipaddr->list);
+ to_free = i_ipaddr;
+ break;
+ }
+ }
+ spin_unlock_irq(&rds_ibdev->spinlock);
+
+ if (to_free)
+ kfree_rcu(to_free, rcu);
+}
+
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
+ struct in6_addr *ipaddr)
+{
+ struct rds_ib_device *rds_ibdev_old;
+
+ rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]);
+ if (!rds_ibdev_old)
+ return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
+
+ if (rds_ibdev_old != rds_ibdev) {
+ rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]);
+ rds_ib_dev_put(rds_ibdev_old);
+ return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
+ }
+ rds_ib_dev_put(rds_ibdev_old);
+
+ return 0;
+}
+
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ /* conn was previously on the nodev_conns_list */
+ spin_lock_irq(&ib_nodev_conns_lock);
+ BUG_ON(list_empty(&ib_nodev_conns));
+ BUG_ON(list_empty(&ic->ib_node));
+ list_del(&ic->ib_node);
+
+ spin_lock(&rds_ibdev->spinlock);
+ list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
+ spin_unlock(&rds_ibdev->spinlock);
+ spin_unlock_irq(&ib_nodev_conns_lock);
+
+ ic->rds_ibdev = rds_ibdev;
+ refcount_inc(&rds_ibdev->refcount);
+}
+
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ /* place conn on nodev_conns_list */
+ spin_lock(&ib_nodev_conns_lock);
+
+ spin_lock_irq(&rds_ibdev->spinlock);
+ BUG_ON(list_empty(&ic->ib_node));
+ list_del(&ic->ib_node);
+ spin_unlock_irq(&rds_ibdev->spinlock);
+
+ list_add_tail(&ic->ib_node, &ib_nodev_conns);
+
+ spin_unlock(&ib_nodev_conns_lock);
+
+ ic->rds_ibdev = NULL;
+ rds_ib_dev_put(rds_ibdev);
+}
+
+void rds_ib_destroy_nodev_conns(void)
+{
+ struct rds_ib_connection *ic, *_ic;
+ LIST_HEAD(tmp_list);
+
+ /* avoid calling conn_destroy with irqs off */
+ spin_lock_irq(&ib_nodev_conns_lock);
+ list_splice(&ib_nodev_conns, &tmp_list);
+ spin_unlock_irq(&ib_nodev_conns_lock);
+
+ list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
+ rds_conn_destroy(ic->conn);
+}
+
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
+{
+ struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
+
+ iinfo->rdma_mr_max = pool_1m->max_items;
+ iinfo->rdma_mr_size = pool_1m->max_pages;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+ struct rds6_info_rdma_connection *iinfo6)
+{
+ struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
+
+ iinfo6->rdma_mr_max = pool_1m->max_items;
+ iinfo6->rdma_mr_size = pool_1m->max_pages;
+}
+#endif
+
+struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
+{
+ struct rds_ib_mr *ibmr = NULL;
+ struct llist_node *ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->clean_lock, flags);
+ ret = llist_del_first(&pool->clean_list);
+ spin_unlock_irqrestore(&pool->clean_lock, flags);
+ if (ret) {
+ ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_reused);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
+ }
+
+ return ibmr;
+}
+
+void rds_ib_sync_mr(void *trans_private, int direction)
+{
+ struct rds_ib_mr *ibmr = trans_private;
+ struct rds_ib_device *rds_ibdev = ibmr->device;
+
+ if (ibmr->odp)
+ return;
+
+ switch (direction) {
+ case DMA_FROM_DEVICE:
+ ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
+ ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+ break;
+ case DMA_TO_DEVICE:
+ ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
+ ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+ break;
+ }
+}
+
+void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+ struct rds_ib_device *rds_ibdev = ibmr->device;
+
+ if (ibmr->sg_dma_len) {
+ ib_dma_unmap_sg(rds_ibdev->dev,
+ ibmr->sg, ibmr->sg_len,
+ DMA_BIDIRECTIONAL);
+ ibmr->sg_dma_len = 0;
+ }
+
+ /* Release the s/g list */
+ if (ibmr->sg_len) {
+ unsigned int i;
+
+ for (i = 0; i < ibmr->sg_len; ++i) {
+ struct page *page = sg_page(&ibmr->sg[i]);
+
+ /* FIXME we need a way to tell a r/w MR
+ * from a r/o MR */
+ WARN_ON(!page->mapping && irqs_disabled());
+ set_page_dirty(page);
+ put_page(page);
+ }
+ kfree(ibmr->sg);
+
+ ibmr->sg = NULL;
+ ibmr->sg_len = 0;
+ }
+}
+
+void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+ unsigned int pinned = ibmr->sg_len;
+
+ __rds_ib_teardown_mr(ibmr);
+ if (pinned) {
+ struct rds_ib_mr_pool *pool = ibmr->pool;
+
+ atomic_sub(pinned, &pool->free_pinned);
+ }
+}
+
+static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
+{
+ unsigned int item_count;
+
+ item_count = atomic_read(&pool->item_count);
+ if (free_all)
+ return item_count;
+
+ return 0;
+}
+
+/*
+ * given an llist of mrs, put them all into the list_head for more processing
+ */
+static unsigned int llist_append_to_list(struct llist_head *llist,
+ struct list_head *list)
+{
+ struct rds_ib_mr *ibmr;
+ struct llist_node *node;
+ struct llist_node *next;
+ unsigned int count = 0;
+
+ node = llist_del_all(llist);
+ while (node) {
+ next = node->next;
+ ibmr = llist_entry(node, struct rds_ib_mr, llnode);
+ list_add_tail(&ibmr->unmap_list, list);
+ node = next;
+ count++;
+ }
+ return count;
+}
+
+/*
+ * this takes a list head of mrs and turns it into linked llist nodes
+ * of clusters. Each cluster has linked llist nodes of
+ * MR_CLUSTER_SIZE mrs that are ready for reuse.
+ */
+static void list_to_llist_nodes(struct list_head *list,
+ struct llist_node **nodes_head,
+ struct llist_node **nodes_tail)
+{
+ struct rds_ib_mr *ibmr;
+ struct llist_node *cur = NULL;
+ struct llist_node **next = nodes_head;
+
+ list_for_each_entry(ibmr, list, unmap_list) {
+ cur = &ibmr->llnode;
+ *next = cur;
+ next = &cur->next;
+ }
+ *next = NULL;
+ *nodes_tail = cur;
+}
+
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
+ int free_all, struct rds_ib_mr **ibmr_ret)
+{
+ struct rds_ib_mr *ibmr;
+ struct llist_node *clean_nodes;
+ struct llist_node *clean_tail;
+ LIST_HEAD(unmap_list);
+ unsigned long unpinned = 0;
+ unsigned int nfreed = 0, dirty_to_clean = 0, free_goal;
+
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush);
+
+ if (ibmr_ret) {
+ DEFINE_WAIT(wait);
+ while (!mutex_trylock(&pool->flush_lock)) {
+ ibmr = rds_ib_reuse_mr(pool);
+ if (ibmr) {
+ *ibmr_ret = ibmr;
+ finish_wait(&pool->flush_wait, &wait);
+ goto out_nolock;
+ }
+
+ prepare_to_wait(&pool->flush_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (llist_empty(&pool->clean_list))
+ schedule();
+
+ ibmr = rds_ib_reuse_mr(pool);
+ if (ibmr) {
+ *ibmr_ret = ibmr;
+ finish_wait(&pool->flush_wait, &wait);
+ goto out_nolock;
+ }
+ }
+ finish_wait(&pool->flush_wait, &wait);
+ } else
+ mutex_lock(&pool->flush_lock);
+
+ if (ibmr_ret) {
+ ibmr = rds_ib_reuse_mr(pool);
+ if (ibmr) {
+ *ibmr_ret = ibmr;
+ goto out;
+ }
+ }
+
+ /* Get the list of all MRs to be dropped. Ordering matters -
+ * we want to put drop_list ahead of free_list.
+ */
+ dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
+ dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
+ if (free_all) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->clean_lock, flags);
+ llist_append_to_list(&pool->clean_list, &unmap_list);
+ spin_unlock_irqrestore(&pool->clean_lock, flags);
+ }
+
+ free_goal = rds_ib_flush_goal(pool, free_all);
+
+ if (list_empty(&unmap_list))
+ goto out;
+
+ rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
+
+ if (!list_empty(&unmap_list)) {
+ unsigned long flags;
+
+ list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail);
+ if (ibmr_ret) {
+ *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
+ clean_nodes = clean_nodes->next;
+ }
+ /* more than one entry in llist nodes */
+ if (clean_nodes) {
+ spin_lock_irqsave(&pool->clean_lock, flags);
+ llist_add_batch(clean_nodes, clean_tail,
+ &pool->clean_list);
+ spin_unlock_irqrestore(&pool->clean_lock, flags);
+ }
+ }
+
+ atomic_sub(unpinned, &pool->free_pinned);
+ atomic_sub(dirty_to_clean, &pool->dirty_count);
+ atomic_sub(nfreed, &pool->item_count);
+
+out:
+ mutex_unlock(&pool->flush_lock);
+ if (waitqueue_active(&pool->flush_wait))
+ wake_up(&pool->flush_wait);
+out_nolock:
+ return 0;
+}
+
+struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
+{
+ struct rds_ib_mr *ibmr = NULL;
+ int iter = 0;
+
+ while (1) {
+ ibmr = rds_ib_reuse_mr(pool);
+ if (ibmr)
+ return ibmr;
+
+ if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+ break;
+
+ atomic_dec(&pool->item_count);
+
+ if (++iter > 2) {
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
+ break;
+ }
+
+ /* We do have some empty MRs. Flush them out. */
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
+
+ rds_ib_flush_mr_pool(pool, 0, &ibmr);
+ if (ibmr)
+ return ibmr;
+ }
+
+ return NULL;
+}
+
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
+{
+ struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
+
+ rds_ib_flush_mr_pool(pool, 0, NULL);
+}
+
+void rds_ib_free_mr(void *trans_private, int invalidate)
+{
+ struct rds_ib_mr *ibmr = trans_private;
+ struct rds_ib_mr_pool *pool = ibmr->pool;
+ struct rds_ib_device *rds_ibdev = ibmr->device;
+
+ rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
+
+ if (ibmr->odp) {
+ /* A MR created and marked as use_once. We use delayed work,
+ * because there is a change that we are in interrupt and can't
+ * call to ib_dereg_mr() directly.
+ */
+ INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker);
+ queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0);
+ return;
+ }
+
+ /* Return it to the pool's free list */
+ rds_ib_free_frmr_list(ibmr);
+
+ atomic_add(ibmr->sg_len, &pool->free_pinned);
+ atomic_inc(&pool->dirty_count);
+
+ /* If we've pinned too many pages, request a flush */
+ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+ atomic_read(&pool->dirty_count) >= pool->max_items / 5)
+ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+
+ if (invalidate) {
+ if (likely(!in_interrupt())) {
+ rds_ib_flush_mr_pool(pool, 0, NULL);
+ } else {
+ /* We get here if the user created a MR marked
+ * as use_once and invalidate at the same time.
+ */
+ queue_delayed_work(rds_ib_mr_wq,
+ &pool->flush_worker, 10);
+ }
+ }
+
+ rds_ib_dev_put(rds_ibdev);
+}
+
+void rds_ib_flush_mrs(void)
+{
+ struct rds_ib_device *rds_ibdev;
+
+ down_read(&rds_ib_devices_lock);
+ list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+ if (rds_ibdev->mr_8k_pool)
+ rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL);
+
+ if (rds_ibdev->mr_1m_pool)
+ rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL);
+ }
+ up_read(&rds_ib_devices_lock);
+}
+
+u32 rds_ib_get_lkey(void *trans_private)
+{
+ struct rds_ib_mr *ibmr = trans_private;
+
+ return ibmr->u.mr->lkey;
+}
+
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+ struct rds_sock *rs, u32 *key_ret,
+ struct rds_connection *conn,
+ u64 start, u64 length, int need_odp)
+{
+ struct rds_ib_device *rds_ibdev;
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_connection *ic = NULL;
+ int ret;
+
+ rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]);
+ if (!rds_ibdev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) {
+ u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start;
+ int access_flags =
+ (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC |
+ IB_ACCESS_ON_DEMAND);
+ struct ib_sge sge = {};
+ struct ib_mr *ib_mr;
+
+ if (!rds_ibdev->odp_capable) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr,
+ access_flags);
+
+ if (IS_ERR(ib_mr)) {
+ rdsdebug("rds_ib_get_user_mr returned %d\n",
+ IS_ERR(ib_mr));
+ ret = PTR_ERR(ib_mr);
+ goto out;
+ }
+ if (key_ret)
+ *key_ret = ib_mr->rkey;
+
+ ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+ if (!ibmr) {
+ ib_dereg_mr(ib_mr);
+ ret = -ENOMEM;
+ goto out;
+ }
+ ibmr->u.mr = ib_mr;
+ ibmr->odp = 1;
+
+ sge.addr = virt_addr;
+ sge.length = length;
+ sge.lkey = ib_mr->lkey;
+
+ ib_advise_mr(rds_ibdev->pd,
+ IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE,
+ IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1);
+ return ibmr;
+ }
+
+ if (conn)
+ ic = conn->c_transport_data;
+
+ if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
+ if (IS_ERR(ibmr)) {
+ ret = PTR_ERR(ibmr);
+ pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret);
+ } else {
+ return ibmr;
+ }
+
+ out:
+ if (rds_ibdev)
+ rds_ib_dev_put(rds_ibdev);
+
+ return ERR_PTR(ret);
+}
+
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
+{
+ cancel_delayed_work_sync(&pool->flush_worker);
+ rds_ib_flush_mr_pool(pool, 1, NULL);
+ WARN_ON(atomic_read(&pool->item_count));
+ WARN_ON(atomic_read(&pool->free_pinned));
+ kfree(pool);
+}
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
+ int pool_type)
+{
+ struct rds_ib_mr_pool *pool;
+
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+
+ pool->pool_type = pool_type;
+ init_llist_head(&pool->free_list);
+ init_llist_head(&pool->drop_list);
+ init_llist_head(&pool->clean_list);
+ spin_lock_init(&pool->clean_lock);
+ mutex_init(&pool->flush_lock);
+ init_waitqueue_head(&pool->flush_wait);
+ INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+
+ if (pool_type == RDS_IB_MR_1M_POOL) {
+ /* +1 allows for unaligned MRs */
+ pool->max_pages = RDS_MR_1M_MSG_SIZE + 1;
+ pool->max_items = rds_ibdev->max_1m_mrs;
+ } else {
+ /* pool_type == RDS_IB_MR_8K_POOL */
+ pool->max_pages = RDS_MR_8K_MSG_SIZE + 1;
+ pool->max_items = rds_ibdev->max_8k_mrs;
+ }
+
+ pool->max_free_pinned = pool->max_items * pool->max_pages / 4;
+ pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4;
+
+ return pool;
+}
+
+int rds_ib_mr_init(void)
+{
+ rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0);
+ if (!rds_ib_mr_wq)
+ return -ENOMEM;
+ return 0;
+}
+
+/* By the time this is called all the IB devices should have been torn down and
+ * had their pools freed. As each pool is freed its work struct is waited on,
+ * so the pool flushing work queue should be idle by the time we get here.
+ */
+void rds_ib_mr_exit(void)
+{
+ destroy_workqueue(rds_ib_mr_wq);
+}
+
+static void rds_ib_odp_mr_worker(struct work_struct *work)
+{
+ struct rds_ib_mr *ibmr;
+
+ ibmr = container_of(work, struct rds_ib_mr, work.work);
+ ib_dereg_mr(ibmr->u.mr);
+ kfree(ibmr);
+}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
new file mode 100644
index 000000000..cfbf0e129
--- /dev/null
+++ b/net/rds/ib_recv.c
@@ -0,0 +1,1094 @@
+/*
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds_single_path.h"
+#include "rds.h"
+#include "ib.h"
+
+static struct kmem_cache *rds_ib_incoming_slab;
+static struct kmem_cache *rds_ib_frag_slab;
+static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
+
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
+{
+ struct rds_ib_recv_work *recv;
+ u32 i;
+
+ for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+ struct ib_sge *sge;
+
+ recv->r_ibinc = NULL;
+ recv->r_frag = NULL;
+
+ recv->r_wr.next = NULL;
+ recv->r_wr.wr_id = i;
+ recv->r_wr.sg_list = recv->r_sge;
+ recv->r_wr.num_sge = RDS_IB_RECV_SGE;
+
+ sge = &recv->r_sge[0];
+ sge->addr = ic->i_recv_hdrs_dma[i];
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = ic->i_pd->local_dma_lkey;
+
+ sge = &recv->r_sge[1];
+ sge->addr = 0;
+ sge->length = RDS_FRAG_SIZE;
+ sge->lkey = ic->i_pd->local_dma_lkey;
+ }
+}
+
+/*
+ * The entire 'from' list, including the from element itself, is put on
+ * to the tail of the 'to' list.
+ */
+static void list_splice_entire_tail(struct list_head *from,
+ struct list_head *to)
+{
+ struct list_head *from_last = from->prev;
+
+ list_splice_tail(from_last, to);
+ list_add_tail(from_last, to);
+}
+
+static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
+{
+ struct list_head *tmp;
+
+ tmp = xchg(&cache->xfer, NULL);
+ if (tmp) {
+ if (cache->ready)
+ list_splice_entire_tail(tmp, cache->ready);
+ else
+ cache->ready = tmp;
+ }
+}
+
+static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache, gfp_t gfp)
+{
+ struct rds_ib_cache_head *head;
+ int cpu;
+
+ cache->percpu = alloc_percpu_gfp(struct rds_ib_cache_head, gfp);
+ if (!cache->percpu)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ head = per_cpu_ptr(cache->percpu, cpu);
+ head->first = NULL;
+ head->count = 0;
+ }
+ cache->xfer = NULL;
+ cache->ready = NULL;
+
+ return 0;
+}
+
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp)
+{
+ int ret;
+
+ ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs, gfp);
+ if (!ret) {
+ ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags, gfp);
+ if (ret)
+ free_percpu(ic->i_cache_incs.percpu);
+ }
+
+ return ret;
+}
+
+static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
+ struct list_head *caller_list)
+{
+ struct rds_ib_cache_head *head;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ head = per_cpu_ptr(cache->percpu, cpu);
+ if (head->first) {
+ list_splice_entire_tail(head->first, caller_list);
+ head->first = NULL;
+ }
+ }
+
+ if (cache->ready) {
+ list_splice_entire_tail(cache->ready, caller_list);
+ cache->ready = NULL;
+ }
+}
+
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
+{
+ struct rds_ib_incoming *inc;
+ struct rds_ib_incoming *inc_tmp;
+ struct rds_page_frag *frag;
+ struct rds_page_frag *frag_tmp;
+ LIST_HEAD(list);
+
+ rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+ rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
+ free_percpu(ic->i_cache_incs.percpu);
+
+ list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
+ list_del(&inc->ii_cache_entry);
+ WARN_ON(!list_empty(&inc->ii_frags));
+ kmem_cache_free(rds_ib_incoming_slab, inc);
+ atomic_dec(&rds_ib_allocation);
+ }
+
+ rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+ rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
+ free_percpu(ic->i_cache_frags.percpu);
+
+ list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
+ list_del(&frag->f_cache_entry);
+ WARN_ON(!list_empty(&frag->f_item));
+ kmem_cache_free(rds_ib_frag_slab, frag);
+ }
+}
+
+/* fwd decl */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+ struct rds_ib_refill_cache *cache);
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
+
+
+/* Recycle frag and attached recv buffer f_sg */
+static void rds_ib_frag_free(struct rds_ib_connection *ic,
+ struct rds_page_frag *frag)
+{
+ rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
+
+ rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+ atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
+ rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
+}
+
+/* Recycle inc after freeing attached frags */
+void rds_ib_inc_free(struct rds_incoming *inc)
+{
+ struct rds_ib_incoming *ibinc;
+ struct rds_page_frag *frag;
+ struct rds_page_frag *pos;
+ struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
+
+ ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+
+ /* Free attached frags */
+ list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+ list_del_init(&frag->f_item);
+ rds_ib_frag_free(ic, frag);
+ }
+ BUG_ON(!list_empty(&ibinc->ii_frags));
+
+ rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+ rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
+}
+
+static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
+ struct rds_ib_recv_work *recv)
+{
+ if (recv->r_ibinc) {
+ rds_inc_put(&recv->r_ibinc->ii_inc);
+ recv->r_ibinc = NULL;
+ }
+ if (recv->r_frag) {
+ ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+ rds_ib_frag_free(ic, recv->r_frag);
+ recv->r_frag = NULL;
+ }
+}
+
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
+{
+ u32 i;
+
+ for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+ rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
+}
+
+static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
+ gfp_t slab_mask)
+{
+ struct rds_ib_incoming *ibinc;
+ struct list_head *cache_item;
+ int avail_allocs;
+
+ cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
+ if (cache_item) {
+ ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
+ } else {
+ avail_allocs = atomic_add_unless(&rds_ib_allocation,
+ 1, rds_ib_sysctl_max_recv_allocation);
+ if (!avail_allocs) {
+ rds_ib_stats_inc(s_ib_rx_alloc_limit);
+ return NULL;
+ }
+ ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
+ if (!ibinc) {
+ atomic_dec(&rds_ib_allocation);
+ return NULL;
+ }
+ rds_ib_stats_inc(s_ib_rx_total_incs);
+ }
+ INIT_LIST_HEAD(&ibinc->ii_frags);
+ rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr);
+
+ return ibinc;
+}
+
+static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
+ gfp_t slab_mask, gfp_t page_mask)
+{
+ struct rds_page_frag *frag;
+ struct list_head *cache_item;
+ int ret;
+
+ cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
+ if (cache_item) {
+ frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+ atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
+ rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
+ } else {
+ frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
+ if (!frag)
+ return NULL;
+
+ sg_init_table(&frag->f_sg, 1);
+ ret = rds_page_remainder_alloc(&frag->f_sg,
+ RDS_FRAG_SIZE, page_mask);
+ if (ret) {
+ kmem_cache_free(rds_ib_frag_slab, frag);
+ return NULL;
+ }
+ rds_ib_stats_inc(s_ib_rx_total_frags);
+ }
+
+ INIT_LIST_HEAD(&frag->f_item);
+
+ return frag;
+}
+
+static int rds_ib_recv_refill_one(struct rds_connection *conn,
+ struct rds_ib_recv_work *recv, gfp_t gfp)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct ib_sge *sge;
+ int ret = -ENOMEM;
+ gfp_t slab_mask = gfp;
+ gfp_t page_mask = gfp;
+
+ if (gfp & __GFP_DIRECT_RECLAIM) {
+ slab_mask = GFP_KERNEL;
+ page_mask = GFP_HIGHUSER;
+ }
+
+ if (!ic->i_cache_incs.ready)
+ rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+ if (!ic->i_cache_frags.ready)
+ rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+
+ /*
+ * ibinc was taken from recv if recv contained the start of a message.
+ * recvs that were continuations will still have this allocated.
+ */
+ if (!recv->r_ibinc) {
+ recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
+ if (!recv->r_ibinc)
+ goto out;
+ }
+
+ WARN_ON(recv->r_frag); /* leak! */
+ recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
+ if (!recv->r_frag)
+ goto out;
+
+ ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
+ 1, DMA_FROM_DEVICE);
+ WARN_ON(ret != 1);
+
+ sge = &recv->r_sge[0];
+ sge->addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs];
+ sge->length = sizeof(struct rds_header);
+
+ sge = &recv->r_sge[1];
+ sge->addr = sg_dma_address(&recv->r_frag->f_sg);
+ sge->length = sg_dma_len(&recv->r_frag->f_sg);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int acquire_refill(struct rds_connection *conn)
+{
+ return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0;
+}
+
+static void release_refill(struct rds_connection *conn)
+{
+ clear_bit(RDS_RECV_REFILL, &conn->c_flags);
+ smp_mb__after_atomic();
+
+ /* We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+ * hot path and finding waiters is very rare. We don't want to walk
+ * the system-wide hashed waitqueue buckets in the fast path only to
+ * almost never find waiters.
+ */
+ if (waitqueue_active(&conn->c_waitq))
+ wake_up_all(&conn->c_waitq);
+}
+
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets.
+ */
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_recv_work *recv;
+ unsigned int posted = 0;
+ int ret = 0;
+ bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
+ bool must_wake = false;
+ u32 pos;
+
+ /* the goal here is to just make sure that someone, somewhere
+ * is posting buffers. If we can't get the refill lock,
+ * let them do their thing
+ */
+ if (!acquire_refill(conn))
+ return;
+
+ while ((prefill || rds_conn_up(conn)) &&
+ rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+ if (pos >= ic->i_recv_ring.w_nr) {
+ printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+ pos);
+ break;
+ }
+
+ recv = &ic->i_recvs[pos];
+ ret = rds_ib_recv_refill_one(conn, recv, gfp);
+ if (ret) {
+ must_wake = true;
+ break;
+ }
+
+ rdsdebug("recv %p ibinc %p page %p addr %lu\n", recv,
+ recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
+ (long)sg_dma_address(&recv->r_frag->f_sg));
+
+ /* XXX when can this fail? */
+ ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, NULL);
+ if (ret) {
+ rds_ib_conn_error(conn, "recv post on "
+ "%pI6c returned %d, disconnecting and "
+ "reconnecting\n", &conn->c_faddr,
+ ret);
+ break;
+ }
+
+ posted++;
+
+ if ((posted > 128 && need_resched()) || posted > 8192) {
+ must_wake = true;
+ break;
+ }
+ }
+
+ /* We're doing flow control - update the window. */
+ if (ic->i_flowctl && posted)
+ rds_ib_advertise_credits(conn, posted);
+
+ if (ret)
+ rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+
+ release_refill(conn);
+
+ /* if we're called from the softirq handler, we'll be GFP_NOWAIT.
+ * in this case the ring being low is going to lead to more interrupts
+ * and we can safely let the softirq code take care of it unless the
+ * ring is completely empty.
+ *
+ * if we're called from krdsd, we'll be GFP_KERNEL. In this case
+ * we might have raced with the softirq code while we had the refill
+ * lock held. Use rds_ib_ring_low() instead of ring_empty to decide
+ * if we should requeue.
+ */
+ if (rds_conn_up(conn) &&
+ (must_wake ||
+ (can_wait && rds_ib_ring_low(&ic->i_recv_ring)) ||
+ rds_ib_ring_empty(&ic->i_recv_ring))) {
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 1);
+ }
+ if (can_wait)
+ cond_resched();
+}
+
+/*
+ * We want to recycle several types of recv allocations, like incs and frags.
+ * To use this, the *_free() function passes in the ptr to a list_head within
+ * the recyclee, as well as the cache to put it on.
+ *
+ * First, we put the memory on a percpu list. When this reaches a certain size,
+ * We move it to an intermediate non-percpu list in a lockless manner, with some
+ * xchg/compxchg wizardry.
+ *
+ * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
+ * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
+ * list_empty() will return true with one element is actually present.
+ */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+ struct rds_ib_refill_cache *cache)
+{
+ unsigned long flags;
+ struct list_head *old, *chpfirst;
+
+ local_irq_save(flags);
+
+ chpfirst = __this_cpu_read(cache->percpu->first);
+ if (!chpfirst)
+ INIT_LIST_HEAD(new_item);
+ else /* put on front */
+ list_add_tail(new_item, chpfirst);
+
+ __this_cpu_write(cache->percpu->first, new_item);
+ __this_cpu_inc(cache->percpu->count);
+
+ if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT)
+ goto end;
+
+ /*
+ * Return our per-cpu first list to the cache's xfer by atomically
+ * grabbing the current xfer list, appending it to our per-cpu list,
+ * and then atomically returning that entire list back to the
+ * cache's xfer list as long as it's still empty.
+ */
+ do {
+ old = xchg(&cache->xfer, NULL);
+ if (old)
+ list_splice_entire_tail(old, chpfirst);
+ old = cmpxchg(&cache->xfer, NULL, chpfirst);
+ } while (old);
+
+
+ __this_cpu_write(cache->percpu->first, NULL);
+ __this_cpu_write(cache->percpu->count, 0);
+end:
+ local_irq_restore(flags);
+}
+
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
+{
+ struct list_head *head = cache->ready;
+
+ if (head) {
+ if (!list_empty(head)) {
+ cache->ready = head->next;
+ list_del_init(head);
+ } else
+ cache->ready = NULL;
+ }
+
+ return head;
+}
+
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+{
+ struct rds_ib_incoming *ibinc;
+ struct rds_page_frag *frag;
+ unsigned long to_copy;
+ unsigned long frag_off = 0;
+ int copied = 0;
+ int ret;
+ u32 len;
+
+ ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+ frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+ len = be32_to_cpu(inc->i_hdr.h_len);
+
+ while (iov_iter_count(to) && copied < len) {
+ if (frag_off == RDS_FRAG_SIZE) {
+ frag = list_entry(frag->f_item.next,
+ struct rds_page_frag, f_item);
+ frag_off = 0;
+ }
+ to_copy = min_t(unsigned long, iov_iter_count(to),
+ RDS_FRAG_SIZE - frag_off);
+ to_copy = min_t(unsigned long, to_copy, len - copied);
+
+ /* XXX needs + offset for multiple recvs per page */
+ rds_stats_add(s_copy_to_user, to_copy);
+ ret = copy_page_to_iter(sg_page(&frag->f_sg),
+ frag->f_sg.offset + frag_off,
+ to_copy,
+ to);
+ if (ret != to_copy)
+ return -EFAULT;
+
+ frag_off += to_copy;
+ copied += to_copy;
+ }
+
+ return copied;
+}
+
+/* ic starts out kzalloc()ed */
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
+{
+ struct ib_send_wr *wr = &ic->i_ack_wr;
+ struct ib_sge *sge = &ic->i_ack_sge;
+
+ sge->addr = ic->i_ack_dma;
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = ic->i_pd->local_dma_lkey;
+
+ wr->sg_list = sge;
+ wr->num_sge = 1;
+ wr->opcode = IB_WR_SEND;
+ wr->wr_id = RDS_IB_ACK_WR_ID;
+ wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received. The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory. This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed. This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue. To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time. This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight. This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame. This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do. The QP attribute specifically makes
+ * room for it beyond the ring size. Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+#ifndef KERNEL_HAS_ATOMIC64
+void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ic->i_ack_lock, flags);
+ ic->i_ack_next = seq;
+ if (ack_required)
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+}
+
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+ unsigned long flags;
+ u64 seq;
+
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+ spin_lock_irqsave(&ic->i_ack_lock, flags);
+ seq = ic->i_ack_next;
+ spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+
+ return seq;
+}
+#else
+void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
+{
+ atomic64_set(&ic->i_ack_next, seq);
+ if (ack_required) {
+ smp_mb__before_atomic();
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ }
+}
+
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ smp_mb__after_atomic();
+
+ return atomic64_read(&ic->i_ack_next);
+}
+#endif
+
+
+static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
+{
+ struct rds_header *hdr = ic->i_ack;
+ u64 seq;
+ int ret;
+
+ seq = rds_ib_get_ack(ic);
+
+ rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, ic->i_ack_dma,
+ sizeof(*hdr), DMA_TO_DEVICE);
+ rds_message_populate_header(hdr, 0, 0, 0);
+ hdr->h_ack = cpu_to_be64(seq);
+ hdr->h_credit = adv_credits;
+ rds_message_make_checksum(hdr);
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev, ic->i_ack_dma,
+ sizeof(*hdr), DMA_TO_DEVICE);
+
+ ic->i_ack_queued = jiffies;
+
+ ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, NULL);
+ if (unlikely(ret)) {
+ /* Failed to send. Release the WR, and
+ * force another ACK.
+ */
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+ rds_ib_stats_inc(s_ib_ack_send_failure);
+
+ rds_ib_conn_error(ic->conn, "sending ack failed\n");
+ } else
+ rds_ib_stats_inc(s_ib_ack_sent);
+}
+
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ * 1. We call rds_ib_attempt_ack from the recv completion handler
+ * to send an ACK-only frame.
+ * However, there can be only one such frame in the send queue
+ * at any time, so we may have to postpone it.
+ * 2. When another (data) packet is transmitted while there's
+ * an ACK in the queue, we piggyback the ACK sequence number
+ * on the data packet.
+ * 3. If the ACK WR is done sending, we get called from the
+ * send queue completion handler, and check whether there's
+ * another ACK pending (postponed because the WR was on the
+ * queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ * - i_ack_flags, which keeps track of whether the ACK WR
+ * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ * - i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ * It would be nice to not have to use a spinlock to synchronize things,
+ * but the one problem that rules this out is that 64bit updates are
+ * not atomic on all platforms. Things would be a lot simpler if
+ * we had atomic64 or maybe cmpxchg64 everywhere.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_ib_attempt_ack(struct rds_ib_connection *ic)
+{
+ unsigned int adv_credits;
+
+ if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+ return;
+
+ if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+ rds_ib_stats_inc(s_ib_ack_send_delayed);
+ return;
+ }
+
+ /* Can we get a send credit? */
+ if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+ rds_ib_stats_inc(s_ib_tx_throttle);
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ return;
+ }
+
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ rds_ib_send_ack(ic, adv_credits);
+}
+
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
+{
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ rds_ib_attempt_ack(ic);
+}
+
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
+{
+ if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+ rds_ib_stats_inc(s_ib_ack_send_piggybacked);
+ return rds_ib_get_ack(ic);
+}
+
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps. We could have posted the bitmaps and rdma written into
+ * them. But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient. By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_ib_cong_recv(struct rds_connection *conn,
+ struct rds_ib_incoming *ibinc)
+{
+ struct rds_cong_map *map;
+ unsigned int map_off;
+ unsigned int map_page;
+ struct rds_page_frag *frag;
+ unsigned long frag_off;
+ unsigned long to_copy;
+ unsigned long copied;
+ __le64 uncongested = 0;
+ void *addr;
+
+ /* catch completely corrupt packets */
+ if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+ return;
+
+ map = conn->c_fcong;
+ map_page = 0;
+ map_off = 0;
+
+ frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+ frag_off = 0;
+
+ copied = 0;
+
+ while (copied < RDS_CONG_MAP_BYTES) {
+ __le64 *src, *dst;
+ unsigned int k;
+
+ to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+ BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+
+ addr = kmap_atomic(sg_page(&frag->f_sg));
+
+ src = addr + frag->f_sg.offset + frag_off;
+ dst = (void *)map->m_page_addrs[map_page] + map_off;
+ for (k = 0; k < to_copy; k += 8) {
+ /* Record ports that became uncongested, ie
+ * bits that changed from 0 to 1. */
+ uncongested |= ~(*src) & *dst;
+ *dst++ = *src++;
+ }
+ kunmap_atomic(addr);
+
+ copied += to_copy;
+
+ map_off += to_copy;
+ if (map_off == PAGE_SIZE) {
+ map_off = 0;
+ map_page++;
+ }
+
+ frag_off += to_copy;
+ if (frag_off == RDS_FRAG_SIZE) {
+ frag = list_entry(frag->f_item.next,
+ struct rds_page_frag, f_item);
+ frag_off = 0;
+ }
+ }
+
+ /* the congestion map is in little endian order */
+ rds_cong_map_updated(map, le64_to_cpu(uncongested));
+}
+
+static void rds_ib_process_recv(struct rds_connection *conn,
+ struct rds_ib_recv_work *recv, u32 data_len,
+ struct rds_ib_ack_state *state)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_incoming *ibinc = ic->i_ibinc;
+ struct rds_header *ihdr, *hdr;
+ dma_addr_t dma_addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs];
+
+ /* XXX shut down the connection if port 0,0 are seen? */
+
+ rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
+ data_len);
+
+ if (data_len < sizeof(struct rds_header)) {
+ rds_ib_conn_error(conn, "incoming message "
+ "from %pI6c didn't include a "
+ "header, disconnecting and "
+ "reconnecting\n",
+ &conn->c_faddr);
+ return;
+ }
+ data_len -= sizeof(struct rds_header);
+
+ ihdr = ic->i_recv_hdrs[recv - ic->i_recvs];
+
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, dma_addr,
+ sizeof(*ihdr), DMA_FROM_DEVICE);
+ /* Validate the checksum. */
+ if (!rds_message_verify_checksum(ihdr)) {
+ rds_ib_conn_error(conn, "incoming message "
+ "from %pI6c has corrupted header - "
+ "forcing a reconnect\n",
+ &conn->c_faddr);
+ rds_stats_inc(s_recv_drop_bad_checksum);
+ goto done;
+ }
+
+ /* Process the ACK sequence which comes with every packet */
+ state->ack_recv = be64_to_cpu(ihdr->h_ack);
+ state->ack_recv_valid = 1;
+
+ /* Process the credits update if there was one */
+ if (ihdr->h_credit)
+ rds_ib_send_add_credits(conn, ihdr->h_credit);
+
+ if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
+ /* This is an ACK-only packet. The fact that it gets
+ * special treatment here is that historically, ACKs
+ * were rather special beasts.
+ */
+ rds_ib_stats_inc(s_ib_ack_received);
+
+ /*
+ * Usually the frags make their way on to incs and are then freed as
+ * the inc is freed. We don't go that route, so we have to drop the
+ * page ref ourselves. We can't just leave the page on the recv
+ * because that confuses the dma mapping of pages and each recv's use
+ * of a partial page.
+ *
+ * FIXME: Fold this into the code path below.
+ */
+ rds_ib_frag_free(ic, recv->r_frag);
+ recv->r_frag = NULL;
+ goto done;
+ }
+
+ /*
+ * If we don't already have an inc on the connection then this
+ * fragment has a header and starts a message.. copy its header
+ * into the inc and save the inc so we can hang upcoming fragments
+ * off its list.
+ */
+ if (!ibinc) {
+ ibinc = recv->r_ibinc;
+ recv->r_ibinc = NULL;
+ ic->i_ibinc = ibinc;
+
+ hdr = &ibinc->ii_inc.i_hdr;
+ ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+ local_clock();
+ memcpy(hdr, ihdr, sizeof(*hdr));
+ ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+ ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+ local_clock();
+
+ rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
+ ic->i_recv_data_rem, hdr->h_flags);
+ } else {
+ hdr = &ibinc->ii_inc.i_hdr;
+ /* We can't just use memcmp here; fragments of a
+ * single message may carry different ACKs */
+ if (hdr->h_sequence != ihdr->h_sequence ||
+ hdr->h_len != ihdr->h_len ||
+ hdr->h_sport != ihdr->h_sport ||
+ hdr->h_dport != ihdr->h_dport) {
+ rds_ib_conn_error(conn,
+ "fragment header mismatch; forcing reconnect\n");
+ goto done;
+ }
+ }
+
+ list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
+ recv->r_frag = NULL;
+
+ if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+ ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+ else {
+ ic->i_recv_data_rem = 0;
+ ic->i_ibinc = NULL;
+
+ if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) {
+ rds_ib_cong_recv(conn, ibinc);
+ } else {
+ rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr,
+ &ibinc->ii_inc, GFP_ATOMIC);
+ state->ack_next = be64_to_cpu(hdr->h_sequence);
+ state->ack_next_valid = 1;
+ }
+
+ /* Evaluate the ACK_REQUIRED flag *after* we received
+ * the complete frame, and after bumping the next_rx
+ * sequence. */
+ if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+ rds_stats_inc(s_recv_ack_required);
+ state->ack_required = 1;
+ }
+
+ rds_inc_put(&ibinc->ii_inc);
+ }
+done:
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev, dma_addr,
+ sizeof(*ihdr), DMA_FROM_DEVICE);
+}
+
+void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
+ struct ib_wc *wc,
+ struct rds_ib_ack_state *state)
+{
+ struct rds_connection *conn = ic->conn;
+ struct rds_ib_recv_work *recv;
+
+ rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+ (unsigned long long)wc->wr_id, wc->status,
+ ib_wc_status_msg(wc->status), wc->byte_len,
+ be32_to_cpu(wc->ex.imm_data));
+
+ rds_ib_stats_inc(s_ib_rx_cq_event);
+ recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
+ ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1,
+ DMA_FROM_DEVICE);
+
+ /* Also process recvs in connecting state because it is possible
+ * to get a recv completion _before_ the rdmacm ESTABLISHED
+ * event is processed.
+ */
+ if (wc->status == IB_WC_SUCCESS) {
+ rds_ib_process_recv(conn, recv, wc->byte_len, state);
+ } else {
+ /* We expect errors as the qp is drained during shutdown */
+ if (rds_conn_up(conn) || rds_conn_connecting(conn))
+ rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n",
+ &conn->c_laddr, &conn->c_faddr,
+ conn->c_tos, wc->status,
+ ib_wc_status_msg(wc->status),
+ wc->vendor_err);
+ }
+
+ /* rds_ib_process_recv() doesn't always consume the frag, and
+ * we might not have called it at all if the wc didn't indicate
+ * success. We already unmapped the frag's pages, though, and
+ * the following rds_ib_ring_free() call tells the refill path
+ * that it will not find an allocated frag here. Make sure we
+ * keep that promise by freeing a frag that's still on the ring.
+ */
+ if (recv->r_frag) {
+ rds_ib_frag_free(ic, recv->r_frag);
+ recv->r_frag = NULL;
+ }
+ rds_ib_ring_free(&ic->i_recv_ring, 1);
+
+ /* If we ever end up with a really empty receive ring, we're
+ * in deep trouble, as the sender will definitely see RNR
+ * timeouts. */
+ if (rds_ib_ring_empty(&ic->i_recv_ring))
+ rds_ib_stats_inc(s_ib_rx_ring_empty);
+
+ if (rds_ib_ring_low(&ic->i_recv_ring)) {
+ rds_ib_recv_refill(conn, 0, GFP_NOWAIT | __GFP_NOWARN);
+ rds_ib_stats_inc(s_ib_rx_refill_from_cq);
+ }
+}
+
+int rds_ib_recv_path(struct rds_conn_path *cp)
+{
+ struct rds_connection *conn = cp->cp_conn;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ rdsdebug("conn %p\n", conn);
+ if (rds_conn_up(conn)) {
+ rds_ib_attempt_ack(ic);
+ rds_ib_recv_refill(conn, 0, GFP_KERNEL);
+ rds_ib_stats_inc(s_ib_rx_refill_from_thread);
+ }
+
+ return 0;
+}
+
+int rds_ib_recv_init(void)
+{
+ struct sysinfo si;
+ int ret = -ENOMEM;
+
+ /* Default to 30% of all available RAM for recv memory */
+ si_meminfo(&si);
+ rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+
+ rds_ib_incoming_slab =
+ kmem_cache_create_usercopy("rds_ib_incoming",
+ sizeof(struct rds_ib_incoming),
+ 0, SLAB_HWCACHE_ALIGN,
+ offsetof(struct rds_ib_incoming,
+ ii_inc.i_usercopy),
+ sizeof(struct rds_inc_usercopy),
+ NULL);
+ if (!rds_ib_incoming_slab)
+ goto out;
+
+ rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
+ sizeof(struct rds_page_frag),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!rds_ib_frag_slab) {
+ kmem_cache_destroy(rds_ib_incoming_slab);
+ rds_ib_incoming_slab = NULL;
+ } else
+ ret = 0;
+out:
+ return ret;
+}
+
+void rds_ib_recv_exit(void)
+{
+ WARN_ON(atomic_read(&rds_ib_allocation));
+
+ kmem_cache_destroy(rds_ib_incoming_slab);
+ kmem_cache_destroy(rds_ib_frag_slab);
+}
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c
new file mode 100644
index 000000000..006b2e441
--- /dev/null
+++ b/net/rds/ib_ring.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "ib.h"
+
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait);
+
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr)
+{
+ memset(ring, 0, sizeof(*ring));
+ ring->w_nr = nr;
+ rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+
+static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring)
+{
+ u32 diff;
+
+ /* This assumes that atomic_t has at least as many bits as u32 */
+ diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+ BUG_ON(diff > ring->w_nr);
+
+ return diff;
+}
+
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr)
+{
+ /* We only ever get called from the connection setup code,
+ * prior to creating the QP. */
+ BUG_ON(__rds_ib_ring_used(ring));
+ ring->w_nr = nr;
+}
+
+static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+ return __rds_ib_ring_used(ring) == 0;
+}
+
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos)
+{
+ u32 ret = 0, avail;
+
+ avail = ring->w_nr - __rds_ib_ring_used(ring);
+
+ rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+ ring->w_alloc_ptr, avail);
+
+ if (val && avail) {
+ ret = min(val, avail);
+ *pos = ring->w_alloc_ptr;
+
+ ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+ ring->w_alloc_ctr += ret;
+ }
+
+ return ret;
+}
+
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val)
+{
+ ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+ atomic_add(val, &ring->w_free_ctr);
+
+ if (__rds_ib_ring_empty(ring) &&
+ waitqueue_active(&rds_ib_ring_empty_wait))
+ wake_up(&rds_ib_ring_empty_wait);
+}
+
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val)
+{
+ ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+ ring->w_alloc_ctr -= val;
+}
+
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+ return __rds_ib_ring_empty(ring);
+}
+
+int rds_ib_ring_low(struct rds_ib_work_ring *ring)
+{
+ return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1);
+}
+
+/*
+ * returns the oldest allocated ring entry. This will be the next one
+ * freed. This can't be called if there are none allocated.
+ */
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring)
+{
+ return ring->w_free_ptr;
+}
+
+/*
+ * returns the number of completed work requests.
+ */
+
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest)
+{
+ u32 ret;
+
+ if (oldest <= (unsigned long long)wr_id)
+ ret = (unsigned long long)wr_id - oldest + 1;
+ else
+ ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+
+ rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+ wr_id, oldest);
+ return ret;
+}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
new file mode 100644
index 000000000..4190b90ff
--- /dev/null
+++ b/net/rds/ib_send.c
@@ -0,0 +1,1017 @@
+/*
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/ratelimit.h>
+
+#include "rds_single_path.h"
+#include "rds.h"
+#include "ib.h"
+#include "ib_mr.h"
+
+/*
+ * Convert IB-specific error message to RDS error message and call core
+ * completion handler.
+ */
+static void rds_ib_send_complete(struct rds_message *rm,
+ int wc_status,
+ void (*complete)(struct rds_message *rm, int status))
+{
+ int notify_status;
+
+ switch (wc_status) {
+ case IB_WC_WR_FLUSH_ERR:
+ return;
+
+ case IB_WC_SUCCESS:
+ notify_status = RDS_RDMA_SUCCESS;
+ break;
+
+ case IB_WC_REM_ACCESS_ERR:
+ notify_status = RDS_RDMA_REMOTE_ERROR;
+ break;
+
+ default:
+ notify_status = RDS_RDMA_OTHER_ERROR;
+ break;
+ }
+ complete(rm, notify_status);
+}
+
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+ struct rm_data_op *op,
+ int wc_status)
+{
+ if (op->op_nents)
+ ib_dma_unmap_sg(ic->i_cm_id->device,
+ op->op_sg, op->op_nents,
+ DMA_TO_DEVICE);
+}
+
+static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
+ struct rm_rdma_op *op,
+ int wc_status)
+{
+ if (op->op_mapped) {
+ ib_dma_unmap_sg(ic->i_cm_id->device,
+ op->op_sg, op->op_nents,
+ op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ op->op_mapped = 0;
+ }
+
+ /* If the user asked for a completion notification on this
+ * message, we can implement three different semantics:
+ * 1. Notify when we received the ACK on the RDS message
+ * that was queued with the RDMA. This provides reliable
+ * notification of RDMA status at the expense of a one-way
+ * packet delay.
+ * 2. Notify when the IB stack gives us the completion event for
+ * the RDMA operation.
+ * 3. Notify when the IB stack gives us the completion event for
+ * the accompanying RDS messages.
+ * Here, we implement approach #3. To implement approach #2,
+ * we would need to take an event for the rdma WR. To implement #1,
+ * don't call rds_rdma_send_complete at all, and fall back to the notify
+ * handling in the ACK processing code.
+ *
+ * Note: There's no need to explicitly sync any RDMA buffers using
+ * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+ * operation itself unmapped the RDMA buffers, which takes care
+ * of synching.
+ */
+ rds_ib_send_complete(container_of(op, struct rds_message, rdma),
+ wc_status, rds_rdma_send_complete);
+
+ if (op->op_write)
+ rds_stats_add(s_send_rdma_bytes, op->op_bytes);
+ else
+ rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
+}
+
+static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
+ struct rm_atomic_op *op,
+ int wc_status)
+{
+ /* unmap atomic recvbuf */
+ if (op->op_mapped) {
+ ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
+ DMA_FROM_DEVICE);
+ op->op_mapped = 0;
+ }
+
+ rds_ib_send_complete(container_of(op, struct rds_message, atomic),
+ wc_status, rds_atomic_send_complete);
+
+ if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
+ rds_ib_stats_inc(s_ib_atomic_cswp);
+ else
+ rds_ib_stats_inc(s_ib_atomic_fadd);
+}
+
+/*
+ * Unmap the resources associated with a struct send_work.
+ *
+ * Returns the rm for no good reason other than it is unobtainable
+ * other than by switching on wr.opcode, currently, and the caller,
+ * the event handler, needs it.
+ */
+static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
+ struct rds_ib_send_work *send,
+ int wc_status)
+{
+ struct rds_message *rm = NULL;
+
+ /* In the error case, wc.opcode sometimes contains garbage */
+ switch (send->s_wr.opcode) {
+ case IB_WR_SEND:
+ if (send->s_op) {
+ rm = container_of(send->s_op, struct rds_message, data);
+ rds_ib_send_unmap_data(ic, send->s_op, wc_status);
+ }
+ break;
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_READ:
+ if (send->s_op) {
+ rm = container_of(send->s_op, struct rds_message, rdma);
+ rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
+ }
+ break;
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ if (send->s_op) {
+ rm = container_of(send->s_op, struct rds_message, atomic);
+ rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
+ }
+ break;
+ default:
+ printk_ratelimited(KERN_NOTICE
+ "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+ __func__, send->s_wr.opcode);
+ break;
+ }
+
+ send->s_wr.opcode = 0xdead;
+
+ return rm;
+}
+
+void rds_ib_send_init_ring(struct rds_ib_connection *ic)
+{
+ struct rds_ib_send_work *send;
+ u32 i;
+
+ for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+ struct ib_sge *sge;
+
+ send->s_op = NULL;
+
+ send->s_wr.wr_id = i;
+ send->s_wr.sg_list = send->s_sge;
+ send->s_wr.ex.imm_data = 0;
+
+ sge = &send->s_sge[0];
+ sge->addr = ic->i_send_hdrs_dma[i];
+
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = ic->i_pd->local_dma_lkey;
+
+ send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
+ }
+}
+
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
+{
+ struct rds_ib_send_work *send;
+ u32 i;
+
+ for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+ if (send->s_op && send->s_wr.opcode != 0xdead)
+ rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
+ }
+}
+
+/*
+ * The only fast path caller always has a non-zero nr, so we don't
+ * bother testing nr before performing the atomic sub.
+ */
+static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
+{
+ if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
+ waitqueue_active(&rds_ib_ring_empty_wait))
+ wake_up(&rds_ib_ring_empty_wait);
+ BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
+}
+
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path. As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
+{
+ struct rds_message *rm = NULL;
+ struct rds_connection *conn = ic->conn;
+ struct rds_ib_send_work *send;
+ u32 completed;
+ u32 oldest;
+ u32 i = 0;
+ int nr_sig = 0;
+
+
+ rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+ (unsigned long long)wc->wr_id, wc->status,
+ ib_wc_status_msg(wc->status), wc->byte_len,
+ be32_to_cpu(wc->ex.imm_data));
+ rds_ib_stats_inc(s_ib_tx_cq_event);
+
+ if (wc->wr_id == RDS_IB_ACK_WR_ID) {
+ if (time_after(jiffies, ic->i_ack_queued + HZ / 2))
+ rds_ib_stats_inc(s_ib_tx_stalled);
+ rds_ib_ack_send_complete(ic);
+ return;
+ }
+
+ oldest = rds_ib_ring_oldest(&ic->i_send_ring);
+
+ completed = rds_ib_ring_completed(&ic->i_send_ring, wc->wr_id, oldest);
+
+ for (i = 0; i < completed; i++) {
+ send = &ic->i_sends[oldest];
+ if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+ nr_sig++;
+
+ rm = rds_ib_send_unmap_op(ic, send, wc->status);
+
+ if (time_after(jiffies, send->s_queued + HZ / 2))
+ rds_ib_stats_inc(s_ib_tx_stalled);
+
+ if (send->s_op) {
+ if (send->s_op == rm->m_final_op) {
+ /* If anyone waited for this message to get
+ * flushed out, wake them up now
+ */
+ rds_message_unmapped(rm);
+ }
+ rds_message_put(rm);
+ send->s_op = NULL;
+ }
+
+ oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+ }
+
+ rds_ib_ring_free(&ic->i_send_ring, completed);
+ rds_ib_sub_signaled(ic, nr_sig);
+
+ if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
+ test_bit(0, &conn->c_map_queued))
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+ /* We expect errors as the qp is drained during shutdown */
+ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+ rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n",
+ &conn->c_laddr, &conn->c_faddr,
+ conn->c_tos, wc->status,
+ ib_wc_status_msg(wc->status), wc->vendor_err);
+ }
+}
+
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ * - send credits: this tells us how many WRs we're allowed
+ * to submit without overruning the receiver's queue. For
+ * each SEND WR we post, we decrement this by one.
+ *
+ * - posted credits: this tells us how many WRs we recently
+ * posted to the receive queue. This value is transferred
+ * to the peer as a "credit update" in a RDS header field.
+ * Every time we transmit credits to the peer, we subtract
+ * the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_ib_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter. Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
+ u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+{
+ unsigned int avail, posted, got = 0, advertise;
+ long oldval, newval;
+
+ *adv_credits = 0;
+ if (!ic->i_flowctl)
+ return wanted;
+
+try_again:
+ advertise = 0;
+ oldval = newval = atomic_read(&ic->i_credits);
+ posted = IB_GET_POST_CREDITS(oldval);
+ avail = IB_GET_SEND_CREDITS(oldval);
+
+ rdsdebug("wanted=%u credits=%u posted=%u\n",
+ wanted, avail, posted);
+
+ /* The last credit must be used to send a credit update. */
+ if (avail && !posted)
+ avail--;
+
+ if (avail < wanted) {
+ struct rds_connection *conn = ic->i_cm_id->context;
+
+ /* Oops, there aren't that many credits left! */
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ got = avail;
+ } else {
+ /* Sometimes you get what you want, lalala. */
+ got = wanted;
+ }
+ newval -= IB_SET_SEND_CREDITS(got);
+
+ /*
+ * If need_posted is non-zero, then the caller wants
+ * the posted regardless of whether any send credits are
+ * available.
+ */
+ if (posted && (got || need_posted)) {
+ advertise = min_t(unsigned int, posted, max_posted);
+ newval -= IB_SET_POST_CREDITS(advertise);
+ }
+
+ /* Finally bill everything */
+ if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+ goto try_again;
+
+ *adv_credits = advertise;
+ return got;
+}
+
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ if (credits == 0)
+ return;
+
+ rdsdebug("credits=%u current=%u%s\n",
+ credits,
+ IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+ test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+
+ atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+ if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+ WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+
+ rds_ib_stats_inc(s_ib_rx_credit_updates);
+}
+
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ if (posted == 0)
+ return;
+
+ atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+
+ /* Decide whether to send an update to the peer now.
+ * If we would send a credit update for every single buffer we
+ * post, we would end up with an ACK storm (ACK arrives,
+ * consumes buffer, we refill the ring, send ACK to remote
+ * advertising the newly posted buffer... ad inf)
+ *
+ * Performance pretty much depends on how often we send
+ * credit updates - too frequent updates mean lots of ACKs.
+ * Too infrequent updates, and the peer will run out of
+ * credits and has to throttle.
+ * For the time being, 16 seems to be a good compromise.
+ */
+ if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+
+static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
+ struct rds_ib_send_work *send,
+ bool notify)
+{
+ /*
+ * We want to delay signaling completions just enough to get
+ * the batching benefits but not so much that we create dead time
+ * on the wire.
+ */
+ if (ic->i_unsignaled_wrs-- == 0 || notify) {
+ ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+ send->s_wr.send_flags |= IB_SEND_SIGNALED;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * This can be called multiple times for a given message. The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests. We translate the scatterlist into a series
+ * of work requests that fragment the message. These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection. This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct ib_device *dev = ic->i_cm_id->device;
+ struct rds_ib_send_work *send = NULL;
+ struct rds_ib_send_work *first;
+ struct rds_ib_send_work *prev;
+ const struct ib_send_wr *failed_wr;
+ struct scatterlist *scat;
+ u32 pos;
+ u32 i;
+ u32 work_alloc;
+ u32 credit_alloc = 0;
+ u32 posted;
+ u32 adv_credits = 0;
+ int send_flags = 0;
+ int bytes_sent = 0;
+ int ret;
+ int flow_controlled = 0;
+ int nr_sig = 0;
+
+ BUG_ON(off % RDS_FRAG_SIZE);
+ BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+
+ /* Do not send cong updates to IB loopback */
+ if (conn->c_loopback
+ && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+ rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+ scat = &rm->data.op_sg[sg];
+ ret = max_t(int, RDS_CONG_MAP_BYTES, scat->length);
+ return sizeof(struct rds_header) + ret;
+ }
+
+ /* FIXME we may overallocate here */
+ if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+ i = 1;
+ else
+ i = DIV_ROUND_UP(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+
+ work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+ if (work_alloc == 0) {
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ rds_ib_stats_inc(s_ib_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (ic->i_flowctl) {
+ credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+ adv_credits += posted;
+ if (credit_alloc < work_alloc) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+ work_alloc = credit_alloc;
+ flow_controlled = 1;
+ }
+ if (work_alloc == 0) {
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ rds_ib_stats_inc(s_ib_tx_throttle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* map the message the first time we see it */
+ if (!ic->i_data_op) {
+ if (rm->data.op_nents) {
+ rm->data.op_count = ib_dma_map_sg(dev,
+ rm->data.op_sg,
+ rm->data.op_nents,
+ DMA_TO_DEVICE);
+ rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+ if (rm->data.op_count == 0) {
+ rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+ } else {
+ rm->data.op_count = 0;
+ }
+
+ rds_message_addref(rm);
+ rm->data.op_dmasg = 0;
+ rm->data.op_dmaoff = 0;
+ ic->i_data_op = &rm->data;
+
+ /* Finalize the header */
+ if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+ if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+ /* If it has a RDMA op, tell the peer we did it. This is
+ * used by the peer to release use-once RDMA MRs. */
+ if (rm->rdma.op_active) {
+ struct rds_ext_header_rdma ext_hdr;
+
+ ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+ }
+ if (rm->m_rdma_cookie) {
+ rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+ rds_rdma_cookie_key(rm->m_rdma_cookie),
+ rds_rdma_cookie_offset(rm->m_rdma_cookie));
+ }
+
+ /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so
+ * we should not do this unless we have a chance of at least
+ * sticking the header into the send ring. Which is why we
+ * should call rds_ib_ring_alloc first. */
+ rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
+ rds_message_make_checksum(&rm->m_inc.i_hdr);
+
+ /*
+ * Update adv_credits since we reset the ACK_REQUIRED bit.
+ */
+ if (ic->i_flowctl) {
+ rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+ adv_credits += posted;
+ BUG_ON(adv_credits > 255);
+ }
+ }
+
+ /* Sometimes you want to put a fence between an RDMA
+ * READ and the following SEND.
+ * We could either do this all the time
+ * or when requested by the user. Right now, we let
+ * the application choose.
+ */
+ if (rm->rdma.op_active && rm->rdma.op_fence)
+ send_flags = IB_SEND_FENCE;
+
+ /* Each frag gets a header. Msgs may be 0 bytes */
+ send = &ic->i_sends[pos];
+ first = send;
+ prev = NULL;
+ scat = &ic->i_data_op->op_sg[rm->data.op_dmasg];
+ i = 0;
+ do {
+ unsigned int len = 0;
+
+ /* Set up the header */
+ send->s_wr.send_flags = send_flags;
+ send->s_wr.opcode = IB_WR_SEND;
+ send->s_wr.num_sge = 1;
+ send->s_wr.next = NULL;
+ send->s_queued = jiffies;
+ send->s_op = NULL;
+
+ send->s_sge[0].addr = ic->i_send_hdrs_dma[pos];
+
+ send->s_sge[0].length = sizeof(struct rds_header);
+ send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
+
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev,
+ ic->i_send_hdrs_dma[pos],
+ sizeof(struct rds_header),
+ DMA_TO_DEVICE);
+ memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr,
+ sizeof(struct rds_header));
+
+
+ /* Set up the data, if present */
+ if (i < work_alloc
+ && scat != &rm->data.op_sg[rm->data.op_count]) {
+ len = min(RDS_FRAG_SIZE,
+ sg_dma_len(scat) - rm->data.op_dmaoff);
+ send->s_wr.num_sge = 2;
+
+ send->s_sge[1].addr = sg_dma_address(scat);
+ send->s_sge[1].addr += rm->data.op_dmaoff;
+ send->s_sge[1].length = len;
+ send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
+
+ bytes_sent += len;
+ rm->data.op_dmaoff += len;
+ if (rm->data.op_dmaoff == sg_dma_len(scat)) {
+ scat++;
+ rm->data.op_dmasg++;
+ rm->data.op_dmaoff = 0;
+ }
+ }
+
+ rds_ib_set_wr_signal_state(ic, send, false);
+
+ /*
+ * Always signal the last one if we're stopping due to flow control.
+ */
+ if (ic->i_flowctl && flow_controlled && i == (work_alloc - 1)) {
+ rds_ib_set_wr_signal_state(ic, send, true);
+ send->s_wr.send_flags |= IB_SEND_SOLICITED;
+ }
+
+ if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+ nr_sig++;
+
+ rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+ &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+ if (ic->i_flowctl && adv_credits) {
+ struct rds_header *hdr = ic->i_send_hdrs[pos];
+
+ /* add credit and redo the header checksum */
+ hdr->h_credit = adv_credits;
+ rds_message_make_checksum(hdr);
+ adv_credits = 0;
+ rds_ib_stats_inc(s_ib_tx_credit_updates);
+ }
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev,
+ ic->i_send_hdrs_dma[pos],
+ sizeof(struct rds_header),
+ DMA_TO_DEVICE);
+
+ if (prev)
+ prev->s_wr.next = &send->s_wr;
+ prev = send;
+
+ pos = (pos + 1) % ic->i_send_ring.w_nr;
+ send = &ic->i_sends[pos];
+ i++;
+
+ } while (i < work_alloc
+ && scat != &rm->data.op_sg[rm->data.op_count]);
+
+ /* Account the RDS header in the number of bytes we sent, but just once.
+ * The caller has no concept of fragmentation. */
+ if (hdr_off == 0)
+ bytes_sent += sizeof(struct rds_header);
+
+ /* if we finished the message then send completion owns it */
+ if (scat == &rm->data.op_sg[rm->data.op_count]) {
+ prev->s_op = ic->i_data_op;
+ prev->s_wr.send_flags |= IB_SEND_SOLICITED;
+ if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED))
+ nr_sig += rds_ib_set_wr_signal_state(ic, prev, true);
+ ic->i_data_op = NULL;
+ }
+
+ /* Put back wrs & credits we didn't use */
+ if (i < work_alloc) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+ work_alloc = i;
+ }
+ if (ic->i_flowctl && i < credit_alloc)
+ rds_ib_send_add_credits(conn, credit_alloc - i);
+
+ if (nr_sig)
+ atomic_add(nr_sig, &ic->i_signaled_sends);
+
+ /* XXX need to worry about failed_wr and partial sends. */
+ failed_wr = &first->s_wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+ rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+ first, &first->s_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_wr);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c "
+ "returned %d\n", &conn->c_faddr, ret);
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_sub_signaled(ic, nr_sig);
+ if (prev->s_op) {
+ ic->i_data_op = prev->s_op;
+ prev->s_op = NULL;
+ }
+
+ rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
+ goto out;
+ }
+
+ ret = bytes_sent;
+out:
+ BUG_ON(adv_credits);
+ return ret;
+}
+
+/*
+ * Issue atomic operation.
+ * A simplified version of the rdma case, we always map 1 SG, and
+ * only 8 bytes, for the return value from the atomic operation.
+ */
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_send_work *send = NULL;
+ const struct ib_send_wr *failed_wr;
+ u32 pos;
+ u32 work_alloc;
+ int ret;
+ int nr_sig = 0;
+
+ work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
+ if (work_alloc != 1) {
+ rds_ib_stats_inc(s_ib_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* address of send request in ring */
+ send = &ic->i_sends[pos];
+ send->s_queued = jiffies;
+
+ if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
+ send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
+ send->s_atomic_wr.compare_add = op->op_m_cswp.compare;
+ send->s_atomic_wr.swap = op->op_m_cswp.swap;
+ send->s_atomic_wr.compare_add_mask = op->op_m_cswp.compare_mask;
+ send->s_atomic_wr.swap_mask = op->op_m_cswp.swap_mask;
+ } else { /* FADD */
+ send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+ send->s_atomic_wr.compare_add = op->op_m_fadd.add;
+ send->s_atomic_wr.swap = 0;
+ send->s_atomic_wr.compare_add_mask = op->op_m_fadd.nocarry_mask;
+ send->s_atomic_wr.swap_mask = 0;
+ }
+ send->s_wr.send_flags = 0;
+ nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+ send->s_atomic_wr.wr.num_sge = 1;
+ send->s_atomic_wr.wr.next = NULL;
+ send->s_atomic_wr.remote_addr = op->op_remote_addr;
+ send->s_atomic_wr.rkey = op->op_rkey;
+ send->s_op = op;
+ rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
+
+ /* map 8 byte retval buffer to the device */
+ ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
+ if (ret != 1) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+
+ /* Convert our struct scatterlist to struct ib_sge */
+ send->s_sge[0].addr = sg_dma_address(op->op_sg);
+ send->s_sge[0].length = sg_dma_len(op->op_sg);
+ send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
+
+ rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
+ send->s_sge[0].addr, send->s_sge[0].length);
+
+ if (nr_sig)
+ atomic_add(nr_sig, &ic->i_signaled_sends);
+
+ failed_wr = &send->s_atomic_wr.wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &send->s_atomic_wr.wr, &failed_wr);
+ rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
+ send, &send->s_atomic_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &send->s_atomic_wr.wr);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c "
+ "returned %d\n", &conn->c_faddr, ret);
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_sub_signaled(ic, nr_sig);
+ goto out;
+ }
+
+ if (unlikely(failed_wr != &send->s_atomic_wr.wr)) {
+ printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+ BUG_ON(failed_wr != &send->s_atomic_wr.wr);
+ }
+
+out:
+ return ret;
+}
+
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_send_work *send = NULL;
+ struct rds_ib_send_work *first;
+ struct rds_ib_send_work *prev;
+ const struct ib_send_wr *failed_wr;
+ struct scatterlist *scat;
+ unsigned long len;
+ u64 remote_addr = op->op_remote_addr;
+ u32 max_sge = ic->rds_ibdev->max_sge;
+ u32 pos;
+ u32 work_alloc;
+ u32 i;
+ u32 j;
+ int sent;
+ int ret;
+ int num_sge;
+ int nr_sig = 0;
+ u64 odp_addr = op->op_odp_addr;
+ u32 odp_lkey = 0;
+
+ /* map the op the first time we see it */
+ if (!op->op_odp_mr) {
+ if (!op->op_mapped) {
+ op->op_count =
+ ib_dma_map_sg(ic->i_cm_id->device, op->op_sg,
+ op->op_nents,
+ (op->op_write) ? DMA_TO_DEVICE :
+ DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping op %p: %d\n", ic, op,
+ op->op_count);
+ if (op->op_count == 0) {
+ rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+ op->op_mapped = 1;
+ }
+ } else {
+ op->op_count = op->op_nents;
+ odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private);
+ }
+
+ /*
+ * Instead of knowing how to return a partial rdma read/write we insist that there
+ * be enough work requests to send the entire message.
+ */
+ i = DIV_ROUND_UP(op->op_count, max_sge);
+
+ work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+ if (work_alloc != i) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_stats_inc(s_ib_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ send = &ic->i_sends[pos];
+ first = send;
+ prev = NULL;
+ scat = &op->op_sg[0];
+ sent = 0;
+ num_sge = op->op_count;
+
+ for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+ send->s_wr.send_flags = 0;
+ send->s_queued = jiffies;
+ send->s_op = NULL;
+
+ if (!op->op_notify)
+ nr_sig += rds_ib_set_wr_signal_state(ic, send,
+ op->op_notify);
+
+ send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+ send->s_rdma_wr.remote_addr = remote_addr;
+ send->s_rdma_wr.rkey = op->op_rkey;
+
+ if (num_sge > max_sge) {
+ send->s_rdma_wr.wr.num_sge = max_sge;
+ num_sge -= max_sge;
+ } else {
+ send->s_rdma_wr.wr.num_sge = num_sge;
+ }
+
+ send->s_rdma_wr.wr.next = NULL;
+
+ if (prev)
+ prev->s_rdma_wr.wr.next = &send->s_rdma_wr.wr;
+
+ for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
+ scat != &op->op_sg[op->op_count]; j++) {
+ len = sg_dma_len(scat);
+ if (!op->op_odp_mr) {
+ send->s_sge[j].addr = sg_dma_address(scat);
+ send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
+ } else {
+ send->s_sge[j].addr = odp_addr;
+ send->s_sge[j].lkey = odp_lkey;
+ }
+ send->s_sge[j].length = len;
+
+ sent += len;
+ rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+
+ remote_addr += len;
+ odp_addr += len;
+ scat++;
+ }
+
+ rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+ &send->s_rdma_wr.wr,
+ send->s_rdma_wr.wr.num_sge,
+ send->s_rdma_wr.wr.next);
+
+ prev = send;
+ if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+ send = ic->i_sends;
+ }
+
+ /* give a reference to the last op */
+ if (scat == &op->op_sg[op->op_count]) {
+ prev->s_op = op;
+ rds_message_addref(container_of(op, struct rds_message, rdma));
+ }
+
+ if (i < work_alloc) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+ work_alloc = i;
+ }
+
+ if (nr_sig)
+ atomic_add(nr_sig, &ic->i_signaled_sends);
+
+ failed_wr = &first->s_rdma_wr.wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr);
+ rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+ first, &first->s_rdma_wr.wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_rdma_wr.wr);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c "
+ "returned %d\n", &conn->c_faddr, ret);
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_sub_signaled(ic, nr_sig);
+ goto out;
+ }
+
+ if (unlikely(failed_wr != &first->s_rdma_wr.wr)) {
+ printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+ BUG_ON(failed_wr != &first->s_rdma_wr.wr);
+ }
+
+
+out:
+ return ret;
+}
+
+void rds_ib_xmit_path_complete(struct rds_conn_path *cp)
+{
+ struct rds_connection *conn = cp->cp_conn;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ /* We may have a pending ACK or window update we were unable
+ * to send previously (due to flow control). Try again. */
+ rds_ib_attempt_ack(ic);
+}
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
new file mode 100644
index 000000000..ac46d8961
--- /dev/null
+++ b/net/rds/ib_stats.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "ib.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
+
+static const char *const rds_ib_stat_names[] = {
+ "ib_connect_raced",
+ "ib_listen_closed_stale",
+ "ib_evt_handler_call",
+ "ib_tasklet_call",
+ "ib_tx_cq_event",
+ "ib_tx_ring_full",
+ "ib_tx_throttle",
+ "ib_tx_sg_mapping_failure",
+ "ib_tx_stalled",
+ "ib_tx_credit_updates",
+ "ib_rx_cq_event",
+ "ib_rx_ring_empty",
+ "ib_rx_refill_from_cq",
+ "ib_rx_refill_from_thread",
+ "ib_rx_alloc_limit",
+ "ib_rx_total_frags",
+ "ib_rx_total_incs",
+ "ib_rx_credit_updates",
+ "ib_ack_sent",
+ "ib_ack_send_failure",
+ "ib_ack_send_delayed",
+ "ib_ack_send_piggybacked",
+ "ib_ack_received",
+ "ib_rdma_mr_8k_alloc",
+ "ib_rdma_mr_8k_free",
+ "ib_rdma_mr_8k_used",
+ "ib_rdma_mr_8k_pool_flush",
+ "ib_rdma_mr_8k_pool_wait",
+ "ib_rdma_mr_8k_pool_depleted",
+ "ib_rdma_mr_1m_alloc",
+ "ib_rdma_mr_1m_free",
+ "ib_rdma_mr_1m_used",
+ "ib_rdma_mr_1m_pool_flush",
+ "ib_rdma_mr_1m_pool_wait",
+ "ib_rdma_mr_1m_pool_depleted",
+ "ib_rdma_mr_8k_reused",
+ "ib_rdma_mr_1m_reused",
+ "ib_atomic_cswp",
+ "ib_atomic_fadd",
+};
+
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail)
+{
+ struct rds_ib_statistics stats = {0, };
+ uint64_t *src;
+ uint64_t *sum;
+ size_t i;
+ int cpu;
+
+ if (avail < ARRAY_SIZE(rds_ib_stat_names))
+ goto out;
+
+ for_each_online_cpu(cpu) {
+ src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu));
+ sum = (uint64_t *)&stats;
+ for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+ *(sum++) += *(src++);
+ }
+
+ rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names,
+ ARRAY_SIZE(rds_ib_stat_names));
+out:
+ return ARRAY_SIZE(rds_ib_stat_names);
+}
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
new file mode 100644
index 000000000..e4e41b3af
--- /dev/null
+++ b/net/rds/ib_sysctl.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "ib.h"
+
+static struct ctl_table_header *rds_ib_sysctl_hdr;
+
+unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR;
+unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR;
+unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_ib_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0;
+
+unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
+
+/*
+ * This sysctl does nothing.
+ *
+ * Backwards compatibility with RDS 3.0 wire protocol
+ * disables initial FC credit exchange.
+ * If it's ever possible to drop 3.0 support,
+ * setting this to 1 and moving init/refill of send/recv
+ * rings from ib_cm_connect_complete() back into ib_setup_qp()
+ * will cause credits to be added before protocol negotiation.
+ */
+unsigned int rds_ib_sysctl_flow_control = 0;
+
+static struct ctl_table rds_ib_sysctl_table[] = {
+ {
+ .procname = "max_send_wr",
+ .data = &rds_ib_sysctl_max_send_wr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &rds_ib_sysctl_max_wr_min,
+ .extra2 = &rds_ib_sysctl_max_wr_max,
+ },
+ {
+ .procname = "max_recv_wr",
+ .data = &rds_ib_sysctl_max_recv_wr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &rds_ib_sysctl_max_wr_min,
+ .extra2 = &rds_ib_sysctl_max_wr_max,
+ },
+ {
+ .procname = "max_unsignaled_wr",
+ .data = &rds_ib_sysctl_max_unsig_wrs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &rds_ib_sysctl_max_unsig_wr_min,
+ .extra2 = &rds_ib_sysctl_max_unsig_wr_max,
+ },
+ {
+ .procname = "max_recv_allocation",
+ .data = &rds_ib_sysctl_max_recv_allocation,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "flow_control",
+ .data = &rds_ib_sysctl_flow_control,
+ .maxlen = sizeof(rds_ib_sysctl_flow_control),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+void rds_ib_sysctl_exit(void)
+{
+ if (rds_ib_sysctl_hdr)
+ unregister_net_sysctl_table(rds_ib_sysctl_hdr);
+}
+
+int rds_ib_sysctl_init(void)
+{
+ rds_ib_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/ib", rds_ib_sysctl_table);
+ if (!rds_ib_sysctl_hdr)
+ return -ENOMEM;
+ return 0;
+}
diff --git a/net/rds/info.c b/net/rds/info.c
new file mode 100644
index 000000000..b6b46a821
--- /dev/null
+++ b/net/rds/info.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/*
+ * This file implements a getsockopt() call which copies a set of fixed
+ * sized structs into a user-specified buffer as a means of providing
+ * read-only information about RDS.
+ *
+ * For a given information source there are a given number of fixed sized
+ * structs at a given time. The structs are only copied if the user-specified
+ * buffer is big enough. The destination pages that make up the buffer
+ * are pinned for the duration of the copy.
+ *
+ * This gives us the following benefits:
+ *
+ * - simple implementation, no copy "position" across multiple calls
+ * - consistent snapshot of an info source
+ * - atomic copy works well with whatever locking info source has
+ * - one portable tool to get rds info across implementations
+ * - long-lived tool can get info without allocating
+ *
+ * at the following costs:
+ *
+ * - info source copy must be pinned, may be "large"
+ */
+
+struct rds_info_iterator {
+ struct page **pages;
+ void *addr;
+ unsigned long offset;
+};
+
+static DEFINE_SPINLOCK(rds_info_lock);
+static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1];
+
+void rds_info_register_func(int optname, rds_info_func func)
+{
+ int offset = optname - RDS_INFO_FIRST;
+
+ BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+ spin_lock(&rds_info_lock);
+ BUG_ON(rds_info_funcs[offset]);
+ rds_info_funcs[offset] = func;
+ spin_unlock(&rds_info_lock);
+}
+EXPORT_SYMBOL_GPL(rds_info_register_func);
+
+void rds_info_deregister_func(int optname, rds_info_func func)
+{
+ int offset = optname - RDS_INFO_FIRST;
+
+ BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+ spin_lock(&rds_info_lock);
+ BUG_ON(rds_info_funcs[offset] != func);
+ rds_info_funcs[offset] = NULL;
+ spin_unlock(&rds_info_lock);
+}
+EXPORT_SYMBOL_GPL(rds_info_deregister_func);
+
+/*
+ * Typically we hold an atomic kmap across multiple rds_info_copy() calls
+ * because the kmap is so expensive. This must be called before using blocking
+ * operations while holding the mapping and as the iterator is torn down.
+ */
+void rds_info_iter_unmap(struct rds_info_iterator *iter)
+{
+ if (iter->addr) {
+ kunmap_atomic(iter->addr);
+ iter->addr = NULL;
+ }
+}
+
+/*
+ * get_user_pages() called flush_dcache_page() on the pages for us.
+ */
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+ unsigned long bytes)
+{
+ unsigned long this;
+
+ while (bytes) {
+ if (!iter->addr)
+ iter->addr = kmap_atomic(*iter->pages);
+
+ this = min(bytes, PAGE_SIZE - iter->offset);
+
+ rdsdebug("page %p addr %p offset %lu this %lu data %p "
+ "bytes %lu\n", *iter->pages, iter->addr,
+ iter->offset, this, data, bytes);
+
+ memcpy(iter->addr + iter->offset, data, this);
+
+ data += this;
+ bytes -= this;
+ iter->offset += this;
+
+ if (iter->offset == PAGE_SIZE) {
+ kunmap_atomic(iter->addr);
+ iter->addr = NULL;
+ iter->offset = 0;
+ iter->pages++;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(rds_info_copy);
+
+/*
+ * @optval points to the userspace buffer that the information snapshot
+ * will be copied into.
+ *
+ * @optlen on input is the size of the buffer in userspace. @optlen
+ * on output is the size of the requested snapshot in bytes.
+ *
+ * This function returns -errno if there is a failure, particularly -ENOSPC
+ * if the given userspace buffer was not large enough to fit the snapshot.
+ * On success it returns the positive number of bytes of each array element
+ * in the snapshot.
+ */
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+ int __user *optlen)
+{
+ struct rds_info_iterator iter;
+ struct rds_info_lengths lens;
+ unsigned long nr_pages = 0;
+ unsigned long start;
+ rds_info_func func;
+ struct page **pages = NULL;
+ int ret;
+ int len;
+ int total;
+
+ if (get_user(len, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* check for all kinds of wrapping and the like */
+ start = (unsigned long)optval;
+ if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* a 0 len call is just trying to probe its length */
+ if (len == 0)
+ goto call_func;
+
+ nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
+ >> PAGE_SHIFT;
+
+ pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
+ if (ret != nr_pages) {
+ if (ret > 0)
+ nr_pages = ret;
+ else
+ nr_pages = 0;
+ ret = -EAGAIN; /* XXX ? */
+ goto out;
+ }
+
+ rdsdebug("len %d nr_pages %lu\n", len, nr_pages);
+
+call_func:
+ func = rds_info_funcs[optname - RDS_INFO_FIRST];
+ if (!func) {
+ ret = -ENOPROTOOPT;
+ goto out;
+ }
+
+ iter.pages = pages;
+ iter.addr = NULL;
+ iter.offset = start & (PAGE_SIZE - 1);
+
+ func(sock, len, &iter, &lens);
+ BUG_ON(lens.each == 0);
+
+ total = lens.nr * lens.each;
+
+ rds_info_iter_unmap(&iter);
+
+ if (total > len) {
+ len = total;
+ ret = -ENOSPC;
+ } else {
+ len = total;
+ ret = lens.each;
+ }
+
+ if (put_user(len, optlen))
+ ret = -EFAULT;
+
+out:
+ if (pages)
+ unpin_user_pages(pages, nr_pages);
+ kfree(pages);
+
+ return ret;
+}
diff --git a/net/rds/info.h b/net/rds/info.h
new file mode 100644
index 000000000..a069b51c4
--- /dev/null
+++ b/net/rds/info.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDS_INFO_H
+#define _RDS_INFO_H
+
+struct rds_info_lengths {
+ unsigned int nr;
+ unsigned int each;
+};
+
+struct rds_info_iterator;
+
+/*
+ * These functions must fill in the fields of @lens to reflect the size
+ * of the available info source. If the snapshot fits in @len then it
+ * should be copied using @iter. The caller will deduce if it was copied
+ * or not by comparing the lengths.
+ */
+typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens);
+
+void rds_info_register_func(int optname, rds_info_func func);
+void rds_info_deregister_func(int optname, rds_info_func func);
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+ int __user *optlen);
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+ unsigned long bytes);
+void rds_info_iter_unmap(struct rds_info_iterator *iter);
+
+
+#endif
diff --git a/net/rds/loop.c b/net/rds/loop.c
new file mode 100644
index 000000000..1d73ad79c
--- /dev/null
+++ b/net/rds/loop.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/ipv6.h>
+
+#include "rds_single_path.h"
+#include "rds.h"
+#include "loop.h"
+
+static DEFINE_SPINLOCK(loop_conns_lock);
+static LIST_HEAD(loop_conns);
+static atomic_t rds_loop_unloading = ATOMIC_INIT(0);
+
+static void rds_loop_set_unloading(void)
+{
+ atomic_set(&rds_loop_unloading, 1);
+}
+
+static bool rds_loop_is_unloading(struct rds_connection *conn)
+{
+ return atomic_read(&rds_loop_unloading) != 0;
+}
+
+/*
+ * This 'loopback' transport is a special case for flows that originate
+ * and terminate on the same machine.
+ *
+ * Connection build-up notices if the destination address is thought of
+ * as a local address by a transport. At that time it decides to use the
+ * loopback transport instead of the bound transport of the sending socket.
+ *
+ * The loopback transport's sending path just hands the sent rds_message
+ * straight to the receiving path via an embedded rds_incoming.
+ */
+
+/*
+ * Usually a message transits both the sender and receiver's conns as it
+ * flows to the receiver. In the loopback case, though, the receive path
+ * is handed the sending conn so the sense of the addresses is reversed.
+ */
+static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg,
+ unsigned int off)
+{
+ struct scatterlist *sgp = &rm->data.op_sg[sg];
+ int ret = sizeof(struct rds_header) +
+ be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+ /* Do not send cong updates to loopback */
+ if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+ rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+ ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off);
+ goto out;
+ }
+
+ BUG_ON(hdr_off || sg || off);
+
+ rds_inc_init(&rm->m_inc, conn, &conn->c_laddr);
+ /* For the embedded inc. Matching put is in loop_inc_free() */
+ rds_message_addref(rm);
+
+ rds_recv_incoming(conn, &conn->c_laddr, &conn->c_faddr, &rm->m_inc,
+ GFP_KERNEL);
+
+ rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
+ NULL);
+
+ rds_inc_put(&rm->m_inc);
+out:
+ return ret;
+}
+
+/*
+ * See rds_loop_xmit(). Since our inc is embedded in the rm, we
+ * make sure the rm lives at least until the inc is done.
+ */
+static void rds_loop_inc_free(struct rds_incoming *inc)
+{
+ struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+
+ rds_message_put(rm);
+}
+
+/* we need to at least give the thread something to succeed */
+static int rds_loop_recv_path(struct rds_conn_path *cp)
+{
+ return 0;
+}
+
+struct rds_loop_connection {
+ struct list_head loop_node;
+ struct rds_connection *conn;
+};
+
+/*
+ * Even the loopback transport needs to keep track of its connections,
+ * so it can call rds_conn_destroy() on them on exit. N.B. there are
+ * 1+ loopback addresses (127.*.*.*) so it's not a bug to have
+ * multiple loopback conns allocated, although rather useless.
+ */
+static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+ struct rds_loop_connection *lc;
+ unsigned long flags;
+
+ lc = kzalloc(sizeof(struct rds_loop_connection), gfp);
+ if (!lc)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&lc->loop_node);
+ lc->conn = conn;
+ conn->c_transport_data = lc;
+
+ spin_lock_irqsave(&loop_conns_lock, flags);
+ list_add_tail(&lc->loop_node, &loop_conns);
+ spin_unlock_irqrestore(&loop_conns_lock, flags);
+
+ return 0;
+}
+
+static void rds_loop_conn_free(void *arg)
+{
+ struct rds_loop_connection *lc = arg;
+ unsigned long flags;
+
+ rdsdebug("lc %p\n", lc);
+ spin_lock_irqsave(&loop_conns_lock, flags);
+ list_del(&lc->loop_node);
+ spin_unlock_irqrestore(&loop_conns_lock, flags);
+ kfree(lc);
+}
+
+static int rds_loop_conn_path_connect(struct rds_conn_path *cp)
+{
+ rds_connect_complete(cp->cp_conn);
+ return 0;
+}
+
+static void rds_loop_conn_path_shutdown(struct rds_conn_path *cp)
+{
+}
+
+void rds_loop_exit(void)
+{
+ struct rds_loop_connection *lc, *_lc;
+ LIST_HEAD(tmp_list);
+
+ rds_loop_set_unloading();
+ synchronize_rcu();
+ /* avoid calling conn_destroy with irqs off */
+ spin_lock_irq(&loop_conns_lock);
+ list_splice(&loop_conns, &tmp_list);
+ INIT_LIST_HEAD(&loop_conns);
+ spin_unlock_irq(&loop_conns_lock);
+
+ list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
+ WARN_ON(lc->conn->c_passive);
+ rds_conn_destroy(lc->conn);
+ }
+}
+
+static void rds_loop_kill_conns(struct net *net)
+{
+ struct rds_loop_connection *lc, *_lc;
+ LIST_HEAD(tmp_list);
+
+ spin_lock_irq(&loop_conns_lock);
+ list_for_each_entry_safe(lc, _lc, &loop_conns, loop_node) {
+ struct net *c_net = read_pnet(&lc->conn->c_net);
+
+ if (net != c_net)
+ continue;
+ list_move_tail(&lc->loop_node, &tmp_list);
+ }
+ spin_unlock_irq(&loop_conns_lock);
+
+ list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
+ WARN_ON(lc->conn->c_passive);
+ rds_conn_destroy(lc->conn);
+ }
+}
+
+static void __net_exit rds_loop_exit_net(struct net *net)
+{
+ rds_loop_kill_conns(net);
+}
+
+static struct pernet_operations rds_loop_net_ops = {
+ .exit = rds_loop_exit_net,
+};
+
+int rds_loop_net_init(void)
+{
+ return register_pernet_device(&rds_loop_net_ops);
+}
+
+void rds_loop_net_exit(void)
+{
+ unregister_pernet_device(&rds_loop_net_ops);
+}
+
+/*
+ * This is missing .xmit_* because loop doesn't go through generic
+ * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and
+ * .laddr_check are missing because transport.c doesn't iterate over
+ * rds_loop_transport.
+ */
+struct rds_transport rds_loop_transport = {
+ .xmit = rds_loop_xmit,
+ .recv_path = rds_loop_recv_path,
+ .conn_alloc = rds_loop_conn_alloc,
+ .conn_free = rds_loop_conn_free,
+ .conn_path_connect = rds_loop_conn_path_connect,
+ .conn_path_shutdown = rds_loop_conn_path_shutdown,
+ .inc_copy_to_user = rds_message_inc_copy_to_user,
+ .inc_free = rds_loop_inc_free,
+ .t_name = "loopback",
+ .t_type = RDS_TRANS_LOOP,
+ .t_unloading = rds_loop_is_unloading,
+};
diff --git a/net/rds/loop.h b/net/rds/loop.h
new file mode 100644
index 000000000..bbc8cdd03
--- /dev/null
+++ b/net/rds/loop.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDS_LOOP_H
+#define _RDS_LOOP_H
+
+/* loop.c */
+extern struct rds_transport rds_loop_transport;
+
+int rds_loop_net_init(void);
+void rds_loop_net_exit(void);
+void rds_loop_exit(void);
+
+#endif
diff --git a/net/rds/message.c b/net/rds/message.c
new file mode 100644
index 000000000..f71e1237e
--- /dev/null
+++ b/net/rds/message.c
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2006, 2020 Oracle and/or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/errqueue.h>
+
+#include "rds.h"
+
+static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
+[RDS_EXTHDR_NONE] = 0,
+[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
+[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
+[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
+[RDS_EXTHDR_NPATHS] = sizeof(u16),
+[RDS_EXTHDR_GEN_NUM] = sizeof(u32),
+};
+
+void rds_message_addref(struct rds_message *rm)
+{
+ rdsdebug("addref rm %p ref %d\n", rm, refcount_read(&rm->m_refcount));
+ refcount_inc(&rm->m_refcount);
+}
+EXPORT_SYMBOL_GPL(rds_message_addref);
+
+static inline bool rds_zcookie_add(struct rds_msg_zcopy_info *info, u32 cookie)
+{
+ struct rds_zcopy_cookies *ck = &info->zcookies;
+ int ncookies = ck->num;
+
+ if (ncookies == RDS_MAX_ZCOOKIES)
+ return false;
+ ck->cookies[ncookies] = cookie;
+ ck->num = ++ncookies;
+ return true;
+}
+
+static struct rds_msg_zcopy_info *rds_info_from_znotifier(struct rds_znotifier *znotif)
+{
+ return container_of(znotif, struct rds_msg_zcopy_info, znotif);
+}
+
+void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *q)
+{
+ unsigned long flags;
+ LIST_HEAD(copy);
+ struct rds_msg_zcopy_info *info, *tmp;
+
+ spin_lock_irqsave(&q->lock, flags);
+ list_splice(&q->zcookie_head, &copy);
+ INIT_LIST_HEAD(&q->zcookie_head);
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ list_for_each_entry_safe(info, tmp, &copy, rs_zcookie_next) {
+ list_del(&info->rs_zcookie_next);
+ kfree(info);
+ }
+}
+
+static void rds_rm_zerocopy_callback(struct rds_sock *rs,
+ struct rds_znotifier *znotif)
+{
+ struct rds_msg_zcopy_info *info;
+ struct rds_msg_zcopy_queue *q;
+ u32 cookie = znotif->z_cookie;
+ struct rds_zcopy_cookies *ck;
+ struct list_head *head;
+ unsigned long flags;
+
+ mm_unaccount_pinned_pages(&znotif->z_mmp);
+ q = &rs->rs_zcookie_queue;
+ spin_lock_irqsave(&q->lock, flags);
+ head = &q->zcookie_head;
+ if (!list_empty(head)) {
+ info = list_first_entry(head, struct rds_msg_zcopy_info,
+ rs_zcookie_next);
+ if (rds_zcookie_add(info, cookie)) {
+ spin_unlock_irqrestore(&q->lock, flags);
+ kfree(rds_info_from_znotifier(znotif));
+ /* caller invokes rds_wake_sk_sleep() */
+ return;
+ }
+ }
+
+ info = rds_info_from_znotifier(znotif);
+ ck = &info->zcookies;
+ memset(ck, 0, sizeof(*ck));
+ WARN_ON(!rds_zcookie_add(info, cookie));
+ list_add_tail(&info->rs_zcookie_next, &q->zcookie_head);
+
+ spin_unlock_irqrestore(&q->lock, flags);
+ /* caller invokes rds_wake_sk_sleep() */
+}
+
+/*
+ * This relies on dma_map_sg() not touching sg[].page during merging.
+ */
+static void rds_message_purge(struct rds_message *rm)
+{
+ unsigned long i, flags;
+ bool zcopy = false;
+
+ if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
+ return;
+
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
+ if (rm->m_rs) {
+ struct rds_sock *rs = rm->m_rs;
+
+ if (rm->data.op_mmp_znotifier) {
+ zcopy = true;
+ rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier);
+ rds_wake_sk_sleep(rs);
+ rm->data.op_mmp_znotifier = NULL;
+ }
+ sock_put(rds_rs_to_sk(rs));
+ rm->m_rs = NULL;
+ }
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+ for (i = 0; i < rm->data.op_nents; i++) {
+ /* XXX will have to put_page for page refs */
+ if (!zcopy)
+ __free_page(sg_page(&rm->data.op_sg[i]));
+ else
+ put_page(sg_page(&rm->data.op_sg[i]));
+ }
+ rm->data.op_nents = 0;
+
+ if (rm->rdma.op_active)
+ rds_rdma_free_op(&rm->rdma);
+ if (rm->rdma.op_rdma_mr)
+ kref_put(&rm->rdma.op_rdma_mr->r_kref, __rds_put_mr_final);
+
+ if (rm->atomic.op_active)
+ rds_atomic_free_op(&rm->atomic);
+ if (rm->atomic.op_rdma_mr)
+ kref_put(&rm->atomic.op_rdma_mr->r_kref, __rds_put_mr_final);
+}
+
+void rds_message_put(struct rds_message *rm)
+{
+ rdsdebug("put rm %p ref %d\n", rm, refcount_read(&rm->m_refcount));
+ WARN(!refcount_read(&rm->m_refcount), "danger refcount zero on %p\n", rm);
+ if (refcount_dec_and_test(&rm->m_refcount)) {
+ BUG_ON(!list_empty(&rm->m_sock_item));
+ BUG_ON(!list_empty(&rm->m_conn_item));
+ rds_message_purge(rm);
+
+ kfree(rm);
+ }
+}
+EXPORT_SYMBOL_GPL(rds_message_put);
+
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+ __be16 dport, u64 seq)
+{
+ hdr->h_flags = 0;
+ hdr->h_sport = sport;
+ hdr->h_dport = dport;
+ hdr->h_sequence = cpu_to_be64(seq);
+ hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
+}
+EXPORT_SYMBOL_GPL(rds_message_populate_header);
+
+int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
+ const void *data, unsigned int len)
+{
+ unsigned int ext_len = sizeof(u8) + len;
+ unsigned char *dst;
+
+ /* For now, refuse to add more than one extension header */
+ if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
+ return 0;
+
+ if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
+ return 0;
+
+ if (ext_len >= RDS_HEADER_EXT_SPACE)
+ return 0;
+ dst = hdr->h_exthdr;
+
+ *dst++ = type;
+ memcpy(dst, data, len);
+
+ dst[len] = RDS_EXTHDR_NONE;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(rds_message_add_extension);
+
+/*
+ * If a message has extension headers, retrieve them here.
+ * Call like this:
+ *
+ * unsigned int pos = 0;
+ *
+ * while (1) {
+ * buflen = sizeof(buffer);
+ * type = rds_message_next_extension(hdr, &pos, buffer, &buflen);
+ * if (type == RDS_EXTHDR_NONE)
+ * break;
+ * ...
+ * }
+ */
+int rds_message_next_extension(struct rds_header *hdr,
+ unsigned int *pos, void *buf, unsigned int *buflen)
+{
+ unsigned int offset, ext_type, ext_len;
+ u8 *src = hdr->h_exthdr;
+
+ offset = *pos;
+ if (offset >= RDS_HEADER_EXT_SPACE)
+ goto none;
+
+ /* Get the extension type and length. For now, the
+ * length is implied by the extension type. */
+ ext_type = src[offset++];
+
+ if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX)
+ goto none;
+ ext_len = rds_exthdr_size[ext_type];
+ if (offset + ext_len > RDS_HEADER_EXT_SPACE)
+ goto none;
+
+ *pos = offset + ext_len;
+ if (ext_len < *buflen)
+ *buflen = ext_len;
+ memcpy(buf, src + offset, *buflen);
+ return ext_type;
+
+none:
+ *pos = RDS_HEADER_EXT_SPACE;
+ *buflen = 0;
+ return RDS_EXTHDR_NONE;
+}
+
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
+{
+ struct rds_ext_header_rdma_dest ext_hdr;
+
+ ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
+ ext_hdr.h_rdma_offset = cpu_to_be32(offset);
+ return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
+}
+EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
+
+/*
+ * Each rds_message is allocated with extra space for the scatterlist entries
+ * rds ops will need. This is to minimize memory allocation count. Then, each rds op
+ * can grab SGs when initializing its part of the rds_message.
+ */
+struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
+{
+ struct rds_message *rm;
+
+ if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message))
+ return NULL;
+
+ rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
+ if (!rm)
+ goto out;
+
+ rm->m_used_sgs = 0;
+ rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
+
+ refcount_set(&rm->m_refcount, 1);
+ INIT_LIST_HEAD(&rm->m_sock_item);
+ INIT_LIST_HEAD(&rm->m_conn_item);
+ spin_lock_init(&rm->m_rs_lock);
+ init_waitqueue_head(&rm->m_flush_wait);
+
+out:
+ return rm;
+}
+
+/*
+ * RDS ops use this to grab SG entries from the rm's sg pool.
+ */
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
+{
+ struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
+ struct scatterlist *sg_ret;
+
+ if (nents <= 0) {
+ pr_warn("rds: alloc sgs failed! nents <= 0\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (rm->m_used_sgs + nents > rm->m_total_sgs) {
+ pr_warn("rds: alloc sgs failed! total %d used %d nents %d\n",
+ rm->m_total_sgs, rm->m_used_sgs, nents);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ sg_ret = &sg_first[rm->m_used_sgs];
+ sg_init_table(sg_ret, nents);
+ rm->m_used_sgs += nents;
+
+ return sg_ret;
+}
+
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
+{
+ struct rds_message *rm;
+ unsigned int i;
+ int num_sgs = DIV_ROUND_UP(total_len, PAGE_SIZE);
+ int extra_bytes = num_sgs * sizeof(struct scatterlist);
+
+ rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
+ if (!rm)
+ return ERR_PTR(-ENOMEM);
+
+ set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
+ rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
+ rm->data.op_nents = DIV_ROUND_UP(total_len, PAGE_SIZE);
+ rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
+ if (IS_ERR(rm->data.op_sg)) {
+ void *err = ERR_CAST(rm->data.op_sg);
+ rds_message_put(rm);
+ return err;
+ }
+
+ for (i = 0; i < rm->data.op_nents; ++i) {
+ sg_set_page(&rm->data.op_sg[i],
+ virt_to_page((void *)page_addrs[i]),
+ PAGE_SIZE, 0);
+ }
+
+ return rm;
+}
+
+static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from)
+{
+ struct scatterlist *sg;
+ int ret = 0;
+ int length = iov_iter_count(from);
+ int total_copied = 0;
+ struct rds_msg_zcopy_info *info;
+
+ rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
+
+ /*
+ * now allocate and copy in the data payload.
+ */
+ sg = rm->data.op_sg;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&info->rs_zcookie_next);
+ rm->data.op_mmp_znotifier = &info->znotif;
+ if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
+ length)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ while (iov_iter_count(from)) {
+ struct page *pages;
+ size_t start;
+ ssize_t copied;
+
+ copied = iov_iter_get_pages2(from, &pages, PAGE_SIZE,
+ 1, &start);
+ if (copied < 0) {
+ struct mmpin *mmp;
+ int i;
+
+ for (i = 0; i < rm->data.op_nents; i++)
+ put_page(sg_page(&rm->data.op_sg[i]));
+ mmp = &rm->data.op_mmp_znotifier->z_mmp;
+ mm_unaccount_pinned_pages(mmp);
+ ret = -EFAULT;
+ goto err;
+ }
+ total_copied += copied;
+ length -= copied;
+ sg_set_page(sg, pages, copied, start);
+ rm->data.op_nents++;
+ sg++;
+ }
+ WARN_ON_ONCE(length != 0);
+ return ret;
+err:
+ kfree(info);
+ rm->data.op_mmp_znotifier = NULL;
+ return ret;
+}
+
+int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
+ bool zcopy)
+{
+ unsigned long to_copy, nbytes;
+ unsigned long sg_off;
+ struct scatterlist *sg;
+ int ret = 0;
+
+ rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
+
+ /* now allocate and copy in the data payload. */
+ sg = rm->data.op_sg;
+ sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
+
+ if (zcopy)
+ return rds_message_zcopy_from_user(rm, from);
+
+ while (iov_iter_count(from)) {
+ if (!sg_page(sg)) {
+ ret = rds_page_remainder_alloc(sg, iov_iter_count(from),
+ GFP_HIGHUSER);
+ if (ret)
+ return ret;
+ rm->data.op_nents++;
+ sg_off = 0;
+ }
+
+ to_copy = min_t(unsigned long, iov_iter_count(from),
+ sg->length - sg_off);
+
+ rds_stats_add(s_copy_from_user, to_copy);
+ nbytes = copy_page_from_iter(sg_page(sg), sg->offset + sg_off,
+ to_copy, from);
+ if (nbytes != to_copy)
+ return -EFAULT;
+
+ sg_off += to_copy;
+
+ if (sg_off == sg->length)
+ sg++;
+ }
+
+ return ret;
+}
+
+int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+{
+ struct rds_message *rm;
+ struct scatterlist *sg;
+ unsigned long to_copy;
+ unsigned long vec_off;
+ int copied;
+ int ret;
+ u32 len;
+
+ rm = container_of(inc, struct rds_message, m_inc);
+ len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+ sg = rm->data.op_sg;
+ vec_off = 0;
+ copied = 0;
+
+ while (iov_iter_count(to) && copied < len) {
+ to_copy = min_t(unsigned long, iov_iter_count(to),
+ sg->length - vec_off);
+ to_copy = min_t(unsigned long, to_copy, len - copied);
+
+ rds_stats_add(s_copy_to_user, to_copy);
+ ret = copy_page_to_iter(sg_page(sg), sg->offset + vec_off,
+ to_copy, to);
+ if (ret != to_copy)
+ return -EFAULT;
+
+ vec_off += to_copy;
+ copied += to_copy;
+
+ if (vec_off == sg->length) {
+ vec_off = 0;
+ sg++;
+ }
+ }
+
+ return copied;
+}
+
+/*
+ * If the message is still on the send queue, wait until the transport
+ * is done with it. This is particularly important for RDMA operations.
+ */
+void rds_message_wait(struct rds_message *rm)
+{
+ wait_event_interruptible(rm->m_flush_wait,
+ !test_bit(RDS_MSG_MAPPED, &rm->m_flags));
+}
+
+void rds_message_unmapped(struct rds_message *rm)
+{
+ clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ wake_up_interruptible(&rm->m_flush_wait);
+}
+EXPORT_SYMBOL_GPL(rds_message_unmapped);
diff --git a/net/rds/page.c b/net/rds/page.c
new file mode 100644
index 000000000..7cc57e098
--- /dev/null
+++ b/net/rds/page.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+struct rds_page_remainder {
+ struct page *r_page;
+ unsigned long r_offset;
+};
+
+static
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
+
+/**
+ * rds_page_remainder_alloc - build up regions of a message.
+ *
+ * @scat: Scatter list for message
+ * @bytes: the number of bytes needed.
+ * @gfp: the waiting behaviour of the allocation
+ *
+ * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to
+ * kmap the pages, etc.
+ *
+ * If @bytes is at least a full page then this just returns a page from
+ * alloc_page().
+ *
+ * If @bytes is a partial page then this stores the unused region of the
+ * page in a per-cpu structure. Future partial-page allocations may be
+ * satisfied from that cached region. This lets us waste less memory on
+ * small allocations with minimal complexity. It works because the transmit
+ * path passes read-only page regions down to devices. They hold a page
+ * reference until they are done with the region.
+ */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+ gfp_t gfp)
+{
+ struct rds_page_remainder *rem;
+ unsigned long flags;
+ struct page *page;
+ int ret;
+
+ gfp |= __GFP_HIGHMEM;
+
+ /* jump straight to allocation if we're trying for a huge page */
+ if (bytes >= PAGE_SIZE) {
+ page = alloc_page(gfp);
+ if (!page) {
+ ret = -ENOMEM;
+ } else {
+ sg_set_page(scat, page, PAGE_SIZE, 0);
+ ret = 0;
+ }
+ goto out;
+ }
+
+ rem = &per_cpu(rds_page_remainders, get_cpu());
+ local_irq_save(flags);
+
+ while (1) {
+ /* avoid a tiny region getting stuck by tossing it */
+ if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) {
+ rds_stats_inc(s_page_remainder_miss);
+ __free_page(rem->r_page);
+ rem->r_page = NULL;
+ }
+
+ /* hand out a fragment from the cached page */
+ if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) {
+ sg_set_page(scat, rem->r_page, bytes, rem->r_offset);
+ get_page(sg_page(scat));
+
+ if (rem->r_offset != 0)
+ rds_stats_inc(s_page_remainder_hit);
+
+ rem->r_offset += ALIGN(bytes, 8);
+ if (rem->r_offset >= PAGE_SIZE) {
+ __free_page(rem->r_page);
+ rem->r_page = NULL;
+ }
+ ret = 0;
+ break;
+ }
+
+ /* alloc if there is nothing for us to use */
+ local_irq_restore(flags);
+ put_cpu();
+
+ page = alloc_page(gfp);
+
+ rem = &per_cpu(rds_page_remainders, get_cpu());
+ local_irq_save(flags);
+
+ if (!page) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ /* did someone race to fill the remainder before us? */
+ if (rem->r_page) {
+ __free_page(page);
+ continue;
+ }
+
+ /* otherwise install our page and loop around to alloc */
+ rem->r_page = page;
+ rem->r_offset = 0;
+ }
+
+ local_irq_restore(flags);
+ put_cpu();
+out:
+ rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
+ ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
+ ret ? 0 : scat->length);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
+
+void rds_page_exit(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct rds_page_remainder *rem;
+
+ rem = &per_cpu(rds_page_remainders, cpu);
+ rdsdebug("cpu %u\n", cpu);
+
+ if (rem->r_page)
+ __free_page(rem->r_page);
+ rem->r_page = NULL;
+ }
+}
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
new file mode 100644
index 000000000..fba82d365
--- /dev/null
+++ b/net/rds/rdma.c
@@ -0,0 +1,958 @@
+/*
+ * Copyright (c) 2007, 2020 Oracle and/or its affiliates.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
+
+#include "rds.h"
+
+/*
+ * XXX
+ * - build with sparse
+ * - should we detect duplicate keys on a socket? hmm.
+ * - an rdma is an mlock, apply rlimit?
+ */
+
+/*
+ * get the number of pages by looking at the page indices that the start and
+ * end addresses fall in.
+ *
+ * Returns 0 if the vec is invalid. It is invalid if the number of bytes
+ * causes the address to wrap or overflows an unsigned int. This comes
+ * from being stored in the 'length' member of 'struct scatterlist'.
+ */
+static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
+{
+ if ((vec->addr + vec->bytes <= vec->addr) ||
+ (vec->bytes > (u64)UINT_MAX))
+ return 0;
+
+ return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+ (vec->addr >> PAGE_SHIFT);
+}
+
+static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
+ struct rds_mr *insert)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct rds_mr *mr;
+
+ while (*p) {
+ parent = *p;
+ mr = rb_entry(parent, struct rds_mr, r_rb_node);
+
+ if (key < mr->r_key)
+ p = &(*p)->rb_left;
+ else if (key > mr->r_key)
+ p = &(*p)->rb_right;
+ else
+ return mr;
+ }
+
+ if (insert) {
+ rb_link_node(&insert->r_rb_node, parent, p);
+ rb_insert_color(&insert->r_rb_node, root);
+ kref_get(&insert->r_kref);
+ }
+ return NULL;
+}
+
+/*
+ * Destroy the transport-specific part of a MR.
+ */
+static void rds_destroy_mr(struct rds_mr *mr)
+{
+ struct rds_sock *rs = mr->r_sock;
+ void *trans_private = NULL;
+ unsigned long flags;
+
+ rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
+ mr->r_key, kref_read(&mr->r_kref));
+
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ if (!RB_EMPTY_NODE(&mr->r_rb_node))
+ rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+ trans_private = mr->r_trans_private;
+ mr->r_trans_private = NULL;
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ if (trans_private)
+ mr->r_trans->free_mr(trans_private, mr->r_invalidate);
+}
+
+void __rds_put_mr_final(struct kref *kref)
+{
+ struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref);
+
+ rds_destroy_mr(mr);
+ kfree(mr);
+}
+
+/*
+ * By the time this is called we can't have any more ioctls called on
+ * the socket so we don't need to worry about racing with others.
+ */
+void rds_rdma_drop_keys(struct rds_sock *rs)
+{
+ struct rds_mr *mr;
+ struct rb_node *node;
+ unsigned long flags;
+
+ /* Release any MRs associated with this socket */
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ while ((node = rb_first(&rs->rs_rdma_keys))) {
+ mr = rb_entry(node, struct rds_mr, r_rb_node);
+ if (mr->r_trans == rs->rs_transport)
+ mr->r_invalidate = 0;
+ rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+ kref_put(&mr->r_kref, __rds_put_mr_final);
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ }
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ if (rs->rs_transport && rs->rs_transport->flush_mrs)
+ rs->rs_transport->flush_mrs();
+}
+
+/*
+ * Helper function to pin user pages.
+ */
+static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
+ struct page **pages, int write)
+{
+ unsigned int gup_flags = FOLL_LONGTERM;
+ int ret;
+
+ if (write)
+ gup_flags |= FOLL_WRITE;
+
+ ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages);
+ if (ret >= 0 && ret < nr_pages) {
+ unpin_user_pages(pages, ret);
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+
+static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
+ u64 *cookie_ret, struct rds_mr **mr_ret,
+ struct rds_conn_path *cp)
+{
+ struct rds_mr *mr = NULL, *found;
+ struct scatterlist *sg = NULL;
+ unsigned int nr_pages;
+ struct page **pages = NULL;
+ void *trans_private;
+ unsigned long flags;
+ rds_rdma_cookie_t cookie;
+ unsigned int nents = 0;
+ int need_odp = 0;
+ long i;
+ int ret;
+
+ if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ goto out;
+ }
+
+ if (!rs->rs_transport->get_mr) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* If the combination of the addr and size requested for this memory
+ * region causes an integer overflow, return error.
+ */
+ if (((args->vec.addr + args->vec.bytes) < args->vec.addr) ||
+ PAGE_ALIGN(args->vec.addr + args->vec.bytes) <
+ (args->vec.addr + args->vec.bytes)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!can_do_mlock()) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ nr_pages = rds_pages_in_vec(&args->vec);
+ if (nr_pages == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Restrict the size of mr irrespective of underlying transport
+ * To account for unaligned mr regions, subtract one from nr_pages
+ */
+ if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
+ rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
+ args->vec.addr, args->vec.bytes, nr_pages);
+
+ /* XXX clamp nr_pages to limit the size of this alloc? */
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
+ if (!mr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ kref_init(&mr->r_kref);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ mr->r_trans = rs->rs_transport;
+ mr->r_sock = rs;
+
+ if (args->flags & RDS_RDMA_USE_ONCE)
+ mr->r_use_once = 1;
+ if (args->flags & RDS_RDMA_INVALIDATE)
+ mr->r_invalidate = 1;
+ if (args->flags & RDS_RDMA_READWRITE)
+ mr->r_write = 1;
+
+ /*
+ * Pin the pages that make up the user buffer and transfer the page
+ * pointers to the mr's sg array. We check to see if we've mapped
+ * the whole region after transferring the partial page references
+ * to the sg array so that we can have one page ref cleanup path.
+ *
+ * For now we have no flag that tells us whether the mapping is
+ * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
+ * the zero page.
+ */
+ ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
+ if (ret == -EOPNOTSUPP) {
+ need_odp = 1;
+ } else if (ret <= 0) {
+ goto out;
+ } else {
+ nents = ret;
+ sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL);
+ if (!sg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ WARN_ON(!nents);
+ sg_init_table(sg, nents);
+
+ /* Stick all pages into the scatterlist */
+ for (i = 0 ; i < nents; i++)
+ sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+
+ rdsdebug("RDS: trans_private nents is %u\n", nents);
+ }
+ /* Obtain a transport specific MR. If this succeeds, the
+ * s/g list is now owned by the MR.
+ * Note that dma_map() implies that pending writes are
+ * flushed to RAM, so no dma_sync is needed here. */
+ trans_private = rs->rs_transport->get_mr(
+ sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL,
+ args->vec.addr, args->vec.bytes,
+ need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED);
+
+ if (IS_ERR(trans_private)) {
+ /* In ODP case, we don't GUP pages, so don't need
+ * to release anything.
+ */
+ if (!need_odp) {
+ unpin_user_pages(pages, nr_pages);
+ kfree(sg);
+ }
+ ret = PTR_ERR(trans_private);
+ goto out;
+ }
+
+ mr->r_trans_private = trans_private;
+
+ rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n",
+ mr->r_key, (void *)(unsigned long) args->cookie_addr);
+
+ /* The user may pass us an unaligned address, but we can only
+ * map page aligned regions. So we keep the offset, and build
+ * a 64bit cookie containing <R_Key, offset> and pass that
+ * around. */
+ if (need_odp)
+ cookie = rds_rdma_make_cookie(mr->r_key, 0);
+ else
+ cookie = rds_rdma_make_cookie(mr->r_key,
+ args->vec.addr & ~PAGE_MASK);
+ if (cookie_ret)
+ *cookie_ret = cookie;
+
+ if (args->cookie_addr &&
+ put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) {
+ if (!need_odp) {
+ unpin_user_pages(pages, nr_pages);
+ kfree(sg);
+ }
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* Inserting the new MR into the rbtree bumps its
+ * reference count. */
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ BUG_ON(found && found != mr);
+
+ rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
+ if (mr_ret) {
+ kref_get(&mr->r_kref);
+ *mr_ret = mr;
+ }
+
+ ret = 0;
+out:
+ kfree(pages);
+ if (mr)
+ kref_put(&mr->r_kref, __rds_put_mr_final);
+ return ret;
+}
+
+int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen)
+{
+ struct rds_get_mr_args args;
+
+ if (optlen != sizeof(struct rds_get_mr_args))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args)))
+ return -EFAULT;
+
+ return __rds_rdma_map(rs, &args, NULL, NULL, NULL);
+}
+
+int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen)
+{
+ struct rds_get_mr_for_dest_args args;
+ struct rds_get_mr_args new_args;
+
+ if (optlen != sizeof(struct rds_get_mr_for_dest_args))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&args, optval,
+ sizeof(struct rds_get_mr_for_dest_args)))
+ return -EFAULT;
+
+ /*
+ * Initially, just behave like get_mr().
+ * TODO: Implement get_mr as wrapper around this
+ * and deprecate it.
+ */
+ new_args.vec = args.vec;
+ new_args.cookie_addr = args.cookie_addr;
+ new_args.flags = args.flags;
+
+ return __rds_rdma_map(rs, &new_args, NULL, NULL, NULL);
+}
+
+/*
+ * Free the MR indicated by the given R_Key
+ */
+int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen)
+{
+ struct rds_free_mr_args args;
+ struct rds_mr *mr;
+ unsigned long flags;
+
+ if (optlen != sizeof(struct rds_free_mr_args))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args)))
+ return -EFAULT;
+
+ /* Special case - a null cookie means flush all unused MRs */
+ if (args.cookie == 0) {
+ if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
+ return -EINVAL;
+ rs->rs_transport->flush_mrs();
+ return 0;
+ }
+
+ /* Look up the MR given its R_key and remove it from the rbtree
+ * so nobody else finds it.
+ * This should also prevent races with rds_rdma_unuse.
+ */
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
+ if (mr) {
+ rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ if (args.flags & RDS_RDMA_INVALIDATE)
+ mr->r_invalidate = 1;
+ }
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ if (!mr)
+ return -EINVAL;
+
+ kref_put(&mr->r_kref, __rds_put_mr_final);
+ return 0;
+}
+
+/*
+ * This is called when we receive an extension header that
+ * tells us this MR was used. It allows us to implement
+ * use_once semantics
+ */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
+{
+ struct rds_mr *mr;
+ unsigned long flags;
+ int zot_me = 0;
+
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+ if (!mr) {
+ pr_debug("rds: trying to unuse MR with unknown r_key %u!\n",
+ r_key);
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+ return;
+ }
+
+ /* Get a reference so that the MR won't go away before calling
+ * sync_mr() below.
+ */
+ kref_get(&mr->r_kref);
+
+ /* If it is going to be freed, remove it from the tree now so
+ * that no other thread can find it and free it.
+ */
+ if (mr->r_use_once || force) {
+ rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ zot_me = 1;
+ }
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ /* May have to issue a dma_sync on this memory region.
+ * Note we could avoid this if the operation was a RDMA READ,
+ * but at this point we can't tell. */
+ if (mr->r_trans->sync_mr)
+ mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+
+ /* Release the reference held above. */
+ kref_put(&mr->r_kref, __rds_put_mr_final);
+
+ /* If the MR was marked as invalidate, this will
+ * trigger an async flush. */
+ if (zot_me)
+ kref_put(&mr->r_kref, __rds_put_mr_final);
+}
+
+void rds_rdma_free_op(struct rm_rdma_op *ro)
+{
+ unsigned int i;
+
+ if (ro->op_odp_mr) {
+ kref_put(&ro->op_odp_mr->r_kref, __rds_put_mr_final);
+ } else {
+ for (i = 0; i < ro->op_nents; i++) {
+ struct page *page = sg_page(&ro->op_sg[i]);
+
+ /* Mark page dirty if it was possibly modified, which
+ * is the case for a RDMA_READ which copies from remote
+ * to local memory
+ */
+ unpin_user_pages_dirty_lock(&page, 1, !ro->op_write);
+ }
+ }
+
+ kfree(ro->op_notifier);
+ ro->op_notifier = NULL;
+ ro->op_active = 0;
+ ro->op_odp_mr = NULL;
+}
+
+void rds_atomic_free_op(struct rm_atomic_op *ao)
+{
+ struct page *page = sg_page(ao->op_sg);
+
+ /* Mark page dirty if it was possibly modified, which
+ * is the case for a RDMA_READ which copies from remote
+ * to local memory */
+ unpin_user_pages_dirty_lock(&page, 1, true);
+
+ kfree(ao->op_notifier);
+ ao->op_notifier = NULL;
+ ao->op_active = 0;
+}
+
+
+/*
+ * Count the number of pages needed to describe an incoming iovec array.
+ */
+static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
+{
+ int tot_pages = 0;
+ unsigned int nr_pages;
+ unsigned int i;
+
+ /* figure out the number of pages in the vector */
+ for (i = 0; i < nr_iovecs; i++) {
+ nr_pages = rds_pages_in_vec(&iov[i]);
+ if (nr_pages == 0)
+ return -EINVAL;
+
+ tot_pages += nr_pages;
+
+ /*
+ * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
+ * so tot_pages cannot overflow without first going negative.
+ */
+ if (tot_pages < 0)
+ return -EINVAL;
+ }
+
+ return tot_pages;
+}
+
+int rds_rdma_extra_size(struct rds_rdma_args *args,
+ struct rds_iov_vector *iov)
+{
+ struct rds_iovec *vec;
+ struct rds_iovec __user *local_vec;
+ int tot_pages = 0;
+ unsigned int nr_pages;
+ unsigned int i;
+
+ local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+ if (args->nr_local == 0)
+ return -EINVAL;
+
+ if (args->nr_local > UIO_MAXIOV)
+ return -EMSGSIZE;
+
+ iov->iov = kcalloc(args->nr_local,
+ sizeof(struct rds_iovec),
+ GFP_KERNEL);
+ if (!iov->iov)
+ return -ENOMEM;
+
+ vec = &iov->iov[0];
+
+ if (copy_from_user(vec, local_vec, args->nr_local *
+ sizeof(struct rds_iovec)))
+ return -EFAULT;
+ iov->len = args->nr_local;
+
+ /* figure out the number of pages in the vector */
+ for (i = 0; i < args->nr_local; i++, vec++) {
+
+ nr_pages = rds_pages_in_vec(vec);
+ if (nr_pages == 0)
+ return -EINVAL;
+
+ tot_pages += nr_pages;
+
+ /*
+ * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
+ * so tot_pages cannot overflow without first going negative.
+ */
+ if (tot_pages < 0)
+ return -EINVAL;
+ }
+
+ return tot_pages * sizeof(struct scatterlist);
+}
+
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg,
+ struct rds_iov_vector *vec)
+{
+ struct rds_rdma_args *args;
+ struct rm_rdma_op *op = &rm->rdma;
+ int nr_pages;
+ unsigned int nr_bytes;
+ struct page **pages = NULL;
+ struct rds_iovec *iovs;
+ unsigned int i, j;
+ int ret = 0;
+ bool odp_supported = true;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+ || rm->rdma.op_active)
+ return -EINVAL;
+
+ args = CMSG_DATA(cmsg);
+
+ if (ipv6_addr_any(&rs->rs_bound_addr)) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ goto out_ret;
+ }
+
+ if (args->nr_local > UIO_MAXIOV) {
+ ret = -EMSGSIZE;
+ goto out_ret;
+ }
+
+ if (vec->len != args->nr_local) {
+ ret = -EINVAL;
+ goto out_ret;
+ }
+ /* odp-mr is not supported for multiple requests within one message */
+ if (args->nr_local != 1)
+ odp_supported = false;
+
+ iovs = vec->iov;
+
+ nr_pages = rds_rdma_pages(iovs, args->nr_local);
+ if (nr_pages < 0) {
+ ret = -EINVAL;
+ goto out_ret;
+ }
+
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out_ret;
+ }
+
+ op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
+ op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
+ op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+ op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
+ op->op_active = 1;
+ op->op_recverr = rs->rs_recverr;
+ op->op_odp_mr = NULL;
+
+ WARN_ON(!nr_pages);
+ op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
+ if (IS_ERR(op->op_sg)) {
+ ret = PTR_ERR(op->op_sg);
+ goto out_pages;
+ }
+
+ if (op->op_notify || op->op_recverr) {
+ /* We allocate an uninitialized notifier here, because
+ * we don't want to do that in the completion handler. We
+ * would have to use GFP_ATOMIC there, and don't want to deal
+ * with failed allocations.
+ */
+ op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+ if (!op->op_notifier) {
+ ret = -ENOMEM;
+ goto out_pages;
+ }
+ op->op_notifier->n_user_token = args->user_token;
+ op->op_notifier->n_status = RDS_RDMA_SUCCESS;
+ }
+
+ /* The cookie contains the R_Key of the remote memory region, and
+ * optionally an offset into it. This is how we implement RDMA into
+ * unaligned memory.
+ * When setting up the RDMA, we need to add that offset to the
+ * destination address (which is really an offset into the MR)
+ * FIXME: We may want to move this into ib_rdma.c
+ */
+ op->op_rkey = rds_rdma_cookie_key(args->cookie);
+ op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+
+ nr_bytes = 0;
+
+ rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
+ (unsigned long long)args->nr_local,
+ (unsigned long long)args->remote_vec.addr,
+ op->op_rkey);
+
+ for (i = 0; i < args->nr_local; i++) {
+ struct rds_iovec *iov = &iovs[i];
+ /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
+ unsigned int nr = rds_pages_in_vec(iov);
+
+ rs->rs_user_addr = iov->addr;
+ rs->rs_user_bytes = iov->bytes;
+
+ /* If it's a WRITE operation, we want to pin the pages for reading.
+ * If it's a READ operation, we need to pin the pages for writing.
+ */
+ ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
+ if ((!odp_supported && ret <= 0) ||
+ (odp_supported && ret <= 0 && ret != -EOPNOTSUPP))
+ goto out_pages;
+
+ if (ret == -EOPNOTSUPP) {
+ struct rds_mr *local_odp_mr;
+
+ if (!rs->rs_transport->get_mr) {
+ ret = -EOPNOTSUPP;
+ goto out_pages;
+ }
+ local_odp_mr =
+ kzalloc(sizeof(*local_odp_mr), GFP_KERNEL);
+ if (!local_odp_mr) {
+ ret = -ENOMEM;
+ goto out_pages;
+ }
+ RB_CLEAR_NODE(&local_odp_mr->r_rb_node);
+ kref_init(&local_odp_mr->r_kref);
+ local_odp_mr->r_trans = rs->rs_transport;
+ local_odp_mr->r_sock = rs;
+ local_odp_mr->r_trans_private =
+ rs->rs_transport->get_mr(
+ NULL, 0, rs, &local_odp_mr->r_key, NULL,
+ iov->addr, iov->bytes, ODP_VIRTUAL);
+ if (IS_ERR(local_odp_mr->r_trans_private)) {
+ ret = PTR_ERR(local_odp_mr->r_trans_private);
+ rdsdebug("get_mr ret %d %p\"", ret,
+ local_odp_mr->r_trans_private);
+ kfree(local_odp_mr);
+ ret = -EOPNOTSUPP;
+ goto out_pages;
+ }
+ rdsdebug("Need odp; local_odp_mr %p trans_private %p\n",
+ local_odp_mr, local_odp_mr->r_trans_private);
+ op->op_odp_mr = local_odp_mr;
+ op->op_odp_addr = iov->addr;
+ }
+
+ rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
+ nr_bytes, nr, iov->bytes, iov->addr);
+
+ nr_bytes += iov->bytes;
+
+ for (j = 0; j < nr; j++) {
+ unsigned int offset = iov->addr & ~PAGE_MASK;
+ struct scatterlist *sg;
+
+ sg = &op->op_sg[op->op_nents + j];
+ sg_set_page(sg, pages[j],
+ min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
+ offset);
+
+ sg_dma_len(sg) = sg->length;
+ rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
+ sg->offset, sg->length, iov->addr, iov->bytes);
+
+ iov->addr += sg->length;
+ iov->bytes -= sg->length;
+ }
+
+ op->op_nents += nr;
+ }
+
+ if (nr_bytes > args->remote_vec.bytes) {
+ rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
+ nr_bytes,
+ (unsigned int) args->remote_vec.bytes);
+ ret = -EINVAL;
+ goto out_pages;
+ }
+ op->op_bytes = nr_bytes;
+ ret = 0;
+
+out_pages:
+ kfree(pages);
+out_ret:
+ if (ret)
+ rds_rdma_free_op(op);
+ else
+ rds_stats_inc(s_send_rdma);
+
+ return ret;
+}
+
+/*
+ * The application wants us to pass an RDMA destination (aka MR)
+ * to the remote
+ */
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ unsigned long flags;
+ struct rds_mr *mr;
+ u32 r_key;
+ int err = 0;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) ||
+ rm->m_rdma_cookie != 0)
+ return -EINVAL;
+
+ memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
+
+ /* We are reusing a previously mapped MR here. Most likely, the
+ * application has written to the buffer, so we need to explicitly
+ * flush those writes to RAM. Otherwise the HCA may not see them
+ * when doing a DMA from that buffer.
+ */
+ r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
+
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+ if (!mr)
+ err = -EINVAL; /* invalid r_key */
+ else
+ kref_get(&mr->r_kref);
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ if (mr) {
+ mr->r_trans->sync_mr(mr->r_trans_private,
+ DMA_TO_DEVICE);
+ rm->rdma.op_rdma_mr = mr;
+ }
+ return err;
+}
+
+/*
+ * The application passes us an address range it wants to enable RDMA
+ * to/from. We map the area, and save the <R_Key,offset> pair
+ * in rm->m_rdma_cookie. This causes it to be sent along to the peer
+ * in an extension header.
+ */
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) ||
+ rm->m_rdma_cookie != 0)
+ return -EINVAL;
+
+ return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie,
+ &rm->rdma.op_rdma_mr, rm->m_conn_path);
+}
+
+/*
+ * Fill in rds_message for an atomic request.
+ */
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ struct page *page = NULL;
+ struct rds_atomic_args *args;
+ int ret = 0;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
+ || rm->atomic.op_active)
+ return -EINVAL;
+
+ args = CMSG_DATA(cmsg);
+
+ /* Nonmasked & masked cmsg ops converted to masked hw ops */
+ switch (cmsg->cmsg_type) {
+ case RDS_CMSG_ATOMIC_FADD:
+ rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+ rm->atomic.op_m_fadd.add = args->fadd.add;
+ rm->atomic.op_m_fadd.nocarry_mask = 0;
+ break;
+ case RDS_CMSG_MASKED_ATOMIC_FADD:
+ rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+ rm->atomic.op_m_fadd.add = args->m_fadd.add;
+ rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
+ break;
+ case RDS_CMSG_ATOMIC_CSWP:
+ rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+ rm->atomic.op_m_cswp.compare = args->cswp.compare;
+ rm->atomic.op_m_cswp.swap = args->cswp.swap;
+ rm->atomic.op_m_cswp.compare_mask = ~0;
+ rm->atomic.op_m_cswp.swap_mask = ~0;
+ break;
+ case RDS_CMSG_MASKED_ATOMIC_CSWP:
+ rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+ rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
+ rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
+ rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
+ rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
+ break;
+ default:
+ BUG(); /* should never happen */
+ }
+
+ rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+ rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
+ rm->atomic.op_active = 1;
+ rm->atomic.op_recverr = rs->rs_recverr;
+ rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
+ if (IS_ERR(rm->atomic.op_sg)) {
+ ret = PTR_ERR(rm->atomic.op_sg);
+ goto err;
+ }
+
+ /* verify 8 byte-aligned */
+ if (args->local_addr & 0x7) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ ret = rds_pin_pages(args->local_addr, 1, &page, 1);
+ if (ret != 1)
+ goto err;
+ ret = 0;
+
+ sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
+
+ if (rm->atomic.op_notify || rm->atomic.op_recverr) {
+ /* We allocate an uninitialized notifier here, because
+ * we don't want to do that in the completion handler. We
+ * would have to use GFP_ATOMIC there, and don't want to deal
+ * with failed allocations.
+ */
+ rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
+ if (!rm->atomic.op_notifier) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ rm->atomic.op_notifier->n_user_token = args->user_token;
+ rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
+ }
+
+ rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
+ rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
+
+ return ret;
+err:
+ if (page)
+ unpin_user_page(page);
+ rm->atomic.op_active = 0;
+ kfree(rm->atomic.op_notifier);
+
+ return ret;
+}
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
new file mode 100644
index 000000000..b15cf316b
--- /dev/null
+++ b/net/rds/rdma_transport.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2009, 2018 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds_single_path.h"
+#include "rdma_transport.h"
+#include "ib.h"
+
+/* Global IPv4 and IPv6 RDS RDMA listener cm_id */
+static struct rdma_cm_id *rds_rdma_listen_id;
+#if IS_ENABLED(CONFIG_IPV6)
+static struct rdma_cm_id *rds6_rdma_listen_id;
+#endif
+
+/* Per IB specification 7.7.3, service level is a 4-bit field. */
+#define TOS_TO_SL(tos) ((tos) & 0xF)
+
+static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event,
+ bool isv6)
+{
+ /* this can be null in the listening path */
+ struct rds_connection *conn = cm_id->context;
+ struct rds_transport *trans;
+ int ret = 0;
+ int *err;
+ u8 len;
+
+ rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
+ event->event, rdma_event_msg(event->event));
+
+ if (cm_id->device->node_type == RDMA_NODE_IB_CA)
+ trans = &rds_ib_transport;
+
+ /* Prevent shutdown from tearing down the connection
+ * while we're executing. */
+ if (conn) {
+ mutex_lock(&conn->c_cm_lock);
+
+ /* If the connection is being shut down, bail out
+ * right away. We return 0 so cm_id doesn't get
+ * destroyed prematurely */
+ if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) {
+ /* Reject incoming connections while we're tearing
+ * down an existing one. */
+ if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
+ ret = 1;
+ goto out;
+ }
+ }
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ ret = trans->cm_handle_connect(cm_id, event, isv6);
+ break;
+
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ if (conn) {
+ rdma_set_service_type(cm_id, conn->c_tos);
+ rdma_set_min_rnr_timer(cm_id, IB_RNR_TIMER_000_32);
+ /* XXX do we need to clean up if this fails? */
+ ret = rdma_resolve_route(cm_id,
+ RDS_RDMA_RESOLVE_TIMEOUT_MS);
+ }
+ break;
+
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ /* Connection could have been dropped so make sure the
+ * cm_id is valid before proceeding
+ */
+ if (conn) {
+ struct rds_ib_connection *ibic;
+
+ ibic = conn->c_transport_data;
+ if (ibic && ibic->i_cm_id == cm_id) {
+ cm_id->route.path_rec[0].sl =
+ TOS_TO_SL(conn->c_tos);
+ ret = trans->cm_initiate_connect(cm_id, isv6);
+ } else {
+ rds_conn_drop(conn);
+ }
+ }
+ break;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ if (conn)
+ trans->cm_connect_complete(conn, event);
+ break;
+
+ case RDMA_CM_EVENT_REJECTED:
+ if (!conn)
+ break;
+ err = (int *)rdma_consumer_reject_data(cm_id, event, &len);
+ if (!err ||
+ (err && len >= sizeof(*err) &&
+ ((*err) <= RDS_RDMA_REJ_INCOMPAT))) {
+ pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n",
+ &conn->c_laddr, &conn->c_faddr);
+
+ if (!conn->c_tos)
+ conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION;
+
+ rds_conn_drop(conn);
+ }
+ rdsdebug("Connection rejected: %s\n",
+ rdma_reject_msg(cm_id, event->status));
+ break;
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ case RDMA_CM_EVENT_UNREACHABLE:
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ if (conn)
+ rds_conn_drop(conn);
+ break;
+
+ case RDMA_CM_EVENT_DISCONNECTED:
+ if (!conn)
+ break;
+ rdsdebug("DISCONNECT event - dropping connection "
+ "%pI6c->%pI6c\n", &conn->c_laddr,
+ &conn->c_faddr);
+ rds_conn_drop(conn);
+ break;
+
+ case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+ if (conn) {
+ pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI6c->%pI6c\n",
+ &conn->c_laddr, &conn->c_faddr);
+ rds_conn_drop(conn);
+ }
+ break;
+
+ default:
+ /* things like device disconnect? */
+ printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
+ event->event, rdma_event_msg(event->event));
+ break;
+ }
+
+out:
+ if (conn)
+ mutex_unlock(&conn->c_cm_lock);
+
+ rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
+ rdma_event_msg(event->event), ret);
+
+ return ret;
+}
+
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ return rds_rdma_cm_event_handler_cmn(cm_id, event, false);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ return rds_rdma_cm_event_handler_cmn(cm_id, event, true);
+}
+#endif
+
+static int rds_rdma_listen_init_common(rdma_cm_event_handler handler,
+ struct sockaddr *sa,
+ struct rdma_cm_id **ret_cm_id)
+{
+ struct rdma_cm_id *cm_id;
+ int ret;
+
+ cm_id = rdma_create_id(&init_net, handler, NULL,
+ RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(cm_id)) {
+ ret = PTR_ERR(cm_id);
+ printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+ "rdma_create_id() returned %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * XXX I bet this binds the cm_id to a device. If we want to support
+ * fail-over we'll have to take this into consideration.
+ */
+ ret = rdma_bind_addr(cm_id, sa);
+ if (ret) {
+ printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+ "rdma_bind_addr() returned %d\n", ret);
+ goto out;
+ }
+
+ ret = rdma_listen(cm_id, 128);
+ if (ret) {
+ printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+ "rdma_listen() returned %d\n", ret);
+ goto out;
+ }
+
+ rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
+
+ *ret_cm_id = cm_id;
+ cm_id = NULL;
+out:
+ if (cm_id)
+ rdma_destroy_id(cm_id);
+ return ret;
+}
+
+/* Initialize the RDS RDMA listeners. We create two listeners for
+ * compatibility reason. The one on RDS_PORT is used for IPv4
+ * requests only. The one on RDS_CM_PORT is used for IPv6 requests
+ * only. So only IPv6 enabled RDS module will communicate using this
+ * port.
+ */
+static int rds_rdma_listen_init(void)
+{
+ int ret;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 sin6;
+#endif
+ struct sockaddr_in sin;
+
+ sin.sin_family = PF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(RDS_PORT);
+ ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler,
+ (struct sockaddr *)&sin,
+ &rds_rdma_listen_id);
+ if (ret != 0)
+ return ret;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ sin6.sin6_family = PF_INET6;
+ sin6.sin6_addr = in6addr_any;
+ sin6.sin6_port = htons(RDS_CM_PORT);
+ sin6.sin6_scope_id = 0;
+ sin6.sin6_flowinfo = 0;
+ ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler,
+ (struct sockaddr *)&sin6,
+ &rds6_rdma_listen_id);
+ /* Keep going even when IPv6 is not enabled in the system. */
+ if (ret != 0)
+ rdsdebug("Cannot set up IPv6 RDMA listener\n");
+#endif
+ return 0;
+}
+
+static void rds_rdma_listen_stop(void)
+{
+ if (rds_rdma_listen_id) {
+ rdsdebug("cm %p\n", rds_rdma_listen_id);
+ rdma_destroy_id(rds_rdma_listen_id);
+ rds_rdma_listen_id = NULL;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ if (rds6_rdma_listen_id) {
+ rdsdebug("cm %p\n", rds6_rdma_listen_id);
+ rdma_destroy_id(rds6_rdma_listen_id);
+ rds6_rdma_listen_id = NULL;
+ }
+#endif
+}
+
+static int __init rds_rdma_init(void)
+{
+ int ret;
+
+ ret = rds_ib_init();
+ if (ret)
+ goto out;
+
+ ret = rds_rdma_listen_init();
+ if (ret)
+ rds_ib_exit();
+out:
+ return ret;
+}
+module_init(rds_rdma_init);
+
+static void __exit rds_rdma_exit(void)
+{
+ /* stop listening first to ensure no new connections are attempted */
+ rds_rdma_listen_stop();
+ rds_ib_exit();
+}
+module_exit(rds_rdma_exit);
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: IB transport");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
new file mode 100644
index 000000000..ca4c3a667
--- /dev/null
+++ b/net/rds/rdma_transport.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDMA_TRANSPORT_H
+#define _RDMA_TRANSPORT_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+
+/* RDMA_CM also uses 16385 as the listener port. */
+#define RDS_CM_PORT 16385
+
+#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000
+
+/* Below reject reason is for legacy interoperability issue with non-linux
+ * RDS endpoints where older version incompatibility is conveyed via value 1.
+ * For future version(s), proper encoded reject reason should be used.
+ */
+#define RDS_RDMA_REJ_INCOMPAT 1
+
+int rds_rdma_conn_connect(struct rds_connection *conn);
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
+int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
+
+/* from ib.c */
+extern struct rds_transport rds_ib_transport;
+int rds_ib_init(void);
+void rds_ib_exit(void);
+
+#endif
diff --git a/net/rds/rds.h b/net/rds/rds.h
new file mode 100644
index 000000000..d35d1fc39
--- /dev/null
+++ b/net/rds/rds.h
@@ -0,0 +1,1019 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDS_RDS_H
+#define _RDS_RDS_H
+
+#include <net/sock.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <rdma/rdma_cm.h>
+#include <linux/mutex.h>
+#include <linux/rds.h>
+#include <linux/rhashtable.h>
+#include <linux/refcount.h>
+#include <linux/in6.h>
+
+#include "info.h"
+
+/*
+ * RDS Network protocol version
+ */
+#define RDS_PROTOCOL_3_0 0x0300
+#define RDS_PROTOCOL_3_1 0x0301
+#define RDS_PROTOCOL_4_0 0x0400
+#define RDS_PROTOCOL_4_1 0x0401
+#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1
+#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8)
+#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
+#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
+#define RDS_PROTOCOL_COMPAT_VERSION RDS_PROTOCOL_3_1
+
+/* The following ports, 16385, 18634, 18635, are registered with IANA as
+ * the ports to be used for RDS over TCP and UDP. Currently, only RDS over
+ * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value
+ * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After
+ * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept
+ * to ensure compatibility with older RDS modules. Those ports are defined
+ * in each transport's header file.
+ */
+#define RDS_PORT 18634
+
+#ifdef ATOMIC64_INIT
+#define KERNEL_HAS_ATOMIC64
+#endif
+#ifdef RDS_DEBUG
+#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
+#else
+/* sigh, pr_debug() causes unused variable warnings */
+static inline __printf(1, 2)
+void rdsdebug(char *fmt, ...)
+{
+}
+#endif
+
+#define RDS_FRAG_SHIFT 12
+#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
+
+/* Used to limit both RDMA and non-RDMA RDS message to 1MB */
+#define RDS_MAX_MSG_SIZE ((unsigned int)(1 << 20))
+
+#define RDS_CONG_MAP_BYTES (65536 / 8)
+#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
+#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
+
+struct rds_cong_map {
+ struct rb_node m_rb_node;
+ struct in6_addr m_addr;
+ wait_queue_head_t m_waitq;
+ struct list_head m_conn_list;
+ unsigned long m_page_addrs[RDS_CONG_MAP_PAGES];
+};
+
+
+/*
+ * This is how we will track the connection state:
+ * A connection is always in one of the following
+ * states. Updates to the state are atomic and imply
+ * a memory barrier.
+ */
+enum {
+ RDS_CONN_DOWN = 0,
+ RDS_CONN_CONNECTING,
+ RDS_CONN_DISCONNECTING,
+ RDS_CONN_UP,
+ RDS_CONN_RESETTING,
+ RDS_CONN_ERROR,
+};
+
+/* Bits for c_flags */
+#define RDS_LL_SEND_FULL 0
+#define RDS_RECONNECT_PENDING 1
+#define RDS_IN_XMIT 2
+#define RDS_RECV_REFILL 3
+#define RDS_DESTROY_PENDING 4
+
+/* Max number of multipaths per RDS connection. Must be a power of 2 */
+#define RDS_MPATH_WORKERS 8
+#define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \
+ (rs)->rs_hash_initval) & ((n) - 1))
+
+#define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr))
+
+/* Per mpath connection state */
+struct rds_conn_path {
+ struct rds_connection *cp_conn;
+ struct rds_message *cp_xmit_rm;
+ unsigned long cp_xmit_sg;
+ unsigned int cp_xmit_hdr_off;
+ unsigned int cp_xmit_data_off;
+ unsigned int cp_xmit_atomic_sent;
+ unsigned int cp_xmit_rdma_sent;
+ unsigned int cp_xmit_data_sent;
+
+ spinlock_t cp_lock; /* protect msg queues */
+ u64 cp_next_tx_seq;
+ struct list_head cp_send_queue;
+ struct list_head cp_retrans;
+
+ u64 cp_next_rx_seq;
+
+ void *cp_transport_data;
+
+ atomic_t cp_state;
+ unsigned long cp_send_gen;
+ unsigned long cp_flags;
+ unsigned long cp_reconnect_jiffies;
+ struct delayed_work cp_send_w;
+ struct delayed_work cp_recv_w;
+ struct delayed_work cp_conn_w;
+ struct work_struct cp_down_w;
+ struct mutex cp_cm_lock; /* protect cp_state & cm */
+ wait_queue_head_t cp_waitq;
+
+ unsigned int cp_unacked_packets;
+ unsigned int cp_unacked_bytes;
+ unsigned int cp_index;
+};
+
+/* One rds_connection per RDS address pair */
+struct rds_connection {
+ struct hlist_node c_hash_node;
+ struct in6_addr c_laddr;
+ struct in6_addr c_faddr;
+ int c_dev_if; /* ifindex used for this conn */
+ int c_bound_if; /* ifindex of c_laddr */
+ unsigned int c_loopback:1,
+ c_isv6:1,
+ c_ping_triggered:1,
+ c_pad_to_32:29;
+ int c_npaths;
+ struct rds_connection *c_passive;
+ struct rds_transport *c_trans;
+
+ struct rds_cong_map *c_lcong;
+ struct rds_cong_map *c_fcong;
+
+ /* Protocol version */
+ unsigned int c_proposed_version;
+ unsigned int c_version;
+ possible_net_t c_net;
+
+ /* TOS */
+ u8 c_tos;
+
+ struct list_head c_map_item;
+ unsigned long c_map_queued;
+
+ struct rds_conn_path *c_path;
+ wait_queue_head_t c_hs_waitq; /* handshake waitq */
+
+ u32 c_my_gen_num;
+ u32 c_peer_gen_num;
+};
+
+static inline
+struct net *rds_conn_net(struct rds_connection *conn)
+{
+ return read_pnet(&conn->c_net);
+}
+
+static inline
+void rds_conn_net_set(struct rds_connection *conn, struct net *net)
+{
+ write_pnet(&conn->c_net, net);
+}
+
+#define RDS_FLAG_CONG_BITMAP 0x01
+#define RDS_FLAG_ACK_REQUIRED 0x02
+#define RDS_FLAG_RETRANSMITTED 0x04
+#define RDS_MAX_ADV_CREDIT 255
+
+/* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping
+ * probe to exchange control information before establishing a connection.
+ * Currently the control information that is exchanged is the number of
+ * supported paths. If the peer is a legacy (older kernel revision) peer,
+ * it would return a pong message without additional control information
+ * that would then alert the sender that the peer was an older rev.
+ */
+#define RDS_FLAG_PROBE_PORT 1
+#define RDS_HS_PROBE(sport, dport) \
+ ((sport == RDS_FLAG_PROBE_PORT && dport == 0) || \
+ (sport == 0 && dport == RDS_FLAG_PROBE_PORT))
+/*
+ * Maximum space available for extension headers.
+ */
+#define RDS_HEADER_EXT_SPACE 16
+
+struct rds_header {
+ __be64 h_sequence;
+ __be64 h_ack;
+ __be32 h_len;
+ __be16 h_sport;
+ __be16 h_dport;
+ u8 h_flags;
+ u8 h_credit;
+ u8 h_padding[4];
+ __sum16 h_csum;
+
+ u8 h_exthdr[RDS_HEADER_EXT_SPACE];
+};
+
+/*
+ * Reserved - indicates end of extensions
+ */
+#define RDS_EXTHDR_NONE 0
+
+/*
+ * This extension header is included in the very
+ * first message that is sent on a new connection,
+ * and identifies the protocol level. This will help
+ * rolling updates if a future change requires breaking
+ * the protocol.
+ * NB: This is no longer true for IB, where we do a version
+ * negotiation during the connection setup phase (protocol
+ * version information is included in the RDMA CM private data).
+ */
+#define RDS_EXTHDR_VERSION 1
+struct rds_ext_header_version {
+ __be32 h_version;
+};
+
+/*
+ * This extension header is included in the RDS message
+ * chasing an RDMA operation.
+ */
+#define RDS_EXTHDR_RDMA 2
+struct rds_ext_header_rdma {
+ __be32 h_rdma_rkey;
+};
+
+/*
+ * This extension header tells the peer about the
+ * destination <R_Key,offset> of the requested RDMA
+ * operation.
+ */
+#define RDS_EXTHDR_RDMA_DEST 3
+struct rds_ext_header_rdma_dest {
+ __be32 h_rdma_rkey;
+ __be32 h_rdma_offset;
+};
+
+/* Extension header announcing number of paths.
+ * Implicit length = 2 bytes.
+ */
+#define RDS_EXTHDR_NPATHS 5
+#define RDS_EXTHDR_GEN_NUM 6
+
+#define __RDS_EXTHDR_MAX 16 /* for now */
+#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
+#define RDS_MSG_RX_HDR 0
+#define RDS_MSG_RX_START 1
+#define RDS_MSG_RX_END 2
+#define RDS_MSG_RX_CMSG 3
+
+/* The following values are whitelisted for usercopy */
+struct rds_inc_usercopy {
+ rds_rdma_cookie_t rdma_cookie;
+ ktime_t rx_tstamp;
+};
+
+struct rds_incoming {
+ refcount_t i_refcount;
+ struct list_head i_item;
+ struct rds_connection *i_conn;
+ struct rds_conn_path *i_conn_path;
+ struct rds_header i_hdr;
+ unsigned long i_rx_jiffies;
+ struct in6_addr i_saddr;
+
+ struct rds_inc_usercopy i_usercopy;
+ u64 i_rx_lat_trace[RDS_RX_MAX_TRACES];
+};
+
+struct rds_mr {
+ struct rb_node r_rb_node;
+ struct kref r_kref;
+ u32 r_key;
+
+ /* A copy of the creation flags */
+ unsigned int r_use_once:1;
+ unsigned int r_invalidate:1;
+ unsigned int r_write:1;
+
+ struct rds_sock *r_sock; /* back pointer to the socket that owns us */
+ struct rds_transport *r_trans;
+ void *r_trans_private;
+};
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+ return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+ return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+ return cookie >> 32;
+}
+
+/* atomic operation types */
+#define RDS_ATOMIC_TYPE_CSWP 0
+#define RDS_ATOMIC_TYPE_FADD 1
+
+/*
+ * m_sock_item and m_conn_item are on lists that are serialized under
+ * conn->c_lock. m_sock_item has additional meaning in that once it is empty
+ * the message will not be put back on the retransmit list after being sent.
+ * messages that are canceled while being sent rely on this.
+ *
+ * m_inc is used by loopback so that it can pass an incoming message straight
+ * back up into the rx path. It embeds a wire header which is also used by
+ * the send path, which is kind of awkward.
+ *
+ * m_sock_item indicates the message's presence on a socket's send or receive
+ * queue. m_rs will point to that socket.
+ *
+ * m_daddr is used by cancellation to prune messages to a given destination.
+ *
+ * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
+ * nesting. As paths iterate over messages on a sock, or conn, they must
+ * also lock the conn, or sock, to remove the message from those lists too.
+ * Testing the flag to determine if the message is still on the lists lets
+ * us avoid testing the list_head directly. That means each path can use
+ * the message's list_head to keep it on a local list while juggling locks
+ * without confusing the other path.
+ *
+ * m_ack_seq is an optional field set by transports who need a different
+ * sequence number range to invalidate. They can use this in a callback
+ * that they pass to rds_send_drop_acked() to see if each message has been
+ * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't
+ * had ack_seq set yet.
+ */
+#define RDS_MSG_ON_SOCK 1
+#define RDS_MSG_ON_CONN 2
+#define RDS_MSG_HAS_ACK_SEQ 3
+#define RDS_MSG_ACK_REQUIRED 4
+#define RDS_MSG_RETRANSMITTED 5
+#define RDS_MSG_MAPPED 6
+#define RDS_MSG_PAGEVEC 7
+#define RDS_MSG_FLUSH 8
+
+struct rds_znotifier {
+ struct mmpin z_mmp;
+ u32 z_cookie;
+};
+
+struct rds_msg_zcopy_info {
+ struct list_head rs_zcookie_next;
+ union {
+ struct rds_znotifier znotif;
+ struct rds_zcopy_cookies zcookies;
+ };
+};
+
+struct rds_msg_zcopy_queue {
+ struct list_head zcookie_head;
+ spinlock_t lock; /* protects zcookie_head queue */
+};
+
+static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q)
+{
+ spin_lock_init(&q->lock);
+ INIT_LIST_HEAD(&q->zcookie_head);
+}
+
+struct rds_iov_vector {
+ struct rds_iovec *iov;
+ int len;
+};
+
+struct rds_iov_vector_arr {
+ struct rds_iov_vector *vec;
+ int len;
+ int indx;
+ int incr;
+};
+
+struct rds_message {
+ refcount_t m_refcount;
+ struct list_head m_sock_item;
+ struct list_head m_conn_item;
+ struct rds_incoming m_inc;
+ u64 m_ack_seq;
+ struct in6_addr m_daddr;
+ unsigned long m_flags;
+
+ /* Never access m_rs without holding m_rs_lock.
+ * Lock nesting is
+ * rm->m_rs_lock
+ * -> rs->rs_lock
+ */
+ spinlock_t m_rs_lock;
+ wait_queue_head_t m_flush_wait;
+
+ struct rds_sock *m_rs;
+
+ /* cookie to send to remote, in rds header */
+ rds_rdma_cookie_t m_rdma_cookie;
+
+ unsigned int m_used_sgs;
+ unsigned int m_total_sgs;
+
+ void *m_final_op;
+
+ struct {
+ struct rm_atomic_op {
+ int op_type;
+ union {
+ struct {
+ uint64_t compare;
+ uint64_t swap;
+ uint64_t compare_mask;
+ uint64_t swap_mask;
+ } op_m_cswp;
+ struct {
+ uint64_t add;
+ uint64_t nocarry_mask;
+ } op_m_fadd;
+ };
+
+ u32 op_rkey;
+ u64 op_remote_addr;
+ unsigned int op_notify:1;
+ unsigned int op_recverr:1;
+ unsigned int op_mapped:1;
+ unsigned int op_silent:1;
+ unsigned int op_active:1;
+ struct scatterlist *op_sg;
+ struct rds_notifier *op_notifier;
+
+ struct rds_mr *op_rdma_mr;
+ } atomic;
+ struct rm_rdma_op {
+ u32 op_rkey;
+ u64 op_remote_addr;
+ unsigned int op_write:1;
+ unsigned int op_fence:1;
+ unsigned int op_notify:1;
+ unsigned int op_recverr:1;
+ unsigned int op_mapped:1;
+ unsigned int op_silent:1;
+ unsigned int op_active:1;
+ unsigned int op_bytes;
+ unsigned int op_nents;
+ unsigned int op_count;
+ struct scatterlist *op_sg;
+ struct rds_notifier *op_notifier;
+
+ struct rds_mr *op_rdma_mr;
+
+ u64 op_odp_addr;
+ struct rds_mr *op_odp_mr;
+ } rdma;
+ struct rm_data_op {
+ unsigned int op_active:1;
+ unsigned int op_nents;
+ unsigned int op_count;
+ unsigned int op_dmasg;
+ unsigned int op_dmaoff;
+ struct rds_znotifier *op_mmp_znotifier;
+ struct scatterlist *op_sg;
+ } data;
+ };
+
+ struct rds_conn_path *m_conn_path;
+};
+
+/*
+ * The RDS notifier is used (optionally) to tell the application about
+ * completed RDMA operations. Rather than keeping the whole rds message
+ * around on the queue, we allocate a small notifier that is put on the
+ * socket's notifier_list. Notifications are delivered to the application
+ * through control messages.
+ */
+struct rds_notifier {
+ struct list_head n_list;
+ uint64_t n_user_token;
+ int n_status;
+};
+
+/* Available as part of RDS core, so doesn't need to participate
+ * in get_preferred transport etc
+ */
+#define RDS_TRANS_LOOP 3
+
+/**
+ * struct rds_transport - transport specific behavioural hooks
+ *
+ * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
+ * part of a message. The caller serializes on the send_sem so this
+ * doesn't need to be reentrant for a given conn. The header must be
+ * sent before the data payload. .xmit must be prepared to send a
+ * message with no data payload. .xmit should return the number of
+ * bytes that were sent down the connection, including header bytes.
+ * Returning 0 tells the caller that it doesn't need to perform any
+ * additional work now. This is usually the case when the transport has
+ * filled the sending queue for its connection and will handle
+ * triggering the rds thread to continue the send when space becomes
+ * available. Returning -EAGAIN tells the caller to retry the send
+ * immediately. Returning -ENOMEM tells the caller to retry the send at
+ * some point in the future.
+ *
+ * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once
+ * it returns the connection can not call rds_recv_incoming().
+ * This will only be called once after conn_connect returns
+ * non-zero success and will The caller serializes this with
+ * the send and connecting paths (xmit_* and conn_*). The
+ * transport is responsible for other serialization, including
+ * rds_recv_incoming(). This is called in process context but
+ * should try hard not to block.
+ */
+
+struct rds_transport {
+ char t_name[TRANSNAMSIZ];
+ struct list_head t_item;
+ struct module *t_owner;
+ unsigned int t_prefer_loopback:1,
+ t_mp_capable:1;
+ unsigned int t_type;
+
+ int (*laddr_check)(struct net *net, const struct in6_addr *addr,
+ __u32 scope_id);
+ int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
+ void (*conn_free)(void *data);
+ int (*conn_path_connect)(struct rds_conn_path *cp);
+ void (*conn_path_shutdown)(struct rds_conn_path *conn);
+ void (*xmit_path_prepare)(struct rds_conn_path *cp);
+ void (*xmit_path_complete)(struct rds_conn_path *cp);
+ int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
+ int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
+ int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
+ int (*recv_path)(struct rds_conn_path *cp);
+ int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to);
+ void (*inc_free)(struct rds_incoming *inc);
+
+ int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event, bool isv6);
+ int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6);
+ void (*cm_connect_complete)(struct rds_connection *conn,
+ struct rdma_cm_event *event);
+
+ unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
+ unsigned int avail);
+ void (*exit)(void);
+ void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
+ struct rds_sock *rs, u32 *key_ret,
+ struct rds_connection *conn,
+ u64 start, u64 length, int need_odp);
+ void (*sync_mr)(void *trans_private, int direction);
+ void (*free_mr)(void *trans_private, int invalidate);
+ void (*flush_mrs)(void);
+ bool (*t_unloading)(struct rds_connection *conn);
+ u8 (*get_tos_map)(u8 tos);
+};
+
+/* Bind hash table key length. It is the sum of the size of a struct
+ * in6_addr, a scope_id and a port.
+ */
+#define RDS_BOUND_KEY_LEN \
+ (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16))
+
+struct rds_sock {
+ struct sock rs_sk;
+
+ u64 rs_user_addr;
+ u64 rs_user_bytes;
+
+ /*
+ * bound_addr used for both incoming and outgoing, no INADDR_ANY
+ * support.
+ */
+ struct rhash_head rs_bound_node;
+ u8 rs_bound_key[RDS_BOUND_KEY_LEN];
+ struct sockaddr_in6 rs_bound_sin6;
+#define rs_bound_addr rs_bound_sin6.sin6_addr
+#define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3]
+#define rs_bound_port rs_bound_sin6.sin6_port
+#define rs_bound_scope_id rs_bound_sin6.sin6_scope_id
+ struct in6_addr rs_conn_addr;
+#define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3]
+ __be16 rs_conn_port;
+ struct rds_transport *rs_transport;
+
+ /*
+ * rds_sendmsg caches the conn it used the last time around.
+ * This helps avoid costly lookups.
+ */
+ struct rds_connection *rs_conn;
+
+ /* flag indicating we were congested or not */
+ int rs_congested;
+ /* seen congestion (ENOBUFS) when sending? */
+ int rs_seen_congestion;
+
+ /* rs_lock protects all these adjacent members before the newline */
+ spinlock_t rs_lock;
+ struct list_head rs_send_queue;
+ u32 rs_snd_bytes;
+ int rs_rcv_bytes;
+ struct list_head rs_notify_queue; /* currently used for failed RDMAs */
+
+ /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
+ * to decide whether the application should be woken up.
+ * If not set, we use rs_cong_track to find out whether a cong map
+ * update arrived.
+ */
+ uint64_t rs_cong_mask;
+ uint64_t rs_cong_notify;
+ struct list_head rs_cong_list;
+ unsigned long rs_cong_track;
+
+ /*
+ * rs_recv_lock protects the receive queue, and is
+ * used to serialize with rds_release.
+ */
+ rwlock_t rs_recv_lock;
+ struct list_head rs_recv_queue;
+
+ /* just for stats reporting */
+ struct list_head rs_item;
+
+ /* these have their own lock */
+ spinlock_t rs_rdma_lock;
+ struct rb_root rs_rdma_keys;
+
+ /* Socket options - in case there will be more */
+ unsigned char rs_recverr,
+ rs_cong_monitor;
+ u32 rs_hash_initval;
+
+ /* Socket receive path trace points*/
+ u8 rs_rx_traces;
+ u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
+ struct rds_msg_zcopy_queue rs_zcookie_queue;
+ u8 rs_tos;
+};
+
+static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
+{
+ return container_of(sk, struct rds_sock, rs_sk);
+}
+static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
+{
+ return &rs->rs_sk;
+}
+
+/*
+ * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
+ * to account for overhead. We don't account for overhead, we just apply
+ * the number of payload bytes to the specified value.
+ */
+static inline int rds_sk_sndbuf(struct rds_sock *rs)
+{
+ return rds_rs_to_sk(rs)->sk_sndbuf / 2;
+}
+static inline int rds_sk_rcvbuf(struct rds_sock *rs)
+{
+ return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
+}
+
+struct rds_statistics {
+ uint64_t s_conn_reset;
+ uint64_t s_recv_drop_bad_checksum;
+ uint64_t s_recv_drop_old_seq;
+ uint64_t s_recv_drop_no_sock;
+ uint64_t s_recv_drop_dead_sock;
+ uint64_t s_recv_deliver_raced;
+ uint64_t s_recv_delivered;
+ uint64_t s_recv_queued;
+ uint64_t s_recv_immediate_retry;
+ uint64_t s_recv_delayed_retry;
+ uint64_t s_recv_ack_required;
+ uint64_t s_recv_rdma_bytes;
+ uint64_t s_recv_ping;
+ uint64_t s_send_queue_empty;
+ uint64_t s_send_queue_full;
+ uint64_t s_send_lock_contention;
+ uint64_t s_send_lock_queue_raced;
+ uint64_t s_send_immediate_retry;
+ uint64_t s_send_delayed_retry;
+ uint64_t s_send_drop_acked;
+ uint64_t s_send_ack_required;
+ uint64_t s_send_queued;
+ uint64_t s_send_rdma;
+ uint64_t s_send_rdma_bytes;
+ uint64_t s_send_pong;
+ uint64_t s_page_remainder_hit;
+ uint64_t s_page_remainder_miss;
+ uint64_t s_copy_to_user;
+ uint64_t s_copy_from_user;
+ uint64_t s_cong_update_queued;
+ uint64_t s_cong_update_received;
+ uint64_t s_cong_send_error;
+ uint64_t s_cong_send_blocked;
+ uint64_t s_recv_bytes_added_to_socket;
+ uint64_t s_recv_bytes_removed_from_socket;
+ uint64_t s_send_stuck_rm;
+};
+
+/* af_rds.c */
+void rds_sock_addref(struct rds_sock *rs);
+void rds_sock_put(struct rds_sock *rs);
+void rds_wake_sk_sleep(struct rds_sock *rs);
+static inline void __rds_wake_sk_sleep(struct sock *sk)
+{
+ wait_queue_head_t *waitq = sk_sleep(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD) && waitq)
+ wake_up(waitq);
+}
+extern wait_queue_head_t rds_poll_waitq;
+
+
+/* bind.c */
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+void rds_remove_bound(struct rds_sock *rs);
+struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
+ __u32 scope_id);
+int rds_bind_lock_init(void);
+void rds_bind_lock_destroy(void);
+
+/* cong.c */
+int rds_cong_get_maps(struct rds_connection *conn);
+void rds_cong_add_conn(struct rds_connection *conn);
+void rds_cong_remove_conn(struct rds_connection *conn);
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
+void rds_cong_queue_updates(struct rds_cong_map *map);
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
+int rds_cong_updated_since(unsigned long *recent);
+void rds_cong_add_socket(struct rds_sock *);
+void rds_cong_remove_socket(struct rds_sock *);
+void rds_cong_exit(void);
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
+
+/* connection.c */
+extern u32 rds_gen_num;
+int rds_conn_init(void);
+void rds_conn_exit(void);
+struct rds_connection *rds_conn_create(struct net *net,
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ u8 tos, gfp_t gfp,
+ int dev_if);
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ u8 tos, gfp_t gfp, int dev_if);
+void rds_conn_shutdown(struct rds_conn_path *cpath);
+void rds_conn_destroy(struct rds_connection *conn);
+void rds_conn_drop(struct rds_connection *conn);
+void rds_conn_path_drop(struct rds_conn_path *cpath, bool destroy);
+void rds_conn_connect_if_down(struct rds_connection *conn);
+void rds_conn_path_connect_if_down(struct rds_conn_path *cp);
+void rds_check_all_paths(struct rds_connection *conn);
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int (*visitor)(struct rds_connection *, void *),
+ u64 *buffer,
+ size_t item_len);
+
+__printf(2, 3)
+void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...);
+#define rds_conn_path_error(cp, fmt...) \
+ __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt)
+
+static inline int
+rds_conn_path_transition(struct rds_conn_path *cp, int old, int new)
+{
+ return atomic_cmpxchg(&cp->cp_state, old, new) == old;
+}
+
+static inline int
+rds_conn_transition(struct rds_connection *conn, int old, int new)
+{
+ WARN_ON(conn->c_trans->t_mp_capable);
+ return rds_conn_path_transition(&conn->c_path[0], old, new);
+}
+
+static inline int
+rds_conn_path_state(struct rds_conn_path *cp)
+{
+ return atomic_read(&cp->cp_state);
+}
+
+static inline int
+rds_conn_state(struct rds_connection *conn)
+{
+ WARN_ON(conn->c_trans->t_mp_capable);
+ return rds_conn_path_state(&conn->c_path[0]);
+}
+
+static inline int
+rds_conn_path_up(struct rds_conn_path *cp)
+{
+ return atomic_read(&cp->cp_state) == RDS_CONN_UP;
+}
+
+static inline int
+rds_conn_path_down(struct rds_conn_path *cp)
+{
+ return atomic_read(&cp->cp_state) == RDS_CONN_DOWN;
+}
+
+static inline int
+rds_conn_up(struct rds_connection *conn)
+{
+ WARN_ON(conn->c_trans->t_mp_capable);
+ return rds_conn_path_up(&conn->c_path[0]);
+}
+
+static inline int
+rds_conn_path_connecting(struct rds_conn_path *cp)
+{
+ return atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING;
+}
+
+static inline int
+rds_conn_connecting(struct rds_connection *conn)
+{
+ WARN_ON(conn->c_trans->t_mp_capable);
+ return rds_conn_path_connecting(&conn->c_path[0]);
+}
+
+/* message.c */
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
+int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
+ bool zcopy);
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+ __be16 dport, u64 seq);
+int rds_message_add_extension(struct rds_header *hdr,
+ unsigned int type, const void *data, unsigned int len);
+int rds_message_next_extension(struct rds_header *hdr,
+ unsigned int *pos, void *buf, unsigned int *buflen);
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
+int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+void rds_message_inc_free(struct rds_incoming *inc);
+void rds_message_addref(struct rds_message *rm);
+void rds_message_put(struct rds_message *rm);
+void rds_message_wait(struct rds_message *rm);
+void rds_message_unmapped(struct rds_message *rm);
+void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *info);
+
+static inline void rds_message_make_checksum(struct rds_header *hdr)
+{
+ hdr->h_csum = 0;
+ hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
+}
+
+static inline int rds_message_verify_checksum(const struct rds_header *hdr)
+{
+ return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
+}
+
+
+/* page.c */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+ gfp_t gfp);
+void rds_page_exit(void);
+
+/* recv.c */
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+ struct in6_addr *saddr);
+void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn,
+ struct in6_addr *saddr);
+void rds_inc_put(struct rds_incoming *inc);
+void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
+ struct in6_addr *daddr,
+ struct rds_incoming *inc, gfp_t gfp);
+int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int msg_flags);
+void rds_clear_recv_queue(struct rds_sock *rs);
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
+void rds_inc_info_copy(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ __be32 saddr, __be32 daddr, int flip);
+void rds6_inc_info_copy(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ struct in6_addr *saddr, struct in6_addr *daddr,
+ int flip);
+
+/* send.c */
+int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
+void rds_send_path_reset(struct rds_conn_path *conn);
+int rds_send_xmit(struct rds_conn_path *cp);
+struct sockaddr_in;
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest);
+typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+ is_acked_func is_acked);
+void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
+ is_acked_func is_acked);
+void rds_send_ping(struct rds_connection *conn, int cp_index);
+int rds_send_pong(struct rds_conn_path *cp, __be16 dport);
+
+/* rdma.c */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
+int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen);
+int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_rdma_extra_size(struct rds_rdma_args *args,
+ struct rds_iov_vector *iov);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg,
+ struct rds_iov_vector *vec);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rm_rdma_op *ro);
+void rds_atomic_free_op(struct rm_atomic_op *ao);
+void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
+void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+
+void __rds_put_mr_final(struct kref *kref);
+
+static inline bool rds_destroy_pending(struct rds_connection *conn)
+{
+ return !check_net(rds_conn_net(conn)) ||
+ (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn));
+}
+
+enum {
+ ODP_NOT_NEEDED,
+ ODP_ZEROBASED,
+ ODP_VIRTUAL
+};
+
+/* stats.c */
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+#define rds_stats_inc_which(which, member) do { \
+ per_cpu(which, get_cpu()).member++; \
+ put_cpu(); \
+} while (0)
+#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
+#define rds_stats_add_which(which, member, count) do { \
+ per_cpu(which, get_cpu()).member += count; \
+ put_cpu(); \
+} while (0)
+#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
+int rds_stats_init(void);
+void rds_stats_exit(void);
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+ uint64_t *values, const char *const *names,
+ size_t nr);
+
+/* sysctl.c */
+int rds_sysctl_init(void);
+void rds_sysctl_exit(void);
+extern unsigned long rds_sysctl_sndbuf_min;
+extern unsigned long rds_sysctl_sndbuf_default;
+extern unsigned long rds_sysctl_sndbuf_max;
+extern unsigned long rds_sysctl_reconnect_min_jiffies;
+extern unsigned long rds_sysctl_reconnect_max_jiffies;
+extern unsigned int rds_sysctl_max_unacked_packets;
+extern unsigned int rds_sysctl_max_unacked_bytes;
+extern unsigned int rds_sysctl_ping_enable;
+extern unsigned long rds_sysctl_trace_flags;
+extern unsigned int rds_sysctl_trace_level;
+
+/* threads.c */
+int rds_threads_init(void);
+void rds_threads_exit(void);
+extern struct workqueue_struct *rds_wq;
+void rds_queue_reconnect(struct rds_conn_path *cp);
+void rds_connect_worker(struct work_struct *);
+void rds_shutdown_worker(struct work_struct *);
+void rds_send_worker(struct work_struct *);
+void rds_recv_worker(struct work_struct *);
+void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
+void rds_connect_complete(struct rds_connection *conn);
+int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2);
+
+/* transport.c */
+void rds_trans_register(struct rds_transport *trans);
+void rds_trans_unregister(struct rds_transport *trans);
+struct rds_transport *rds_trans_get_preferred(struct net *net,
+ const struct in6_addr *addr,
+ __u32 scope_id);
+void rds_trans_put(struct rds_transport *trans);
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail);
+struct rds_transport *rds_trans_get(int t_type);
+int rds_trans_init(void);
+void rds_trans_exit(void);
+
+#endif
diff --git a/net/rds/rds_single_path.h b/net/rds/rds_single_path.h
new file mode 100644
index 000000000..9521f6e99
--- /dev/null
+++ b/net/rds/rds_single_path.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDS_RDS_SINGLE_H
+#define _RDS_RDS_SINGLE_H
+
+#define c_xmit_rm c_path[0].cp_xmit_rm
+#define c_xmit_sg c_path[0].cp_xmit_sg
+#define c_xmit_hdr_off c_path[0].cp_xmit_hdr_off
+#define c_xmit_data_off c_path[0].cp_xmit_data_off
+#define c_xmit_atomic_sent c_path[0].cp_xmit_atomic_sent
+#define c_xmit_rdma_sent c_path[0].cp_xmit_rdma_sent
+#define c_xmit_data_sent c_path[0].cp_xmit_data_sent
+#define c_lock c_path[0].cp_lock
+#define c_next_tx_seq c_path[0].cp_next_tx_seq
+#define c_send_queue c_path[0].cp_send_queue
+#define c_retrans c_path[0].cp_retrans
+#define c_next_rx_seq c_path[0].cp_next_rx_seq
+#define c_transport_data c_path[0].cp_transport_data
+#define c_state c_path[0].cp_state
+#define c_send_gen c_path[0].cp_send_gen
+#define c_flags c_path[0].cp_flags
+#define c_reconnect_jiffies c_path[0].cp_reconnect_jiffies
+#define c_send_w c_path[0].cp_send_w
+#define c_recv_w c_path[0].cp_recv_w
+#define c_conn_w c_path[0].cp_conn_w
+#define c_down_w c_path[0].cp_down_w
+#define c_cm_lock c_path[0].cp_cm_lock
+#define c_waitq c_path[0].cp_waitq
+#define c_unacked_packets c_path[0].cp_unacked_packets
+#define c_unacked_bytes c_path[0].cp_unacked_bytes
+
+#endif /* _RDS_RDS_SINGLE_H */
diff --git a/net/rds/recv.c b/net/rds/recv.c
new file mode 100644
index 000000000..5b426dc36
--- /dev/null
+++ b/net/rds/recv.c
@@ -0,0 +1,831 @@
+/*
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/export.h>
+#include <linux/time.h>
+#include <linux/rds.h>
+
+#include "rds.h"
+
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+ struct in6_addr *saddr)
+{
+ refcount_set(&inc->i_refcount, 1);
+ INIT_LIST_HEAD(&inc->i_item);
+ inc->i_conn = conn;
+ inc->i_saddr = *saddr;
+ inc->i_usercopy.rdma_cookie = 0;
+ inc->i_usercopy.rx_tstamp = ktime_set(0, 0);
+
+ memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace));
+}
+EXPORT_SYMBOL_GPL(rds_inc_init);
+
+void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
+ struct in6_addr *saddr)
+{
+ refcount_set(&inc->i_refcount, 1);
+ INIT_LIST_HEAD(&inc->i_item);
+ inc->i_conn = cp->cp_conn;
+ inc->i_conn_path = cp;
+ inc->i_saddr = *saddr;
+ inc->i_usercopy.rdma_cookie = 0;
+ inc->i_usercopy.rx_tstamp = ktime_set(0, 0);
+}
+EXPORT_SYMBOL_GPL(rds_inc_path_init);
+
+static void rds_inc_addref(struct rds_incoming *inc)
+{
+ rdsdebug("addref inc %p ref %d\n", inc, refcount_read(&inc->i_refcount));
+ refcount_inc(&inc->i_refcount);
+}
+
+void rds_inc_put(struct rds_incoming *inc)
+{
+ rdsdebug("put inc %p ref %d\n", inc, refcount_read(&inc->i_refcount));
+ if (refcount_dec_and_test(&inc->i_refcount)) {
+ BUG_ON(!list_empty(&inc->i_item));
+
+ inc->i_conn->c_trans->inc_free(inc);
+ }
+}
+EXPORT_SYMBOL_GPL(rds_inc_put);
+
+static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
+ struct rds_cong_map *map,
+ int delta, __be16 port)
+{
+ int now_congested;
+
+ if (delta == 0)
+ return;
+
+ rs->rs_rcv_bytes += delta;
+ if (delta > 0)
+ rds_stats_add(s_recv_bytes_added_to_socket, delta);
+ else
+ rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
+
+ /* loop transport doesn't send/recv congestion updates */
+ if (rs->rs_transport->t_type == RDS_TRANS_LOOP)
+ return;
+
+ now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
+
+ rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d "
+ "now_cong %d delta %d\n",
+ rs, &rs->rs_bound_addr,
+ ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
+ rds_sk_rcvbuf(rs), now_congested, delta);
+
+ /* wasn't -> am congested */
+ if (!rs->rs_congested && now_congested) {
+ rs->rs_congested = 1;
+ rds_cong_set_bit(map, port);
+ rds_cong_queue_updates(map);
+ }
+ /* was -> aren't congested */
+ /* Require more free space before reporting uncongested to prevent
+ bouncing cong/uncong state too often */
+ else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
+ rs->rs_congested = 0;
+ rds_cong_clear_bit(map, port);
+ rds_cong_queue_updates(map);
+ }
+
+ /* do nothing if no change in cong state */
+}
+
+static void rds_conn_peer_gen_update(struct rds_connection *conn,
+ u32 peer_gen_num)
+{
+ int i;
+ struct rds_message *rm, *tmp;
+ unsigned long flags;
+
+ WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP);
+ if (peer_gen_num != 0) {
+ if (conn->c_peer_gen_num != 0 &&
+ peer_gen_num != conn->c_peer_gen_num) {
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ struct rds_conn_path *cp;
+
+ cp = &conn->c_path[i];
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ cp->cp_next_tx_seq = 1;
+ cp->cp_next_rx_seq = 0;
+ list_for_each_entry_safe(rm, tmp,
+ &cp->cp_retrans,
+ m_conn_item) {
+ set_bit(RDS_MSG_FLUSH, &rm->m_flags);
+ }
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+ }
+ }
+ conn->c_peer_gen_num = peer_gen_num;
+ }
+}
+
+/*
+ * Process all extension headers that come with this message.
+ */
+static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
+{
+ struct rds_header *hdr = &inc->i_hdr;
+ unsigned int pos = 0, type, len;
+ union {
+ struct rds_ext_header_version version;
+ struct rds_ext_header_rdma rdma;
+ struct rds_ext_header_rdma_dest rdma_dest;
+ } buffer;
+
+ while (1) {
+ len = sizeof(buffer);
+ type = rds_message_next_extension(hdr, &pos, &buffer, &len);
+ if (type == RDS_EXTHDR_NONE)
+ break;
+ /* Process extension header here */
+ switch (type) {
+ case RDS_EXTHDR_RDMA:
+ rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
+ break;
+
+ case RDS_EXTHDR_RDMA_DEST:
+ /* We ignore the size for now. We could stash it
+ * somewhere and use it for error checking. */
+ inc->i_usercopy.rdma_cookie = rds_rdma_make_cookie(
+ be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
+ be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
+
+ break;
+ }
+ }
+}
+
+static void rds_recv_hs_exthdrs(struct rds_header *hdr,
+ struct rds_connection *conn)
+{
+ unsigned int pos = 0, type, len;
+ union {
+ struct rds_ext_header_version version;
+ u16 rds_npaths;
+ u32 rds_gen_num;
+ } buffer;
+ u32 new_peer_gen_num = 0;
+
+ while (1) {
+ len = sizeof(buffer);
+ type = rds_message_next_extension(hdr, &pos, &buffer, &len);
+ if (type == RDS_EXTHDR_NONE)
+ break;
+ /* Process extension header here */
+ switch (type) {
+ case RDS_EXTHDR_NPATHS:
+ conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
+ be16_to_cpu(buffer.rds_npaths));
+ break;
+ case RDS_EXTHDR_GEN_NUM:
+ new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
+ break;
+ default:
+ pr_warn_ratelimited("ignoring unknown exthdr type "
+ "0x%x\n", type);
+ }
+ }
+ /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
+ conn->c_npaths = max_t(int, conn->c_npaths, 1);
+ conn->c_ping_triggered = 0;
+ rds_conn_peer_gen_update(conn, new_peer_gen_num);
+}
+
+/* rds_start_mprds() will synchronously start multiple paths when appropriate.
+ * The scheme is based on the following rules:
+ *
+ * 1. rds_sendmsg on first connect attempt sends the probe ping, with the
+ * sender's npaths (s_npaths)
+ * 2. rcvr of probe-ping knows the mprds_paths = min(s_npaths, r_npaths). It
+ * sends back a probe-pong with r_npaths. After that, if rcvr is the
+ * smaller ip addr, it starts rds_conn_path_connect_if_down on all
+ * mprds_paths.
+ * 3. sender gets woken up, and can move to rds_conn_path_connect_if_down.
+ * If it is the smaller ipaddr, rds_conn_path_connect_if_down can be
+ * called after reception of the probe-pong on all mprds_paths.
+ * Otherwise (sender of probe-ping is not the smaller ip addr): just call
+ * rds_conn_path_connect_if_down on the hashed path. (see rule 4)
+ * 4. rds_connect_worker must only trigger a connection if laddr < faddr.
+ * 5. sender may end up queuing the packet on the cp. will get sent out later.
+ * when connection is completed.
+ */
+static void rds_start_mprds(struct rds_connection *conn)
+{
+ int i;
+ struct rds_conn_path *cp;
+
+ if (conn->c_npaths > 1 &&
+ rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) {
+ for (i = 0; i < conn->c_npaths; i++) {
+ cp = &conn->c_path[i];
+ rds_conn_path_connect_if_down(cp);
+ }
+ }
+}
+
+/*
+ * The transport must make sure that this is serialized against other
+ * rx and conn reset on this specific conn.
+ *
+ * We currently assert that only one fragmented message will be sent
+ * down a connection at a time. This lets us reassemble in the conn
+ * instead of per-flow which means that we don't have to go digging through
+ * flows to tear down partial reassembly progress on conn failure and
+ * we save flow lookup and locking for each frag arrival. It does mean
+ * that small messages will wait behind large ones. Fragmenting at all
+ * is only to reduce the memory consumption of pre-posted buffers.
+ *
+ * The caller passes in saddr and daddr instead of us getting it from the
+ * conn. This lets loopback, who only has one conn for both directions,
+ * tell us which roles the addrs in the conn are playing for this message.
+ */
+void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
+ struct in6_addr *daddr,
+ struct rds_incoming *inc, gfp_t gfp)
+{
+ struct rds_sock *rs = NULL;
+ struct sock *sk;
+ unsigned long flags;
+ struct rds_conn_path *cp;
+
+ inc->i_conn = conn;
+ inc->i_rx_jiffies = jiffies;
+ if (conn->c_trans->t_mp_capable)
+ cp = inc->i_conn_path;
+ else
+ cp = &conn->c_path[0];
+
+ rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
+ "flags 0x%x rx_jiffies %lu\n", conn,
+ (unsigned long long)cp->cp_next_rx_seq,
+ inc,
+ (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
+ be32_to_cpu(inc->i_hdr.h_len),
+ be16_to_cpu(inc->i_hdr.h_sport),
+ be16_to_cpu(inc->i_hdr.h_dport),
+ inc->i_hdr.h_flags,
+ inc->i_rx_jiffies);
+
+ /*
+ * Sequence numbers should only increase. Messages get their
+ * sequence number as they're queued in a sending conn. They
+ * can be dropped, though, if the sending socket is closed before
+ * they hit the wire. So sequence numbers can skip forward
+ * under normal operation. They can also drop back in the conn
+ * failover case as previously sent messages are resent down the
+ * new instance of a conn. We drop those, otherwise we have
+ * to assume that the next valid seq does not come after a
+ * hole in the fragment stream.
+ *
+ * The headers don't give us a way to realize if fragments of
+ * a message have been dropped. We assume that frags that arrive
+ * to a flow are part of the current message on the flow that is
+ * being reassembled. This means that senders can't drop messages
+ * from the sending conn until all their frags are sent.
+ *
+ * XXX we could spend more on the wire to get more robust failure
+ * detection, arguably worth it to avoid data corruption.
+ */
+ if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq &&
+ (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
+ rds_stats_inc(s_recv_drop_old_seq);
+ goto out;
+ }
+ cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
+
+ if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+ if (inc->i_hdr.h_sport == 0) {
+ rdsdebug("ignore ping with 0 sport from %pI6c\n",
+ saddr);
+ goto out;
+ }
+ rds_stats_inc(s_recv_ping);
+ rds_send_pong(cp, inc->i_hdr.h_sport);
+ /* if this is a handshake ping, start multipath if necessary */
+ if (RDS_HS_PROBE(be16_to_cpu(inc->i_hdr.h_sport),
+ be16_to_cpu(inc->i_hdr.h_dport))) {
+ rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
+ rds_start_mprds(cp->cp_conn);
+ }
+ goto out;
+ }
+
+ if (be16_to_cpu(inc->i_hdr.h_dport) == RDS_FLAG_PROBE_PORT &&
+ inc->i_hdr.h_sport == 0) {
+ rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
+ /* if this is a handshake pong, start multipath if necessary */
+ rds_start_mprds(cp->cp_conn);
+ wake_up(&cp->cp_conn->c_hs_waitq);
+ goto out;
+ }
+
+ rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if);
+ if (!rs) {
+ rds_stats_inc(s_recv_drop_no_sock);
+ goto out;
+ }
+
+ /* Process extension headers */
+ rds_recv_incoming_exthdrs(inc, rs);
+
+ /* We can be racing with rds_release() which marks the socket dead. */
+ sk = rds_rs_to_sk(rs);
+
+ /* serialize with rds_release -> sock_orphan */
+ write_lock_irqsave(&rs->rs_recv_lock, flags);
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
+ rds_stats_inc(s_recv_queued);
+ rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+ be32_to_cpu(inc->i_hdr.h_len),
+ inc->i_hdr.h_dport);
+ if (sock_flag(sk, SOCK_RCVTSTAMP))
+ inc->i_usercopy.rx_tstamp = ktime_get_real();
+ rds_inc_addref(inc);
+ inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
+ list_add_tail(&inc->i_item, &rs->rs_recv_queue);
+ __rds_wake_sk_sleep(sk);
+ } else {
+ rds_stats_inc(s_recv_drop_dead_sock);
+ }
+ write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+out:
+ if (rs)
+ rds_sock_put(rs);
+}
+EXPORT_SYMBOL_GPL(rds_recv_incoming);
+
+/*
+ * be very careful here. This is being called as the condition in
+ * wait_event_*() needs to cope with being called many times.
+ */
+static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
+{
+ unsigned long flags;
+
+ if (!*inc) {
+ read_lock_irqsave(&rs->rs_recv_lock, flags);
+ if (!list_empty(&rs->rs_recv_queue)) {
+ *inc = list_entry(rs->rs_recv_queue.next,
+ struct rds_incoming,
+ i_item);
+ rds_inc_addref(*inc);
+ }
+ read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+ }
+
+ return *inc != NULL;
+}
+
+static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
+ int drop)
+{
+ struct sock *sk = rds_rs_to_sk(rs);
+ int ret = 0;
+ unsigned long flags;
+
+ write_lock_irqsave(&rs->rs_recv_lock, flags);
+ if (!list_empty(&inc->i_item)) {
+ ret = 1;
+ if (drop) {
+ /* XXX make sure this i_conn is reliable */
+ rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+ -be32_to_cpu(inc->i_hdr.h_len),
+ inc->i_hdr.h_dport);
+ list_del_init(&inc->i_item);
+ rds_inc_put(inc);
+ }
+ }
+ write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+ rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
+ return ret;
+}
+
+/*
+ * Pull errors off the error queue.
+ * If msghdr is NULL, we will just purge the error queue.
+ */
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
+{
+ struct rds_notifier *notifier;
+ struct rds_rdma_notify cmsg;
+ unsigned int count = 0, max_messages = ~0U;
+ unsigned long flags;
+ LIST_HEAD(copy);
+ int err = 0;
+
+ memset(&cmsg, 0, sizeof(cmsg)); /* fill holes with zero */
+
+ /* put_cmsg copies to user space and thus may sleep. We can't do this
+ * with rs_lock held, so first grab as many notifications as we can stuff
+ * in the user provided cmsg buffer. We don't try to copy more, to avoid
+ * losing notifications - except when the buffer is so small that it wouldn't
+ * even hold a single notification. Then we give him as much of this single
+ * msg as we can squeeze in, and set MSG_CTRUNC.
+ */
+ if (msghdr) {
+ max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
+ if (!max_messages)
+ max_messages = 1;
+ }
+
+ spin_lock_irqsave(&rs->rs_lock, flags);
+ while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
+ notifier = list_entry(rs->rs_notify_queue.next,
+ struct rds_notifier, n_list);
+ list_move(&notifier->n_list, &copy);
+ count++;
+ }
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+ if (!count)
+ return 0;
+
+ while (!list_empty(&copy)) {
+ notifier = list_entry(copy.next, struct rds_notifier, n_list);
+
+ if (msghdr) {
+ cmsg.user_token = notifier->n_user_token;
+ cmsg.status = notifier->n_status;
+
+ err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
+ sizeof(cmsg), &cmsg);
+ if (err)
+ break;
+ }
+
+ list_del_init(&notifier->n_list);
+ kfree(notifier);
+ }
+
+ /* If we bailed out because of an error in put_cmsg,
+ * we may be left with one or more notifications that we
+ * didn't process. Return them to the head of the list. */
+ if (!list_empty(&copy)) {
+ spin_lock_irqsave(&rs->rs_lock, flags);
+ list_splice(&copy, &rs->rs_notify_queue);
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+ }
+
+ return err;
+}
+
+/*
+ * Queue a congestion notification
+ */
+static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
+{
+ uint64_t notify = rs->rs_cong_notify;
+ unsigned long flags;
+ int err;
+
+ err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
+ sizeof(notify), &notify);
+ if (err)
+ return err;
+
+ spin_lock_irqsave(&rs->rs_lock, flags);
+ rs->rs_cong_notify &= ~notify;
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+ return 0;
+}
+
+/*
+ * Receive any control messages.
+ */
+static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
+ struct rds_sock *rs)
+{
+ int ret = 0;
+
+ if (inc->i_usercopy.rdma_cookie) {
+ ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
+ sizeof(inc->i_usercopy.rdma_cookie),
+ &inc->i_usercopy.rdma_cookie);
+ if (ret)
+ goto out;
+ }
+
+ if ((inc->i_usercopy.rx_tstamp != 0) &&
+ sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
+ struct __kernel_old_timeval tv =
+ ns_to_kernel_old_timeval(inc->i_usercopy.rx_tstamp);
+
+ if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) {
+ ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
+ sizeof(tv), &tv);
+ } else {
+ struct __kernel_sock_timeval sk_tv;
+
+ sk_tv.tv_sec = tv.tv_sec;
+ sk_tv.tv_usec = tv.tv_usec;
+
+ ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
+ sizeof(sk_tv), &sk_tv);
+ }
+
+ if (ret)
+ goto out;
+ }
+
+ if (rs->rs_rx_traces) {
+ struct rds_cmsg_rx_trace t;
+ int i, j;
+
+ memset(&t, 0, sizeof(t));
+ inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
+ t.rx_traces = rs->rs_rx_traces;
+ for (i = 0; i < rs->rs_rx_traces; i++) {
+ j = rs->rs_rx_trace[i];
+ t.rx_trace_pos[i] = j;
+ t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
+ inc->i_rx_lat_trace[j];
+ }
+
+ ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
+ sizeof(t), &t);
+ if (ret)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg)
+{
+ struct rds_msg_zcopy_queue *q = &rs->rs_zcookie_queue;
+ struct rds_msg_zcopy_info *info = NULL;
+ struct rds_zcopy_cookies *done;
+ unsigned long flags;
+
+ if (!msg->msg_control)
+ return false;
+
+ if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) ||
+ msg->msg_controllen < CMSG_SPACE(sizeof(*done)))
+ return false;
+
+ spin_lock_irqsave(&q->lock, flags);
+ if (!list_empty(&q->zcookie_head)) {
+ info = list_entry(q->zcookie_head.next,
+ struct rds_msg_zcopy_info, rs_zcookie_next);
+ list_del(&info->rs_zcookie_next);
+ }
+ spin_unlock_irqrestore(&q->lock, flags);
+ if (!info)
+ return false;
+ done = &info->zcookies;
+ if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done),
+ done)) {
+ spin_lock_irqsave(&q->lock, flags);
+ list_add(&info->rs_zcookie_next, &q->zcookie_head);
+ spin_unlock_irqrestore(&q->lock, flags);
+ return false;
+ }
+ kfree(info);
+ return true;
+}
+
+int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int msg_flags)
+{
+ struct sock *sk = sock->sk;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ long timeo;
+ int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+ struct rds_incoming *inc = NULL;
+
+ /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
+ timeo = sock_rcvtimeo(sk, nonblock);
+
+ rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
+
+ if (msg_flags & MSG_OOB)
+ goto out;
+ if (msg_flags & MSG_ERRQUEUE)
+ return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR);
+
+ while (1) {
+ /* If there are pending notifications, do those - and nothing else */
+ if (!list_empty(&rs->rs_notify_queue)) {
+ ret = rds_notify_queue_get(rs, msg);
+ break;
+ }
+
+ if (rs->rs_cong_notify) {
+ ret = rds_notify_cong(rs, msg);
+ break;
+ }
+
+ if (!rds_next_incoming(rs, &inc)) {
+ if (nonblock) {
+ bool reaped = rds_recvmsg_zcookie(rs, msg);
+
+ ret = reaped ? 0 : -EAGAIN;
+ break;
+ }
+
+ timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+ (!list_empty(&rs->rs_notify_queue) ||
+ rs->rs_cong_notify ||
+ rds_next_incoming(rs, &inc)), timeo);
+ rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
+ timeo);
+ if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+ continue;
+
+ ret = timeo;
+ if (ret == 0)
+ ret = -ETIMEDOUT;
+ break;
+ }
+
+ rdsdebug("copying inc %p from %pI6c:%u to user\n", inc,
+ &inc->i_conn->c_faddr,
+ ntohs(inc->i_hdr.h_sport));
+ ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
+ if (ret < 0)
+ break;
+
+ /*
+ * if the message we just copied isn't at the head of the
+ * recv queue then someone else raced us to return it, try
+ * to get the next message.
+ */
+ if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
+ rds_inc_put(inc);
+ inc = NULL;
+ rds_stats_inc(s_recv_deliver_raced);
+ iov_iter_revert(&msg->msg_iter, ret);
+ continue;
+ }
+
+ if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
+ if (msg_flags & MSG_TRUNC)
+ ret = be32_to_cpu(inc->i_hdr.h_len);
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ if (rds_cmsg_recv(inc, msg, rs)) {
+ ret = -EFAULT;
+ break;
+ }
+ rds_recvmsg_zcookie(rs, msg);
+
+ rds_stats_inc(s_recv_delivered);
+
+ if (msg->msg_name) {
+ if (ipv6_addr_v4mapped(&inc->i_saddr)) {
+ sin->sin_family = AF_INET;
+ sin->sin_port = inc->i_hdr.h_sport;
+ sin->sin_addr.s_addr =
+ inc->i_saddr.s6_addr32[3];
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ msg->msg_namelen = sizeof(*sin);
+ } else {
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = inc->i_hdr.h_sport;
+ sin6->sin6_addr = inc->i_saddr;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_scope_id = rs->rs_bound_scope_id;
+ msg->msg_namelen = sizeof(*sin6);
+ }
+ }
+ break;
+ }
+
+ if (inc)
+ rds_inc_put(inc);
+
+out:
+ return ret;
+}
+
+/*
+ * The socket is being shut down and we're asked to drop messages that were
+ * queued for recvmsg. The caller has unbound the socket so the receive path
+ * won't queue any more incoming fragments or messages on the socket.
+ */
+void rds_clear_recv_queue(struct rds_sock *rs)
+{
+ struct sock *sk = rds_rs_to_sk(rs);
+ struct rds_incoming *inc, *tmp;
+ unsigned long flags;
+
+ write_lock_irqsave(&rs->rs_recv_lock, flags);
+ list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
+ rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+ -be32_to_cpu(inc->i_hdr.h_len),
+ inc->i_hdr.h_dport);
+ list_del_init(&inc->i_item);
+ rds_inc_put(inc);
+ }
+ write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+/*
+ * inc->i_saddr isn't used here because it is only set in the receive
+ * path.
+ */
+void rds_inc_info_copy(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ __be32 saddr, __be32 daddr, int flip)
+{
+ struct rds_info_message minfo;
+
+ minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+ minfo.len = be32_to_cpu(inc->i_hdr.h_len);
+ minfo.tos = inc->i_conn->c_tos;
+
+ if (flip) {
+ minfo.laddr = daddr;
+ minfo.faddr = saddr;
+ minfo.lport = inc->i_hdr.h_dport;
+ minfo.fport = inc->i_hdr.h_sport;
+ } else {
+ minfo.laddr = saddr;
+ minfo.faddr = daddr;
+ minfo.lport = inc->i_hdr.h_sport;
+ minfo.fport = inc->i_hdr.h_dport;
+ }
+
+ minfo.flags = 0;
+
+ rds_info_copy(iter, &minfo, sizeof(minfo));
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+void rds6_inc_info_copy(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ struct in6_addr *saddr, struct in6_addr *daddr,
+ int flip)
+{
+ struct rds6_info_message minfo6;
+
+ minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+ minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
+ minfo6.tos = inc->i_conn->c_tos;
+
+ if (flip) {
+ minfo6.laddr = *daddr;
+ minfo6.faddr = *saddr;
+ minfo6.lport = inc->i_hdr.h_dport;
+ minfo6.fport = inc->i_hdr.h_sport;
+ } else {
+ minfo6.laddr = *saddr;
+ minfo6.faddr = *daddr;
+ minfo6.lport = inc->i_hdr.h_sport;
+ minfo6.fport = inc->i_hdr.h_dport;
+ }
+
+ minfo6.flags = 0;
+
+ rds_info_copy(iter, &minfo6, sizeof(minfo6));
+}
+#endif
diff --git a/net/rds/send.c b/net/rds/send.c
new file mode 100644
index 000000000..0c5504068
--- /dev/null
+++ b/net/rds/send.c
@@ -0,0 +1,1515 @@
+/*
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/list.h>
+#include <linux/ratelimit.h>
+#include <linux/export.h>
+#include <linux/sizes.h>
+
+#include "rds.h"
+
+/* When transmitting messages in rds_send_xmit, we need to emerge from
+ * time to time and briefly release the CPU. Otherwise the softlock watchdog
+ * will kick our shin.
+ * Also, it seems fairer to not let one busy connection stall all the
+ * others.
+ *
+ * send_batch_count is the number of times we'll loop in send_xmit. Setting
+ * it to 0 will restore the old behavior (where we looped until we had
+ * drained the queue).
+ */
+static int send_batch_count = SZ_1K;
+module_param(send_batch_count, int, 0444);
+MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
+
+static void rds_send_remove_from_sock(struct list_head *messages, int status);
+
+/*
+ * Reset the send state. Callers must ensure that this doesn't race with
+ * rds_send_xmit().
+ */
+void rds_send_path_reset(struct rds_conn_path *cp)
+{
+ struct rds_message *rm, *tmp;
+ unsigned long flags;
+
+ if (cp->cp_xmit_rm) {
+ rm = cp->cp_xmit_rm;
+ cp->cp_xmit_rm = NULL;
+ /* Tell the user the RDMA op is no longer mapped by the
+ * transport. This isn't entirely true (it's flushed out
+ * independently) but as the connection is down, there's
+ * no ongoing RDMA to/from that memory */
+ rds_message_unmapped(rm);
+ rds_message_put(rm);
+ }
+
+ cp->cp_xmit_sg = 0;
+ cp->cp_xmit_hdr_off = 0;
+ cp->cp_xmit_data_off = 0;
+ cp->cp_xmit_atomic_sent = 0;
+ cp->cp_xmit_rdma_sent = 0;
+ cp->cp_xmit_data_sent = 0;
+
+ cp->cp_conn->c_map_queued = 0;
+
+ cp->cp_unacked_packets = rds_sysctl_max_unacked_packets;
+ cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes;
+
+ /* Mark messages as retransmissions, and move them to the send q */
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
+ set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+ set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
+ }
+ list_splice_init(&cp->cp_retrans, &cp->cp_send_queue);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+}
+EXPORT_SYMBOL_GPL(rds_send_path_reset);
+
+static int acquire_in_xmit(struct rds_conn_path *cp)
+{
+ return test_and_set_bit(RDS_IN_XMIT, &cp->cp_flags) == 0;
+}
+
+static void release_in_xmit(struct rds_conn_path *cp)
+{
+ clear_bit(RDS_IN_XMIT, &cp->cp_flags);
+ smp_mb__after_atomic();
+ /*
+ * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+ * hot path and finding waiters is very rare. We don't want to walk
+ * the system-wide hashed waitqueue buckets in the fast path only to
+ * almost never find waiters.
+ */
+ if (waitqueue_active(&cp->cp_waitq))
+ wake_up_all(&cp->cp_waitq);
+}
+
+/*
+ * We're making the conscious trade-off here to only send one message
+ * down the connection at a time.
+ * Pro:
+ * - tx queueing is a simple fifo list
+ * - reassembly is optional and easily done by transports per conn
+ * - no per flow rx lookup at all, straight to the socket
+ * - less per-frag memory and wire overhead
+ * Con:
+ * - queued acks can be delayed behind large messages
+ * Depends:
+ * - small message latency is higher behind queued large messages
+ * - large message latency isn't starved by intervening small sends
+ */
+int rds_send_xmit(struct rds_conn_path *cp)
+{
+ struct rds_connection *conn = cp->cp_conn;
+ struct rds_message *rm;
+ unsigned long flags;
+ unsigned int tmp;
+ struct scatterlist *sg;
+ int ret = 0;
+ LIST_HEAD(to_be_dropped);
+ int batch_count;
+ unsigned long send_gen = 0;
+ int same_rm = 0;
+
+restart:
+ batch_count = 0;
+
+ /*
+ * sendmsg calls here after having queued its message on the send
+ * queue. We only have one task feeding the connection at a time. If
+ * another thread is already feeding the queue then we back off. This
+ * avoids blocking the caller and trading per-connection data between
+ * caches per message.
+ */
+ if (!acquire_in_xmit(cp)) {
+ rds_stats_inc(s_send_lock_contention);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (rds_destroy_pending(cp->cp_conn)) {
+ release_in_xmit(cp);
+ ret = -ENETUNREACH; /* dont requeue send work */
+ goto out;
+ }
+
+ /*
+ * we record the send generation after doing the xmit acquire.
+ * if someone else manages to jump in and do some work, we'll use
+ * this to avoid a goto restart farther down.
+ *
+ * The acquire_in_xmit() check above ensures that only one
+ * caller can increment c_send_gen at any time.
+ */
+ send_gen = READ_ONCE(cp->cp_send_gen) + 1;
+ WRITE_ONCE(cp->cp_send_gen, send_gen);
+
+ /*
+ * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
+ * we do the opposite to avoid races.
+ */
+ if (!rds_conn_path_up(cp)) {
+ release_in_xmit(cp);
+ ret = 0;
+ goto out;
+ }
+
+ if (conn->c_trans->xmit_path_prepare)
+ conn->c_trans->xmit_path_prepare(cp);
+
+ /*
+ * spin trying to push headers and data down the connection until
+ * the connection doesn't make forward progress.
+ */
+ while (1) {
+
+ rm = cp->cp_xmit_rm;
+
+ if (!rm) {
+ same_rm = 0;
+ } else {
+ same_rm++;
+ if (same_rm >= 4096) {
+ rds_stats_inc(s_send_stuck_rm);
+ ret = -EAGAIN;
+ break;
+ }
+ }
+
+ /*
+ * If between sending messages, we can send a pending congestion
+ * map update.
+ */
+ if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
+ rm = rds_cong_update_alloc(conn);
+ if (IS_ERR(rm)) {
+ ret = PTR_ERR(rm);
+ break;
+ }
+ rm->data.op_active = 1;
+ rm->m_inc.i_conn_path = cp;
+ rm->m_inc.i_conn = cp->cp_conn;
+
+ cp->cp_xmit_rm = rm;
+ }
+
+ /*
+ * If not already working on one, grab the next message.
+ *
+ * cp_xmit_rm holds a ref while we're sending this message down
+ * the connction. We can use this ref while holding the
+ * send_sem.. rds_send_reset() is serialized with it.
+ */
+ if (!rm) {
+ unsigned int len;
+
+ batch_count++;
+
+ /* we want to process as big a batch as we can, but
+ * we also want to avoid softlockups. If we've been
+ * through a lot of messages, lets back off and see
+ * if anyone else jumps in
+ */
+ if (batch_count >= send_batch_count)
+ goto over_batch;
+
+ spin_lock_irqsave(&cp->cp_lock, flags);
+
+ if (!list_empty(&cp->cp_send_queue)) {
+ rm = list_entry(cp->cp_send_queue.next,
+ struct rds_message,
+ m_conn_item);
+ rds_message_addref(rm);
+
+ /*
+ * Move the message from the send queue to the retransmit
+ * list right away.
+ */
+ list_move_tail(&rm->m_conn_item,
+ &cp->cp_retrans);
+ }
+
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+
+ if (!rm)
+ break;
+
+ /* Unfortunately, the way Infiniband deals with
+ * RDMA to a bad MR key is by moving the entire
+ * queue pair to error state. We could possibly
+ * recover from that, but right now we drop the
+ * connection.
+ * Therefore, we never retransmit messages with RDMA ops.
+ */
+ if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) ||
+ (rm->rdma.op_active &&
+ test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) {
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+ list_move(&rm->m_conn_item, &to_be_dropped);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+ continue;
+ }
+
+ /* Require an ACK every once in a while */
+ len = ntohl(rm->m_inc.i_hdr.h_len);
+ if (cp->cp_unacked_packets == 0 ||
+ cp->cp_unacked_bytes < len) {
+ set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+ cp->cp_unacked_packets =
+ rds_sysctl_max_unacked_packets;
+ cp->cp_unacked_bytes =
+ rds_sysctl_max_unacked_bytes;
+ rds_stats_inc(s_send_ack_required);
+ } else {
+ cp->cp_unacked_bytes -= len;
+ cp->cp_unacked_packets--;
+ }
+
+ cp->cp_xmit_rm = rm;
+ }
+
+ /* The transport either sends the whole rdma or none of it */
+ if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) {
+ rm->m_final_op = &rm->rdma;
+ /* The transport owns the mapped memory for now.
+ * You can't unmap it while it's on the send queue
+ */
+ set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
+ if (ret) {
+ clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ wake_up_interruptible(&rm->m_flush_wait);
+ break;
+ }
+ cp->cp_xmit_rdma_sent = 1;
+
+ }
+
+ if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) {
+ rm->m_final_op = &rm->atomic;
+ /* The transport owns the mapped memory for now.
+ * You can't unmap it while it's on the send queue
+ */
+ set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
+ if (ret) {
+ clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ wake_up_interruptible(&rm->m_flush_wait);
+ break;
+ }
+ cp->cp_xmit_atomic_sent = 1;
+
+ }
+
+ /*
+ * A number of cases require an RDS header to be sent
+ * even if there is no data.
+ * We permit 0-byte sends; rds-ping depends on this.
+ * However, if there are exclusively attached silent ops,
+ * we skip the hdr/data send, to enable silent operation.
+ */
+ if (rm->data.op_nents == 0) {
+ int ops_present;
+ int all_ops_are_silent = 1;
+
+ ops_present = (rm->atomic.op_active || rm->rdma.op_active);
+ if (rm->atomic.op_active && !rm->atomic.op_silent)
+ all_ops_are_silent = 0;
+ if (rm->rdma.op_active && !rm->rdma.op_silent)
+ all_ops_are_silent = 0;
+
+ if (ops_present && all_ops_are_silent
+ && !rm->m_rdma_cookie)
+ rm->data.op_active = 0;
+ }
+
+ if (rm->data.op_active && !cp->cp_xmit_data_sent) {
+ rm->m_final_op = &rm->data;
+
+ ret = conn->c_trans->xmit(conn, rm,
+ cp->cp_xmit_hdr_off,
+ cp->cp_xmit_sg,
+ cp->cp_xmit_data_off);
+ if (ret <= 0)
+ break;
+
+ if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) {
+ tmp = min_t(int, ret,
+ sizeof(struct rds_header) -
+ cp->cp_xmit_hdr_off);
+ cp->cp_xmit_hdr_off += tmp;
+ ret -= tmp;
+ }
+
+ sg = &rm->data.op_sg[cp->cp_xmit_sg];
+ while (ret) {
+ tmp = min_t(int, ret, sg->length -
+ cp->cp_xmit_data_off);
+ cp->cp_xmit_data_off += tmp;
+ ret -= tmp;
+ if (cp->cp_xmit_data_off == sg->length) {
+ cp->cp_xmit_data_off = 0;
+ sg++;
+ cp->cp_xmit_sg++;
+ BUG_ON(ret != 0 && cp->cp_xmit_sg ==
+ rm->data.op_nents);
+ }
+ }
+
+ if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) &&
+ (cp->cp_xmit_sg == rm->data.op_nents))
+ cp->cp_xmit_data_sent = 1;
+ }
+
+ /*
+ * A rm will only take multiple times through this loop
+ * if there is a data op. Thus, if the data is sent (or there was
+ * none), then we're done with the rm.
+ */
+ if (!rm->data.op_active || cp->cp_xmit_data_sent) {
+ cp->cp_xmit_rm = NULL;
+ cp->cp_xmit_sg = 0;
+ cp->cp_xmit_hdr_off = 0;
+ cp->cp_xmit_data_off = 0;
+ cp->cp_xmit_rdma_sent = 0;
+ cp->cp_xmit_atomic_sent = 0;
+ cp->cp_xmit_data_sent = 0;
+
+ rds_message_put(rm);
+ }
+ }
+
+over_batch:
+ if (conn->c_trans->xmit_path_complete)
+ conn->c_trans->xmit_path_complete(cp);
+ release_in_xmit(cp);
+
+ /* Nuke any messages we decided not to retransmit. */
+ if (!list_empty(&to_be_dropped)) {
+ /* irqs on here, so we can put(), unlike above */
+ list_for_each_entry(rm, &to_be_dropped, m_conn_item)
+ rds_message_put(rm);
+ rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
+ }
+
+ /*
+ * Other senders can queue a message after we last test the send queue
+ * but before we clear RDS_IN_XMIT. In that case they'd back off and
+ * not try and send their newly queued message. We need to check the
+ * send queue after having cleared RDS_IN_XMIT so that their message
+ * doesn't get stuck on the send queue.
+ *
+ * If the transport cannot continue (i.e ret != 0), then it must
+ * call us when more room is available, such as from the tx
+ * completion handler.
+ *
+ * We have an extra generation check here so that if someone manages
+ * to jump in after our release_in_xmit, we'll see that they have done
+ * some work and we will skip our goto
+ */
+ if (ret == 0) {
+ bool raced;
+
+ smp_mb();
+ raced = send_gen != READ_ONCE(cp->cp_send_gen);
+
+ if ((test_bit(0, &conn->c_map_queued) ||
+ !list_empty(&cp->cp_send_queue)) && !raced) {
+ if (batch_count < send_batch_count)
+ goto restart;
+ rcu_read_lock();
+ if (rds_destroy_pending(cp->cp_conn))
+ ret = -ENETUNREACH;
+ else
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+ rcu_read_unlock();
+ } else if (raced) {
+ rds_stats_inc(s_send_lock_queue_raced);
+ }
+ }
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rds_send_xmit);
+
+static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm)
+{
+ u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+ assert_spin_locked(&rs->rs_lock);
+
+ BUG_ON(rs->rs_snd_bytes < len);
+ rs->rs_snd_bytes -= len;
+
+ if (rs->rs_snd_bytes == 0)
+ rds_stats_inc(s_send_queue_empty);
+}
+
+static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
+ is_acked_func is_acked)
+{
+ if (is_acked)
+ return is_acked(rm, ack);
+ return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
+}
+
+/*
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
+ */
+void rds_rdma_send_complete(struct rds_message *rm, int status)
+{
+ struct rds_sock *rs = NULL;
+ struct rm_rdma_op *ro;
+ struct rds_notifier *notifier;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+ ro = &rm->rdma;
+ if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
+ ro->op_active && ro->op_notify && ro->op_notifier) {
+ notifier = ro->op_notifier;
+ rs = rm->m_rs;
+ sock_hold(rds_rs_to_sk(rs));
+
+ notifier->n_status = status;
+ spin_lock(&rs->rs_lock);
+ list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+ spin_unlock(&rs->rs_lock);
+
+ ro->op_notifier = NULL;
+ }
+
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+ if (rs) {
+ rds_wake_sk_sleep(rs);
+ sock_put(rds_rs_to_sk(rs));
+ }
+}
+EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
+
+/*
+ * Just like above, except looks at atomic op
+ */
+void rds_atomic_send_complete(struct rds_message *rm, int status)
+{
+ struct rds_sock *rs = NULL;
+ struct rm_atomic_op *ao;
+ struct rds_notifier *notifier;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+ ao = &rm->atomic;
+ if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
+ && ao->op_active && ao->op_notify && ao->op_notifier) {
+ notifier = ao->op_notifier;
+ rs = rm->m_rs;
+ sock_hold(rds_rs_to_sk(rs));
+
+ notifier->n_status = status;
+ spin_lock(&rs->rs_lock);
+ list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+ spin_unlock(&rs->rs_lock);
+
+ ao->op_notifier = NULL;
+ }
+
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+ if (rs) {
+ rds_wake_sk_sleep(rs);
+ sock_put(rds_rs_to_sk(rs));
+ }
+}
+EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
+
+/*
+ * This is the same as rds_rdma_send_complete except we
+ * don't do any locking - we have all the ingredients (message,
+ * socket, socket lock) and can just move the notifier.
+ */
+static inline void
+__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+{
+ struct rm_rdma_op *ro;
+ struct rm_atomic_op *ao;
+
+ ro = &rm->rdma;
+ if (ro->op_active && ro->op_notify && ro->op_notifier) {
+ ro->op_notifier->n_status = status;
+ list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
+ ro->op_notifier = NULL;
+ }
+
+ ao = &rm->atomic;
+ if (ao->op_active && ao->op_notify && ao->op_notifier) {
+ ao->op_notifier->n_status = status;
+ list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
+ ao->op_notifier = NULL;
+ }
+
+ /* No need to wake the app - caller does this */
+}
+
+/*
+ * This removes messages from the socket's list if they're on it. The list
+ * argument must be private to the caller, we must be able to modify it
+ * without locks. The messages must have a reference held for their
+ * position on the list. This function will drop that reference after
+ * removing the messages from the 'messages' list regardless of if it found
+ * the messages on the socket list or not.
+ */
+static void rds_send_remove_from_sock(struct list_head *messages, int status)
+{
+ unsigned long flags;
+ struct rds_sock *rs = NULL;
+ struct rds_message *rm;
+
+ while (!list_empty(messages)) {
+ int was_on_sock = 0;
+
+ rm = list_entry(messages->next, struct rds_message,
+ m_conn_item);
+ list_del_init(&rm->m_conn_item);
+
+ /*
+ * If we see this flag cleared then we're *sure* that someone
+ * else beat us to removing it from the sock. If we race
+ * with their flag update we'll get the lock and then really
+ * see that the flag has been cleared.
+ *
+ * The message spinlock makes sure nobody clears rm->m_rs
+ * while we're messing with it. It does not prevent the
+ * message from being removed from the socket, though.
+ */
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
+ if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
+ goto unlock_and_drop;
+
+ if (rs != rm->m_rs) {
+ if (rs) {
+ rds_wake_sk_sleep(rs);
+ sock_put(rds_rs_to_sk(rs));
+ }
+ rs = rm->m_rs;
+ if (rs)
+ sock_hold(rds_rs_to_sk(rs));
+ }
+ if (!rs)
+ goto unlock_and_drop;
+ spin_lock(&rs->rs_lock);
+
+ if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
+ struct rm_rdma_op *ro = &rm->rdma;
+ struct rds_notifier *notifier;
+
+ list_del_init(&rm->m_sock_item);
+ rds_send_sndbuf_remove(rs, rm);
+
+ if (ro->op_active && ro->op_notifier &&
+ (ro->op_notify || (ro->op_recverr && status))) {
+ notifier = ro->op_notifier;
+ list_add_tail(&notifier->n_list,
+ &rs->rs_notify_queue);
+ if (!notifier->n_status)
+ notifier->n_status = status;
+ rm->rdma.op_notifier = NULL;
+ }
+ was_on_sock = 1;
+ }
+ spin_unlock(&rs->rs_lock);
+
+unlock_and_drop:
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+ rds_message_put(rm);
+ if (was_on_sock)
+ rds_message_put(rm);
+ }
+
+ if (rs) {
+ rds_wake_sk_sleep(rs);
+ sock_put(rds_rs_to_sk(rs));
+ }
+}
+
+/*
+ * Transports call here when they've determined that the receiver queued
+ * messages up to, and including, the given sequence number. Messages are
+ * moved to the retrans queue when rds_send_xmit picks them off the send
+ * queue. This means that in the TCP case, the message may not have been
+ * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
+ * checks the RDS_MSG_HAS_ACK_SEQ bit.
+ */
+void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
+ is_acked_func is_acked)
+{
+ struct rds_message *rm, *tmp;
+ unsigned long flags;
+ LIST_HEAD(list);
+
+ spin_lock_irqsave(&cp->cp_lock, flags);
+
+ list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) {
+ if (!rds_send_is_acked(rm, ack, is_acked))
+ break;
+
+ list_move(&rm->m_conn_item, &list);
+ clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+ }
+
+ /* order flag updates with spin locks */
+ if (!list_empty(&list))
+ smp_mb__after_atomic();
+
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+
+ /* now remove the messages from the sock list as needed */
+ rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
+}
+EXPORT_SYMBOL_GPL(rds_send_path_drop_acked);
+
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+ is_acked_func is_acked)
+{
+ WARN_ON(conn->c_trans->t_mp_capable);
+ rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked);
+}
+EXPORT_SYMBOL_GPL(rds_send_drop_acked);
+
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest)
+{
+ struct rds_message *rm, *tmp;
+ struct rds_connection *conn;
+ struct rds_conn_path *cp;
+ unsigned long flags;
+ LIST_HEAD(list);
+
+ /* get all the messages we're dropping under the rs lock */
+ spin_lock_irqsave(&rs->rs_lock, flags);
+
+ list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
+ if (dest &&
+ (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) ||
+ dest->sin6_port != rm->m_inc.i_hdr.h_dport))
+ continue;
+
+ list_move(&rm->m_sock_item, &list);
+ rds_send_sndbuf_remove(rs, rm);
+ clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+ }
+
+ /* order flag updates with the rs lock */
+ smp_mb__after_atomic();
+
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+ if (list_empty(&list))
+ return;
+
+ /* Remove the messages from the conn */
+ list_for_each_entry(rm, &list, m_sock_item) {
+
+ conn = rm->m_inc.i_conn;
+ if (conn->c_trans->t_mp_capable)
+ cp = rm->m_inc.i_conn_path;
+ else
+ cp = &conn->c_path[0];
+
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ /*
+ * Maybe someone else beat us to removing rm from the conn.
+ * If we race with their flag update we'll get the lock and
+ * then really see that the flag has been cleared.
+ */
+ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+ continue;
+ }
+ list_del_init(&rm->m_conn_item);
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+
+ /*
+ * Couldn't grab m_rs_lock in top loop (lock ordering),
+ * but we can now.
+ */
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+ spin_lock(&rs->rs_lock);
+ __rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
+ spin_unlock(&rs->rs_lock);
+
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+ rds_message_put(rm);
+ }
+
+ rds_wake_sk_sleep(rs);
+
+ while (!list_empty(&list)) {
+ rm = list_entry(list.next, struct rds_message, m_sock_item);
+ list_del_init(&rm->m_sock_item);
+ rds_message_wait(rm);
+
+ /* just in case the code above skipped this message
+ * because RDS_MSG_ON_CONN wasn't set, run it again here
+ * taking m_rs_lock is the only thing that keeps us
+ * from racing with ack processing.
+ */
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+ spin_lock(&rs->rs_lock);
+ __rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
+ spin_unlock(&rs->rs_lock);
+
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+ rds_message_put(rm);
+ }
+}
+
+/*
+ * we only want this to fire once so we use the callers 'queued'. It's
+ * possible that another thread can race with us and remove the
+ * message from the flow with RDS_CANCEL_SENT_TO.
+ */
+static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
+ struct rds_conn_path *cp,
+ struct rds_message *rm, __be16 sport,
+ __be16 dport, int *queued)
+{
+ unsigned long flags;
+ u32 len;
+
+ if (*queued)
+ goto out;
+
+ len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+ /* this is the only place which holds both the socket's rs_lock
+ * and the connection's c_lock */
+ spin_lock_irqsave(&rs->rs_lock, flags);
+
+ /*
+ * If there is a little space in sndbuf, we don't queue anything,
+ * and userspace gets -EAGAIN. But poll() indicates there's send
+ * room. This can lead to bad behavior (spinning) if snd_bytes isn't
+ * freed up by incoming acks. So we check the *old* value of
+ * rs_snd_bytes here to allow the last msg to exceed the buffer,
+ * and poll() now knows no more data can be sent.
+ */
+ if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
+ rs->rs_snd_bytes += len;
+
+ /* let recv side know we are close to send space exhaustion.
+ * This is probably not the optimal way to do it, as this
+ * means we set the flag on *all* messages as soon as our
+ * throughput hits a certain threshold.
+ */
+ if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
+ set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+ list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
+ set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+ rds_message_addref(rm);
+ sock_hold(rds_rs_to_sk(rs));
+ rm->m_rs = rs;
+
+ /* The code ordering is a little weird, but we're
+ trying to minimize the time we hold c_lock */
+ rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
+ rm->m_inc.i_conn = conn;
+ rm->m_inc.i_conn_path = cp;
+ rds_message_addref(rm);
+
+ spin_lock(&cp->cp_lock);
+ rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++);
+ list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
+ set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+ spin_unlock(&cp->cp_lock);
+
+ rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
+ rm, len, rs, rs->rs_snd_bytes,
+ (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
+
+ *queued = 1;
+ }
+
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+out:
+ return *queued;
+}
+
+/*
+ * rds_message is getting to be quite complicated, and we'd like to allocate
+ * it all in one go. This figures out how big it needs to be up front.
+ */
+static int rds_rm_size(struct msghdr *msg, int num_sgs,
+ struct rds_iov_vector_arr *vct)
+{
+ struct cmsghdr *cmsg;
+ int size = 0;
+ int cmsg_groups = 0;
+ int retval;
+ bool zcopy_cookie = false;
+ struct rds_iov_vector *iov, *tmp_iov;
+
+ if (num_sgs < 0)
+ return -EINVAL;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_RDS)
+ continue;
+
+ switch (cmsg->cmsg_type) {
+ case RDS_CMSG_RDMA_ARGS:
+ if (vct->indx >= vct->len) {
+ vct->len += vct->incr;
+ tmp_iov =
+ krealloc(vct->vec,
+ vct->len *
+ sizeof(struct rds_iov_vector),
+ GFP_KERNEL);
+ if (!tmp_iov) {
+ vct->len -= vct->incr;
+ return -ENOMEM;
+ }
+ vct->vec = tmp_iov;
+ }
+ iov = &vct->vec[vct->indx];
+ memset(iov, 0, sizeof(struct rds_iov_vector));
+ vct->indx++;
+ cmsg_groups |= 1;
+ retval = rds_rdma_extra_size(CMSG_DATA(cmsg), iov);
+ if (retval < 0)
+ return retval;
+ size += retval;
+
+ break;
+
+ case RDS_CMSG_ZCOPY_COOKIE:
+ zcopy_cookie = true;
+ fallthrough;
+
+ case RDS_CMSG_RDMA_DEST:
+ case RDS_CMSG_RDMA_MAP:
+ cmsg_groups |= 2;
+ /* these are valid but do no add any size */
+ break;
+
+ case RDS_CMSG_ATOMIC_CSWP:
+ case RDS_CMSG_ATOMIC_FADD:
+ case RDS_CMSG_MASKED_ATOMIC_CSWP:
+ case RDS_CMSG_MASKED_ATOMIC_FADD:
+ cmsg_groups |= 1;
+ size += sizeof(struct scatterlist);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ }
+
+ if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie)
+ return -EINVAL;
+
+ size += num_sgs * sizeof(struct scatterlist);
+
+ /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
+ if (cmsg_groups == 3)
+ return -EINVAL;
+
+ return size;
+}
+
+static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ u32 *cookie;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)) ||
+ !rm->data.op_mmp_znotifier)
+ return -EINVAL;
+ cookie = CMSG_DATA(cmsg);
+ rm->data.op_mmp_znotifier->z_cookie = *cookie;
+ return 0;
+}
+
+static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
+ struct msghdr *msg, int *allocated_mr,
+ struct rds_iov_vector_arr *vct)
+{
+ struct cmsghdr *cmsg;
+ int ret = 0, ind = 0;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_RDS)
+ continue;
+
+ /* As a side effect, RDMA_DEST and RDMA_MAP will set
+ * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
+ */
+ switch (cmsg->cmsg_type) {
+ case RDS_CMSG_RDMA_ARGS:
+ if (ind >= vct->indx)
+ return -ENOMEM;
+ ret = rds_cmsg_rdma_args(rs, rm, cmsg, &vct->vec[ind]);
+ ind++;
+ break;
+
+ case RDS_CMSG_RDMA_DEST:
+ ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
+ break;
+
+ case RDS_CMSG_RDMA_MAP:
+ ret = rds_cmsg_rdma_map(rs, rm, cmsg);
+ if (!ret)
+ *allocated_mr = 1;
+ else if (ret == -ENODEV)
+ /* Accommodate the get_mr() case which can fail
+ * if connection isn't established yet.
+ */
+ ret = -EAGAIN;
+ break;
+ case RDS_CMSG_ATOMIC_CSWP:
+ case RDS_CMSG_ATOMIC_FADD:
+ case RDS_CMSG_MASKED_ATOMIC_CSWP:
+ case RDS_CMSG_MASKED_ATOMIC_FADD:
+ ret = rds_cmsg_atomic(rs, rm, cmsg);
+ break;
+
+ case RDS_CMSG_ZCOPY_COOKIE:
+ ret = rds_cmsg_zcopy(rs, rm, cmsg);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int rds_send_mprds_hash(struct rds_sock *rs,
+ struct rds_connection *conn, int nonblock)
+{
+ int hash;
+
+ if (conn->c_npaths == 0)
+ hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS);
+ else
+ hash = RDS_MPATH_HASH(rs, conn->c_npaths);
+ if (conn->c_npaths == 0 && hash != 0) {
+ rds_send_ping(conn, 0);
+
+ /* The underlying connection is not up yet. Need to wait
+ * until it is up to be sure that the non-zero c_path can be
+ * used. But if we are interrupted, we have to use the zero
+ * c_path in case the connection ends up being non-MP capable.
+ */
+ if (conn->c_npaths == 0) {
+ /* Cannot wait for the connection be made, so just use
+ * the base c_path.
+ */
+ if (nonblock)
+ return 0;
+ if (wait_event_interruptible(conn->c_hs_waitq,
+ conn->c_npaths != 0))
+ hash = 0;
+ }
+ if (conn->c_npaths == 1)
+ hash = 0;
+ }
+ return hash;
+}
+
+static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
+{
+ struct rds_rdma_args *args;
+ struct cmsghdr *cmsg;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_RDS)
+ continue;
+
+ if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
+ if (cmsg->cmsg_len <
+ CMSG_LEN(sizeof(struct rds_rdma_args)))
+ return -EINVAL;
+ args = CMSG_DATA(cmsg);
+ *rdma_bytes += args->remote_vec.bytes;
+ }
+ }
+ return 0;
+}
+
+int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
+{
+ struct sock *sk = sock->sk;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ __be16 dport;
+ struct rds_message *rm = NULL;
+ struct rds_connection *conn;
+ int ret = 0;
+ int queued = 0, allocated_mr = 0;
+ int nonblock = msg->msg_flags & MSG_DONTWAIT;
+ long timeo = sock_sndtimeo(sk, nonblock);
+ struct rds_conn_path *cpath;
+ struct in6_addr daddr;
+ __u32 scope_id = 0;
+ size_t total_payload_len = payload_len, rdma_payload_len = 0;
+ bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
+ sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
+ int num_sgs = DIV_ROUND_UP(payload_len, PAGE_SIZE);
+ int namelen;
+ struct rds_iov_vector_arr vct;
+ int ind;
+
+ memset(&vct, 0, sizeof(vct));
+
+ /* expect 1 RDMA CMSG per rds_sendmsg. can still grow if more needed. */
+ vct.incr = 1;
+
+ /* Mirror Linux UDP mirror of BSD error message compatibility */
+ /* XXX: Perhaps MSG_MORE someday */
+ if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ namelen = msg->msg_namelen;
+ if (namelen != 0) {
+ if (namelen < sizeof(*usin)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ switch (usin->sin_family) {
+ case AF_INET:
+ if (usin->sin_addr.s_addr == htonl(INADDR_ANY) ||
+ usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
+ ipv4_is_multicast(usin->sin_addr.s_addr)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr);
+ dport = usin->sin_port;
+ break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6: {
+ int addr_type;
+
+ if (namelen < sizeof(*sin6)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ addr_type = ipv6_addr_type(&sin6->sin6_addr);
+ if (!(addr_type & IPV6_ADDR_UNICAST)) {
+ __be32 addr4;
+
+ if (!(addr_type & IPV6_ADDR_MAPPED)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* It is a mapped address. Need to do some
+ * sanity checks.
+ */
+ addr4 = sin6->sin6_addr.s6_addr32[3];
+ if (addr4 == htonl(INADDR_ANY) ||
+ addr4 == htonl(INADDR_BROADCAST) ||
+ ipv4_is_multicast(addr4)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (sin6->sin6_scope_id == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ scope_id = sin6->sin6_scope_id;
+ }
+
+ daddr = sin6->sin6_addr;
+ dport = sin6->sin6_port;
+ break;
+ }
+#endif
+
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+ } else {
+ /* We only care about consistency with ->connect() */
+ lock_sock(sk);
+ daddr = rs->rs_conn_addr;
+ dport = rs->rs_conn_port;
+ scope_id = rs->rs_bound_scope_id;
+ release_sock(sk);
+ }
+
+ lock_sock(sk);
+ if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) {
+ release_sock(sk);
+ ret = -ENOTCONN;
+ goto out;
+ } else if (namelen != 0) {
+ /* Cannot send to an IPv4 address using an IPv6 source
+ * address and cannot send to an IPv6 address using an
+ * IPv4 source address.
+ */
+ if (ipv6_addr_v4mapped(&daddr) ^
+ ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
+ release_sock(sk);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ /* If the socket is already bound to a link local address,
+ * it can only send to peers on the same link. But allow
+ * communicating between link local and non-link local address.
+ */
+ if (scope_id != rs->rs_bound_scope_id) {
+ if (!scope_id) {
+ scope_id = rs->rs_bound_scope_id;
+ } else if (rs->rs_bound_scope_id) {
+ release_sock(sk);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ }
+ release_sock(sk);
+
+ ret = rds_rdma_bytes(msg, &rdma_payload_len);
+ if (ret)
+ goto out;
+
+ total_payload_len += rdma_payload_len;
+ if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
+ if (payload_len > rds_sk_sndbuf(rs)) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
+ if (zcopy) {
+ if (rs->rs_transport->t_type != RDS_TRANS_TCP) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX);
+ }
+ /* size of rm including all sgs */
+ ret = rds_rm_size(msg, num_sgs, &vct);
+ if (ret < 0)
+ goto out;
+
+ rm = rds_message_alloc(ret, GFP_KERNEL);
+ if (!rm) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Attach data to the rm */
+ if (payload_len) {
+ rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
+ if (IS_ERR(rm->data.op_sg)) {
+ ret = PTR_ERR(rm->data.op_sg);
+ goto out;
+ }
+ ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy);
+ if (ret)
+ goto out;
+ }
+ rm->data.op_active = 1;
+
+ rm->m_daddr = daddr;
+
+ /* rds_conn_create has a spinlock that runs with IRQ off.
+ * Caching the conn in the socket helps a lot. */
+ if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr) &&
+ rs->rs_tos == rs->rs_conn->c_tos) {
+ conn = rs->rs_conn;
+ } else {
+ conn = rds_conn_create_outgoing(sock_net(sock->sk),
+ &rs->rs_bound_addr, &daddr,
+ rs->rs_transport, rs->rs_tos,
+ sock->sk->sk_allocation,
+ scope_id);
+ if (IS_ERR(conn)) {
+ ret = PTR_ERR(conn);
+ goto out;
+ }
+ rs->rs_conn = conn;
+ }
+
+ if (conn->c_trans->t_mp_capable)
+ cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)];
+ else
+ cpath = &conn->c_path[0];
+
+ rm->m_conn_path = cpath;
+
+ /* Parse any control messages the user may have included. */
+ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr, &vct);
+ if (ret) {
+ /* Trigger connection so that its ready for the next retry */
+ if (ret == -EAGAIN)
+ rds_conn_connect_if_down(conn);
+ goto out;
+ }
+
+ if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
+ printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
+ &rm->rdma, conn->c_trans->xmit_rdma);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
+ printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
+ &rm->atomic, conn->c_trans->xmit_atomic);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (rds_destroy_pending(conn)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ if (rds_conn_path_down(cpath))
+ rds_check_all_paths(conn);
+
+ ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
+ if (ret) {
+ rs->rs_seen_congestion = 1;
+ goto out;
+ }
+ while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port,
+ dport, &queued)) {
+ rds_stats_inc(s_send_queue_full);
+
+ if (nonblock) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+ rds_send_queue_rm(rs, conn, cpath, rm,
+ rs->rs_bound_port,
+ dport,
+ &queued),
+ timeo);
+ rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
+ if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+ continue;
+
+ ret = timeo;
+ if (ret == 0)
+ ret = -ETIMEDOUT;
+ goto out;
+ }
+
+ /*
+ * By now we've committed to the send. We reuse rds_send_worker()
+ * to retry sends in the rds thread if the transport asks us to.
+ */
+ rds_stats_inc(s_send_queued);
+
+ ret = rds_send_xmit(cpath);
+ if (ret == -ENOMEM || ret == -EAGAIN) {
+ ret = 0;
+ rcu_read_lock();
+ if (rds_destroy_pending(cpath->cp_conn))
+ ret = -ENETUNREACH;
+ else
+ queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
+ rcu_read_unlock();
+ }
+ if (ret)
+ goto out;
+ rds_message_put(rm);
+
+ for (ind = 0; ind < vct.indx; ind++)
+ kfree(vct.vec[ind].iov);
+ kfree(vct.vec);
+
+ return payload_len;
+
+out:
+ for (ind = 0; ind < vct.indx; ind++)
+ kfree(vct.vec[ind].iov);
+ kfree(vct.vec);
+
+ /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
+ * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
+ * or in any other way, we need to destroy the MR again */
+ if (allocated_mr)
+ rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
+
+ if (rm)
+ rds_message_put(rm);
+ return ret;
+}
+
+/*
+ * send out a probe. Can be shared by rds_send_ping,
+ * rds_send_pong, rds_send_hb.
+ * rds_send_hb should use h_flags
+ * RDS_FLAG_HB_PING|RDS_FLAG_ACK_REQUIRED
+ * or
+ * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED
+ */
+static int
+rds_send_probe(struct rds_conn_path *cp, __be16 sport,
+ __be16 dport, u8 h_flags)
+{
+ struct rds_message *rm;
+ unsigned long flags;
+ int ret = 0;
+
+ rm = rds_message_alloc(0, GFP_ATOMIC);
+ if (!rm) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rm->m_daddr = cp->cp_conn->c_faddr;
+ rm->data.op_active = 1;
+
+ rds_conn_path_connect_if_down(cp);
+
+ ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL);
+ if (ret)
+ goto out;
+
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ list_add_tail(&rm->m_conn_item, &cp->cp_send_queue);
+ set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+ rds_message_addref(rm);
+ rm->m_inc.i_conn = cp->cp_conn;
+ rm->m_inc.i_conn_path = cp;
+
+ rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport,
+ cp->cp_next_tx_seq);
+ rm->m_inc.i_hdr.h_flags |= h_flags;
+ cp->cp_next_tx_seq++;
+
+ if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) &&
+ cp->cp_conn->c_trans->t_mp_capable) {
+ u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
+ u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
+
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_NPATHS, &npaths,
+ sizeof(npaths));
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_GEN_NUM,
+ &my_gen_num,
+ sizeof(u32));
+ }
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+
+ rds_stats_inc(s_send_queued);
+ rds_stats_inc(s_send_pong);
+
+ /* schedule the send work on rds_wq */
+ rcu_read_lock();
+ if (!rds_destroy_pending(cp->cp_conn))
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+ rcu_read_unlock();
+
+ rds_message_put(rm);
+ return 0;
+
+out:
+ if (rm)
+ rds_message_put(rm);
+ return ret;
+}
+
+int
+rds_send_pong(struct rds_conn_path *cp, __be16 dport)
+{
+ return rds_send_probe(cp, 0, dport, 0);
+}
+
+void
+rds_send_ping(struct rds_connection *conn, int cp_index)
+{
+ unsigned long flags;
+ struct rds_conn_path *cp = &conn->c_path[cp_index];
+
+ spin_lock_irqsave(&cp->cp_lock, flags);
+ if (conn->c_ping_triggered) {
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+ return;
+ }
+ conn->c_ping_triggered = 1;
+ spin_unlock_irqrestore(&cp->cp_lock, flags);
+ rds_send_probe(cp, cpu_to_be16(RDS_FLAG_PROBE_PORT), 0, 0);
+}
+EXPORT_SYMBOL_GPL(rds_send_ping);
diff --git a/net/rds/stats.c b/net/rds/stats.c
new file mode 100644
index 000000000..9e87da43c
--- /dev/null
+++ b/net/rds/stats.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+EXPORT_PER_CPU_SYMBOL_GPL(rds_stats);
+
+/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
+
+static const char *const rds_stat_names[] = {
+ "conn_reset",
+ "recv_drop_bad_checksum",
+ "recv_drop_old_seq",
+ "recv_drop_no_sock",
+ "recv_drop_dead_sock",
+ "recv_deliver_raced",
+ "recv_delivered",
+ "recv_queued",
+ "recv_immediate_retry",
+ "recv_delayed_retry",
+ "recv_ack_required",
+ "recv_rdma_bytes",
+ "recv_ping",
+ "send_queue_empty",
+ "send_queue_full",
+ "send_lock_contention",
+ "send_lock_queue_raced",
+ "send_immediate_retry",
+ "send_delayed_retry",
+ "send_drop_acked",
+ "send_ack_required",
+ "send_queued",
+ "send_rdma",
+ "send_rdma_bytes",
+ "send_pong",
+ "page_remainder_hit",
+ "page_remainder_miss",
+ "copy_to_user",
+ "copy_from_user",
+ "cong_update_queued",
+ "cong_update_received",
+ "cong_send_error",
+ "cong_send_blocked",
+ "recv_bytes_added_to_sock",
+ "recv_bytes_freed_fromsock",
+ "send_stuck_rm",
+};
+
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+ uint64_t *values, const char *const *names, size_t nr)
+{
+ struct rds_info_counter ctr;
+ size_t i;
+
+ for (i = 0; i < nr; i++) {
+ BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
+ strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
+ ctr.name[sizeof(ctr.name) - 1] = '\0';
+ ctr.value = values[i];
+
+ rds_info_copy(iter, &ctr, sizeof(ctr));
+ }
+}
+EXPORT_SYMBOL_GPL(rds_stats_info_copy);
+
+/*
+ * This gives global counters across all the transports. The strings
+ * are copied in so that the tool doesn't need knowledge of the specific
+ * stats that we're exporting. Some are pretty implementation dependent
+ * and may change over time. That doesn't stop them from being useful.
+ *
+ * This is the only function in the chain that knows about the byte granular
+ * length in userspace. It converts it to number of stat entries that the
+ * rest of the functions operate in.
+ */
+static void rds_stats_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_statistics stats = {0, };
+ uint64_t *src;
+ uint64_t *sum;
+ size_t i;
+ int cpu;
+ unsigned int avail;
+
+ avail = len / sizeof(struct rds_info_counter);
+
+ if (avail < ARRAY_SIZE(rds_stat_names)) {
+ avail = 0;
+ goto trans;
+ }
+
+ for_each_online_cpu(cpu) {
+ src = (uint64_t *)&(per_cpu(rds_stats, cpu));
+ sum = (uint64_t *)&stats;
+ for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+ *(sum++) += *(src++);
+ }
+
+ rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names,
+ ARRAY_SIZE(rds_stat_names));
+ avail -= ARRAY_SIZE(rds_stat_names);
+
+trans:
+ lens->each = sizeof(struct rds_info_counter);
+ lens->nr = rds_trans_stats_info_copy(iter, avail) +
+ ARRAY_SIZE(rds_stat_names);
+}
+
+void rds_stats_exit(void)
+{
+ rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
+}
+
+int rds_stats_init(void)
+{
+ rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
+ return 0;
+}
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
new file mode 100644
index 000000000..e381bbcd9
--- /dev/null
+++ b/net/rds/sysctl.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+
+static struct ctl_table_header *rds_sysctl_reg_table;
+
+static unsigned long rds_sysctl_reconnect_min = 1;
+static unsigned long rds_sysctl_reconnect_max = ~0UL;
+
+unsigned long rds_sysctl_reconnect_min_jiffies;
+unsigned long rds_sysctl_reconnect_max_jiffies = HZ;
+
+unsigned int rds_sysctl_max_unacked_packets = 8;
+unsigned int rds_sysctl_max_unacked_bytes = (16 << 20);
+
+unsigned int rds_sysctl_ping_enable = 1;
+
+static struct ctl_table rds_sysctl_rds_table[] = {
+ {
+ .procname = "reconnect_min_delay_ms",
+ .data = &rds_sysctl_reconnect_min_jiffies,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_ms_jiffies_minmax,
+ .extra1 = &rds_sysctl_reconnect_min,
+ .extra2 = &rds_sysctl_reconnect_max_jiffies,
+ },
+ {
+ .procname = "reconnect_max_delay_ms",
+ .data = &rds_sysctl_reconnect_max_jiffies,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_ms_jiffies_minmax,
+ .extra1 = &rds_sysctl_reconnect_min_jiffies,
+ .extra2 = &rds_sysctl_reconnect_max,
+ },
+ {
+ .procname = "max_unacked_packets",
+ .data = &rds_sysctl_max_unacked_packets,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "max_unacked_bytes",
+ .data = &rds_sysctl_max_unacked_bytes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ping_enable",
+ .data = &rds_sysctl_ping_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+void rds_sysctl_exit(void)
+{
+ unregister_net_sysctl_table(rds_sysctl_reg_table);
+}
+
+int rds_sysctl_init(void)
+{
+ rds_sysctl_reconnect_min = msecs_to_jiffies(1);
+ rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
+
+ rds_sysctl_reg_table =
+ register_net_sysctl(&init_net, "net/rds", rds_sysctl_rds_table);
+ if (!rds_sysctl_reg_table)
+ return -ENOMEM;
+ return 0;
+}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
new file mode 100644
index 000000000..4444fd82b
--- /dev/null
+++ b/net/rds/tcp.c
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/addrconf.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+/* only for info exporting */
+static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
+static LIST_HEAD(rds_tcp_tc_list);
+
+/* rds_tcp_tc_count counts only IPv4 connections.
+ * rds6_tcp_tc_count counts both IPv4 and IPv6 connections.
+ */
+static unsigned int rds_tcp_tc_count;
+#if IS_ENABLED(CONFIG_IPV6)
+static unsigned int rds6_tcp_tc_count;
+#endif
+
+/* Track rds_tcp_connection structs so they can be cleaned up */
+static DEFINE_SPINLOCK(rds_tcp_conn_lock);
+static LIST_HEAD(rds_tcp_conn_list);
+static atomic_t rds_tcp_unloading = ATOMIC_INIT(0);
+
+static struct kmem_cache *rds_tcp_conn_slab;
+
+static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *fpos);
+
+static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
+static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
+
+static struct ctl_table rds_tcp_sysctl_table[] = {
+#define RDS_TCP_SNDBUF 0
+ {
+ .procname = "rds_tcp_sndbuf",
+ /* data is per-net pointer */
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = rds_tcp_skbuf_handler,
+ .extra1 = &rds_tcp_min_sndbuf,
+ },
+#define RDS_TCP_RCVBUF 1
+ {
+ .procname = "rds_tcp_rcvbuf",
+ /* data is per-net pointer */
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = rds_tcp_skbuf_handler,
+ .extra1 = &rds_tcp_min_rcvbuf,
+ },
+ { }
+};
+
+u32 rds_tcp_write_seq(struct rds_tcp_connection *tc)
+{
+ /* seq# of the last byte of data in tcp send buffer */
+ return tcp_sk(tc->t_sock->sk)->write_seq;
+}
+
+u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
+{
+ return tcp_sk(tc->t_sock->sk)->snd_una;
+}
+
+void rds_tcp_restore_callbacks(struct socket *sock,
+ struct rds_tcp_connection *tc)
+{
+ rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
+ write_lock_bh(&sock->sk->sk_callback_lock);
+
+ /* done under the callback_lock to serialize with write_space */
+ spin_lock(&rds_tcp_tc_list_lock);
+ list_del_init(&tc->t_list_item);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds6_tcp_tc_count--;
+#endif
+ if (!tc->t_cpath->cp_conn->c_isv6)
+ rds_tcp_tc_count--;
+ spin_unlock(&rds_tcp_tc_list_lock);
+
+ tc->t_sock = NULL;
+
+ sock->sk->sk_write_space = tc->t_orig_write_space;
+ sock->sk->sk_data_ready = tc->t_orig_data_ready;
+ sock->sk->sk_state_change = tc->t_orig_state_change;
+ sock->sk->sk_user_data = NULL;
+
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+/*
+ * rds_tcp_reset_callbacks() switches the to the new sock and
+ * returns the existing tc->t_sock.
+ *
+ * The only functions that set tc->t_sock are rds_tcp_set_callbacks
+ * and rds_tcp_reset_callbacks. Send and receive trust that
+ * it is set. The absence of RDS_CONN_UP bit protects those paths
+ * from being called while it isn't set.
+ */
+void rds_tcp_reset_callbacks(struct socket *sock,
+ struct rds_conn_path *cp)
+{
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
+ struct socket *osock = tc->t_sock;
+
+ if (!osock)
+ goto newsock;
+
+ /* Need to resolve a duelling SYN between peers.
+ * We have an outstanding SYN to this peer, which may
+ * potentially have transitioned to the RDS_CONN_UP state,
+ * so we must quiesce any send threads before resetting
+ * cp_transport_data. We quiesce these threads by setting
+ * cp_state to something other than RDS_CONN_UP, and then
+ * waiting for any existing threads in rds_send_xmit to
+ * complete release_in_xmit(). (Subsequent threads entering
+ * rds_send_xmit() will bail on !rds_conn_up().
+ *
+ * However an incoming syn-ack at this point would end up
+ * marking the conn as RDS_CONN_UP, and would again permit
+ * rds_send_xmi() threads through, so ideally we would
+ * synchronize on RDS_CONN_UP after lock_sock(), but cannot
+ * do that: waiting on !RDS_IN_XMIT after lock_sock() may
+ * end up deadlocking with tcp_sendmsg(), and the RDS_IN_XMIT
+ * would not get set. As a result, we set c_state to
+ * RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change
+ * cannot mark rds_conn_path_up() in the window before lock_sock()
+ */
+ atomic_set(&cp->cp_state, RDS_CONN_RESETTING);
+ wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags));
+ /* reset receive side state for rds_tcp_data_recv() for osock */
+ cancel_delayed_work_sync(&cp->cp_send_w);
+ cancel_delayed_work_sync(&cp->cp_recv_w);
+ lock_sock(osock->sk);
+ if (tc->t_tinc) {
+ rds_inc_put(&tc->t_tinc->ti_inc);
+ tc->t_tinc = NULL;
+ }
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
+ rds_tcp_restore_callbacks(osock, tc);
+ release_sock(osock->sk);
+ sock_release(osock);
+newsock:
+ rds_send_path_reset(cp);
+ lock_sock(sock->sk);
+ rds_tcp_set_callbacks(sock, cp);
+ release_sock(sock->sk);
+}
+
+/* Add tc to rds_tcp_tc_list and set tc->t_sock. See comments
+ * above rds_tcp_reset_callbacks for notes about synchronization
+ * with data path
+ */
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
+{
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
+
+ rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
+ write_lock_bh(&sock->sk->sk_callback_lock);
+
+ /* done under the callback_lock to serialize with write_space */
+ spin_lock(&rds_tcp_tc_list_lock);
+ list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds6_tcp_tc_count++;
+#endif
+ if (!tc->t_cpath->cp_conn->c_isv6)
+ rds_tcp_tc_count++;
+ spin_unlock(&rds_tcp_tc_list_lock);
+
+ /* accepted sockets need our listen data ready undone */
+ if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
+ sock->sk->sk_data_ready = sock->sk->sk_user_data;
+
+ tc->t_sock = sock;
+ tc->t_cpath = cp;
+ tc->t_orig_data_ready = sock->sk->sk_data_ready;
+ tc->t_orig_write_space = sock->sk->sk_write_space;
+ tc->t_orig_state_change = sock->sk->sk_state_change;
+
+ sock->sk->sk_user_data = cp;
+ sock->sk->sk_data_ready = rds_tcp_data_ready;
+ sock->sk->sk_write_space = rds_tcp_write_space;
+ sock->sk->sk_state_change = rds_tcp_state_change;
+
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+/* Handle RDS_INFO_TCP_SOCKETS socket option. It only returns IPv4
+ * connections for backward compatibility.
+ */
+static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_info_tcp_socket tsinfo;
+ struct rds_tcp_connection *tc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
+
+ if (len / sizeof(tsinfo) < rds_tcp_tc_count)
+ goto out;
+
+ list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+ struct inet_sock *inet = inet_sk(tc->t_sock->sk);
+
+ if (tc->t_cpath->cp_conn->c_isv6)
+ continue;
+
+ tsinfo.local_addr = inet->inet_saddr;
+ tsinfo.local_port = inet->inet_sport;
+ tsinfo.peer_addr = inet->inet_daddr;
+ tsinfo.peer_port = inet->inet_dport;
+
+ tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
+ tsinfo.data_rem = tc->t_tinc_data_rem;
+ tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
+ tsinfo.last_expected_una = tc->t_last_expected_una;
+ tsinfo.last_seen_una = tc->t_last_seen_una;
+ tsinfo.tos = tc->t_cpath->cp_conn->c_tos;
+
+ rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
+ }
+
+out:
+ lens->nr = rds_tcp_tc_count;
+ lens->each = sizeof(tsinfo);
+
+ spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and
+ * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped
+ * address.
+ */
+static void rds6_tcp_tc_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds6_info_tcp_socket tsinfo6;
+ struct rds_tcp_connection *tc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
+
+ if (len / sizeof(tsinfo6) < rds6_tcp_tc_count)
+ goto out;
+
+ list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+ struct sock *sk = tc->t_sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+
+ tsinfo6.local_addr = sk->sk_v6_rcv_saddr;
+ tsinfo6.local_port = inet->inet_sport;
+ tsinfo6.peer_addr = sk->sk_v6_daddr;
+ tsinfo6.peer_port = inet->inet_dport;
+
+ tsinfo6.hdr_rem = tc->t_tinc_hdr_rem;
+ tsinfo6.data_rem = tc->t_tinc_data_rem;
+ tsinfo6.last_sent_nxt = tc->t_last_sent_nxt;
+ tsinfo6.last_expected_una = tc->t_last_expected_una;
+ tsinfo6.last_seen_una = tc->t_last_seen_una;
+
+ rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6));
+ }
+
+out:
+ lens->nr = rds6_tcp_tc_count;
+ lens->each = sizeof(tsinfo6);
+
+ spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
+}
+#endif
+
+int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
+ __u32 scope_id)
+{
+ struct net_device *dev = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ int ret;
+#endif
+
+ if (ipv6_addr_v4mapped(addr)) {
+ if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL)
+ return 0;
+ return -EADDRNOTAVAIL;
+ }
+
+ /* If the scope_id is specified, check only those addresses
+ * hosted on the specified interface.
+ */
+ if (scope_id != 0) {
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, scope_id);
+ /* scope_id is not valid... */
+ if (!dev) {
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ }
+ rcu_read_unlock();
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ ret = ipv6_chk_addr(net, addr, dev, 0);
+ if (ret)
+ return 0;
+#endif
+ return -EADDRNOTAVAIL;
+}
+
+static void rds_tcp_conn_free(void *arg)
+{
+ struct rds_tcp_connection *tc = arg;
+ unsigned long flags;
+
+ rdsdebug("freeing tc %p\n", tc);
+
+ spin_lock_irqsave(&rds_tcp_conn_lock, flags);
+ if (!tc->t_tcp_node_detached)
+ list_del(&tc->t_tcp_node);
+ spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
+
+ kmem_cache_free(rds_tcp_conn_slab, tc);
+}
+
+static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+ struct rds_tcp_connection *tc;
+ int i, j;
+ int ret = 0;
+
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
+ if (!tc) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ mutex_init(&tc->t_conn_path_lock);
+ tc->t_sock = NULL;
+ tc->t_tinc = NULL;
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
+
+ conn->c_path[i].cp_transport_data = tc;
+ tc->t_cpath = &conn->c_path[i];
+ tc->t_tcp_node_detached = true;
+
+ rdsdebug("rds_conn_path [%d] tc %p\n", i,
+ conn->c_path[i].cp_transport_data);
+ }
+ spin_lock_irq(&rds_tcp_conn_lock);
+ for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+ tc = conn->c_path[i].cp_transport_data;
+ tc->t_tcp_node_detached = false;
+ list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
+ }
+ spin_unlock_irq(&rds_tcp_conn_lock);
+fail:
+ if (ret) {
+ for (j = 0; j < i; j++)
+ rds_tcp_conn_free(conn->c_path[j].cp_transport_data);
+ }
+ return ret;
+}
+
+static bool list_has_conn(struct list_head *list, struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc, *_tc;
+
+ list_for_each_entry_safe(tc, _tc, list, t_tcp_node) {
+ if (tc->t_cpath->cp_conn == conn)
+ return true;
+ }
+ return false;
+}
+
+static void rds_tcp_set_unloading(void)
+{
+ atomic_set(&rds_tcp_unloading, 1);
+}
+
+static bool rds_tcp_is_unloading(struct rds_connection *conn)
+{
+ return atomic_read(&rds_tcp_unloading) != 0;
+}
+
+static void rds_tcp_destroy_conns(void)
+{
+ struct rds_tcp_connection *tc, *_tc;
+ LIST_HEAD(tmp_list);
+
+ /* avoid calling conn_destroy with irqs off */
+ spin_lock_irq(&rds_tcp_conn_lock);
+ list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+ if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn))
+ list_move_tail(&tc->t_tcp_node, &tmp_list);
+ }
+ spin_unlock_irq(&rds_tcp_conn_lock);
+
+ list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
+ rds_conn_destroy(tc->t_cpath->cp_conn);
+}
+
+static void rds_tcp_exit(void);
+
+static u8 rds_tcp_get_tos_map(u8 tos)
+{
+ /* all user tos mapped to default 0 for TCP transport */
+ return 0;
+}
+
+struct rds_transport rds_tcp_transport = {
+ .laddr_check = rds_tcp_laddr_check,
+ .xmit_path_prepare = rds_tcp_xmit_path_prepare,
+ .xmit_path_complete = rds_tcp_xmit_path_complete,
+ .xmit = rds_tcp_xmit,
+ .recv_path = rds_tcp_recv_path,
+ .conn_alloc = rds_tcp_conn_alloc,
+ .conn_free = rds_tcp_conn_free,
+ .conn_path_connect = rds_tcp_conn_path_connect,
+ .conn_path_shutdown = rds_tcp_conn_path_shutdown,
+ .inc_copy_to_user = rds_tcp_inc_copy_to_user,
+ .inc_free = rds_tcp_inc_free,
+ .stats_info_copy = rds_tcp_stats_info_copy,
+ .exit = rds_tcp_exit,
+ .get_tos_map = rds_tcp_get_tos_map,
+ .t_owner = THIS_MODULE,
+ .t_name = "tcp",
+ .t_type = RDS_TRANS_TCP,
+ .t_prefer_loopback = 1,
+ .t_mp_capable = 1,
+ .t_unloading = rds_tcp_is_unloading,
+};
+
+static unsigned int rds_tcp_netid;
+
+/* per-network namespace private data for this module */
+struct rds_tcp_net {
+ struct socket *rds_tcp_listen_sock;
+ struct work_struct rds_tcp_accept_w;
+ struct ctl_table_header *rds_tcp_sysctl;
+ struct ctl_table *ctl_table;
+ int sndbuf_size;
+ int rcvbuf_size;
+};
+
+/* All module specific customizations to the RDS-TCP socket should be done in
+ * rds_tcp_tune() and applied after socket creation.
+ */
+bool rds_tcp_tune(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct rds_tcp_net *rtn;
+
+ tcp_sock_set_nodelay(sock->sk);
+ lock_sock(sk);
+ /* TCP timer functions might access net namespace even after
+ * a process which created this net namespace terminated.
+ */
+ if (!sk->sk_net_refcnt) {
+ if (!maybe_get_net(net)) {
+ release_sock(sk);
+ return false;
+ }
+ sk->sk_net_refcnt = 1;
+ netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
+ }
+ rtn = net_generic(net, rds_tcp_netid);
+ if (rtn->sndbuf_size > 0) {
+ sk->sk_sndbuf = rtn->sndbuf_size;
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ }
+ if (rtn->rcvbuf_size > 0) {
+ sk->sk_rcvbuf = rtn->rcvbuf_size;
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+ release_sock(sk);
+ return true;
+}
+
+static void rds_tcp_accept_worker(struct work_struct *work)
+{
+ struct rds_tcp_net *rtn = container_of(work,
+ struct rds_tcp_net,
+ rds_tcp_accept_w);
+
+ while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0)
+ cond_resched();
+}
+
+void rds_tcp_accept_work(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+ queue_work(rds_wq, &rtn->rds_tcp_accept_w);
+}
+
+static __net_init int rds_tcp_init_net(struct net *net)
+{
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+ struct ctl_table *tbl;
+ int err = 0;
+
+ memset(rtn, 0, sizeof(*rtn));
+
+ /* {snd, rcv}buf_size default to 0, which implies we let the
+ * stack pick the value, and permit auto-tuning of buffer size.
+ */
+ if (net == &init_net) {
+ tbl = rds_tcp_sysctl_table;
+ } else {
+ tbl = kmemdup(rds_tcp_sysctl_table,
+ sizeof(rds_tcp_sysctl_table), GFP_KERNEL);
+ if (!tbl) {
+ pr_warn("could not set allocate sysctl table\n");
+ return -ENOMEM;
+ }
+ rtn->ctl_table = tbl;
+ }
+ tbl[RDS_TCP_SNDBUF].data = &rtn->sndbuf_size;
+ tbl[RDS_TCP_RCVBUF].data = &rtn->rcvbuf_size;
+ rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp", tbl);
+ if (!rtn->rds_tcp_sysctl) {
+ pr_warn("could not register sysctl\n");
+ err = -ENOMEM;
+ goto fail;
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true);
+#else
+ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
+#endif
+ if (!rtn->rds_tcp_listen_sock) {
+ pr_warn("could not set up IPv6 listen sock\n");
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* Try IPv4 as some systems disable IPv6 */
+ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
+ if (!rtn->rds_tcp_listen_sock) {
+#endif
+ unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+ rtn->rds_tcp_sysctl = NULL;
+ err = -EAFNOSUPPORT;
+ goto fail;
+#if IS_ENABLED(CONFIG_IPV6)
+ }
+#endif
+ }
+ INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
+ return 0;
+
+fail:
+ if (net != &init_net)
+ kfree(tbl);
+ return err;
+}
+
+static void rds_tcp_kill_sock(struct net *net)
+{
+ struct rds_tcp_connection *tc, *_tc;
+ LIST_HEAD(tmp_list);
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+ struct socket *lsock = rtn->rds_tcp_listen_sock;
+
+ rtn->rds_tcp_listen_sock = NULL;
+ rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
+ spin_lock_irq(&rds_tcp_conn_lock);
+ list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+ struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
+
+ if (net != c_net)
+ continue;
+ if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) {
+ list_move_tail(&tc->t_tcp_node, &tmp_list);
+ } else {
+ list_del(&tc->t_tcp_node);
+ tc->t_tcp_node_detached = true;
+ }
+ }
+ spin_unlock_irq(&rds_tcp_conn_lock);
+ list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
+ rds_conn_destroy(tc->t_cpath->cp_conn);
+}
+
+static void __net_exit rds_tcp_exit_net(struct net *net)
+{
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+ rds_tcp_kill_sock(net);
+
+ if (rtn->rds_tcp_sysctl)
+ unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+
+ if (net != &init_net)
+ kfree(rtn->ctl_table);
+}
+
+static struct pernet_operations rds_tcp_net_ops = {
+ .init = rds_tcp_init_net,
+ .exit = rds_tcp_exit_net,
+ .id = &rds_tcp_netid,
+ .size = sizeof(struct rds_tcp_net),
+};
+
+void *rds_tcp_listen_sock_def_readable(struct net *net)
+{
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+ struct socket *lsock = rtn->rds_tcp_listen_sock;
+
+ if (!lsock)
+ return NULL;
+
+ return lsock->sk->sk_user_data;
+}
+
+/* when sysctl is used to modify some kernel socket parameters,this
+ * function resets the RDS connections in that netns so that we can
+ * restart with new parameters. The assumption is that such reset
+ * events are few and far-between.
+ */
+static void rds_tcp_sysctl_reset(struct net *net)
+{
+ struct rds_tcp_connection *tc, *_tc;
+
+ spin_lock_irq(&rds_tcp_conn_lock);
+ list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+ struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
+
+ if (net != c_net || !tc->t_sock)
+ continue;
+
+ /* reconnect with new parameters */
+ rds_conn_path_drop(tc->t_cpath, false);
+ }
+ spin_unlock_irq(&rds_tcp_conn_lock);
+}
+
+static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *fpos)
+{
+ struct net *net = current->nsproxy->net_ns;
+ int err;
+
+ err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos);
+ if (err < 0) {
+ pr_warn("Invalid input. Must be >= %d\n",
+ *(int *)(ctl->extra1));
+ return err;
+ }
+ if (write)
+ rds_tcp_sysctl_reset(net);
+ return 0;
+}
+
+static void rds_tcp_exit(void)
+{
+ rds_tcp_set_unloading();
+ synchronize_rcu();
+ rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
+#endif
+ unregister_pernet_device(&rds_tcp_net_ops);
+ rds_tcp_destroy_conns();
+ rds_trans_unregister(&rds_tcp_transport);
+ rds_tcp_recv_exit();
+ kmem_cache_destroy(rds_tcp_conn_slab);
+}
+module_exit(rds_tcp_exit);
+
+static int __init rds_tcp_init(void)
+{
+ int ret;
+
+ rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
+ sizeof(struct rds_tcp_connection),
+ 0, 0, NULL);
+ if (!rds_tcp_conn_slab) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = rds_tcp_recv_init();
+ if (ret)
+ goto out_slab;
+
+ ret = register_pernet_device(&rds_tcp_net_ops);
+ if (ret)
+ goto out_recv;
+
+ rds_trans_register(&rds_tcp_transport);
+
+ rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
+#endif
+
+ goto out;
+out_recv:
+ rds_tcp_recv_exit();
+out_slab:
+ kmem_cache_destroy(rds_tcp_conn_slab);
+out:
+ return ret;
+}
+module_init(rds_tcp_init);
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: TCP transport");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
new file mode 100644
index 000000000..f8b5930d7
--- /dev/null
+++ b/net/rds/tcp.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _RDS_TCP_H
+#define _RDS_TCP_H
+
+#define RDS_TCP_PORT 16385
+
+struct rds_tcp_incoming {
+ struct rds_incoming ti_inc;
+ struct sk_buff_head ti_skb_list;
+};
+
+struct rds_tcp_connection {
+
+ struct list_head t_tcp_node;
+ bool t_tcp_node_detached;
+ struct rds_conn_path *t_cpath;
+ /* t_conn_path_lock synchronizes the connection establishment between
+ * rds_tcp_accept_one and rds_tcp_conn_path_connect
+ */
+ struct mutex t_conn_path_lock;
+ struct socket *t_sock;
+ void *t_orig_write_space;
+ void *t_orig_data_ready;
+ void *t_orig_state_change;
+
+ struct rds_tcp_incoming *t_tinc;
+ size_t t_tinc_hdr_rem;
+ size_t t_tinc_data_rem;
+
+ /* XXX error report? */
+ struct work_struct t_conn_w;
+ struct work_struct t_send_w;
+ struct work_struct t_down_w;
+ struct work_struct t_recv_w;
+
+ /* for info exporting only */
+ struct list_head t_list_item;
+ u32 t_last_sent_nxt;
+ u32 t_last_expected_una;
+ u32 t_last_seen_una;
+};
+
+struct rds_tcp_statistics {
+ uint64_t s_tcp_data_ready_calls;
+ uint64_t s_tcp_write_space_calls;
+ uint64_t s_tcp_sndbuf_full;
+ uint64_t s_tcp_connect_raced;
+ uint64_t s_tcp_listen_closed_stale;
+};
+
+/* tcp.c */
+bool rds_tcp_tune(struct socket *sock);
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp);
+void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp);
+void rds_tcp_restore_callbacks(struct socket *sock,
+ struct rds_tcp_connection *tc);
+u32 rds_tcp_write_seq(struct rds_tcp_connection *tc);
+u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
+u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
+extern struct rds_transport rds_tcp_transport;
+void rds_tcp_accept_work(struct sock *sk);
+int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
+ __u32 scope_id);
+/* tcp_connect.c */
+int rds_tcp_conn_path_connect(struct rds_conn_path *cp);
+void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn);
+void rds_tcp_state_change(struct sock *sk);
+
+/* tcp_listen.c */
+struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
+void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
+void rds_tcp_listen_data_ready(struct sock *sk);
+int rds_tcp_accept_one(struct socket *sock);
+void rds_tcp_keepalive(struct socket *sock);
+void *rds_tcp_listen_sock_def_readable(struct net *net);
+
+/* tcp_recv.c */
+int rds_tcp_recv_init(void);
+void rds_tcp_recv_exit(void);
+void rds_tcp_data_ready(struct sock *sk);
+int rds_tcp_recv_path(struct rds_conn_path *cp);
+void rds_tcp_inc_free(struct rds_incoming *inc);
+int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+
+/* tcp_send.c */
+void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp);
+void rds_tcp_xmit_path_complete(struct rds_conn_path *cp);
+int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_tcp_write_space(struct sock *sk);
+
+/* tcp_stats.c */
+DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
+#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member)
+unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail);
+
+#endif
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
new file mode 100644
index 000000000..a0046e99d
--- /dev/null
+++ b/net/rds/tcp_connect.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+void rds_tcp_state_change(struct sock *sk)
+{
+ void (*state_change)(struct sock *sk);
+ struct rds_conn_path *cp;
+ struct rds_tcp_connection *tc;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ cp = sk->sk_user_data;
+ if (!cp) {
+ state_change = sk->sk_state_change;
+ goto out;
+ }
+ tc = cp->cp_transport_data;
+ state_change = tc->t_orig_state_change;
+
+ rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
+
+ switch (sk->sk_state) {
+ /* ignore connecting sockets as they make progress */
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ break;
+ case TCP_ESTABLISHED:
+ /* Force the peer to reconnect so that we have the
+ * TCP ports going from <smaller-ip>.<transient> to
+ * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the
+ * RDS connection as RDS_CONN_UP until the reconnect,
+ * to avoid RDS datagram loss.
+ */
+ if (rds_addr_cmp(&cp->cp_conn->c_laddr,
+ &cp->cp_conn->c_faddr) >= 0 &&
+ rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
+ RDS_CONN_ERROR)) {
+ rds_conn_path_drop(cp, false);
+ } else {
+ rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+ }
+ break;
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSE:
+ rds_conn_path_drop(cp, false);
+ break;
+ default:
+ break;
+ }
+out:
+ read_unlock_bh(&sk->sk_callback_lock);
+ state_change(sk);
+}
+
+int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
+{
+ struct socket *sock = NULL;
+ struct sockaddr_in6 sin6;
+ struct sockaddr_in sin;
+ struct sockaddr *addr;
+ int addrlen;
+ bool isv6;
+ int ret;
+ struct rds_connection *conn = cp->cp_conn;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
+
+ /* for multipath rds,we only trigger the connection after
+ * the handshake probe has determined the number of paths.
+ */
+ if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2)
+ return -EAGAIN;
+
+ mutex_lock(&tc->t_conn_path_lock);
+
+ if (rds_conn_path_up(cp)) {
+ mutex_unlock(&tc->t_conn_path_lock);
+ return 0;
+ }
+ if (ipv6_addr_v4mapped(&conn->c_laddr)) {
+ ret = sock_create_kern(rds_conn_net(conn), PF_INET,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
+ isv6 = false;
+ } else {
+ ret = sock_create_kern(rds_conn_net(conn), PF_INET6,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
+ isv6 = true;
+ }
+
+ if (ret < 0)
+ goto out;
+
+ if (!rds_tcp_tune(sock)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (isv6) {
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = conn->c_laddr;
+ sin6.sin6_port = 0;
+ sin6.sin6_flowinfo = 0;
+ sin6.sin6_scope_id = conn->c_dev_if;
+ addr = (struct sockaddr *)&sin6;
+ addrlen = sizeof(sin6);
+ } else {
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
+ sin.sin_port = 0;
+ addr = (struct sockaddr *)&sin;
+ addrlen = sizeof(sin);
+ }
+
+ ret = kernel_bind(sock, addr, addrlen);
+ if (ret) {
+ rdsdebug("bind failed with %d at address %pI6c\n",
+ ret, &conn->c_laddr);
+ goto out;
+ }
+
+ if (isv6) {
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = conn->c_faddr;
+ sin6.sin6_port = htons(RDS_TCP_PORT);
+ sin6.sin6_flowinfo = 0;
+ sin6.sin6_scope_id = conn->c_dev_if;
+ addr = (struct sockaddr *)&sin6;
+ addrlen = sizeof(sin6);
+ } else {
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
+ sin.sin_port = htons(RDS_TCP_PORT);
+ addr = (struct sockaddr *)&sin;
+ addrlen = sizeof(sin);
+ }
+
+ /*
+ * once we call connect() we can start getting callbacks and they
+ * own the socket
+ */
+ rds_tcp_set_callbacks(sock, cp);
+ ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK);
+
+ rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret);
+ if (ret == -EINPROGRESS)
+ ret = 0;
+ if (ret == 0) {
+ rds_tcp_keepalive(sock);
+ sock = NULL;
+ } else {
+ rds_tcp_restore_callbacks(sock, cp->cp_transport_data);
+ }
+
+out:
+ mutex_unlock(&tc->t_conn_path_lock);
+ if (sock)
+ sock_release(sock);
+ return ret;
+}
+
+/*
+ * Before killing the tcp socket this needs to serialize with callbacks. The
+ * caller has already grabbed the sending sem so we're serialized with other
+ * senders.
+ *
+ * TCP calls the callbacks with the sock lock so we hold it while we reset the
+ * callbacks to those set by TCP. Our callbacks won't execute again once we
+ * hold the sock lock.
+ */
+void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
+{
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
+ struct socket *sock = tc->t_sock;
+
+ rdsdebug("shutting down conn %p tc %p sock %p\n",
+ cp->cp_conn, tc, sock);
+
+ if (sock) {
+ if (rds_destroy_pending(cp->cp_conn))
+ sock_no_linger(sock->sk);
+ sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
+ lock_sock(sock->sk);
+ rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
+
+ release_sock(sock->sk);
+ sock_release(sock);
+ }
+
+ if (tc->t_tinc) {
+ rds_inc_put(&tc->t_tinc->ti_inc);
+ tc->t_tinc = NULL;
+ }
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
+}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
new file mode 100644
index 000000000..b576bd252
--- /dev/null
+++ b/net/rds/tcp_listen.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+void rds_tcp_keepalive(struct socket *sock)
+{
+ /* values below based on xs_udp_default_timeout */
+ int keepidle = 5; /* send a probe 'keepidle' secs after last data */
+ int keepcnt = 5; /* number of unack'ed probes before declaring dead */
+
+ sock_set_keepalive(sock->sk);
+ tcp_sock_set_keepcnt(sock->sk, keepcnt);
+ tcp_sock_set_keepidle(sock->sk, keepidle);
+ /* KEEPINTVL is the interval between successive probes. We follow
+ * the model in xs_tcp_finish_connecting() and re-use keepidle.
+ */
+ tcp_sock_set_keepintvl(sock->sk, keepidle);
+}
+
+/* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the
+ * client's ipaddr < server's ipaddr. Otherwise, close the accepted
+ * socket and force a reconneect from smaller -> larger ip addr. The reason
+ * we special case cp_index 0 is to allow the rds probe ping itself to itself
+ * get through efficiently.
+ * Since reconnects are only initiated from the node with the numerically
+ * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side
+ * by moving them to CONNECTING in this function.
+ */
+static
+struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
+{
+ int i;
+ int npaths = max_t(int, 1, conn->c_npaths);
+
+ /* for mprds, all paths MUST be initiated by the peer
+ * with the smaller address.
+ */
+ if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) {
+ /* Make sure we initiate at least one path if this
+ * has not already been done; rds_start_mprds() will
+ * take care of additional paths, if necessary.
+ */
+ if (npaths == 1)
+ rds_conn_path_connect_if_down(&conn->c_path[0]);
+ return NULL;
+ }
+
+ for (i = 0; i < npaths; i++) {
+ struct rds_conn_path *cp = &conn->c_path[i];
+
+ if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
+ RDS_CONN_CONNECTING) ||
+ rds_conn_path_transition(cp, RDS_CONN_ERROR,
+ RDS_CONN_CONNECTING)) {
+ return cp->cp_transport_data;
+ }
+ }
+ return NULL;
+}
+
+int rds_tcp_accept_one(struct socket *sock)
+{
+ struct socket *new_sock = NULL;
+ struct rds_connection *conn;
+ int ret;
+ struct inet_sock *inet;
+ struct rds_tcp_connection *rs_tcp = NULL;
+ int conn_state;
+ struct rds_conn_path *cp;
+ struct in6_addr *my_addr, *peer_addr;
+#if !IS_ENABLED(CONFIG_IPV6)
+ struct in6_addr saddr, daddr;
+#endif
+ int dev_if = 0;
+
+ if (!sock) /* module unload or netns delete in progress */
+ return -ENETUNREACH;
+
+ ret = sock_create_lite(sock->sk->sk_family,
+ sock->sk->sk_type, sock->sk->sk_protocol,
+ &new_sock);
+ if (ret)
+ goto out;
+
+ ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, true);
+ if (ret < 0)
+ goto out;
+
+ /* sock_create_lite() does not get a hold on the owner module so we
+ * need to do it here. Note that sock_release() uses sock->ops to
+ * determine if it needs to decrement the reference count. So set
+ * sock->ops after calling accept() in case that fails. And there's
+ * no need to do try_module_get() as the listener should have a hold
+ * already.
+ */
+ new_sock->ops = sock->ops;
+ __module_get(new_sock->ops->owner);
+
+ rds_tcp_keepalive(new_sock);
+ if (!rds_tcp_tune(new_sock)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ inet = inet_sk(new_sock->sk);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ my_addr = &new_sock->sk->sk_v6_rcv_saddr;
+ peer_addr = &new_sock->sk->sk_v6_daddr;
+#else
+ ipv6_addr_set_v4mapped(inet->inet_saddr, &saddr);
+ ipv6_addr_set_v4mapped(inet->inet_daddr, &daddr);
+ my_addr = &saddr;
+ peer_addr = &daddr;
+#endif
+ rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n",
+ sock->sk->sk_family,
+ my_addr, ntohs(inet->inet_sport),
+ peer_addr, ntohs(inet->inet_dport));
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* sk_bound_dev_if is not set if the peer address is not link local
+ * address. In this case, it happens that mcast_oif is set. So
+ * just use it.
+ */
+ if ((ipv6_addr_type(my_addr) & IPV6_ADDR_LINKLOCAL) &&
+ !(ipv6_addr_type(peer_addr) & IPV6_ADDR_LINKLOCAL)) {
+ struct ipv6_pinfo *inet6;
+
+ inet6 = inet6_sk(new_sock->sk);
+ dev_if = inet6->mcast_oif;
+ } else {
+ dev_if = new_sock->sk->sk_bound_dev_if;
+ }
+#endif
+
+ if (!rds_tcp_laddr_check(sock_net(sock->sk), peer_addr, dev_if)) {
+ /* local address connection is only allowed via loopback */
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ conn = rds_conn_create(sock_net(sock->sk),
+ my_addr, peer_addr,
+ &rds_tcp_transport, 0, GFP_KERNEL, dev_if);
+
+ if (IS_ERR(conn)) {
+ ret = PTR_ERR(conn);
+ goto out;
+ }
+ /* An incoming SYN request came in, and TCP just accepted it.
+ *
+ * If the client reboots, this conn will need to be cleaned up.
+ * rds_tcp_state_change() will do that cleanup
+ */
+ rs_tcp = rds_tcp_accept_one_path(conn);
+ if (!rs_tcp)
+ goto rst_nsk;
+ mutex_lock(&rs_tcp->t_conn_path_lock);
+ cp = rs_tcp->t_cpath;
+ conn_state = rds_conn_path_state(cp);
+ WARN_ON(conn_state == RDS_CONN_UP);
+ if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR)
+ goto rst_nsk;
+ if (rs_tcp->t_sock) {
+ /* Duelling SYN has been handled in rds_tcp_accept_one() */
+ rds_tcp_reset_callbacks(new_sock, cp);
+ /* rds_connect_path_complete() marks RDS_CONN_UP */
+ rds_connect_path_complete(cp, RDS_CONN_RESETTING);
+ } else {
+ rds_tcp_set_callbacks(new_sock, cp);
+ rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+ }
+ new_sock = NULL;
+ ret = 0;
+ if (conn->c_npaths == 0)
+ rds_send_ping(cp->cp_conn, cp->cp_index);
+ goto out;
+rst_nsk:
+ /* reset the newly returned accept sock and bail.
+ * It is safe to set linger on new_sock because the RDS connection
+ * has not been brought up on new_sock, so no RDS-level data could
+ * be pending on it. By setting linger, we achieve the side-effect
+ * of avoiding TIME_WAIT state on new_sock.
+ */
+ sock_no_linger(new_sock->sk);
+ kernel_sock_shutdown(new_sock, SHUT_RDWR);
+ ret = 0;
+out:
+ if (rs_tcp)
+ mutex_unlock(&rs_tcp->t_conn_path_lock);
+ if (new_sock)
+ sock_release(new_sock);
+ return ret;
+}
+
+void rds_tcp_listen_data_ready(struct sock *sk)
+{
+ void (*ready)(struct sock *sk);
+
+ rdsdebug("listen data ready sk %p\n", sk);
+
+ read_lock_bh(&sk->sk_callback_lock);
+ ready = sk->sk_user_data;
+ if (!ready) { /* check for teardown race */
+ ready = sk->sk_data_ready;
+ goto out;
+ }
+
+ /*
+ * ->sk_data_ready is also called for a newly established child socket
+ * before it has been accepted and the accepter has set up their
+ * data_ready.. we only want to queue listen work for our listening
+ * socket
+ *
+ * (*ready)() may be null if we are racing with netns delete, and
+ * the listen socket is being torn down.
+ */
+ if (sk->sk_state == TCP_LISTEN)
+ rds_tcp_accept_work(sk);
+ else
+ ready = rds_tcp_listen_sock_def_readable(sock_net(sk));
+
+out:
+ read_unlock_bh(&sk->sk_callback_lock);
+ if (ready)
+ ready(sk);
+}
+
+struct socket *rds_tcp_listen_init(struct net *net, bool isv6)
+{
+ struct socket *sock = NULL;
+ struct sockaddr_storage ss;
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_in *sin;
+ int addr_len;
+ int ret;
+
+ ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+ if (ret < 0) {
+ rdsdebug("could not create %s listener socket: %d\n",
+ isv6 ? "IPv6" : "IPv4", ret);
+ goto out;
+ }
+
+ sock->sk->sk_reuse = SK_CAN_REUSE;
+ tcp_sock_set_nodelay(sock->sk);
+
+ write_lock_bh(&sock->sk->sk_callback_lock);
+ sock->sk->sk_user_data = sock->sk->sk_data_ready;
+ sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+
+ if (isv6) {
+ sin6 = (struct sockaddr_in6 *)&ss;
+ sin6->sin6_family = PF_INET6;
+ sin6->sin6_addr = in6addr_any;
+ sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT);
+ sin6->sin6_scope_id = 0;
+ sin6->sin6_flowinfo = 0;
+ addr_len = sizeof(*sin6);
+ } else {
+ sin = (struct sockaddr_in *)&ss;
+ sin->sin_family = PF_INET;
+ sin->sin_addr.s_addr = INADDR_ANY;
+ sin->sin_port = (__force u16)htons(RDS_TCP_PORT);
+ addr_len = sizeof(*sin);
+ }
+
+ ret = kernel_bind(sock, (struct sockaddr *)&ss, addr_len);
+ if (ret < 0) {
+ rdsdebug("could not bind %s listener socket: %d\n",
+ isv6 ? "IPv6" : "IPv4", ret);
+ goto out;
+ }
+
+ ret = sock->ops->listen(sock, 64);
+ if (ret < 0)
+ goto out;
+
+ return sock;
+out:
+ if (sock)
+ sock_release(sock);
+ return NULL;
+}
+
+void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor)
+{
+ struct sock *sk;
+
+ if (!sock)
+ return;
+
+ sk = sock->sk;
+
+ /* serialize with and prevent further callbacks */
+ lock_sock(sk);
+ write_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_user_data) {
+ sk->sk_data_ready = sk->sk_user_data;
+ sk->sk_user_data = NULL;
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ release_sock(sk);
+
+ /* wait for accepts to stop and close the socket */
+ flush_workqueue(rds_wq);
+ flush_work(acceptor);
+ sock_release(sock);
+}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
new file mode 100644
index 000000000..f4ee13da9
--- /dev/null
+++ b/net/rds/tcp_recv.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+static struct kmem_cache *rds_tcp_incoming_slab;
+
+static void rds_tcp_inc_purge(struct rds_incoming *inc)
+{
+ struct rds_tcp_incoming *tinc;
+ tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+ rdsdebug("purging tinc %p inc %p\n", tinc, inc);
+ skb_queue_purge(&tinc->ti_skb_list);
+}
+
+void rds_tcp_inc_free(struct rds_incoming *inc)
+{
+ struct rds_tcp_incoming *tinc;
+ tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+ rds_tcp_inc_purge(inc);
+ rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
+ kmem_cache_free(rds_tcp_incoming_slab, tinc);
+}
+
+/*
+ * this is pretty lame, but, whatever.
+ */
+int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+{
+ struct rds_tcp_incoming *tinc;
+ struct sk_buff *skb;
+ int ret = 0;
+
+ if (!iov_iter_count(to))
+ goto out;
+
+ tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+
+ skb_queue_walk(&tinc->ti_skb_list, skb) {
+ unsigned long to_copy, skb_off;
+ for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) {
+ to_copy = iov_iter_count(to);
+ to_copy = min(to_copy, skb->len - skb_off);
+
+ if (skb_copy_datagram_iter(skb, skb_off, to, to_copy))
+ return -EFAULT;
+
+ rds_stats_add(s_copy_to_user, to_copy);
+ ret += to_copy;
+
+ if (!iov_iter_count(to))
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+
+/*
+ * We have a series of skbs that have fragmented pieces of the congestion
+ * bitmap. They must add up to the exact size of the congestion bitmap. We
+ * use the skb helpers to copy those into the pages that make up the in-memory
+ * congestion bitmap for the remote address of this connection. We then tell
+ * the congestion core that the bitmap has been changed so that it can wake up
+ * sleepers.
+ *
+ * This is racing with sending paths which are using test_bit to see if the
+ * bitmap indicates that their recipient is congested.
+ */
+
+static void rds_tcp_cong_recv(struct rds_connection *conn,
+ struct rds_tcp_incoming *tinc)
+{
+ struct sk_buff *skb;
+ unsigned int to_copy, skb_off;
+ unsigned int map_off;
+ unsigned int map_page;
+ struct rds_cong_map *map;
+ int ret;
+
+ /* catch completely corrupt packets */
+ if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+ return;
+
+ map_page = 0;
+ map_off = 0;
+ map = conn->c_fcong;
+
+ skb_queue_walk(&tinc->ti_skb_list, skb) {
+ skb_off = 0;
+ while (skb_off < skb->len) {
+ to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
+ skb->len - skb_off);
+
+ BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
+
+ /* only returns 0 or -error */
+ ret = skb_copy_bits(skb, skb_off,
+ (void *)map->m_page_addrs[map_page] + map_off,
+ to_copy);
+ BUG_ON(ret != 0);
+
+ skb_off += to_copy;
+ map_off += to_copy;
+ if (map_off == PAGE_SIZE) {
+ map_off = 0;
+ map_page++;
+ }
+ }
+ }
+
+ rds_cong_map_updated(map, ~(u64) 0);
+}
+
+struct rds_tcp_desc_arg {
+ struct rds_conn_path *conn_path;
+ gfp_t gfp;
+};
+
+static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct rds_tcp_desc_arg *arg = desc->arg.data;
+ struct rds_conn_path *cp = arg->conn_path;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
+ struct rds_tcp_incoming *tinc = tc->t_tinc;
+ struct sk_buff *clone;
+ size_t left = len, to_copy;
+
+ rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
+ len);
+
+ /*
+ * tcp_read_sock() interprets partial progress as an indication to stop
+ * processing.
+ */
+ while (left) {
+ if (!tinc) {
+ tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
+ arg->gfp);
+ if (!tinc) {
+ desc->error = -ENOMEM;
+ goto out;
+ }
+ tc->t_tinc = tinc;
+ rdsdebug("allocated tinc %p\n", tinc);
+ rds_inc_path_init(&tinc->ti_inc, cp,
+ &cp->cp_conn->c_faddr);
+ tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+ local_clock();
+
+ /*
+ * XXX * we might be able to use the __ variants when
+ * we've already serialized at a higher level.
+ */
+ skb_queue_head_init(&tinc->ti_skb_list);
+ }
+
+ if (left && tc->t_tinc_hdr_rem) {
+ to_copy = min(tc->t_tinc_hdr_rem, left);
+ rdsdebug("copying %zu header from skb %p\n", to_copy,
+ skb);
+ skb_copy_bits(skb, offset,
+ (char *)&tinc->ti_inc.i_hdr +
+ sizeof(struct rds_header) -
+ tc->t_tinc_hdr_rem,
+ to_copy);
+ tc->t_tinc_hdr_rem -= to_copy;
+ left -= to_copy;
+ offset += to_copy;
+
+ if (tc->t_tinc_hdr_rem == 0) {
+ /* could be 0 for a 0 len message */
+ tc->t_tinc_data_rem =
+ be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
+ tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+ local_clock();
+ }
+ }
+
+ if (left && tc->t_tinc_data_rem) {
+ to_copy = min(tc->t_tinc_data_rem, left);
+
+ clone = pskb_extract(skb, offset, to_copy, arg->gfp);
+ if (!clone) {
+ desc->error = -ENOMEM;
+ goto out;
+ }
+
+ skb_queue_tail(&tinc->ti_skb_list, clone);
+
+ rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
+ "clone %p data %p len %d\n",
+ skb, skb->data, skb->len, offset, to_copy,
+ clone, clone->data, clone->len);
+
+ tc->t_tinc_data_rem -= to_copy;
+ left -= to_copy;
+ offset += to_copy;
+ }
+
+ if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
+ struct rds_connection *conn = cp->cp_conn;
+
+ if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+ rds_tcp_cong_recv(conn, tinc);
+ else
+ rds_recv_incoming(conn, &conn->c_faddr,
+ &conn->c_laddr,
+ &tinc->ti_inc,
+ arg->gfp);
+
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
+ tc->t_tinc = NULL;
+ rds_inc_put(&tinc->ti_inc);
+ tinc = NULL;
+ }
+ }
+out:
+ rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
+ len, left, skb->len,
+ skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
+ return len - left;
+}
+
+/* the caller has to hold the sock lock */
+static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp)
+{
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
+ struct socket *sock = tc->t_sock;
+ read_descriptor_t desc;
+ struct rds_tcp_desc_arg arg;
+
+ /* It's like glib in the kernel! */
+ arg.conn_path = cp;
+ arg.gfp = gfp;
+ desc.arg.data = &arg;
+ desc.error = 0;
+ desc.count = 1; /* give more than one skb per call */
+
+ tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
+ rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
+ desc.error);
+
+ return desc.error;
+}
+
+/*
+ * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
+ * data_ready.
+ *
+ * if we fail to allocate we're in trouble.. blindly wait some time before
+ * trying again to see if the VM can free up something for us.
+ */
+int rds_tcp_recv_path(struct rds_conn_path *cp)
+{
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
+ struct socket *sock = tc->t_sock;
+ int ret = 0;
+
+ rdsdebug("recv worker path [%d] tc %p sock %p\n",
+ cp->cp_index, tc, sock);
+
+ lock_sock(sock->sk);
+ ret = rds_tcp_read_sock(cp, GFP_KERNEL);
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+void rds_tcp_data_ready(struct sock *sk)
+{
+ void (*ready)(struct sock *sk);
+ struct rds_conn_path *cp;
+ struct rds_tcp_connection *tc;
+
+ rdsdebug("data ready sk %p\n", sk);
+
+ read_lock_bh(&sk->sk_callback_lock);
+ cp = sk->sk_user_data;
+ if (!cp) { /* check for teardown race */
+ ready = sk->sk_data_ready;
+ goto out;
+ }
+
+ tc = cp->cp_transport_data;
+ ready = tc->t_orig_data_ready;
+ rds_tcp_stats_inc(s_tcp_data_ready_calls);
+
+ if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) {
+ rcu_read_lock();
+ if (!rds_destroy_pending(cp->cp_conn))
+ queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+ rcu_read_unlock();
+ }
+out:
+ read_unlock_bh(&sk->sk_callback_lock);
+ ready(sk);
+}
+
+int rds_tcp_recv_init(void)
+{
+ rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
+ sizeof(struct rds_tcp_incoming),
+ 0, 0, NULL);
+ if (!rds_tcp_incoming_slab)
+ return -ENOMEM;
+ return 0;
+}
+
+void rds_tcp_recv_exit(void)
+{
+ kmem_cache_destroy(rds_tcp_incoming_slab);
+}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
new file mode 100644
index 000000000..8c4d1d6e9
--- /dev/null
+++ b/net/rds/tcp_send.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds_single_path.h"
+#include "rds.h"
+#include "tcp.h"
+
+void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp)
+{
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
+
+ tcp_sock_set_cork(tc->t_sock->sk, true);
+}
+
+void rds_tcp_xmit_path_complete(struct rds_conn_path *cp)
+{
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
+
+ tcp_sock_set_cork(tc->t_sock->sk, false);
+}
+
+/* the core send_sem serializes this with other xmit and shutdown */
+static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
+{
+ struct kvec vec = {
+ .iov_base = data,
+ .iov_len = len,
+ };
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
+ };
+
+ return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
+}
+
+/* the core send_sem serializes this with other xmit and shutdown */
+int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+ struct rds_conn_path *cp = rm->m_inc.i_conn_path;
+ struct rds_tcp_connection *tc = cp->cp_transport_data;
+ int done = 0;
+ int ret = 0;
+ int more;
+
+ if (hdr_off == 0) {
+ /*
+ * m_ack_seq is set to the sequence number of the last byte of
+ * header and data. see rds_tcp_is_acked().
+ */
+ tc->t_last_sent_nxt = rds_tcp_write_seq(tc);
+ rm->m_ack_seq = tc->t_last_sent_nxt +
+ sizeof(struct rds_header) +
+ be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1;
+ smp_mb__before_atomic();
+ set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
+ tc->t_last_expected_una = rm->m_ack_seq + 1;
+
+ if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+ rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
+ rm, rds_tcp_write_seq(tc),
+ (unsigned long long)rm->m_ack_seq);
+ }
+
+ if (hdr_off < sizeof(struct rds_header)) {
+ /* see rds_tcp_write_space() */
+ set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags);
+
+ ret = rds_tcp_sendmsg(tc->t_sock,
+ (void *)&rm->m_inc.i_hdr + hdr_off,
+ sizeof(rm->m_inc.i_hdr) - hdr_off);
+ if (ret < 0)
+ goto out;
+ done += ret;
+ if (hdr_off + done != sizeof(struct rds_header))
+ goto out;
+ }
+
+ more = rm->data.op_nents > 1 ? (MSG_MORE | MSG_SENDPAGE_NOTLAST) : 0;
+ while (sg < rm->data.op_nents) {
+ int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
+
+ ret = tc->t_sock->ops->sendpage(tc->t_sock,
+ sg_page(&rm->data.op_sg[sg]),
+ rm->data.op_sg[sg].offset + off,
+ rm->data.op_sg[sg].length - off,
+ flags);
+ rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
+ rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
+ ret);
+ if (ret <= 0)
+ break;
+
+ off += ret;
+ done += ret;
+ if (off == rm->data.op_sg[sg].length) {
+ off = 0;
+ sg++;
+ }
+ if (sg == rm->data.op_nents - 1)
+ more = 0;
+ }
+
+out:
+ if (ret <= 0) {
+ /* write_space will hit after EAGAIN, all else fatal */
+ if (ret == -EAGAIN) {
+ rds_tcp_stats_inc(s_tcp_sndbuf_full);
+ ret = 0;
+ } else {
+ /* No need to disconnect/reconnect if path_drop
+ * has already been triggered, because, e.g., of
+ * an incoming RST.
+ */
+ if (rds_conn_path_up(cp)) {
+ pr_warn("RDS/tcp: send to %pI6c on cp [%d]"
+ "returned %d, "
+ "disconnecting and reconnecting\n",
+ &conn->c_faddr, cp->cp_index, ret);
+ rds_conn_path_drop(cp, false);
+ }
+ }
+ }
+ if (done == 0)
+ done = ret;
+ return done;
+}
+
+/*
+ * rm->m_ack_seq is set to the tcp sequence number that corresponds to the
+ * last byte of the message, including the header. This means that the
+ * entire message has been received if rm->m_ack_seq is "before" the next
+ * unacked byte of the TCP sequence space. We have to do very careful
+ * wrapping 32bit comparisons here.
+ */
+static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
+{
+ if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags))
+ return 0;
+ return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0;
+}
+
+void rds_tcp_write_space(struct sock *sk)
+{
+ void (*write_space)(struct sock *sk);
+ struct rds_conn_path *cp;
+ struct rds_tcp_connection *tc;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ cp = sk->sk_user_data;
+ if (!cp) {
+ write_space = sk->sk_write_space;
+ goto out;
+ }
+
+ tc = cp->cp_transport_data;
+ rdsdebug("write_space for tc %p\n", tc);
+ write_space = tc->t_orig_write_space;
+ rds_tcp_stats_inc(s_tcp_write_space_calls);
+
+ rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc));
+ tc->t_last_seen_una = rds_tcp_snd_una(tc);
+ rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), rds_tcp_is_acked);
+
+ rcu_read_lock();
+ if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf &&
+ !rds_destroy_pending(cp->cp_conn))
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ rcu_read_unlock();
+
+out:
+ read_unlock_bh(&sk->sk_callback_lock);
+
+ /*
+ * write_space is only called when data leaves tcp's send queue if
+ * SOCK_NOSPACE is set. We set SOCK_NOSPACE every time we put
+ * data in tcp's send queue because we use write_space to parse the
+ * sequence numbers and notice that rds messages have been fully
+ * received.
+ *
+ * tcp's write_space clears SOCK_NOSPACE if the send queue has more
+ * than a certain amount of space. So we need to set it again *after*
+ * we call tcp's write_space or else we might only get called on the
+ * first of a series of incoming tcp acks.
+ */
+ write_space(sk);
+
+ if (sk->sk_socket)
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c
new file mode 100644
index 000000000..f8a7954f1
--- /dev/null
+++ b/net/rds/tcp_stats.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats)
+ ____cacheline_aligned;
+
+static const char * const rds_tcp_stat_names[] = {
+ "tcp_data_ready_calls",
+ "tcp_write_space_calls",
+ "tcp_sndbuf_full",
+ "tcp_connect_raced",
+ "tcp_listen_closed_stale",
+};
+
+unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail)
+{
+ struct rds_tcp_statistics stats = {0, };
+ uint64_t *src;
+ uint64_t *sum;
+ size_t i;
+ int cpu;
+
+ if (avail < ARRAY_SIZE(rds_tcp_stat_names))
+ goto out;
+
+ for_each_online_cpu(cpu) {
+ src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu));
+ sum = (uint64_t *)&stats;
+ for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+ *(sum++) += *(src++);
+ }
+
+ rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names,
+ ARRAY_SIZE(rds_tcp_stat_names));
+out:
+ return ARRAY_SIZE(rds_tcp_stat_names);
+}
diff --git a/net/rds/threads.c b/net/rds/threads.c
new file mode 100644
index 000000000..1f424cbfc
--- /dev/null
+++ b/net/rds/threads.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/*
+ * All of connection management is simplified by serializing it through
+ * work queues that execute in a connection managing thread.
+ *
+ * TCP wants to send acks through sendpage() in response to data_ready(),
+ * but it needs a process context to do so.
+ *
+ * The receive paths need to allocate but can't drop packets (!) so we have
+ * a thread around to block allocating if the receive fast path sees an
+ * allocation failure.
+ */
+
+/* Grand Unified Theory of connection life cycle:
+ * At any point in time, the connection can be in one of these states:
+ * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
+ *
+ * The following transitions are possible:
+ * ANY -> ERROR
+ * UP -> DISCONNECTING
+ * ERROR -> DISCONNECTING
+ * DISCONNECTING -> DOWN
+ * DOWN -> CONNECTING
+ * CONNECTING -> UP
+ *
+ * Transition to state DISCONNECTING/DOWN:
+ * - Inside the shutdown worker; synchronizes with xmit path
+ * through RDS_IN_XMIT, and with connection management callbacks
+ * via c_cm_lock.
+ *
+ * For receive callbacks, we rely on the underlying transport
+ * (TCP, IB/RDMA) to provide the necessary synchronisation.
+ */
+struct workqueue_struct *rds_wq;
+EXPORT_SYMBOL_GPL(rds_wq);
+
+void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
+{
+ if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) {
+ printk(KERN_WARNING "%s: Cannot transition to state UP, "
+ "current state is %d\n",
+ __func__,
+ atomic_read(&cp->cp_state));
+ rds_conn_path_drop(cp, false);
+ return;
+ }
+
+ rdsdebug("conn %p for %pI6c to %pI6c complete\n",
+ cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
+
+ cp->cp_reconnect_jiffies = 0;
+ set_bit(0, &cp->cp_conn->c_map_queued);
+ rcu_read_lock();
+ if (!rds_destroy_pending(cp->cp_conn)) {
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+ }
+ rcu_read_unlock();
+ cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
+}
+EXPORT_SYMBOL_GPL(rds_connect_path_complete);
+
+void rds_connect_complete(struct rds_connection *conn)
+{
+ rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING);
+}
+EXPORT_SYMBOL_GPL(rds_connect_complete);
+
+/*
+ * This random exponential backoff is relied on to eventually resolve racing
+ * connects.
+ *
+ * If connect attempts race then both parties drop both connections and come
+ * here to wait for a random amount of time before trying again. Eventually
+ * the backoff range will be so much greater than the time it takes to
+ * establish a connection that one of the pair will establish the connection
+ * before the other's random delay fires.
+ *
+ * Connection attempts that arrive while a connection is already established
+ * are also considered to be racing connects. This lets a connection from
+ * a rebooted machine replace an existing stale connection before the transport
+ * notices that the connection has failed.
+ *
+ * We should *always* start with a random backoff; otherwise a broken connection
+ * will always take several iterations to be re-established.
+ */
+void rds_queue_reconnect(struct rds_conn_path *cp)
+{
+ unsigned long rand;
+ struct rds_connection *conn = cp->cp_conn;
+
+ rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n",
+ conn, &conn->c_laddr, &conn->c_faddr,
+ cp->cp_reconnect_jiffies);
+
+ /* let peer with smaller addr initiate reconnect, to avoid duels */
+ if (conn->c_trans->t_type == RDS_TRANS_TCP &&
+ rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0)
+ return;
+
+ set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
+ if (cp->cp_reconnect_jiffies == 0) {
+ cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
+ rcu_read_lock();
+ if (!rds_destroy_pending(cp->cp_conn))
+ queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+ rcu_read_unlock();
+ return;
+ }
+
+ get_random_bytes(&rand, sizeof(rand));
+ rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n",
+ rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
+ conn, &conn->c_laddr, &conn->c_faddr);
+ rcu_read_lock();
+ if (!rds_destroy_pending(cp->cp_conn))
+ queue_delayed_work(rds_wq, &cp->cp_conn_w,
+ rand % cp->cp_reconnect_jiffies);
+ rcu_read_unlock();
+
+ cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2,
+ rds_sysctl_reconnect_max_jiffies);
+}
+
+void rds_connect_worker(struct work_struct *work)
+{
+ struct rds_conn_path *cp = container_of(work,
+ struct rds_conn_path,
+ cp_conn_w.work);
+ struct rds_connection *conn = cp->cp_conn;
+ int ret;
+
+ if (cp->cp_index > 0 &&
+ rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0)
+ return;
+ clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
+ ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
+ if (ret) {
+ ret = conn->c_trans->conn_path_connect(cp);
+ rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n",
+ conn, &conn->c_laddr, &conn->c_faddr, ret);
+
+ if (ret) {
+ if (rds_conn_path_transition(cp,
+ RDS_CONN_CONNECTING,
+ RDS_CONN_DOWN))
+ rds_queue_reconnect(cp);
+ else
+ rds_conn_path_error(cp, "connect failed\n");
+ }
+ }
+}
+
+void rds_send_worker(struct work_struct *work)
+{
+ struct rds_conn_path *cp = container_of(work,
+ struct rds_conn_path,
+ cp_send_w.work);
+ int ret;
+
+ if (rds_conn_path_state(cp) == RDS_CONN_UP) {
+ clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags);
+ ret = rds_send_xmit(cp);
+ cond_resched();
+ rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
+ switch (ret) {
+ case -EAGAIN:
+ rds_stats_inc(s_send_immediate_retry);
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ break;
+ case -ENOMEM:
+ rds_stats_inc(s_send_delayed_retry);
+ queue_delayed_work(rds_wq, &cp->cp_send_w, 2);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+void rds_recv_worker(struct work_struct *work)
+{
+ struct rds_conn_path *cp = container_of(work,
+ struct rds_conn_path,
+ cp_recv_w.work);
+ int ret;
+
+ if (rds_conn_path_state(cp) == RDS_CONN_UP) {
+ ret = cp->cp_conn->c_trans->recv_path(cp);
+ rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
+ switch (ret) {
+ case -EAGAIN:
+ rds_stats_inc(s_recv_immediate_retry);
+ queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+ break;
+ case -ENOMEM:
+ rds_stats_inc(s_recv_delayed_retry);
+ queue_delayed_work(rds_wq, &cp->cp_recv_w, 2);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+void rds_shutdown_worker(struct work_struct *work)
+{
+ struct rds_conn_path *cp = container_of(work,
+ struct rds_conn_path,
+ cp_down_w);
+
+ rds_conn_shutdown(cp);
+}
+
+void rds_threads_exit(void)
+{
+ destroy_workqueue(rds_wq);
+}
+
+int rds_threads_init(void)
+{
+ rds_wq = create_singlethread_workqueue("krdsd");
+ if (!rds_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Compare two IPv6 addresses. Return 0 if the two addresses are equal.
+ * Return 1 if the first is greater. Return -1 if the second is greater.
+ */
+int rds_addr_cmp(const struct in6_addr *addr1,
+ const struct in6_addr *addr2)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+ const __be64 *a1, *a2;
+ u64 x, y;
+
+ a1 = (__be64 *)addr1;
+ a2 = (__be64 *)addr2;
+
+ if (*a1 != *a2) {
+ if (be64_to_cpu(*a1) < be64_to_cpu(*a2))
+ return -1;
+ else
+ return 1;
+ } else {
+ x = be64_to_cpu(*++a1);
+ y = be64_to_cpu(*++a2);
+ if (x < y)
+ return -1;
+ else if (x > y)
+ return 1;
+ else
+ return 0;
+ }
+#else
+ u32 a, b;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) {
+ a = ntohl(addr1->s6_addr32[i]);
+ b = ntohl(addr2->s6_addr32[i]);
+ if (a < b)
+ return -1;
+ else if (a > b)
+ return 1;
+ }
+ }
+ return 0;
+#endif
+}
+EXPORT_SYMBOL_GPL(rds_addr_cmp);
diff --git a/net/rds/transport.c b/net/rds/transport.c
new file mode 100644
index 000000000..f8001ec80
--- /dev/null
+++ b/net/rds/transport.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/ipv6.h>
+
+#include "rds.h"
+#include "loop.h"
+
+static char * const rds_trans_modules[] = {
+ [RDS_TRANS_IB] = "rds_rdma",
+ [RDS_TRANS_GAP] = NULL,
+ [RDS_TRANS_TCP] = "rds_tcp",
+};
+
+static struct rds_transport *transports[RDS_TRANS_COUNT];
+static DECLARE_RWSEM(rds_trans_sem);
+
+void rds_trans_register(struct rds_transport *trans)
+{
+ BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
+
+ down_write(&rds_trans_sem);
+
+ if (transports[trans->t_type])
+ printk(KERN_ERR "RDS Transport type %d already registered\n",
+ trans->t_type);
+ else {
+ transports[trans->t_type] = trans;
+ printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
+ }
+
+ up_write(&rds_trans_sem);
+}
+EXPORT_SYMBOL_GPL(rds_trans_register);
+
+void rds_trans_unregister(struct rds_transport *trans)
+{
+ down_write(&rds_trans_sem);
+
+ transports[trans->t_type] = NULL;
+ printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
+
+ up_write(&rds_trans_sem);
+}
+EXPORT_SYMBOL_GPL(rds_trans_unregister);
+
+void rds_trans_put(struct rds_transport *trans)
+{
+ if (trans)
+ module_put(trans->t_owner);
+}
+
+struct rds_transport *rds_trans_get_preferred(struct net *net,
+ const struct in6_addr *addr,
+ __u32 scope_id)
+{
+ struct rds_transport *ret = NULL;
+ struct rds_transport *trans;
+ unsigned int i;
+
+ if (ipv6_addr_v4mapped(addr)) {
+ if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET)
+ return &rds_loop_transport;
+ } else if (ipv6_addr_loopback(addr)) {
+ return &rds_loop_transport;
+ }
+
+ down_read(&rds_trans_sem);
+ for (i = 0; i < RDS_TRANS_COUNT; i++) {
+ trans = transports[i];
+
+ if (trans && (trans->laddr_check(net, addr, scope_id) == 0) &&
+ (!trans->t_owner || try_module_get(trans->t_owner))) {
+ ret = trans;
+ break;
+ }
+ }
+ up_read(&rds_trans_sem);
+
+ return ret;
+}
+
+struct rds_transport *rds_trans_get(int t_type)
+{
+ struct rds_transport *ret = NULL;
+ struct rds_transport *trans;
+
+ down_read(&rds_trans_sem);
+ trans = transports[t_type];
+ if (!trans) {
+ up_read(&rds_trans_sem);
+ if (rds_trans_modules[t_type])
+ request_module(rds_trans_modules[t_type]);
+ down_read(&rds_trans_sem);
+ trans = transports[t_type];
+ }
+ if (trans && trans->t_type == t_type &&
+ (!trans->t_owner || try_module_get(trans->t_owner)))
+ ret = trans;
+
+ up_read(&rds_trans_sem);
+
+ return ret;
+}
+
+/*
+ * This returns the number of stats entries in the snapshot and only
+ * copies them using the iter if there is enough space for them. The
+ * caller passes in the global stats so that we can size and copy while
+ * holding the lock.
+ */
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail)
+
+{
+ struct rds_transport *trans;
+ unsigned int total = 0;
+ unsigned int part;
+ int i;
+
+ rds_info_iter_unmap(iter);
+ down_read(&rds_trans_sem);
+
+ for (i = 0; i < RDS_TRANS_COUNT; i++) {
+ trans = transports[i];
+ if (!trans || !trans->stats_info_copy)
+ continue;
+
+ part = trans->stats_info_copy(iter, avail);
+ avail -= min(avail, part);
+ total += part;
+ }
+
+ up_read(&rds_trans_sem);
+
+ return total;
+}