summaryrefslogtreecommitdiffstats
path: root/net/smc
diff options
context:
space:
mode:
Diffstat (limited to 'net/smc')
-rw-r--r--net/smc/Kconfig22
-rw-r--r--net/smc/Makefile8
-rw-r--r--net/smc/af_smc.c3606
-rw-r--r--net/smc/smc.h385
-rw-r--r--net/smc/smc_cdc.c493
-rw-r--r--net/smc/smc_cdc.h305
-rw-r--r--net/smc/smc_clc.c1278
-rw-r--r--net/smc/smc_clc.h446
-rw-r--r--net/smc/smc_close.c506
-rw-r--r--net/smc/smc_close.h30
-rw-r--r--net/smc/smc_core.c2633
-rw-r--r--net/smc/smc_core.h596
-rw-r--r--net/smc/smc_diag.c271
-rw-r--r--net/smc/smc_ib.c1018
-rw-r--r--net/smc/smc_ib.h119
-rw-r--r--net/smc/smc_ism.c554
-rw-r--r--net/smc/smc_ism.h59
-rw-r--r--net/smc/smc_llc.c2365
-rw-r--r--net/smc/smc_llc.h120
-rw-r--r--net/smc/smc_netlink.c157
-rw-r--r--net/smc/smc_netlink.h34
-rw-r--r--net/smc/smc_netns.h21
-rw-r--r--net/smc/smc_pnet.c1210
-rw-r--r--net/smc/smc_pnet.h70
-rw-r--r--net/smc/smc_rx.c515
-rw-r--r--net/smc/smc_rx.h31
-rw-r--r--net/smc/smc_stats.c413
-rw-r--r--net/smc/smc_stats.h270
-rw-r--r--net/smc/smc_sysctl.c118
-rw-r--r--net/smc/smc_sysctl.h33
-rw-r--r--net/smc/smc_tracepoint.c9
-rw-r--r--net/smc/smc_tracepoint.h125
-rw-r--r--net/smc/smc_tx.c762
-rw-r--r--net/smc/smc_tx.h40
-rw-r--r--net/smc/smc_wr.c939
-rw-r--r--net/smc/smc_wr.h139
36 files changed, 19700 insertions, 0 deletions
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
new file mode 100644
index 000000000..746be3996
--- /dev/null
+++ b/net/smc/Kconfig
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config SMC
+ tristate "SMC socket protocol family"
+ depends on INET && INFINIBAND
+ depends on m || ISM != m
+ help
+ SMC-R provides a "sockets over RDMA" solution making use of
+ RDMA over Converged Ethernet (RoCE) technology to upgrade
+ AF_INET TCP connections transparently.
+ The Linux implementation of the SMC-R solution is designed as
+ a separate socket family SMC.
+
+ Select this option if you want to run SMC socket applications
+
+config SMC_DIAG
+ tristate "SMC: socket monitoring interface"
+ depends on SMC
+ help
+ Support for SMC socket monitoring interface used by tools such as
+ smcss.
+
+ if unsure, say Y.
diff --git a/net/smc/Makefile b/net/smc/Makefile
new file mode 100644
index 000000000..875efcd12
--- /dev/null
+++ b/net/smc/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+ccflags-y += -I$(src)
+obj-$(CONFIG_SMC) += smc.o
+obj-$(CONFIG_SMC_DIAG) += smc_diag.o
+smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
+smc-y += smc_tracepoint.o
+smc-$(CONFIG_SYSCTL) += smc_sysctl.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
new file mode 100644
index 000000000..ef5b5d498
--- /dev/null
+++ b/net/smc/af_smc.c
@@ -0,0 +1,3606 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * AF_SMC protocol family socket handler keeping the AF_INET sock address type
+ * applies to SOCK_STREAM sockets only
+ * offers an alternative communication option for TCP-protocol sockets
+ * applicable with RoCE-cards only
+ *
+ * Initial restrictions:
+ * - support for alternate links postponed
+ *
+ * Copyright IBM Corp. 2016, 2018
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ * based on prototype from Frank Blaschka
+ */
+
+#define KMSG_COMPONENT "smc"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/workqueue.h>
+#include <linux/in.h>
+#include <linux/sched/signal.h>
+#include <linux/if_vlan.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/ctype.h>
+#include <linux/splice.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/smc.h>
+#include <asm/ioctls.h>
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include "smc_netns.h"
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_llc.h"
+#include "smc_cdc.h"
+#include "smc_core.h"
+#include "smc_ib.h"
+#include "smc_ism.h"
+#include "smc_pnet.h"
+#include "smc_netlink.h"
+#include "smc_tx.h"
+#include "smc_rx.h"
+#include "smc_close.h"
+#include "smc_stats.h"
+#include "smc_tracepoint.h"
+#include "smc_sysctl.h"
+
+static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
+ * creation on server
+ */
+static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group
+ * creation on client
+ */
+
+static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */
+struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
+struct workqueue_struct *smc_close_wq; /* wq for close work */
+
+static void smc_tcp_listen_work(struct work_struct *);
+static void smc_connect_work(struct work_struct *);
+
+int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ void *hdr;
+
+ if (cb_ctx->pos[0])
+ goto out;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_DUMP_HS_LIMITATION);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED,
+ sock_net(skb->sk)->smc.limit_smc_hs))
+ goto err;
+
+ genlmsg_end(skb, hdr);
+ cb_ctx->pos[0] = 1;
+out:
+ return skb->len;
+err:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
+{
+ sock_net(skb->sk)->smc.limit_smc_hs = true;
+ return 0;
+}
+
+int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
+{
+ sock_net(skb->sk)->smc.limit_smc_hs = false;
+ return 0;
+}
+
+static void smc_set_keepalive(struct sock *sk, int val)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
+}
+
+static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
+{
+ struct smc_sock *smc;
+ struct sock *child;
+
+ smc = smc_clcsock_user_data(sk);
+
+ if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) >
+ sk->sk_max_ack_backlog)
+ goto drop;
+
+ if (sk_acceptq_is_full(&smc->sk)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ goto drop;
+ }
+
+ /* passthrough to original syn recv sock fct */
+ child = smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash,
+ own_req);
+ /* child must not inherit smc or its ops */
+ if (child) {
+ rcu_assign_sk_user_data(child, NULL);
+
+ /* v4-mapped sockets don't inherit parent ops. Don't restore. */
+ if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops)
+ inet_csk(child)->icsk_af_ops = smc->ori_af_ops;
+ }
+ return child;
+
+drop:
+ dst_release(dst);
+ tcp_listendrop(sk);
+ return NULL;
+}
+
+static bool smc_hs_congested(const struct sock *sk)
+{
+ const struct smc_sock *smc;
+
+ smc = smc_clcsock_user_data(sk);
+
+ if (!smc)
+ return true;
+
+ if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
+ return true;
+
+ return false;
+}
+
+static struct smc_hashinfo smc_v4_hashinfo = {
+ .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
+};
+
+static struct smc_hashinfo smc_v6_hashinfo = {
+ .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
+};
+
+int smc_hash_sk(struct sock *sk)
+{
+ struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+ struct hlist_head *head;
+
+ head = &h->ht;
+
+ write_lock_bh(&h->lock);
+ sk_add_node(sk, head);
+ write_unlock_bh(&h->lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(smc_hash_sk);
+
+void smc_unhash_sk(struct sock *sk)
+{
+ struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+
+ write_lock_bh(&h->lock);
+ if (sk_del_node_init(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(smc_unhash_sk);
+
+/* This will be called before user really release sock_lock. So do the
+ * work which we didn't do because of user hold the sock_lock in the
+ * BH context
+ */
+static void smc_release_cb(struct sock *sk)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ if (smc->conn.tx_in_release_sock) {
+ smc_tx_pending(&smc->conn);
+ smc->conn.tx_in_release_sock = false;
+ }
+}
+
+struct proto smc_proto = {
+ .name = "SMC",
+ .owner = THIS_MODULE,
+ .keepalive = smc_set_keepalive,
+ .hash = smc_hash_sk,
+ .unhash = smc_unhash_sk,
+ .release_cb = smc_release_cb,
+ .obj_size = sizeof(struct smc_sock),
+ .h.smc_hash = &smc_v4_hashinfo,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
+};
+EXPORT_SYMBOL_GPL(smc_proto);
+
+struct proto smc_proto6 = {
+ .name = "SMC6",
+ .owner = THIS_MODULE,
+ .keepalive = smc_set_keepalive,
+ .hash = smc_hash_sk,
+ .unhash = smc_unhash_sk,
+ .release_cb = smc_release_cb,
+ .obj_size = sizeof(struct smc_sock),
+ .h.smc_hash = &smc_v6_hashinfo,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
+};
+EXPORT_SYMBOL_GPL(smc_proto6);
+
+static void smc_fback_restore_callbacks(struct smc_sock *smc)
+{
+ struct sock *clcsk = smc->clcsock->sk;
+
+ write_lock_bh(&clcsk->sk_callback_lock);
+ clcsk->sk_user_data = NULL;
+
+ smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change);
+ smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready);
+ smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space);
+ smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report);
+
+ write_unlock_bh(&clcsk->sk_callback_lock);
+}
+
+static void smc_restore_fallback_changes(struct smc_sock *smc)
+{
+ if (smc->clcsock->file) { /* non-accepted sockets have no file yet */
+ smc->clcsock->file->private_data = smc->sk.sk_socket;
+ smc->clcsock->file = NULL;
+ smc_fback_restore_callbacks(smc);
+ }
+}
+
+static int __smc_release(struct smc_sock *smc)
+{
+ struct sock *sk = &smc->sk;
+ int rc = 0;
+
+ if (!smc->use_fallback) {
+ rc = smc_close_active(smc);
+ smc_sock_set_flag(sk, SOCK_DEAD);
+ sk->sk_shutdown |= SHUTDOWN_MASK;
+ } else {
+ if (sk->sk_state != SMC_CLOSED) {
+ if (sk->sk_state != SMC_LISTEN &&
+ sk->sk_state != SMC_INIT)
+ sock_put(sk); /* passive closing */
+ if (sk->sk_state == SMC_LISTEN) {
+ /* wake up clcsock accept */
+ rc = kernel_sock_shutdown(smc->clcsock,
+ SHUT_RDWR);
+ }
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_state_change(sk);
+ }
+ smc_restore_fallback_changes(smc);
+ }
+
+ sk->sk_prot->unhash(sk);
+
+ if (sk->sk_state == SMC_CLOSED) {
+ if (smc->clcsock) {
+ release_sock(sk);
+ smc_clcsock_release(smc);
+ lock_sock(sk);
+ }
+ if (!smc->use_fallback)
+ smc_conn_free(&smc->conn);
+ }
+
+ return rc;
+}
+
+static int smc_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int old_state, rc = 0;
+
+ if (!sk)
+ goto out;
+
+ sock_hold(sk); /* sock_put below */
+ smc = smc_sk(sk);
+
+ old_state = sk->sk_state;
+
+ /* cleanup for a dangling non-blocking connect */
+ if (smc->connect_nonblock && old_state == SMC_INIT)
+ tcp_abort(smc->clcsock->sk, ECONNABORTED);
+
+ if (cancel_work_sync(&smc->connect_work))
+ sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */
+
+ if (sk->sk_state == SMC_LISTEN)
+ /* smc_close_non_accepted() is called and acquires
+ * sock lock for child sockets again
+ */
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ else
+ lock_sock(sk);
+
+ if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE &&
+ !smc->use_fallback)
+ smc_close_active_abort(smc);
+
+ rc = __smc_release(smc);
+
+ /* detach socket */
+ sock_orphan(sk);
+ sock->sk = NULL;
+ release_sock(sk);
+
+ sock_put(sk); /* sock_hold above */
+ sock_put(sk); /* final sock_put */
+out:
+ return rc;
+}
+
+static void smc_destruct(struct sock *sk)
+{
+ if (sk->sk_state != SMC_CLOSED)
+ return;
+ if (!sock_flag(sk, SOCK_DEAD))
+ return;
+}
+
+static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
+ int protocol)
+{
+ struct smc_sock *smc;
+ struct proto *prot;
+ struct sock *sk;
+
+ prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
+ sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
+ if (!sk)
+ return NULL;
+
+ sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
+ sk->sk_state = SMC_INIT;
+ sk->sk_destruct = smc_destruct;
+ sk->sk_protocol = protocol;
+ WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem));
+ WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem));
+ smc = smc_sk(sk);
+ INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+ INIT_WORK(&smc->connect_work, smc_connect_work);
+ INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
+ INIT_LIST_HEAD(&smc->accept_q);
+ spin_lock_init(&smc->accept_q_lock);
+ spin_lock_init(&smc->conn.send_lock);
+ sk->sk_prot->hash(sk);
+ mutex_init(&smc->clcsock_release_lock);
+ smc_init_saved_callbacks(smc);
+
+ return sk;
+}
+
+static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sk);
+
+ /* replicate tests from inet_bind(), to be safe wrt. future changes */
+ rc = -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_in))
+ goto out;
+
+ rc = -EAFNOSUPPORT;
+ if (addr->sin_family != AF_INET &&
+ addr->sin_family != AF_INET6 &&
+ addr->sin_family != AF_UNSPEC)
+ goto out;
+ /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
+ if (addr->sin_family == AF_UNSPEC &&
+ addr->sin_addr.s_addr != htonl(INADDR_ANY))
+ goto out;
+
+ lock_sock(sk);
+
+ /* Check if socket is already active */
+ rc = -EINVAL;
+ if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
+ goto out_rel;
+
+ smc->clcsock->sk->sk_reuse = sk->sk_reuse;
+ smc->clcsock->sk->sk_reuseport = sk->sk_reuseport;
+ rc = kernel_bind(smc->clcsock, uaddr, addr_len);
+
+out_rel:
+ release_sock(sk);
+out:
+ return rc;
+}
+
+/* copy only relevant settings and flags of SOL_SOCKET level from smc to
+ * clc socket (since smc is not called for these options from net/core)
+ */
+
+#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
+ (1UL << SOCK_KEEPOPEN) | \
+ (1UL << SOCK_LINGER) | \
+ (1UL << SOCK_BROADCAST) | \
+ (1UL << SOCK_TIMESTAMP) | \
+ (1UL << SOCK_DBG) | \
+ (1UL << SOCK_RCVTSTAMP) | \
+ (1UL << SOCK_RCVTSTAMPNS) | \
+ (1UL << SOCK_LOCALROUTE) | \
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
+ (1UL << SOCK_RXQ_OVFL) | \
+ (1UL << SOCK_WIFI_STATUS) | \
+ (1UL << SOCK_NOFCS) | \
+ (1UL << SOCK_FILTER_LOCKED) | \
+ (1UL << SOCK_TSTAMP_NEW))
+
+/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */
+static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk,
+ unsigned long mask)
+{
+ struct net *nnet = sock_net(nsk);
+
+ nsk->sk_userlocks = osk->sk_userlocks;
+ if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) {
+ nsk->sk_sndbuf = osk->sk_sndbuf;
+ } else {
+ if (mask == SK_FLAGS_SMC_TO_CLC)
+ WRITE_ONCE(nsk->sk_sndbuf,
+ READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1]));
+ else
+ WRITE_ONCE(nsk->sk_sndbuf,
+ 2 * READ_ONCE(nnet->smc.sysctl_wmem));
+ }
+ if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) {
+ nsk->sk_rcvbuf = osk->sk_rcvbuf;
+ } else {
+ if (mask == SK_FLAGS_SMC_TO_CLC)
+ WRITE_ONCE(nsk->sk_rcvbuf,
+ READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1]));
+ else
+ WRITE_ONCE(nsk->sk_rcvbuf,
+ 2 * READ_ONCE(nnet->smc.sysctl_rmem));
+ }
+}
+
+static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
+ unsigned long mask)
+{
+ /* options we don't get control via setsockopt for */
+ nsk->sk_type = osk->sk_type;
+ nsk->sk_sndtimeo = osk->sk_sndtimeo;
+ nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
+ nsk->sk_mark = READ_ONCE(osk->sk_mark);
+ nsk->sk_priority = osk->sk_priority;
+ nsk->sk_rcvlowat = osk->sk_rcvlowat;
+ nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
+ nsk->sk_err = osk->sk_err;
+
+ nsk->sk_flags &= ~mask;
+ nsk->sk_flags |= osk->sk_flags & mask;
+
+ smc_adjust_sock_bufsizes(nsk, osk, mask);
+}
+
+static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
+{
+ smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
+}
+
+#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
+ (1UL << SOCK_KEEPOPEN) | \
+ (1UL << SOCK_LINGER) | \
+ (1UL << SOCK_DBG))
+/* copy only settings and flags relevant for smc from clc to smc socket */
+static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
+{
+ smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
+}
+
+/* register the new vzalloced sndbuf on all links */
+static int smcr_lgr_reg_sndbufs(struct smc_link *link,
+ struct smc_buf_desc *snd_desc)
+{
+ struct smc_link_group *lgr = link->lgr;
+ int i, rc = 0;
+
+ if (!snd_desc->is_vm)
+ return -EINVAL;
+
+ /* protect against parallel smcr_link_reg_buf() */
+ down_write(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]))
+ continue;
+ rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc);
+ if (rc)
+ break;
+ }
+ up_write(&lgr->llc_conf_mutex);
+ return rc;
+}
+
+/* register the new rmb on all links */
+static int smcr_lgr_reg_rmbs(struct smc_link *link,
+ struct smc_buf_desc *rmb_desc)
+{
+ struct smc_link_group *lgr = link->lgr;
+ bool do_slow = false;
+ int i, rc = 0;
+
+ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
+ if (rc)
+ return rc;
+
+ down_read(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]))
+ continue;
+ if (!rmb_desc->is_reg_mr[link->link_idx]) {
+ up_read(&lgr->llc_conf_mutex);
+ goto slow_path;
+ }
+ }
+ /* mr register already */
+ goto fast_path;
+slow_path:
+ do_slow = true;
+ /* protect against parallel smc_llc_cli_rkey_exchange() and
+ * parallel smcr_link_reg_buf()
+ */
+ down_write(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]))
+ continue;
+ rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc);
+ if (rc)
+ goto out;
+ }
+fast_path:
+ /* exchange confirm_rkey msg with peer */
+ rc = smc_llc_do_confirm_rkey(link, rmb_desc);
+ if (rc) {
+ rc = -EFAULT;
+ goto out;
+ }
+ rmb_desc->is_conf_rkey = true;
+out:
+ do_slow ? up_write(&lgr->llc_conf_mutex) : up_read(&lgr->llc_conf_mutex);
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+ return rc;
+}
+
+static int smcr_clnt_conf_first_link(struct smc_sock *smc)
+{
+ struct smc_link *link = smc->conn.lnk;
+ struct smc_llc_qentry *qentry;
+ int rc;
+
+ /* Receive CONFIRM LINK request from server over RoCE fabric.
+ * Increasing the client's timeout by twice as much as the server's
+ * timeout by default can temporarily avoid decline messages of
+ * both sides crossing or colliding
+ */
+ qentry = smc_llc_wait(link->lgr, NULL, 2 * SMC_LLC_WAIT_TIME,
+ SMC_LLC_CONFIRM_LINK);
+ if (!qentry) {
+ struct smc_clc_msg_decline dclc;
+
+ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+ SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+ return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
+ }
+ smc_llc_save_peer_uid(qentry);
+ rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
+ smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
+ if (rc)
+ return SMC_CLC_DECL_RMBE_EC;
+
+ rc = smc_ib_modify_qp_rts(link);
+ if (rc)
+ return SMC_CLC_DECL_ERR_RDYLNK;
+
+ smc_wr_remember_qp_attr(link);
+
+ /* reg the sndbuf if it was vzalloced */
+ if (smc->conn.sndbuf_desc->is_vm) {
+ if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc))
+ return SMC_CLC_DECL_ERR_REGBUF;
+ }
+
+ /* reg the rmb */
+ if (smcr_link_reg_buf(link, smc->conn.rmb_desc))
+ return SMC_CLC_DECL_ERR_REGBUF;
+
+ /* confirm_rkey is implicit on 1st contact */
+ smc->conn.rmb_desc->is_conf_rkey = true;
+
+ /* send CONFIRM LINK response over RoCE fabric */
+ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
+ if (rc < 0)
+ return SMC_CLC_DECL_TIMEOUT_CL;
+
+ smc_llc_link_active(link);
+ smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
+
+ if (link->lgr->max_links > 1) {
+ /* optional 2nd link, receive ADD LINK request from server */
+ qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
+ SMC_LLC_ADD_LINK);
+ if (!qentry) {
+ struct smc_clc_msg_decline dclc;
+
+ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+ SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+ if (rc == -EAGAIN)
+ rc = 0; /* no DECLINE received, go with one link */
+ return rc;
+ }
+ smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
+ smc_llc_cli_add_link(link, qentry);
+ }
+ return 0;
+}
+
+static bool smc_isascii(char *hostname)
+{
+ int i;
+
+ for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
+ if (!isascii(hostname[i]))
+ return false;
+ return true;
+}
+
+static void smc_conn_save_peer_info_fce(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)clc;
+ struct smc_clc_first_contact_ext *fce;
+ int clc_v2_len;
+
+ if (clc->hdr.version == SMC_V1 ||
+ !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK))
+ return;
+
+ if (smc->conn.lgr->is_smcd) {
+ memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid,
+ SMC_MAX_EID_LEN);
+ clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
+ d1);
+ } else {
+ memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid,
+ SMC_MAX_EID_LEN);
+ clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
+ r1);
+ }
+ fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len);
+ smc->conn.lgr->peer_os = fce->os_type;
+ smc->conn.lgr->peer_smc_release = fce->release;
+ if (smc_isascii(fce->hostname))
+ memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
+ SMC_MAX_HOSTNAME_LEN);
+}
+
+static void smcr_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);
+
+ smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
+ smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
+ smc->conn.peer_rmbe_size = bufsize;
+ atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
+ smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
+}
+
+static void smcd_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);
+
+ smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
+ smc->conn.peer_token = ntohll(clc->d0.token);
+ /* msg header takes up space in the buffer */
+ smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
+ atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
+ smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
+}
+
+static void smc_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ if (smc->conn.lgr->is_smcd)
+ smcd_conn_save_peer_info(smc, clc);
+ else
+ smcr_conn_save_peer_info(smc, clc);
+ smc_conn_save_peer_info_fce(smc, clc);
+}
+
+static void smc_link_save_peer_info(struct smc_link *link,
+ struct smc_clc_msg_accept_confirm *clc,
+ struct smc_init_info *ini)
+{
+ link->peer_qpn = ntoh24(clc->r0.qpn);
+ memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE);
+ memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac));
+ link->peer_psn = ntoh24(clc->r0.psn);
+ link->peer_mtu = clc->r0.qp_mtu;
+}
+
+static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc,
+ struct smc_stats_fback *fback_arr)
+{
+ int cnt;
+
+ for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) {
+ if (fback_arr[cnt].fback_code == smc->fallback_rsn) {
+ fback_arr[cnt].count++;
+ break;
+ }
+ if (!fback_arr[cnt].fback_code) {
+ fback_arr[cnt].fback_code = smc->fallback_rsn;
+ fback_arr[cnt].count++;
+ break;
+ }
+ }
+}
+
+static void smc_stat_fallback(struct smc_sock *smc)
+{
+ struct net *net = sock_net(&smc->sk);
+
+ mutex_lock(&net->smc.mutex_fback_rsn);
+ if (smc->listen_smc) {
+ smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv);
+ net->smc.fback_rsn->srv_fback_cnt++;
+ } else {
+ smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt);
+ net->smc.fback_rsn->clnt_fback_cnt++;
+ }
+ mutex_unlock(&net->smc.mutex_fback_rsn);
+}
+
+/* must be called under rcu read lock */
+static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key)
+{
+ struct socket_wq *wq;
+ __poll_t flags;
+
+ wq = rcu_dereference(smc->sk.sk_wq);
+ if (!skwq_has_sleeper(wq))
+ return;
+
+ /* wake up smc sk->sk_wq */
+ if (!key) {
+ /* sk_state_change */
+ wake_up_interruptible_all(&wq->wait);
+ } else {
+ flags = key_to_poll(key);
+ if (flags & (EPOLLIN | EPOLLOUT))
+ /* sk_data_ready or sk_write_space */
+ wake_up_interruptible_sync_poll(&wq->wait, flags);
+ else if (flags & EPOLLERR)
+ /* sk_error_report */
+ wake_up_interruptible_poll(&wq->wait, flags);
+ }
+}
+
+static int smc_fback_mark_woken(wait_queue_entry_t *wait,
+ unsigned int mode, int sync, void *key)
+{
+ struct smc_mark_woken *mark =
+ container_of(wait, struct smc_mark_woken, wait_entry);
+
+ mark->woken = true;
+ mark->key = key;
+ return 0;
+}
+
+static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk,
+ void (*clcsock_callback)(struct sock *sk))
+{
+ struct smc_mark_woken mark = { .woken = false };
+ struct socket_wq *wq;
+
+ init_waitqueue_func_entry(&mark.wait_entry,
+ smc_fback_mark_woken);
+ rcu_read_lock();
+ wq = rcu_dereference(clcsk->sk_wq);
+ if (!wq)
+ goto out;
+ add_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
+ clcsock_callback(clcsk);
+ remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
+
+ if (mark.woken)
+ smc_fback_wakeup_waitqueue(smc, mark.key);
+out:
+ rcu_read_unlock();
+}
+
+static void smc_fback_state_change(struct sock *clcsk)
+{
+ struct smc_sock *smc;
+
+ read_lock_bh(&clcsk->sk_callback_lock);
+ smc = smc_clcsock_user_data(clcsk);
+ if (smc)
+ smc_fback_forward_wakeup(smc, clcsk,
+ smc->clcsk_state_change);
+ read_unlock_bh(&clcsk->sk_callback_lock);
+}
+
+static void smc_fback_data_ready(struct sock *clcsk)
+{
+ struct smc_sock *smc;
+
+ read_lock_bh(&clcsk->sk_callback_lock);
+ smc = smc_clcsock_user_data(clcsk);
+ if (smc)
+ smc_fback_forward_wakeup(smc, clcsk,
+ smc->clcsk_data_ready);
+ read_unlock_bh(&clcsk->sk_callback_lock);
+}
+
+static void smc_fback_write_space(struct sock *clcsk)
+{
+ struct smc_sock *smc;
+
+ read_lock_bh(&clcsk->sk_callback_lock);
+ smc = smc_clcsock_user_data(clcsk);
+ if (smc)
+ smc_fback_forward_wakeup(smc, clcsk,
+ smc->clcsk_write_space);
+ read_unlock_bh(&clcsk->sk_callback_lock);
+}
+
+static void smc_fback_error_report(struct sock *clcsk)
+{
+ struct smc_sock *smc;
+
+ read_lock_bh(&clcsk->sk_callback_lock);
+ smc = smc_clcsock_user_data(clcsk);
+ if (smc)
+ smc_fback_forward_wakeup(smc, clcsk,
+ smc->clcsk_error_report);
+ read_unlock_bh(&clcsk->sk_callback_lock);
+}
+
+static void smc_fback_replace_callbacks(struct smc_sock *smc)
+{
+ struct sock *clcsk = smc->clcsock->sk;
+
+ write_lock_bh(&clcsk->sk_callback_lock);
+ clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
+
+ smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change,
+ &smc->clcsk_state_change);
+ smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready,
+ &smc->clcsk_data_ready);
+ smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space,
+ &smc->clcsk_write_space);
+ smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report,
+ &smc->clcsk_error_report);
+
+ write_unlock_bh(&clcsk->sk_callback_lock);
+}
+
+static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
+{
+ int rc = 0;
+
+ mutex_lock(&smc->clcsock_release_lock);
+ if (!smc->clcsock) {
+ rc = -EBADF;
+ goto out;
+ }
+
+ smc->use_fallback = true;
+ smc->fallback_rsn = reason_code;
+ smc_stat_fallback(smc);
+ trace_smc_switch_to_fallback(smc, reason_code);
+ if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
+ smc->clcsock->file = smc->sk.sk_socket->file;
+ smc->clcsock->file->private_data = smc->clcsock;
+ smc->clcsock->wq.fasync_list =
+ smc->sk.sk_socket->wq.fasync_list;
+
+ /* There might be some wait entries remaining
+ * in smc sk->sk_wq and they should be woken up
+ * as clcsock's wait queue is woken up.
+ */
+ smc_fback_replace_callbacks(smc);
+ }
+out:
+ mutex_unlock(&smc->clcsock_release_lock);
+ return rc;
+}
+
+/* fall back during connect */
+static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
+{
+ struct net *net = sock_net(&smc->sk);
+ int rc = 0;
+
+ rc = smc_switch_to_fallback(smc, reason_code);
+ if (rc) { /* fallback fails */
+ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
+ if (smc->sk.sk_state == SMC_INIT)
+ sock_put(&smc->sk); /* passive closing */
+ return rc;
+ }
+ smc_copy_sock_settings_to_clc(smc);
+ smc->connect_nonblock = 0;
+ if (smc->sk.sk_state == SMC_INIT)
+ smc->sk.sk_state = SMC_ACTIVE;
+ return 0;
+}
+
+/* decline and fall back during connect */
+static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code,
+ u8 version)
+{
+ struct net *net = sock_net(&smc->sk);
+ int rc;
+
+ if (reason_code < 0) { /* error, fallback is not possible */
+ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
+ if (smc->sk.sk_state == SMC_INIT)
+ sock_put(&smc->sk); /* passive closing */
+ return reason_code;
+ }
+ if (reason_code != SMC_CLC_DECL_PEERDECL) {
+ rc = smc_clc_send_decline(smc, reason_code, version);
+ if (rc < 0) {
+ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
+ if (smc->sk.sk_state == SMC_INIT)
+ sock_put(&smc->sk); /* passive closing */
+ return rc;
+ }
+ }
+ return smc_connect_fallback(smc, reason_code);
+}
+
+static void smc_conn_abort(struct smc_sock *smc, int local_first)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_link_group *lgr = conn->lgr;
+ bool lgr_valid = false;
+
+ if (smc_conn_lgr_valid(conn))
+ lgr_valid = true;
+
+ smc_conn_free(conn);
+ if (local_first && lgr_valid)
+ smc_lgr_cleanup_early(lgr);
+}
+
+/* check if there is a rdma device available for this connection. */
+/* called for connect and listen */
+static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
+{
+ /* PNET table look up: search active ib_device and port
+ * within same PNETID that also contains the ethernet device
+ * used for the internal TCP socket
+ */
+ smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
+ if (!ini->check_smcrv2 && !ini->ib_dev)
+ return SMC_CLC_DECL_NOSMCRDEV;
+ if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2)
+ return SMC_CLC_DECL_NOSMCRDEV;
+ return 0;
+}
+
+/* check if there is an ISM device available for this connection. */
+/* called for connect and listen */
+static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
+{
+ /* Find ISM device with same PNETID as connecting interface */
+ smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
+ if (!ini->ism_dev[0])
+ return SMC_CLC_DECL_NOSMCDDEV;
+ else
+ ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
+ return 0;
+}
+
+/* is chid unique for the ism devices that are already determined? */
+static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
+ int cnt)
+{
+ int i = (!ini->ism_dev[0]) ? 1 : 0;
+
+ for (; i < cnt; i++)
+ if (ini->ism_chid[i] == chid)
+ return false;
+ return true;
+}
+
+/* determine possible V2 ISM devices (either without PNETID or with PNETID plus
+ * PNETID matching net_device)
+ */
+static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ int rc = SMC_CLC_DECL_NOSMCDDEV;
+ struct smcd_dev *smcd;
+ int i = 1;
+ u16 chid;
+
+ if (smcd_indicated(ini->smc_type_v1))
+ rc = 0; /* already initialized for V1 */
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ if (smcd->going_away || smcd == ini->ism_dev[0])
+ continue;
+ chid = smc_ism_get_chid(smcd);
+ if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
+ continue;
+ if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
+ smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
+ ini->ism_dev[i] = smcd;
+ ini->ism_chid[i] = chid;
+ ini->is_smcd = true;
+ rc = 0;
+ i++;
+ if (i > SMC_MAX_ISM_DEVS)
+ break;
+ }
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+ ini->ism_offered_cnt = i - 1;
+ if (!ini->ism_dev[0] && !ini->ism_dev[1])
+ ini->smcd_version = 0;
+
+ return rc;
+}
+
+/* Check for VLAN ID and register it on ISM device just for CLC handshake */
+static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
+ return SMC_CLC_DECL_ISMVLANERR;
+ return 0;
+}
+
+static int smc_find_proposal_devices(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ int rc = 0;
+
+ /* check if there is an ism device available */
+ if (!(ini->smcd_version & SMC_V1) ||
+ smc_find_ism_device(smc, ini) ||
+ smc_connect_ism_vlan_setup(smc, ini))
+ ini->smcd_version &= ~SMC_V1;
+ /* else ISM V1 is supported for this connection */
+
+ /* check if there is an rdma device available */
+ if (!(ini->smcr_version & SMC_V1) ||
+ smc_find_rdma_device(smc, ini))
+ ini->smcr_version &= ~SMC_V1;
+ /* else RDMA is supported for this connection */
+
+ ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1,
+ ini->smcr_version & SMC_V1);
+
+ /* check if there is an ism v2 device available */
+ if (!(ini->smcd_version & SMC_V2) ||
+ !smc_ism_is_v2_capable() ||
+ smc_find_ism_v2_device_clnt(smc, ini))
+ ini->smcd_version &= ~SMC_V2;
+
+ /* check if there is an rdma v2 device available */
+ ini->check_smcrv2 = true;
+ ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr;
+ if (!(ini->smcr_version & SMC_V2) ||
+ smc->clcsock->sk->sk_family != AF_INET ||
+ !smc_clc_ueid_count() ||
+ smc_find_rdma_device(smc, ini))
+ ini->smcr_version &= ~SMC_V2;
+ ini->check_smcrv2 = false;
+
+ ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2,
+ ini->smcr_version & SMC_V2);
+
+ /* if neither ISM nor RDMA are supported, fallback */
+ if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
+ rc = SMC_CLC_DECL_NOSMCDEV;
+
+ return rc;
+}
+
+/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
+ * used, the VLAN ID will be registered again during the connection setup.
+ */
+static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ if (!smcd_indicated(ini->smc_type_v1))
+ return 0;
+ if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id))
+ return SMC_CLC_DECL_CNFERR;
+ return 0;
+}
+
+#define SMC_CLC_MAX_ACCEPT_LEN \
+ (sizeof(struct smc_clc_msg_accept_confirm_v2) + \
+ sizeof(struct smc_clc_first_contact_ext_v2x) + \
+ sizeof(struct smc_clc_msg_trail))
+
+/* CLC handshake during connect */
+static int smc_connect_clc(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm_v2 *aclc2,
+ struct smc_init_info *ini)
+{
+ int rc = 0;
+
+ /* do inband token exchange */
+ rc = smc_clc_send_proposal(smc, ini);
+ if (rc)
+ return rc;
+ /* receive SMC Accept CLC message */
+ return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN,
+ SMC_CLC_ACCEPT, CLC_WAIT_TIME);
+}
+
+void smc_fill_gid_list(struct smc_link_group *lgr,
+ struct smc_gidlist *gidlist,
+ struct smc_ib_device *known_dev, u8 *known_gid)
+{
+ struct smc_init_info *alt_ini = NULL;
+
+ memset(gidlist, 0, sizeof(*gidlist));
+ memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE);
+
+ alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL);
+ if (!alt_ini)
+ goto out;
+
+ alt_ini->vlan_id = lgr->vlan_id;
+ alt_ini->check_smcrv2 = true;
+ alt_ini->smcrv2.saddr = lgr->saddr;
+ smc_pnet_find_alt_roce(lgr, alt_ini, known_dev);
+
+ if (!alt_ini->smcrv2.ib_dev_v2)
+ goto out;
+
+ memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2,
+ SMC_GID_SIZE);
+
+out:
+ kfree(alt_ini);
+}
+
+static int smc_connect_rdma_v2_prepare(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *aclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)aclc;
+ struct smc_clc_first_contact_ext *fce =
+ smc_get_clc_first_contact_ext(clc_v2, false);
+ struct net *net = sock_net(&smc->sk);
+ int rc;
+
+ if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1)
+ return 0;
+
+ if (fce->v2_direct) {
+ memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN);
+ ini->smcrv2.uses_gateway = false;
+ } else {
+ if (smc_ib_find_route(net, smc->clcsock->sk->sk_rcv_saddr,
+ smc_ib_gid_to_ipv4(aclc->r0.lcl.gid),
+ ini->smcrv2.nexthop_mac,
+ &ini->smcrv2.uses_gateway))
+ return SMC_CLC_DECL_NOROUTE;
+ if (!ini->smcrv2.uses_gateway) {
+ /* mismatch: peer claims indirect, but its direct */
+ return SMC_CLC_DECL_NOINDIRECT;
+ }
+ }
+
+ ini->release_nr = fce->release;
+ rc = smc_clc_clnt_v2x_features_validate(fce, ini);
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
+/* setup for RDMA connection of client */
+static int smc_connect_rdma(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *aclc,
+ struct smc_init_info *ini)
+{
+ int i, reason_code = 0;
+ struct smc_link *link;
+ u8 *eid = NULL;
+
+ ini->is_smcd = false;
+ ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
+ ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
+ memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN);
+ memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE);
+ memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN);
+ ini->max_conns = SMC_CONN_PER_LGR_MAX;
+ ini->max_links = SMC_LINKS_ADD_LNK_MAX;
+
+ reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini);
+ if (reason_code)
+ return reason_code;
+
+ mutex_lock(&smc_client_lgr_pending);
+ reason_code = smc_conn_create(smc, ini);
+ if (reason_code) {
+ mutex_unlock(&smc_client_lgr_pending);
+ return reason_code;
+ }
+
+ smc_conn_save_peer_info(smc, aclc);
+
+ if (ini->first_contact_local) {
+ link = smc->conn.lnk;
+ } else {
+ /* set link that was assigned by server */
+ link = NULL;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *l = &smc->conn.lgr->lnk[i];
+
+ if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
+ !memcmp(l->peer_gid, &aclc->r0.lcl.gid,
+ SMC_GID_SIZE) &&
+ (aclc->hdr.version > SMC_V1 ||
+ !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
+ sizeof(l->peer_mac)))) {
+ link = l;
+ break;
+ }
+ }
+ if (!link) {
+ reason_code = SMC_CLC_DECL_NOSRVLINK;
+ goto connect_abort;
+ }
+ smc_switch_link_and_count(&smc->conn, link);
+ }
+
+ /* create send buffer and rmb */
+ if (smc_buf_create(smc, false)) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto connect_abort;
+ }
+
+ if (ini->first_contact_local)
+ smc_link_save_peer_info(link, aclc, ini);
+
+ if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
+ reason_code = SMC_CLC_DECL_ERR_RTOK;
+ goto connect_abort;
+ }
+
+ smc_close_init(smc);
+ smc_rx_init(smc);
+
+ if (ini->first_contact_local) {
+ if (smc_ib_ready_link(link)) {
+ reason_code = SMC_CLC_DECL_ERR_RDYLNK;
+ goto connect_abort;
+ }
+ } else {
+ /* reg sendbufs if they were vzalloced */
+ if (smc->conn.sndbuf_desc->is_vm) {
+ if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) {
+ reason_code = SMC_CLC_DECL_ERR_REGBUF;
+ goto connect_abort;
+ }
+ }
+ if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
+ reason_code = SMC_CLC_DECL_ERR_REGBUF;
+ goto connect_abort;
+ }
+ }
+
+ if (aclc->hdr.version > SMC_V1) {
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)aclc;
+
+ eid = clc_v2->r1.eid;
+ if (ini->first_contact_local)
+ smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist,
+ link->smcibdev, link->gid);
+ }
+
+ reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
+ aclc->hdr.version, eid, ini);
+ if (reason_code)
+ goto connect_abort;
+
+ smc_tx_init(smc);
+
+ if (ini->first_contact_local) {
+ /* QP confirmation over RoCE fabric */
+ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
+ reason_code = smcr_clnt_conf_first_link(smc);
+ smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
+ if (reason_code)
+ goto connect_abort;
+ }
+ mutex_unlock(&smc_client_lgr_pending);
+
+ smc_copy_sock_settings_to_clc(smc);
+ smc->connect_nonblock = 0;
+ if (smc->sk.sk_state == SMC_INIT)
+ smc->sk.sk_state = SMC_ACTIVE;
+
+ return 0;
+connect_abort:
+ smc_conn_abort(smc, ini->first_contact_local);
+ mutex_unlock(&smc_client_lgr_pending);
+ smc->connect_nonblock = 0;
+
+ return reason_code;
+}
+
+/* The server has chosen one of the proposed ISM devices for the communication.
+ * Determine from the CHID of the received CLC ACCEPT the ISM device chosen.
+ */
+static int
+smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc,
+ struct smc_init_info *ini)
+{
+ int i;
+
+ for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
+ if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) {
+ ini->ism_selected = i;
+ return 0;
+ }
+ }
+
+ return -EPROTO;
+}
+
+/* setup for ISM connection of client */
+static int smc_connect_ism(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *aclc,
+ struct smc_init_info *ini)
+{
+ u8 *eid = NULL;
+ int rc = 0;
+
+ ini->is_smcd = true;
+ ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
+
+ if (aclc->hdr.version == SMC_V2) {
+ struct smc_clc_msg_accept_confirm_v2 *aclc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)aclc;
+
+ if (ini->first_contact_peer) {
+ struct smc_clc_first_contact_ext *fce =
+ smc_get_clc_first_contact_ext(aclc_v2, true);
+
+ ini->release_nr = fce->release;
+ rc = smc_clc_clnt_v2x_features_validate(fce, ini);
+ if (rc)
+ return rc;
+ }
+
+ rc = smc_v2_determine_accepted_chid(aclc_v2, ini);
+ if (rc)
+ return rc;
+ }
+ ini->ism_peer_gid[ini->ism_selected] = ntohll(aclc->d0.gid);
+
+ /* there is only one lgr role for SMC-D; use server lock */
+ mutex_lock(&smc_server_lgr_pending);
+ rc = smc_conn_create(smc, ini);
+ if (rc) {
+ mutex_unlock(&smc_server_lgr_pending);
+ return rc;
+ }
+
+ /* Create send and receive buffers */
+ rc = smc_buf_create(smc, true);
+ if (rc) {
+ rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM;
+ goto connect_abort;
+ }
+
+ smc_conn_save_peer_info(smc, aclc);
+ smc_close_init(smc);
+ smc_rx_init(smc);
+ smc_tx_init(smc);
+
+ if (aclc->hdr.version > SMC_V1) {
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)aclc;
+
+ eid = clc_v2->d1.eid;
+ }
+
+ rc = smc_clc_send_confirm(smc, ini->first_contact_local,
+ aclc->hdr.version, eid, ini);
+ if (rc)
+ goto connect_abort;
+ mutex_unlock(&smc_server_lgr_pending);
+
+ smc_copy_sock_settings_to_clc(smc);
+ smc->connect_nonblock = 0;
+ if (smc->sk.sk_state == SMC_INIT)
+ smc->sk.sk_state = SMC_ACTIVE;
+
+ return 0;
+connect_abort:
+ smc_conn_abort(smc, ini->first_contact_local);
+ mutex_unlock(&smc_server_lgr_pending);
+ smc->connect_nonblock = 0;
+
+ return rc;
+}
+
+/* check if received accept type and version matches a proposed one */
+static int smc_connect_check_aclc(struct smc_init_info *ini,
+ struct smc_clc_msg_accept_confirm *aclc)
+{
+ if (aclc->hdr.typev1 != SMC_TYPE_R &&
+ aclc->hdr.typev1 != SMC_TYPE_D)
+ return SMC_CLC_DECL_MODEUNSUPP;
+
+ if (aclc->hdr.version >= SMC_V2) {
+ if ((aclc->hdr.typev1 == SMC_TYPE_R &&
+ !smcr_indicated(ini->smc_type_v2)) ||
+ (aclc->hdr.typev1 == SMC_TYPE_D &&
+ !smcd_indicated(ini->smc_type_v2)))
+ return SMC_CLC_DECL_MODEUNSUPP;
+ } else {
+ if ((aclc->hdr.typev1 == SMC_TYPE_R &&
+ !smcr_indicated(ini->smc_type_v1)) ||
+ (aclc->hdr.typev1 == SMC_TYPE_D &&
+ !smcd_indicated(ini->smc_type_v1)))
+ return SMC_CLC_DECL_MODEUNSUPP;
+ }
+
+ return 0;
+}
+
+/* perform steps before actually connecting */
+static int __smc_connect(struct smc_sock *smc)
+{
+ u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1;
+ struct smc_clc_msg_accept_confirm_v2 *aclc2;
+ struct smc_clc_msg_accept_confirm *aclc;
+ struct smc_init_info *ini = NULL;
+ u8 *buf = NULL;
+ int rc = 0;
+
+ if (smc->use_fallback)
+ return smc_connect_fallback(smc, smc->fallback_rsn);
+
+ /* if peer has not signalled SMC-capability, fall back */
+ if (!tcp_sk(smc->clcsock->sk)->syn_smc)
+ return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
+
+ /* IPSec connections opt out of SMC optimizations */
+ if (using_ipsec(smc))
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC,
+ version);
+
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini)
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM,
+ version);
+
+ ini->smcd_version = SMC_V1 | SMC_V2;
+ ini->smcr_version = SMC_V1 | SMC_V2;
+ ini->smc_type_v1 = SMC_TYPE_B;
+ ini->smc_type_v2 = SMC_TYPE_B;
+
+ /* get vlan id from IP device */
+ if (smc_vlan_by_tcpsk(smc->clcsock, ini)) {
+ ini->smcd_version &= ~SMC_V1;
+ ini->smcr_version = 0;
+ ini->smc_type_v1 = SMC_TYPE_N;
+ if (!ini->smcd_version) {
+ rc = SMC_CLC_DECL_GETVLANERR;
+ goto fallback;
+ }
+ }
+
+ rc = smc_find_proposal_devices(smc, ini);
+ if (rc)
+ goto fallback;
+
+ buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL);
+ if (!buf) {
+ rc = SMC_CLC_DECL_MEM;
+ goto fallback;
+ }
+ aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf;
+ aclc = (struct smc_clc_msg_accept_confirm *)aclc2;
+
+ /* perform CLC handshake */
+ rc = smc_connect_clc(smc, aclc2, ini);
+ if (rc) {
+ /* -EAGAIN on timeout, see tcp_recvmsg() */
+ if (rc == -EAGAIN) {
+ rc = -ETIMEDOUT;
+ smc->sk.sk_err = ETIMEDOUT;
+ }
+ goto vlan_cleanup;
+ }
+
+ /* check if smc modes and versions of CLC proposal and accept match */
+ rc = smc_connect_check_aclc(ini, aclc);
+ version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2;
+ if (rc)
+ goto vlan_cleanup;
+
+ /* depending on previous steps, connect using rdma or ism */
+ if (aclc->hdr.typev1 == SMC_TYPE_R) {
+ ini->smcr_version = version;
+ rc = smc_connect_rdma(smc, aclc, ini);
+ } else if (aclc->hdr.typev1 == SMC_TYPE_D) {
+ ini->smcd_version = version;
+ rc = smc_connect_ism(smc, aclc, ini);
+ }
+ if (rc)
+ goto vlan_cleanup;
+
+ SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc);
+ smc_connect_ism_vlan_cleanup(smc, ini);
+ kfree(buf);
+ kfree(ini);
+ return 0;
+
+vlan_cleanup:
+ smc_connect_ism_vlan_cleanup(smc, ini);
+ kfree(buf);
+fallback:
+ kfree(ini);
+ return smc_connect_decline_fallback(smc, rc, version);
+}
+
+static void smc_connect_work(struct work_struct *work)
+{
+ struct smc_sock *smc = container_of(work, struct smc_sock,
+ connect_work);
+ long timeo = smc->sk.sk_sndtimeo;
+ int rc = 0;
+
+ if (!timeo)
+ timeo = MAX_SCHEDULE_TIMEOUT;
+ lock_sock(smc->clcsock->sk);
+ if (smc->clcsock->sk->sk_err) {
+ smc->sk.sk_err = smc->clcsock->sk->sk_err;
+ } else if ((1 << smc->clcsock->sk->sk_state) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
+ if ((rc == -EPIPE) &&
+ ((1 << smc->clcsock->sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
+ rc = 0;
+ }
+ release_sock(smc->clcsock->sk);
+ lock_sock(&smc->sk);
+ if (rc != 0 || smc->sk.sk_err) {
+ smc->sk.sk_state = SMC_CLOSED;
+ if (rc == -EPIPE || rc == -EAGAIN)
+ smc->sk.sk_err = EPIPE;
+ else if (rc == -ECONNREFUSED)
+ smc->sk.sk_err = ECONNREFUSED;
+ else if (signal_pending(current))
+ smc->sk.sk_err = -sock_intr_errno(timeo);
+ sock_put(&smc->sk); /* passive closing */
+ goto out;
+ }
+
+ rc = __smc_connect(smc);
+ if (rc < 0)
+ smc->sk.sk_err = -rc;
+
+out:
+ if (!sock_flag(&smc->sk, SOCK_DEAD)) {
+ if (smc->sk.sk_err) {
+ smc->sk.sk_state_change(&smc->sk);
+ } else { /* allow polling before and after fallback decision */
+ smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
+ smc->sk.sk_write_space(&smc->sk);
+ }
+ }
+ release_sock(&smc->sk);
+}
+
+static int smc_connect(struct socket *sock, struct sockaddr *addr,
+ int alen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EINVAL;
+
+ smc = smc_sk(sk);
+
+ /* separate smc parameter checking to be safe */
+ if (alen < sizeof(addr->sa_family))
+ goto out_err;
+ if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
+ goto out_err;
+
+ lock_sock(sk);
+ switch (sock->state) {
+ default:
+ rc = -EINVAL;
+ goto out;
+ case SS_CONNECTED:
+ rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL;
+ goto out;
+ case SS_CONNECTING:
+ if (sk->sk_state == SMC_ACTIVE)
+ goto connected;
+ break;
+ case SS_UNCONNECTED:
+ sock->state = SS_CONNECTING;
+ break;
+ }
+
+ switch (sk->sk_state) {
+ default:
+ goto out;
+ case SMC_CLOSED:
+ rc = sock_error(sk) ? : -ECONNABORTED;
+ sock->state = SS_UNCONNECTED;
+ goto out;
+ case SMC_ACTIVE:
+ rc = -EISCONN;
+ goto out;
+ case SMC_INIT:
+ break;
+ }
+
+ smc_copy_sock_settings_to_clc(smc);
+ tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+ if (smc->connect_nonblock) {
+ rc = -EALREADY;
+ goto out;
+ }
+ rc = kernel_connect(smc->clcsock, addr, alen, flags);
+ if (rc && rc != -EINPROGRESS)
+ goto out;
+
+ if (smc->use_fallback) {
+ sock->state = rc ? SS_CONNECTING : SS_CONNECTED;
+ goto out;
+ }
+ sock_hold(&smc->sk); /* sock put in passive closing */
+ if (flags & O_NONBLOCK) {
+ if (queue_work(smc_hs_wq, &smc->connect_work))
+ smc->connect_nonblock = 1;
+ rc = -EINPROGRESS;
+ goto out;
+ } else {
+ rc = __smc_connect(smc);
+ if (rc < 0)
+ goto out;
+ }
+
+connected:
+ rc = 0;
+ sock->state = SS_CONNECTED;
+out:
+ release_sock(sk);
+out_err:
+ return rc;
+}
+
+static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
+{
+ struct socket *new_clcsock = NULL;
+ struct sock *lsk = &lsmc->sk;
+ struct sock *new_sk;
+ int rc = -EINVAL;
+
+ release_sock(lsk);
+ new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
+ if (!new_sk) {
+ rc = -ENOMEM;
+ lsk->sk_err = ENOMEM;
+ *new_smc = NULL;
+ lock_sock(lsk);
+ goto out;
+ }
+ *new_smc = smc_sk(new_sk);
+
+ mutex_lock(&lsmc->clcsock_release_lock);
+ if (lsmc->clcsock)
+ rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
+ mutex_unlock(&lsmc->clcsock_release_lock);
+ lock_sock(lsk);
+ if (rc < 0 && rc != -EAGAIN)
+ lsk->sk_err = -rc;
+ if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
+ new_sk->sk_prot->unhash(new_sk);
+ if (new_clcsock)
+ sock_release(new_clcsock);
+ new_sk->sk_state = SMC_CLOSED;
+ smc_sock_set_flag(new_sk, SOCK_DEAD);
+ sock_put(new_sk); /* final */
+ *new_smc = NULL;
+ goto out;
+ }
+
+ /* new clcsock has inherited the smc listen-specific sk_data_ready
+ * function; switch it back to the original sk_data_ready function
+ */
+ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready;
+
+ /* if new clcsock has also inherited the fallback-specific callback
+ * functions, switch them back to the original ones.
+ */
+ if (lsmc->use_fallback) {
+ if (lsmc->clcsk_state_change)
+ new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change;
+ if (lsmc->clcsk_write_space)
+ new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space;
+ if (lsmc->clcsk_error_report)
+ new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report;
+ }
+
+ (*new_smc)->clcsock = new_clcsock;
+out:
+ return rc;
+}
+
+/* add a just created sock to the accept queue of the listen sock as
+ * candidate for a following socket accept call from user space
+ */
+static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
+{
+ struct smc_sock *par = smc_sk(parent);
+
+ sock_hold(sk); /* sock_put in smc_accept_unlink () */
+ spin_lock(&par->accept_q_lock);
+ list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
+ spin_unlock(&par->accept_q_lock);
+ sk_acceptq_added(parent);
+}
+
+/* remove a socket from the accept queue of its parental listening socket */
+static void smc_accept_unlink(struct sock *sk)
+{
+ struct smc_sock *par = smc_sk(sk)->listen_smc;
+
+ spin_lock(&par->accept_q_lock);
+ list_del_init(&smc_sk(sk)->accept_q);
+ spin_unlock(&par->accept_q_lock);
+ sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
+ sock_put(sk); /* sock_hold in smc_accept_enqueue */
+}
+
+/* remove a sock from the accept queue to bind it to a new socket created
+ * for a socket accept call from user space
+ */
+struct sock *smc_accept_dequeue(struct sock *parent,
+ struct socket *new_sock)
+{
+ struct smc_sock *isk, *n;
+ struct sock *new_sk;
+
+ list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
+ new_sk = (struct sock *)isk;
+
+ smc_accept_unlink(new_sk);
+ if (new_sk->sk_state == SMC_CLOSED) {
+ new_sk->sk_prot->unhash(new_sk);
+ if (isk->clcsock) {
+ sock_release(isk->clcsock);
+ isk->clcsock = NULL;
+ }
+ sock_put(new_sk); /* final */
+ continue;
+ }
+ if (new_sock) {
+ sock_graft(new_sk, new_sock);
+ new_sock->state = SS_CONNECTED;
+ if (isk->use_fallback) {
+ smc_sk(new_sk)->clcsock->file = new_sock->file;
+ isk->clcsock->file->private_data = isk->clcsock;
+ }
+ }
+ return new_sk;
+ }
+ return NULL;
+}
+
+/* clean up for a created but never accepted sock */
+void smc_close_non_accepted(struct sock *sk)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ sock_hold(sk); /* sock_put below */
+ lock_sock(sk);
+ if (!sk->sk_lingertime)
+ /* wait for peer closing */
+ WRITE_ONCE(sk->sk_lingertime, SMC_MAX_STREAM_WAIT_TIMEOUT);
+ __smc_release(smc);
+ release_sock(sk);
+ sock_put(sk); /* sock_hold above */
+ sock_put(sk); /* final sock_put */
+}
+
+static int smcr_serv_conf_first_link(struct smc_sock *smc)
+{
+ struct smc_link *link = smc->conn.lnk;
+ struct smc_llc_qentry *qentry;
+ int rc;
+
+ /* reg the sndbuf if it was vzalloced*/
+ if (smc->conn.sndbuf_desc->is_vm) {
+ if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc))
+ return SMC_CLC_DECL_ERR_REGBUF;
+ }
+
+ /* reg the rmb */
+ if (smcr_link_reg_buf(link, smc->conn.rmb_desc))
+ return SMC_CLC_DECL_ERR_REGBUF;
+
+ /* send CONFIRM LINK request to client over the RoCE fabric */
+ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
+ if (rc < 0)
+ return SMC_CLC_DECL_TIMEOUT_CL;
+
+ /* receive CONFIRM LINK response from client over the RoCE fabric */
+ qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
+ SMC_LLC_CONFIRM_LINK);
+ if (!qentry) {
+ struct smc_clc_msg_decline dclc;
+
+ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+ SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+ return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
+ }
+ smc_llc_save_peer_uid(qentry);
+ rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
+ smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
+ if (rc)
+ return SMC_CLC_DECL_RMBE_EC;
+
+ /* confirm_rkey is implicit on 1st contact */
+ smc->conn.rmb_desc->is_conf_rkey = true;
+
+ smc_llc_link_active(link);
+ smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
+
+ if (link->lgr->max_links > 1) {
+ down_write(&link->lgr->llc_conf_mutex);
+ /* initial contact - try to establish second link */
+ smc_llc_srv_add_link(link, NULL);
+ up_write(&link->lgr->llc_conf_mutex);
+ }
+ return 0;
+}
+
+/* listen worker: finish */
+static void smc_listen_out(struct smc_sock *new_smc)
+{
+ struct smc_sock *lsmc = new_smc->listen_smc;
+ struct sock *newsmcsk = &new_smc->sk;
+
+ if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
+ atomic_dec(&lsmc->queued_smc_hs);
+
+ if (lsmc->sk.sk_state == SMC_LISTEN) {
+ lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
+ smc_accept_enqueue(&lsmc->sk, newsmcsk);
+ release_sock(&lsmc->sk);
+ } else { /* no longer listening */
+ smc_close_non_accepted(newsmcsk);
+ }
+
+ /* Wake up accept */
+ lsmc->sk.sk_data_ready(&lsmc->sk);
+ sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
+}
+
+/* listen worker: finish in state connected */
+static void smc_listen_out_connected(struct smc_sock *new_smc)
+{
+ struct sock *newsmcsk = &new_smc->sk;
+
+ if (newsmcsk->sk_state == SMC_INIT)
+ newsmcsk->sk_state = SMC_ACTIVE;
+
+ smc_listen_out(new_smc);
+}
+
+/* listen worker: finish in error state */
+static void smc_listen_out_err(struct smc_sock *new_smc)
+{
+ struct sock *newsmcsk = &new_smc->sk;
+ struct net *net = sock_net(newsmcsk);
+
+ this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt);
+ if (newsmcsk->sk_state == SMC_INIT)
+ sock_put(&new_smc->sk); /* passive closing */
+ newsmcsk->sk_state = SMC_CLOSED;
+
+ smc_listen_out(new_smc);
+}
+
+/* listen worker: decline and fall back if possible */
+static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
+ int local_first, u8 version)
+{
+ /* RDMA setup failed, switch back to TCP */
+ smc_conn_abort(new_smc, local_first);
+ if (reason_code < 0 ||
+ smc_switch_to_fallback(new_smc, reason_code)) {
+ /* error, no fallback possible */
+ smc_listen_out_err(new_smc);
+ return;
+ }
+ if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
+ if (smc_clc_send_decline(new_smc, reason_code, version) < 0) {
+ smc_listen_out_err(new_smc);
+ return;
+ }
+ }
+ smc_listen_out_connected(new_smc);
+}
+
+/* listen worker: version checking */
+static int smc_listen_v2_check(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext;
+ struct smc_clc_v2_extension *pclc_v2_ext;
+ int rc = SMC_CLC_DECL_PEERNOSMC;
+
+ ini->smc_type_v1 = pclc->hdr.typev1;
+ ini->smc_type_v2 = pclc->hdr.typev2;
+ ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
+ ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
+ if (pclc->hdr.version > SMC_V1) {
+ if (smcd_indicated(ini->smc_type_v2))
+ ini->smcd_version |= SMC_V2;
+ if (smcr_indicated(ini->smc_type_v2))
+ ini->smcr_version |= SMC_V2;
+ }
+ if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) {
+ rc = SMC_CLC_DECL_PEERNOSMC;
+ goto out;
+ }
+ pclc_v2_ext = smc_get_clc_v2_ext(pclc);
+ if (!pclc_v2_ext) {
+ ini->smcd_version &= ~SMC_V2;
+ ini->smcr_version &= ~SMC_V2;
+ rc = SMC_CLC_DECL_NOV2EXT;
+ goto out;
+ }
+ pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext);
+ if (ini->smcd_version & SMC_V2) {
+ if (!smc_ism_is_v2_capable()) {
+ ini->smcd_version &= ~SMC_V2;
+ rc = SMC_CLC_DECL_NOISM2SUPP;
+ } else if (!pclc_smcd_v2_ext) {
+ ini->smcd_version &= ~SMC_V2;
+ rc = SMC_CLC_DECL_NOV2DEXT;
+ } else if (!pclc_v2_ext->hdr.eid_cnt &&
+ !pclc_v2_ext->hdr.flag.seid) {
+ ini->smcd_version &= ~SMC_V2;
+ rc = SMC_CLC_DECL_NOUEID;
+ }
+ }
+ if (ini->smcr_version & SMC_V2) {
+ if (!pclc_v2_ext->hdr.eid_cnt) {
+ ini->smcr_version &= ~SMC_V2;
+ rc = SMC_CLC_DECL_NOUEID;
+ }
+ }
+
+ ini->release_nr = pclc_v2_ext->hdr.flag.release;
+ if (pclc_v2_ext->hdr.flag.release > SMC_RELEASE)
+ ini->release_nr = SMC_RELEASE;
+
+out:
+ if (!ini->smcd_version && !ini->smcr_version)
+ return rc;
+
+ return 0;
+}
+
+/* listen worker: check prefixes */
+static int smc_listen_prfx_check(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc)
+{
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
+ struct socket *newclcsock = new_smc->clcsock;
+
+ if (pclc->hdr.typev1 == SMC_TYPE_N)
+ return 0;
+ pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+ if (smc_clc_prfx_match(newclcsock, pclc_prfx))
+ return SMC_CLC_DECL_DIFFPREFIX;
+
+ return 0;
+}
+
+/* listen worker: initialize connection and buffers */
+static int smc_listen_rdma_init(struct smc_sock *new_smc,
+ struct smc_init_info *ini)
+{
+ int rc;
+
+ /* allocate connection / link group */
+ rc = smc_conn_create(new_smc, ini);
+ if (rc)
+ return rc;
+
+ /* create send buffer and rmb */
+ if (smc_buf_create(new_smc, false)) {
+ smc_conn_abort(new_smc, ini->first_contact_local);
+ return SMC_CLC_DECL_MEM;
+ }
+
+ return 0;
+}
+
+/* listen worker: initialize connection and buffers for SMC-D */
+static int smc_listen_ism_init(struct smc_sock *new_smc,
+ struct smc_init_info *ini)
+{
+ int rc;
+
+ rc = smc_conn_create(new_smc, ini);
+ if (rc)
+ return rc;
+
+ /* Create send and receive buffers */
+ rc = smc_buf_create(new_smc, true);
+ if (rc) {
+ smc_conn_abort(new_smc, ini->first_contact_local);
+ return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB :
+ SMC_CLC_DECL_MEM;
+ }
+
+ return 0;
+}
+
+static bool smc_is_already_selected(struct smcd_dev *smcd,
+ struct smc_init_info *ini,
+ int matches)
+{
+ int i;
+
+ for (i = 0; i < matches; i++)
+ if (smcd == ini->ism_dev[i])
+ return true;
+
+ return false;
+}
+
+/* check for ISM devices matching proposed ISM devices */
+static void smc_check_ism_v2_match(struct smc_init_info *ini,
+ u16 proposed_chid, u64 proposed_gid,
+ unsigned int *matches)
+{
+ struct smcd_dev *smcd;
+
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ if (smcd->going_away)
+ continue;
+ if (smc_is_already_selected(smcd, ini, *matches))
+ continue;
+ if (smc_ism_get_chid(smcd) == proposed_chid &&
+ !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) {
+ ini->ism_peer_gid[*matches] = proposed_gid;
+ ini->ism_dev[*matches] = smcd;
+ (*matches)++;
+ break;
+ }
+ }
+}
+
+static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini)
+{
+ if (!ini->rc)
+ ini->rc = rc;
+}
+
+static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+ struct smc_clc_v2_extension *smc_v2_ext;
+ struct smc_clc_msg_smcd *pclc_smcd;
+ unsigned int matches = 0;
+ u8 smcd_version;
+ u8 *eid = NULL;
+ int i, rc;
+
+ if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2))
+ goto not_found;
+
+ pclc_smcd = smc_get_clc_msg_smcd(pclc);
+ smc_v2_ext = smc_get_clc_v2_ext(pclc);
+ smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext);
+
+ mutex_lock(&smcd_dev_list.mutex);
+ if (pclc_smcd->ism.chid)
+ /* check for ISM device matching proposed native ISM device */
+ smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid),
+ ntohll(pclc_smcd->ism.gid), &matches);
+ for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) {
+ /* check for ISM devices matching proposed non-native ISM
+ * devices
+ */
+ smc_check_ism_v2_match(ini,
+ ntohs(smcd_v2_ext->gidchid[i - 1].chid),
+ ntohll(smcd_v2_ext->gidchid[i - 1].gid),
+ &matches);
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+
+ if (!ini->ism_dev[0]) {
+ smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini);
+ goto not_found;
+ }
+
+ smc_ism_get_system_eid(&eid);
+ if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext,
+ smcd_v2_ext->system_eid, eid))
+ goto not_found;
+
+ /* separate - outside the smcd_dev_list.lock */
+ smcd_version = ini->smcd_version;
+ for (i = 0; i < matches; i++) {
+ ini->smcd_version = SMC_V2;
+ ini->is_smcd = true;
+ ini->ism_selected = i;
+ rc = smc_listen_ism_init(new_smc, ini);
+ if (rc) {
+ smc_find_ism_store_rc(rc, ini);
+ /* try next active ISM device */
+ continue;
+ }
+ return; /* matching and usable V2 ISM device found */
+ }
+ /* no V2 ISM device could be initialized */
+ ini->smcd_version = smcd_version; /* restore original value */
+ ini->negotiated_eid[0] = 0;
+
+not_found:
+ ini->smcd_version &= ~SMC_V2;
+ ini->ism_dev[0] = NULL;
+ ini->is_smcd = false;
+}
+
+static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc);
+ int rc = 0;
+
+ /* check if ISM V1 is available */
+ if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1))
+ goto not_found;
+ ini->is_smcd = true; /* prepare ISM check */
+ ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid);
+ rc = smc_find_ism_device(new_smc, ini);
+ if (rc)
+ goto not_found;
+ ini->ism_selected = 0;
+ rc = smc_listen_ism_init(new_smc, ini);
+ if (!rc)
+ return; /* V1 ISM device found */
+
+not_found:
+ smc_find_ism_store_rc(rc, ini);
+ ini->smcd_version &= ~SMC_V1;
+ ini->ism_dev[0] = NULL;
+ ini->is_smcd = false;
+}
+
+/* listen worker: register buffers */
+static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
+{
+ struct smc_connection *conn = &new_smc->conn;
+
+ if (!local_first) {
+ /* reg sendbufs if they were vzalloced */
+ if (conn->sndbuf_desc->is_vm) {
+ if (smcr_lgr_reg_sndbufs(conn->lnk,
+ conn->sndbuf_desc))
+ return SMC_CLC_DECL_ERR_REGBUF;
+ }
+ if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
+ return SMC_CLC_DECL_ERR_REGBUF;
+ }
+
+ return 0;
+}
+
+static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_v2_extension *smc_v2_ext;
+ u8 smcr_version;
+ int rc;
+
+ if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2))
+ goto not_found;
+
+ smc_v2_ext = smc_get_clc_v2_ext(pclc);
+ if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL))
+ goto not_found;
+
+ /* prepare RDMA check */
+ memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
+ memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE);
+ memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
+ ini->check_smcrv2 = true;
+ ini->smcrv2.clc_sk = new_smc->clcsock->sk;
+ ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr;
+ ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce);
+ rc = smc_find_rdma_device(new_smc, ini);
+ if (rc) {
+ smc_find_ism_store_rc(rc, ini);
+ goto not_found;
+ }
+ if (!ini->smcrv2.uses_gateway)
+ memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN);
+
+ smcr_version = ini->smcr_version;
+ ini->smcr_version = SMC_V2;
+ rc = smc_listen_rdma_init(new_smc, ini);
+ if (!rc) {
+ rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local);
+ if (rc)
+ smc_conn_abort(new_smc, ini->first_contact_local);
+ }
+ if (!rc)
+ return;
+ ini->smcr_version = smcr_version;
+ smc_find_ism_store_rc(rc, ini);
+
+not_found:
+ ini->smcr_version &= ~SMC_V2;
+ ini->smcrv2.ib_dev_v2 = NULL;
+ ini->check_smcrv2 = false;
+}
+
+static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ int rc;
+
+ if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1))
+ return SMC_CLC_DECL_NOSMCDEV;
+
+ /* prepare RDMA check */
+ memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
+ memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE);
+ memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
+ rc = smc_find_rdma_device(new_smc, ini);
+ if (rc) {
+ /* no RDMA device found */
+ return SMC_CLC_DECL_NOSMCDEV;
+ }
+ rc = smc_listen_rdma_init(new_smc, ini);
+ if (rc)
+ return rc;
+ return smc_listen_rdma_reg(new_smc, ini->first_contact_local);
+}
+
+/* determine the local device matching to proposal */
+static int smc_listen_find_device(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ int prfx_rc;
+
+ /* check for ISM device matching V2 proposed device */
+ smc_find_ism_v2_device_serv(new_smc, pclc, ini);
+ if (ini->ism_dev[0])
+ return 0;
+
+ /* check for matching IP prefix and subnet length (V1) */
+ prfx_rc = smc_listen_prfx_check(new_smc, pclc);
+ if (prfx_rc)
+ smc_find_ism_store_rc(prfx_rc, ini);
+
+ /* get vlan id from IP device */
+ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini))
+ return ini->rc ?: SMC_CLC_DECL_GETVLANERR;
+
+ /* check for ISM device matching V1 proposed device */
+ if (!prfx_rc)
+ smc_find_ism_v1_device_serv(new_smc, pclc, ini);
+ if (ini->ism_dev[0])
+ return 0;
+
+ if (!smcr_indicated(pclc->hdr.typev1) &&
+ !smcr_indicated(pclc->hdr.typev2))
+ /* skip RDMA and decline */
+ return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV;
+
+ /* check if RDMA V2 is available */
+ smc_find_rdma_v2_device_serv(new_smc, pclc, ini);
+ if (ini->smcrv2.ib_dev_v2)
+ return 0;
+
+ /* check if RDMA V1 is available */
+ if (!prfx_rc) {
+ int rc;
+
+ rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
+ smc_find_ism_store_rc(rc, ini);
+ return (!rc) ? 0 : ini->rc;
+ }
+ return prfx_rc;
+}
+
+/* listen worker: finish RDMA setup */
+static int smc_listen_rdma_finish(struct smc_sock *new_smc,
+ struct smc_clc_msg_accept_confirm *cclc,
+ bool local_first,
+ struct smc_init_info *ini)
+{
+ struct smc_link *link = new_smc->conn.lnk;
+ int reason_code = 0;
+
+ if (local_first)
+ smc_link_save_peer_info(link, cclc, ini);
+
+ if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
+ return SMC_CLC_DECL_ERR_RTOK;
+
+ if (local_first) {
+ if (smc_ib_ready_link(link))
+ return SMC_CLC_DECL_ERR_RDYLNK;
+ /* QP confirmation over RoCE fabric */
+ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
+ reason_code = smcr_serv_conf_first_link(new_smc);
+ smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
+ }
+ return reason_code;
+}
+
+/* setup for connection of server */
+static void smc_listen_work(struct work_struct *work)
+{
+ struct smc_sock *new_smc = container_of(work, struct smc_sock,
+ smc_listen_work);
+ struct socket *newclcsock = new_smc->clcsock;
+ struct smc_clc_msg_accept_confirm *cclc;
+ struct smc_clc_msg_proposal_area *buf;
+ struct smc_clc_msg_proposal *pclc;
+ struct smc_init_info *ini = NULL;
+ u8 proposal_version = SMC_V1;
+ u8 accept_version;
+ int rc = 0;
+
+ if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
+ return smc_listen_out_err(new_smc);
+
+ if (new_smc->use_fallback) {
+ smc_listen_out_connected(new_smc);
+ return;
+ }
+
+ /* check if peer is smc capable */
+ if (!tcp_sk(newclcsock->sk)->syn_smc) {
+ rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC);
+ if (rc)
+ smc_listen_out_err(new_smc);
+ else
+ smc_listen_out_connected(new_smc);
+ return;
+ }
+
+ /* do inband token exchange -
+ * wait for and receive SMC Proposal CLC message
+ */
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+ if (!buf) {
+ rc = SMC_CLC_DECL_MEM;
+ goto out_decl;
+ }
+ pclc = (struct smc_clc_msg_proposal *)buf;
+ rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf),
+ SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
+ if (rc)
+ goto out_decl;
+
+ if (pclc->hdr.version > SMC_V1)
+ proposal_version = SMC_V2;
+
+ /* IPSec connections opt out of SMC optimizations */
+ if (using_ipsec(new_smc)) {
+ rc = SMC_CLC_DECL_IPSEC;
+ goto out_decl;
+ }
+
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini) {
+ rc = SMC_CLC_DECL_MEM;
+ goto out_decl;
+ }
+
+ /* initial version checking */
+ rc = smc_listen_v2_check(new_smc, pclc, ini);
+ if (rc)
+ goto out_decl;
+
+ rc = smc_clc_srv_v2x_features_validate(pclc, ini);
+ if (rc)
+ goto out_decl;
+
+ mutex_lock(&smc_server_lgr_pending);
+ smc_close_init(new_smc);
+ smc_rx_init(new_smc);
+ smc_tx_init(new_smc);
+
+ /* determine ISM or RoCE device used for connection */
+ rc = smc_listen_find_device(new_smc, pclc, ini);
+ if (rc)
+ goto out_unlock;
+
+ /* send SMC Accept CLC message */
+ accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version;
+ rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
+ accept_version, ini->negotiated_eid, ini);
+ if (rc)
+ goto out_unlock;
+
+ /* SMC-D does not need this lock any more */
+ if (ini->is_smcd)
+ mutex_unlock(&smc_server_lgr_pending);
+
+ /* receive SMC Confirm CLC message */
+ memset(buf, 0, sizeof(*buf));
+ cclc = (struct smc_clc_msg_accept_confirm *)buf;
+ rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf),
+ SMC_CLC_CONFIRM, CLC_WAIT_TIME);
+ if (rc) {
+ if (!ini->is_smcd)
+ goto out_unlock;
+ goto out_decl;
+ }
+
+ rc = smc_clc_v2x_features_confirm_check(cclc, ini);
+ if (rc) {
+ if (!ini->is_smcd)
+ goto out_unlock;
+ goto out_decl;
+ }
+
+ /* fce smc release version is needed in smc_listen_rdma_finish,
+ * so save fce info here.
+ */
+ smc_conn_save_peer_info_fce(new_smc, cclc);
+
+ /* finish worker */
+ if (!ini->is_smcd) {
+ rc = smc_listen_rdma_finish(new_smc, cclc,
+ ini->first_contact_local, ini);
+ if (rc)
+ goto out_unlock;
+ mutex_unlock(&smc_server_lgr_pending);
+ }
+ smc_conn_save_peer_info(new_smc, cclc);
+ smc_listen_out_connected(new_smc);
+ SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini);
+ goto out_free;
+
+out_unlock:
+ mutex_unlock(&smc_server_lgr_pending);
+out_decl:
+ smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
+ proposal_version);
+out_free:
+ kfree(ini);
+ kfree(buf);
+}
+
+static void smc_tcp_listen_work(struct work_struct *work)
+{
+ struct smc_sock *lsmc = container_of(work, struct smc_sock,
+ tcp_listen_work);
+ struct sock *lsk = &lsmc->sk;
+ struct smc_sock *new_smc;
+ int rc = 0;
+
+ lock_sock(lsk);
+ while (lsk->sk_state == SMC_LISTEN) {
+ rc = smc_clcsock_accept(lsmc, &new_smc);
+ if (rc) /* clcsock accept queue empty or error */
+ goto out;
+ if (!new_smc)
+ continue;
+
+ if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
+ atomic_inc(&lsmc->queued_smc_hs);
+
+ new_smc->listen_smc = lsmc;
+ new_smc->use_fallback = lsmc->use_fallback;
+ new_smc->fallback_rsn = lsmc->fallback_rsn;
+ sock_hold(lsk); /* sock_put in smc_listen_work */
+ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
+ smc_copy_sock_settings_to_smc(new_smc);
+ sock_hold(&new_smc->sk); /* sock_put in passive closing */
+ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
+ sock_put(&new_smc->sk);
+ }
+
+out:
+ release_sock(lsk);
+ sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
+}
+
+static void smc_clcsock_data_ready(struct sock *listen_clcsock)
+{
+ struct smc_sock *lsmc;
+
+ read_lock_bh(&listen_clcsock->sk_callback_lock);
+ lsmc = smc_clcsock_user_data(listen_clcsock);
+ if (!lsmc)
+ goto out;
+ lsmc->clcsk_data_ready(listen_clcsock);
+ if (lsmc->sk.sk_state == SMC_LISTEN) {
+ sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
+ if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work))
+ sock_put(&lsmc->sk);
+ }
+out:
+ read_unlock_bh(&listen_clcsock->sk_callback_lock);
+}
+
+static int smc_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+
+ rc = -EINVAL;
+ if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
+ smc->connect_nonblock || sock->state != SS_UNCONNECTED)
+ goto out;
+
+ rc = 0;
+ if (sk->sk_state == SMC_LISTEN) {
+ sk->sk_max_ack_backlog = backlog;
+ goto out;
+ }
+ /* some socket options are handled in core, so we could not apply
+ * them to the clc socket -- copy smc socket options to clc socket
+ */
+ smc_copy_sock_settings_to_clc(smc);
+ if (!smc->use_fallback)
+ tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+
+ /* save original sk_data_ready function and establish
+ * smc-specific sk_data_ready function
+ */
+ write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
+ smc->clcsock->sk->sk_user_data =
+ (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
+ smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready,
+ smc_clcsock_data_ready, &smc->clcsk_data_ready);
+ write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
+
+ /* save original ops */
+ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops;
+
+ smc->af_ops = *smc->ori_af_ops;
+ smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock;
+
+ inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
+
+ if (smc->limit_smc_hs)
+ tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
+
+ rc = kernel_listen(smc->clcsock, backlog);
+ if (rc) {
+ write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
+ smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready,
+ &smc->clcsk_data_ready);
+ smc->clcsock->sk->sk_user_data = NULL;
+ write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
+ goto out;
+ }
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+ sk->sk_state = SMC_LISTEN;
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int smc_accept(struct socket *sock, struct socket *new_sock,
+ int flags, bool kern)
+{
+ struct sock *sk = sock->sk, *nsk;
+ DECLARE_WAITQUEUE(wait, current);
+ struct smc_sock *lsmc;
+ long timeo;
+ int rc = 0;
+
+ lsmc = smc_sk(sk);
+ sock_hold(sk); /* sock_put below */
+ lock_sock(sk);
+
+ if (lsmc->sk.sk_state != SMC_LISTEN) {
+ rc = -EINVAL;
+ release_sock(sk);
+ goto out;
+ }
+
+ /* Wait for an incoming connection */
+ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!timeo) {
+ rc = -EAGAIN;
+ break;
+ }
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ /* wakeup by sk_data_ready in smc_listen_work() */
+ sched_annotate_sleep();
+ lock_sock(sk);
+ if (signal_pending(current)) {
+ rc = sock_intr_errno(timeo);
+ break;
+ }
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ if (!rc)
+ rc = sock_error(nsk);
+ release_sock(sk);
+ if (rc)
+ goto out;
+
+ if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
+ /* wait till data arrives on the socket */
+ timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
+ MSEC_PER_SEC);
+ if (smc_sk(nsk)->use_fallback) {
+ struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
+
+ lock_sock(clcsk);
+ if (skb_queue_empty(&clcsk->sk_receive_queue))
+ sk_wait_data(clcsk, &timeo, NULL);
+ release_sock(clcsk);
+ } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
+ lock_sock(nsk);
+ smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
+ release_sock(nsk);
+ }
+ }
+
+out:
+ sock_put(sk); /* sock_hold above */
+ return rc;
+}
+
+static int smc_getname(struct socket *sock, struct sockaddr *addr,
+ int peer)
+{
+ struct smc_sock *smc;
+
+ if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
+ (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
+ return -ENOTCONN;
+
+ smc = smc_sk(sock->sk);
+
+ return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
+}
+
+static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+
+ /* SMC does not support connect with fastopen */
+ if (msg->msg_flags & MSG_FASTOPEN) {
+ /* not connected yet, fallback */
+ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
+ rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
+ if (rc)
+ goto out;
+ } else {
+ rc = -EINVAL;
+ goto out;
+ }
+ } else if ((sk->sk_state != SMC_ACTIVE) &&
+ (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+ (sk->sk_state != SMC_INIT)) {
+ rc = -EPIPE;
+ goto out;
+ }
+
+ if (smc->use_fallback) {
+ rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
+ } else {
+ rc = smc_tx_sendmsg(smc, msg, len);
+ SMC_STAT_TX_PAYLOAD(smc, len, rc);
+ }
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -ENOTCONN;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
+ /* socket was connected before, no more data to read */
+ rc = 0;
+ goto out;
+ }
+ if ((sk->sk_state == SMC_INIT) ||
+ (sk->sk_state == SMC_LISTEN) ||
+ (sk->sk_state == SMC_CLOSED))
+ goto out;
+
+ if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
+ rc = 0;
+ goto out;
+ }
+
+ if (smc->use_fallback) {
+ rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
+ } else {
+ msg->msg_namelen = 0;
+ rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
+ SMC_STAT_RX_PAYLOAD(smc, rc, rc);
+ }
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static __poll_t smc_accept_poll(struct sock *parent)
+{
+ struct smc_sock *isk = smc_sk(parent);
+ __poll_t mask = 0;
+
+ spin_lock(&isk->accept_q_lock);
+ if (!list_empty(&isk->accept_q))
+ mask = EPOLLIN | EPOLLRDNORM;
+ spin_unlock(&isk->accept_q_lock);
+
+ return mask;
+}
+
+static __poll_t smc_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ __poll_t mask = 0;
+
+ if (!sk)
+ return EPOLLNVAL;
+
+ smc = smc_sk(sock->sk);
+ if (smc->use_fallback) {
+ /* delegate to CLC child sock */
+ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
+ sk->sk_err = smc->clcsock->sk->sk_err;
+ } else {
+ if (sk->sk_state != SMC_CLOSED)
+ sock_poll_wait(file, sock, wait);
+ if (sk->sk_err)
+ mask |= EPOLLERR;
+ if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+ (sk->sk_state == SMC_CLOSED))
+ mask |= EPOLLHUP;
+ if (sk->sk_state == SMC_LISTEN) {
+ /* woken up by sk_data_ready in smc_listen_work() */
+ mask |= smc_accept_poll(sk);
+ } else if (smc->use_fallback) { /* as result of connect_work()*/
+ mask |= smc->clcsock->ops->poll(file, smc->clcsock,
+ wait);
+ sk->sk_err = smc->clcsock->sk->sk_err;
+ } else {
+ if ((sk->sk_state != SMC_INIT &&
+ atomic_read(&smc->conn.sndbuf_space)) ||
+ sk->sk_shutdown & SEND_SHUTDOWN) {
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ } else {
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ }
+ if (atomic_read(&smc->conn.bytes_to_rcv))
+ mask |= EPOLLIN | EPOLLRDNORM;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+ if (sk->sk_state == SMC_APPCLOSEWAIT1)
+ mask |= EPOLLIN;
+ if (smc->conn.urg_state == SMC_URG_VALID)
+ mask |= EPOLLPRI;
+ }
+ }
+
+ return mask;
+}
+
+static int smc_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ bool do_shutdown = true;
+ struct smc_sock *smc;
+ int rc = -EINVAL;
+ int old_state;
+ int rc1 = 0;
+
+ smc = smc_sk(sk);
+
+ if ((how < SHUT_RD) || (how > SHUT_RDWR))
+ return rc;
+
+ lock_sock(sk);
+
+ if (sock->state == SS_CONNECTING) {
+ if (sk->sk_state == SMC_ACTIVE)
+ sock->state = SS_CONNECTED;
+ else if (sk->sk_state == SMC_PEERCLOSEWAIT1 ||
+ sk->sk_state == SMC_PEERCLOSEWAIT2 ||
+ sk->sk_state == SMC_APPCLOSEWAIT1 ||
+ sk->sk_state == SMC_APPCLOSEWAIT2 ||
+ sk->sk_state == SMC_APPFINCLOSEWAIT)
+ sock->state = SS_DISCONNECTING;
+ }
+
+ rc = -ENOTCONN;
+ if ((sk->sk_state != SMC_ACTIVE) &&
+ (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
+ (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
+ (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+ (sk->sk_state != SMC_APPCLOSEWAIT2) &&
+ (sk->sk_state != SMC_APPFINCLOSEWAIT))
+ goto out;
+ if (smc->use_fallback) {
+ rc = kernel_sock_shutdown(smc->clcsock, how);
+ sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
+ if (sk->sk_shutdown == SHUTDOWN_MASK) {
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_socket->state = SS_UNCONNECTED;
+ sock_put(sk);
+ }
+ goto out;
+ }
+ switch (how) {
+ case SHUT_RDWR: /* shutdown in both directions */
+ old_state = sk->sk_state;
+ rc = smc_close_active(smc);
+ if (old_state == SMC_ACTIVE &&
+ sk->sk_state == SMC_PEERCLOSEWAIT1)
+ do_shutdown = false;
+ break;
+ case SHUT_WR:
+ rc = smc_close_shutdown_write(smc);
+ break;
+ case SHUT_RD:
+ rc = 0;
+ /* nothing more to do because peer is not involved */
+ break;
+ }
+ if (do_shutdown && smc->clcsock)
+ rc1 = kernel_sock_shutdown(smc->clcsock, how);
+ /* map sock_shutdown_cmd constants to sk_shutdown value range */
+ sk->sk_shutdown |= how + 1;
+
+ if (sk->sk_state == SMC_CLOSED)
+ sock->state = SS_UNCONNECTED;
+ else
+ sock->state = SS_DISCONNECTING;
+out:
+ release_sock(sk);
+ return rc ? rc : rc1;
+}
+
+static int __smc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct smc_sock *smc;
+ int val, len;
+
+ smc = smc_sk(sock->sk);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(int, len, sizeof(int));
+
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case SMC_LIMIT_HS:
+ val = smc->limit_smc_hs;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int __smc_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int val, rc;
+
+ smc = smc_sk(sk);
+
+ lock_sock(sk);
+ switch (optname) {
+ case SMC_LIMIT_HS:
+ if (optlen < sizeof(int)) {
+ rc = -EINVAL;
+ break;
+ }
+ if (copy_from_sockptr(&val, optval, sizeof(int))) {
+ rc = -EFAULT;
+ break;
+ }
+
+ smc->limit_smc_hs = !!val;
+ rc = 0;
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ break;
+ }
+ release_sock(sk);
+
+ return rc;
+}
+
+static int smc_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int val, rc;
+
+ if (level == SOL_TCP && optname == TCP_ULP)
+ return -EOPNOTSUPP;
+ else if (level == SOL_SMC)
+ return __smc_setsockopt(sock, level, optname, optval, optlen);
+
+ smc = smc_sk(sk);
+
+ /* generic setsockopts reaching us here always apply to the
+ * CLC socket
+ */
+ mutex_lock(&smc->clcsock_release_lock);
+ if (!smc->clcsock) {
+ mutex_unlock(&smc->clcsock_release_lock);
+ return -EBADF;
+ }
+ if (unlikely(!smc->clcsock->ops->setsockopt))
+ rc = -EOPNOTSUPP;
+ else
+ rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+ if (smc->clcsock->sk->sk_err) {
+ sk->sk_err = smc->clcsock->sk->sk_err;
+ sk_error_report(sk);
+ }
+ mutex_unlock(&smc->clcsock_release_lock);
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
+ return -EFAULT;
+
+ lock_sock(sk);
+ if (rc || smc->use_fallback)
+ goto out;
+ switch (optname) {
+ case TCP_FASTOPEN:
+ case TCP_FASTOPEN_CONNECT:
+ case TCP_FASTOPEN_KEY:
+ case TCP_FASTOPEN_NO_COOKIE:
+ /* option not supported by SMC */
+ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
+ rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
+ } else {
+ rc = -EINVAL;
+ }
+ break;
+ case TCP_NODELAY:
+ if (sk->sk_state != SMC_INIT &&
+ sk->sk_state != SMC_LISTEN &&
+ sk->sk_state != SMC_CLOSED) {
+ if (val) {
+ SMC_STAT_INC(smc, ndly_cnt);
+ smc_tx_pending(&smc->conn);
+ cancel_delayed_work(&smc->conn.tx_work);
+ }
+ }
+ break;
+ case TCP_CORK:
+ if (sk->sk_state != SMC_INIT &&
+ sk->sk_state != SMC_LISTEN &&
+ sk->sk_state != SMC_CLOSED) {
+ if (!val) {
+ SMC_STAT_INC(smc, cork_cnt);
+ smc_tx_pending(&smc->conn);
+ cancel_delayed_work(&smc->conn.tx_work);
+ }
+ }
+ break;
+ case TCP_DEFER_ACCEPT:
+ smc->sockopt_defer_accept = val;
+ break;
+ default:
+ break;
+ }
+out:
+ release_sock(sk);
+
+ return rc;
+}
+
+static int smc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct smc_sock *smc;
+ int rc;
+
+ if (level == SOL_SMC)
+ return __smc_getsockopt(sock, level, optname, optval, optlen);
+
+ smc = smc_sk(sock->sk);
+ mutex_lock(&smc->clcsock_release_lock);
+ if (!smc->clcsock) {
+ mutex_unlock(&smc->clcsock_release_lock);
+ return -EBADF;
+ }
+ /* socket options apply to the CLC socket */
+ if (unlikely(!smc->clcsock->ops->getsockopt)) {
+ mutex_unlock(&smc->clcsock_release_lock);
+ return -EOPNOTSUPP;
+ }
+ rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+ mutex_unlock(&smc->clcsock_release_lock);
+ return rc;
+}
+
+static int smc_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ union smc_host_cursor cons, urg;
+ struct smc_connection *conn;
+ struct smc_sock *smc;
+ int answ;
+
+ smc = smc_sk(sock->sk);
+ conn = &smc->conn;
+ lock_sock(&smc->sk);
+ if (smc->use_fallback) {
+ if (!smc->clcsock) {
+ release_sock(&smc->sk);
+ return -EBADF;
+ }
+ answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
+ release_sock(&smc->sk);
+ return answ;
+ }
+ switch (cmd) {
+ case SIOCINQ: /* same as FIONREAD */
+ if (smc->sk.sk_state == SMC_LISTEN) {
+ release_sock(&smc->sk);
+ return -EINVAL;
+ }
+ if (smc->sk.sk_state == SMC_INIT ||
+ smc->sk.sk_state == SMC_CLOSED)
+ answ = 0;
+ else
+ answ = atomic_read(&smc->conn.bytes_to_rcv);
+ break;
+ case SIOCOUTQ:
+ /* output queue size (not send + not acked) */
+ if (smc->sk.sk_state == SMC_LISTEN) {
+ release_sock(&smc->sk);
+ return -EINVAL;
+ }
+ if (smc->sk.sk_state == SMC_INIT ||
+ smc->sk.sk_state == SMC_CLOSED)
+ answ = 0;
+ else
+ answ = smc->conn.sndbuf_desc->len -
+ atomic_read(&smc->conn.sndbuf_space);
+ break;
+ case SIOCOUTQNSD:
+ /* output queue size (not send only) */
+ if (smc->sk.sk_state == SMC_LISTEN) {
+ release_sock(&smc->sk);
+ return -EINVAL;
+ }
+ if (smc->sk.sk_state == SMC_INIT ||
+ smc->sk.sk_state == SMC_CLOSED)
+ answ = 0;
+ else
+ answ = smc_tx_prepared_sends(&smc->conn);
+ break;
+ case SIOCATMARK:
+ if (smc->sk.sk_state == SMC_LISTEN) {
+ release_sock(&smc->sk);
+ return -EINVAL;
+ }
+ if (smc->sk.sk_state == SMC_INIT ||
+ smc->sk.sk_state == SMC_CLOSED) {
+ answ = 0;
+ } else {
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ smc_curs_copy(&urg, &conn->urg_curs, conn);
+ answ = smc_curs_diff(conn->rmb_desc->len,
+ &cons, &urg) == 1;
+ }
+ break;
+ default:
+ release_sock(&smc->sk);
+ return -ENOIOCTLCMD;
+ }
+ release_sock(&smc->sk);
+
+ return put_user(answ, (int __user *)arg);
+}
+
+/* Map the affected portions of the rmbe into an spd, note the number of bytes
+ * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
+ * updates till whenever a respective page has been fully processed.
+ * Note that subsequent recv() calls have to wait till all splice() processing
+ * completed.
+ */
+static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -ENOTCONN;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
+ /* socket was connected before, no more data to read */
+ rc = 0;
+ goto out;
+ }
+ if (sk->sk_state == SMC_INIT ||
+ sk->sk_state == SMC_LISTEN ||
+ sk->sk_state == SMC_CLOSED)
+ goto out;
+
+ if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
+ rc = 0;
+ goto out;
+ }
+
+ if (smc->use_fallback) {
+ rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
+ pipe, len, flags);
+ } else {
+ if (*ppos) {
+ rc = -ESPIPE;
+ goto out;
+ }
+ if (flags & SPLICE_F_NONBLOCK)
+ flags = MSG_DONTWAIT;
+ else
+ flags = 0;
+ SMC_STAT_INC(smc, splice_cnt);
+ rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
+ }
+out:
+ release_sock(sk);
+
+ return rc;
+}
+
+/* must look like tcp */
+static const struct proto_ops smc_sock_ops = {
+ .family = PF_SMC,
+ .owner = THIS_MODULE,
+ .release = smc_release,
+ .bind = smc_bind,
+ .connect = smc_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = smc_accept,
+ .getname = smc_getname,
+ .poll = smc_poll,
+ .ioctl = smc_ioctl,
+ .listen = smc_listen,
+ .shutdown = smc_shutdown,
+ .setsockopt = smc_setsockopt,
+ .getsockopt = smc_getsockopt,
+ .sendmsg = smc_sendmsg,
+ .recvmsg = smc_recvmsg,
+ .mmap = sock_no_mmap,
+ .splice_read = smc_splice_read,
+};
+
+static int __smc_create(struct net *net, struct socket *sock, int protocol,
+ int kern, struct socket *clcsock)
+{
+ int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
+ struct smc_sock *smc;
+ struct sock *sk;
+ int rc;
+
+ rc = -ESOCKTNOSUPPORT;
+ if (sock->type != SOCK_STREAM)
+ goto out;
+
+ rc = -EPROTONOSUPPORT;
+ if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
+ goto out;
+
+ rc = -ENOBUFS;
+ sock->ops = &smc_sock_ops;
+ sock->state = SS_UNCONNECTED;
+ sk = smc_sock_alloc(net, sock, protocol);
+ if (!sk)
+ goto out;
+
+ /* create internal TCP socket for CLC handshake and fallback */
+ smc = smc_sk(sk);
+ smc->use_fallback = false; /* assume rdma capability first */
+ smc->fallback_rsn = 0;
+
+ /* default behavior from limit_smc_hs in every net namespace */
+ smc->limit_smc_hs = net->smc.limit_smc_hs;
+
+ rc = 0;
+ if (!clcsock) {
+ rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
+ &smc->clcsock);
+ if (rc) {
+ sk_common_release(sk);
+ goto out;
+ }
+
+ /* smc_clcsock_release() does not wait smc->clcsock->sk's
+ * destruction; its sk_state might not be TCP_CLOSE after
+ * smc->sk is close()d, and TCP timers can be fired later,
+ * which need net ref.
+ */
+ sk = smc->clcsock->sk;
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ sk->sk_net_refcnt = 1;
+ get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
+ } else {
+ smc->clcsock = clcsock;
+ }
+
+out:
+ return rc;
+}
+
+static int smc_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ return __smc_create(net, sock, protocol, kern, NULL);
+}
+
+static const struct net_proto_family smc_sock_family_ops = {
+ .family = PF_SMC,
+ .owner = THIS_MODULE,
+ .create = smc_create,
+};
+
+static int smc_ulp_init(struct sock *sk)
+{
+ struct socket *tcp = sk->sk_socket;
+ struct net *net = sock_net(sk);
+ struct socket *smcsock;
+ int protocol, ret;
+
+ /* only TCP can be replaced */
+ if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP ||
+ (sk->sk_family != AF_INET && sk->sk_family != AF_INET6))
+ return -ESOCKTNOSUPPORT;
+ /* don't handle wq now */
+ if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list)
+ return -ENOTCONN;
+
+ if (sk->sk_family == AF_INET)
+ protocol = SMCPROTO_SMC;
+ else
+ protocol = SMCPROTO_SMC6;
+
+ smcsock = sock_alloc();
+ if (!smcsock)
+ return -ENFILE;
+
+ smcsock->type = SOCK_STREAM;
+ __module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */
+ ret = __smc_create(net, smcsock, protocol, 1, tcp);
+ if (ret) {
+ sock_release(smcsock); /* module_put() which ops won't be NULL */
+ return ret;
+ }
+
+ /* replace tcp socket to smc */
+ smcsock->file = tcp->file;
+ smcsock->file->private_data = smcsock;
+ smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */
+ smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */
+ tcp->file = NULL;
+
+ return ret;
+}
+
+static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk,
+ const gfp_t priority)
+{
+ struct inet_connection_sock *icsk = inet_csk(newsk);
+
+ /* don't inherit ulp ops to child when listen */
+ icsk->icsk_ulp_ops = NULL;
+}
+
+static struct tcp_ulp_ops smc_ulp_ops __read_mostly = {
+ .name = "smc",
+ .owner = THIS_MODULE,
+ .init = smc_ulp_init,
+ .clone = smc_ulp_clone,
+};
+
+unsigned int smc_net_id;
+
+static __net_init int smc_net_init(struct net *net)
+{
+ int rc;
+
+ rc = smc_sysctl_net_init(net);
+ if (rc)
+ return rc;
+ return smc_pnet_net_init(net);
+}
+
+static void __net_exit smc_net_exit(struct net *net)
+{
+ smc_sysctl_net_exit(net);
+ smc_pnet_net_exit(net);
+}
+
+static __net_init int smc_net_stat_init(struct net *net)
+{
+ return smc_stats_init(net);
+}
+
+static void __net_exit smc_net_stat_exit(struct net *net)
+{
+ smc_stats_exit(net);
+}
+
+static struct pernet_operations smc_net_ops = {
+ .init = smc_net_init,
+ .exit = smc_net_exit,
+ .id = &smc_net_id,
+ .size = sizeof(struct smc_net),
+};
+
+static struct pernet_operations smc_net_stat_ops = {
+ .init = smc_net_stat_init,
+ .exit = smc_net_stat_exit,
+};
+
+static int __init smc_init(void)
+{
+ int rc;
+
+ rc = register_pernet_subsys(&smc_net_ops);
+ if (rc)
+ return rc;
+
+ rc = register_pernet_subsys(&smc_net_stat_ops);
+ if (rc)
+ goto out_pernet_subsys;
+
+ rc = smc_ism_init();
+ if (rc)
+ goto out_pernet_subsys_stat;
+ smc_clc_init();
+
+ rc = smc_nl_init();
+ if (rc)
+ goto out_ism;
+
+ rc = smc_pnet_init();
+ if (rc)
+ goto out_nl;
+
+ rc = -ENOMEM;
+
+ smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0);
+ if (!smc_tcp_ls_wq)
+ goto out_pnet;
+
+ smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
+ if (!smc_hs_wq)
+ goto out_alloc_tcp_ls_wq;
+
+ smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
+ if (!smc_close_wq)
+ goto out_alloc_hs_wq;
+
+ rc = smc_core_init();
+ if (rc) {
+ pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
+ goto out_alloc_wqs;
+ }
+
+ rc = smc_llc_init();
+ if (rc) {
+ pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
+ goto out_core;
+ }
+
+ rc = smc_cdc_init();
+ if (rc) {
+ pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
+ goto out_core;
+ }
+
+ rc = proto_register(&smc_proto, 1);
+ if (rc) {
+ pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
+ goto out_core;
+ }
+
+ rc = proto_register(&smc_proto6, 1);
+ if (rc) {
+ pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
+ goto out_proto;
+ }
+
+ rc = sock_register(&smc_sock_family_ops);
+ if (rc) {
+ pr_err("%s: sock_register fails with %d\n", __func__, rc);
+ goto out_proto6;
+ }
+ INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
+ INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
+
+ rc = smc_ib_register_client();
+ if (rc) {
+ pr_err("%s: ib_register fails with %d\n", __func__, rc);
+ goto out_sock;
+ }
+
+ rc = tcp_register_ulp(&smc_ulp_ops);
+ if (rc) {
+ pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
+ goto out_ib;
+ }
+
+ static_branch_enable(&tcp_have_smc);
+ return 0;
+
+out_ib:
+ smc_ib_unregister_client();
+out_sock:
+ sock_unregister(PF_SMC);
+out_proto6:
+ proto_unregister(&smc_proto6);
+out_proto:
+ proto_unregister(&smc_proto);
+out_core:
+ smc_core_exit();
+out_alloc_wqs:
+ destroy_workqueue(smc_close_wq);
+out_alloc_hs_wq:
+ destroy_workqueue(smc_hs_wq);
+out_alloc_tcp_ls_wq:
+ destroy_workqueue(smc_tcp_ls_wq);
+out_pnet:
+ smc_pnet_exit();
+out_nl:
+ smc_nl_exit();
+out_ism:
+ smc_clc_exit();
+ smc_ism_exit();
+out_pernet_subsys_stat:
+ unregister_pernet_subsys(&smc_net_stat_ops);
+out_pernet_subsys:
+ unregister_pernet_subsys(&smc_net_ops);
+
+ return rc;
+}
+
+static void __exit smc_exit(void)
+{
+ static_branch_disable(&tcp_have_smc);
+ tcp_unregister_ulp(&smc_ulp_ops);
+ sock_unregister(PF_SMC);
+ smc_core_exit();
+ smc_ib_unregister_client();
+ smc_ism_exit();
+ destroy_workqueue(smc_close_wq);
+ destroy_workqueue(smc_tcp_ls_wq);
+ destroy_workqueue(smc_hs_wq);
+ proto_unregister(&smc_proto6);
+ proto_unregister(&smc_proto);
+ smc_pnet_exit();
+ smc_nl_exit();
+ smc_clc_exit();
+ unregister_pernet_subsys(&smc_net_stat_ops);
+ unregister_pernet_subsys(&smc_net_ops);
+ rcu_barrier();
+}
+
+module_init(smc_init);
+module_exit(smc_exit);
+
+MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("smc socket address family");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_SMC);
+MODULE_ALIAS_TCP_ULP("smc");
+MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME);
diff --git a/net/smc/smc.h b/net/smc/smc.h
new file mode 100644
index 000000000..e377980b8
--- /dev/null
+++ b/net/smc/smc.h
@@ -0,0 +1,385 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for the SMC module (socket related)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+#ifndef __SMC_H
+#define __SMC_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <linux/compiler.h> /* __aligned */
+#include <net/genetlink.h>
+#include <net/sock.h>
+
+#include "smc_ib.h"
+
+#define SMC_V1 1 /* SMC version V1 */
+#define SMC_V2 2 /* SMC version V2 */
+
+#define SMC_RELEASE_0 0
+#define SMC_RELEASE_1 1
+#define SMC_RELEASE SMC_RELEASE_1 /* the latest release version */
+
+#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */
+#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */
+
+#define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM
+ * devices
+ */
+#define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */
+
+extern struct proto smc_proto;
+extern struct proto smc_proto6;
+
+#ifdef ATOMIC64_INIT
+#define KERNEL_HAS_ATOMIC64
+#endif
+
+enum smc_state { /* possible states of an SMC socket */
+ SMC_ACTIVE = 1,
+ SMC_INIT = 2,
+ SMC_CLOSED = 7,
+ SMC_LISTEN = 10,
+ /* normal close */
+ SMC_PEERCLOSEWAIT1 = 20,
+ SMC_PEERCLOSEWAIT2 = 21,
+ SMC_APPFINCLOSEWAIT = 24,
+ SMC_APPCLOSEWAIT1 = 22,
+ SMC_APPCLOSEWAIT2 = 23,
+ SMC_PEERFINCLOSEWAIT = 25,
+ /* abnormal close */
+ SMC_PEERABORTWAIT = 26,
+ SMC_PROCESSABORT = 27,
+};
+
+struct smc_link_group;
+
+struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */
+ union {
+ u8 type;
+#if defined(__BIG_ENDIAN_BITFIELD)
+ struct {
+ u8 llc_version:4,
+ llc_type:4;
+ };
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ struct {
+ u8 llc_type:4,
+ llc_version:4;
+ };
+#endif
+ };
+} __aligned(1);
+
+struct smc_cdc_conn_state_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 peer_done_writing : 1; /* Sending done indicator */
+ u8 peer_conn_closed : 1; /* Peer connection closed indicator */
+ u8 peer_conn_abort : 1; /* Abnormal close indicator */
+ u8 reserved : 5;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 5;
+ u8 peer_conn_abort : 1;
+ u8 peer_conn_closed : 1;
+ u8 peer_done_writing : 1;
+#endif
+};
+
+struct smc_cdc_producer_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 write_blocked : 1; /* Writing Blocked, no rx buf space */
+ u8 urg_data_pending : 1; /* Urgent Data Pending */
+ u8 urg_data_present : 1; /* Urgent Data Present */
+ u8 cons_curs_upd_req : 1; /* cursor update requested */
+ u8 failover_validation : 1;/* message replay due to failover */
+ u8 reserved : 3;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 3;
+ u8 failover_validation : 1;
+ u8 cons_curs_upd_req : 1;
+ u8 urg_data_present : 1;
+ u8 urg_data_pending : 1;
+ u8 write_blocked : 1;
+#endif
+};
+
+/* in host byte order */
+union smc_host_cursor { /* SMC cursor - an offset in an RMBE */
+ struct {
+ u16 reserved;
+ u16 wrap; /* window wrap sequence number */
+ u32 count; /* cursor (= offset) part */
+ };
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_t acurs; /* for atomic processing */
+#else
+ u64 acurs; /* for atomic processing */
+#endif
+} __aligned(8);
+
+/* in host byte order, except for flag bitfields in network byte order */
+struct smc_host_cdc_msg { /* Connection Data Control message */
+ struct smc_wr_rx_hdr common; /* .type = 0xFE */
+ u8 len; /* length = 44 */
+ u16 seqno; /* connection seq # */
+ u32 token; /* alert_token */
+ union smc_host_cursor prod; /* producer cursor */
+ union smc_host_cursor cons; /* consumer cursor,
+ * piggy backed "ack"
+ */
+ struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */
+ struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/
+ u8 reserved[18];
+} __aligned(8);
+
+enum smc_urg_state {
+ SMC_URG_VALID = 1, /* data present */
+ SMC_URG_NOTYET = 2, /* data pending */
+ SMC_URG_READ = 3, /* data was already read */
+};
+
+struct smc_mark_woken {
+ bool woken;
+ void *key;
+ wait_queue_entry_t wait_entry;
+};
+
+struct smc_connection {
+ struct rb_node alert_node;
+ struct smc_link_group *lgr; /* link group of connection */
+ struct smc_link *lnk; /* assigned SMC-R link */
+ u32 alert_token_local; /* unique conn. id */
+ u8 peer_rmbe_idx; /* from tcp handshake */
+ int peer_rmbe_size; /* size of peer rx buffer */
+ atomic_t peer_rmbe_space;/* remaining free bytes in peer
+ * rmbe
+ */
+ int rtoken_idx; /* idx to peer RMB rkey/addr */
+
+ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
+ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
+ int rmbe_size_comp; /* compressed notation */
+ int rmbe_update_limit;
+ /* lower limit for consumer
+ * cursor update
+ */
+
+ struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging
+ * buffer for CDC msg send
+ * .prod cf. TCP snd_nxt
+ * .cons cf. TCP sends ack
+ */
+ union smc_host_cursor local_tx_ctrl_fin;
+ /* prod crsr - confirmed by peer
+ */
+ union smc_host_cursor tx_curs_prep; /* tx - prepared data
+ * snd_max..wmem_alloc
+ */
+ union smc_host_cursor tx_curs_sent; /* tx - sent data
+ * snd_nxt ?
+ */
+ union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer
+ * snd-wnd-begin ?
+ */
+ atomic_t sndbuf_space; /* remaining space in sndbuf */
+ u16 tx_cdc_seq; /* sequence # for CDC send */
+ u16 tx_cdc_seq_fin; /* sequence # - tx completed */
+ spinlock_t send_lock; /* protect wr_sends */
+ atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe
+ * - inc when post wqe,
+ * - dec on polled tx cqe
+ */
+ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
+ atomic_t tx_pushing; /* nr_threads trying tx push */
+ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
+ u32 tx_off; /* base offset in peer rmb */
+
+ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl.
+ * .prod cf. TCP rcv_nxt
+ * .cons cf. TCP snd_una
+ */
+ union smc_host_cursor rx_curs_confirmed; /* confirmed to peer
+ * source of snd_una ?
+ */
+ union smc_host_cursor urg_curs; /* points at urgent byte */
+ enum smc_urg_state urg_state;
+ bool urg_tx_pend; /* urgent data staged */
+ bool urg_rx_skip_pend;
+ /* indicate urgent oob data
+ * read, but previous regular
+ * data still pending
+ */
+ char urg_rx_byte; /* urgent byte */
+ bool tx_in_release_sock;
+ /* flush pending tx data in
+ * sock release_cb()
+ */
+ atomic_t bytes_to_rcv; /* arrived data,
+ * not yet received
+ */
+ atomic_t splice_pending; /* number of spliced bytes
+ * pending processing
+ */
+#ifndef KERNEL_HAS_ATOMIC64
+ spinlock_t acurs_lock; /* protect cursors */
+#endif
+ struct work_struct close_work; /* peer sent some closing */
+ struct work_struct abort_work; /* abort the connection */
+ struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */
+ u8 rx_off; /* receive offset:
+ * 0 for SMC-R, 32 for SMC-D
+ */
+ u64 peer_token; /* SMC-D token of peer */
+ u8 killed : 1; /* abnormal termination */
+ u8 freed : 1; /* normal termiation */
+ u8 out_of_sync : 1; /* out of sync with peer */
+};
+
+struct smc_sock { /* smc sock container */
+ struct sock sk;
+ struct socket *clcsock; /* internal tcp socket */
+ void (*clcsk_state_change)(struct sock *sk);
+ /* original stat_change fct. */
+ void (*clcsk_data_ready)(struct sock *sk);
+ /* original data_ready fct. */
+ void (*clcsk_write_space)(struct sock *sk);
+ /* original write_space fct. */
+ void (*clcsk_error_report)(struct sock *sk);
+ /* original error_report fct. */
+ struct smc_connection conn; /* smc connection */
+ struct smc_sock *listen_smc; /* listen parent */
+ struct work_struct connect_work; /* handle non-blocking connect*/
+ struct work_struct tcp_listen_work;/* handle tcp socket accepts */
+ struct work_struct smc_listen_work;/* prepare new accept socket */
+ struct list_head accept_q; /* sockets to be accepted */
+ spinlock_t accept_q_lock; /* protects accept_q */
+ bool limit_smc_hs; /* put constraint on handshake */
+ bool use_fallback; /* fallback to tcp */
+ int fallback_rsn; /* reason for fallback */
+ u32 peer_diagnosis; /* decline reason from peer */
+ atomic_t queued_smc_hs; /* queued smc handshakes */
+ struct inet_connection_sock_af_ops af_ops;
+ const struct inet_connection_sock_af_ops *ori_af_ops;
+ /* original af ops */
+ int sockopt_defer_accept;
+ /* sockopt TCP_DEFER_ACCEPT
+ * value
+ */
+ u8 wait_close_tx_prepared : 1;
+ /* shutdown wr or close
+ * started, waiting for unsent
+ * data to be sent
+ */
+ u8 connect_nonblock : 1;
+ /* non-blocking connect in
+ * flight
+ */
+ struct mutex clcsock_release_lock;
+ /* protects clcsock of a listen
+ * socket
+ * */
+};
+
+#define smc_sk(ptr) container_of_const(ptr, struct smc_sock, sk)
+
+static inline void smc_init_saved_callbacks(struct smc_sock *smc)
+{
+ smc->clcsk_state_change = NULL;
+ smc->clcsk_data_ready = NULL;
+ smc->clcsk_write_space = NULL;
+ smc->clcsk_error_report = NULL;
+}
+
+static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk)
+{
+ return (struct smc_sock *)
+ ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY);
+}
+
+/* save target_cb in saved_cb, and replace target_cb with new_cb */
+static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *),
+ void (*new_cb)(struct sock *),
+ void (**saved_cb)(struct sock *))
+{
+ /* only save once */
+ if (!*saved_cb)
+ *saved_cb = *target_cb;
+ *target_cb = new_cb;
+}
+
+/* restore target_cb to saved_cb, and reset saved_cb to NULL */
+static inline void smc_clcsock_restore_cb(void (**target_cb)(struct sock *),
+ void (**saved_cb)(struct sock *))
+{
+ if (!*saved_cb)
+ return;
+ *target_cb = *saved_cb;
+ *saved_cb = NULL;
+}
+
+extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
+extern struct workqueue_struct *smc_close_wq; /* wq for close work */
+
+#define SMC_SYSTEMID_LEN 8
+
+extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
+
+#define ntohll(x) be64_to_cpu(x)
+#define htonll(x) cpu_to_be64(x)
+
+/* convert an u32 value into network byte order, store it into a 3 byte field */
+static inline void hton24(u8 *net, u32 host)
+{
+ __be32 t;
+
+ t = cpu_to_be32(host);
+ memcpy(net, ((u8 *)&t) + 1, 3);
+}
+
+/* convert a received 3 byte field into host byte order*/
+static inline u32 ntoh24(u8 *net)
+{
+ __be32 t = 0;
+
+ memcpy(((u8 *)&t) + 1, net, 3);
+ return be32_to_cpu(t);
+}
+
+#ifdef CONFIG_XFRM
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+ return (smc->clcsock->sk->sk_policy[0] ||
+ smc->clcsock->sk->sk_policy[1]) ? true : false;
+}
+#else
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+ return false;
+}
+#endif
+
+struct smc_gidlist;
+
+struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
+void smc_close_non_accepted(struct sock *sk);
+void smc_fill_gid_list(struct smc_link_group *lgr,
+ struct smc_gidlist *gidlist,
+ struct smc_ib_device *known_dev, u8 *known_gid);
+
+/* smc handshake limitation interface for netlink */
+int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
+
+static inline void smc_sock_set_flag(struct sock *sk, enum sock_flags flag)
+{
+ set_bit(flag, &sk->sk_flags);
+}
+
+#endif /* __SMC_H */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
new file mode 100644
index 000000000..3c06625ce
--- /dev/null
+++ b/net/smc/smc_cdc.c
@@ -0,0 +1,493 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ * handles flow control
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/spinlock.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+#include "smc_tx.h"
+#include "smc_rx.h"
+#include "smc_close.h"
+
+/********************************** send *************************************/
+
+/* handler for send/transmission completion of a CDC msg */
+static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
+ struct smc_link *link,
+ enum ib_wc_status wc_status)
+{
+ struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
+ struct smc_connection *conn = cdcpend->conn;
+ struct smc_buf_desc *sndbuf_desc;
+ struct smc_sock *smc;
+ int diff;
+
+ sndbuf_desc = conn->sndbuf_desc;
+ smc = container_of(conn, struct smc_sock, conn);
+ bh_lock_sock(&smc->sk);
+ if (!wc_status && sndbuf_desc) {
+ diff = smc_curs_diff(sndbuf_desc->len,
+ &cdcpend->conn->tx_curs_fin,
+ &cdcpend->cursor);
+ /* sndbuf_space is decreased in smc_sendmsg */
+ smp_mb__before_atomic();
+ atomic_add(diff, &cdcpend->conn->sndbuf_space);
+ /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
+ smp_mb__after_atomic();
+ smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn);
+ smc_curs_copy(&conn->local_tx_ctrl_fin, &cdcpend->p_cursor,
+ conn);
+ conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
+ }
+
+ if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) {
+ /* If user owns the sock_lock, mark the connection need sending.
+ * User context will later try to send when it release sock_lock
+ * in smc_release_cb()
+ */
+ if (sock_owned_by_user(&smc->sk))
+ conn->tx_in_release_sock = true;
+ else
+ smc_tx_pending(conn);
+
+ if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
+ wake_up(&conn->cdc_pend_tx_wq);
+ }
+ WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0);
+
+ smc_tx_sndbuf_nonfull(smc);
+ bh_unlock_sock(&smc->sk);
+}
+
+int smc_cdc_get_free_slot(struct smc_connection *conn,
+ struct smc_link *link,
+ struct smc_wr_buf **wr_buf,
+ struct smc_rdma_wr **wr_rdma_buf,
+ struct smc_cdc_tx_pend **pend)
+{
+ int rc;
+
+ rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
+ wr_rdma_buf,
+ (struct smc_wr_tx_pend_priv **)pend);
+ if (conn->killed) {
+ /* abnormal termination */
+ if (!rc)
+ smc_wr_tx_put_slot(link,
+ (struct smc_wr_tx_pend_priv *)(*pend));
+ rc = -EPIPE;
+ }
+ return rc;
+}
+
+static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
+ struct smc_cdc_tx_pend *pend)
+{
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
+ "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
+ BUILD_BUG_ON_MSG(
+ offsetofend(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE,
+ "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+ "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)");
+ pend->conn = conn;
+ pend->cursor = conn->tx_curs_sent;
+ pend->p_cursor = conn->local_tx_ctrl.prod;
+ pend->ctrl_seq = conn->tx_cdc_seq;
+}
+
+int smc_cdc_msg_send(struct smc_connection *conn,
+ struct smc_wr_buf *wr_buf,
+ struct smc_cdc_tx_pend *pend)
+{
+ struct smc_link *link = conn->lnk;
+ union smc_host_cursor cfed;
+ int rc;
+
+ smc_cdc_add_pending_send(conn, pend);
+
+ conn->tx_cdc_seq++;
+ conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
+ smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed);
+
+ atomic_inc(&conn->cdc_pend_tx_wr);
+ smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
+
+ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
+ if (!rc) {
+ smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn);
+ conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
+ } else {
+ conn->tx_cdc_seq--;
+ conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
+ atomic_dec(&conn->cdc_pend_tx_wr);
+ }
+
+ return rc;
+}
+
+/* send a validation msg indicating the move of a conn to an other QP link */
+int smcr_cdc_msg_send_validation(struct smc_connection *conn,
+ struct smc_cdc_tx_pend *pend,
+ struct smc_wr_buf *wr_buf)
+{
+ struct smc_host_cdc_msg *local = &conn->local_tx_ctrl;
+ struct smc_link *link = conn->lnk;
+ struct smc_cdc_msg *peer;
+ int rc;
+
+ peer = (struct smc_cdc_msg *)wr_buf;
+ peer->common.type = local->common.type;
+ peer->len = local->len;
+ peer->seqno = htons(conn->tx_cdc_seq_fin); /* seqno last compl. tx */
+ peer->token = htonl(local->token);
+ peer->prod_flags.failover_validation = 1;
+
+ /* We need to set pend->conn here to make sure smc_cdc_tx_handler()
+ * can handle properly
+ */
+ smc_cdc_add_pending_send(conn, pend);
+
+ atomic_inc(&conn->cdc_pend_tx_wr);
+ smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
+
+ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
+ if (unlikely(rc))
+ atomic_dec(&conn->cdc_pend_tx_wr);
+
+ return rc;
+}
+
+static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
+{
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ struct smc_link *link;
+ bool again = false;
+ int rc;
+
+again:
+ link = conn->lnk;
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend);
+ if (rc)
+ goto put_out;
+
+ spin_lock_bh(&conn->send_lock);
+ if (link != conn->lnk) {
+ /* link of connection changed, try again one time*/
+ spin_unlock_bh(&conn->send_lock);
+ smc_wr_tx_put_slot(link,
+ (struct smc_wr_tx_pend_priv *)pend);
+ smc_wr_tx_link_put(link);
+ if (again)
+ return -ENOLINK;
+ again = true;
+ goto again;
+ }
+ rc = smc_cdc_msg_send(conn, wr_buf, pend);
+ spin_unlock_bh(&conn->send_lock);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
+{
+ int rc;
+
+ if (!smc_conn_lgr_valid(conn) ||
+ (conn->lgr->is_smcd && conn->lgr->peer_shutdown))
+ return -EPIPE;
+
+ if (conn->lgr->is_smcd) {
+ spin_lock_bh(&conn->send_lock);
+ rc = smcd_cdc_msg_send(conn);
+ spin_unlock_bh(&conn->send_lock);
+ } else {
+ rc = smcr_cdc_get_slot_and_msg_send(conn);
+ }
+
+ return rc;
+}
+
+void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn)
+{
+ wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr));
+}
+
+/* Send a SMC-D CDC header.
+ * This increments the free space available in our send buffer.
+ * Also update the confirmed receive buffer with what was sent to the peer.
+ */
+int smcd_cdc_msg_send(struct smc_connection *conn)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ union smc_host_cursor curs;
+ struct smcd_cdc_msg cdc;
+ int rc, diff;
+
+ memset(&cdc, 0, sizeof(cdc));
+ cdc.common.type = SMC_CDC_MSG_TYPE;
+ curs.acurs.counter = atomic64_read(&conn->local_tx_ctrl.prod.acurs);
+ cdc.prod.wrap = curs.wrap;
+ cdc.prod.count = curs.count;
+ curs.acurs.counter = atomic64_read(&conn->local_tx_ctrl.cons.acurs);
+ cdc.cons.wrap = curs.wrap;
+ cdc.cons.count = curs.count;
+ cdc.cons.prod_flags = conn->local_tx_ctrl.prod_flags;
+ cdc.cons.conn_state_flags = conn->local_tx_ctrl.conn_state_flags;
+ rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1);
+ if (rc)
+ return rc;
+ smc_curs_copy(&conn->rx_curs_confirmed, &curs, conn);
+ conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
+ /* Calculate transmitted data and increment free send buffer space */
+ diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin,
+ &conn->tx_curs_sent);
+ /* increased by confirmed number of bytes */
+ smp_mb__before_atomic();
+ atomic_add(diff, &conn->sndbuf_space);
+ /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
+ smp_mb__after_atomic();
+ smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn);
+
+ smc_tx_sndbuf_nonfull(smc);
+ return rc;
+}
+
+/********************************* receive ***********************************/
+
+static inline bool smc_cdc_before(u16 seq1, u16 seq2)
+{
+ return (s16)(seq1 - seq2) < 0;
+}
+
+static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
+ int *diff_prod)
+{
+ struct smc_connection *conn = &smc->conn;
+ char *base;
+
+ /* new data included urgent business */
+ smc_curs_copy(&conn->urg_curs, &conn->local_rx_ctrl.prod, conn);
+ conn->urg_state = SMC_URG_VALID;
+ if (!sock_flag(&smc->sk, SOCK_URGINLINE))
+ /* we'll skip the urgent byte, so don't account for it */
+ (*diff_prod)--;
+ base = (char *)conn->rmb_desc->cpu_addr + conn->rx_off;
+ if (conn->urg_curs.count)
+ conn->urg_rx_byte = *(base + conn->urg_curs.count - 1);
+ else
+ conn->urg_rx_byte = *(base + conn->rmb_desc->len - 1);
+ sk_send_sigurg(&smc->sk);
+}
+
+static void smc_cdc_msg_validate(struct smc_sock *smc, struct smc_cdc_msg *cdc,
+ struct smc_link *link)
+{
+ struct smc_connection *conn = &smc->conn;
+ u16 recv_seq = ntohs(cdc->seqno);
+ s16 diff;
+
+ /* check that seqnum was seen before */
+ diff = conn->local_rx_ctrl.seqno - recv_seq;
+ if (diff < 0) { /* diff larger than 0x7fff */
+ /* drop connection */
+ conn->out_of_sync = 1; /* prevent any further receives */
+ spin_lock_bh(&conn->send_lock);
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ conn->lnk = link;
+ spin_unlock_bh(&conn->send_lock);
+ sock_hold(&smc->sk); /* sock_put in abort_work */
+ if (!queue_work(smc_close_wq, &conn->abort_work))
+ sock_put(&smc->sk);
+ }
+}
+
+static void smc_cdc_msg_recv_action(struct smc_sock *smc,
+ struct smc_cdc_msg *cdc)
+{
+ union smc_host_cursor cons_old, prod_old;
+ struct smc_connection *conn = &smc->conn;
+ int diff_cons, diff_prod;
+
+ smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn);
+ smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn);
+ smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn);
+
+ diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old,
+ &conn->local_rx_ctrl.cons);
+ if (diff_cons) {
+ /* peer_rmbe_space is decreased during data transfer with RDMA
+ * write
+ */
+ smp_mb__before_atomic();
+ atomic_add(diff_cons, &conn->peer_rmbe_space);
+ /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
+ smp_mb__after_atomic();
+ }
+
+ diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old,
+ &conn->local_rx_ctrl.prod);
+ if (diff_prod) {
+ if (conn->local_rx_ctrl.prod_flags.urg_data_present)
+ smc_cdc_handle_urg_data_arrival(smc, &diff_prod);
+ /* bytes_to_rcv is decreased in smc_recvmsg */
+ smp_mb__before_atomic();
+ atomic_add(diff_prod, &conn->bytes_to_rcv);
+ /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
+ smp_mb__after_atomic();
+ smc->sk.sk_data_ready(&smc->sk);
+ } else {
+ if (conn->local_rx_ctrl.prod_flags.write_blocked)
+ smc->sk.sk_data_ready(&smc->sk);
+ if (conn->local_rx_ctrl.prod_flags.urg_data_pending)
+ conn->urg_state = SMC_URG_NOTYET;
+ }
+
+ /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
+ if ((diff_cons && smc_tx_prepared_sends(conn)) ||
+ conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+ conn->local_rx_ctrl.prod_flags.urg_data_pending) {
+ if (!sock_owned_by_user(&smc->sk))
+ smc_tx_pending(conn);
+ else
+ conn->tx_in_release_sock = true;
+ }
+
+ if (diff_cons && conn->urg_tx_pend &&
+ atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {
+ /* urg data confirmed by peer, indicate we're ready for more */
+ conn->urg_tx_pend = false;
+ smc->sk.sk_write_space(&smc->sk);
+ }
+
+ if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+ smc->sk.sk_err = ECONNRESET;
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ }
+ if (smc_cdc_rxed_any_close_or_senddone(conn)) {
+ smc->sk.sk_shutdown |= RCV_SHUTDOWN;
+ if (smc->clcsock && smc->clcsock->sk)
+ smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
+ smc_sock_set_flag(&smc->sk, SOCK_DONE);
+ sock_hold(&smc->sk); /* sock_put in close_work */
+ if (!queue_work(smc_close_wq, &conn->close_work))
+ sock_put(&smc->sk);
+ }
+}
+
+/* called under tasklet context */
+static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc)
+{
+ sock_hold(&smc->sk);
+ bh_lock_sock(&smc->sk);
+ smc_cdc_msg_recv_action(smc, cdc);
+ bh_unlock_sock(&smc->sk);
+ sock_put(&smc->sk); /* no free sk in softirq-context */
+}
+
+/* Schedule a tasklet for this connection. Triggered from the ISM device IRQ
+ * handler to indicate update in the DMBE.
+ *
+ * Context:
+ * - tasklet context
+ */
+static void smcd_cdc_rx_tsklet(struct tasklet_struct *t)
+{
+ struct smc_connection *conn = from_tasklet(conn, t, rx_tsklet);
+ struct smcd_cdc_msg *data_cdc;
+ struct smcd_cdc_msg cdc;
+ struct smc_sock *smc;
+
+ if (!conn || conn->killed)
+ return;
+
+ data_cdc = (struct smcd_cdc_msg *)conn->rmb_desc->cpu_addr;
+ smcd_curs_copy(&cdc.prod, &data_cdc->prod, conn);
+ smcd_curs_copy(&cdc.cons, &data_cdc->cons, conn);
+ smc = container_of(conn, struct smc_sock, conn);
+ smc_cdc_msg_recv(smc, (struct smc_cdc_msg *)&cdc);
+}
+
+/* Initialize receive tasklet. Called from ISM device IRQ handler to start
+ * receiver side.
+ */
+void smcd_cdc_rx_init(struct smc_connection *conn)
+{
+ tasklet_setup(&conn->rx_tsklet, smcd_cdc_rx_tsklet);
+}
+
+/***************************** init, exit, misc ******************************/
+
+static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ struct smc_cdc_msg *cdc = buf;
+ struct smc_connection *conn;
+ struct smc_link_group *lgr;
+ struct smc_sock *smc;
+
+ if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved))
+ return; /* short message */
+ if (cdc->len != SMC_WR_TX_SIZE)
+ return; /* invalid message */
+
+ /* lookup connection */
+ lgr = smc_get_lgr(link);
+ read_lock_bh(&lgr->conns_lock);
+ conn = smc_lgr_find_conn(ntohl(cdc->token), lgr);
+ read_unlock_bh(&lgr->conns_lock);
+ if (!conn || conn->out_of_sync)
+ return;
+ smc = container_of(conn, struct smc_sock, conn);
+
+ if (cdc->prod_flags.failover_validation) {
+ smc_cdc_msg_validate(smc, cdc, link);
+ return;
+ }
+ if (smc_cdc_before(ntohs(cdc->seqno),
+ conn->local_rx_ctrl.seqno))
+ /* received seqno is old */
+ return;
+
+ smc_cdc_msg_recv(smc, cdc);
+}
+
+static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
+ {
+ .handler = smc_cdc_rx_handler,
+ .type = SMC_CDC_MSG_TYPE
+ },
+ {
+ .handler = NULL,
+ }
+};
+
+int __init smc_cdc_init(void)
+{
+ struct smc_wr_rx_handler *handler;
+ int rc = 0;
+
+ for (handler = smc_cdc_rx_handlers; handler->handler; handler++) {
+ INIT_HLIST_NODE(&handler->list);
+ rc = smc_wr_rx_register_handler(handler);
+ if (rc)
+ break;
+ }
+ return rc;
+}
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
new file mode 100644
index 000000000..696cc11f2
--- /dev/null
+++ b/net/smc/smc_cdc.h
@@ -0,0 +1,305 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_CDC_H
+#define SMC_CDC_H
+
+#include <linux/kernel.h> /* max_t */
+#include <linux/atomic.h>
+#include <linux/in.h>
+#include <linux/compiler.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_wr.h"
+
+#define SMC_CDC_MSG_TYPE 0xFE
+
+/* in network byte order */
+union smc_cdc_cursor { /* SMC cursor */
+ struct {
+ __be16 reserved;
+ __be16 wrap;
+ __be32 count;
+ };
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_t acurs; /* for atomic processing */
+#else
+ u64 acurs; /* for atomic processing */
+#endif
+} __aligned(8);
+
+/* in network byte order */
+struct smc_cdc_msg {
+ struct smc_wr_rx_hdr common; /* .type = 0xFE */
+ u8 len; /* 44 */
+ __be16 seqno;
+ __be32 token;
+ union smc_cdc_cursor prod;
+ union smc_cdc_cursor cons; /* piggy backed "ack" */
+ struct smc_cdc_producer_flags prod_flags;
+ struct smc_cdc_conn_state_flags conn_state_flags;
+ u8 reserved[18];
+};
+
+/* SMC-D cursor format */
+union smcd_cdc_cursor {
+ struct {
+ u16 wrap;
+ u32 count;
+ struct smc_cdc_producer_flags prod_flags;
+ struct smc_cdc_conn_state_flags conn_state_flags;
+ } __packed;
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_t acurs; /* for atomic processing */
+#else
+ u64 acurs; /* for atomic processing */
+#endif
+} __aligned(8);
+
+/* CDC message for SMC-D */
+struct smcd_cdc_msg {
+ struct smc_wr_rx_hdr common; /* Type = 0xFE */
+ u8 res1[7];
+ union smcd_cdc_cursor prod;
+ union smcd_cdc_cursor cons;
+ u8 res3[8];
+} __aligned(8);
+
+static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
+{
+ return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort ||
+ conn->local_rx_ctrl.conn_state_flags.peer_conn_closed;
+}
+
+static inline bool smc_cdc_rxed_any_close_or_senddone(
+ struct smc_connection *conn)
+{
+ return smc_cdc_rxed_any_close(conn) ||
+ conn->local_rx_ctrl.conn_state_flags.peer_done_writing;
+}
+
+static inline void smc_curs_add(int size, union smc_host_cursor *curs,
+ int value)
+{
+ curs->count += value;
+ if (curs->count >= size) {
+ curs->wrap++;
+ curs->count -= size;
+ }
+}
+
+/* Copy cursor src into tgt */
+static inline void smc_curs_copy(union smc_host_cursor *tgt,
+ union smc_host_cursor *src,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ tgt->acurs = src->acurs;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+ atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
+#endif
+}
+
+static inline void smc_curs_copy_net(union smc_cdc_cursor *tgt,
+ union smc_cdc_cursor *src,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ tgt->acurs = src->acurs;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+ atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
+#endif
+}
+
+static inline void smcd_curs_copy(union smcd_cdc_cursor *tgt,
+ union smcd_cdc_cursor *src,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ tgt->acurs = src->acurs;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+ atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
+#endif
+}
+
+/* calculate cursor difference between old and new, where old <= new and
+ * difference cannot exceed size
+ */
+static inline int smc_curs_diff(unsigned int size,
+ union smc_host_cursor *old,
+ union smc_host_cursor *new)
+{
+ if (old->wrap != new->wrap)
+ return max_t(int, 0,
+ ((size - old->count) + new->count));
+
+ return max_t(int, 0, (new->count - old->count));
+}
+
+/* calculate cursor difference between old and new - returns negative
+ * value in case old > new
+ */
+static inline int smc_curs_comp(unsigned int size,
+ union smc_host_cursor *old,
+ union smc_host_cursor *new)
+{
+ if (old->wrap > new->wrap ||
+ (old->wrap == new->wrap && old->count > new->count))
+ return -smc_curs_diff(size, new, old);
+ return smc_curs_diff(size, old, new);
+}
+
+/* calculate cursor difference between old and new, where old <= new and
+ * difference may exceed size
+ */
+static inline int smc_curs_diff_large(unsigned int size,
+ union smc_host_cursor *old,
+ union smc_host_cursor *new)
+{
+ if (old->wrap < new->wrap)
+ return min_t(int,
+ (size - old->count) + new->count +
+ (new->wrap - old->wrap - 1) * size,
+ size);
+
+ if (old->wrap > new->wrap) /* wrap has switched from 0xffff to 0x0000 */
+ return min_t(int,
+ (size - old->count) + new->count +
+ (new->wrap + 0xffff - old->wrap) * size,
+ size);
+
+ return max_t(int, 0, (new->count - old->count));
+}
+
+static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
+ union smc_host_cursor *local,
+ union smc_host_cursor *save,
+ struct smc_connection *conn)
+{
+ smc_curs_copy(save, local, conn);
+ peer->count = htonl(save->count);
+ peer->wrap = htons(save->wrap);
+ /* peer->reserved = htons(0); must be ensured by caller */
+}
+
+static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer,
+ struct smc_connection *conn,
+ union smc_host_cursor *save)
+{
+ struct smc_host_cdc_msg *local = &conn->local_tx_ctrl;
+
+ peer->common.type = local->common.type;
+ peer->len = local->len;
+ peer->seqno = htons(local->seqno);
+ peer->token = htonl(local->token);
+ smc_host_cursor_to_cdc(&peer->prod, &local->prod, save, conn);
+ smc_host_cursor_to_cdc(&peer->cons, &local->cons, save, conn);
+ peer->prod_flags = local->prod_flags;
+ peer->conn_state_flags = local->conn_state_flags;
+}
+
+static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
+ union smc_cdc_cursor *peer,
+ struct smc_connection *conn)
+{
+ union smc_host_cursor temp, old;
+ union smc_cdc_cursor net;
+
+ smc_curs_copy(&old, local, conn);
+ smc_curs_copy_net(&net, peer, conn);
+ temp.count = ntohl(net.count);
+ temp.wrap = ntohs(net.wrap);
+ if ((old.wrap > temp.wrap) && temp.wrap)
+ return;
+ if ((old.wrap == temp.wrap) &&
+ (old.count > temp.count))
+ return;
+ smc_curs_copy(local, &temp, conn);
+}
+
+static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smc_cdc_msg *peer,
+ struct smc_connection *conn)
+{
+ local->common.type = peer->common.type;
+ local->len = peer->len;
+ local->seqno = ntohs(peer->seqno);
+ local->token = ntohl(peer->token);
+ smc_cdc_cursor_to_host(&local->prod, &peer->prod, conn);
+ smc_cdc_cursor_to_host(&local->cons, &peer->cons, conn);
+ local->prod_flags = peer->prod_flags;
+ local->conn_state_flags = peer->conn_state_flags;
+}
+
+static inline void smcd_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smcd_cdc_msg *peer,
+ struct smc_connection *conn)
+{
+ union smc_host_cursor temp;
+
+ temp.wrap = peer->prod.wrap;
+ temp.count = peer->prod.count;
+ smc_curs_copy(&local->prod, &temp, conn);
+
+ temp.wrap = peer->cons.wrap;
+ temp.count = peer->cons.count;
+ smc_curs_copy(&local->cons, &temp, conn);
+ local->prod_flags = peer->cons.prod_flags;
+ local->conn_state_flags = peer->cons.conn_state_flags;
+}
+
+static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smc_cdc_msg *peer,
+ struct smc_connection *conn)
+{
+ if (conn->lgr->is_smcd)
+ smcd_cdc_msg_to_host(local, (struct smcd_cdc_msg *)peer, conn);
+ else
+ smcr_cdc_msg_to_host(local, peer, conn);
+}
+
+struct smc_cdc_tx_pend {
+ struct smc_connection *conn; /* socket connection */
+ union smc_host_cursor cursor; /* tx sndbuf cursor sent */
+ union smc_host_cursor p_cursor; /* rx RMBE cursor produced */
+ u16 ctrl_seq; /* conn. tx sequence # */
+};
+
+int smc_cdc_get_free_slot(struct smc_connection *conn,
+ struct smc_link *link,
+ struct smc_wr_buf **wr_buf,
+ struct smc_rdma_wr **wr_rdma_buf,
+ struct smc_cdc_tx_pend **pend);
+void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn);
+int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
+ struct smc_cdc_tx_pend *pend);
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
+int smcd_cdc_msg_send(struct smc_connection *conn);
+int smcr_cdc_msg_send_validation(struct smc_connection *conn,
+ struct smc_cdc_tx_pend *pend,
+ struct smc_wr_buf *wr_buf);
+int smc_cdc_init(void) __init;
+void smcd_cdc_rx_init(struct smc_connection *conn);
+
+#endif /* SMC_CDC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
new file mode 100644
index 000000000..72f4d81a3
--- /dev/null
+++ b/net/smc/smc_clc.c
@@ -0,0 +1,1278 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * CLC (connection layer control) handshake over initial TCP socket to
+ * prepare for RDMA traffic
+ *
+ * Copyright IBM Corp. 2016, 2018
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/in.h>
+#include <linux/inetdevice.h>
+#include <linux/if_ether.h>
+#include <linux/sched/signal.h>
+#include <linux/utsname.h>
+#include <linux/ctype.h>
+
+#include <net/addrconf.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_clc.h"
+#include "smc_ib.h"
+#include "smc_ism.h"
+#include "smc_netlink.h"
+
+#define SMCR_CLC_ACCEPT_CONFIRM_LEN 68
+#define SMCD_CLC_ACCEPT_CONFIRM_LEN 48
+#define SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 78
+#define SMCR_CLC_ACCEPT_CONFIRM_LEN_V2 108
+#define SMC_CLC_RECV_BUF_LEN 100
+
+/* eye catcher "SMCR" EBCDIC for CLC messages */
+static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
+/* eye catcher "SMCD" EBCDIC for CLC messages */
+static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'};
+
+static u8 smc_hostname[SMC_MAX_HOSTNAME_LEN];
+
+struct smc_clc_eid_table {
+ rwlock_t lock;
+ struct list_head list;
+ u8 ueid_cnt;
+ u8 seid_enabled;
+};
+
+static struct smc_clc_eid_table smc_clc_eid_table;
+
+struct smc_clc_eid_entry {
+ struct list_head list;
+ u8 eid[SMC_MAX_EID_LEN];
+};
+
+/* The size of a user EID is 32 characters.
+ * Valid characters should be (single-byte character set) A-Z, 0-9, '.' and '-'.
+ * Blanks should only be used to pad to the expected size.
+ * First character must be alphanumeric.
+ */
+static bool smc_clc_ueid_valid(char *ueid)
+{
+ char *end = ueid + SMC_MAX_EID_LEN;
+
+ while (--end >= ueid && isspace(*end))
+ ;
+ if (end < ueid)
+ return false;
+ if (!isalnum(*ueid) || islower(*ueid))
+ return false;
+ while (ueid <= end) {
+ if ((!isalnum(*ueid) || islower(*ueid)) && *ueid != '.' &&
+ *ueid != '-')
+ return false;
+ ueid++;
+ }
+ return true;
+}
+
+static int smc_clc_ueid_add(char *ueid)
+{
+ struct smc_clc_eid_entry *new_ueid, *tmp_ueid;
+ int rc;
+
+ if (!smc_clc_ueid_valid(ueid))
+ return -EINVAL;
+
+ /* add a new ueid entry to the ueid table if there isn't one */
+ new_ueid = kzalloc(sizeof(*new_ueid), GFP_KERNEL);
+ if (!new_ueid)
+ return -ENOMEM;
+ memcpy(new_ueid->eid, ueid, SMC_MAX_EID_LEN);
+
+ write_lock(&smc_clc_eid_table.lock);
+ if (smc_clc_eid_table.ueid_cnt >= SMC_MAX_UEID) {
+ rc = -ERANGE;
+ goto err_out;
+ }
+ list_for_each_entry(tmp_ueid, &smc_clc_eid_table.list, list) {
+ if (!memcmp(tmp_ueid->eid, ueid, SMC_MAX_EID_LEN)) {
+ rc = -EEXIST;
+ goto err_out;
+ }
+ }
+ list_add_tail(&new_ueid->list, &smc_clc_eid_table.list);
+ smc_clc_eid_table.ueid_cnt++;
+ write_unlock(&smc_clc_eid_table.lock);
+ return 0;
+
+err_out:
+ write_unlock(&smc_clc_eid_table.lock);
+ kfree(new_ueid);
+ return rc;
+}
+
+int smc_clc_ueid_count(void)
+{
+ int count;
+
+ read_lock(&smc_clc_eid_table.lock);
+ count = smc_clc_eid_table.ueid_cnt;
+ read_unlock(&smc_clc_eid_table.lock);
+
+ return count;
+}
+
+int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY];
+ char *ueid;
+
+ if (!nla_ueid || nla_len(nla_ueid) != SMC_MAX_EID_LEN + 1)
+ return -EINVAL;
+ ueid = (char *)nla_data(nla_ueid);
+
+ return smc_clc_ueid_add(ueid);
+}
+
+/* remove one or all ueid entries from the table */
+static int smc_clc_ueid_remove(char *ueid)
+{
+ struct smc_clc_eid_entry *lst_ueid, *tmp_ueid;
+ int rc = -ENOENT;
+
+ /* remove table entry */
+ write_lock(&smc_clc_eid_table.lock);
+ list_for_each_entry_safe(lst_ueid, tmp_ueid, &smc_clc_eid_table.list,
+ list) {
+ if (!ueid || !memcmp(lst_ueid->eid, ueid, SMC_MAX_EID_LEN)) {
+ list_del(&lst_ueid->list);
+ smc_clc_eid_table.ueid_cnt--;
+ kfree(lst_ueid);
+ rc = 0;
+ }
+ }
+ if (!rc && !smc_clc_eid_table.ueid_cnt) {
+ smc_clc_eid_table.seid_enabled = 1;
+ rc = -EAGAIN; /* indicate success and enabling of seid */
+ }
+ write_unlock(&smc_clc_eid_table.lock);
+ return rc;
+}
+
+int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY];
+ char *ueid;
+
+ if (!nla_ueid || nla_len(nla_ueid) != SMC_MAX_EID_LEN + 1)
+ return -EINVAL;
+ ueid = (char *)nla_data(nla_ueid);
+
+ return smc_clc_ueid_remove(ueid);
+}
+
+int smc_nl_flush_ueid(struct sk_buff *skb, struct genl_info *info)
+{
+ smc_clc_ueid_remove(NULL);
+ return 0;
+}
+
+static int smc_nl_ueid_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq,
+ u32 flags, char *ueid)
+{
+ char ueid_str[SMC_MAX_EID_LEN + 1];
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &smc_gen_nl_family,
+ flags, SMC_NETLINK_DUMP_UEID);
+ if (!hdr)
+ return -ENOMEM;
+ memcpy(ueid_str, ueid, SMC_MAX_EID_LEN);
+ ueid_str[SMC_MAX_EID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_EID_TABLE_ENTRY, ueid_str)) {
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+ }
+ genlmsg_end(skb, hdr);
+ return 0;
+}
+
+static int _smc_nl_ueid_dump(struct sk_buff *skb, u32 portid, u32 seq,
+ int start_idx)
+{
+ struct smc_clc_eid_entry *lst_ueid;
+ int idx = 0;
+
+ read_lock(&smc_clc_eid_table.lock);
+ list_for_each_entry(lst_ueid, &smc_clc_eid_table.list, list) {
+ if (idx++ < start_idx)
+ continue;
+ if (smc_nl_ueid_dumpinfo(skb, portid, seq, NLM_F_MULTI,
+ lst_ueid->eid)) {
+ --idx;
+ break;
+ }
+ }
+ read_unlock(&smc_clc_eid_table.lock);
+ return idx;
+}
+
+int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ int idx;
+
+ idx = _smc_nl_ueid_dump(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, cb_ctx->pos[0]);
+
+ cb_ctx->pos[0] = idx;
+ return skb->len;
+}
+
+int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ char seid_str[SMC_MAX_EID_LEN + 1];
+ u8 seid_enabled;
+ void *hdr;
+ u8 *seid;
+
+ if (cb_ctx->pos[0])
+ return skb->len;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_DUMP_SEID);
+ if (!hdr)
+ return -ENOMEM;
+ if (!smc_ism_is_v2_capable())
+ goto end;
+
+ smc_ism_get_system_eid(&seid);
+ memcpy(seid_str, seid, SMC_MAX_EID_LEN);
+ seid_str[SMC_MAX_EID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_SEID_ENTRY, seid_str))
+ goto err;
+ read_lock(&smc_clc_eid_table.lock);
+ seid_enabled = smc_clc_eid_table.seid_enabled;
+ read_unlock(&smc_clc_eid_table.lock);
+ if (nla_put_u8(skb, SMC_NLA_SEID_ENABLED, seid_enabled))
+ goto err;
+end:
+ genlmsg_end(skb, hdr);
+ cb_ctx->pos[0]++;
+ return skb->len;
+err:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info)
+{
+ write_lock(&smc_clc_eid_table.lock);
+ smc_clc_eid_table.seid_enabled = 1;
+ write_unlock(&smc_clc_eid_table.lock);
+ return 0;
+}
+
+int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info)
+{
+ int rc = 0;
+
+ write_lock(&smc_clc_eid_table.lock);
+ if (!smc_clc_eid_table.ueid_cnt)
+ rc = -ENOENT;
+ else
+ smc_clc_eid_table.seid_enabled = 0;
+ write_unlock(&smc_clc_eid_table.lock);
+ return rc;
+}
+
+static bool _smc_clc_match_ueid(u8 *peer_ueid)
+{
+ struct smc_clc_eid_entry *tmp_ueid;
+
+ list_for_each_entry(tmp_ueid, &smc_clc_eid_table.list, list) {
+ if (!memcmp(tmp_ueid->eid, peer_ueid, SMC_MAX_EID_LEN))
+ return true;
+ }
+ return false;
+}
+
+bool smc_clc_match_eid(u8 *negotiated_eid,
+ struct smc_clc_v2_extension *smc_v2_ext,
+ u8 *peer_eid, u8 *local_eid)
+{
+ bool match = false;
+ int i;
+
+ negotiated_eid[0] = 0;
+ read_lock(&smc_clc_eid_table.lock);
+ if (peer_eid && local_eid &&
+ smc_clc_eid_table.seid_enabled &&
+ smc_v2_ext->hdr.flag.seid &&
+ !memcmp(peer_eid, local_eid, SMC_MAX_EID_LEN)) {
+ memcpy(negotiated_eid, peer_eid, SMC_MAX_EID_LEN);
+ match = true;
+ goto out;
+ }
+
+ for (i = 0; i < smc_v2_ext->hdr.eid_cnt; i++) {
+ if (_smc_clc_match_ueid(smc_v2_ext->user_eids[i])) {
+ memcpy(negotiated_eid, smc_v2_ext->user_eids[i],
+ SMC_MAX_EID_LEN);
+ match = true;
+ goto out;
+ }
+ }
+out:
+ read_unlock(&smc_clc_eid_table.lock);
+ return match;
+}
+
+/* check arriving CLC proposal */
+static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc)
+{
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
+ struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+ struct smc_clc_msg_hdr *hdr = &pclc->hdr;
+ struct smc_clc_v2_extension *v2_ext;
+
+ v2_ext = smc_get_clc_v2_ext(pclc);
+ pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+ if (hdr->version == SMC_V1) {
+ if (hdr->typev1 == SMC_TYPE_N)
+ return false;
+ if (ntohs(hdr->length) !=
+ sizeof(*pclc) + ntohs(pclc->iparea_offset) +
+ sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(struct smc_clc_ipv6_prefix) +
+ sizeof(struct smc_clc_msg_trail))
+ return false;
+ } else {
+ if (ntohs(hdr->length) !=
+ sizeof(*pclc) +
+ sizeof(struct smc_clc_msg_smcd) +
+ (hdr->typev1 != SMC_TYPE_N ?
+ sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(struct smc_clc_ipv6_prefix) : 0) +
+ (hdr->typev2 != SMC_TYPE_N ?
+ sizeof(*v2_ext) +
+ v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN : 0) +
+ (smcd_indicated(hdr->typev2) ?
+ sizeof(*smcd_v2_ext) + v2_ext->hdr.ism_gid_cnt *
+ sizeof(struct smc_clc_smcd_gid_chid) :
+ 0) +
+ sizeof(struct smc_clc_msg_trail))
+ return false;
+ }
+ return true;
+}
+
+/* check arriving CLC accept or confirm */
+static bool
+smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2)
+{
+ struct smc_clc_msg_hdr *hdr = &clc_v2->hdr;
+
+ if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D)
+ return false;
+ if (hdr->version == SMC_V1) {
+ if ((hdr->typev1 == SMC_TYPE_R &&
+ ntohs(hdr->length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) ||
+ (hdr->typev1 == SMC_TYPE_D &&
+ ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN))
+ return false;
+ } else {
+ if (hdr->typev1 == SMC_TYPE_D &&
+ ntohs(hdr->length) < SMCD_CLC_ACCEPT_CONFIRM_LEN_V2)
+ return false;
+ if (hdr->typev1 == SMC_TYPE_R &&
+ ntohs(hdr->length) < SMCR_CLC_ACCEPT_CONFIRM_LEN_V2)
+ return false;
+ }
+ return true;
+}
+
+/* check arriving CLC decline */
+static bool
+smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc)
+{
+ struct smc_clc_msg_hdr *hdr = &dclc->hdr;
+
+ if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D)
+ return false;
+ if (hdr->version == SMC_V1) {
+ if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline))
+ return false;
+ } else {
+ if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline_v2))
+ return false;
+ }
+ return true;
+}
+
+static int smc_clc_fill_fce(struct smc_clc_first_contact_ext_v2x *fce,
+ struct smc_init_info *ini)
+{
+ int ret = sizeof(*fce);
+
+ memset(fce, 0, sizeof(*fce));
+ fce->fce_v2_base.os_type = SMC_CLC_OS_LINUX;
+ fce->fce_v2_base.release = ini->release_nr;
+ memcpy(fce->fce_v2_base.hostname, smc_hostname, sizeof(smc_hostname));
+ if (ini->is_smcd && ini->release_nr < SMC_RELEASE_1) {
+ ret = sizeof(struct smc_clc_first_contact_ext);
+ goto out;
+ }
+
+ if (ini->release_nr >= SMC_RELEASE_1) {
+ if (!ini->is_smcd) {
+ fce->max_conns = ini->max_conns;
+ fce->max_links = ini->max_links;
+ }
+ }
+
+out:
+ return ret;
+}
+
+/* check if received message has a correct header length and contains valid
+ * heading and trailing eyecatchers
+ */
+static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl)
+{
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2;
+ struct smc_clc_msg_proposal *pclc;
+ struct smc_clc_msg_decline *dclc;
+ struct smc_clc_msg_trail *trl;
+
+ if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
+ memcmp(clcm->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)))
+ return false;
+ switch (clcm->type) {
+ case SMC_CLC_PROPOSAL:
+ pclc = (struct smc_clc_msg_proposal *)clcm;
+ if (!smc_clc_msg_prop_valid(pclc))
+ return false;
+ trl = (struct smc_clc_msg_trail *)
+ ((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl));
+ break;
+ case SMC_CLC_ACCEPT:
+ case SMC_CLC_CONFIRM:
+ clc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)clcm;
+ if (!smc_clc_msg_acc_conf_valid(clc_v2))
+ return false;
+ trl = (struct smc_clc_msg_trail *)
+ ((u8 *)clc_v2 + ntohs(clc_v2->hdr.length) -
+ sizeof(*trl));
+ break;
+ case SMC_CLC_DECLINE:
+ dclc = (struct smc_clc_msg_decline *)clcm;
+ if (!smc_clc_msg_decl_valid(dclc))
+ return false;
+ check_trl = false;
+ break;
+ default:
+ return false;
+ }
+ if (check_trl &&
+ memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
+ memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)))
+ return false;
+ return true;
+}
+
+/* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */
+static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4,
+ struct smc_clc_msg_proposal_prefix *prop)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dst->dev);
+ const struct in_ifaddr *ifa;
+
+ if (!in_dev)
+ return -ENODEV;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (!inet_ifa_match(ipv4, ifa))
+ continue;
+ prop->prefix_len = inet_mask_len(ifa->ifa_mask);
+ prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask;
+ /* prop->ipv6_prefixes_cnt = 0; already done by memset before */
+ return 0;
+ }
+ return -ENOENT;
+}
+
+/* fill CLC proposal msg with ipv6 prefixes from device */
+static int smc_clc_prfx_set6_rcu(struct dst_entry *dst,
+ struct smc_clc_msg_proposal_prefix *prop,
+ struct smc_clc_ipv6_prefix *ipv6_prfx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_dev *in6_dev = __in6_dev_get(dst->dev);
+ struct inet6_ifaddr *ifa;
+ int cnt = 0;
+
+ if (!in6_dev)
+ return -ENODEV;
+ /* use a maximum of 8 IPv6 prefixes from device */
+ list_for_each_entry(ifa, &in6_dev->addr_list, if_list) {
+ if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)
+ continue;
+ ipv6_addr_prefix(&ipv6_prfx[cnt].prefix,
+ &ifa->addr, ifa->prefix_len);
+ ipv6_prfx[cnt].prefix_len = ifa->prefix_len;
+ cnt++;
+ if (cnt == SMC_CLC_MAX_V6_PREFIX)
+ break;
+ }
+ prop->ipv6_prefixes_cnt = cnt;
+ if (cnt)
+ return 0;
+#endif
+ return -ENOENT;
+}
+
+/* retrieve and set prefixes in CLC proposal msg */
+static int smc_clc_prfx_set(struct socket *clcsock,
+ struct smc_clc_msg_proposal_prefix *prop,
+ struct smc_clc_ipv6_prefix *ipv6_prfx)
+{
+ struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ struct sockaddr_storage addrs;
+ struct sockaddr_in6 *addr6;
+ struct sockaddr_in *addr;
+ int rc = -ENOENT;
+
+ if (!dst) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ if (!dst->dev) {
+ rc = -ENODEV;
+ goto out_rel;
+ }
+ /* get address to which the internal TCP socket is bound */
+ if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0)
+ goto out_rel;
+ /* analyze IP specific data of net_device belonging to TCP socket */
+ addr6 = (struct sockaddr_in6 *)&addrs;
+ rcu_read_lock();
+ if (addrs.ss_family == PF_INET) {
+ /* IPv4 */
+ addr = (struct sockaddr_in *)&addrs;
+ rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop);
+ } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) {
+ /* mapped IPv4 address - peer is IPv4 only */
+ rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3],
+ prop);
+ } else {
+ /* IPv6 */
+ rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx);
+ }
+ rcu_read_unlock();
+out_rel:
+ dst_release(dst);
+out:
+ return rc;
+}
+
+/* match ipv4 addrs of dev against addr in CLC proposal */
+static int smc_clc_prfx_match4_rcu(struct net_device *dev,
+ struct smc_clc_msg_proposal_prefix *prop)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ const struct in_ifaddr *ifa;
+
+ if (!in_dev)
+ return -ENODEV;
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) &&
+ inet_ifa_match(prop->outgoing_subnet, ifa))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+/* match ipv6 addrs of dev against addrs in CLC proposal */
+static int smc_clc_prfx_match6_rcu(struct net_device *dev,
+ struct smc_clc_msg_proposal_prefix *prop)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_dev *in6_dev = __in6_dev_get(dev);
+ struct smc_clc_ipv6_prefix *ipv6_prfx;
+ struct inet6_ifaddr *ifa;
+ int i, max;
+
+ if (!in6_dev)
+ return -ENODEV;
+ /* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */
+ ipv6_prfx = (struct smc_clc_ipv6_prefix *)((u8 *)prop + sizeof(*prop));
+ max = min_t(u8, prop->ipv6_prefixes_cnt, SMC_CLC_MAX_V6_PREFIX);
+ list_for_each_entry(ifa, &in6_dev->addr_list, if_list) {
+ if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)
+ continue;
+ for (i = 0; i < max; i++) {
+ if (ifa->prefix_len == ipv6_prfx[i].prefix_len &&
+ ipv6_prefix_equal(&ifa->addr, &ipv6_prfx[i].prefix,
+ ifa->prefix_len))
+ return 0;
+ }
+ }
+#endif
+ return -ENOENT;
+}
+
+/* check if proposed prefixes match one of our device prefixes */
+int smc_clc_prfx_match(struct socket *clcsock,
+ struct smc_clc_msg_proposal_prefix *prop)
+{
+ struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ int rc;
+
+ if (!dst) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ if (!dst->dev) {
+ rc = -ENODEV;
+ goto out_rel;
+ }
+ rcu_read_lock();
+ if (!prop->ipv6_prefixes_cnt)
+ rc = smc_clc_prfx_match4_rcu(dst->dev, prop);
+ else
+ rc = smc_clc_prfx_match6_rcu(dst->dev, prop);
+ rcu_read_unlock();
+out_rel:
+ dst_release(dst);
+out:
+ return rc;
+}
+
+/* Wait for data on the tcp-socket, analyze received data
+ * Returns:
+ * 0 if success and it was not a decline that we received.
+ * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
+ * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
+ */
+int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
+ u8 expected_type, unsigned long timeout)
+{
+ long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo;
+ struct sock *clc_sk = smc->clcsock->sk;
+ struct smc_clc_msg_hdr *clcm = buf;
+ struct msghdr msg = {NULL, 0};
+ int reason_code = 0;
+ struct kvec vec = {buf, buflen};
+ int len, datlen, recvlen;
+ bool check_trl = true;
+ int krflags;
+
+ /* peek the first few bytes to determine length of data to receive
+ * so we don't consume any subsequent CLC message or payload data
+ * in the TCP byte stream
+ */
+ /*
+ * Caller must make sure that buflen is no less than
+ * sizeof(struct smc_clc_msg_hdr)
+ */
+ krflags = MSG_PEEK | MSG_WAITALL;
+ clc_sk->sk_rcvtimeo = timeout;
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1,
+ sizeof(struct smc_clc_msg_hdr));
+ len = sock_recvmsg(smc->clcsock, &msg, krflags);
+ if (signal_pending(current)) {
+ reason_code = -EINTR;
+ clc_sk->sk_err = EINTR;
+ smc->sk.sk_err = EINTR;
+ goto out;
+ }
+ if (clc_sk->sk_err) {
+ reason_code = -clc_sk->sk_err;
+ if (clc_sk->sk_err == EAGAIN &&
+ expected_type == SMC_CLC_DECLINE)
+ clc_sk->sk_err = 0; /* reset for fallback usage */
+ else
+ smc->sk.sk_err = clc_sk->sk_err;
+ goto out;
+ }
+ if (!len) { /* peer has performed orderly shutdown */
+ smc->sk.sk_err = ECONNRESET;
+ reason_code = -ECONNRESET;
+ goto out;
+ }
+ if (len < 0) {
+ if (len != -EAGAIN || expected_type != SMC_CLC_DECLINE)
+ smc->sk.sk_err = -len;
+ reason_code = len;
+ goto out;
+ }
+ datlen = ntohs(clcm->length);
+ if ((len < sizeof(struct smc_clc_msg_hdr)) ||
+ (clcm->version < SMC_V1) ||
+ ((clcm->type != SMC_CLC_DECLINE) &&
+ (clcm->type != expected_type))) {
+ smc->sk.sk_err = EPROTO;
+ reason_code = -EPROTO;
+ goto out;
+ }
+
+ /* receive the complete CLC message */
+ memset(&msg, 0, sizeof(struct msghdr));
+ if (datlen > buflen) {
+ check_trl = false;
+ recvlen = buflen;
+ } else {
+ recvlen = datlen;
+ }
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen);
+ krflags = MSG_WAITALL;
+ len = sock_recvmsg(smc->clcsock, &msg, krflags);
+ if (len < recvlen || !smc_clc_msg_hdr_valid(clcm, check_trl)) {
+ smc->sk.sk_err = EPROTO;
+ reason_code = -EPROTO;
+ goto out;
+ }
+ datlen -= len;
+ while (datlen) {
+ u8 tmp[SMC_CLC_RECV_BUF_LEN];
+
+ vec.iov_base = &tmp;
+ vec.iov_len = SMC_CLC_RECV_BUF_LEN;
+ /* receive remaining proposal message */
+ recvlen = datlen > SMC_CLC_RECV_BUF_LEN ?
+ SMC_CLC_RECV_BUF_LEN : datlen;
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen);
+ len = sock_recvmsg(smc->clcsock, &msg, krflags);
+ datlen -= len;
+ }
+ if (clcm->type == SMC_CLC_DECLINE) {
+ struct smc_clc_msg_decline *dclc;
+
+ dclc = (struct smc_clc_msg_decline *)clcm;
+ reason_code = SMC_CLC_DECL_PEERDECL;
+ smc->peer_diagnosis = ntohl(dclc->peer_diagnosis);
+ if (((struct smc_clc_msg_decline *)buf)->hdr.typev2 &
+ SMC_FIRST_CONTACT_MASK) {
+ smc->conn.lgr->sync_err = 1;
+ smc_lgr_terminate_sched(smc->conn.lgr);
+ }
+ }
+
+out:
+ clc_sk->sk_rcvtimeo = rcvtimeo;
+ return reason_code;
+}
+
+/* send CLC DECLINE message across internal TCP socket */
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version)
+{
+ struct smc_clc_msg_decline *dclc_v1;
+ struct smc_clc_msg_decline_v2 dclc;
+ struct msghdr msg;
+ int len, send_len;
+ struct kvec vec;
+
+ dclc_v1 = (struct smc_clc_msg_decline *)&dclc;
+ memset(&dclc, 0, sizeof(dclc));
+ memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ dclc.hdr.type = SMC_CLC_DECLINE;
+ dclc.hdr.version = version;
+ dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX;
+ dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ?
+ SMC_FIRST_CONTACT_MASK : 0;
+ if ((!smc_conn_lgr_valid(&smc->conn) || !smc->conn.lgr->is_smcd) &&
+ smc_ib_is_valid_local_systemid())
+ memcpy(dclc.id_for_peer, local_systemid,
+ sizeof(local_systemid));
+ dclc.peer_diagnosis = htonl(peer_diag_info);
+ if (version == SMC_V1) {
+ memcpy(dclc_v1->trl.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ send_len = sizeof(*dclc_v1);
+ } else {
+ memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ send_len = sizeof(dclc);
+ }
+ dclc.hdr.length = htons(send_len);
+
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &dclc;
+ vec.iov_len = send_len;
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len);
+ if (len < 0 || len < send_len)
+ len = -EPROTO;
+ return len > 0 ? 0 : len;
+}
+
+/* send CLC PROPOSAL message across internal TCP socket */
+int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
+{
+ struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
+ struct smc_clc_msg_proposal *pclc_base;
+ struct smc_clc_smcd_gid_chid *gidchids;
+ struct smc_clc_msg_proposal_area *pclc;
+ struct smc_clc_ipv6_prefix *ipv6_prfx;
+ struct smc_clc_v2_extension *v2_ext;
+ struct smc_clc_msg_smcd *pclc_smcd;
+ struct smc_clc_msg_trail *trl;
+ struct smcd_dev *smcd;
+ int len, i, plen, rc;
+ int reason_code = 0;
+ struct kvec vec[8];
+ struct msghdr msg;
+
+ pclc = kzalloc(sizeof(*pclc), GFP_KERNEL);
+ if (!pclc)
+ return -ENOMEM;
+
+ pclc_base = &pclc->pclc_base;
+ pclc_smcd = &pclc->pclc_smcd;
+ pclc_prfx = &pclc->pclc_prfx;
+ ipv6_prfx = pclc->pclc_prfx_ipv6;
+ v2_ext = &pclc->pclc_v2_ext;
+ smcd_v2_ext = &pclc->pclc_smcd_v2_ext;
+ gidchids = pclc->pclc_gidchids;
+ trl = &pclc->pclc_trl;
+
+ pclc_base->hdr.version = SMC_V2;
+ pclc_base->hdr.typev1 = ini->smc_type_v1;
+ pclc_base->hdr.typev2 = ini->smc_type_v2;
+ plen = sizeof(*pclc_base) + sizeof(*pclc_smcd) + sizeof(*trl);
+
+ /* retrieve ip prefixes for CLC proposal msg */
+ if (ini->smc_type_v1 != SMC_TYPE_N) {
+ rc = smc_clc_prfx_set(smc->clcsock, pclc_prfx, ipv6_prfx);
+ if (rc) {
+ if (ini->smc_type_v2 == SMC_TYPE_N) {
+ kfree(pclc);
+ return SMC_CLC_DECL_CNFERR;
+ }
+ pclc_base->hdr.typev1 = SMC_TYPE_N;
+ } else {
+ pclc_base->iparea_offset = htons(sizeof(*pclc_smcd));
+ plen += sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(ipv6_prfx[0]);
+ }
+ }
+
+ /* build SMC Proposal CLC message */
+ memcpy(pclc_base->hdr.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ pclc_base->hdr.type = SMC_CLC_PROPOSAL;
+ if (smcr_indicated(ini->smc_type_v1)) {
+ /* add SMC-R specifics */
+ memcpy(pclc_base->lcl.id_for_peer, local_systemid,
+ sizeof(local_systemid));
+ memcpy(pclc_base->lcl.gid, ini->ib_gid, SMC_GID_SIZE);
+ memcpy(pclc_base->lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1],
+ ETH_ALEN);
+ }
+ if (smcd_indicated(ini->smc_type_v1)) {
+ /* add SMC-D specifics */
+ if (ini->ism_dev[0]) {
+ smcd = ini->ism_dev[0];
+ pclc_smcd->ism.gid =
+ htonll(smcd->ops->get_local_gid(smcd));
+ pclc_smcd->ism.chid =
+ htons(smc_ism_get_chid(ini->ism_dev[0]));
+ }
+ }
+ if (ini->smc_type_v2 == SMC_TYPE_N) {
+ pclc_smcd->v2_ext_offset = 0;
+ } else {
+ struct smc_clc_eid_entry *ueident;
+ u16 v2_ext_offset;
+
+ v2_ext->hdr.flag.release = SMC_RELEASE;
+ v2_ext_offset = sizeof(*pclc_smcd) -
+ offsetofend(struct smc_clc_msg_smcd, v2_ext_offset);
+ if (ini->smc_type_v1 != SMC_TYPE_N)
+ v2_ext_offset += sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(ipv6_prfx[0]);
+ pclc_smcd->v2_ext_offset = htons(v2_ext_offset);
+ plen += sizeof(*v2_ext);
+
+ read_lock(&smc_clc_eid_table.lock);
+ v2_ext->hdr.eid_cnt = smc_clc_eid_table.ueid_cnt;
+ plen += smc_clc_eid_table.ueid_cnt * SMC_MAX_EID_LEN;
+ i = 0;
+ list_for_each_entry(ueident, &smc_clc_eid_table.list, list) {
+ memcpy(v2_ext->user_eids[i++], ueident->eid,
+ sizeof(ueident->eid));
+ }
+ read_unlock(&smc_clc_eid_table.lock);
+ }
+ if (smcd_indicated(ini->smc_type_v2)) {
+ u8 *eid = NULL;
+
+ v2_ext->hdr.flag.seid = smc_clc_eid_table.seid_enabled;
+ v2_ext->hdr.ism_gid_cnt = ini->ism_offered_cnt;
+ v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) -
+ offsetofend(struct smc_clnt_opts_area_hdr,
+ smcd_v2_ext_offset) +
+ v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN);
+ smc_ism_get_system_eid(&eid);
+ if (eid && v2_ext->hdr.flag.seid)
+ memcpy(smcd_v2_ext->system_eid, eid, SMC_MAX_EID_LEN);
+ plen += sizeof(*smcd_v2_ext);
+ if (ini->ism_offered_cnt) {
+ for (i = 1; i <= ini->ism_offered_cnt; i++) {
+ smcd = ini->ism_dev[i];
+ gidchids[i - 1].gid =
+ htonll(smcd->ops->get_local_gid(smcd));
+ gidchids[i - 1].chid =
+ htons(smc_ism_get_chid(ini->ism_dev[i]));
+ }
+ plen += ini->ism_offered_cnt *
+ sizeof(struct smc_clc_smcd_gid_chid);
+ }
+ }
+ if (smcr_indicated(ini->smc_type_v2)) {
+ memcpy(v2_ext->roce, ini->smcrv2.ib_gid_v2, SMC_GID_SIZE);
+ v2_ext->max_conns = SMC_CONN_PER_LGR_PREFER;
+ v2_ext->max_links = SMC_LINKS_PER_LGR_MAX_PREFER;
+ }
+
+ pclc_base->hdr.length = htons(plen);
+ memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ /* send SMC Proposal CLC message */
+ memset(&msg, 0, sizeof(msg));
+ i = 0;
+ vec[i].iov_base = pclc_base;
+ vec[i++].iov_len = sizeof(*pclc_base);
+ vec[i].iov_base = pclc_smcd;
+ vec[i++].iov_len = sizeof(*pclc_smcd);
+ if (ini->smc_type_v1 != SMC_TYPE_N) {
+ vec[i].iov_base = pclc_prfx;
+ vec[i++].iov_len = sizeof(*pclc_prfx);
+ if (pclc_prfx->ipv6_prefixes_cnt > 0) {
+ vec[i].iov_base = ipv6_prfx;
+ vec[i++].iov_len = pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(ipv6_prfx[0]);
+ }
+ }
+ if (ini->smc_type_v2 != SMC_TYPE_N) {
+ vec[i].iov_base = v2_ext;
+ vec[i++].iov_len = sizeof(*v2_ext) +
+ (v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN);
+ if (smcd_indicated(ini->smc_type_v2)) {
+ vec[i].iov_base = smcd_v2_ext;
+ vec[i++].iov_len = sizeof(*smcd_v2_ext);
+ if (ini->ism_offered_cnt) {
+ vec[i].iov_base = gidchids;
+ vec[i++].iov_len = ini->ism_offered_cnt *
+ sizeof(struct smc_clc_smcd_gid_chid);
+ }
+ }
+ }
+ vec[i].iov_base = trl;
+ vec[i++].iov_len = sizeof(*trl);
+ /* due to the few bytes needed for clc-handshake this cannot block */
+ len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen);
+ if (len < 0) {
+ smc->sk.sk_err = smc->clcsock->sk->sk_err;
+ reason_code = -smc->sk.sk_err;
+ } else if (len < ntohs(pclc_base->hdr.length)) {
+ reason_code = -ENETUNREACH;
+ smc->sk.sk_err = -reason_code;
+ }
+
+ kfree(pclc);
+ return reason_code;
+}
+
+/* build and send CLC CONFIRM / ACCEPT message */
+static int smc_clc_send_confirm_accept(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2,
+ int first_contact, u8 version,
+ u8 *eid, struct smc_init_info *ini)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_clc_first_contact_ext_v2x fce;
+ struct smcd_dev *smcd = conn->lgr->smcd;
+ struct smc_clc_msg_accept_confirm *clc;
+ struct smc_clc_fce_gid_ext gle;
+ struct smc_clc_msg_trail trl;
+ int i, len, fce_len;
+ struct kvec vec[5];
+ struct msghdr msg;
+
+ /* send SMC Confirm CLC msg */
+ clc = (struct smc_clc_msg_accept_confirm *)clc_v2;
+ clc->hdr.version = version; /* SMC version */
+ if (first_contact)
+ clc->hdr.typev2 |= SMC_FIRST_CONTACT_MASK;
+ if (conn->lgr->is_smcd) {
+ /* SMC-D specific settings */
+ memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER,
+ sizeof(SMCD_EYECATCHER));
+ clc->hdr.typev1 = SMC_TYPE_D;
+ clc->d0.gid = htonll(smcd->ops->get_local_gid(smcd));
+ clc->d0.token = htonll(conn->rmb_desc->token);
+ clc->d0.dmbe_size = conn->rmbe_size_comp;
+ clc->d0.dmbe_idx = 0;
+ memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
+ if (version == SMC_V1) {
+ clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
+ } else {
+ clc_v2->d1.chid = htons(smc_ism_get_chid(smcd));
+ if (eid && eid[0])
+ memcpy(clc_v2->d1.eid, eid, SMC_MAX_EID_LEN);
+ len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2;
+ if (first_contact) {
+ fce_len = smc_clc_fill_fce(&fce, ini);
+ len += fce_len;
+ }
+ clc_v2->hdr.length = htons(len);
+ }
+ memcpy(trl.eyecatcher, SMCD_EYECATCHER,
+ sizeof(SMCD_EYECATCHER));
+ } else {
+ struct smc_link *link = conn->lnk;
+
+ /* SMC-R specific settings */
+ memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ clc->hdr.typev1 = SMC_TYPE_R;
+ clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
+ memcpy(clc->r0.lcl.id_for_peer, local_systemid,
+ sizeof(local_systemid));
+ memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE);
+ memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
+ ETH_ALEN);
+ hton24(clc->r0.qpn, link->roce_qp->qp_num);
+ clc->r0.rmb_rkey =
+ htonl(conn->rmb_desc->mr[link->link_idx]->rkey);
+ clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
+ clc->r0.rmbe_alert_token = htonl(conn->alert_token_local);
+ switch (clc->hdr.type) {
+ case SMC_CLC_ACCEPT:
+ clc->r0.qp_mtu = link->path_mtu;
+ break;
+ case SMC_CLC_CONFIRM:
+ clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu);
+ break;
+ }
+ clc->r0.rmbe_size = conn->rmbe_size_comp;
+ clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ?
+ cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) :
+ cpu_to_be64((u64)sg_dma_address
+ (conn->rmb_desc->sgt[link->link_idx].sgl));
+ hton24(clc->r0.psn, link->psn_initial);
+ if (version == SMC_V1) {
+ clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
+ } else {
+ if (eid && eid[0])
+ memcpy(clc_v2->r1.eid, eid, SMC_MAX_EID_LEN);
+ len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2;
+ if (first_contact) {
+ fce_len = smc_clc_fill_fce(&fce, ini);
+ len += fce_len;
+ fce.fce_v2_base.v2_direct = !link->lgr->uses_gateway;
+ if (clc->hdr.type == SMC_CLC_CONFIRM) {
+ memset(&gle, 0, sizeof(gle));
+ gle.gid_cnt = ini->smcrv2.gidlist.len;
+ len += sizeof(gle);
+ len += gle.gid_cnt * sizeof(gle.gid[0]);
+ }
+ }
+ clc_v2->hdr.length = htons(len);
+ }
+ memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ i = 0;
+ vec[i].iov_base = clc_v2;
+ if (version > SMC_V1)
+ vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ?
+ SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 :
+ SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) -
+ sizeof(trl);
+ else
+ vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ?
+ SMCD_CLC_ACCEPT_CONFIRM_LEN :
+ SMCR_CLC_ACCEPT_CONFIRM_LEN) -
+ sizeof(trl);
+ if (version > SMC_V1 && first_contact) {
+ vec[i].iov_base = &fce;
+ vec[i++].iov_len = fce_len;
+ if (!conn->lgr->is_smcd) {
+ if (clc->hdr.type == SMC_CLC_CONFIRM) {
+ vec[i].iov_base = &gle;
+ vec[i++].iov_len = sizeof(gle);
+ vec[i].iov_base = &ini->smcrv2.gidlist.list;
+ vec[i++].iov_len = gle.gid_cnt *
+ sizeof(gle.gid[0]);
+ }
+ }
+ }
+ vec[i].iov_base = &trl;
+ vec[i++].iov_len = sizeof(trl);
+ return kernel_sendmsg(smc->clcsock, &msg, vec, 1,
+ ntohs(clc->hdr.length));
+}
+
+/* send CLC CONFIRM message across internal TCP socket */
+int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
+ u8 version, u8 *eid, struct smc_init_info *ini)
+{
+ struct smc_clc_msg_accept_confirm_v2 cclc_v2;
+ int reason_code = 0;
+ int len;
+
+ /* send SMC Confirm CLC msg */
+ memset(&cclc_v2, 0, sizeof(cclc_v2));
+ cclc_v2.hdr.type = SMC_CLC_CONFIRM;
+ len = smc_clc_send_confirm_accept(smc, &cclc_v2, clnt_first_contact,
+ version, eid, ini);
+ if (len < ntohs(cclc_v2.hdr.length)) {
+ if (len >= 0) {
+ reason_code = -ENETUNREACH;
+ smc->sk.sk_err = -reason_code;
+ } else {
+ smc->sk.sk_err = smc->clcsock->sk->sk_err;
+ reason_code = -smc->sk.sk_err;
+ }
+ }
+ return reason_code;
+}
+
+/* send CLC ACCEPT message across internal TCP socket */
+int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact,
+ u8 version, u8 *negotiated_eid, struct smc_init_info *ini)
+{
+ struct smc_clc_msg_accept_confirm_v2 aclc_v2;
+ int len;
+
+ memset(&aclc_v2, 0, sizeof(aclc_v2));
+ aclc_v2.hdr.type = SMC_CLC_ACCEPT;
+ len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact,
+ version, negotiated_eid, ini);
+ if (len < ntohs(aclc_v2.hdr.length))
+ len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err;
+
+ return len > 0 ? 0 : len;
+}
+
+int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_v2_extension *pclc_v2_ext;
+
+ ini->max_conns = SMC_CONN_PER_LGR_MAX;
+ ini->max_links = SMC_LINKS_ADD_LNK_MAX;
+
+ if ((!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) ||
+ ini->release_nr < SMC_RELEASE_1)
+ return 0;
+
+ pclc_v2_ext = smc_get_clc_v2_ext(pclc);
+ if (!pclc_v2_ext)
+ return SMC_CLC_DECL_NOV2EXT;
+
+ if (ini->smcr_version & SMC_V2) {
+ ini->max_conns = min_t(u8, pclc_v2_ext->max_conns, SMC_CONN_PER_LGR_PREFER);
+ if (ini->max_conns < SMC_CONN_PER_LGR_MIN)
+ return SMC_CLC_DECL_MAXCONNERR;
+
+ ini->max_links = min_t(u8, pclc_v2_ext->max_links, SMC_LINKS_PER_LGR_MAX_PREFER);
+ if (ini->max_links < SMC_LINKS_ADD_LNK_MIN)
+ return SMC_CLC_DECL_MAXLINKERR;
+ }
+
+ return 0;
+}
+
+int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_first_contact_ext_v2x *fce_v2x =
+ (struct smc_clc_first_contact_ext_v2x *)fce;
+
+ if (ini->release_nr < SMC_RELEASE_1)
+ return 0;
+
+ if (!ini->is_smcd) {
+ if (fce_v2x->max_conns < SMC_CONN_PER_LGR_MIN)
+ return SMC_CLC_DECL_MAXCONNERR;
+ ini->max_conns = fce_v2x->max_conns;
+
+ if (fce_v2x->max_links > SMC_LINKS_ADD_LNK_MAX ||
+ fce_v2x->max_links < SMC_LINKS_ADD_LNK_MIN)
+ return SMC_CLC_DECL_MAXLINKERR;
+ ini->max_links = fce_v2x->max_links;
+ }
+
+ return 0;
+}
+
+int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)cclc;
+ struct smc_clc_first_contact_ext *fce =
+ smc_get_clc_first_contact_ext(clc_v2, ini->is_smcd);
+ struct smc_clc_first_contact_ext_v2x *fce_v2x =
+ (struct smc_clc_first_contact_ext_v2x *)fce;
+
+ if (cclc->hdr.version == SMC_V1 ||
+ !(cclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK))
+ return 0;
+
+ if (ini->release_nr != fce->release)
+ return SMC_CLC_DECL_RELEASEERR;
+
+ if (fce->release < SMC_RELEASE_1)
+ return 0;
+
+ if (!ini->is_smcd) {
+ if (fce_v2x->max_conns != ini->max_conns)
+ return SMC_CLC_DECL_MAXCONNERR;
+ if (fce_v2x->max_links != ini->max_links)
+ return SMC_CLC_DECL_MAXLINKERR;
+ }
+
+ return 0;
+}
+
+void smc_clc_get_hostname(u8 **host)
+{
+ *host = &smc_hostname[0];
+}
+
+void __init smc_clc_init(void)
+{
+ struct new_utsname *u;
+
+ memset(smc_hostname, _S, sizeof(smc_hostname)); /* ASCII blanks */
+ u = utsname();
+ memcpy(smc_hostname, u->nodename,
+ min_t(size_t, strlen(u->nodename), sizeof(smc_hostname)));
+
+ INIT_LIST_HEAD(&smc_clc_eid_table.list);
+ rwlock_init(&smc_clc_eid_table.lock);
+ smc_clc_eid_table.ueid_cnt = 0;
+ smc_clc_eid_table.seid_enabled = 1;
+}
+
+void smc_clc_exit(void)
+{
+ smc_clc_ueid_remove(NULL);
+}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
new file mode 100644
index 000000000..08155a96a
--- /dev/null
+++ b/net/smc/smc_clc.h
@@ -0,0 +1,446 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * CLC (connection layer control) handshake over initial TCP socket to
+ * prepare for RDMA traffic
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CLC_H
+#define _SMC_CLC_H
+
+#include <rdma/ib_verbs.h>
+#include <linux/smc.h>
+
+#include "smc.h"
+#include "smc_netlink.h"
+
+#define SMC_CLC_PROPOSAL 0x01
+#define SMC_CLC_ACCEPT 0x02
+#define SMC_CLC_CONFIRM 0x03
+#define SMC_CLC_DECLINE 0x04
+
+#define SMC_TYPE_R 0 /* SMC-R only */
+#define SMC_TYPE_D 1 /* SMC-D only */
+#define SMC_TYPE_N 2 /* neither SMC-R nor SMC-D */
+#define SMC_TYPE_B 3 /* SMC-R and SMC-D */
+#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
+#define CLC_WAIT_TIME_SHORT HZ /* short wait time on clcsock */
+#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
+#define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */
+#define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */
+#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
+#define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */
+#define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */
+#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */
+#define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */
+#define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */
+#define SMC_CLC_DECL_NOISM2SUPP 0x03030003 /* hardware has no ISMv2 support */
+#define SMC_CLC_DECL_NOV2EXT 0x03030004 /* peer sent no clc v2 extension */
+#define SMC_CLC_DECL_NOV2DEXT 0x03030005 /* peer sent no clc SMC-Dv2 ext. */
+#define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */
+#define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */
+#define SMC_CLC_DECL_NOUEID 0x03030008 /* peer sent no UEID */
+#define SMC_CLC_DECL_RELEASEERR 0x03030009 /* release version negotiate failed */
+#define SMC_CLC_DECL_MAXCONNERR 0x0303000a /* max connections negotiate failed */
+#define SMC_CLC_DECL_MAXLINKERR 0x0303000b /* max links negotiate failed */
+#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/
+#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */
+#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */
+#define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */
+#define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/
+#define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */
+#define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */
+#define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */
+#define SMC_CLC_DECL_VERSMISMAT 0x030c0000 /* SMC version mismatch */
+#define SMC_CLC_DECL_MAX_DMB 0x030d0000 /* SMC-D DMB limit exceeded */
+#define SMC_CLC_DECL_NOROUTE 0x030e0000 /* SMC-Rv2 conn. no route to peer */
+#define SMC_CLC_DECL_NOINDIRECT 0x030f0000 /* SMC-Rv2 conn. indirect mismatch*/
+#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
+#define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */
+#define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */
+#define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */
+#define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */
+#define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */
+
+#define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */
+
+struct smc_clc_msg_hdr { /* header1 of clc messages */
+ u8 eyecatcher[4]; /* eye catcher */
+ u8 type; /* proposal / accept / confirm / decline */
+ __be16 length;
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 version : 4,
+ typev2 : 2,
+ typev1 : 2;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 typev1 : 2,
+ typev2 : 2,
+ version : 4;
+#endif
+} __packed; /* format defined in RFC7609 */
+
+struct smc_clc_msg_trail { /* trailer of clc messages */
+ u8 eyecatcher[4];
+};
+
+struct smc_clc_msg_local { /* header2 of clc messages */
+ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */
+ u8 gid[16]; /* gid of ib_device port */
+ u8 mac[6]; /* mac of ib_device port */
+};
+
+/* Struct would be 4 byte aligned, but it is used in an array that is sent
+ * to peers and must conform to RFC7609, hence we need to use packed here.
+ */
+struct smc_clc_ipv6_prefix {
+ struct in6_addr prefix;
+ u8 prefix_len;
+} __packed; /* format defined in RFC7609 */
+
+#if defined(__BIG_ENDIAN_BITFIELD)
+struct smc_clc_v2_flag {
+ u8 release : 4,
+ rsvd : 3,
+ seid : 1;
+};
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+struct smc_clc_v2_flag {
+ u8 seid : 1,
+ rsvd : 3,
+ release : 4;
+};
+#endif
+
+struct smc_clnt_opts_area_hdr {
+ u8 eid_cnt; /* number of user defined EIDs */
+ u8 ism_gid_cnt; /* number of ISMv2 GIDs */
+ u8 reserved1;
+ struct smc_clc_v2_flag flag;
+ u8 reserved2[2];
+ __be16 smcd_v2_ext_offset; /* SMC-Dv2 Extension Offset */
+};
+
+struct smc_clc_smcd_gid_chid {
+ __be64 gid; /* ISM GID */
+ __be16 chid; /* ISMv2 CHID */
+} __packed; /* format defined in
+ * IBM Shared Memory Communications Version 2
+ * (https://www.ibm.com/support/pages/node/6326337)
+ */
+
+struct smc_clc_v2_extension {
+ struct smc_clnt_opts_area_hdr hdr;
+ u8 roce[16]; /* RoCEv2 GID */
+ u8 max_conns;
+ u8 max_links;
+ u8 reserved[14];
+ u8 user_eids[][SMC_MAX_EID_LEN];
+};
+
+struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/
+ __be32 outgoing_subnet; /* subnet mask */
+ u8 prefix_len; /* number of significant bits in mask */
+ u8 reserved[2];
+ u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
+} __aligned(4);
+
+struct smc_clc_msg_smcd { /* SMC-D GID information */
+ struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */
+ __be16 v2_ext_offset; /* SMC Version 2 Extension Offset */
+ u8 vendor_oui[3]; /* vendor organizationally unique identifier */
+ u8 vendor_exp_options[5];
+ u8 reserved[20];
+};
+
+struct smc_clc_smcd_v2_extension {
+ u8 system_eid[SMC_MAX_EID_LEN];
+ u8 reserved[16];
+ struct smc_clc_smcd_gid_chid gidchid[];
+};
+
+struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
+ struct smc_clc_msg_hdr hdr;
+ struct smc_clc_msg_local lcl;
+ __be16 iparea_offset; /* offset to IP address information area */
+} __aligned(4);
+
+#define SMC_CLC_MAX_V6_PREFIX 8
+#define SMC_CLC_MAX_UEID 8
+
+struct smc_clc_msg_proposal_area {
+ struct smc_clc_msg_proposal pclc_base;
+ struct smc_clc_msg_smcd pclc_smcd;
+ struct smc_clc_msg_proposal_prefix pclc_prfx;
+ struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX];
+ struct smc_clc_v2_extension pclc_v2_ext;
+ u8 user_eids[SMC_CLC_MAX_UEID][SMC_MAX_EID_LEN];
+ struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext;
+ struct smc_clc_smcd_gid_chid pclc_gidchids[SMC_MAX_ISM_DEVS];
+ struct smc_clc_msg_trail pclc_trl;
+};
+
+struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */
+ struct smc_clc_msg_local lcl;
+ u8 qpn[3]; /* QP number */
+ __be32 rmb_rkey; /* RMB rkey */
+ u8 rmbe_idx; /* Index of RMBE in RMB */
+ __be32 rmbe_alert_token; /* unique connection id */
+ #if defined(__BIG_ENDIAN_BITFIELD)
+ u8 rmbe_size : 4, /* buf size (compressed) */
+ qp_mtu : 4; /* QP mtu */
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 qp_mtu : 4,
+ rmbe_size : 4;
+#endif
+ u8 reserved;
+ __be64 rmb_dma_addr; /* RMB virtual address */
+ u8 reserved2;
+ u8 psn[3]; /* packet sequence number */
+} __packed;
+
+struct smcd_clc_msg_accept_confirm_common { /* SMCD accept/confirm */
+ __be64 gid; /* Sender GID */
+ __be64 token; /* DMB token */
+ u8 dmbe_idx; /* DMBE index */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 dmbe_size : 4, /* buf size (compressed) */
+ reserved3 : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved3 : 4,
+ dmbe_size : 4;
+#endif
+ u16 reserved4;
+ __be32 linkid; /* Link identifier */
+} __packed;
+
+#define SMC_CLC_OS_ZOS 1
+#define SMC_CLC_OS_LINUX 2
+#define SMC_CLC_OS_AIX 3
+
+struct smc_clc_first_contact_ext {
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 v2_direct : 1,
+ reserved : 7;
+ u8 os_type : 4,
+ release : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 7,
+ v2_direct : 1;
+ u8 release : 4,
+ os_type : 4;
+#endif
+ u8 reserved2[2];
+ u8 hostname[SMC_MAX_HOSTNAME_LEN];
+};
+
+struct smc_clc_first_contact_ext_v2x {
+ struct smc_clc_first_contact_ext fce_v2_base;
+ u8 max_conns; /* for SMC-R only */
+ u8 max_links; /* for SMC-R only */
+ u8 reserved3[2];
+ __be32 vendor_exp_options;
+ u8 reserved4[8];
+} __packed; /* format defined in
+ * IBM Shared Memory Communications Version 2 (Third Edition)
+ * (https://www.ibm.com/support/pages/node/7009315)
+ */
+
+struct smc_clc_fce_gid_ext {
+ u8 gid_cnt;
+ u8 reserved2[3];
+ u8 gid[][SMC_GID_SIZE];
+};
+
+struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
+ struct smc_clc_msg_hdr hdr;
+ union {
+ struct smcr_clc_msg_accept_confirm r0; /* SMC-R */
+ struct { /* SMC-D */
+ struct smcd_clc_msg_accept_confirm_common d0;
+ u32 reserved5[3];
+ };
+ };
+} __packed; /* format defined in RFC7609 */
+
+struct smc_clc_msg_accept_confirm_v2 { /* clc accept / confirm message */
+ struct smc_clc_msg_hdr hdr;
+ union {
+ struct { /* SMC-R */
+ struct smcr_clc_msg_accept_confirm r0;
+ u8 eid[SMC_MAX_EID_LEN];
+ u8 reserved6[8];
+ } r1;
+ struct { /* SMC-D */
+ struct smcd_clc_msg_accept_confirm_common d0;
+ __be16 chid;
+ u8 eid[SMC_MAX_EID_LEN];
+ u8 reserved5[8];
+ } d1;
+ };
+};
+
+struct smc_clc_msg_decline { /* clc decline message */
+ struct smc_clc_msg_hdr hdr;
+ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
+ __be32 peer_diagnosis; /* diagnosis information */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 os_type : 4,
+ reserved : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 4,
+ os_type : 4;
+#endif
+ u8 reserved2[3];
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */
+} __aligned(4);
+
+#define SMC_DECL_DIAG_COUNT_V2 4 /* no. of additional peer diagnosis codes */
+
+struct smc_clc_msg_decline_v2 { /* clc decline message */
+ struct smc_clc_msg_hdr hdr;
+ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
+ __be32 peer_diagnosis; /* diagnosis information */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 os_type : 4,
+ reserved : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 4,
+ os_type : 4;
+#endif
+ u8 reserved2[3];
+ __be32 peer_diagnosis_v2[SMC_DECL_DIAG_COUNT_V2];
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */
+} __aligned(4);
+
+/* determine start of the prefix area within the proposal message */
+static inline struct smc_clc_msg_proposal_prefix *
+smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
+{
+ return (struct smc_clc_msg_proposal_prefix *)
+ ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
+}
+
+static inline bool smcr_indicated(int smc_type)
+{
+ return smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B;
+}
+
+static inline bool smcd_indicated(int smc_type)
+{
+ return smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B;
+}
+
+static inline u8 smc_indicated_type(int is_smcd, int is_smcr)
+{
+ if (is_smcd && is_smcr)
+ return SMC_TYPE_B;
+ if (is_smcd)
+ return SMC_TYPE_D;
+ if (is_smcr)
+ return SMC_TYPE_R;
+ return SMC_TYPE_N;
+}
+
+/* get SMC-D info from proposal message */
+static inline struct smc_clc_msg_smcd *
+smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop)
+{
+ if (smcd_indicated(prop->hdr.typev1) &&
+ ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd))
+ return NULL;
+
+ return (struct smc_clc_msg_smcd *)(prop + 1);
+}
+
+static inline struct smc_clc_v2_extension *
+smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop)
+{
+ struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop);
+
+ if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset))
+ return NULL;
+
+ return (struct smc_clc_v2_extension *)
+ ((u8 *)prop_smcd +
+ offsetof(struct smc_clc_msg_smcd, v2_ext_offset) +
+ sizeof(prop_smcd->v2_ext_offset) +
+ ntohs(prop_smcd->v2_ext_offset));
+}
+
+static inline struct smc_clc_smcd_v2_extension *
+smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext)
+{
+ if (!prop_v2ext)
+ return NULL;
+ if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset))
+ return NULL;
+
+ return (struct smc_clc_smcd_v2_extension *)
+ ((u8 *)prop_v2ext +
+ offsetof(struct smc_clc_v2_extension, hdr) +
+ offsetof(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) +
+ sizeof(prop_v2ext->hdr.smcd_v2_ext_offset) +
+ ntohs(prop_v2ext->hdr.smcd_v2_ext_offset));
+}
+
+static inline struct smc_clc_first_contact_ext *
+smc_get_clc_first_contact_ext(struct smc_clc_msg_accept_confirm_v2 *clc_v2,
+ bool is_smcd)
+{
+ int clc_v2_len;
+
+ if (clc_v2->hdr.version == SMC_V1 ||
+ !(clc_v2->hdr.typev2 & SMC_FIRST_CONTACT_MASK))
+ return NULL;
+
+ if (is_smcd)
+ clc_v2_len =
+ offsetofend(struct smc_clc_msg_accept_confirm_v2, d1);
+ else
+ clc_v2_len =
+ offsetofend(struct smc_clc_msg_accept_confirm_v2, r1);
+
+ return (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) +
+ clc_v2_len);
+}
+
+struct smcd_dev;
+struct smc_init_info;
+
+int smc_clc_prfx_match(struct socket *clcsock,
+ struct smc_clc_msg_proposal_prefix *prop);
+int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
+ u8 expected_type, unsigned long timeout);
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version);
+int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini);
+int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
+ u8 version, u8 *eid, struct smc_init_info *ini);
+int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact,
+ u8 version, u8 *negotiated_eid, struct smc_init_info *ini);
+int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini);
+int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce,
+ struct smc_init_info *ini);
+int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc,
+ struct smc_init_info *ini);
+void smc_clc_init(void) __init;
+void smc_clc_exit(void);
+void smc_clc_get_hostname(u8 **host);
+bool smc_clc_match_eid(u8 *negotiated_eid,
+ struct smc_clc_v2_extension *smc_v2_ext,
+ u8 *peer_eid, u8 *local_eid);
+int smc_clc_ueid_count(void);
+int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_flush_ueid(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info);
+
+#endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
new file mode 100644
index 000000000..10219f55a
--- /dev/null
+++ b/net/smc/smc_close.c
@@ -0,0 +1,506 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Socket Closing - normal and abnormal
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/workqueue.h>
+#include <linux/sched/signal.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include "smc.h"
+#include "smc_tx.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+
+/* release the clcsock that is assigned to the smc_sock */
+void smc_clcsock_release(struct smc_sock *smc)
+{
+ struct socket *tcp;
+
+ if (smc->listen_smc && current_work() != &smc->smc_listen_work)
+ cancel_work_sync(&smc->smc_listen_work);
+ mutex_lock(&smc->clcsock_release_lock);
+ if (smc->clcsock) {
+ tcp = smc->clcsock;
+ smc->clcsock = NULL;
+ sock_release(tcp);
+ }
+ mutex_unlock(&smc->clcsock_release_lock);
+}
+
+static void smc_close_cleanup_listen(struct sock *parent)
+{
+ struct sock *sk;
+
+ /* Close non-accepted connections */
+ while ((sk = smc_accept_dequeue(parent, NULL)))
+ smc_close_non_accepted(sk);
+}
+
+/* wait for sndbuf data being transmitted */
+static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = &smc->sk;
+
+ if (!timeout)
+ return;
+
+ if (!smc_tx_prepared_sends(&smc->conn))
+ return;
+
+ /* Send out corked data remaining in sndbuf */
+ smc_tx_pending(&smc->conn);
+
+ smc->wait_close_tx_prepared = 1;
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (!signal_pending(current) && timeout) {
+ int rc;
+
+ rc = sk_wait_event(sk, &timeout,
+ !smc_tx_prepared_sends(&smc->conn) ||
+ READ_ONCE(sk->sk_err) == ECONNABORTED ||
+ READ_ONCE(sk->sk_err) == ECONNRESET ||
+ smc->conn.killed,
+ &wait);
+ if (rc)
+ break;
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+ smc->wait_close_tx_prepared = 0;
+}
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc)
+{
+ if (smc->wait_close_tx_prepared)
+ /* wake up socket closing */
+ smc->sk.sk_state_change(&smc->sk);
+}
+
+static int smc_close_wr(struct smc_connection *conn)
+{
+ conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_final(struct smc_connection *conn)
+{
+ if (atomic_read(&conn->bytes_to_rcv))
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ else
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1;
+ if (conn->killed)
+ return -EPIPE;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+int smc_close_abort(struct smc_connection *conn)
+{
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static void smc_close_cancel_work(struct smc_sock *smc)
+{
+ struct sock *sk = &smc->sk;
+
+ release_sock(sk);
+ if (cancel_work_sync(&smc->conn.close_work))
+ sock_put(sk);
+ cancel_delayed_work_sync(&smc->conn.tx_work);
+ lock_sock(sk);
+}
+
+/* terminate smc socket abnormally - active abort
+ * link group is terminated, i.e. RDMA communication no longer possible
+ */
+void smc_close_active_abort(struct smc_sock *smc)
+{
+ struct sock *sk = &smc->sk;
+ bool release_clcsock = false;
+
+ if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) {
+ sk->sk_err = ECONNABORTED;
+ if (smc->clcsock && smc->clcsock->sk)
+ tcp_abort(smc->clcsock->sk, ECONNABORTED);
+ }
+ switch (sk->sk_state) {
+ case SMC_ACTIVE:
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ sk->sk_state = SMC_PEERABORTWAIT;
+ smc_close_cancel_work(smc);
+ if (sk->sk_state != SMC_PEERABORTWAIT)
+ break;
+ sk->sk_state = SMC_CLOSED;
+ sock_put(sk); /* (postponed) passive closing */
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ sk->sk_state = SMC_PEERABORTWAIT;
+ smc_close_cancel_work(smc);
+ if (sk->sk_state != SMC_PEERABORTWAIT)
+ break;
+ sk->sk_state = SMC_CLOSED;
+ smc_conn_free(&smc->conn);
+ release_clcsock = true;
+ sock_put(sk); /* passive closing */
+ break;
+ case SMC_PROCESSABORT:
+ case SMC_APPFINCLOSEWAIT:
+ sk->sk_state = SMC_PEERABORTWAIT;
+ smc_close_cancel_work(smc);
+ if (sk->sk_state != SMC_PEERABORTWAIT)
+ break;
+ sk->sk_state = SMC_CLOSED;
+ smc_conn_free(&smc->conn);
+ release_clcsock = true;
+ break;
+ case SMC_INIT:
+ case SMC_PEERABORTWAIT:
+ case SMC_CLOSED:
+ break;
+ }
+
+ smc_sock_set_flag(sk, SOCK_DEAD);
+ sk->sk_state_change(sk);
+
+ if (release_clcsock) {
+ release_sock(sk);
+ smc_clcsock_release(smc);
+ lock_sock(sk);
+ }
+}
+
+static inline bool smc_close_sent_any_close(struct smc_connection *conn)
+{
+ return conn->local_tx_ctrl.conn_state_flags.peer_conn_abort ||
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_closed;
+}
+
+int smc_close_active(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *txflags =
+ &smc->conn.local_tx_ctrl.conn_state_flags;
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ int old_state;
+ long timeout;
+ int rc = 0;
+ int rc1 = 0;
+
+ timeout = current->flags & PF_EXITING ?
+ 0 : sock_flag(sk, SOCK_LINGER) ?
+ sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT;
+
+ old_state = sk->sk_state;
+again:
+ switch (sk->sk_state) {
+ case SMC_INIT:
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_LISTEN:
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_state_change(sk); /* wake up accept */
+ if (smc->clcsock && smc->clcsock->sk) {
+ write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
+ smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready,
+ &smc->clcsk_data_ready);
+ smc->clcsock->sk->sk_user_data = NULL;
+ write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
+ rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
+ }
+ smc_close_cleanup_listen(sk);
+ release_sock(sk);
+ flush_work(&smc->tcp_listen_work);
+ lock_sock(sk);
+ break;
+ case SMC_ACTIVE:
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_delayed_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_state == SMC_ACTIVE) {
+ /* send close request */
+ rc = smc_close_final(conn);
+ sk->sk_state = SMC_PEERCLOSEWAIT1;
+
+ /* actively shutdown clcsock before peer close it,
+ * prevent peer from entering TIME_WAIT state.
+ */
+ if (smc->clcsock && smc->clcsock->sk) {
+ rc1 = kernel_sock_shutdown(smc->clcsock,
+ SHUT_RDWR);
+ rc = rc ? rc : rc1;
+ }
+ } else {
+ /* peer event has changed the state */
+ goto again;
+ }
+ break;
+ case SMC_APPFINCLOSEWAIT:
+ /* socket already shutdown wr or both (active close) */
+ if (txflags->peer_done_writing &&
+ !smc_close_sent_any_close(conn)) {
+ /* just shutdown wr done, send close request */
+ rc = smc_close_final(conn);
+ }
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ if (!smc_cdc_rxed_any_close(conn))
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_delayed_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_state != SMC_APPCLOSEWAIT1 &&
+ sk->sk_state != SMC_APPCLOSEWAIT2)
+ goto again;
+ /* confirm close from peer */
+ rc = smc_close_final(conn);
+ if (smc_cdc_rxed_any_close(conn)) {
+ /* peer has closed the socket already */
+ sk->sk_state = SMC_CLOSED;
+ sock_put(sk); /* postponed passive closing */
+ } else {
+ /* peer has just issued a shutdown write */
+ sk->sk_state = SMC_PEERFINCLOSEWAIT;
+ }
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ if (txflags->peer_done_writing &&
+ !smc_close_sent_any_close(conn)) {
+ /* just shutdown wr done, send close request */
+ rc = smc_close_final(conn);
+ }
+ /* peer sending PeerConnectionClosed will cause transition */
+ break;
+ case SMC_PEERFINCLOSEWAIT:
+ /* peer sending PeerConnectionClosed will cause transition */
+ break;
+ case SMC_PROCESSABORT:
+ rc = smc_close_abort(conn);
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_PEERABORTWAIT:
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_CLOSED:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(sk);
+ return rc;
+}
+
+static void smc_close_passive_abort_received(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *txflags =
+ &smc->conn.local_tx_ctrl.conn_state_flags;
+ struct sock *sk = &smc->sk;
+
+ switch (sk->sk_state) {
+ case SMC_INIT:
+ case SMC_ACTIVE:
+ case SMC_APPCLOSEWAIT1:
+ sk->sk_state = SMC_PROCESSABORT;
+ sock_put(sk); /* passive closing */
+ break;
+ case SMC_APPFINCLOSEWAIT:
+ sk->sk_state = SMC_PROCESSABORT;
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ if (txflags->peer_done_writing &&
+ !smc_close_sent_any_close(&smc->conn))
+ /* just shutdown, but not yet closed locally */
+ sk->sk_state = SMC_PROCESSABORT;
+ else
+ sk->sk_state = SMC_CLOSED;
+ sock_put(sk); /* passive closing */
+ break;
+ case SMC_APPCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ sk->sk_state = SMC_CLOSED;
+ sock_put(sk); /* passive closing */
+ break;
+ case SMC_PEERABORTWAIT:
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_PROCESSABORT:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+}
+
+/* Either some kind of closing has been received: peer_conn_closed,
+ * peer_conn_abort, or peer_done_writing
+ * or the link group of the connection terminates abnormally.
+ */
+static void smc_close_passive_work(struct work_struct *work)
+{
+ struct smc_connection *conn = container_of(work,
+ struct smc_connection,
+ close_work);
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ struct smc_cdc_conn_state_flags *rxflags;
+ bool release_clcsock = false;
+ struct sock *sk = &smc->sk;
+ int old_state;
+
+ lock_sock(sk);
+ old_state = sk->sk_state;
+
+ rxflags = &conn->local_rx_ctrl.conn_state_flags;
+ if (rxflags->peer_conn_abort) {
+ /* peer has not received all data */
+ smc_close_passive_abort_received(smc);
+ release_sock(sk);
+ cancel_delayed_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ goto wakeup;
+ }
+
+ switch (sk->sk_state) {
+ case SMC_INIT:
+ sk->sk_state = SMC_APPCLOSEWAIT1;
+ break;
+ case SMC_ACTIVE:
+ sk->sk_state = SMC_APPCLOSEWAIT1;
+ /* postpone sock_put() for passive closing to cover
+ * received SEND_SHUTDOWN as well
+ */
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ if (rxflags->peer_done_writing)
+ sk->sk_state = SMC_PEERCLOSEWAIT2;
+ fallthrough;
+ /* to check for closing */
+ case SMC_PEERCLOSEWAIT2:
+ if (!smc_cdc_rxed_any_close(conn))
+ break;
+ if (sock_flag(sk, SOCK_DEAD) &&
+ smc_close_sent_any_close(conn)) {
+ /* smc_release has already been called locally */
+ sk->sk_state = SMC_CLOSED;
+ } else {
+ /* just shutdown, but not yet closed locally */
+ sk->sk_state = SMC_APPFINCLOSEWAIT;
+ }
+ sock_put(sk); /* passive closing */
+ break;
+ case SMC_PEERFINCLOSEWAIT:
+ if (smc_cdc_rxed_any_close(conn)) {
+ sk->sk_state = SMC_CLOSED;
+ sock_put(sk); /* passive closing */
+ }
+ break;
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ /* postpone sock_put() for passive closing to cover
+ * received SEND_SHUTDOWN as well
+ */
+ break;
+ case SMC_APPFINCLOSEWAIT:
+ case SMC_PEERABORTWAIT:
+ case SMC_PROCESSABORT:
+ case SMC_CLOSED:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+wakeup:
+ sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */
+ sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */
+
+ if (old_state != sk->sk_state) {
+ sk->sk_state_change(sk);
+ if ((sk->sk_state == SMC_CLOSED) &&
+ (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) {
+ smc_conn_free(conn);
+ if (smc->clcsock)
+ release_clcsock = true;
+ }
+ }
+ release_sock(sk);
+ if (release_clcsock)
+ smc_clcsock_release(smc);
+ sock_put(sk); /* sock_hold done by schedulers of close_work */
+}
+
+int smc_close_shutdown_write(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ int old_state;
+ long timeout;
+ int rc = 0;
+
+ timeout = current->flags & PF_EXITING ?
+ 0 : sock_flag(sk, SOCK_LINGER) ?
+ sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT;
+
+ old_state = sk->sk_state;
+again:
+ switch (sk->sk_state) {
+ case SMC_ACTIVE:
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_delayed_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_state != SMC_ACTIVE)
+ goto again;
+ /* send close wr request */
+ rc = smc_close_wr(conn);
+ sk->sk_state = SMC_PEERCLOSEWAIT1;
+ break;
+ case SMC_APPCLOSEWAIT1:
+ /* passive close */
+ if (!smc_cdc_rxed_any_close(conn))
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_delayed_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_state != SMC_APPCLOSEWAIT1)
+ goto again;
+ /* confirm close from peer */
+ rc = smc_close_wr(conn);
+ sk->sk_state = SMC_APPCLOSEWAIT2;
+ break;
+ case SMC_APPCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ case SMC_APPFINCLOSEWAIT:
+ case SMC_PROCESSABORT:
+ case SMC_PEERABORTWAIT:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(sk);
+ return rc;
+}
+
+/* Initialize close properties on connection establishment. */
+void smc_close_init(struct smc_sock *smc)
+{
+ INIT_WORK(&smc->conn.close_work, smc_close_passive_work);
+}
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
new file mode 100644
index 000000000..634fea2b7
--- /dev/null
+++ b/net/smc/smc_close.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Socket Closing
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_CLOSE_H
+#define SMC_CLOSE_H
+
+#include <linux/workqueue.h>
+
+#include "smc.h"
+
+#define SMC_MAX_STREAM_WAIT_TIMEOUT (2 * HZ)
+#define SMC_CLOSE_SOCK_PUT_DELAY HZ
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc);
+int smc_close_active(struct smc_sock *smc);
+int smc_close_shutdown_write(struct smc_sock *smc);
+void smc_close_init(struct smc_sock *smc);
+void smc_clcsock_release(struct smc_sock *smc);
+int smc_close_abort(struct smc_connection *conn);
+void smc_close_active_abort(struct smc_sock *smc);
+
+#endif /* SMC_CLOSE_H */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
new file mode 100644
index 000000000..d520ee62c
--- /dev/null
+++ b/net/smc/smc_core.c
@@ -0,0 +1,2633 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Basic Transport Functions exploiting Infiniband API
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/socket.h>
+#include <linux/if_vlan.h>
+#include <linux/random.h>
+#include <linux/workqueue.h>
+#include <linux/wait.h>
+#include <linux/reboot.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/smc.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_core.h"
+#include "smc_ib.h"
+#include "smc_wr.h"
+#include "smc_llc.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+#include "smc_ism.h"
+#include "smc_netlink.h"
+#include "smc_stats.h"
+#include "smc_tracepoint.h"
+
+#define SMC_LGR_NUM_INCR 256
+#define SMC_LGR_FREE_DELAY_SERV (600 * HZ)
+#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
+
+struct smc_lgr_list smc_lgr_list = { /* established link groups */
+ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
+ .list = LIST_HEAD_INIT(smc_lgr_list.list),
+ .num = 0,
+};
+
+static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */
+static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted);
+
+static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
+ struct smc_buf_desc *buf_desc);
+static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
+
+static void smc_link_down_work(struct work_struct *work);
+
+/* return head of link group list and its lock for a given link group */
+static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
+ spinlock_t **lgr_lock)
+{
+ if (lgr->is_smcd) {
+ *lgr_lock = &lgr->smcd->lgr_lock;
+ return &lgr->smcd->lgr_list;
+ }
+
+ *lgr_lock = &smc_lgr_list.lock;
+ return &smc_lgr_list.list;
+}
+
+static void smc_ibdev_cnt_inc(struct smc_link *lnk)
+{
+ atomic_inc(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]);
+}
+
+static void smc_ibdev_cnt_dec(struct smc_link *lnk)
+{
+ atomic_dec(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]);
+}
+
+static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
+{
+ /* client link group creation always follows the server link group
+ * creation. For client use a somewhat higher removal delay time,
+ * otherwise there is a risk of out-of-sync link groups.
+ */
+ if (!lgr->freeing) {
+ mod_delayed_work(system_wq, &lgr->free_work,
+ (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
+ SMC_LGR_FREE_DELAY_CLNT :
+ SMC_LGR_FREE_DELAY_SERV);
+ }
+}
+
+/* Register connection's alert token in our lookup structure.
+ * To use rbtrees we have to implement our own insert core.
+ * Requires @conns_lock
+ * @smc connection to register
+ * Returns 0 on success, != otherwise.
+ */
+static void smc_lgr_add_alert_token(struct smc_connection *conn)
+{
+ struct rb_node **link, *parent = NULL;
+ u32 token = conn->alert_token_local;
+
+ link = &conn->lgr->conns_all.rb_node;
+ while (*link) {
+ struct smc_connection *cur = rb_entry(*link,
+ struct smc_connection, alert_node);
+
+ parent = *link;
+ if (cur->alert_token_local > token)
+ link = &parent->rb_left;
+ else
+ link = &parent->rb_right;
+ }
+ /* Put the new node there */
+ rb_link_node(&conn->alert_node, parent, link);
+ rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
+}
+
+/* assign an SMC-R link to the connection */
+static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first)
+{
+ enum smc_link_state expected = first ? SMC_LNK_ACTIVATING :
+ SMC_LNK_ACTIVE;
+ int i, j;
+
+ /* do link balancing */
+ conn->lnk = NULL; /* reset conn->lnk first */
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *lnk = &conn->lgr->lnk[i];
+
+ if (lnk->state != expected || lnk->link_is_asym)
+ continue;
+ if (conn->lgr->role == SMC_CLNT) {
+ conn->lnk = lnk; /* temporary, SMC server assigns link*/
+ break;
+ }
+ if (conn->lgr->conns_num % 2) {
+ for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) {
+ struct smc_link *lnk2;
+
+ lnk2 = &conn->lgr->lnk[j];
+ if (lnk2->state == expected &&
+ !lnk2->link_is_asym) {
+ conn->lnk = lnk2;
+ break;
+ }
+ }
+ }
+ if (!conn->lnk)
+ conn->lnk = lnk;
+ break;
+ }
+ if (!conn->lnk)
+ return SMC_CLC_DECL_NOACTLINK;
+ atomic_inc(&conn->lnk->conn_cnt);
+ return 0;
+}
+
+/* Register connection in link group by assigning an alert token
+ * registered in a search tree.
+ * Requires @conns_lock
+ * Note that '0' is a reserved value and not assigned.
+ */
+static int smc_lgr_register_conn(struct smc_connection *conn, bool first)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ static atomic_t nexttoken = ATOMIC_INIT(0);
+ int rc;
+
+ if (!conn->lgr->is_smcd) {
+ rc = smcr_lgr_conn_assign_link(conn, first);
+ if (rc) {
+ conn->lgr = NULL;
+ return rc;
+ }
+ }
+ /* find a new alert_token_local value not yet used by some connection
+ * in this link group
+ */
+ sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
+ while (!conn->alert_token_local) {
+ conn->alert_token_local = atomic_inc_return(&nexttoken);
+ if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
+ conn->alert_token_local = 0;
+ }
+ smc_lgr_add_alert_token(conn);
+ conn->lgr->conns_num++;
+ return 0;
+}
+
+/* Unregister connection and reset the alert token of the given connection<
+ */
+static void __smc_lgr_unregister_conn(struct smc_connection *conn)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ struct smc_link_group *lgr = conn->lgr;
+
+ rb_erase(&conn->alert_node, &lgr->conns_all);
+ if (conn->lnk)
+ atomic_dec(&conn->lnk->conn_cnt);
+ lgr->conns_num--;
+ conn->alert_token_local = 0;
+ sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
+}
+
+/* Unregister connection from lgr
+ */
+static void smc_lgr_unregister_conn(struct smc_connection *conn)
+{
+ struct smc_link_group *lgr = conn->lgr;
+
+ if (!smc_conn_lgr_valid(conn))
+ return;
+ write_lock_bh(&lgr->conns_lock);
+ if (conn->alert_token_local) {
+ __smc_lgr_unregister_conn(conn);
+ }
+ write_unlock_bh(&lgr->conns_lock);
+}
+
+int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ char hostname[SMC_MAX_HOSTNAME_LEN + 1];
+ char smc_seid[SMC_MAX_EID_LEN + 1];
+ struct nlattr *attrs;
+ u8 *seid = NULL;
+ u8 *host = NULL;
+ void *nlh;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_SYS_INFO);
+ if (!nlh)
+ goto errmsg;
+ if (cb_ctx->pos[0])
+ goto errout;
+ attrs = nla_nest_start(skb, SMC_GEN_SYS_INFO);
+ if (!attrs)
+ goto errout;
+ if (nla_put_u8(skb, SMC_NLA_SYS_VER, SMC_V2))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_SYS_REL, SMC_RELEASE))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_SYS_IS_ISM_V2, smc_ism_is_v2_capable()))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_SYS_IS_SMCR_V2, true))
+ goto errattr;
+ smc_clc_get_hostname(&host);
+ if (host) {
+ memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN);
+ hostname[SMC_MAX_HOSTNAME_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_SYS_LOCAL_HOST, hostname))
+ goto errattr;
+ }
+ if (smc_ism_is_v2_capable()) {
+ smc_ism_get_system_eid(&seid);
+ memcpy(smc_seid, seid, SMC_MAX_EID_LEN);
+ smc_seid[SMC_MAX_EID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_SYS_SEID, smc_seid))
+ goto errattr;
+ }
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ cb_ctx->pos[0] = 1;
+ return skb->len;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return skb->len;
+}
+
+/* Fill SMC_NLA_LGR_D_V2_COMMON/SMC_NLA_LGR_R_V2_COMMON nested attributes */
+static int smc_nl_fill_lgr_v2_common(struct smc_link_group *lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nlattr *v2_attrs)
+{
+ char smc_host[SMC_MAX_HOSTNAME_LEN + 1];
+ char smc_eid[SMC_MAX_EID_LEN + 1];
+
+ if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version))
+ goto errv2attr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release))
+ goto errv2attr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os))
+ goto errv2attr;
+ memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN);
+ smc_host[SMC_MAX_HOSTNAME_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host))
+ goto errv2attr;
+ memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN);
+ smc_eid[SMC_MAX_EID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid))
+ goto errv2attr;
+
+ nla_nest_end(skb, v2_attrs);
+ return 0;
+
+errv2attr:
+ nla_nest_cancel(skb, v2_attrs);
+ return -EMSGSIZE;
+}
+
+static int smc_nl_fill_smcr_lgr_v2(struct smc_link_group *lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nlattr *v2_attrs;
+
+ v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2);
+ if (!v2_attrs)
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_DIRECT, !lgr->uses_gateway))
+ goto errv2attr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_CONNS, lgr->max_conns))
+ goto errv2attr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_LINKS, lgr->max_links))
+ goto errv2attr;
+
+ nla_nest_end(skb, v2_attrs);
+ return 0;
+
+errv2attr:
+ nla_nest_cancel(skb, v2_attrs);
+errattr:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_fill_lgr(struct smc_link_group *lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_target[SMC_MAX_PNETID_LEN + 1];
+ struct nlattr *attrs, *v2_attrs;
+
+ attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCR);
+ if (!attrs)
+ goto errout;
+
+ if (nla_put_u32(skb, SMC_NLA_LGR_R_ID, *((u32 *)&lgr->id)))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LGR_R_CONNS_NUM, lgr->conns_num))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_ROLE, lgr->role))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_BUF_TYPE, lgr->buf_type))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_LGR_R_NET_COOKIE,
+ lgr->net->net_cookie, SMC_NLA_LGR_R_PAD))
+ goto errattr;
+ memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN);
+ smc_target[SMC_MAX_PNETID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target))
+ goto errattr;
+ if (lgr->smc_version > SMC_V1) {
+ v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2_COMMON);
+ if (!v2_attrs)
+ goto errattr;
+ if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs))
+ goto errattr;
+ if (smc_nl_fill_smcr_lgr_v2(lgr, skb, cb))
+ goto errattr;
+ }
+
+ nla_nest_end(skb, attrs);
+ return 0;
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_fill_lgr_link(struct smc_link_group *lgr,
+ struct smc_link *link,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_ibname[IB_DEVICE_NAME_MAX];
+ u8 smc_gid_target[41];
+ struct nlattr *attrs;
+ u32 link_uid = 0;
+ void *nlh;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_LINK_SMCR);
+ if (!nlh)
+ goto errmsg;
+
+ attrs = nla_nest_start(skb, SMC_GEN_LINK_SMCR);
+ if (!attrs)
+ goto errout;
+
+ if (nla_put_u8(skb, SMC_NLA_LINK_ID, link->link_id))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LINK_STATE, link->state))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LINK_CONN_CNT,
+ atomic_read(&link->conn_cnt)))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LINK_IB_PORT, link->ibport))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LINK_NET_DEV, link->ndev_ifidx))
+ goto errattr;
+ snprintf(smc_ibname, sizeof(smc_ibname), "%s", link->ibname);
+ if (nla_put_string(skb, SMC_NLA_LINK_IB_DEV, smc_ibname))
+ goto errattr;
+ memcpy(&link_uid, link->link_uid, sizeof(link_uid));
+ if (nla_put_u32(skb, SMC_NLA_LINK_UID, link_uid))
+ goto errattr;
+ memcpy(&link_uid, link->peer_link_uid, sizeof(link_uid));
+ if (nla_put_u32(skb, SMC_NLA_LINK_PEER_UID, link_uid))
+ goto errattr;
+ memset(smc_gid_target, 0, sizeof(smc_gid_target));
+ smc_gid_be16_convert(smc_gid_target, link->gid);
+ if (nla_put_string(skb, SMC_NLA_LINK_GID, smc_gid_target))
+ goto errattr;
+ memset(smc_gid_target, 0, sizeof(smc_gid_target));
+ smc_gid_be16_convert(smc_gid_target, link->peer_gid);
+ if (nla_put_string(skb, SMC_NLA_LINK_PEER_GID, smc_gid_target))
+ goto errattr;
+
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return 0;
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_handle_lgr(struct smc_link_group *lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ bool list_links)
+{
+ void *nlh;
+ int i;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_LGR_SMCR);
+ if (!nlh)
+ goto errmsg;
+ if (smc_nl_fill_lgr(lgr, skb, cb))
+ goto errout;
+
+ genlmsg_end(skb, nlh);
+ if (!list_links)
+ goto out;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_usable(&lgr->lnk[i]))
+ continue;
+ if (smc_nl_fill_lgr_link(lgr, &lgr->lnk[i], skb, cb))
+ goto errout;
+ }
+out:
+ return 0;
+
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static void smc_nl_fill_lgr_list(struct smc_lgr_list *smc_lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ bool list_links)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct smc_link_group *lgr;
+ int snum = cb_ctx->pos[0];
+ int num = 0;
+
+ spin_lock_bh(&smc_lgr->lock);
+ list_for_each_entry(lgr, &smc_lgr->list, list) {
+ if (num < snum)
+ goto next;
+ if (smc_nl_handle_lgr(lgr, skb, cb, list_links))
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ spin_unlock_bh(&smc_lgr->lock);
+ cb_ctx->pos[0] = num;
+}
+
+static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_pnet[SMC_MAX_PNETID_LEN + 1];
+ struct smcd_dev *smcd = lgr->smcd;
+ struct nlattr *attrs;
+ void *nlh;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_LGR_SMCD);
+ if (!nlh)
+ goto errmsg;
+
+ attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCD);
+ if (!attrs)
+ goto errout;
+
+ if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id)))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID,
+ smcd->ops->get_local_gid(smcd),
+ SMC_NLA_LGR_D_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid,
+ SMC_NLA_LGR_D_PAD))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_D_VLAN_ID, lgr->vlan_id))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LGR_D_CONNS_NUM, lgr->conns_num))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd)))
+ goto errattr;
+ memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN);
+ smc_pnet[SMC_MAX_PNETID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet))
+ goto errattr;
+ if (lgr->smc_version > SMC_V1) {
+ struct nlattr *v2_attrs;
+
+ v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_D_V2_COMMON);
+ if (!v2_attrs)
+ goto errattr;
+ if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs))
+ goto errattr;
+ }
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return 0;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_handle_smcd_lgr(struct smcd_dev *dev,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct smc_link_group *lgr;
+ int snum = cb_ctx->pos[1];
+ int rc = 0, num = 0;
+
+ spin_lock_bh(&dev->lgr_lock);
+ list_for_each_entry(lgr, &dev->lgr_list, list) {
+ if (!lgr->is_smcd)
+ continue;
+ if (num < snum)
+ goto next;
+ rc = smc_nl_fill_smcd_lgr(lgr, skb, cb);
+ if (rc)
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ spin_unlock_bh(&dev->lgr_lock);
+ cb_ctx->pos[1] = num;
+ return rc;
+}
+
+static int smc_nl_fill_smcd_dev(struct smcd_dev_list *dev_list,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct smcd_dev *smcd_dev;
+ int snum = cb_ctx->pos[0];
+ int rc = 0, num = 0;
+
+ mutex_lock(&dev_list->mutex);
+ list_for_each_entry(smcd_dev, &dev_list->list, list) {
+ if (list_empty(&smcd_dev->lgr_list))
+ continue;
+ if (num < snum)
+ goto next;
+ rc = smc_nl_handle_smcd_lgr(smcd_dev, skb, cb);
+ if (rc)
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ mutex_unlock(&dev_list->mutex);
+ cb_ctx->pos[0] = num;
+ return rc;
+}
+
+int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ bool list_links = false;
+
+ smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links);
+ return skb->len;
+}
+
+int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ bool list_links = true;
+
+ smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links);
+ return skb->len;
+}
+
+int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ smc_nl_fill_smcd_dev(&smcd_dev_list, skb, cb);
+ return skb->len;
+}
+
+void smc_lgr_cleanup_early(struct smc_link_group *lgr)
+{
+ spinlock_t *lgr_lock;
+
+ if (!lgr)
+ return;
+
+ smc_lgr_list_head(lgr, &lgr_lock);
+ spin_lock_bh(lgr_lock);
+ /* do not use this link group for new connections */
+ if (!list_empty(&lgr->list))
+ list_del_init(&lgr->list);
+ spin_unlock_bh(lgr_lock);
+ __smc_lgr_terminate(lgr, true);
+}
+
+static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr)
+{
+ int i;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *lnk = &lgr->lnk[i];
+
+ if (smc_link_sendable(lnk))
+ lnk->state = SMC_LNK_INACTIVE;
+ }
+ wake_up_all(&lgr->llc_msg_waiter);
+ wake_up_all(&lgr->llc_flow_waiter);
+}
+
+static void smc_lgr_free(struct smc_link_group *lgr);
+
+static void smc_lgr_free_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(to_delayed_work(work),
+ struct smc_link_group,
+ free_work);
+ spinlock_t *lgr_lock;
+ bool conns;
+
+ smc_lgr_list_head(lgr, &lgr_lock);
+ spin_lock_bh(lgr_lock);
+ if (lgr->freeing) {
+ spin_unlock_bh(lgr_lock);
+ return;
+ }
+ read_lock_bh(&lgr->conns_lock);
+ conns = RB_EMPTY_ROOT(&lgr->conns_all);
+ read_unlock_bh(&lgr->conns_lock);
+ if (!conns) { /* number of lgr connections is no longer zero */
+ spin_unlock_bh(lgr_lock);
+ return;
+ }
+ list_del_init(&lgr->list); /* remove from smc_lgr_list */
+ lgr->freeing = 1; /* this instance does the freeing, no new schedule */
+ spin_unlock_bh(lgr_lock);
+ cancel_delayed_work(&lgr->free_work);
+
+ if (!lgr->is_smcd && !lgr->terminating)
+ smc_llc_send_link_delete_all(lgr, true,
+ SMC_LLC_DEL_PROG_INIT_TERM);
+ if (lgr->is_smcd && !lgr->terminating)
+ smc_ism_signal_shutdown(lgr);
+ if (!lgr->is_smcd)
+ smcr_lgr_link_deactivate_all(lgr);
+ smc_lgr_free(lgr);
+}
+
+static void smc_lgr_terminate_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+ terminate_work);
+
+ __smc_lgr_terminate(lgr, true);
+}
+
+/* return next unique link id for the lgr */
+static u8 smcr_next_link_id(struct smc_link_group *lgr)
+{
+ u8 link_id;
+ int i;
+
+ while (1) {
+again:
+ link_id = ++lgr->next_link_id;
+ if (!link_id) /* skip zero as link_id */
+ link_id = ++lgr->next_link_id;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (smc_link_usable(&lgr->lnk[i]) &&
+ lgr->lnk[i].link_id == link_id)
+ goto again;
+ }
+ break;
+ }
+ return link_id;
+}
+
+static void smcr_copy_dev_info_to_link(struct smc_link *link)
+{
+ struct smc_ib_device *smcibdev = link->smcibdev;
+
+ snprintf(link->ibname, sizeof(link->ibname), "%s",
+ smcibdev->ibdev->name);
+ link->ndev_ifidx = smcibdev->ndev_ifidx[link->ibport - 1];
+}
+
+int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
+ u8 link_idx, struct smc_init_info *ini)
+{
+ struct smc_ib_device *smcibdev;
+ u8 rndvec[3];
+ int rc;
+
+ if (lgr->smc_version == SMC_V2) {
+ lnk->smcibdev = ini->smcrv2.ib_dev_v2;
+ lnk->ibport = ini->smcrv2.ib_port_v2;
+ } else {
+ lnk->smcibdev = ini->ib_dev;
+ lnk->ibport = ini->ib_port;
+ }
+ get_device(&lnk->smcibdev->ibdev->dev);
+ atomic_inc(&lnk->smcibdev->lnk_cnt);
+ refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
+ lnk->clearing = 0;
+ lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
+ lnk->link_id = smcr_next_link_id(lgr);
+ lnk->lgr = lgr;
+ smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
+ lnk->link_idx = link_idx;
+ lnk->wr_rx_id_compl = 0;
+ smc_ibdev_cnt_inc(lnk);
+ smcr_copy_dev_info_to_link(lnk);
+ atomic_set(&lnk->conn_cnt, 0);
+ smc_llc_link_set_uid(lnk);
+ INIT_WORK(&lnk->link_down_wrk, smc_link_down_work);
+ if (!lnk->smcibdev->initialized) {
+ rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev);
+ if (rc)
+ goto out;
+ }
+ get_random_bytes(rndvec, sizeof(rndvec));
+ lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
+ (rndvec[2] << 16);
+ rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
+ ini->vlan_id, lnk->gid, &lnk->sgid_index,
+ lgr->smc_version == SMC_V2 ?
+ &ini->smcrv2 : NULL);
+ if (rc)
+ goto out;
+ rc = smc_llc_link_init(lnk);
+ if (rc)
+ goto out;
+ rc = smc_wr_alloc_link_mem(lnk);
+ if (rc)
+ goto clear_llc_lnk;
+ rc = smc_ib_create_protection_domain(lnk);
+ if (rc)
+ goto free_link_mem;
+ rc = smc_ib_create_queue_pair(lnk);
+ if (rc)
+ goto dealloc_pd;
+ rc = smc_wr_create_link(lnk);
+ if (rc)
+ goto destroy_qp;
+ lnk->state = SMC_LNK_ACTIVATING;
+ return 0;
+
+destroy_qp:
+ smc_ib_destroy_queue_pair(lnk);
+dealloc_pd:
+ smc_ib_dealloc_protection_domain(lnk);
+free_link_mem:
+ smc_wr_free_link_mem(lnk);
+clear_llc_lnk:
+ smc_llc_link_clear(lnk, false);
+out:
+ smc_ibdev_cnt_dec(lnk);
+ put_device(&lnk->smcibdev->ibdev->dev);
+ smcibdev = lnk->smcibdev;
+ memset(lnk, 0, sizeof(struct smc_link));
+ lnk->state = SMC_LNK_UNUSED;
+ if (!atomic_dec_return(&smcibdev->lnk_cnt))
+ wake_up(&smcibdev->lnks_deleted);
+ smc_lgr_put(lgr); /* lgr_hold above */
+ return rc;
+}
+
+/* create a new SMC link group */
+static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
+{
+ struct smc_link_group *lgr;
+ struct list_head *lgr_list;
+ struct smcd_dev *smcd;
+ struct smc_link *lnk;
+ spinlock_t *lgr_lock;
+ u8 link_idx;
+ int rc = 0;
+ int i;
+
+ if (ini->is_smcd && ini->vlan_id) {
+ if (smc_ism_get_vlan(ini->ism_dev[ini->ism_selected],
+ ini->vlan_id)) {
+ rc = SMC_CLC_DECL_ISMVLANERR;
+ goto out;
+ }
+ }
+
+ lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
+ if (!lgr) {
+ rc = SMC_CLC_DECL_MEM;
+ goto ism_put_vlan;
+ }
+ lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0,
+ SMC_LGR_ID_SIZE, &lgr->id);
+ if (!lgr->tx_wq) {
+ rc = -ENOMEM;
+ goto free_lgr;
+ }
+ lgr->is_smcd = ini->is_smcd;
+ lgr->sync_err = 0;
+ lgr->terminating = 0;
+ lgr->freeing = 0;
+ lgr->vlan_id = ini->vlan_id;
+ refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */
+ init_rwsem(&lgr->sndbufs_lock);
+ init_rwsem(&lgr->rmbs_lock);
+ rwlock_init(&lgr->conns_lock);
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ INIT_LIST_HEAD(&lgr->sndbufs[i]);
+ INIT_LIST_HEAD(&lgr->rmbs[i]);
+ }
+ lgr->next_link_id = 0;
+ smc_lgr_list.num += SMC_LGR_NUM_INCR;
+ memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
+ INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
+ INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work);
+ lgr->conns_all = RB_ROOT;
+ if (ini->is_smcd) {
+ /* SMC-D specific settings */
+ smcd = ini->ism_dev[ini->ism_selected];
+ get_device(smcd->ops->get_dev(smcd));
+ lgr->peer_gid = ini->ism_peer_gid[ini->ism_selected];
+ lgr->smcd = ini->ism_dev[ini->ism_selected];
+ lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list;
+ lgr_lock = &lgr->smcd->lgr_lock;
+ lgr->smc_version = ini->smcd_version;
+ lgr->peer_shutdown = 0;
+ atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt);
+ } else {
+ /* SMC-R specific settings */
+ struct smc_ib_device *ibdev;
+ int ibport;
+
+ lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ lgr->smc_version = ini->smcr_version;
+ memcpy(lgr->peer_systemid, ini->peer_systemid,
+ SMC_SYSTEMID_LEN);
+ if (lgr->smc_version == SMC_V2) {
+ ibdev = ini->smcrv2.ib_dev_v2;
+ ibport = ini->smcrv2.ib_port_v2;
+ lgr->saddr = ini->smcrv2.saddr;
+ lgr->uses_gateway = ini->smcrv2.uses_gateway;
+ memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac,
+ ETH_ALEN);
+ lgr->max_conns = ini->max_conns;
+ lgr->max_links = ini->max_links;
+ } else {
+ ibdev = ini->ib_dev;
+ ibport = ini->ib_port;
+ lgr->max_conns = SMC_CONN_PER_LGR_MAX;
+ lgr->max_links = SMC_LINKS_ADD_LNK_MAX;
+ }
+ memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1],
+ SMC_MAX_PNETID_LEN);
+ rc = smc_wr_alloc_lgr_mem(lgr);
+ if (rc)
+ goto free_wq;
+ smc_llc_lgr_init(lgr, smc);
+
+ link_idx = SMC_SINGLE_LINK;
+ lnk = &lgr->lnk[link_idx];
+ rc = smcr_link_init(lgr, lnk, link_idx, ini);
+ if (rc) {
+ smc_wr_free_lgr_mem(lgr);
+ goto free_wq;
+ }
+ lgr->net = smc_ib_net(lnk->smcibdev);
+ lgr_list = &smc_lgr_list.list;
+ lgr_lock = &smc_lgr_list.lock;
+ lgr->buf_type = lgr->net->smc.sysctl_smcr_buf_type;
+ atomic_inc(&lgr_cnt);
+ }
+ smc->conn.lgr = lgr;
+ spin_lock_bh(lgr_lock);
+ list_add_tail(&lgr->list, lgr_list);
+ spin_unlock_bh(lgr_lock);
+ return 0;
+
+free_wq:
+ destroy_workqueue(lgr->tx_wq);
+free_lgr:
+ kfree(lgr);
+ism_put_vlan:
+ if (ini->is_smcd && ini->vlan_id)
+ smc_ism_put_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id);
+out:
+ if (rc < 0) {
+ if (rc == -ENOMEM)
+ rc = SMC_CLC_DECL_MEM;
+ else
+ rc = SMC_CLC_DECL_INTERR;
+ }
+ return rc;
+}
+
+static int smc_write_space(struct smc_connection *conn)
+{
+ int buffer_len = conn->peer_rmbe_size;
+ union smc_host_cursor prod;
+ union smc_host_cursor cons;
+ int space;
+
+ smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
+ smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
+ /* determine rx_buf space */
+ space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod);
+ return space;
+}
+
+static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend,
+ struct smc_wr_buf *wr_buf)
+{
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor cons, fin;
+ int rc = 0;
+ int diff;
+
+ smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn);
+ smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn);
+ /* set prod cursor to old state, enforce tx_rdma_writes() */
+ smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn);
+ smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
+
+ if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) {
+ /* cons cursor advanced more than fin, and prod was set
+ * fin above, so now prod is smaller than cons. Fix that.
+ */
+ diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons);
+ smc_curs_add(conn->sndbuf_desc->len,
+ &conn->tx_curs_sent, diff);
+ smc_curs_add(conn->sndbuf_desc->len,
+ &conn->tx_curs_fin, diff);
+
+ smp_mb__before_atomic();
+ atomic_add(diff, &conn->sndbuf_space);
+ smp_mb__after_atomic();
+
+ smc_curs_add(conn->peer_rmbe_size,
+ &conn->local_tx_ctrl.prod, diff);
+ smc_curs_add(conn->peer_rmbe_size,
+ &conn->local_tx_ctrl_fin, diff);
+ }
+ /* recalculate, value is used by tx_rdma_writes() */
+ atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn));
+
+ if (smc->sk.sk_state != SMC_INIT &&
+ smc->sk.sk_state != SMC_CLOSED) {
+ rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf);
+ if (!rc) {
+ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 0);
+ smc->sk.sk_data_ready(&smc->sk);
+ }
+ } else {
+ smc_wr_tx_put_slot(conn->lnk,
+ (struct smc_wr_tx_pend_priv *)pend);
+ }
+ return rc;
+}
+
+void smc_switch_link_and_count(struct smc_connection *conn,
+ struct smc_link *to_lnk)
+{
+ atomic_dec(&conn->lnk->conn_cnt);
+ /* link_hold in smc_conn_create() */
+ smcr_link_put(conn->lnk);
+ conn->lnk = to_lnk;
+ atomic_inc(&conn->lnk->conn_cnt);
+ /* link_put in smc_conn_free() */
+ smcr_link_hold(conn->lnk);
+}
+
+struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
+ struct smc_link *from_lnk, bool is_dev_err)
+{
+ struct smc_link *to_lnk = NULL;
+ struct smc_cdc_tx_pend *pend;
+ struct smc_connection *conn;
+ struct smc_wr_buf *wr_buf;
+ struct smc_sock *smc;
+ struct rb_node *node;
+ int i, rc = 0;
+
+ /* link is inactive, wake up tx waiters */
+ smc_wr_wakeup_tx_wait(from_lnk);
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]) || i == from_lnk->link_idx)
+ continue;
+ if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev &&
+ from_lnk->ibport == lgr->lnk[i].ibport) {
+ continue;
+ }
+ to_lnk = &lgr->lnk[i];
+ break;
+ }
+ if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) {
+ smc_lgr_terminate_sched(lgr);
+ return NULL;
+ }
+again:
+ read_lock_bh(&lgr->conns_lock);
+ for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) {
+ conn = rb_entry(node, struct smc_connection, alert_node);
+ if (conn->lnk != from_lnk)
+ continue;
+ smc = container_of(conn, struct smc_sock, conn);
+ /* conn->lnk not yet set in SMC_INIT state */
+ if (smc->sk.sk_state == SMC_INIT)
+ continue;
+ if (smc->sk.sk_state == SMC_CLOSED ||
+ smc->sk.sk_state == SMC_PEERCLOSEWAIT1 ||
+ smc->sk.sk_state == SMC_PEERCLOSEWAIT2 ||
+ smc->sk.sk_state == SMC_APPFINCLOSEWAIT ||
+ smc->sk.sk_state == SMC_APPCLOSEWAIT1 ||
+ smc->sk.sk_state == SMC_APPCLOSEWAIT2 ||
+ smc->sk.sk_state == SMC_PEERFINCLOSEWAIT ||
+ smc->sk.sk_state == SMC_PEERABORTWAIT ||
+ smc->sk.sk_state == SMC_PROCESSABORT) {
+ spin_lock_bh(&conn->send_lock);
+ smc_switch_link_and_count(conn, to_lnk);
+ spin_unlock_bh(&conn->send_lock);
+ continue;
+ }
+ sock_hold(&smc->sk);
+ read_unlock_bh(&lgr->conns_lock);
+ /* pre-fetch buffer outside of send_lock, might sleep */
+ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend);
+ if (rc)
+ goto err_out;
+ /* avoid race with smcr_tx_sndbuf_nonempty() */
+ spin_lock_bh(&conn->send_lock);
+ smc_switch_link_and_count(conn, to_lnk);
+ rc = smc_switch_cursor(smc, pend, wr_buf);
+ spin_unlock_bh(&conn->send_lock);
+ sock_put(&smc->sk);
+ if (rc)
+ goto err_out;
+ goto again;
+ }
+ read_unlock_bh(&lgr->conns_lock);
+ smc_wr_tx_link_put(to_lnk);
+ return to_lnk;
+
+err_out:
+ smcr_link_down_cond_sched(to_lnk);
+ smc_wr_tx_link_put(to_lnk);
+ return NULL;
+}
+
+static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
+ struct smc_link_group *lgr)
+{
+ struct rw_semaphore *lock; /* lock buffer list */
+ int rc;
+
+ if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) {
+ /* unregister rmb with peer */
+ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
+ if (!rc) {
+ /* protect against smc_llc_cli_rkey_exchange() */
+ down_read(&lgr->llc_conf_mutex);
+ smc_llc_do_delete_rkey(lgr, buf_desc);
+ buf_desc->is_conf_rkey = false;
+ up_read(&lgr->llc_conf_mutex);
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+ }
+ }
+
+ if (buf_desc->is_reg_err) {
+ /* buf registration failed, reuse not possible */
+ lock = is_rmb ? &lgr->rmbs_lock :
+ &lgr->sndbufs_lock;
+ down_write(lock);
+ list_del(&buf_desc->list);
+ up_write(lock);
+
+ smc_buf_free(lgr, is_rmb, buf_desc);
+ } else {
+ /* memzero_explicit provides potential memory barrier semantics */
+ memzero_explicit(buf_desc->cpu_addr, buf_desc->len);
+ WRITE_ONCE(buf_desc->used, 0);
+ }
+}
+
+static void smc_buf_unuse(struct smc_connection *conn,
+ struct smc_link_group *lgr)
+{
+ if (conn->sndbuf_desc) {
+ if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) {
+ smcr_buf_unuse(conn->sndbuf_desc, false, lgr);
+ } else {
+ memzero_explicit(conn->sndbuf_desc->cpu_addr, conn->sndbuf_desc->len);
+ WRITE_ONCE(conn->sndbuf_desc->used, 0);
+ }
+ }
+ if (conn->rmb_desc) {
+ if (!lgr->is_smcd) {
+ smcr_buf_unuse(conn->rmb_desc, true, lgr);
+ } else {
+ memzero_explicit(conn->rmb_desc->cpu_addr,
+ conn->rmb_desc->len + sizeof(struct smcd_cdc_msg));
+ WRITE_ONCE(conn->rmb_desc->used, 0);
+ }
+ }
+}
+
+/* remove a finished connection from its link group */
+void smc_conn_free(struct smc_connection *conn)
+{
+ struct smc_link_group *lgr = conn->lgr;
+
+ if (!lgr || conn->freed)
+ /* Connection has never been registered in a
+ * link group, or has already been freed.
+ */
+ return;
+
+ conn->freed = 1;
+ if (!smc_conn_lgr_valid(conn))
+ /* Connection has already unregistered from
+ * link group.
+ */
+ goto lgr_put;
+
+ if (lgr->is_smcd) {
+ if (!list_empty(&lgr->list))
+ smc_ism_unset_conn(conn);
+ tasklet_kill(&conn->rx_tsklet);
+ } else {
+ smc_cdc_wait_pend_tx_wr(conn);
+ if (current_work() != &conn->abort_work)
+ cancel_work_sync(&conn->abort_work);
+ }
+ if (!list_empty(&lgr->list)) {
+ smc_buf_unuse(conn, lgr); /* allow buffer reuse */
+ smc_lgr_unregister_conn(conn);
+ }
+
+ if (!lgr->conns_num)
+ smc_lgr_schedule_free_work(lgr);
+lgr_put:
+ if (!lgr->is_smcd)
+ smcr_link_put(conn->lnk); /* link_hold in smc_conn_create() */
+ smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */
+}
+
+/* unregister a link from a buf_desc */
+static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb,
+ struct smc_link *lnk)
+{
+ if (is_rmb || buf_desc->is_vm)
+ buf_desc->is_reg_mr[lnk->link_idx] = false;
+ if (!buf_desc->is_map_ib[lnk->link_idx])
+ return;
+
+ if ((is_rmb || buf_desc->is_vm) &&
+ buf_desc->mr[lnk->link_idx]) {
+ smc_ib_put_memory_region(buf_desc->mr[lnk->link_idx]);
+ buf_desc->mr[lnk->link_idx] = NULL;
+ }
+ if (is_rmb)
+ smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE);
+ else
+ smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE);
+
+ sg_free_table(&buf_desc->sgt[lnk->link_idx]);
+ buf_desc->is_map_ib[lnk->link_idx] = false;
+}
+
+/* unmap all buffers of lgr for a deleted link */
+static void smcr_buf_unmap_lgr(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ struct smc_buf_desc *buf_desc, *bf;
+ int i;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ down_write(&lgr->rmbs_lock);
+ list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list)
+ smcr_buf_unmap_link(buf_desc, true, lnk);
+ up_write(&lgr->rmbs_lock);
+
+ down_write(&lgr->sndbufs_lock);
+ list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i],
+ list)
+ smcr_buf_unmap_link(buf_desc, false, lnk);
+ up_write(&lgr->sndbufs_lock);
+ }
+}
+
+static void smcr_rtoken_clear_link(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ int i;
+
+ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+ lgr->rtokens[i][lnk->link_idx].rkey = 0;
+ lgr->rtokens[i][lnk->link_idx].dma_addr = 0;
+ }
+}
+
+static void __smcr_link_clear(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ struct smc_ib_device *smcibdev;
+
+ smc_wr_free_link_mem(lnk);
+ smc_ibdev_cnt_dec(lnk);
+ put_device(&lnk->smcibdev->ibdev->dev);
+ smcibdev = lnk->smcibdev;
+ memset(lnk, 0, sizeof(struct smc_link));
+ lnk->state = SMC_LNK_UNUSED;
+ if (!atomic_dec_return(&smcibdev->lnk_cnt))
+ wake_up(&smcibdev->lnks_deleted);
+ smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */
+}
+
+/* must be called under lgr->llc_conf_mutex lock */
+void smcr_link_clear(struct smc_link *lnk, bool log)
+{
+ if (!lnk->lgr || lnk->clearing ||
+ lnk->state == SMC_LNK_UNUSED)
+ return;
+ lnk->clearing = 1;
+ lnk->peer_qpn = 0;
+ smc_llc_link_clear(lnk, log);
+ smcr_buf_unmap_lgr(lnk);
+ smcr_rtoken_clear_link(lnk);
+ smc_ib_modify_qp_error(lnk);
+ smc_wr_free_link(lnk);
+ smc_ib_destroy_queue_pair(lnk);
+ smc_ib_dealloc_protection_domain(lnk);
+ smcr_link_put(lnk); /* theoretically last link_put */
+}
+
+void smcr_link_hold(struct smc_link *lnk)
+{
+ refcount_inc(&lnk->refcnt);
+}
+
+void smcr_link_put(struct smc_link *lnk)
+{
+ if (refcount_dec_and_test(&lnk->refcnt))
+ __smcr_link_clear(lnk);
+}
+
+static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
+ struct smc_buf_desc *buf_desc)
+{
+ int i;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+ smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]);
+
+ if (!buf_desc->is_vm && buf_desc->pages)
+ __free_pages(buf_desc->pages, buf_desc->order);
+ else if (buf_desc->is_vm && buf_desc->cpu_addr)
+ vfree(buf_desc->cpu_addr);
+ kfree(buf_desc);
+}
+
+static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb,
+ struct smc_buf_desc *buf_desc)
+{
+ if (is_dmb) {
+ /* restore original buf len */
+ buf_desc->len += sizeof(struct smcd_cdc_msg);
+ smc_ism_unregister_dmb(lgr->smcd, buf_desc);
+ } else {
+ kfree(buf_desc->cpu_addr);
+ }
+ kfree(buf_desc);
+}
+
+static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
+ struct smc_buf_desc *buf_desc)
+{
+ if (lgr->is_smcd)
+ smcd_buf_free(lgr, is_rmb, buf_desc);
+ else
+ smcr_buf_free(lgr, is_rmb, buf_desc);
+}
+
+static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
+{
+ struct smc_buf_desc *buf_desc, *bf_desc;
+ struct list_head *buf_list;
+ int i;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ if (is_rmb)
+ buf_list = &lgr->rmbs[i];
+ else
+ buf_list = &lgr->sndbufs[i];
+ list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
+ list) {
+ list_del(&buf_desc->list);
+ smc_buf_free(lgr, is_rmb, buf_desc);
+ }
+ }
+}
+
+static void smc_lgr_free_bufs(struct smc_link_group *lgr)
+{
+ /* free send buffers */
+ __smc_lgr_free_bufs(lgr, false);
+ /* free rmbs */
+ __smc_lgr_free_bufs(lgr, true);
+}
+
+/* won't be freed until no one accesses to lgr anymore */
+static void __smc_lgr_free(struct smc_link_group *lgr)
+{
+ smc_lgr_free_bufs(lgr);
+ if (lgr->is_smcd) {
+ if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
+ wake_up(&lgr->smcd->lgrs_deleted);
+ } else {
+ smc_wr_free_lgr_mem(lgr);
+ if (!atomic_dec_return(&lgr_cnt))
+ wake_up(&lgrs_deleted);
+ }
+ kfree(lgr);
+}
+
+/* remove a link group */
+static void smc_lgr_free(struct smc_link_group *lgr)
+{
+ int i;
+
+ if (!lgr->is_smcd) {
+ down_write(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].state != SMC_LNK_UNUSED)
+ smcr_link_clear(&lgr->lnk[i], false);
+ }
+ up_write(&lgr->llc_conf_mutex);
+ smc_llc_lgr_clear(lgr);
+ }
+
+ destroy_workqueue(lgr->tx_wq);
+ if (lgr->is_smcd) {
+ smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
+ put_device(lgr->smcd->ops->get_dev(lgr->smcd));
+ }
+ smc_lgr_put(lgr); /* theoretically last lgr_put */
+}
+
+void smc_lgr_hold(struct smc_link_group *lgr)
+{
+ refcount_inc(&lgr->refcnt);
+}
+
+void smc_lgr_put(struct smc_link_group *lgr)
+{
+ if (refcount_dec_and_test(&lgr->refcnt))
+ __smc_lgr_free(lgr);
+}
+
+static void smc_sk_wake_ups(struct smc_sock *smc)
+{
+ smc->sk.sk_write_space(&smc->sk);
+ smc->sk.sk_data_ready(&smc->sk);
+ smc->sk.sk_state_change(&smc->sk);
+}
+
+/* kill a connection */
+static void smc_conn_kill(struct smc_connection *conn, bool soft)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+ if (conn->lgr->is_smcd && conn->lgr->peer_shutdown)
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ else
+ smc_close_abort(conn);
+ conn->killed = 1;
+ smc->sk.sk_err = ECONNABORTED;
+ smc_sk_wake_ups(smc);
+ if (conn->lgr->is_smcd) {
+ smc_ism_unset_conn(conn);
+ if (soft)
+ tasklet_kill(&conn->rx_tsklet);
+ else
+ tasklet_unlock_wait(&conn->rx_tsklet);
+ } else {
+ smc_cdc_wait_pend_tx_wr(conn);
+ }
+ smc_lgr_unregister_conn(conn);
+ smc_close_active_abort(smc);
+}
+
+static void smc_lgr_cleanup(struct smc_link_group *lgr)
+{
+ if (lgr->is_smcd) {
+ smc_ism_signal_shutdown(lgr);
+ } else {
+ u32 rsn = lgr->llc_termination_rsn;
+
+ if (!rsn)
+ rsn = SMC_LLC_DEL_PROG_INIT_TERM;
+ smc_llc_send_link_delete_all(lgr, false, rsn);
+ smcr_lgr_link_deactivate_all(lgr);
+ }
+}
+
+/* terminate link group
+ * @soft: true if link group shutdown can take its time
+ * false if immediate link group shutdown is required
+ */
+static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
+{
+ struct smc_connection *conn;
+ struct smc_sock *smc;
+ struct rb_node *node;
+
+ if (lgr->terminating)
+ return; /* lgr already terminating */
+ /* cancel free_work sync, will terminate when lgr->freeing is set */
+ cancel_delayed_work(&lgr->free_work);
+ lgr->terminating = 1;
+
+ /* kill remaining link group connections */
+ read_lock_bh(&lgr->conns_lock);
+ node = rb_first(&lgr->conns_all);
+ while (node) {
+ read_unlock_bh(&lgr->conns_lock);
+ conn = rb_entry(node, struct smc_connection, alert_node);
+ smc = container_of(conn, struct smc_sock, conn);
+ sock_hold(&smc->sk); /* sock_put below */
+ lock_sock(&smc->sk);
+ smc_conn_kill(conn, soft);
+ release_sock(&smc->sk);
+ sock_put(&smc->sk); /* sock_hold above */
+ read_lock_bh(&lgr->conns_lock);
+ node = rb_first(&lgr->conns_all);
+ }
+ read_unlock_bh(&lgr->conns_lock);
+ smc_lgr_cleanup(lgr);
+ smc_lgr_free(lgr);
+}
+
+/* unlink link group and schedule termination */
+void smc_lgr_terminate_sched(struct smc_link_group *lgr)
+{
+ spinlock_t *lgr_lock;
+
+ smc_lgr_list_head(lgr, &lgr_lock);
+ spin_lock_bh(lgr_lock);
+ if (list_empty(&lgr->list) || lgr->terminating || lgr->freeing) {
+ spin_unlock_bh(lgr_lock);
+ return; /* lgr already terminating */
+ }
+ list_del_init(&lgr->list);
+ lgr->freeing = 1;
+ spin_unlock_bh(lgr_lock);
+ schedule_work(&lgr->terminate_work);
+}
+
+/* Called when peer lgr shutdown (regularly or abnormally) is received */
+void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
+{
+ struct smc_link_group *lgr, *l;
+ LIST_HEAD(lgr_free_list);
+
+ /* run common cleanup function and build free list */
+ spin_lock_bh(&dev->lgr_lock);
+ list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) {
+ if ((!peer_gid || lgr->peer_gid == peer_gid) &&
+ (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) {
+ if (peer_gid) /* peer triggered termination */
+ lgr->peer_shutdown = 1;
+ list_move(&lgr->list, &lgr_free_list);
+ lgr->freeing = 1;
+ }
+ }
+ spin_unlock_bh(&dev->lgr_lock);
+
+ /* cancel the regular free workers and actually free lgrs */
+ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
+ list_del_init(&lgr->list);
+ schedule_work(&lgr->terminate_work);
+ }
+}
+
+/* Called when an SMCD device is removed or the smc module is unloaded */
+void smc_smcd_terminate_all(struct smcd_dev *smcd)
+{
+ struct smc_link_group *lgr, *lg;
+ LIST_HEAD(lgr_free_list);
+
+ spin_lock_bh(&smcd->lgr_lock);
+ list_splice_init(&smcd->lgr_list, &lgr_free_list);
+ list_for_each_entry(lgr, &lgr_free_list, list)
+ lgr->freeing = 1;
+ spin_unlock_bh(&smcd->lgr_lock);
+
+ list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
+ list_del_init(&lgr->list);
+ __smc_lgr_terminate(lgr, false);
+ }
+
+ if (atomic_read(&smcd->lgr_cnt))
+ wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt));
+}
+
+/* Called when an SMCR device is removed or the smc module is unloaded.
+ * If smcibdev is given, all SMCR link groups using this device are terminated.
+ * If smcibdev is NULL, all SMCR link groups are terminated.
+ */
+void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
+{
+ struct smc_link_group *lgr, *lg;
+ LIST_HEAD(lgr_free_list);
+ int i;
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ if (!smcibdev) {
+ list_splice_init(&smc_lgr_list.list, &lgr_free_list);
+ list_for_each_entry(lgr, &lgr_free_list, list)
+ lgr->freeing = 1;
+ } else {
+ list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].smcibdev == smcibdev)
+ smcr_link_down_cond_sched(&lgr->lnk[i]);
+ }
+ }
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+
+ list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
+ list_del_init(&lgr->list);
+ smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM);
+ __smc_lgr_terminate(lgr, false);
+ }
+
+ if (smcibdev) {
+ if (atomic_read(&smcibdev->lnk_cnt))
+ wait_event(smcibdev->lnks_deleted,
+ !atomic_read(&smcibdev->lnk_cnt));
+ } else {
+ if (atomic_read(&lgr_cnt))
+ wait_event(lgrs_deleted, !atomic_read(&lgr_cnt));
+ }
+}
+
+/* set new lgr type and clear all asymmetric link tagging */
+void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type)
+{
+ char *lgr_type = "";
+ int i;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+ if (smc_link_usable(&lgr->lnk[i]))
+ lgr->lnk[i].link_is_asym = false;
+ if (lgr->type == new_type)
+ return;
+ lgr->type = new_type;
+
+ switch (lgr->type) {
+ case SMC_LGR_NONE:
+ lgr_type = "NONE";
+ break;
+ case SMC_LGR_SINGLE:
+ lgr_type = "SINGLE";
+ break;
+ case SMC_LGR_SYMMETRIC:
+ lgr_type = "SYMMETRIC";
+ break;
+ case SMC_LGR_ASYMMETRIC_PEER:
+ lgr_type = "ASYMMETRIC_PEER";
+ break;
+ case SMC_LGR_ASYMMETRIC_LOCAL:
+ lgr_type = "ASYMMETRIC_LOCAL";
+ break;
+ }
+ pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu state changed: "
+ "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id,
+ lgr->net->net_cookie, lgr_type, lgr->pnet_id);
+}
+
+/* set new lgr type and tag a link as asymmetric */
+void smcr_lgr_set_type_asym(struct smc_link_group *lgr,
+ enum smc_lgr_type new_type, int asym_lnk_idx)
+{
+ smcr_lgr_set_type(lgr, new_type);
+ lgr->lnk[asym_lnk_idx].link_is_asym = true;
+}
+
+/* abort connection, abort_work scheduled from tasklet context */
+static void smc_conn_abort_work(struct work_struct *work)
+{
+ struct smc_connection *conn = container_of(work,
+ struct smc_connection,
+ abort_work);
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+ lock_sock(&smc->sk);
+ smc_conn_kill(conn, true);
+ release_sock(&smc->sk);
+ sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */
+}
+
+void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct smc_link_group *lgr, *n;
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) {
+ struct smc_link *link;
+
+ if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
+ SMC_MAX_PNETID_LEN) ||
+ lgr->type == SMC_LGR_SYMMETRIC ||
+ lgr->type == SMC_LGR_ASYMMETRIC_PEER ||
+ !rdma_dev_access_netns(smcibdev->ibdev, lgr->net))
+ continue;
+
+ if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1)
+ continue;
+
+ /* trigger local add link processing */
+ link = smc_llc_usable_link(lgr);
+ if (link)
+ smc_llc_add_link_local(link);
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+}
+
+/* link is down - switch connections to alternate link,
+ * must be called under lgr->llc_conf_mutex lock
+ */
+static void smcr_link_down(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ struct smc_link *to_lnk;
+ int del_link_id;
+
+ if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list))
+ return;
+
+ to_lnk = smc_switch_conns(lgr, lnk, true);
+ if (!to_lnk) { /* no backup link available */
+ smcr_link_clear(lnk, true);
+ return;
+ }
+ smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
+ del_link_id = lnk->link_id;
+
+ if (lgr->role == SMC_SERV) {
+ /* trigger local delete link processing */
+ smc_llc_srv_delete_link_local(to_lnk, del_link_id);
+ } else {
+ if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
+ /* another llc task is ongoing */
+ up_write(&lgr->llc_conf_mutex);
+ wait_event_timeout(lgr->llc_flow_waiter,
+ (list_empty(&lgr->list) ||
+ lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE),
+ SMC_LLC_WAIT_TIME);
+ down_write(&lgr->llc_conf_mutex);
+ }
+ if (!list_empty(&lgr->list)) {
+ smc_llc_send_delete_link(to_lnk, del_link_id,
+ SMC_LLC_REQ, true,
+ SMC_LLC_DEL_LOST_PATH);
+ smcr_link_clear(lnk, true);
+ }
+ wake_up(&lgr->llc_flow_waiter); /* wake up next waiter */
+ }
+}
+
+/* must be called under lgr->llc_conf_mutex lock */
+void smcr_link_down_cond(struct smc_link *lnk)
+{
+ if (smc_link_downing(&lnk->state)) {
+ trace_smcr_link_down(lnk, __builtin_return_address(0));
+ smcr_link_down(lnk);
+ }
+}
+
+/* will get the lgr->llc_conf_mutex lock */
+void smcr_link_down_cond_sched(struct smc_link *lnk)
+{
+ if (smc_link_downing(&lnk->state)) {
+ trace_smcr_link_down(lnk, __builtin_return_address(0));
+ schedule_work(&lnk->link_down_wrk);
+ }
+}
+
+void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct smc_link_group *lgr, *n;
+ int i;
+
+ list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) {
+ if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
+ SMC_MAX_PNETID_LEN))
+ continue; /* lgr is not affected */
+ if (list_empty(&lgr->list))
+ continue;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *lnk = &lgr->lnk[i];
+
+ if (smc_link_usable(lnk) &&
+ lnk->smcibdev == smcibdev && lnk->ibport == ibport)
+ smcr_link_down_cond_sched(lnk);
+ }
+ }
+}
+
+static void smc_link_down_work(struct work_struct *work)
+{
+ struct smc_link *link = container_of(work, struct smc_link,
+ link_down_wrk);
+ struct smc_link_group *lgr = link->lgr;
+
+ if (list_empty(&lgr->list))
+ return;
+ wake_up_all(&lgr->llc_msg_waiter);
+ down_write(&lgr->llc_conf_mutex);
+ smcr_link_down(link);
+ up_write(&lgr->llc_conf_mutex);
+}
+
+static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev,
+ struct netdev_nested_priv *priv)
+{
+ unsigned short *vlan_id = (unsigned short *)priv->data;
+
+ if (is_vlan_dev(lower_dev)) {
+ *vlan_id = vlan_dev_vlan_id(lower_dev);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Determine vlan of internal TCP socket. */
+int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
+{
+ struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ struct netdev_nested_priv priv;
+ struct net_device *ndev;
+ int rc = 0;
+
+ ini->vlan_id = 0;
+ if (!dst) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ if (!dst->dev) {
+ rc = -ENODEV;
+ goto out_rel;
+ }
+
+ ndev = dst->dev;
+ if (is_vlan_dev(ndev)) {
+ ini->vlan_id = vlan_dev_vlan_id(ndev);
+ goto out_rel;
+ }
+
+ priv.data = (void *)&ini->vlan_id;
+ rtnl_lock();
+ netdev_walk_all_lower_dev(ndev, smc_vlan_by_tcpsk_walk, &priv);
+ rtnl_unlock();
+
+out_rel:
+ dst_release(dst);
+out:
+ return rc;
+}
+
+static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version,
+ u8 peer_systemid[],
+ u8 peer_gid[],
+ u8 peer_mac_v1[],
+ enum smc_lgr_role role, u32 clcqpn,
+ struct net *net)
+{
+ struct smc_link *lnk;
+ int i;
+
+ if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) ||
+ lgr->role != role)
+ return false;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ lnk = &lgr->lnk[i];
+
+ if (!smc_link_active(lnk))
+ continue;
+ /* use verbs API to check netns, instead of lgr->net */
+ if (!rdma_dev_access_netns(lnk->smcibdev->ibdev, net))
+ return false;
+ if ((lgr->role == SMC_SERV || lnk->peer_qpn == clcqpn) &&
+ !memcmp(lnk->peer_gid, peer_gid, SMC_GID_SIZE) &&
+ (smcr_version == SMC_V2 ||
+ !memcmp(lnk->peer_mac, peer_mac_v1, ETH_ALEN)))
+ return true;
+ }
+ return false;
+}
+
+static bool smcd_lgr_match(struct smc_link_group *lgr,
+ struct smcd_dev *smcismdev, u64 peer_gid)
+{
+ return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev;
+}
+
+/* create a new SMC connection (and a new link group if necessary) */
+int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct net *net = sock_net(&smc->sk);
+ struct list_head *lgr_list;
+ struct smc_link_group *lgr;
+ enum smc_lgr_role role;
+ spinlock_t *lgr_lock;
+ int rc = 0;
+
+ lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list :
+ &smc_lgr_list.list;
+ lgr_lock = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_lock :
+ &smc_lgr_list.lock;
+ ini->first_contact_local = 1;
+ role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ if (role == SMC_CLNT && ini->first_contact_peer)
+ /* create new link group as well */
+ goto create;
+
+ /* determine if an existing link group can be reused */
+ spin_lock_bh(lgr_lock);
+ list_for_each_entry(lgr, lgr_list, list) {
+ write_lock_bh(&lgr->conns_lock);
+ if ((ini->is_smcd ?
+ smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected],
+ ini->ism_peer_gid[ini->ism_selected]) :
+ smcr_lgr_match(lgr, ini->smcr_version,
+ ini->peer_systemid,
+ ini->peer_gid, ini->peer_mac, role,
+ ini->ib_clcqpn, net)) &&
+ !lgr->sync_err &&
+ (ini->smcd_version == SMC_V2 ||
+ lgr->vlan_id == ini->vlan_id) &&
+ (role == SMC_CLNT || ini->is_smcd ||
+ (lgr->conns_num < lgr->max_conns &&
+ !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) {
+ /* link group found */
+ ini->first_contact_local = 0;
+ conn->lgr = lgr;
+ rc = smc_lgr_register_conn(conn, false);
+ write_unlock_bh(&lgr->conns_lock);
+ if (!rc && delayed_work_pending(&lgr->free_work))
+ cancel_delayed_work(&lgr->free_work);
+ break;
+ }
+ write_unlock_bh(&lgr->conns_lock);
+ }
+ spin_unlock_bh(lgr_lock);
+ if (rc)
+ return rc;
+
+ if (role == SMC_CLNT && !ini->first_contact_peer &&
+ ini->first_contact_local) {
+ /* Server reuses a link group, but Client wants to start
+ * a new one
+ * send out_of_sync decline, reason synchr. error
+ */
+ return SMC_CLC_DECL_SYNCERR;
+ }
+
+create:
+ if (ini->first_contact_local) {
+ rc = smc_lgr_create(smc, ini);
+ if (rc)
+ goto out;
+ lgr = conn->lgr;
+ write_lock_bh(&lgr->conns_lock);
+ rc = smc_lgr_register_conn(conn, true);
+ write_unlock_bh(&lgr->conns_lock);
+ if (rc) {
+ smc_lgr_cleanup_early(lgr);
+ goto out;
+ }
+ }
+ smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */
+ if (!conn->lgr->is_smcd)
+ smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */
+ conn->freed = 0;
+ conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
+ conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
+ conn->urg_state = SMC_URG_READ;
+ init_waitqueue_head(&conn->cdc_pend_tx_wq);
+ INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work);
+ if (ini->is_smcd) {
+ conn->rx_off = sizeof(struct smcd_cdc_msg);
+ smcd_cdc_rx_init(conn); /* init tasklet for this conn */
+ } else {
+ conn->rx_off = 0;
+ }
+#ifndef KERNEL_HAS_ATOMIC64
+ spin_lock_init(&conn->acurs_lock);
+#endif
+
+out:
+ return rc;
+}
+
+#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
+#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */
+
+/* convert the RMB size into the compressed notation (minimum 16K, see
+ * SMCD/R_DMBE_SIZES.
+ * In contrast to plain ilog2, this rounds towards the next power of 2,
+ * so the socket application gets at least its desired sndbuf / rcvbuf size.
+ */
+static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb)
+{
+ const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE;
+ u8 compressed;
+
+ if (size <= SMC_BUF_MIN_SIZE)
+ return 0;
+
+ size = (size - 1) >> 14; /* convert to 16K multiple */
+ compressed = min_t(u8, ilog2(size) + 1,
+ is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES);
+
+ if (!is_smcd && is_rmb)
+ /* RMBs are backed by & limited to max size of scatterlists */
+ compressed = min_t(u8, compressed, ilog2(max_scat >> 14));
+
+ return compressed;
+}
+
+/* convert the RMB size from compressed notation into integer */
+int smc_uncompress_bufsize(u8 compressed)
+{
+ u32 size;
+
+ size = 0x00000001 << (((int)compressed) + 14);
+ return (int)size;
+}
+
+/* try to reuse a sndbuf or rmb description slot for a certain
+ * buffer size; if not available, return NULL
+ */
+static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
+ struct rw_semaphore *lock,
+ struct list_head *buf_list)
+{
+ struct smc_buf_desc *buf_slot;
+
+ down_read(lock);
+ list_for_each_entry(buf_slot, buf_list, list) {
+ if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
+ up_read(lock);
+ return buf_slot;
+ }
+ }
+ up_read(lock);
+ return NULL;
+}
+
+/* one of the conditions for announcing a receiver's current window size is
+ * that it "results in a minimum increase in the window size of 10% of the
+ * receive buffer space" [RFC7609]
+ */
+static inline int smc_rmb_wnd_update_limit(int rmbe_size)
+{
+ return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
+}
+
+/* map an buf to a link */
+static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb,
+ struct smc_link *lnk)
+{
+ int rc, i, nents, offset, buf_size, size, access_flags;
+ struct scatterlist *sg;
+ void *buf;
+
+ if (buf_desc->is_map_ib[lnk->link_idx])
+ return 0;
+
+ if (buf_desc->is_vm) {
+ buf = buf_desc->cpu_addr;
+ buf_size = buf_desc->len;
+ offset = offset_in_page(buf_desc->cpu_addr);
+ nents = PAGE_ALIGN(buf_size + offset) / PAGE_SIZE;
+ } else {
+ nents = 1;
+ }
+
+ rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], nents, GFP_KERNEL);
+ if (rc)
+ return rc;
+
+ if (buf_desc->is_vm) {
+ /* virtually contiguous buffer */
+ for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) {
+ size = min_t(int, PAGE_SIZE - offset, buf_size);
+ sg_set_page(sg, vmalloc_to_page(buf), size, offset);
+ buf += size / sizeof(*buf);
+ buf_size -= size;
+ offset = 0;
+ }
+ } else {
+ /* physically contiguous buffer */
+ sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl,
+ buf_desc->cpu_addr, buf_desc->len);
+ }
+
+ /* map sg table to DMA address */
+ rc = smc_ib_buf_map_sg(lnk, buf_desc,
+ is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ /* SMC protocol depends on mapping to one DMA address only */
+ if (rc != nents) {
+ rc = -EAGAIN;
+ goto free_table;
+ }
+
+ buf_desc->is_dma_need_sync |=
+ smc_ib_is_sg_need_sync(lnk, buf_desc) << lnk->link_idx;
+
+ if (is_rmb || buf_desc->is_vm) {
+ /* create a new memory region for the RMB or vzalloced sndbuf */
+ access_flags = is_rmb ?
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+ IB_ACCESS_LOCAL_WRITE;
+
+ rc = smc_ib_get_memory_region(lnk->roce_pd, access_flags,
+ buf_desc, lnk->link_idx);
+ if (rc)
+ goto buf_unmap;
+ smc_ib_sync_sg_for_device(lnk, buf_desc,
+ is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ }
+ buf_desc->is_map_ib[lnk->link_idx] = true;
+ return 0;
+
+buf_unmap:
+ smc_ib_buf_unmap_sg(lnk, buf_desc,
+ is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+free_table:
+ sg_free_table(&buf_desc->sgt[lnk->link_idx]);
+ return rc;
+}
+
+/* register a new buf on IB device, rmb or vzalloced sndbuf
+ * must be called under lgr->llc_conf_mutex lock
+ */
+int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc)
+{
+ if (list_empty(&link->lgr->list))
+ return -ENOLINK;
+ if (!buf_desc->is_reg_mr[link->link_idx]) {
+ /* register memory region for new buf */
+ if (buf_desc->is_vm)
+ buf_desc->mr[link->link_idx]->iova =
+ (uintptr_t)buf_desc->cpu_addr;
+ if (smc_wr_reg_send(link, buf_desc->mr[link->link_idx])) {
+ buf_desc->is_reg_err = true;
+ return -EFAULT;
+ }
+ buf_desc->is_reg_mr[link->link_idx] = true;
+ }
+ return 0;
+}
+
+static int _smcr_buf_map_lgr(struct smc_link *lnk, struct rw_semaphore *lock,
+ struct list_head *lst, bool is_rmb)
+{
+ struct smc_buf_desc *buf_desc, *bf;
+ int rc = 0;
+
+ down_write(lock);
+ list_for_each_entry_safe(buf_desc, bf, lst, list) {
+ if (!buf_desc->used)
+ continue;
+ rc = smcr_buf_map_link(buf_desc, is_rmb, lnk);
+ if (rc)
+ goto out;
+ }
+out:
+ up_write(lock);
+ return rc;
+}
+
+/* map all used buffers of lgr for a new link */
+int smcr_buf_map_lgr(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ int i, rc = 0;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock,
+ &lgr->rmbs[i], true);
+ if (rc)
+ return rc;
+ rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock,
+ &lgr->sndbufs[i], false);
+ if (rc)
+ return rc;
+ }
+ return 0;
+}
+
+/* register all used buffers of lgr for a new link,
+ * must be called under lgr->llc_conf_mutex lock
+ */
+int smcr_buf_reg_lgr(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ struct smc_buf_desc *buf_desc, *bf;
+ int i, rc = 0;
+
+ /* reg all RMBs for a new link */
+ down_write(&lgr->rmbs_lock);
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) {
+ if (!buf_desc->used)
+ continue;
+ rc = smcr_link_reg_buf(lnk, buf_desc);
+ if (rc) {
+ up_write(&lgr->rmbs_lock);
+ return rc;
+ }
+ }
+ }
+ up_write(&lgr->rmbs_lock);
+
+ if (lgr->buf_type == SMCR_PHYS_CONT_BUFS)
+ return rc;
+
+ /* reg all vzalloced sndbufs for a new link */
+ down_write(&lgr->sndbufs_lock);
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) {
+ if (!buf_desc->used || !buf_desc->is_vm)
+ continue;
+ rc = smcr_link_reg_buf(lnk, buf_desc);
+ if (rc) {
+ up_write(&lgr->sndbufs_lock);
+ return rc;
+ }
+ }
+ }
+ up_write(&lgr->sndbufs_lock);
+ return rc;
+}
+
+static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
+ bool is_rmb, int bufsize)
+{
+ struct smc_buf_desc *buf_desc;
+
+ /* try to alloc a new buffer */
+ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
+ if (!buf_desc)
+ return ERR_PTR(-ENOMEM);
+
+ switch (lgr->buf_type) {
+ case SMCR_PHYS_CONT_BUFS:
+ case SMCR_MIXED_BUFS:
+ buf_desc->order = get_order(bufsize);
+ buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NOMEMALLOC | __GFP_COMP |
+ __GFP_NORETRY | __GFP_ZERO,
+ buf_desc->order);
+ if (buf_desc->pages) {
+ buf_desc->cpu_addr =
+ (void *)page_address(buf_desc->pages);
+ buf_desc->len = bufsize;
+ buf_desc->is_vm = false;
+ break;
+ }
+ if (lgr->buf_type == SMCR_PHYS_CONT_BUFS)
+ goto out;
+ fallthrough; // try virtually continguous buf
+ case SMCR_VIRT_CONT_BUFS:
+ buf_desc->order = get_order(bufsize);
+ buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order);
+ if (!buf_desc->cpu_addr)
+ goto out;
+ buf_desc->pages = NULL;
+ buf_desc->len = bufsize;
+ buf_desc->is_vm = true;
+ break;
+ }
+ return buf_desc;
+
+out:
+ kfree(buf_desc);
+ return ERR_PTR(-EAGAIN);
+}
+
+/* map buf_desc on all usable links,
+ * unused buffers stay mapped as long as the link is up
+ */
+static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
+ struct smc_buf_desc *buf_desc, bool is_rmb)
+{
+ int i, rc = 0, cnt = 0;
+
+ /* protect against parallel link reconfiguration */
+ down_read(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *lnk = &lgr->lnk[i];
+
+ if (!smc_link_usable(lnk))
+ continue;
+ if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ cnt++;
+ }
+out:
+ up_read(&lgr->llc_conf_mutex);
+ if (!rc && !cnt)
+ rc = -EINVAL;
+ return rc;
+}
+
+static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
+ bool is_dmb, int bufsize)
+{
+ struct smc_buf_desc *buf_desc;
+ int rc;
+
+ /* try to alloc a new DMB */
+ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
+ if (!buf_desc)
+ return ERR_PTR(-ENOMEM);
+ if (is_dmb) {
+ rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
+ if (rc) {
+ kfree(buf_desc);
+ if (rc == -ENOMEM)
+ return ERR_PTR(-EAGAIN);
+ if (rc == -ENOSPC)
+ return ERR_PTR(-ENOSPC);
+ return ERR_PTR(-EIO);
+ }
+ buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
+ /* CDC header stored in buf. So, pretend it was smaller */
+ buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg);
+ } else {
+ buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL |
+ __GFP_NOWARN | __GFP_NORETRY |
+ __GFP_NOMEMALLOC);
+ if (!buf_desc->cpu_addr) {
+ kfree(buf_desc);
+ return ERR_PTR(-EAGAIN);
+ }
+ buf_desc->len = bufsize;
+ }
+ return buf_desc;
+}
+
+static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
+{
+ struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
+ struct smc_connection *conn = &smc->conn;
+ struct smc_link_group *lgr = conn->lgr;
+ struct list_head *buf_list;
+ int bufsize, bufsize_comp;
+ struct rw_semaphore *lock; /* lock buffer list */
+ bool is_dgraded = false;
+
+ if (is_rmb)
+ /* use socket recv buffer size (w/o overhead) as start value */
+ bufsize = smc->sk.sk_rcvbuf / 2;
+ else
+ /* use socket send buffer size (w/o overhead) as start value */
+ bufsize = smc->sk.sk_sndbuf / 2;
+
+ for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb);
+ bufsize_comp >= 0; bufsize_comp--) {
+ if (is_rmb) {
+ lock = &lgr->rmbs_lock;
+ buf_list = &lgr->rmbs[bufsize_comp];
+ } else {
+ lock = &lgr->sndbufs_lock;
+ buf_list = &lgr->sndbufs[bufsize_comp];
+ }
+ bufsize = smc_uncompress_bufsize(bufsize_comp);
+
+ /* check for reusable slot in the link group */
+ buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list);
+ if (buf_desc) {
+ buf_desc->is_dma_need_sync = 0;
+ SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize);
+ SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb);
+ break; /* found reusable slot */
+ }
+
+ if (is_smcd)
+ buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize);
+ else
+ buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize);
+
+ if (PTR_ERR(buf_desc) == -ENOMEM)
+ break;
+ if (IS_ERR(buf_desc)) {
+ if (!is_dgraded) {
+ is_dgraded = true;
+ SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rmb);
+ }
+ continue;
+ }
+
+ SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb);
+ SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize);
+ buf_desc->used = 1;
+ down_write(lock);
+ list_add(&buf_desc->list, buf_list);
+ up_write(lock);
+ break; /* found */
+ }
+
+ if (IS_ERR(buf_desc))
+ return PTR_ERR(buf_desc);
+
+ if (!is_smcd) {
+ if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) {
+ smcr_buf_unuse(buf_desc, is_rmb, lgr);
+ return -ENOMEM;
+ }
+ }
+
+ if (is_rmb) {
+ conn->rmb_desc = buf_desc;
+ conn->rmbe_size_comp = bufsize_comp;
+ smc->sk.sk_rcvbuf = bufsize * 2;
+ atomic_set(&conn->bytes_to_rcv, 0);
+ conn->rmbe_update_limit =
+ smc_rmb_wnd_update_limit(buf_desc->len);
+ if (is_smcd)
+ smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
+ } else {
+ conn->sndbuf_desc = buf_desc;
+ smc->sk.sk_sndbuf = bufsize * 2;
+ atomic_set(&conn->sndbuf_space, bufsize);
+ }
+ return 0;
+}
+
+void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
+{
+ if (!conn->sndbuf_desc->is_dma_need_sync)
+ return;
+ if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd ||
+ !smc_link_active(conn->lnk))
+ return;
+ smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
+}
+
+void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
+{
+ int i;
+
+ if (!conn->rmb_desc->is_dma_need_sync)
+ return;
+ if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd)
+ return;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&conn->lgr->lnk[i]))
+ continue;
+ smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc,
+ DMA_FROM_DEVICE);
+ }
+}
+
+/* create the send and receive buffer for an SMC socket;
+ * receive buffers are called RMBs;
+ * (even though the SMC protocol allows more than one RMB-element per RMB,
+ * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
+ * extra RMB for every connection in a link group
+ */
+int smc_buf_create(struct smc_sock *smc, bool is_smcd)
+{
+ int rc;
+
+ /* create send buffer */
+ rc = __smc_buf_create(smc, is_smcd, false);
+ if (rc)
+ return rc;
+ /* create rmb */
+ rc = __smc_buf_create(smc, is_smcd, true);
+ if (rc) {
+ down_write(&smc->conn.lgr->sndbufs_lock);
+ list_del(&smc->conn.sndbuf_desc->list);
+ up_write(&smc->conn.lgr->sndbufs_lock);
+ smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
+ smc->conn.sndbuf_desc = NULL;
+ }
+ return rc;
+}
+
+static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
+{
+ int i;
+
+ for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
+ if (!test_and_set_bit(i, lgr->rtokens_used_mask))
+ return i;
+ }
+ return -ENOSPC;
+}
+
+static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx,
+ u32 rkey)
+{
+ int i;
+
+ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+ if (test_bit(i, lgr->rtokens_used_mask) &&
+ lgr->rtokens[i][lnk_idx].rkey == rkey)
+ return i;
+ }
+ return -ENOENT;
+}
+
+/* set rtoken for a new link to an existing rmb */
+void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
+ __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey)
+{
+ int rtok_idx;
+
+ rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known));
+ if (rtok_idx == -ENOENT)
+ return;
+ lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey);
+ lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr);
+}
+
+/* set rtoken for a new link whose link_id is given */
+void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
+ __be64 nw_vaddr, __be32 nw_rkey)
+{
+ u64 dma_addr = be64_to_cpu(nw_vaddr);
+ u32 rkey = ntohl(nw_rkey);
+ bool found = false;
+ int link_idx;
+
+ for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) {
+ if (lgr->lnk[link_idx].link_id == link_id) {
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ return;
+ lgr->rtokens[rtok_idx][link_idx].rkey = rkey;
+ lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr;
+}
+
+/* add a new rtoken from peer */
+int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey)
+{
+ struct smc_link_group *lgr = smc_get_lgr(lnk);
+ u64 dma_addr = be64_to_cpu(nw_vaddr);
+ u32 rkey = ntohl(nw_rkey);
+ int i;
+
+ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+ if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
+ lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr &&
+ test_bit(i, lgr->rtokens_used_mask)) {
+ /* already in list */
+ return i;
+ }
+ }
+ i = smc_rmb_reserve_rtoken_idx(lgr);
+ if (i < 0)
+ return i;
+ lgr->rtokens[i][lnk->link_idx].rkey = rkey;
+ lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr;
+ return i;
+}
+
+/* delete an rtoken from all links */
+int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey)
+{
+ struct smc_link_group *lgr = smc_get_lgr(lnk);
+ u32 rkey = ntohl(nw_rkey);
+ int i, j;
+
+ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+ if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
+ test_bit(i, lgr->rtokens_used_mask)) {
+ for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) {
+ lgr->rtokens[i][j].rkey = 0;
+ lgr->rtokens[i][j].dma_addr = 0;
+ }
+ clear_bit(i, lgr->rtokens_used_mask);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+/* save rkey and dma_addr received from peer during clc handshake */
+int smc_rmb_rtoken_handling(struct smc_connection *conn,
+ struct smc_link *lnk,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ conn->rtoken_idx = smc_rtoken_add(lnk, clc->r0.rmb_dma_addr,
+ clc->r0.rmb_rkey);
+ if (conn->rtoken_idx < 0)
+ return conn->rtoken_idx;
+ return 0;
+}
+
+static void smc_core_going_away(void)
+{
+ struct smc_ib_device *smcibdev;
+ struct smcd_dev *smcd;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
+ int i;
+
+ for (i = 0; i < SMC_MAX_PORTS; i++)
+ set_bit(i, smcibdev->ports_going_away);
+ }
+ mutex_unlock(&smc_ib_devices.mutex);
+
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ smcd->going_away = 1;
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+}
+
+/* Clean up all SMC link groups */
+static void smc_lgrs_shutdown(void)
+{
+ struct smcd_dev *smcd;
+
+ smc_core_going_away();
+
+ smc_smcr_terminate_all(NULL);
+
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(smcd, &smcd_dev_list.list, list)
+ smc_smcd_terminate_all(smcd);
+ mutex_unlock(&smcd_dev_list.mutex);
+}
+
+static int smc_core_reboot_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ smc_lgrs_shutdown();
+ smc_ib_unregister_client();
+ smc_ism_exit();
+ return 0;
+}
+
+static struct notifier_block smc_reboot_notifier = {
+ .notifier_call = smc_core_reboot_event,
+};
+
+int __init smc_core_init(void)
+{
+ return register_reboot_notifier(&smc_reboot_notifier);
+}
+
+/* Called (from smc_exit) when module is removed */
+void smc_core_exit(void)
+{
+ unregister_reboot_notifier(&smc_reboot_notifier);
+ smc_lgrs_shutdown();
+}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
new file mode 100644
index 000000000..120027d40
--- /dev/null
+++ b/net/smc/smc_core.h
@@ -0,0 +1,596 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for SMC Connections, Link Groups and Links
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CORE_H
+#define _SMC_CORE_H
+
+#include <linux/atomic.h>
+#include <linux/smc.h>
+#include <linux/pci.h>
+#include <rdma/ib_verbs.h>
+#include <net/genetlink.h>
+
+#include "smc.h"
+#include "smc_ib.h"
+
+#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */
+#define SMC_CONN_PER_LGR_MIN 16 /* min. # of connections per link group */
+#define SMC_CONN_PER_LGR_MAX 255 /* max. # of connections per link group,
+ * also is the default value for SMC-R v1 and v2.0
+ */
+#define SMC_CONN_PER_LGR_PREFER 255 /* Preferred connections per link group used for
+ * SMC-R v2.1 and later negotiation, vendors or
+ * distrubutions may modify it to a value between
+ * 16-255 as needed.
+ */
+
+struct smc_lgr_list { /* list of link group definition */
+ struct list_head list;
+ spinlock_t lock; /* protects list of link groups */
+ u32 num; /* unique link group number */
+};
+
+enum smc_lgr_role { /* possible roles of a link group */
+ SMC_CLNT, /* client */
+ SMC_SERV /* server */
+};
+
+enum smc_link_state { /* possible states of a link */
+ SMC_LNK_UNUSED, /* link is unused */
+ SMC_LNK_INACTIVE, /* link is inactive */
+ SMC_LNK_ACTIVATING, /* link is being activated */
+ SMC_LNK_ACTIVE, /* link is active */
+};
+
+#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
+#define SMC_WR_BUF_V2_SIZE 8192 /* size of v2 work request buffer */
+
+struct smc_wr_buf {
+ u8 raw[SMC_WR_BUF_SIZE];
+};
+
+struct smc_wr_v2_buf {
+ u8 raw[SMC_WR_BUF_V2_SIZE];
+};
+
+#define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */
+
+enum smc_wr_reg_state {
+ POSTED, /* ib_wr_reg_mr request posted */
+ CONFIRMED, /* ib_wr_reg_mr response: successful */
+ FAILED /* ib_wr_reg_mr response: failure */
+};
+
+struct smc_rdma_sge { /* sges for RDMA writes */
+ struct ib_sge wr_tx_rdma_sge[SMC_IB_MAX_SEND_SGE];
+};
+
+#define SMC_MAX_RDMA_WRITES 2 /* max. # of RDMA writes per
+ * message send
+ */
+
+struct smc_rdma_sges { /* sges per message send */
+ struct smc_rdma_sge tx_rdma_sge[SMC_MAX_RDMA_WRITES];
+};
+
+struct smc_rdma_wr { /* work requests per message
+ * send
+ */
+ struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES];
+};
+
+#define SMC_LGR_ID_SIZE 4
+
+struct smc_link {
+ struct smc_ib_device *smcibdev; /* ib-device */
+ u8 ibport; /* port - values 1 | 2 */
+ struct ib_pd *roce_pd; /* IB protection domain,
+ * unique for every RoCE QP
+ */
+ struct ib_qp *roce_qp; /* IB queue pair */
+ struct ib_qp_attr qp_attr; /* IB queue pair attributes */
+
+ struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */
+ struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
+ struct ib_sge *wr_tx_sges; /* WR send gather meta data */
+ struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/
+ struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */
+ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
+ struct completion *wr_tx_compl; /* WR send CQE completion */
+ /* above four vectors have wr_tx_cnt elements and use the same index */
+ struct ib_send_wr *wr_tx_v2_ib; /* WR send v2 meta data */
+ struct ib_sge *wr_tx_v2_sge; /* WR send v2 gather meta data*/
+ struct smc_wr_tx_pend *wr_tx_v2_pend; /* WR send v2 waiting for CQE */
+ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
+ dma_addr_t wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/
+ atomic_long_t wr_tx_id; /* seq # of last sent WR */
+ unsigned long *wr_tx_mask; /* bit mask of used indexes */
+ u32 wr_tx_cnt; /* number of WR send buffers */
+ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
+ struct {
+ struct percpu_ref wr_tx_refs;
+ } ____cacheline_aligned_in_smp;
+ struct completion tx_ref_comp;
+
+ struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
+ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
+ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
+ /* above three vectors have wr_rx_cnt elements and use the same index */
+ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
+ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
+ u64 wr_rx_id; /* seq # of last recv WR */
+ u64 wr_rx_id_compl; /* seq # of last completed WR */
+ u32 wr_rx_cnt; /* number of WR recv buffers */
+ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */
+ wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */
+
+ struct ib_reg_wr wr_reg; /* WR register memory region */
+ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
+ struct {
+ struct percpu_ref wr_reg_refs;
+ } ____cacheline_aligned_in_smp;
+ struct completion reg_ref_comp;
+ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */
+
+ u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/
+ u8 sgid_index; /* gid index for vlan id */
+ u32 peer_qpn; /* QP number of peer */
+ enum ib_mtu path_mtu; /* used mtu */
+ enum ib_mtu peer_mtu; /* mtu size of peer */
+ u32 psn_initial; /* QP tx initial packet seqno */
+ u32 peer_psn; /* QP rx initial packet seqno */
+ u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
+ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/
+ u8 link_id; /* unique # within link group */
+ u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */
+ u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */
+ u8 link_idx; /* index in lgr link array */
+ u8 link_is_asym; /* is link asymmetric? */
+ u8 clearing : 1; /* link is being cleared */
+ refcount_t refcnt; /* link reference count */
+ struct smc_link_group *lgr; /* parent link group */
+ struct work_struct link_down_wrk; /* wrk to bring link down */
+ char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */
+ int ndev_ifidx; /* network device ifindex */
+
+ enum smc_link_state state; /* state of link */
+ struct delayed_work llc_testlink_wrk; /* testlink worker */
+ struct completion llc_testlink_resp; /* wait for rx of testlink */
+ int llc_testlink_time; /* testlink interval */
+ atomic_t conn_cnt; /* connections on this link */
+};
+
+/* For now we just allow one parallel link per link group. The SMC protocol
+ * allows more (up to 8).
+ */
+#define SMC_LINKS_PER_LGR_MAX 3
+#define SMC_SINGLE_LINK 0
+#define SMC_LINKS_ADD_LNK_MIN 1 /* min. # of links per link group */
+#define SMC_LINKS_ADD_LNK_MAX 2 /* max. # of links per link group, also is the
+ * default value for smc-r v1.0 and v2.0
+ */
+#define SMC_LINKS_PER_LGR_MAX_PREFER 2 /* Preferred max links per link group used for
+ * SMC-R v2.1 and later negotiation, vendors or
+ * distrubutions may modify it to a value between
+ * 1-2 as needed.
+ */
+
+/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
+struct smc_buf_desc {
+ struct list_head list;
+ void *cpu_addr; /* virtual address of buffer */
+ struct page *pages;
+ int len; /* length of buffer */
+ u32 used; /* currently used / unused */
+ union {
+ struct { /* SMC-R */
+ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];
+ /* virtual buffer */
+ struct ib_mr *mr[SMC_LINKS_PER_LGR_MAX];
+ /* memory region: for rmb and
+ * vzalloced sndbuf
+ * incl. rkey provided to peer
+ * and lkey provided to local
+ */
+ u32 order; /* allocation order */
+
+ u8 is_conf_rkey;
+ /* confirm_rkey done */
+ u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX];
+ /* mem region registered */
+ u8 is_map_ib[SMC_LINKS_PER_LGR_MAX];
+ /* mem region mapped to lnk */
+ u8 is_dma_need_sync;
+ u8 is_reg_err;
+ /* buffer registration err */
+ u8 is_vm;
+ /* virtually contiguous */
+ };
+ struct { /* SMC-D */
+ unsigned short sba_idx;
+ /* SBA index number */
+ u64 token;
+ /* DMB token number */
+ dma_addr_t dma_addr;
+ /* DMA address */
+ };
+ };
+};
+
+struct smc_rtoken { /* address/key of remote RMB */
+ u64 dma_addr;
+ u32 rkey;
+};
+
+#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
+#define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */
+/* theoretically, the RFC states that largest size would be 512K,
+ * i.e. compressed 5 and thus 6 sizes (0..5), despite
+ * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
+ */
+
+struct smcd_dev;
+
+enum smc_lgr_type { /* redundancy state of lgr */
+ SMC_LGR_NONE, /* no active links, lgr to be deleted */
+ SMC_LGR_SINGLE, /* 1 active RNIC on each peer */
+ SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */
+ SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */
+ SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */
+};
+
+enum smcr_buf_type { /* types of SMC-R sndbufs and RMBs */
+ SMCR_PHYS_CONT_BUFS = 0,
+ SMCR_VIRT_CONT_BUFS = 1,
+ SMCR_MIXED_BUFS = 2,
+};
+
+enum smc_llc_flowtype {
+ SMC_LLC_FLOW_NONE = 0,
+ SMC_LLC_FLOW_ADD_LINK = 2,
+ SMC_LLC_FLOW_DEL_LINK = 4,
+ SMC_LLC_FLOW_REQ_ADD_LINK = 5,
+ SMC_LLC_FLOW_RKEY = 6,
+};
+
+struct smc_llc_qentry;
+
+struct smc_llc_flow {
+ enum smc_llc_flowtype type;
+ struct smc_llc_qentry *qentry;
+};
+
+struct smc_link_group {
+ struct list_head list;
+ struct rb_root conns_all; /* connection tree */
+ rwlock_t conns_lock; /* protects conns_all */
+ unsigned int conns_num; /* current # of connections */
+ unsigned short vlan_id; /* vlan id of link group */
+
+ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */
+ struct rw_semaphore sndbufs_lock; /* protects tx buffers */
+ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */
+ struct rw_semaphore rmbs_lock; /* protects rx buffers */
+
+ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
+ struct delayed_work free_work; /* delayed freeing of an lgr */
+ struct work_struct terminate_work; /* abnormal lgr termination */
+ struct workqueue_struct *tx_wq; /* wq for conn. tx workers */
+ u8 sync_err : 1; /* lgr no longer fits to peer */
+ u8 terminating : 1;/* lgr is terminating */
+ u8 freeing : 1; /* lgr is being freed */
+
+ refcount_t refcnt; /* lgr reference count */
+ bool is_smcd; /* SMC-R or SMC-D */
+ u8 smc_version;
+ u8 negotiated_eid[SMC_MAX_EID_LEN];
+ u8 peer_os; /* peer operating system */
+ u8 peer_smc_release;
+ u8 peer_hostname[SMC_MAX_HOSTNAME_LEN];
+ union {
+ struct { /* SMC-R */
+ enum smc_lgr_role role;
+ /* client or server */
+ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX];
+ /* smc link */
+ struct smc_wr_v2_buf *wr_rx_buf_v2;
+ /* WR v2 recv payload buffer */
+ struct smc_wr_v2_buf *wr_tx_buf_v2;
+ /* WR v2 send payload buffer */
+ char peer_systemid[SMC_SYSTEMID_LEN];
+ /* unique system_id of peer */
+ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
+ [SMC_LINKS_PER_LGR_MAX];
+ /* remote addr/key pairs */
+ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX);
+ /* used rtoken elements */
+ u8 next_link_id;
+ enum smc_lgr_type type;
+ enum smcr_buf_type buf_type;
+ /* redundancy state */
+ u8 pnet_id[SMC_MAX_PNETID_LEN + 1];
+ /* pnet id of this lgr */
+ struct list_head llc_event_q;
+ /* queue for llc events */
+ spinlock_t llc_event_q_lock;
+ /* protects llc_event_q */
+ struct rw_semaphore llc_conf_mutex;
+ /* protects lgr reconfig. */
+ struct work_struct llc_add_link_work;
+ struct work_struct llc_del_link_work;
+ struct work_struct llc_event_work;
+ /* llc event worker */
+ wait_queue_head_t llc_flow_waiter;
+ /* w4 next llc event */
+ wait_queue_head_t llc_msg_waiter;
+ /* w4 next llc msg */
+ struct smc_llc_flow llc_flow_lcl;
+ /* llc local control field */
+ struct smc_llc_flow llc_flow_rmt;
+ /* llc remote control field */
+ struct smc_llc_qentry *delayed_event;
+ /* arrived when flow active */
+ spinlock_t llc_flow_lock;
+ /* protects llc flow */
+ int llc_testlink_time;
+ /* link keep alive time */
+ u32 llc_termination_rsn;
+ /* rsn code for termination */
+ u8 nexthop_mac[ETH_ALEN];
+ u8 uses_gateway;
+ __be32 saddr;
+ /* net namespace */
+ struct net *net;
+ u8 max_conns;
+ /* max conn can be assigned to lgr */
+ u8 max_links;
+ /* max links can be added in lgr */
+ };
+ struct { /* SMC-D */
+ u64 peer_gid;
+ /* Peer GID (remote) */
+ struct smcd_dev *smcd;
+ /* ISM device for VLAN reg. */
+ u8 peer_shutdown : 1;
+ /* peer triggered shutdownn */
+ };
+ };
+};
+
+struct smc_clc_msg_local;
+
+#define GID_LIST_SIZE 2
+
+struct smc_gidlist {
+ u8 len;
+ u8 list[GID_LIST_SIZE][SMC_GID_SIZE];
+};
+
+struct smc_init_info_smcrv2 {
+ /* Input fields */
+ __be32 saddr;
+ struct sock *clc_sk;
+ __be32 daddr;
+
+ /* Output fields when saddr is set */
+ struct smc_ib_device *ib_dev_v2;
+ u8 ib_port_v2;
+ u8 ib_gid_v2[SMC_GID_SIZE];
+
+ /* Additional output fields when clc_sk and daddr is set as well */
+ u8 uses_gateway;
+ u8 nexthop_mac[ETH_ALEN];
+
+ struct smc_gidlist gidlist;
+};
+
+struct smc_init_info {
+ u8 is_smcd;
+ u8 smc_type_v1;
+ u8 smc_type_v2;
+ u8 release_nr;
+ u8 max_conns;
+ u8 max_links;
+ u8 first_contact_peer;
+ u8 first_contact_local;
+ unsigned short vlan_id;
+ u32 rc;
+ u8 negotiated_eid[SMC_MAX_EID_LEN];
+ /* SMC-R */
+ u8 smcr_version;
+ u8 check_smcrv2;
+ u8 peer_gid[SMC_GID_SIZE];
+ u8 peer_mac[ETH_ALEN];
+ u8 peer_systemid[SMC_SYSTEMID_LEN];
+ struct smc_ib_device *ib_dev;
+ u8 ib_gid[SMC_GID_SIZE];
+ u8 ib_port;
+ u32 ib_clcqpn;
+ struct smc_init_info_smcrv2 smcrv2;
+ /* SMC-D */
+ u64 ism_peer_gid[SMC_MAX_ISM_DEVS + 1];
+ struct smcd_dev *ism_dev[SMC_MAX_ISM_DEVS + 1];
+ u16 ism_chid[SMC_MAX_ISM_DEVS + 1];
+ u8 ism_offered_cnt; /* # of ISM devices offered */
+ u8 ism_selected; /* index of selected ISM dev*/
+ u8 smcd_version;
+};
+
+/* Find the connection associated with the given alert token in the link group.
+ * To use rbtrees we have to implement our own search core.
+ * Requires @conns_lock
+ * @token alert token to search for
+ * @lgr link group to search in
+ * Returns connection associated with token if found, NULL otherwise.
+ */
+static inline struct smc_connection *smc_lgr_find_conn(
+ u32 token, struct smc_link_group *lgr)
+{
+ struct smc_connection *res = NULL;
+ struct rb_node *node;
+
+ node = lgr->conns_all.rb_node;
+ while (node) {
+ struct smc_connection *cur = rb_entry(node,
+ struct smc_connection, alert_node);
+
+ if (cur->alert_token_local > token) {
+ node = node->rb_left;
+ } else {
+ if (cur->alert_token_local < token) {
+ node = node->rb_right;
+ } else {
+ res = cur;
+ break;
+ }
+ }
+ }
+
+ return res;
+}
+
+static inline bool smc_conn_lgr_valid(struct smc_connection *conn)
+{
+ return conn->lgr && conn->alert_token_local;
+}
+
+/*
+ * Returns true if the specified link is usable.
+ *
+ * usable means the link is ready to receive RDMA messages, map memory
+ * on the link, etc. This doesn't ensure we are able to send RDMA messages
+ * on this link, if sending RDMA messages is needed, use smc_link_sendable()
+ */
+static inline bool smc_link_usable(struct smc_link *lnk)
+{
+ if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE)
+ return false;
+ return true;
+}
+
+/*
+ * Returns true if the specified link is ready to receive AND send RDMA
+ * messages.
+ *
+ * For the client side in first contact, the underlying QP may still in
+ * RESET or RTR when the link state is ACTIVATING, checks in smc_link_usable()
+ * is not strong enough. For those places that need to send any CDC or LLC
+ * messages, use smc_link_sendable(), otherwise, use smc_link_usable() instead
+ */
+static inline bool smc_link_sendable(struct smc_link *lnk)
+{
+ return smc_link_usable(lnk) &&
+ lnk->qp_attr.cur_qp_state == IB_QPS_RTS;
+}
+
+static inline bool smc_link_active(struct smc_link *lnk)
+{
+ return lnk->state == SMC_LNK_ACTIVE;
+}
+
+static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
+{
+ sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+ be16_to_cpu(((__be16 *)gid_raw)[0]),
+ be16_to_cpu(((__be16 *)gid_raw)[1]),
+ be16_to_cpu(((__be16 *)gid_raw)[2]),
+ be16_to_cpu(((__be16 *)gid_raw)[3]),
+ be16_to_cpu(((__be16 *)gid_raw)[4]),
+ be16_to_cpu(((__be16 *)gid_raw)[5]),
+ be16_to_cpu(((__be16 *)gid_raw)[6]),
+ be16_to_cpu(((__be16 *)gid_raw)[7]));
+}
+
+struct smc_pci_dev {
+ __u32 pci_fid;
+ __u16 pci_pchid;
+ __u16 pci_vendor;
+ __u16 pci_device;
+ __u8 pci_id[SMC_PCI_ID_STR_LEN];
+};
+
+static inline void smc_set_pci_values(struct pci_dev *pci_dev,
+ struct smc_pci_dev *smc_dev)
+{
+ smc_dev->pci_vendor = pci_dev->vendor;
+ smc_dev->pci_device = pci_dev->device;
+ snprintf(smc_dev->pci_id, sizeof(smc_dev->pci_id), "%s",
+ pci_name(pci_dev));
+#if IS_ENABLED(CONFIG_S390)
+ { /* Set s390 specific PCI information */
+ struct zpci_dev *zdev;
+
+ zdev = to_zpci(pci_dev);
+ smc_dev->pci_fid = zdev->fid;
+ smc_dev->pci_pchid = zdev->pchid;
+ }
+#endif
+}
+
+struct smc_sock;
+struct smc_clc_msg_accept_confirm;
+
+void smc_lgr_cleanup_early(struct smc_link_group *lgr);
+void smc_lgr_terminate_sched(struct smc_link_group *lgr);
+void smc_lgr_hold(struct smc_link_group *lgr);
+void smc_lgr_put(struct smc_link_group *lgr);
+void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport);
+void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport);
+void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
+ unsigned short vlan);
+void smc_smcd_terminate_all(struct smcd_dev *dev);
+void smc_smcr_terminate_all(struct smc_ib_device *smcibdev);
+int smc_buf_create(struct smc_sock *smc, bool is_smcd);
+int smc_uncompress_bufsize(u8 compressed);
+int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link,
+ struct smc_clc_msg_accept_confirm *clc);
+int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey);
+int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey);
+void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
+ __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey);
+void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
+ __be64 nw_vaddr, __be32 nw_rkey);
+void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
+void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
+int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini);
+
+void smc_conn_free(struct smc_connection *conn);
+int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini);
+int smc_core_init(void);
+void smc_core_exit(void);
+
+int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
+ u8 link_idx, struct smc_init_info *ini);
+void smcr_link_clear(struct smc_link *lnk, bool log);
+void smcr_link_hold(struct smc_link *lnk);
+void smcr_link_put(struct smc_link *lnk);
+void smc_switch_link_and_count(struct smc_connection *conn,
+ struct smc_link *to_lnk);
+int smcr_buf_map_lgr(struct smc_link *lnk);
+int smcr_buf_reg_lgr(struct smc_link *lnk);
+void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type);
+void smcr_lgr_set_type_asym(struct smc_link_group *lgr,
+ enum smc_lgr_type new_type, int asym_lnk_idx);
+int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *rmb_desc);
+struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
+ struct smc_link *from_lnk, bool is_dev_err);
+void smcr_link_down_cond(struct smc_link *lnk);
+void smcr_link_down_cond_sched(struct smc_link *lnk);
+int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb);
+int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb);
+int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb);
+int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb);
+
+static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
+{
+ return link->lgr;
+}
+#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
new file mode 100644
index 000000000..37833b96b
--- /dev/null
+++ b/net/smc/smc_diag.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Monitoring SMC transport protocol sockets
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sock_diag.h>
+#include <linux/inet_diag.h>
+#include <linux/smc_diag.h>
+#include <net/netlink.h>
+#include <net/smc.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+struct smc_diag_dump_ctx {
+ int pos[2];
+};
+
+static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb)
+{
+ return (struct smc_diag_dump_ctx *)cb->ctx;
+}
+
+static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ memset(r, 0, sizeof(*r));
+ r->diag_family = sk->sk_family;
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
+ if (!smc->clcsock)
+ return;
+ r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
+ r->id.idiag_dport = smc->clcsock->sk->sk_dport;
+ r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
+ if (sk->sk_protocol == SMCPROTO_SMC) {
+ r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
+ r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_protocol == SMCPROTO_SMC6) {
+ memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr,
+ sizeof(smc->clcsock->sk->sk_v6_rcv_saddr));
+ memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr,
+ sizeof(smc->clcsock->sk->sk_v6_daddr));
+#endif
+ }
+}
+
+static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
+ struct smc_diag_msg *r,
+ struct user_namespace *user_ns)
+{
+ if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown))
+ return 1;
+
+ r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+ r->diag_inode = sock_i_ino(sk);
+ return 0;
+}
+
+static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct smc_diag_req *req,
+ struct nlattr *bc)
+{
+ struct smc_sock *smc = smc_sk(sk);
+ struct smc_diag_fallback fallback;
+ struct user_namespace *user_ns;
+ struct smc_diag_msg *r;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ smc_diag_msg_common_fill(r, sk);
+ r->diag_state = sk->sk_state;
+ if (smc->use_fallback)
+ r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP;
+ else if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd)
+ r->diag_mode = SMC_DIAG_MODE_SMCD;
+ else
+ r->diag_mode = SMC_DIAG_MODE_SMCR;
+ user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
+ if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
+ goto errout;
+
+ fallback.reason = smc->fallback_rsn;
+ fallback.peer_diagnosis = smc->peer_diagnosis;
+ if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0)
+ goto errout;
+
+ if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) &&
+ smc->conn.alert_token_local) {
+ struct smc_connection *conn = &smc->conn;
+ struct smc_diag_conninfo cinfo = {
+ .token = conn->alert_token_local,
+ .sndbuf_size = conn->sndbuf_desc ?
+ conn->sndbuf_desc->len : 0,
+ .rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0,
+ .peer_rmbe_size = conn->peer_rmbe_size,
+
+ .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
+ .rx_prod.count = conn->local_rx_ctrl.prod.count,
+ .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap,
+ .rx_cons.count = conn->local_rx_ctrl.cons.count,
+
+ .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap,
+ .tx_prod.count = conn->local_tx_ctrl.prod.count,
+ .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap,
+ .tx_cons.count = conn->local_tx_ctrl.cons.count,
+
+ .tx_prod_flags =
+ *(u8 *)&conn->local_tx_ctrl.prod_flags,
+ .tx_conn_state_flags =
+ *(u8 *)&conn->local_tx_ctrl.conn_state_flags,
+ .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags,
+ .rx_conn_state_flags =
+ *(u8 *)&conn->local_rx_ctrl.conn_state_flags,
+
+ .tx_prep.wrap = conn->tx_curs_prep.wrap,
+ .tx_prep.count = conn->tx_curs_prep.count,
+ .tx_sent.wrap = conn->tx_curs_sent.wrap,
+ .tx_sent.count = conn->tx_curs_sent.count,
+ .tx_fin.wrap = conn->tx_curs_fin.wrap,
+ .tx_fin.count = conn->tx_curs_fin.count,
+ };
+
+ if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0)
+ goto errout;
+ }
+
+ if (smc_conn_lgr_valid(&smc->conn) && !smc->conn.lgr->is_smcd &&
+ (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) &&
+ !list_empty(&smc->conn.lgr->list)) {
+ struct smc_link *link = smc->conn.lnk;
+
+ struct smc_diag_lgrinfo linfo = {
+ .role = smc->conn.lgr->role,
+ .lnk[0].ibport = link->ibport,
+ .lnk[0].link_id = link->link_id,
+ };
+
+ memcpy(linfo.lnk[0].ibname, link->smcibdev->ibdev->name,
+ sizeof(link->smcibdev->ibdev->name));
+ smc_gid_be16_convert(linfo.lnk[0].gid, link->gid);
+ smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid);
+
+ if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
+ goto errout;
+ }
+ if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd &&
+ (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) &&
+ !list_empty(&smc->conn.lgr->list) && smc->conn.rmb_desc) {
+ struct smc_connection *conn = &smc->conn;
+ struct smcd_diag_dmbinfo dinfo;
+ struct smcd_dev *smcd = conn->lgr->smcd;
+
+ memset(&dinfo, 0, sizeof(dinfo));
+
+ dinfo.linkid = *((u32 *)conn->lgr->id);
+ dinfo.peer_gid = conn->lgr->peer_gid;
+ dinfo.my_gid = smcd->ops->get_local_gid(smcd);
+ dinfo.token = conn->rmb_desc->token;
+ dinfo.peer_token = conn->peer_token;
+
+ if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0)
+ goto errout;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+errout:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb,
+ struct netlink_callback *cb, int p_type)
+{
+ struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb);
+ struct net *net = sock_net(skb->sk);
+ int snum = cb_ctx->pos[p_type];
+ struct nlattr *bc = NULL;
+ struct hlist_head *head;
+ int rc = 0, num = 0;
+ struct sock *sk;
+
+ read_lock(&prot->h.smc_hash->lock);
+ head = &prot->h.smc_hash->ht;
+ if (hlist_empty(head))
+ goto out;
+
+ sk_for_each(sk, head) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < snum)
+ goto next;
+ rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
+ if (rc < 0)
+ goto out;
+next:
+ num++;
+ }
+
+out:
+ read_unlock(&prot->h.smc_hash->lock);
+ cb_ctx->pos[p_type] = num;
+ return rc;
+}
+
+static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int rc = 0;
+
+ rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC);
+ if (!rc)
+ smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6);
+ return skb->len;
+}
+
+static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+ struct net *net = sock_net(skb->sk);
+
+ if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
+ h->nlmsg_flags & NLM_F_DUMP) {
+ {
+ struct netlink_dump_control c = {
+ .dump = smc_diag_dump,
+ .min_dump_alloc = SKB_WITH_OVERHEAD(32768),
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ }
+ }
+ return 0;
+}
+
+static const struct sock_diag_handler smc_diag_handler = {
+ .family = AF_SMC,
+ .dump = smc_diag_handler_dump,
+};
+
+static int __init smc_diag_init(void)
+{
+ return sock_diag_register(&smc_diag_handler);
+}
+
+static void __exit smc_diag_exit(void)
+{
+ sock_diag_unregister(&smc_diag_handler);
+}
+
+module_init(smc_diag_init);
+module_exit(smc_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */);
+MODULE_ALIAS_GENL_FAMILY(SMCR_GENL_FAMILY_NAME);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
new file mode 100644
index 000000000..89981dbe4
--- /dev/null
+++ b/net/smc/smc_ib.c
@@ -0,0 +1,1018 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * IB infrastructure:
+ * Establish SMC-R as an Infiniband Client to be notified about added and
+ * removed IB devices of type RDMA.
+ * Determine device and port characteristics for these IB devices.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/random.h>
+#include <linux/workqueue.h>
+#include <linux/scatterlist.h>
+#include <linux/wait.h>
+#include <linux/mutex.h>
+#include <linux/inetdevice.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+
+#include "smc_pnet.h"
+#include "smc_ib.h"
+#include "smc_core.h"
+#include "smc_wr.h"
+#include "smc.h"
+#include "smc_netlink.h"
+
+#define SMC_MAX_CQE 32766 /* max. # of completion queue elements */
+
+#define SMC_QP_MIN_RNR_TIMER 5
+#define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */
+#define SMC_QP_RETRY_CNT 7 /* 7: infinite */
+#define SMC_QP_RNR_RETRY 7 /* 7: infinite */
+
+struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
+ .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex),
+ .list = LIST_HEAD_INIT(smc_ib_devices.list),
+};
+
+u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
+
+static int smc_ib_modify_qp_init(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.pkey_index = 0;
+ qp_attr.port_num = lnk->ibport;
+ qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
+ | IB_ACCESS_REMOTE_WRITE;
+ return ib_modify_qp(lnk->roce_qp, &qp_attr,
+ IB_QP_STATE | IB_QP_PKEY_INDEX |
+ IB_QP_ACCESS_FLAGS | IB_QP_PORT);
+}
+
+static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
+{
+ enum ib_qp_attr_mask qp_attr_mask =
+ IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
+ struct ib_qp_attr qp_attr;
+ u8 hop_lim = 1;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTR;
+ qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
+ qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
+ rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
+ if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
+ hop_lim = IPV6_DEFAULT_HOPLIMIT;
+ rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0);
+ rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
+ if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
+ memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac,
+ sizeof(lnk->lgr->nexthop_mac));
+ else
+ memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
+ sizeof(lnk->peer_mac));
+ qp_attr.dest_qp_num = lnk->peer_qpn;
+ qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
+ qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
+ * requests
+ */
+ qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
+
+ return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
+}
+
+int smc_ib_modify_qp_rts(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */
+ qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */
+ qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */
+ qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */
+ qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and
+ * atomic ops allowed
+ */
+ return ib_modify_qp(lnk->roce_qp, &qp_attr,
+ IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
+ IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC);
+}
+
+int smc_ib_modify_qp_error(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_ERR;
+ return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
+}
+
+int smc_ib_ready_link(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = smc_get_lgr(lnk);
+ int rc = 0;
+
+ rc = smc_ib_modify_qp_init(lnk);
+ if (rc)
+ goto out;
+
+ rc = smc_ib_modify_qp_rtr(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+ rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
+ IB_CQ_SOLICITED_MASK);
+ if (rc)
+ goto out;
+ rc = smc_wr_rx_post_init(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+
+ if (lgr->role == SMC_SERV) {
+ rc = smc_ib_modify_qp_rts(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+ }
+out:
+ return rc;
+}
+
+static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ const struct ib_gid_attr *attr;
+ int rc;
+
+ attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
+ if (IS_ERR(attr))
+ return -ENODEV;
+
+ rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]);
+ rdma_put_gid_attr(attr);
+ return rc;
+}
+
+/* Create an identifier unique for this instance of SMC-R.
+ * The MAC-address of the first active registered IB device
+ * plus a random 2-byte number is used to create this identifier.
+ * This name is delivered to the peer during connection initialization.
+ */
+static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
+ u8 ibport)
+{
+ memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
+ sizeof(smcibdev->mac[ibport - 1]));
+}
+
+bool smc_ib_is_valid_local_systemid(void)
+{
+ return !is_zero_ether_addr(&local_systemid[2]);
+}
+
+static void smc_ib_init_local_systemid(void)
+{
+ get_random_bytes(&local_systemid[0], 2);
+}
+
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
+}
+
+int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr,
+ u8 nexthop_mac[], u8 *uses_gateway)
+{
+ struct neighbour *neigh = NULL;
+ struct rtable *rt = NULL;
+ struct flowi4 fl4 = {
+ .saddr = saddr,
+ .daddr = daddr
+ };
+
+ if (daddr == cpu_to_be32(INADDR_NONE))
+ goto out;
+ rt = ip_route_output_flow(net, &fl4, NULL);
+ if (IS_ERR(rt))
+ goto out;
+ if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET)
+ goto out;
+ neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr);
+ if (neigh) {
+ memcpy(nexthop_mac, neigh->ha, ETH_ALEN);
+ *uses_gateway = rt->rt_uses_gateway;
+ return 0;
+ }
+out:
+ return -ENOENT;
+}
+
+static int smc_ib_determine_gid_rcu(const struct net_device *ndev,
+ const struct ib_gid_attr *attr,
+ u8 gid[], u8 *sgid_index,
+ struct smc_init_info_smcrv2 *smcrv2)
+{
+ if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) {
+ if (gid)
+ memcpy(gid, &attr->gid, SMC_GID_SIZE);
+ if (sgid_index)
+ *sgid_index = attr->index;
+ return 0;
+ }
+ if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
+ smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) {
+ struct in_device *in_dev = __in_dev_get_rcu(ndev);
+ struct net *net = dev_net(ndev);
+ const struct in_ifaddr *ifa;
+ bool subnet_match = false;
+
+ if (!in_dev)
+ goto out;
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (!inet_ifa_match(smcrv2->saddr, ifa))
+ continue;
+ subnet_match = true;
+ break;
+ }
+ if (!subnet_match)
+ goto out;
+ if (smcrv2->daddr && smc_ib_find_route(net, smcrv2->saddr,
+ smcrv2->daddr,
+ smcrv2->nexthop_mac,
+ &smcrv2->uses_gateway))
+ goto out;
+
+ if (gid)
+ memcpy(gid, &attr->gid, SMC_GID_SIZE);
+ if (sgid_index)
+ *sgid_index = attr->index;
+ return 0;
+ }
+out:
+ return -ENODEV;
+}
+
+/* determine the gid for an ib-device port and vlan id */
+int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
+ unsigned short vlan_id, u8 gid[], u8 *sgid_index,
+ struct smc_init_info_smcrv2 *smcrv2)
+{
+ const struct ib_gid_attr *attr;
+ const struct net_device *ndev;
+ int i;
+
+ for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
+ attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
+ if (IS_ERR(attr))
+ continue;
+
+ rcu_read_lock();
+ ndev = rdma_read_gid_attr_ndev_rcu(attr);
+ if (!IS_ERR(ndev) &&
+ ((!vlan_id && !is_vlan_dev(ndev)) ||
+ (vlan_id && is_vlan_dev(ndev) &&
+ vlan_dev_vlan_id(ndev) == vlan_id))) {
+ if (!smc_ib_determine_gid_rcu(ndev, attr, gid,
+ sgid_index, smcrv2)) {
+ rcu_read_unlock();
+ rdma_put_gid_attr(attr);
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+ rdma_put_gid_attr(attr);
+ }
+ return -ENODEV;
+}
+
+/* check if gid is still defined on smcibdev */
+static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2,
+ struct smc_ib_device *smcibdev, u8 ibport)
+{
+ const struct ib_gid_attr *attr;
+ bool rc = false;
+ int i;
+
+ for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
+ attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
+ if (IS_ERR(attr))
+ continue;
+
+ rcu_read_lock();
+ if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) ||
+ (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
+ !(ipv6_addr_type((const struct in6_addr *)&attr->gid)
+ & IPV6_ADDR_LINKLOCAL)))
+ if (!memcmp(gid, &attr->gid, SMC_GID_SIZE))
+ rc = true;
+ rcu_read_unlock();
+ rdma_put_gid_attr(attr);
+ }
+ return rc;
+}
+
+/* check all links if the gid is still defined on smcibdev */
+static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct smc_link_group *lgr;
+ int i;
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ list_for_each_entry(lgr, &smc_lgr_list.list, list) {
+ if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
+ SMC_MAX_PNETID_LEN))
+ continue; /* lgr is not affected */
+ if (list_empty(&lgr->list))
+ continue;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
+ lgr->lnk[i].smcibdev != smcibdev)
+ continue;
+ if (!smc_ib_check_link_gid(lgr->lnk[i].gid,
+ lgr->smc_version == SMC_V2,
+ smcibdev, ibport))
+ smcr_port_err(smcibdev, ibport);
+ }
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+}
+
+static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ int rc;
+
+ memset(&smcibdev->pattr[ibport - 1], 0,
+ sizeof(smcibdev->pattr[ibport - 1]));
+ rc = ib_query_port(smcibdev->ibdev, ibport,
+ &smcibdev->pattr[ibport - 1]);
+ if (rc)
+ goto out;
+ /* the SMC protocol requires specification of the RoCE MAC address */
+ rc = smc_ib_fill_mac(smcibdev, ibport);
+ if (rc)
+ goto out;
+ if (!smc_ib_is_valid_local_systemid() &&
+ smc_ib_port_active(smcibdev, ibport))
+ /* create unique system identifier */
+ smc_ib_define_local_systemid(smcibdev, ibport);
+out:
+ return rc;
+}
+
+/* process context wrapper for might_sleep smc_ib_remember_port_attr */
+static void smc_ib_port_event_work(struct work_struct *work)
+{
+ struct smc_ib_device *smcibdev = container_of(
+ work, struct smc_ib_device, port_event_work);
+ u8 port_idx;
+
+ for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
+ smc_ib_remember_port_attr(smcibdev, port_idx + 1);
+ clear_bit(port_idx, &smcibdev->port_event_mask);
+ if (!smc_ib_port_active(smcibdev, port_idx + 1)) {
+ set_bit(port_idx, smcibdev->ports_going_away);
+ smcr_port_err(smcibdev, port_idx + 1);
+ } else {
+ clear_bit(port_idx, smcibdev->ports_going_away);
+ smcr_port_add(smcibdev, port_idx + 1);
+ smc_ib_gid_check(smcibdev, port_idx + 1);
+ }
+ }
+}
+
+/* can be called in IRQ context */
+static void smc_ib_global_event_handler(struct ib_event_handler *handler,
+ struct ib_event *ibevent)
+{
+ struct smc_ib_device *smcibdev;
+ bool schedule = false;
+ u8 port_idx;
+
+ smcibdev = container_of(handler, struct smc_ib_device, event_handler);
+
+ switch (ibevent->event) {
+ case IB_EVENT_DEVICE_FATAL:
+ /* terminate all ports on device */
+ for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) {
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (!test_and_set_bit(port_idx,
+ smcibdev->ports_going_away))
+ schedule = true;
+ }
+ if (schedule)
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ case IB_EVENT_PORT_ACTIVE:
+ port_idx = ibevent->element.port_num - 1;
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (test_and_clear_bit(port_idx, smcibdev->ports_going_away))
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ case IB_EVENT_PORT_ERR:
+ port_idx = ibevent->element.port_num - 1;
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ case IB_EVENT_GID_CHANGE:
+ port_idx = ibevent->element.port_num - 1;
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ default:
+ break;
+ }
+}
+
+void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
+{
+ if (lnk->roce_pd)
+ ib_dealloc_pd(lnk->roce_pd);
+ lnk->roce_pd = NULL;
+}
+
+int smc_ib_create_protection_domain(struct smc_link *lnk)
+{
+ int rc;
+
+ lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
+ rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
+ if (IS_ERR(lnk->roce_pd))
+ lnk->roce_pd = NULL;
+ return rc;
+}
+
+static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr,
+ struct smc_ib_device *smcibdev)
+{
+ struct smc_link_group *lgr;
+ bool rc = false;
+ int i;
+
+ spin_lock_bh(&smc_lgr->lock);
+ list_for_each_entry(lgr, &smc_lgr->list, list) {
+ if (lgr->is_smcd)
+ continue;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
+ lgr->lnk[i].smcibdev != smcibdev)
+ continue;
+ if (lgr->type == SMC_LGR_SINGLE ||
+ lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) {
+ rc = true;
+ goto out;
+ }
+ }
+ }
+out:
+ spin_unlock_bh(&smc_lgr->lock);
+ return rc;
+}
+
+static int smc_nl_handle_dev_port(struct sk_buff *skb,
+ struct ib_device *ibdev,
+ struct smc_ib_device *smcibdev,
+ int port)
+{
+ char smc_pnet[SMC_MAX_PNETID_LEN + 1];
+ struct nlattr *port_attrs;
+ unsigned char port_state;
+ int lnk_count = 0;
+
+ port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port);
+ if (!port_attrs)
+ goto errout;
+
+ if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR,
+ smcibdev->pnetid_by_user[port]))
+ goto errattr;
+ memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN);
+ smc_pnet[SMC_MAX_PNETID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV,
+ smcibdev->ndev_ifidx[port]))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1))
+ goto errattr;
+ port_state = smc_ib_port_active(smcibdev, port + 1);
+ if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state))
+ goto errattr;
+ lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]);
+ if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count))
+ goto errattr;
+ nla_nest_end(skb, port_attrs);
+ return 0;
+errattr:
+ nla_nest_cancel(skb, port_attrs);
+errout:
+ return -EMSGSIZE;
+}
+
+static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev,
+ struct sk_buff *skb)
+{
+ if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid))
+ return false;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid))
+ return false;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor))
+ return false;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device))
+ return false;
+ if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id))
+ return false;
+ return true;
+}
+
+static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_ibname[IB_DEVICE_NAME_MAX];
+ struct smc_pci_dev smc_pci_dev;
+ struct pci_dev *pci_dev;
+ unsigned char is_crit;
+ struct nlattr *attrs;
+ void *nlh;
+ int i;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_DEV_SMCR);
+ if (!nlh)
+ goto errmsg;
+ attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR);
+ if (!attrs)
+ goto errout;
+ is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev);
+ if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit))
+ goto errattr;
+ if (smcibdev->ibdev->dev.parent) {
+ memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
+ pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent);
+ smc_set_pci_values(pci_dev, &smc_pci_dev);
+ if (!smc_nl_handle_pci_values(&smc_pci_dev, skb))
+ goto errattr;
+ }
+ snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name);
+ if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname))
+ goto errattr;
+ for (i = 1; i <= SMC_MAX_PORTS; i++) {
+ if (!rdma_is_port_valid(smcibdev->ibdev, i))
+ continue;
+ if (smc_nl_handle_dev_port(skb, smcibdev->ibdev,
+ smcibdev, i - 1))
+ goto errattr;
+ }
+
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return 0;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct smc_ib_device *smcibdev;
+ int snum = cb_ctx->pos[0];
+ int num = 0;
+
+ mutex_lock(&dev_list->mutex);
+ list_for_each_entry(smcibdev, &dev_list->list, list) {
+ if (num < snum)
+ goto next;
+ if (smc_nl_handle_smcr_dev(smcibdev, skb, cb))
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ mutex_unlock(&dev_list->mutex);
+ cb_ctx->pos[0] = num;
+}
+
+int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb);
+ return skb->len;
+}
+
+static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
+{
+ struct smc_link *lnk = (struct smc_link *)priv;
+ struct smc_ib_device *smcibdev = lnk->smcibdev;
+ u8 port_idx;
+
+ switch (ibevent->event) {
+ case IB_EVENT_QP_FATAL:
+ case IB_EVENT_QP_ACCESS_ERR:
+ port_idx = ibevent->element.qp->port - 1;
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ default:
+ break;
+ }
+}
+
+void smc_ib_destroy_queue_pair(struct smc_link *lnk)
+{
+ if (lnk->roce_qp)
+ ib_destroy_qp(lnk->roce_qp);
+ lnk->roce_qp = NULL;
+}
+
+/* create a queue pair within the protection domain for a link */
+int smc_ib_create_queue_pair(struct smc_link *lnk)
+{
+ int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
+ struct ib_qp_init_attr qp_attr = {
+ .event_handler = smc_ib_qp_event_handler,
+ .qp_context = lnk,
+ .send_cq = lnk->smcibdev->roce_cq_send,
+ .recv_cq = lnk->smcibdev->roce_cq_recv,
+ .srq = NULL,
+ .cap = {
+ /* include unsolicited rdma_writes as well,
+ * there are max. 2 RDMA_WRITE per 1 WR_SEND
+ */
+ .max_send_wr = SMC_WR_BUF_CNT * 3,
+ .max_recv_wr = SMC_WR_BUF_CNT * 3,
+ .max_send_sge = SMC_IB_MAX_SEND_SGE,
+ .max_recv_sge = sges_per_buf,
+ .max_inline_data = 0,
+ },
+ .sq_sig_type = IB_SIGNAL_REQ_WR,
+ .qp_type = IB_QPT_RC,
+ };
+ int rc;
+
+ lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
+ rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
+ if (IS_ERR(lnk->roce_qp))
+ lnk->roce_qp = NULL;
+ else
+ smc_wr_remember_qp_attr(lnk);
+ return rc;
+}
+
+void smc_ib_put_memory_region(struct ib_mr *mr)
+{
+ ib_dereg_mr(mr);
+}
+
+static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx)
+{
+ unsigned int offset = 0;
+ int sg_num;
+
+ /* map the largest prefix of a dma mapped SG list */
+ sg_num = ib_map_mr_sg(buf_slot->mr[link_idx],
+ buf_slot->sgt[link_idx].sgl,
+ buf_slot->sgt[link_idx].orig_nents,
+ &offset, PAGE_SIZE);
+
+ return sg_num;
+}
+
+/* Allocate a memory region and map the dma mapped SG list of buf_slot */
+int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
+ struct smc_buf_desc *buf_slot, u8 link_idx)
+{
+ if (buf_slot->mr[link_idx])
+ return 0; /* already done */
+
+ buf_slot->mr[link_idx] =
+ ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order);
+ if (IS_ERR(buf_slot->mr[link_idx])) {
+ int rc;
+
+ rc = PTR_ERR(buf_slot->mr[link_idx]);
+ buf_slot->mr[link_idx] = NULL;
+ return rc;
+ }
+
+ if (smc_ib_map_mr_sg(buf_slot, link_idx) !=
+ buf_slot->sgt[link_idx].orig_nents)
+ return -EINVAL;
+
+ return 0;
+}
+
+bool smc_ib_is_sg_need_sync(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot)
+{
+ struct scatterlist *sg;
+ unsigned int i;
+ bool ret = false;
+
+ /* for now there is just one DMA address */
+ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
+ buf_slot->sgt[lnk->link_idx].nents, i) {
+ if (!sg_dma_len(sg))
+ break;
+ if (dma_need_sync(lnk->smcibdev->ibdev->dma_device,
+ sg_dma_address(sg))) {
+ ret = true;
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
+
+/* synchronize buffer usage for cpu access */
+void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ struct scatterlist *sg;
+ unsigned int i;
+
+ if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx)))
+ return;
+
+ /* for now there is just one DMA address */
+ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
+ buf_slot->sgt[lnk->link_idx].nents, i) {
+ if (!sg_dma_len(sg))
+ break;
+ ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev,
+ sg_dma_address(sg),
+ sg_dma_len(sg),
+ data_direction);
+ }
+}
+
+/* synchronize buffer usage for device access */
+void smc_ib_sync_sg_for_device(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ struct scatterlist *sg;
+ unsigned int i;
+
+ if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx)))
+ return;
+
+ /* for now there is just one DMA address */
+ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
+ buf_slot->sgt[lnk->link_idx].nents, i) {
+ if (!sg_dma_len(sg))
+ break;
+ ib_dma_sync_single_for_device(lnk->smcibdev->ibdev,
+ sg_dma_address(sg),
+ sg_dma_len(sg),
+ data_direction);
+ }
+}
+
+/* Map a new TX or RX buffer SG-table to DMA */
+int smc_ib_buf_map_sg(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ int mapped_nents;
+
+ mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev,
+ buf_slot->sgt[lnk->link_idx].sgl,
+ buf_slot->sgt[lnk->link_idx].orig_nents,
+ data_direction);
+ if (!mapped_nents)
+ return -ENOMEM;
+
+ return mapped_nents;
+}
+
+void smc_ib_buf_unmap_sg(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address)
+ return; /* already unmapped */
+
+ ib_dma_unmap_sg(lnk->smcibdev->ibdev,
+ buf_slot->sgt[lnk->link_idx].sgl,
+ buf_slot->sgt[lnk->link_idx].orig_nents,
+ data_direction);
+ buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
+}
+
+long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+ struct ib_cq_init_attr cqattr = {
+ .cqe = SMC_MAX_CQE, .comp_vector = 0 };
+ int cqe_size_order, smc_order;
+ long rc;
+
+ mutex_lock(&smcibdev->mutex);
+ rc = 0;
+ if (smcibdev->initialized)
+ goto out;
+ /* the calculated number of cq entries fits to mlx5 cq allocation */
+ cqe_size_order = cache_line_size() == 128 ? 7 : 6;
+ smc_order = MAX_ORDER - cqe_size_order;
+ if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
+ cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
+ smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
+ smc_wr_tx_cq_handler, NULL,
+ smcibdev, &cqattr);
+ rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
+ if (IS_ERR(smcibdev->roce_cq_send)) {
+ smcibdev->roce_cq_send = NULL;
+ goto out;
+ }
+ smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
+ smc_wr_rx_cq_handler, NULL,
+ smcibdev, &cqattr);
+ rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
+ if (IS_ERR(smcibdev->roce_cq_recv)) {
+ smcibdev->roce_cq_recv = NULL;
+ goto err;
+ }
+ smc_wr_add_dev(smcibdev);
+ smcibdev->initialized = 1;
+ goto out;
+
+err:
+ ib_destroy_cq(smcibdev->roce_cq_send);
+out:
+ mutex_unlock(&smcibdev->mutex);
+ return rc;
+}
+
+static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+ mutex_lock(&smcibdev->mutex);
+ if (!smcibdev->initialized)
+ goto out;
+ smcibdev->initialized = 0;
+ ib_destroy_cq(smcibdev->roce_cq_recv);
+ ib_destroy_cq(smcibdev->roce_cq_send);
+ smc_wr_remove_dev(smcibdev);
+out:
+ mutex_unlock(&smcibdev->mutex);
+}
+
+static struct ib_client smc_ib_client;
+
+static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port)
+{
+ struct ib_device *ibdev = smcibdev->ibdev;
+ struct net_device *ndev;
+
+ if (!ibdev->ops.get_netdev)
+ return;
+ ndev = ibdev->ops.get_netdev(ibdev, port + 1);
+ if (ndev) {
+ smcibdev->ndev_ifidx[port] = ndev->ifindex;
+ dev_put(ndev);
+ }
+}
+
+void smc_ib_ndev_change(struct net_device *ndev, unsigned long event)
+{
+ struct smc_ib_device *smcibdev;
+ struct ib_device *libdev;
+ struct net_device *lndev;
+ u8 port_cnt;
+ int i;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
+ port_cnt = smcibdev->ibdev->phys_port_cnt;
+ for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) {
+ libdev = smcibdev->ibdev;
+ if (!libdev->ops.get_netdev)
+ continue;
+ lndev = libdev->ops.get_netdev(libdev, i + 1);
+ dev_put(lndev);
+ if (lndev != ndev)
+ continue;
+ if (event == NETDEV_REGISTER)
+ smcibdev->ndev_ifidx[i] = ndev->ifindex;
+ if (event == NETDEV_UNREGISTER)
+ smcibdev->ndev_ifidx[i] = 0;
+ }
+ }
+ mutex_unlock(&smc_ib_devices.mutex);
+}
+
+/* callback function for ib_register_client() */
+static int smc_ib_add_dev(struct ib_device *ibdev)
+{
+ struct smc_ib_device *smcibdev;
+ u8 port_cnt;
+ int i;
+
+ if (ibdev->node_type != RDMA_NODE_IB_CA)
+ return -EOPNOTSUPP;
+
+ smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
+ if (!smcibdev)
+ return -ENOMEM;
+
+ smcibdev->ibdev = ibdev;
+ INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
+ atomic_set(&smcibdev->lnk_cnt, 0);
+ init_waitqueue_head(&smcibdev->lnks_deleted);
+ mutex_init(&smcibdev->mutex);
+ mutex_lock(&smc_ib_devices.mutex);
+ list_add_tail(&smcibdev->list, &smc_ib_devices.list);
+ mutex_unlock(&smc_ib_devices.mutex);
+ ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
+ INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
+ smc_ib_global_event_handler);
+ ib_register_event_handler(&smcibdev->event_handler);
+
+ /* trigger reading of the port attributes */
+ port_cnt = smcibdev->ibdev->phys_port_cnt;
+ pr_warn_ratelimited("smc: adding ib device %s with port count %d\n",
+ smcibdev->ibdev->name, port_cnt);
+ for (i = 0;
+ i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
+ i++) {
+ set_bit(i, &smcibdev->port_event_mask);
+ /* determine pnetids of the port */
+ if (smc_pnetid_by_dev_port(ibdev->dev.parent, i,
+ smcibdev->pnetid[i]))
+ smc_pnetid_by_table_ib(smcibdev, i + 1);
+ smc_copy_netdev_ifindex(smcibdev, i);
+ pr_warn_ratelimited("smc: ib device %s port %d has pnetid "
+ "%.16s%s\n",
+ smcibdev->ibdev->name, i + 1,
+ smcibdev->pnetid[i],
+ smcibdev->pnetid_by_user[i] ?
+ " (user defined)" :
+ "");
+ }
+ schedule_work(&smcibdev->port_event_work);
+ return 0;
+}
+
+/* callback function for ib_unregister_client() */
+static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
+{
+ struct smc_ib_device *smcibdev = client_data;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
+ mutex_unlock(&smc_ib_devices.mutex);
+ pr_warn_ratelimited("smc: removing ib device %s\n",
+ smcibdev->ibdev->name);
+ smc_smcr_terminate_all(smcibdev);
+ smc_ib_cleanup_per_ibdev(smcibdev);
+ ib_unregister_event_handler(&smcibdev->event_handler);
+ cancel_work_sync(&smcibdev->port_event_work);
+ kfree(smcibdev);
+}
+
+static struct ib_client smc_ib_client = {
+ .name = "smc_ib",
+ .add = smc_ib_add_dev,
+ .remove = smc_ib_remove_dev,
+};
+
+int __init smc_ib_register_client(void)
+{
+ smc_ib_init_local_systemid();
+ return ib_register_client(&smc_ib_client);
+}
+
+void smc_ib_unregister_client(void)
+{
+ ib_unregister_client(&smc_ib_client);
+}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
new file mode 100644
index 000000000..ef8ac2b75
--- /dev/null
+++ b/net/smc/smc_ib.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for IB environment
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <Ursula Braun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_IB_H
+#define _SMC_IB_H
+
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <rdma/ib_verbs.h>
+#include <net/smc.h>
+
+#define SMC_MAX_PORTS 2 /* Max # of ports */
+#define SMC_GID_SIZE sizeof(union ib_gid)
+
+#define SMC_IB_MAX_SEND_SGE 2
+
+struct smc_ib_devices { /* list of smc ib devices definition */
+ struct list_head list;
+ struct mutex mutex; /* protects list of smc ib devices */
+};
+
+extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */
+extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */
+
+struct smc_ib_device { /* ib-device infos for smc */
+ struct list_head list;
+ struct ib_device *ibdev;
+ struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
+ struct ib_event_handler event_handler; /* global ib_event handler */
+ struct ib_cq *roce_cq_send; /* send completion queue */
+ struct ib_cq *roce_cq_recv; /* recv completion queue */
+ struct tasklet_struct send_tasklet; /* called by send cq handler */
+ struct tasklet_struct recv_tasklet; /* called by recv cq handler */
+ char mac[SMC_MAX_PORTS][ETH_ALEN];
+ /* mac address per port*/
+ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN];
+ /* pnetid per port */
+ bool pnetid_by_user[SMC_MAX_PORTS];
+ /* pnetid defined by user? */
+ u8 initialized : 1; /* ib dev CQ, evthdl done */
+ struct work_struct port_event_work;
+ unsigned long port_event_mask;
+ DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS);
+ atomic_t lnk_cnt; /* number of links on ibdev */
+ wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/
+ struct mutex mutex; /* protect dev setup+cleanup */
+ atomic_t lnk_cnt_by_port[SMC_MAX_PORTS];
+ /* number of links per port */
+ int ndev_ifidx[SMC_MAX_PORTS]; /* ndev if indexes */
+};
+
+static inline __be32 smc_ib_gid_to_ipv4(u8 gid[SMC_GID_SIZE])
+{
+ struct in6_addr *addr6 = (struct in6_addr *)gid;
+
+ if (ipv6_addr_v4mapped(addr6) ||
+ !(addr6->s6_addr32[0] | addr6->s6_addr32[1] | addr6->s6_addr32[2]))
+ return addr6->s6_addr32[3];
+ return cpu_to_be32(INADDR_NONE);
+}
+
+static inline struct net *smc_ib_net(struct smc_ib_device *smcibdev)
+{
+ if (smcibdev && smcibdev->ibdev)
+ return read_pnet(&smcibdev->ibdev->coredev.rdma_net);
+ return NULL;
+}
+
+struct smc_init_info_smcrv2;
+struct smc_buf_desc;
+struct smc_link;
+
+void smc_ib_ndev_change(struct net_device *ndev, unsigned long event);
+int smc_ib_register_client(void) __init;
+void smc_ib_unregister_client(void);
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
+int smc_ib_buf_map_sg(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction);
+void smc_ib_buf_unmap_sg(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction);
+void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
+int smc_ib_create_protection_domain(struct smc_link *lnk);
+void smc_ib_destroy_queue_pair(struct smc_link *lnk);
+int smc_ib_create_queue_pair(struct smc_link *lnk);
+int smc_ib_ready_link(struct smc_link *lnk);
+int smc_ib_modify_qp_rts(struct smc_link *lnk);
+int smc_ib_modify_qp_error(struct smc_link *lnk);
+long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
+int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
+ struct smc_buf_desc *buf_slot, u8 link_idx);
+void smc_ib_put_memory_region(struct ib_mr *mr);
+bool smc_ib_is_sg_need_sync(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot);
+void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction);
+void smc_ib_sync_sg_for_device(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction);
+int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
+ unsigned short vlan_id, u8 gid[], u8 *sgid_index,
+ struct smc_init_info_smcrv2 *smcrv2);
+int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr,
+ u8 nexthop_mac[], u8 *uses_gateway);
+bool smc_ib_is_valid_local_systemid(void);
+int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb);
+#endif
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
new file mode 100644
index 000000000..fbee24930
--- /dev/null
+++ b/net/smc/smc_ism.c
@@ -0,0 +1,554 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Shared Memory Communications Direct over ISM devices (SMC-D)
+ *
+ * Functions for ISM device.
+ *
+ * Copyright IBM Corp. 2018
+ */
+
+#include <linux/if_vlan.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <asm/page.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_ism.h"
+#include "smc_pnet.h"
+#include "smc_netlink.h"
+#include "linux/ism.h"
+
+struct smcd_dev_list smcd_dev_list = {
+ .list = LIST_HEAD_INIT(smcd_dev_list.list),
+ .mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex)
+};
+
+static bool smc_ism_v2_capable;
+static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN];
+
+#if IS_ENABLED(CONFIG_ISM)
+static void smcd_register_dev(struct ism_dev *ism);
+static void smcd_unregister_dev(struct ism_dev *ism);
+static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event);
+static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno,
+ u16 dmbemask);
+
+static struct ism_client smc_ism_client = {
+ .name = "SMC-D",
+ .add = smcd_register_dev,
+ .remove = smcd_unregister_dev,
+ .handle_event = smcd_handle_event,
+ .handle_irq = smcd_handle_irq,
+};
+#endif
+
+/* Test if an ISM communication is possible - same CPC */
+int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd)
+{
+ return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0,
+ vlan_id);
+}
+
+void smc_ism_get_system_eid(u8 **eid)
+{
+ if (!smc_ism_v2_capable)
+ *eid = NULL;
+ else
+ *eid = smc_ism_v2_system_eid;
+}
+
+u16 smc_ism_get_chid(struct smcd_dev *smcd)
+{
+ return smcd->ops->get_chid(smcd);
+}
+
+/* HW supports ISM V2 and thus System EID is defined */
+bool smc_ism_is_v2_capable(void)
+{
+ return smc_ism_v2_capable;
+}
+
+/* Set a connection using this DMBE. */
+void smc_ism_set_conn(struct smc_connection *conn)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->lgr->smcd->lock, flags);
+ conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn;
+ spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags);
+}
+
+/* Unset a connection using this DMBE. */
+void smc_ism_unset_conn(struct smc_connection *conn)
+{
+ unsigned long flags;
+
+ if (!conn->rmb_desc)
+ return;
+
+ spin_lock_irqsave(&conn->lgr->smcd->lock, flags);
+ conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL;
+ spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags);
+}
+
+/* Register a VLAN identifier with the ISM device. Use a reference count
+ * and add a VLAN identifier only when the first DMB using this VLAN is
+ * registered.
+ */
+int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid)
+{
+ struct smc_ism_vlanid *new_vlan, *vlan;
+ unsigned long flags;
+ int rc = 0;
+
+ if (!vlanid) /* No valid vlan id */
+ return -EINVAL;
+
+ /* create new vlan entry, in case we need it */
+ new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL);
+ if (!new_vlan)
+ return -ENOMEM;
+ new_vlan->vlanid = vlanid;
+ refcount_set(&new_vlan->refcnt, 1);
+
+ /* if there is an existing entry, increase count and return */
+ spin_lock_irqsave(&smcd->lock, flags);
+ list_for_each_entry(vlan, &smcd->vlan, list) {
+ if (vlan->vlanid == vlanid) {
+ refcount_inc(&vlan->refcnt);
+ kfree(new_vlan);
+ goto out;
+ }
+ }
+
+ /* no existing entry found.
+ * add new entry to device; might fail, e.g., if HW limit reached
+ */
+ if (smcd->ops->add_vlan_id(smcd, vlanid)) {
+ kfree(new_vlan);
+ rc = -EIO;
+ goto out;
+ }
+ list_add_tail(&new_vlan->list, &smcd->vlan);
+out:
+ spin_unlock_irqrestore(&smcd->lock, flags);
+ return rc;
+}
+
+/* Unregister a VLAN identifier with the ISM device. Use a reference count
+ * and remove a VLAN identifier only when the last DMB using this VLAN is
+ * unregistered.
+ */
+int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid)
+{
+ struct smc_ism_vlanid *vlan;
+ unsigned long flags;
+ bool found = false;
+ int rc = 0;
+
+ if (!vlanid) /* No valid vlan id */
+ return -EINVAL;
+
+ spin_lock_irqsave(&smcd->lock, flags);
+ list_for_each_entry(vlan, &smcd->vlan, list) {
+ if (vlan->vlanid == vlanid) {
+ if (!refcount_dec_and_test(&vlan->refcnt))
+ goto out;
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ rc = -ENOENT;
+ goto out; /* VLAN id not in table */
+ }
+
+ /* Found and the last reference just gone */
+ if (smcd->ops->del_vlan_id(smcd, vlanid))
+ rc = -EIO;
+ list_del(&vlan->list);
+ kfree(vlan);
+out:
+ spin_unlock_irqrestore(&smcd->lock, flags);
+ return rc;
+}
+
+int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
+{
+ struct smcd_dmb dmb;
+ int rc = 0;
+
+ if (!dmb_desc->dma_addr)
+ return rc;
+
+ memset(&dmb, 0, sizeof(dmb));
+ dmb.dmb_tok = dmb_desc->token;
+ dmb.sba_idx = dmb_desc->sba_idx;
+ dmb.cpu_addr = dmb_desc->cpu_addr;
+ dmb.dma_addr = dmb_desc->dma_addr;
+ dmb.dmb_len = dmb_desc->len;
+ rc = smcd->ops->unregister_dmb(smcd, &dmb);
+ if (!rc || rc == ISM_ERROR) {
+ dmb_desc->cpu_addr = NULL;
+ dmb_desc->dma_addr = 0;
+ }
+
+ return rc;
+}
+
+int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
+ struct smc_buf_desc *dmb_desc)
+{
+#if IS_ENABLED(CONFIG_ISM)
+ struct smcd_dmb dmb;
+ int rc;
+
+ memset(&dmb, 0, sizeof(dmb));
+ dmb.dmb_len = dmb_len;
+ dmb.sba_idx = dmb_desc->sba_idx;
+ dmb.vlan_id = lgr->vlan_id;
+ dmb.rgid = lgr->peer_gid;
+ rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, &smc_ism_client);
+ if (!rc) {
+ dmb_desc->sba_idx = dmb.sba_idx;
+ dmb_desc->token = dmb.dmb_tok;
+ dmb_desc->cpu_addr = dmb.cpu_addr;
+ dmb_desc->dma_addr = dmb.dma_addr;
+ dmb_desc->len = dmb.dmb_len;
+ }
+ return rc;
+#else
+ return 0;
+#endif
+}
+
+static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_pnet[SMC_MAX_PNETID_LEN + 1];
+ struct smc_pci_dev smc_pci_dev;
+ struct nlattr *port_attrs;
+ struct nlattr *attrs;
+ struct ism_dev *ism;
+ int use_cnt = 0;
+ void *nlh;
+
+ ism = smcd->priv;
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_DEV_SMCD);
+ if (!nlh)
+ goto errmsg;
+ attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCD);
+ if (!attrs)
+ goto errout;
+ use_cnt = atomic_read(&smcd->lgr_cnt);
+ if (nla_put_u32(skb, SMC_NLA_DEV_USE_CNT, use_cnt))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0))
+ goto errattr;
+ memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
+ smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev);
+ if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid))
+ goto errattr;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid))
+ goto errattr;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor))
+ goto errattr;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device))
+ goto errattr;
+ if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id))
+ goto errattr;
+
+ port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT);
+ if (!port_attrs)
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcd->pnetid_by_user))
+ goto errportattr;
+ memcpy(smc_pnet, smcd->pnetid, SMC_MAX_PNETID_LEN);
+ smc_pnet[SMC_MAX_PNETID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet))
+ goto errportattr;
+
+ nla_nest_end(skb, port_attrs);
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return 0;
+
+errportattr:
+ nla_nest_cancel(skb, port_attrs);
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ nlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ int snum = cb_ctx->pos[0];
+ struct smcd_dev *smcd;
+ int num = 0;
+
+ mutex_lock(&dev_list->mutex);
+ list_for_each_entry(smcd, &dev_list->list, list) {
+ if (num < snum)
+ goto next;
+ if (smc_nl_handle_smcd_dev(smcd, skb, cb))
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ mutex_unlock(&dev_list->mutex);
+ cb_ctx->pos[0] = num;
+}
+
+int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ smc_nl_prep_smcd_dev(&smcd_dev_list, skb, cb);
+ return skb->len;
+}
+
+#if IS_ENABLED(CONFIG_ISM)
+struct smc_ism_event_work {
+ struct work_struct work;
+ struct smcd_dev *smcd;
+ struct ism_event event;
+};
+
+#define ISM_EVENT_REQUEST 0x0001
+#define ISM_EVENT_RESPONSE 0x0002
+#define ISM_EVENT_REQUEST_IR 0x00000001
+#define ISM_EVENT_CODE_SHUTDOWN 0x80
+#define ISM_EVENT_CODE_TESTLINK 0x83
+
+union smcd_sw_event_info {
+ u64 info;
+ struct {
+ u8 uid[SMC_LGR_ID_SIZE];
+ unsigned short vlan_id;
+ u16 code;
+ };
+};
+
+static void smcd_handle_sw_event(struct smc_ism_event_work *wrk)
+{
+ union smcd_sw_event_info ev_info;
+
+ ev_info.info = wrk->event.info;
+ switch (wrk->event.code) {
+ case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */
+ smc_smcd_terminate(wrk->smcd, wrk->event.tok, ev_info.vlan_id);
+ break;
+ case ISM_EVENT_CODE_TESTLINK: /* Activity timer */
+ if (ev_info.code == ISM_EVENT_REQUEST) {
+ ev_info.code = ISM_EVENT_RESPONSE;
+ wrk->smcd->ops->signal_event(wrk->smcd,
+ wrk->event.tok,
+ ISM_EVENT_REQUEST_IR,
+ ISM_EVENT_CODE_TESTLINK,
+ ev_info.info);
+ }
+ break;
+ }
+}
+
+/* worker for SMC-D events */
+static void smc_ism_event_work(struct work_struct *work)
+{
+ struct smc_ism_event_work *wrk =
+ container_of(work, struct smc_ism_event_work, work);
+
+ switch (wrk->event.type) {
+ case ISM_EVENT_GID: /* GID event, token is peer GID */
+ smc_smcd_terminate(wrk->smcd, wrk->event.tok, VLAN_VID_MASK);
+ break;
+ case ISM_EVENT_DMB:
+ break;
+ case ISM_EVENT_SWR: /* Software defined event */
+ smcd_handle_sw_event(wrk);
+ break;
+ }
+ kfree(wrk);
+}
+
+static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
+ const struct smcd_ops *ops, int max_dmbs)
+{
+ struct smcd_dev *smcd;
+
+ smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL);
+ if (!smcd)
+ return NULL;
+ smcd->conn = devm_kcalloc(parent, max_dmbs,
+ sizeof(struct smc_connection *), GFP_KERNEL);
+ if (!smcd->conn)
+ return NULL;
+
+ smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
+ WQ_MEM_RECLAIM, name);
+ if (!smcd->event_wq)
+ return NULL;
+
+ smcd->ops = ops;
+
+ spin_lock_init(&smcd->lock);
+ spin_lock_init(&smcd->lgr_lock);
+ INIT_LIST_HEAD(&smcd->vlan);
+ INIT_LIST_HEAD(&smcd->lgr_list);
+ init_waitqueue_head(&smcd->lgrs_deleted);
+ return smcd;
+}
+
+static void smcd_register_dev(struct ism_dev *ism)
+{
+ const struct smcd_ops *ops = ism_get_smcd_ops();
+ struct smcd_dev *smcd;
+
+ if (!ops)
+ return;
+
+ smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops,
+ ISM_NR_DMBS);
+ if (!smcd)
+ return;
+ smcd->priv = ism;
+ ism_set_priv(ism, &smc_ism_client, smcd);
+ if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid))
+ smc_pnetid_by_table_smcd(smcd);
+
+ mutex_lock(&smcd_dev_list.mutex);
+ if (list_empty(&smcd_dev_list.list)) {
+ u8 *system_eid = NULL;
+
+ system_eid = smcd->ops->get_system_eid();
+ if (smcd->ops->supports_v2()) {
+ smc_ism_v2_capable = true;
+ memcpy(smc_ism_v2_system_eid, system_eid,
+ SMC_MAX_EID_LEN);
+ }
+ }
+ /* sort list: devices without pnetid before devices with pnetid */
+ if (smcd->pnetid[0])
+ list_add_tail(&smcd->list, &smcd_dev_list.list);
+ else
+ list_add(&smcd->list, &smcd_dev_list.list);
+ mutex_unlock(&smcd_dev_list.mutex);
+
+ pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n",
+ dev_name(&ism->dev), smcd->pnetid,
+ smcd->pnetid_by_user ? " (user defined)" : "");
+
+ return;
+}
+
+static void smcd_unregister_dev(struct ism_dev *ism)
+{
+ struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client);
+
+ pr_warn_ratelimited("smc: removing smcd device %s\n",
+ dev_name(&ism->dev));
+ smcd->going_away = 1;
+ smc_smcd_terminate_all(smcd);
+ mutex_lock(&smcd_dev_list.mutex);
+ list_del_init(&smcd->list);
+ mutex_unlock(&smcd_dev_list.mutex);
+ destroy_workqueue(smcd->event_wq);
+}
+
+/* SMCD Device event handler. Called from ISM device interrupt handler.
+ * Parameters are ism device pointer,
+ * - event->type (0 --> DMB, 1 --> GID),
+ * - event->code (event code),
+ * - event->tok (either DMB token when event type 0, or GID when event type 1)
+ * - event->time (time of day)
+ * - event->info (debug info).
+ *
+ * Context:
+ * - Function called in IRQ context from ISM device driver event handler.
+ */
+static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event)
+{
+ struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client);
+ struct smc_ism_event_work *wrk;
+
+ if (smcd->going_away)
+ return;
+ /* copy event to event work queue, and let it be handled there */
+ wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC);
+ if (!wrk)
+ return;
+ INIT_WORK(&wrk->work, smc_ism_event_work);
+ wrk->smcd = smcd;
+ wrk->event = *event;
+ queue_work(smcd->event_wq, &wrk->work);
+}
+
+/* SMCD Device interrupt handler. Called from ISM device interrupt handler.
+ * Parameters are the ism device pointer, DMB number, and the DMBE bitmask.
+ * Find the connection and schedule the tasklet for this connection.
+ *
+ * Context:
+ * - Function called in IRQ context from ISM device driver IRQ handler.
+ */
+static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno,
+ u16 dmbemask)
+{
+ struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client);
+ struct smc_connection *conn = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&smcd->lock, flags);
+ conn = smcd->conn[dmbno];
+ if (conn && !conn->killed)
+ tasklet_schedule(&conn->rx_tsklet);
+ spin_unlock_irqrestore(&smcd->lock, flags);
+}
+#endif
+
+int smc_ism_signal_shutdown(struct smc_link_group *lgr)
+{
+ int rc = 0;
+#if IS_ENABLED(CONFIG_ISM)
+ union smcd_sw_event_info ev_info;
+
+ if (lgr->peer_shutdown)
+ return 0;
+
+ memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE);
+ ev_info.vlan_id = lgr->vlan_id;
+ ev_info.code = ISM_EVENT_REQUEST;
+ rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid,
+ ISM_EVENT_REQUEST_IR,
+ ISM_EVENT_CODE_SHUTDOWN,
+ ev_info.info);
+#endif
+ return rc;
+}
+
+int smc_ism_init(void)
+{
+ int rc = 0;
+
+#if IS_ENABLED(CONFIG_ISM)
+ smc_ism_v2_capable = false;
+ memset(smc_ism_v2_system_eid, 0, SMC_MAX_EID_LEN);
+
+ rc = ism_register_client(&smc_ism_client);
+#endif
+ return rc;
+}
+
+void smc_ism_exit(void)
+{
+#if IS_ENABLED(CONFIG_ISM)
+ ism_unregister_client(&smc_ism_client);
+#endif
+}
diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h
new file mode 100644
index 000000000..832b2f42d
--- /dev/null
+++ b/net/smc/smc_ism.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Shared Memory Communications Direct over ISM devices (SMC-D)
+ *
+ * SMC-D ISM device structure definitions.
+ *
+ * Copyright IBM Corp. 2018
+ */
+
+#ifndef SMCD_ISM_H
+#define SMCD_ISM_H
+
+#include <linux/uio.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+
+#include "smc.h"
+
+struct smcd_dev_list { /* List of SMCD devices */
+ struct list_head list;
+ struct mutex mutex; /* Protects list of devices */
+};
+
+extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */
+
+struct smc_ism_vlanid { /* VLAN id set on ISM device */
+ struct list_head list;
+ unsigned short vlanid; /* Vlan id */
+ refcount_t refcnt; /* Reference count */
+};
+
+struct smcd_dev;
+
+int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev);
+void smc_ism_set_conn(struct smc_connection *conn);
+void smc_ism_unset_conn(struct smc_connection *conn);
+int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id);
+int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id);
+int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size,
+ struct smc_buf_desc *dmb_desc);
+int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc);
+int smc_ism_signal_shutdown(struct smc_link_group *lgr);
+void smc_ism_get_system_eid(u8 **eid);
+u16 smc_ism_get_chid(struct smcd_dev *dev);
+bool smc_ism_is_v2_capable(void);
+int smc_ism_init(void);
+void smc_ism_exit(void);
+int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb);
+
+static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok,
+ unsigned int idx, bool sf, unsigned int offset,
+ void *data, size_t len)
+{
+ int rc;
+
+ rc = smcd->ops->move_data(smcd, dmb_tok, idx, sf, offset, data, len);
+ return rc < 0 ? rc : 0;
+}
+
+#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
new file mode 100644
index 000000000..018ce8133
--- /dev/null
+++ b/net/smc/smc_llc.c
@@ -0,0 +1,2365 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Link Layer Control (LLC)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ * Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <net/tcp.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_clc.h"
+#include "smc_llc.h"
+#include "smc_pnet.h"
+
+#define SMC_LLC_DATA_LEN 40
+
+struct smc_llc_hdr {
+ struct smc_wr_rx_hdr common;
+ union {
+ struct {
+ u8 length; /* 44 */
+ #if defined(__BIG_ENDIAN_BITFIELD)
+ u8 reserved:4,
+ add_link_rej_rsn:4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 add_link_rej_rsn:4,
+ reserved:4;
+#endif
+ };
+ u16 length_v2; /* 44 - 8192*/
+ };
+ u8 flags;
+} __packed; /* format defined in
+ * IBM Shared Memory Communications Version 2
+ * (https://www.ibm.com/support/pages/node/6326337)
+ */
+
+#define SMC_LLC_FLAG_NO_RMBE_EYEC 0x03
+
+struct smc_llc_msg_confirm_link { /* type 0x01 */
+ struct smc_llc_hdr hd;
+ u8 sender_mac[ETH_ALEN];
+ u8 sender_gid[SMC_GID_SIZE];
+ u8 sender_qp_num[3];
+ u8 link_num;
+ u8 link_uid[SMC_LGR_ID_SIZE];
+ u8 max_links;
+ u8 max_conns;
+ u8 reserved[8];
+};
+
+#define SMC_LLC_FLAG_ADD_LNK_REJ 0x40
+#define SMC_LLC_REJ_RSN_NO_ALT_PATH 1
+
+struct smc_llc_msg_add_link { /* type 0x02 */
+ struct smc_llc_hdr hd;
+ u8 sender_mac[ETH_ALEN];
+ u8 reserved2[2];
+ u8 sender_gid[SMC_GID_SIZE];
+ u8 sender_qp_num[3];
+ u8 link_num;
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 reserved3 : 4,
+ qp_mtu : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 qp_mtu : 4,
+ reserved3 : 4;
+#endif
+ u8 initial_psn[3];
+ u8 reserved[8];
+};
+
+struct smc_llc_msg_add_link_cont_rt {
+ __be32 rmb_key;
+ __be32 rmb_key_new;
+ __be64 rmb_vaddr_new;
+};
+
+struct smc_llc_msg_add_link_v2_ext {
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 v2_direct : 1,
+ reserved : 7;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 7,
+ v2_direct : 1;
+#endif
+ u8 reserved2;
+ u8 client_target_gid[SMC_GID_SIZE];
+ u8 reserved3[8];
+ u16 num_rkeys;
+ struct smc_llc_msg_add_link_cont_rt rt[];
+} __packed; /* format defined in
+ * IBM Shared Memory Communications Version 2
+ * (https://www.ibm.com/support/pages/node/6326337)
+ */
+
+struct smc_llc_msg_req_add_link_v2 {
+ struct smc_llc_hdr hd;
+ u8 reserved[20];
+ u8 gid_cnt;
+ u8 reserved2[3];
+ u8 gid[][SMC_GID_SIZE];
+};
+
+#define SMC_LLC_RKEYS_PER_CONT_MSG 2
+
+struct smc_llc_msg_add_link_cont { /* type 0x03 */
+ struct smc_llc_hdr hd;
+ u8 link_num;
+ u8 num_rkeys;
+ u8 reserved2[2];
+ struct smc_llc_msg_add_link_cont_rt rt[SMC_LLC_RKEYS_PER_CONT_MSG];
+ u8 reserved[4];
+} __packed; /* format defined in RFC7609 */
+
+#define SMC_LLC_FLAG_DEL_LINK_ALL 0x40
+#define SMC_LLC_FLAG_DEL_LINK_ORDERLY 0x20
+
+struct smc_llc_msg_del_link { /* type 0x04 */
+ struct smc_llc_hdr hd;
+ u8 link_num;
+ __be32 reason;
+ u8 reserved[35];
+} __packed; /* format defined in RFC7609 */
+
+struct smc_llc_msg_test_link { /* type 0x07 */
+ struct smc_llc_hdr hd;
+ u8 user_data[16];
+ u8 reserved[24];
+};
+
+struct smc_rmb_rtoken {
+ union {
+ u8 num_rkeys; /* first rtoken byte of CONFIRM LINK msg */
+ /* is actually the num of rtokens, first */
+ /* rtoken is always for the current link */
+ u8 link_id; /* link id of the rtoken */
+ };
+ __be32 rmb_key;
+ __be64 rmb_vaddr;
+} __packed; /* format defined in RFC7609 */
+
+#define SMC_LLC_RKEYS_PER_MSG 3
+#define SMC_LLC_RKEYS_PER_MSG_V2 255
+
+struct smc_llc_msg_confirm_rkey { /* type 0x06 */
+ struct smc_llc_hdr hd;
+ struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG];
+ u8 reserved;
+};
+
+#define SMC_LLC_DEL_RKEY_MAX 8
+#define SMC_LLC_FLAG_RKEY_RETRY 0x10
+#define SMC_LLC_FLAG_RKEY_NEG 0x20
+
+struct smc_llc_msg_delete_rkey { /* type 0x09 */
+ struct smc_llc_hdr hd;
+ u8 num_rkeys;
+ u8 err_mask;
+ u8 reserved[2];
+ __be32 rkey[8];
+ u8 reserved2[4];
+};
+
+struct smc_llc_msg_delete_rkey_v2 { /* type 0x29 */
+ struct smc_llc_hdr hd;
+ u8 num_rkeys;
+ u8 num_inval_rkeys;
+ u8 reserved[2];
+ __be32 rkey[];
+};
+
+union smc_llc_msg {
+ struct smc_llc_msg_confirm_link confirm_link;
+ struct smc_llc_msg_add_link add_link;
+ struct smc_llc_msg_req_add_link_v2 req_add_link;
+ struct smc_llc_msg_add_link_cont add_link_cont;
+ struct smc_llc_msg_del_link delete_link;
+
+ struct smc_llc_msg_confirm_rkey confirm_rkey;
+ struct smc_llc_msg_delete_rkey delete_rkey;
+
+ struct smc_llc_msg_test_link test_link;
+ struct {
+ struct smc_llc_hdr hdr;
+ u8 data[SMC_LLC_DATA_LEN];
+ } raw;
+};
+
+#define SMC_LLC_FLAG_RESP 0x80
+
+struct smc_llc_qentry {
+ struct list_head list;
+ struct smc_link *link;
+ union smc_llc_msg msg;
+};
+
+static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc);
+
+struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow)
+{
+ struct smc_llc_qentry *qentry = flow->qentry;
+
+ flow->qentry = NULL;
+ return qentry;
+}
+
+void smc_llc_flow_qentry_del(struct smc_llc_flow *flow)
+{
+ struct smc_llc_qentry *qentry;
+
+ if (flow->qentry) {
+ qentry = flow->qentry;
+ flow->qentry = NULL;
+ kfree(qentry);
+ }
+}
+
+static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow,
+ struct smc_llc_qentry *qentry)
+{
+ flow->qentry = qentry;
+}
+
+static void smc_llc_flow_parallel(struct smc_link_group *lgr, u8 flow_type,
+ struct smc_llc_qentry *qentry)
+{
+ u8 msg_type = qentry->msg.raw.hdr.common.llc_type;
+
+ if ((msg_type == SMC_LLC_ADD_LINK || msg_type == SMC_LLC_DELETE_LINK) &&
+ flow_type != msg_type && !lgr->delayed_event) {
+ lgr->delayed_event = qentry;
+ return;
+ }
+ /* drop parallel or already-in-progress llc requests */
+ if (flow_type != msg_type)
+ pr_warn_once("smc: SMC-R lg %*phN net %llu dropped parallel "
+ "LLC msg: msg %d flow %d role %d\n",
+ SMC_LGR_ID_SIZE, &lgr->id,
+ lgr->net->net_cookie,
+ qentry->msg.raw.hdr.common.type,
+ flow_type, lgr->role);
+ kfree(qentry);
+}
+
+/* try to start a new llc flow, initiated by an incoming llc msg */
+static bool smc_llc_flow_start(struct smc_llc_flow *flow,
+ struct smc_llc_qentry *qentry)
+{
+ struct smc_link_group *lgr = qentry->link->lgr;
+
+ spin_lock_bh(&lgr->llc_flow_lock);
+ if (flow->type) {
+ /* a flow is already active */
+ smc_llc_flow_parallel(lgr, flow->type, qentry);
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ return false;
+ }
+ switch (qentry->msg.raw.hdr.common.llc_type) {
+ case SMC_LLC_ADD_LINK:
+ flow->type = SMC_LLC_FLOW_ADD_LINK;
+ break;
+ case SMC_LLC_DELETE_LINK:
+ flow->type = SMC_LLC_FLOW_DEL_LINK;
+ break;
+ case SMC_LLC_CONFIRM_RKEY:
+ case SMC_LLC_DELETE_RKEY:
+ flow->type = SMC_LLC_FLOW_RKEY;
+ break;
+ default:
+ flow->type = SMC_LLC_FLOW_NONE;
+ }
+ smc_llc_flow_qentry_set(flow, qentry);
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ return true;
+}
+
+/* start a new local llc flow, wait till current flow finished */
+int smc_llc_flow_initiate(struct smc_link_group *lgr,
+ enum smc_llc_flowtype type)
+{
+ enum smc_llc_flowtype allowed_remote = SMC_LLC_FLOW_NONE;
+ int rc;
+
+ /* all flows except confirm_rkey and delete_rkey are exclusive,
+ * confirm/delete rkey flows can run concurrently (local and remote)
+ */
+ if (type == SMC_LLC_FLOW_RKEY)
+ allowed_remote = SMC_LLC_FLOW_RKEY;
+again:
+ if (list_empty(&lgr->list))
+ return -ENODEV;
+ spin_lock_bh(&lgr->llc_flow_lock);
+ if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE &&
+ (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE ||
+ lgr->llc_flow_rmt.type == allowed_remote)) {
+ lgr->llc_flow_lcl.type = type;
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ return 0;
+ }
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ rc = wait_event_timeout(lgr->llc_flow_waiter, (list_empty(&lgr->list) ||
+ (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE &&
+ (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE ||
+ lgr->llc_flow_rmt.type == allowed_remote))),
+ SMC_LLC_WAIT_TIME * 10);
+ if (!rc)
+ return -ETIMEDOUT;
+ goto again;
+}
+
+/* finish the current llc flow */
+void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow)
+{
+ spin_lock_bh(&lgr->llc_flow_lock);
+ memset(flow, 0, sizeof(*flow));
+ flow->type = SMC_LLC_FLOW_NONE;
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ if (!list_empty(&lgr->list) && lgr->delayed_event &&
+ flow == &lgr->llc_flow_lcl)
+ schedule_work(&lgr->llc_event_work);
+ else
+ wake_up(&lgr->llc_flow_waiter);
+}
+
+/* lnk is optional and used for early wakeup when link goes down, useful in
+ * cases where we wait for a response on the link after we sent a request
+ */
+struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
+ struct smc_link *lnk,
+ int time_out, u8 exp_msg)
+{
+ struct smc_llc_flow *flow = &lgr->llc_flow_lcl;
+ u8 rcv_msg;
+
+ wait_event_timeout(lgr->llc_msg_waiter,
+ (flow->qentry ||
+ (lnk && !smc_link_usable(lnk)) ||
+ list_empty(&lgr->list)),
+ time_out);
+ if (!flow->qentry ||
+ (lnk && !smc_link_usable(lnk)) || list_empty(&lgr->list)) {
+ smc_llc_flow_qentry_del(flow);
+ goto out;
+ }
+ rcv_msg = flow->qentry->msg.raw.hdr.common.llc_type;
+ if (exp_msg && rcv_msg != exp_msg) {
+ if (exp_msg == SMC_LLC_ADD_LINK &&
+ rcv_msg == SMC_LLC_DELETE_LINK) {
+ /* flow_start will delay the unexpected msg */
+ smc_llc_flow_start(&lgr->llc_flow_lcl,
+ smc_llc_flow_qentry_clr(flow));
+ return NULL;
+ }
+ pr_warn_once("smc: SMC-R lg %*phN net %llu dropped unexpected LLC msg: "
+ "msg %d exp %d flow %d role %d flags %x\n",
+ SMC_LGR_ID_SIZE, &lgr->id, lgr->net->net_cookie,
+ rcv_msg, exp_msg,
+ flow->type, lgr->role,
+ flow->qentry->msg.raw.hdr.flags);
+ smc_llc_flow_qentry_del(flow);
+ }
+out:
+ return flow->qentry;
+}
+
+/********************************** send *************************************/
+
+struct smc_llc_tx_pend {
+};
+
+/* handler for send/transmission completion of an LLC msg */
+static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend,
+ struct smc_link *link,
+ enum ib_wc_status wc_status)
+{
+ /* future work: handle wc_status error for recovery and failover */
+}
+
+/**
+ * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits
+ * @link: Pointer to SMC link used for sending LLC control message.
+ * @wr_buf: Out variable returning pointer to work request payload buffer.
+ * @pend: Out variable returning pointer to private pending WR tracking.
+ * It's the context the transmit complete handler will get.
+ *
+ * Reserves and pre-fills an entry for a pending work request send/tx.
+ * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx.
+ * Can sleep due to smc_get_ctrl_buf (if not in softirq context).
+ *
+ * Return: 0 on success, otherwise an error value.
+ */
+static int smc_llc_add_pending_send(struct smc_link *link,
+ struct smc_wr_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **pend)
+{
+ int rc;
+
+ rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, NULL,
+ pend);
+ if (rc < 0)
+ return rc;
+ BUILD_BUG_ON_MSG(
+ sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE,
+ "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)");
+ BUILD_BUG_ON_MSG(
+ sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE,
+ "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+ "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)");
+ return 0;
+}
+
+static int smc_llc_add_pending_send_v2(struct smc_link *link,
+ struct smc_wr_v2_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **pend)
+{
+ int rc;
+
+ rc = smc_wr_tx_get_v2_slot(link, smc_llc_tx_handler, wr_buf, pend);
+ if (rc < 0)
+ return rc;
+ return 0;
+}
+
+static void smc_llc_init_msg_hdr(struct smc_llc_hdr *hdr,
+ struct smc_link_group *lgr, size_t len)
+{
+ if (lgr->smc_version == SMC_V2) {
+ hdr->common.llc_version = SMC_V2;
+ hdr->length_v2 = len;
+ } else {
+ hdr->common.llc_version = 0;
+ hdr->length = len;
+ }
+}
+
+/* high-level API to send LLC confirm link */
+int smc_llc_send_confirm_link(struct smc_link *link,
+ enum smc_llc_reqresp reqresp)
+{
+ struct smc_llc_msg_confirm_link *confllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
+ memset(confllc, 0, sizeof(*confllc));
+ confllc->hd.common.llc_type = SMC_LLC_CONFIRM_LINK;
+ smc_llc_init_msg_hdr(&confllc->hd, link->lgr, sizeof(*confllc));
+ confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC;
+ if (reqresp == SMC_LLC_RESP)
+ confllc->hd.flags |= SMC_LLC_FLAG_RESP;
+ memcpy(confllc->sender_mac, link->smcibdev->mac[link->ibport - 1],
+ ETH_ALEN);
+ memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE);
+ hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
+ confllc->link_num = link->link_id;
+ memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE);
+ confllc->max_links = SMC_LINKS_ADD_LNK_MAX;
+ if (link->lgr->smc_version == SMC_V2 &&
+ link->lgr->peer_smc_release >= SMC_RELEASE_1) {
+ confllc->max_conns = link->lgr->max_conns;
+ confllc->max_links = link->lgr->max_links;
+ }
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/* send LLC confirm rkey request */
+static int smc_llc_send_confirm_rkey(struct smc_link *send_link,
+ struct smc_buf_desc *rmb_desc)
+{
+ struct smc_llc_msg_confirm_rkey *rkeyllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ struct smc_link *link;
+ int i, rc, rtok_ix;
+
+ if (!smc_wr_tx_link_hold(send_link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf;
+ memset(rkeyllc, 0, sizeof(*rkeyllc));
+ rkeyllc->hd.common.llc_type = SMC_LLC_CONFIRM_RKEY;
+ smc_llc_init_msg_hdr(&rkeyllc->hd, send_link->lgr, sizeof(*rkeyllc));
+
+ rtok_ix = 1;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ link = &send_link->lgr->lnk[i];
+ if (smc_link_active(link) && link != send_link) {
+ rkeyllc->rtoken[rtok_ix].link_id = link->link_id;
+ rkeyllc->rtoken[rtok_ix].rmb_key =
+ htonl(rmb_desc->mr[link->link_idx]->rkey);
+ rkeyllc->rtoken[rtok_ix].rmb_vaddr = rmb_desc->is_vm ?
+ cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) :
+ cpu_to_be64((u64)sg_dma_address
+ (rmb_desc->sgt[link->link_idx].sgl));
+ rtok_ix++;
+ }
+ }
+ /* rkey of send_link is in rtoken[0] */
+ rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1;
+ rkeyllc->rtoken[0].rmb_key =
+ htonl(rmb_desc->mr[send_link->link_idx]->rkey);
+ rkeyllc->rtoken[0].rmb_vaddr = rmb_desc->is_vm ?
+ cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) :
+ cpu_to_be64((u64)sg_dma_address
+ (rmb_desc->sgt[send_link->link_idx].sgl));
+ /* send llc message */
+ rc = smc_wr_tx_send(send_link, pend);
+put_out:
+ smc_wr_tx_link_put(send_link);
+ return rc;
+}
+
+/* send LLC delete rkey request */
+static int smc_llc_send_delete_rkey(struct smc_link *link,
+ struct smc_buf_desc *rmb_desc)
+{
+ struct smc_llc_msg_delete_rkey *rkeyllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf;
+ memset(rkeyllc, 0, sizeof(*rkeyllc));
+ rkeyllc->hd.common.llc_type = SMC_LLC_DELETE_RKEY;
+ smc_llc_init_msg_hdr(&rkeyllc->hd, link->lgr, sizeof(*rkeyllc));
+ rkeyllc->num_rkeys = 1;
+ rkeyllc->rkey[0] = htonl(rmb_desc->mr[link->link_idx]->rkey);
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/* return first buffer from any of the next buf lists */
+static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr,
+ int *buf_lst)
+{
+ struct smc_buf_desc *buf_pos;
+
+ while (*buf_lst < SMC_RMBE_SIZES) {
+ buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst],
+ struct smc_buf_desc, list);
+ if (buf_pos)
+ return buf_pos;
+ (*buf_lst)++;
+ }
+ return NULL;
+}
+
+/* return next rmb from buffer lists */
+static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr,
+ int *buf_lst,
+ struct smc_buf_desc *buf_pos)
+{
+ struct smc_buf_desc *buf_next;
+
+ if (!buf_pos)
+ return _smc_llc_get_next_rmb(lgr, buf_lst);
+
+ if (list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) {
+ (*buf_lst)++;
+ return _smc_llc_get_next_rmb(lgr, buf_lst);
+ }
+ buf_next = list_next_entry(buf_pos, list);
+ return buf_next;
+}
+
+static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr,
+ int *buf_lst)
+{
+ *buf_lst = 0;
+ return smc_llc_get_next_rmb(lgr, buf_lst, NULL);
+}
+
+static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext,
+ struct smc_link *link, struct smc_link *link_new)
+{
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_buf_desc *buf_pos;
+ int prim_lnk_idx, lnk_idx, i;
+ struct smc_buf_desc *rmb;
+ int len = sizeof(*ext);
+ int buf_lst;
+
+ ext->v2_direct = !lgr->uses_gateway;
+ memcpy(ext->client_target_gid, link_new->gid, SMC_GID_SIZE);
+
+ prim_lnk_idx = link->link_idx;
+ lnk_idx = link_new->link_idx;
+ down_write(&lgr->rmbs_lock);
+ ext->num_rkeys = lgr->conns_num;
+ if (!ext->num_rkeys)
+ goto out;
+ buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
+ for (i = 0; i < ext->num_rkeys; i++) {
+ while (buf_pos && !(buf_pos)->used)
+ buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos);
+ if (!buf_pos)
+ break;
+ rmb = buf_pos;
+ ext->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey);
+ ext->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey);
+ ext->rt[i].rmb_vaddr_new = rmb->is_vm ?
+ cpu_to_be64((uintptr_t)rmb->cpu_addr) :
+ cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl));
+ buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos);
+ }
+ len += i * sizeof(ext->rt[0]);
+out:
+ up_write(&lgr->rmbs_lock);
+ return len;
+}
+
+/* send ADD LINK request or response */
+int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
+ struct smc_link *link_new,
+ enum smc_llc_reqresp reqresp)
+{
+ struct smc_llc_msg_add_link_v2_ext *ext = NULL;
+ struct smc_llc_msg_add_link *addllc;
+ struct smc_wr_tx_pend_priv *pend;
+ int len = sizeof(*addllc);
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ if (link->lgr->smc_version == SMC_V2) {
+ struct smc_wr_v2_buf *wr_buf;
+
+ rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ addllc = (struct smc_llc_msg_add_link *)wr_buf;
+ ext = (struct smc_llc_msg_add_link_v2_ext *)
+ &wr_buf->raw[sizeof(*addllc)];
+ memset(ext, 0, SMC_WR_TX_SIZE);
+ } else {
+ struct smc_wr_buf *wr_buf;
+
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ addllc = (struct smc_llc_msg_add_link *)wr_buf;
+ }
+
+ memset(addllc, 0, sizeof(*addllc));
+ addllc->hd.common.llc_type = SMC_LLC_ADD_LINK;
+ if (reqresp == SMC_LLC_RESP)
+ addllc->hd.flags |= SMC_LLC_FLAG_RESP;
+ memcpy(addllc->sender_mac, mac, ETH_ALEN);
+ memcpy(addllc->sender_gid, gid, SMC_GID_SIZE);
+ if (link_new) {
+ addllc->link_num = link_new->link_id;
+ hton24(addllc->sender_qp_num, link_new->roce_qp->qp_num);
+ hton24(addllc->initial_psn, link_new->psn_initial);
+ if (reqresp == SMC_LLC_REQ)
+ addllc->qp_mtu = link_new->path_mtu;
+ else
+ addllc->qp_mtu = min(link_new->path_mtu,
+ link_new->peer_mtu);
+ }
+ if (ext && link_new)
+ len += smc_llc_fill_ext_v2(ext, link, link_new);
+ smc_llc_init_msg_hdr(&addllc->hd, link->lgr, len);
+ /* send llc message */
+ if (link->lgr->smc_version == SMC_V2)
+ rc = smc_wr_tx_v2_send(link, pend, len);
+ else
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/* send DELETE LINK request or response */
+int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
+ enum smc_llc_reqresp reqresp, bool orderly,
+ u32 reason)
+{
+ struct smc_llc_msg_del_link *delllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ delllc = (struct smc_llc_msg_del_link *)wr_buf;
+
+ memset(delllc, 0, sizeof(*delllc));
+ delllc->hd.common.llc_type = SMC_LLC_DELETE_LINK;
+ smc_llc_init_msg_hdr(&delllc->hd, link->lgr, sizeof(*delllc));
+ if (reqresp == SMC_LLC_RESP)
+ delllc->hd.flags |= SMC_LLC_FLAG_RESP;
+ if (orderly)
+ delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+ if (link_del_id)
+ delllc->link_num = link_del_id;
+ else
+ delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
+ delllc->reason = htonl(reason);
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/* send LLC test link request */
+static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16])
+{
+ struct smc_llc_msg_test_link *testllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ testllc = (struct smc_llc_msg_test_link *)wr_buf;
+ memset(testllc, 0, sizeof(*testllc));
+ testllc->hd.common.llc_type = SMC_LLC_TEST_LINK;
+ smc_llc_init_msg_hdr(&testllc->hd, link->lgr, sizeof(*testllc));
+ memcpy(testllc->user_data, user_data, sizeof(testllc->user_data));
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/* schedule an llc send on link, may wait for buffers */
+static int smc_llc_send_message(struct smc_link *link, void *llcbuf)
+{
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg));
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/* schedule an llc send on link, may wait for buffers,
+ * and wait for send completion notification.
+ * @return 0 on success
+ */
+static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf)
+{
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg));
+ rc = smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/********************************* receive ***********************************/
+
+static int smc_llc_alloc_alt_link(struct smc_link_group *lgr,
+ enum smc_lgr_type lgr_new_t)
+{
+ int i;
+
+ if (lgr->type == SMC_LGR_SYMMETRIC ||
+ (lgr->type != SMC_LGR_SINGLE &&
+ (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+ lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)))
+ return -EMLINK;
+
+ if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+ lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) {
+ for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--)
+ if (lgr->lnk[i].state == SMC_LNK_UNUSED)
+ return i;
+ } else {
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+ if (lgr->lnk[i].state == SMC_LNK_UNUSED)
+ return i;
+ }
+ return -EMLINK;
+}
+
+/* send one add_link_continue msg */
+static int smc_llc_add_link_cont(struct smc_link *link,
+ struct smc_link *link_new, u8 *num_rkeys_todo,
+ int *buf_lst, struct smc_buf_desc **buf_pos)
+{
+ struct smc_llc_msg_add_link_cont *addc_llc;
+ struct smc_link_group *lgr = link->lgr;
+ int prim_lnk_idx, lnk_idx, i, rc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ struct smc_buf_desc *rmb;
+ u8 n;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf;
+ memset(addc_llc, 0, sizeof(*addc_llc));
+
+ prim_lnk_idx = link->link_idx;
+ lnk_idx = link_new->link_idx;
+ addc_llc->link_num = link_new->link_id;
+ addc_llc->num_rkeys = *num_rkeys_todo;
+ n = *num_rkeys_todo;
+ for (i = 0; i < min_t(u8, n, SMC_LLC_RKEYS_PER_CONT_MSG); i++) {
+ while (*buf_pos && !(*buf_pos)->used)
+ *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos);
+ if (!*buf_pos) {
+ addc_llc->num_rkeys = addc_llc->num_rkeys -
+ *num_rkeys_todo;
+ *num_rkeys_todo = 0;
+ break;
+ }
+ rmb = *buf_pos;
+
+ addc_llc->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey);
+ addc_llc->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey);
+ addc_llc->rt[i].rmb_vaddr_new = rmb->is_vm ?
+ cpu_to_be64((uintptr_t)rmb->cpu_addr) :
+ cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl));
+
+ (*num_rkeys_todo)--;
+ *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos);
+ }
+ addc_llc->hd.common.llc_type = SMC_LLC_ADD_LINK_CONT;
+ addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont);
+ if (lgr->role == SMC_CLNT)
+ addc_llc->hd.flags |= SMC_LLC_FLAG_RESP;
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+static int smc_llc_cli_rkey_exchange(struct smc_link *link,
+ struct smc_link *link_new)
+{
+ struct smc_llc_msg_add_link_cont *addc_llc;
+ struct smc_link_group *lgr = link->lgr;
+ u8 max, num_rkeys_send, num_rkeys_recv;
+ struct smc_llc_qentry *qentry;
+ struct smc_buf_desc *buf_pos;
+ int buf_lst;
+ int rc = 0;
+ int i;
+
+ down_write(&lgr->rmbs_lock);
+ num_rkeys_send = lgr->conns_num;
+ buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
+ do {
+ qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_TIME,
+ SMC_LLC_ADD_LINK_CONT);
+ if (!qentry) {
+ rc = -ETIMEDOUT;
+ break;
+ }
+ addc_llc = &qentry->msg.add_link_cont;
+ num_rkeys_recv = addc_llc->num_rkeys;
+ max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG);
+ for (i = 0; i < max; i++) {
+ smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
+ addc_llc->rt[i].rmb_key,
+ addc_llc->rt[i].rmb_vaddr_new,
+ addc_llc->rt[i].rmb_key_new);
+ num_rkeys_recv--;
+ }
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ rc = smc_llc_add_link_cont(link, link_new, &num_rkeys_send,
+ &buf_lst, &buf_pos);
+ if (rc)
+ break;
+ } while (num_rkeys_send || num_rkeys_recv);
+
+ up_write(&lgr->rmbs_lock);
+ return rc;
+}
+
+/* prepare and send an add link reject response */
+static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry)
+{
+ qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
+ qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_ADD_LNK_REJ;
+ qentry->msg.raw.hdr.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH;
+ smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr,
+ sizeof(qentry->msg));
+ return smc_llc_send_message(qentry->link, &qentry->msg);
+}
+
+static int smc_llc_cli_conf_link(struct smc_link *link,
+ struct smc_init_info *ini,
+ struct smc_link *link_new,
+ enum smc_lgr_type lgr_new_t)
+{
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_llc_qentry *qentry = NULL;
+ int rc = 0;
+
+ /* receive CONFIRM LINK request over RoCE fabric */
+ qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_FIRST_TIME, 0);
+ if (!qentry) {
+ rc = smc_llc_send_delete_link(link, link_new->link_id,
+ SMC_LLC_REQ, false,
+ SMC_LLC_DEL_LOST_PATH);
+ return -ENOLINK;
+ }
+ if (qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) {
+ /* received DELETE_LINK instead */
+ qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_send_message(link, &qentry->msg);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return -ENOLINK;
+ }
+ smc_llc_save_peer_uid(qentry);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+
+ rc = smc_ib_modify_qp_rts(link_new);
+ if (rc) {
+ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+ false, SMC_LLC_DEL_LOST_PATH);
+ return -ENOLINK;
+ }
+ smc_wr_remember_qp_attr(link_new);
+
+ rc = smcr_buf_reg_lgr(link_new);
+ if (rc) {
+ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+ false, SMC_LLC_DEL_LOST_PATH);
+ return -ENOLINK;
+ }
+
+ /* send CONFIRM LINK response over RoCE fabric */
+ rc = smc_llc_send_confirm_link(link_new, SMC_LLC_RESP);
+ if (rc) {
+ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+ false, SMC_LLC_DEL_LOST_PATH);
+ return -ENOLINK;
+ }
+ smc_llc_link_active(link_new);
+ if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+ lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)
+ smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx);
+ else
+ smcr_lgr_set_type(lgr, lgr_new_t);
+ return 0;
+}
+
+static void smc_llc_save_add_link_rkeys(struct smc_link *link,
+ struct smc_link *link_new)
+{
+ struct smc_llc_msg_add_link_v2_ext *ext;
+ struct smc_link_group *lgr = link->lgr;
+ int max, i;
+
+ ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 +
+ SMC_WR_TX_SIZE);
+ max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2);
+ down_write(&lgr->rmbs_lock);
+ for (i = 0; i < max; i++) {
+ smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
+ ext->rt[i].rmb_key,
+ ext->rt[i].rmb_vaddr_new,
+ ext->rt[i].rmb_key_new);
+ }
+ up_write(&lgr->rmbs_lock);
+}
+
+static void smc_llc_save_add_link_info(struct smc_link *link,
+ struct smc_llc_msg_add_link *add_llc)
+{
+ link->peer_qpn = ntoh24(add_llc->sender_qp_num);
+ memcpy(link->peer_gid, add_llc->sender_gid, SMC_GID_SIZE);
+ memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN);
+ link->peer_psn = ntoh24(add_llc->initial_psn);
+ link->peer_mtu = add_llc->qp_mtu;
+}
+
+/* as an SMC client, process an add link request */
+int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
+{
+ struct smc_llc_msg_add_link *llc = &qentry->msg.add_link;
+ enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
+ struct smc_link_group *lgr = smc_get_lgr(link);
+ struct smc_init_info *ini = NULL;
+ struct smc_link *lnk_new = NULL;
+ int lnk_idx, rc = 0;
+
+ if (!llc->qp_mtu)
+ goto out_reject;
+
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini) {
+ rc = -ENOMEM;
+ goto out_reject;
+ }
+
+ if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) {
+ rc = 0;
+ goto out_reject;
+ }
+
+ ini->vlan_id = lgr->vlan_id;
+ if (lgr->smc_version == SMC_V2) {
+ ini->check_smcrv2 = true;
+ ini->smcrv2.saddr = lgr->saddr;
+ ini->smcrv2.daddr = smc_ib_gid_to_ipv4(llc->sender_gid);
+ }
+ smc_pnet_find_alt_roce(lgr, ini, link->smcibdev);
+ if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
+ (lgr->smc_version == SMC_V2 ||
+ !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN))) {
+ if (!ini->ib_dev && !ini->smcrv2.ib_dev_v2)
+ goto out_reject;
+ lgr_new_t = SMC_LGR_ASYMMETRIC_PEER;
+ }
+ if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) {
+ lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+ ini->smcrv2.ib_dev_v2 = link->smcibdev;
+ ini->smcrv2.ib_port_v2 = link->ibport;
+ } else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) {
+ lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+ ini->ib_dev = link->smcibdev;
+ ini->ib_port = link->ibport;
+ }
+ lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t);
+ if (lnk_idx < 0)
+ goto out_reject;
+ lnk_new = &lgr->lnk[lnk_idx];
+ rc = smcr_link_init(lgr, lnk_new, lnk_idx, ini);
+ if (rc)
+ goto out_reject;
+ smc_llc_save_add_link_info(lnk_new, llc);
+ lnk_new->link_id = llc->link_num; /* SMC server assigns link id */
+ smc_llc_link_set_uid(lnk_new);
+
+ rc = smc_ib_ready_link(lnk_new);
+ if (rc)
+ goto out_clear_lnk;
+
+ rc = smcr_buf_map_lgr(lnk_new);
+ if (rc)
+ goto out_clear_lnk;
+
+ rc = smc_llc_send_add_link(link,
+ lnk_new->smcibdev->mac[lnk_new->ibport - 1],
+ lnk_new->gid, lnk_new, SMC_LLC_RESP);
+ if (rc)
+ goto out_clear_lnk;
+ if (lgr->smc_version == SMC_V2) {
+ smc_llc_save_add_link_rkeys(link, lnk_new);
+ } else {
+ rc = smc_llc_cli_rkey_exchange(link, lnk_new);
+ if (rc) {
+ rc = 0;
+ goto out_clear_lnk;
+ }
+ }
+ rc = smc_llc_cli_conf_link(link, ini, lnk_new, lgr_new_t);
+ if (!rc)
+ goto out;
+out_clear_lnk:
+ lnk_new->state = SMC_LNK_INACTIVE;
+ smcr_link_clear(lnk_new, false);
+out_reject:
+ smc_llc_cli_add_link_reject(qentry);
+out:
+ kfree(ini);
+ kfree(qentry);
+ return rc;
+}
+
+static void smc_llc_send_request_add_link(struct smc_link *link)
+{
+ struct smc_llc_msg_req_add_link_v2 *llc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_v2_buf *wr_buf;
+ struct smc_gidlist gidlist;
+ int rc, len, i;
+
+ if (!smc_wr_tx_link_hold(link))
+ return;
+ if (link->lgr->type == SMC_LGR_SYMMETRIC ||
+ link->lgr->type == SMC_LGR_ASYMMETRIC_PEER)
+ goto put_out;
+
+ smc_fill_gid_list(link->lgr, &gidlist, link->smcibdev, link->gid);
+ if (gidlist.len <= 1)
+ goto put_out;
+
+ rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ llc = (struct smc_llc_msg_req_add_link_v2 *)wr_buf;
+ memset(llc, 0, SMC_WR_TX_SIZE);
+
+ llc->hd.common.llc_type = SMC_LLC_REQ_ADD_LINK;
+ for (i = 0; i < gidlist.len; i++)
+ memcpy(llc->gid[i], gidlist.list[i], sizeof(gidlist.list[0]));
+ llc->gid_cnt = gidlist.len;
+ len = sizeof(*llc) + (gidlist.len * sizeof(gidlist.list[0]));
+ smc_llc_init_msg_hdr(&llc->hd, link->lgr, len);
+ rc = smc_wr_tx_v2_send(link, pend, len);
+ if (!rc)
+ /* set REQ_ADD_LINK flow and wait for response from peer */
+ link->lgr->llc_flow_lcl.type = SMC_LLC_FLOW_REQ_ADD_LINK;
+put_out:
+ smc_wr_tx_link_put(link);
+}
+
+/* as an SMC client, invite server to start the add_link processing */
+static void smc_llc_cli_add_link_invite(struct smc_link *link,
+ struct smc_llc_qentry *qentry)
+{
+ struct smc_link_group *lgr = smc_get_lgr(link);
+ struct smc_init_info *ini = NULL;
+
+ if (lgr->smc_version == SMC_V2) {
+ smc_llc_send_request_add_link(link);
+ goto out;
+ }
+
+ if (lgr->type == SMC_LGR_SYMMETRIC ||
+ lgr->type == SMC_LGR_ASYMMETRIC_PEER)
+ goto out;
+
+ if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1)
+ goto out;
+
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini)
+ goto out;
+
+ ini->vlan_id = lgr->vlan_id;
+ smc_pnet_find_alt_roce(lgr, ini, link->smcibdev);
+ if (!ini->ib_dev)
+ goto out;
+
+ smc_llc_send_add_link(link, ini->ib_dev->mac[ini->ib_port - 1],
+ ini->ib_gid, NULL, SMC_LLC_REQ);
+out:
+ kfree(ini);
+ kfree(qentry);
+}
+
+static bool smc_llc_is_empty_llc_message(union smc_llc_msg *llc)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(llc->raw.data); i++)
+ if (llc->raw.data[i])
+ return false;
+ return true;
+}
+
+static bool smc_llc_is_local_add_link(union smc_llc_msg *llc)
+{
+ if (llc->raw.hdr.common.llc_type == SMC_LLC_ADD_LINK &&
+ smc_llc_is_empty_llc_message(llc))
+ return true;
+ return false;
+}
+
+static void smc_llc_process_cli_add_link(struct smc_link_group *lgr)
+{
+ struct smc_llc_qentry *qentry;
+
+ qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
+
+ down_write(&lgr->llc_conf_mutex);
+ if (smc_llc_is_local_add_link(&qentry->msg))
+ smc_llc_cli_add_link_invite(qentry->link, qentry);
+ else
+ smc_llc_cli_add_link(qentry->link, qentry);
+ up_write(&lgr->llc_conf_mutex);
+}
+
+static int smc_llc_active_link_count(struct smc_link_group *lgr)
+{
+ int i, link_count = 0;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]))
+ continue;
+ link_count++;
+ }
+ return link_count;
+}
+
+/* find the asymmetric link when 3 links are established */
+static struct smc_link *smc_llc_find_asym_link(struct smc_link_group *lgr)
+{
+ int asym_idx = -ENOENT;
+ int i, j, k;
+ bool found;
+
+ /* determine asymmetric link */
+ found = false;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) {
+ if (!smc_link_usable(&lgr->lnk[i]) ||
+ !smc_link_usable(&lgr->lnk[j]))
+ continue;
+ if (!memcmp(lgr->lnk[i].gid, lgr->lnk[j].gid,
+ SMC_GID_SIZE)) {
+ found = true; /* asym_lnk is i or j */
+ break;
+ }
+ }
+ if (found)
+ break;
+ }
+ if (!found)
+ goto out; /* no asymmetric link */
+ for (k = 0; k < SMC_LINKS_PER_LGR_MAX; k++) {
+ if (!smc_link_usable(&lgr->lnk[k]))
+ continue;
+ if (k != i &&
+ !memcmp(lgr->lnk[i].peer_gid, lgr->lnk[k].peer_gid,
+ SMC_GID_SIZE)) {
+ asym_idx = i;
+ break;
+ }
+ if (k != j &&
+ !memcmp(lgr->lnk[j].peer_gid, lgr->lnk[k].peer_gid,
+ SMC_GID_SIZE)) {
+ asym_idx = j;
+ break;
+ }
+ }
+out:
+ return (asym_idx < 0) ? NULL : &lgr->lnk[asym_idx];
+}
+
+static void smc_llc_delete_asym_link(struct smc_link_group *lgr)
+{
+ struct smc_link *lnk_new = NULL, *lnk_asym;
+ struct smc_llc_qentry *qentry;
+ int rc;
+
+ lnk_asym = smc_llc_find_asym_link(lgr);
+ if (!lnk_asym)
+ return; /* no asymmetric link */
+ if (!smc_link_downing(&lnk_asym->state))
+ return;
+ lnk_new = smc_switch_conns(lgr, lnk_asym, false);
+ smc_wr_tx_wait_no_pending_sends(lnk_asym);
+ if (!lnk_new)
+ goto out_free;
+ /* change flow type from ADD_LINK into DEL_LINK */
+ lgr->llc_flow_lcl.type = SMC_LLC_FLOW_DEL_LINK;
+ rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ,
+ true, SMC_LLC_DEL_NO_ASYM_NEEDED);
+ if (rc) {
+ smcr_link_down_cond(lnk_new);
+ goto out_free;
+ }
+ qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME,
+ SMC_LLC_DELETE_LINK);
+ if (!qentry) {
+ smcr_link_down_cond(lnk_new);
+ goto out_free;
+ }
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+out_free:
+ smcr_link_clear(lnk_asym, true);
+}
+
+static int smc_llc_srv_rkey_exchange(struct smc_link *link,
+ struct smc_link *link_new)
+{
+ struct smc_llc_msg_add_link_cont *addc_llc;
+ struct smc_link_group *lgr = link->lgr;
+ u8 max, num_rkeys_send, num_rkeys_recv;
+ struct smc_llc_qentry *qentry = NULL;
+ struct smc_buf_desc *buf_pos;
+ int buf_lst;
+ int rc = 0;
+ int i;
+
+ down_write(&lgr->rmbs_lock);
+ num_rkeys_send = lgr->conns_num;
+ buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
+ do {
+ smc_llc_add_link_cont(link, link_new, &num_rkeys_send,
+ &buf_lst, &buf_pos);
+ qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME,
+ SMC_LLC_ADD_LINK_CONT);
+ if (!qentry) {
+ rc = -ETIMEDOUT;
+ goto out;
+ }
+ addc_llc = &qentry->msg.add_link_cont;
+ num_rkeys_recv = addc_llc->num_rkeys;
+ max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG);
+ for (i = 0; i < max; i++) {
+ smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
+ addc_llc->rt[i].rmb_key,
+ addc_llc->rt[i].rmb_vaddr_new,
+ addc_llc->rt[i].rmb_key_new);
+ num_rkeys_recv--;
+ }
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ } while (num_rkeys_send || num_rkeys_recv);
+out:
+ up_write(&lgr->rmbs_lock);
+ return rc;
+}
+
+static int smc_llc_srv_conf_link(struct smc_link *link,
+ struct smc_link *link_new,
+ enum smc_lgr_type lgr_new_t)
+{
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_llc_qentry *qentry = NULL;
+ int rc;
+
+ /* send CONFIRM LINK request over the RoCE fabric */
+ rc = smc_llc_send_confirm_link(link_new, SMC_LLC_REQ);
+ if (rc)
+ return -ENOLINK;
+ /* receive CONFIRM LINK response over the RoCE fabric */
+ qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, 0);
+ if (!qentry ||
+ qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) {
+ /* send DELETE LINK */
+ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+ false, SMC_LLC_DEL_LOST_PATH);
+ if (qentry)
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return -ENOLINK;
+ }
+ smc_llc_save_peer_uid(qentry);
+ smc_llc_link_active(link_new);
+ if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+ lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)
+ smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx);
+ else
+ smcr_lgr_set_type(lgr, lgr_new_t);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return 0;
+}
+
+static void smc_llc_send_req_add_link_response(struct smc_llc_qentry *qentry)
+{
+ qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr,
+ sizeof(qentry->msg));
+ memset(&qentry->msg.raw.data, 0, sizeof(qentry->msg.raw.data));
+ smc_llc_send_message(qentry->link, &qentry->msg);
+}
+
+int smc_llc_srv_add_link(struct smc_link *link,
+ struct smc_llc_qentry *req_qentry)
+{
+ enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_llc_msg_add_link *add_llc;
+ struct smc_llc_qentry *qentry = NULL;
+ bool send_req_add_link_resp = false;
+ struct smc_link *link_new = NULL;
+ struct smc_init_info *ini = NULL;
+ int lnk_idx, rc = 0;
+
+ if (req_qentry &&
+ req_qentry->msg.raw.hdr.common.llc_type == SMC_LLC_REQ_ADD_LINK)
+ send_req_add_link_resp = true;
+
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) {
+ rc = 0;
+ goto out;
+ }
+
+ /* ignore client add link recommendation, start new flow */
+ ini->vlan_id = lgr->vlan_id;
+ if (lgr->smc_version == SMC_V2) {
+ ini->check_smcrv2 = true;
+ ini->smcrv2.saddr = lgr->saddr;
+ if (send_req_add_link_resp) {
+ struct smc_llc_msg_req_add_link_v2 *req_add =
+ &req_qentry->msg.req_add_link;
+
+ ini->smcrv2.daddr = smc_ib_gid_to_ipv4(req_add->gid[0]);
+ }
+ }
+ smc_pnet_find_alt_roce(lgr, ini, link->smcibdev);
+ if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) {
+ lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+ ini->smcrv2.ib_dev_v2 = link->smcibdev;
+ ini->smcrv2.ib_port_v2 = link->ibport;
+ } else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) {
+ lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+ ini->ib_dev = link->smcibdev;
+ ini->ib_port = link->ibport;
+ }
+ lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t);
+ if (lnk_idx < 0) {
+ rc = 0;
+ goto out;
+ }
+
+ rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, ini);
+ if (rc)
+ goto out;
+ link_new = &lgr->lnk[lnk_idx];
+
+ rc = smcr_buf_map_lgr(link_new);
+ if (rc)
+ goto out_err;
+
+ rc = smc_llc_send_add_link(link,
+ link_new->smcibdev->mac[link_new->ibport-1],
+ link_new->gid, link_new, SMC_LLC_REQ);
+ if (rc)
+ goto out_err;
+ send_req_add_link_resp = false;
+ /* receive ADD LINK response over the RoCE fabric */
+ qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK);
+ if (!qentry) {
+ rc = -ETIMEDOUT;
+ goto out_err;
+ }
+ add_llc = &qentry->msg.add_link;
+ if (add_llc->hd.flags & SMC_LLC_FLAG_ADD_LNK_REJ) {
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ rc = -ENOLINK;
+ goto out_err;
+ }
+ if (lgr->type == SMC_LGR_SINGLE &&
+ (!memcmp(add_llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
+ (lgr->smc_version == SMC_V2 ||
+ !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN)))) {
+ lgr_new_t = SMC_LGR_ASYMMETRIC_PEER;
+ }
+ smc_llc_save_add_link_info(link_new, add_llc);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+
+ rc = smc_ib_ready_link(link_new);
+ if (rc)
+ goto out_err;
+ rc = smcr_buf_reg_lgr(link_new);
+ if (rc)
+ goto out_err;
+ if (lgr->smc_version == SMC_V2) {
+ smc_llc_save_add_link_rkeys(link, link_new);
+ } else {
+ rc = smc_llc_srv_rkey_exchange(link, link_new);
+ if (rc)
+ goto out_err;
+ }
+ rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t);
+ if (rc)
+ goto out_err;
+ kfree(ini);
+ return 0;
+out_err:
+ if (link_new) {
+ link_new->state = SMC_LNK_INACTIVE;
+ smcr_link_clear(link_new, false);
+ }
+out:
+ kfree(ini);
+ if (send_req_add_link_resp)
+ smc_llc_send_req_add_link_response(req_qentry);
+ return rc;
+}
+
+static void smc_llc_process_srv_add_link(struct smc_link_group *lgr)
+{
+ struct smc_link *link = lgr->llc_flow_lcl.qentry->link;
+ struct smc_llc_qentry *qentry;
+ int rc;
+
+ qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
+
+ down_write(&lgr->llc_conf_mutex);
+ rc = smc_llc_srv_add_link(link, qentry);
+ if (!rc && lgr->type == SMC_LGR_SYMMETRIC) {
+ /* delete any asymmetric link */
+ smc_llc_delete_asym_link(lgr);
+ }
+ up_write(&lgr->llc_conf_mutex);
+ kfree(qentry);
+}
+
+/* enqueue a local add_link req to trigger a new add_link flow */
+void smc_llc_add_link_local(struct smc_link *link)
+{
+ struct smc_llc_msg_add_link add_llc = {};
+
+ add_llc.hd.common.llc_type = SMC_LLC_ADD_LINK;
+ smc_llc_init_msg_hdr(&add_llc.hd, link->lgr, sizeof(add_llc));
+ /* no dev and port needed */
+ smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc);
+}
+
+/* worker to process an add link message */
+static void smc_llc_add_link_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+ llc_add_link_work);
+
+ if (list_empty(&lgr->list)) {
+ /* link group is terminating */
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ goto out;
+ }
+
+ if (lgr->role == SMC_CLNT)
+ smc_llc_process_cli_add_link(lgr);
+ else
+ smc_llc_process_srv_add_link(lgr);
+out:
+ if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_REQ_ADD_LINK)
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+}
+
+/* enqueue a local del_link msg to trigger a new del_link flow,
+ * called only for role SMC_SERV
+ */
+void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id)
+{
+ struct smc_llc_msg_del_link del_llc = {};
+
+ del_llc.hd.common.llc_type = SMC_LLC_DELETE_LINK;
+ smc_llc_init_msg_hdr(&del_llc.hd, link->lgr, sizeof(del_llc));
+ del_llc.link_num = del_link_id;
+ del_llc.reason = htonl(SMC_LLC_DEL_LOST_PATH);
+ del_llc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+ smc_llc_enqueue(link, (union smc_llc_msg *)&del_llc);
+}
+
+static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr)
+{
+ struct smc_link *lnk_del = NULL, *lnk_asym, *lnk;
+ struct smc_llc_msg_del_link *del_llc;
+ struct smc_llc_qentry *qentry;
+ int active_links;
+ int lnk_idx;
+
+ qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
+ lnk = qentry->link;
+ del_llc = &qentry->msg.delete_link;
+
+ if (del_llc->hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) {
+ smc_lgr_terminate_sched(lgr);
+ goto out;
+ }
+ down_write(&lgr->llc_conf_mutex);
+ /* delete single link */
+ for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) {
+ if (lgr->lnk[lnk_idx].link_id != del_llc->link_num)
+ continue;
+ lnk_del = &lgr->lnk[lnk_idx];
+ break;
+ }
+ del_llc->hd.flags |= SMC_LLC_FLAG_RESP;
+ if (!lnk_del) {
+ /* link was not found */
+ del_llc->reason = htonl(SMC_LLC_DEL_NOLNK);
+ smc_llc_send_message(lnk, &qentry->msg);
+ goto out_unlock;
+ }
+ lnk_asym = smc_llc_find_asym_link(lgr);
+
+ del_llc->reason = 0;
+ smc_llc_send_message(lnk, &qentry->msg); /* response */
+
+ if (smc_link_downing(&lnk_del->state))
+ smc_switch_conns(lgr, lnk_del, false);
+ smcr_link_clear(lnk_del, true);
+
+ active_links = smc_llc_active_link_count(lgr);
+ if (lnk_del == lnk_asym) {
+ /* expected deletion of asym link, don't change lgr state */
+ } else if (active_links == 1) {
+ smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
+ } else if (!active_links) {
+ smcr_lgr_set_type(lgr, SMC_LGR_NONE);
+ smc_lgr_terminate_sched(lgr);
+ }
+out_unlock:
+ up_write(&lgr->llc_conf_mutex);
+out:
+ kfree(qentry);
+}
+
+/* try to send a DELETE LINK ALL request on any active link,
+ * waiting for send completion
+ */
+void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn)
+{
+ struct smc_llc_msg_del_link delllc = {};
+ int i;
+
+ delllc.hd.common.llc_type = SMC_LLC_DELETE_LINK;
+ smc_llc_init_msg_hdr(&delllc.hd, lgr, sizeof(delllc));
+ if (ord)
+ delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+ delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
+ delllc.reason = htonl(rsn);
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_sendable(&lgr->lnk[i]))
+ continue;
+ if (!smc_llc_send_message_wait(&lgr->lnk[i], &delllc))
+ break;
+ }
+}
+
+static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
+{
+ struct smc_llc_msg_del_link *del_llc;
+ struct smc_link *lnk, *lnk_del;
+ struct smc_llc_qentry *qentry;
+ int active_links;
+ int i;
+
+ down_write(&lgr->llc_conf_mutex);
+ qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
+ lnk = qentry->link;
+ del_llc = &qentry->msg.delete_link;
+
+ if (qentry->msg.delete_link.hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) {
+ /* delete entire lgr */
+ smc_llc_send_link_delete_all(lgr, true, ntohl(
+ qentry->msg.delete_link.reason));
+ smc_lgr_terminate_sched(lgr);
+ goto out;
+ }
+ /* delete single link */
+ lnk_del = NULL;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].link_id == del_llc->link_num) {
+ lnk_del = &lgr->lnk[i];
+ break;
+ }
+ }
+ if (!lnk_del)
+ goto out; /* asymmetric link already deleted */
+
+ if (smc_link_downing(&lnk_del->state)) {
+ if (smc_switch_conns(lgr, lnk_del, false))
+ smc_wr_tx_wait_no_pending_sends(lnk_del);
+ }
+ if (!list_empty(&lgr->list)) {
+ /* qentry is either a request from peer (send it back to
+ * initiate the DELETE_LINK processing), or a locally
+ * enqueued DELETE_LINK request (forward it)
+ */
+ if (!smc_llc_send_message(lnk, &qentry->msg)) {
+ struct smc_llc_qentry *qentry2;
+
+ qentry2 = smc_llc_wait(lgr, lnk, SMC_LLC_WAIT_TIME,
+ SMC_LLC_DELETE_LINK);
+ if (qentry2)
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ }
+ }
+ smcr_link_clear(lnk_del, true);
+
+ active_links = smc_llc_active_link_count(lgr);
+ if (active_links == 1) {
+ smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
+ } else if (!active_links) {
+ smcr_lgr_set_type(lgr, SMC_LGR_NONE);
+ smc_lgr_terminate_sched(lgr);
+ }
+
+ if (lgr->type == SMC_LGR_SINGLE && !list_empty(&lgr->list)) {
+ /* trigger setup of asymm alt link */
+ smc_llc_add_link_local(lnk);
+ }
+out:
+ up_write(&lgr->llc_conf_mutex);
+ kfree(qentry);
+}
+
+static void smc_llc_delete_link_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+ llc_del_link_work);
+
+ if (list_empty(&lgr->list)) {
+ /* link group is terminating */
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ goto out;
+ }
+
+ if (lgr->role == SMC_CLNT)
+ smc_llc_process_cli_delete_link(lgr);
+ else
+ smc_llc_process_srv_delete_link(lgr);
+out:
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+}
+
+/* process a confirm_rkey request from peer, remote flow */
+static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr)
+{
+ struct smc_llc_msg_confirm_rkey *llc;
+ struct smc_llc_qentry *qentry;
+ struct smc_link *link;
+ int num_entries;
+ int rk_idx;
+ int i;
+
+ qentry = lgr->llc_flow_rmt.qentry;
+ llc = &qentry->msg.confirm_rkey;
+ link = qentry->link;
+
+ num_entries = llc->rtoken[0].num_rkeys;
+ if (num_entries > SMC_LLC_RKEYS_PER_MSG)
+ goto out_err;
+ /* first rkey entry is for receiving link */
+ rk_idx = smc_rtoken_add(link,
+ llc->rtoken[0].rmb_vaddr,
+ llc->rtoken[0].rmb_key);
+ if (rk_idx < 0)
+ goto out_err;
+
+ for (i = 1; i <= min_t(u8, num_entries, SMC_LLC_RKEYS_PER_MSG - 1); i++)
+ smc_rtoken_set2(lgr, rk_idx, llc->rtoken[i].link_id,
+ llc->rtoken[i].rmb_vaddr,
+ llc->rtoken[i].rmb_key);
+ /* max links is 3 so there is no need to support conf_rkey_cont msgs */
+ goto out;
+out_err:
+ llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
+ llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY;
+out:
+ llc->hd.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc));
+ smc_llc_send_message(link, &qentry->msg);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
+}
+
+/* process a delete_rkey request from peer, remote flow */
+static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr)
+{
+ struct smc_llc_msg_delete_rkey *llc;
+ struct smc_llc_qentry *qentry;
+ struct smc_link *link;
+ u8 err_mask = 0;
+ int i, max;
+
+ qentry = lgr->llc_flow_rmt.qentry;
+ llc = &qentry->msg.delete_rkey;
+ link = qentry->link;
+
+ if (lgr->smc_version == SMC_V2) {
+ struct smc_llc_msg_delete_rkey_v2 *llcv2;
+
+ memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc));
+ llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2;
+ llcv2->num_inval_rkeys = 0;
+
+ max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2);
+ for (i = 0; i < max; i++) {
+ if (smc_rtoken_delete(link, llcv2->rkey[i]))
+ llcv2->num_inval_rkeys++;
+ }
+ memset(&llc->rkey[0], 0, sizeof(llc->rkey));
+ memset(&llc->reserved2, 0, sizeof(llc->reserved2));
+ smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc));
+ if (llcv2->num_inval_rkeys) {
+ llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
+ llc->err_mask = llcv2->num_inval_rkeys;
+ }
+ goto finish;
+ }
+
+ max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
+ for (i = 0; i < max; i++) {
+ if (smc_rtoken_delete(link, llc->rkey[i]))
+ err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i);
+ }
+ if (err_mask) {
+ llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
+ llc->err_mask = err_mask;
+ }
+finish:
+ llc->hd.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_send_message(link, &qentry->msg);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
+}
+
+static void smc_llc_protocol_violation(struct smc_link_group *lgr, u8 type)
+{
+ pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu LLC protocol violation: "
+ "llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id,
+ lgr->net->net_cookie, type);
+ smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_PROT_VIOL);
+ smc_lgr_terminate_sched(lgr);
+}
+
+/* flush the llc event queue */
+static void smc_llc_event_flush(struct smc_link_group *lgr)
+{
+ struct smc_llc_qentry *qentry, *q;
+
+ spin_lock_bh(&lgr->llc_event_q_lock);
+ list_for_each_entry_safe(qentry, q, &lgr->llc_event_q, list) {
+ list_del_init(&qentry->list);
+ kfree(qentry);
+ }
+ spin_unlock_bh(&lgr->llc_event_q_lock);
+}
+
+static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
+{
+ union smc_llc_msg *llc = &qentry->msg;
+ struct smc_link *link = qentry->link;
+ struct smc_link_group *lgr = link->lgr;
+
+ if (!smc_link_usable(link))
+ goto out;
+
+ switch (llc->raw.hdr.common.llc_type) {
+ case SMC_LLC_TEST_LINK:
+ llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_send_message(link, llc);
+ break;
+ case SMC_LLC_ADD_LINK:
+ if (list_empty(&lgr->list))
+ goto out; /* lgr is terminating */
+ if (lgr->role == SMC_CLNT) {
+ if (smc_llc_is_local_add_link(llc)) {
+ if (lgr->llc_flow_lcl.type ==
+ SMC_LLC_FLOW_ADD_LINK)
+ break; /* add_link in progress */
+ if (smc_llc_flow_start(&lgr->llc_flow_lcl,
+ qentry)) {
+ schedule_work(&lgr->llc_add_link_work);
+ }
+ return;
+ }
+ if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK &&
+ !lgr->llc_flow_lcl.qentry) {
+ /* a flow is waiting for this message */
+ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl,
+ qentry);
+ wake_up(&lgr->llc_msg_waiter);
+ return;
+ }
+ if (lgr->llc_flow_lcl.type ==
+ SMC_LLC_FLOW_REQ_ADD_LINK) {
+ /* server started add_link processing */
+ lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK;
+ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl,
+ qentry);
+ schedule_work(&lgr->llc_add_link_work);
+ return;
+ }
+ if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
+ schedule_work(&lgr->llc_add_link_work);
+ }
+ } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
+ /* as smc server, handle client suggestion */
+ schedule_work(&lgr->llc_add_link_work);
+ }
+ return;
+ case SMC_LLC_CONFIRM_LINK:
+ case SMC_LLC_ADD_LINK_CONT:
+ if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
+ /* a flow is waiting for this message */
+ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry);
+ wake_up(&lgr->llc_msg_waiter);
+ return;
+ }
+ break;
+ case SMC_LLC_DELETE_LINK:
+ if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK &&
+ !lgr->llc_flow_lcl.qentry) {
+ /* DEL LINK REQ during ADD LINK SEQ */
+ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry);
+ wake_up(&lgr->llc_msg_waiter);
+ } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
+ schedule_work(&lgr->llc_del_link_work);
+ }
+ return;
+ case SMC_LLC_CONFIRM_RKEY:
+ /* new request from remote, assign to remote flow */
+ if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) {
+ /* process here, does not wait for more llc msgs */
+ smc_llc_rmt_conf_rkey(lgr);
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt);
+ }
+ return;
+ case SMC_LLC_CONFIRM_RKEY_CONT:
+ /* not used because max links is 3, and 3 rkeys fit into
+ * one CONFIRM_RKEY message
+ */
+ break;
+ case SMC_LLC_DELETE_RKEY:
+ /* new request from remote, assign to remote flow */
+ if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) {
+ /* process here, does not wait for more llc msgs */
+ smc_llc_rmt_delete_rkey(lgr);
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt);
+ }
+ return;
+ case SMC_LLC_REQ_ADD_LINK:
+ /* handle response here, smc_llc_flow_stop() cannot be called
+ * in tasklet context
+ */
+ if (lgr->role == SMC_CLNT &&
+ lgr->llc_flow_lcl.type == SMC_LLC_FLOW_REQ_ADD_LINK &&
+ (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP)) {
+ smc_llc_flow_stop(link->lgr, &lgr->llc_flow_lcl);
+ } else if (lgr->role == SMC_SERV) {
+ if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
+ /* as smc server, handle client suggestion */
+ lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK;
+ schedule_work(&lgr->llc_add_link_work);
+ }
+ return;
+ }
+ break;
+ default:
+ smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type);
+ break;
+ }
+out:
+ kfree(qentry);
+}
+
+/* worker to process llc messages on the event queue */
+static void smc_llc_event_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+ llc_event_work);
+ struct smc_llc_qentry *qentry;
+
+ if (!lgr->llc_flow_lcl.type && lgr->delayed_event) {
+ qentry = lgr->delayed_event;
+ lgr->delayed_event = NULL;
+ if (smc_link_usable(qentry->link))
+ smc_llc_event_handler(qentry);
+ else
+ kfree(qentry);
+ }
+
+again:
+ spin_lock_bh(&lgr->llc_event_q_lock);
+ if (!list_empty(&lgr->llc_event_q)) {
+ qentry = list_first_entry(&lgr->llc_event_q,
+ struct smc_llc_qentry, list);
+ list_del_init(&qentry->list);
+ spin_unlock_bh(&lgr->llc_event_q_lock);
+ smc_llc_event_handler(qentry);
+ goto again;
+ }
+ spin_unlock_bh(&lgr->llc_event_q_lock);
+}
+
+/* process llc responses in tasklet context */
+static void smc_llc_rx_response(struct smc_link *link,
+ struct smc_llc_qentry *qentry)
+{
+ enum smc_llc_flowtype flowtype = link->lgr->llc_flow_lcl.type;
+ struct smc_llc_flow *flow = &link->lgr->llc_flow_lcl;
+ u8 llc_type = qentry->msg.raw.hdr.common.llc_type;
+
+ switch (llc_type) {
+ case SMC_LLC_TEST_LINK:
+ if (smc_link_active(link))
+ complete(&link->llc_testlink_resp);
+ break;
+ case SMC_LLC_ADD_LINK:
+ case SMC_LLC_ADD_LINK_CONT:
+ case SMC_LLC_CONFIRM_LINK:
+ if (flowtype != SMC_LLC_FLOW_ADD_LINK || flow->qentry)
+ break; /* drop out-of-flow response */
+ goto assign;
+ case SMC_LLC_DELETE_LINK:
+ if (flowtype != SMC_LLC_FLOW_DEL_LINK || flow->qentry)
+ break; /* drop out-of-flow response */
+ goto assign;
+ case SMC_LLC_CONFIRM_RKEY:
+ case SMC_LLC_DELETE_RKEY:
+ if (flowtype != SMC_LLC_FLOW_RKEY || flow->qentry)
+ break; /* drop out-of-flow response */
+ goto assign;
+ case SMC_LLC_CONFIRM_RKEY_CONT:
+ /* not used because max links is 3 */
+ break;
+ default:
+ smc_llc_protocol_violation(link->lgr,
+ qentry->msg.raw.hdr.common.type);
+ break;
+ }
+ kfree(qentry);
+ return;
+assign:
+ /* assign responses to the local flow, we requested them */
+ smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry);
+ wake_up(&link->lgr->llc_msg_waiter);
+}
+
+static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc)
+{
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_llc_qentry *qentry;
+ unsigned long flags;
+
+ qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC);
+ if (!qentry)
+ return;
+ qentry->link = link;
+ INIT_LIST_HEAD(&qentry->list);
+ memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg));
+
+ /* process responses immediately */
+ if ((llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) &&
+ llc->raw.hdr.common.llc_type != SMC_LLC_REQ_ADD_LINK) {
+ smc_llc_rx_response(link, qentry);
+ return;
+ }
+
+ /* add requests to event queue */
+ spin_lock_irqsave(&lgr->llc_event_q_lock, flags);
+ list_add_tail(&qentry->list, &lgr->llc_event_q);
+ spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags);
+ queue_work(system_highpri_wq, &lgr->llc_event_work);
+}
+
+/* copy received msg and add it to the event queue */
+static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ union smc_llc_msg *llc = buf;
+
+ if (wc->byte_len < sizeof(*llc))
+ return; /* short message */
+ if (!llc->raw.hdr.common.llc_version) {
+ if (llc->raw.hdr.length != sizeof(*llc))
+ return; /* invalid message */
+ } else {
+ if (llc->raw.hdr.length_v2 < sizeof(*llc))
+ return; /* invalid message */
+ }
+
+ smc_llc_enqueue(link, llc);
+}
+
+/***************************** worker, utils *********************************/
+
+static void smc_llc_testlink_work(struct work_struct *work)
+{
+ struct smc_link *link = container_of(to_delayed_work(work),
+ struct smc_link, llc_testlink_wrk);
+ unsigned long next_interval;
+ unsigned long expire_time;
+ u8 user_data[16] = { 0 };
+ int rc;
+
+ if (!smc_link_active(link))
+ return; /* don't reschedule worker */
+ expire_time = link->wr_rx_tstamp + link->llc_testlink_time;
+ if (time_is_after_jiffies(expire_time)) {
+ next_interval = expire_time - jiffies;
+ goto out;
+ }
+ reinit_completion(&link->llc_testlink_resp);
+ smc_llc_send_test_link(link, user_data);
+ /* receive TEST LINK response over RoCE fabric */
+ rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
+ SMC_LLC_WAIT_TIME);
+ if (!smc_link_active(link))
+ return; /* link state changed */
+ if (rc <= 0) {
+ smcr_link_down_cond_sched(link);
+ return;
+ }
+ next_interval = link->llc_testlink_time;
+out:
+ schedule_delayed_work(&link->llc_testlink_wrk, next_interval);
+}
+
+void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
+{
+ struct net *net = sock_net(smc->clcsock->sk);
+
+ INIT_WORK(&lgr->llc_event_work, smc_llc_event_work);
+ INIT_WORK(&lgr->llc_add_link_work, smc_llc_add_link_work);
+ INIT_WORK(&lgr->llc_del_link_work, smc_llc_delete_link_work);
+ INIT_LIST_HEAD(&lgr->llc_event_q);
+ spin_lock_init(&lgr->llc_event_q_lock);
+ spin_lock_init(&lgr->llc_flow_lock);
+ init_waitqueue_head(&lgr->llc_flow_waiter);
+ init_waitqueue_head(&lgr->llc_msg_waiter);
+ init_rwsem(&lgr->llc_conf_mutex);
+ lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time);
+}
+
+/* called after lgr was removed from lgr_list */
+void smc_llc_lgr_clear(struct smc_link_group *lgr)
+{
+ smc_llc_event_flush(lgr);
+ wake_up_all(&lgr->llc_flow_waiter);
+ wake_up_all(&lgr->llc_msg_waiter);
+ cancel_work_sync(&lgr->llc_event_work);
+ cancel_work_sync(&lgr->llc_add_link_work);
+ cancel_work_sync(&lgr->llc_del_link_work);
+ if (lgr->delayed_event) {
+ kfree(lgr->delayed_event);
+ lgr->delayed_event = NULL;
+ }
+}
+
+int smc_llc_link_init(struct smc_link *link)
+{
+ init_completion(&link->llc_testlink_resp);
+ INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
+ return 0;
+}
+
+void smc_llc_link_active(struct smc_link *link)
+{
+ pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu link added: id %*phN, "
+ "peerid %*phN, ibdev %s, ibport %d\n",
+ SMC_LGR_ID_SIZE, &link->lgr->id,
+ link->lgr->net->net_cookie,
+ SMC_LGR_ID_SIZE, &link->link_uid,
+ SMC_LGR_ID_SIZE, &link->peer_link_uid,
+ link->smcibdev->ibdev->name, link->ibport);
+ link->state = SMC_LNK_ACTIVE;
+ if (link->lgr->llc_testlink_time) {
+ link->llc_testlink_time = link->lgr->llc_testlink_time;
+ schedule_delayed_work(&link->llc_testlink_wrk,
+ link->llc_testlink_time);
+ }
+}
+
+/* called in worker context */
+void smc_llc_link_clear(struct smc_link *link, bool log)
+{
+ if (log)
+ pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu link removed: id %*phN"
+ ", peerid %*phN, ibdev %s, ibport %d\n",
+ SMC_LGR_ID_SIZE, &link->lgr->id,
+ link->lgr->net->net_cookie,
+ SMC_LGR_ID_SIZE, &link->link_uid,
+ SMC_LGR_ID_SIZE, &link->peer_link_uid,
+ link->smcibdev->ibdev->name, link->ibport);
+ complete(&link->llc_testlink_resp);
+ cancel_delayed_work_sync(&link->llc_testlink_wrk);
+}
+
+/* register a new rtoken at the remote peer (for all links) */
+int smc_llc_do_confirm_rkey(struct smc_link *send_link,
+ struct smc_buf_desc *rmb_desc)
+{
+ struct smc_link_group *lgr = send_link->lgr;
+ struct smc_llc_qentry *qentry = NULL;
+ int rc = 0;
+
+ rc = smc_llc_send_confirm_rkey(send_link, rmb_desc);
+ if (rc)
+ goto out;
+ /* receive CONFIRM RKEY response from server over RoCE fabric */
+ qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME,
+ SMC_LLC_CONFIRM_RKEY);
+ if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG))
+ rc = -EFAULT;
+out:
+ if (qentry)
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return rc;
+}
+
+/* unregister an rtoken at the remote peer */
+int smc_llc_do_delete_rkey(struct smc_link_group *lgr,
+ struct smc_buf_desc *rmb_desc)
+{
+ struct smc_llc_qentry *qentry = NULL;
+ struct smc_link *send_link;
+ int rc = 0;
+
+ send_link = smc_llc_usable_link(lgr);
+ if (!send_link)
+ return -ENOLINK;
+
+ /* protected by llc_flow control */
+ rc = smc_llc_send_delete_rkey(send_link, rmb_desc);
+ if (rc)
+ goto out;
+ /* receive DELETE RKEY response from server over RoCE fabric */
+ qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME,
+ SMC_LLC_DELETE_RKEY);
+ if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG))
+ rc = -EFAULT;
+out:
+ if (qentry)
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return rc;
+}
+
+void smc_llc_link_set_uid(struct smc_link *link)
+{
+ __be32 link_uid;
+
+ link_uid = htonl(*((u32 *)link->lgr->id) + link->link_id);
+ memcpy(link->link_uid, &link_uid, SMC_LGR_ID_SIZE);
+}
+
+/* save peers link user id, used for debug purposes */
+void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry)
+{
+ memcpy(qentry->link->peer_link_uid, qentry->msg.confirm_link.link_uid,
+ SMC_LGR_ID_SIZE);
+}
+
+/* evaluate confirm link request or response */
+int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
+ enum smc_llc_reqresp type)
+{
+ if (type == SMC_LLC_REQ) { /* SMC server assigns link_id */
+ qentry->link->link_id = qentry->msg.confirm_link.link_num;
+ smc_llc_link_set_uid(qentry->link);
+ }
+ if (!(qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC))
+ return -ENOTSUPP;
+ return 0;
+}
+
+/***************************** init, exit, misc ******************************/
+
+static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_CONFIRM_LINK
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_TEST_LINK
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_ADD_LINK
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_ADD_LINK_CONT
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_DELETE_LINK
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_CONFIRM_RKEY
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_CONFIRM_RKEY_CONT
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_DELETE_RKEY
+ },
+ /* V2 types */
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_CONFIRM_LINK_V2
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_TEST_LINK_V2
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_ADD_LINK_V2
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_DELETE_LINK_V2
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_REQ_ADD_LINK_V2
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_CONFIRM_RKEY_V2
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_DELETE_RKEY_V2
+ },
+ {
+ .handler = NULL,
+ }
+};
+
+int __init smc_llc_init(void)
+{
+ struct smc_wr_rx_handler *handler;
+ int rc = 0;
+
+ for (handler = smc_llc_rx_handlers; handler->handler; handler++) {
+ INIT_HLIST_NODE(&handler->list);
+ rc = smc_wr_rx_register_handler(handler);
+ if (rc)
+ break;
+ }
+ return rc;
+}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
new file mode 100644
index 000000000..7e7a3162c
--- /dev/null
+++ b/net/smc/smc_llc.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for LLC (link layer control) message handling
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ * Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_LLC_H
+#define SMC_LLC_H
+
+#include "smc_wr.h"
+
+#define SMC_LLC_FLAG_RESP 0x80
+
+#define SMC_LLC_WAIT_FIRST_TIME (5 * HZ)
+#define SMC_LLC_WAIT_TIME (2 * HZ)
+#define SMC_LLC_TESTLINK_DEFAULT_TIME (30 * HZ)
+
+enum smc_llc_reqresp {
+ SMC_LLC_REQ,
+ SMC_LLC_RESP
+};
+
+enum smc_llc_msg_type {
+ SMC_LLC_CONFIRM_LINK = 0x01,
+ SMC_LLC_ADD_LINK = 0x02,
+ SMC_LLC_ADD_LINK_CONT = 0x03,
+ SMC_LLC_DELETE_LINK = 0x04,
+ SMC_LLC_REQ_ADD_LINK = 0x05,
+ SMC_LLC_CONFIRM_RKEY = 0x06,
+ SMC_LLC_TEST_LINK = 0x07,
+ SMC_LLC_CONFIRM_RKEY_CONT = 0x08,
+ SMC_LLC_DELETE_RKEY = 0x09,
+ /* V2 types */
+ SMC_LLC_CONFIRM_LINK_V2 = 0x21,
+ SMC_LLC_ADD_LINK_V2 = 0x22,
+ SMC_LLC_DELETE_LINK_V2 = 0x24,
+ SMC_LLC_REQ_ADD_LINK_V2 = 0x25,
+ SMC_LLC_CONFIRM_RKEY_V2 = 0x26,
+ SMC_LLC_TEST_LINK_V2 = 0x27,
+ SMC_LLC_DELETE_RKEY_V2 = 0x29,
+};
+
+#define smc_link_downing(state) \
+ (cmpxchg(state, SMC_LNK_ACTIVE, SMC_LNK_INACTIVE) == SMC_LNK_ACTIVE)
+
+/* LLC DELETE LINK Request Reason Codes */
+#define SMC_LLC_DEL_LOST_PATH 0x00010000
+#define SMC_LLC_DEL_OP_INIT_TERM 0x00020000
+#define SMC_LLC_DEL_PROG_INIT_TERM 0x00030000
+#define SMC_LLC_DEL_PROT_VIOL 0x00040000
+#define SMC_LLC_DEL_NO_ASYM_NEEDED 0x00050000
+/* LLC DELETE LINK Response Reason Codes */
+#define SMC_LLC_DEL_NOLNK 0x00100000 /* Unknown Link ID (no link) */
+#define SMC_LLC_DEL_NOLGR 0x00200000 /* Unknown Link Group */
+
+/* returns a usable link of the link group, or NULL */
+static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr)
+{
+ int i;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+ if (smc_link_usable(&lgr->lnk[i]))
+ return &lgr->lnk[i];
+ return NULL;
+}
+
+/* set the termination reason code for the link group */
+static inline void smc_llc_set_termination_rsn(struct smc_link_group *lgr,
+ u32 rsn)
+{
+ if (!lgr->llc_termination_rsn)
+ lgr->llc_termination_rsn = rsn;
+}
+
+/* transmit */
+int smc_llc_send_confirm_link(struct smc_link *lnk,
+ enum smc_llc_reqresp reqresp);
+int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
+ struct smc_link *link_new,
+ enum smc_llc_reqresp reqresp);
+int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
+ enum smc_llc_reqresp reqresp, bool orderly,
+ u32 reason);
+void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id);
+void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc);
+void smc_llc_lgr_clear(struct smc_link_group *lgr);
+int smc_llc_link_init(struct smc_link *link);
+void smc_llc_link_active(struct smc_link *link);
+void smc_llc_link_clear(struct smc_link *link, bool log);
+int smc_llc_do_confirm_rkey(struct smc_link *send_link,
+ struct smc_buf_desc *rmb_desc);
+int smc_llc_do_delete_rkey(struct smc_link_group *lgr,
+ struct smc_buf_desc *rmb_desc);
+int smc_llc_flow_initiate(struct smc_link_group *lgr,
+ enum smc_llc_flowtype type);
+void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow);
+int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
+ enum smc_llc_reqresp type);
+void smc_llc_link_set_uid(struct smc_link *link);
+void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry);
+struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
+ struct smc_link *lnk,
+ int time_out, u8 exp_msg);
+struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow);
+void smc_llc_flow_qentry_del(struct smc_llc_flow *flow);
+void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord,
+ u32 rsn);
+int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry);
+int smc_llc_srv_add_link(struct smc_link *link,
+ struct smc_llc_qentry *req_qentry);
+void smc_llc_add_link_local(struct smc_link *link);
+int smc_llc_init(void) __init;
+
+#endif /* SMC_LLC_H */
diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c
new file mode 100644
index 000000000..621c46c70
--- /dev/null
+++ b/net/smc/smc_netlink.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Generic netlink support functions to interact with SMC module
+ *
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s): Guvenc Gulce <guvenc@linux.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <linux/if.h>
+#include <linux/smc.h>
+
+#include "smc_core.h"
+#include "smc_ism.h"
+#include "smc_ib.h"
+#include "smc_clc.h"
+#include "smc_stats.h"
+#include "smc_netlink.h"
+
+const struct nla_policy
+smc_gen_ueid_policy[SMC_NLA_EID_TABLE_MAX + 1] = {
+ [SMC_NLA_EID_TABLE_UNSPEC] = { .type = NLA_UNSPEC },
+ [SMC_NLA_EID_TABLE_ENTRY] = { .type = NLA_STRING,
+ .len = SMC_MAX_EID_LEN,
+ },
+};
+
+#define SMC_CMD_MAX_ATTR 1
+/* SMC_GENL generic netlink operation definition */
+static const struct genl_ops smc_gen_nl_ops[] = {
+ {
+ .cmd = SMC_NETLINK_GET_SYS_INFO,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_get_sys_info,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_LGR_SMCR,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcr_nl_get_lgr,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_LINK_SMCR,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcr_nl_get_link,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_LGR_SMCD,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcd_nl_get_lgr,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_DEV_SMCD,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcd_nl_get_device,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_DEV_SMCR,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcr_nl_get_device,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_STATS,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_get_stats,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_FBACK_STATS,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_get_fback_stats,
+ },
+ {
+ .cmd = SMC_NETLINK_DUMP_UEID,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_dump_ueid,
+ },
+ {
+ .cmd = SMC_NETLINK_ADD_UEID,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_add_ueid,
+ .policy = smc_gen_ueid_policy,
+ },
+ {
+ .cmd = SMC_NETLINK_REMOVE_UEID,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_remove_ueid,
+ .policy = smc_gen_ueid_policy,
+ },
+ {
+ .cmd = SMC_NETLINK_FLUSH_UEID,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_flush_ueid,
+ },
+ {
+ .cmd = SMC_NETLINK_DUMP_SEID,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_dump_seid,
+ },
+ {
+ .cmd = SMC_NETLINK_ENABLE_SEID,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_enable_seid,
+ },
+ {
+ .cmd = SMC_NETLINK_DISABLE_SEID,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_disable_seid,
+ },
+ {
+ .cmd = SMC_NETLINK_DUMP_HS_LIMITATION,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_dump_hs_limitation,
+ },
+ {
+ .cmd = SMC_NETLINK_ENABLE_HS_LIMITATION,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_enable_hs_limitation,
+ },
+ {
+ .cmd = SMC_NETLINK_DISABLE_HS_LIMITATION,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_disable_hs_limitation,
+ },
+};
+
+static const struct nla_policy smc_gen_nl_policy[2] = {
+ [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, },
+};
+
+/* SMC_GENL family definition */
+struct genl_family smc_gen_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = SMC_GENL_FAMILY_NAME,
+ .version = SMC_GENL_FAMILY_VERSION,
+ .maxattr = SMC_CMD_MAX_ATTR,
+ .policy = smc_gen_nl_policy,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = smc_gen_nl_ops,
+ .n_ops = ARRAY_SIZE(smc_gen_nl_ops),
+ .resv_start_op = SMC_NETLINK_DISABLE_HS_LIMITATION + 1,
+};
+
+int __init smc_nl_init(void)
+{
+ return genl_register_family(&smc_gen_nl_family);
+}
+
+void smc_nl_exit(void)
+{
+ genl_unregister_family(&smc_gen_nl_family);
+}
diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h
new file mode 100644
index 000000000..e8c6c3f0e
--- /dev/null
+++ b/net/smc/smc_netlink.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * SMC Generic netlink operations
+ *
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s): Guvenc Gulce <guvenc@linux.ibm.com>
+ */
+
+#ifndef _SMC_NETLINK_H
+#define _SMC_NETLINK_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+extern struct genl_family smc_gen_nl_family;
+
+extern const struct nla_policy smc_gen_ueid_policy[];
+
+struct smc_nl_dmp_ctx {
+ int pos[3];
+};
+
+static inline struct smc_nl_dmp_ctx *smc_nl_dmp_ctx(struct netlink_callback *c)
+{
+ return (struct smc_nl_dmp_ctx *)c->ctx;
+}
+
+int smc_nl_init(void) __init;
+void smc_nl_exit(void);
+
+#endif
diff --git a/net/smc/smc_netns.h b/net/smc/smc_netns.h
new file mode 100644
index 000000000..0f4f35aa4
--- /dev/null
+++ b/net/smc/smc_netns.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Shared Memory Communications
+ *
+ * Network namespace definitions.
+ *
+ * Copyright IBM Corp. 2018
+ */
+
+#ifndef SMC_NETNS_H
+#define SMC_NETNS_H
+
+#include "smc_pnet.h"
+
+extern unsigned int smc_net_id;
+
+/* per-network namespace private data */
+struct smc_net {
+ struct smc_pnettable pnettable;
+ struct smc_pnetids_ndev pnetids_ndev;
+};
+#endif
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
new file mode 100644
index 000000000..11775401d
--- /dev/null
+++ b/net/smc/smc_pnet.c
@@ -0,0 +1,1210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Generic netlink support functions to configure an SMC-R PNET table
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/if.h>
+#include <uapi/linux/smc.h>
+
+#include <rdma/ib_verbs.h>
+
+#include <net/netns/generic.h>
+#include "smc_netns.h"
+
+#include "smc_pnet.h"
+#include "smc_ib.h"
+#include "smc_ism.h"
+#include "smc_core.h"
+
+static struct net_device *__pnet_find_base_ndev(struct net_device *ndev);
+static struct net_device *pnet_find_base_ndev(struct net_device *ndev);
+
+static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
+ [SMC_PNETID_NAME] = {
+ .type = NLA_NUL_STRING,
+ .len = SMC_MAX_PNETID_LEN
+ },
+ [SMC_PNETID_ETHNAME] = {
+ .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ - 1
+ },
+ [SMC_PNETID_IBNAME] = {
+ .type = NLA_NUL_STRING,
+ .len = IB_DEVICE_NAME_MAX - 1
+ },
+ [SMC_PNETID_IBPORT] = { .type = NLA_U8 }
+};
+
+static struct genl_family smc_pnet_nl_family;
+
+enum smc_pnet_nametype {
+ SMC_PNET_ETH = 1,
+ SMC_PNET_IB = 2,
+};
+
+/* pnet entry stored in pnet table */
+struct smc_pnetentry {
+ struct list_head list;
+ char pnet_name[SMC_MAX_PNETID_LEN + 1];
+ enum smc_pnet_nametype type;
+ union {
+ struct {
+ char eth_name[IFNAMSIZ + 1];
+ struct net_device *ndev;
+ netdevice_tracker dev_tracker;
+ };
+ struct {
+ char ib_name[IB_DEVICE_NAME_MAX + 1];
+ u8 ib_port;
+ };
+ };
+};
+
+/* Check if the pnetid is set */
+bool smc_pnet_is_pnetid_set(u8 *pnetid)
+{
+ if (pnetid[0] == 0 || pnetid[0] == _S)
+ return false;
+ return true;
+}
+
+/* Check if two given pnetids match */
+static bool smc_pnet_match(u8 *pnetid1, u8 *pnetid2)
+{
+ int i;
+
+ for (i = 0; i < SMC_MAX_PNETID_LEN; i++) {
+ if ((pnetid1[i] == 0 || pnetid1[i] == _S) &&
+ (pnetid2[i] == 0 || pnetid2[i] == _S))
+ break;
+ if (pnetid1[i] != pnetid2[i])
+ return false;
+ }
+ return true;
+}
+
+/* Remove a pnetid from the pnet table.
+ */
+static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+ struct smc_pnettable *pnettable;
+ struct smc_ib_device *ibdev;
+ struct smcd_dev *smcd;
+ struct smc_net *sn;
+ int rc = -ENOENT;
+ int ibport;
+
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ /* remove table entry */
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist,
+ list) {
+ if (!pnet_name ||
+ smc_pnet_match(pnetelem->pnet_name, pnet_name)) {
+ list_del(&pnetelem->list);
+ if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) {
+ netdev_put(pnetelem->ndev,
+ &pnetelem->dev_tracker);
+ pr_warn_ratelimited("smc: net device %s "
+ "erased user defined "
+ "pnetid %.16s\n",
+ pnetelem->eth_name,
+ pnetelem->pnet_name);
+ }
+ kfree(pnetelem);
+ rc = 0;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+
+ /* if this is not the initial namespace, stop here */
+ if (net != &init_net)
+ return rc;
+
+ /* remove ib devices */
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+ for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) {
+ if (ibdev->pnetid_by_user[ibport] &&
+ (!pnet_name ||
+ smc_pnet_match(pnet_name,
+ ibdev->pnetid[ibport]))) {
+ pr_warn_ratelimited("smc: ib device %s ibport "
+ "%d erased user defined "
+ "pnetid %.16s\n",
+ ibdev->ibdev->name,
+ ibport + 1,
+ ibdev->pnetid[ibport]);
+ memset(ibdev->pnetid[ibport], 0,
+ SMC_MAX_PNETID_LEN);
+ ibdev->pnetid_by_user[ibport] = false;
+ rc = 0;
+ }
+ }
+ }
+ mutex_unlock(&smc_ib_devices.mutex);
+ /* remove smcd devices */
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ if (smcd->pnetid_by_user &&
+ (!pnet_name ||
+ smc_pnet_match(pnet_name, smcd->pnetid))) {
+ pr_warn_ratelimited("smc: smcd device %s "
+ "erased user defined pnetid "
+ "%.16s\n",
+ dev_name(smcd->ops->get_dev(smcd)),
+ smcd->pnetid);
+ memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN);
+ smcd->pnetid_by_user = false;
+ rc = 0;
+ }
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+ return rc;
+}
+
+/* Add the reference to a given network device to the pnet table.
+ */
+static int smc_pnet_add_by_ndev(struct net_device *ndev)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+ struct smc_pnettable *pnettable;
+ struct net *net = dev_net(ndev);
+ struct smc_net *sn;
+ int rc = -ENOENT;
+
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
+ if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev &&
+ !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) {
+ netdev_hold(ndev, &pnetelem->dev_tracker, GFP_ATOMIC);
+ pnetelem->ndev = ndev;
+ rc = 0;
+ pr_warn_ratelimited("smc: adding net device %s with "
+ "user defined pnetid %.16s\n",
+ pnetelem->eth_name,
+ pnetelem->pnet_name);
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+ return rc;
+}
+
+/* Remove the reference to a given network device from the pnet table.
+ */
+static int smc_pnet_remove_by_ndev(struct net_device *ndev)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+ struct smc_pnettable *pnettable;
+ struct net *net = dev_net(ndev);
+ struct smc_net *sn;
+ int rc = -ENOENT;
+
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
+ if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) {
+ netdev_put(pnetelem->ndev, &pnetelem->dev_tracker);
+ pnetelem->ndev = NULL;
+ rc = 0;
+ pr_warn_ratelimited("smc: removing net device %s with "
+ "user defined pnetid %.16s\n",
+ pnetelem->eth_name,
+ pnetelem->pnet_name);
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+ return rc;
+}
+
+/* Apply pnetid to ib device when no pnetid is set.
+ */
+static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port,
+ char *pnet_name)
+{
+ bool applied = false;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ if (!smc_pnet_is_pnetid_set(ib_dev->pnetid[ib_port - 1])) {
+ memcpy(ib_dev->pnetid[ib_port - 1], pnet_name,
+ SMC_MAX_PNETID_LEN);
+ ib_dev->pnetid_by_user[ib_port - 1] = true;
+ applied = true;
+ }
+ mutex_unlock(&smc_ib_devices.mutex);
+ return applied;
+}
+
+/* Apply pnetid to smcd device when no pnetid is set.
+ */
+static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name)
+{
+ bool applied = false;
+
+ mutex_lock(&smcd_dev_list.mutex);
+ if (!smc_pnet_is_pnetid_set(smcd_dev->pnetid)) {
+ memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN);
+ smcd_dev->pnetid_by_user = true;
+ applied = true;
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+ return applied;
+}
+
+/* The limit for pnetid is 16 characters.
+ * Valid characters should be (single-byte character set) a-z, A-Z, 0-9.
+ * Lower case letters are converted to upper case.
+ * Interior blanks should not be used.
+ */
+static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
+{
+ char *bf = skip_spaces(pnet_name);
+ size_t len = strlen(bf);
+ char *end = bf + len;
+
+ if (!len)
+ return false;
+ while (--end >= bf && isspace(*end))
+ ;
+ if (end - bf >= SMC_MAX_PNETID_LEN)
+ return false;
+ while (bf <= end) {
+ if (!isalnum(*bf))
+ return false;
+ *pnetid++ = islower(*bf) ? toupper(*bf) : *bf;
+ bf++;
+ }
+ *pnetid = '\0';
+ return true;
+}
+
+/* Find an infiniband device by a given name. The device might not exist. */
+static struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
+{
+ struct smc_ib_device *ibdev;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+ if (!strncmp(ibdev->ibdev->name, ib_name,
+ sizeof(ibdev->ibdev->name)) ||
+ (ibdev->ibdev->dev.parent &&
+ !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name,
+ IB_DEVICE_NAME_MAX - 1))) {
+ goto out;
+ }
+ }
+ ibdev = NULL;
+out:
+ mutex_unlock(&smc_ib_devices.mutex);
+ return ibdev;
+}
+
+/* Find an smcd device by a given name. The device might not exist. */
+static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name)
+{
+ struct smcd_dev *smcd_dev;
+
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) {
+ if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)),
+ smcd_name, IB_DEVICE_NAME_MAX - 1))
+ goto out;
+ }
+ smcd_dev = NULL;
+out:
+ mutex_unlock(&smcd_dev_list.mutex);
+ return smcd_dev;
+}
+
+static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net,
+ char *eth_name, char *pnet_name)
+{
+ struct smc_pnetentry *tmp_pe, *new_pe;
+ struct net_device *ndev, *base_ndev;
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ bool new_netdev;
+ int rc;
+
+ /* check if (base) netdev already has a pnetid. If there is one, we do
+ * not want to add a pnet table entry
+ */
+ rc = -EEXIST;
+ ndev = dev_get_by_name(net, eth_name); /* dev_hold() */
+ if (ndev) {
+ base_ndev = pnet_find_base_ndev(ndev);
+ if (!smc_pnetid_by_dev_port(base_ndev->dev.parent,
+ base_ndev->dev_port, ndev_pnetid))
+ goto out_put;
+ }
+
+ /* add a new netdev entry to the pnet table if there isn't one */
+ rc = -ENOMEM;
+ new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL);
+ if (!new_pe)
+ goto out_put;
+ new_pe->type = SMC_PNET_ETH;
+ memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
+ strncpy(new_pe->eth_name, eth_name, IFNAMSIZ);
+ rc = -EEXIST;
+ new_netdev = true;
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+ if (tmp_pe->type == SMC_PNET_ETH &&
+ !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) {
+ new_netdev = false;
+ break;
+ }
+ }
+ if (new_netdev) {
+ if (ndev) {
+ new_pe->ndev = ndev;
+ netdev_tracker_alloc(ndev, &new_pe->dev_tracker,
+ GFP_ATOMIC);
+ }
+ list_add_tail(&new_pe->list, &pnettable->pnetlist);
+ mutex_unlock(&pnettable->lock);
+ } else {
+ mutex_unlock(&pnettable->lock);
+ kfree(new_pe);
+ goto out_put;
+ }
+ if (ndev)
+ pr_warn_ratelimited("smc: net device %s "
+ "applied user defined pnetid %.16s\n",
+ new_pe->eth_name, new_pe->pnet_name);
+ return 0;
+
+out_put:
+ dev_put(ndev);
+ return rc;
+}
+
+static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name,
+ u8 ib_port, char *pnet_name)
+{
+ struct smc_pnetentry *tmp_pe, *new_pe;
+ struct smc_ib_device *ib_dev;
+ bool smcddev_applied = true;
+ bool ibdev_applied = true;
+ struct smcd_dev *smcd;
+ struct device *dev;
+ bool new_ibdev;
+
+ /* try to apply the pnetid to active devices */
+ ib_dev = smc_pnet_find_ib(ib_name);
+ if (ib_dev) {
+ ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name);
+ if (ibdev_applied)
+ pr_warn_ratelimited("smc: ib device %s ibport %d "
+ "applied user defined pnetid "
+ "%.16s\n", ib_dev->ibdev->name,
+ ib_port,
+ ib_dev->pnetid[ib_port - 1]);
+ }
+ smcd = smc_pnet_find_smcd(ib_name);
+ if (smcd) {
+ smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name);
+ if (smcddev_applied) {
+ dev = smcd->ops->get_dev(smcd);
+ pr_warn_ratelimited("smc: smcd device %s "
+ "applied user defined pnetid "
+ "%.16s\n", dev_name(dev),
+ smcd->pnetid);
+ }
+ }
+ /* Apply fails when a device has a hardware-defined pnetid set, do not
+ * add a pnet table entry in that case.
+ */
+ if (!ibdev_applied || !smcddev_applied)
+ return -EEXIST;
+
+ /* add a new ib entry to the pnet table if there isn't one */
+ new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL);
+ if (!new_pe)
+ return -ENOMEM;
+ new_pe->type = SMC_PNET_IB;
+ memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
+ strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX);
+ new_pe->ib_port = ib_port;
+
+ new_ibdev = true;
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+ if (tmp_pe->type == SMC_PNET_IB &&
+ !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
+ new_ibdev = false;
+ break;
+ }
+ }
+ if (new_ibdev) {
+ list_add_tail(&new_pe->list, &pnettable->pnetlist);
+ mutex_unlock(&pnettable->lock);
+ } else {
+ mutex_unlock(&pnettable->lock);
+ kfree(new_pe);
+ }
+ return (new_ibdev) ? 0 : -EEXIST;
+}
+
+/* Append a pnetid to the end of the pnet table if not already on this list.
+ */
+static int smc_pnet_enter(struct net *net, struct nlattr *tb[])
+{
+ char pnet_name[SMC_MAX_PNETID_LEN + 1];
+ struct smc_pnettable *pnettable;
+ bool new_netdev = false;
+ bool new_ibdev = false;
+ struct smc_net *sn;
+ u8 ibport = 1;
+ char *string;
+ int rc;
+
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ rc = -EINVAL;
+ if (!tb[SMC_PNETID_NAME])
+ goto error;
+ string = (char *)nla_data(tb[SMC_PNETID_NAME]);
+ if (!smc_pnetid_valid(string, pnet_name))
+ goto error;
+
+ if (tb[SMC_PNETID_ETHNAME]) {
+ string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
+ rc = smc_pnet_add_eth(pnettable, net, string, pnet_name);
+ if (!rc)
+ new_netdev = true;
+ else if (rc != -EEXIST)
+ goto error;
+ }
+
+ /* if this is not the initial namespace, stop here */
+ if (net != &init_net)
+ return new_netdev ? 0 : -EEXIST;
+
+ rc = -EINVAL;
+ if (tb[SMC_PNETID_IBNAME]) {
+ string = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
+ string = strim(string);
+ if (tb[SMC_PNETID_IBPORT]) {
+ ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]);
+ if (ibport < 1 || ibport > SMC_MAX_PORTS)
+ goto error;
+ }
+ rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name);
+ if (!rc)
+ new_ibdev = true;
+ else if (rc != -EEXIST)
+ goto error;
+ }
+ return (new_netdev || new_ibdev) ? 0 : -EEXIST;
+
+error:
+ return rc;
+}
+
+/* Convert an smc_pnetentry to a netlink attribute sequence */
+static int smc_pnet_set_nla(struct sk_buff *msg,
+ struct smc_pnetentry *pnetelem)
+{
+ if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name))
+ return -1;
+ if (pnetelem->type == SMC_PNET_ETH) {
+ if (nla_put_string(msg, SMC_PNETID_ETHNAME,
+ pnetelem->eth_name))
+ return -1;
+ } else {
+ if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a"))
+ return -1;
+ }
+ if (pnetelem->type == SMC_PNET_IB) {
+ if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) ||
+ nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port))
+ return -1;
+ } else {
+ if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") ||
+ nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff))
+ return -1;
+ }
+
+ return 0;
+}
+
+static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+
+ return smc_pnet_enter(net, info->attrs);
+}
+
+static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+
+ if (!info->attrs[SMC_PNETID_NAME])
+ return -EINVAL;
+ return smc_pnet_remove_by_pnetid(net,
+ (char *)nla_data(info->attrs[SMC_PNETID_NAME]));
+}
+
+static int smc_pnet_dump_start(struct netlink_callback *cb)
+{
+ cb->args[0] = 0;
+ return 0;
+}
+
+static int smc_pnet_dumpinfo(struct sk_buff *skb,
+ u32 portid, u32 seq, u32 flags,
+ struct smc_pnetentry *pnetelem)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family,
+ flags, SMC_PNETID_GET);
+ if (!hdr)
+ return -ENOMEM;
+ if (smc_pnet_set_nla(skb, pnetelem) < 0) {
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+ }
+ genlmsg_end(skb, hdr);
+ return 0;
+}
+
+static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid,
+ u32 seq, u8 *pnetid, int start_idx)
+{
+ struct smc_pnettable *pnettable;
+ struct smc_pnetentry *pnetelem;
+ struct smc_net *sn;
+ int idx = 0;
+
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ /* dump pnettable entries */
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
+ if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid))
+ continue;
+ if (idx++ < start_idx)
+ continue;
+ /* if this is not the initial namespace, dump only netdev */
+ if (net != &init_net && pnetelem->type != SMC_PNET_ETH)
+ continue;
+ if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI,
+ pnetelem)) {
+ --idx;
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+ return idx;
+}
+
+static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int idx;
+
+ idx = _smc_pnet_dump(net, skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NULL, cb->args[0]);
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+/* Retrieve one PNETID entry */
+static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct sk_buff *msg;
+ void *hdr;
+
+ if (!info->attrs[SMC_PNETID_NAME])
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ _smc_pnet_dump(net, msg, info->snd_portid, info->snd_seq,
+ nla_data(info->attrs[SMC_PNETID_NAME]), 0);
+
+ /* finish multi part message and send it */
+ hdr = nlmsg_put(msg, info->snd_portid, info->snd_seq, NLMSG_DONE, 0,
+ NLM_F_MULTI);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+ }
+ return genlmsg_reply(msg, info);
+}
+
+/* Remove and delete all pnetids from pnet table.
+ */
+static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+
+ smc_pnet_remove_by_pnetid(net, NULL);
+ return 0;
+}
+
+/* SMC_PNETID generic netlink operation definition */
+static const struct genl_ops smc_pnet_ops[] = {
+ {
+ .cmd = SMC_PNETID_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ /* can be retrieved by unprivileged users */
+ .doit = smc_pnet_get,
+ .dumpit = smc_pnet_dump,
+ .start = smc_pnet_dump_start
+ },
+ {
+ .cmd = SMC_PNETID_ADD,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_pnet_add
+ },
+ {
+ .cmd = SMC_PNETID_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_pnet_del
+ },
+ {
+ .cmd = SMC_PNETID_FLUSH,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_pnet_flush
+ }
+};
+
+/* SMC_PNETID family definition */
+static struct genl_family smc_pnet_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = SMCR_GENL_FAMILY_NAME,
+ .version = SMCR_GENL_FAMILY_VERSION,
+ .maxattr = SMC_PNETID_MAX,
+ .policy = smc_pnet_policy,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = smc_pnet_ops,
+ .n_ops = ARRAY_SIZE(smc_pnet_ops),
+ .resv_start_op = SMC_PNETID_FLUSH + 1,
+};
+
+bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe;
+ bool rc = false;
+
+ read_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry(pe, &sn->pnetids_ndev.list, list) {
+ if (smc_pnet_match(pnetid, pe->pnetid)) {
+ rc = true;
+ goto unlock;
+ }
+ }
+
+unlock:
+ read_unlock(&sn->pnetids_ndev.lock);
+ return rc;
+}
+
+static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe, *pi;
+
+ pe = kzalloc(sizeof(*pe), GFP_KERNEL);
+ if (!pe)
+ return -ENOMEM;
+
+ write_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry(pi, &sn->pnetids_ndev.list, list) {
+ if (smc_pnet_match(pnetid, pe->pnetid)) {
+ refcount_inc(&pi->refcnt);
+ kfree(pe);
+ goto unlock;
+ }
+ }
+ refcount_set(&pe->refcnt, 1);
+ memcpy(pe->pnetid, pnetid, SMC_MAX_PNETID_LEN);
+ list_add_tail(&pe->list, &sn->pnetids_ndev.list);
+
+unlock:
+ write_unlock(&sn->pnetids_ndev.lock);
+ return 0;
+}
+
+static void smc_pnet_remove_pnetid(struct net *net, u8 *pnetid)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe, *pe2;
+
+ write_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry_safe(pe, pe2, &sn->pnetids_ndev.list, list) {
+ if (smc_pnet_match(pnetid, pe->pnetid)) {
+ if (refcount_dec_and_test(&pe->refcnt)) {
+ list_del(&pe->list);
+ kfree(pe);
+ }
+ break;
+ }
+ }
+ write_unlock(&sn->pnetids_ndev.lock);
+}
+
+static void smc_pnet_add_base_pnetid(struct net *net, struct net_device *dev,
+ u8 *ndev_pnetid)
+{
+ struct net_device *base_dev;
+
+ base_dev = __pnet_find_base_ndev(dev);
+ if (base_dev->flags & IFF_UP &&
+ !smc_pnetid_by_dev_port(base_dev->dev.parent, base_dev->dev_port,
+ ndev_pnetid)) {
+ /* add to PNETIDs list */
+ smc_pnet_add_pnetid(net, ndev_pnetid);
+ }
+}
+
+/* create initial list of netdevice pnetids */
+static void smc_pnet_create_pnetids_list(struct net *net)
+{
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(net, dev)
+ smc_pnet_add_base_pnetid(net, dev, ndev_pnetid);
+ rtnl_unlock();
+}
+
+/* clean up list of netdevice pnetids */
+static void smc_pnet_destroy_pnetids_list(struct net *net)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe, *temp_pe;
+
+ write_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry_safe(pe, temp_pe, &sn->pnetids_ndev.list, list) {
+ list_del(&pe->list);
+ kfree(pe);
+ }
+ write_unlock(&sn->pnetids_ndev.lock);
+}
+
+static int smc_pnet_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(event_dev);
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+
+ switch (event) {
+ case NETDEV_REBOOT:
+ case NETDEV_UNREGISTER:
+ smc_pnet_remove_by_ndev(event_dev);
+ smc_ib_ndev_change(event_dev, event);
+ return NOTIFY_OK;
+ case NETDEV_REGISTER:
+ smc_pnet_add_by_ndev(event_dev);
+ smc_ib_ndev_change(event_dev, event);
+ return NOTIFY_OK;
+ case NETDEV_UP:
+ smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid);
+ return NOTIFY_OK;
+ case NETDEV_DOWN:
+ event_dev = __pnet_find_base_ndev(event_dev);
+ if (!smc_pnetid_by_dev_port(event_dev->dev.parent,
+ event_dev->dev_port, ndev_pnetid)) {
+ /* remove from PNETIDs list */
+ smc_pnet_remove_pnetid(net, ndev_pnetid);
+ }
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block smc_netdev_notifier = {
+ .notifier_call = smc_pnet_netdev_event
+};
+
+/* init network namespace */
+int smc_pnet_net_init(struct net *net)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnettable *pnettable = &sn->pnettable;
+ struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev;
+
+ INIT_LIST_HEAD(&pnettable->pnetlist);
+ mutex_init(&pnettable->lock);
+ INIT_LIST_HEAD(&pnetids_ndev->list);
+ rwlock_init(&pnetids_ndev->lock);
+
+ smc_pnet_create_pnetids_list(net);
+
+ /* disable handshake limitation by default */
+ net->smc.limit_smc_hs = 0;
+
+ return 0;
+}
+
+int __init smc_pnet_init(void)
+{
+ int rc;
+
+ rc = genl_register_family(&smc_pnet_nl_family);
+ if (rc)
+ return rc;
+ rc = register_netdevice_notifier(&smc_netdev_notifier);
+ if (rc)
+ genl_unregister_family(&smc_pnet_nl_family);
+
+ return rc;
+}
+
+/* exit network namespace */
+void smc_pnet_net_exit(struct net *net)
+{
+ /* flush pnet table */
+ smc_pnet_remove_by_pnetid(net, NULL);
+ smc_pnet_destroy_pnetids_list(net);
+}
+
+void smc_pnet_exit(void)
+{
+ unregister_netdevice_notifier(&smc_netdev_notifier);
+ genl_unregister_family(&smc_pnet_nl_family);
+}
+
+static struct net_device *__pnet_find_base_ndev(struct net_device *ndev)
+{
+ int i, nest_lvl;
+
+ ASSERT_RTNL();
+ nest_lvl = ndev->lower_level;
+ for (i = 0; i < nest_lvl; i++) {
+ struct list_head *lower = &ndev->adj_list.lower;
+
+ if (list_empty(lower))
+ break;
+ lower = lower->next;
+ ndev = netdev_lower_get_next(ndev, &lower);
+ }
+ return ndev;
+}
+
+/* Determine one base device for stacked net devices.
+ * If the lower device level contains more than one devices
+ * (for instance with bonding slaves), just the first device
+ * is used to reach a base device.
+ */
+static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
+{
+ rtnl_lock();
+ ndev = __pnet_find_base_ndev(ndev);
+ rtnl_unlock();
+ return ndev;
+}
+
+static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev,
+ u8 *pnetid)
+{
+ struct smc_pnettable *pnettable;
+ struct net *net = dev_net(ndev);
+ struct smc_pnetentry *pnetelem;
+ struct smc_net *sn;
+ int rc = -ENOENT;
+
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
+ if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) {
+ /* get pnetid of netdev device */
+ memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN);
+ rc = 0;
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+ return rc;
+}
+
+static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i,
+ struct smc_init_info *ini)
+{
+ if (!ini->check_smcrv2 &&
+ !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL,
+ NULL)) {
+ ini->ib_dev = ibdev;
+ ini->ib_port = i;
+ return 0;
+ }
+ if (ini->check_smcrv2 &&
+ !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->smcrv2.ib_gid_v2,
+ NULL, &ini->smcrv2)) {
+ ini->smcrv2.ib_dev_v2 = ibdev;
+ ini->smcrv2.ib_port_v2 = i;
+ return 0;
+ }
+ return -ENODEV;
+}
+
+/* find a roce device for the given pnetid */
+static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id,
+ struct smc_init_info *ini,
+ struct smc_ib_device *known_dev,
+ struct net *net)
+{
+ struct smc_ib_device *ibdev;
+ int i;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+ if (ibdev == known_dev ||
+ !rdma_dev_access_netns(ibdev->ibdev, net))
+ continue;
+ for (i = 1; i <= SMC_MAX_PORTS; i++) {
+ if (!rdma_is_port_valid(ibdev->ibdev, i))
+ continue;
+ if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) &&
+ smc_ib_port_active(ibdev, i) &&
+ !test_bit(i - 1, ibdev->ports_going_away)) {
+ if (!smc_pnet_determine_gid(ibdev, i, ini))
+ goto out;
+ }
+ }
+ }
+out:
+ mutex_unlock(&smc_ib_devices.mutex);
+}
+
+/* find alternate roce device with same pnet_id, vlan_id and net namespace */
+void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
+ struct smc_init_info *ini,
+ struct smc_ib_device *known_dev)
+{
+ struct net *net = lgr->net;
+
+ _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net);
+}
+
+/* if handshake network device belongs to a roce device, return its
+ * IB device and port
+ */
+static void smc_pnet_find_rdma_dev(struct net_device *netdev,
+ struct smc_init_info *ini)
+{
+ struct net *net = dev_net(netdev);
+ struct smc_ib_device *ibdev;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+ struct net_device *ndev;
+ int i;
+
+ /* check rdma net namespace */
+ if (!rdma_dev_access_netns(ibdev->ibdev, net))
+ continue;
+
+ for (i = 1; i <= SMC_MAX_PORTS; i++) {
+ if (!rdma_is_port_valid(ibdev->ibdev, i))
+ continue;
+ if (!ibdev->ibdev->ops.get_netdev)
+ continue;
+ ndev = ibdev->ibdev->ops.get_netdev(ibdev->ibdev, i);
+ if (!ndev)
+ continue;
+ dev_put(ndev);
+ if (netdev == ndev &&
+ smc_ib_port_active(ibdev, i) &&
+ !test_bit(i - 1, ibdev->ports_going_away)) {
+ if (!smc_pnet_determine_gid(ibdev, i, ini))
+ break;
+ }
+ }
+ }
+ mutex_unlock(&smc_ib_devices.mutex);
+}
+
+/* Determine the corresponding IB device port based on the hardware PNETID.
+ * Searching stops at the first matching active IB device port with vlan_id
+ * configured.
+ * If nothing found, check pnetid table.
+ * If nothing found, try to use handshake device
+ */
+static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
+ struct smc_init_info *ini)
+{
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ struct net *net;
+
+ ndev = pnet_find_base_ndev(ndev);
+ net = dev_net(ndev);
+ if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
+ ndev_pnetid) &&
+ smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) {
+ smc_pnet_find_rdma_dev(ndev, ini);
+ return; /* pnetid could not be determined */
+ }
+ _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net);
+}
+
+static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
+ struct smc_init_info *ini)
+{
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ struct smcd_dev *ismdev;
+
+ ndev = pnet_find_base_ndev(ndev);
+ if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
+ ndev_pnetid) &&
+ smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid))
+ return; /* pnetid could not be determined */
+
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(ismdev, &smcd_dev_list.list, list) {
+ if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) &&
+ !ismdev->going_away &&
+ (!ini->ism_peer_gid[0] ||
+ !smc_ism_cantalk(ini->ism_peer_gid[0], ini->vlan_id,
+ ismdev))) {
+ ini->ism_dev[0] = ismdev;
+ break;
+ }
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+}
+
+/* PNET table analysis for a given sock:
+ * determine ib_device and port belonging to used internal TCP socket
+ * ethernet interface.
+ */
+void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+
+ if (!dst)
+ goto out;
+ if (!dst->dev)
+ goto out_rel;
+
+ smc_pnet_find_roce_by_pnetid(dst->dev, ini);
+
+out_rel:
+ dst_release(dst);
+out:
+ return;
+}
+
+void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+
+ ini->ism_dev[0] = NULL;
+ if (!dst)
+ goto out;
+ if (!dst->dev)
+ goto out_rel;
+
+ smc_pnet_find_ism_by_pnetid(dst->dev, ini);
+
+out_rel:
+ dst_release(dst);
+out:
+ return;
+}
+
+/* Lookup and apply a pnet table entry to the given ib device.
+ */
+int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port)
+{
+ char *ib_name = smcibdev->ibdev->name;
+ struct smc_pnettable *pnettable;
+ struct smc_pnetentry *tmp_pe;
+ struct smc_net *sn;
+ int rc = -ENOENT;
+
+ /* get pnettable for init namespace */
+ sn = net_generic(&init_net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+ if (tmp_pe->type == SMC_PNET_IB &&
+ !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) &&
+ tmp_pe->ib_port == ib_port) {
+ smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name);
+ rc = 0;
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+
+ return rc;
+}
+
+/* Lookup and apply a pnet table entry to the given smcd device.
+ */
+int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev)
+{
+ const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev));
+ struct smc_pnettable *pnettable;
+ struct smc_pnetentry *tmp_pe;
+ struct smc_net *sn;
+ int rc = -ENOENT;
+
+ /* get pnettable for init namespace */
+ sn = net_generic(&init_net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+ if (tmp_pe->type == SMC_PNET_IB &&
+ !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
+ smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name);
+ rc = 0;
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+
+ return rc;
+}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
new file mode 100644
index 000000000..80a88eea4
--- /dev/null
+++ b/net/smc/smc_pnet.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * PNET table queries
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_PNET_H
+#define _SMC_PNET_H
+
+#include <net/smc.h>
+
+#if IS_ENABLED(CONFIG_HAVE_PNETID)
+#include <asm/pnet.h>
+#endif
+
+struct smc_ib_device;
+struct smcd_dev;
+struct smc_init_info;
+struct smc_link_group;
+
+/**
+ * struct smc_pnettable - SMC PNET table anchor
+ * @lock: Lock for list action
+ * @pnetlist: List of PNETIDs
+ */
+struct smc_pnettable {
+ struct mutex lock;
+ struct list_head pnetlist;
+};
+
+struct smc_pnetids_ndev { /* list of pnetids for net devices in UP state*/
+ struct list_head list;
+ rwlock_t lock;
+};
+
+struct smc_pnetids_ndev_entry {
+ struct list_head list;
+ u8 pnetid[SMC_MAX_PNETID_LEN];
+ refcount_t refcnt;
+};
+
+static inline int smc_pnetid_by_dev_port(struct device *dev,
+ unsigned short port, u8 *pnetid)
+{
+#if IS_ENABLED(CONFIG_HAVE_PNETID)
+ return pnet_id_by_dev_port(dev, port, pnetid);
+#else
+ return -ENOENT;
+#endif
+}
+
+int smc_pnet_init(void) __init;
+int smc_pnet_net_init(struct net *net);
+void smc_pnet_exit(void);
+void smc_pnet_net_exit(struct net *net);
+void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini);
+void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini);
+int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port);
+int smc_pnetid_by_table_smcd(struct smcd_dev *smcd);
+void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
+ struct smc_init_info *ini,
+ struct smc_ib_device *known_dev);
+bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid);
+bool smc_pnet_is_pnetid_set(u8 *pnetid);
+#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
new file mode 100644
index 000000000..9a2f3638d
--- /dev/null
+++ b/net/smc/smc_rx.c
@@ -0,0 +1,515 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ * copy new RMBE data into user space
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/signal.h>
+#include <linux/splice.h>
+
+#include <net/sock.h>
+#include <trace/events/sock.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_cdc.h"
+#include "smc_tx.h" /* smc_tx_consumer_update() */
+#include "smc_rx.h"
+#include "smc_stats.h"
+#include "smc_tracepoint.h"
+
+/* callback implementation to wakeup consumers blocked with smc_rx_wait().
+ * indirectly called by smc_cdc_msg_recv_action().
+ */
+static void smc_rx_wake_up(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ trace_sk_data_ready(sk);
+
+ /* derived from sock_def_readable() */
+ /* called already in smc_listen_work() */
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
+ EPOLLRDNORM | EPOLLRDBAND);
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+ (sk->sk_state == SMC_CLOSED))
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ rcu_read_unlock();
+}
+
+/* Update consumer cursor
+ * @conn connection to update
+ * @cons consumer cursor
+ * @len number of Bytes consumed
+ * Returns:
+ * 1 if we should end our receive, 0 otherwise
+ */
+static int smc_rx_update_consumer(struct smc_sock *smc,
+ union smc_host_cursor cons, size_t len)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ bool force = false;
+ int diff, rc = 0;
+
+ smc_curs_add(conn->rmb_desc->len, &cons, len);
+
+ /* did we process urgent data? */
+ if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) {
+ diff = smc_curs_comp(conn->rmb_desc->len, &cons,
+ &conn->urg_curs);
+ if (sock_flag(sk, SOCK_URGINLINE)) {
+ if (diff == 0) {
+ force = true;
+ rc = 1;
+ conn->urg_state = SMC_URG_READ;
+ }
+ } else {
+ if (diff == 1) {
+ /* skip urgent byte */
+ force = true;
+ smc_curs_add(conn->rmb_desc->len, &cons, 1);
+ conn->urg_rx_skip_pend = false;
+ } else if (diff < -1)
+ /* we read past urgent byte */
+ conn->urg_state = SMC_URG_READ;
+ }
+ }
+
+ smc_curs_copy(&conn->local_tx_ctrl.cons, &cons, conn);
+
+ /* send consumer cursor update if required */
+ /* similar to advertising new TCP rcv_wnd if required */
+ smc_tx_consumer_update(conn, force);
+
+ return rc;
+}
+
+static void smc_rx_update_cons(struct smc_sock *smc, size_t len)
+{
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor cons;
+
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ smc_rx_update_consumer(smc, cons, len);
+}
+
+struct smc_spd_priv {
+ struct smc_sock *smc;
+ size_t len;
+};
+
+static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private;
+ struct smc_sock *smc = priv->smc;
+ struct smc_connection *conn;
+ struct sock *sk = &smc->sk;
+
+ if (sk->sk_state == SMC_CLOSED ||
+ sk->sk_state == SMC_PEERFINCLOSEWAIT ||
+ sk->sk_state == SMC_APPFINCLOSEWAIT)
+ goto out;
+ conn = &smc->conn;
+ lock_sock(sk);
+ smc_rx_update_cons(smc, priv->len);
+ release_sock(sk);
+ if (atomic_sub_and_test(priv->len, &conn->splice_pending))
+ smc_rx_wake_up(sk);
+out:
+ kfree(priv);
+ put_page(buf->page);
+ sock_put(sk);
+}
+
+static const struct pipe_buf_operations smc_pipe_ops = {
+ .release = smc_rx_pipe_buf_release,
+ .get = generic_pipe_buf_get
+};
+
+static void smc_rx_spd_release(struct splice_pipe_desc *spd,
+ unsigned int i)
+{
+ put_page(spd->pages[i]);
+}
+
+static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
+ struct smc_sock *smc)
+{
+ struct smc_link_group *lgr = smc->conn.lgr;
+ int offset = offset_in_page(src);
+ struct partial_page *partial;
+ struct splice_pipe_desc spd;
+ struct smc_spd_priv **priv;
+ struct page **pages;
+ int bytes, nr_pages;
+ int i;
+
+ nr_pages = !lgr->is_smcd && smc->conn.rmb_desc->is_vm ?
+ PAGE_ALIGN(len + offset) / PAGE_SIZE : 1;
+
+ pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ goto out;
+ partial = kcalloc(nr_pages, sizeof(*partial), GFP_KERNEL);
+ if (!partial)
+ goto out_page;
+ priv = kcalloc(nr_pages, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ goto out_part;
+ for (i = 0; i < nr_pages; i++) {
+ priv[i] = kzalloc(sizeof(**priv), GFP_KERNEL);
+ if (!priv[i])
+ goto out_priv;
+ }
+
+ if (lgr->is_smcd ||
+ (!lgr->is_smcd && !smc->conn.rmb_desc->is_vm)) {
+ /* smcd or smcr that uses physically contiguous RMBs */
+ priv[0]->len = len;
+ priv[0]->smc = smc;
+ partial[0].offset = src - (char *)smc->conn.rmb_desc->cpu_addr;
+ partial[0].len = len;
+ partial[0].private = (unsigned long)priv[0];
+ pages[0] = smc->conn.rmb_desc->pages;
+ } else {
+ int size, left = len;
+ void *buf = src;
+ /* smcr that uses virtually contiguous RMBs*/
+ for (i = 0; i < nr_pages; i++) {
+ size = min_t(int, PAGE_SIZE - offset, left);
+ priv[i]->len = size;
+ priv[i]->smc = smc;
+ pages[i] = vmalloc_to_page(buf);
+ partial[i].offset = offset;
+ partial[i].len = size;
+ partial[i].private = (unsigned long)priv[i];
+ buf += size / sizeof(*buf);
+ left -= size;
+ offset = 0;
+ }
+ }
+ spd.nr_pages_max = nr_pages;
+ spd.nr_pages = nr_pages;
+ spd.pages = pages;
+ spd.partial = partial;
+ spd.ops = &smc_pipe_ops;
+ spd.spd_release = smc_rx_spd_release;
+
+ bytes = splice_to_pipe(pipe, &spd);
+ if (bytes > 0) {
+ sock_hold(&smc->sk);
+ if (!lgr->is_smcd && smc->conn.rmb_desc->is_vm) {
+ for (i = 0; i < PAGE_ALIGN(bytes + offset) / PAGE_SIZE; i++)
+ get_page(pages[i]);
+ } else {
+ get_page(smc->conn.rmb_desc->pages);
+ }
+ atomic_add(bytes, &smc->conn.splice_pending);
+ }
+ kfree(priv);
+ kfree(partial);
+ kfree(pages);
+
+ return bytes;
+
+out_priv:
+ for (i = (i - 1); i >= 0; i--)
+ kfree(priv[i]);
+ kfree(priv);
+out_part:
+ kfree(partial);
+out_page:
+ kfree(pages);
+out:
+ return -ENOMEM;
+}
+
+static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn)
+{
+ return atomic_read(&conn->bytes_to_rcv) &&
+ !atomic_read(&conn->splice_pending);
+}
+
+/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
+ * @smc smc socket
+ * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout
+ * @fcrit add'l criterion to evaluate as function pointer
+ * Returns:
+ * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
+ * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
+ */
+int smc_rx_wait(struct smc_sock *smc, long *timeo,
+ int (*fcrit)(struct smc_connection *conn))
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct smc_connection *conn = &smc->conn;
+ struct smc_cdc_conn_state_flags *cflags =
+ &conn->local_tx_ctrl.conn_state_flags;
+ struct sock *sk = &smc->sk;
+ int rc;
+
+ if (fcrit(conn))
+ return 1;
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ add_wait_queue(sk_sleep(sk), &wait);
+ rc = sk_wait_event(sk, timeo,
+ READ_ONCE(sk->sk_err) ||
+ cflags->peer_conn_abort ||
+ READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN ||
+ conn->killed ||
+ fcrit(conn),
+ &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ return rc;
+}
+
+static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
+ int flags)
+{
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor cons;
+ struct sock *sk = &smc->sk;
+ int rc = 0;
+
+ if (sock_flag(sk, SOCK_URGINLINE) ||
+ !(conn->urg_state == SMC_URG_VALID) ||
+ conn->urg_state == SMC_URG_READ)
+ return -EINVAL;
+
+ SMC_STAT_INC(smc, urg_data_cnt);
+ if (conn->urg_state == SMC_URG_VALID) {
+ if (!(flags & MSG_PEEK))
+ smc->conn.urg_state = SMC_URG_READ;
+ msg->msg_flags |= MSG_OOB;
+ if (len > 0) {
+ if (!(flags & MSG_TRUNC))
+ rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1);
+ len = 1;
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ if (smc_curs_diff(conn->rmb_desc->len, &cons,
+ &conn->urg_curs) > 1)
+ conn->urg_rx_skip_pend = true;
+ /* Urgent Byte was already accounted for, but trigger
+ * skipping the urgent byte in non-inline case
+ */
+ if (!(flags & MSG_PEEK))
+ smc_rx_update_consumer(smc, cons, 0);
+ } else {
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ return rc ? -EFAULT : len;
+ }
+
+ if (sk->sk_state == SMC_CLOSED || sk->sk_shutdown & RCV_SHUTDOWN)
+ return 0;
+
+ return -EAGAIN;
+}
+
+static bool smc_rx_recvmsg_data_available(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+
+ if (smc_rx_data_available(conn))
+ return true;
+ else if (conn->urg_state == SMC_URG_VALID)
+ /* we received a single urgent Byte - skip */
+ smc_rx_update_cons(smc, 0);
+ return false;
+}
+
+/* smc_rx_recvmsg - receive data from RMBE
+ * @msg: copy data to receive buffer
+ * @pipe: copy data to pipe if set - indicates splice() call
+ *
+ * rcvbuf consumer: main API called by socket layer.
+ * Called under sk lock.
+ */
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
+ struct pipe_inode_info *pipe, size_t len, int flags)
+{
+ size_t copylen, read_done = 0, read_remaining = len;
+ size_t chunk_len, chunk_off, chunk_len_sum;
+ struct smc_connection *conn = &smc->conn;
+ int (*func)(struct smc_connection *conn);
+ union smc_host_cursor cons;
+ int readable, chunk;
+ char *rcvbuf_base;
+ struct sock *sk;
+ int splbytes;
+ long timeo;
+ int target; /* Read at least these many bytes */
+ int rc;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return -EINVAL; /* future work for sk.sk_family == AF_SMC */
+
+ sk = &smc->sk;
+ if (sk->sk_state == SMC_LISTEN)
+ return -ENOTCONN;
+ if (flags & MSG_OOB)
+ return smc_rx_recv_urg(smc, msg, len, flags);
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+ readable = atomic_read(&conn->bytes_to_rcv);
+ if (readable >= conn->rmb_desc->len)
+ SMC_STAT_RMB_RX_FULL(smc, !conn->lnk);
+
+ if (len < readable)
+ SMC_STAT_RMB_RX_SIZE_SMALL(smc, !conn->lnk);
+ /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
+ rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr;
+
+ do { /* while (read_remaining) */
+ if (read_done >= target || (pipe && read_done))
+ break;
+
+ if (conn->killed)
+ break;
+
+ if (smc_rx_recvmsg_data_available(smc))
+ goto copy;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ /* smc_cdc_msg_recv_action() could have run after
+ * above smc_rx_recvmsg_data_available()
+ */
+ if (smc_rx_recvmsg_data_available(smc))
+ goto copy;
+ break;
+ }
+
+ if (read_done) {
+ if (sk->sk_err ||
+ sk->sk_state == SMC_CLOSED ||
+ !timeo ||
+ signal_pending(current))
+ break;
+ } else {
+ if (sk->sk_err) {
+ read_done = sock_error(sk);
+ break;
+ }
+ if (sk->sk_state == SMC_CLOSED) {
+ if (!sock_flag(sk, SOCK_DONE)) {
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ read_done = -ENOTCONN;
+ break;
+ }
+ break;
+ }
+ if (!timeo)
+ return -EAGAIN;
+ if (signal_pending(current)) {
+ read_done = sock_intr_errno(timeo);
+ break;
+ }
+ }
+
+ if (!smc_rx_data_available(conn)) {
+ smc_rx_wait(smc, &timeo, smc_rx_data_available);
+ continue;
+ }
+
+copy:
+ /* initialize variables for 1st iteration of subsequent loop */
+ /* could be just 1 byte, even after waiting on data above */
+ readable = atomic_read(&conn->bytes_to_rcv);
+ splbytes = atomic_read(&conn->splice_pending);
+ if (!readable || (msg && splbytes)) {
+ if (splbytes)
+ func = smc_rx_data_available_and_no_splice_pend;
+ else
+ func = smc_rx_data_available;
+ smc_rx_wait(smc, &timeo, func);
+ continue;
+ }
+
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ /* subsequent splice() calls pick up where previous left */
+ if (splbytes)
+ smc_curs_add(conn->rmb_desc->len, &cons, splbytes);
+ if (conn->urg_state == SMC_URG_VALID &&
+ sock_flag(&smc->sk, SOCK_URGINLINE) &&
+ readable > 1)
+ readable--; /* always stop at urgent Byte */
+ /* not more than what user space asked for */
+ copylen = min_t(size_t, read_remaining, readable);
+ /* determine chunks where to read from rcvbuf */
+ /* either unwrapped case, or 1st chunk of wrapped case */
+ chunk_len = min_t(size_t, copylen, conn->rmb_desc->len -
+ cons.count);
+ chunk_len_sum = chunk_len;
+ chunk_off = cons.count;
+ smc_rmb_sync_sg_for_cpu(conn);
+ for (chunk = 0; chunk < 2; chunk++) {
+ if (!(flags & MSG_TRUNC)) {
+ if (msg) {
+ rc = memcpy_to_msg(msg, rcvbuf_base +
+ chunk_off,
+ chunk_len);
+ } else {
+ rc = smc_rx_splice(pipe, rcvbuf_base +
+ chunk_off, chunk_len,
+ smc);
+ }
+ if (rc < 0) {
+ if (!read_done)
+ read_done = -EFAULT;
+ goto out;
+ }
+ }
+ read_remaining -= chunk_len;
+ read_done += chunk_len;
+
+ if (chunk_len_sum == copylen)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ chunk_len = copylen - chunk_len; /* remainder */
+ chunk_len_sum += chunk_len;
+ chunk_off = 0; /* modulo offset in recv ring buffer */
+ }
+
+ /* update cursors */
+ if (!(flags & MSG_PEEK)) {
+ /* increased in recv tasklet smc_cdc_msg_rcv() */
+ smp_mb__before_atomic();
+ atomic_sub(copylen, &conn->bytes_to_rcv);
+ /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
+ smp_mb__after_atomic();
+ if (msg && smc_rx_update_consumer(smc, cons, copylen))
+ goto out;
+ }
+
+ trace_smc_rx_recvmsg(smc, copylen);
+ } while (read_remaining);
+out:
+ return read_done;
+}
+
+/* Initialize receive properties on connection establishment. NB: not __init! */
+void smc_rx_init(struct smc_sock *smc)
+{
+ smc->sk.sk_data_ready = smc_rx_wake_up;
+ atomic_set(&smc->conn.splice_pending, 0);
+ smc->conn.urg_state = SMC_URG_READ;
+}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
new file mode 100644
index 000000000..db823c97d
--- /dev/null
+++ b/net/smc/smc_rx.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_RX_H
+#define SMC_RX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+
+void smc_rx_init(struct smc_sock *smc);
+
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
+ struct pipe_inode_info *pipe, size_t len, int flags);
+int smc_rx_wait(struct smc_sock *smc, long *timeo,
+ int (*fcrit)(struct smc_connection *conn));
+static inline int smc_rx_data_available(struct smc_connection *conn)
+{
+ return atomic_read(&conn->bytes_to_rcv);
+}
+
+#endif /* SMC_RX_H */
diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c
new file mode 100644
index 000000000..ca14c0f3a
--- /dev/null
+++ b/net/smc/smc_stats.c
@@ -0,0 +1,413 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * SMC statistics netlink routines
+ *
+ * Copyright IBM Corp. 2021
+ *
+ * Author(s): Guvenc Gulce
+ */
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/ctype.h>
+#include <linux/smc.h>
+#include <net/genetlink.h>
+#include <net/sock.h>
+#include "smc_netlink.h"
+#include "smc_stats.h"
+
+int smc_stats_init(struct net *net)
+{
+ net->smc.fback_rsn = kzalloc(sizeof(*net->smc.fback_rsn), GFP_KERNEL);
+ if (!net->smc.fback_rsn)
+ goto err_fback;
+ net->smc.smc_stats = alloc_percpu(struct smc_stats);
+ if (!net->smc.smc_stats)
+ goto err_stats;
+ mutex_init(&net->smc.mutex_fback_rsn);
+ return 0;
+
+err_stats:
+ kfree(net->smc.fback_rsn);
+err_fback:
+ return -ENOMEM;
+}
+
+void smc_stats_exit(struct net *net)
+{
+ kfree(net->smc.fback_rsn);
+ if (net->smc.smc_stats)
+ free_percpu(net->smc.smc_stats);
+}
+
+static int smc_nl_fill_stats_rmb_data(struct sk_buff *skb,
+ struct smc_stats *stats, int tech,
+ int type)
+{
+ struct smc_stats_rmbcnt *stats_rmb_cnt;
+ struct nlattr *attrs;
+
+ if (type == SMC_NLA_STATS_T_TX_RMB_STATS)
+ stats_rmb_cnt = &stats->smc[tech].rmb_tx;
+ else
+ stats_rmb_cnt = &stats->smc[tech].rmb_rx;
+
+ attrs = nla_nest_start(skb, type);
+ if (!attrs)
+ goto errout;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_REUSE_CNT,
+ stats_rmb_cnt->reuse_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_PEER_CNT,
+ stats_rmb_cnt->buf_size_small_peer_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_CNT,
+ stats_rmb_cnt->buf_size_small_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_PEER_CNT,
+ stats_rmb_cnt->buf_full_peer_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_CNT,
+ stats_rmb_cnt->buf_full_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_ALLOC_CNT,
+ stats_rmb_cnt->alloc_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_DGRADE_CNT,
+ stats_rmb_cnt->dgrade_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+
+ nla_nest_end(skb, attrs);
+ return 0;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_fill_stats_bufsize_data(struct sk_buff *skb,
+ struct smc_stats *stats, int tech,
+ int type)
+{
+ struct smc_stats_memsize *stats_pload;
+ struct nlattr *attrs;
+
+ if (type == SMC_NLA_STATS_T_TXPLOAD_SIZE)
+ stats_pload = &stats->smc[tech].tx_pd;
+ else if (type == SMC_NLA_STATS_T_RXPLOAD_SIZE)
+ stats_pload = &stats->smc[tech].rx_pd;
+ else if (type == SMC_NLA_STATS_T_TX_RMB_SIZE)
+ stats_pload = &stats->smc[tech].tx_rmbsize;
+ else if (type == SMC_NLA_STATS_T_RX_RMB_SIZE)
+ stats_pload = &stats->smc[tech].rx_rmbsize;
+ else
+ goto errout;
+
+ attrs = nla_nest_start(skb, type);
+ if (!attrs)
+ goto errout;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_8K,
+ stats_pload->buf[SMC_BUF_8K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_16K,
+ stats_pload->buf[SMC_BUF_16K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_32K,
+ stats_pload->buf[SMC_BUF_32K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_64K,
+ stats_pload->buf[SMC_BUF_64K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_128K,
+ stats_pload->buf[SMC_BUF_128K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_256K,
+ stats_pload->buf[SMC_BUF_256K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_512K,
+ stats_pload->buf[SMC_BUF_512K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_1024K,
+ stats_pload->buf[SMC_BUF_1024K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_G_1024K,
+ stats_pload->buf[SMC_BUF_G_1024K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+
+ nla_nest_end(skb, attrs);
+ return 0;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_fill_stats_tech_data(struct sk_buff *skb,
+ struct smc_stats *stats, int tech)
+{
+ struct smc_stats_tech *smc_tech;
+ struct nlattr *attrs;
+
+ smc_tech = &stats->smc[tech];
+ if (tech == SMC_TYPE_D)
+ attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCD_TECH);
+ else
+ attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCR_TECH);
+
+ if (!attrs)
+ goto errout;
+ if (smc_nl_fill_stats_rmb_data(skb, stats, tech,
+ SMC_NLA_STATS_T_TX_RMB_STATS))
+ goto errattr;
+ if (smc_nl_fill_stats_rmb_data(skb, stats, tech,
+ SMC_NLA_STATS_T_RX_RMB_STATS))
+ goto errattr;
+ if (smc_nl_fill_stats_bufsize_data(skb, stats, tech,
+ SMC_NLA_STATS_T_TXPLOAD_SIZE))
+ goto errattr;
+ if (smc_nl_fill_stats_bufsize_data(skb, stats, tech,
+ SMC_NLA_STATS_T_RXPLOAD_SIZE))
+ goto errattr;
+ if (smc_nl_fill_stats_bufsize_data(skb, stats, tech,
+ SMC_NLA_STATS_T_TX_RMB_SIZE))
+ goto errattr;
+ if (smc_nl_fill_stats_bufsize_data(skb, stats, tech,
+ SMC_NLA_STATS_T_RX_RMB_SIZE))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V1_SUCC,
+ smc_tech->clnt_v1_succ_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V2_SUCC,
+ smc_tech->clnt_v2_succ_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V1_SUCC,
+ smc_tech->srv_v1_succ_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V2_SUCC,
+ smc_tech->srv_v2_succ_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_BYTES,
+ smc_tech->rx_bytes,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_BYTES,
+ smc_tech->tx_bytes,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT,
+ smc_tech->rx_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_CNT,
+ smc_tech->tx_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SENDPAGE_CNT,
+ 0,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CORK_CNT,
+ smc_tech->cork_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_NDLY_CNT,
+ smc_tech->ndly_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SPLICE_CNT,
+ smc_tech->splice_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_URG_DATA_CNT,
+ smc_tech->urg_data_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+
+ nla_nest_end(skb, attrs);
+ return 0;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ return -EMSGSIZE;
+}
+
+int smc_nl_get_stats(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct net *net = sock_net(skb->sk);
+ struct smc_stats *stats;
+ struct nlattr *attrs;
+ int cpu, i, size;
+ void *nlh;
+ u64 *src;
+ u64 *sum;
+
+ if (cb_ctx->pos[0])
+ goto errmsg;
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_STATS);
+ if (!nlh)
+ goto errmsg;
+
+ attrs = nla_nest_start(skb, SMC_GEN_STATS);
+ if (!attrs)
+ goto errnest;
+ stats = kzalloc(sizeof(*stats), GFP_KERNEL);
+ if (!stats)
+ goto erralloc;
+ size = sizeof(*stats) / sizeof(u64);
+ for_each_possible_cpu(cpu) {
+ src = (u64 *)per_cpu_ptr(net->smc.smc_stats, cpu);
+ sum = (u64 *)stats;
+ for (i = 0; i < size; i++)
+ *(sum++) += *(src++);
+ }
+ if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_D))
+ goto errattr;
+ if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_R))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_CLNT_HS_ERR_CNT,
+ stats->clnt_hshake_err_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_SRV_HS_ERR_CNT,
+ stats->srv_hshake_err_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ cb_ctx->pos[0] = 1;
+ kfree(stats);
+ return skb->len;
+
+errattr:
+ kfree(stats);
+erralloc:
+ nla_nest_cancel(skb, attrs);
+errnest:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return skb->len;
+}
+
+static int smc_nl_get_fback_details(struct sk_buff *skb,
+ struct netlink_callback *cb, int pos,
+ bool is_srv)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct net *net = sock_net(skb->sk);
+ int cnt_reported = cb_ctx->pos[2];
+ struct smc_stats_fback *trgt_arr;
+ struct nlattr *attrs;
+ int rc = 0;
+ void *nlh;
+
+ if (is_srv)
+ trgt_arr = &net->smc.fback_rsn->srv[0];
+ else
+ trgt_arr = &net->smc.fback_rsn->clnt[0];
+ if (!trgt_arr[pos].fback_code)
+ return -ENODATA;
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_FBACK_STATS);
+ if (!nlh)
+ goto errmsg;
+ attrs = nla_nest_start(skb, SMC_GEN_FBACK_STATS);
+ if (!attrs)
+ goto errout;
+ if (nla_put_u8(skb, SMC_NLA_FBACK_STATS_TYPE, is_srv))
+ goto errattr;
+ if (!cnt_reported) {
+ if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_SRV_CNT,
+ net->smc.fback_rsn->srv_fback_cnt,
+ SMC_NLA_FBACK_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_CLNT_CNT,
+ net->smc.fback_rsn->clnt_fback_cnt,
+ SMC_NLA_FBACK_STATS_PAD))
+ goto errattr;
+ cnt_reported = 1;
+ }
+
+ if (nla_put_u32(skb, SMC_NLA_FBACK_STATS_RSN_CODE,
+ trgt_arr[pos].fback_code))
+ goto errattr;
+ if (nla_put_u16(skb, SMC_NLA_FBACK_STATS_RSN_CNT,
+ trgt_arr[pos].count))
+ goto errattr;
+
+ cb_ctx->pos[2] = cnt_reported;
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return rc;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct net *net = sock_net(skb->sk);
+ int rc_srv = 0, rc_clnt = 0, k;
+ int skip_serv = cb_ctx->pos[1];
+ int snum = cb_ctx->pos[0];
+ bool is_srv = true;
+
+ mutex_lock(&net->smc.mutex_fback_rsn);
+ for (k = 0; k < SMC_MAX_FBACK_RSN_CNT; k++) {
+ if (k < snum)
+ continue;
+ if (!skip_serv) {
+ rc_srv = smc_nl_get_fback_details(skb, cb, k, is_srv);
+ if (rc_srv && rc_srv != -ENODATA)
+ break;
+ } else {
+ skip_serv = 0;
+ }
+ rc_clnt = smc_nl_get_fback_details(skb, cb, k, !is_srv);
+ if (rc_clnt && rc_clnt != -ENODATA) {
+ skip_serv = 1;
+ break;
+ }
+ if (rc_clnt == -ENODATA && rc_srv == -ENODATA)
+ break;
+ }
+ mutex_unlock(&net->smc.mutex_fback_rsn);
+ cb_ctx->pos[1] = skip_serv;
+ cb_ctx->pos[0] = k;
+ return skb->len;
+}
diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h
new file mode 100644
index 000000000..9d32058db
--- /dev/null
+++ b/net/smc/smc_stats.h
@@ -0,0 +1,270 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Macros for SMC statistics
+ *
+ * Copyright IBM Corp. 2021
+ *
+ * Author(s): Guvenc Gulce
+ */
+
+#ifndef NET_SMC_SMC_STATS_H_
+#define NET_SMC_SMC_STATS_H_
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/ctype.h>
+#include <linux/smc.h>
+
+#include "smc_clc.h"
+
+#define SMC_MAX_FBACK_RSN_CNT 30
+
+enum {
+ SMC_BUF_8K,
+ SMC_BUF_16K,
+ SMC_BUF_32K,
+ SMC_BUF_64K,
+ SMC_BUF_128K,
+ SMC_BUF_256K,
+ SMC_BUF_512K,
+ SMC_BUF_1024K,
+ SMC_BUF_G_1024K,
+ SMC_BUF_MAX,
+};
+
+struct smc_stats_fback {
+ int fback_code;
+ u16 count;
+};
+
+struct smc_stats_rsn {
+ struct smc_stats_fback srv[SMC_MAX_FBACK_RSN_CNT];
+ struct smc_stats_fback clnt[SMC_MAX_FBACK_RSN_CNT];
+ u64 srv_fback_cnt;
+ u64 clnt_fback_cnt;
+};
+
+struct smc_stats_rmbcnt {
+ u64 buf_size_small_peer_cnt;
+ u64 buf_size_small_cnt;
+ u64 buf_full_peer_cnt;
+ u64 buf_full_cnt;
+ u64 reuse_cnt;
+ u64 alloc_cnt;
+ u64 dgrade_cnt;
+};
+
+struct smc_stats_memsize {
+ u64 buf[SMC_BUF_MAX];
+};
+
+struct smc_stats_tech {
+ struct smc_stats_memsize tx_rmbsize;
+ struct smc_stats_memsize rx_rmbsize;
+ struct smc_stats_memsize tx_pd;
+ struct smc_stats_memsize rx_pd;
+ struct smc_stats_rmbcnt rmb_tx;
+ struct smc_stats_rmbcnt rmb_rx;
+ u64 clnt_v1_succ_cnt;
+ u64 clnt_v2_succ_cnt;
+ u64 srv_v1_succ_cnt;
+ u64 srv_v2_succ_cnt;
+ u64 urg_data_cnt;
+ u64 splice_cnt;
+ u64 cork_cnt;
+ u64 ndly_cnt;
+ u64 rx_bytes;
+ u64 tx_bytes;
+ u64 rx_cnt;
+ u64 tx_cnt;
+};
+
+struct smc_stats {
+ struct smc_stats_tech smc[2];
+ u64 clnt_hshake_err_cnt;
+ u64 srv_hshake_err_cnt;
+};
+
+#define SMC_STAT_PAYLOAD_SUB(_smc_stats, _tech, key, _len, _rc) \
+do { \
+ typeof(_smc_stats) stats = (_smc_stats); \
+ typeof(_tech) t = (_tech); \
+ typeof(_len) l = (_len); \
+ int _pos; \
+ typeof(_rc) r = (_rc); \
+ int m = SMC_BUF_MAX - 1; \
+ this_cpu_inc((*stats).smc[t].key ## _cnt); \
+ if (r <= 0 || l <= 0) \
+ break; \
+ _pos = fls64((l - 1) >> 13); \
+ _pos = (_pos <= m) ? _pos : m; \
+ this_cpu_inc((*stats).smc[t].key ## _pd.buf[_pos]); \
+ this_cpu_add((*stats).smc[t].key ## _bytes, r); \
+} \
+while (0)
+
+#define SMC_STAT_TX_PAYLOAD(_smc, length, rcode) \
+do { \
+ typeof(_smc) __smc = _smc; \
+ struct net *_net = sock_net(&__smc->sk); \
+ struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \
+ typeof(length) _len = (length); \
+ typeof(rcode) _rc = (rcode); \
+ bool is_smcd = !__smc->conn.lnk; \
+ if (is_smcd) \
+ SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, tx, _len, _rc); \
+ else \
+ SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, tx, _len, _rc); \
+} \
+while (0)
+
+#define SMC_STAT_RX_PAYLOAD(_smc, length, rcode) \
+do { \
+ typeof(_smc) __smc = _smc; \
+ struct net *_net = sock_net(&__smc->sk); \
+ struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \
+ typeof(length) _len = (length); \
+ typeof(rcode) _rc = (rcode); \
+ bool is_smcd = !__smc->conn.lnk; \
+ if (is_smcd) \
+ SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, rx, _len, _rc); \
+ else \
+ SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, rx, _len, _rc); \
+} \
+while (0)
+
+#define SMC_STAT_RMB_SIZE_SUB(_smc_stats, _tech, k, _len) \
+do { \
+ typeof(_len) _l = (_len); \
+ typeof(_tech) t = (_tech); \
+ int _pos; \
+ int m = SMC_BUF_MAX - 1; \
+ if (_l <= 0) \
+ break; \
+ _pos = fls((_l - 1) >> 13); \
+ _pos = (_pos <= m) ? _pos : m; \
+ this_cpu_inc((*(_smc_stats)).smc[t].k ## _rmbsize.buf[_pos]); \
+} \
+while (0)
+
+#define SMC_STAT_RMB_SUB(_smc_stats, type, t, key) \
+ this_cpu_inc((*(_smc_stats)).smc[t].rmb ## _ ## key.type ## _cnt)
+
+#define SMC_STAT_RMB_SIZE(_smc, _is_smcd, _is_rx, _len) \
+do { \
+ struct net *_net = sock_net(&(_smc)->sk); \
+ struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \
+ typeof(_is_smcd) is_d = (_is_smcd); \
+ typeof(_is_rx) is_r = (_is_rx); \
+ typeof(_len) l = (_len); \
+ if ((is_d) && (is_r)) \
+ SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, rx, l); \
+ if ((is_d) && !(is_r)) \
+ SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, tx, l); \
+ if (!(is_d) && (is_r)) \
+ SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, rx, l); \
+ if (!(is_d) && !(is_r)) \
+ SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, tx, l); \
+} \
+while (0)
+
+#define SMC_STAT_RMB(_smc, type, _is_smcd, _is_rx) \
+do { \
+ struct net *net = sock_net(&(_smc)->sk); \
+ struct smc_stats __percpu *_smc_stats = net->smc.smc_stats; \
+ typeof(_is_smcd) is_d = (_is_smcd); \
+ typeof(_is_rx) is_r = (_is_rx); \
+ if ((is_d) && (is_r)) \
+ SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, rx); \
+ if ((is_d) && !(is_r)) \
+ SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, tx); \
+ if (!(is_d) && (is_r)) \
+ SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, rx); \
+ if (!(is_d) && !(is_r)) \
+ SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, tx); \
+} \
+while (0)
+
+#define SMC_STAT_BUF_REUSE(smc, is_smcd, is_rx) \
+ SMC_STAT_RMB(smc, reuse, is_smcd, is_rx)
+
+#define SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rx) \
+ SMC_STAT_RMB(smc, alloc, is_smcd, is_rx)
+
+#define SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rx) \
+ SMC_STAT_RMB(smc, dgrade, is_smcd, is_rx)
+
+#define SMC_STAT_RMB_TX_PEER_FULL(smc, is_smcd) \
+ SMC_STAT_RMB(smc, buf_full_peer, is_smcd, false)
+
+#define SMC_STAT_RMB_TX_FULL(smc, is_smcd) \
+ SMC_STAT_RMB(smc, buf_full, is_smcd, false)
+
+#define SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, is_smcd) \
+ SMC_STAT_RMB(smc, buf_size_small_peer, is_smcd, false)
+
+#define SMC_STAT_RMB_TX_SIZE_SMALL(smc, is_smcd) \
+ SMC_STAT_RMB(smc, buf_size_small, is_smcd, false)
+
+#define SMC_STAT_RMB_RX_SIZE_SMALL(smc, is_smcd) \
+ SMC_STAT_RMB(smc, buf_size_small, is_smcd, true)
+
+#define SMC_STAT_RMB_RX_FULL(smc, is_smcd) \
+ SMC_STAT_RMB(smc, buf_full, is_smcd, true)
+
+#define SMC_STAT_INC(_smc, type) \
+do { \
+ typeof(_smc) __smc = _smc; \
+ bool is_smcd = !(__smc)->conn.lnk; \
+ struct net *net = sock_net(&(__smc)->sk); \
+ struct smc_stats __percpu *smc_stats = net->smc.smc_stats; \
+ if ((is_smcd)) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].type); \
+ else \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_R].type); \
+} \
+while (0)
+
+#define SMC_STAT_CLNT_SUCC_INC(net, _aclc) \
+do { \
+ typeof(_aclc) acl = (_aclc); \
+ bool is_v2 = (acl->hdr.version == SMC_V2); \
+ bool is_smcd = (acl->hdr.typev1 == SMC_TYPE_D); \
+ struct smc_stats __percpu *smc_stats = (net)->smc.smc_stats; \
+ if (is_v2 && is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v2_succ_cnt); \
+ else if (is_v2 && !is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v2_succ_cnt); \
+ else if (!is_v2 && is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v1_succ_cnt); \
+ else if (!is_v2 && !is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v1_succ_cnt); \
+} \
+while (0)
+
+#define SMC_STAT_SERV_SUCC_INC(net, _ini) \
+do { \
+ typeof(_ini) i = (_ini); \
+ bool is_smcd = (i->is_smcd); \
+ u8 version = is_smcd ? i->smcd_version : i->smcr_version; \
+ bool is_v2 = (version & SMC_V2); \
+ typeof(net->smc.smc_stats) smc_stats = (net)->smc.smc_stats; \
+ if (is_v2 && is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v2_succ_cnt); \
+ else if (is_v2 && !is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v2_succ_cnt); \
+ else if (!is_v2 && is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v1_succ_cnt); \
+ else if (!is_v2 && !is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v1_succ_cnt); \
+} \
+while (0)
+
+int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_stats_init(struct net *net);
+void smc_stats_exit(struct net *net);
+
+#endif /* NET_SMC_SMC_STATS_H_ */
diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c
new file mode 100644
index 000000000..5cbc18c6e
--- /dev/null
+++ b/net/smc/smc_sysctl.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * smc_sysctl.c: sysctl interface to SMC subsystem.
+ *
+ * Copyright (c) 2022, Alibaba Inc.
+ *
+ * Author: Tony Lu <tonylu@linux.alibaba.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <net/net_namespace.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_llc.h"
+#include "smc_sysctl.h"
+
+static int min_sndbuf = SMC_BUF_MIN_SIZE;
+static int min_rcvbuf = SMC_BUF_MIN_SIZE;
+static int max_sndbuf = INT_MAX / 2;
+static int max_rcvbuf = INT_MAX / 2;
+static const int net_smc_wmem_init = (64 * 1024);
+static const int net_smc_rmem_init = (64 * 1024);
+
+static struct ctl_table smc_table[] = {
+ {
+ .procname = "autocorking_size",
+ .data = &init_net.smc.sysctl_autocorking_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ {
+ .procname = "smcr_buf_type",
+ .data = &init_net.smc.sysctl_smcr_buf_type,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "smcr_testlink_time",
+ .data = &init_net.smc.sysctl_smcr_testlink_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "wmem",
+ .data = &init_net.smc.sysctl_wmem,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ .extra2 = &max_sndbuf,
+ },
+ {
+ .procname = "rmem",
+ .data = &init_net.smc.sysctl_rmem,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
+ .extra2 = &max_rcvbuf,
+ },
+ { }
+};
+
+int __net_init smc_sysctl_net_init(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = smc_table;
+ if (!net_eq(net, &init_net)) {
+ int i;
+
+ table = kmemdup(table, sizeof(smc_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++)
+ table[i].data += (void *)net - (void *)&init_net;
+ }
+
+ net->smc.smc_hdr = register_net_sysctl_sz(net, "net/smc", table,
+ ARRAY_SIZE(smc_table));
+ if (!net->smc.smc_hdr)
+ goto err_reg;
+
+ net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
+ net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS;
+ net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME;
+ WRITE_ONCE(net->smc.sysctl_wmem, net_smc_wmem_init);
+ WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init);
+
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+void __net_exit smc_sysctl_net_exit(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->smc.smc_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->smc.smc_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h
new file mode 100644
index 000000000..0becc11bd
--- /dev/null
+++ b/net/smc/smc_sysctl.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * smc_sysctl.c: sysctl interface to SMC subsystem.
+ *
+ * Copyright (c) 2022, Alibaba Inc.
+ *
+ * Author: Tony Lu <tonylu@linux.alibaba.com>
+ *
+ */
+
+#ifndef _SMC_SYSCTL_H
+#define _SMC_SYSCTL_H
+
+#ifdef CONFIG_SYSCTL
+
+int __net_init smc_sysctl_net_init(struct net *net);
+void __net_exit smc_sysctl_net_exit(struct net *net);
+
+#else
+
+static inline int smc_sysctl_net_init(struct net *net)
+{
+ net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
+ return 0;
+}
+
+static inline void smc_sysctl_net_exit(struct net *net) { }
+
+#endif /* CONFIG_SYSCTL */
+
+#endif /* _SMC_SYSCTL_H */
diff --git a/net/smc/smc_tracepoint.c b/net/smc/smc_tracepoint.c
new file mode 100644
index 000000000..8d47ced5a
--- /dev/null
+++ b/net/smc/smc_tracepoint.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define CREATE_TRACE_POINTS
+#include "smc_tracepoint.h"
+
+EXPORT_TRACEPOINT_SYMBOL(smc_switch_to_fallback);
+EXPORT_TRACEPOINT_SYMBOL(smc_tx_sendmsg);
+EXPORT_TRACEPOINT_SYMBOL(smc_rx_recvmsg);
+EXPORT_TRACEPOINT_SYMBOL(smcr_link_down);
diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h
new file mode 100644
index 000000000..9fc5e586d
--- /dev/null
+++ b/net/smc/smc_tracepoint.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM smc
+
+#if !defined(_TRACE_SMC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SMC_H
+
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/tracepoint.h>
+#include <net/ipv6.h>
+#include "smc.h"
+#include "smc_core.h"
+
+TRACE_EVENT(smc_switch_to_fallback,
+
+ TP_PROTO(const struct smc_sock *smc, int fallback_rsn),
+
+ TP_ARGS(smc, fallback_rsn),
+
+ TP_STRUCT__entry(
+ __field(const void *, sk)
+ __field(const void *, clcsk)
+ __field(u64, net_cookie)
+ __field(int, fallback_rsn)
+ ),
+
+ TP_fast_assign(
+ const struct sock *sk = &smc->sk;
+ const struct sock *clcsk = smc->clcsock->sk;
+
+ __entry->sk = sk;
+ __entry->clcsk = clcsk;
+ __entry->net_cookie = sock_net(sk)->net_cookie;
+ __entry->fallback_rsn = fallback_rsn;
+ ),
+
+ TP_printk("sk=%p clcsk=%p net=%llu fallback_rsn=%d",
+ __entry->sk, __entry->clcsk,
+ __entry->net_cookie, __entry->fallback_rsn)
+);
+
+DECLARE_EVENT_CLASS(smc_msg_event,
+
+ TP_PROTO(const struct smc_sock *smc, size_t len),
+
+ TP_ARGS(smc, len),
+
+ TP_STRUCT__entry(
+ __field(const void *, smc)
+ __field(u64, net_cookie)
+ __field(size_t, len)
+ __string(name, smc->conn.lnk->ibname)
+ ),
+
+ TP_fast_assign(
+ const struct sock *sk = &smc->sk;
+
+ __entry->smc = smc;
+ __entry->net_cookie = sock_net(sk)->net_cookie;
+ __entry->len = len;
+ __assign_str(name, smc->conn.lnk->ibname);
+ ),
+
+ TP_printk("smc=%p net=%llu len=%zu dev=%s",
+ __entry->smc, __entry->net_cookie,
+ __entry->len, __get_str(name))
+);
+
+DEFINE_EVENT(smc_msg_event, smc_tx_sendmsg,
+
+ TP_PROTO(const struct smc_sock *smc, size_t len),
+
+ TP_ARGS(smc, len)
+);
+
+DEFINE_EVENT(smc_msg_event, smc_rx_recvmsg,
+
+ TP_PROTO(const struct smc_sock *smc, size_t len),
+
+ TP_ARGS(smc, len)
+);
+
+TRACE_EVENT(smcr_link_down,
+
+ TP_PROTO(const struct smc_link *lnk, void *location),
+
+ TP_ARGS(lnk, location),
+
+ TP_STRUCT__entry(
+ __field(const void *, lnk)
+ __field(const void *, lgr)
+ __field(u64, net_cookie)
+ __field(int, state)
+ __string(name, lnk->ibname)
+ __field(void *, location)
+ ),
+
+ TP_fast_assign(
+ const struct smc_link_group *lgr = lnk->lgr;
+
+ __entry->lnk = lnk;
+ __entry->lgr = lgr;
+ __entry->net_cookie = lgr->net->net_cookie;
+ __entry->state = lnk->state;
+ __assign_str(name, lnk->ibname);
+ __entry->location = location;
+ ),
+
+ TP_printk("lnk=%p lgr=%p net=%llu state=%d dev=%s location=%pS",
+ __entry->lnk, __entry->lgr, __entry->net_cookie,
+ __entry->state, __get_str(name),
+ __entry->location)
+);
+
+#endif /* _TRACE_SMC_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE smc_tracepoint
+
+#include <trace/define_trace.h>
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
new file mode 100644
index 000000000..3b0ff3b58
--- /dev/null
+++ b/net/smc/smc_tx.c
@@ -0,0 +1,762 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer.
+ * Producer:
+ * Copy user space data into send buffer, if send buffer space available.
+ * Consumer:
+ * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/sched/signal.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+#include "smc_ism.h"
+#include "smc_tx.h"
+#include "smc_stats.h"
+#include "smc_tracepoint.h"
+
+#define SMC_TX_WORK_DELAY 0
+
+/***************************** sndbuf producer *******************************/
+
+/* callback implementation for sk.sk_write_space()
+ * to wakeup sndbuf producers that blocked with smc_tx_wait().
+ * called under sk_socket lock.
+ */
+static void smc_tx_write_space(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+ struct smc_sock *smc = smc_sk(sk);
+ struct socket_wq *wq;
+
+ /* similar to sk_stream_write_space */
+ if (atomic_read(&smc->conn.sndbuf_space) && sock) {
+ if (test_bit(SOCK_NOSPACE, &sock->flags))
+ SMC_STAT_RMB_TX_FULL(smc, !smc->conn.lnk);
+ clear_bit(SOCK_NOSPACE, &sock->flags);
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait,
+ EPOLLOUT | EPOLLWRNORM |
+ EPOLLWRBAND);
+ if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+ sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
+ rcu_read_unlock();
+ }
+}
+
+/* Wakeup sndbuf producers that blocked with smc_tx_wait().
+ * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
+ */
+void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
+{
+ if (smc->sk.sk_socket &&
+ test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags))
+ smc->sk.sk_write_space(&smc->sk);
+}
+
+/* blocks sndbuf producer until at least one byte of free space available
+ * or urgent Byte was consumed
+ */
+static int smc_tx_wait(struct smc_sock *smc, int flags)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ long timeo;
+ int rc = 0;
+
+ /* similar to sk_stream_wait_memory */
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (1) {
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ if (sk->sk_err ||
+ (sk->sk_shutdown & SEND_SHUTDOWN) ||
+ conn->killed ||
+ conn->local_tx_ctrl.conn_state_flags.peer_done_writing) {
+ rc = -EPIPE;
+ break;
+ }
+ if (smc_cdc_rxed_any_close(conn)) {
+ rc = -ECONNRESET;
+ break;
+ }
+ if (!timeo) {
+ /* ensure EPOLLOUT is subsequently generated */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ rc = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ rc = sock_intr_errno(timeo);
+ break;
+ }
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend)
+ break; /* at least 1 byte of free & no urgent data */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ sk_wait_event(sk, &timeo,
+ READ_ONCE(sk->sk_err) ||
+ (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) ||
+ smc_cdc_rxed_any_close(conn) ||
+ (atomic_read(&conn->sndbuf_space) &&
+ !conn->urg_tx_pend),
+ &wait);
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return rc;
+}
+
+static bool smc_tx_is_corked(struct smc_sock *smc)
+{
+ struct tcp_sock *tp = tcp_sk(smc->clcsock->sk);
+
+ return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
+}
+
+/* If we have pending CDC messages, do not send:
+ * Because CQE of this CDC message will happen shortly, it gives
+ * a chance to coalesce future sendmsg() payload in to one RDMA Write,
+ * without need for a timer, and with no latency trade off.
+ * Algorithm here:
+ * 1. First message should never cork
+ * 2. If we have pending Tx CDC messages, wait for the first CDC
+ * message's completion
+ * 3. Don't cork to much data in a single RDMA Write to prevent burst
+ * traffic, total corked message should not exceed sendbuf/2
+ */
+static bool smc_should_autocork(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ int corking_size;
+
+ corking_size = min_t(unsigned int, conn->sndbuf_desc->len >> 1,
+ sock_net(&smc->sk)->smc.sysctl_autocorking_size);
+
+ if (atomic_read(&conn->cdc_pend_tx_wr) == 0 ||
+ smc_tx_prepared_sends(conn) > corking_size)
+ return false;
+ return true;
+}
+
+static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg)
+{
+ struct smc_connection *conn = &smc->conn;
+
+ if (smc_should_autocork(smc))
+ return true;
+
+ /* for a corked socket defer the RDMA writes if
+ * sndbuf_space is still available. The applications
+ * should known how/when to uncork it.
+ */
+ if ((msg->msg_flags & MSG_MORE ||
+ smc_tx_is_corked(smc)) &&
+ atomic_read(&conn->sndbuf_space))
+ return true;
+
+ return false;
+}
+
+/* sndbuf producer: main API called by socket layer.
+ * called under sock lock.
+ */
+int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
+{
+ size_t copylen, send_done = 0, send_remaining = len;
+ size_t chunk_len, chunk_off, chunk_len_sum;
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor prep;
+ struct sock *sk = &smc->sk;
+ char *sndbuf_base;
+ int tx_cnt_prep;
+ int writespace;
+ int rc, chunk;
+
+ /* This should be in poll */
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
+ rc = -EPIPE;
+ goto out_err;
+ }
+
+ if (sk->sk_state == SMC_INIT)
+ return -ENOTCONN;
+
+ if (len > conn->sndbuf_desc->len)
+ SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk);
+
+ if (len > conn->peer_rmbe_size)
+ SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, !conn->lnk);
+
+ if (msg->msg_flags & MSG_OOB)
+ SMC_STAT_INC(smc, urg_data_cnt);
+
+ while (msg_data_left(msg)) {
+ if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
+ (smc->sk.sk_err == ECONNABORTED) ||
+ conn->killed)
+ return -EPIPE;
+ if (smc_cdc_rxed_any_close(conn))
+ return send_done ?: -ECONNRESET;
+
+ if (msg->msg_flags & MSG_OOB)
+ conn->local_tx_ctrl.prod_flags.urg_data_pending = 1;
+
+ if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) {
+ if (send_done)
+ return send_done;
+ rc = smc_tx_wait(smc, msg->msg_flags);
+ if (rc)
+ goto out_err;
+ continue;
+ }
+
+ /* initialize variables for 1st iteration of subsequent loop */
+ /* could be just 1 byte, even after smc_tx_wait above */
+ writespace = atomic_read(&conn->sndbuf_space);
+ /* not more than what user space asked for */
+ copylen = min_t(size_t, send_remaining, writespace);
+ /* determine start of sndbuf */
+ sndbuf_base = conn->sndbuf_desc->cpu_addr;
+ smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
+ tx_cnt_prep = prep.count;
+ /* determine chunks where to write into sndbuf */
+ /* either unwrapped case, or 1st chunk of wrapped case */
+ chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len -
+ tx_cnt_prep);
+ chunk_len_sum = chunk_len;
+ chunk_off = tx_cnt_prep;
+ for (chunk = 0; chunk < 2; chunk++) {
+ rc = memcpy_from_msg(sndbuf_base + chunk_off,
+ msg, chunk_len);
+ if (rc) {
+ smc_sndbuf_sync_sg_for_device(conn);
+ if (send_done)
+ return send_done;
+ goto out_err;
+ }
+ send_done += chunk_len;
+ send_remaining -= chunk_len;
+
+ if (chunk_len_sum == copylen)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ chunk_len = copylen - chunk_len; /* remainder */
+ chunk_len_sum += chunk_len;
+ chunk_off = 0; /* modulo offset in send ring buffer */
+ }
+ smc_sndbuf_sync_sg_for_device(conn);
+ /* update cursors */
+ smc_curs_add(conn->sndbuf_desc->len, &prep, copylen);
+ smc_curs_copy(&conn->tx_curs_prep, &prep, conn);
+ /* increased in send tasklet smc_cdc_tx_handler() */
+ smp_mb__before_atomic();
+ atomic_sub(copylen, &conn->sndbuf_space);
+ /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
+ smp_mb__after_atomic();
+ /* since we just produced more new data into sndbuf,
+ * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
+ */
+ if ((msg->msg_flags & MSG_OOB) && !send_remaining)
+ conn->urg_tx_pend = true;
+ /* If we need to cork, do nothing and wait for the next
+ * sendmsg() call or push on tx completion
+ */
+ if (!smc_tx_should_cork(smc, msg))
+ smc_tx_sndbuf_nonempty(conn);
+
+ trace_smc_tx_sendmsg(smc, copylen);
+ } /* while (msg_data_left(msg)) */
+
+ return send_done;
+
+out_err:
+ rc = sk_stream_error(sk, msg->msg_flags, rc);
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(rc == -EAGAIN))
+ sk->sk_write_space(sk);
+ return rc;
+}
+
+/***************************** sndbuf consumer *******************************/
+
+/* sndbuf consumer: actual data transfer of one target chunk with ISM write */
+int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
+ u32 offset, int signal)
+{
+ int rc;
+
+ rc = smc_ism_write(conn->lgr->smcd, conn->peer_token,
+ conn->peer_rmbe_idx, signal, conn->tx_off + offset,
+ data, len);
+ if (rc)
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ return rc;
+}
+
+/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
+static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
+ int num_sges, struct ib_rdma_wr *rdma_wr)
+{
+ struct smc_link_group *lgr = conn->lgr;
+ struct smc_link *link = conn->lnk;
+ int rc;
+
+ rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link);
+ rdma_wr->wr.num_sge = num_sges;
+ rdma_wr->remote_addr =
+ lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr +
+ /* RMBE within RMB */
+ conn->tx_off +
+ /* offset within RMBE */
+ peer_rmbe_offset;
+ rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey;
+ rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL);
+ if (rc)
+ smcr_link_down_cond_sched(link);
+ return rc;
+}
+
+/* sndbuf consumer */
+static inline void smc_tx_advance_cursors(struct smc_connection *conn,
+ union smc_host_cursor *prod,
+ union smc_host_cursor *sent,
+ size_t len)
+{
+ smc_curs_add(conn->peer_rmbe_size, prod, len);
+ /* increased in recv tasklet smc_cdc_msg_rcv() */
+ smp_mb__before_atomic();
+ /* data in flight reduces usable snd_wnd */
+ atomic_sub(len, &conn->peer_rmbe_space);
+ /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
+ smp_mb__after_atomic();
+ smc_curs_add(conn->sndbuf_desc->len, sent, len);
+}
+
+/* SMC-R helper for smc_tx_rdma_writes() */
+static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
+ size_t src_off, size_t src_len,
+ size_t dst_off, size_t dst_len,
+ struct smc_rdma_wr *wr_rdma_buf)
+{
+ struct smc_link *link = conn->lnk;
+
+ dma_addr_t dma_addr =
+ sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl);
+ u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr;
+ int src_len_sum = src_len, dst_len_sum = dst_len;
+ int sent_count = src_off;
+ int srcchunk, dstchunk;
+ int num_sges;
+ int rc;
+
+ for (dstchunk = 0; dstchunk < 2; dstchunk++) {
+ struct ib_rdma_wr *wr = &wr_rdma_buf->wr_tx_rdma[dstchunk];
+ struct ib_sge *sge = wr->wr.sg_list;
+ u64 base_addr = dma_addr;
+
+ if (dst_len < link->qp_attr.cap.max_inline_data) {
+ base_addr = virt_addr;
+ wr->wr.send_flags |= IB_SEND_INLINE;
+ } else {
+ wr->wr.send_flags &= ~IB_SEND_INLINE;
+ }
+
+ num_sges = 0;
+ for (srcchunk = 0; srcchunk < 2; srcchunk++) {
+ sge[srcchunk].addr = conn->sndbuf_desc->is_vm ?
+ (virt_addr + src_off) : (base_addr + src_off);
+ sge[srcchunk].length = src_len;
+ if (conn->sndbuf_desc->is_vm)
+ sge[srcchunk].lkey =
+ conn->sndbuf_desc->mr[link->link_idx]->lkey;
+ num_sges++;
+
+ src_off += src_len;
+ if (src_off >= conn->sndbuf_desc->len)
+ src_off -= conn->sndbuf_desc->len;
+ /* modulo in send ring */
+ if (src_len_sum == dst_len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ src_len = dst_len - src_len; /* remainder */
+ src_len_sum += src_len;
+ }
+ rc = smc_tx_rdma_write(conn, dst_off, num_sges, wr);
+ if (rc)
+ return rc;
+ if (dst_len_sum == len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ dst_off = 0; /* modulo offset in RMBE ring buffer */
+ dst_len = len - dst_len; /* remainder */
+ dst_len_sum += dst_len;
+ src_len = min_t(int, dst_len, conn->sndbuf_desc->len -
+ sent_count);
+ src_len_sum = src_len;
+ }
+ return 0;
+}
+
+/* SMC-D helper for smc_tx_rdma_writes() */
+static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len,
+ size_t src_off, size_t src_len,
+ size_t dst_off, size_t dst_len)
+{
+ int src_len_sum = src_len, dst_len_sum = dst_len;
+ int srcchunk, dstchunk;
+ int rc;
+
+ for (dstchunk = 0; dstchunk < 2; dstchunk++) {
+ for (srcchunk = 0; srcchunk < 2; srcchunk++) {
+ void *data = conn->sndbuf_desc->cpu_addr + src_off;
+
+ rc = smcd_tx_ism_write(conn, data, src_len, dst_off +
+ sizeof(struct smcd_cdc_msg), 0);
+ if (rc)
+ return rc;
+ dst_off += src_len;
+ src_off += src_len;
+ if (src_off >= conn->sndbuf_desc->len)
+ src_off -= conn->sndbuf_desc->len;
+ /* modulo in send ring */
+ if (src_len_sum == dst_len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ src_len = dst_len - src_len; /* remainder */
+ src_len_sum += src_len;
+ }
+ if (dst_len_sum == len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ dst_off = 0; /* modulo offset in RMBE ring buffer */
+ dst_len = len - dst_len; /* remainder */
+ dst_len_sum += dst_len;
+ src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off);
+ src_len_sum = src_len;
+ }
+ return 0;
+}
+
+/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
+ * usable snd_wnd as max transmit
+ */
+static int smc_tx_rdma_writes(struct smc_connection *conn,
+ struct smc_rdma_wr *wr_rdma_buf)
+{
+ size_t len, src_len, dst_off, dst_len; /* current chunk values */
+ union smc_host_cursor sent, prep, prod, cons;
+ struct smc_cdc_producer_flags *pflags;
+ int to_send, rmbespace;
+ int rc;
+
+ /* source: sndbuf */
+ smc_curs_copy(&sent, &conn->tx_curs_sent, conn);
+ smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
+ /* cf. wmem_alloc - (snd_max - snd_una) */
+ to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
+ if (to_send <= 0)
+ return 0;
+
+ /* destination: RMBE */
+ /* cf. snd_wnd */
+ rmbespace = atomic_read(&conn->peer_rmbe_space);
+ if (rmbespace <= 0) {
+ struct smc_sock *smc = container_of(conn, struct smc_sock,
+ conn);
+ SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk);
+ return 0;
+ }
+ smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
+ smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
+
+ /* if usable snd_wnd closes ask peer to advertise once it opens again */
+ pflags = &conn->local_tx_ctrl.prod_flags;
+ pflags->write_blocked = (to_send >= rmbespace);
+ /* cf. usable snd_wnd */
+ len = min(to_send, rmbespace);
+
+ /* initialize variables for first iteration of subsequent nested loop */
+ dst_off = prod.count;
+ if (prod.wrap == cons.wrap) {
+ /* the filled destination area is unwrapped,
+ * hence the available free destination space is wrapped
+ * and we need 2 destination chunks of sum len; start with 1st
+ * which is limited by what's available in sndbuf
+ */
+ dst_len = min_t(size_t,
+ conn->peer_rmbe_size - prod.count, len);
+ } else {
+ /* the filled destination area is wrapped,
+ * hence the available free destination space is unwrapped
+ * and we need a single destination chunk of entire len
+ */
+ dst_len = len;
+ }
+ /* dst_len determines the maximum src_len */
+ if (sent.count + dst_len <= conn->sndbuf_desc->len) {
+ /* unwrapped src case: single chunk of entire dst_len */
+ src_len = dst_len;
+ } else {
+ /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
+ src_len = conn->sndbuf_desc->len - sent.count;
+ }
+
+ if (conn->lgr->is_smcd)
+ rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len,
+ dst_off, dst_len);
+ else
+ rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len,
+ dst_off, dst_len, wr_rdma_buf);
+ if (rc)
+ return rc;
+
+ if (conn->urg_tx_pend && len == to_send)
+ pflags->urg_data_present = 1;
+ smc_tx_advance_cursors(conn, &prod, &sent, len);
+ /* update connection's cursors with advanced local cursors */
+ smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn);
+ /* dst: peer RMBE */
+ smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */
+
+ return 0;
+}
+
+/* Wakeup sndbuf consumers from any context (IRQ or process)
+ * since there is more data to transmit; usable snd_wnd as max transmit
+ */
+static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
+ struct smc_link *link = conn->lnk;
+ struct smc_rdma_wr *wr_rdma_buf;
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!link || !smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend);
+ if (rc < 0) {
+ smc_wr_tx_link_put(link);
+ if (rc == -EBUSY) {
+ struct smc_sock *smc =
+ container_of(conn, struct smc_sock, conn);
+
+ if (smc->sk.sk_err == ECONNABORTED)
+ return sock_error(&smc->sk);
+ if (conn->killed)
+ return -EPIPE;
+ rc = 0;
+ mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
+ SMC_TX_WORK_DELAY);
+ }
+ return rc;
+ }
+
+ spin_lock_bh(&conn->send_lock);
+ if (link != conn->lnk) {
+ /* link of connection changed, tx_work will restart */
+ smc_wr_tx_put_slot(link,
+ (struct smc_wr_tx_pend_priv *)pend);
+ rc = -ENOLINK;
+ goto out_unlock;
+ }
+ if (!pflags->urg_data_present) {
+ rc = smc_tx_rdma_writes(conn, wr_rdma_buf);
+ if (rc) {
+ smc_wr_tx_put_slot(link,
+ (struct smc_wr_tx_pend_priv *)pend);
+ goto out_unlock;
+ }
+ }
+
+ rc = smc_cdc_msg_send(conn, wr_buf, pend);
+ if (!rc && pflags->urg_data_present) {
+ pflags->urg_data_pending = 0;
+ pflags->urg_data_present = 0;
+ }
+
+out_unlock:
+ spin_unlock_bh(&conn->send_lock);
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
+ int rc = 0;
+
+ spin_lock_bh(&conn->send_lock);
+ if (!pflags->urg_data_present)
+ rc = smc_tx_rdma_writes(conn, NULL);
+ if (!rc)
+ rc = smcd_cdc_msg_send(conn);
+
+ if (!rc && pflags->urg_data_present) {
+ pflags->urg_data_pending = 0;
+ pflags->urg_data_present = 0;
+ }
+ spin_unlock_bh(&conn->send_lock);
+ return rc;
+}
+
+static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ int rc = 0;
+
+ /* No data in the send queue */
+ if (unlikely(smc_tx_prepared_sends(conn) <= 0))
+ goto out;
+
+ /* Peer don't have RMBE space */
+ if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) {
+ SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk);
+ goto out;
+ }
+
+ if (conn->killed ||
+ conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+ rc = -EPIPE; /* connection being aborted */
+ goto out;
+ }
+ if (conn->lgr->is_smcd)
+ rc = smcd_tx_sndbuf_nonempty(conn);
+ else
+ rc = smcr_tx_sndbuf_nonempty(conn);
+
+ if (!rc) {
+ /* trigger socket release if connection is closing */
+ smc_close_wake_tx_prepared(smc);
+ }
+
+out:
+ return rc;
+}
+
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ int rc;
+
+ /* This make sure only one can send simultaneously to prevent wasting
+ * of CPU and CDC slot.
+ * Record whether someone has tried to push while we are pushing.
+ */
+ if (atomic_inc_return(&conn->tx_pushing) > 1)
+ return 0;
+
+again:
+ atomic_set(&conn->tx_pushing, 1);
+ smp_wmb(); /* Make sure tx_pushing is 1 before real send */
+ rc = __smc_tx_sndbuf_nonempty(conn);
+
+ /* We need to check whether someone else have added some data into
+ * the send queue and tried to push but failed after the atomic_set()
+ * when we are pushing.
+ * If so, we need to push again to prevent those data hang in the send
+ * queue.
+ */
+ if (unlikely(!atomic_dec_and_test(&conn->tx_pushing)))
+ goto again;
+
+ return rc;
+}
+
+/* Wakeup sndbuf consumers from process context
+ * since there is more data to transmit. The caller
+ * must hold sock lock.
+ */
+void smc_tx_pending(struct smc_connection *conn)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ int rc;
+
+ if (smc->sk.sk_err)
+ return;
+
+ rc = smc_tx_sndbuf_nonempty(conn);
+ if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked &&
+ !atomic_read(&conn->bytes_to_rcv))
+ conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+}
+
+/* Wakeup sndbuf consumers from process context
+ * since there is more data to transmit in locked
+ * sock.
+ */
+void smc_tx_work(struct work_struct *work)
+{
+ struct smc_connection *conn = container_of(to_delayed_work(work),
+ struct smc_connection,
+ tx_work);
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+ lock_sock(&smc->sk);
+ smc_tx_pending(conn);
+ release_sock(&smc->sk);
+}
+
+void smc_tx_consumer_update(struct smc_connection *conn, bool force)
+{
+ union smc_host_cursor cfed, cons, prod;
+ int sender_free = conn->rmb_desc->len;
+ int to_confirm;
+
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn);
+ to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
+ if (to_confirm > conn->rmbe_update_limit) {
+ smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn);
+ sender_free = conn->rmb_desc->len -
+ smc_curs_diff_large(conn->rmb_desc->len,
+ &cfed, &prod);
+ }
+
+ if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+ force ||
+ ((to_confirm > conn->rmbe_update_limit) &&
+ ((sender_free <= (conn->rmb_desc->len / 2)) ||
+ conn->local_rx_ctrl.prod_flags.write_blocked))) {
+ if (conn->killed ||
+ conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
+ return;
+ if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
+ !conn->killed) {
+ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
+ SMC_TX_WORK_DELAY);
+ return;
+ }
+ }
+ if (conn->local_rx_ctrl.prod_flags.write_blocked &&
+ !atomic_read(&conn->bytes_to_rcv))
+ conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+}
+
+/***************************** send initialize *******************************/
+
+/* Initialize send properties on connection establishment. NB: not __init! */
+void smc_tx_init(struct smc_sock *smc)
+{
+ smc->sk.sk_write_space = smc_tx_write_space;
+}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
new file mode 100644
index 000000000..a59f370b8
--- /dev/null
+++ b/net/smc/smc_tx.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_TX_H
+#define SMC_TX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+#include "smc_cdc.h"
+
+static inline int smc_tx_prepared_sends(struct smc_connection *conn)
+{
+ union smc_host_cursor sent, prep;
+
+ smc_curs_copy(&sent, &conn->tx_curs_sent, conn);
+ smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
+ return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
+}
+
+void smc_tx_pending(struct smc_connection *conn);
+void smc_tx_work(struct work_struct *work);
+void smc_tx_init(struct smc_sock *smc);
+int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
+void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
+void smc_tx_consumer_update(struct smc_connection *conn, bool force);
+int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
+ u32 offset, int signal);
+
+#endif /* SMC_TX_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
new file mode 100644
index 000000000..0021065a6
--- /dev/null
+++ b/net/smc/smc_wr.c
@@ -0,0 +1,939 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Work requests (WR) of type ib_post_send or ib_post_recv respectively
+ * are submitted to either RC SQ or RC RQ respectively
+ * (reliably connected send/receive queue)
+ * and become work queue entries (WQEs).
+ * While an SQ WR/WQE is pending, we track it until transmission completion.
+ * Through a send or receive completion queue (CQ) respectively,
+ * we get completion queue entries (CQEs) [aka work completions (WCs)].
+ * Since the CQ callback is called from IRQ context, we split work by using
+ * bottom halves implemented by tasklets.
+ *
+ * SMC uses this to exchange LLC (link layer control)
+ * and CDC (connection data control) messages.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#include <linux/atomic.h>
+#include <linux/hashtable.h>
+#include <linux/wait.h>
+#include <rdma/ib_verbs.h>
+#include <asm/div64.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+
+#define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
+
+#define SMC_WR_RX_HASH_BITS 4
+static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
+static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
+
+struct smc_wr_tx_pend { /* control data for a pending send request */
+ u64 wr_id; /* work request id sent */
+ smc_wr_tx_handler handler;
+ enum ib_wc_status wc_status; /* CQE status */
+ struct smc_link *link;
+ u32 idx;
+ struct smc_wr_tx_pend_priv priv;
+ u8 compl_requested;
+};
+
+/******************************** send queue *********************************/
+
+/*------------------------------- completion --------------------------------*/
+
+/* returns true if at least one tx work request is pending on the given link */
+static inline bool smc_wr_is_tx_pend(struct smc_link *link)
+{
+ return !bitmap_empty(link->wr_tx_mask, link->wr_tx_cnt);
+}
+
+/* wait till all pending tx work requests on the given link are completed */
+void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
+{
+ wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
+}
+
+static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
+{
+ u32 i;
+
+ for (i = 0; i < link->wr_tx_cnt; i++) {
+ if (link->wr_tx_pends[i].wr_id == wr_id)
+ return i;
+ }
+ return link->wr_tx_cnt;
+}
+
+static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
+{
+ struct smc_wr_tx_pend pnd_snd;
+ struct smc_link *link;
+ u32 pnd_snd_idx;
+
+ link = wc->qp->qp_context;
+
+ if (wc->opcode == IB_WC_REG_MR) {
+ if (wc->status)
+ link->wr_reg_state = FAILED;
+ else
+ link->wr_reg_state = CONFIRMED;
+ smc_wr_wakeup_reg_wait(link);
+ return;
+ }
+
+ pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
+ if (pnd_snd_idx == link->wr_tx_cnt) {
+ if (link->lgr->smc_version != SMC_V2 ||
+ link->wr_tx_v2_pend->wr_id != wc->wr_id)
+ return;
+ link->wr_tx_v2_pend->wc_status = wc->status;
+ memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
+ /* clear the full struct smc_wr_tx_pend including .priv */
+ memset(link->wr_tx_v2_pend, 0,
+ sizeof(*link->wr_tx_v2_pend));
+ memset(link->lgr->wr_tx_buf_v2, 0,
+ sizeof(*link->lgr->wr_tx_buf_v2));
+ } else {
+ link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
+ if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
+ complete(&link->wr_tx_compl[pnd_snd_idx]);
+ memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
+ sizeof(pnd_snd));
+ /* clear the full struct smc_wr_tx_pend including .priv */
+ memset(&link->wr_tx_pends[pnd_snd_idx], 0,
+ sizeof(link->wr_tx_pends[pnd_snd_idx]));
+ memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
+ sizeof(link->wr_tx_bufs[pnd_snd_idx]));
+ if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
+ return;
+ }
+
+ if (wc->status) {
+ if (link->lgr->smc_version == SMC_V2) {
+ memset(link->wr_tx_v2_pend, 0,
+ sizeof(*link->wr_tx_v2_pend));
+ memset(link->lgr->wr_tx_buf_v2, 0,
+ sizeof(*link->lgr->wr_tx_buf_v2));
+ }
+ /* terminate link */
+ smcr_link_down_cond_sched(link);
+ }
+ if (pnd_snd.handler)
+ pnd_snd.handler(&pnd_snd.priv, link, wc->status);
+ wake_up(&link->wr_tx_wait);
+}
+
+static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
+{
+ struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
+ struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+ int i = 0, rc;
+ int polled = 0;
+
+again:
+ polled++;
+ do {
+ memset(&wc, 0, sizeof(wc));
+ rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
+ if (polled == 1) {
+ ib_req_notify_cq(dev->roce_cq_send,
+ IB_CQ_NEXT_COMP |
+ IB_CQ_REPORT_MISSED_EVENTS);
+ }
+ if (!rc)
+ break;
+ for (i = 0; i < rc; i++)
+ smc_wr_tx_process_cqe(&wc[i]);
+ } while (rc > 0);
+ if (polled == 1)
+ goto again;
+}
+
+void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+ tasklet_schedule(&dev->send_tasklet);
+}
+
+/*---------------------------- request submission ---------------------------*/
+
+static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
+{
+ *idx = link->wr_tx_cnt;
+ if (!smc_link_sendable(link))
+ return -ENOLINK;
+ for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
+ if (!test_and_set_bit(*idx, link->wr_tx_mask))
+ return 0;
+ }
+ *idx = link->wr_tx_cnt;
+ return -EBUSY;
+}
+
+/**
+ * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
+ * and sets info for pending transmit tracking
+ * @link: Pointer to smc_link used to later send the message.
+ * @handler: Send completion handler function pointer.
+ * @wr_buf: Out value returns pointer to message buffer.
+ * @wr_rdma_buf: Out value returns pointer to rdma work request.
+ * @wr_pend_priv: Out value returns pointer serving as handler context.
+ *
+ * Return: 0 on success, or -errno on error.
+ */
+int smc_wr_tx_get_free_slot(struct smc_link *link,
+ smc_wr_tx_handler handler,
+ struct smc_wr_buf **wr_buf,
+ struct smc_rdma_wr **wr_rdma_buf,
+ struct smc_wr_tx_pend_priv **wr_pend_priv)
+{
+ struct smc_link_group *lgr = smc_get_lgr(link);
+ struct smc_wr_tx_pend *wr_pend;
+ u32 idx = link->wr_tx_cnt;
+ struct ib_send_wr *wr_ib;
+ u64 wr_id;
+ int rc;
+
+ *wr_buf = NULL;
+ *wr_pend_priv = NULL;
+ if (in_softirq() || lgr->terminating) {
+ rc = smc_wr_tx_get_free_slot_index(link, &idx);
+ if (rc)
+ return rc;
+ } else {
+ rc = wait_event_interruptible_timeout(
+ link->wr_tx_wait,
+ !smc_link_sendable(link) ||
+ lgr->terminating ||
+ (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
+ SMC_WR_TX_WAIT_FREE_SLOT_TIME);
+ if (!rc) {
+ /* timeout - terminate link */
+ smcr_link_down_cond_sched(link);
+ return -EPIPE;
+ }
+ if (idx == link->wr_tx_cnt)
+ return -EPIPE;
+ }
+ wr_id = smc_wr_tx_get_next_wr_id(link);
+ wr_pend = &link->wr_tx_pends[idx];
+ wr_pend->wr_id = wr_id;
+ wr_pend->handler = handler;
+ wr_pend->link = link;
+ wr_pend->idx = idx;
+ wr_ib = &link->wr_tx_ibs[idx];
+ wr_ib->wr_id = wr_id;
+ *wr_buf = &link->wr_tx_bufs[idx];
+ if (wr_rdma_buf)
+ *wr_rdma_buf = &link->wr_tx_rdmas[idx];
+ *wr_pend_priv = &wr_pend->priv;
+ return 0;
+}
+
+int smc_wr_tx_get_v2_slot(struct smc_link *link,
+ smc_wr_tx_handler handler,
+ struct smc_wr_v2_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **wr_pend_priv)
+{
+ struct smc_wr_tx_pend *wr_pend;
+ struct ib_send_wr *wr_ib;
+ u64 wr_id;
+
+ if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt)
+ return -EBUSY;
+
+ *wr_buf = NULL;
+ *wr_pend_priv = NULL;
+ wr_id = smc_wr_tx_get_next_wr_id(link);
+ wr_pend = link->wr_tx_v2_pend;
+ wr_pend->wr_id = wr_id;
+ wr_pend->handler = handler;
+ wr_pend->link = link;
+ wr_pend->idx = link->wr_tx_cnt;
+ wr_ib = link->wr_tx_v2_ib;
+ wr_ib->wr_id = wr_id;
+ *wr_buf = link->lgr->wr_tx_buf_v2;
+ *wr_pend_priv = &wr_pend->priv;
+ return 0;
+}
+
+int smc_wr_tx_put_slot(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *wr_pend_priv)
+{
+ struct smc_wr_tx_pend *pend;
+
+ pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
+ if (pend->idx < link->wr_tx_cnt) {
+ u32 idx = pend->idx;
+
+ /* clear the full struct smc_wr_tx_pend including .priv */
+ memset(&link->wr_tx_pends[idx], 0,
+ sizeof(link->wr_tx_pends[idx]));
+ memset(&link->wr_tx_bufs[idx], 0,
+ sizeof(link->wr_tx_bufs[idx]));
+ test_and_clear_bit(idx, link->wr_tx_mask);
+ wake_up(&link->wr_tx_wait);
+ return 1;
+ } else if (link->lgr->smc_version == SMC_V2 &&
+ pend->idx == link->wr_tx_cnt) {
+ /* Large v2 buffer */
+ memset(&link->wr_tx_v2_pend, 0,
+ sizeof(link->wr_tx_v2_pend));
+ memset(&link->lgr->wr_tx_buf_v2, 0,
+ sizeof(link->lgr->wr_tx_buf_v2));
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Send prepared WR slot via ib_post_send.
+ * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
+ */
+int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
+{
+ struct smc_wr_tx_pend *pend;
+ int rc;
+
+ ib_req_notify_cq(link->smcibdev->roce_cq_send,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ pend = container_of(priv, struct smc_wr_tx_pend, priv);
+ rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
+ if (rc) {
+ smc_wr_tx_put_slot(link, priv);
+ smcr_link_down_cond_sched(link);
+ }
+ return rc;
+}
+
+int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
+ int len)
+{
+ int rc;
+
+ link->wr_tx_v2_ib->sg_list[0].length = len;
+ ib_req_notify_cq(link->smcibdev->roce_cq_send,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL);
+ if (rc) {
+ smc_wr_tx_put_slot(link, priv);
+ smcr_link_down_cond_sched(link);
+ }
+ return rc;
+}
+
+/* Send prepared WR slot via ib_post_send and wait for send completion
+ * notification.
+ * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
+ */
+int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
+ unsigned long timeout)
+{
+ struct smc_wr_tx_pend *pend;
+ u32 pnd_idx;
+ int rc;
+
+ pend = container_of(priv, struct smc_wr_tx_pend, priv);
+ pend->compl_requested = 1;
+ pnd_idx = pend->idx;
+ init_completion(&link->wr_tx_compl[pnd_idx]);
+
+ rc = smc_wr_tx_send(link, priv);
+ if (rc)
+ return rc;
+ /* wait for completion by smc_wr_tx_process_cqe() */
+ rc = wait_for_completion_interruptible_timeout(
+ &link->wr_tx_compl[pnd_idx], timeout);
+ if (rc <= 0)
+ rc = -ENODATA;
+ if (rc > 0)
+ rc = 0;
+ return rc;
+}
+
+/* Register a memory region and wait for result. */
+int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
+{
+ int rc;
+
+ ib_req_notify_cq(link->smcibdev->roce_cq_send,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ link->wr_reg_state = POSTED;
+ link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
+ link->wr_reg.mr = mr;
+ link->wr_reg.key = mr->rkey;
+ rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
+ if (rc)
+ return rc;
+
+ percpu_ref_get(&link->wr_reg_refs);
+ rc = wait_event_interruptible_timeout(link->wr_reg_wait,
+ (link->wr_reg_state != POSTED),
+ SMC_WR_REG_MR_WAIT_TIME);
+ percpu_ref_put(&link->wr_reg_refs);
+ if (!rc) {
+ /* timeout - terminate link */
+ smcr_link_down_cond_sched(link);
+ return -EPIPE;
+ }
+ if (rc == -ERESTARTSYS)
+ return -EINTR;
+ switch (link->wr_reg_state) {
+ case CONFIRMED:
+ rc = 0;
+ break;
+ case FAILED:
+ rc = -EIO;
+ break;
+ case POSTED:
+ rc = -EPIPE;
+ break;
+ }
+ return rc;
+}
+
+/****************************** receive queue ********************************/
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
+{
+ struct smc_wr_rx_handler *h_iter;
+ int rc = 0;
+
+ spin_lock(&smc_wr_rx_hash_lock);
+ hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
+ if (h_iter->type == handler->type) {
+ rc = -EEXIST;
+ goto out_unlock;
+ }
+ }
+ hash_add(smc_wr_rx_hash, &handler->list, handler->type);
+out_unlock:
+ spin_unlock(&smc_wr_rx_hash_lock);
+ return rc;
+}
+
+/* Demultiplex a received work request based on the message type to its handler.
+ * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
+ * and not being modified any more afterwards so we don't need to lock it.
+ */
+static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ struct smc_wr_rx_handler *handler;
+ struct smc_wr_rx_hdr *wr_rx;
+ u64 temp_wr_id;
+ u32 index;
+
+ if (wc->byte_len < sizeof(*wr_rx))
+ return; /* short message */
+ temp_wr_id = wc->wr_id;
+ index = do_div(temp_wr_id, link->wr_rx_cnt);
+ wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
+ hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
+ if (handler->type == wr_rx->type)
+ handler->handler(wc, wr_rx);
+ }
+}
+
+static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
+{
+ struct smc_link *link;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ link = wc[i].qp->qp_context;
+ link->wr_rx_id_compl = wc[i].wr_id;
+ if (wc[i].status == IB_WC_SUCCESS) {
+ link->wr_rx_tstamp = jiffies;
+ smc_wr_rx_demultiplex(&wc[i]);
+ smc_wr_rx_post(link); /* refill WR RX */
+ } else {
+ /* handle status errors */
+ switch (wc[i].status) {
+ case IB_WC_RETRY_EXC_ERR:
+ case IB_WC_RNR_RETRY_EXC_ERR:
+ case IB_WC_WR_FLUSH_ERR:
+ smcr_link_down_cond_sched(link);
+ if (link->wr_rx_id_compl == link->wr_rx_id)
+ wake_up(&link->wr_rx_empty_wait);
+ break;
+ default:
+ smc_wr_rx_post(link); /* refill WR RX */
+ break;
+ }
+ }
+ }
+}
+
+static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
+{
+ struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
+ struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+ int polled = 0;
+ int rc;
+
+again:
+ polled++;
+ do {
+ memset(&wc, 0, sizeof(wc));
+ rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
+ if (polled == 1) {
+ ib_req_notify_cq(dev->roce_cq_recv,
+ IB_CQ_SOLICITED_MASK
+ | IB_CQ_REPORT_MISSED_EVENTS);
+ }
+ if (!rc)
+ break;
+ smc_wr_rx_process_cqes(&wc[0], rc);
+ } while (rc > 0);
+ if (polled == 1)
+ goto again;
+}
+
+void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+ tasklet_schedule(&dev->recv_tasklet);
+}
+
+int smc_wr_rx_post_init(struct smc_link *link)
+{
+ u32 i;
+ int rc = 0;
+
+ for (i = 0; i < link->wr_rx_cnt; i++)
+ rc = smc_wr_rx_post(link);
+ return rc;
+}
+
+/***************************** init, exit, misc ******************************/
+
+void smc_wr_remember_qp_attr(struct smc_link *lnk)
+{
+ struct ib_qp_attr *attr = &lnk->qp_attr;
+ struct ib_qp_init_attr init_attr;
+
+ memset(attr, 0, sizeof(*attr));
+ memset(&init_attr, 0, sizeof(init_attr));
+ ib_query_qp(lnk->roce_qp, attr,
+ IB_QP_STATE |
+ IB_QP_CUR_STATE |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY |
+ IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_RQ_PSN |
+ IB_QP_ALT_PATH |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_SQ_PSN |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_CAP |
+ IB_QP_DEST_QPN,
+ &init_attr);
+
+ lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
+ lnk->qp_attr.cap.max_send_wr);
+ lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
+ lnk->qp_attr.cap.max_recv_wr);
+}
+
+static void smc_wr_init_sge(struct smc_link *lnk)
+{
+ int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
+ bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
+ u32 i;
+
+ for (i = 0; i < lnk->wr_tx_cnt; i++) {
+ lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) :
+ lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
+ lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
+ lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
+ lnk->roce_pd->local_dma_lkey;
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
+ lnk->roce_pd->local_dma_lkey;
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
+ lnk->roce_pd->local_dma_lkey;
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
+ lnk->roce_pd->local_dma_lkey;
+ lnk->wr_tx_ibs[i].next = NULL;
+ lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
+ lnk->wr_tx_ibs[i].num_sge = 1;
+ lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
+ lnk->wr_tx_ibs[i].send_flags =
+ IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ if (send_inline)
+ lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
+ }
+
+ if (lnk->lgr->smc_version == SMC_V2) {
+ lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
+ lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
+ lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
+
+ lnk->wr_tx_v2_ib->next = NULL;
+ lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
+ lnk->wr_tx_v2_ib->num_sge = 1;
+ lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
+ lnk->wr_tx_v2_ib->send_flags =
+ IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ }
+
+ /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
+ * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
+ * and the same buffer for all sges. When a larger message arrived then
+ * the content of the first small sge is copied to the beginning of
+ * the larger spillover buffer, allowing easy data mapping.
+ */
+ for (i = 0; i < lnk->wr_rx_cnt; i++) {
+ int x = i * sges_per_buf;
+
+ lnk->wr_rx_sges[x].addr =
+ lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
+ lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
+ lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
+ if (lnk->lgr->smc_version == SMC_V2) {
+ lnk->wr_rx_sges[x + 1].addr =
+ lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
+ lnk->wr_rx_sges[x + 1].length =
+ SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
+ lnk->wr_rx_sges[x + 1].lkey =
+ lnk->roce_pd->local_dma_lkey;
+ }
+ lnk->wr_rx_ibs[i].next = NULL;
+ lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
+ lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
+ }
+ lnk->wr_reg.wr.next = NULL;
+ lnk->wr_reg.wr.num_sge = 0;
+ lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
+ lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
+ lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
+}
+
+void smc_wr_free_link(struct smc_link *lnk)
+{
+ struct ib_device *ibdev;
+
+ if (!lnk->smcibdev)
+ return;
+ ibdev = lnk->smcibdev->ibdev;
+
+ smc_wr_drain_cq(lnk);
+ smc_wr_wakeup_reg_wait(lnk);
+ smc_wr_wakeup_tx_wait(lnk);
+
+ smc_wr_tx_wait_no_pending_sends(lnk);
+ percpu_ref_kill(&lnk->wr_reg_refs);
+ wait_for_completion(&lnk->reg_ref_comp);
+ percpu_ref_kill(&lnk->wr_tx_refs);
+ wait_for_completion(&lnk->tx_ref_comp);
+
+ if (lnk->wr_rx_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_dma_addr = 0;
+ }
+ if (lnk->wr_rx_v2_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
+ SMC_WR_BUF_V2_SIZE,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_v2_dma_addr = 0;
+ }
+ if (lnk->wr_tx_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+ DMA_TO_DEVICE);
+ lnk->wr_tx_dma_addr = 0;
+ }
+ if (lnk->wr_tx_v2_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
+ SMC_WR_BUF_V2_SIZE,
+ DMA_TO_DEVICE);
+ lnk->wr_tx_v2_dma_addr = 0;
+ }
+}
+
+void smc_wr_free_lgr_mem(struct smc_link_group *lgr)
+{
+ if (lgr->smc_version < SMC_V2)
+ return;
+
+ kfree(lgr->wr_rx_buf_v2);
+ lgr->wr_rx_buf_v2 = NULL;
+ kfree(lgr->wr_tx_buf_v2);
+ lgr->wr_tx_buf_v2 = NULL;
+}
+
+void smc_wr_free_link_mem(struct smc_link *lnk)
+{
+ kfree(lnk->wr_tx_v2_ib);
+ lnk->wr_tx_v2_ib = NULL;
+ kfree(lnk->wr_tx_v2_sge);
+ lnk->wr_tx_v2_sge = NULL;
+ kfree(lnk->wr_tx_v2_pend);
+ lnk->wr_tx_v2_pend = NULL;
+ kfree(lnk->wr_tx_compl);
+ lnk->wr_tx_compl = NULL;
+ kfree(lnk->wr_tx_pends);
+ lnk->wr_tx_pends = NULL;
+ bitmap_free(lnk->wr_tx_mask);
+ lnk->wr_tx_mask = NULL;
+ kfree(lnk->wr_tx_sges);
+ lnk->wr_tx_sges = NULL;
+ kfree(lnk->wr_tx_rdma_sges);
+ lnk->wr_tx_rdma_sges = NULL;
+ kfree(lnk->wr_rx_sges);
+ lnk->wr_rx_sges = NULL;
+ kfree(lnk->wr_tx_rdmas);
+ lnk->wr_tx_rdmas = NULL;
+ kfree(lnk->wr_rx_ibs);
+ lnk->wr_rx_ibs = NULL;
+ kfree(lnk->wr_tx_ibs);
+ lnk->wr_tx_ibs = NULL;
+ kfree(lnk->wr_tx_bufs);
+ lnk->wr_tx_bufs = NULL;
+ kfree(lnk->wr_rx_bufs);
+ lnk->wr_rx_bufs = NULL;
+}
+
+int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr)
+{
+ if (lgr->smc_version < SMC_V2)
+ return 0;
+
+ lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
+ if (!lgr->wr_rx_buf_v2)
+ return -ENOMEM;
+ lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
+ if (!lgr->wr_tx_buf_v2) {
+ kfree(lgr->wr_rx_buf_v2);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int smc_wr_alloc_link_mem(struct smc_link *link)
+{
+ int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1;
+
+ /* allocate link related memory */
+ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
+ if (!link->wr_tx_bufs)
+ goto no_mem;
+ link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
+ GFP_KERNEL);
+ if (!link->wr_rx_bufs)
+ goto no_mem_wr_tx_bufs;
+ link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_ibs)
+ goto no_mem_wr_rx_bufs;
+ link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
+ sizeof(link->wr_rx_ibs[0]),
+ GFP_KERNEL);
+ if (!link->wr_rx_ibs)
+ goto no_mem_wr_tx_ibs;
+ link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT,
+ sizeof(link->wr_tx_rdmas[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_rdmas)
+ goto no_mem_wr_rx_ibs;
+ link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT,
+ sizeof(link->wr_tx_rdma_sges[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_rdma_sges)
+ goto no_mem_wr_tx_rdmas;
+ link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_sges)
+ goto no_mem_wr_tx_rdma_sges;
+ link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
+ sizeof(link->wr_rx_sges[0]) * sges_per_buf,
+ GFP_KERNEL);
+ if (!link->wr_rx_sges)
+ goto no_mem_wr_tx_sges;
+ link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL);
+ if (!link->wr_tx_mask)
+ goto no_mem_wr_rx_sges;
+ link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
+ sizeof(link->wr_tx_pends[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_pends)
+ goto no_mem_wr_tx_mask;
+ link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
+ sizeof(link->wr_tx_compl[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_compl)
+ goto no_mem_wr_tx_pends;
+
+ if (link->lgr->smc_version == SMC_V2) {
+ link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib),
+ GFP_KERNEL);
+ if (!link->wr_tx_v2_ib)
+ goto no_mem_tx_compl;
+ link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge),
+ GFP_KERNEL);
+ if (!link->wr_tx_v2_sge)
+ goto no_mem_v2_ib;
+ link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend),
+ GFP_KERNEL);
+ if (!link->wr_tx_v2_pend)
+ goto no_mem_v2_sge;
+ }
+ return 0;
+
+no_mem_v2_sge:
+ kfree(link->wr_tx_v2_sge);
+no_mem_v2_ib:
+ kfree(link->wr_tx_v2_ib);
+no_mem_tx_compl:
+ kfree(link->wr_tx_compl);
+no_mem_wr_tx_pends:
+ kfree(link->wr_tx_pends);
+no_mem_wr_tx_mask:
+ kfree(link->wr_tx_mask);
+no_mem_wr_rx_sges:
+ kfree(link->wr_rx_sges);
+no_mem_wr_tx_sges:
+ kfree(link->wr_tx_sges);
+no_mem_wr_tx_rdma_sges:
+ kfree(link->wr_tx_rdma_sges);
+no_mem_wr_tx_rdmas:
+ kfree(link->wr_tx_rdmas);
+no_mem_wr_rx_ibs:
+ kfree(link->wr_rx_ibs);
+no_mem_wr_tx_ibs:
+ kfree(link->wr_tx_ibs);
+no_mem_wr_rx_bufs:
+ kfree(link->wr_rx_bufs);
+no_mem_wr_tx_bufs:
+ kfree(link->wr_tx_bufs);
+no_mem:
+ return -ENOMEM;
+}
+
+void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
+{
+ tasklet_kill(&smcibdev->recv_tasklet);
+ tasklet_kill(&smcibdev->send_tasklet);
+}
+
+void smc_wr_add_dev(struct smc_ib_device *smcibdev)
+{
+ tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn);
+ tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
+}
+
+static void smcr_wr_tx_refs_free(struct percpu_ref *ref)
+{
+ struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs);
+
+ complete(&lnk->tx_ref_comp);
+}
+
+static void smcr_wr_reg_refs_free(struct percpu_ref *ref)
+{
+ struct smc_link *lnk = container_of(ref, struct smc_link, wr_reg_refs);
+
+ complete(&lnk->reg_ref_comp);
+}
+
+int smc_wr_create_link(struct smc_link *lnk)
+{
+ struct ib_device *ibdev = lnk->smcibdev->ibdev;
+ int rc = 0;
+
+ smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
+ lnk->wr_rx_id = 0;
+ lnk->wr_rx_dma_addr = ib_dma_map_single(
+ ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
+ lnk->wr_rx_dma_addr = 0;
+ rc = -EIO;
+ goto out;
+ }
+ if (lnk->lgr->smc_version == SMC_V2) {
+ lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev,
+ lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) {
+ lnk->wr_rx_v2_dma_addr = 0;
+ rc = -EIO;
+ goto dma_unmap;
+ }
+ lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev,
+ lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) {
+ lnk->wr_tx_v2_dma_addr = 0;
+ rc = -EIO;
+ goto dma_unmap;
+ }
+ }
+ lnk->wr_tx_dma_addr = ib_dma_map_single(
+ ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
+ rc = -EIO;
+ goto dma_unmap;
+ }
+ smc_wr_init_sge(lnk);
+ bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT);
+ init_waitqueue_head(&lnk->wr_tx_wait);
+ rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free, 0, GFP_KERNEL);
+ if (rc)
+ goto dma_unmap;
+ init_completion(&lnk->tx_ref_comp);
+ init_waitqueue_head(&lnk->wr_reg_wait);
+ rc = percpu_ref_init(&lnk->wr_reg_refs, smcr_wr_reg_refs_free, 0, GFP_KERNEL);
+ if (rc)
+ goto dma_unmap;
+ init_completion(&lnk->reg_ref_comp);
+ init_waitqueue_head(&lnk->wr_rx_empty_wait);
+ return rc;
+
+dma_unmap:
+ if (lnk->wr_rx_v2_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
+ SMC_WR_BUF_V2_SIZE,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_v2_dma_addr = 0;
+ }
+ if (lnk->wr_tx_v2_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
+ SMC_WR_BUF_V2_SIZE,
+ DMA_TO_DEVICE);
+ lnk->wr_tx_v2_dma_addr = 0;
+ }
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_dma_addr = 0;
+out:
+ return rc;
+}
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
new file mode 100644
index 000000000..f3008dda2
--- /dev/null
+++ b/net/smc/smc_wr.h
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_WR_H
+#define SMC_WR_H
+
+#include <linux/atomic.h>
+#include <rdma/ib_verbs.h>
+#include <asm/div64.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
+
+#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
+
+#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
+
+#define SMC_WR_TX_PEND_PRIV_SIZE 32
+
+struct smc_wr_tx_pend_priv {
+ u8 priv[SMC_WR_TX_PEND_PRIV_SIZE];
+};
+
+typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *,
+ struct smc_link *,
+ enum ib_wc_status);
+
+typedef bool (*smc_wr_tx_filter)(struct smc_wr_tx_pend_priv *,
+ unsigned long);
+
+typedef void (*smc_wr_tx_dismisser)(struct smc_wr_tx_pend_priv *);
+
+struct smc_wr_rx_handler {
+ struct hlist_node list; /* hash table collision resolution */
+ void (*handler)(struct ib_wc *, void *);
+ u8 type;
+};
+
+/* Only used by RDMA write WRs.
+ * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
+ */
+static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link)
+{
+ return atomic_long_inc_return(&link->wr_tx_id);
+}
+
+static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
+{
+ atomic_long_set(wr_tx_id, val);
+}
+
+static inline bool smc_wr_tx_link_hold(struct smc_link *link)
+{
+ if (!smc_link_sendable(link))
+ return false;
+ percpu_ref_get(&link->wr_tx_refs);
+ return true;
+}
+
+static inline void smc_wr_tx_link_put(struct smc_link *link)
+{
+ percpu_ref_put(&link->wr_tx_refs);
+}
+
+static inline void smc_wr_drain_cq(struct smc_link *lnk)
+{
+ wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
+}
+
+static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk)
+{
+ wake_up_all(&lnk->wr_tx_wait);
+}
+
+static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk)
+{
+ wake_up(&lnk->wr_reg_wait);
+}
+
+/* post a new receive work request to fill a completed old work request entry */
+static inline int smc_wr_rx_post(struct smc_link *link)
+{
+ int rc;
+ u64 wr_id, temp_wr_id;
+ u32 index;
+
+ wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
+ temp_wr_id = wr_id;
+ index = do_div(temp_wr_id, link->wr_rx_cnt);
+ link->wr_rx_ibs[index].wr_id = wr_id;
+ rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL);
+ return rc;
+}
+
+int smc_wr_create_link(struct smc_link *lnk);
+int smc_wr_alloc_link_mem(struct smc_link *lnk);
+int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
+void smc_wr_free_link(struct smc_link *lnk);
+void smc_wr_free_link_mem(struct smc_link *lnk);
+void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
+void smc_wr_remember_qp_attr(struct smc_link *lnk);
+void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
+void smc_wr_add_dev(struct smc_ib_device *smcibdev);
+
+int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
+ struct smc_wr_buf **wr_buf,
+ struct smc_rdma_wr **wrs,
+ struct smc_wr_tx_pend_priv **wr_pend_priv);
+int smc_wr_tx_get_v2_slot(struct smc_link *link,
+ smc_wr_tx_handler handler,
+ struct smc_wr_v2_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **wr_pend_priv);
+int smc_wr_tx_put_slot(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *wr_pend_priv);
+int smc_wr_tx_send(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *wr_pend_priv);
+int smc_wr_tx_v2_send(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *priv, int len);
+int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
+ unsigned long timeout);
+void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+void smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
+int smc_wr_rx_post_init(struct smc_link *link);
+void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr);
+
+#endif /* SMC_WR_H */