summaryrefslogtreecommitdiffstats
path: root/net/smc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--net/smc/Kconfig21
-rw-r--r--net/smc/Makefile5
-rw-r--r--net/smc/af_smc.c2631
-rw-r--r--net/smc/smc.h305
-rw-r--r--net/smc/smc_cdc.c478
-rw-r--r--net/smc/smc_cdc.h305
-rw-r--r--net/smc/smc_clc.c784
-rw-r--r--net/smc/smc_clc.h333
-rw-r--r--net/smc/smc_close.c500
-rw-r--r--net/smc/smc_close.h30
-rw-r--r--net/smc/smc_core.c1975
-rw-r--r--net/smc/smc_core.h425
-rw-r--r--net/smc/smc_diag.c283
-rw-r--r--net/smc/smc_ib.c643
-rw-r--r--net/smc/smc_ib.h91
-rw-r--r--net/smc/smc_ism.c439
-rw-r--r--net/smc/smc_ism.h56
-rw-r--r--net/smc/smc_llc.c1974
-rw-r--r--net/smc/smc_llc.h109
-rw-r--r--net/smc/smc_netns.h21
-rw-r--r--net/smc/smc_pnet.c1174
-rw-r--r--net/smc/smc_pnet.h70
-rw-r--r--net/smc/smc_rx.c444
-rw-r--r--net/smc/smc_rx.h31
-rw-r--r--net/smc/smc_tx.c646
-rw-r--r--net/smc/smc_tx.h39
-rw-r--r--net/smc/smc_wr.c720
-rw-r--r--net/smc/smc_wr.h131
28 files changed, 14663 insertions, 0 deletions
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
new file mode 100644
index 000000000..1ab3c5a2c
--- /dev/null
+++ b/net/smc/Kconfig
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config SMC
+ tristate "SMC socket protocol family"
+ depends on INET && INFINIBAND
+ help
+ SMC-R provides a "sockets over RDMA" solution making use of
+ RDMA over Converged Ethernet (RoCE) technology to upgrade
+ AF_INET TCP connections transparently.
+ The Linux implementation of the SMC-R solution is designed as
+ a separate socket family SMC.
+
+ Select this option if you want to run SMC socket applications
+
+config SMC_DIAG
+ tristate "SMC: socket monitoring interface"
+ depends on SMC
+ help
+ Support for SMC socket monitoring interface used by tools such as
+ smcss.
+
+ if unsure, say Y.
diff --git a/net/smc/Makefile b/net/smc/Makefile
new file mode 100644
index 000000000..cb1254541
--- /dev/null
+++ b/net/smc/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_SMC) += smc.o
+obj-$(CONFIG_SMC_DIAG) += smc_diag.o
+smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
new file mode 100644
index 000000000..664ddf564
--- /dev/null
+++ b/net/smc/af_smc.c
@@ -0,0 +1,2631 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * AF_SMC protocol family socket handler keeping the AF_INET sock address type
+ * applies to SOCK_STREAM sockets only
+ * offers an alternative communication option for TCP-protocol sockets
+ * applicable with RoCE-cards only
+ *
+ * Initial restrictions:
+ * - support for alternate links postponed
+ *
+ * Copyright IBM Corp. 2016, 2018
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ * based on prototype from Frank Blaschka
+ */
+
+#define KMSG_COMPONENT "smc"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/workqueue.h>
+#include <linux/in.h>
+#include <linux/sched/signal.h>
+#include <linux/if_vlan.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/ctype.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/smc.h>
+#include <asm/ioctls.h>
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include "smc_netns.h"
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_llc.h"
+#include "smc_cdc.h"
+#include "smc_core.h"
+#include "smc_ib.h"
+#include "smc_ism.h"
+#include "smc_pnet.h"
+#include "smc_tx.h"
+#include "smc_rx.h"
+#include "smc_close.h"
+
+static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
+ * creation on server
+ */
+static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group
+ * creation on client
+ */
+
+struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
+struct workqueue_struct *smc_close_wq; /* wq for close work */
+
+static void smc_tcp_listen_work(struct work_struct *);
+static void smc_connect_work(struct work_struct *);
+
+static void smc_set_keepalive(struct sock *sk, int val)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
+}
+
+static struct smc_hashinfo smc_v4_hashinfo = {
+ .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
+};
+
+static struct smc_hashinfo smc_v6_hashinfo = {
+ .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
+};
+
+int smc_hash_sk(struct sock *sk)
+{
+ struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+ struct hlist_head *head;
+
+ head = &h->ht;
+
+ write_lock_bh(&h->lock);
+ sk_add_node(sk, head);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ write_unlock_bh(&h->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(smc_hash_sk);
+
+void smc_unhash_sk(struct sock *sk)
+{
+ struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+
+ write_lock_bh(&h->lock);
+ if (sk_del_node_init(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(smc_unhash_sk);
+
+struct proto smc_proto = {
+ .name = "SMC",
+ .owner = THIS_MODULE,
+ .keepalive = smc_set_keepalive,
+ .hash = smc_hash_sk,
+ .unhash = smc_unhash_sk,
+ .obj_size = sizeof(struct smc_sock),
+ .h.smc_hash = &smc_v4_hashinfo,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
+};
+EXPORT_SYMBOL_GPL(smc_proto);
+
+struct proto smc_proto6 = {
+ .name = "SMC6",
+ .owner = THIS_MODULE,
+ .keepalive = smc_set_keepalive,
+ .hash = smc_hash_sk,
+ .unhash = smc_unhash_sk,
+ .obj_size = sizeof(struct smc_sock),
+ .h.smc_hash = &smc_v6_hashinfo,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
+};
+EXPORT_SYMBOL_GPL(smc_proto6);
+
+static void smc_restore_fallback_changes(struct smc_sock *smc)
+{
+ if (smc->clcsock->file) { /* non-accepted sockets have no file yet */
+ smc->clcsock->file->private_data = smc->sk.sk_socket;
+ smc->clcsock->file = NULL;
+ }
+}
+
+static int __smc_release(struct smc_sock *smc)
+{
+ struct sock *sk = &smc->sk;
+ int rc = 0;
+
+ if (!smc->use_fallback) {
+ rc = smc_close_active(smc);
+ smc_sock_set_flag(sk, SOCK_DEAD);
+ sk->sk_shutdown |= SHUTDOWN_MASK;
+ } else {
+ if (sk->sk_state != SMC_CLOSED) {
+ if (sk->sk_state != SMC_LISTEN &&
+ sk->sk_state != SMC_INIT)
+ sock_put(sk); /* passive closing */
+ if (sk->sk_state == SMC_LISTEN) {
+ /* wake up clcsock accept */
+ rc = kernel_sock_shutdown(smc->clcsock,
+ SHUT_RDWR);
+ }
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_state_change(sk);
+ }
+ smc_restore_fallback_changes(smc);
+ }
+
+ sk->sk_prot->unhash(sk);
+
+ if (sk->sk_state == SMC_CLOSED) {
+ if (smc->clcsock) {
+ release_sock(sk);
+ smc_clcsock_release(smc);
+ lock_sock(sk);
+ }
+ if (!smc->use_fallback)
+ smc_conn_free(&smc->conn);
+ }
+
+ return rc;
+}
+
+static int smc_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int old_state, rc = 0;
+
+ if (!sk)
+ goto out;
+
+ sock_hold(sk); /* sock_put below */
+ smc = smc_sk(sk);
+
+ old_state = sk->sk_state;
+
+ /* cleanup for a dangling non-blocking connect */
+ if (smc->connect_nonblock && old_state == SMC_INIT)
+ tcp_abort(smc->clcsock->sk, ECONNABORTED);
+
+ if (cancel_work_sync(&smc->connect_work))
+ sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */
+
+ if (sk->sk_state == SMC_LISTEN)
+ /* smc_close_non_accepted() is called and acquires
+ * sock lock for child sockets again
+ */
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ else
+ lock_sock(sk);
+
+ if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE &&
+ !smc->use_fallback)
+ smc_close_active_abort(smc);
+
+ rc = __smc_release(smc);
+
+ /* detach socket */
+ sock_orphan(sk);
+ sock->sk = NULL;
+ release_sock(sk);
+
+ sock_put(sk); /* sock_hold above */
+ sock_put(sk); /* final sock_put */
+out:
+ return rc;
+}
+
+static void smc_destruct(struct sock *sk)
+{
+ if (sk->sk_state != SMC_CLOSED)
+ return;
+ if (!sock_flag(sk, SOCK_DEAD))
+ return;
+
+ sk_refcnt_debug_dec(sk);
+}
+
+static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
+ int protocol)
+{
+ struct smc_sock *smc;
+ struct proto *prot;
+ struct sock *sk;
+
+ prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
+ sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
+ if (!sk)
+ return NULL;
+
+ sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
+ sk->sk_state = SMC_INIT;
+ sk->sk_destruct = smc_destruct;
+ sk->sk_protocol = protocol;
+ smc = smc_sk(sk);
+ INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+ INIT_WORK(&smc->connect_work, smc_connect_work);
+ INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
+ INIT_LIST_HEAD(&smc->accept_q);
+ spin_lock_init(&smc->accept_q_lock);
+ spin_lock_init(&smc->conn.send_lock);
+ sk->sk_prot->hash(sk);
+ sk_refcnt_debug_inc(sk);
+ mutex_init(&smc->clcsock_release_lock);
+
+ return sk;
+}
+
+static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sk);
+
+ /* replicate tests from inet_bind(), to be safe wrt. future changes */
+ rc = -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_in))
+ goto out;
+
+ rc = -EAFNOSUPPORT;
+ if (addr->sin_family != AF_INET &&
+ addr->sin_family != AF_INET6 &&
+ addr->sin_family != AF_UNSPEC)
+ goto out;
+ /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
+ if (addr->sin_family == AF_UNSPEC &&
+ addr->sin_addr.s_addr != htonl(INADDR_ANY))
+ goto out;
+
+ lock_sock(sk);
+
+ /* Check if socket is already active */
+ rc = -EINVAL;
+ if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
+ goto out_rel;
+
+ smc->clcsock->sk->sk_reuse = sk->sk_reuse;
+ rc = kernel_bind(smc->clcsock, uaddr, addr_len);
+
+out_rel:
+ release_sock(sk);
+out:
+ return rc;
+}
+
+static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
+ unsigned long mask)
+{
+ /* options we don't get control via setsockopt for */
+ nsk->sk_type = osk->sk_type;
+ nsk->sk_sndbuf = osk->sk_sndbuf;
+ nsk->sk_rcvbuf = osk->sk_rcvbuf;
+ nsk->sk_sndtimeo = osk->sk_sndtimeo;
+ nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
+ nsk->sk_mark = osk->sk_mark;
+ nsk->sk_priority = osk->sk_priority;
+ nsk->sk_rcvlowat = osk->sk_rcvlowat;
+ nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
+ nsk->sk_err = osk->sk_err;
+
+ nsk->sk_flags &= ~mask;
+ nsk->sk_flags |= osk->sk_flags & mask;
+}
+
+#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
+ (1UL << SOCK_KEEPOPEN) | \
+ (1UL << SOCK_LINGER) | \
+ (1UL << SOCK_BROADCAST) | \
+ (1UL << SOCK_TIMESTAMP) | \
+ (1UL << SOCK_DBG) | \
+ (1UL << SOCK_RCVTSTAMP) | \
+ (1UL << SOCK_RCVTSTAMPNS) | \
+ (1UL << SOCK_LOCALROUTE) | \
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
+ (1UL << SOCK_RXQ_OVFL) | \
+ (1UL << SOCK_WIFI_STATUS) | \
+ (1UL << SOCK_NOFCS) | \
+ (1UL << SOCK_FILTER_LOCKED) | \
+ (1UL << SOCK_TSTAMP_NEW))
+/* copy only relevant settings and flags of SOL_SOCKET level from smc to
+ * clc socket (since smc is not called for these options from net/core)
+ */
+static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
+{
+ smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
+}
+
+#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
+ (1UL << SOCK_KEEPOPEN) | \
+ (1UL << SOCK_LINGER) | \
+ (1UL << SOCK_DBG))
+/* copy only settings and flags relevant for smc from clc to smc socket */
+static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
+{
+ smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
+}
+
+/* register the new rmb on all links */
+static int smcr_lgr_reg_rmbs(struct smc_link *link,
+ struct smc_buf_desc *rmb_desc)
+{
+ struct smc_link_group *lgr = link->lgr;
+ int i, rc = 0;
+
+ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
+ if (rc)
+ return rc;
+ /* protect against parallel smc_llc_cli_rkey_exchange() and
+ * parallel smcr_link_reg_rmb()
+ */
+ mutex_lock(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]))
+ continue;
+ rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc);
+ if (rc)
+ goto out;
+ }
+
+ /* exchange confirm_rkey msg with peer */
+ rc = smc_llc_do_confirm_rkey(link, rmb_desc);
+ if (rc) {
+ rc = -EFAULT;
+ goto out;
+ }
+ rmb_desc->is_conf_rkey = true;
+out:
+ mutex_unlock(&lgr->llc_conf_mutex);
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+ return rc;
+}
+
+static int smcr_clnt_conf_first_link(struct smc_sock *smc)
+{
+ struct smc_link *link = smc->conn.lnk;
+ struct smc_llc_qentry *qentry;
+ int rc;
+
+ /* Receive CONFIRM LINK request from server over RoCE fabric.
+ * Increasing the client's timeout by twice as much as the server's
+ * timeout by default can temporarily avoid decline messages of
+ * both sides crossing or colliding
+ */
+ qentry = smc_llc_wait(link->lgr, NULL, 2 * SMC_LLC_WAIT_TIME,
+ SMC_LLC_CONFIRM_LINK);
+ if (!qentry) {
+ struct smc_clc_msg_decline dclc;
+
+ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+ SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+ return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
+ }
+ smc_llc_save_peer_uid(qentry);
+ rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
+ smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
+ if (rc)
+ return SMC_CLC_DECL_RMBE_EC;
+
+ rc = smc_ib_modify_qp_rts(link);
+ if (rc)
+ return SMC_CLC_DECL_ERR_RDYLNK;
+
+ smc_wr_remember_qp_attr(link);
+
+ if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
+ return SMC_CLC_DECL_ERR_REGRMB;
+
+ /* confirm_rkey is implicit on 1st contact */
+ smc->conn.rmb_desc->is_conf_rkey = true;
+
+ /* send CONFIRM LINK response over RoCE fabric */
+ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
+ if (rc < 0)
+ return SMC_CLC_DECL_TIMEOUT_CL;
+
+ smc_llc_link_active(link);
+ smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
+
+ /* optional 2nd link, receive ADD LINK request from server */
+ qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
+ SMC_LLC_ADD_LINK);
+ if (!qentry) {
+ struct smc_clc_msg_decline dclc;
+
+ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+ SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+ if (rc == -EAGAIN)
+ rc = 0; /* no DECLINE received, go with one link */
+ return rc;
+ }
+ smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
+ smc_llc_cli_add_link(link, qentry);
+ return 0;
+}
+
+static void smcr_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);
+
+ smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
+ smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
+ smc->conn.peer_rmbe_size = bufsize;
+ atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
+ smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
+}
+
+static bool smc_isascii(char *hostname)
+{
+ int i;
+
+ for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
+ if (!isascii(hostname[i]))
+ return false;
+ return true;
+}
+
+static void smcd_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);
+
+ smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
+ smc->conn.peer_token = clc->d0.token;
+ /* msg header takes up space in the buffer */
+ smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
+ atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
+ smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
+ if (clc->hdr.version > SMC_V1 &&
+ (clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) {
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)clc;
+ struct smc_clc_first_contact_ext *fce =
+ (struct smc_clc_first_contact_ext *)
+ (((u8 *)clc_v2) + sizeof(*clc_v2));
+
+ memcpy(smc->conn.lgr->negotiated_eid, clc_v2->eid,
+ SMC_MAX_EID_LEN);
+ smc->conn.lgr->peer_os = fce->os_type;
+ smc->conn.lgr->peer_smc_release = fce->release;
+ if (smc_isascii(fce->hostname))
+ memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
+ SMC_MAX_HOSTNAME_LEN);
+ }
+}
+
+static void smc_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ if (smc->conn.lgr->is_smcd)
+ smcd_conn_save_peer_info(smc, clc);
+ else
+ smcr_conn_save_peer_info(smc, clc);
+}
+
+static void smc_link_save_peer_info(struct smc_link *link,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ link->peer_qpn = ntoh24(clc->r0.qpn);
+ memcpy(link->peer_gid, clc->r0.lcl.gid, SMC_GID_SIZE);
+ memcpy(link->peer_mac, clc->r0.lcl.mac, sizeof(link->peer_mac));
+ link->peer_psn = ntoh24(clc->r0.psn);
+ link->peer_mtu = clc->r0.qp_mtu;
+}
+
+static void smc_switch_to_fallback(struct smc_sock *smc)
+{
+ wait_queue_head_t *smc_wait = sk_sleep(&smc->sk);
+ wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk);
+ unsigned long flags;
+
+ smc->use_fallback = true;
+ if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
+ smc->clcsock->file = smc->sk.sk_socket->file;
+ smc->clcsock->file->private_data = smc->clcsock;
+ smc->clcsock->wq.fasync_list =
+ smc->sk.sk_socket->wq.fasync_list;
+
+ /* There may be some entries remaining in
+ * smc socket->wq, which should be removed
+ * to clcsocket->wq during the fallback.
+ */
+ spin_lock_irqsave(&smc_wait->lock, flags);
+ spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING);
+ list_splice_init(&smc_wait->head, &clc_wait->head);
+ spin_unlock(&clc_wait->lock);
+ spin_unlock_irqrestore(&smc_wait->lock, flags);
+ }
+}
+
+/* fall back during connect */
+static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
+{
+ smc_switch_to_fallback(smc);
+ smc->fallback_rsn = reason_code;
+ smc_copy_sock_settings_to_clc(smc);
+ smc->connect_nonblock = 0;
+ if (smc->sk.sk_state == SMC_INIT)
+ smc->sk.sk_state = SMC_ACTIVE;
+ return 0;
+}
+
+/* decline and fall back during connect */
+static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code,
+ u8 version)
+{
+ int rc;
+
+ if (reason_code < 0) { /* error, fallback is not possible */
+ if (smc->sk.sk_state == SMC_INIT)
+ sock_put(&smc->sk); /* passive closing */
+ return reason_code;
+ }
+ if (reason_code != SMC_CLC_DECL_PEERDECL) {
+ rc = smc_clc_send_decline(smc, reason_code, version);
+ if (rc < 0) {
+ if (smc->sk.sk_state == SMC_INIT)
+ sock_put(&smc->sk); /* passive closing */
+ return rc;
+ }
+ }
+ return smc_connect_fallback(smc, reason_code);
+}
+
+/* abort connecting */
+static void smc_connect_abort(struct smc_sock *smc, int local_first)
+{
+ if (local_first)
+ smc_lgr_cleanup_early(&smc->conn);
+ else
+ smc_conn_free(&smc->conn);
+}
+
+/* check if there is a rdma device available for this connection. */
+/* called for connect and listen */
+static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
+{
+ /* PNET table look up: search active ib_device and port
+ * within same PNETID that also contains the ethernet device
+ * used for the internal TCP socket
+ */
+ smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
+ if (!ini->ib_dev)
+ return SMC_CLC_DECL_NOSMCRDEV;
+ return 0;
+}
+
+/* check if there is an ISM device available for this connection. */
+/* called for connect and listen */
+static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
+{
+ /* Find ISM device with same PNETID as connecting interface */
+ smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
+ if (!ini->ism_dev[0])
+ return SMC_CLC_DECL_NOSMCDDEV;
+ else
+ ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
+ return 0;
+}
+
+/* is chid unique for the ism devices that are already determined? */
+static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
+ int cnt)
+{
+ int i = (!ini->ism_dev[0]) ? 1 : 0;
+
+ for (; i < cnt; i++)
+ if (ini->ism_chid[i] == chid)
+ return false;
+ return true;
+}
+
+/* determine possible V2 ISM devices (either without PNETID or with PNETID plus
+ * PNETID matching net_device)
+ */
+static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ int rc = SMC_CLC_DECL_NOSMCDDEV;
+ struct smcd_dev *smcd;
+ int i = 1;
+ u16 chid;
+
+ if (smcd_indicated(ini->smc_type_v1))
+ rc = 0; /* already initialized for V1 */
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ if (smcd->going_away || smcd == ini->ism_dev[0])
+ continue;
+ chid = smc_ism_get_chid(smcd);
+ if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
+ continue;
+ if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
+ smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
+ ini->ism_dev[i] = smcd;
+ ini->ism_chid[i] = chid;
+ ini->is_smcd = true;
+ rc = 0;
+ i++;
+ if (i > SMC_MAX_ISM_DEVS)
+ break;
+ }
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+ ini->ism_offered_cnt = i - 1;
+ if (!ini->ism_dev[0] && !ini->ism_dev[1])
+ ini->smcd_version = 0;
+
+ return rc;
+}
+
+/* Check for VLAN ID and register it on ISM device just for CLC handshake */
+static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
+ return SMC_CLC_DECL_ISMVLANERR;
+ return 0;
+}
+
+static int smc_find_proposal_devices(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ int rc = 0;
+
+ /* check if there is an ism device available */
+ if (ini->smcd_version & SMC_V1) {
+ if (smc_find_ism_device(smc, ini) ||
+ smc_connect_ism_vlan_setup(smc, ini)) {
+ if (ini->smc_type_v1 == SMC_TYPE_B)
+ ini->smc_type_v1 = SMC_TYPE_R;
+ else
+ ini->smc_type_v1 = SMC_TYPE_N;
+ } /* else ISM V1 is supported for this connection */
+ if (smc_find_rdma_device(smc, ini)) {
+ if (ini->smc_type_v1 == SMC_TYPE_B)
+ ini->smc_type_v1 = SMC_TYPE_D;
+ else
+ ini->smc_type_v1 = SMC_TYPE_N;
+ } /* else RDMA is supported for this connection */
+ }
+ if (smc_ism_v2_capable && smc_find_ism_v2_device_clnt(smc, ini))
+ ini->smc_type_v2 = SMC_TYPE_N;
+
+ /* if neither ISM nor RDMA are supported, fallback */
+ if (!smcr_indicated(ini->smc_type_v1) &&
+ ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
+ rc = SMC_CLC_DECL_NOSMCDEV;
+
+ return rc;
+}
+
+/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
+ * used, the VLAN ID will be registered again during the connection setup.
+ */
+static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ if (!smcd_indicated(ini->smc_type_v1))
+ return 0;
+ if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id))
+ return SMC_CLC_DECL_CNFERR;
+ return 0;
+}
+
+#define SMC_CLC_MAX_ACCEPT_LEN \
+ (sizeof(struct smc_clc_msg_accept_confirm_v2) + \
+ sizeof(struct smc_clc_first_contact_ext) + \
+ sizeof(struct smc_clc_msg_trail))
+
+/* CLC handshake during connect */
+static int smc_connect_clc(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm_v2 *aclc2,
+ struct smc_init_info *ini)
+{
+ int rc = 0;
+
+ /* do inband token exchange */
+ rc = smc_clc_send_proposal(smc, ini);
+ if (rc)
+ return rc;
+ /* receive SMC Accept CLC message */
+ return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN,
+ SMC_CLC_ACCEPT, CLC_WAIT_TIME);
+}
+
+/* setup for RDMA connection of client */
+static int smc_connect_rdma(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *aclc,
+ struct smc_init_info *ini)
+{
+ int i, reason_code = 0;
+ struct smc_link *link;
+
+ ini->is_smcd = false;
+ ini->ib_lcl = &aclc->r0.lcl;
+ ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
+ ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
+
+ mutex_lock(&smc_client_lgr_pending);
+ reason_code = smc_conn_create(smc, ini);
+ if (reason_code) {
+ mutex_unlock(&smc_client_lgr_pending);
+ return reason_code;
+ }
+
+ smc_conn_save_peer_info(smc, aclc);
+
+ if (ini->first_contact_local) {
+ link = smc->conn.lnk;
+ } else {
+ /* set link that was assigned by server */
+ link = NULL;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *l = &smc->conn.lgr->lnk[i];
+
+ if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
+ !memcmp(l->peer_gid, &aclc->r0.lcl.gid,
+ SMC_GID_SIZE) &&
+ !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
+ sizeof(l->peer_mac))) {
+ link = l;
+ break;
+ }
+ }
+ if (!link) {
+ reason_code = SMC_CLC_DECL_NOSRVLINK;
+ goto connect_abort;
+ }
+ smc->conn.lnk = link;
+ }
+
+ /* create send buffer and rmb */
+ if (smc_buf_create(smc, false)) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto connect_abort;
+ }
+
+ if (ini->first_contact_local)
+ smc_link_save_peer_info(link, aclc);
+
+ if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
+ reason_code = SMC_CLC_DECL_ERR_RTOK;
+ goto connect_abort;
+ }
+
+ smc_close_init(smc);
+ smc_rx_init(smc);
+
+ if (ini->first_contact_local) {
+ if (smc_ib_ready_link(link)) {
+ reason_code = SMC_CLC_DECL_ERR_RDYLNK;
+ goto connect_abort;
+ }
+ } else {
+ if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
+ reason_code = SMC_CLC_DECL_ERR_REGRMB;
+ goto connect_abort;
+ }
+ }
+ smc_rmb_sync_sg_for_device(&smc->conn);
+
+ reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
+ SMC_V1);
+ if (reason_code)
+ goto connect_abort;
+
+ smc_tx_init(smc);
+
+ if (ini->first_contact_local) {
+ /* QP confirmation over RoCE fabric */
+ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
+ reason_code = smcr_clnt_conf_first_link(smc);
+ smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
+ if (reason_code)
+ goto connect_abort;
+ }
+ mutex_unlock(&smc_client_lgr_pending);
+
+ smc_copy_sock_settings_to_clc(smc);
+ smc->connect_nonblock = 0;
+ if (smc->sk.sk_state == SMC_INIT)
+ smc->sk.sk_state = SMC_ACTIVE;
+
+ return 0;
+connect_abort:
+ smc_connect_abort(smc, ini->first_contact_local);
+ mutex_unlock(&smc_client_lgr_pending);
+ smc->connect_nonblock = 0;
+
+ return reason_code;
+}
+
+/* The server has chosen one of the proposed ISM devices for the communication.
+ * Determine from the CHID of the received CLC ACCEPT the ISM device chosen.
+ */
+static int
+smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc,
+ struct smc_init_info *ini)
+{
+ int i;
+
+ for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
+ if (ini->ism_chid[i] == ntohs(aclc->chid)) {
+ ini->ism_selected = i;
+ return 0;
+ }
+ }
+
+ return -EPROTO;
+}
+
+/* setup for ISM connection of client */
+static int smc_connect_ism(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *aclc,
+ struct smc_init_info *ini)
+{
+ int rc = 0;
+
+ ini->is_smcd = true;
+ ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
+
+ if (aclc->hdr.version == SMC_V2) {
+ struct smc_clc_msg_accept_confirm_v2 *aclc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)aclc;
+
+ rc = smc_v2_determine_accepted_chid(aclc_v2, ini);
+ if (rc)
+ return rc;
+ }
+ ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid;
+
+ /* there is only one lgr role for SMC-D; use server lock */
+ mutex_lock(&smc_server_lgr_pending);
+ rc = smc_conn_create(smc, ini);
+ if (rc) {
+ mutex_unlock(&smc_server_lgr_pending);
+ return rc;
+ }
+
+ /* Create send and receive buffers */
+ rc = smc_buf_create(smc, true);
+ if (rc) {
+ rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM;
+ goto connect_abort;
+ }
+
+ smc_conn_save_peer_info(smc, aclc);
+ smc_close_init(smc);
+ smc_rx_init(smc);
+ smc_tx_init(smc);
+
+ rc = smc_clc_send_confirm(smc, ini->first_contact_local,
+ aclc->hdr.version);
+ if (rc)
+ goto connect_abort;
+ mutex_unlock(&smc_server_lgr_pending);
+
+ smc_copy_sock_settings_to_clc(smc);
+ smc->connect_nonblock = 0;
+ if (smc->sk.sk_state == SMC_INIT)
+ smc->sk.sk_state = SMC_ACTIVE;
+
+ return 0;
+connect_abort:
+ smc_connect_abort(smc, ini->first_contact_local);
+ mutex_unlock(&smc_server_lgr_pending);
+ smc->connect_nonblock = 0;
+
+ return rc;
+}
+
+/* check if received accept type and version matches a proposed one */
+static int smc_connect_check_aclc(struct smc_init_info *ini,
+ struct smc_clc_msg_accept_confirm *aclc)
+{
+ if ((aclc->hdr.typev1 == SMC_TYPE_R &&
+ !smcr_indicated(ini->smc_type_v1)) ||
+ (aclc->hdr.typev1 == SMC_TYPE_D &&
+ ((!smcd_indicated(ini->smc_type_v1) &&
+ !smcd_indicated(ini->smc_type_v2)) ||
+ (aclc->hdr.version == SMC_V1 &&
+ !smcd_indicated(ini->smc_type_v1)) ||
+ (aclc->hdr.version == SMC_V2 &&
+ !smcd_indicated(ini->smc_type_v2)))))
+ return SMC_CLC_DECL_MODEUNSUPP;
+
+ return 0;
+}
+
+/* perform steps before actually connecting */
+static int __smc_connect(struct smc_sock *smc)
+{
+ u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1;
+ struct smc_clc_msg_accept_confirm_v2 *aclc2;
+ struct smc_clc_msg_accept_confirm *aclc;
+ struct smc_init_info *ini = NULL;
+ u8 *buf = NULL;
+ int rc = 0;
+
+ if (smc->use_fallback)
+ return smc_connect_fallback(smc, smc->fallback_rsn);
+
+ /* if peer has not signalled SMC-capability, fall back */
+ if (!tcp_sk(smc->clcsock->sk)->syn_smc)
+ return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
+
+ /* IPSec connections opt out of SMC optimizations */
+ if (using_ipsec(smc))
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC,
+ version);
+
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini)
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM,
+ version);
+
+ ini->smcd_version = SMC_V1;
+ ini->smcd_version |= smc_ism_v2_capable ? SMC_V2 : 0;
+ ini->smc_type_v1 = SMC_TYPE_B;
+ ini->smc_type_v2 = smc_ism_v2_capable ? SMC_TYPE_D : SMC_TYPE_N;
+
+ /* get vlan id from IP device */
+ if (smc_vlan_by_tcpsk(smc->clcsock, ini)) {
+ ini->smcd_version &= ~SMC_V1;
+ ini->smc_type_v1 = SMC_TYPE_N;
+ if (!ini->smcd_version) {
+ rc = SMC_CLC_DECL_GETVLANERR;
+ goto fallback;
+ }
+ }
+
+ rc = smc_find_proposal_devices(smc, ini);
+ if (rc)
+ goto fallback;
+
+ buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL);
+ if (!buf) {
+ rc = SMC_CLC_DECL_MEM;
+ goto fallback;
+ }
+ aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf;
+ aclc = (struct smc_clc_msg_accept_confirm *)aclc2;
+
+ /* perform CLC handshake */
+ rc = smc_connect_clc(smc, aclc2, ini);
+ if (rc)
+ goto vlan_cleanup;
+
+ /* check if smc modes and versions of CLC proposal and accept match */
+ rc = smc_connect_check_aclc(ini, aclc);
+ version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2;
+ ini->smcd_version = version;
+ if (rc)
+ goto vlan_cleanup;
+
+ /* depending on previous steps, connect using rdma or ism */
+ if (aclc->hdr.typev1 == SMC_TYPE_R)
+ rc = smc_connect_rdma(smc, aclc, ini);
+ else if (aclc->hdr.typev1 == SMC_TYPE_D)
+ rc = smc_connect_ism(smc, aclc, ini);
+ if (rc)
+ goto vlan_cleanup;
+
+ smc_connect_ism_vlan_cleanup(smc, ini);
+ kfree(buf);
+ kfree(ini);
+ return 0;
+
+vlan_cleanup:
+ smc_connect_ism_vlan_cleanup(smc, ini);
+ kfree(buf);
+fallback:
+ kfree(ini);
+ return smc_connect_decline_fallback(smc, rc, version);
+}
+
+static void smc_connect_work(struct work_struct *work)
+{
+ struct smc_sock *smc = container_of(work, struct smc_sock,
+ connect_work);
+ long timeo = smc->sk.sk_sndtimeo;
+ int rc = 0;
+
+ if (!timeo)
+ timeo = MAX_SCHEDULE_TIMEOUT;
+ lock_sock(smc->clcsock->sk);
+ if (smc->clcsock->sk->sk_err) {
+ smc->sk.sk_err = smc->clcsock->sk->sk_err;
+ } else if ((1 << smc->clcsock->sk->sk_state) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
+ if ((rc == -EPIPE) &&
+ ((1 << smc->clcsock->sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
+ rc = 0;
+ }
+ release_sock(smc->clcsock->sk);
+ lock_sock(&smc->sk);
+ if (rc != 0 || smc->sk.sk_err) {
+ smc->sk.sk_state = SMC_CLOSED;
+ if (rc == -EPIPE || rc == -EAGAIN)
+ smc->sk.sk_err = EPIPE;
+ else if (rc == -ECONNREFUSED)
+ smc->sk.sk_err = ECONNREFUSED;
+ else if (signal_pending(current))
+ smc->sk.sk_err = -sock_intr_errno(timeo);
+ sock_put(&smc->sk); /* passive closing */
+ goto out;
+ }
+
+ rc = __smc_connect(smc);
+ if (rc < 0)
+ smc->sk.sk_err = -rc;
+
+out:
+ if (!sock_flag(&smc->sk, SOCK_DEAD)) {
+ if (smc->sk.sk_err) {
+ smc->sk.sk_state_change(&smc->sk);
+ } else { /* allow polling before and after fallback decision */
+ smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
+ smc->sk.sk_write_space(&smc->sk);
+ }
+ }
+ release_sock(&smc->sk);
+}
+
+static int smc_connect(struct socket *sock, struct sockaddr *addr,
+ int alen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EINVAL;
+
+ smc = smc_sk(sk);
+
+ /* separate smc parameter checking to be safe */
+ if (alen < sizeof(addr->sa_family))
+ goto out_err;
+ if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
+ goto out_err;
+
+ lock_sock(sk);
+ switch (sk->sk_state) {
+ default:
+ goto out;
+ case SMC_ACTIVE:
+ rc = -EISCONN;
+ goto out;
+ case SMC_INIT:
+ rc = 0;
+ break;
+ }
+
+ smc_copy_sock_settings_to_clc(smc);
+ tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+ if (smc->connect_nonblock) {
+ rc = -EALREADY;
+ goto out;
+ }
+ rc = kernel_connect(smc->clcsock, addr, alen, flags);
+ if (rc && rc != -EINPROGRESS)
+ goto out;
+
+ if (smc->use_fallback)
+ goto out;
+ sock_hold(&smc->sk); /* sock put in passive closing */
+ if (flags & O_NONBLOCK) {
+ if (queue_work(smc_hs_wq, &smc->connect_work))
+ smc->connect_nonblock = 1;
+ rc = -EINPROGRESS;
+ } else {
+ rc = __smc_connect(smc);
+ if (rc < 0)
+ goto out;
+ else
+ rc = 0; /* success cases including fallback */
+ }
+
+out:
+ release_sock(sk);
+out_err:
+ return rc;
+}
+
+static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
+{
+ struct socket *new_clcsock = NULL;
+ struct sock *lsk = &lsmc->sk;
+ struct sock *new_sk;
+ int rc = -EINVAL;
+
+ release_sock(lsk);
+ new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
+ if (!new_sk) {
+ rc = -ENOMEM;
+ lsk->sk_err = ENOMEM;
+ *new_smc = NULL;
+ lock_sock(lsk);
+ goto out;
+ }
+ *new_smc = smc_sk(new_sk);
+
+ mutex_lock(&lsmc->clcsock_release_lock);
+ if (lsmc->clcsock)
+ rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
+ mutex_unlock(&lsmc->clcsock_release_lock);
+ lock_sock(lsk);
+ if (rc < 0 && rc != -EAGAIN)
+ lsk->sk_err = -rc;
+ if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
+ new_sk->sk_prot->unhash(new_sk);
+ if (new_clcsock)
+ sock_release(new_clcsock);
+ new_sk->sk_state = SMC_CLOSED;
+ smc_sock_set_flag(new_sk, SOCK_DEAD);
+ sock_put(new_sk); /* final */
+ *new_smc = NULL;
+ goto out;
+ }
+
+ /* new clcsock has inherited the smc listen-specific sk_data_ready
+ * function; switch it back to the original sk_data_ready function
+ */
+ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready;
+ (*new_smc)->clcsock = new_clcsock;
+out:
+ return rc;
+}
+
+/* add a just created sock to the accept queue of the listen sock as
+ * candidate for a following socket accept call from user space
+ */
+static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
+{
+ struct smc_sock *par = smc_sk(parent);
+
+ sock_hold(sk); /* sock_put in smc_accept_unlink () */
+ spin_lock(&par->accept_q_lock);
+ list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
+ spin_unlock(&par->accept_q_lock);
+ sk_acceptq_added(parent);
+}
+
+/* remove a socket from the accept queue of its parental listening socket */
+static void smc_accept_unlink(struct sock *sk)
+{
+ struct smc_sock *par = smc_sk(sk)->listen_smc;
+
+ spin_lock(&par->accept_q_lock);
+ list_del_init(&smc_sk(sk)->accept_q);
+ spin_unlock(&par->accept_q_lock);
+ sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
+ sock_put(sk); /* sock_hold in smc_accept_enqueue */
+}
+
+/* remove a sock from the accept queue to bind it to a new socket created
+ * for a socket accept call from user space
+ */
+struct sock *smc_accept_dequeue(struct sock *parent,
+ struct socket *new_sock)
+{
+ struct smc_sock *isk, *n;
+ struct sock *new_sk;
+
+ list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
+ new_sk = (struct sock *)isk;
+
+ smc_accept_unlink(new_sk);
+ if (new_sk->sk_state == SMC_CLOSED) {
+ new_sk->sk_prot->unhash(new_sk);
+ if (isk->clcsock) {
+ sock_release(isk->clcsock);
+ isk->clcsock = NULL;
+ }
+ sock_put(new_sk); /* final */
+ continue;
+ }
+ if (new_sock) {
+ sock_graft(new_sk, new_sock);
+ if (isk->use_fallback) {
+ smc_sk(new_sk)->clcsock->file = new_sock->file;
+ isk->clcsock->file->private_data = isk->clcsock;
+ }
+ }
+ return new_sk;
+ }
+ return NULL;
+}
+
+/* clean up for a created but never accepted sock */
+void smc_close_non_accepted(struct sock *sk)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ sock_hold(sk); /* sock_put below */
+ lock_sock(sk);
+ if (!sk->sk_lingertime)
+ /* wait for peer closing */
+ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
+ __smc_release(smc);
+ release_sock(sk);
+ sock_put(sk); /* sock_hold above */
+ sock_put(sk); /* final sock_put */
+}
+
+static int smcr_serv_conf_first_link(struct smc_sock *smc)
+{
+ struct smc_link *link = smc->conn.lnk;
+ struct smc_llc_qentry *qentry;
+ int rc;
+
+ if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
+ return SMC_CLC_DECL_ERR_REGRMB;
+
+ /* send CONFIRM LINK request to client over the RoCE fabric */
+ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
+ if (rc < 0)
+ return SMC_CLC_DECL_TIMEOUT_CL;
+
+ /* receive CONFIRM LINK response from client over the RoCE fabric */
+ qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
+ SMC_LLC_CONFIRM_LINK);
+ if (!qentry) {
+ struct smc_clc_msg_decline dclc;
+
+ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+ SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+ return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
+ }
+ smc_llc_save_peer_uid(qentry);
+ rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
+ smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
+ if (rc)
+ return SMC_CLC_DECL_RMBE_EC;
+
+ /* confirm_rkey is implicit on 1st contact */
+ smc->conn.rmb_desc->is_conf_rkey = true;
+
+ smc_llc_link_active(link);
+ smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
+
+ /* initial contact - try to establish second link */
+ smc_llc_srv_add_link(link);
+ return 0;
+}
+
+/* listen worker: finish */
+static void smc_listen_out(struct smc_sock *new_smc)
+{
+ struct smc_sock *lsmc = new_smc->listen_smc;
+ struct sock *newsmcsk = &new_smc->sk;
+
+ if (lsmc->sk.sk_state == SMC_LISTEN) {
+ lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
+ smc_accept_enqueue(&lsmc->sk, newsmcsk);
+ release_sock(&lsmc->sk);
+ } else { /* no longer listening */
+ smc_close_non_accepted(newsmcsk);
+ }
+
+ /* Wake up accept */
+ lsmc->sk.sk_data_ready(&lsmc->sk);
+ sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
+}
+
+/* listen worker: finish in state connected */
+static void smc_listen_out_connected(struct smc_sock *new_smc)
+{
+ struct sock *newsmcsk = &new_smc->sk;
+
+ if (newsmcsk->sk_state == SMC_INIT)
+ newsmcsk->sk_state = SMC_ACTIVE;
+
+ smc_listen_out(new_smc);
+}
+
+/* listen worker: finish in error state */
+static void smc_listen_out_err(struct smc_sock *new_smc)
+{
+ struct sock *newsmcsk = &new_smc->sk;
+
+ if (newsmcsk->sk_state == SMC_INIT)
+ sock_put(&new_smc->sk); /* passive closing */
+ newsmcsk->sk_state = SMC_CLOSED;
+
+ smc_listen_out(new_smc);
+}
+
+/* listen worker: decline and fall back if possible */
+static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
+ int local_first, u8 version)
+{
+ /* RDMA setup failed, switch back to TCP */
+ if (local_first)
+ smc_lgr_cleanup_early(&new_smc->conn);
+ else
+ smc_conn_free(&new_smc->conn);
+ if (reason_code < 0) { /* error, no fallback possible */
+ smc_listen_out_err(new_smc);
+ return;
+ }
+ smc_switch_to_fallback(new_smc);
+ new_smc->fallback_rsn = reason_code;
+ if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
+ if (smc_clc_send_decline(new_smc, reason_code, version) < 0) {
+ smc_listen_out_err(new_smc);
+ return;
+ }
+ }
+ smc_listen_out_connected(new_smc);
+}
+
+/* listen worker: version checking */
+static int smc_listen_v2_check(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext;
+ struct smc_clc_v2_extension *pclc_v2_ext;
+
+ ini->smc_type_v1 = pclc->hdr.typev1;
+ ini->smc_type_v2 = pclc->hdr.typev2;
+ ini->smcd_version = ini->smc_type_v1 != SMC_TYPE_N ? SMC_V1 : 0;
+ if (pclc->hdr.version > SMC_V1)
+ ini->smcd_version |=
+ ini->smc_type_v2 != SMC_TYPE_N ? SMC_V2 : 0;
+ if (!smc_ism_v2_capable) {
+ ini->smcd_version &= ~SMC_V2;
+ goto out;
+ }
+ pclc_v2_ext = smc_get_clc_v2_ext(pclc);
+ if (!pclc_v2_ext) {
+ ini->smcd_version &= ~SMC_V2;
+ goto out;
+ }
+ pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext);
+ if (!pclc_smcd_v2_ext)
+ ini->smcd_version &= ~SMC_V2;
+
+out:
+ if (!ini->smcd_version) {
+ if (pclc->hdr.typev1 == SMC_TYPE_B ||
+ pclc->hdr.typev2 == SMC_TYPE_B)
+ return SMC_CLC_DECL_NOSMCDEV;
+ if (pclc->hdr.typev1 == SMC_TYPE_D ||
+ pclc->hdr.typev2 == SMC_TYPE_D)
+ return SMC_CLC_DECL_NOSMCDDEV;
+ return SMC_CLC_DECL_NOSMCRDEV;
+ }
+
+ return 0;
+}
+
+/* listen worker: check prefixes */
+static int smc_listen_prfx_check(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc)
+{
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
+ struct socket *newclcsock = new_smc->clcsock;
+
+ if (pclc->hdr.typev1 == SMC_TYPE_N)
+ return 0;
+ pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+ if (smc_clc_prfx_match(newclcsock, pclc_prfx))
+ return SMC_CLC_DECL_DIFFPREFIX;
+
+ return 0;
+}
+
+/* listen worker: initialize connection and buffers */
+static int smc_listen_rdma_init(struct smc_sock *new_smc,
+ struct smc_init_info *ini)
+{
+ int rc;
+
+ /* allocate connection / link group */
+ rc = smc_conn_create(new_smc, ini);
+ if (rc)
+ return rc;
+
+ /* create send buffer and rmb */
+ if (smc_buf_create(new_smc, false))
+ return SMC_CLC_DECL_MEM;
+
+ return 0;
+}
+
+/* listen worker: initialize connection and buffers for SMC-D */
+static int smc_listen_ism_init(struct smc_sock *new_smc,
+ struct smc_init_info *ini)
+{
+ int rc;
+
+ rc = smc_conn_create(new_smc, ini);
+ if (rc)
+ return rc;
+
+ /* Create send and receive buffers */
+ rc = smc_buf_create(new_smc, true);
+ if (rc) {
+ if (ini->first_contact_local)
+ smc_lgr_cleanup_early(&new_smc->conn);
+ else
+ smc_conn_free(&new_smc->conn);
+ return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB :
+ SMC_CLC_DECL_MEM;
+ }
+
+ return 0;
+}
+
+static bool smc_is_already_selected(struct smcd_dev *smcd,
+ struct smc_init_info *ini,
+ int matches)
+{
+ int i;
+
+ for (i = 0; i < matches; i++)
+ if (smcd == ini->ism_dev[i])
+ return true;
+
+ return false;
+}
+
+/* check for ISM devices matching proposed ISM devices */
+static void smc_check_ism_v2_match(struct smc_init_info *ini,
+ u16 proposed_chid, u64 proposed_gid,
+ unsigned int *matches)
+{
+ struct smcd_dev *smcd;
+
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ if (smcd->going_away)
+ continue;
+ if (smc_is_already_selected(smcd, ini, *matches))
+ continue;
+ if (smc_ism_get_chid(smcd) == proposed_chid &&
+ !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) {
+ ini->ism_peer_gid[*matches] = proposed_gid;
+ ini->ism_dev[*matches] = smcd;
+ (*matches)++;
+ break;
+ }
+ }
+}
+
+static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+ struct smc_clc_v2_extension *smc_v2_ext;
+ struct smc_clc_msg_smcd *pclc_smcd;
+ unsigned int matches = 0;
+ u8 smcd_version;
+ u8 *eid = NULL;
+ int i;
+
+ if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2))
+ goto not_found;
+
+ pclc_smcd = smc_get_clc_msg_smcd(pclc);
+ smc_v2_ext = smc_get_clc_v2_ext(pclc);
+ smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext);
+ if (!smcd_v2_ext ||
+ !smc_v2_ext->hdr.flag.seid) /* no system EID support for SMCD */
+ goto not_found;
+
+ mutex_lock(&smcd_dev_list.mutex);
+ if (pclc_smcd->ism.chid)
+ /* check for ISM device matching proposed native ISM device */
+ smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid),
+ ntohll(pclc_smcd->ism.gid), &matches);
+ for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) {
+ /* check for ISM devices matching proposed non-native ISM
+ * devices
+ */
+ smc_check_ism_v2_match(ini,
+ ntohs(smcd_v2_ext->gidchid[i - 1].chid),
+ ntohll(smcd_v2_ext->gidchid[i - 1].gid),
+ &matches);
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+
+ if (ini->ism_dev[0]) {
+ smc_ism_get_system_eid(ini->ism_dev[0], &eid);
+ if (memcmp(eid, smcd_v2_ext->system_eid, SMC_MAX_EID_LEN))
+ goto not_found;
+ } else {
+ goto not_found;
+ }
+
+ /* separate - outside the smcd_dev_list.lock */
+ smcd_version = ini->smcd_version;
+ for (i = 0; i < matches; i++) {
+ ini->smcd_version = SMC_V2;
+ ini->is_smcd = true;
+ ini->ism_selected = i;
+ if (smc_listen_ism_init(new_smc, ini))
+ /* try next active ISM device */
+ continue;
+ return; /* matching and usable V2 ISM device found */
+ }
+ /* no V2 ISM device could be initialized */
+ ini->smcd_version = smcd_version; /* restore original value */
+
+not_found:
+ ini->smcd_version &= ~SMC_V2;
+ ini->ism_dev[0] = NULL;
+ ini->is_smcd = false;
+}
+
+static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc);
+
+ /* check if ISM V1 is available */
+ if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1))
+ goto not_found;
+ ini->is_smcd = true; /* prepare ISM check */
+ ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid);
+ if (smc_find_ism_device(new_smc, ini))
+ goto not_found;
+ ini->ism_selected = 0;
+ if (!smc_listen_ism_init(new_smc, ini))
+ return; /* V1 ISM device found */
+
+not_found:
+ ini->ism_dev[0] = NULL;
+ ini->is_smcd = false;
+}
+
+/* listen worker: register buffers */
+static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
+{
+ struct smc_connection *conn = &new_smc->conn;
+
+ if (!local_first) {
+ if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
+ return SMC_CLC_DECL_ERR_REGRMB;
+ }
+ smc_rmb_sync_sg_for_device(&new_smc->conn);
+
+ return 0;
+}
+
+static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ int rc;
+
+ if (!smcr_indicated(ini->smc_type_v1))
+ return SMC_CLC_DECL_NOSMCDEV;
+
+ /* prepare RDMA check */
+ ini->ib_lcl = &pclc->lcl;
+ rc = smc_find_rdma_device(new_smc, ini);
+ if (rc) {
+ /* no RDMA device found */
+ if (ini->smc_type_v1 == SMC_TYPE_B)
+ /* neither ISM nor RDMA device found */
+ rc = SMC_CLC_DECL_NOSMCDEV;
+ return rc;
+ }
+ rc = smc_listen_rdma_init(new_smc, ini);
+ if (rc)
+ return rc;
+ return smc_listen_rdma_reg(new_smc, ini->first_contact_local);
+}
+
+/* determine the local device matching to proposal */
+static int smc_listen_find_device(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ int rc;
+
+ /* check for ISM device matching V2 proposed device */
+ smc_find_ism_v2_device_serv(new_smc, pclc, ini);
+ if (ini->ism_dev[0])
+ return 0;
+
+ if (!(ini->smcd_version & SMC_V1))
+ return SMC_CLC_DECL_NOSMCDEV;
+
+ /* check for matching IP prefix and subnet length */
+ rc = smc_listen_prfx_check(new_smc, pclc);
+ if (rc)
+ return rc;
+
+ /* get vlan id from IP device */
+ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini))
+ return SMC_CLC_DECL_GETVLANERR;
+
+ /* check for ISM device matching V1 proposed device */
+ smc_find_ism_v1_device_serv(new_smc, pclc, ini);
+ if (ini->ism_dev[0])
+ return 0;
+
+ if (pclc->hdr.typev1 == SMC_TYPE_D)
+ return SMC_CLC_DECL_NOSMCDDEV; /* skip RDMA and decline */
+
+ /* check if RDMA is available */
+ return smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
+}
+
+/* listen worker: finish RDMA setup */
+static int smc_listen_rdma_finish(struct smc_sock *new_smc,
+ struct smc_clc_msg_accept_confirm *cclc,
+ bool local_first)
+{
+ struct smc_link *link = new_smc->conn.lnk;
+ int reason_code = 0;
+
+ if (local_first)
+ smc_link_save_peer_info(link, cclc);
+
+ if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
+ return SMC_CLC_DECL_ERR_RTOK;
+
+ if (local_first) {
+ if (smc_ib_ready_link(link))
+ return SMC_CLC_DECL_ERR_RDYLNK;
+ /* QP confirmation over RoCE fabric */
+ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
+ reason_code = smcr_serv_conf_first_link(new_smc);
+ smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
+ }
+ return reason_code;
+}
+
+/* setup for connection of server */
+static void smc_listen_work(struct work_struct *work)
+{
+ struct smc_sock *new_smc = container_of(work, struct smc_sock,
+ smc_listen_work);
+ u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1;
+ struct socket *newclcsock = new_smc->clcsock;
+ struct smc_clc_msg_accept_confirm *cclc;
+ struct smc_clc_msg_proposal_area *buf;
+ struct smc_clc_msg_proposal *pclc;
+ struct smc_init_info *ini = NULL;
+ int rc = 0;
+
+ if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
+ return smc_listen_out_err(new_smc);
+
+ if (new_smc->use_fallback) {
+ smc_listen_out_connected(new_smc);
+ return;
+ }
+
+ /* check if peer is smc capable */
+ if (!tcp_sk(newclcsock->sk)->syn_smc) {
+ smc_switch_to_fallback(new_smc);
+ new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
+ smc_listen_out_connected(new_smc);
+ return;
+ }
+
+ /* do inband token exchange -
+ * wait for and receive SMC Proposal CLC message
+ */
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+ if (!buf) {
+ rc = SMC_CLC_DECL_MEM;
+ goto out_decl;
+ }
+ pclc = (struct smc_clc_msg_proposal *)buf;
+ rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf),
+ SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
+ if (rc)
+ goto out_decl;
+ version = pclc->hdr.version == SMC_V1 ? SMC_V1 : version;
+
+ /* IPSec connections opt out of SMC optimizations */
+ if (using_ipsec(new_smc)) {
+ rc = SMC_CLC_DECL_IPSEC;
+ goto out_decl;
+ }
+
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini) {
+ rc = SMC_CLC_DECL_MEM;
+ goto out_decl;
+ }
+
+ /* initial version checking */
+ rc = smc_listen_v2_check(new_smc, pclc, ini);
+ if (rc)
+ goto out_decl;
+
+ mutex_lock(&smc_server_lgr_pending);
+ smc_close_init(new_smc);
+ smc_rx_init(new_smc);
+ smc_tx_init(new_smc);
+
+ /* determine ISM or RoCE device used for connection */
+ rc = smc_listen_find_device(new_smc, pclc, ini);
+ if (rc)
+ goto out_unlock;
+
+ /* send SMC Accept CLC message */
+ rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
+ ini->smcd_version == SMC_V2 ? SMC_V2 : SMC_V1);
+ if (rc)
+ goto out_unlock;
+
+ /* SMC-D does not need this lock any more */
+ if (ini->is_smcd)
+ mutex_unlock(&smc_server_lgr_pending);
+
+ /* receive SMC Confirm CLC message */
+ memset(buf, 0, sizeof(*buf));
+ cclc = (struct smc_clc_msg_accept_confirm *)buf;
+ rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf),
+ SMC_CLC_CONFIRM, CLC_WAIT_TIME);
+ if (rc) {
+ if (!ini->is_smcd)
+ goto out_unlock;
+ goto out_decl;
+ }
+
+ /* finish worker */
+ if (!ini->is_smcd) {
+ rc = smc_listen_rdma_finish(new_smc, cclc,
+ ini->first_contact_local);
+ if (rc)
+ goto out_unlock;
+ mutex_unlock(&smc_server_lgr_pending);
+ }
+ smc_conn_save_peer_info(new_smc, cclc);
+ smc_listen_out_connected(new_smc);
+ goto out_free;
+
+out_unlock:
+ mutex_unlock(&smc_server_lgr_pending);
+out_decl:
+ smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
+ version);
+out_free:
+ kfree(ini);
+ kfree(buf);
+}
+
+static void smc_tcp_listen_work(struct work_struct *work)
+{
+ struct smc_sock *lsmc = container_of(work, struct smc_sock,
+ tcp_listen_work);
+ struct sock *lsk = &lsmc->sk;
+ struct smc_sock *new_smc;
+ int rc = 0;
+
+ lock_sock(lsk);
+ while (lsk->sk_state == SMC_LISTEN) {
+ rc = smc_clcsock_accept(lsmc, &new_smc);
+ if (rc) /* clcsock accept queue empty or error */
+ goto out;
+ if (!new_smc)
+ continue;
+
+ new_smc->listen_smc = lsmc;
+ new_smc->use_fallback = lsmc->use_fallback;
+ new_smc->fallback_rsn = lsmc->fallback_rsn;
+ sock_hold(lsk); /* sock_put in smc_listen_work */
+ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
+ smc_copy_sock_settings_to_smc(new_smc);
+ new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
+ new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
+ sock_hold(&new_smc->sk); /* sock_put in passive closing */
+ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
+ sock_put(&new_smc->sk);
+ }
+
+out:
+ release_sock(lsk);
+ sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
+}
+
+static void smc_clcsock_data_ready(struct sock *listen_clcsock)
+{
+ struct smc_sock *lsmc;
+
+ lsmc = (struct smc_sock *)
+ ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY);
+ if (!lsmc)
+ return;
+ lsmc->clcsk_data_ready(listen_clcsock);
+ if (lsmc->sk.sk_state == SMC_LISTEN) {
+ sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
+ if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work))
+ sock_put(&lsmc->sk);
+ }
+}
+
+static int smc_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+
+ rc = -EINVAL;
+ if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
+ smc->connect_nonblock)
+ goto out;
+
+ rc = 0;
+ if (sk->sk_state == SMC_LISTEN) {
+ sk->sk_max_ack_backlog = backlog;
+ goto out;
+ }
+ /* some socket options are handled in core, so we could not apply
+ * them to the clc socket -- copy smc socket options to clc socket
+ */
+ smc_copy_sock_settings_to_clc(smc);
+ if (!smc->use_fallback)
+ tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+
+ /* save original sk_data_ready function and establish
+ * smc-specific sk_data_ready function
+ */
+ smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready;
+ smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready;
+ smc->clcsock->sk->sk_user_data =
+ (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
+ rc = kernel_listen(smc->clcsock, backlog);
+ if (rc) {
+ smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready;
+ goto out;
+ }
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+ sk->sk_state = SMC_LISTEN;
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int smc_accept(struct socket *sock, struct socket *new_sock,
+ int flags, bool kern)
+{
+ struct sock *sk = sock->sk, *nsk;
+ DECLARE_WAITQUEUE(wait, current);
+ struct smc_sock *lsmc;
+ long timeo;
+ int rc = 0;
+
+ lsmc = smc_sk(sk);
+ sock_hold(sk); /* sock_put below */
+ lock_sock(sk);
+
+ if (lsmc->sk.sk_state != SMC_LISTEN) {
+ rc = -EINVAL;
+ release_sock(sk);
+ goto out;
+ }
+
+ /* Wait for an incoming connection */
+ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!timeo) {
+ rc = -EAGAIN;
+ break;
+ }
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ /* wakeup by sk_data_ready in smc_listen_work() */
+ sched_annotate_sleep();
+ lock_sock(sk);
+ if (signal_pending(current)) {
+ rc = sock_intr_errno(timeo);
+ break;
+ }
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ if (!rc)
+ rc = sock_error(nsk);
+ release_sock(sk);
+ if (rc)
+ goto out;
+
+ if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
+ /* wait till data arrives on the socket */
+ timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
+ MSEC_PER_SEC);
+ if (smc_sk(nsk)->use_fallback) {
+ struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
+
+ lock_sock(clcsk);
+ if (skb_queue_empty(&clcsk->sk_receive_queue))
+ sk_wait_data(clcsk, &timeo, NULL);
+ release_sock(clcsk);
+ } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
+ lock_sock(nsk);
+ smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
+ release_sock(nsk);
+ }
+ }
+
+out:
+ sock_put(sk); /* sock_hold above */
+ return rc;
+}
+
+static int smc_getname(struct socket *sock, struct sockaddr *addr,
+ int peer)
+{
+ struct smc_sock *smc;
+
+ if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
+ (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
+ return -ENOTCONN;
+
+ smc = smc_sk(sock->sk);
+
+ return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
+}
+
+static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+
+ /* SMC does not support connect with fastopen */
+ if (msg->msg_flags & MSG_FASTOPEN) {
+ /* not connected yet, fallback */
+ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
+ smc_switch_to_fallback(smc);
+ smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
+ } else {
+ rc = -EINVAL;
+ goto out;
+ }
+ } else if ((sk->sk_state != SMC_ACTIVE) &&
+ (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+ (sk->sk_state != SMC_INIT)) {
+ rc = -EPIPE;
+ goto out;
+ }
+
+ if (smc->use_fallback)
+ rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
+ else
+ rc = smc_tx_sendmsg(smc, msg, len);
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -ENOTCONN;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
+ /* socket was connected before, no more data to read */
+ rc = 0;
+ goto out;
+ }
+ if ((sk->sk_state == SMC_INIT) ||
+ (sk->sk_state == SMC_LISTEN) ||
+ (sk->sk_state == SMC_CLOSED))
+ goto out;
+
+ if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
+ rc = 0;
+ goto out;
+ }
+
+ if (smc->use_fallback) {
+ rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
+ } else {
+ msg->msg_namelen = 0;
+ rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
+ }
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static __poll_t smc_accept_poll(struct sock *parent)
+{
+ struct smc_sock *isk = smc_sk(parent);
+ __poll_t mask = 0;
+
+ spin_lock(&isk->accept_q_lock);
+ if (!list_empty(&isk->accept_q))
+ mask = EPOLLIN | EPOLLRDNORM;
+ spin_unlock(&isk->accept_q_lock);
+
+ return mask;
+}
+
+static __poll_t smc_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ __poll_t mask = 0;
+
+ if (!sk)
+ return EPOLLNVAL;
+
+ smc = smc_sk(sock->sk);
+ if (smc->use_fallback) {
+ /* delegate to CLC child sock */
+ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
+ sk->sk_err = smc->clcsock->sk->sk_err;
+ } else {
+ if (sk->sk_state != SMC_CLOSED)
+ sock_poll_wait(file, sock, wait);
+ if (sk->sk_err)
+ mask |= EPOLLERR;
+ if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+ (sk->sk_state == SMC_CLOSED))
+ mask |= EPOLLHUP;
+ if (sk->sk_state == SMC_LISTEN) {
+ /* woken up by sk_data_ready in smc_listen_work() */
+ mask |= smc_accept_poll(sk);
+ } else if (smc->use_fallback) { /* as result of connect_work()*/
+ mask |= smc->clcsock->ops->poll(file, smc->clcsock,
+ wait);
+ sk->sk_err = smc->clcsock->sk->sk_err;
+ } else {
+ if ((sk->sk_state != SMC_INIT &&
+ atomic_read(&smc->conn.sndbuf_space)) ||
+ sk->sk_shutdown & SEND_SHUTDOWN) {
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ } else {
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ }
+ if (atomic_read(&smc->conn.bytes_to_rcv))
+ mask |= EPOLLIN | EPOLLRDNORM;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+ if (sk->sk_state == SMC_APPCLOSEWAIT1)
+ mask |= EPOLLIN;
+ if (smc->conn.urg_state == SMC_URG_VALID)
+ mask |= EPOLLPRI;
+ }
+ }
+
+ return mask;
+}
+
+static int smc_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ bool do_shutdown = true;
+ struct smc_sock *smc;
+ int rc = -EINVAL;
+ int old_state;
+ int rc1 = 0;
+
+ smc = smc_sk(sk);
+
+ if ((how < SHUT_RD) || (how > SHUT_RDWR))
+ return rc;
+
+ lock_sock(sk);
+
+ rc = -ENOTCONN;
+ if ((sk->sk_state != SMC_ACTIVE) &&
+ (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
+ (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
+ (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+ (sk->sk_state != SMC_APPCLOSEWAIT2) &&
+ (sk->sk_state != SMC_APPFINCLOSEWAIT))
+ goto out;
+ if (smc->use_fallback) {
+ rc = kernel_sock_shutdown(smc->clcsock, how);
+ sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
+ if (sk->sk_shutdown == SHUTDOWN_MASK) {
+ sk->sk_state = SMC_CLOSED;
+ sock_put(sk);
+ }
+ goto out;
+ }
+ switch (how) {
+ case SHUT_RDWR: /* shutdown in both directions */
+ old_state = sk->sk_state;
+ rc = smc_close_active(smc);
+ if (old_state == SMC_ACTIVE &&
+ sk->sk_state == SMC_PEERCLOSEWAIT1)
+ do_shutdown = false;
+ break;
+ case SHUT_WR:
+ rc = smc_close_shutdown_write(smc);
+ break;
+ case SHUT_RD:
+ rc = 0;
+ /* nothing more to do because peer is not involved */
+ break;
+ }
+ if (do_shutdown && smc->clcsock)
+ rc1 = kernel_sock_shutdown(smc->clcsock, how);
+ /* map sock_shutdown_cmd constants to sk_shutdown value range */
+ sk->sk_shutdown |= how + 1;
+
+out:
+ release_sock(sk);
+ return rc ? rc : rc1;
+}
+
+static int smc_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int val, rc;
+
+ if (level == SOL_TCP && optname == TCP_ULP)
+ return -EOPNOTSUPP;
+
+ smc = smc_sk(sk);
+
+ /* generic setsockopts reaching us here always apply to the
+ * CLC socket
+ */
+ if (unlikely(!smc->clcsock->ops->setsockopt))
+ rc = -EOPNOTSUPP;
+ else
+ rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+ if (smc->clcsock->sk->sk_err) {
+ sk->sk_err = smc->clcsock->sk->sk_err;
+ sk->sk_error_report(sk);
+ }
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
+ return -EFAULT;
+
+ lock_sock(sk);
+ if (rc || smc->use_fallback)
+ goto out;
+ switch (optname) {
+ case TCP_FASTOPEN:
+ case TCP_FASTOPEN_CONNECT:
+ case TCP_FASTOPEN_KEY:
+ case TCP_FASTOPEN_NO_COOKIE:
+ /* option not supported by SMC */
+ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
+ smc_switch_to_fallback(smc);
+ smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
+ } else {
+ rc = -EINVAL;
+ }
+ break;
+ case TCP_NODELAY:
+ if (sk->sk_state != SMC_INIT &&
+ sk->sk_state != SMC_LISTEN &&
+ sk->sk_state != SMC_CLOSED) {
+ if (val)
+ mod_delayed_work(smc->conn.lgr->tx_wq,
+ &smc->conn.tx_work, 0);
+ }
+ break;
+ case TCP_CORK:
+ if (sk->sk_state != SMC_INIT &&
+ sk->sk_state != SMC_LISTEN &&
+ sk->sk_state != SMC_CLOSED) {
+ if (!val)
+ mod_delayed_work(smc->conn.lgr->tx_wq,
+ &smc->conn.tx_work, 0);
+ }
+ break;
+ case TCP_DEFER_ACCEPT:
+ smc->sockopt_defer_accept = val;
+ break;
+ default:
+ break;
+ }
+out:
+ release_sock(sk);
+
+ return rc;
+}
+
+static int smc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct smc_sock *smc;
+
+ smc = smc_sk(sock->sk);
+ /* socket options apply to the CLC socket */
+ if (unlikely(!smc->clcsock->ops->getsockopt))
+ return -EOPNOTSUPP;
+ return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+}
+
+static int smc_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ union smc_host_cursor cons, urg;
+ struct smc_connection *conn;
+ struct smc_sock *smc;
+ int answ;
+
+ smc = smc_sk(sock->sk);
+ conn = &smc->conn;
+ lock_sock(&smc->sk);
+ if (smc->use_fallback) {
+ if (!smc->clcsock) {
+ release_sock(&smc->sk);
+ return -EBADF;
+ }
+ answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
+ release_sock(&smc->sk);
+ return answ;
+ }
+ switch (cmd) {
+ case SIOCINQ: /* same as FIONREAD */
+ if (smc->sk.sk_state == SMC_LISTEN) {
+ release_sock(&smc->sk);
+ return -EINVAL;
+ }
+ if (smc->sk.sk_state == SMC_INIT ||
+ smc->sk.sk_state == SMC_CLOSED)
+ answ = 0;
+ else
+ answ = atomic_read(&smc->conn.bytes_to_rcv);
+ break;
+ case SIOCOUTQ:
+ /* output queue size (not send + not acked) */
+ if (smc->sk.sk_state == SMC_LISTEN) {
+ release_sock(&smc->sk);
+ return -EINVAL;
+ }
+ if (smc->sk.sk_state == SMC_INIT ||
+ smc->sk.sk_state == SMC_CLOSED)
+ answ = 0;
+ else
+ answ = smc->conn.sndbuf_desc->len -
+ atomic_read(&smc->conn.sndbuf_space);
+ break;
+ case SIOCOUTQNSD:
+ /* output queue size (not send only) */
+ if (smc->sk.sk_state == SMC_LISTEN) {
+ release_sock(&smc->sk);
+ return -EINVAL;
+ }
+ if (smc->sk.sk_state == SMC_INIT ||
+ smc->sk.sk_state == SMC_CLOSED)
+ answ = 0;
+ else
+ answ = smc_tx_prepared_sends(&smc->conn);
+ break;
+ case SIOCATMARK:
+ if (smc->sk.sk_state == SMC_LISTEN) {
+ release_sock(&smc->sk);
+ return -EINVAL;
+ }
+ if (smc->sk.sk_state == SMC_INIT ||
+ smc->sk.sk_state == SMC_CLOSED) {
+ answ = 0;
+ } else {
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ smc_curs_copy(&urg, &conn->urg_curs, conn);
+ answ = smc_curs_diff(conn->rmb_desc->len,
+ &cons, &urg) == 1;
+ }
+ break;
+ default:
+ release_sock(&smc->sk);
+ return -ENOIOCTLCMD;
+ }
+ release_sock(&smc->sk);
+
+ return put_user(answ, (int __user *)arg);
+}
+
+static ssize_t smc_sendpage(struct socket *sock, struct page *page,
+ int offset, size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EPIPE;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if (sk->sk_state != SMC_ACTIVE) {
+ release_sock(sk);
+ goto out;
+ }
+ release_sock(sk);
+ if (smc->use_fallback)
+ rc = kernel_sendpage(smc->clcsock, page, offset,
+ size, flags);
+ else
+ rc = sock_no_sendpage(sock, page, offset, size, flags);
+
+out:
+ return rc;
+}
+
+/* Map the affected portions of the rmbe into an spd, note the number of bytes
+ * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
+ * updates till whenever a respective page has been fully processed.
+ * Note that subsequent recv() calls have to wait till all splice() processing
+ * completed.
+ */
+static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -ENOTCONN;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
+ /* socket was connected before, no more data to read */
+ rc = 0;
+ goto out;
+ }
+ if (sk->sk_state == SMC_INIT ||
+ sk->sk_state == SMC_LISTEN ||
+ sk->sk_state == SMC_CLOSED)
+ goto out;
+
+ if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
+ rc = 0;
+ goto out;
+ }
+
+ if (smc->use_fallback) {
+ rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
+ pipe, len, flags);
+ } else {
+ if (*ppos) {
+ rc = -ESPIPE;
+ goto out;
+ }
+ if (flags & SPLICE_F_NONBLOCK)
+ flags = MSG_DONTWAIT;
+ else
+ flags = 0;
+ rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
+ }
+out:
+ release_sock(sk);
+
+ return rc;
+}
+
+/* must look like tcp */
+static const struct proto_ops smc_sock_ops = {
+ .family = PF_SMC,
+ .owner = THIS_MODULE,
+ .release = smc_release,
+ .bind = smc_bind,
+ .connect = smc_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = smc_accept,
+ .getname = smc_getname,
+ .poll = smc_poll,
+ .ioctl = smc_ioctl,
+ .listen = smc_listen,
+ .shutdown = smc_shutdown,
+ .setsockopt = smc_setsockopt,
+ .getsockopt = smc_getsockopt,
+ .sendmsg = smc_sendmsg,
+ .recvmsg = smc_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = smc_sendpage,
+ .splice_read = smc_splice_read,
+};
+
+static int smc_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
+ struct smc_sock *smc;
+ struct sock *sk;
+ int rc;
+
+ rc = -ESOCKTNOSUPPORT;
+ if (sock->type != SOCK_STREAM)
+ goto out;
+
+ rc = -EPROTONOSUPPORT;
+ if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
+ goto out;
+
+ rc = -ENOBUFS;
+ sock->ops = &smc_sock_ops;
+ sk = smc_sock_alloc(net, sock, protocol);
+ if (!sk)
+ goto out;
+
+ /* create internal TCP socket for CLC handshake and fallback */
+ smc = smc_sk(sk);
+ smc->use_fallback = false; /* assume rdma capability first */
+ smc->fallback_rsn = 0;
+ rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
+ &smc->clcsock);
+ if (rc) {
+ sk_common_release(sk);
+ goto out;
+ }
+ smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
+ smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
+
+out:
+ return rc;
+}
+
+static const struct net_proto_family smc_sock_family_ops = {
+ .family = PF_SMC,
+ .owner = THIS_MODULE,
+ .create = smc_create,
+};
+
+unsigned int smc_net_id;
+
+static __net_init int smc_net_init(struct net *net)
+{
+ return smc_pnet_net_init(net);
+}
+
+static void __net_exit smc_net_exit(struct net *net)
+{
+ smc_pnet_net_exit(net);
+}
+
+static struct pernet_operations smc_net_ops = {
+ .init = smc_net_init,
+ .exit = smc_net_exit,
+ .id = &smc_net_id,
+ .size = sizeof(struct smc_net),
+};
+
+static int __init smc_init(void)
+{
+ int rc;
+
+ rc = register_pernet_subsys(&smc_net_ops);
+ if (rc)
+ return rc;
+
+ smc_ism_init();
+ smc_clc_init();
+
+ rc = smc_pnet_init();
+ if (rc)
+ goto out_pernet_subsys;
+
+ rc = -ENOMEM;
+ smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
+ if (!smc_hs_wq)
+ goto out_pnet;
+
+ smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
+ if (!smc_close_wq)
+ goto out_alloc_hs_wq;
+
+ rc = smc_core_init();
+ if (rc) {
+ pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
+ goto out_alloc_wqs;
+ }
+
+ rc = smc_llc_init();
+ if (rc) {
+ pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
+ goto out_core;
+ }
+
+ rc = smc_cdc_init();
+ if (rc) {
+ pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
+ goto out_core;
+ }
+
+ rc = proto_register(&smc_proto, 1);
+ if (rc) {
+ pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
+ goto out_core;
+ }
+
+ rc = proto_register(&smc_proto6, 1);
+ if (rc) {
+ pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
+ goto out_proto;
+ }
+
+ rc = sock_register(&smc_sock_family_ops);
+ if (rc) {
+ pr_err("%s: sock_register fails with %d\n", __func__, rc);
+ goto out_proto6;
+ }
+ INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
+ INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
+
+ rc = smc_ib_register_client();
+ if (rc) {
+ pr_err("%s: ib_register fails with %d\n", __func__, rc);
+ goto out_sock;
+ }
+
+ static_branch_enable(&tcp_have_smc);
+ return 0;
+
+out_sock:
+ sock_unregister(PF_SMC);
+out_proto6:
+ proto_unregister(&smc_proto6);
+out_proto:
+ proto_unregister(&smc_proto);
+out_core:
+ smc_core_exit();
+out_alloc_wqs:
+ destroy_workqueue(smc_close_wq);
+out_alloc_hs_wq:
+ destroy_workqueue(smc_hs_wq);
+out_pnet:
+ smc_pnet_exit();
+out_pernet_subsys:
+ unregister_pernet_subsys(&smc_net_ops);
+
+ return rc;
+}
+
+static void __exit smc_exit(void)
+{
+ static_branch_disable(&tcp_have_smc);
+ sock_unregister(PF_SMC);
+ smc_core_exit();
+ smc_ib_unregister_client();
+ destroy_workqueue(smc_close_wq);
+ destroy_workqueue(smc_hs_wq);
+ proto_unregister(&smc_proto6);
+ proto_unregister(&smc_proto);
+ smc_pnet_exit();
+ unregister_pernet_subsys(&smc_net_ops);
+ rcu_barrier();
+}
+
+module_init(smc_init);
+module_exit(smc_exit);
+
+MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("smc socket address family");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_SMC);
diff --git a/net/smc/smc.h b/net/smc/smc.h
new file mode 100644
index 000000000..1eee40ec1
--- /dev/null
+++ b/net/smc/smc.h
@@ -0,0 +1,305 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for the SMC module (socket related)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+#ifndef __SMC_H
+#define __SMC_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <linux/compiler.h> /* __aligned */
+#include <net/sock.h>
+
+#include "smc_ib.h"
+
+#define SMC_V1 1 /* SMC version V1 */
+#define SMC_V2 2 /* SMC version V2 */
+#define SMC_RELEASE 0
+
+#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */
+#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */
+
+#define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM
+ * devices
+ */
+
+#define SMC_MAX_HOSTNAME_LEN 32
+#define SMC_MAX_EID_LEN 32
+
+extern struct proto smc_proto;
+extern struct proto smc_proto6;
+
+#ifdef ATOMIC64_INIT
+#define KERNEL_HAS_ATOMIC64
+#endif
+
+enum smc_state { /* possible states of an SMC socket */
+ SMC_ACTIVE = 1,
+ SMC_INIT = 2,
+ SMC_CLOSED = 7,
+ SMC_LISTEN = 10,
+ /* normal close */
+ SMC_PEERCLOSEWAIT1 = 20,
+ SMC_PEERCLOSEWAIT2 = 21,
+ SMC_APPFINCLOSEWAIT = 24,
+ SMC_APPCLOSEWAIT1 = 22,
+ SMC_APPCLOSEWAIT2 = 23,
+ SMC_PEERFINCLOSEWAIT = 25,
+ /* abnormal close */
+ SMC_PEERABORTWAIT = 26,
+ SMC_PROCESSABORT = 27,
+};
+
+struct smc_link_group;
+
+struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */
+ u8 type;
+} __aligned(1);
+
+struct smc_cdc_conn_state_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 peer_done_writing : 1; /* Sending done indicator */
+ u8 peer_conn_closed : 1; /* Peer connection closed indicator */
+ u8 peer_conn_abort : 1; /* Abnormal close indicator */
+ u8 reserved : 5;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 5;
+ u8 peer_conn_abort : 1;
+ u8 peer_conn_closed : 1;
+ u8 peer_done_writing : 1;
+#endif
+};
+
+struct smc_cdc_producer_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 write_blocked : 1; /* Writing Blocked, no rx buf space */
+ u8 urg_data_pending : 1; /* Urgent Data Pending */
+ u8 urg_data_present : 1; /* Urgent Data Present */
+ u8 cons_curs_upd_req : 1; /* cursor update requested */
+ u8 failover_validation : 1;/* message replay due to failover */
+ u8 reserved : 3;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 3;
+ u8 failover_validation : 1;
+ u8 cons_curs_upd_req : 1;
+ u8 urg_data_present : 1;
+ u8 urg_data_pending : 1;
+ u8 write_blocked : 1;
+#endif
+};
+
+/* in host byte order */
+union smc_host_cursor { /* SMC cursor - an offset in an RMBE */
+ struct {
+ u16 reserved;
+ u16 wrap; /* window wrap sequence number */
+ u32 count; /* cursor (= offset) part */
+ };
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_t acurs; /* for atomic processing */
+#else
+ u64 acurs; /* for atomic processing */
+#endif
+} __aligned(8);
+
+/* in host byte order, except for flag bitfields in network byte order */
+struct smc_host_cdc_msg { /* Connection Data Control message */
+ struct smc_wr_rx_hdr common; /* .type = 0xFE */
+ u8 len; /* length = 44 */
+ u16 seqno; /* connection seq # */
+ u32 token; /* alert_token */
+ union smc_host_cursor prod; /* producer cursor */
+ union smc_host_cursor cons; /* consumer cursor,
+ * piggy backed "ack"
+ */
+ struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */
+ struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/
+ u8 reserved[18];
+} __aligned(8);
+
+enum smc_urg_state {
+ SMC_URG_VALID = 1, /* data present */
+ SMC_URG_NOTYET = 2, /* data pending */
+ SMC_URG_READ = 3, /* data was already read */
+};
+
+struct smc_connection {
+ struct rb_node alert_node;
+ struct smc_link_group *lgr; /* link group of connection */
+ struct smc_link *lnk; /* assigned SMC-R link */
+ u32 alert_token_local; /* unique conn. id */
+ u8 peer_rmbe_idx; /* from tcp handshake */
+ int peer_rmbe_size; /* size of peer rx buffer */
+ atomic_t peer_rmbe_space;/* remaining free bytes in peer
+ * rmbe
+ */
+ int rtoken_idx; /* idx to peer RMB rkey/addr */
+
+ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
+ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
+ int rmbe_size_short;/* compressed notation */
+ int rmbe_update_limit;
+ /* lower limit for consumer
+ * cursor update
+ */
+
+ struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging
+ * buffer for CDC msg send
+ * .prod cf. TCP snd_nxt
+ * .cons cf. TCP sends ack
+ */
+ union smc_host_cursor local_tx_ctrl_fin;
+ /* prod crsr - confirmed by peer
+ */
+ union smc_host_cursor tx_curs_prep; /* tx - prepared data
+ * snd_max..wmem_alloc
+ */
+ union smc_host_cursor tx_curs_sent; /* tx - sent data
+ * snd_nxt ?
+ */
+ union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer
+ * snd-wnd-begin ?
+ */
+ atomic_t sndbuf_space; /* remaining space in sndbuf */
+ u16 tx_cdc_seq; /* sequence # for CDC send */
+ u16 tx_cdc_seq_fin; /* sequence # - tx completed */
+ spinlock_t send_lock; /* protect wr_sends */
+ atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe
+ * - inc when post wqe,
+ * - dec on polled tx cqe
+ */
+ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
+ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
+ u32 tx_off; /* base offset in peer rmb */
+
+ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl.
+ * .prod cf. TCP rcv_nxt
+ * .cons cf. TCP snd_una
+ */
+ union smc_host_cursor rx_curs_confirmed; /* confirmed to peer
+ * source of snd_una ?
+ */
+ union smc_host_cursor urg_curs; /* points at urgent byte */
+ enum smc_urg_state urg_state;
+ bool urg_tx_pend; /* urgent data staged */
+ bool urg_rx_skip_pend;
+ /* indicate urgent oob data
+ * read, but previous regular
+ * data still pending
+ */
+ char urg_rx_byte; /* urgent byte */
+ atomic_t bytes_to_rcv; /* arrived data,
+ * not yet received
+ */
+ atomic_t splice_pending; /* number of spliced bytes
+ * pending processing
+ */
+#ifndef KERNEL_HAS_ATOMIC64
+ spinlock_t acurs_lock; /* protect cursors */
+#endif
+ struct work_struct close_work; /* peer sent some closing */
+ struct work_struct abort_work; /* abort the connection */
+ struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */
+ u8 rx_off; /* receive offset:
+ * 0 for SMC-R, 32 for SMC-D
+ */
+ u64 peer_token; /* SMC-D token of peer */
+ u8 killed : 1; /* abnormal termination */
+ u8 out_of_sync : 1; /* out of sync with peer */
+};
+
+struct smc_sock { /* smc sock container */
+ struct sock sk;
+ struct socket *clcsock; /* internal tcp socket */
+ void (*clcsk_data_ready)(struct sock *sk);
+ /* original data_ready fct. **/
+ struct smc_connection conn; /* smc connection */
+ struct smc_sock *listen_smc; /* listen parent */
+ struct work_struct connect_work; /* handle non-blocking connect*/
+ struct work_struct tcp_listen_work;/* handle tcp socket accepts */
+ struct work_struct smc_listen_work;/* prepare new accept socket */
+ struct list_head accept_q; /* sockets to be accepted */
+ spinlock_t accept_q_lock; /* protects accept_q */
+ bool use_fallback; /* fallback to tcp */
+ int fallback_rsn; /* reason for fallback */
+ u32 peer_diagnosis; /* decline reason from peer */
+ int sockopt_defer_accept;
+ /* sockopt TCP_DEFER_ACCEPT
+ * value
+ */
+ u8 wait_close_tx_prepared : 1;
+ /* shutdown wr or close
+ * started, waiting for unsent
+ * data to be sent
+ */
+ u8 connect_nonblock : 1;
+ /* non-blocking connect in
+ * flight
+ */
+ struct mutex clcsock_release_lock;
+ /* protects clcsock of a listen
+ * socket
+ * */
+};
+
+static inline struct smc_sock *smc_sk(const struct sock *sk)
+{
+ return (struct smc_sock *)sk;
+}
+
+extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
+extern struct workqueue_struct *smc_close_wq; /* wq for close work */
+
+#define SMC_SYSTEMID_LEN 8
+
+extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
+
+#define ntohll(x) be64_to_cpu(x)
+#define htonll(x) cpu_to_be64(x)
+
+/* convert an u32 value into network byte order, store it into a 3 byte field */
+static inline void hton24(u8 *net, u32 host)
+{
+ __be32 t;
+
+ t = cpu_to_be32(host);
+ memcpy(net, ((u8 *)&t) + 1, 3);
+}
+
+/* convert a received 3 byte field into host byte order*/
+static inline u32 ntoh24(u8 *net)
+{
+ __be32 t = 0;
+
+ memcpy(((u8 *)&t) + 1, net, 3);
+ return be32_to_cpu(t);
+}
+
+#ifdef CONFIG_XFRM
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+ return (smc->clcsock->sk->sk_policy[0] ||
+ smc->clcsock->sk->sk_policy[1]) ? true : false;
+}
+#else
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+ return false;
+}
+#endif
+
+struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
+void smc_close_non_accepted(struct sock *sk);
+
+static inline void smc_sock_set_flag(struct sock *sk, enum sock_flags flag)
+{
+ set_bit(flag, &sk->sk_flags);
+}
+
+#endif /* __SMC_H */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
new file mode 100644
index 000000000..b92b2cee2
--- /dev/null
+++ b/net/smc/smc_cdc.c
@@ -0,0 +1,478 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ * handles flow control
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/spinlock.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+#include "smc_tx.h"
+#include "smc_rx.h"
+#include "smc_close.h"
+
+/********************************** send *************************************/
+
+/* handler for send/transmission completion of a CDC msg */
+static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
+ struct smc_link *link,
+ enum ib_wc_status wc_status)
+{
+ struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
+ struct smc_connection *conn = cdcpend->conn;
+ struct smc_buf_desc *sndbuf_desc;
+ struct smc_sock *smc;
+ int diff;
+
+ sndbuf_desc = conn->sndbuf_desc;
+ smc = container_of(conn, struct smc_sock, conn);
+ bh_lock_sock(&smc->sk);
+ if (!wc_status && sndbuf_desc) {
+ diff = smc_curs_diff(sndbuf_desc->len,
+ &cdcpend->conn->tx_curs_fin,
+ &cdcpend->cursor);
+ /* sndbuf_space is decreased in smc_sendmsg */
+ smp_mb__before_atomic();
+ atomic_add(diff, &cdcpend->conn->sndbuf_space);
+ /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
+ smp_mb__after_atomic();
+ smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn);
+ smc_curs_copy(&conn->local_tx_ctrl_fin, &cdcpend->p_cursor,
+ conn);
+ conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
+ }
+
+ if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) &&
+ unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
+ wake_up(&conn->cdc_pend_tx_wq);
+ WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0);
+
+ smc_tx_sndbuf_nonfull(smc);
+ bh_unlock_sock(&smc->sk);
+}
+
+int smc_cdc_get_free_slot(struct smc_connection *conn,
+ struct smc_link *link,
+ struct smc_wr_buf **wr_buf,
+ struct smc_rdma_wr **wr_rdma_buf,
+ struct smc_cdc_tx_pend **pend)
+{
+ int rc;
+
+ rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
+ wr_rdma_buf,
+ (struct smc_wr_tx_pend_priv **)pend);
+ if (conn->killed) {
+ /* abnormal termination */
+ if (!rc)
+ smc_wr_tx_put_slot(link,
+ (struct smc_wr_tx_pend_priv *)(*pend));
+ rc = -EPIPE;
+ }
+ return rc;
+}
+
+static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
+ struct smc_cdc_tx_pend *pend)
+{
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
+ "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
+ BUILD_BUG_ON_MSG(
+ offsetofend(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE,
+ "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+ "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)");
+ pend->conn = conn;
+ pend->cursor = conn->tx_curs_sent;
+ pend->p_cursor = conn->local_tx_ctrl.prod;
+ pend->ctrl_seq = conn->tx_cdc_seq;
+}
+
+int smc_cdc_msg_send(struct smc_connection *conn,
+ struct smc_wr_buf *wr_buf,
+ struct smc_cdc_tx_pend *pend)
+{
+ struct smc_link *link = conn->lnk;
+ union smc_host_cursor cfed;
+ int rc;
+
+ smc_cdc_add_pending_send(conn, pend);
+
+ conn->tx_cdc_seq++;
+ conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
+ smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed);
+
+ atomic_inc(&conn->cdc_pend_tx_wr);
+ smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
+
+ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
+ if (!rc) {
+ smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn);
+ conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
+ } else {
+ conn->tx_cdc_seq--;
+ conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
+ atomic_dec(&conn->cdc_pend_tx_wr);
+ }
+
+ return rc;
+}
+
+/* send a validation msg indicating the move of a conn to an other QP link */
+int smcr_cdc_msg_send_validation(struct smc_connection *conn,
+ struct smc_cdc_tx_pend *pend,
+ struct smc_wr_buf *wr_buf)
+{
+ struct smc_host_cdc_msg *local = &conn->local_tx_ctrl;
+ struct smc_link *link = conn->lnk;
+ struct smc_cdc_msg *peer;
+ int rc;
+
+ peer = (struct smc_cdc_msg *)wr_buf;
+ peer->common.type = local->common.type;
+ peer->len = local->len;
+ peer->seqno = htons(conn->tx_cdc_seq_fin); /* seqno last compl. tx */
+ peer->token = htonl(local->token);
+ peer->prod_flags.failover_validation = 1;
+
+ /* We need to set pend->conn here to make sure smc_cdc_tx_handler()
+ * can handle properly
+ */
+ smc_cdc_add_pending_send(conn, pend);
+
+ atomic_inc(&conn->cdc_pend_tx_wr);
+ smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
+
+ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
+ if (unlikely(rc))
+ atomic_dec(&conn->cdc_pend_tx_wr);
+
+ return rc;
+}
+
+static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
+{
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ struct smc_link *link;
+ bool again = false;
+ int rc;
+
+again:
+ link = conn->lnk;
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend);
+ if (rc)
+ goto put_out;
+
+ spin_lock_bh(&conn->send_lock);
+ if (link != conn->lnk) {
+ /* link of connection changed, try again one time*/
+ spin_unlock_bh(&conn->send_lock);
+ smc_wr_tx_put_slot(link,
+ (struct smc_wr_tx_pend_priv *)pend);
+ smc_wr_tx_link_put(link);
+ if (again)
+ return -ENOLINK;
+ again = true;
+ goto again;
+ }
+ rc = smc_cdc_msg_send(conn, wr_buf, pend);
+ spin_unlock_bh(&conn->send_lock);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
+{
+ int rc;
+
+ if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown))
+ return -EPIPE;
+
+ if (conn->lgr->is_smcd) {
+ spin_lock_bh(&conn->send_lock);
+ rc = smcd_cdc_msg_send(conn);
+ spin_unlock_bh(&conn->send_lock);
+ } else {
+ rc = smcr_cdc_get_slot_and_msg_send(conn);
+ }
+
+ return rc;
+}
+
+void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn)
+{
+ wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr));
+}
+
+/* Send a SMC-D CDC header.
+ * This increments the free space available in our send buffer.
+ * Also update the confirmed receive buffer with what was sent to the peer.
+ */
+int smcd_cdc_msg_send(struct smc_connection *conn)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ union smc_host_cursor curs;
+ struct smcd_cdc_msg cdc;
+ int rc, diff;
+
+ memset(&cdc, 0, sizeof(cdc));
+ cdc.common.type = SMC_CDC_MSG_TYPE;
+ curs.acurs.counter = atomic64_read(&conn->local_tx_ctrl.prod.acurs);
+ cdc.prod.wrap = curs.wrap;
+ cdc.prod.count = curs.count;
+ curs.acurs.counter = atomic64_read(&conn->local_tx_ctrl.cons.acurs);
+ cdc.cons.wrap = curs.wrap;
+ cdc.cons.count = curs.count;
+ cdc.cons.prod_flags = conn->local_tx_ctrl.prod_flags;
+ cdc.cons.conn_state_flags = conn->local_tx_ctrl.conn_state_flags;
+ rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1);
+ if (rc)
+ return rc;
+ smc_curs_copy(&conn->rx_curs_confirmed, &curs, conn);
+ conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
+ /* Calculate transmitted data and increment free send buffer space */
+ diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin,
+ &conn->tx_curs_sent);
+ /* increased by confirmed number of bytes */
+ smp_mb__before_atomic();
+ atomic_add(diff, &conn->sndbuf_space);
+ /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
+ smp_mb__after_atomic();
+ smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn);
+
+ smc_tx_sndbuf_nonfull(smc);
+ return rc;
+}
+
+/********************************* receive ***********************************/
+
+static inline bool smc_cdc_before(u16 seq1, u16 seq2)
+{
+ return (s16)(seq1 - seq2) < 0;
+}
+
+static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
+ int *diff_prod)
+{
+ struct smc_connection *conn = &smc->conn;
+ char *base;
+
+ /* new data included urgent business */
+ smc_curs_copy(&conn->urg_curs, &conn->local_rx_ctrl.prod, conn);
+ conn->urg_state = SMC_URG_VALID;
+ if (!sock_flag(&smc->sk, SOCK_URGINLINE))
+ /* we'll skip the urgent byte, so don't account for it */
+ (*diff_prod)--;
+ base = (char *)conn->rmb_desc->cpu_addr + conn->rx_off;
+ if (conn->urg_curs.count)
+ conn->urg_rx_byte = *(base + conn->urg_curs.count - 1);
+ else
+ conn->urg_rx_byte = *(base + conn->rmb_desc->len - 1);
+ sk_send_sigurg(&smc->sk);
+}
+
+static void smc_cdc_msg_validate(struct smc_sock *smc, struct smc_cdc_msg *cdc,
+ struct smc_link *link)
+{
+ struct smc_connection *conn = &smc->conn;
+ u16 recv_seq = ntohs(cdc->seqno);
+ s16 diff;
+
+ /* check that seqnum was seen before */
+ diff = conn->local_rx_ctrl.seqno - recv_seq;
+ if (diff < 0) { /* diff larger than 0x7fff */
+ /* drop connection */
+ conn->out_of_sync = 1; /* prevent any further receives */
+ spin_lock_bh(&conn->send_lock);
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ conn->lnk = link;
+ spin_unlock_bh(&conn->send_lock);
+ sock_hold(&smc->sk); /* sock_put in abort_work */
+ if (!queue_work(smc_close_wq, &conn->abort_work))
+ sock_put(&smc->sk);
+ }
+}
+
+static void smc_cdc_msg_recv_action(struct smc_sock *smc,
+ struct smc_cdc_msg *cdc)
+{
+ union smc_host_cursor cons_old, prod_old;
+ struct smc_connection *conn = &smc->conn;
+ int diff_cons, diff_prod;
+
+ smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn);
+ smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn);
+ smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn);
+
+ diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old,
+ &conn->local_rx_ctrl.cons);
+ if (diff_cons) {
+ /* peer_rmbe_space is decreased during data transfer with RDMA
+ * write
+ */
+ smp_mb__before_atomic();
+ atomic_add(diff_cons, &conn->peer_rmbe_space);
+ /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
+ smp_mb__after_atomic();
+ }
+
+ diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old,
+ &conn->local_rx_ctrl.prod);
+ if (diff_prod) {
+ if (conn->local_rx_ctrl.prod_flags.urg_data_present)
+ smc_cdc_handle_urg_data_arrival(smc, &diff_prod);
+ /* bytes_to_rcv is decreased in smc_recvmsg */
+ smp_mb__before_atomic();
+ atomic_add(diff_prod, &conn->bytes_to_rcv);
+ /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
+ smp_mb__after_atomic();
+ smc->sk.sk_data_ready(&smc->sk);
+ } else {
+ if (conn->local_rx_ctrl.prod_flags.write_blocked)
+ smc->sk.sk_data_ready(&smc->sk);
+ if (conn->local_rx_ctrl.prod_flags.urg_data_pending)
+ conn->urg_state = SMC_URG_NOTYET;
+ }
+
+ /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
+ if ((diff_cons && smc_tx_prepared_sends(conn)) ||
+ conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+ conn->local_rx_ctrl.prod_flags.urg_data_pending)
+ smc_tx_sndbuf_nonempty(conn);
+
+ if (diff_cons && conn->urg_tx_pend &&
+ atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {
+ /* urg data confirmed by peer, indicate we're ready for more */
+ conn->urg_tx_pend = false;
+ smc->sk.sk_write_space(&smc->sk);
+ }
+
+ if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+ smc->sk.sk_err = ECONNRESET;
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ }
+ if (smc_cdc_rxed_any_close_or_senddone(conn)) {
+ smc->sk.sk_shutdown |= RCV_SHUTDOWN;
+ if (smc->clcsock && smc->clcsock->sk)
+ smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
+ smc_sock_set_flag(&smc->sk, SOCK_DONE);
+ sock_hold(&smc->sk); /* sock_put in close_work */
+ if (!queue_work(smc_close_wq, &conn->close_work))
+ sock_put(&smc->sk);
+ }
+}
+
+/* called under tasklet context */
+static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc)
+{
+ sock_hold(&smc->sk);
+ bh_lock_sock(&smc->sk);
+ smc_cdc_msg_recv_action(smc, cdc);
+ bh_unlock_sock(&smc->sk);
+ sock_put(&smc->sk); /* no free sk in softirq-context */
+}
+
+/* Schedule a tasklet for this connection. Triggered from the ISM device IRQ
+ * handler to indicate update in the DMBE.
+ *
+ * Context:
+ * - tasklet context
+ */
+static void smcd_cdc_rx_tsklet(unsigned long data)
+{
+ struct smc_connection *conn = (struct smc_connection *)data;
+ struct smcd_cdc_msg *data_cdc;
+ struct smcd_cdc_msg cdc;
+ struct smc_sock *smc;
+
+ if (!conn || conn->killed)
+ return;
+
+ data_cdc = (struct smcd_cdc_msg *)conn->rmb_desc->cpu_addr;
+ smcd_curs_copy(&cdc.prod, &data_cdc->prod, conn);
+ smcd_curs_copy(&cdc.cons, &data_cdc->cons, conn);
+ smc = container_of(conn, struct smc_sock, conn);
+ smc_cdc_msg_recv(smc, (struct smc_cdc_msg *)&cdc);
+}
+
+/* Initialize receive tasklet. Called from ISM device IRQ handler to start
+ * receiver side.
+ */
+void smcd_cdc_rx_init(struct smc_connection *conn)
+{
+ tasklet_init(&conn->rx_tsklet, smcd_cdc_rx_tsklet, (unsigned long)conn);
+}
+
+/***************************** init, exit, misc ******************************/
+
+static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ struct smc_cdc_msg *cdc = buf;
+ struct smc_connection *conn;
+ struct smc_link_group *lgr;
+ struct smc_sock *smc;
+
+ if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved))
+ return; /* short message */
+ if (cdc->len != SMC_WR_TX_SIZE)
+ return; /* invalid message */
+
+ /* lookup connection */
+ lgr = smc_get_lgr(link);
+ read_lock_bh(&lgr->conns_lock);
+ conn = smc_lgr_find_conn(ntohl(cdc->token), lgr);
+ read_unlock_bh(&lgr->conns_lock);
+ if (!conn || conn->out_of_sync)
+ return;
+ smc = container_of(conn, struct smc_sock, conn);
+
+ if (cdc->prod_flags.failover_validation) {
+ smc_cdc_msg_validate(smc, cdc, link);
+ return;
+ }
+ if (smc_cdc_before(ntohs(cdc->seqno),
+ conn->local_rx_ctrl.seqno))
+ /* received seqno is old */
+ return;
+
+ smc_cdc_msg_recv(smc, cdc);
+}
+
+static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
+ {
+ .handler = smc_cdc_rx_handler,
+ .type = SMC_CDC_MSG_TYPE
+ },
+ {
+ .handler = NULL,
+ }
+};
+
+int __init smc_cdc_init(void)
+{
+ struct smc_wr_rx_handler *handler;
+ int rc = 0;
+
+ for (handler = smc_cdc_rx_handlers; handler->handler; handler++) {
+ INIT_HLIST_NODE(&handler->list);
+ rc = smc_wr_rx_register_handler(handler);
+ if (rc)
+ break;
+ }
+ return rc;
+}
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
new file mode 100644
index 000000000..696cc11f2
--- /dev/null
+++ b/net/smc/smc_cdc.h
@@ -0,0 +1,305 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_CDC_H
+#define SMC_CDC_H
+
+#include <linux/kernel.h> /* max_t */
+#include <linux/atomic.h>
+#include <linux/in.h>
+#include <linux/compiler.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_wr.h"
+
+#define SMC_CDC_MSG_TYPE 0xFE
+
+/* in network byte order */
+union smc_cdc_cursor { /* SMC cursor */
+ struct {
+ __be16 reserved;
+ __be16 wrap;
+ __be32 count;
+ };
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_t acurs; /* for atomic processing */
+#else
+ u64 acurs; /* for atomic processing */
+#endif
+} __aligned(8);
+
+/* in network byte order */
+struct smc_cdc_msg {
+ struct smc_wr_rx_hdr common; /* .type = 0xFE */
+ u8 len; /* 44 */
+ __be16 seqno;
+ __be32 token;
+ union smc_cdc_cursor prod;
+ union smc_cdc_cursor cons; /* piggy backed "ack" */
+ struct smc_cdc_producer_flags prod_flags;
+ struct smc_cdc_conn_state_flags conn_state_flags;
+ u8 reserved[18];
+};
+
+/* SMC-D cursor format */
+union smcd_cdc_cursor {
+ struct {
+ u16 wrap;
+ u32 count;
+ struct smc_cdc_producer_flags prod_flags;
+ struct smc_cdc_conn_state_flags conn_state_flags;
+ } __packed;
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_t acurs; /* for atomic processing */
+#else
+ u64 acurs; /* for atomic processing */
+#endif
+} __aligned(8);
+
+/* CDC message for SMC-D */
+struct smcd_cdc_msg {
+ struct smc_wr_rx_hdr common; /* Type = 0xFE */
+ u8 res1[7];
+ union smcd_cdc_cursor prod;
+ union smcd_cdc_cursor cons;
+ u8 res3[8];
+} __aligned(8);
+
+static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
+{
+ return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort ||
+ conn->local_rx_ctrl.conn_state_flags.peer_conn_closed;
+}
+
+static inline bool smc_cdc_rxed_any_close_or_senddone(
+ struct smc_connection *conn)
+{
+ return smc_cdc_rxed_any_close(conn) ||
+ conn->local_rx_ctrl.conn_state_flags.peer_done_writing;
+}
+
+static inline void smc_curs_add(int size, union smc_host_cursor *curs,
+ int value)
+{
+ curs->count += value;
+ if (curs->count >= size) {
+ curs->wrap++;
+ curs->count -= size;
+ }
+}
+
+/* Copy cursor src into tgt */
+static inline void smc_curs_copy(union smc_host_cursor *tgt,
+ union smc_host_cursor *src,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ tgt->acurs = src->acurs;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+ atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
+#endif
+}
+
+static inline void smc_curs_copy_net(union smc_cdc_cursor *tgt,
+ union smc_cdc_cursor *src,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ tgt->acurs = src->acurs;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+ atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
+#endif
+}
+
+static inline void smcd_curs_copy(union smcd_cdc_cursor *tgt,
+ union smcd_cdc_cursor *src,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ tgt->acurs = src->acurs;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+ atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
+#endif
+}
+
+/* calculate cursor difference between old and new, where old <= new and
+ * difference cannot exceed size
+ */
+static inline int smc_curs_diff(unsigned int size,
+ union smc_host_cursor *old,
+ union smc_host_cursor *new)
+{
+ if (old->wrap != new->wrap)
+ return max_t(int, 0,
+ ((size - old->count) + new->count));
+
+ return max_t(int, 0, (new->count - old->count));
+}
+
+/* calculate cursor difference between old and new - returns negative
+ * value in case old > new
+ */
+static inline int smc_curs_comp(unsigned int size,
+ union smc_host_cursor *old,
+ union smc_host_cursor *new)
+{
+ if (old->wrap > new->wrap ||
+ (old->wrap == new->wrap && old->count > new->count))
+ return -smc_curs_diff(size, new, old);
+ return smc_curs_diff(size, old, new);
+}
+
+/* calculate cursor difference between old and new, where old <= new and
+ * difference may exceed size
+ */
+static inline int smc_curs_diff_large(unsigned int size,
+ union smc_host_cursor *old,
+ union smc_host_cursor *new)
+{
+ if (old->wrap < new->wrap)
+ return min_t(int,
+ (size - old->count) + new->count +
+ (new->wrap - old->wrap - 1) * size,
+ size);
+
+ if (old->wrap > new->wrap) /* wrap has switched from 0xffff to 0x0000 */
+ return min_t(int,
+ (size - old->count) + new->count +
+ (new->wrap + 0xffff - old->wrap) * size,
+ size);
+
+ return max_t(int, 0, (new->count - old->count));
+}
+
+static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
+ union smc_host_cursor *local,
+ union smc_host_cursor *save,
+ struct smc_connection *conn)
+{
+ smc_curs_copy(save, local, conn);
+ peer->count = htonl(save->count);
+ peer->wrap = htons(save->wrap);
+ /* peer->reserved = htons(0); must be ensured by caller */
+}
+
+static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer,
+ struct smc_connection *conn,
+ union smc_host_cursor *save)
+{
+ struct smc_host_cdc_msg *local = &conn->local_tx_ctrl;
+
+ peer->common.type = local->common.type;
+ peer->len = local->len;
+ peer->seqno = htons(local->seqno);
+ peer->token = htonl(local->token);
+ smc_host_cursor_to_cdc(&peer->prod, &local->prod, save, conn);
+ smc_host_cursor_to_cdc(&peer->cons, &local->cons, save, conn);
+ peer->prod_flags = local->prod_flags;
+ peer->conn_state_flags = local->conn_state_flags;
+}
+
+static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
+ union smc_cdc_cursor *peer,
+ struct smc_connection *conn)
+{
+ union smc_host_cursor temp, old;
+ union smc_cdc_cursor net;
+
+ smc_curs_copy(&old, local, conn);
+ smc_curs_copy_net(&net, peer, conn);
+ temp.count = ntohl(net.count);
+ temp.wrap = ntohs(net.wrap);
+ if ((old.wrap > temp.wrap) && temp.wrap)
+ return;
+ if ((old.wrap == temp.wrap) &&
+ (old.count > temp.count))
+ return;
+ smc_curs_copy(local, &temp, conn);
+}
+
+static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smc_cdc_msg *peer,
+ struct smc_connection *conn)
+{
+ local->common.type = peer->common.type;
+ local->len = peer->len;
+ local->seqno = ntohs(peer->seqno);
+ local->token = ntohl(peer->token);
+ smc_cdc_cursor_to_host(&local->prod, &peer->prod, conn);
+ smc_cdc_cursor_to_host(&local->cons, &peer->cons, conn);
+ local->prod_flags = peer->prod_flags;
+ local->conn_state_flags = peer->conn_state_flags;
+}
+
+static inline void smcd_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smcd_cdc_msg *peer,
+ struct smc_connection *conn)
+{
+ union smc_host_cursor temp;
+
+ temp.wrap = peer->prod.wrap;
+ temp.count = peer->prod.count;
+ smc_curs_copy(&local->prod, &temp, conn);
+
+ temp.wrap = peer->cons.wrap;
+ temp.count = peer->cons.count;
+ smc_curs_copy(&local->cons, &temp, conn);
+ local->prod_flags = peer->cons.prod_flags;
+ local->conn_state_flags = peer->cons.conn_state_flags;
+}
+
+static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smc_cdc_msg *peer,
+ struct smc_connection *conn)
+{
+ if (conn->lgr->is_smcd)
+ smcd_cdc_msg_to_host(local, (struct smcd_cdc_msg *)peer, conn);
+ else
+ smcr_cdc_msg_to_host(local, peer, conn);
+}
+
+struct smc_cdc_tx_pend {
+ struct smc_connection *conn; /* socket connection */
+ union smc_host_cursor cursor; /* tx sndbuf cursor sent */
+ union smc_host_cursor p_cursor; /* rx RMBE cursor produced */
+ u16 ctrl_seq; /* conn. tx sequence # */
+};
+
+int smc_cdc_get_free_slot(struct smc_connection *conn,
+ struct smc_link *link,
+ struct smc_wr_buf **wr_buf,
+ struct smc_rdma_wr **wr_rdma_buf,
+ struct smc_cdc_tx_pend **pend);
+void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn);
+int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
+ struct smc_cdc_tx_pend *pend);
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
+int smcd_cdc_msg_send(struct smc_connection *conn);
+int smcr_cdc_msg_send_validation(struct smc_connection *conn,
+ struct smc_cdc_tx_pend *pend,
+ struct smc_wr_buf *wr_buf);
+int smc_cdc_init(void) __init;
+void smcd_cdc_rx_init(struct smc_connection *conn);
+
+#endif /* SMC_CDC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
new file mode 100644
index 000000000..5ee5b2ce2
--- /dev/null
+++ b/net/smc/smc_clc.c
@@ -0,0 +1,784 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * CLC (connection layer control) handshake over initial TCP socket to
+ * prepare for RDMA traffic
+ *
+ * Copyright IBM Corp. 2016, 2018
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/in.h>
+#include <linux/inetdevice.h>
+#include <linux/if_ether.h>
+#include <linux/sched/signal.h>
+#include <linux/utsname.h>
+#include <linux/ctype.h>
+
+#include <net/addrconf.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_clc.h"
+#include "smc_ib.h"
+#include "smc_ism.h"
+
+#define SMCR_CLC_ACCEPT_CONFIRM_LEN 68
+#define SMCD_CLC_ACCEPT_CONFIRM_LEN 48
+#define SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 78
+#define SMC_CLC_RECV_BUF_LEN 100
+
+/* eye catcher "SMCR" EBCDIC for CLC messages */
+static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
+/* eye catcher "SMCD" EBCDIC for CLC messages */
+static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'};
+
+static u8 smc_hostname[SMC_MAX_HOSTNAME_LEN];
+
+/* check arriving CLC proposal */
+static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc)
+{
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
+ struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+ struct smc_clc_msg_hdr *hdr = &pclc->hdr;
+ struct smc_clc_v2_extension *v2_ext;
+
+ v2_ext = smc_get_clc_v2_ext(pclc);
+ pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+ if (hdr->version == SMC_V1) {
+ if (hdr->typev1 == SMC_TYPE_N)
+ return false;
+ if (ntohs(hdr->length) !=
+ sizeof(*pclc) + ntohs(pclc->iparea_offset) +
+ sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(struct smc_clc_ipv6_prefix) +
+ sizeof(struct smc_clc_msg_trail))
+ return false;
+ } else {
+ if (ntohs(hdr->length) !=
+ sizeof(*pclc) +
+ sizeof(struct smc_clc_msg_smcd) +
+ (hdr->typev1 != SMC_TYPE_N ?
+ sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(struct smc_clc_ipv6_prefix) : 0) +
+ (hdr->typev2 != SMC_TYPE_N ?
+ sizeof(*v2_ext) +
+ v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN : 0) +
+ (smcd_indicated(hdr->typev2) ?
+ sizeof(*smcd_v2_ext) + v2_ext->hdr.ism_gid_cnt *
+ sizeof(struct smc_clc_smcd_gid_chid) :
+ 0) +
+ sizeof(struct smc_clc_msg_trail))
+ return false;
+ }
+ return true;
+}
+
+/* check arriving CLC accept or confirm */
+static bool
+smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2)
+{
+ struct smc_clc_msg_hdr *hdr = &clc_v2->hdr;
+
+ if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D)
+ return false;
+ if (hdr->version == SMC_V1) {
+ if ((hdr->typev1 == SMC_TYPE_R &&
+ ntohs(hdr->length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) ||
+ (hdr->typev1 == SMC_TYPE_D &&
+ ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN))
+ return false;
+ } else {
+ if (hdr->typev1 == SMC_TYPE_D &&
+ ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 &&
+ (ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 +
+ sizeof(struct smc_clc_first_contact_ext)))
+ return false;
+ }
+ return true;
+}
+
+static void smc_clc_fill_fce(struct smc_clc_first_contact_ext *fce, int *len)
+{
+ memset(fce, 0, sizeof(*fce));
+ fce->os_type = SMC_CLC_OS_LINUX;
+ fce->release = SMC_RELEASE;
+ memcpy(fce->hostname, smc_hostname, sizeof(smc_hostname));
+ (*len) += sizeof(*fce);
+}
+
+/* check if received message has a correct header length and contains valid
+ * heading and trailing eyecatchers
+ */
+static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl)
+{
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2;
+ struct smc_clc_msg_proposal *pclc;
+ struct smc_clc_msg_decline *dclc;
+ struct smc_clc_msg_trail *trl;
+
+ if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
+ memcmp(clcm->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)))
+ return false;
+ switch (clcm->type) {
+ case SMC_CLC_PROPOSAL:
+ pclc = (struct smc_clc_msg_proposal *)clcm;
+ if (!smc_clc_msg_prop_valid(pclc))
+ return false;
+ trl = (struct smc_clc_msg_trail *)
+ ((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl));
+ break;
+ case SMC_CLC_ACCEPT:
+ case SMC_CLC_CONFIRM:
+ clc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)clcm;
+ if (!smc_clc_msg_acc_conf_valid(clc_v2))
+ return false;
+ trl = (struct smc_clc_msg_trail *)
+ ((u8 *)clc_v2 + ntohs(clc_v2->hdr.length) -
+ sizeof(*trl));
+ break;
+ case SMC_CLC_DECLINE:
+ dclc = (struct smc_clc_msg_decline *)clcm;
+ if (ntohs(dclc->hdr.length) != sizeof(*dclc))
+ return false;
+ trl = &dclc->trl;
+ break;
+ default:
+ return false;
+ }
+ if (check_trl &&
+ memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
+ memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)))
+ return false;
+ return true;
+}
+
+/* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */
+static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4,
+ struct smc_clc_msg_proposal_prefix *prop)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dst->dev);
+ const struct in_ifaddr *ifa;
+
+ if (!in_dev)
+ return -ENODEV;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (!inet_ifa_match(ipv4, ifa))
+ continue;
+ prop->prefix_len = inet_mask_len(ifa->ifa_mask);
+ prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask;
+ /* prop->ipv6_prefixes_cnt = 0; already done by memset before */
+ return 0;
+ }
+ return -ENOENT;
+}
+
+/* fill CLC proposal msg with ipv6 prefixes from device */
+static int smc_clc_prfx_set6_rcu(struct dst_entry *dst,
+ struct smc_clc_msg_proposal_prefix *prop,
+ struct smc_clc_ipv6_prefix *ipv6_prfx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_dev *in6_dev = __in6_dev_get(dst->dev);
+ struct inet6_ifaddr *ifa;
+ int cnt = 0;
+
+ if (!in6_dev)
+ return -ENODEV;
+ /* use a maximum of 8 IPv6 prefixes from device */
+ list_for_each_entry(ifa, &in6_dev->addr_list, if_list) {
+ if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)
+ continue;
+ ipv6_addr_prefix(&ipv6_prfx[cnt].prefix,
+ &ifa->addr, ifa->prefix_len);
+ ipv6_prfx[cnt].prefix_len = ifa->prefix_len;
+ cnt++;
+ if (cnt == SMC_CLC_MAX_V6_PREFIX)
+ break;
+ }
+ prop->ipv6_prefixes_cnt = cnt;
+ if (cnt)
+ return 0;
+#endif
+ return -ENOENT;
+}
+
+/* retrieve and set prefixes in CLC proposal msg */
+static int smc_clc_prfx_set(struct socket *clcsock,
+ struct smc_clc_msg_proposal_prefix *prop,
+ struct smc_clc_ipv6_prefix *ipv6_prfx)
+{
+ struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ struct sockaddr_storage addrs;
+ struct sockaddr_in6 *addr6;
+ struct sockaddr_in *addr;
+ int rc = -ENOENT;
+
+ if (!dst) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ if (!dst->dev) {
+ rc = -ENODEV;
+ goto out_rel;
+ }
+ /* get address to which the internal TCP socket is bound */
+ if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0)
+ goto out_rel;
+ /* analyze IP specific data of net_device belonging to TCP socket */
+ addr6 = (struct sockaddr_in6 *)&addrs;
+ rcu_read_lock();
+ if (addrs.ss_family == PF_INET) {
+ /* IPv4 */
+ addr = (struct sockaddr_in *)&addrs;
+ rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop);
+ } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) {
+ /* mapped IPv4 address - peer is IPv4 only */
+ rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3],
+ prop);
+ } else {
+ /* IPv6 */
+ rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx);
+ }
+ rcu_read_unlock();
+out_rel:
+ dst_release(dst);
+out:
+ return rc;
+}
+
+/* match ipv4 addrs of dev against addr in CLC proposal */
+static int smc_clc_prfx_match4_rcu(struct net_device *dev,
+ struct smc_clc_msg_proposal_prefix *prop)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ const struct in_ifaddr *ifa;
+
+ if (!in_dev)
+ return -ENODEV;
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) &&
+ inet_ifa_match(prop->outgoing_subnet, ifa))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+/* match ipv6 addrs of dev against addrs in CLC proposal */
+static int smc_clc_prfx_match6_rcu(struct net_device *dev,
+ struct smc_clc_msg_proposal_prefix *prop)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_dev *in6_dev = __in6_dev_get(dev);
+ struct smc_clc_ipv6_prefix *ipv6_prfx;
+ struct inet6_ifaddr *ifa;
+ int i, max;
+
+ if (!in6_dev)
+ return -ENODEV;
+ /* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */
+ ipv6_prfx = (struct smc_clc_ipv6_prefix *)((u8 *)prop + sizeof(*prop));
+ max = min_t(u8, prop->ipv6_prefixes_cnt, SMC_CLC_MAX_V6_PREFIX);
+ list_for_each_entry(ifa, &in6_dev->addr_list, if_list) {
+ if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)
+ continue;
+ for (i = 0; i < max; i++) {
+ if (ifa->prefix_len == ipv6_prfx[i].prefix_len &&
+ ipv6_prefix_equal(&ifa->addr, &ipv6_prfx[i].prefix,
+ ifa->prefix_len))
+ return 0;
+ }
+ }
+#endif
+ return -ENOENT;
+}
+
+/* check if proposed prefixes match one of our device prefixes */
+int smc_clc_prfx_match(struct socket *clcsock,
+ struct smc_clc_msg_proposal_prefix *prop)
+{
+ struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ int rc;
+
+ if (!dst) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ if (!dst->dev) {
+ rc = -ENODEV;
+ goto out_rel;
+ }
+ rcu_read_lock();
+ if (!prop->ipv6_prefixes_cnt)
+ rc = smc_clc_prfx_match4_rcu(dst->dev, prop);
+ else
+ rc = smc_clc_prfx_match6_rcu(dst->dev, prop);
+ rcu_read_unlock();
+out_rel:
+ dst_release(dst);
+out:
+ return rc;
+}
+
+/* Wait for data on the tcp-socket, analyze received data
+ * Returns:
+ * 0 if success and it was not a decline that we received.
+ * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
+ * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
+ */
+int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
+ u8 expected_type, unsigned long timeout)
+{
+ long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo;
+ struct sock *clc_sk = smc->clcsock->sk;
+ struct smc_clc_msg_hdr *clcm = buf;
+ struct msghdr msg = {NULL, 0};
+ int reason_code = 0;
+ struct kvec vec = {buf, buflen};
+ int len, datlen, recvlen;
+ bool check_trl = true;
+ int krflags;
+
+ /* peek the first few bytes to determine length of data to receive
+ * so we don't consume any subsequent CLC message or payload data
+ * in the TCP byte stream
+ */
+ /*
+ * Caller must make sure that buflen is no less than
+ * sizeof(struct smc_clc_msg_hdr)
+ */
+ krflags = MSG_PEEK | MSG_WAITALL;
+ clc_sk->sk_rcvtimeo = timeout;
+ iov_iter_kvec(&msg.msg_iter, READ, &vec, 1,
+ sizeof(struct smc_clc_msg_hdr));
+ len = sock_recvmsg(smc->clcsock, &msg, krflags);
+ if (signal_pending(current)) {
+ reason_code = -EINTR;
+ clc_sk->sk_err = EINTR;
+ smc->sk.sk_err = EINTR;
+ goto out;
+ }
+ if (clc_sk->sk_err) {
+ reason_code = -clc_sk->sk_err;
+ if (clc_sk->sk_err == EAGAIN &&
+ expected_type == SMC_CLC_DECLINE)
+ clc_sk->sk_err = 0; /* reset for fallback usage */
+ else
+ smc->sk.sk_err = clc_sk->sk_err;
+ goto out;
+ }
+ if (!len) { /* peer has performed orderly shutdown */
+ smc->sk.sk_err = ECONNRESET;
+ reason_code = -ECONNRESET;
+ goto out;
+ }
+ if (len < 0) {
+ if (len != -EAGAIN || expected_type != SMC_CLC_DECLINE)
+ smc->sk.sk_err = -len;
+ reason_code = len;
+ goto out;
+ }
+ datlen = ntohs(clcm->length);
+ if ((len < sizeof(struct smc_clc_msg_hdr)) ||
+ (clcm->version < SMC_V1) ||
+ ((clcm->type != SMC_CLC_DECLINE) &&
+ (clcm->type != expected_type))) {
+ smc->sk.sk_err = EPROTO;
+ reason_code = -EPROTO;
+ goto out;
+ }
+
+ /* receive the complete CLC message */
+ memset(&msg, 0, sizeof(struct msghdr));
+ if (datlen > buflen) {
+ check_trl = false;
+ recvlen = buflen;
+ } else {
+ recvlen = datlen;
+ }
+ iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen);
+ krflags = MSG_WAITALL;
+ len = sock_recvmsg(smc->clcsock, &msg, krflags);
+ if (len < recvlen || !smc_clc_msg_hdr_valid(clcm, check_trl)) {
+ smc->sk.sk_err = EPROTO;
+ reason_code = -EPROTO;
+ goto out;
+ }
+ datlen -= len;
+ while (datlen) {
+ u8 tmp[SMC_CLC_RECV_BUF_LEN];
+
+ vec.iov_base = &tmp;
+ vec.iov_len = SMC_CLC_RECV_BUF_LEN;
+ /* receive remaining proposal message */
+ recvlen = datlen > SMC_CLC_RECV_BUF_LEN ?
+ SMC_CLC_RECV_BUF_LEN : datlen;
+ iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen);
+ len = sock_recvmsg(smc->clcsock, &msg, krflags);
+ datlen -= len;
+ }
+ if (clcm->type == SMC_CLC_DECLINE) {
+ struct smc_clc_msg_decline *dclc;
+
+ dclc = (struct smc_clc_msg_decline *)clcm;
+ reason_code = SMC_CLC_DECL_PEERDECL;
+ smc->peer_diagnosis = ntohl(dclc->peer_diagnosis);
+ if (((struct smc_clc_msg_decline *)buf)->hdr.typev2 &
+ SMC_FIRST_CONTACT_MASK) {
+ smc->conn.lgr->sync_err = 1;
+ smc_lgr_terminate_sched(smc->conn.lgr);
+ }
+ }
+
+out:
+ clc_sk->sk_rcvtimeo = rcvtimeo;
+ return reason_code;
+}
+
+/* send CLC DECLINE message across internal TCP socket */
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version)
+{
+ struct smc_clc_msg_decline dclc;
+ struct msghdr msg;
+ struct kvec vec;
+ int len;
+
+ memset(&dclc, 0, sizeof(dclc));
+ memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ dclc.hdr.type = SMC_CLC_DECLINE;
+ dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
+ dclc.hdr.version = version;
+ dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX;
+ dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ?
+ SMC_FIRST_CONTACT_MASK : 0;
+ if ((!smc->conn.lgr || !smc->conn.lgr->is_smcd) &&
+ smc_ib_is_valid_local_systemid())
+ memcpy(dclc.id_for_peer, local_systemid,
+ sizeof(local_systemid));
+ dclc.peer_diagnosis = htonl(peer_diag_info);
+ memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &dclc;
+ vec.iov_len = sizeof(struct smc_clc_msg_decline);
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
+ sizeof(struct smc_clc_msg_decline));
+ if (len < 0 || len < sizeof(struct smc_clc_msg_decline))
+ len = -EPROTO;
+ return len > 0 ? 0 : len;
+}
+
+/* send CLC PROPOSAL message across internal TCP socket */
+int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
+{
+ struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
+ struct smc_clc_msg_proposal *pclc_base;
+ struct smc_clc_smcd_gid_chid *gidchids;
+ struct smc_clc_msg_proposal_area *pclc;
+ struct smc_clc_ipv6_prefix *ipv6_prfx;
+ struct smc_clc_v2_extension *v2_ext;
+ struct smc_clc_msg_smcd *pclc_smcd;
+ struct smc_clc_msg_trail *trl;
+ int len, i, plen, rc;
+ int reason_code = 0;
+ struct kvec vec[8];
+ struct msghdr msg;
+
+ pclc = kzalloc(sizeof(*pclc), GFP_KERNEL);
+ if (!pclc)
+ return -ENOMEM;
+
+ pclc_base = &pclc->pclc_base;
+ pclc_smcd = &pclc->pclc_smcd;
+ pclc_prfx = &pclc->pclc_prfx;
+ ipv6_prfx = pclc->pclc_prfx_ipv6;
+ v2_ext = &pclc->pclc_v2_ext;
+ smcd_v2_ext = &pclc->pclc_smcd_v2_ext;
+ gidchids = pclc->pclc_gidchids;
+ trl = &pclc->pclc_trl;
+
+ pclc_base->hdr.version = SMC_V2;
+ pclc_base->hdr.typev1 = ini->smc_type_v1;
+ pclc_base->hdr.typev2 = ini->smc_type_v2;
+ plen = sizeof(*pclc_base) + sizeof(*pclc_smcd) + sizeof(*trl);
+
+ /* retrieve ip prefixes for CLC proposal msg */
+ if (ini->smc_type_v1 != SMC_TYPE_N) {
+ rc = smc_clc_prfx_set(smc->clcsock, pclc_prfx, ipv6_prfx);
+ if (rc) {
+ if (ini->smc_type_v2 == SMC_TYPE_N) {
+ kfree(pclc);
+ return SMC_CLC_DECL_CNFERR;
+ }
+ pclc_base->hdr.typev1 = SMC_TYPE_N;
+ } else {
+ pclc_base->iparea_offset = htons(sizeof(*pclc_smcd));
+ plen += sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(ipv6_prfx[0]);
+ }
+ }
+
+ /* build SMC Proposal CLC message */
+ memcpy(pclc_base->hdr.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ pclc_base->hdr.type = SMC_CLC_PROPOSAL;
+ if (smcr_indicated(ini->smc_type_v1)) {
+ /* add SMC-R specifics */
+ memcpy(pclc_base->lcl.id_for_peer, local_systemid,
+ sizeof(local_systemid));
+ memcpy(pclc_base->lcl.gid, ini->ib_gid, SMC_GID_SIZE);
+ memcpy(pclc_base->lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1],
+ ETH_ALEN);
+ }
+ if (smcd_indicated(ini->smc_type_v1)) {
+ /* add SMC-D specifics */
+ if (ini->ism_dev[0]) {
+ pclc_smcd->ism.gid = htonll(ini->ism_dev[0]->local_gid);
+ pclc_smcd->ism.chid =
+ htons(smc_ism_get_chid(ini->ism_dev[0]));
+ }
+ }
+ if (ini->smc_type_v2 == SMC_TYPE_N) {
+ pclc_smcd->v2_ext_offset = 0;
+ } else {
+ u16 v2_ext_offset;
+ u8 *eid = NULL;
+
+ v2_ext_offset = sizeof(*pclc_smcd) -
+ offsetofend(struct smc_clc_msg_smcd, v2_ext_offset);
+ if (ini->smc_type_v1 != SMC_TYPE_N)
+ v2_ext_offset += sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(ipv6_prfx[0]);
+ pclc_smcd->v2_ext_offset = htons(v2_ext_offset);
+ v2_ext->hdr.eid_cnt = 0;
+ v2_ext->hdr.ism_gid_cnt = ini->ism_offered_cnt;
+ v2_ext->hdr.flag.release = SMC_RELEASE;
+ v2_ext->hdr.flag.seid = 1;
+ v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) -
+ offsetofend(struct smc_clnt_opts_area_hdr,
+ smcd_v2_ext_offset) +
+ v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN);
+ if (ini->ism_dev[0])
+ smc_ism_get_system_eid(ini->ism_dev[0], &eid);
+ else
+ smc_ism_get_system_eid(ini->ism_dev[1], &eid);
+ if (eid)
+ memcpy(smcd_v2_ext->system_eid, eid, SMC_MAX_EID_LEN);
+ plen += sizeof(*v2_ext) + sizeof(*smcd_v2_ext);
+ if (ini->ism_offered_cnt) {
+ for (i = 1; i <= ini->ism_offered_cnt; i++) {
+ gidchids[i - 1].gid =
+ htonll(ini->ism_dev[i]->local_gid);
+ gidchids[i - 1].chid =
+ htons(smc_ism_get_chid(ini->ism_dev[i]));
+ }
+ plen += ini->ism_offered_cnt *
+ sizeof(struct smc_clc_smcd_gid_chid);
+ }
+ }
+ pclc_base->hdr.length = htons(plen);
+ memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ /* send SMC Proposal CLC message */
+ memset(&msg, 0, sizeof(msg));
+ i = 0;
+ vec[i].iov_base = pclc_base;
+ vec[i++].iov_len = sizeof(*pclc_base);
+ vec[i].iov_base = pclc_smcd;
+ vec[i++].iov_len = sizeof(*pclc_smcd);
+ if (ini->smc_type_v1 != SMC_TYPE_N) {
+ vec[i].iov_base = pclc_prfx;
+ vec[i++].iov_len = sizeof(*pclc_prfx);
+ if (pclc_prfx->ipv6_prefixes_cnt > 0) {
+ vec[i].iov_base = ipv6_prfx;
+ vec[i++].iov_len = pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(ipv6_prfx[0]);
+ }
+ }
+ if (ini->smc_type_v2 != SMC_TYPE_N) {
+ vec[i].iov_base = v2_ext;
+ vec[i++].iov_len = sizeof(*v2_ext);
+ vec[i].iov_base = smcd_v2_ext;
+ vec[i++].iov_len = sizeof(*smcd_v2_ext);
+ if (ini->ism_offered_cnt) {
+ vec[i].iov_base = gidchids;
+ vec[i++].iov_len = ini->ism_offered_cnt *
+ sizeof(struct smc_clc_smcd_gid_chid);
+ }
+ }
+ vec[i].iov_base = trl;
+ vec[i++].iov_len = sizeof(*trl);
+ /* due to the few bytes needed for clc-handshake this cannot block */
+ len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen);
+ if (len < 0) {
+ smc->sk.sk_err = smc->clcsock->sk->sk_err;
+ reason_code = -smc->sk.sk_err;
+ } else if (len < ntohs(pclc_base->hdr.length)) {
+ reason_code = -ENETUNREACH;
+ smc->sk.sk_err = -reason_code;
+ }
+
+ kfree(pclc);
+ return reason_code;
+}
+
+/* build and send CLC CONFIRM / ACCEPT message */
+static int smc_clc_send_confirm_accept(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2,
+ int first_contact, u8 version)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_clc_msg_accept_confirm *clc;
+ struct smc_clc_first_contact_ext fce;
+ struct smc_clc_msg_trail trl;
+ struct kvec vec[3];
+ struct msghdr msg;
+ int i, len;
+
+ /* send SMC Confirm CLC msg */
+ clc = (struct smc_clc_msg_accept_confirm *)clc_v2;
+ clc->hdr.version = version; /* SMC version */
+ if (first_contact)
+ clc->hdr.typev2 |= SMC_FIRST_CONTACT_MASK;
+ if (conn->lgr->is_smcd) {
+ /* SMC-D specific settings */
+ memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER,
+ sizeof(SMCD_EYECATCHER));
+ clc->hdr.typev1 = SMC_TYPE_D;
+ clc->d0.gid = conn->lgr->smcd->local_gid;
+ clc->d0.token = conn->rmb_desc->token;
+ clc->d0.dmbe_size = conn->rmbe_size_short;
+ clc->d0.dmbe_idx = 0;
+ memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
+ if (version == SMC_V1) {
+ clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
+ } else {
+ u8 *eid = NULL;
+
+ clc_v2->chid = htons(smc_ism_get_chid(conn->lgr->smcd));
+ smc_ism_get_system_eid(conn->lgr->smcd, &eid);
+ if (eid)
+ memcpy(clc_v2->eid, eid, SMC_MAX_EID_LEN);
+ len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2;
+ if (first_contact)
+ smc_clc_fill_fce(&fce, &len);
+ clc_v2->hdr.length = htons(len);
+ }
+ memcpy(trl.eyecatcher, SMCD_EYECATCHER,
+ sizeof(SMCD_EYECATCHER));
+ } else {
+ struct smc_link *link = conn->lnk;
+
+ /* SMC-R specific settings */
+ link = conn->lnk;
+ memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ clc->hdr.typev1 = SMC_TYPE_R;
+ clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
+ memcpy(clc->r0.lcl.id_for_peer, local_systemid,
+ sizeof(local_systemid));
+ memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE);
+ memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
+ ETH_ALEN);
+ hton24(clc->r0.qpn, link->roce_qp->qp_num);
+ clc->r0.rmb_rkey =
+ htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey);
+ clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
+ clc->r0.rmbe_alert_token = htonl(conn->alert_token_local);
+ switch (clc->hdr.type) {
+ case SMC_CLC_ACCEPT:
+ clc->r0.qp_mtu = link->path_mtu;
+ break;
+ case SMC_CLC_CONFIRM:
+ clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu);
+ break;
+ }
+ clc->r0.rmbe_size = conn->rmbe_size_short;
+ clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
+ (conn->rmb_desc->sgt[link->link_idx].sgl));
+ hton24(clc->r0.psn, link->psn_initial);
+ memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ i = 0;
+ vec[i].iov_base = clc_v2;
+ if (version > SMC_V1)
+ vec[i++].iov_len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 - sizeof(trl);
+ else
+ vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ?
+ SMCD_CLC_ACCEPT_CONFIRM_LEN :
+ SMCR_CLC_ACCEPT_CONFIRM_LEN) -
+ sizeof(trl);
+ if (version > SMC_V1 && first_contact) {
+ vec[i].iov_base = &fce;
+ vec[i++].iov_len = sizeof(fce);
+ }
+ vec[i].iov_base = &trl;
+ vec[i++].iov_len = sizeof(trl);
+ return kernel_sendmsg(smc->clcsock, &msg, vec, 1,
+ ntohs(clc->hdr.length));
+}
+
+/* send CLC CONFIRM message across internal TCP socket */
+int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
+ u8 version)
+{
+ struct smc_clc_msg_accept_confirm_v2 cclc_v2;
+ int reason_code = 0;
+ int len;
+
+ /* send SMC Confirm CLC msg */
+ memset(&cclc_v2, 0, sizeof(cclc_v2));
+ cclc_v2.hdr.type = SMC_CLC_CONFIRM;
+ len = smc_clc_send_confirm_accept(smc, &cclc_v2, clnt_first_contact,
+ version);
+ if (len < ntohs(cclc_v2.hdr.length)) {
+ if (len >= 0) {
+ reason_code = -ENETUNREACH;
+ smc->sk.sk_err = -reason_code;
+ } else {
+ smc->sk.sk_err = smc->clcsock->sk->sk_err;
+ reason_code = -smc->sk.sk_err;
+ }
+ }
+ return reason_code;
+}
+
+/* send CLC ACCEPT message across internal TCP socket */
+int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact,
+ u8 version)
+{
+ struct smc_clc_msg_accept_confirm_v2 aclc_v2;
+ int len;
+
+ memset(&aclc_v2, 0, sizeof(aclc_v2));
+ aclc_v2.hdr.type = SMC_CLC_ACCEPT;
+ len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact,
+ version);
+ if (len < ntohs(aclc_v2.hdr.length))
+ len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err;
+
+ return len > 0 ? 0 : len;
+}
+
+void __init smc_clc_init(void)
+{
+ struct new_utsname *u;
+
+ memset(smc_hostname, _S, sizeof(smc_hostname)); /* ASCII blanks */
+ u = utsname();
+ memcpy(smc_hostname, u->nodename,
+ min_t(size_t, strlen(u->nodename), sizeof(smc_hostname)));
+}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
new file mode 100644
index 000000000..c579d1d59
--- /dev/null
+++ b/net/smc/smc_clc.h
@@ -0,0 +1,333 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * CLC (connection layer control) handshake over initial TCP socket to
+ * prepare for RDMA traffic
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CLC_H
+#define _SMC_CLC_H
+
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+
+#define SMC_CLC_PROPOSAL 0x01
+#define SMC_CLC_ACCEPT 0x02
+#define SMC_CLC_CONFIRM 0x03
+#define SMC_CLC_DECLINE 0x04
+
+#define SMC_TYPE_R 0 /* SMC-R only */
+#define SMC_TYPE_D 1 /* SMC-D only */
+#define SMC_TYPE_N 2 /* neither SMC-R nor SMC-D */
+#define SMC_TYPE_B 3 /* SMC-R and SMC-D */
+#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
+#define CLC_WAIT_TIME_SHORT HZ /* short wait time on clcsock */
+#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
+#define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */
+#define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */
+#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
+#define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */
+#define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */
+#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */
+#define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */
+#define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */
+#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/
+#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */
+#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */
+#define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */
+#define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/
+#define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */
+#define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */
+#define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */
+#define SMC_CLC_DECL_VERSMISMAT 0x030c0000 /* SMC version mismatch */
+#define SMC_CLC_DECL_MAX_DMB 0x030d0000 /* SMC-D DMB limit exceeded */
+#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
+#define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */
+#define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */
+#define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */
+#define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */
+#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */
+
+#define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */
+
+struct smc_clc_msg_hdr { /* header1 of clc messages */
+ u8 eyecatcher[4]; /* eye catcher */
+ u8 type; /* proposal / accept / confirm / decline */
+ __be16 length;
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 version : 4,
+ typev2 : 2,
+ typev1 : 2;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 typev1 : 2,
+ typev2 : 2,
+ version : 4;
+#endif
+} __packed; /* format defined in RFC7609 */
+
+struct smc_clc_msg_trail { /* trailer of clc messages */
+ u8 eyecatcher[4];
+};
+
+struct smc_clc_msg_local { /* header2 of clc messages */
+ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */
+ u8 gid[16]; /* gid of ib_device port */
+ u8 mac[6]; /* mac of ib_device port */
+};
+
+/* Struct would be 4 byte aligned, but it is used in an array that is sent
+ * to peers and must conform to RFC7609, hence we need to use packed here.
+ */
+struct smc_clc_ipv6_prefix {
+ struct in6_addr prefix;
+ u8 prefix_len;
+} __packed; /* format defined in RFC7609 */
+
+#if defined(__BIG_ENDIAN_BITFIELD)
+struct smc_clc_v2_flag {
+ u8 release : 4,
+ rsvd : 3,
+ seid : 1;
+};
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+struct smc_clc_v2_flag {
+ u8 seid : 1,
+ rsvd : 3,
+ release : 4;
+};
+#endif
+
+struct smc_clnt_opts_area_hdr {
+ u8 eid_cnt; /* number of user defined EIDs */
+ u8 ism_gid_cnt; /* number of ISMv2 GIDs */
+ u8 reserved1;
+ struct smc_clc_v2_flag flag;
+ u8 reserved2[2];
+ __be16 smcd_v2_ext_offset; /* SMC-Dv2 Extension Offset */
+};
+
+struct smc_clc_smcd_gid_chid {
+ __be64 gid; /* ISM GID */
+ __be16 chid; /* ISMv2 CHID */
+} __packed; /* format defined in
+ * IBM Shared Memory Communications Version 2
+ * (https://www.ibm.com/support/pages/node/6326337)
+ */
+
+struct smc_clc_v2_extension {
+ struct smc_clnt_opts_area_hdr hdr;
+ u8 roce[16]; /* RoCEv2 GID */
+ u8 reserved[16];
+ u8 user_eids[][SMC_MAX_EID_LEN];
+};
+
+struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/
+ __be32 outgoing_subnet; /* subnet mask */
+ u8 prefix_len; /* number of significant bits in mask */
+ u8 reserved[2];
+ u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
+} __aligned(4);
+
+struct smc_clc_msg_smcd { /* SMC-D GID information */
+ struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */
+ __be16 v2_ext_offset; /* SMC Version 2 Extension Offset */
+ u8 reserved[28];
+};
+
+struct smc_clc_smcd_v2_extension {
+ u8 system_eid[SMC_MAX_EID_LEN];
+ u8 reserved[16];
+ struct smc_clc_smcd_gid_chid gidchid[];
+};
+
+struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
+ struct smc_clc_msg_hdr hdr;
+ struct smc_clc_msg_local lcl;
+ __be16 iparea_offset; /* offset to IP address information area */
+} __aligned(4);
+
+#define SMC_CLC_MAX_V6_PREFIX 8
+
+struct smc_clc_msg_proposal_area {
+ struct smc_clc_msg_proposal pclc_base;
+ struct smc_clc_msg_smcd pclc_smcd;
+ struct smc_clc_msg_proposal_prefix pclc_prfx;
+ struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX];
+ struct smc_clc_v2_extension pclc_v2_ext;
+ struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext;
+ struct smc_clc_smcd_gid_chid pclc_gidchids[SMC_MAX_ISM_DEVS];
+ struct smc_clc_msg_trail pclc_trl;
+};
+
+struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */
+ struct smc_clc_msg_local lcl;
+ u8 qpn[3]; /* QP number */
+ __be32 rmb_rkey; /* RMB rkey */
+ u8 rmbe_idx; /* Index of RMBE in RMB */
+ __be32 rmbe_alert_token; /* unique connection id */
+ #if defined(__BIG_ENDIAN_BITFIELD)
+ u8 rmbe_size : 4, /* buf size (compressed) */
+ qp_mtu : 4; /* QP mtu */
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 qp_mtu : 4,
+ rmbe_size : 4;
+#endif
+ u8 reserved;
+ __be64 rmb_dma_addr; /* RMB virtual address */
+ u8 reserved2;
+ u8 psn[3]; /* packet sequence number */
+} __packed;
+
+struct smcd_clc_msg_accept_confirm_common { /* SMCD accept/confirm */
+ u64 gid; /* Sender GID */
+ u64 token; /* DMB token */
+ u8 dmbe_idx; /* DMBE index */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 dmbe_size : 4, /* buf size (compressed) */
+ reserved3 : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved3 : 4,
+ dmbe_size : 4;
+#endif
+ u16 reserved4;
+ __be32 linkid; /* Link identifier */
+} __packed;
+
+#define SMC_CLC_OS_ZOS 1
+#define SMC_CLC_OS_LINUX 2
+#define SMC_CLC_OS_AIX 3
+
+struct smc_clc_first_contact_ext {
+ u8 reserved1;
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 os_type : 4,
+ release : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 release : 4,
+ os_type : 4;
+#endif
+ u8 reserved2[2];
+ u8 hostname[SMC_MAX_HOSTNAME_LEN];
+};
+
+struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
+ struct smc_clc_msg_hdr hdr;
+ union {
+ struct smcr_clc_msg_accept_confirm r0; /* SMC-R */
+ struct { /* SMC-D */
+ struct smcd_clc_msg_accept_confirm_common d0;
+ u32 reserved5[3];
+ };
+ };
+} __packed; /* format defined in RFC7609 */
+
+struct smc_clc_msg_accept_confirm_v2 { /* clc accept / confirm message */
+ struct smc_clc_msg_hdr hdr;
+ union {
+ struct smcr_clc_msg_accept_confirm r0; /* SMC-R */
+ struct { /* SMC-D */
+ struct smcd_clc_msg_accept_confirm_common d0;
+ __be16 chid;
+ u8 eid[SMC_MAX_EID_LEN];
+ u8 reserved5[8];
+ };
+ };
+};
+
+struct smc_clc_msg_decline { /* clc decline message */
+ struct smc_clc_msg_hdr hdr;
+ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
+ __be32 peer_diagnosis; /* diagnosis information */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 os_type : 4,
+ reserved : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 4,
+ os_type : 4;
+#endif
+ u8 reserved2[3];
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */
+} __aligned(4);
+
+/* determine start of the prefix area within the proposal message */
+static inline struct smc_clc_msg_proposal_prefix *
+smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
+{
+ return (struct smc_clc_msg_proposal_prefix *)
+ ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
+}
+
+static inline bool smcr_indicated(int smc_type)
+{
+ return smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B;
+}
+
+static inline bool smcd_indicated(int smc_type)
+{
+ return smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B;
+}
+
+/* get SMC-D info from proposal message */
+static inline struct smc_clc_msg_smcd *
+smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop)
+{
+ if (smcd_indicated(prop->hdr.typev1) &&
+ ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd))
+ return NULL;
+
+ return (struct smc_clc_msg_smcd *)(prop + 1);
+}
+
+static inline struct smc_clc_v2_extension *
+smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop)
+{
+ struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop);
+
+ if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset))
+ return NULL;
+
+ return (struct smc_clc_v2_extension *)
+ ((u8 *)prop_smcd +
+ offsetof(struct smc_clc_msg_smcd, v2_ext_offset) +
+ sizeof(prop_smcd->v2_ext_offset) +
+ ntohs(prop_smcd->v2_ext_offset));
+}
+
+static inline struct smc_clc_smcd_v2_extension *
+smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext)
+{
+ if (!prop_v2ext)
+ return NULL;
+ if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset))
+ return NULL;
+
+ return (struct smc_clc_smcd_v2_extension *)
+ ((u8 *)prop_v2ext +
+ offsetof(struct smc_clc_v2_extension, hdr) +
+ offsetof(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) +
+ sizeof(prop_v2ext->hdr.smcd_v2_ext_offset) +
+ ntohs(prop_v2ext->hdr.smcd_v2_ext_offset));
+}
+
+struct smcd_dev;
+struct smc_init_info;
+
+int smc_clc_prfx_match(struct socket *clcsock,
+ struct smc_clc_msg_proposal_prefix *prop);
+int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
+ u8 expected_type, unsigned long timeout);
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version);
+int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini);
+int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
+ u8 version);
+int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact,
+ u8 version);
+void smc_clc_init(void) __init;
+
+#endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
new file mode 100644
index 000000000..bcd3ea894
--- /dev/null
+++ b/net/smc/smc_close.c
@@ -0,0 +1,500 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Socket Closing - normal and abnormal
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/workqueue.h>
+#include <linux/sched/signal.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include "smc.h"
+#include "smc_tx.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+
+/* release the clcsock that is assigned to the smc_sock */
+void smc_clcsock_release(struct smc_sock *smc)
+{
+ struct socket *tcp;
+
+ if (smc->listen_smc && current_work() != &smc->smc_listen_work)
+ cancel_work_sync(&smc->smc_listen_work);
+ mutex_lock(&smc->clcsock_release_lock);
+ if (smc->clcsock) {
+ tcp = smc->clcsock;
+ smc->clcsock = NULL;
+ sock_release(tcp);
+ }
+ mutex_unlock(&smc->clcsock_release_lock);
+}
+
+static void smc_close_cleanup_listen(struct sock *parent)
+{
+ struct sock *sk;
+
+ /* Close non-accepted connections */
+ while ((sk = smc_accept_dequeue(parent, NULL)))
+ smc_close_non_accepted(sk);
+}
+
+/* wait for sndbuf data being transmitted */
+static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = &smc->sk;
+
+ if (!timeout)
+ return;
+
+ if (!smc_tx_prepared_sends(&smc->conn))
+ return;
+
+ smc->wait_close_tx_prepared = 1;
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (!signal_pending(current) && timeout) {
+ int rc;
+
+ rc = sk_wait_event(sk, &timeout,
+ !smc_tx_prepared_sends(&smc->conn) ||
+ READ_ONCE(sk->sk_err) == ECONNABORTED ||
+ READ_ONCE(sk->sk_err) == ECONNRESET ||
+ smc->conn.killed,
+ &wait);
+ if (rc)
+ break;
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+ smc->wait_close_tx_prepared = 0;
+}
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc)
+{
+ if (smc->wait_close_tx_prepared)
+ /* wake up socket closing */
+ smc->sk.sk_state_change(&smc->sk);
+}
+
+static int smc_close_wr(struct smc_connection *conn)
+{
+ conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_final(struct smc_connection *conn)
+{
+ if (atomic_read(&conn->bytes_to_rcv))
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ else
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1;
+ if (conn->killed)
+ return -EPIPE;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+int smc_close_abort(struct smc_connection *conn)
+{
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static void smc_close_cancel_work(struct smc_sock *smc)
+{
+ struct sock *sk = &smc->sk;
+
+ release_sock(sk);
+ if (cancel_work_sync(&smc->conn.close_work))
+ sock_put(sk);
+ cancel_delayed_work_sync(&smc->conn.tx_work);
+ lock_sock(sk);
+}
+
+/* terminate smc socket abnormally - active abort
+ * link group is terminated, i.e. RDMA communication no longer possible
+ */
+void smc_close_active_abort(struct smc_sock *smc)
+{
+ struct sock *sk = &smc->sk;
+ bool release_clcsock = false;
+
+ if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) {
+ sk->sk_err = ECONNABORTED;
+ if (smc->clcsock && smc->clcsock->sk)
+ tcp_abort(smc->clcsock->sk, ECONNABORTED);
+ }
+ switch (sk->sk_state) {
+ case SMC_ACTIVE:
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ sk->sk_state = SMC_PEERABORTWAIT;
+ smc_close_cancel_work(smc);
+ if (sk->sk_state != SMC_PEERABORTWAIT)
+ break;
+ sk->sk_state = SMC_CLOSED;
+ sock_put(sk); /* (postponed) passive closing */
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ sk->sk_state = SMC_PEERABORTWAIT;
+ smc_close_cancel_work(smc);
+ if (sk->sk_state != SMC_PEERABORTWAIT)
+ break;
+ sk->sk_state = SMC_CLOSED;
+ smc_conn_free(&smc->conn);
+ release_clcsock = true;
+ sock_put(sk); /* passive closing */
+ break;
+ case SMC_PROCESSABORT:
+ case SMC_APPFINCLOSEWAIT:
+ sk->sk_state = SMC_PEERABORTWAIT;
+ smc_close_cancel_work(smc);
+ if (sk->sk_state != SMC_PEERABORTWAIT)
+ break;
+ sk->sk_state = SMC_CLOSED;
+ smc_conn_free(&smc->conn);
+ release_clcsock = true;
+ break;
+ case SMC_INIT:
+ case SMC_PEERABORTWAIT:
+ case SMC_CLOSED:
+ break;
+ }
+
+ smc_sock_set_flag(sk, SOCK_DEAD);
+ sk->sk_state_change(sk);
+
+ if (release_clcsock) {
+ release_sock(sk);
+ smc_clcsock_release(smc);
+ lock_sock(sk);
+ }
+}
+
+static inline bool smc_close_sent_any_close(struct smc_connection *conn)
+{
+ return conn->local_tx_ctrl.conn_state_flags.peer_conn_abort ||
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_closed;
+}
+
+int smc_close_active(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *txflags =
+ &smc->conn.local_tx_ctrl.conn_state_flags;
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ int old_state;
+ long timeout;
+ int rc = 0;
+ int rc1 = 0;
+
+ timeout = current->flags & PF_EXITING ?
+ 0 : sock_flag(sk, SOCK_LINGER) ?
+ sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT;
+
+ old_state = sk->sk_state;
+again:
+ switch (sk->sk_state) {
+ case SMC_INIT:
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_LISTEN:
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_state_change(sk); /* wake up accept */
+ if (smc->clcsock && smc->clcsock->sk) {
+ smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready;
+ smc->clcsock->sk->sk_user_data = NULL;
+ rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
+ }
+ smc_close_cleanup_listen(sk);
+ release_sock(sk);
+ flush_work(&smc->tcp_listen_work);
+ lock_sock(sk);
+ break;
+ case SMC_ACTIVE:
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_delayed_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_state == SMC_ACTIVE) {
+ /* send close request */
+ rc = smc_close_final(conn);
+ sk->sk_state = SMC_PEERCLOSEWAIT1;
+
+ /* actively shutdown clcsock before peer close it,
+ * prevent peer from entering TIME_WAIT state.
+ */
+ if (smc->clcsock && smc->clcsock->sk) {
+ rc1 = kernel_sock_shutdown(smc->clcsock,
+ SHUT_RDWR);
+ rc = rc ? rc : rc1;
+ }
+ } else {
+ /* peer event has changed the state */
+ goto again;
+ }
+ break;
+ case SMC_APPFINCLOSEWAIT:
+ /* socket already shutdown wr or both (active close) */
+ if (txflags->peer_done_writing &&
+ !smc_close_sent_any_close(conn)) {
+ /* just shutdown wr done, send close request */
+ rc = smc_close_final(conn);
+ }
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ if (!smc_cdc_rxed_any_close(conn))
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_delayed_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_state != SMC_APPCLOSEWAIT1 &&
+ sk->sk_state != SMC_APPCLOSEWAIT2)
+ goto again;
+ /* confirm close from peer */
+ rc = smc_close_final(conn);
+ if (smc_cdc_rxed_any_close(conn)) {
+ /* peer has closed the socket already */
+ sk->sk_state = SMC_CLOSED;
+ sock_put(sk); /* postponed passive closing */
+ } else {
+ /* peer has just issued a shutdown write */
+ sk->sk_state = SMC_PEERFINCLOSEWAIT;
+ }
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ if (txflags->peer_done_writing &&
+ !smc_close_sent_any_close(conn)) {
+ /* just shutdown wr done, send close request */
+ rc = smc_close_final(conn);
+ }
+ /* peer sending PeerConnectionClosed will cause transition */
+ break;
+ case SMC_PEERFINCLOSEWAIT:
+ /* peer sending PeerConnectionClosed will cause transition */
+ break;
+ case SMC_PROCESSABORT:
+ rc = smc_close_abort(conn);
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_PEERABORTWAIT:
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_CLOSED:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(sk);
+ return rc;
+}
+
+static void smc_close_passive_abort_received(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *txflags =
+ &smc->conn.local_tx_ctrl.conn_state_flags;
+ struct sock *sk = &smc->sk;
+
+ switch (sk->sk_state) {
+ case SMC_INIT:
+ case SMC_ACTIVE:
+ case SMC_APPCLOSEWAIT1:
+ sk->sk_state = SMC_PROCESSABORT;
+ sock_put(sk); /* passive closing */
+ break;
+ case SMC_APPFINCLOSEWAIT:
+ sk->sk_state = SMC_PROCESSABORT;
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ if (txflags->peer_done_writing &&
+ !smc_close_sent_any_close(&smc->conn))
+ /* just shutdown, but not yet closed locally */
+ sk->sk_state = SMC_PROCESSABORT;
+ else
+ sk->sk_state = SMC_CLOSED;
+ sock_put(sk); /* passive closing */
+ break;
+ case SMC_APPCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ sk->sk_state = SMC_CLOSED;
+ sock_put(sk); /* passive closing */
+ break;
+ case SMC_PEERABORTWAIT:
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_PROCESSABORT:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+}
+
+/* Either some kind of closing has been received: peer_conn_closed,
+ * peer_conn_abort, or peer_done_writing
+ * or the link group of the connection terminates abnormally.
+ */
+static void smc_close_passive_work(struct work_struct *work)
+{
+ struct smc_connection *conn = container_of(work,
+ struct smc_connection,
+ close_work);
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ struct smc_cdc_conn_state_flags *rxflags;
+ bool release_clcsock = false;
+ struct sock *sk = &smc->sk;
+ int old_state;
+
+ lock_sock(sk);
+ old_state = sk->sk_state;
+
+ rxflags = &conn->local_rx_ctrl.conn_state_flags;
+ if (rxflags->peer_conn_abort) {
+ /* peer has not received all data */
+ smc_close_passive_abort_received(smc);
+ release_sock(&smc->sk);
+ cancel_delayed_work_sync(&conn->tx_work);
+ lock_sock(&smc->sk);
+ goto wakeup;
+ }
+
+ switch (sk->sk_state) {
+ case SMC_INIT:
+ sk->sk_state = SMC_APPCLOSEWAIT1;
+ break;
+ case SMC_ACTIVE:
+ sk->sk_state = SMC_APPCLOSEWAIT1;
+ /* postpone sock_put() for passive closing to cover
+ * received SEND_SHUTDOWN as well
+ */
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ if (rxflags->peer_done_writing)
+ sk->sk_state = SMC_PEERCLOSEWAIT2;
+ fallthrough;
+ /* to check for closing */
+ case SMC_PEERCLOSEWAIT2:
+ if (!smc_cdc_rxed_any_close(conn))
+ break;
+ if (sock_flag(sk, SOCK_DEAD) &&
+ smc_close_sent_any_close(conn)) {
+ /* smc_release has already been called locally */
+ sk->sk_state = SMC_CLOSED;
+ } else {
+ /* just shutdown, but not yet closed locally */
+ sk->sk_state = SMC_APPFINCLOSEWAIT;
+ }
+ sock_put(sk); /* passive closing */
+ break;
+ case SMC_PEERFINCLOSEWAIT:
+ if (smc_cdc_rxed_any_close(conn)) {
+ sk->sk_state = SMC_CLOSED;
+ sock_put(sk); /* passive closing */
+ }
+ break;
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ /* postpone sock_put() for passive closing to cover
+ * received SEND_SHUTDOWN as well
+ */
+ break;
+ case SMC_APPFINCLOSEWAIT:
+ case SMC_PEERABORTWAIT:
+ case SMC_PROCESSABORT:
+ case SMC_CLOSED:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+wakeup:
+ sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */
+ sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */
+
+ if (old_state != sk->sk_state) {
+ sk->sk_state_change(sk);
+ if ((sk->sk_state == SMC_CLOSED) &&
+ (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) {
+ smc_conn_free(conn);
+ if (smc->clcsock)
+ release_clcsock = true;
+ }
+ }
+ release_sock(sk);
+ if (release_clcsock)
+ smc_clcsock_release(smc);
+ sock_put(sk); /* sock_hold done by schedulers of close_work */
+}
+
+int smc_close_shutdown_write(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ int old_state;
+ long timeout;
+ int rc = 0;
+
+ timeout = current->flags & PF_EXITING ?
+ 0 : sock_flag(sk, SOCK_LINGER) ?
+ sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT;
+
+ old_state = sk->sk_state;
+again:
+ switch (sk->sk_state) {
+ case SMC_ACTIVE:
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_delayed_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_state != SMC_ACTIVE)
+ goto again;
+ /* send close wr request */
+ rc = smc_close_wr(conn);
+ sk->sk_state = SMC_PEERCLOSEWAIT1;
+ break;
+ case SMC_APPCLOSEWAIT1:
+ /* passive close */
+ if (!smc_cdc_rxed_any_close(conn))
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_delayed_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_state != SMC_APPCLOSEWAIT1)
+ goto again;
+ /* confirm close from peer */
+ rc = smc_close_wr(conn);
+ sk->sk_state = SMC_APPCLOSEWAIT2;
+ break;
+ case SMC_APPCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ case SMC_APPFINCLOSEWAIT:
+ case SMC_PROCESSABORT:
+ case SMC_PEERABORTWAIT:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(sk);
+ return rc;
+}
+
+/* Initialize close properties on connection establishment. */
+void smc_close_init(struct smc_sock *smc)
+{
+ INIT_WORK(&smc->conn.close_work, smc_close_passive_work);
+}
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
new file mode 100644
index 000000000..634fea2b7
--- /dev/null
+++ b/net/smc/smc_close.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Socket Closing
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_CLOSE_H
+#define SMC_CLOSE_H
+
+#include <linux/workqueue.h>
+
+#include "smc.h"
+
+#define SMC_MAX_STREAM_WAIT_TIMEOUT (2 * HZ)
+#define SMC_CLOSE_SOCK_PUT_DELAY HZ
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc);
+int smc_close_active(struct smc_sock *smc);
+int smc_close_shutdown_write(struct smc_sock *smc);
+void smc_close_init(struct smc_sock *smc);
+void smc_clcsock_release(struct smc_sock *smc);
+int smc_close_abort(struct smc_connection *conn);
+void smc_close_active_abort(struct smc_sock *smc);
+
+#endif /* SMC_CLOSE_H */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
new file mode 100644
index 000000000..ab9ecdd1a
--- /dev/null
+++ b/net/smc/smc_core.c
@@ -0,0 +1,1975 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Basic Transport Functions exploiting Infiniband API
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/socket.h>
+#include <linux/if_vlan.h>
+#include <linux/random.h>
+#include <linux/workqueue.h>
+#include <linux/wait.h>
+#include <linux/reboot.h>
+#include <linux/mutex.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_core.h"
+#include "smc_ib.h"
+#include "smc_wr.h"
+#include "smc_llc.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+#include "smc_ism.h"
+
+#define SMC_LGR_NUM_INCR 256
+#define SMC_LGR_FREE_DELAY_SERV (600 * HZ)
+#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
+
+static struct smc_lgr_list smc_lgr_list = { /* established link groups */
+ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
+ .list = LIST_HEAD_INIT(smc_lgr_list.list),
+ .num = 0,
+};
+
+static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */
+static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted);
+
+static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
+ struct smc_buf_desc *buf_desc);
+static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
+
+static void smc_link_down_work(struct work_struct *work);
+
+/* return head of link group list and its lock for a given link group */
+static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
+ spinlock_t **lgr_lock)
+{
+ if (lgr->is_smcd) {
+ *lgr_lock = &lgr->smcd->lgr_lock;
+ return &lgr->smcd->lgr_list;
+ }
+
+ *lgr_lock = &smc_lgr_list.lock;
+ return &smc_lgr_list.list;
+}
+
+static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
+{
+ /* client link group creation always follows the server link group
+ * creation. For client use a somewhat higher removal delay time,
+ * otherwise there is a risk of out-of-sync link groups.
+ */
+ if (!lgr->freeing) {
+ mod_delayed_work(system_wq, &lgr->free_work,
+ (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
+ SMC_LGR_FREE_DELAY_CLNT :
+ SMC_LGR_FREE_DELAY_SERV);
+ }
+}
+
+/* Register connection's alert token in our lookup structure.
+ * To use rbtrees we have to implement our own insert core.
+ * Requires @conns_lock
+ * @smc connection to register
+ * Returns 0 on success, != otherwise.
+ */
+static void smc_lgr_add_alert_token(struct smc_connection *conn)
+{
+ struct rb_node **link, *parent = NULL;
+ u32 token = conn->alert_token_local;
+
+ link = &conn->lgr->conns_all.rb_node;
+ while (*link) {
+ struct smc_connection *cur = rb_entry(*link,
+ struct smc_connection, alert_node);
+
+ parent = *link;
+ if (cur->alert_token_local > token)
+ link = &parent->rb_left;
+ else
+ link = &parent->rb_right;
+ }
+ /* Put the new node there */
+ rb_link_node(&conn->alert_node, parent, link);
+ rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
+}
+
+/* assign an SMC-R link to the connection */
+static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first)
+{
+ enum smc_link_state expected = first ? SMC_LNK_ACTIVATING :
+ SMC_LNK_ACTIVE;
+ int i, j;
+
+ /* do link balancing */
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *lnk = &conn->lgr->lnk[i];
+
+ if (lnk->state != expected || lnk->link_is_asym)
+ continue;
+ if (conn->lgr->role == SMC_CLNT) {
+ conn->lnk = lnk; /* temporary, SMC server assigns link*/
+ break;
+ }
+ if (conn->lgr->conns_num % 2) {
+ for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) {
+ struct smc_link *lnk2;
+
+ lnk2 = &conn->lgr->lnk[j];
+ if (lnk2->state == expected &&
+ !lnk2->link_is_asym) {
+ conn->lnk = lnk2;
+ break;
+ }
+ }
+ }
+ if (!conn->lnk)
+ conn->lnk = lnk;
+ break;
+ }
+ if (!conn->lnk)
+ return SMC_CLC_DECL_NOACTLINK;
+ return 0;
+}
+
+/* Register connection in link group by assigning an alert token
+ * registered in a search tree.
+ * Requires @conns_lock
+ * Note that '0' is a reserved value and not assigned.
+ */
+static int smc_lgr_register_conn(struct smc_connection *conn, bool first)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ static atomic_t nexttoken = ATOMIC_INIT(0);
+ int rc;
+
+ if (!conn->lgr->is_smcd) {
+ rc = smcr_lgr_conn_assign_link(conn, first);
+ if (rc)
+ return rc;
+ }
+ /* find a new alert_token_local value not yet used by some connection
+ * in this link group
+ */
+ sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
+ while (!conn->alert_token_local) {
+ conn->alert_token_local = atomic_inc_return(&nexttoken);
+ if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
+ conn->alert_token_local = 0;
+ }
+ smc_lgr_add_alert_token(conn);
+ conn->lgr->conns_num++;
+ return 0;
+}
+
+/* Unregister connection and reset the alert token of the given connection<
+ */
+static void __smc_lgr_unregister_conn(struct smc_connection *conn)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ struct smc_link_group *lgr = conn->lgr;
+
+ rb_erase(&conn->alert_node, &lgr->conns_all);
+ lgr->conns_num--;
+ conn->alert_token_local = 0;
+ sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
+}
+
+/* Unregister connection from lgr
+ */
+static void smc_lgr_unregister_conn(struct smc_connection *conn)
+{
+ struct smc_link_group *lgr = conn->lgr;
+
+ if (!lgr)
+ return;
+ write_lock_bh(&lgr->conns_lock);
+ if (conn->alert_token_local) {
+ __smc_lgr_unregister_conn(conn);
+ }
+ write_unlock_bh(&lgr->conns_lock);
+ conn->lgr = NULL;
+}
+
+void smc_lgr_cleanup_early(struct smc_connection *conn)
+{
+ struct smc_link_group *lgr = conn->lgr;
+ spinlock_t *lgr_lock;
+
+ if (!lgr)
+ return;
+
+ smc_conn_free(conn);
+ smc_lgr_list_head(lgr, &lgr_lock);
+ spin_lock_bh(lgr_lock);
+ /* do not use this link group for new connections */
+ if (!list_empty(&lgr->list))
+ list_del_init(&lgr->list);
+ spin_unlock_bh(lgr_lock);
+ __smc_lgr_terminate(lgr, true);
+}
+
+static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr)
+{
+ int i;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *lnk = &lgr->lnk[i];
+
+ if (smc_link_sendable(lnk))
+ lnk->state = SMC_LNK_INACTIVE;
+ }
+ wake_up_all(&lgr->llc_msg_waiter);
+ wake_up_all(&lgr->llc_flow_waiter);
+}
+
+static void smc_lgr_free(struct smc_link_group *lgr);
+
+static void smc_lgr_free_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(to_delayed_work(work),
+ struct smc_link_group,
+ free_work);
+ spinlock_t *lgr_lock;
+ bool conns;
+
+ smc_lgr_list_head(lgr, &lgr_lock);
+ spin_lock_bh(lgr_lock);
+ if (lgr->freeing) {
+ spin_unlock_bh(lgr_lock);
+ return;
+ }
+ read_lock_bh(&lgr->conns_lock);
+ conns = RB_EMPTY_ROOT(&lgr->conns_all);
+ read_unlock_bh(&lgr->conns_lock);
+ if (!conns) { /* number of lgr connections is no longer zero */
+ spin_unlock_bh(lgr_lock);
+ return;
+ }
+ list_del_init(&lgr->list); /* remove from smc_lgr_list */
+ lgr->freeing = 1; /* this instance does the freeing, no new schedule */
+ spin_unlock_bh(lgr_lock);
+ cancel_delayed_work(&lgr->free_work);
+
+ if (!lgr->is_smcd && !lgr->terminating)
+ smc_llc_send_link_delete_all(lgr, true,
+ SMC_LLC_DEL_PROG_INIT_TERM);
+ if (lgr->is_smcd && !lgr->terminating)
+ smc_ism_signal_shutdown(lgr);
+ if (!lgr->is_smcd)
+ smcr_lgr_link_deactivate_all(lgr);
+ smc_lgr_free(lgr);
+}
+
+static void smc_lgr_terminate_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+ terminate_work);
+
+ __smc_lgr_terminate(lgr, true);
+}
+
+/* return next unique link id for the lgr */
+static u8 smcr_next_link_id(struct smc_link_group *lgr)
+{
+ u8 link_id;
+ int i;
+
+ while (1) {
+again:
+ link_id = ++lgr->next_link_id;
+ if (!link_id) /* skip zero as link_id */
+ link_id = ++lgr->next_link_id;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (smc_link_usable(&lgr->lnk[i]) &&
+ lgr->lnk[i].link_id == link_id)
+ goto again;
+ }
+ break;
+ }
+ return link_id;
+}
+
+int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
+ u8 link_idx, struct smc_init_info *ini)
+{
+ u8 rndvec[3];
+ int rc;
+
+ get_device(&ini->ib_dev->ibdev->dev);
+ atomic_inc(&ini->ib_dev->lnk_cnt);
+ lnk->link_id = smcr_next_link_id(lgr);
+ lnk->lgr = lgr;
+ lnk->link_idx = link_idx;
+ lnk->smcibdev = ini->ib_dev;
+ lnk->ibport = ini->ib_port;
+ lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
+ smc_llc_link_set_uid(lnk);
+ INIT_WORK(&lnk->link_down_wrk, smc_link_down_work);
+ if (!ini->ib_dev->initialized) {
+ rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev);
+ if (rc)
+ goto out;
+ }
+ get_random_bytes(rndvec, sizeof(rndvec));
+ lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
+ (rndvec[2] << 16);
+ rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
+ ini->vlan_id, lnk->gid, &lnk->sgid_index);
+ if (rc)
+ goto out;
+ rc = smc_llc_link_init(lnk);
+ if (rc)
+ goto out;
+ rc = smc_wr_alloc_link_mem(lnk);
+ if (rc)
+ goto clear_llc_lnk;
+ rc = smc_ib_create_protection_domain(lnk);
+ if (rc)
+ goto free_link_mem;
+ rc = smc_ib_create_queue_pair(lnk);
+ if (rc)
+ goto dealloc_pd;
+ rc = smc_wr_create_link(lnk);
+ if (rc)
+ goto destroy_qp;
+ lnk->state = SMC_LNK_ACTIVATING;
+ return 0;
+
+destroy_qp:
+ smc_ib_destroy_queue_pair(lnk);
+dealloc_pd:
+ smc_ib_dealloc_protection_domain(lnk);
+free_link_mem:
+ smc_wr_free_link_mem(lnk);
+clear_llc_lnk:
+ smc_llc_link_clear(lnk, false);
+out:
+ put_device(&ini->ib_dev->ibdev->dev);
+ memset(lnk, 0, sizeof(struct smc_link));
+ lnk->state = SMC_LNK_UNUSED;
+ if (!atomic_dec_return(&ini->ib_dev->lnk_cnt))
+ wake_up(&ini->ib_dev->lnks_deleted);
+ return rc;
+}
+
+/* create a new SMC link group */
+static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
+{
+ struct smc_link_group *lgr;
+ struct list_head *lgr_list;
+ struct smc_link *lnk;
+ spinlock_t *lgr_lock;
+ u8 link_idx;
+ int rc = 0;
+ int i;
+
+ if (ini->is_smcd && ini->vlan_id) {
+ if (smc_ism_get_vlan(ini->ism_dev[ini->ism_selected],
+ ini->vlan_id)) {
+ rc = SMC_CLC_DECL_ISMVLANERR;
+ goto out;
+ }
+ }
+
+ lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
+ if (!lgr) {
+ rc = SMC_CLC_DECL_MEM;
+ goto ism_put_vlan;
+ }
+ lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0,
+ SMC_LGR_ID_SIZE, &lgr->id);
+ if (!lgr->tx_wq) {
+ rc = -ENOMEM;
+ goto free_lgr;
+ }
+ lgr->is_smcd = ini->is_smcd;
+ lgr->sync_err = 0;
+ lgr->terminating = 0;
+ lgr->freeing = 0;
+ lgr->vlan_id = ini->vlan_id;
+ mutex_init(&lgr->sndbufs_lock);
+ mutex_init(&lgr->rmbs_lock);
+ rwlock_init(&lgr->conns_lock);
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ INIT_LIST_HEAD(&lgr->sndbufs[i]);
+ INIT_LIST_HEAD(&lgr->rmbs[i]);
+ }
+ lgr->next_link_id = 0;
+ smc_lgr_list.num += SMC_LGR_NUM_INCR;
+ memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
+ INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
+ INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work);
+ lgr->conns_all = RB_ROOT;
+ if (ini->is_smcd) {
+ /* SMC-D specific settings */
+ get_device(&ini->ism_dev[ini->ism_selected]->dev);
+ lgr->peer_gid = ini->ism_peer_gid[ini->ism_selected];
+ lgr->smcd = ini->ism_dev[ini->ism_selected];
+ lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list;
+ lgr_lock = &lgr->smcd->lgr_lock;
+ lgr->smc_version = ini->smcd_version;
+ lgr->peer_shutdown = 0;
+ atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt);
+ } else {
+ /* SMC-R specific settings */
+ lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
+ SMC_SYSTEMID_LEN);
+ memcpy(lgr->pnet_id, ini->ib_dev->pnetid[ini->ib_port - 1],
+ SMC_MAX_PNETID_LEN);
+ smc_llc_lgr_init(lgr, smc);
+
+ link_idx = SMC_SINGLE_LINK;
+ lnk = &lgr->lnk[link_idx];
+ rc = smcr_link_init(lgr, lnk, link_idx, ini);
+ if (rc)
+ goto free_wq;
+ lgr_list = &smc_lgr_list.list;
+ lgr_lock = &smc_lgr_list.lock;
+ atomic_inc(&lgr_cnt);
+ }
+ smc->conn.lgr = lgr;
+ spin_lock_bh(lgr_lock);
+ list_add_tail(&lgr->list, lgr_list);
+ spin_unlock_bh(lgr_lock);
+ return 0;
+
+free_wq:
+ destroy_workqueue(lgr->tx_wq);
+free_lgr:
+ kfree(lgr);
+ism_put_vlan:
+ if (ini->is_smcd && ini->vlan_id)
+ smc_ism_put_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id);
+out:
+ if (rc < 0) {
+ if (rc == -ENOMEM)
+ rc = SMC_CLC_DECL_MEM;
+ else
+ rc = SMC_CLC_DECL_INTERR;
+ }
+ return rc;
+}
+
+static int smc_write_space(struct smc_connection *conn)
+{
+ int buffer_len = conn->peer_rmbe_size;
+ union smc_host_cursor prod;
+ union smc_host_cursor cons;
+ int space;
+
+ smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
+ smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
+ /* determine rx_buf space */
+ space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod);
+ return space;
+}
+
+static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend,
+ struct smc_wr_buf *wr_buf)
+{
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor cons, fin;
+ int rc = 0;
+ int diff;
+
+ smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn);
+ smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn);
+ /* set prod cursor to old state, enforce tx_rdma_writes() */
+ smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn);
+ smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
+
+ if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) {
+ /* cons cursor advanced more than fin, and prod was set
+ * fin above, so now prod is smaller than cons. Fix that.
+ */
+ diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons);
+ smc_curs_add(conn->sndbuf_desc->len,
+ &conn->tx_curs_sent, diff);
+ smc_curs_add(conn->sndbuf_desc->len,
+ &conn->tx_curs_fin, diff);
+
+ smp_mb__before_atomic();
+ atomic_add(diff, &conn->sndbuf_space);
+ smp_mb__after_atomic();
+
+ smc_curs_add(conn->peer_rmbe_size,
+ &conn->local_tx_ctrl.prod, diff);
+ smc_curs_add(conn->peer_rmbe_size,
+ &conn->local_tx_ctrl_fin, diff);
+ }
+ /* recalculate, value is used by tx_rdma_writes() */
+ atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn));
+
+ if (smc->sk.sk_state != SMC_INIT &&
+ smc->sk.sk_state != SMC_CLOSED) {
+ rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf);
+ if (!rc) {
+ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 0);
+ smc->sk.sk_data_ready(&smc->sk);
+ }
+ } else {
+ smc_wr_tx_put_slot(conn->lnk,
+ (struct smc_wr_tx_pend_priv *)pend);
+ }
+ return rc;
+}
+
+struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
+ struct smc_link *from_lnk, bool is_dev_err)
+{
+ struct smc_link *to_lnk = NULL;
+ struct smc_cdc_tx_pend *pend;
+ struct smc_connection *conn;
+ struct smc_wr_buf *wr_buf;
+ struct smc_sock *smc;
+ struct rb_node *node;
+ int i, rc = 0;
+
+ /* link is inactive, wake up tx waiters */
+ smc_wr_wakeup_tx_wait(from_lnk);
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]) || i == from_lnk->link_idx)
+ continue;
+ if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev &&
+ from_lnk->ibport == lgr->lnk[i].ibport) {
+ continue;
+ }
+ to_lnk = &lgr->lnk[i];
+ break;
+ }
+ if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) {
+ smc_lgr_terminate_sched(lgr);
+ return NULL;
+ }
+again:
+ read_lock_bh(&lgr->conns_lock);
+ for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) {
+ conn = rb_entry(node, struct smc_connection, alert_node);
+ if (conn->lnk != from_lnk)
+ continue;
+ smc = container_of(conn, struct smc_sock, conn);
+ /* conn->lnk not yet set in SMC_INIT state */
+ if (smc->sk.sk_state == SMC_INIT)
+ continue;
+ if (smc->sk.sk_state == SMC_CLOSED ||
+ smc->sk.sk_state == SMC_PEERCLOSEWAIT1 ||
+ smc->sk.sk_state == SMC_PEERCLOSEWAIT2 ||
+ smc->sk.sk_state == SMC_APPFINCLOSEWAIT ||
+ smc->sk.sk_state == SMC_APPCLOSEWAIT1 ||
+ smc->sk.sk_state == SMC_APPCLOSEWAIT2 ||
+ smc->sk.sk_state == SMC_PEERFINCLOSEWAIT ||
+ smc->sk.sk_state == SMC_PEERABORTWAIT ||
+ smc->sk.sk_state == SMC_PROCESSABORT) {
+ spin_lock_bh(&conn->send_lock);
+ conn->lnk = to_lnk;
+ spin_unlock_bh(&conn->send_lock);
+ continue;
+ }
+ sock_hold(&smc->sk);
+ read_unlock_bh(&lgr->conns_lock);
+ /* pre-fetch buffer outside of send_lock, might sleep */
+ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend);
+ if (rc)
+ goto err_out;
+ /* avoid race with smcr_tx_sndbuf_nonempty() */
+ spin_lock_bh(&conn->send_lock);
+ conn->lnk = to_lnk;
+ rc = smc_switch_cursor(smc, pend, wr_buf);
+ spin_unlock_bh(&conn->send_lock);
+ sock_put(&smc->sk);
+ if (rc)
+ goto err_out;
+ goto again;
+ }
+ read_unlock_bh(&lgr->conns_lock);
+ smc_wr_tx_link_put(to_lnk);
+ return to_lnk;
+
+err_out:
+ smcr_link_down_cond_sched(to_lnk);
+ smc_wr_tx_link_put(to_lnk);
+ return NULL;
+}
+
+static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc,
+ struct smc_link_group *lgr)
+{
+ int rc;
+
+ if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) {
+ /* unregister rmb with peer */
+ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
+ if (!rc) {
+ /* protect against smc_llc_cli_rkey_exchange() */
+ mutex_lock(&lgr->llc_conf_mutex);
+ smc_llc_do_delete_rkey(lgr, rmb_desc);
+ rmb_desc->is_conf_rkey = false;
+ mutex_unlock(&lgr->llc_conf_mutex);
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+ }
+ }
+
+ if (rmb_desc->is_reg_err) {
+ /* buf registration failed, reuse not possible */
+ mutex_lock(&lgr->rmbs_lock);
+ list_del(&rmb_desc->list);
+ mutex_unlock(&lgr->rmbs_lock);
+
+ smc_buf_free(lgr, true, rmb_desc);
+ } else {
+ rmb_desc->used = 0;
+ }
+}
+
+static void smc_buf_unuse(struct smc_connection *conn,
+ struct smc_link_group *lgr)
+{
+ if (conn->sndbuf_desc)
+ conn->sndbuf_desc->used = 0;
+ if (conn->rmb_desc && lgr->is_smcd)
+ conn->rmb_desc->used = 0;
+ else if (conn->rmb_desc)
+ smcr_buf_unuse(conn->rmb_desc, lgr);
+}
+
+/* remove a finished connection from its link group */
+void smc_conn_free(struct smc_connection *conn)
+{
+ struct smc_link_group *lgr = conn->lgr;
+
+ if (!lgr)
+ return;
+ if (lgr->is_smcd) {
+ if (!list_empty(&lgr->list))
+ smc_ism_unset_conn(conn);
+ tasklet_kill(&conn->rx_tsklet);
+ } else {
+ smc_cdc_wait_pend_tx_wr(conn);
+ if (current_work() != &conn->abort_work)
+ cancel_work_sync(&conn->abort_work);
+ }
+ if (!list_empty(&lgr->list)) {
+ smc_buf_unuse(conn, lgr); /* allow buffer reuse */
+ smc_lgr_unregister_conn(conn);
+ }
+
+ if (!lgr->conns_num)
+ smc_lgr_schedule_free_work(lgr);
+}
+
+/* unregister a link from a buf_desc */
+static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb,
+ struct smc_link *lnk)
+{
+ if (is_rmb)
+ buf_desc->is_reg_mr[lnk->link_idx] = false;
+ if (!buf_desc->is_map_ib[lnk->link_idx])
+ return;
+ if (is_rmb) {
+ if (buf_desc->mr_rx[lnk->link_idx]) {
+ smc_ib_put_memory_region(
+ buf_desc->mr_rx[lnk->link_idx]);
+ buf_desc->mr_rx[lnk->link_idx] = NULL;
+ }
+ smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE);
+ } else {
+ smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE);
+ }
+ sg_free_table(&buf_desc->sgt[lnk->link_idx]);
+ buf_desc->is_map_ib[lnk->link_idx] = false;
+}
+
+/* unmap all buffers of lgr for a deleted link */
+static void smcr_buf_unmap_lgr(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ struct smc_buf_desc *buf_desc, *bf;
+ int i;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ mutex_lock(&lgr->rmbs_lock);
+ list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list)
+ smcr_buf_unmap_link(buf_desc, true, lnk);
+ mutex_unlock(&lgr->rmbs_lock);
+ mutex_lock(&lgr->sndbufs_lock);
+ list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i],
+ list)
+ smcr_buf_unmap_link(buf_desc, false, lnk);
+ mutex_unlock(&lgr->sndbufs_lock);
+ }
+}
+
+static void smcr_rtoken_clear_link(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ int i;
+
+ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+ lgr->rtokens[i][lnk->link_idx].rkey = 0;
+ lgr->rtokens[i][lnk->link_idx].dma_addr = 0;
+ }
+}
+
+/* must be called under lgr->llc_conf_mutex lock */
+void smcr_link_clear(struct smc_link *lnk, bool log)
+{
+ struct smc_ib_device *smcibdev;
+
+ if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED)
+ return;
+ lnk->peer_qpn = 0;
+ smc_llc_link_clear(lnk, log);
+ smcr_buf_unmap_lgr(lnk);
+ smcr_rtoken_clear_link(lnk);
+ smc_ib_modify_qp_error(lnk);
+ smc_wr_free_link(lnk);
+ smc_ib_destroy_queue_pair(lnk);
+ smc_ib_dealloc_protection_domain(lnk);
+ smc_wr_free_link_mem(lnk);
+ put_device(&lnk->smcibdev->ibdev->dev);
+ smcibdev = lnk->smcibdev;
+ memset(lnk, 0, sizeof(struct smc_link));
+ lnk->state = SMC_LNK_UNUSED;
+ if (!atomic_dec_return(&smcibdev->lnk_cnt))
+ wake_up(&smcibdev->lnks_deleted);
+}
+
+static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
+ struct smc_buf_desc *buf_desc)
+{
+ int i;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+ smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]);
+
+ if (buf_desc->pages)
+ __free_pages(buf_desc->pages, buf_desc->order);
+ kfree(buf_desc);
+}
+
+static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb,
+ struct smc_buf_desc *buf_desc)
+{
+ if (is_dmb) {
+ /* restore original buf len */
+ buf_desc->len += sizeof(struct smcd_cdc_msg);
+ smc_ism_unregister_dmb(lgr->smcd, buf_desc);
+ } else {
+ kfree(buf_desc->cpu_addr);
+ }
+ kfree(buf_desc);
+}
+
+static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
+ struct smc_buf_desc *buf_desc)
+{
+ if (lgr->is_smcd)
+ smcd_buf_free(lgr, is_rmb, buf_desc);
+ else
+ smcr_buf_free(lgr, is_rmb, buf_desc);
+}
+
+static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
+{
+ struct smc_buf_desc *buf_desc, *bf_desc;
+ struct list_head *buf_list;
+ int i;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ if (is_rmb)
+ buf_list = &lgr->rmbs[i];
+ else
+ buf_list = &lgr->sndbufs[i];
+ list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
+ list) {
+ list_del(&buf_desc->list);
+ smc_buf_free(lgr, is_rmb, buf_desc);
+ }
+ }
+}
+
+static void smc_lgr_free_bufs(struct smc_link_group *lgr)
+{
+ /* free send buffers */
+ __smc_lgr_free_bufs(lgr, false);
+ /* free rmbs */
+ __smc_lgr_free_bufs(lgr, true);
+}
+
+/* remove a link group */
+static void smc_lgr_free(struct smc_link_group *lgr)
+{
+ int i;
+
+ if (!lgr->is_smcd) {
+ mutex_lock(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].state != SMC_LNK_UNUSED)
+ smcr_link_clear(&lgr->lnk[i], false);
+ }
+ mutex_unlock(&lgr->llc_conf_mutex);
+ smc_llc_lgr_clear(lgr);
+ }
+
+ smc_lgr_free_bufs(lgr);
+ destroy_workqueue(lgr->tx_wq);
+ if (lgr->is_smcd) {
+ smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
+ put_device(&lgr->smcd->dev);
+ if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
+ wake_up(&lgr->smcd->lgrs_deleted);
+ } else {
+ if (!atomic_dec_return(&lgr_cnt))
+ wake_up(&lgrs_deleted);
+ }
+ kfree(lgr);
+}
+
+static void smcd_unregister_all_dmbs(struct smc_link_group *lgr)
+{
+ int i;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ struct smc_buf_desc *buf_desc;
+
+ list_for_each_entry(buf_desc, &lgr->rmbs[i], list) {
+ buf_desc->len += sizeof(struct smcd_cdc_msg);
+ smc_ism_unregister_dmb(lgr->smcd, buf_desc);
+ }
+ }
+}
+
+static void smc_sk_wake_ups(struct smc_sock *smc)
+{
+ smc->sk.sk_write_space(&smc->sk);
+ smc->sk.sk_data_ready(&smc->sk);
+ smc->sk.sk_state_change(&smc->sk);
+}
+
+/* kill a connection */
+static void smc_conn_kill(struct smc_connection *conn, bool soft)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+ if (conn->lgr->is_smcd && conn->lgr->peer_shutdown)
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ else
+ smc_close_abort(conn);
+ conn->killed = 1;
+ smc->sk.sk_err = ECONNABORTED;
+ smc_sk_wake_ups(smc);
+ if (conn->lgr->is_smcd) {
+ smc_ism_unset_conn(conn);
+ if (soft)
+ tasklet_kill(&conn->rx_tsklet);
+ else
+ tasklet_unlock_wait(&conn->rx_tsklet);
+ } else {
+ smc_cdc_wait_pend_tx_wr(conn);
+ }
+ smc_lgr_unregister_conn(conn);
+ smc_close_active_abort(smc);
+}
+
+static void smc_lgr_cleanup(struct smc_link_group *lgr)
+{
+ if (lgr->is_smcd) {
+ smc_ism_signal_shutdown(lgr);
+ smcd_unregister_all_dmbs(lgr);
+ } else {
+ u32 rsn = lgr->llc_termination_rsn;
+
+ if (!rsn)
+ rsn = SMC_LLC_DEL_PROG_INIT_TERM;
+ smc_llc_send_link_delete_all(lgr, false, rsn);
+ smcr_lgr_link_deactivate_all(lgr);
+ }
+}
+
+/* terminate link group
+ * @soft: true if link group shutdown can take its time
+ * false if immediate link group shutdown is required
+ */
+static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
+{
+ struct smc_connection *conn;
+ struct smc_sock *smc;
+ struct rb_node *node;
+
+ if (lgr->terminating)
+ return; /* lgr already terminating */
+ /* cancel free_work sync, will terminate when lgr->freeing is set */
+ cancel_delayed_work(&lgr->free_work);
+ lgr->terminating = 1;
+
+ /* kill remaining link group connections */
+ read_lock_bh(&lgr->conns_lock);
+ node = rb_first(&lgr->conns_all);
+ while (node) {
+ read_unlock_bh(&lgr->conns_lock);
+ conn = rb_entry(node, struct smc_connection, alert_node);
+ smc = container_of(conn, struct smc_sock, conn);
+ sock_hold(&smc->sk); /* sock_put below */
+ lock_sock(&smc->sk);
+ smc_conn_kill(conn, soft);
+ release_sock(&smc->sk);
+ sock_put(&smc->sk); /* sock_hold above */
+ read_lock_bh(&lgr->conns_lock);
+ node = rb_first(&lgr->conns_all);
+ }
+ read_unlock_bh(&lgr->conns_lock);
+ smc_lgr_cleanup(lgr);
+ smc_lgr_free(lgr);
+}
+
+/* unlink link group and schedule termination */
+void smc_lgr_terminate_sched(struct smc_link_group *lgr)
+{
+ spinlock_t *lgr_lock;
+
+ smc_lgr_list_head(lgr, &lgr_lock);
+ spin_lock_bh(lgr_lock);
+ if (list_empty(&lgr->list) || lgr->terminating || lgr->freeing) {
+ spin_unlock_bh(lgr_lock);
+ return; /* lgr already terminating */
+ }
+ list_del_init(&lgr->list);
+ lgr->freeing = 1;
+ spin_unlock_bh(lgr_lock);
+ schedule_work(&lgr->terminate_work);
+}
+
+/* Called when peer lgr shutdown (regularly or abnormally) is received */
+void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
+{
+ struct smc_link_group *lgr, *l;
+ LIST_HEAD(lgr_free_list);
+
+ /* run common cleanup function and build free list */
+ spin_lock_bh(&dev->lgr_lock);
+ list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) {
+ if ((!peer_gid || lgr->peer_gid == peer_gid) &&
+ (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) {
+ if (peer_gid) /* peer triggered termination */
+ lgr->peer_shutdown = 1;
+ list_move(&lgr->list, &lgr_free_list);
+ lgr->freeing = 1;
+ }
+ }
+ spin_unlock_bh(&dev->lgr_lock);
+
+ /* cancel the regular free workers and actually free lgrs */
+ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
+ list_del_init(&lgr->list);
+ schedule_work(&lgr->terminate_work);
+ }
+}
+
+/* Called when an SMCD device is removed or the smc module is unloaded */
+void smc_smcd_terminate_all(struct smcd_dev *smcd)
+{
+ struct smc_link_group *lgr, *lg;
+ LIST_HEAD(lgr_free_list);
+
+ spin_lock_bh(&smcd->lgr_lock);
+ list_splice_init(&smcd->lgr_list, &lgr_free_list);
+ list_for_each_entry(lgr, &lgr_free_list, list)
+ lgr->freeing = 1;
+ spin_unlock_bh(&smcd->lgr_lock);
+
+ list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
+ list_del_init(&lgr->list);
+ __smc_lgr_terminate(lgr, false);
+ }
+
+ if (atomic_read(&smcd->lgr_cnt))
+ wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt));
+}
+
+/* Called when an SMCR device is removed or the smc module is unloaded.
+ * If smcibdev is given, all SMCR link groups using this device are terminated.
+ * If smcibdev is NULL, all SMCR link groups are terminated.
+ */
+void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
+{
+ struct smc_link_group *lgr, *lg;
+ LIST_HEAD(lgr_free_list);
+ int i;
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ if (!smcibdev) {
+ list_splice_init(&smc_lgr_list.list, &lgr_free_list);
+ list_for_each_entry(lgr, &lgr_free_list, list)
+ lgr->freeing = 1;
+ } else {
+ list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].smcibdev == smcibdev)
+ smcr_link_down_cond_sched(&lgr->lnk[i]);
+ }
+ }
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+
+ list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
+ list_del_init(&lgr->list);
+ smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM);
+ __smc_lgr_terminate(lgr, false);
+ }
+
+ if (smcibdev) {
+ if (atomic_read(&smcibdev->lnk_cnt))
+ wait_event(smcibdev->lnks_deleted,
+ !atomic_read(&smcibdev->lnk_cnt));
+ } else {
+ if (atomic_read(&lgr_cnt))
+ wait_event(lgrs_deleted, !atomic_read(&lgr_cnt));
+ }
+}
+
+/* set new lgr type and clear all asymmetric link tagging */
+void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type)
+{
+ char *lgr_type = "";
+ int i;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+ if (smc_link_usable(&lgr->lnk[i]))
+ lgr->lnk[i].link_is_asym = false;
+ if (lgr->type == new_type)
+ return;
+ lgr->type = new_type;
+
+ switch (lgr->type) {
+ case SMC_LGR_NONE:
+ lgr_type = "NONE";
+ break;
+ case SMC_LGR_SINGLE:
+ lgr_type = "SINGLE";
+ break;
+ case SMC_LGR_SYMMETRIC:
+ lgr_type = "SYMMETRIC";
+ break;
+ case SMC_LGR_ASYMMETRIC_PEER:
+ lgr_type = "ASYMMETRIC_PEER";
+ break;
+ case SMC_LGR_ASYMMETRIC_LOCAL:
+ lgr_type = "ASYMMETRIC_LOCAL";
+ break;
+ }
+ pr_warn_ratelimited("smc: SMC-R lg %*phN state changed: "
+ "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id,
+ lgr_type, lgr->pnet_id);
+}
+
+/* set new lgr type and tag a link as asymmetric */
+void smcr_lgr_set_type_asym(struct smc_link_group *lgr,
+ enum smc_lgr_type new_type, int asym_lnk_idx)
+{
+ smcr_lgr_set_type(lgr, new_type);
+ lgr->lnk[asym_lnk_idx].link_is_asym = true;
+}
+
+/* abort connection, abort_work scheduled from tasklet context */
+static void smc_conn_abort_work(struct work_struct *work)
+{
+ struct smc_connection *conn = container_of(work,
+ struct smc_connection,
+ abort_work);
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+ lock_sock(&smc->sk);
+ smc_conn_kill(conn, true);
+ release_sock(&smc->sk);
+ sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */
+}
+
+void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct smc_link_group *lgr, *n;
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) {
+ struct smc_link *link;
+
+ if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
+ SMC_MAX_PNETID_LEN) ||
+ lgr->type == SMC_LGR_SYMMETRIC ||
+ lgr->type == SMC_LGR_ASYMMETRIC_PEER)
+ continue;
+
+ /* trigger local add link processing */
+ link = smc_llc_usable_link(lgr);
+ if (link)
+ smc_llc_add_link_local(link);
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+}
+
+/* link is down - switch connections to alternate link,
+ * must be called under lgr->llc_conf_mutex lock
+ */
+static void smcr_link_down(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ struct smc_link *to_lnk;
+ int del_link_id;
+
+ if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list))
+ return;
+
+ to_lnk = smc_switch_conns(lgr, lnk, true);
+ if (!to_lnk) { /* no backup link available */
+ smcr_link_clear(lnk, true);
+ return;
+ }
+ smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
+ del_link_id = lnk->link_id;
+
+ if (lgr->role == SMC_SERV) {
+ /* trigger local delete link processing */
+ smc_llc_srv_delete_link_local(to_lnk, del_link_id);
+ } else {
+ if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
+ /* another llc task is ongoing */
+ mutex_unlock(&lgr->llc_conf_mutex);
+ wait_event_timeout(lgr->llc_flow_waiter,
+ (list_empty(&lgr->list) ||
+ lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE),
+ SMC_LLC_WAIT_TIME);
+ mutex_lock(&lgr->llc_conf_mutex);
+ }
+ if (!list_empty(&lgr->list)) {
+ smc_llc_send_delete_link(to_lnk, del_link_id,
+ SMC_LLC_REQ, true,
+ SMC_LLC_DEL_LOST_PATH);
+ smcr_link_clear(lnk, true);
+ }
+ wake_up(&lgr->llc_flow_waiter); /* wake up next waiter */
+ }
+}
+
+/* must be called under lgr->llc_conf_mutex lock */
+void smcr_link_down_cond(struct smc_link *lnk)
+{
+ if (smc_link_downing(&lnk->state))
+ smcr_link_down(lnk);
+}
+
+/* will get the lgr->llc_conf_mutex lock */
+void smcr_link_down_cond_sched(struct smc_link *lnk)
+{
+ if (smc_link_downing(&lnk->state))
+ schedule_work(&lnk->link_down_wrk);
+}
+
+void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct smc_link_group *lgr, *n;
+ int i;
+
+ list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) {
+ if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
+ SMC_MAX_PNETID_LEN))
+ continue; /* lgr is not affected */
+ if (list_empty(&lgr->list))
+ continue;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *lnk = &lgr->lnk[i];
+
+ if (smc_link_usable(lnk) &&
+ lnk->smcibdev == smcibdev && lnk->ibport == ibport)
+ smcr_link_down_cond_sched(lnk);
+ }
+ }
+}
+
+static void smc_link_down_work(struct work_struct *work)
+{
+ struct smc_link *link = container_of(work, struct smc_link,
+ link_down_wrk);
+ struct smc_link_group *lgr = link->lgr;
+
+ if (list_empty(&lgr->list))
+ return;
+ wake_up_all(&lgr->llc_msg_waiter);
+ mutex_lock(&lgr->llc_conf_mutex);
+ smcr_link_down(link);
+ mutex_unlock(&lgr->llc_conf_mutex);
+}
+
+static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev,
+ struct netdev_nested_priv *priv)
+{
+ unsigned short *vlan_id = (unsigned short *)priv->data;
+
+ if (is_vlan_dev(lower_dev)) {
+ *vlan_id = vlan_dev_vlan_id(lower_dev);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Determine vlan of internal TCP socket. */
+int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
+{
+ struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ struct netdev_nested_priv priv;
+ struct net_device *ndev;
+ int rc = 0;
+
+ ini->vlan_id = 0;
+ if (!dst) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ if (!dst->dev) {
+ rc = -ENODEV;
+ goto out_rel;
+ }
+
+ ndev = dst->dev;
+ if (is_vlan_dev(ndev)) {
+ ini->vlan_id = vlan_dev_vlan_id(ndev);
+ goto out_rel;
+ }
+
+ priv.data = (void *)&ini->vlan_id;
+ rtnl_lock();
+ netdev_walk_all_lower_dev(ndev, smc_vlan_by_tcpsk_walk, &priv);
+ rtnl_unlock();
+
+out_rel:
+ dst_release(dst);
+out:
+ return rc;
+}
+
+static bool smcr_lgr_match(struct smc_link_group *lgr,
+ struct smc_clc_msg_local *lcl,
+ enum smc_lgr_role role, u32 clcqpn)
+{
+ int i;
+
+ if (memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) ||
+ lgr->role != role)
+ return false;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]))
+ continue;
+ if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) &&
+ !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) &&
+ !memcmp(lgr->lnk[i].peer_mac, lcl->mac, sizeof(lcl->mac)))
+ return true;
+ }
+ return false;
+}
+
+static bool smcd_lgr_match(struct smc_link_group *lgr,
+ struct smcd_dev *smcismdev, u64 peer_gid)
+{
+ return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev;
+}
+
+/* create a new SMC connection (and a new link group if necessary) */
+int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct list_head *lgr_list;
+ struct smc_link_group *lgr;
+ enum smc_lgr_role role;
+ spinlock_t *lgr_lock;
+ int rc = 0;
+
+ lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list :
+ &smc_lgr_list.list;
+ lgr_lock = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_lock :
+ &smc_lgr_list.lock;
+ ini->first_contact_local = 1;
+ role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ if (role == SMC_CLNT && ini->first_contact_peer)
+ /* create new link group as well */
+ goto create;
+
+ /* determine if an existing link group can be reused */
+ spin_lock_bh(lgr_lock);
+ list_for_each_entry(lgr, lgr_list, list) {
+ write_lock_bh(&lgr->conns_lock);
+ if ((ini->is_smcd ?
+ smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected],
+ ini->ism_peer_gid[ini->ism_selected]) :
+ smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) &&
+ !lgr->sync_err &&
+ (ini->smcd_version == SMC_V2 ||
+ lgr->vlan_id == ini->vlan_id) &&
+ (role == SMC_CLNT || ini->is_smcd ||
+ (lgr->conns_num < SMC_RMBS_PER_LGR_MAX &&
+ !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) {
+ /* link group found */
+ ini->first_contact_local = 0;
+ conn->lgr = lgr;
+ rc = smc_lgr_register_conn(conn, false);
+ write_unlock_bh(&lgr->conns_lock);
+ if (!rc && delayed_work_pending(&lgr->free_work))
+ cancel_delayed_work(&lgr->free_work);
+ break;
+ }
+ write_unlock_bh(&lgr->conns_lock);
+ }
+ spin_unlock_bh(lgr_lock);
+ if (rc)
+ return rc;
+
+ if (role == SMC_CLNT && !ini->first_contact_peer &&
+ ini->first_contact_local) {
+ /* Server reuses a link group, but Client wants to start
+ * a new one
+ * send out_of_sync decline, reason synchr. error
+ */
+ return SMC_CLC_DECL_SYNCERR;
+ }
+
+create:
+ if (ini->first_contact_local) {
+ rc = smc_lgr_create(smc, ini);
+ if (rc)
+ goto out;
+ lgr = conn->lgr;
+ write_lock_bh(&lgr->conns_lock);
+ rc = smc_lgr_register_conn(conn, true);
+ write_unlock_bh(&lgr->conns_lock);
+ if (rc)
+ goto out;
+ }
+ conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
+ conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
+ conn->urg_state = SMC_URG_READ;
+ init_waitqueue_head(&conn->cdc_pend_tx_wq);
+ INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work);
+ if (ini->is_smcd) {
+ conn->rx_off = sizeof(struct smcd_cdc_msg);
+ smcd_cdc_rx_init(conn); /* init tasklet for this conn */
+ } else {
+ conn->rx_off = 0;
+ }
+#ifndef KERNEL_HAS_ATOMIC64
+ spin_lock_init(&conn->acurs_lock);
+#endif
+
+out:
+ return rc;
+}
+
+/* convert the RMB size into the compressed notation - minimum 16K.
+ * In contrast to plain ilog2, this rounds towards the next power of 2,
+ * so the socket application gets at least its desired sndbuf / rcvbuf size.
+ */
+static u8 smc_compress_bufsize(int size)
+{
+ u8 compressed;
+
+ if (size <= SMC_BUF_MIN_SIZE)
+ return 0;
+
+ size = (size - 1) >> 14;
+ compressed = ilog2(size) + 1;
+ if (compressed >= SMC_RMBE_SIZES)
+ compressed = SMC_RMBE_SIZES - 1;
+ return compressed;
+}
+
+/* convert the RMB size from compressed notation into integer */
+int smc_uncompress_bufsize(u8 compressed)
+{
+ u32 size;
+
+ size = 0x00000001 << (((int)compressed) + 14);
+ return (int)size;
+}
+
+/* try to reuse a sndbuf or rmb description slot for a certain
+ * buffer size; if not available, return NULL
+ */
+static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
+ struct mutex *lock,
+ struct list_head *buf_list)
+{
+ struct smc_buf_desc *buf_slot;
+
+ mutex_lock(lock);
+ list_for_each_entry(buf_slot, buf_list, list) {
+ if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
+ mutex_unlock(lock);
+ return buf_slot;
+ }
+ }
+ mutex_unlock(lock);
+ return NULL;
+}
+
+/* one of the conditions for announcing a receiver's current window size is
+ * that it "results in a minimum increase in the window size of 10% of the
+ * receive buffer space" [RFC7609]
+ */
+static inline int smc_rmb_wnd_update_limit(int rmbe_size)
+{
+ return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
+}
+
+/* map an rmb buf to a link */
+static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb,
+ struct smc_link *lnk)
+{
+ int rc;
+
+ if (buf_desc->is_map_ib[lnk->link_idx])
+ return 0;
+
+ rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL);
+ if (rc)
+ return rc;
+ sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl,
+ buf_desc->cpu_addr, buf_desc->len);
+
+ /* map sg table to DMA address */
+ rc = smc_ib_buf_map_sg(lnk, buf_desc,
+ is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ /* SMC protocol depends on mapping to one DMA address only */
+ if (rc != 1) {
+ rc = -EAGAIN;
+ goto free_table;
+ }
+
+ /* create a new memory region for the RMB */
+ if (is_rmb) {
+ rc = smc_ib_get_memory_region(lnk->roce_pd,
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_LOCAL_WRITE,
+ buf_desc, lnk->link_idx);
+ if (rc)
+ goto buf_unmap;
+ smc_ib_sync_sg_for_device(lnk, buf_desc, DMA_FROM_DEVICE);
+ }
+ buf_desc->is_map_ib[lnk->link_idx] = true;
+ return 0;
+
+buf_unmap:
+ smc_ib_buf_unmap_sg(lnk, buf_desc,
+ is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+free_table:
+ sg_free_table(&buf_desc->sgt[lnk->link_idx]);
+ return rc;
+}
+
+/* register a new rmb on IB device,
+ * must be called under lgr->llc_conf_mutex lock
+ */
+int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
+{
+ if (list_empty(&link->lgr->list))
+ return -ENOLINK;
+ if (!rmb_desc->is_reg_mr[link->link_idx]) {
+ /* register memory region for new rmb */
+ if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) {
+ rmb_desc->is_reg_err = true;
+ return -EFAULT;
+ }
+ rmb_desc->is_reg_mr[link->link_idx] = true;
+ }
+ return 0;
+}
+
+static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock,
+ struct list_head *lst, bool is_rmb)
+{
+ struct smc_buf_desc *buf_desc, *bf;
+ int rc = 0;
+
+ mutex_lock(lock);
+ list_for_each_entry_safe(buf_desc, bf, lst, list) {
+ if (!buf_desc->used)
+ continue;
+ rc = smcr_buf_map_link(buf_desc, is_rmb, lnk);
+ if (rc)
+ goto out;
+ }
+out:
+ mutex_unlock(lock);
+ return rc;
+}
+
+/* map all used buffers of lgr for a new link */
+int smcr_buf_map_lgr(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ int i, rc = 0;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock,
+ &lgr->rmbs[i], true);
+ if (rc)
+ return rc;
+ rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock,
+ &lgr->sndbufs[i], false);
+ if (rc)
+ return rc;
+ }
+ return 0;
+}
+
+/* register all used buffers of lgr for a new link,
+ * must be called under lgr->llc_conf_mutex lock
+ */
+int smcr_buf_reg_lgr(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ struct smc_buf_desc *buf_desc, *bf;
+ int i, rc = 0;
+
+ mutex_lock(&lgr->rmbs_lock);
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) {
+ if (!buf_desc->used)
+ continue;
+ rc = smcr_link_reg_rmb(lnk, buf_desc);
+ if (rc)
+ goto out;
+ }
+ }
+out:
+ mutex_unlock(&lgr->rmbs_lock);
+ return rc;
+}
+
+static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
+ bool is_rmb, int bufsize)
+{
+ struct smc_buf_desc *buf_desc;
+
+ /* try to alloc a new buffer */
+ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
+ if (!buf_desc)
+ return ERR_PTR(-ENOMEM);
+
+ buf_desc->order = get_order(bufsize);
+ buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NOMEMALLOC | __GFP_COMP |
+ __GFP_NORETRY | __GFP_ZERO,
+ buf_desc->order);
+ if (!buf_desc->pages) {
+ kfree(buf_desc);
+ return ERR_PTR(-EAGAIN);
+ }
+ buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
+ buf_desc->len = bufsize;
+ return buf_desc;
+}
+
+/* map buf_desc on all usable links,
+ * unused buffers stay mapped as long as the link is up
+ */
+static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
+ struct smc_buf_desc *buf_desc, bool is_rmb)
+{
+ int i, rc = 0, cnt = 0;
+
+ /* protect against parallel link reconfiguration */
+ mutex_lock(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *lnk = &lgr->lnk[i];
+
+ if (!smc_link_usable(lnk))
+ continue;
+ if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ cnt++;
+ }
+out:
+ mutex_unlock(&lgr->llc_conf_mutex);
+ if (!rc && !cnt)
+ rc = -EINVAL;
+ return rc;
+}
+
+#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
+
+static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
+ bool is_dmb, int bufsize)
+{
+ struct smc_buf_desc *buf_desc;
+ int rc;
+
+ if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES)
+ return ERR_PTR(-EAGAIN);
+
+ /* try to alloc a new DMB */
+ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
+ if (!buf_desc)
+ return ERR_PTR(-ENOMEM);
+ if (is_dmb) {
+ rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
+ if (rc) {
+ kfree(buf_desc);
+ if (rc == -ENOMEM)
+ return ERR_PTR(-EAGAIN);
+ if (rc == -ENOSPC)
+ return ERR_PTR(-ENOSPC);
+ return ERR_PTR(-EIO);
+ }
+ buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
+ /* CDC header stored in buf. So, pretend it was smaller */
+ buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg);
+ } else {
+ buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL |
+ __GFP_NOWARN | __GFP_NORETRY |
+ __GFP_NOMEMALLOC);
+ if (!buf_desc->cpu_addr) {
+ kfree(buf_desc);
+ return ERR_PTR(-EAGAIN);
+ }
+ buf_desc->len = bufsize;
+ }
+ return buf_desc;
+}
+
+static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
+{
+ struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
+ struct smc_connection *conn = &smc->conn;
+ struct smc_link_group *lgr = conn->lgr;
+ struct list_head *buf_list;
+ int bufsize, bufsize_short;
+ struct mutex *lock; /* lock buffer list */
+ int sk_buf_size;
+
+ if (is_rmb)
+ /* use socket recv buffer size (w/o overhead) as start value */
+ sk_buf_size = smc->sk.sk_rcvbuf / 2;
+ else
+ /* use socket send buffer size (w/o overhead) as start value */
+ sk_buf_size = smc->sk.sk_sndbuf / 2;
+
+ for (bufsize_short = smc_compress_bufsize(sk_buf_size);
+ bufsize_short >= 0; bufsize_short--) {
+
+ if (is_rmb) {
+ lock = &lgr->rmbs_lock;
+ buf_list = &lgr->rmbs[bufsize_short];
+ } else {
+ lock = &lgr->sndbufs_lock;
+ buf_list = &lgr->sndbufs[bufsize_short];
+ }
+ bufsize = smc_uncompress_bufsize(bufsize_short);
+ if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
+ continue;
+
+ /* check for reusable slot in the link group */
+ buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
+ if (buf_desc) {
+ memset(buf_desc->cpu_addr, 0, bufsize);
+ break; /* found reusable slot */
+ }
+
+ if (is_smcd)
+ buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize);
+ else
+ buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize);
+
+ if (PTR_ERR(buf_desc) == -ENOMEM)
+ break;
+ if (IS_ERR(buf_desc))
+ continue;
+
+ buf_desc->used = 1;
+ mutex_lock(lock);
+ list_add(&buf_desc->list, buf_list);
+ mutex_unlock(lock);
+ break; /* found */
+ }
+
+ if (IS_ERR(buf_desc))
+ return PTR_ERR(buf_desc);
+
+ if (!is_smcd) {
+ if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) {
+ smcr_buf_unuse(buf_desc, lgr);
+ return -ENOMEM;
+ }
+ }
+
+ if (is_rmb) {
+ conn->rmb_desc = buf_desc;
+ conn->rmbe_size_short = bufsize_short;
+ smc->sk.sk_rcvbuf = bufsize * 2;
+ atomic_set(&conn->bytes_to_rcv, 0);
+ conn->rmbe_update_limit =
+ smc_rmb_wnd_update_limit(buf_desc->len);
+ if (is_smcd)
+ smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
+ } else {
+ conn->sndbuf_desc = buf_desc;
+ smc->sk.sk_sndbuf = bufsize * 2;
+ atomic_set(&conn->sndbuf_space, bufsize);
+ }
+ return 0;
+}
+
+void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
+{
+ if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk))
+ return;
+ smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
+}
+
+void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
+{
+ if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk))
+ return;
+ smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
+}
+
+void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
+{
+ int i;
+
+ if (!conn->lgr || conn->lgr->is_smcd)
+ return;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&conn->lgr->lnk[i]))
+ continue;
+ smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc,
+ DMA_FROM_DEVICE);
+ }
+}
+
+void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
+{
+ int i;
+
+ if (!conn->lgr || conn->lgr->is_smcd)
+ return;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&conn->lgr->lnk[i]))
+ continue;
+ smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc,
+ DMA_FROM_DEVICE);
+ }
+}
+
+/* create the send and receive buffer for an SMC socket;
+ * receive buffers are called RMBs;
+ * (even though the SMC protocol allows more than one RMB-element per RMB,
+ * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
+ * extra RMB for every connection in a link group
+ */
+int smc_buf_create(struct smc_sock *smc, bool is_smcd)
+{
+ int rc;
+
+ /* create send buffer */
+ rc = __smc_buf_create(smc, is_smcd, false);
+ if (rc)
+ return rc;
+ /* create rmb */
+ rc = __smc_buf_create(smc, is_smcd, true);
+ if (rc) {
+ mutex_lock(&smc->conn.lgr->sndbufs_lock);
+ list_del(&smc->conn.sndbuf_desc->list);
+ mutex_unlock(&smc->conn.lgr->sndbufs_lock);
+ smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
+ smc->conn.sndbuf_desc = NULL;
+ }
+ return rc;
+}
+
+static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
+{
+ int i;
+
+ for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
+ if (!test_and_set_bit(i, lgr->rtokens_used_mask))
+ return i;
+ }
+ return -ENOSPC;
+}
+
+static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx,
+ u32 rkey)
+{
+ int i;
+
+ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+ if (test_bit(i, lgr->rtokens_used_mask) &&
+ lgr->rtokens[i][lnk_idx].rkey == rkey)
+ return i;
+ }
+ return -ENOENT;
+}
+
+/* set rtoken for a new link to an existing rmb */
+void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
+ __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey)
+{
+ int rtok_idx;
+
+ rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known));
+ if (rtok_idx == -ENOENT)
+ return;
+ lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey);
+ lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr);
+}
+
+/* set rtoken for a new link whose link_id is given */
+void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
+ __be64 nw_vaddr, __be32 nw_rkey)
+{
+ u64 dma_addr = be64_to_cpu(nw_vaddr);
+ u32 rkey = ntohl(nw_rkey);
+ bool found = false;
+ int link_idx;
+
+ for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) {
+ if (lgr->lnk[link_idx].link_id == link_id) {
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ return;
+ lgr->rtokens[rtok_idx][link_idx].rkey = rkey;
+ lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr;
+}
+
+/* add a new rtoken from peer */
+int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey)
+{
+ struct smc_link_group *lgr = smc_get_lgr(lnk);
+ u64 dma_addr = be64_to_cpu(nw_vaddr);
+ u32 rkey = ntohl(nw_rkey);
+ int i;
+
+ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+ if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
+ lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr &&
+ test_bit(i, lgr->rtokens_used_mask)) {
+ /* already in list */
+ return i;
+ }
+ }
+ i = smc_rmb_reserve_rtoken_idx(lgr);
+ if (i < 0)
+ return i;
+ lgr->rtokens[i][lnk->link_idx].rkey = rkey;
+ lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr;
+ return i;
+}
+
+/* delete an rtoken from all links */
+int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey)
+{
+ struct smc_link_group *lgr = smc_get_lgr(lnk);
+ u32 rkey = ntohl(nw_rkey);
+ int i, j;
+
+ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+ if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
+ test_bit(i, lgr->rtokens_used_mask)) {
+ for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) {
+ lgr->rtokens[i][j].rkey = 0;
+ lgr->rtokens[i][j].dma_addr = 0;
+ }
+ clear_bit(i, lgr->rtokens_used_mask);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+/* save rkey and dma_addr received from peer during clc handshake */
+int smc_rmb_rtoken_handling(struct smc_connection *conn,
+ struct smc_link *lnk,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ conn->rtoken_idx = smc_rtoken_add(lnk, clc->r0.rmb_dma_addr,
+ clc->r0.rmb_rkey);
+ if (conn->rtoken_idx < 0)
+ return conn->rtoken_idx;
+ return 0;
+}
+
+static void smc_core_going_away(void)
+{
+ struct smc_ib_device *smcibdev;
+ struct smcd_dev *smcd;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
+ int i;
+
+ for (i = 0; i < SMC_MAX_PORTS; i++)
+ set_bit(i, smcibdev->ports_going_away);
+ }
+ mutex_unlock(&smc_ib_devices.mutex);
+
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ smcd->going_away = 1;
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+}
+
+/* Clean up all SMC link groups */
+static void smc_lgrs_shutdown(void)
+{
+ struct smcd_dev *smcd;
+
+ smc_core_going_away();
+
+ smc_smcr_terminate_all(NULL);
+
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(smcd, &smcd_dev_list.list, list)
+ smc_smcd_terminate_all(smcd);
+ mutex_unlock(&smcd_dev_list.mutex);
+}
+
+static int smc_core_reboot_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ smc_lgrs_shutdown();
+ smc_ib_unregister_client();
+ return 0;
+}
+
+static struct notifier_block smc_reboot_notifier = {
+ .notifier_call = smc_core_reboot_event,
+};
+
+int __init smc_core_init(void)
+{
+ return register_reboot_notifier(&smc_reboot_notifier);
+}
+
+/* Called (from smc_exit) when module is removed */
+void smc_core_exit(void)
+{
+ unregister_reboot_notifier(&smc_reboot_notifier);
+ smc_lgrs_shutdown();
+}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
new file mode 100644
index 000000000..9364d0f35
--- /dev/null
+++ b/net/smc/smc_core.h
@@ -0,0 +1,425 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for SMC Connections, Link Groups and Links
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CORE_H
+#define _SMC_CORE_H
+
+#include <linux/atomic.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_ib.h"
+
+#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */
+
+struct smc_lgr_list { /* list of link group definition */
+ struct list_head list;
+ spinlock_t lock; /* protects list of link groups */
+ u32 num; /* unique link group number */
+};
+
+enum smc_lgr_role { /* possible roles of a link group */
+ SMC_CLNT, /* client */
+ SMC_SERV /* server */
+};
+
+enum smc_link_state { /* possible states of a link */
+ SMC_LNK_UNUSED, /* link is unused */
+ SMC_LNK_INACTIVE, /* link is inactive */
+ SMC_LNK_ACTIVATING, /* link is being activated */
+ SMC_LNK_ACTIVE, /* link is active */
+};
+
+#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
+
+struct smc_wr_buf {
+ u8 raw[SMC_WR_BUF_SIZE];
+};
+
+#define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */
+
+enum smc_wr_reg_state {
+ POSTED, /* ib_wr_reg_mr request posted */
+ CONFIRMED, /* ib_wr_reg_mr response: successful */
+ FAILED /* ib_wr_reg_mr response: failure */
+};
+
+struct smc_rdma_sge { /* sges for RDMA writes */
+ struct ib_sge wr_tx_rdma_sge[SMC_IB_MAX_SEND_SGE];
+};
+
+#define SMC_MAX_RDMA_WRITES 2 /* max. # of RDMA writes per
+ * message send
+ */
+
+struct smc_rdma_sges { /* sges per message send */
+ struct smc_rdma_sge tx_rdma_sge[SMC_MAX_RDMA_WRITES];
+};
+
+struct smc_rdma_wr { /* work requests per message
+ * send
+ */
+ struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES];
+};
+
+#define SMC_LGR_ID_SIZE 4
+
+struct smc_link {
+ struct smc_ib_device *smcibdev; /* ib-device */
+ u8 ibport; /* port - values 1 | 2 */
+ struct ib_pd *roce_pd; /* IB protection domain,
+ * unique for every RoCE QP
+ */
+ struct ib_qp *roce_qp; /* IB queue pair */
+ struct ib_qp_attr qp_attr; /* IB queue pair attributes */
+
+ struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */
+ struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
+ struct ib_sge *wr_tx_sges; /* WR send gather meta data */
+ struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/
+ struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */
+ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
+ struct completion *wr_tx_compl; /* WR send CQE completion */
+ /* above four vectors have wr_tx_cnt elements and use the same index */
+ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
+ atomic_long_t wr_tx_id; /* seq # of last sent WR */
+ unsigned long *wr_tx_mask; /* bit mask of used indexes */
+ u32 wr_tx_cnt; /* number of WR send buffers */
+ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
+ atomic_t wr_tx_refcnt; /* tx refs to link */
+
+ struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
+ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
+ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
+ /* above three vectors have wr_rx_cnt elements and use the same index */
+ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
+ u64 wr_rx_id; /* seq # of last recv WR */
+ u32 wr_rx_cnt; /* number of WR recv buffers */
+ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */
+
+ struct ib_reg_wr wr_reg; /* WR register memory region */
+ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
+ atomic_t wr_reg_refcnt; /* reg refs to link */
+ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */
+
+ u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/
+ u8 sgid_index; /* gid index for vlan id */
+ u32 peer_qpn; /* QP number of peer */
+ enum ib_mtu path_mtu; /* used mtu */
+ enum ib_mtu peer_mtu; /* mtu size of peer */
+ u32 psn_initial; /* QP tx initial packet seqno */
+ u32 peer_psn; /* QP rx initial packet seqno */
+ u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
+ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/
+ u8 link_id; /* unique # within link group */
+ u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */
+ u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */
+ u8 link_idx; /* index in lgr link array */
+ u8 link_is_asym; /* is link asymmetric? */
+ struct smc_link_group *lgr; /* parent link group */
+ struct work_struct link_down_wrk; /* wrk to bring link down */
+
+ enum smc_link_state state; /* state of link */
+ struct delayed_work llc_testlink_wrk; /* testlink worker */
+ struct completion llc_testlink_resp; /* wait for rx of testlink */
+ int llc_testlink_time; /* testlink interval */
+};
+
+/* For now we just allow one parallel link per link group. The SMC protocol
+ * allows more (up to 8).
+ */
+#define SMC_LINKS_PER_LGR_MAX 3
+#define SMC_SINGLE_LINK 0
+
+/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
+struct smc_buf_desc {
+ struct list_head list;
+ void *cpu_addr; /* virtual address of buffer */
+ struct page *pages;
+ int len; /* length of buffer */
+ u32 used; /* currently used / unused */
+ union {
+ struct { /* SMC-R */
+ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];
+ /* virtual buffer */
+ struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
+ /* for rmb only: memory region
+ * incl. rkey provided to peer
+ */
+ u32 order; /* allocation order */
+
+ u8 is_conf_rkey;
+ /* confirm_rkey done */
+ u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX];
+ /* mem region registered */
+ u8 is_map_ib[SMC_LINKS_PER_LGR_MAX];
+ /* mem region mapped to lnk */
+ u8 is_reg_err;
+ /* buffer registration err */
+ };
+ struct { /* SMC-D */
+ unsigned short sba_idx;
+ /* SBA index number */
+ u64 token;
+ /* DMB token number */
+ dma_addr_t dma_addr;
+ /* DMA address */
+ };
+ };
+};
+
+struct smc_rtoken { /* address/key of remote RMB */
+ u64 dma_addr;
+ u32 rkey;
+};
+
+#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
+#define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */
+/* theoretically, the RFC states that largest size would be 512K,
+ * i.e. compressed 5 and thus 6 sizes (0..5), despite
+ * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
+ */
+
+struct smcd_dev;
+
+enum smc_lgr_type { /* redundancy state of lgr */
+ SMC_LGR_NONE, /* no active links, lgr to be deleted */
+ SMC_LGR_SINGLE, /* 1 active RNIC on each peer */
+ SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */
+ SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */
+ SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */
+};
+
+enum smc_llc_flowtype {
+ SMC_LLC_FLOW_NONE = 0,
+ SMC_LLC_FLOW_ADD_LINK = 2,
+ SMC_LLC_FLOW_DEL_LINK = 4,
+ SMC_LLC_FLOW_RKEY = 6,
+};
+
+struct smc_llc_qentry;
+
+struct smc_llc_flow {
+ enum smc_llc_flowtype type;
+ struct smc_llc_qentry *qentry;
+};
+
+struct smc_link_group {
+ struct list_head list;
+ struct rb_root conns_all; /* connection tree */
+ rwlock_t conns_lock; /* protects conns_all */
+ unsigned int conns_num; /* current # of connections */
+ unsigned short vlan_id; /* vlan id of link group */
+
+ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */
+ struct mutex sndbufs_lock; /* protects tx buffers */
+ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */
+ struct mutex rmbs_lock; /* protects rx buffers */
+
+ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
+ struct delayed_work free_work; /* delayed freeing of an lgr */
+ struct work_struct terminate_work; /* abnormal lgr termination */
+ struct workqueue_struct *tx_wq; /* wq for conn. tx workers */
+ u8 sync_err : 1; /* lgr no longer fits to peer */
+ u8 terminating : 1;/* lgr is terminating */
+ u8 freeing : 1; /* lgr is being freed */
+
+ bool is_smcd; /* SMC-R or SMC-D */
+ u8 smc_version;
+ u8 negotiated_eid[SMC_MAX_EID_LEN];
+ u8 peer_os; /* peer operating system */
+ u8 peer_smc_release;
+ u8 peer_hostname[SMC_MAX_HOSTNAME_LEN];
+ union {
+ struct { /* SMC-R */
+ enum smc_lgr_role role;
+ /* client or server */
+ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX];
+ /* smc link */
+ char peer_systemid[SMC_SYSTEMID_LEN];
+ /* unique system_id of peer */
+ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
+ [SMC_LINKS_PER_LGR_MAX];
+ /* remote addr/key pairs */
+ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX);
+ /* used rtoken elements */
+ u8 next_link_id;
+ enum smc_lgr_type type;
+ /* redundancy state */
+ u8 pnet_id[SMC_MAX_PNETID_LEN + 1];
+ /* pnet id of this lgr */
+ struct list_head llc_event_q;
+ /* queue for llc events */
+ spinlock_t llc_event_q_lock;
+ /* protects llc_event_q */
+ struct mutex llc_conf_mutex;
+ /* protects lgr reconfig. */
+ struct work_struct llc_add_link_work;
+ struct work_struct llc_del_link_work;
+ struct work_struct llc_event_work;
+ /* llc event worker */
+ wait_queue_head_t llc_flow_waiter;
+ /* w4 next llc event */
+ wait_queue_head_t llc_msg_waiter;
+ /* w4 next llc msg */
+ struct smc_llc_flow llc_flow_lcl;
+ /* llc local control field */
+ struct smc_llc_flow llc_flow_rmt;
+ /* llc remote control field */
+ struct smc_llc_qentry *delayed_event;
+ /* arrived when flow active */
+ spinlock_t llc_flow_lock;
+ /* protects llc flow */
+ int llc_testlink_time;
+ /* link keep alive time */
+ u32 llc_termination_rsn;
+ /* rsn code for termination */
+ };
+ struct { /* SMC-D */
+ u64 peer_gid;
+ /* Peer GID (remote) */
+ struct smcd_dev *smcd;
+ /* ISM device for VLAN reg. */
+ u8 peer_shutdown : 1;
+ /* peer triggered shutdownn */
+ };
+ };
+};
+
+struct smc_clc_msg_local;
+
+struct smc_init_info {
+ u8 is_smcd;
+ u8 smc_type_v1;
+ u8 smc_type_v2;
+ u8 first_contact_peer;
+ u8 first_contact_local;
+ unsigned short vlan_id;
+ /* SMC-R */
+ struct smc_clc_msg_local *ib_lcl;
+ struct smc_ib_device *ib_dev;
+ u8 ib_gid[SMC_GID_SIZE];
+ u8 ib_port;
+ u32 ib_clcqpn;
+ /* SMC-D */
+ u64 ism_peer_gid[SMC_MAX_ISM_DEVS + 1];
+ struct smcd_dev *ism_dev[SMC_MAX_ISM_DEVS + 1];
+ u16 ism_chid[SMC_MAX_ISM_DEVS + 1];
+ u8 ism_offered_cnt; /* # of ISM devices offered */
+ u8 ism_selected; /* index of selected ISM dev*/
+ u8 smcd_version;
+};
+
+/* Find the connection associated with the given alert token in the link group.
+ * To use rbtrees we have to implement our own search core.
+ * Requires @conns_lock
+ * @token alert token to search for
+ * @lgr link group to search in
+ * Returns connection associated with token if found, NULL otherwise.
+ */
+static inline struct smc_connection *smc_lgr_find_conn(
+ u32 token, struct smc_link_group *lgr)
+{
+ struct smc_connection *res = NULL;
+ struct rb_node *node;
+
+ node = lgr->conns_all.rb_node;
+ while (node) {
+ struct smc_connection *cur = rb_entry(node,
+ struct smc_connection, alert_node);
+
+ if (cur->alert_token_local > token) {
+ node = node->rb_left;
+ } else {
+ if (cur->alert_token_local < token) {
+ node = node->rb_right;
+ } else {
+ res = cur;
+ break;
+ }
+ }
+ }
+
+ return res;
+}
+
+/* returns true if the specified link is usable */
+static inline bool smc_link_usable(struct smc_link *lnk)
+{
+ if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE)
+ return false;
+ return true;
+}
+
+static inline bool smc_link_sendable(struct smc_link *lnk)
+{
+ return smc_link_usable(lnk) &&
+ lnk->qp_attr.cur_qp_state == IB_QPS_RTS;
+}
+
+static inline bool smc_link_active(struct smc_link *lnk)
+{
+ return lnk->state == SMC_LNK_ACTIVE;
+}
+
+struct smc_sock;
+struct smc_clc_msg_accept_confirm;
+struct smc_clc_msg_local;
+
+void smc_lgr_cleanup_early(struct smc_connection *conn);
+void smc_lgr_terminate_sched(struct smc_link_group *lgr);
+void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport);
+void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport);
+void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
+ unsigned short vlan);
+void smc_smcd_terminate_all(struct smcd_dev *dev);
+void smc_smcr_terminate_all(struct smc_ib_device *smcibdev);
+int smc_buf_create(struct smc_sock *smc, bool is_smcd);
+int smc_uncompress_bufsize(u8 compressed);
+int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link,
+ struct smc_clc_msg_accept_confirm *clc);
+int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey);
+int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey);
+void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
+ __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey);
+void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
+ __be64 nw_vaddr, __be32 nw_rkey);
+void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
+void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
+void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
+void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
+int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini);
+
+void smc_conn_free(struct smc_connection *conn);
+int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini);
+void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
+int smc_core_init(void);
+void smc_core_exit(void);
+
+int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
+ u8 link_idx, struct smc_init_info *ini);
+void smcr_link_clear(struct smc_link *lnk, bool log);
+int smcr_buf_map_lgr(struct smc_link *lnk);
+int smcr_buf_reg_lgr(struct smc_link *lnk);
+void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type);
+void smcr_lgr_set_type_asym(struct smc_link_group *lgr,
+ enum smc_lgr_type new_type, int asym_lnk_idx);
+int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc);
+struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
+ struct smc_link *from_lnk, bool is_dev_err);
+void smcr_link_down_cond(struct smc_link *lnk);
+void smcr_link_down_cond_sched(struct smc_link *lnk);
+
+static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
+{
+ return link->lgr;
+}
+#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
new file mode 100644
index 000000000..f15fca59b
--- /dev/null
+++ b/net/smc/smc_diag.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Monitoring SMC transport protocol sockets
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sock_diag.h>
+#include <linux/inet_diag.h>
+#include <linux/smc_diag.h>
+#include <net/netlink.h>
+#include <net/smc.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+struct smc_diag_dump_ctx {
+ int pos[2];
+};
+
+static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb)
+{
+ return (struct smc_diag_dump_ctx *)cb->ctx;
+}
+
+static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
+{
+ sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+ be16_to_cpu(((__be16 *)gid_raw)[0]),
+ be16_to_cpu(((__be16 *)gid_raw)[1]),
+ be16_to_cpu(((__be16 *)gid_raw)[2]),
+ be16_to_cpu(((__be16 *)gid_raw)[3]),
+ be16_to_cpu(((__be16 *)gid_raw)[4]),
+ be16_to_cpu(((__be16 *)gid_raw)[5]),
+ be16_to_cpu(((__be16 *)gid_raw)[6]),
+ be16_to_cpu(((__be16 *)gid_raw)[7]));
+}
+
+static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ memset(r, 0, sizeof(*r));
+ r->diag_family = sk->sk_family;
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
+ if (!smc->clcsock)
+ return;
+ r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
+ r->id.idiag_dport = smc->clcsock->sk->sk_dport;
+ r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
+ if (sk->sk_protocol == SMCPROTO_SMC) {
+ r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
+ r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_protocol == SMCPROTO_SMC6) {
+ memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr,
+ sizeof(smc->clcsock->sk->sk_v6_rcv_saddr));
+ memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr,
+ sizeof(smc->clcsock->sk->sk_v6_daddr));
+#endif
+ }
+}
+
+static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
+ struct smc_diag_msg *r,
+ struct user_namespace *user_ns)
+{
+ if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown))
+ return 1;
+
+ r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+ r->diag_inode = sock_i_ino(sk);
+ return 0;
+}
+
+static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct smc_diag_req *req,
+ struct nlattr *bc)
+{
+ struct smc_sock *smc = smc_sk(sk);
+ struct smc_diag_fallback fallback;
+ struct user_namespace *user_ns;
+ struct smc_diag_msg *r;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ smc_diag_msg_common_fill(r, sk);
+ r->diag_state = sk->sk_state;
+ if (smc->use_fallback)
+ r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP;
+ else if (smc->conn.lgr && smc->conn.lgr->is_smcd)
+ r->diag_mode = SMC_DIAG_MODE_SMCD;
+ else
+ r->diag_mode = SMC_DIAG_MODE_SMCR;
+ user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
+ if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
+ goto errout;
+
+ fallback.reason = smc->fallback_rsn;
+ fallback.peer_diagnosis = smc->peer_diagnosis;
+ if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0)
+ goto errout;
+
+ if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) &&
+ smc->conn.alert_token_local) {
+ struct smc_connection *conn = &smc->conn;
+ struct smc_diag_conninfo cinfo = {
+ .token = conn->alert_token_local,
+ .sndbuf_size = conn->sndbuf_desc ?
+ conn->sndbuf_desc->len : 0,
+ .rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0,
+ .peer_rmbe_size = conn->peer_rmbe_size,
+
+ .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
+ .rx_prod.count = conn->local_rx_ctrl.prod.count,
+ .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap,
+ .rx_cons.count = conn->local_rx_ctrl.cons.count,
+
+ .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap,
+ .tx_prod.count = conn->local_tx_ctrl.prod.count,
+ .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap,
+ .tx_cons.count = conn->local_tx_ctrl.cons.count,
+
+ .tx_prod_flags =
+ *(u8 *)&conn->local_tx_ctrl.prod_flags,
+ .tx_conn_state_flags =
+ *(u8 *)&conn->local_tx_ctrl.conn_state_flags,
+ .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags,
+ .rx_conn_state_flags =
+ *(u8 *)&conn->local_rx_ctrl.conn_state_flags,
+
+ .tx_prep.wrap = conn->tx_curs_prep.wrap,
+ .tx_prep.count = conn->tx_curs_prep.count,
+ .tx_sent.wrap = conn->tx_curs_sent.wrap,
+ .tx_sent.count = conn->tx_curs_sent.count,
+ .tx_fin.wrap = conn->tx_curs_fin.wrap,
+ .tx_fin.count = conn->tx_curs_fin.count,
+ };
+
+ if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0)
+ goto errout;
+ }
+
+ if (smc->conn.lgr && !smc->conn.lgr->is_smcd &&
+ (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) &&
+ !list_empty(&smc->conn.lgr->list)) {
+ struct smc_diag_lgrinfo linfo = {
+ .role = smc->conn.lgr->role,
+ .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport,
+ .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id,
+ };
+
+ memcpy(linfo.lnk[0].ibname,
+ smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
+ sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
+ smc_gid_be16_convert(linfo.lnk[0].gid,
+ smc->conn.lgr->lnk[0].gid);
+ smc_gid_be16_convert(linfo.lnk[0].peer_gid,
+ smc->conn.lgr->lnk[0].peer_gid);
+
+ if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
+ goto errout;
+ }
+ if (smc->conn.lgr && smc->conn.lgr->is_smcd &&
+ (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) &&
+ !list_empty(&smc->conn.lgr->list)) {
+ struct smc_connection *conn = &smc->conn;
+ struct smcd_diag_dmbinfo dinfo;
+
+ memset(&dinfo, 0, sizeof(dinfo));
+
+ dinfo.linkid = *((u32 *)conn->lgr->id);
+ dinfo.peer_gid = conn->lgr->peer_gid;
+ dinfo.my_gid = conn->lgr->smcd->local_gid;
+ dinfo.token = conn->rmb_desc->token;
+ dinfo.peer_token = conn->peer_token;
+
+ if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0)
+ goto errout;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+errout:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb,
+ struct netlink_callback *cb, int p_type)
+{
+ struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb);
+ struct net *net = sock_net(skb->sk);
+ int snum = cb_ctx->pos[p_type];
+ struct nlattr *bc = NULL;
+ struct hlist_head *head;
+ int rc = 0, num = 0;
+ struct sock *sk;
+
+ read_lock(&prot->h.smc_hash->lock);
+ head = &prot->h.smc_hash->ht;
+ if (hlist_empty(head))
+ goto out;
+
+ sk_for_each(sk, head) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < snum)
+ goto next;
+ rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
+ if (rc < 0)
+ goto out;
+next:
+ num++;
+ }
+
+out:
+ read_unlock(&prot->h.smc_hash->lock);
+ cb_ctx->pos[p_type] = num;
+ return rc;
+}
+
+static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int rc = 0;
+
+ rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC);
+ if (!rc)
+ smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6);
+ return skb->len;
+}
+
+static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+ struct net *net = sock_net(skb->sk);
+
+ if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
+ h->nlmsg_flags & NLM_F_DUMP) {
+ {
+ struct netlink_dump_control c = {
+ .dump = smc_diag_dump,
+ .min_dump_alloc = SKB_WITH_OVERHEAD(32768),
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ }
+ }
+ return 0;
+}
+
+static const struct sock_diag_handler smc_diag_handler = {
+ .family = AF_SMC,
+ .dump = smc_diag_handler_dump,
+};
+
+static int __init smc_diag_init(void)
+{
+ return sock_diag_register(&smc_diag_handler);
+}
+
+static void __exit smc_diag_exit(void)
+{
+ sock_diag_unregister(&smc_diag_handler);
+}
+
+module_init(smc_diag_init);
+module_exit(smc_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
new file mode 100644
index 000000000..f1ffbd414
--- /dev/null
+++ b/net/smc/smc_ib.c
@@ -0,0 +1,643 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * IB infrastructure:
+ * Establish SMC-R as an Infiniband Client to be notified about added and
+ * removed IB devices of type RDMA.
+ * Determine device and port characteristics for these IB devices.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/random.h>
+#include <linux/workqueue.h>
+#include <linux/scatterlist.h>
+#include <linux/wait.h>
+#include <linux/mutex.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+
+#include "smc_pnet.h"
+#include "smc_ib.h"
+#include "smc_core.h"
+#include "smc_wr.h"
+#include "smc.h"
+
+#define SMC_MAX_CQE 32766 /* max. # of completion queue elements */
+
+#define SMC_QP_MIN_RNR_TIMER 5
+#define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */
+#define SMC_QP_RETRY_CNT 7 /* 7: infinite */
+#define SMC_QP_RNR_RETRY 7 /* 7: infinite */
+
+struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
+ .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex),
+ .list = LIST_HEAD_INIT(smc_ib_devices.list),
+};
+
+u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
+
+static int smc_ib_modify_qp_init(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.pkey_index = 0;
+ qp_attr.port_num = lnk->ibport;
+ qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
+ | IB_ACCESS_REMOTE_WRITE;
+ return ib_modify_qp(lnk->roce_qp, &qp_attr,
+ IB_QP_STATE | IB_QP_PKEY_INDEX |
+ IB_QP_ACCESS_FLAGS | IB_QP_PORT);
+}
+
+static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
+{
+ enum ib_qp_attr_mask qp_attr_mask =
+ IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTR;
+ qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
+ qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
+ rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
+ rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0);
+ rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
+ memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
+ sizeof(lnk->peer_mac));
+ qp_attr.dest_qp_num = lnk->peer_qpn;
+ qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
+ qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
+ * requests
+ */
+ qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
+
+ return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
+}
+
+int smc_ib_modify_qp_rts(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */
+ qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */
+ qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */
+ qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */
+ qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and
+ * atomic ops allowed
+ */
+ return ib_modify_qp(lnk->roce_qp, &qp_attr,
+ IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
+ IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC);
+}
+
+int smc_ib_modify_qp_error(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_ERR;
+ return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
+}
+
+int smc_ib_ready_link(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = smc_get_lgr(lnk);
+ int rc = 0;
+
+ rc = smc_ib_modify_qp_init(lnk);
+ if (rc)
+ goto out;
+
+ rc = smc_ib_modify_qp_rtr(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+ rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
+ IB_CQ_SOLICITED_MASK);
+ if (rc)
+ goto out;
+ rc = smc_wr_rx_post_init(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+
+ if (lgr->role == SMC_SERV) {
+ rc = smc_ib_modify_qp_rts(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+ }
+out:
+ return rc;
+}
+
+static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ const struct ib_gid_attr *attr;
+ int rc;
+
+ attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
+ if (IS_ERR(attr))
+ return -ENODEV;
+
+ rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]);
+ rdma_put_gid_attr(attr);
+ return rc;
+}
+
+/* Create an identifier unique for this instance of SMC-R.
+ * The MAC-address of the first active registered IB device
+ * plus a random 2-byte number is used to create this identifier.
+ * This name is delivered to the peer during connection initialization.
+ */
+static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
+ u8 ibport)
+{
+ memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
+ sizeof(smcibdev->mac[ibport - 1]));
+}
+
+bool smc_ib_is_valid_local_systemid(void)
+{
+ return !is_zero_ether_addr(&local_systemid[2]);
+}
+
+static void smc_ib_init_local_systemid(void)
+{
+ get_random_bytes(&local_systemid[0], 2);
+}
+
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
+}
+
+/* determine the gid for an ib-device port and vlan id */
+int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
+ unsigned short vlan_id, u8 gid[], u8 *sgid_index)
+{
+ const struct ib_gid_attr *attr;
+ const struct net_device *ndev;
+ int i;
+
+ for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
+ attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
+ if (IS_ERR(attr))
+ continue;
+
+ rcu_read_lock();
+ ndev = rdma_read_gid_attr_ndev_rcu(attr);
+ if (!IS_ERR(ndev) &&
+ ((!vlan_id && !is_vlan_dev(ndev)) ||
+ (vlan_id && is_vlan_dev(ndev) &&
+ vlan_dev_vlan_id(ndev) == vlan_id)) &&
+ attr->gid_type == IB_GID_TYPE_ROCE) {
+ rcu_read_unlock();
+ if (gid)
+ memcpy(gid, &attr->gid, SMC_GID_SIZE);
+ if (sgid_index)
+ *sgid_index = attr->index;
+ rdma_put_gid_attr(attr);
+ return 0;
+ }
+ rcu_read_unlock();
+ rdma_put_gid_attr(attr);
+ }
+ return -ENODEV;
+}
+
+static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ int rc;
+
+ memset(&smcibdev->pattr[ibport - 1], 0,
+ sizeof(smcibdev->pattr[ibport - 1]));
+ rc = ib_query_port(smcibdev->ibdev, ibport,
+ &smcibdev->pattr[ibport - 1]);
+ if (rc)
+ goto out;
+ /* the SMC protocol requires specification of the RoCE MAC address */
+ rc = smc_ib_fill_mac(smcibdev, ibport);
+ if (rc)
+ goto out;
+ if (!smc_ib_is_valid_local_systemid() &&
+ smc_ib_port_active(smcibdev, ibport))
+ /* create unique system identifier */
+ smc_ib_define_local_systemid(smcibdev, ibport);
+out:
+ return rc;
+}
+
+/* process context wrapper for might_sleep smc_ib_remember_port_attr */
+static void smc_ib_port_event_work(struct work_struct *work)
+{
+ struct smc_ib_device *smcibdev = container_of(
+ work, struct smc_ib_device, port_event_work);
+ u8 port_idx;
+
+ for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
+ smc_ib_remember_port_attr(smcibdev, port_idx + 1);
+ clear_bit(port_idx, &smcibdev->port_event_mask);
+ if (!smc_ib_port_active(smcibdev, port_idx + 1)) {
+ set_bit(port_idx, smcibdev->ports_going_away);
+ smcr_port_err(smcibdev, port_idx + 1);
+ } else {
+ clear_bit(port_idx, smcibdev->ports_going_away);
+ smcr_port_add(smcibdev, port_idx + 1);
+ }
+ }
+}
+
+/* can be called in IRQ context */
+static void smc_ib_global_event_handler(struct ib_event_handler *handler,
+ struct ib_event *ibevent)
+{
+ struct smc_ib_device *smcibdev;
+ bool schedule = false;
+ u8 port_idx;
+
+ smcibdev = container_of(handler, struct smc_ib_device, event_handler);
+
+ switch (ibevent->event) {
+ case IB_EVENT_DEVICE_FATAL:
+ /* terminate all ports on device */
+ for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) {
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (!test_and_set_bit(port_idx,
+ smcibdev->ports_going_away))
+ schedule = true;
+ }
+ if (schedule)
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ case IB_EVENT_PORT_ACTIVE:
+ port_idx = ibevent->element.port_num - 1;
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (test_and_clear_bit(port_idx, smcibdev->ports_going_away))
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ case IB_EVENT_PORT_ERR:
+ port_idx = ibevent->element.port_num - 1;
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ case IB_EVENT_GID_CHANGE:
+ port_idx = ibevent->element.port_num - 1;
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ default:
+ break;
+ }
+}
+
+void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
+{
+ if (lnk->roce_pd)
+ ib_dealloc_pd(lnk->roce_pd);
+ lnk->roce_pd = NULL;
+}
+
+int smc_ib_create_protection_domain(struct smc_link *lnk)
+{
+ int rc;
+
+ lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
+ rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
+ if (IS_ERR(lnk->roce_pd))
+ lnk->roce_pd = NULL;
+ return rc;
+}
+
+static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
+{
+ struct smc_link *lnk = (struct smc_link *)priv;
+ struct smc_ib_device *smcibdev = lnk->smcibdev;
+ u8 port_idx;
+
+ switch (ibevent->event) {
+ case IB_EVENT_QP_FATAL:
+ case IB_EVENT_QP_ACCESS_ERR:
+ port_idx = ibevent->element.qp->port - 1;
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ default:
+ break;
+ }
+}
+
+void smc_ib_destroy_queue_pair(struct smc_link *lnk)
+{
+ if (lnk->roce_qp)
+ ib_destroy_qp(lnk->roce_qp);
+ lnk->roce_qp = NULL;
+}
+
+/* create a queue pair within the protection domain for a link */
+int smc_ib_create_queue_pair(struct smc_link *lnk)
+{
+ struct ib_qp_init_attr qp_attr = {
+ .event_handler = smc_ib_qp_event_handler,
+ .qp_context = lnk,
+ .send_cq = lnk->smcibdev->roce_cq_send,
+ .recv_cq = lnk->smcibdev->roce_cq_recv,
+ .srq = NULL,
+ .cap = {
+ /* include unsolicited rdma_writes as well,
+ * there are max. 2 RDMA_WRITE per 1 WR_SEND
+ */
+ .max_send_wr = SMC_WR_BUF_CNT * 3,
+ .max_recv_wr = SMC_WR_BUF_CNT * 3,
+ .max_send_sge = SMC_IB_MAX_SEND_SGE,
+ .max_recv_sge = 1,
+ },
+ .sq_sig_type = IB_SIGNAL_REQ_WR,
+ .qp_type = IB_QPT_RC,
+ };
+ int rc;
+
+ lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
+ rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
+ if (IS_ERR(lnk->roce_qp))
+ lnk->roce_qp = NULL;
+ else
+ smc_wr_remember_qp_attr(lnk);
+ return rc;
+}
+
+void smc_ib_put_memory_region(struct ib_mr *mr)
+{
+ ib_dereg_mr(mr);
+}
+
+static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx)
+{
+ unsigned int offset = 0;
+ int sg_num;
+
+ /* map the largest prefix of a dma mapped SG list */
+ sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx],
+ buf_slot->sgt[link_idx].sgl,
+ buf_slot->sgt[link_idx].orig_nents,
+ &offset, PAGE_SIZE);
+
+ return sg_num;
+}
+
+/* Allocate a memory region and map the dma mapped SG list of buf_slot */
+int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
+ struct smc_buf_desc *buf_slot, u8 link_idx)
+{
+ if (buf_slot->mr_rx[link_idx])
+ return 0; /* already done */
+
+ buf_slot->mr_rx[link_idx] =
+ ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order);
+ if (IS_ERR(buf_slot->mr_rx[link_idx])) {
+ int rc;
+
+ rc = PTR_ERR(buf_slot->mr_rx[link_idx]);
+ buf_slot->mr_rx[link_idx] = NULL;
+ return rc;
+ }
+
+ if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1)
+ return -EINVAL;
+
+ return 0;
+}
+
+/* synchronize buffer usage for cpu access */
+void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ struct scatterlist *sg;
+ unsigned int i;
+
+ /* for now there is just one DMA address */
+ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
+ buf_slot->sgt[lnk->link_idx].nents, i) {
+ if (!sg_dma_len(sg))
+ break;
+ ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev,
+ sg_dma_address(sg),
+ sg_dma_len(sg),
+ data_direction);
+ }
+}
+
+/* synchronize buffer usage for device access */
+void smc_ib_sync_sg_for_device(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ struct scatterlist *sg;
+ unsigned int i;
+
+ /* for now there is just one DMA address */
+ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
+ buf_slot->sgt[lnk->link_idx].nents, i) {
+ if (!sg_dma_len(sg))
+ break;
+ ib_dma_sync_single_for_device(lnk->smcibdev->ibdev,
+ sg_dma_address(sg),
+ sg_dma_len(sg),
+ data_direction);
+ }
+}
+
+/* Map a new TX or RX buffer SG-table to DMA */
+int smc_ib_buf_map_sg(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ int mapped_nents;
+
+ mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev,
+ buf_slot->sgt[lnk->link_idx].sgl,
+ buf_slot->sgt[lnk->link_idx].orig_nents,
+ data_direction);
+ if (!mapped_nents)
+ return -ENOMEM;
+
+ return mapped_nents;
+}
+
+void smc_ib_buf_unmap_sg(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address)
+ return; /* already unmapped */
+
+ ib_dma_unmap_sg(lnk->smcibdev->ibdev,
+ buf_slot->sgt[lnk->link_idx].sgl,
+ buf_slot->sgt[lnk->link_idx].orig_nents,
+ data_direction);
+ buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
+}
+
+long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+ struct ib_cq_init_attr cqattr = {
+ .cqe = SMC_MAX_CQE, .comp_vector = 0 };
+ int cqe_size_order, smc_order;
+ long rc;
+
+ mutex_lock(&smcibdev->mutex);
+ rc = 0;
+ if (smcibdev->initialized)
+ goto out;
+ /* the calculated number of cq entries fits to mlx5 cq allocation */
+ cqe_size_order = cache_line_size() == 128 ? 7 : 6;
+ smc_order = MAX_ORDER - cqe_size_order - 1;
+ if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
+ cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
+ smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
+ smc_wr_tx_cq_handler, NULL,
+ smcibdev, &cqattr);
+ rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
+ if (IS_ERR(smcibdev->roce_cq_send)) {
+ smcibdev->roce_cq_send = NULL;
+ goto out;
+ }
+ smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
+ smc_wr_rx_cq_handler, NULL,
+ smcibdev, &cqattr);
+ rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
+ if (IS_ERR(smcibdev->roce_cq_recv)) {
+ smcibdev->roce_cq_recv = NULL;
+ goto err;
+ }
+ smc_wr_add_dev(smcibdev);
+ smcibdev->initialized = 1;
+ goto out;
+
+err:
+ ib_destroy_cq(smcibdev->roce_cq_send);
+out:
+ mutex_unlock(&smcibdev->mutex);
+ return rc;
+}
+
+static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+ mutex_lock(&smcibdev->mutex);
+ if (!smcibdev->initialized)
+ goto out;
+ smcibdev->initialized = 0;
+ ib_destroy_cq(smcibdev->roce_cq_recv);
+ ib_destroy_cq(smcibdev->roce_cq_send);
+ smc_wr_remove_dev(smcibdev);
+out:
+ mutex_unlock(&smcibdev->mutex);
+}
+
+static struct ib_client smc_ib_client;
+
+/* callback function for ib_register_client() */
+static int smc_ib_add_dev(struct ib_device *ibdev)
+{
+ struct smc_ib_device *smcibdev;
+ u8 port_cnt;
+ int i;
+
+ if (ibdev->node_type != RDMA_NODE_IB_CA)
+ return -EOPNOTSUPP;
+
+ smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
+ if (!smcibdev)
+ return -ENOMEM;
+
+ smcibdev->ibdev = ibdev;
+ INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
+ atomic_set(&smcibdev->lnk_cnt, 0);
+ init_waitqueue_head(&smcibdev->lnks_deleted);
+ mutex_init(&smcibdev->mutex);
+ mutex_lock(&smc_ib_devices.mutex);
+ list_add_tail(&smcibdev->list, &smc_ib_devices.list);
+ mutex_unlock(&smc_ib_devices.mutex);
+ ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
+ INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
+ smc_ib_global_event_handler);
+ ib_register_event_handler(&smcibdev->event_handler);
+
+ /* trigger reading of the port attributes */
+ port_cnt = smcibdev->ibdev->phys_port_cnt;
+ pr_warn_ratelimited("smc: adding ib device %s with port count %d\n",
+ smcibdev->ibdev->name, port_cnt);
+ for (i = 0;
+ i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
+ i++) {
+ set_bit(i, &smcibdev->port_event_mask);
+ /* determine pnetids of the port */
+ if (smc_pnetid_by_dev_port(ibdev->dev.parent, i,
+ smcibdev->pnetid[i]))
+ smc_pnetid_by_table_ib(smcibdev, i + 1);
+ pr_warn_ratelimited("smc: ib device %s port %d has pnetid "
+ "%.16s%s\n",
+ smcibdev->ibdev->name, i + 1,
+ smcibdev->pnetid[i],
+ smcibdev->pnetid_by_user[i] ?
+ " (user defined)" :
+ "");
+ }
+ schedule_work(&smcibdev->port_event_work);
+ return 0;
+}
+
+/* callback function for ib_unregister_client() */
+static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
+{
+ struct smc_ib_device *smcibdev = client_data;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
+ mutex_unlock(&smc_ib_devices.mutex);
+ pr_warn_ratelimited("smc: removing ib device %s\n",
+ smcibdev->ibdev->name);
+ smc_smcr_terminate_all(smcibdev);
+ smc_ib_cleanup_per_ibdev(smcibdev);
+ ib_unregister_event_handler(&smcibdev->event_handler);
+ cancel_work_sync(&smcibdev->port_event_work);
+ kfree(smcibdev);
+}
+
+static struct ib_client smc_ib_client = {
+ .name = "smc_ib",
+ .add = smc_ib_add_dev,
+ .remove = smc_ib_remove_dev,
+};
+
+int __init smc_ib_register_client(void)
+{
+ smc_ib_init_local_systemid();
+ return ib_register_client(&smc_ib_client);
+}
+
+void smc_ib_unregister_client(void)
+{
+ ib_unregister_client(&smc_ib_client);
+}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
new file mode 100644
index 000000000..f90d15eae
--- /dev/null
+++ b/net/smc/smc_ib.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for IB environment
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <Ursula Braun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_IB_H
+#define _SMC_IB_H
+
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <rdma/ib_verbs.h>
+#include <net/smc.h>
+
+#define SMC_MAX_PORTS 2 /* Max # of ports */
+#define SMC_GID_SIZE sizeof(union ib_gid)
+
+#define SMC_IB_MAX_SEND_SGE 2
+
+struct smc_ib_devices { /* list of smc ib devices definition */
+ struct list_head list;
+ struct mutex mutex; /* protects list of smc ib devices */
+};
+
+extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */
+
+struct smc_ib_device { /* ib-device infos for smc */
+ struct list_head list;
+ struct ib_device *ibdev;
+ struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
+ struct ib_event_handler event_handler; /* global ib_event handler */
+ struct ib_cq *roce_cq_send; /* send completion queue */
+ struct ib_cq *roce_cq_recv; /* recv completion queue */
+ struct tasklet_struct send_tasklet; /* called by send cq handler */
+ struct tasklet_struct recv_tasklet; /* called by recv cq handler */
+ char mac[SMC_MAX_PORTS][ETH_ALEN];
+ /* mac address per port*/
+ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN];
+ /* pnetid per port */
+ bool pnetid_by_user[SMC_MAX_PORTS];
+ /* pnetid defined by user? */
+ u8 initialized : 1; /* ib dev CQ, evthdl done */
+ struct work_struct port_event_work;
+ unsigned long port_event_mask;
+ DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS);
+ atomic_t lnk_cnt; /* number of links on ibdev */
+ wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/
+ struct mutex mutex; /* protect dev setup+cleanup */
+};
+
+struct smc_buf_desc;
+struct smc_link;
+
+int smc_ib_register_client(void) __init;
+void smc_ib_unregister_client(void);
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
+int smc_ib_buf_map_sg(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction);
+void smc_ib_buf_unmap_sg(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction);
+void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
+int smc_ib_create_protection_domain(struct smc_link *lnk);
+void smc_ib_destroy_queue_pair(struct smc_link *lnk);
+int smc_ib_create_queue_pair(struct smc_link *lnk);
+int smc_ib_ready_link(struct smc_link *lnk);
+int smc_ib_modify_qp_rts(struct smc_link *lnk);
+int smc_ib_modify_qp_reset(struct smc_link *lnk);
+int smc_ib_modify_qp_error(struct smc_link *lnk);
+long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
+int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
+ struct smc_buf_desc *buf_slot, u8 link_idx);
+void smc_ib_put_memory_region(struct ib_mr *mr);
+void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction);
+void smc_ib_sync_sg_for_device(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction);
+int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
+ unsigned short vlan_id, u8 gid[], u8 *sgid_index);
+bool smc_ib_is_valid_local_systemid(void);
+#endif
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
new file mode 100644
index 000000000..8e33c0128
--- /dev/null
+++ b/net/smc/smc_ism.c
@@ -0,0 +1,439 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Shared Memory Communications Direct over ISM devices (SMC-D)
+ *
+ * Functions for ISM device.
+ *
+ * Copyright IBM Corp. 2018
+ */
+
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <asm/page.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_ism.h"
+#include "smc_pnet.h"
+
+struct smcd_dev_list smcd_dev_list = {
+ .list = LIST_HEAD_INIT(smcd_dev_list.list),
+ .mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex)
+};
+
+bool smc_ism_v2_capable;
+
+/* Test if an ISM communication is possible - same CPC */
+int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd)
+{
+ return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0,
+ vlan_id);
+}
+
+int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos,
+ void *data, size_t len)
+{
+ int rc;
+
+ rc = smcd->ops->move_data(smcd, pos->token, pos->index, pos->signal,
+ pos->offset, data, len);
+
+ return rc < 0 ? rc : 0;
+}
+
+void smc_ism_get_system_eid(struct smcd_dev *smcd, u8 **eid)
+{
+ smcd->ops->get_system_eid(smcd, eid);
+}
+
+u16 smc_ism_get_chid(struct smcd_dev *smcd)
+{
+ return smcd->ops->get_chid(smcd);
+}
+
+/* Set a connection using this DMBE. */
+void smc_ism_set_conn(struct smc_connection *conn)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->lgr->smcd->lock, flags);
+ conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn;
+ spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags);
+}
+
+/* Unset a connection using this DMBE. */
+void smc_ism_unset_conn(struct smc_connection *conn)
+{
+ unsigned long flags;
+
+ if (!conn->rmb_desc)
+ return;
+
+ spin_lock_irqsave(&conn->lgr->smcd->lock, flags);
+ conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL;
+ spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags);
+}
+
+/* Register a VLAN identifier with the ISM device. Use a reference count
+ * and add a VLAN identifier only when the first DMB using this VLAN is
+ * registered.
+ */
+int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid)
+{
+ struct smc_ism_vlanid *new_vlan, *vlan;
+ unsigned long flags;
+ int rc = 0;
+
+ if (!vlanid) /* No valid vlan id */
+ return -EINVAL;
+
+ /* create new vlan entry, in case we need it */
+ new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL);
+ if (!new_vlan)
+ return -ENOMEM;
+ new_vlan->vlanid = vlanid;
+ refcount_set(&new_vlan->refcnt, 1);
+
+ /* if there is an existing entry, increase count and return */
+ spin_lock_irqsave(&smcd->lock, flags);
+ list_for_each_entry(vlan, &smcd->vlan, list) {
+ if (vlan->vlanid == vlanid) {
+ refcount_inc(&vlan->refcnt);
+ kfree(new_vlan);
+ goto out;
+ }
+ }
+
+ /* no existing entry found.
+ * add new entry to device; might fail, e.g., if HW limit reached
+ */
+ if (smcd->ops->add_vlan_id(smcd, vlanid)) {
+ kfree(new_vlan);
+ rc = -EIO;
+ goto out;
+ }
+ list_add_tail(&new_vlan->list, &smcd->vlan);
+out:
+ spin_unlock_irqrestore(&smcd->lock, flags);
+ return rc;
+}
+
+/* Unregister a VLAN identifier with the ISM device. Use a reference count
+ * and remove a VLAN identifier only when the last DMB using this VLAN is
+ * unregistered.
+ */
+int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid)
+{
+ struct smc_ism_vlanid *vlan;
+ unsigned long flags;
+ bool found = false;
+ int rc = 0;
+
+ if (!vlanid) /* No valid vlan id */
+ return -EINVAL;
+
+ spin_lock_irqsave(&smcd->lock, flags);
+ list_for_each_entry(vlan, &smcd->vlan, list) {
+ if (vlan->vlanid == vlanid) {
+ if (!refcount_dec_and_test(&vlan->refcnt))
+ goto out;
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ rc = -ENOENT;
+ goto out; /* VLAN id not in table */
+ }
+
+ /* Found and the last reference just gone */
+ if (smcd->ops->del_vlan_id(smcd, vlanid))
+ rc = -EIO;
+ list_del(&vlan->list);
+ kfree(vlan);
+out:
+ spin_unlock_irqrestore(&smcd->lock, flags);
+ return rc;
+}
+
+int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
+{
+ struct smcd_dmb dmb;
+ int rc = 0;
+
+ if (!dmb_desc->dma_addr)
+ return rc;
+
+ memset(&dmb, 0, sizeof(dmb));
+ dmb.dmb_tok = dmb_desc->token;
+ dmb.sba_idx = dmb_desc->sba_idx;
+ dmb.cpu_addr = dmb_desc->cpu_addr;
+ dmb.dma_addr = dmb_desc->dma_addr;
+ dmb.dmb_len = dmb_desc->len;
+ rc = smcd->ops->unregister_dmb(smcd, &dmb);
+ if (!rc || rc == ISM_ERROR) {
+ dmb_desc->cpu_addr = NULL;
+ dmb_desc->dma_addr = 0;
+ }
+
+ return rc;
+}
+
+int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
+ struct smc_buf_desc *dmb_desc)
+{
+ struct smcd_dmb dmb;
+ int rc;
+
+ memset(&dmb, 0, sizeof(dmb));
+ dmb.dmb_len = dmb_len;
+ dmb.sba_idx = dmb_desc->sba_idx;
+ dmb.vlan_id = lgr->vlan_id;
+ dmb.rgid = lgr->peer_gid;
+ rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb);
+ if (!rc) {
+ dmb_desc->sba_idx = dmb.sba_idx;
+ dmb_desc->token = dmb.dmb_tok;
+ dmb_desc->cpu_addr = dmb.cpu_addr;
+ dmb_desc->dma_addr = dmb.dma_addr;
+ dmb_desc->len = dmb.dmb_len;
+ }
+ return rc;
+}
+
+struct smc_ism_event_work {
+ struct work_struct work;
+ struct smcd_dev *smcd;
+ struct smcd_event event;
+};
+
+#define ISM_EVENT_REQUEST 0x0001
+#define ISM_EVENT_RESPONSE 0x0002
+#define ISM_EVENT_REQUEST_IR 0x00000001
+#define ISM_EVENT_CODE_SHUTDOWN 0x80
+#define ISM_EVENT_CODE_TESTLINK 0x83
+
+union smcd_sw_event_info {
+ u64 info;
+ struct {
+ u8 uid[SMC_LGR_ID_SIZE];
+ unsigned short vlan_id;
+ u16 code;
+ };
+};
+
+static void smcd_handle_sw_event(struct smc_ism_event_work *wrk)
+{
+ union smcd_sw_event_info ev_info;
+
+ ev_info.info = wrk->event.info;
+ switch (wrk->event.code) {
+ case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */
+ smc_smcd_terminate(wrk->smcd, wrk->event.tok, ev_info.vlan_id);
+ break;
+ case ISM_EVENT_CODE_TESTLINK: /* Activity timer */
+ if (ev_info.code == ISM_EVENT_REQUEST) {
+ ev_info.code = ISM_EVENT_RESPONSE;
+ wrk->smcd->ops->signal_event(wrk->smcd,
+ wrk->event.tok,
+ ISM_EVENT_REQUEST_IR,
+ ISM_EVENT_CODE_TESTLINK,
+ ev_info.info);
+ }
+ break;
+ }
+}
+
+int smc_ism_signal_shutdown(struct smc_link_group *lgr)
+{
+ int rc;
+ union smcd_sw_event_info ev_info;
+
+ if (lgr->peer_shutdown)
+ return 0;
+
+ memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE);
+ ev_info.vlan_id = lgr->vlan_id;
+ ev_info.code = ISM_EVENT_REQUEST;
+ rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid,
+ ISM_EVENT_REQUEST_IR,
+ ISM_EVENT_CODE_SHUTDOWN,
+ ev_info.info);
+ return rc;
+}
+
+/* worker for SMC-D events */
+static void smc_ism_event_work(struct work_struct *work)
+{
+ struct smc_ism_event_work *wrk =
+ container_of(work, struct smc_ism_event_work, work);
+
+ switch (wrk->event.type) {
+ case ISM_EVENT_GID: /* GID event, token is peer GID */
+ smc_smcd_terminate(wrk->smcd, wrk->event.tok, VLAN_VID_MASK);
+ break;
+ case ISM_EVENT_DMB:
+ break;
+ case ISM_EVENT_SWR: /* Software defined event */
+ smcd_handle_sw_event(wrk);
+ break;
+ }
+ kfree(wrk);
+}
+
+static void smcd_release(struct device *dev)
+{
+ struct smcd_dev *smcd = container_of(dev, struct smcd_dev, dev);
+
+ kfree(smcd->conn);
+ kfree(smcd);
+}
+
+struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
+ const struct smcd_ops *ops, int max_dmbs)
+{
+ struct smcd_dev *smcd;
+
+ smcd = kzalloc(sizeof(*smcd), GFP_KERNEL);
+ if (!smcd)
+ return NULL;
+ smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *),
+ GFP_KERNEL);
+ if (!smcd->conn) {
+ kfree(smcd);
+ return NULL;
+ }
+
+ smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
+ WQ_MEM_RECLAIM, name);
+ if (!smcd->event_wq) {
+ kfree(smcd->conn);
+ kfree(smcd);
+ return NULL;
+ }
+
+ smcd->dev.parent = parent;
+ smcd->dev.release = smcd_release;
+ device_initialize(&smcd->dev);
+ dev_set_name(&smcd->dev, name);
+ smcd->ops = ops;
+ if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid))
+ smc_pnetid_by_table_smcd(smcd);
+
+ spin_lock_init(&smcd->lock);
+ spin_lock_init(&smcd->lgr_lock);
+ INIT_LIST_HEAD(&smcd->vlan);
+ INIT_LIST_HEAD(&smcd->lgr_list);
+ init_waitqueue_head(&smcd->lgrs_deleted);
+ return smcd;
+}
+EXPORT_SYMBOL_GPL(smcd_alloc_dev);
+
+int smcd_register_dev(struct smcd_dev *smcd)
+{
+ int rc;
+
+ mutex_lock(&smcd_dev_list.mutex);
+ if (list_empty(&smcd_dev_list.list)) {
+ u8 *system_eid = NULL;
+
+ smc_ism_get_system_eid(smcd, &system_eid);
+ if (system_eid[24] != '0' || system_eid[28] != '0')
+ smc_ism_v2_capable = true;
+ }
+ /* sort list: devices without pnetid before devices with pnetid */
+ if (smcd->pnetid[0])
+ list_add_tail(&smcd->list, &smcd_dev_list.list);
+ else
+ list_add(&smcd->list, &smcd_dev_list.list);
+ mutex_unlock(&smcd_dev_list.mutex);
+
+ pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n",
+ dev_name(&smcd->dev), smcd->pnetid,
+ smcd->pnetid_by_user ? " (user defined)" : "");
+
+ rc = device_add(&smcd->dev);
+ if (rc) {
+ mutex_lock(&smcd_dev_list.mutex);
+ list_del(&smcd->list);
+ mutex_unlock(&smcd_dev_list.mutex);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(smcd_register_dev);
+
+void smcd_unregister_dev(struct smcd_dev *smcd)
+{
+ pr_warn_ratelimited("smc: removing smcd device %s\n",
+ dev_name(&smcd->dev));
+ mutex_lock(&smcd_dev_list.mutex);
+ list_del_init(&smcd->list);
+ mutex_unlock(&smcd_dev_list.mutex);
+ smcd->going_away = 1;
+ smc_smcd_terminate_all(smcd);
+ flush_workqueue(smcd->event_wq);
+ destroy_workqueue(smcd->event_wq);
+
+ device_del(&smcd->dev);
+}
+EXPORT_SYMBOL_GPL(smcd_unregister_dev);
+
+void smcd_free_dev(struct smcd_dev *smcd)
+{
+ put_device(&smcd->dev);
+}
+EXPORT_SYMBOL_GPL(smcd_free_dev);
+
+/* SMCD Device event handler. Called from ISM device interrupt handler.
+ * Parameters are smcd device pointer,
+ * - event->type (0 --> DMB, 1 --> GID),
+ * - event->code (event code),
+ * - event->tok (either DMB token when event type 0, or GID when event type 1)
+ * - event->time (time of day)
+ * - event->info (debug info).
+ *
+ * Context:
+ * - Function called in IRQ context from ISM device driver event handler.
+ */
+void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event)
+{
+ struct smc_ism_event_work *wrk;
+
+ if (smcd->going_away)
+ return;
+ /* copy event to event work queue, and let it be handled there */
+ wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC);
+ if (!wrk)
+ return;
+ INIT_WORK(&wrk->work, smc_ism_event_work);
+ wrk->smcd = smcd;
+ wrk->event = *event;
+ queue_work(smcd->event_wq, &wrk->work);
+}
+EXPORT_SYMBOL_GPL(smcd_handle_event);
+
+/* SMCD Device interrupt handler. Called from ISM device interrupt handler.
+ * Parameters are smcd device pointer and DMB number. Find the connection and
+ * schedule the tasklet for this connection.
+ *
+ * Context:
+ * - Function called in IRQ context from ISM device driver IRQ handler.
+ */
+void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno)
+{
+ struct smc_connection *conn = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&smcd->lock, flags);
+ conn = smcd->conn[dmbno];
+ if (conn && !conn->killed)
+ tasklet_schedule(&conn->rx_tsklet);
+ spin_unlock_irqrestore(&smcd->lock, flags);
+}
+EXPORT_SYMBOL_GPL(smcd_handle_irq);
+
+void __init smc_ism_init(void)
+{
+ smc_ism_v2_capable = false;
+}
diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h
new file mode 100644
index 000000000..8048e09dd
--- /dev/null
+++ b/net/smc/smc_ism.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Shared Memory Communications Direct over ISM devices (SMC-D)
+ *
+ * SMC-D ISM device structure definitions.
+ *
+ * Copyright IBM Corp. 2018
+ */
+
+#ifndef SMCD_ISM_H
+#define SMCD_ISM_H
+
+#include <linux/uio.h>
+#include <linux/mutex.h>
+
+#include "smc.h"
+
+struct smcd_dev_list { /* List of SMCD devices */
+ struct list_head list;
+ struct mutex mutex; /* Protects list of devices */
+};
+
+extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */
+extern bool smc_ism_v2_capable; /* HW supports ISM V2 and thus
+ * System EID is defined
+ */
+
+struct smc_ism_vlanid { /* VLAN id set on ISM device */
+ struct list_head list;
+ unsigned short vlanid; /* Vlan id */
+ refcount_t refcnt; /* Reference count */
+};
+
+struct smc_ism_position { /* ISM device position to write to */
+ u64 token; /* Token of DMB */
+ u32 offset; /* Offset into DMBE */
+ u8 index; /* Index of DMBE */
+ u8 signal; /* Generate interrupt on owner side */
+};
+
+struct smcd_dev;
+
+int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev);
+void smc_ism_set_conn(struct smc_connection *conn);
+void smc_ism_unset_conn(struct smc_connection *conn);
+int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id);
+int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id);
+int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size,
+ struct smc_buf_desc *dmb_desc);
+int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc);
+int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos,
+ void *data, size_t len);
+int smc_ism_signal_shutdown(struct smc_link_group *lgr);
+void smc_ism_get_system_eid(struct smcd_dev *dev, u8 **eid);
+u16 smc_ism_get_chid(struct smcd_dev *dev);
+void smc_ism_init(void);
+#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
new file mode 100644
index 000000000..d5ee961ca
--- /dev/null
+++ b/net/smc/smc_llc.c
@@ -0,0 +1,1974 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Link Layer Control (LLC)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ * Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <net/tcp.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_clc.h"
+#include "smc_llc.h"
+#include "smc_pnet.h"
+
+#define SMC_LLC_DATA_LEN 40
+
+struct smc_llc_hdr {
+ struct smc_wr_rx_hdr common;
+ u8 length; /* 44 */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 reserved:4,
+ add_link_rej_rsn:4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 add_link_rej_rsn:4,
+ reserved:4;
+#endif
+ u8 flags;
+};
+
+#define SMC_LLC_FLAG_NO_RMBE_EYEC 0x03
+
+struct smc_llc_msg_confirm_link { /* type 0x01 */
+ struct smc_llc_hdr hd;
+ u8 sender_mac[ETH_ALEN];
+ u8 sender_gid[SMC_GID_SIZE];
+ u8 sender_qp_num[3];
+ u8 link_num;
+ u8 link_uid[SMC_LGR_ID_SIZE];
+ u8 max_links;
+ u8 reserved[9];
+};
+
+#define SMC_LLC_FLAG_ADD_LNK_REJ 0x40
+#define SMC_LLC_REJ_RSN_NO_ALT_PATH 1
+
+#define SMC_LLC_ADD_LNK_MAX_LINKS 2
+
+struct smc_llc_msg_add_link { /* type 0x02 */
+ struct smc_llc_hdr hd;
+ u8 sender_mac[ETH_ALEN];
+ u8 reserved2[2];
+ u8 sender_gid[SMC_GID_SIZE];
+ u8 sender_qp_num[3];
+ u8 link_num;
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 reserved3 : 4,
+ qp_mtu : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 qp_mtu : 4,
+ reserved3 : 4;
+#endif
+ u8 initial_psn[3];
+ u8 reserved[8];
+};
+
+struct smc_llc_msg_add_link_cont_rt {
+ __be32 rmb_key;
+ __be32 rmb_key_new;
+ __be64 rmb_vaddr_new;
+};
+
+#define SMC_LLC_RKEYS_PER_CONT_MSG 2
+
+struct smc_llc_msg_add_link_cont { /* type 0x03 */
+ struct smc_llc_hdr hd;
+ u8 link_num;
+ u8 num_rkeys;
+ u8 reserved2[2];
+ struct smc_llc_msg_add_link_cont_rt rt[SMC_LLC_RKEYS_PER_CONT_MSG];
+ u8 reserved[4];
+} __packed; /* format defined in RFC7609 */
+
+#define SMC_LLC_FLAG_DEL_LINK_ALL 0x40
+#define SMC_LLC_FLAG_DEL_LINK_ORDERLY 0x20
+
+struct smc_llc_msg_del_link { /* type 0x04 */
+ struct smc_llc_hdr hd;
+ u8 link_num;
+ __be32 reason;
+ u8 reserved[35];
+} __packed; /* format defined in RFC7609 */
+
+struct smc_llc_msg_test_link { /* type 0x07 */
+ struct smc_llc_hdr hd;
+ u8 user_data[16];
+ u8 reserved[24];
+};
+
+struct smc_rmb_rtoken {
+ union {
+ u8 num_rkeys; /* first rtoken byte of CONFIRM LINK msg */
+ /* is actually the num of rtokens, first */
+ /* rtoken is always for the current link */
+ u8 link_id; /* link id of the rtoken */
+ };
+ __be32 rmb_key;
+ __be64 rmb_vaddr;
+} __packed; /* format defined in RFC7609 */
+
+#define SMC_LLC_RKEYS_PER_MSG 3
+
+struct smc_llc_msg_confirm_rkey { /* type 0x06 */
+ struct smc_llc_hdr hd;
+ struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG];
+ u8 reserved;
+};
+
+#define SMC_LLC_DEL_RKEY_MAX 8
+#define SMC_LLC_FLAG_RKEY_RETRY 0x10
+#define SMC_LLC_FLAG_RKEY_NEG 0x20
+
+struct smc_llc_msg_delete_rkey { /* type 0x09 */
+ struct smc_llc_hdr hd;
+ u8 num_rkeys;
+ u8 err_mask;
+ u8 reserved[2];
+ __be32 rkey[8];
+ u8 reserved2[4];
+};
+
+union smc_llc_msg {
+ struct smc_llc_msg_confirm_link confirm_link;
+ struct smc_llc_msg_add_link add_link;
+ struct smc_llc_msg_add_link_cont add_link_cont;
+ struct smc_llc_msg_del_link delete_link;
+
+ struct smc_llc_msg_confirm_rkey confirm_rkey;
+ struct smc_llc_msg_delete_rkey delete_rkey;
+
+ struct smc_llc_msg_test_link test_link;
+ struct {
+ struct smc_llc_hdr hdr;
+ u8 data[SMC_LLC_DATA_LEN];
+ } raw;
+};
+
+#define SMC_LLC_FLAG_RESP 0x80
+
+struct smc_llc_qentry {
+ struct list_head list;
+ struct smc_link *link;
+ union smc_llc_msg msg;
+};
+
+static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc);
+
+struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow)
+{
+ struct smc_llc_qentry *qentry = flow->qentry;
+
+ flow->qentry = NULL;
+ return qentry;
+}
+
+void smc_llc_flow_qentry_del(struct smc_llc_flow *flow)
+{
+ struct smc_llc_qentry *qentry;
+
+ if (flow->qentry) {
+ qentry = flow->qentry;
+ flow->qentry = NULL;
+ kfree(qentry);
+ }
+}
+
+static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow,
+ struct smc_llc_qentry *qentry)
+{
+ flow->qentry = qentry;
+}
+
+static void smc_llc_flow_parallel(struct smc_link_group *lgr, u8 flow_type,
+ struct smc_llc_qentry *qentry)
+{
+ u8 msg_type = qentry->msg.raw.hdr.common.type;
+
+ if ((msg_type == SMC_LLC_ADD_LINK || msg_type == SMC_LLC_DELETE_LINK) &&
+ flow_type != msg_type && !lgr->delayed_event) {
+ lgr->delayed_event = qentry;
+ return;
+ }
+ /* drop parallel or already-in-progress llc requests */
+ if (flow_type != msg_type)
+ pr_warn_once("smc: SMC-R lg %*phN dropped parallel "
+ "LLC msg: msg %d flow %d role %d\n",
+ SMC_LGR_ID_SIZE, &lgr->id,
+ qentry->msg.raw.hdr.common.type,
+ flow_type, lgr->role);
+ kfree(qentry);
+}
+
+/* try to start a new llc flow, initiated by an incoming llc msg */
+static bool smc_llc_flow_start(struct smc_llc_flow *flow,
+ struct smc_llc_qentry *qentry)
+{
+ struct smc_link_group *lgr = qentry->link->lgr;
+
+ spin_lock_bh(&lgr->llc_flow_lock);
+ if (flow->type) {
+ /* a flow is already active */
+ smc_llc_flow_parallel(lgr, flow->type, qentry);
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ return false;
+ }
+ switch (qentry->msg.raw.hdr.common.type) {
+ case SMC_LLC_ADD_LINK:
+ flow->type = SMC_LLC_FLOW_ADD_LINK;
+ break;
+ case SMC_LLC_DELETE_LINK:
+ flow->type = SMC_LLC_FLOW_DEL_LINK;
+ break;
+ case SMC_LLC_CONFIRM_RKEY:
+ case SMC_LLC_DELETE_RKEY:
+ flow->type = SMC_LLC_FLOW_RKEY;
+ break;
+ default:
+ flow->type = SMC_LLC_FLOW_NONE;
+ }
+ smc_llc_flow_qentry_set(flow, qentry);
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ return true;
+}
+
+/* start a new local llc flow, wait till current flow finished */
+int smc_llc_flow_initiate(struct smc_link_group *lgr,
+ enum smc_llc_flowtype type)
+{
+ enum smc_llc_flowtype allowed_remote = SMC_LLC_FLOW_NONE;
+ int rc;
+
+ /* all flows except confirm_rkey and delete_rkey are exclusive,
+ * confirm/delete rkey flows can run concurrently (local and remote)
+ */
+ if (type == SMC_LLC_FLOW_RKEY)
+ allowed_remote = SMC_LLC_FLOW_RKEY;
+again:
+ if (list_empty(&lgr->list))
+ return -ENODEV;
+ spin_lock_bh(&lgr->llc_flow_lock);
+ if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE &&
+ (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE ||
+ lgr->llc_flow_rmt.type == allowed_remote)) {
+ lgr->llc_flow_lcl.type = type;
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ return 0;
+ }
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ rc = wait_event_timeout(lgr->llc_flow_waiter, (list_empty(&lgr->list) ||
+ (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE &&
+ (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE ||
+ lgr->llc_flow_rmt.type == allowed_remote))),
+ SMC_LLC_WAIT_TIME * 10);
+ if (!rc)
+ return -ETIMEDOUT;
+ goto again;
+}
+
+/* finish the current llc flow */
+void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow)
+{
+ spin_lock_bh(&lgr->llc_flow_lock);
+ memset(flow, 0, sizeof(*flow));
+ flow->type = SMC_LLC_FLOW_NONE;
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ if (!list_empty(&lgr->list) && lgr->delayed_event &&
+ flow == &lgr->llc_flow_lcl)
+ schedule_work(&lgr->llc_event_work);
+ else
+ wake_up(&lgr->llc_flow_waiter);
+}
+
+/* lnk is optional and used for early wakeup when link goes down, useful in
+ * cases where we wait for a response on the link after we sent a request
+ */
+struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
+ struct smc_link *lnk,
+ int time_out, u8 exp_msg)
+{
+ struct smc_llc_flow *flow = &lgr->llc_flow_lcl;
+ u8 rcv_msg;
+
+ wait_event_timeout(lgr->llc_msg_waiter,
+ (flow->qentry ||
+ (lnk && !smc_link_usable(lnk)) ||
+ list_empty(&lgr->list)),
+ time_out);
+ if (!flow->qentry ||
+ (lnk && !smc_link_usable(lnk)) || list_empty(&lgr->list)) {
+ smc_llc_flow_qentry_del(flow);
+ goto out;
+ }
+ rcv_msg = flow->qentry->msg.raw.hdr.common.type;
+ if (exp_msg && rcv_msg != exp_msg) {
+ if (exp_msg == SMC_LLC_ADD_LINK &&
+ rcv_msg == SMC_LLC_DELETE_LINK) {
+ /* flow_start will delay the unexpected msg */
+ smc_llc_flow_start(&lgr->llc_flow_lcl,
+ smc_llc_flow_qentry_clr(flow));
+ return NULL;
+ }
+ pr_warn_once("smc: SMC-R lg %*phN dropped unexpected LLC msg: "
+ "msg %d exp %d flow %d role %d flags %x\n",
+ SMC_LGR_ID_SIZE, &lgr->id, rcv_msg, exp_msg,
+ flow->type, lgr->role,
+ flow->qentry->msg.raw.hdr.flags);
+ smc_llc_flow_qentry_del(flow);
+ }
+out:
+ return flow->qentry;
+}
+
+/********************************** send *************************************/
+
+struct smc_llc_tx_pend {
+};
+
+/* handler for send/transmission completion of an LLC msg */
+static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend,
+ struct smc_link *link,
+ enum ib_wc_status wc_status)
+{
+ /* future work: handle wc_status error for recovery and failover */
+}
+
+/**
+ * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits
+ * @link: Pointer to SMC link used for sending LLC control message.
+ * @wr_buf: Out variable returning pointer to work request payload buffer.
+ * @pend: Out variable returning pointer to private pending WR tracking.
+ * It's the context the transmit complete handler will get.
+ *
+ * Reserves and pre-fills an entry for a pending work request send/tx.
+ * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx.
+ * Can sleep due to smc_get_ctrl_buf (if not in softirq context).
+ *
+ * Return: 0 on success, otherwise an error value.
+ */
+static int smc_llc_add_pending_send(struct smc_link *link,
+ struct smc_wr_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **pend)
+{
+ int rc;
+
+ rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, NULL,
+ pend);
+ if (rc < 0)
+ return rc;
+ BUILD_BUG_ON_MSG(
+ sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE,
+ "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)");
+ BUILD_BUG_ON_MSG(
+ sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE,
+ "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+ "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)");
+ return 0;
+}
+
+/* high-level API to send LLC confirm link */
+int smc_llc_send_confirm_link(struct smc_link *link,
+ enum smc_llc_reqresp reqresp)
+{
+ struct smc_llc_msg_confirm_link *confllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
+ memset(confllc, 0, sizeof(*confllc));
+ confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
+ confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link);
+ confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC;
+ if (reqresp == SMC_LLC_RESP)
+ confllc->hd.flags |= SMC_LLC_FLAG_RESP;
+ memcpy(confllc->sender_mac, link->smcibdev->mac[link->ibport - 1],
+ ETH_ALEN);
+ memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE);
+ hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
+ confllc->link_num = link->link_id;
+ memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE);
+ confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS;
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/* send LLC confirm rkey request */
+static int smc_llc_send_confirm_rkey(struct smc_link *send_link,
+ struct smc_buf_desc *rmb_desc)
+{
+ struct smc_llc_msg_confirm_rkey *rkeyllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ struct smc_link *link;
+ int i, rc, rtok_ix;
+
+ if (!smc_wr_tx_link_hold(send_link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf;
+ memset(rkeyllc, 0, sizeof(*rkeyllc));
+ rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY;
+ rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey);
+
+ rtok_ix = 1;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ link = &send_link->lgr->lnk[i];
+ if (smc_link_active(link) && link != send_link) {
+ rkeyllc->rtoken[rtok_ix].link_id = link->link_id;
+ rkeyllc->rtoken[rtok_ix].rmb_key =
+ htonl(rmb_desc->mr_rx[link->link_idx]->rkey);
+ rkeyllc->rtoken[rtok_ix].rmb_vaddr = cpu_to_be64(
+ (u64)sg_dma_address(
+ rmb_desc->sgt[link->link_idx].sgl));
+ rtok_ix++;
+ }
+ }
+ /* rkey of send_link is in rtoken[0] */
+ rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1;
+ rkeyllc->rtoken[0].rmb_key =
+ htonl(rmb_desc->mr_rx[send_link->link_idx]->rkey);
+ rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64(
+ (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl));
+ /* send llc message */
+ rc = smc_wr_tx_send(send_link, pend);
+put_out:
+ smc_wr_tx_link_put(send_link);
+ return rc;
+}
+
+/* send LLC delete rkey request */
+static int smc_llc_send_delete_rkey(struct smc_link *link,
+ struct smc_buf_desc *rmb_desc)
+{
+ struct smc_llc_msg_delete_rkey *rkeyllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf;
+ memset(rkeyllc, 0, sizeof(*rkeyllc));
+ rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY;
+ rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey);
+ rkeyllc->num_rkeys = 1;
+ rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey);
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/* send ADD LINK request or response */
+int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
+ struct smc_link *link_new,
+ enum smc_llc_reqresp reqresp)
+{
+ struct smc_llc_msg_add_link *addllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ addllc = (struct smc_llc_msg_add_link *)wr_buf;
+
+ memset(addllc, 0, sizeof(*addllc));
+ addllc->hd.common.type = SMC_LLC_ADD_LINK;
+ addllc->hd.length = sizeof(struct smc_llc_msg_add_link);
+ if (reqresp == SMC_LLC_RESP)
+ addllc->hd.flags |= SMC_LLC_FLAG_RESP;
+ memcpy(addllc->sender_mac, mac, ETH_ALEN);
+ memcpy(addllc->sender_gid, gid, SMC_GID_SIZE);
+ if (link_new) {
+ addllc->link_num = link_new->link_id;
+ hton24(addllc->sender_qp_num, link_new->roce_qp->qp_num);
+ hton24(addllc->initial_psn, link_new->psn_initial);
+ if (reqresp == SMC_LLC_REQ)
+ addllc->qp_mtu = link_new->path_mtu;
+ else
+ addllc->qp_mtu = min(link_new->path_mtu,
+ link_new->peer_mtu);
+ }
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/* send DELETE LINK request or response */
+int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
+ enum smc_llc_reqresp reqresp, bool orderly,
+ u32 reason)
+{
+ struct smc_llc_msg_del_link *delllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ delllc = (struct smc_llc_msg_del_link *)wr_buf;
+
+ memset(delllc, 0, sizeof(*delllc));
+ delllc->hd.common.type = SMC_LLC_DELETE_LINK;
+ delllc->hd.length = sizeof(struct smc_llc_msg_del_link);
+ if (reqresp == SMC_LLC_RESP)
+ delllc->hd.flags |= SMC_LLC_FLAG_RESP;
+ if (orderly)
+ delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+ if (link_del_id)
+ delllc->link_num = link_del_id;
+ else
+ delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
+ delllc->reason = htonl(reason);
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/* send LLC test link request */
+static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16])
+{
+ struct smc_llc_msg_test_link *testllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ testllc = (struct smc_llc_msg_test_link *)wr_buf;
+ memset(testllc, 0, sizeof(*testllc));
+ testllc->hd.common.type = SMC_LLC_TEST_LINK;
+ testllc->hd.length = sizeof(struct smc_llc_msg_test_link);
+ memcpy(testllc->user_data, user_data, sizeof(testllc->user_data));
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/* schedule an llc send on link, may wait for buffers */
+static int smc_llc_send_message(struct smc_link *link, void *llcbuf)
+{
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg));
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/* schedule an llc send on link, may wait for buffers,
+ * and wait for send completion notification.
+ * @return 0 on success
+ */
+static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf)
+{
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg));
+ rc = smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+/********************************* receive ***********************************/
+
+static int smc_llc_alloc_alt_link(struct smc_link_group *lgr,
+ enum smc_lgr_type lgr_new_t)
+{
+ int i;
+
+ if (lgr->type == SMC_LGR_SYMMETRIC ||
+ (lgr->type != SMC_LGR_SINGLE &&
+ (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+ lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)))
+ return -EMLINK;
+
+ if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+ lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) {
+ for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--)
+ if (lgr->lnk[i].state == SMC_LNK_UNUSED)
+ return i;
+ } else {
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+ if (lgr->lnk[i].state == SMC_LNK_UNUSED)
+ return i;
+ }
+ return -EMLINK;
+}
+
+/* return first buffer from any of the next buf lists */
+static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr,
+ int *buf_lst)
+{
+ struct smc_buf_desc *buf_pos;
+
+ while (*buf_lst < SMC_RMBE_SIZES) {
+ buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst],
+ struct smc_buf_desc, list);
+ if (buf_pos)
+ return buf_pos;
+ (*buf_lst)++;
+ }
+ return NULL;
+}
+
+/* return next rmb from buffer lists */
+static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr,
+ int *buf_lst,
+ struct smc_buf_desc *buf_pos)
+{
+ struct smc_buf_desc *buf_next;
+
+ if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) {
+ (*buf_lst)++;
+ return _smc_llc_get_next_rmb(lgr, buf_lst);
+ }
+ buf_next = list_next_entry(buf_pos, list);
+ return buf_next;
+}
+
+static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr,
+ int *buf_lst)
+{
+ *buf_lst = 0;
+ return smc_llc_get_next_rmb(lgr, buf_lst, NULL);
+}
+
+/* send one add_link_continue msg */
+static int smc_llc_add_link_cont(struct smc_link *link,
+ struct smc_link *link_new, u8 *num_rkeys_todo,
+ int *buf_lst, struct smc_buf_desc **buf_pos)
+{
+ struct smc_llc_msg_add_link_cont *addc_llc;
+ struct smc_link_group *lgr = link->lgr;
+ int prim_lnk_idx, lnk_idx, i, rc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ struct smc_buf_desc *rmb;
+ u8 n;
+
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf;
+ memset(addc_llc, 0, sizeof(*addc_llc));
+
+ prim_lnk_idx = link->link_idx;
+ lnk_idx = link_new->link_idx;
+ addc_llc->link_num = link_new->link_id;
+ addc_llc->num_rkeys = *num_rkeys_todo;
+ n = *num_rkeys_todo;
+ for (i = 0; i < min_t(u8, n, SMC_LLC_RKEYS_PER_CONT_MSG); i++) {
+ while (*buf_pos && !(*buf_pos)->used)
+ *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos);
+ if (!*buf_pos) {
+ addc_llc->num_rkeys = addc_llc->num_rkeys -
+ *num_rkeys_todo;
+ *num_rkeys_todo = 0;
+ break;
+ }
+ rmb = *buf_pos;
+
+ addc_llc->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey);
+ addc_llc->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey);
+ addc_llc->rt[i].rmb_vaddr_new =
+ cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl));
+
+ (*num_rkeys_todo)--;
+ *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos);
+ }
+ addc_llc->hd.common.type = SMC_LLC_ADD_LINK_CONT;
+ addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont);
+ if (lgr->role == SMC_CLNT)
+ addc_llc->hd.flags |= SMC_LLC_FLAG_RESP;
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+static int smc_llc_cli_rkey_exchange(struct smc_link *link,
+ struct smc_link *link_new)
+{
+ struct smc_llc_msg_add_link_cont *addc_llc;
+ struct smc_link_group *lgr = link->lgr;
+ u8 max, num_rkeys_send, num_rkeys_recv;
+ struct smc_llc_qentry *qentry;
+ struct smc_buf_desc *buf_pos;
+ int buf_lst;
+ int rc = 0;
+ int i;
+
+ mutex_lock(&lgr->rmbs_lock);
+ num_rkeys_send = lgr->conns_num;
+ buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
+ do {
+ qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_TIME,
+ SMC_LLC_ADD_LINK_CONT);
+ if (!qentry) {
+ rc = -ETIMEDOUT;
+ break;
+ }
+ addc_llc = &qentry->msg.add_link_cont;
+ num_rkeys_recv = addc_llc->num_rkeys;
+ max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG);
+ for (i = 0; i < max; i++) {
+ smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
+ addc_llc->rt[i].rmb_key,
+ addc_llc->rt[i].rmb_vaddr_new,
+ addc_llc->rt[i].rmb_key_new);
+ num_rkeys_recv--;
+ }
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ rc = smc_llc_add_link_cont(link, link_new, &num_rkeys_send,
+ &buf_lst, &buf_pos);
+ if (rc)
+ break;
+ } while (num_rkeys_send || num_rkeys_recv);
+
+ mutex_unlock(&lgr->rmbs_lock);
+ return rc;
+}
+
+/* prepare and send an add link reject response */
+static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry)
+{
+ qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
+ qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_ADD_LNK_REJ;
+ qentry->msg.raw.hdr.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH;
+ return smc_llc_send_message(qentry->link, &qentry->msg);
+}
+
+static int smc_llc_cli_conf_link(struct smc_link *link,
+ struct smc_init_info *ini,
+ struct smc_link *link_new,
+ enum smc_lgr_type lgr_new_t)
+{
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_llc_qentry *qentry = NULL;
+ int rc = 0;
+
+ /* receive CONFIRM LINK request over RoCE fabric */
+ qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_FIRST_TIME, 0);
+ if (!qentry) {
+ rc = smc_llc_send_delete_link(link, link_new->link_id,
+ SMC_LLC_REQ, false,
+ SMC_LLC_DEL_LOST_PATH);
+ return -ENOLINK;
+ }
+ if (qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) {
+ /* received DELETE_LINK instead */
+ qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_send_message(link, &qentry->msg);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return -ENOLINK;
+ }
+ smc_llc_save_peer_uid(qentry);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+
+ rc = smc_ib_modify_qp_rts(link_new);
+ if (rc) {
+ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+ false, SMC_LLC_DEL_LOST_PATH);
+ return -ENOLINK;
+ }
+ smc_wr_remember_qp_attr(link_new);
+
+ rc = smcr_buf_reg_lgr(link_new);
+ if (rc) {
+ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+ false, SMC_LLC_DEL_LOST_PATH);
+ return -ENOLINK;
+ }
+
+ /* send CONFIRM LINK response over RoCE fabric */
+ rc = smc_llc_send_confirm_link(link_new, SMC_LLC_RESP);
+ if (rc) {
+ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+ false, SMC_LLC_DEL_LOST_PATH);
+ return -ENOLINK;
+ }
+ smc_llc_link_active(link_new);
+ if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+ lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)
+ smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx);
+ else
+ smcr_lgr_set_type(lgr, lgr_new_t);
+ return 0;
+}
+
+static void smc_llc_save_add_link_info(struct smc_link *link,
+ struct smc_llc_msg_add_link *add_llc)
+{
+ link->peer_qpn = ntoh24(add_llc->sender_qp_num);
+ memcpy(link->peer_gid, add_llc->sender_gid, SMC_GID_SIZE);
+ memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN);
+ link->peer_psn = ntoh24(add_llc->initial_psn);
+ link->peer_mtu = add_llc->qp_mtu;
+}
+
+/* as an SMC client, process an add link request */
+int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
+{
+ struct smc_llc_msg_add_link *llc = &qentry->msg.add_link;
+ enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
+ struct smc_link_group *lgr = smc_get_lgr(link);
+ struct smc_link *lnk_new = NULL;
+ struct smc_init_info ini;
+ int lnk_idx, rc = 0;
+
+ if (!llc->qp_mtu)
+ goto out_reject;
+
+ ini.vlan_id = lgr->vlan_id;
+ smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev);
+ if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
+ !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN)) {
+ if (!ini.ib_dev)
+ goto out_reject;
+ lgr_new_t = SMC_LGR_ASYMMETRIC_PEER;
+ }
+ if (!ini.ib_dev) {
+ lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+ ini.ib_dev = link->smcibdev;
+ ini.ib_port = link->ibport;
+ }
+ lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t);
+ if (lnk_idx < 0)
+ goto out_reject;
+ lnk_new = &lgr->lnk[lnk_idx];
+ rc = smcr_link_init(lgr, lnk_new, lnk_idx, &ini);
+ if (rc)
+ goto out_reject;
+ smc_llc_save_add_link_info(lnk_new, llc);
+ lnk_new->link_id = llc->link_num; /* SMC server assigns link id */
+ smc_llc_link_set_uid(lnk_new);
+
+ rc = smc_ib_ready_link(lnk_new);
+ if (rc)
+ goto out_clear_lnk;
+
+ rc = smcr_buf_map_lgr(lnk_new);
+ if (rc)
+ goto out_clear_lnk;
+
+ rc = smc_llc_send_add_link(link,
+ lnk_new->smcibdev->mac[ini.ib_port - 1],
+ lnk_new->gid, lnk_new, SMC_LLC_RESP);
+ if (rc)
+ goto out_clear_lnk;
+ rc = smc_llc_cli_rkey_exchange(link, lnk_new);
+ if (rc) {
+ rc = 0;
+ goto out_clear_lnk;
+ }
+ rc = smc_llc_cli_conf_link(link, &ini, lnk_new, lgr_new_t);
+ if (!rc)
+ goto out;
+out_clear_lnk:
+ lnk_new->state = SMC_LNK_INACTIVE;
+ smcr_link_clear(lnk_new, false);
+out_reject:
+ smc_llc_cli_add_link_reject(qentry);
+out:
+ kfree(qentry);
+ return rc;
+}
+
+/* as an SMC client, invite server to start the add_link processing */
+static void smc_llc_cli_add_link_invite(struct smc_link *link,
+ struct smc_llc_qentry *qentry)
+{
+ struct smc_link_group *lgr = smc_get_lgr(link);
+ struct smc_init_info ini;
+
+ if (lgr->type == SMC_LGR_SYMMETRIC ||
+ lgr->type == SMC_LGR_ASYMMETRIC_PEER)
+ goto out;
+
+ ini.vlan_id = lgr->vlan_id;
+ smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev);
+ if (!ini.ib_dev)
+ goto out;
+
+ smc_llc_send_add_link(link, ini.ib_dev->mac[ini.ib_port - 1],
+ ini.ib_gid, NULL, SMC_LLC_REQ);
+out:
+ kfree(qentry);
+}
+
+static bool smc_llc_is_empty_llc_message(union smc_llc_msg *llc)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(llc->raw.data); i++)
+ if (llc->raw.data[i])
+ return false;
+ return true;
+}
+
+static bool smc_llc_is_local_add_link(union smc_llc_msg *llc)
+{
+ if (llc->raw.hdr.common.type == SMC_LLC_ADD_LINK &&
+ smc_llc_is_empty_llc_message(llc))
+ return true;
+ return false;
+}
+
+static void smc_llc_process_cli_add_link(struct smc_link_group *lgr)
+{
+ struct smc_llc_qentry *qentry;
+
+ qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
+
+ mutex_lock(&lgr->llc_conf_mutex);
+ if (smc_llc_is_local_add_link(&qentry->msg))
+ smc_llc_cli_add_link_invite(qentry->link, qentry);
+ else
+ smc_llc_cli_add_link(qentry->link, qentry);
+ mutex_unlock(&lgr->llc_conf_mutex);
+}
+
+static int smc_llc_active_link_count(struct smc_link_group *lgr)
+{
+ int i, link_count = 0;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]))
+ continue;
+ link_count++;
+ }
+ return link_count;
+}
+
+/* find the asymmetric link when 3 links are established */
+static struct smc_link *smc_llc_find_asym_link(struct smc_link_group *lgr)
+{
+ int asym_idx = -ENOENT;
+ int i, j, k;
+ bool found;
+
+ /* determine asymmetric link */
+ found = false;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) {
+ if (!smc_link_usable(&lgr->lnk[i]) ||
+ !smc_link_usable(&lgr->lnk[j]))
+ continue;
+ if (!memcmp(lgr->lnk[i].gid, lgr->lnk[j].gid,
+ SMC_GID_SIZE)) {
+ found = true; /* asym_lnk is i or j */
+ break;
+ }
+ }
+ if (found)
+ break;
+ }
+ if (!found)
+ goto out; /* no asymmetric link */
+ for (k = 0; k < SMC_LINKS_PER_LGR_MAX; k++) {
+ if (!smc_link_usable(&lgr->lnk[k]))
+ continue;
+ if (k != i &&
+ !memcmp(lgr->lnk[i].peer_gid, lgr->lnk[k].peer_gid,
+ SMC_GID_SIZE)) {
+ asym_idx = i;
+ break;
+ }
+ if (k != j &&
+ !memcmp(lgr->lnk[j].peer_gid, lgr->lnk[k].peer_gid,
+ SMC_GID_SIZE)) {
+ asym_idx = j;
+ break;
+ }
+ }
+out:
+ return (asym_idx < 0) ? NULL : &lgr->lnk[asym_idx];
+}
+
+static void smc_llc_delete_asym_link(struct smc_link_group *lgr)
+{
+ struct smc_link *lnk_new = NULL, *lnk_asym;
+ struct smc_llc_qentry *qentry;
+ int rc;
+
+ lnk_asym = smc_llc_find_asym_link(lgr);
+ if (!lnk_asym)
+ return; /* no asymmetric link */
+ if (!smc_link_downing(&lnk_asym->state))
+ return;
+ lnk_new = smc_switch_conns(lgr, lnk_asym, false);
+ smc_wr_tx_wait_no_pending_sends(lnk_asym);
+ if (!lnk_new)
+ goto out_free;
+ /* change flow type from ADD_LINK into DEL_LINK */
+ lgr->llc_flow_lcl.type = SMC_LLC_FLOW_DEL_LINK;
+ rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ,
+ true, SMC_LLC_DEL_NO_ASYM_NEEDED);
+ if (rc) {
+ smcr_link_down_cond(lnk_new);
+ goto out_free;
+ }
+ qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME,
+ SMC_LLC_DELETE_LINK);
+ if (!qentry) {
+ smcr_link_down_cond(lnk_new);
+ goto out_free;
+ }
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+out_free:
+ smcr_link_clear(lnk_asym, true);
+}
+
+static int smc_llc_srv_rkey_exchange(struct smc_link *link,
+ struct smc_link *link_new)
+{
+ struct smc_llc_msg_add_link_cont *addc_llc;
+ struct smc_link_group *lgr = link->lgr;
+ u8 max, num_rkeys_send, num_rkeys_recv;
+ struct smc_llc_qentry *qentry = NULL;
+ struct smc_buf_desc *buf_pos;
+ int buf_lst;
+ int rc = 0;
+ int i;
+
+ mutex_lock(&lgr->rmbs_lock);
+ num_rkeys_send = lgr->conns_num;
+ buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
+ do {
+ smc_llc_add_link_cont(link, link_new, &num_rkeys_send,
+ &buf_lst, &buf_pos);
+ qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME,
+ SMC_LLC_ADD_LINK_CONT);
+ if (!qentry) {
+ rc = -ETIMEDOUT;
+ goto out;
+ }
+ addc_llc = &qentry->msg.add_link_cont;
+ num_rkeys_recv = addc_llc->num_rkeys;
+ max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG);
+ for (i = 0; i < max; i++) {
+ smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
+ addc_llc->rt[i].rmb_key,
+ addc_llc->rt[i].rmb_vaddr_new,
+ addc_llc->rt[i].rmb_key_new);
+ num_rkeys_recv--;
+ }
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ } while (num_rkeys_send || num_rkeys_recv);
+out:
+ mutex_unlock(&lgr->rmbs_lock);
+ return rc;
+}
+
+static int smc_llc_srv_conf_link(struct smc_link *link,
+ struct smc_link *link_new,
+ enum smc_lgr_type lgr_new_t)
+{
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_llc_qentry *qentry = NULL;
+ int rc;
+
+ /* send CONFIRM LINK request over the RoCE fabric */
+ rc = smc_llc_send_confirm_link(link_new, SMC_LLC_REQ);
+ if (rc)
+ return -ENOLINK;
+ /* receive CONFIRM LINK response over the RoCE fabric */
+ qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, 0);
+ if (!qentry ||
+ qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) {
+ /* send DELETE LINK */
+ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+ false, SMC_LLC_DEL_LOST_PATH);
+ if (qentry)
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return -ENOLINK;
+ }
+ smc_llc_save_peer_uid(qentry);
+ smc_llc_link_active(link_new);
+ if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+ lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)
+ smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx);
+ else
+ smcr_lgr_set_type(lgr, lgr_new_t);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return 0;
+}
+
+int smc_llc_srv_add_link(struct smc_link *link)
+{
+ enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_llc_msg_add_link *add_llc;
+ struct smc_llc_qentry *qentry = NULL;
+ struct smc_link *link_new;
+ struct smc_init_info ini;
+ int lnk_idx, rc = 0;
+
+ /* ignore client add link recommendation, start new flow */
+ ini.vlan_id = lgr->vlan_id;
+ smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev);
+ if (!ini.ib_dev) {
+ lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+ ini.ib_dev = link->smcibdev;
+ ini.ib_port = link->ibport;
+ }
+ lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t);
+ if (lnk_idx < 0)
+ return 0;
+
+ rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, &ini);
+ if (rc)
+ return rc;
+ link_new = &lgr->lnk[lnk_idx];
+ rc = smc_llc_send_add_link(link,
+ link_new->smcibdev->mac[ini.ib_port - 1],
+ link_new->gid, link_new, SMC_LLC_REQ);
+ if (rc)
+ goto out_err;
+ /* receive ADD LINK response over the RoCE fabric */
+ qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK);
+ if (!qentry) {
+ rc = -ETIMEDOUT;
+ goto out_err;
+ }
+ add_llc = &qentry->msg.add_link;
+ if (add_llc->hd.flags & SMC_LLC_FLAG_ADD_LNK_REJ) {
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ rc = -ENOLINK;
+ goto out_err;
+ }
+ if (lgr->type == SMC_LGR_SINGLE &&
+ (!memcmp(add_llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
+ !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN))) {
+ lgr_new_t = SMC_LGR_ASYMMETRIC_PEER;
+ }
+ smc_llc_save_add_link_info(link_new, add_llc);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+
+ rc = smc_ib_ready_link(link_new);
+ if (rc)
+ goto out_err;
+ rc = smcr_buf_map_lgr(link_new);
+ if (rc)
+ goto out_err;
+ rc = smcr_buf_reg_lgr(link_new);
+ if (rc)
+ goto out_err;
+ rc = smc_llc_srv_rkey_exchange(link, link_new);
+ if (rc)
+ goto out_err;
+ rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t);
+ if (rc)
+ goto out_err;
+ return 0;
+out_err:
+ link_new->state = SMC_LNK_INACTIVE;
+ smcr_link_clear(link_new, false);
+ return rc;
+}
+
+static void smc_llc_process_srv_add_link(struct smc_link_group *lgr)
+{
+ struct smc_link *link = lgr->llc_flow_lcl.qentry->link;
+ int rc;
+
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+
+ mutex_lock(&lgr->llc_conf_mutex);
+ rc = smc_llc_srv_add_link(link);
+ if (!rc && lgr->type == SMC_LGR_SYMMETRIC) {
+ /* delete any asymmetric link */
+ smc_llc_delete_asym_link(lgr);
+ }
+ mutex_unlock(&lgr->llc_conf_mutex);
+}
+
+/* enqueue a local add_link req to trigger a new add_link flow */
+void smc_llc_add_link_local(struct smc_link *link)
+{
+ struct smc_llc_msg_add_link add_llc = {};
+
+ add_llc.hd.length = sizeof(add_llc);
+ add_llc.hd.common.type = SMC_LLC_ADD_LINK;
+ /* no dev and port needed */
+ smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc);
+}
+
+/* worker to process an add link message */
+static void smc_llc_add_link_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+ llc_add_link_work);
+
+ if (list_empty(&lgr->list)) {
+ /* link group is terminating */
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ goto out;
+ }
+
+ if (lgr->role == SMC_CLNT)
+ smc_llc_process_cli_add_link(lgr);
+ else
+ smc_llc_process_srv_add_link(lgr);
+out:
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+}
+
+/* enqueue a local del_link msg to trigger a new del_link flow,
+ * called only for role SMC_SERV
+ */
+void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id)
+{
+ struct smc_llc_msg_del_link del_llc = {};
+
+ del_llc.hd.length = sizeof(del_llc);
+ del_llc.hd.common.type = SMC_LLC_DELETE_LINK;
+ del_llc.link_num = del_link_id;
+ del_llc.reason = htonl(SMC_LLC_DEL_LOST_PATH);
+ del_llc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+ smc_llc_enqueue(link, (union smc_llc_msg *)&del_llc);
+}
+
+static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr)
+{
+ struct smc_link *lnk_del = NULL, *lnk_asym, *lnk;
+ struct smc_llc_msg_del_link *del_llc;
+ struct smc_llc_qentry *qentry;
+ int active_links;
+ int lnk_idx;
+
+ qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
+ lnk = qentry->link;
+ del_llc = &qentry->msg.delete_link;
+
+ if (del_llc->hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) {
+ smc_lgr_terminate_sched(lgr);
+ goto out;
+ }
+ mutex_lock(&lgr->llc_conf_mutex);
+ /* delete single link */
+ for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) {
+ if (lgr->lnk[lnk_idx].link_id != del_llc->link_num)
+ continue;
+ lnk_del = &lgr->lnk[lnk_idx];
+ break;
+ }
+ del_llc->hd.flags |= SMC_LLC_FLAG_RESP;
+ if (!lnk_del) {
+ /* link was not found */
+ del_llc->reason = htonl(SMC_LLC_DEL_NOLNK);
+ smc_llc_send_message(lnk, &qentry->msg);
+ goto out_unlock;
+ }
+ lnk_asym = smc_llc_find_asym_link(lgr);
+
+ del_llc->reason = 0;
+ smc_llc_send_message(lnk, &qentry->msg); /* response */
+
+ if (smc_link_downing(&lnk_del->state))
+ smc_switch_conns(lgr, lnk_del, false);
+ smcr_link_clear(lnk_del, true);
+
+ active_links = smc_llc_active_link_count(lgr);
+ if (lnk_del == lnk_asym) {
+ /* expected deletion of asym link, don't change lgr state */
+ } else if (active_links == 1) {
+ smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
+ } else if (!active_links) {
+ smcr_lgr_set_type(lgr, SMC_LGR_NONE);
+ smc_lgr_terminate_sched(lgr);
+ }
+out_unlock:
+ mutex_unlock(&lgr->llc_conf_mutex);
+out:
+ kfree(qentry);
+}
+
+/* try to send a DELETE LINK ALL request on any active link,
+ * waiting for send completion
+ */
+void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn)
+{
+ struct smc_llc_msg_del_link delllc = {};
+ int i;
+
+ delllc.hd.common.type = SMC_LLC_DELETE_LINK;
+ delllc.hd.length = sizeof(delllc);
+ if (ord)
+ delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+ delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
+ delllc.reason = htonl(rsn);
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_sendable(&lgr->lnk[i]))
+ continue;
+ if (!smc_llc_send_message_wait(&lgr->lnk[i], &delllc))
+ break;
+ }
+}
+
+static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
+{
+ struct smc_llc_msg_del_link *del_llc;
+ struct smc_link *lnk, *lnk_del;
+ struct smc_llc_qentry *qentry;
+ int active_links;
+ int i;
+
+ mutex_lock(&lgr->llc_conf_mutex);
+ qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
+ lnk = qentry->link;
+ del_llc = &qentry->msg.delete_link;
+
+ if (qentry->msg.delete_link.hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) {
+ /* delete entire lgr */
+ smc_llc_send_link_delete_all(lgr, true, ntohl(
+ qentry->msg.delete_link.reason));
+ smc_lgr_terminate_sched(lgr);
+ goto out;
+ }
+ /* delete single link */
+ lnk_del = NULL;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].link_id == del_llc->link_num) {
+ lnk_del = &lgr->lnk[i];
+ break;
+ }
+ }
+ if (!lnk_del)
+ goto out; /* asymmetric link already deleted */
+
+ if (smc_link_downing(&lnk_del->state)) {
+ if (smc_switch_conns(lgr, lnk_del, false))
+ smc_wr_tx_wait_no_pending_sends(lnk_del);
+ }
+ if (!list_empty(&lgr->list)) {
+ /* qentry is either a request from peer (send it back to
+ * initiate the DELETE_LINK processing), or a locally
+ * enqueued DELETE_LINK request (forward it)
+ */
+ if (!smc_llc_send_message(lnk, &qentry->msg)) {
+ struct smc_llc_qentry *qentry2;
+
+ qentry2 = smc_llc_wait(lgr, lnk, SMC_LLC_WAIT_TIME,
+ SMC_LLC_DELETE_LINK);
+ if (qentry2)
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ }
+ }
+ smcr_link_clear(lnk_del, true);
+
+ active_links = smc_llc_active_link_count(lgr);
+ if (active_links == 1) {
+ smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
+ } else if (!active_links) {
+ smcr_lgr_set_type(lgr, SMC_LGR_NONE);
+ smc_lgr_terminate_sched(lgr);
+ }
+
+ if (lgr->type == SMC_LGR_SINGLE && !list_empty(&lgr->list)) {
+ /* trigger setup of asymm alt link */
+ smc_llc_add_link_local(lnk);
+ }
+out:
+ mutex_unlock(&lgr->llc_conf_mutex);
+ kfree(qentry);
+}
+
+static void smc_llc_delete_link_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+ llc_del_link_work);
+
+ if (list_empty(&lgr->list)) {
+ /* link group is terminating */
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ goto out;
+ }
+
+ if (lgr->role == SMC_CLNT)
+ smc_llc_process_cli_delete_link(lgr);
+ else
+ smc_llc_process_srv_delete_link(lgr);
+out:
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+}
+
+/* process a confirm_rkey request from peer, remote flow */
+static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr)
+{
+ struct smc_llc_msg_confirm_rkey *llc;
+ struct smc_llc_qentry *qentry;
+ struct smc_link *link;
+ int num_entries;
+ int rk_idx;
+ int i;
+
+ qentry = lgr->llc_flow_rmt.qentry;
+ llc = &qentry->msg.confirm_rkey;
+ link = qentry->link;
+
+ num_entries = llc->rtoken[0].num_rkeys;
+ /* first rkey entry is for receiving link */
+ rk_idx = smc_rtoken_add(link,
+ llc->rtoken[0].rmb_vaddr,
+ llc->rtoken[0].rmb_key);
+ if (rk_idx < 0)
+ goto out_err;
+
+ for (i = 1; i <= min_t(u8, num_entries, SMC_LLC_RKEYS_PER_MSG - 1); i++)
+ smc_rtoken_set2(lgr, rk_idx, llc->rtoken[i].link_id,
+ llc->rtoken[i].rmb_vaddr,
+ llc->rtoken[i].rmb_key);
+ /* max links is 3 so there is no need to support conf_rkey_cont msgs */
+ goto out;
+out_err:
+ llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
+ llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY;
+out:
+ llc->hd.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_send_message(link, &qentry->msg);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
+}
+
+/* process a delete_rkey request from peer, remote flow */
+static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr)
+{
+ struct smc_llc_msg_delete_rkey *llc;
+ struct smc_llc_qentry *qentry;
+ struct smc_link *link;
+ u8 err_mask = 0;
+ int i, max;
+
+ qentry = lgr->llc_flow_rmt.qentry;
+ llc = &qentry->msg.delete_rkey;
+ link = qentry->link;
+
+ max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
+ for (i = 0; i < max; i++) {
+ if (smc_rtoken_delete(link, llc->rkey[i]))
+ err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i);
+ }
+ if (err_mask) {
+ llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
+ llc->err_mask = err_mask;
+ }
+ llc->hd.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_send_message(link, &qentry->msg);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
+}
+
+static void smc_llc_protocol_violation(struct smc_link_group *lgr, u8 type)
+{
+ pr_warn_ratelimited("smc: SMC-R lg %*phN LLC protocol violation: "
+ "llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id, type);
+ smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_PROT_VIOL);
+ smc_lgr_terminate_sched(lgr);
+}
+
+/* flush the llc event queue */
+static void smc_llc_event_flush(struct smc_link_group *lgr)
+{
+ struct smc_llc_qentry *qentry, *q;
+
+ spin_lock_bh(&lgr->llc_event_q_lock);
+ list_for_each_entry_safe(qentry, q, &lgr->llc_event_q, list) {
+ list_del_init(&qentry->list);
+ kfree(qentry);
+ }
+ spin_unlock_bh(&lgr->llc_event_q_lock);
+}
+
+static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
+{
+ union smc_llc_msg *llc = &qentry->msg;
+ struct smc_link *link = qentry->link;
+ struct smc_link_group *lgr = link->lgr;
+
+ if (!smc_link_usable(link))
+ goto out;
+
+ switch (llc->raw.hdr.common.type) {
+ case SMC_LLC_TEST_LINK:
+ llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_send_message(link, llc);
+ break;
+ case SMC_LLC_ADD_LINK:
+ if (list_empty(&lgr->list))
+ goto out; /* lgr is terminating */
+ if (lgr->role == SMC_CLNT) {
+ if (smc_llc_is_local_add_link(llc)) {
+ if (lgr->llc_flow_lcl.type ==
+ SMC_LLC_FLOW_ADD_LINK)
+ break; /* add_link in progress */
+ if (smc_llc_flow_start(&lgr->llc_flow_lcl,
+ qentry)) {
+ schedule_work(&lgr->llc_add_link_work);
+ }
+ return;
+ }
+ if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK &&
+ !lgr->llc_flow_lcl.qentry) {
+ /* a flow is waiting for this message */
+ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl,
+ qentry);
+ wake_up(&lgr->llc_msg_waiter);
+ } else if (smc_llc_flow_start(&lgr->llc_flow_lcl,
+ qentry)) {
+ schedule_work(&lgr->llc_add_link_work);
+ }
+ } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
+ /* as smc server, handle client suggestion */
+ schedule_work(&lgr->llc_add_link_work);
+ }
+ return;
+ case SMC_LLC_CONFIRM_LINK:
+ case SMC_LLC_ADD_LINK_CONT:
+ if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
+ /* a flow is waiting for this message */
+ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry);
+ wake_up(&lgr->llc_msg_waiter);
+ return;
+ }
+ break;
+ case SMC_LLC_DELETE_LINK:
+ if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK &&
+ !lgr->llc_flow_lcl.qentry) {
+ /* DEL LINK REQ during ADD LINK SEQ */
+ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry);
+ wake_up(&lgr->llc_msg_waiter);
+ } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
+ schedule_work(&lgr->llc_del_link_work);
+ }
+ return;
+ case SMC_LLC_CONFIRM_RKEY:
+ /* new request from remote, assign to remote flow */
+ if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) {
+ /* process here, does not wait for more llc msgs */
+ smc_llc_rmt_conf_rkey(lgr);
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt);
+ }
+ return;
+ case SMC_LLC_CONFIRM_RKEY_CONT:
+ /* not used because max links is 3, and 3 rkeys fit into
+ * one CONFIRM_RKEY message
+ */
+ break;
+ case SMC_LLC_DELETE_RKEY:
+ /* new request from remote, assign to remote flow */
+ if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) {
+ /* process here, does not wait for more llc msgs */
+ smc_llc_rmt_delete_rkey(lgr);
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt);
+ }
+ return;
+ default:
+ smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type);
+ break;
+ }
+out:
+ kfree(qentry);
+}
+
+/* worker to process llc messages on the event queue */
+static void smc_llc_event_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+ llc_event_work);
+ struct smc_llc_qentry *qentry;
+
+ if (!lgr->llc_flow_lcl.type && lgr->delayed_event) {
+ qentry = lgr->delayed_event;
+ lgr->delayed_event = NULL;
+ if (smc_link_usable(qentry->link))
+ smc_llc_event_handler(qentry);
+ else
+ kfree(qentry);
+ }
+
+again:
+ spin_lock_bh(&lgr->llc_event_q_lock);
+ if (!list_empty(&lgr->llc_event_q)) {
+ qentry = list_first_entry(&lgr->llc_event_q,
+ struct smc_llc_qentry, list);
+ list_del_init(&qentry->list);
+ spin_unlock_bh(&lgr->llc_event_q_lock);
+ smc_llc_event_handler(qentry);
+ goto again;
+ }
+ spin_unlock_bh(&lgr->llc_event_q_lock);
+}
+
+/* process llc responses in tasklet context */
+static void smc_llc_rx_response(struct smc_link *link,
+ struct smc_llc_qentry *qentry)
+{
+ enum smc_llc_flowtype flowtype = link->lgr->llc_flow_lcl.type;
+ struct smc_llc_flow *flow = &link->lgr->llc_flow_lcl;
+ u8 llc_type = qentry->msg.raw.hdr.common.type;
+
+ switch (llc_type) {
+ case SMC_LLC_TEST_LINK:
+ if (smc_link_active(link))
+ complete(&link->llc_testlink_resp);
+ break;
+ case SMC_LLC_ADD_LINK:
+ case SMC_LLC_ADD_LINK_CONT:
+ case SMC_LLC_CONFIRM_LINK:
+ if (flowtype != SMC_LLC_FLOW_ADD_LINK || flow->qentry)
+ break; /* drop out-of-flow response */
+ goto assign;
+ case SMC_LLC_DELETE_LINK:
+ if (flowtype != SMC_LLC_FLOW_DEL_LINK || flow->qentry)
+ break; /* drop out-of-flow response */
+ goto assign;
+ case SMC_LLC_CONFIRM_RKEY:
+ case SMC_LLC_DELETE_RKEY:
+ if (flowtype != SMC_LLC_FLOW_RKEY || flow->qentry)
+ break; /* drop out-of-flow response */
+ goto assign;
+ case SMC_LLC_CONFIRM_RKEY_CONT:
+ /* not used because max links is 3 */
+ break;
+ default:
+ smc_llc_protocol_violation(link->lgr, llc_type);
+ break;
+ }
+ kfree(qentry);
+ return;
+assign:
+ /* assign responses to the local flow, we requested them */
+ smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry);
+ wake_up(&link->lgr->llc_msg_waiter);
+}
+
+static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc)
+{
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_llc_qentry *qentry;
+ unsigned long flags;
+
+ qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC);
+ if (!qentry)
+ return;
+ qentry->link = link;
+ INIT_LIST_HEAD(&qentry->list);
+ memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg));
+
+ /* process responses immediately */
+ if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) {
+ smc_llc_rx_response(link, qentry);
+ return;
+ }
+
+ /* add requests to event queue */
+ spin_lock_irqsave(&lgr->llc_event_q_lock, flags);
+ list_add_tail(&qentry->list, &lgr->llc_event_q);
+ spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags);
+ queue_work(system_highpri_wq, &lgr->llc_event_work);
+}
+
+/* copy received msg and add it to the event queue */
+static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ union smc_llc_msg *llc = buf;
+
+ if (wc->byte_len < sizeof(*llc))
+ return; /* short message */
+ if (llc->raw.hdr.length != sizeof(*llc))
+ return; /* invalid message */
+
+ smc_llc_enqueue(link, llc);
+}
+
+/***************************** worker, utils *********************************/
+
+static void smc_llc_testlink_work(struct work_struct *work)
+{
+ struct smc_link *link = container_of(to_delayed_work(work),
+ struct smc_link, llc_testlink_wrk);
+ unsigned long next_interval;
+ unsigned long expire_time;
+ u8 user_data[16] = { 0 };
+ int rc;
+
+ if (!smc_link_active(link))
+ return; /* don't reschedule worker */
+ expire_time = link->wr_rx_tstamp + link->llc_testlink_time;
+ if (time_is_after_jiffies(expire_time)) {
+ next_interval = expire_time - jiffies;
+ goto out;
+ }
+ reinit_completion(&link->llc_testlink_resp);
+ smc_llc_send_test_link(link, user_data);
+ /* receive TEST LINK response over RoCE fabric */
+ rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
+ SMC_LLC_WAIT_TIME);
+ if (!smc_link_active(link))
+ return; /* link state changed */
+ if (rc <= 0) {
+ smcr_link_down_cond_sched(link);
+ return;
+ }
+ next_interval = link->llc_testlink_time;
+out:
+ schedule_delayed_work(&link->llc_testlink_wrk, next_interval);
+}
+
+void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
+{
+ struct net *net = sock_net(smc->clcsock->sk);
+
+ INIT_WORK(&lgr->llc_event_work, smc_llc_event_work);
+ INIT_WORK(&lgr->llc_add_link_work, smc_llc_add_link_work);
+ INIT_WORK(&lgr->llc_del_link_work, smc_llc_delete_link_work);
+ INIT_LIST_HEAD(&lgr->llc_event_q);
+ spin_lock_init(&lgr->llc_event_q_lock);
+ spin_lock_init(&lgr->llc_flow_lock);
+ init_waitqueue_head(&lgr->llc_flow_waiter);
+ init_waitqueue_head(&lgr->llc_msg_waiter);
+ mutex_init(&lgr->llc_conf_mutex);
+ lgr->llc_testlink_time = READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time);
+}
+
+/* called after lgr was removed from lgr_list */
+void smc_llc_lgr_clear(struct smc_link_group *lgr)
+{
+ smc_llc_event_flush(lgr);
+ wake_up_all(&lgr->llc_flow_waiter);
+ wake_up_all(&lgr->llc_msg_waiter);
+ cancel_work_sync(&lgr->llc_event_work);
+ cancel_work_sync(&lgr->llc_add_link_work);
+ cancel_work_sync(&lgr->llc_del_link_work);
+ if (lgr->delayed_event) {
+ kfree(lgr->delayed_event);
+ lgr->delayed_event = NULL;
+ }
+}
+
+int smc_llc_link_init(struct smc_link *link)
+{
+ init_completion(&link->llc_testlink_resp);
+ INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
+ return 0;
+}
+
+void smc_llc_link_active(struct smc_link *link)
+{
+ pr_warn_ratelimited("smc: SMC-R lg %*phN link added: id %*phN, "
+ "peerid %*phN, ibdev %s, ibport %d\n",
+ SMC_LGR_ID_SIZE, &link->lgr->id,
+ SMC_LGR_ID_SIZE, &link->link_uid,
+ SMC_LGR_ID_SIZE, &link->peer_link_uid,
+ link->smcibdev->ibdev->name, link->ibport);
+ link->state = SMC_LNK_ACTIVE;
+ if (link->lgr->llc_testlink_time) {
+ link->llc_testlink_time = link->lgr->llc_testlink_time;
+ schedule_delayed_work(&link->llc_testlink_wrk,
+ link->llc_testlink_time);
+ }
+}
+
+/* called in worker context */
+void smc_llc_link_clear(struct smc_link *link, bool log)
+{
+ if (log)
+ pr_warn_ratelimited("smc: SMC-R lg %*phN link removed: id %*phN"
+ ", peerid %*phN, ibdev %s, ibport %d\n",
+ SMC_LGR_ID_SIZE, &link->lgr->id,
+ SMC_LGR_ID_SIZE, &link->link_uid,
+ SMC_LGR_ID_SIZE, &link->peer_link_uid,
+ link->smcibdev->ibdev->name, link->ibport);
+ complete(&link->llc_testlink_resp);
+ cancel_delayed_work_sync(&link->llc_testlink_wrk);
+}
+
+/* register a new rtoken at the remote peer (for all links) */
+int smc_llc_do_confirm_rkey(struct smc_link *send_link,
+ struct smc_buf_desc *rmb_desc)
+{
+ struct smc_link_group *lgr = send_link->lgr;
+ struct smc_llc_qentry *qentry = NULL;
+ int rc = 0;
+
+ rc = smc_llc_send_confirm_rkey(send_link, rmb_desc);
+ if (rc)
+ goto out;
+ /* receive CONFIRM RKEY response from server over RoCE fabric */
+ qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME,
+ SMC_LLC_CONFIRM_RKEY);
+ if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG))
+ rc = -EFAULT;
+out:
+ if (qentry)
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return rc;
+}
+
+/* unregister an rtoken at the remote peer */
+int smc_llc_do_delete_rkey(struct smc_link_group *lgr,
+ struct smc_buf_desc *rmb_desc)
+{
+ struct smc_llc_qentry *qentry = NULL;
+ struct smc_link *send_link;
+ int rc = 0;
+
+ send_link = smc_llc_usable_link(lgr);
+ if (!send_link)
+ return -ENOLINK;
+
+ /* protected by llc_flow control */
+ rc = smc_llc_send_delete_rkey(send_link, rmb_desc);
+ if (rc)
+ goto out;
+ /* receive DELETE RKEY response from server over RoCE fabric */
+ qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME,
+ SMC_LLC_DELETE_RKEY);
+ if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG))
+ rc = -EFAULT;
+out:
+ if (qentry)
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return rc;
+}
+
+void smc_llc_link_set_uid(struct smc_link *link)
+{
+ __be32 link_uid;
+
+ link_uid = htonl(*((u32 *)link->lgr->id) + link->link_id);
+ memcpy(link->link_uid, &link_uid, SMC_LGR_ID_SIZE);
+}
+
+/* save peers link user id, used for debug purposes */
+void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry)
+{
+ memcpy(qentry->link->peer_link_uid, qentry->msg.confirm_link.link_uid,
+ SMC_LGR_ID_SIZE);
+}
+
+/* evaluate confirm link request or response */
+int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
+ enum smc_llc_reqresp type)
+{
+ if (type == SMC_LLC_REQ) { /* SMC server assigns link_id */
+ qentry->link->link_id = qentry->msg.confirm_link.link_num;
+ smc_llc_link_set_uid(qentry->link);
+ }
+ if (!(qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC))
+ return -ENOTSUPP;
+ return 0;
+}
+
+/***************************** init, exit, misc ******************************/
+
+static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_CONFIRM_LINK
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_TEST_LINK
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_ADD_LINK
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_ADD_LINK_CONT
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_DELETE_LINK
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_CONFIRM_RKEY
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_CONFIRM_RKEY_CONT
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_DELETE_RKEY
+ },
+ {
+ .handler = NULL,
+ }
+};
+
+int __init smc_llc_init(void)
+{
+ struct smc_wr_rx_handler *handler;
+ int rc = 0;
+
+ for (handler = smc_llc_rx_handlers; handler->handler; handler++) {
+ INIT_HLIST_NODE(&handler->list);
+ rc = smc_wr_rx_register_handler(handler);
+ if (rc)
+ break;
+ }
+ return rc;
+}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
new file mode 100644
index 000000000..cc00a2ec4
--- /dev/null
+++ b/net/smc/smc_llc.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for LLC (link layer control) message handling
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ * Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_LLC_H
+#define SMC_LLC_H
+
+#include "smc_wr.h"
+
+#define SMC_LLC_FLAG_RESP 0x80
+
+#define SMC_LLC_WAIT_FIRST_TIME (5 * HZ)
+#define SMC_LLC_WAIT_TIME (2 * HZ)
+
+enum smc_llc_reqresp {
+ SMC_LLC_REQ,
+ SMC_LLC_RESP
+};
+
+enum smc_llc_msg_type {
+ SMC_LLC_CONFIRM_LINK = 0x01,
+ SMC_LLC_ADD_LINK = 0x02,
+ SMC_LLC_ADD_LINK_CONT = 0x03,
+ SMC_LLC_DELETE_LINK = 0x04,
+ SMC_LLC_CONFIRM_RKEY = 0x06,
+ SMC_LLC_TEST_LINK = 0x07,
+ SMC_LLC_CONFIRM_RKEY_CONT = 0x08,
+ SMC_LLC_DELETE_RKEY = 0x09,
+};
+
+#define smc_link_downing(state) \
+ (cmpxchg(state, SMC_LNK_ACTIVE, SMC_LNK_INACTIVE) == SMC_LNK_ACTIVE)
+
+/* LLC DELETE LINK Request Reason Codes */
+#define SMC_LLC_DEL_LOST_PATH 0x00010000
+#define SMC_LLC_DEL_OP_INIT_TERM 0x00020000
+#define SMC_LLC_DEL_PROG_INIT_TERM 0x00030000
+#define SMC_LLC_DEL_PROT_VIOL 0x00040000
+#define SMC_LLC_DEL_NO_ASYM_NEEDED 0x00050000
+/* LLC DELETE LINK Response Reason Codes */
+#define SMC_LLC_DEL_NOLNK 0x00100000 /* Unknown Link ID (no link) */
+#define SMC_LLC_DEL_NOLGR 0x00200000 /* Unknown Link Group */
+
+/* returns a usable link of the link group, or NULL */
+static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr)
+{
+ int i;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+ if (smc_link_usable(&lgr->lnk[i]))
+ return &lgr->lnk[i];
+ return NULL;
+}
+
+/* set the termination reason code for the link group */
+static inline void smc_llc_set_termination_rsn(struct smc_link_group *lgr,
+ u32 rsn)
+{
+ if (!lgr->llc_termination_rsn)
+ lgr->llc_termination_rsn = rsn;
+}
+
+/* transmit */
+int smc_llc_send_confirm_link(struct smc_link *lnk,
+ enum smc_llc_reqresp reqresp);
+int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
+ struct smc_link *link_new,
+ enum smc_llc_reqresp reqresp);
+int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
+ enum smc_llc_reqresp reqresp, bool orderly,
+ u32 reason);
+void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id);
+void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc);
+void smc_llc_lgr_clear(struct smc_link_group *lgr);
+int smc_llc_link_init(struct smc_link *link);
+void smc_llc_link_active(struct smc_link *link);
+void smc_llc_link_clear(struct smc_link *link, bool log);
+int smc_llc_do_confirm_rkey(struct smc_link *send_link,
+ struct smc_buf_desc *rmb_desc);
+int smc_llc_do_delete_rkey(struct smc_link_group *lgr,
+ struct smc_buf_desc *rmb_desc);
+int smc_llc_flow_initiate(struct smc_link_group *lgr,
+ enum smc_llc_flowtype type);
+void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow);
+int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
+ enum smc_llc_reqresp type);
+void smc_llc_link_set_uid(struct smc_link *link);
+void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry);
+struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
+ struct smc_link *lnk,
+ int time_out, u8 exp_msg);
+struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow);
+void smc_llc_flow_qentry_del(struct smc_llc_flow *flow);
+void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord,
+ u32 rsn);
+int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry);
+int smc_llc_srv_add_link(struct smc_link *link);
+void smc_llc_add_link_local(struct smc_link *link);
+int smc_llc_init(void) __init;
+
+#endif /* SMC_LLC_H */
diff --git a/net/smc/smc_netns.h b/net/smc/smc_netns.h
new file mode 100644
index 000000000..0f4f35aa4
--- /dev/null
+++ b/net/smc/smc_netns.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Shared Memory Communications
+ *
+ * Network namespace definitions.
+ *
+ * Copyright IBM Corp. 2018
+ */
+
+#ifndef SMC_NETNS_H
+#define SMC_NETNS_H
+
+#include "smc_pnet.h"
+
+extern unsigned int smc_net_id;
+
+/* per-network namespace private data */
+struct smc_net {
+ struct smc_pnettable pnettable;
+ struct smc_pnetids_ndev pnetids_ndev;
+};
+#endif
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
new file mode 100644
index 000000000..30bae60d6
--- /dev/null
+++ b/net/smc/smc_pnet.c
@@ -0,0 +1,1174 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Generic netlink support functions to configure an SMC-R PNET table
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/if.h>
+#include <uapi/linux/smc.h>
+
+#include <rdma/ib_verbs.h>
+
+#include <net/netns/generic.h>
+#include "smc_netns.h"
+
+#include "smc_pnet.h"
+#include "smc_ib.h"
+#include "smc_ism.h"
+#include "smc_core.h"
+
+static struct net_device *__pnet_find_base_ndev(struct net_device *ndev);
+static struct net_device *pnet_find_base_ndev(struct net_device *ndev);
+
+static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
+ [SMC_PNETID_NAME] = {
+ .type = NLA_NUL_STRING,
+ .len = SMC_MAX_PNETID_LEN
+ },
+ [SMC_PNETID_ETHNAME] = {
+ .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ - 1
+ },
+ [SMC_PNETID_IBNAME] = {
+ .type = NLA_NUL_STRING,
+ .len = IB_DEVICE_NAME_MAX - 1
+ },
+ [SMC_PNETID_IBPORT] = { .type = NLA_U8 }
+};
+
+static struct genl_family smc_pnet_nl_family;
+
+enum smc_pnet_nametype {
+ SMC_PNET_ETH = 1,
+ SMC_PNET_IB = 2,
+};
+
+/* pnet entry stored in pnet table */
+struct smc_pnetentry {
+ struct list_head list;
+ char pnet_name[SMC_MAX_PNETID_LEN + 1];
+ enum smc_pnet_nametype type;
+ union {
+ struct {
+ char eth_name[IFNAMSIZ + 1];
+ struct net_device *ndev;
+ };
+ struct {
+ char ib_name[IB_DEVICE_NAME_MAX + 1];
+ u8 ib_port;
+ };
+ };
+};
+
+/* Check if the pnetid is set */
+bool smc_pnet_is_pnetid_set(u8 *pnetid)
+{
+ if (pnetid[0] == 0 || pnetid[0] == _S)
+ return false;
+ return true;
+}
+
+/* Check if two given pnetids match */
+static bool smc_pnet_match(u8 *pnetid1, u8 *pnetid2)
+{
+ int i;
+
+ for (i = 0; i < SMC_MAX_PNETID_LEN; i++) {
+ if ((pnetid1[i] == 0 || pnetid1[i] == _S) &&
+ (pnetid2[i] == 0 || pnetid2[i] == _S))
+ break;
+ if (pnetid1[i] != pnetid2[i])
+ return false;
+ }
+ return true;
+}
+
+/* Remove a pnetid from the pnet table.
+ */
+static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+ struct smc_pnettable *pnettable;
+ struct smc_ib_device *ibdev;
+ struct smcd_dev *smcd_dev;
+ struct smc_net *sn;
+ int rc = -ENOENT;
+ int ibport;
+
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ /* remove table entry */
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist,
+ list) {
+ if (!pnet_name ||
+ smc_pnet_match(pnetelem->pnet_name, pnet_name)) {
+ list_del(&pnetelem->list);
+ if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) {
+ dev_put(pnetelem->ndev);
+ pr_warn_ratelimited("smc: net device %s "
+ "erased user defined "
+ "pnetid %.16s\n",
+ pnetelem->eth_name,
+ pnetelem->pnet_name);
+ }
+ kfree(pnetelem);
+ rc = 0;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+
+ /* if this is not the initial namespace, stop here */
+ if (net != &init_net)
+ return rc;
+
+ /* remove ib devices */
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+ for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) {
+ if (ibdev->pnetid_by_user[ibport] &&
+ (!pnet_name ||
+ smc_pnet_match(pnet_name,
+ ibdev->pnetid[ibport]))) {
+ pr_warn_ratelimited("smc: ib device %s ibport "
+ "%d erased user defined "
+ "pnetid %.16s\n",
+ ibdev->ibdev->name,
+ ibport + 1,
+ ibdev->pnetid[ibport]);
+ memset(ibdev->pnetid[ibport], 0,
+ SMC_MAX_PNETID_LEN);
+ ibdev->pnetid_by_user[ibport] = false;
+ rc = 0;
+ }
+ }
+ }
+ mutex_unlock(&smc_ib_devices.mutex);
+ /* remove smcd devices */
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) {
+ if (smcd_dev->pnetid_by_user &&
+ (!pnet_name ||
+ smc_pnet_match(pnet_name, smcd_dev->pnetid))) {
+ pr_warn_ratelimited("smc: smcd device %s "
+ "erased user defined pnetid "
+ "%.16s\n", dev_name(&smcd_dev->dev),
+ smcd_dev->pnetid);
+ memset(smcd_dev->pnetid, 0, SMC_MAX_PNETID_LEN);
+ smcd_dev->pnetid_by_user = false;
+ rc = 0;
+ }
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+ return rc;
+}
+
+/* Add the reference to a given network device to the pnet table.
+ */
+static int smc_pnet_add_by_ndev(struct net_device *ndev)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+ struct smc_pnettable *pnettable;
+ struct net *net = dev_net(ndev);
+ struct smc_net *sn;
+ int rc = -ENOENT;
+
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
+ if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev &&
+ !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) {
+ dev_hold(ndev);
+ pnetelem->ndev = ndev;
+ rc = 0;
+ pr_warn_ratelimited("smc: adding net device %s with "
+ "user defined pnetid %.16s\n",
+ pnetelem->eth_name,
+ pnetelem->pnet_name);
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+ return rc;
+}
+
+/* Remove the reference to a given network device from the pnet table.
+ */
+static int smc_pnet_remove_by_ndev(struct net_device *ndev)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+ struct smc_pnettable *pnettable;
+ struct net *net = dev_net(ndev);
+ struct smc_net *sn;
+ int rc = -ENOENT;
+
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
+ if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) {
+ dev_put(pnetelem->ndev);
+ pnetelem->ndev = NULL;
+ rc = 0;
+ pr_warn_ratelimited("smc: removing net device %s with "
+ "user defined pnetid %.16s\n",
+ pnetelem->eth_name,
+ pnetelem->pnet_name);
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+ return rc;
+}
+
+/* Apply pnetid to ib device when no pnetid is set.
+ */
+static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port,
+ char *pnet_name)
+{
+ bool applied = false;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ if (!smc_pnet_is_pnetid_set(ib_dev->pnetid[ib_port - 1])) {
+ memcpy(ib_dev->pnetid[ib_port - 1], pnet_name,
+ SMC_MAX_PNETID_LEN);
+ ib_dev->pnetid_by_user[ib_port - 1] = true;
+ applied = true;
+ }
+ mutex_unlock(&smc_ib_devices.mutex);
+ return applied;
+}
+
+/* Apply pnetid to smcd device when no pnetid is set.
+ */
+static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name)
+{
+ bool applied = false;
+
+ mutex_lock(&smcd_dev_list.mutex);
+ if (!smc_pnet_is_pnetid_set(smcd_dev->pnetid)) {
+ memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN);
+ smcd_dev->pnetid_by_user = true;
+ applied = true;
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+ return applied;
+}
+
+/* The limit for pnetid is 16 characters.
+ * Valid characters should be (single-byte character set) a-z, A-Z, 0-9.
+ * Lower case letters are converted to upper case.
+ * Interior blanks should not be used.
+ */
+static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
+{
+ char *bf = skip_spaces(pnet_name);
+ size_t len = strlen(bf);
+ char *end = bf + len;
+
+ if (!len)
+ return false;
+ while (--end >= bf && isspace(*end))
+ ;
+ if (end - bf >= SMC_MAX_PNETID_LEN)
+ return false;
+ while (bf <= end) {
+ if (!isalnum(*bf))
+ return false;
+ *pnetid++ = islower(*bf) ? toupper(*bf) : *bf;
+ bf++;
+ }
+ *pnetid = '\0';
+ return true;
+}
+
+/* Find an infiniband device by a given name. The device might not exist. */
+static struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
+{
+ struct smc_ib_device *ibdev;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+ if (!strncmp(ibdev->ibdev->name, ib_name,
+ sizeof(ibdev->ibdev->name)) ||
+ (ibdev->ibdev->dev.parent &&
+ !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name,
+ IB_DEVICE_NAME_MAX - 1))) {
+ goto out;
+ }
+ }
+ ibdev = NULL;
+out:
+ mutex_unlock(&smc_ib_devices.mutex);
+ return ibdev;
+}
+
+/* Find an smcd device by a given name. The device might not exist. */
+static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name)
+{
+ struct smcd_dev *smcd_dev;
+
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) {
+ if (!strncmp(dev_name(&smcd_dev->dev), smcd_name,
+ IB_DEVICE_NAME_MAX - 1))
+ goto out;
+ }
+ smcd_dev = NULL;
+out:
+ mutex_unlock(&smcd_dev_list.mutex);
+ return smcd_dev;
+}
+
+static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net,
+ char *eth_name, char *pnet_name)
+{
+ struct smc_pnetentry *tmp_pe, *new_pe;
+ struct net_device *ndev, *base_ndev;
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ bool new_netdev;
+ int rc;
+
+ /* check if (base) netdev already has a pnetid. If there is one, we do
+ * not want to add a pnet table entry
+ */
+ rc = -EEXIST;
+ ndev = dev_get_by_name(net, eth_name); /* dev_hold() */
+ if (ndev) {
+ base_ndev = pnet_find_base_ndev(ndev);
+ if (!smc_pnetid_by_dev_port(base_ndev->dev.parent,
+ base_ndev->dev_port, ndev_pnetid))
+ goto out_put;
+ }
+
+ /* add a new netdev entry to the pnet table if there isn't one */
+ rc = -ENOMEM;
+ new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL);
+ if (!new_pe)
+ goto out_put;
+ new_pe->type = SMC_PNET_ETH;
+ memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
+ strncpy(new_pe->eth_name, eth_name, IFNAMSIZ);
+ new_pe->ndev = ndev;
+
+ rc = -EEXIST;
+ new_netdev = true;
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+ if (tmp_pe->type == SMC_PNET_ETH &&
+ !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) {
+ new_netdev = false;
+ break;
+ }
+ }
+ if (new_netdev) {
+ list_add_tail(&new_pe->list, &pnettable->pnetlist);
+ mutex_unlock(&pnettable->lock);
+ } else {
+ mutex_unlock(&pnettable->lock);
+ kfree(new_pe);
+ goto out_put;
+ }
+ if (ndev)
+ pr_warn_ratelimited("smc: net device %s "
+ "applied user defined pnetid %.16s\n",
+ new_pe->eth_name, new_pe->pnet_name);
+ return 0;
+
+out_put:
+ if (ndev)
+ dev_put(ndev);
+ return rc;
+}
+
+static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name,
+ u8 ib_port, char *pnet_name)
+{
+ struct smc_pnetentry *tmp_pe, *new_pe;
+ struct smc_ib_device *ib_dev;
+ bool smcddev_applied = true;
+ bool ibdev_applied = true;
+ struct smcd_dev *smcd_dev;
+ bool new_ibdev;
+
+ /* try to apply the pnetid to active devices */
+ ib_dev = smc_pnet_find_ib(ib_name);
+ if (ib_dev) {
+ ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name);
+ if (ibdev_applied)
+ pr_warn_ratelimited("smc: ib device %s ibport %d "
+ "applied user defined pnetid "
+ "%.16s\n", ib_dev->ibdev->name,
+ ib_port,
+ ib_dev->pnetid[ib_port - 1]);
+ }
+ smcd_dev = smc_pnet_find_smcd(ib_name);
+ if (smcd_dev) {
+ smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name);
+ if (smcddev_applied)
+ pr_warn_ratelimited("smc: smcd device %s "
+ "applied user defined pnetid "
+ "%.16s\n", dev_name(&smcd_dev->dev),
+ smcd_dev->pnetid);
+ }
+ /* Apply fails when a device has a hardware-defined pnetid set, do not
+ * add a pnet table entry in that case.
+ */
+ if (!ibdev_applied || !smcddev_applied)
+ return -EEXIST;
+
+ /* add a new ib entry to the pnet table if there isn't one */
+ new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL);
+ if (!new_pe)
+ return -ENOMEM;
+ new_pe->type = SMC_PNET_IB;
+ memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
+ strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX);
+ new_pe->ib_port = ib_port;
+
+ new_ibdev = true;
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+ if (tmp_pe->type == SMC_PNET_IB &&
+ !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
+ new_ibdev = false;
+ break;
+ }
+ }
+ if (new_ibdev) {
+ list_add_tail(&new_pe->list, &pnettable->pnetlist);
+ mutex_unlock(&pnettable->lock);
+ } else {
+ mutex_unlock(&pnettable->lock);
+ kfree(new_pe);
+ }
+ return (new_ibdev) ? 0 : -EEXIST;
+}
+
+/* Append a pnetid to the end of the pnet table if not already on this list.
+ */
+static int smc_pnet_enter(struct net *net, struct nlattr *tb[])
+{
+ char pnet_name[SMC_MAX_PNETID_LEN + 1];
+ struct smc_pnettable *pnettable;
+ bool new_netdev = false;
+ bool new_ibdev = false;
+ struct smc_net *sn;
+ u8 ibport = 1;
+ char *string;
+ int rc;
+
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ rc = -EINVAL;
+ if (!tb[SMC_PNETID_NAME])
+ goto error;
+ string = (char *)nla_data(tb[SMC_PNETID_NAME]);
+ if (!smc_pnetid_valid(string, pnet_name))
+ goto error;
+
+ if (tb[SMC_PNETID_ETHNAME]) {
+ string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
+ rc = smc_pnet_add_eth(pnettable, net, string, pnet_name);
+ if (!rc)
+ new_netdev = true;
+ else if (rc != -EEXIST)
+ goto error;
+ }
+
+ /* if this is not the initial namespace, stop here */
+ if (net != &init_net)
+ return new_netdev ? 0 : -EEXIST;
+
+ rc = -EINVAL;
+ if (tb[SMC_PNETID_IBNAME]) {
+ string = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
+ string = strim(string);
+ if (tb[SMC_PNETID_IBPORT]) {
+ ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]);
+ if (ibport < 1 || ibport > SMC_MAX_PORTS)
+ goto error;
+ }
+ rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name);
+ if (!rc)
+ new_ibdev = true;
+ else if (rc != -EEXIST)
+ goto error;
+ }
+ return (new_netdev || new_ibdev) ? 0 : -EEXIST;
+
+error:
+ return rc;
+}
+
+/* Convert an smc_pnetentry to a netlink attribute sequence */
+static int smc_pnet_set_nla(struct sk_buff *msg,
+ struct smc_pnetentry *pnetelem)
+{
+ if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name))
+ return -1;
+ if (pnetelem->type == SMC_PNET_ETH) {
+ if (nla_put_string(msg, SMC_PNETID_ETHNAME,
+ pnetelem->eth_name))
+ return -1;
+ } else {
+ if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a"))
+ return -1;
+ }
+ if (pnetelem->type == SMC_PNET_IB) {
+ if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) ||
+ nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port))
+ return -1;
+ } else {
+ if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") ||
+ nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff))
+ return -1;
+ }
+
+ return 0;
+}
+
+static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+
+ return smc_pnet_enter(net, info->attrs);
+}
+
+static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+
+ if (!info->attrs[SMC_PNETID_NAME])
+ return -EINVAL;
+ return smc_pnet_remove_by_pnetid(net,
+ (char *)nla_data(info->attrs[SMC_PNETID_NAME]));
+}
+
+static int smc_pnet_dump_start(struct netlink_callback *cb)
+{
+ cb->args[0] = 0;
+ return 0;
+}
+
+static int smc_pnet_dumpinfo(struct sk_buff *skb,
+ u32 portid, u32 seq, u32 flags,
+ struct smc_pnetentry *pnetelem)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family,
+ flags, SMC_PNETID_GET);
+ if (!hdr)
+ return -ENOMEM;
+ if (smc_pnet_set_nla(skb, pnetelem) < 0) {
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+ }
+ genlmsg_end(skb, hdr);
+ return 0;
+}
+
+static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid,
+ u32 seq, u8 *pnetid, int start_idx)
+{
+ struct smc_pnettable *pnettable;
+ struct smc_pnetentry *pnetelem;
+ struct smc_net *sn;
+ int idx = 0;
+
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ /* dump pnettable entries */
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
+ if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid))
+ continue;
+ if (idx++ < start_idx)
+ continue;
+ /* if this is not the initial namespace, dump only netdev */
+ if (net != &init_net && pnetelem->type != SMC_PNET_ETH)
+ continue;
+ if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI,
+ pnetelem)) {
+ --idx;
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+ return idx;
+}
+
+static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int idx;
+
+ idx = _smc_pnet_dump(net, skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NULL, cb->args[0]);
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+/* Retrieve one PNETID entry */
+static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct sk_buff *msg;
+ void *hdr;
+
+ if (!info->attrs[SMC_PNETID_NAME])
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ _smc_pnet_dump(net, msg, info->snd_portid, info->snd_seq,
+ nla_data(info->attrs[SMC_PNETID_NAME]), 0);
+
+ /* finish multi part message and send it */
+ hdr = nlmsg_put(msg, info->snd_portid, info->snd_seq, NLMSG_DONE, 0,
+ NLM_F_MULTI);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+ }
+ return genlmsg_reply(msg, info);
+}
+
+/* Remove and delete all pnetids from pnet table.
+ */
+static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+
+ smc_pnet_remove_by_pnetid(net, NULL);
+ return 0;
+}
+
+/* SMC_PNETID generic netlink operation definition */
+static const struct genl_ops smc_pnet_ops[] = {
+ {
+ .cmd = SMC_PNETID_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ /* can be retrieved by unprivileged users */
+ .doit = smc_pnet_get,
+ .dumpit = smc_pnet_dump,
+ .start = smc_pnet_dump_start
+ },
+ {
+ .cmd = SMC_PNETID_ADD,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_pnet_add
+ },
+ {
+ .cmd = SMC_PNETID_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_pnet_del
+ },
+ {
+ .cmd = SMC_PNETID_FLUSH,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_pnet_flush
+ }
+};
+
+/* SMC_PNETID family definition */
+static struct genl_family smc_pnet_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = SMCR_GENL_FAMILY_NAME,
+ .version = SMCR_GENL_FAMILY_VERSION,
+ .maxattr = SMC_PNETID_MAX,
+ .policy = smc_pnet_policy,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = smc_pnet_ops,
+ .n_ops = ARRAY_SIZE(smc_pnet_ops)
+};
+
+bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe;
+ bool rc = false;
+
+ read_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry(pe, &sn->pnetids_ndev.list, list) {
+ if (smc_pnet_match(pnetid, pe->pnetid)) {
+ rc = true;
+ goto unlock;
+ }
+ }
+
+unlock:
+ read_unlock(&sn->pnetids_ndev.lock);
+ return rc;
+}
+
+static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe, *pi;
+
+ pe = kzalloc(sizeof(*pe), GFP_KERNEL);
+ if (!pe)
+ return -ENOMEM;
+
+ write_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry(pi, &sn->pnetids_ndev.list, list) {
+ if (smc_pnet_match(pnetid, pe->pnetid)) {
+ refcount_inc(&pi->refcnt);
+ kfree(pe);
+ goto unlock;
+ }
+ }
+ refcount_set(&pe->refcnt, 1);
+ memcpy(pe->pnetid, pnetid, SMC_MAX_PNETID_LEN);
+ list_add_tail(&pe->list, &sn->pnetids_ndev.list);
+
+unlock:
+ write_unlock(&sn->pnetids_ndev.lock);
+ return 0;
+}
+
+static void smc_pnet_remove_pnetid(struct net *net, u8 *pnetid)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe, *pe2;
+
+ write_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry_safe(pe, pe2, &sn->pnetids_ndev.list, list) {
+ if (smc_pnet_match(pnetid, pe->pnetid)) {
+ if (refcount_dec_and_test(&pe->refcnt)) {
+ list_del(&pe->list);
+ kfree(pe);
+ }
+ break;
+ }
+ }
+ write_unlock(&sn->pnetids_ndev.lock);
+}
+
+static void smc_pnet_add_base_pnetid(struct net *net, struct net_device *dev,
+ u8 *ndev_pnetid)
+{
+ struct net_device *base_dev;
+
+ base_dev = __pnet_find_base_ndev(dev);
+ if (base_dev->flags & IFF_UP &&
+ !smc_pnetid_by_dev_port(base_dev->dev.parent, base_dev->dev_port,
+ ndev_pnetid)) {
+ /* add to PNETIDs list */
+ smc_pnet_add_pnetid(net, ndev_pnetid);
+ }
+}
+
+/* create initial list of netdevice pnetids */
+static void smc_pnet_create_pnetids_list(struct net *net)
+{
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(net, dev)
+ smc_pnet_add_base_pnetid(net, dev, ndev_pnetid);
+ rtnl_unlock();
+}
+
+/* clean up list of netdevice pnetids */
+static void smc_pnet_destroy_pnetids_list(struct net *net)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe, *temp_pe;
+
+ write_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry_safe(pe, temp_pe, &sn->pnetids_ndev.list, list) {
+ list_del(&pe->list);
+ kfree(pe);
+ }
+ write_unlock(&sn->pnetids_ndev.lock);
+}
+
+static int smc_pnet_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(event_dev);
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+
+ switch (event) {
+ case NETDEV_REBOOT:
+ case NETDEV_UNREGISTER:
+ smc_pnet_remove_by_ndev(event_dev);
+ return NOTIFY_OK;
+ case NETDEV_REGISTER:
+ smc_pnet_add_by_ndev(event_dev);
+ return NOTIFY_OK;
+ case NETDEV_UP:
+ smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid);
+ return NOTIFY_OK;
+ case NETDEV_DOWN:
+ event_dev = __pnet_find_base_ndev(event_dev);
+ if (!smc_pnetid_by_dev_port(event_dev->dev.parent,
+ event_dev->dev_port, ndev_pnetid)) {
+ /* remove from PNETIDs list */
+ smc_pnet_remove_pnetid(net, ndev_pnetid);
+ }
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block smc_netdev_notifier = {
+ .notifier_call = smc_pnet_netdev_event
+};
+
+/* init network namespace */
+int smc_pnet_net_init(struct net *net)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnettable *pnettable = &sn->pnettable;
+ struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev;
+
+ INIT_LIST_HEAD(&pnettable->pnetlist);
+ mutex_init(&pnettable->lock);
+ INIT_LIST_HEAD(&pnetids_ndev->list);
+ rwlock_init(&pnetids_ndev->lock);
+
+ smc_pnet_create_pnetids_list(net);
+
+ return 0;
+}
+
+int __init smc_pnet_init(void)
+{
+ int rc;
+
+ rc = genl_register_family(&smc_pnet_nl_family);
+ if (rc)
+ return rc;
+ rc = register_netdevice_notifier(&smc_netdev_notifier);
+ if (rc)
+ genl_unregister_family(&smc_pnet_nl_family);
+
+ return rc;
+}
+
+/* exit network namespace */
+void smc_pnet_net_exit(struct net *net)
+{
+ /* flush pnet table */
+ smc_pnet_remove_by_pnetid(net, NULL);
+ smc_pnet_destroy_pnetids_list(net);
+}
+
+void smc_pnet_exit(void)
+{
+ unregister_netdevice_notifier(&smc_netdev_notifier);
+ genl_unregister_family(&smc_pnet_nl_family);
+}
+
+static struct net_device *__pnet_find_base_ndev(struct net_device *ndev)
+{
+ int i, nest_lvl;
+
+ ASSERT_RTNL();
+ nest_lvl = ndev->lower_level;
+ for (i = 0; i < nest_lvl; i++) {
+ struct list_head *lower = &ndev->adj_list.lower;
+
+ if (list_empty(lower))
+ break;
+ lower = lower->next;
+ ndev = netdev_lower_get_next(ndev, &lower);
+ }
+ return ndev;
+}
+
+/* Determine one base device for stacked net devices.
+ * If the lower device level contains more than one devices
+ * (for instance with bonding slaves), just the first device
+ * is used to reach a base device.
+ */
+static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
+{
+ rtnl_lock();
+ ndev = __pnet_find_base_ndev(ndev);
+ rtnl_unlock();
+ return ndev;
+}
+
+static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev,
+ u8 *pnetid)
+{
+ struct smc_pnettable *pnettable;
+ struct net *net = dev_net(ndev);
+ struct smc_pnetentry *pnetelem;
+ struct smc_net *sn;
+ int rc = -ENOENT;
+
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
+ if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) {
+ /* get pnetid of netdev device */
+ memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN);
+ rc = 0;
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+ return rc;
+}
+
+/* find a roce device for the given pnetid */
+static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id,
+ struct smc_init_info *ini,
+ struct smc_ib_device *known_dev)
+{
+ struct smc_ib_device *ibdev;
+ int i;
+
+ ini->ib_dev = NULL;
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+ if (ibdev == known_dev)
+ continue;
+ for (i = 1; i <= SMC_MAX_PORTS; i++) {
+ if (!rdma_is_port_valid(ibdev->ibdev, i))
+ continue;
+ if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) &&
+ smc_ib_port_active(ibdev, i) &&
+ !test_bit(i - 1, ibdev->ports_going_away) &&
+ !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
+ ini->ib_gid, NULL)) {
+ ini->ib_dev = ibdev;
+ ini->ib_port = i;
+ goto out;
+ }
+ }
+ }
+out:
+ mutex_unlock(&smc_ib_devices.mutex);
+}
+
+/* find alternate roce device with same pnet_id and vlan_id */
+void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
+ struct smc_init_info *ini,
+ struct smc_ib_device *known_dev)
+{
+ _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev);
+}
+
+/* if handshake network device belongs to a roce device, return its
+ * IB device and port
+ */
+static void smc_pnet_find_rdma_dev(struct net_device *netdev,
+ struct smc_init_info *ini)
+{
+ struct smc_ib_device *ibdev;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+ struct net_device *ndev;
+ int i;
+
+ for (i = 1; i <= SMC_MAX_PORTS; i++) {
+ if (!rdma_is_port_valid(ibdev->ibdev, i))
+ continue;
+ if (!ibdev->ibdev->ops.get_netdev)
+ continue;
+ ndev = ibdev->ibdev->ops.get_netdev(ibdev->ibdev, i);
+ if (!ndev)
+ continue;
+ dev_put(ndev);
+ if (netdev == ndev &&
+ smc_ib_port_active(ibdev, i) &&
+ !test_bit(i - 1, ibdev->ports_going_away) &&
+ !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
+ ini->ib_gid, NULL)) {
+ ini->ib_dev = ibdev;
+ ini->ib_port = i;
+ break;
+ }
+ }
+ }
+ mutex_unlock(&smc_ib_devices.mutex);
+}
+
+/* Determine the corresponding IB device port based on the hardware PNETID.
+ * Searching stops at the first matching active IB device port with vlan_id
+ * configured.
+ * If nothing found, check pnetid table.
+ * If nothing found, try to use handshake device
+ */
+static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
+ struct smc_init_info *ini)
+{
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+
+ ndev = pnet_find_base_ndev(ndev);
+ if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
+ ndev_pnetid) &&
+ smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) {
+ smc_pnet_find_rdma_dev(ndev, ini);
+ return; /* pnetid could not be determined */
+ }
+ _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL);
+}
+
+static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
+ struct smc_init_info *ini)
+{
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ struct smcd_dev *ismdev;
+
+ ndev = pnet_find_base_ndev(ndev);
+ if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
+ ndev_pnetid) &&
+ smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid))
+ return; /* pnetid could not be determined */
+
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(ismdev, &smcd_dev_list.list, list) {
+ if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) &&
+ !ismdev->going_away &&
+ (!ini->ism_peer_gid[0] ||
+ !smc_ism_cantalk(ini->ism_peer_gid[0], ini->vlan_id,
+ ismdev))) {
+ ini->ism_dev[0] = ismdev;
+ break;
+ }
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+}
+
+/* PNET table analysis for a given sock:
+ * determine ib_device and port belonging to used internal TCP socket
+ * ethernet interface.
+ */
+void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+
+ ini->ib_dev = NULL;
+ ini->ib_port = 0;
+ if (!dst)
+ goto out;
+ if (!dst->dev)
+ goto out_rel;
+
+ smc_pnet_find_roce_by_pnetid(dst->dev, ini);
+
+out_rel:
+ dst_release(dst);
+out:
+ return;
+}
+
+void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+
+ ini->ism_dev[0] = NULL;
+ if (!dst)
+ goto out;
+ if (!dst->dev)
+ goto out_rel;
+
+ smc_pnet_find_ism_by_pnetid(dst->dev, ini);
+
+out_rel:
+ dst_release(dst);
+out:
+ return;
+}
+
+/* Lookup and apply a pnet table entry to the given ib device.
+ */
+int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port)
+{
+ char *ib_name = smcibdev->ibdev->name;
+ struct smc_pnettable *pnettable;
+ struct smc_pnetentry *tmp_pe;
+ struct smc_net *sn;
+ int rc = -ENOENT;
+
+ /* get pnettable for init namespace */
+ sn = net_generic(&init_net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+ if (tmp_pe->type == SMC_PNET_IB &&
+ !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) &&
+ tmp_pe->ib_port == ib_port) {
+ smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name);
+ rc = 0;
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+
+ return rc;
+}
+
+/* Lookup and apply a pnet table entry to the given smcd device.
+ */
+int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev)
+{
+ const char *ib_name = dev_name(&smcddev->dev);
+ struct smc_pnettable *pnettable;
+ struct smc_pnetentry *tmp_pe;
+ struct smc_net *sn;
+ int rc = -ENOENT;
+
+ /* get pnettable for init namespace */
+ sn = net_generic(&init_net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+ if (tmp_pe->type == SMC_PNET_IB &&
+ !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
+ smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name);
+ rc = 0;
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+
+ return rc;
+}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
new file mode 100644
index 000000000..80a88eea4
--- /dev/null
+++ b/net/smc/smc_pnet.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * PNET table queries
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_PNET_H
+#define _SMC_PNET_H
+
+#include <net/smc.h>
+
+#if IS_ENABLED(CONFIG_HAVE_PNETID)
+#include <asm/pnet.h>
+#endif
+
+struct smc_ib_device;
+struct smcd_dev;
+struct smc_init_info;
+struct smc_link_group;
+
+/**
+ * struct smc_pnettable - SMC PNET table anchor
+ * @lock: Lock for list action
+ * @pnetlist: List of PNETIDs
+ */
+struct smc_pnettable {
+ struct mutex lock;
+ struct list_head pnetlist;
+};
+
+struct smc_pnetids_ndev { /* list of pnetids for net devices in UP state*/
+ struct list_head list;
+ rwlock_t lock;
+};
+
+struct smc_pnetids_ndev_entry {
+ struct list_head list;
+ u8 pnetid[SMC_MAX_PNETID_LEN];
+ refcount_t refcnt;
+};
+
+static inline int smc_pnetid_by_dev_port(struct device *dev,
+ unsigned short port, u8 *pnetid)
+{
+#if IS_ENABLED(CONFIG_HAVE_PNETID)
+ return pnet_id_by_dev_port(dev, port, pnetid);
+#else
+ return -ENOENT;
+#endif
+}
+
+int smc_pnet_init(void) __init;
+int smc_pnet_net_init(struct net *net);
+void smc_pnet_exit(void);
+void smc_pnet_net_exit(struct net *net);
+void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini);
+void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini);
+int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port);
+int smc_pnetid_by_table_smcd(struct smcd_dev *smcd);
+void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
+ struct smc_init_info *ini,
+ struct smc_ib_device *known_dev);
+bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid);
+bool smc_pnet_is_pnetid_set(u8 *pnetid);
+#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
new file mode 100644
index 000000000..3757aff6c
--- /dev/null
+++ b/net/smc/smc_rx.c
@@ -0,0 +1,444 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ * copy new RMBE data into user space
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/signal.h>
+
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_cdc.h"
+#include "smc_tx.h" /* smc_tx_consumer_update() */
+#include "smc_rx.h"
+
+/* callback implementation to wakeup consumers blocked with smc_rx_wait().
+ * indirectly called by smc_cdc_msg_recv_action().
+ */
+static void smc_rx_wake_up(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ /* derived from sock_def_readable() */
+ /* called already in smc_listen_work() */
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
+ EPOLLRDNORM | EPOLLRDBAND);
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+ (sk->sk_state == SMC_CLOSED))
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ rcu_read_unlock();
+}
+
+/* Update consumer cursor
+ * @conn connection to update
+ * @cons consumer cursor
+ * @len number of Bytes consumed
+ * Returns:
+ * 1 if we should end our receive, 0 otherwise
+ */
+static int smc_rx_update_consumer(struct smc_sock *smc,
+ union smc_host_cursor cons, size_t len)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ bool force = false;
+ int diff, rc = 0;
+
+ smc_curs_add(conn->rmb_desc->len, &cons, len);
+
+ /* did we process urgent data? */
+ if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) {
+ diff = smc_curs_comp(conn->rmb_desc->len, &cons,
+ &conn->urg_curs);
+ if (sock_flag(sk, SOCK_URGINLINE)) {
+ if (diff == 0) {
+ force = true;
+ rc = 1;
+ conn->urg_state = SMC_URG_READ;
+ }
+ } else {
+ if (diff == 1) {
+ /* skip urgent byte */
+ force = true;
+ smc_curs_add(conn->rmb_desc->len, &cons, 1);
+ conn->urg_rx_skip_pend = false;
+ } else if (diff < -1)
+ /* we read past urgent byte */
+ conn->urg_state = SMC_URG_READ;
+ }
+ }
+
+ smc_curs_copy(&conn->local_tx_ctrl.cons, &cons, conn);
+
+ /* send consumer cursor update if required */
+ /* similar to advertising new TCP rcv_wnd if required */
+ smc_tx_consumer_update(conn, force);
+
+ return rc;
+}
+
+static void smc_rx_update_cons(struct smc_sock *smc, size_t len)
+{
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor cons;
+
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ smc_rx_update_consumer(smc, cons, len);
+}
+
+struct smc_spd_priv {
+ struct smc_sock *smc;
+ size_t len;
+};
+
+static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private;
+ struct smc_sock *smc = priv->smc;
+ struct smc_connection *conn;
+ struct sock *sk = &smc->sk;
+
+ if (sk->sk_state == SMC_CLOSED ||
+ sk->sk_state == SMC_PEERFINCLOSEWAIT ||
+ sk->sk_state == SMC_APPFINCLOSEWAIT)
+ goto out;
+ conn = &smc->conn;
+ lock_sock(sk);
+ smc_rx_update_cons(smc, priv->len);
+ release_sock(sk);
+ if (atomic_sub_and_test(priv->len, &conn->splice_pending))
+ smc_rx_wake_up(sk);
+out:
+ kfree(priv);
+ put_page(buf->page);
+ sock_put(sk);
+}
+
+static const struct pipe_buf_operations smc_pipe_ops = {
+ .release = smc_rx_pipe_buf_release,
+ .get = generic_pipe_buf_get
+};
+
+static void smc_rx_spd_release(struct splice_pipe_desc *spd,
+ unsigned int i)
+{
+ put_page(spd->pages[i]);
+}
+
+static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
+ struct smc_sock *smc)
+{
+ struct splice_pipe_desc spd;
+ struct partial_page partial;
+ struct smc_spd_priv *priv;
+ int bytes;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+ priv->len = len;
+ priv->smc = smc;
+ partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr;
+ partial.len = len;
+ partial.private = (unsigned long)priv;
+
+ spd.nr_pages_max = 1;
+ spd.nr_pages = 1;
+ spd.pages = &smc->conn.rmb_desc->pages;
+ spd.partial = &partial;
+ spd.ops = &smc_pipe_ops;
+ spd.spd_release = smc_rx_spd_release;
+
+ bytes = splice_to_pipe(pipe, &spd);
+ if (bytes > 0) {
+ sock_hold(&smc->sk);
+ get_page(smc->conn.rmb_desc->pages);
+ atomic_add(bytes, &smc->conn.splice_pending);
+ }
+
+ return bytes;
+}
+
+static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn)
+{
+ return atomic_read(&conn->bytes_to_rcv) &&
+ !atomic_read(&conn->splice_pending);
+}
+
+/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
+ * @smc smc socket
+ * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout
+ * @fcrit add'l criterion to evaluate as function pointer
+ * Returns:
+ * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
+ * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
+ */
+int smc_rx_wait(struct smc_sock *smc, long *timeo,
+ int (*fcrit)(struct smc_connection *conn))
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct smc_connection *conn = &smc->conn;
+ struct smc_cdc_conn_state_flags *cflags =
+ &conn->local_tx_ctrl.conn_state_flags;
+ struct sock *sk = &smc->sk;
+ int rc;
+
+ if (fcrit(conn))
+ return 1;
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ add_wait_queue(sk_sleep(sk), &wait);
+ rc = sk_wait_event(sk, timeo,
+ READ_ONCE(sk->sk_err) ||
+ cflags->peer_conn_abort ||
+ READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN ||
+ conn->killed ||
+ fcrit(conn),
+ &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ return rc;
+}
+
+static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
+ int flags)
+{
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor cons;
+ struct sock *sk = &smc->sk;
+ int rc = 0;
+
+ if (sock_flag(sk, SOCK_URGINLINE) ||
+ !(conn->urg_state == SMC_URG_VALID) ||
+ conn->urg_state == SMC_URG_READ)
+ return -EINVAL;
+
+ if (conn->urg_state == SMC_URG_VALID) {
+ if (!(flags & MSG_PEEK))
+ smc->conn.urg_state = SMC_URG_READ;
+ msg->msg_flags |= MSG_OOB;
+ if (len > 0) {
+ if (!(flags & MSG_TRUNC))
+ rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1);
+ len = 1;
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ if (smc_curs_diff(conn->rmb_desc->len, &cons,
+ &conn->urg_curs) > 1)
+ conn->urg_rx_skip_pend = true;
+ /* Urgent Byte was already accounted for, but trigger
+ * skipping the urgent byte in non-inline case
+ */
+ if (!(flags & MSG_PEEK))
+ smc_rx_update_consumer(smc, cons, 0);
+ } else {
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ return rc ? -EFAULT : len;
+ }
+
+ if (sk->sk_state == SMC_CLOSED || sk->sk_shutdown & RCV_SHUTDOWN)
+ return 0;
+
+ return -EAGAIN;
+}
+
+static bool smc_rx_recvmsg_data_available(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+
+ if (smc_rx_data_available(conn))
+ return true;
+ else if (conn->urg_state == SMC_URG_VALID)
+ /* we received a single urgent Byte - skip */
+ smc_rx_update_cons(smc, 0);
+ return false;
+}
+
+/* smc_rx_recvmsg - receive data from RMBE
+ * @msg: copy data to receive buffer
+ * @pipe: copy data to pipe if set - indicates splice() call
+ *
+ * rcvbuf consumer: main API called by socket layer.
+ * Called under sk lock.
+ */
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
+ struct pipe_inode_info *pipe, size_t len, int flags)
+{
+ size_t copylen, read_done = 0, read_remaining = len;
+ size_t chunk_len, chunk_off, chunk_len_sum;
+ struct smc_connection *conn = &smc->conn;
+ int (*func)(struct smc_connection *conn);
+ union smc_host_cursor cons;
+ int readable, chunk;
+ char *rcvbuf_base;
+ struct sock *sk;
+ int splbytes;
+ long timeo;
+ int target; /* Read at least these many bytes */
+ int rc;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return -EINVAL; /* future work for sk.sk_family == AF_SMC */
+
+ sk = &smc->sk;
+ if (sk->sk_state == SMC_LISTEN)
+ return -ENOTCONN;
+ if (flags & MSG_OOB)
+ return smc_rx_recv_urg(smc, msg, len, flags);
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+ /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
+ rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr;
+
+ do { /* while (read_remaining) */
+ if (read_done >= target || (pipe && read_done))
+ break;
+
+ if (conn->killed)
+ break;
+
+ if (smc_rx_recvmsg_data_available(smc))
+ goto copy;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ /* smc_cdc_msg_recv_action() could have run after
+ * above smc_rx_recvmsg_data_available()
+ */
+ if (smc_rx_recvmsg_data_available(smc))
+ goto copy;
+ break;
+ }
+
+ if (read_done) {
+ if (sk->sk_err ||
+ sk->sk_state == SMC_CLOSED ||
+ !timeo ||
+ signal_pending(current))
+ break;
+ } else {
+ if (sk->sk_err) {
+ read_done = sock_error(sk);
+ break;
+ }
+ if (sk->sk_state == SMC_CLOSED) {
+ if (!sock_flag(sk, SOCK_DONE)) {
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ read_done = -ENOTCONN;
+ break;
+ }
+ break;
+ }
+ if (!timeo)
+ return -EAGAIN;
+ if (signal_pending(current)) {
+ read_done = sock_intr_errno(timeo);
+ break;
+ }
+ }
+
+ if (!smc_rx_data_available(conn)) {
+ smc_rx_wait(smc, &timeo, smc_rx_data_available);
+ continue;
+ }
+
+copy:
+ /* initialize variables for 1st iteration of subsequent loop */
+ /* could be just 1 byte, even after waiting on data above */
+ readable = atomic_read(&conn->bytes_to_rcv);
+ splbytes = atomic_read(&conn->splice_pending);
+ if (!readable || (msg && splbytes)) {
+ if (splbytes)
+ func = smc_rx_data_available_and_no_splice_pend;
+ else
+ func = smc_rx_data_available;
+ smc_rx_wait(smc, &timeo, func);
+ continue;
+ }
+
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ /* subsequent splice() calls pick up where previous left */
+ if (splbytes)
+ smc_curs_add(conn->rmb_desc->len, &cons, splbytes);
+ if (conn->urg_state == SMC_URG_VALID &&
+ sock_flag(&smc->sk, SOCK_URGINLINE) &&
+ readable > 1)
+ readable--; /* always stop at urgent Byte */
+ /* not more than what user space asked for */
+ copylen = min_t(size_t, read_remaining, readable);
+ /* determine chunks where to read from rcvbuf */
+ /* either unwrapped case, or 1st chunk of wrapped case */
+ chunk_len = min_t(size_t, copylen, conn->rmb_desc->len -
+ cons.count);
+ chunk_len_sum = chunk_len;
+ chunk_off = cons.count;
+ smc_rmb_sync_sg_for_cpu(conn);
+ for (chunk = 0; chunk < 2; chunk++) {
+ if (!(flags & MSG_TRUNC)) {
+ if (msg) {
+ rc = memcpy_to_msg(msg, rcvbuf_base +
+ chunk_off,
+ chunk_len);
+ } else {
+ rc = smc_rx_splice(pipe, rcvbuf_base +
+ chunk_off, chunk_len,
+ smc);
+ }
+ if (rc < 0) {
+ if (!read_done)
+ read_done = -EFAULT;
+ smc_rmb_sync_sg_for_device(conn);
+ goto out;
+ }
+ }
+ read_remaining -= chunk_len;
+ read_done += chunk_len;
+
+ if (chunk_len_sum == copylen)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ chunk_len = copylen - chunk_len; /* remainder */
+ chunk_len_sum += chunk_len;
+ chunk_off = 0; /* modulo offset in recv ring buffer */
+ }
+ smc_rmb_sync_sg_for_device(conn);
+
+ /* update cursors */
+ if (!(flags & MSG_PEEK)) {
+ /* increased in recv tasklet smc_cdc_msg_rcv() */
+ smp_mb__before_atomic();
+ atomic_sub(copylen, &conn->bytes_to_rcv);
+ /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
+ smp_mb__after_atomic();
+ if (msg && smc_rx_update_consumer(smc, cons, copylen))
+ goto out;
+ }
+ } while (read_remaining);
+out:
+ return read_done;
+}
+
+/* Initialize receive properties on connection establishment. NB: not __init! */
+void smc_rx_init(struct smc_sock *smc)
+{
+ smc->sk.sk_data_ready = smc_rx_wake_up;
+ atomic_set(&smc->conn.splice_pending, 0);
+ smc->conn.urg_state = SMC_URG_READ;
+}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
new file mode 100644
index 000000000..db823c97d
--- /dev/null
+++ b/net/smc/smc_rx.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_RX_H
+#define SMC_RX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+
+void smc_rx_init(struct smc_sock *smc);
+
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
+ struct pipe_inode_info *pipe, size_t len, int flags);
+int smc_rx_wait(struct smc_sock *smc, long *timeo,
+ int (*fcrit)(struct smc_connection *conn));
+static inline int smc_rx_data_available(struct smc_connection *conn)
+{
+ return atomic_read(&conn->bytes_to_rcv);
+}
+
+#endif /* SMC_RX_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
new file mode 100644
index 000000000..2429f9fc7
--- /dev/null
+++ b/net/smc/smc_tx.c
@@ -0,0 +1,646 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer.
+ * Producer:
+ * Copy user space data into send buffer, if send buffer space available.
+ * Consumer:
+ * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/sched/signal.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+#include "smc_ism.h"
+#include "smc_tx.h"
+
+#define SMC_TX_WORK_DELAY 0
+#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */
+
+/***************************** sndbuf producer *******************************/
+
+/* callback implementation for sk.sk_write_space()
+ * to wakeup sndbuf producers that blocked with smc_tx_wait().
+ * called under sk_socket lock.
+ */
+static void smc_tx_write_space(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+ struct smc_sock *smc = smc_sk(sk);
+ struct socket_wq *wq;
+
+ /* similar to sk_stream_write_space */
+ if (atomic_read(&smc->conn.sndbuf_space) && sock) {
+ clear_bit(SOCK_NOSPACE, &sock->flags);
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait,
+ EPOLLOUT | EPOLLWRNORM |
+ EPOLLWRBAND);
+ if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+ sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
+ rcu_read_unlock();
+ }
+}
+
+/* Wakeup sndbuf producers that blocked with smc_tx_wait().
+ * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
+ */
+void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
+{
+ if (smc->sk.sk_socket &&
+ test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags))
+ smc->sk.sk_write_space(&smc->sk);
+}
+
+/* blocks sndbuf producer until at least one byte of free space available
+ * or urgent Byte was consumed
+ */
+static int smc_tx_wait(struct smc_sock *smc, int flags)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ long timeo;
+ int rc = 0;
+
+ /* similar to sk_stream_wait_memory */
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (1) {
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ if (sk->sk_err ||
+ (sk->sk_shutdown & SEND_SHUTDOWN) ||
+ conn->killed ||
+ conn->local_tx_ctrl.conn_state_flags.peer_done_writing) {
+ rc = -EPIPE;
+ break;
+ }
+ if (smc_cdc_rxed_any_close(conn)) {
+ rc = -ECONNRESET;
+ break;
+ }
+ if (!timeo) {
+ /* ensure EPOLLOUT is subsequently generated */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ rc = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ rc = sock_intr_errno(timeo);
+ break;
+ }
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend)
+ break; /* at least 1 byte of free & no urgent data */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ sk_wait_event(sk, &timeo,
+ READ_ONCE(sk->sk_err) ||
+ (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) ||
+ smc_cdc_rxed_any_close(conn) ||
+ (atomic_read(&conn->sndbuf_space) &&
+ !conn->urg_tx_pend),
+ &wait);
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return rc;
+}
+
+static bool smc_tx_is_corked(struct smc_sock *smc)
+{
+ struct tcp_sock *tp = tcp_sk(smc->clcsock->sk);
+
+ return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
+}
+
+/* sndbuf producer: main API called by socket layer.
+ * called under sock lock.
+ */
+int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
+{
+ size_t copylen, send_done = 0, send_remaining = len;
+ size_t chunk_len, chunk_off, chunk_len_sum;
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor prep;
+ struct sock *sk = &smc->sk;
+ char *sndbuf_base;
+ int tx_cnt_prep;
+ int writespace;
+ int rc, chunk;
+
+ /* This should be in poll */
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
+ rc = -EPIPE;
+ goto out_err;
+ }
+
+ while (msg_data_left(msg)) {
+ if (sk->sk_state == SMC_INIT)
+ return -ENOTCONN;
+ if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
+ (smc->sk.sk_err == ECONNABORTED) ||
+ conn->killed)
+ return -EPIPE;
+ if (smc_cdc_rxed_any_close(conn))
+ return send_done ?: -ECONNRESET;
+
+ if (msg->msg_flags & MSG_OOB)
+ conn->local_tx_ctrl.prod_flags.urg_data_pending = 1;
+
+ if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) {
+ if (send_done)
+ return send_done;
+ rc = smc_tx_wait(smc, msg->msg_flags);
+ if (rc)
+ goto out_err;
+ continue;
+ }
+
+ /* initialize variables for 1st iteration of subsequent loop */
+ /* could be just 1 byte, even after smc_tx_wait above */
+ writespace = atomic_read(&conn->sndbuf_space);
+ /* not more than what user space asked for */
+ copylen = min_t(size_t, send_remaining, writespace);
+ /* determine start of sndbuf */
+ sndbuf_base = conn->sndbuf_desc->cpu_addr;
+ smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
+ tx_cnt_prep = prep.count;
+ /* determine chunks where to write into sndbuf */
+ /* either unwrapped case, or 1st chunk of wrapped case */
+ chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len -
+ tx_cnt_prep);
+ chunk_len_sum = chunk_len;
+ chunk_off = tx_cnt_prep;
+ smc_sndbuf_sync_sg_for_cpu(conn);
+ for (chunk = 0; chunk < 2; chunk++) {
+ rc = memcpy_from_msg(sndbuf_base + chunk_off,
+ msg, chunk_len);
+ if (rc) {
+ smc_sndbuf_sync_sg_for_device(conn);
+ if (send_done)
+ return send_done;
+ goto out_err;
+ }
+ send_done += chunk_len;
+ send_remaining -= chunk_len;
+
+ if (chunk_len_sum == copylen)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ chunk_len = copylen - chunk_len; /* remainder */
+ chunk_len_sum += chunk_len;
+ chunk_off = 0; /* modulo offset in send ring buffer */
+ }
+ smc_sndbuf_sync_sg_for_device(conn);
+ /* update cursors */
+ smc_curs_add(conn->sndbuf_desc->len, &prep, copylen);
+ smc_curs_copy(&conn->tx_curs_prep, &prep, conn);
+ /* increased in send tasklet smc_cdc_tx_handler() */
+ smp_mb__before_atomic();
+ atomic_sub(copylen, &conn->sndbuf_space);
+ /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
+ smp_mb__after_atomic();
+ /* since we just produced more new data into sndbuf,
+ * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
+ */
+ if ((msg->msg_flags & MSG_OOB) && !send_remaining)
+ conn->urg_tx_pend = true;
+ if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
+ (atomic_read(&conn->sndbuf_space) >
+ (conn->sndbuf_desc->len >> 1)))
+ /* for a corked socket defer the RDMA writes if there
+ * is still sufficient sndbuf_space available
+ */
+ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
+ SMC_TX_CORK_DELAY);
+ else
+ smc_tx_sndbuf_nonempty(conn);
+ } /* while (msg_data_left(msg)) */
+
+ return send_done;
+
+out_err:
+ rc = sk_stream_error(sk, msg->msg_flags, rc);
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(rc == -EAGAIN))
+ sk->sk_write_space(sk);
+ return rc;
+}
+
+/***************************** sndbuf consumer *******************************/
+
+/* sndbuf consumer: actual data transfer of one target chunk with ISM write */
+int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
+ u32 offset, int signal)
+{
+ struct smc_ism_position pos;
+ int rc;
+
+ memset(&pos, 0, sizeof(pos));
+ pos.token = conn->peer_token;
+ pos.index = conn->peer_rmbe_idx;
+ pos.offset = conn->tx_off + offset;
+ pos.signal = signal;
+ rc = smc_ism_write(conn->lgr->smcd, &pos, data, len);
+ if (rc)
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ return rc;
+}
+
+/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
+static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
+ int num_sges, struct ib_rdma_wr *rdma_wr)
+{
+ struct smc_link_group *lgr = conn->lgr;
+ struct smc_link *link = conn->lnk;
+ int rc;
+
+ rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link);
+ rdma_wr->wr.num_sge = num_sges;
+ rdma_wr->remote_addr =
+ lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr +
+ /* RMBE within RMB */
+ conn->tx_off +
+ /* offset within RMBE */
+ peer_rmbe_offset;
+ rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey;
+ rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL);
+ if (rc)
+ smcr_link_down_cond_sched(link);
+ return rc;
+}
+
+/* sndbuf consumer */
+static inline void smc_tx_advance_cursors(struct smc_connection *conn,
+ union smc_host_cursor *prod,
+ union smc_host_cursor *sent,
+ size_t len)
+{
+ smc_curs_add(conn->peer_rmbe_size, prod, len);
+ /* increased in recv tasklet smc_cdc_msg_rcv() */
+ smp_mb__before_atomic();
+ /* data in flight reduces usable snd_wnd */
+ atomic_sub(len, &conn->peer_rmbe_space);
+ /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
+ smp_mb__after_atomic();
+ smc_curs_add(conn->sndbuf_desc->len, sent, len);
+}
+
+/* SMC-R helper for smc_tx_rdma_writes() */
+static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
+ size_t src_off, size_t src_len,
+ size_t dst_off, size_t dst_len,
+ struct smc_rdma_wr *wr_rdma_buf)
+{
+ struct smc_link *link = conn->lnk;
+
+ dma_addr_t dma_addr =
+ sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl);
+ int src_len_sum = src_len, dst_len_sum = dst_len;
+ int sent_count = src_off;
+ int srcchunk, dstchunk;
+ int num_sges;
+ int rc;
+
+ for (dstchunk = 0; dstchunk < 2; dstchunk++) {
+ struct ib_sge *sge =
+ wr_rdma_buf->wr_tx_rdma[dstchunk].wr.sg_list;
+
+ num_sges = 0;
+ for (srcchunk = 0; srcchunk < 2; srcchunk++) {
+ sge[srcchunk].addr = dma_addr + src_off;
+ sge[srcchunk].length = src_len;
+ num_sges++;
+
+ src_off += src_len;
+ if (src_off >= conn->sndbuf_desc->len)
+ src_off -= conn->sndbuf_desc->len;
+ /* modulo in send ring */
+ if (src_len_sum == dst_len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ src_len = dst_len - src_len; /* remainder */
+ src_len_sum += src_len;
+ }
+ rc = smc_tx_rdma_write(conn, dst_off, num_sges,
+ &wr_rdma_buf->wr_tx_rdma[dstchunk]);
+ if (rc)
+ return rc;
+ if (dst_len_sum == len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ dst_off = 0; /* modulo offset in RMBE ring buffer */
+ dst_len = len - dst_len; /* remainder */
+ dst_len_sum += dst_len;
+ src_len = min_t(int, dst_len, conn->sndbuf_desc->len -
+ sent_count);
+ src_len_sum = src_len;
+ }
+ return 0;
+}
+
+/* SMC-D helper for smc_tx_rdma_writes() */
+static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len,
+ size_t src_off, size_t src_len,
+ size_t dst_off, size_t dst_len)
+{
+ int src_len_sum = src_len, dst_len_sum = dst_len;
+ int srcchunk, dstchunk;
+ int rc;
+
+ for (dstchunk = 0; dstchunk < 2; dstchunk++) {
+ for (srcchunk = 0; srcchunk < 2; srcchunk++) {
+ void *data = conn->sndbuf_desc->cpu_addr + src_off;
+
+ rc = smcd_tx_ism_write(conn, data, src_len, dst_off +
+ sizeof(struct smcd_cdc_msg), 0);
+ if (rc)
+ return rc;
+ dst_off += src_len;
+ src_off += src_len;
+ if (src_off >= conn->sndbuf_desc->len)
+ src_off -= conn->sndbuf_desc->len;
+ /* modulo in send ring */
+ if (src_len_sum == dst_len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ src_len = dst_len - src_len; /* remainder */
+ src_len_sum += src_len;
+ }
+ if (dst_len_sum == len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ dst_off = 0; /* modulo offset in RMBE ring buffer */
+ dst_len = len - dst_len; /* remainder */
+ dst_len_sum += dst_len;
+ src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off);
+ src_len_sum = src_len;
+ }
+ return 0;
+}
+
+/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
+ * usable snd_wnd as max transmit
+ */
+static int smc_tx_rdma_writes(struct smc_connection *conn,
+ struct smc_rdma_wr *wr_rdma_buf)
+{
+ size_t len, src_len, dst_off, dst_len; /* current chunk values */
+ union smc_host_cursor sent, prep, prod, cons;
+ struct smc_cdc_producer_flags *pflags;
+ int to_send, rmbespace;
+ int rc;
+
+ /* source: sndbuf */
+ smc_curs_copy(&sent, &conn->tx_curs_sent, conn);
+ smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
+ /* cf. wmem_alloc - (snd_max - snd_una) */
+ to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
+ if (to_send <= 0)
+ return 0;
+
+ /* destination: RMBE */
+ /* cf. snd_wnd */
+ rmbespace = atomic_read(&conn->peer_rmbe_space);
+ if (rmbespace <= 0)
+ return 0;
+ smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
+ smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
+
+ /* if usable snd_wnd closes ask peer to advertise once it opens again */
+ pflags = &conn->local_tx_ctrl.prod_flags;
+ pflags->write_blocked = (to_send >= rmbespace);
+ /* cf. usable snd_wnd */
+ len = min(to_send, rmbespace);
+
+ /* initialize variables for first iteration of subsequent nested loop */
+ dst_off = prod.count;
+ if (prod.wrap == cons.wrap) {
+ /* the filled destination area is unwrapped,
+ * hence the available free destination space is wrapped
+ * and we need 2 destination chunks of sum len; start with 1st
+ * which is limited by what's available in sndbuf
+ */
+ dst_len = min_t(size_t,
+ conn->peer_rmbe_size - prod.count, len);
+ } else {
+ /* the filled destination area is wrapped,
+ * hence the available free destination space is unwrapped
+ * and we need a single destination chunk of entire len
+ */
+ dst_len = len;
+ }
+ /* dst_len determines the maximum src_len */
+ if (sent.count + dst_len <= conn->sndbuf_desc->len) {
+ /* unwrapped src case: single chunk of entire dst_len */
+ src_len = dst_len;
+ } else {
+ /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
+ src_len = conn->sndbuf_desc->len - sent.count;
+ }
+
+ if (conn->lgr->is_smcd)
+ rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len,
+ dst_off, dst_len);
+ else
+ rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len,
+ dst_off, dst_len, wr_rdma_buf);
+ if (rc)
+ return rc;
+
+ if (conn->urg_tx_pend && len == to_send)
+ pflags->urg_data_present = 1;
+ smc_tx_advance_cursors(conn, &prod, &sent, len);
+ /* update connection's cursors with advanced local cursors */
+ smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn);
+ /* dst: peer RMBE */
+ smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */
+
+ return 0;
+}
+
+/* Wakeup sndbuf consumers from any context (IRQ or process)
+ * since there is more data to transmit; usable snd_wnd as max transmit
+ */
+static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
+ struct smc_link *link = conn->lnk;
+ struct smc_rdma_wr *wr_rdma_buf;
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ if (!link || !smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend);
+ if (rc < 0) {
+ smc_wr_tx_link_put(link);
+ if (rc == -EBUSY) {
+ struct smc_sock *smc =
+ container_of(conn, struct smc_sock, conn);
+
+ if (smc->sk.sk_err == ECONNABORTED)
+ return sock_error(&smc->sk);
+ if (conn->killed)
+ return -EPIPE;
+ rc = 0;
+ mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
+ SMC_TX_WORK_DELAY);
+ }
+ return rc;
+ }
+
+ spin_lock_bh(&conn->send_lock);
+ if (link != conn->lnk) {
+ /* link of connection changed, tx_work will restart */
+ smc_wr_tx_put_slot(link,
+ (struct smc_wr_tx_pend_priv *)pend);
+ rc = -ENOLINK;
+ goto out_unlock;
+ }
+ if (!pflags->urg_data_present) {
+ rc = smc_tx_rdma_writes(conn, wr_rdma_buf);
+ if (rc) {
+ smc_wr_tx_put_slot(link,
+ (struct smc_wr_tx_pend_priv *)pend);
+ goto out_unlock;
+ }
+ }
+
+ rc = smc_cdc_msg_send(conn, wr_buf, pend);
+ if (!rc && pflags->urg_data_present) {
+ pflags->urg_data_pending = 0;
+ pflags->urg_data_present = 0;
+ }
+
+out_unlock:
+ spin_unlock_bh(&conn->send_lock);
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
+ int rc = 0;
+
+ spin_lock_bh(&conn->send_lock);
+ if (!pflags->urg_data_present)
+ rc = smc_tx_rdma_writes(conn, NULL);
+ if (!rc)
+ rc = smcd_cdc_msg_send(conn);
+
+ if (!rc && pflags->urg_data_present) {
+ pflags->urg_data_pending = 0;
+ pflags->urg_data_present = 0;
+ }
+ spin_unlock_bh(&conn->send_lock);
+ return rc;
+}
+
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ int rc;
+
+ if (conn->killed ||
+ conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
+ return -EPIPE; /* connection being aborted */
+ if (conn->lgr->is_smcd)
+ rc = smcd_tx_sndbuf_nonempty(conn);
+ else
+ rc = smcr_tx_sndbuf_nonempty(conn);
+
+ if (!rc) {
+ /* trigger socket release if connection is closing */
+ struct smc_sock *smc = container_of(conn, struct smc_sock,
+ conn);
+ smc_close_wake_tx_prepared(smc);
+ }
+ return rc;
+}
+
+/* Wakeup sndbuf consumers from process context
+ * since there is more data to transmit
+ */
+void smc_tx_work(struct work_struct *work)
+{
+ struct smc_connection *conn = container_of(to_delayed_work(work),
+ struct smc_connection,
+ tx_work);
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ int rc;
+
+ lock_sock(&smc->sk);
+ if (smc->sk.sk_err)
+ goto out;
+
+ rc = smc_tx_sndbuf_nonempty(conn);
+ if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked &&
+ !atomic_read(&conn->bytes_to_rcv))
+ conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+
+out:
+ release_sock(&smc->sk);
+}
+
+void smc_tx_consumer_update(struct smc_connection *conn, bool force)
+{
+ union smc_host_cursor cfed, cons, prod;
+ int sender_free = conn->rmb_desc->len;
+ int to_confirm;
+
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn);
+ to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
+ if (to_confirm > conn->rmbe_update_limit) {
+ smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn);
+ sender_free = conn->rmb_desc->len -
+ smc_curs_diff_large(conn->rmb_desc->len,
+ &cfed, &prod);
+ }
+
+ if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+ force ||
+ ((to_confirm > conn->rmbe_update_limit) &&
+ ((sender_free <= (conn->rmb_desc->len / 2)) ||
+ conn->local_rx_ctrl.prod_flags.write_blocked))) {
+ if (conn->killed ||
+ conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
+ return;
+ if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
+ !conn->killed) {
+ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
+ SMC_TX_WORK_DELAY);
+ return;
+ }
+ }
+ if (conn->local_rx_ctrl.prod_flags.write_blocked &&
+ !atomic_read(&conn->bytes_to_rcv))
+ conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+}
+
+/***************************** send initialize *******************************/
+
+/* Initialize send properties on connection establishment. NB: not __init! */
+void smc_tx_init(struct smc_sock *smc)
+{
+ smc->sk.sk_write_space = smc_tx_write_space;
+}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
new file mode 100644
index 000000000..07e6ad762
--- /dev/null
+++ b/net/smc/smc_tx.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_TX_H
+#define SMC_TX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+#include "smc_cdc.h"
+
+static inline int smc_tx_prepared_sends(struct smc_connection *conn)
+{
+ union smc_host_cursor sent, prep;
+
+ smc_curs_copy(&sent, &conn->tx_curs_sent, conn);
+ smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
+ return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
+}
+
+void smc_tx_work(struct work_struct *work);
+void smc_tx_init(struct smc_sock *smc);
+int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
+void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
+void smc_tx_consumer_update(struct smc_connection *conn, bool force);
+int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
+ u32 offset, int signal);
+
+#endif /* SMC_TX_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
new file mode 100644
index 000000000..5a81f8c9e
--- /dev/null
+++ b/net/smc/smc_wr.c
@@ -0,0 +1,720 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Work requests (WR) of type ib_post_send or ib_post_recv respectively
+ * are submitted to either RC SQ or RC RQ respectively
+ * (reliably connected send/receive queue)
+ * and become work queue entries (WQEs).
+ * While an SQ WR/WQE is pending, we track it until transmission completion.
+ * Through a send or receive completion queue (CQ) respectively,
+ * we get completion queue entries (CQEs) [aka work completions (WCs)].
+ * Since the CQ callback is called from IRQ context, we split work by using
+ * bottom halves implemented by tasklets.
+ *
+ * SMC uses this to exchange LLC (link layer control)
+ * and CDC (connection data control) messages.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#include <linux/atomic.h>
+#include <linux/hashtable.h>
+#include <linux/wait.h>
+#include <rdma/ib_verbs.h>
+#include <asm/div64.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+
+#define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
+
+#define SMC_WR_RX_HASH_BITS 4
+static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
+static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
+
+struct smc_wr_tx_pend { /* control data for a pending send request */
+ u64 wr_id; /* work request id sent */
+ smc_wr_tx_handler handler;
+ enum ib_wc_status wc_status; /* CQE status */
+ struct smc_link *link;
+ u32 idx;
+ struct smc_wr_tx_pend_priv priv;
+ u8 compl_requested;
+};
+
+/******************************** send queue *********************************/
+
+/*------------------------------- completion --------------------------------*/
+
+/* returns true if at least one tx work request is pending on the given link */
+static inline bool smc_wr_is_tx_pend(struct smc_link *link)
+{
+ if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) !=
+ link->wr_tx_cnt) {
+ return true;
+ }
+ return false;
+}
+
+/* wait till all pending tx work requests on the given link are completed */
+void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
+{
+ wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
+}
+
+static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
+{
+ u32 i;
+
+ for (i = 0; i < link->wr_tx_cnt; i++) {
+ if (link->wr_tx_pends[i].wr_id == wr_id)
+ return i;
+ }
+ return link->wr_tx_cnt;
+}
+
+static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
+{
+ struct smc_wr_tx_pend pnd_snd;
+ struct smc_link *link;
+ u32 pnd_snd_idx;
+
+ link = wc->qp->qp_context;
+
+ if (wc->opcode == IB_WC_REG_MR) {
+ if (wc->status)
+ link->wr_reg_state = FAILED;
+ else
+ link->wr_reg_state = CONFIRMED;
+ smc_wr_wakeup_reg_wait(link);
+ return;
+ }
+
+ pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
+ if (pnd_snd_idx == link->wr_tx_cnt)
+ return;
+ link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
+ if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
+ complete(&link->wr_tx_compl[pnd_snd_idx]);
+ memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
+ /* clear the full struct smc_wr_tx_pend including .priv */
+ memset(&link->wr_tx_pends[pnd_snd_idx], 0,
+ sizeof(link->wr_tx_pends[pnd_snd_idx]));
+ memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
+ sizeof(link->wr_tx_bufs[pnd_snd_idx]));
+ if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
+ return;
+ if (wc->status) {
+ /* terminate link */
+ smcr_link_down_cond_sched(link);
+ }
+ if (pnd_snd.handler)
+ pnd_snd.handler(&pnd_snd.priv, link, wc->status);
+ wake_up(&link->wr_tx_wait);
+}
+
+static void smc_wr_tx_tasklet_fn(unsigned long data)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)data;
+ struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+ int i = 0, rc;
+ int polled = 0;
+
+again:
+ polled++;
+ do {
+ memset(&wc, 0, sizeof(wc));
+ rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
+ if (polled == 1) {
+ ib_req_notify_cq(dev->roce_cq_send,
+ IB_CQ_NEXT_COMP |
+ IB_CQ_REPORT_MISSED_EVENTS);
+ }
+ if (!rc)
+ break;
+ for (i = 0; i < rc; i++)
+ smc_wr_tx_process_cqe(&wc[i]);
+ } while (rc > 0);
+ if (polled == 1)
+ goto again;
+}
+
+void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+ tasklet_schedule(&dev->send_tasklet);
+}
+
+/*---------------------------- request submission ---------------------------*/
+
+static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
+{
+ *idx = link->wr_tx_cnt;
+ if (!smc_link_sendable(link))
+ return -ENOLINK;
+ for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
+ if (!test_and_set_bit(*idx, link->wr_tx_mask))
+ return 0;
+ }
+ *idx = link->wr_tx_cnt;
+ return -EBUSY;
+}
+
+/**
+ * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
+ * and sets info for pending transmit tracking
+ * @link: Pointer to smc_link used to later send the message.
+ * @handler: Send completion handler function pointer.
+ * @wr_buf: Out value returns pointer to message buffer.
+ * @wr_rdma_buf: Out value returns pointer to rdma work request.
+ * @wr_pend_priv: Out value returns pointer serving as handler context.
+ *
+ * Return: 0 on success, or -errno on error.
+ */
+int smc_wr_tx_get_free_slot(struct smc_link *link,
+ smc_wr_tx_handler handler,
+ struct smc_wr_buf **wr_buf,
+ struct smc_rdma_wr **wr_rdma_buf,
+ struct smc_wr_tx_pend_priv **wr_pend_priv)
+{
+ struct smc_link_group *lgr = smc_get_lgr(link);
+ struct smc_wr_tx_pend *wr_pend;
+ u32 idx = link->wr_tx_cnt;
+ struct ib_send_wr *wr_ib;
+ u64 wr_id;
+ int rc;
+
+ *wr_buf = NULL;
+ *wr_pend_priv = NULL;
+ if (in_softirq() || lgr->terminating) {
+ rc = smc_wr_tx_get_free_slot_index(link, &idx);
+ if (rc)
+ return rc;
+ } else {
+ rc = wait_event_interruptible_timeout(
+ link->wr_tx_wait,
+ !smc_link_sendable(link) ||
+ lgr->terminating ||
+ (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
+ SMC_WR_TX_WAIT_FREE_SLOT_TIME);
+ if (!rc) {
+ /* timeout - terminate link */
+ smcr_link_down_cond_sched(link);
+ return -EPIPE;
+ }
+ if (idx == link->wr_tx_cnt)
+ return -EPIPE;
+ }
+ wr_id = smc_wr_tx_get_next_wr_id(link);
+ wr_pend = &link->wr_tx_pends[idx];
+ wr_pend->wr_id = wr_id;
+ wr_pend->handler = handler;
+ wr_pend->link = link;
+ wr_pend->idx = idx;
+ wr_ib = &link->wr_tx_ibs[idx];
+ wr_ib->wr_id = wr_id;
+ *wr_buf = &link->wr_tx_bufs[idx];
+ if (wr_rdma_buf)
+ *wr_rdma_buf = &link->wr_tx_rdmas[idx];
+ *wr_pend_priv = &wr_pend->priv;
+ return 0;
+}
+
+int smc_wr_tx_put_slot(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *wr_pend_priv)
+{
+ struct smc_wr_tx_pend *pend;
+
+ pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
+ if (pend->idx < link->wr_tx_cnt) {
+ u32 idx = pend->idx;
+
+ /* clear the full struct smc_wr_tx_pend including .priv */
+ memset(&link->wr_tx_pends[idx], 0,
+ sizeof(link->wr_tx_pends[idx]));
+ memset(&link->wr_tx_bufs[idx], 0,
+ sizeof(link->wr_tx_bufs[idx]));
+ test_and_clear_bit(idx, link->wr_tx_mask);
+ wake_up(&link->wr_tx_wait);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Send prepared WR slot via ib_post_send.
+ * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
+ */
+int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
+{
+ struct smc_wr_tx_pend *pend;
+ int rc;
+
+ ib_req_notify_cq(link->smcibdev->roce_cq_send,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ pend = container_of(priv, struct smc_wr_tx_pend, priv);
+ rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
+ if (rc) {
+ smc_wr_tx_put_slot(link, priv);
+ smcr_link_down_cond_sched(link);
+ }
+ return rc;
+}
+
+/* Send prepared WR slot via ib_post_send and wait for send completion
+ * notification.
+ * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
+ */
+int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
+ unsigned long timeout)
+{
+ struct smc_wr_tx_pend *pend;
+ u32 pnd_idx;
+ int rc;
+
+ pend = container_of(priv, struct smc_wr_tx_pend, priv);
+ pend->compl_requested = 1;
+ pnd_idx = pend->idx;
+ init_completion(&link->wr_tx_compl[pnd_idx]);
+
+ rc = smc_wr_tx_send(link, priv);
+ if (rc)
+ return rc;
+ /* wait for completion by smc_wr_tx_process_cqe() */
+ rc = wait_for_completion_interruptible_timeout(
+ &link->wr_tx_compl[pnd_idx], timeout);
+ if (rc <= 0)
+ rc = -ENODATA;
+ if (rc > 0)
+ rc = 0;
+ return rc;
+}
+
+/* Register a memory region and wait for result. */
+int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
+{
+ int rc;
+
+ ib_req_notify_cq(link->smcibdev->roce_cq_send,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ link->wr_reg_state = POSTED;
+ link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
+ link->wr_reg.mr = mr;
+ link->wr_reg.key = mr->rkey;
+ rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
+ if (rc)
+ return rc;
+
+ atomic_inc(&link->wr_reg_refcnt);
+ rc = wait_event_interruptible_timeout(link->wr_reg_wait,
+ (link->wr_reg_state != POSTED),
+ SMC_WR_REG_MR_WAIT_TIME);
+ if (atomic_dec_and_test(&link->wr_reg_refcnt))
+ wake_up_all(&link->wr_reg_wait);
+ if (!rc) {
+ /* timeout - terminate link */
+ smcr_link_down_cond_sched(link);
+ return -EPIPE;
+ }
+ if (rc == -ERESTARTSYS)
+ return -EINTR;
+ switch (link->wr_reg_state) {
+ case CONFIRMED:
+ rc = 0;
+ break;
+ case FAILED:
+ rc = -EIO;
+ break;
+ case POSTED:
+ rc = -EPIPE;
+ break;
+ }
+ return rc;
+}
+
+/****************************** receive queue ********************************/
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
+{
+ struct smc_wr_rx_handler *h_iter;
+ int rc = 0;
+
+ spin_lock(&smc_wr_rx_hash_lock);
+ hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
+ if (h_iter->type == handler->type) {
+ rc = -EEXIST;
+ goto out_unlock;
+ }
+ }
+ hash_add(smc_wr_rx_hash, &handler->list, handler->type);
+out_unlock:
+ spin_unlock(&smc_wr_rx_hash_lock);
+ return rc;
+}
+
+/* Demultiplex a received work request based on the message type to its handler.
+ * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
+ * and not being modified any more afterwards so we don't need to lock it.
+ */
+static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ struct smc_wr_rx_handler *handler;
+ struct smc_wr_rx_hdr *wr_rx;
+ u64 temp_wr_id;
+ u32 index;
+
+ if (wc->byte_len < sizeof(*wr_rx))
+ return; /* short message */
+ temp_wr_id = wc->wr_id;
+ index = do_div(temp_wr_id, link->wr_rx_cnt);
+ wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
+ hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
+ if (handler->type == wr_rx->type)
+ handler->handler(wc, wr_rx);
+ }
+}
+
+static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
+{
+ struct smc_link *link;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ link = wc[i].qp->qp_context;
+ if (wc[i].status == IB_WC_SUCCESS) {
+ link->wr_rx_tstamp = jiffies;
+ smc_wr_rx_demultiplex(&wc[i]);
+ smc_wr_rx_post(link); /* refill WR RX */
+ } else {
+ /* handle status errors */
+ switch (wc[i].status) {
+ case IB_WC_RETRY_EXC_ERR:
+ case IB_WC_RNR_RETRY_EXC_ERR:
+ case IB_WC_WR_FLUSH_ERR:
+ smcr_link_down_cond_sched(link);
+ break;
+ default:
+ smc_wr_rx_post(link); /* refill WR RX */
+ break;
+ }
+ }
+ }
+}
+
+static void smc_wr_rx_tasklet_fn(unsigned long data)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)data;
+ struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+ int polled = 0;
+ int rc;
+
+again:
+ polled++;
+ do {
+ memset(&wc, 0, sizeof(wc));
+ rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
+ if (polled == 1) {
+ ib_req_notify_cq(dev->roce_cq_recv,
+ IB_CQ_SOLICITED_MASK
+ | IB_CQ_REPORT_MISSED_EVENTS);
+ }
+ if (!rc)
+ break;
+ smc_wr_rx_process_cqes(&wc[0], rc);
+ } while (rc > 0);
+ if (polled == 1)
+ goto again;
+}
+
+void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+ tasklet_schedule(&dev->recv_tasklet);
+}
+
+int smc_wr_rx_post_init(struct smc_link *link)
+{
+ u32 i;
+ int rc = 0;
+
+ for (i = 0; i < link->wr_rx_cnt; i++)
+ rc = smc_wr_rx_post(link);
+ return rc;
+}
+
+/***************************** init, exit, misc ******************************/
+
+void smc_wr_remember_qp_attr(struct smc_link *lnk)
+{
+ struct ib_qp_attr *attr = &lnk->qp_attr;
+ struct ib_qp_init_attr init_attr;
+
+ memset(attr, 0, sizeof(*attr));
+ memset(&init_attr, 0, sizeof(init_attr));
+ ib_query_qp(lnk->roce_qp, attr,
+ IB_QP_STATE |
+ IB_QP_CUR_STATE |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY |
+ IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_RQ_PSN |
+ IB_QP_ALT_PATH |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_SQ_PSN |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_CAP |
+ IB_QP_DEST_QPN,
+ &init_attr);
+
+ lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
+ lnk->qp_attr.cap.max_send_wr);
+ lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
+ lnk->qp_attr.cap.max_recv_wr);
+}
+
+static void smc_wr_init_sge(struct smc_link *lnk)
+{
+ u32 i;
+
+ for (i = 0; i < lnk->wr_tx_cnt; i++) {
+ lnk->wr_tx_sges[i].addr =
+ lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
+ lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
+ lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
+ lnk->roce_pd->local_dma_lkey;
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
+ lnk->roce_pd->local_dma_lkey;
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
+ lnk->roce_pd->local_dma_lkey;
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
+ lnk->roce_pd->local_dma_lkey;
+ lnk->wr_tx_ibs[i].next = NULL;
+ lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
+ lnk->wr_tx_ibs[i].num_sge = 1;
+ lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
+ lnk->wr_tx_ibs[i].send_flags =
+ IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
+ }
+ for (i = 0; i < lnk->wr_rx_cnt; i++) {
+ lnk->wr_rx_sges[i].addr =
+ lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
+ lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
+ lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
+ lnk->wr_rx_ibs[i].next = NULL;
+ lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
+ lnk->wr_rx_ibs[i].num_sge = 1;
+ }
+ lnk->wr_reg.wr.next = NULL;
+ lnk->wr_reg.wr.num_sge = 0;
+ lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
+ lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
+ lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
+}
+
+void smc_wr_free_link(struct smc_link *lnk)
+{
+ struct ib_device *ibdev;
+
+ if (!lnk->smcibdev)
+ return;
+ ibdev = lnk->smcibdev->ibdev;
+
+ smc_wr_wakeup_reg_wait(lnk);
+ smc_wr_wakeup_tx_wait(lnk);
+
+ smc_wr_tx_wait_no_pending_sends(lnk);
+ wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt)));
+ wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt)));
+
+ if (lnk->wr_rx_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_dma_addr = 0;
+ }
+ if (lnk->wr_tx_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+ DMA_TO_DEVICE);
+ lnk->wr_tx_dma_addr = 0;
+ }
+}
+
+void smc_wr_free_link_mem(struct smc_link *lnk)
+{
+ kfree(lnk->wr_tx_compl);
+ lnk->wr_tx_compl = NULL;
+ kfree(lnk->wr_tx_pends);
+ lnk->wr_tx_pends = NULL;
+ kfree(lnk->wr_tx_mask);
+ lnk->wr_tx_mask = NULL;
+ kfree(lnk->wr_tx_sges);
+ lnk->wr_tx_sges = NULL;
+ kfree(lnk->wr_tx_rdma_sges);
+ lnk->wr_tx_rdma_sges = NULL;
+ kfree(lnk->wr_rx_sges);
+ lnk->wr_rx_sges = NULL;
+ kfree(lnk->wr_tx_rdmas);
+ lnk->wr_tx_rdmas = NULL;
+ kfree(lnk->wr_rx_ibs);
+ lnk->wr_rx_ibs = NULL;
+ kfree(lnk->wr_tx_ibs);
+ lnk->wr_tx_ibs = NULL;
+ kfree(lnk->wr_tx_bufs);
+ lnk->wr_tx_bufs = NULL;
+ kfree(lnk->wr_rx_bufs);
+ lnk->wr_rx_bufs = NULL;
+}
+
+int smc_wr_alloc_link_mem(struct smc_link *link)
+{
+ /* allocate link related memory */
+ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
+ if (!link->wr_tx_bufs)
+ goto no_mem;
+ link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
+ GFP_KERNEL);
+ if (!link->wr_rx_bufs)
+ goto no_mem_wr_tx_bufs;
+ link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_ibs)
+ goto no_mem_wr_rx_bufs;
+ link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
+ sizeof(link->wr_rx_ibs[0]),
+ GFP_KERNEL);
+ if (!link->wr_rx_ibs)
+ goto no_mem_wr_tx_ibs;
+ link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT,
+ sizeof(link->wr_tx_rdmas[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_rdmas)
+ goto no_mem_wr_rx_ibs;
+ link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT,
+ sizeof(link->wr_tx_rdma_sges[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_rdma_sges)
+ goto no_mem_wr_tx_rdmas;
+ link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_sges)
+ goto no_mem_wr_tx_rdma_sges;
+ link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
+ sizeof(link->wr_rx_sges[0]),
+ GFP_KERNEL);
+ if (!link->wr_rx_sges)
+ goto no_mem_wr_tx_sges;
+ link->wr_tx_mask = kcalloc(BITS_TO_LONGS(SMC_WR_BUF_CNT),
+ sizeof(*link->wr_tx_mask),
+ GFP_KERNEL);
+ if (!link->wr_tx_mask)
+ goto no_mem_wr_rx_sges;
+ link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
+ sizeof(link->wr_tx_pends[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_pends)
+ goto no_mem_wr_tx_mask;
+ link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
+ sizeof(link->wr_tx_compl[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_compl)
+ goto no_mem_wr_tx_pends;
+ return 0;
+
+no_mem_wr_tx_pends:
+ kfree(link->wr_tx_pends);
+no_mem_wr_tx_mask:
+ kfree(link->wr_tx_mask);
+no_mem_wr_rx_sges:
+ kfree(link->wr_rx_sges);
+no_mem_wr_tx_sges:
+ kfree(link->wr_tx_sges);
+no_mem_wr_tx_rdma_sges:
+ kfree(link->wr_tx_rdma_sges);
+no_mem_wr_tx_rdmas:
+ kfree(link->wr_tx_rdmas);
+no_mem_wr_rx_ibs:
+ kfree(link->wr_rx_ibs);
+no_mem_wr_tx_ibs:
+ kfree(link->wr_tx_ibs);
+no_mem_wr_rx_bufs:
+ kfree(link->wr_rx_bufs);
+no_mem_wr_tx_bufs:
+ kfree(link->wr_tx_bufs);
+no_mem:
+ return -ENOMEM;
+}
+
+void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
+{
+ tasklet_kill(&smcibdev->recv_tasklet);
+ tasklet_kill(&smcibdev->send_tasklet);
+}
+
+void smc_wr_add_dev(struct smc_ib_device *smcibdev)
+{
+ tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn,
+ (unsigned long)smcibdev);
+ tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn,
+ (unsigned long)smcibdev);
+}
+
+int smc_wr_create_link(struct smc_link *lnk)
+{
+ struct ib_device *ibdev = lnk->smcibdev->ibdev;
+ int rc = 0;
+
+ smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
+ lnk->wr_rx_id = 0;
+ lnk->wr_rx_dma_addr = ib_dma_map_single(
+ ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
+ lnk->wr_rx_dma_addr = 0;
+ rc = -EIO;
+ goto out;
+ }
+ lnk->wr_tx_dma_addr = ib_dma_map_single(
+ ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
+ rc = -EIO;
+ goto dma_unmap;
+ }
+ smc_wr_init_sge(lnk);
+ memset(lnk->wr_tx_mask, 0,
+ BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+ init_waitqueue_head(&lnk->wr_tx_wait);
+ atomic_set(&lnk->wr_tx_refcnt, 0);
+ init_waitqueue_head(&lnk->wr_reg_wait);
+ atomic_set(&lnk->wr_reg_refcnt, 0);
+ return rc;
+
+dma_unmap:
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_dma_addr = 0;
+out:
+ return rc;
+}
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
new file mode 100644
index 000000000..cb58e6007
--- /dev/null
+++ b/net/smc/smc_wr.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_WR_H
+#define SMC_WR_H
+
+#include <linux/atomic.h>
+#include <rdma/ib_verbs.h>
+#include <asm/div64.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
+
+#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
+
+#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
+
+#define SMC_WR_TX_PEND_PRIV_SIZE 32
+
+struct smc_wr_tx_pend_priv {
+ u8 priv[SMC_WR_TX_PEND_PRIV_SIZE];
+};
+
+typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *,
+ struct smc_link *,
+ enum ib_wc_status);
+
+typedef bool (*smc_wr_tx_filter)(struct smc_wr_tx_pend_priv *,
+ unsigned long);
+
+typedef void (*smc_wr_tx_dismisser)(struct smc_wr_tx_pend_priv *);
+
+struct smc_wr_rx_handler {
+ struct hlist_node list; /* hash table collision resolution */
+ void (*handler)(struct ib_wc *, void *);
+ u8 type;
+};
+
+/* Only used by RDMA write WRs.
+ * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
+ */
+static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link)
+{
+ return atomic_long_inc_return(&link->wr_tx_id);
+}
+
+static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
+{
+ atomic_long_set(wr_tx_id, val);
+}
+
+static inline bool smc_wr_tx_link_hold(struct smc_link *link)
+{
+ if (!smc_link_sendable(link))
+ return false;
+ atomic_inc(&link->wr_tx_refcnt);
+ return true;
+}
+
+static inline void smc_wr_tx_link_put(struct smc_link *link)
+{
+ if (atomic_dec_and_test(&link->wr_tx_refcnt))
+ wake_up_all(&link->wr_tx_wait);
+}
+
+static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk)
+{
+ wake_up_all(&lnk->wr_tx_wait);
+}
+
+static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk)
+{
+ wake_up(&lnk->wr_reg_wait);
+}
+
+/* post a new receive work request to fill a completed old work request entry */
+static inline int smc_wr_rx_post(struct smc_link *link)
+{
+ int rc;
+ u64 wr_id, temp_wr_id;
+ u32 index;
+
+ wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
+ temp_wr_id = wr_id;
+ index = do_div(temp_wr_id, link->wr_rx_cnt);
+ link->wr_rx_ibs[index].wr_id = wr_id;
+ rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL);
+ return rc;
+}
+
+int smc_wr_create_link(struct smc_link *lnk);
+int smc_wr_alloc_link_mem(struct smc_link *lnk);
+void smc_wr_free_link(struct smc_link *lnk);
+void smc_wr_free_link_mem(struct smc_link *lnk);
+void smc_wr_remember_qp_attr(struct smc_link *lnk);
+void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
+void smc_wr_add_dev(struct smc_ib_device *smcibdev);
+
+int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
+ struct smc_wr_buf **wr_buf,
+ struct smc_rdma_wr **wrs,
+ struct smc_wr_tx_pend_priv **wr_pend_priv);
+int smc_wr_tx_put_slot(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *wr_pend_priv);
+int smc_wr_tx_send(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *wr_pend_priv);
+int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
+ unsigned long timeout);
+void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
+ smc_wr_tx_filter filter,
+ smc_wr_tx_dismisser dismisser,
+ unsigned long data);
+void smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
+int smc_wr_rx_post_init(struct smc_link *link);
+void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr);
+
+#endif /* SMC_WR_H */