diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /net/smc | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'net/smc')
36 files changed, 19700 insertions, 0 deletions
diff --git a/net/smc/Kconfig b/net/smc/Kconfig new file mode 100644 index 0000000000..746be39967 --- /dev/null +++ b/net/smc/Kconfig @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0-only +config SMC + tristate "SMC socket protocol family" + depends on INET && INFINIBAND + depends on m || ISM != m + help + SMC-R provides a "sockets over RDMA" solution making use of + RDMA over Converged Ethernet (RoCE) technology to upgrade + AF_INET TCP connections transparently. + The Linux implementation of the SMC-R solution is designed as + a separate socket family SMC. + + Select this option if you want to run SMC socket applications + +config SMC_DIAG + tristate "SMC: socket monitoring interface" + depends on SMC + help + Support for SMC socket monitoring interface used by tools such as + smcss. + + if unsure, say Y. diff --git a/net/smc/Makefile b/net/smc/Makefile new file mode 100644 index 0000000000..875efcd126 --- /dev/null +++ b/net/smc/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only +ccflags-y += -I$(src) +obj-$(CONFIG_SMC) += smc.o +obj-$(CONFIG_SMC_DIAG) += smc_diag.o +smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o +smc-y += smc_tracepoint.o +smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c new file mode 100644 index 0000000000..ef5b5d498e --- /dev/null +++ b/net/smc/af_smc.c @@ -0,0 +1,3606 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * AF_SMC protocol family socket handler keeping the AF_INET sock address type + * applies to SOCK_STREAM sockets only + * offers an alternative communication option for TCP-protocol sockets + * applicable with RoCE-cards only + * + * Initial restrictions: + * - support for alternate links postponed + * + * Copyright IBM Corp. 2016, 2018 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + * based on prototype from Frank Blaschka + */ + +#define KMSG_COMPONENT "smc" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/workqueue.h> +#include <linux/in.h> +#include <linux/sched/signal.h> +#include <linux/if_vlan.h> +#include <linux/rcupdate_wait.h> +#include <linux/ctype.h> +#include <linux/splice.h> + +#include <net/sock.h> +#include <net/tcp.h> +#include <net/smc.h> +#include <asm/ioctls.h> + +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include "smc_netns.h" + +#include "smc.h" +#include "smc_clc.h" +#include "smc_llc.h" +#include "smc_cdc.h" +#include "smc_core.h" +#include "smc_ib.h" +#include "smc_ism.h" +#include "smc_pnet.h" +#include "smc_netlink.h" +#include "smc_tx.h" +#include "smc_rx.h" +#include "smc_close.h" +#include "smc_stats.h" +#include "smc_tracepoint.h" +#include "smc_sysctl.h" + +static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group + * creation on server + */ +static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group + * creation on client + */ + +static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ +struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ +struct workqueue_struct *smc_close_wq; /* wq for close work */ + +static void smc_tcp_listen_work(struct work_struct *); +static void smc_connect_work(struct work_struct *); + +int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + void *hdr; + + if (cb_ctx->pos[0]) + goto out; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_DUMP_HS_LIMITATION); + if (!hdr) + return -ENOMEM; + + if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED, + sock_net(skb->sk)->smc.limit_smc_hs)) + goto err; + + genlmsg_end(skb, hdr); + cb_ctx->pos[0] = 1; +out: + return skb->len; +err: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info) +{ + sock_net(skb->sk)->smc.limit_smc_hs = true; + return 0; +} + +int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info) +{ + sock_net(skb->sk)->smc.limit_smc_hs = false; + return 0; +} + +static void smc_set_keepalive(struct sock *sk, int val) +{ + struct smc_sock *smc = smc_sk(sk); + + smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); +} + +static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) +{ + struct smc_sock *smc; + struct sock *child; + + smc = smc_clcsock_user_data(sk); + + if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) > + sk->sk_max_ack_backlog) + goto drop; + + if (sk_acceptq_is_full(&smc->sk)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + goto drop; + } + + /* passthrough to original syn recv sock fct */ + child = smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, + own_req); + /* child must not inherit smc or its ops */ + if (child) { + rcu_assign_sk_user_data(child, NULL); + + /* v4-mapped sockets don't inherit parent ops. Don't restore. */ + if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops) + inet_csk(child)->icsk_af_ops = smc->ori_af_ops; + } + return child; + +drop: + dst_release(dst); + tcp_listendrop(sk); + return NULL; +} + +static bool smc_hs_congested(const struct sock *sk) +{ + const struct smc_sock *smc; + + smc = smc_clcsock_user_data(sk); + + if (!smc) + return true; + + if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) + return true; + + return false; +} + +static struct smc_hashinfo smc_v4_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), +}; + +static struct smc_hashinfo smc_v6_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), +}; + +int smc_hash_sk(struct sock *sk) +{ + struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; + struct hlist_head *head; + + head = &h->ht; + + write_lock_bh(&h->lock); + sk_add_node(sk, head); + write_unlock_bh(&h->lock); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + + return 0; +} +EXPORT_SYMBOL_GPL(smc_hash_sk); + +void smc_unhash_sk(struct sock *sk) +{ + struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; + + write_lock_bh(&h->lock); + if (sk_del_node_init(sk)) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + write_unlock_bh(&h->lock); +} +EXPORT_SYMBOL_GPL(smc_unhash_sk); + +/* This will be called before user really release sock_lock. So do the + * work which we didn't do because of user hold the sock_lock in the + * BH context + */ +static void smc_release_cb(struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + if (smc->conn.tx_in_release_sock) { + smc_tx_pending(&smc->conn); + smc->conn.tx_in_release_sock = false; + } +} + +struct proto smc_proto = { + .name = "SMC", + .owner = THIS_MODULE, + .keepalive = smc_set_keepalive, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, + .obj_size = sizeof(struct smc_sock), + .h.smc_hash = &smc_v4_hashinfo, + .slab_flags = SLAB_TYPESAFE_BY_RCU, +}; +EXPORT_SYMBOL_GPL(smc_proto); + +struct proto smc_proto6 = { + .name = "SMC6", + .owner = THIS_MODULE, + .keepalive = smc_set_keepalive, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, + .obj_size = sizeof(struct smc_sock), + .h.smc_hash = &smc_v6_hashinfo, + .slab_flags = SLAB_TYPESAFE_BY_RCU, +}; +EXPORT_SYMBOL_GPL(smc_proto6); + +static void smc_fback_restore_callbacks(struct smc_sock *smc) +{ + struct sock *clcsk = smc->clcsock->sk; + + write_lock_bh(&clcsk->sk_callback_lock); + clcsk->sk_user_data = NULL; + + smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change); + smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready); + smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space); + smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_restore_fallback_changes(struct smc_sock *smc) +{ + if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ + smc->clcsock->file->private_data = smc->sk.sk_socket; + smc->clcsock->file = NULL; + smc_fback_restore_callbacks(smc); + } +} + +static int __smc_release(struct smc_sock *smc) +{ + struct sock *sk = &smc->sk; + int rc = 0; + + if (!smc->use_fallback) { + rc = smc_close_active(smc); + smc_sock_set_flag(sk, SOCK_DEAD); + sk->sk_shutdown |= SHUTDOWN_MASK; + } else { + if (sk->sk_state != SMC_CLOSED) { + if (sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_INIT) + sock_put(sk); /* passive closing */ + if (sk->sk_state == SMC_LISTEN) { + /* wake up clcsock accept */ + rc = kernel_sock_shutdown(smc->clcsock, + SHUT_RDWR); + } + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); + } + smc_restore_fallback_changes(smc); + } + + sk->sk_prot->unhash(sk); + + if (sk->sk_state == SMC_CLOSED) { + if (smc->clcsock) { + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); + } + if (!smc->use_fallback) + smc_conn_free(&smc->conn); + } + + return rc; +} + +static int smc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int old_state, rc = 0; + + if (!sk) + goto out; + + sock_hold(sk); /* sock_put below */ + smc = smc_sk(sk); + + old_state = sk->sk_state; + + /* cleanup for a dangling non-blocking connect */ + if (smc->connect_nonblock && old_state == SMC_INIT) + tcp_abort(smc->clcsock->sk, ECONNABORTED); + + if (cancel_work_sync(&smc->connect_work)) + sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ + + if (sk->sk_state == SMC_LISTEN) + /* smc_close_non_accepted() is called and acquires + * sock lock for child sockets again + */ + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + else + lock_sock(sk); + + if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE && + !smc->use_fallback) + smc_close_active_abort(smc); + + rc = __smc_release(smc); + + /* detach socket */ + sock_orphan(sk); + sock->sk = NULL; + release_sock(sk); + + sock_put(sk); /* sock_hold above */ + sock_put(sk); /* final sock_put */ +out: + return rc; +} + +static void smc_destruct(struct sock *sk) +{ + if (sk->sk_state != SMC_CLOSED) + return; + if (!sock_flag(sk, SOCK_DEAD)) + return; +} + +static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, + int protocol) +{ + struct smc_sock *smc; + struct proto *prot; + struct sock *sk; + + prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; + sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); + if (!sk) + return NULL; + + sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ + sk->sk_state = SMC_INIT; + sk->sk_destruct = smc_destruct; + sk->sk_protocol = protocol; + WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem)); + WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem)); + smc = smc_sk(sk); + INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + INIT_WORK(&smc->connect_work, smc_connect_work); + INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); + INIT_LIST_HEAD(&smc->accept_q); + spin_lock_init(&smc->accept_q_lock); + spin_lock_init(&smc->conn.send_lock); + sk->sk_prot->hash(sk); + mutex_init(&smc->clcsock_release_lock); + smc_init_saved_callbacks(smc); + + return sk; +} + +static int smc_bind(struct socket *sock, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc; + + smc = smc_sk(sk); + + /* replicate tests from inet_bind(), to be safe wrt. future changes */ + rc = -EINVAL; + if (addr_len < sizeof(struct sockaddr_in)) + goto out; + + rc = -EAFNOSUPPORT; + if (addr->sin_family != AF_INET && + addr->sin_family != AF_INET6 && + addr->sin_family != AF_UNSPEC) + goto out; + /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ + if (addr->sin_family == AF_UNSPEC && + addr->sin_addr.s_addr != htonl(INADDR_ANY)) + goto out; + + lock_sock(sk); + + /* Check if socket is already active */ + rc = -EINVAL; + if (sk->sk_state != SMC_INIT || smc->connect_nonblock) + goto out_rel; + + smc->clcsock->sk->sk_reuse = sk->sk_reuse; + smc->clcsock->sk->sk_reuseport = sk->sk_reuseport; + rc = kernel_bind(smc->clcsock, uaddr, addr_len); + +out_rel: + release_sock(sk); +out: + return rc; +} + +/* copy only relevant settings and flags of SOL_SOCKET level from smc to + * clc socket (since smc is not called for these options from net/core) + */ + +#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ + (1UL << SOCK_KEEPOPEN) | \ + (1UL << SOCK_LINGER) | \ + (1UL << SOCK_BROADCAST) | \ + (1UL << SOCK_TIMESTAMP) | \ + (1UL << SOCK_DBG) | \ + (1UL << SOCK_RCVTSTAMP) | \ + (1UL << SOCK_RCVTSTAMPNS) | \ + (1UL << SOCK_LOCALROUTE) | \ + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ + (1UL << SOCK_RXQ_OVFL) | \ + (1UL << SOCK_WIFI_STATUS) | \ + (1UL << SOCK_NOFCS) | \ + (1UL << SOCK_FILTER_LOCKED) | \ + (1UL << SOCK_TSTAMP_NEW)) + +/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */ +static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, + unsigned long mask) +{ + struct net *nnet = sock_net(nsk); + + nsk->sk_userlocks = osk->sk_userlocks; + if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) { + nsk->sk_sndbuf = osk->sk_sndbuf; + } else { + if (mask == SK_FLAGS_SMC_TO_CLC) + WRITE_ONCE(nsk->sk_sndbuf, + READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1])); + else + WRITE_ONCE(nsk->sk_sndbuf, + 2 * READ_ONCE(nnet->smc.sysctl_wmem)); + } + if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) { + nsk->sk_rcvbuf = osk->sk_rcvbuf; + } else { + if (mask == SK_FLAGS_SMC_TO_CLC) + WRITE_ONCE(nsk->sk_rcvbuf, + READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1])); + else + WRITE_ONCE(nsk->sk_rcvbuf, + 2 * READ_ONCE(nnet->smc.sysctl_rmem)); + } +} + +static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, + unsigned long mask) +{ + /* options we don't get control via setsockopt for */ + nsk->sk_type = osk->sk_type; + nsk->sk_sndtimeo = osk->sk_sndtimeo; + nsk->sk_rcvtimeo = osk->sk_rcvtimeo; + nsk->sk_mark = READ_ONCE(osk->sk_mark); + nsk->sk_priority = osk->sk_priority; + nsk->sk_rcvlowat = osk->sk_rcvlowat; + nsk->sk_bound_dev_if = osk->sk_bound_dev_if; + nsk->sk_err = osk->sk_err; + + nsk->sk_flags &= ~mask; + nsk->sk_flags |= osk->sk_flags & mask; + + smc_adjust_sock_bufsizes(nsk, osk, mask); +} + +static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) +{ + smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); +} + +#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ + (1UL << SOCK_KEEPOPEN) | \ + (1UL << SOCK_LINGER) | \ + (1UL << SOCK_DBG)) +/* copy only settings and flags relevant for smc from clc to smc socket */ +static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) +{ + smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); +} + +/* register the new vzalloced sndbuf on all links */ +static int smcr_lgr_reg_sndbufs(struct smc_link *link, + struct smc_buf_desc *snd_desc) +{ + struct smc_link_group *lgr = link->lgr; + int i, rc = 0; + + if (!snd_desc->is_vm) + return -EINVAL; + + /* protect against parallel smcr_link_reg_buf() */ + down_write(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc); + if (rc) + break; + } + up_write(&lgr->llc_conf_mutex); + return rc; +} + +/* register the new rmb on all links */ +static int smcr_lgr_reg_rmbs(struct smc_link *link, + struct smc_buf_desc *rmb_desc) +{ + struct smc_link_group *lgr = link->lgr; + bool do_slow = false; + int i, rc = 0; + + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (rc) + return rc; + + down_read(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + if (!rmb_desc->is_reg_mr[link->link_idx]) { + up_read(&lgr->llc_conf_mutex); + goto slow_path; + } + } + /* mr register already */ + goto fast_path; +slow_path: + do_slow = true; + /* protect against parallel smc_llc_cli_rkey_exchange() and + * parallel smcr_link_reg_buf() + */ + down_write(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc); + if (rc) + goto out; + } +fast_path: + /* exchange confirm_rkey msg with peer */ + rc = smc_llc_do_confirm_rkey(link, rmb_desc); + if (rc) { + rc = -EFAULT; + goto out; + } + rmb_desc->is_conf_rkey = true; +out: + do_slow ? up_write(&lgr->llc_conf_mutex) : up_read(&lgr->llc_conf_mutex); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + return rc; +} + +static int smcr_clnt_conf_first_link(struct smc_sock *smc) +{ + struct smc_link *link = smc->conn.lnk; + struct smc_llc_qentry *qentry; + int rc; + + /* Receive CONFIRM LINK request from server over RoCE fabric. + * Increasing the client's timeout by twice as much as the server's + * timeout by default can temporarily avoid decline messages of + * both sides crossing or colliding + */ + qentry = smc_llc_wait(link->lgr, NULL, 2 * SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; + } + smc_llc_save_peer_uid(qentry); + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) + return SMC_CLC_DECL_RMBE_EC; + + rc = smc_ib_modify_qp_rts(link); + if (rc) + return SMC_CLC_DECL_ERR_RDYLNK; + + smc_wr_remember_qp_attr(link); + + /* reg the sndbuf if it was vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; + + /* send CONFIRM LINK response over RoCE fabric */ + rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); + if (rc < 0) + return SMC_CLC_DECL_TIMEOUT_CL; + + smc_llc_link_active(link); + smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); + + if (link->lgr->max_links > 1) { + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; + } + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + smc_llc_cli_add_link(link, qentry); + } + return 0; +} + +static bool smc_isascii(char *hostname) +{ + int i; + + for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++) + if (!isascii(hostname[i])) + return false; + return true; +} + +static void smc_conn_save_peer_info_fce(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)clc; + struct smc_clc_first_contact_ext *fce; + int clc_v2_len; + + if (clc->hdr.version == SMC_V1 || + !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) + return; + + if (smc->conn.lgr->is_smcd) { + memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid, + SMC_MAX_EID_LEN); + clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, + d1); + } else { + memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid, + SMC_MAX_EID_LEN); + clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, + r1); + } + fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len); + smc->conn.lgr->peer_os = fce->os_type; + smc->conn.lgr->peer_smc_release = fce->release; + if (smc_isascii(fce->hostname)) + memcpy(smc->conn.lgr->peer_hostname, fce->hostname, + SMC_MAX_HOSTNAME_LEN); +} + +static void smcr_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size); + + smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx; + smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token); + smc->conn.peer_rmbe_size = bufsize; + atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); + smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); +} + +static void smcd_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size); + + smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx; + smc->conn.peer_token = ntohll(clc->d0.token); + /* msg header takes up space in the buffer */ + smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); + atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); + smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx; +} + +static void smc_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + if (smc->conn.lgr->is_smcd) + smcd_conn_save_peer_info(smc, clc); + else + smcr_conn_save_peer_info(smc, clc); + smc_conn_save_peer_info_fce(smc, clc); +} + +static void smc_link_save_peer_info(struct smc_link *link, + struct smc_clc_msg_accept_confirm *clc, + struct smc_init_info *ini) +{ + link->peer_qpn = ntoh24(clc->r0.qpn); + memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE); + memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); + link->peer_psn = ntoh24(clc->r0.psn); + link->peer_mtu = clc->r0.qp_mtu; +} + +static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, + struct smc_stats_fback *fback_arr) +{ + int cnt; + + for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) { + if (fback_arr[cnt].fback_code == smc->fallback_rsn) { + fback_arr[cnt].count++; + break; + } + if (!fback_arr[cnt].fback_code) { + fback_arr[cnt].fback_code = smc->fallback_rsn; + fback_arr[cnt].count++; + break; + } + } +} + +static void smc_stat_fallback(struct smc_sock *smc) +{ + struct net *net = sock_net(&smc->sk); + + mutex_lock(&net->smc.mutex_fback_rsn); + if (smc->listen_smc) { + smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv); + net->smc.fback_rsn->srv_fback_cnt++; + } else { + smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt); + net->smc.fback_rsn->clnt_fback_cnt++; + } + mutex_unlock(&net->smc.mutex_fback_rsn); +} + +/* must be called under rcu read lock */ +static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key) +{ + struct socket_wq *wq; + __poll_t flags; + + wq = rcu_dereference(smc->sk.sk_wq); + if (!skwq_has_sleeper(wq)) + return; + + /* wake up smc sk->sk_wq */ + if (!key) { + /* sk_state_change */ + wake_up_interruptible_all(&wq->wait); + } else { + flags = key_to_poll(key); + if (flags & (EPOLLIN | EPOLLOUT)) + /* sk_data_ready or sk_write_space */ + wake_up_interruptible_sync_poll(&wq->wait, flags); + else if (flags & EPOLLERR) + /* sk_error_report */ + wake_up_interruptible_poll(&wq->wait, flags); + } +} + +static int smc_fback_mark_woken(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) +{ + struct smc_mark_woken *mark = + container_of(wait, struct smc_mark_woken, wait_entry); + + mark->woken = true; + mark->key = key; + return 0; +} + +static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, + void (*clcsock_callback)(struct sock *sk)) +{ + struct smc_mark_woken mark = { .woken = false }; + struct socket_wq *wq; + + init_waitqueue_func_entry(&mark.wait_entry, + smc_fback_mark_woken); + rcu_read_lock(); + wq = rcu_dereference(clcsk->sk_wq); + if (!wq) + goto out; + add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + clcsock_callback(clcsk); + remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + + if (mark.woken) + smc_fback_wakeup_waitqueue(smc, mark.key); +out: + rcu_read_unlock(); +} + +static void smc_fback_state_change(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_state_change); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_data_ready(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_data_ready); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_write_space(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_write_space); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_error_report(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_error_report); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_replace_callbacks(struct smc_sock *smc) +{ + struct sock *clcsk = smc->clcsock->sk; + + write_lock_bh(&clcsk->sk_callback_lock); + clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + + smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, + &smc->clcsk_state_change); + smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready, + &smc->clcsk_data_ready); + smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space, + &smc->clcsk_write_space); + smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, + &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); +} + +static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) +{ + int rc = 0; + + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + rc = -EBADF; + goto out; + } + + smc->use_fallback = true; + smc->fallback_rsn = reason_code; + smc_stat_fallback(smc); + trace_smc_switch_to_fallback(smc, reason_code); + if (smc->sk.sk_socket && smc->sk.sk_socket->file) { + smc->clcsock->file = smc->sk.sk_socket->file; + smc->clcsock->file->private_data = smc->clcsock; + smc->clcsock->wq.fasync_list = + smc->sk.sk_socket->wq.fasync_list; + + /* There might be some wait entries remaining + * in smc sk->sk_wq and they should be woken up + * as clcsock's wait queue is woken up. + */ + smc_fback_replace_callbacks(smc); + } +out: + mutex_unlock(&smc->clcsock_release_lock); + return rc; +} + +/* fall back during connect */ +static int smc_connect_fallback(struct smc_sock *smc, int reason_code) +{ + struct net *net = sock_net(&smc->sk); + int rc = 0; + + rc = smc_switch_to_fallback(smc, reason_code); + if (rc) { /* fallback fails */ + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ + return rc; + } + smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; + if (smc->sk.sk_state == SMC_INIT) + smc->sk.sk_state = SMC_ACTIVE; + return 0; +} + +/* decline and fall back during connect */ +static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, + u8 version) +{ + struct net *net = sock_net(&smc->sk); + int rc; + + if (reason_code < 0) { /* error, fallback is not possible */ + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ + return reason_code; + } + if (reason_code != SMC_CLC_DECL_PEERDECL) { + rc = smc_clc_send_decline(smc, reason_code, version); + if (rc < 0) { + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ + return rc; + } + } + return smc_connect_fallback(smc, reason_code); +} + +static void smc_conn_abort(struct smc_sock *smc, int local_first) +{ + struct smc_connection *conn = &smc->conn; + struct smc_link_group *lgr = conn->lgr; + bool lgr_valid = false; + + if (smc_conn_lgr_valid(conn)) + lgr_valid = true; + + smc_conn_free(conn); + if (local_first && lgr_valid) + smc_lgr_cleanup_early(lgr); +} + +/* check if there is a rdma device available for this connection. */ +/* called for connect and listen */ +static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini) +{ + /* PNET table look up: search active ib_device and port + * within same PNETID that also contains the ethernet device + * used for the internal TCP socket + */ + smc_pnet_find_roce_resource(smc->clcsock->sk, ini); + if (!ini->check_smcrv2 && !ini->ib_dev) + return SMC_CLC_DECL_NOSMCRDEV; + if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2) + return SMC_CLC_DECL_NOSMCRDEV; + return 0; +} + +/* check if there is an ISM device available for this connection. */ +/* called for connect and listen */ +static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini) +{ + /* Find ISM device with same PNETID as connecting interface */ + smc_pnet_find_ism_resource(smc->clcsock->sk, ini); + if (!ini->ism_dev[0]) + return SMC_CLC_DECL_NOSMCDDEV; + else + ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]); + return 0; +} + +/* is chid unique for the ism devices that are already determined? */ +static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini, + int cnt) +{ + int i = (!ini->ism_dev[0]) ? 1 : 0; + + for (; i < cnt; i++) + if (ini->ism_chid[i] == chid) + return false; + return true; +} + +/* determine possible V2 ISM devices (either without PNETID or with PNETID plus + * PNETID matching net_device) + */ +static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, + struct smc_init_info *ini) +{ + int rc = SMC_CLC_DECL_NOSMCDDEV; + struct smcd_dev *smcd; + int i = 1; + u16 chid; + + if (smcd_indicated(ini->smc_type_v1)) + rc = 0; /* already initialized for V1 */ + mutex_lock(&smcd_dev_list.mutex); + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + if (smcd->going_away || smcd == ini->ism_dev[0]) + continue; + chid = smc_ism_get_chid(smcd); + if (!smc_find_ism_v2_is_unique_chid(chid, ini, i)) + continue; + if (!smc_pnet_is_pnetid_set(smcd->pnetid) || + smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) { + ini->ism_dev[i] = smcd; + ini->ism_chid[i] = chid; + ini->is_smcd = true; + rc = 0; + i++; + if (i > SMC_MAX_ISM_DEVS) + break; + } + } + mutex_unlock(&smcd_dev_list.mutex); + ini->ism_offered_cnt = i - 1; + if (!ini->ism_dev[0] && !ini->ism_dev[1]) + ini->smcd_version = 0; + + return rc; +} + +/* Check for VLAN ID and register it on ISM device just for CLC handshake */ +static int smc_connect_ism_vlan_setup(struct smc_sock *smc, + struct smc_init_info *ini) +{ + if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id)) + return SMC_CLC_DECL_ISMVLANERR; + return 0; +} + +static int smc_find_proposal_devices(struct smc_sock *smc, + struct smc_init_info *ini) +{ + int rc = 0; + + /* check if there is an ism device available */ + if (!(ini->smcd_version & SMC_V1) || + smc_find_ism_device(smc, ini) || + smc_connect_ism_vlan_setup(smc, ini)) + ini->smcd_version &= ~SMC_V1; + /* else ISM V1 is supported for this connection */ + + /* check if there is an rdma device available */ + if (!(ini->smcr_version & SMC_V1) || + smc_find_rdma_device(smc, ini)) + ini->smcr_version &= ~SMC_V1; + /* else RDMA is supported for this connection */ + + ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1, + ini->smcr_version & SMC_V1); + + /* check if there is an ism v2 device available */ + if (!(ini->smcd_version & SMC_V2) || + !smc_ism_is_v2_capable() || + smc_find_ism_v2_device_clnt(smc, ini)) + ini->smcd_version &= ~SMC_V2; + + /* check if there is an rdma v2 device available */ + ini->check_smcrv2 = true; + ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr; + if (!(ini->smcr_version & SMC_V2) || + smc->clcsock->sk->sk_family != AF_INET || + !smc_clc_ueid_count() || + smc_find_rdma_device(smc, ini)) + ini->smcr_version &= ~SMC_V2; + ini->check_smcrv2 = false; + + ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2, + ini->smcr_version & SMC_V2); + + /* if neither ISM nor RDMA are supported, fallback */ + if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N) + rc = SMC_CLC_DECL_NOSMCDEV; + + return rc; +} + +/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is + * used, the VLAN ID will be registered again during the connection setup. + */ +static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, + struct smc_init_info *ini) +{ + if (!smcd_indicated(ini->smc_type_v1)) + return 0; + if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id)) + return SMC_CLC_DECL_CNFERR; + return 0; +} + +#define SMC_CLC_MAX_ACCEPT_LEN \ + (sizeof(struct smc_clc_msg_accept_confirm_v2) + \ + sizeof(struct smc_clc_first_contact_ext_v2x) + \ + sizeof(struct smc_clc_msg_trail)) + +/* CLC handshake during connect */ +static int smc_connect_clc(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm_v2 *aclc2, + struct smc_init_info *ini) +{ + int rc = 0; + + /* do inband token exchange */ + rc = smc_clc_send_proposal(smc, ini); + if (rc) + return rc; + /* receive SMC Accept CLC message */ + return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, + SMC_CLC_ACCEPT, CLC_WAIT_TIME); +} + +void smc_fill_gid_list(struct smc_link_group *lgr, + struct smc_gidlist *gidlist, + struct smc_ib_device *known_dev, u8 *known_gid) +{ + struct smc_init_info *alt_ini = NULL; + + memset(gidlist, 0, sizeof(*gidlist)); + memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE); + + alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL); + if (!alt_ini) + goto out; + + alt_ini->vlan_id = lgr->vlan_id; + alt_ini->check_smcrv2 = true; + alt_ini->smcrv2.saddr = lgr->saddr; + smc_pnet_find_alt_roce(lgr, alt_ini, known_dev); + + if (!alt_ini->smcrv2.ib_dev_v2) + goto out; + + memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2, + SMC_GID_SIZE); + +out: + kfree(alt_ini); +} + +static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smc_init_info *ini) +{ + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + struct smc_clc_first_contact_ext *fce = + smc_get_clc_first_contact_ext(clc_v2, false); + struct net *net = sock_net(&smc->sk); + int rc; + + if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1) + return 0; + + if (fce->v2_direct) { + memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN); + ini->smcrv2.uses_gateway = false; + } else { + if (smc_ib_find_route(net, smc->clcsock->sk->sk_rcv_saddr, + smc_ib_gid_to_ipv4(aclc->r0.lcl.gid), + ini->smcrv2.nexthop_mac, + &ini->smcrv2.uses_gateway)) + return SMC_CLC_DECL_NOROUTE; + if (!ini->smcrv2.uses_gateway) { + /* mismatch: peer claims indirect, but its direct */ + return SMC_CLC_DECL_NOINDIRECT; + } + } + + ini->release_nr = fce->release; + rc = smc_clc_clnt_v2x_features_validate(fce, ini); + if (rc) + return rc; + + return 0; +} + +/* setup for RDMA connection of client */ +static int smc_connect_rdma(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smc_init_info *ini) +{ + int i, reason_code = 0; + struct smc_link *link; + u8 *eid = NULL; + + ini->is_smcd = false; + ini->ib_clcqpn = ntoh24(aclc->r0.qpn); + ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; + memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN); + memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE); + memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN); + ini->max_conns = SMC_CONN_PER_LGR_MAX; + ini->max_links = SMC_LINKS_ADD_LNK_MAX; + + reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini); + if (reason_code) + return reason_code; + + mutex_lock(&smc_client_lgr_pending); + reason_code = smc_conn_create(smc, ini); + if (reason_code) { + mutex_unlock(&smc_client_lgr_pending); + return reason_code; + } + + smc_conn_save_peer_info(smc, aclc); + + if (ini->first_contact_local) { + link = smc->conn.lnk; + } else { + /* set link that was assigned by server */ + link = NULL; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *l = &smc->conn.lgr->lnk[i]; + + if (l->peer_qpn == ntoh24(aclc->r0.qpn) && + !memcmp(l->peer_gid, &aclc->r0.lcl.gid, + SMC_GID_SIZE) && + (aclc->hdr.version > SMC_V1 || + !memcmp(l->peer_mac, &aclc->r0.lcl.mac, + sizeof(l->peer_mac)))) { + link = l; + break; + } + } + if (!link) { + reason_code = SMC_CLC_DECL_NOSRVLINK; + goto connect_abort; + } + smc_switch_link_and_count(&smc->conn, link); + } + + /* create send buffer and rmb */ + if (smc_buf_create(smc, false)) { + reason_code = SMC_CLC_DECL_MEM; + goto connect_abort; + } + + if (ini->first_contact_local) + smc_link_save_peer_info(link, aclc, ini); + + if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) { + reason_code = SMC_CLC_DECL_ERR_RTOK; + goto connect_abort; + } + + smc_close_init(smc); + smc_rx_init(smc); + + if (ini->first_contact_local) { + if (smc_ib_ready_link(link)) { + reason_code = SMC_CLC_DECL_ERR_RDYLNK; + goto connect_abort; + } + } else { + /* reg sendbufs if they were vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { + reason_code = SMC_CLC_DECL_ERR_REGBUF; + goto connect_abort; + } + } + if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { + reason_code = SMC_CLC_DECL_ERR_REGBUF; + goto connect_abort; + } + } + + if (aclc->hdr.version > SMC_V1) { + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + + eid = clc_v2->r1.eid; + if (ini->first_contact_local) + smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist, + link->smcibdev, link->gid); + } + + reason_code = smc_clc_send_confirm(smc, ini->first_contact_local, + aclc->hdr.version, eid, ini); + if (reason_code) + goto connect_abort; + + smc_tx_init(smc); + + if (ini->first_contact_local) { + /* QP confirmation over RoCE fabric */ + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); + reason_code = smcr_clnt_conf_first_link(smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); + if (reason_code) + goto connect_abort; + } + mutex_unlock(&smc_client_lgr_pending); + + smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; + if (smc->sk.sk_state == SMC_INIT) + smc->sk.sk_state = SMC_ACTIVE; + + return 0; +connect_abort: + smc_conn_abort(smc, ini->first_contact_local); + mutex_unlock(&smc_client_lgr_pending); + smc->connect_nonblock = 0; + + return reason_code; +} + +/* The server has chosen one of the proposed ISM devices for the communication. + * Determine from the CHID of the received CLC ACCEPT the ISM device chosen. + */ +static int +smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc, + struct smc_init_info *ini) +{ + int i; + + for (i = 0; i < ini->ism_offered_cnt + 1; i++) { + if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) { + ini->ism_selected = i; + return 0; + } + } + + return -EPROTO; +} + +/* setup for ISM connection of client */ +static int smc_connect_ism(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smc_init_info *ini) +{ + u8 *eid = NULL; + int rc = 0; + + ini->is_smcd = true; + ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; + + if (aclc->hdr.version == SMC_V2) { + struct smc_clc_msg_accept_confirm_v2 *aclc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + + if (ini->first_contact_peer) { + struct smc_clc_first_contact_ext *fce = + smc_get_clc_first_contact_ext(aclc_v2, true); + + ini->release_nr = fce->release; + rc = smc_clc_clnt_v2x_features_validate(fce, ini); + if (rc) + return rc; + } + + rc = smc_v2_determine_accepted_chid(aclc_v2, ini); + if (rc) + return rc; + } + ini->ism_peer_gid[ini->ism_selected] = ntohll(aclc->d0.gid); + + /* there is only one lgr role for SMC-D; use server lock */ + mutex_lock(&smc_server_lgr_pending); + rc = smc_conn_create(smc, ini); + if (rc) { + mutex_unlock(&smc_server_lgr_pending); + return rc; + } + + /* Create send and receive buffers */ + rc = smc_buf_create(smc, true); + if (rc) { + rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM; + goto connect_abort; + } + + smc_conn_save_peer_info(smc, aclc); + smc_close_init(smc); + smc_rx_init(smc); + smc_tx_init(smc); + + if (aclc->hdr.version > SMC_V1) { + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + + eid = clc_v2->d1.eid; + } + + rc = smc_clc_send_confirm(smc, ini->first_contact_local, + aclc->hdr.version, eid, ini); + if (rc) + goto connect_abort; + mutex_unlock(&smc_server_lgr_pending); + + smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; + if (smc->sk.sk_state == SMC_INIT) + smc->sk.sk_state = SMC_ACTIVE; + + return 0; +connect_abort: + smc_conn_abort(smc, ini->first_contact_local); + mutex_unlock(&smc_server_lgr_pending); + smc->connect_nonblock = 0; + + return rc; +} + +/* check if received accept type and version matches a proposed one */ +static int smc_connect_check_aclc(struct smc_init_info *ini, + struct smc_clc_msg_accept_confirm *aclc) +{ + if (aclc->hdr.typev1 != SMC_TYPE_R && + aclc->hdr.typev1 != SMC_TYPE_D) + return SMC_CLC_DECL_MODEUNSUPP; + + if (aclc->hdr.version >= SMC_V2) { + if ((aclc->hdr.typev1 == SMC_TYPE_R && + !smcr_indicated(ini->smc_type_v2)) || + (aclc->hdr.typev1 == SMC_TYPE_D && + !smcd_indicated(ini->smc_type_v2))) + return SMC_CLC_DECL_MODEUNSUPP; + } else { + if ((aclc->hdr.typev1 == SMC_TYPE_R && + !smcr_indicated(ini->smc_type_v1)) || + (aclc->hdr.typev1 == SMC_TYPE_D && + !smcd_indicated(ini->smc_type_v1))) + return SMC_CLC_DECL_MODEUNSUPP; + } + + return 0; +} + +/* perform steps before actually connecting */ +static int __smc_connect(struct smc_sock *smc) +{ + u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; + struct smc_clc_msg_accept_confirm_v2 *aclc2; + struct smc_clc_msg_accept_confirm *aclc; + struct smc_init_info *ini = NULL; + u8 *buf = NULL; + int rc = 0; + + if (smc->use_fallback) + return smc_connect_fallback(smc, smc->fallback_rsn); + + /* if peer has not signalled SMC-capability, fall back */ + if (!tcp_sk(smc->clcsock->sk)->syn_smc) + return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC); + + /* IPSec connections opt out of SMC optimizations */ + if (using_ipsec(smc)) + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC, + version); + + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM, + version); + + ini->smcd_version = SMC_V1 | SMC_V2; + ini->smcr_version = SMC_V1 | SMC_V2; + ini->smc_type_v1 = SMC_TYPE_B; + ini->smc_type_v2 = SMC_TYPE_B; + + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(smc->clcsock, ini)) { + ini->smcd_version &= ~SMC_V1; + ini->smcr_version = 0; + ini->smc_type_v1 = SMC_TYPE_N; + if (!ini->smcd_version) { + rc = SMC_CLC_DECL_GETVLANERR; + goto fallback; + } + } + + rc = smc_find_proposal_devices(smc, ini); + if (rc) + goto fallback; + + buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL); + if (!buf) { + rc = SMC_CLC_DECL_MEM; + goto fallback; + } + aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf; + aclc = (struct smc_clc_msg_accept_confirm *)aclc2; + + /* perform CLC handshake */ + rc = smc_connect_clc(smc, aclc2, ini); + if (rc) { + /* -EAGAIN on timeout, see tcp_recvmsg() */ + if (rc == -EAGAIN) { + rc = -ETIMEDOUT; + smc->sk.sk_err = ETIMEDOUT; + } + goto vlan_cleanup; + } + + /* check if smc modes and versions of CLC proposal and accept match */ + rc = smc_connect_check_aclc(ini, aclc); + version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2; + if (rc) + goto vlan_cleanup; + + /* depending on previous steps, connect using rdma or ism */ + if (aclc->hdr.typev1 == SMC_TYPE_R) { + ini->smcr_version = version; + rc = smc_connect_rdma(smc, aclc, ini); + } else if (aclc->hdr.typev1 == SMC_TYPE_D) { + ini->smcd_version = version; + rc = smc_connect_ism(smc, aclc, ini); + } + if (rc) + goto vlan_cleanup; + + SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); + smc_connect_ism_vlan_cleanup(smc, ini); + kfree(buf); + kfree(ini); + return 0; + +vlan_cleanup: + smc_connect_ism_vlan_cleanup(smc, ini); + kfree(buf); +fallback: + kfree(ini); + return smc_connect_decline_fallback(smc, rc, version); +} + +static void smc_connect_work(struct work_struct *work) +{ + struct smc_sock *smc = container_of(work, struct smc_sock, + connect_work); + long timeo = smc->sk.sk_sndtimeo; + int rc = 0; + + if (!timeo) + timeo = MAX_SCHEDULE_TIMEOUT; + lock_sock(smc->clcsock->sk); + if (smc->clcsock->sk->sk_err) { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + } else if ((1 << smc->clcsock->sk->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); + if ((rc == -EPIPE) && + ((1 << smc->clcsock->sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))) + rc = 0; + } + release_sock(smc->clcsock->sk); + lock_sock(&smc->sk); + if (rc != 0 || smc->sk.sk_err) { + smc->sk.sk_state = SMC_CLOSED; + if (rc == -EPIPE || rc == -EAGAIN) + smc->sk.sk_err = EPIPE; + else if (rc == -ECONNREFUSED) + smc->sk.sk_err = ECONNREFUSED; + else if (signal_pending(current)) + smc->sk.sk_err = -sock_intr_errno(timeo); + sock_put(&smc->sk); /* passive closing */ + goto out; + } + + rc = __smc_connect(smc); + if (rc < 0) + smc->sk.sk_err = -rc; + +out: + if (!sock_flag(&smc->sk, SOCK_DEAD)) { + if (smc->sk.sk_err) { + smc->sk.sk_state_change(&smc->sk); + } else { /* allow polling before and after fallback decision */ + smc->clcsock->sk->sk_write_space(smc->clcsock->sk); + smc->sk.sk_write_space(&smc->sk); + } + } + release_sock(&smc->sk); +} + +static int smc_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -EINVAL; + + smc = smc_sk(sk); + + /* separate smc parameter checking to be safe */ + if (alen < sizeof(addr->sa_family)) + goto out_err; + if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) + goto out_err; + + lock_sock(sk); + switch (sock->state) { + default: + rc = -EINVAL; + goto out; + case SS_CONNECTED: + rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL; + goto out; + case SS_CONNECTING: + if (sk->sk_state == SMC_ACTIVE) + goto connected; + break; + case SS_UNCONNECTED: + sock->state = SS_CONNECTING; + break; + } + + switch (sk->sk_state) { + default: + goto out; + case SMC_CLOSED: + rc = sock_error(sk) ? : -ECONNABORTED; + sock->state = SS_UNCONNECTED; + goto out; + case SMC_ACTIVE: + rc = -EISCONN; + goto out; + case SMC_INIT: + break; + } + + smc_copy_sock_settings_to_clc(smc); + tcp_sk(smc->clcsock->sk)->syn_smc = 1; + if (smc->connect_nonblock) { + rc = -EALREADY; + goto out; + } + rc = kernel_connect(smc->clcsock, addr, alen, flags); + if (rc && rc != -EINPROGRESS) + goto out; + + if (smc->use_fallback) { + sock->state = rc ? SS_CONNECTING : SS_CONNECTED; + goto out; + } + sock_hold(&smc->sk); /* sock put in passive closing */ + if (flags & O_NONBLOCK) { + if (queue_work(smc_hs_wq, &smc->connect_work)) + smc->connect_nonblock = 1; + rc = -EINPROGRESS; + goto out; + } else { + rc = __smc_connect(smc); + if (rc < 0) + goto out; + } + +connected: + rc = 0; + sock->state = SS_CONNECTED; +out: + release_sock(sk); +out_err: + return rc; +} + +static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) +{ + struct socket *new_clcsock = NULL; + struct sock *lsk = &lsmc->sk; + struct sock *new_sk; + int rc = -EINVAL; + + release_sock(lsk); + new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); + if (!new_sk) { + rc = -ENOMEM; + lsk->sk_err = ENOMEM; + *new_smc = NULL; + lock_sock(lsk); + goto out; + } + *new_smc = smc_sk(new_sk); + + mutex_lock(&lsmc->clcsock_release_lock); + if (lsmc->clcsock) + rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); + mutex_unlock(&lsmc->clcsock_release_lock); + lock_sock(lsk); + if (rc < 0 && rc != -EAGAIN) + lsk->sk_err = -rc; + if (rc < 0 || lsk->sk_state == SMC_CLOSED) { + new_sk->sk_prot->unhash(new_sk); + if (new_clcsock) + sock_release(new_clcsock); + new_sk->sk_state = SMC_CLOSED; + smc_sock_set_flag(new_sk, SOCK_DEAD); + sock_put(new_sk); /* final */ + *new_smc = NULL; + goto out; + } + + /* new clcsock has inherited the smc listen-specific sk_data_ready + * function; switch it back to the original sk_data_ready function + */ + new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; + + /* if new clcsock has also inherited the fallback-specific callback + * functions, switch them back to the original ones. + */ + if (lsmc->use_fallback) { + if (lsmc->clcsk_state_change) + new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change; + if (lsmc->clcsk_write_space) + new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space; + if (lsmc->clcsk_error_report) + new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report; + } + + (*new_smc)->clcsock = new_clcsock; +out: + return rc; +} + +/* add a just created sock to the accept queue of the listen sock as + * candidate for a following socket accept call from user space + */ +static void smc_accept_enqueue(struct sock *parent, struct sock *sk) +{ + struct smc_sock *par = smc_sk(parent); + + sock_hold(sk); /* sock_put in smc_accept_unlink () */ + spin_lock(&par->accept_q_lock); + list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); + spin_unlock(&par->accept_q_lock); + sk_acceptq_added(parent); +} + +/* remove a socket from the accept queue of its parental listening socket */ +static void smc_accept_unlink(struct sock *sk) +{ + struct smc_sock *par = smc_sk(sk)->listen_smc; + + spin_lock(&par->accept_q_lock); + list_del_init(&smc_sk(sk)->accept_q); + spin_unlock(&par->accept_q_lock); + sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); + sock_put(sk); /* sock_hold in smc_accept_enqueue */ +} + +/* remove a sock from the accept queue to bind it to a new socket created + * for a socket accept call from user space + */ +struct sock *smc_accept_dequeue(struct sock *parent, + struct socket *new_sock) +{ + struct smc_sock *isk, *n; + struct sock *new_sk; + + list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { + new_sk = (struct sock *)isk; + + smc_accept_unlink(new_sk); + if (new_sk->sk_state == SMC_CLOSED) { + new_sk->sk_prot->unhash(new_sk); + if (isk->clcsock) { + sock_release(isk->clcsock); + isk->clcsock = NULL; + } + sock_put(new_sk); /* final */ + continue; + } + if (new_sock) { + sock_graft(new_sk, new_sock); + new_sock->state = SS_CONNECTED; + if (isk->use_fallback) { + smc_sk(new_sk)->clcsock->file = new_sock->file; + isk->clcsock->file->private_data = isk->clcsock; + } + } + return new_sk; + } + return NULL; +} + +/* clean up for a created but never accepted sock */ +void smc_close_non_accepted(struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + sock_hold(sk); /* sock_put below */ + lock_sock(sk); + if (!sk->sk_lingertime) + /* wait for peer closing */ + WRITE_ONCE(sk->sk_lingertime, SMC_MAX_STREAM_WAIT_TIMEOUT); + __smc_release(smc); + release_sock(sk); + sock_put(sk); /* sock_hold above */ + sock_put(sk); /* final sock_put */ +} + +static int smcr_serv_conf_first_link(struct smc_sock *smc) +{ + struct smc_link *link = smc->conn.lnk; + struct smc_llc_qentry *qentry; + int rc; + + /* reg the sndbuf if it was vzalloced*/ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + + /* send CONFIRM LINK request to client over the RoCE fabric */ + rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); + if (rc < 0) + return SMC_CLC_DECL_TIMEOUT_CL; + + /* receive CONFIRM LINK response from client over the RoCE fabric */ + qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; + } + smc_llc_save_peer_uid(qentry); + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) + return SMC_CLC_DECL_RMBE_EC; + + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; + + smc_llc_link_active(link); + smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); + + if (link->lgr->max_links > 1) { + down_write(&link->lgr->llc_conf_mutex); + /* initial contact - try to establish second link */ + smc_llc_srv_add_link(link, NULL); + up_write(&link->lgr->llc_conf_mutex); + } + return 0; +} + +/* listen worker: finish */ +static void smc_listen_out(struct smc_sock *new_smc) +{ + struct smc_sock *lsmc = new_smc->listen_smc; + struct sock *newsmcsk = &new_smc->sk; + + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + atomic_dec(&lsmc->queued_smc_hs); + + if (lsmc->sk.sk_state == SMC_LISTEN) { + lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); + smc_accept_enqueue(&lsmc->sk, newsmcsk); + release_sock(&lsmc->sk); + } else { /* no longer listening */ + smc_close_non_accepted(newsmcsk); + } + + /* Wake up accept */ + lsmc->sk.sk_data_ready(&lsmc->sk); + sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ +} + +/* listen worker: finish in state connected */ +static void smc_listen_out_connected(struct smc_sock *new_smc) +{ + struct sock *newsmcsk = &new_smc->sk; + + if (newsmcsk->sk_state == SMC_INIT) + newsmcsk->sk_state = SMC_ACTIVE; + + smc_listen_out(new_smc); +} + +/* listen worker: finish in error state */ +static void smc_listen_out_err(struct smc_sock *new_smc) +{ + struct sock *newsmcsk = &new_smc->sk; + struct net *net = sock_net(newsmcsk); + + this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt); + if (newsmcsk->sk_state == SMC_INIT) + sock_put(&new_smc->sk); /* passive closing */ + newsmcsk->sk_state = SMC_CLOSED; + + smc_listen_out(new_smc); +} + +/* listen worker: decline and fall back if possible */ +static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, + int local_first, u8 version) +{ + /* RDMA setup failed, switch back to TCP */ + smc_conn_abort(new_smc, local_first); + if (reason_code < 0 || + smc_switch_to_fallback(new_smc, reason_code)) { + /* error, no fallback possible */ + smc_listen_out_err(new_smc); + return; + } + if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { + if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { + smc_listen_out_err(new_smc); + return; + } + } + smc_listen_out_connected(new_smc); +} + +/* listen worker: version checking */ +static int smc_listen_v2_check(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext; + struct smc_clc_v2_extension *pclc_v2_ext; + int rc = SMC_CLC_DECL_PEERNOSMC; + + ini->smc_type_v1 = pclc->hdr.typev1; + ini->smc_type_v2 = pclc->hdr.typev2; + ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0; + ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0; + if (pclc->hdr.version > SMC_V1) { + if (smcd_indicated(ini->smc_type_v2)) + ini->smcd_version |= SMC_V2; + if (smcr_indicated(ini->smc_type_v2)) + ini->smcr_version |= SMC_V2; + } + if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) { + rc = SMC_CLC_DECL_PEERNOSMC; + goto out; + } + pclc_v2_ext = smc_get_clc_v2_ext(pclc); + if (!pclc_v2_ext) { + ini->smcd_version &= ~SMC_V2; + ini->smcr_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOV2EXT; + goto out; + } + pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext); + if (ini->smcd_version & SMC_V2) { + if (!smc_ism_is_v2_capable()) { + ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOISM2SUPP; + } else if (!pclc_smcd_v2_ext) { + ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOV2DEXT; + } else if (!pclc_v2_ext->hdr.eid_cnt && + !pclc_v2_ext->hdr.flag.seid) { + ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOUEID; + } + } + if (ini->smcr_version & SMC_V2) { + if (!pclc_v2_ext->hdr.eid_cnt) { + ini->smcr_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOUEID; + } + } + + ini->release_nr = pclc_v2_ext->hdr.flag.release; + if (pclc_v2_ext->hdr.flag.release > SMC_RELEASE) + ini->release_nr = SMC_RELEASE; + +out: + if (!ini->smcd_version && !ini->smcr_version) + return rc; + + return 0; +} + +/* listen worker: check prefixes */ +static int smc_listen_prfx_check(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc) +{ + struct smc_clc_msg_proposal_prefix *pclc_prfx; + struct socket *newclcsock = new_smc->clcsock; + + if (pclc->hdr.typev1 == SMC_TYPE_N) + return 0; + pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (smc_clc_prfx_match(newclcsock, pclc_prfx)) + return SMC_CLC_DECL_DIFFPREFIX; + + return 0; +} + +/* listen worker: initialize connection and buffers */ +static int smc_listen_rdma_init(struct smc_sock *new_smc, + struct smc_init_info *ini) +{ + int rc; + + /* allocate connection / link group */ + rc = smc_conn_create(new_smc, ini); + if (rc) + return rc; + + /* create send buffer and rmb */ + if (smc_buf_create(new_smc, false)) { + smc_conn_abort(new_smc, ini->first_contact_local); + return SMC_CLC_DECL_MEM; + } + + return 0; +} + +/* listen worker: initialize connection and buffers for SMC-D */ +static int smc_listen_ism_init(struct smc_sock *new_smc, + struct smc_init_info *ini) +{ + int rc; + + rc = smc_conn_create(new_smc, ini); + if (rc) + return rc; + + /* Create send and receive buffers */ + rc = smc_buf_create(new_smc, true); + if (rc) { + smc_conn_abort(new_smc, ini->first_contact_local); + return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : + SMC_CLC_DECL_MEM; + } + + return 0; +} + +static bool smc_is_already_selected(struct smcd_dev *smcd, + struct smc_init_info *ini, + int matches) +{ + int i; + + for (i = 0; i < matches; i++) + if (smcd == ini->ism_dev[i]) + return true; + + return false; +} + +/* check for ISM devices matching proposed ISM devices */ +static void smc_check_ism_v2_match(struct smc_init_info *ini, + u16 proposed_chid, u64 proposed_gid, + unsigned int *matches) +{ + struct smcd_dev *smcd; + + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + if (smcd->going_away) + continue; + if (smc_is_already_selected(smcd, ini, *matches)) + continue; + if (smc_ism_get_chid(smcd) == proposed_chid && + !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) { + ini->ism_peer_gid[*matches] = proposed_gid; + ini->ism_dev[*matches] = smcd; + (*matches)++; + break; + } + } +} + +static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini) +{ + if (!ini->rc) + ini->rc = rc; +} + +static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_smcd_v2_extension *smcd_v2_ext; + struct smc_clc_v2_extension *smc_v2_ext; + struct smc_clc_msg_smcd *pclc_smcd; + unsigned int matches = 0; + u8 smcd_version; + u8 *eid = NULL; + int i, rc; + + if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2)) + goto not_found; + + pclc_smcd = smc_get_clc_msg_smcd(pclc); + smc_v2_ext = smc_get_clc_v2_ext(pclc); + smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); + + mutex_lock(&smcd_dev_list.mutex); + if (pclc_smcd->ism.chid) + /* check for ISM device matching proposed native ISM device */ + smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid), + ntohll(pclc_smcd->ism.gid), &matches); + for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) { + /* check for ISM devices matching proposed non-native ISM + * devices + */ + smc_check_ism_v2_match(ini, + ntohs(smcd_v2_ext->gidchid[i - 1].chid), + ntohll(smcd_v2_ext->gidchid[i - 1].gid), + &matches); + } + mutex_unlock(&smcd_dev_list.mutex); + + if (!ini->ism_dev[0]) { + smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); + goto not_found; + } + + smc_ism_get_system_eid(&eid); + if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, + smcd_v2_ext->system_eid, eid)) + goto not_found; + + /* separate - outside the smcd_dev_list.lock */ + smcd_version = ini->smcd_version; + for (i = 0; i < matches; i++) { + ini->smcd_version = SMC_V2; + ini->is_smcd = true; + ini->ism_selected = i; + rc = smc_listen_ism_init(new_smc, ini); + if (rc) { + smc_find_ism_store_rc(rc, ini); + /* try next active ISM device */ + continue; + } + return; /* matching and usable V2 ISM device found */ + } + /* no V2 ISM device could be initialized */ + ini->smcd_version = smcd_version; /* restore original value */ + ini->negotiated_eid[0] = 0; + +not_found: + ini->smcd_version &= ~SMC_V2; + ini->ism_dev[0] = NULL; + ini->is_smcd = false; +} + +static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc); + int rc = 0; + + /* check if ISM V1 is available */ + if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) + goto not_found; + ini->is_smcd = true; /* prepare ISM check */ + ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid); + rc = smc_find_ism_device(new_smc, ini); + if (rc) + goto not_found; + ini->ism_selected = 0; + rc = smc_listen_ism_init(new_smc, ini); + if (!rc) + return; /* V1 ISM device found */ + +not_found: + smc_find_ism_store_rc(rc, ini); + ini->smcd_version &= ~SMC_V1; + ini->ism_dev[0] = NULL; + ini->is_smcd = false; +} + +/* listen worker: register buffers */ +static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) +{ + struct smc_connection *conn = &new_smc->conn; + + if (!local_first) { + /* reg sendbufs if they were vzalloced */ + if (conn->sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(conn->lnk, + conn->sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + return 0; +} + +static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_v2_extension *smc_v2_ext; + u8 smcr_version; + int rc; + + if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2)) + goto not_found; + + smc_v2_ext = smc_get_clc_v2_ext(pclc); + if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) + goto not_found; + + /* prepare RDMA check */ + memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); + memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE); + memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); + ini->check_smcrv2 = true; + ini->smcrv2.clc_sk = new_smc->clcsock->sk; + ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr; + ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce); + rc = smc_find_rdma_device(new_smc, ini); + if (rc) { + smc_find_ism_store_rc(rc, ini); + goto not_found; + } + if (!ini->smcrv2.uses_gateway) + memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN); + + smcr_version = ini->smcr_version; + ini->smcr_version = SMC_V2; + rc = smc_listen_rdma_init(new_smc, ini); + if (!rc) { + rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); + if (rc) + smc_conn_abort(new_smc, ini->first_contact_local); + } + if (!rc) + return; + ini->smcr_version = smcr_version; + smc_find_ism_store_rc(rc, ini); + +not_found: + ini->smcr_version &= ~SMC_V2; + ini->smcrv2.ib_dev_v2 = NULL; + ini->check_smcrv2 = false; +} + +static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + int rc; + + if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1)) + return SMC_CLC_DECL_NOSMCDEV; + + /* prepare RDMA check */ + memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); + memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE); + memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); + rc = smc_find_rdma_device(new_smc, ini); + if (rc) { + /* no RDMA device found */ + return SMC_CLC_DECL_NOSMCDEV; + } + rc = smc_listen_rdma_init(new_smc, ini); + if (rc) + return rc; + return smc_listen_rdma_reg(new_smc, ini->first_contact_local); +} + +/* determine the local device matching to proposal */ +static int smc_listen_find_device(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + int prfx_rc; + + /* check for ISM device matching V2 proposed device */ + smc_find_ism_v2_device_serv(new_smc, pclc, ini); + if (ini->ism_dev[0]) + return 0; + + /* check for matching IP prefix and subnet length (V1) */ + prfx_rc = smc_listen_prfx_check(new_smc, pclc); + if (prfx_rc) + smc_find_ism_store_rc(prfx_rc, ini); + + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) + return ini->rc ?: SMC_CLC_DECL_GETVLANERR; + + /* check for ISM device matching V1 proposed device */ + if (!prfx_rc) + smc_find_ism_v1_device_serv(new_smc, pclc, ini); + if (ini->ism_dev[0]) + return 0; + + if (!smcr_indicated(pclc->hdr.typev1) && + !smcr_indicated(pclc->hdr.typev2)) + /* skip RDMA and decline */ + return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV; + + /* check if RDMA V2 is available */ + smc_find_rdma_v2_device_serv(new_smc, pclc, ini); + if (ini->smcrv2.ib_dev_v2) + return 0; + + /* check if RDMA V1 is available */ + if (!prfx_rc) { + int rc; + + rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); + smc_find_ism_store_rc(rc, ini); + return (!rc) ? 0 : ini->rc; + } + return prfx_rc; +} + +/* listen worker: finish RDMA setup */ +static int smc_listen_rdma_finish(struct smc_sock *new_smc, + struct smc_clc_msg_accept_confirm *cclc, + bool local_first, + struct smc_init_info *ini) +{ + struct smc_link *link = new_smc->conn.lnk; + int reason_code = 0; + + if (local_first) + smc_link_save_peer_info(link, cclc, ini); + + if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) + return SMC_CLC_DECL_ERR_RTOK; + + if (local_first) { + if (smc_ib_ready_link(link)) + return SMC_CLC_DECL_ERR_RDYLNK; + /* QP confirmation over RoCE fabric */ + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); + reason_code = smcr_serv_conf_first_link(new_smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); + } + return reason_code; +} + +/* setup for connection of server */ +static void smc_listen_work(struct work_struct *work) +{ + struct smc_sock *new_smc = container_of(work, struct smc_sock, + smc_listen_work); + struct socket *newclcsock = new_smc->clcsock; + struct smc_clc_msg_accept_confirm *cclc; + struct smc_clc_msg_proposal_area *buf; + struct smc_clc_msg_proposal *pclc; + struct smc_init_info *ini = NULL; + u8 proposal_version = SMC_V1; + u8 accept_version; + int rc = 0; + + if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) + return smc_listen_out_err(new_smc); + + if (new_smc->use_fallback) { + smc_listen_out_connected(new_smc); + return; + } + + /* check if peer is smc capable */ + if (!tcp_sk(newclcsock->sk)->syn_smc) { + rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); + if (rc) + smc_listen_out_err(new_smc); + else + smc_listen_out_connected(new_smc); + return; + } + + /* do inband token exchange - + * wait for and receive SMC Proposal CLC message + */ + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) { + rc = SMC_CLC_DECL_MEM; + goto out_decl; + } + pclc = (struct smc_clc_msg_proposal *)buf; + rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf), + SMC_CLC_PROPOSAL, CLC_WAIT_TIME); + if (rc) + goto out_decl; + + if (pclc->hdr.version > SMC_V1) + proposal_version = SMC_V2; + + /* IPSec connections opt out of SMC optimizations */ + if (using_ipsec(new_smc)) { + rc = SMC_CLC_DECL_IPSEC; + goto out_decl; + } + + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) { + rc = SMC_CLC_DECL_MEM; + goto out_decl; + } + + /* initial version checking */ + rc = smc_listen_v2_check(new_smc, pclc, ini); + if (rc) + goto out_decl; + + rc = smc_clc_srv_v2x_features_validate(pclc, ini); + if (rc) + goto out_decl; + + mutex_lock(&smc_server_lgr_pending); + smc_close_init(new_smc); + smc_rx_init(new_smc); + smc_tx_init(new_smc); + + /* determine ISM or RoCE device used for connection */ + rc = smc_listen_find_device(new_smc, pclc, ini); + if (rc) + goto out_unlock; + + /* send SMC Accept CLC message */ + accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version; + rc = smc_clc_send_accept(new_smc, ini->first_contact_local, + accept_version, ini->negotiated_eid, ini); + if (rc) + goto out_unlock; + + /* SMC-D does not need this lock any more */ + if (ini->is_smcd) + mutex_unlock(&smc_server_lgr_pending); + + /* receive SMC Confirm CLC message */ + memset(buf, 0, sizeof(*buf)); + cclc = (struct smc_clc_msg_accept_confirm *)buf; + rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf), + SMC_CLC_CONFIRM, CLC_WAIT_TIME); + if (rc) { + if (!ini->is_smcd) + goto out_unlock; + goto out_decl; + } + + rc = smc_clc_v2x_features_confirm_check(cclc, ini); + if (rc) { + if (!ini->is_smcd) + goto out_unlock; + goto out_decl; + } + + /* fce smc release version is needed in smc_listen_rdma_finish, + * so save fce info here. + */ + smc_conn_save_peer_info_fce(new_smc, cclc); + + /* finish worker */ + if (!ini->is_smcd) { + rc = smc_listen_rdma_finish(new_smc, cclc, + ini->first_contact_local, ini); + if (rc) + goto out_unlock; + mutex_unlock(&smc_server_lgr_pending); + } + smc_conn_save_peer_info(new_smc, cclc); + smc_listen_out_connected(new_smc); + SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); + goto out_free; + +out_unlock: + mutex_unlock(&smc_server_lgr_pending); +out_decl: + smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, + proposal_version); +out_free: + kfree(ini); + kfree(buf); +} + +static void smc_tcp_listen_work(struct work_struct *work) +{ + struct smc_sock *lsmc = container_of(work, struct smc_sock, + tcp_listen_work); + struct sock *lsk = &lsmc->sk; + struct smc_sock *new_smc; + int rc = 0; + + lock_sock(lsk); + while (lsk->sk_state == SMC_LISTEN) { + rc = smc_clcsock_accept(lsmc, &new_smc); + if (rc) /* clcsock accept queue empty or error */ + goto out; + if (!new_smc) + continue; + + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + atomic_inc(&lsmc->queued_smc_hs); + + new_smc->listen_smc = lsmc; + new_smc->use_fallback = lsmc->use_fallback; + new_smc->fallback_rsn = lsmc->fallback_rsn; + sock_hold(lsk); /* sock_put in smc_listen_work */ + INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); + smc_copy_sock_settings_to_smc(new_smc); + sock_hold(&new_smc->sk); /* sock_put in passive closing */ + if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) + sock_put(&new_smc->sk); + } + +out: + release_sock(lsk); + sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ +} + +static void smc_clcsock_data_ready(struct sock *listen_clcsock) +{ + struct smc_sock *lsmc; + + read_lock_bh(&listen_clcsock->sk_callback_lock); + lsmc = smc_clcsock_user_data(listen_clcsock); + if (!lsmc) + goto out; + lsmc->clcsk_data_ready(listen_clcsock); + if (lsmc->sk.sk_state == SMC_LISTEN) { + sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ + if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) + sock_put(&lsmc->sk); + } +out: + read_unlock_bh(&listen_clcsock->sk_callback_lock); +} + +static int smc_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc; + + smc = smc_sk(sk); + lock_sock(sk); + + rc = -EINVAL; + if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) || + smc->connect_nonblock || sock->state != SS_UNCONNECTED) + goto out; + + rc = 0; + if (sk->sk_state == SMC_LISTEN) { + sk->sk_max_ack_backlog = backlog; + goto out; + } + /* some socket options are handled in core, so we could not apply + * them to the clc socket -- copy smc socket options to clc socket + */ + smc_copy_sock_settings_to_clc(smc); + if (!smc->use_fallback) + tcp_sk(smc->clcsock->sk)->syn_smc = 1; + + /* save original sk_data_ready function and establish + * smc-specific sk_data_ready function + */ + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); + smc->clcsock->sk->sk_user_data = + (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, + smc_clcsock_data_ready, &smc->clcsk_data_ready); + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); + + /* save original ops */ + smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; + + smc->af_ops = *smc->ori_af_ops; + smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock; + + inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; + + if (smc->limit_smc_hs) + tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; + + rc = kernel_listen(smc->clcsock, backlog); + if (rc) { + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); + smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); + goto out; + } + sk->sk_max_ack_backlog = backlog; + sk->sk_ack_backlog = 0; + sk->sk_state = SMC_LISTEN; + +out: + release_sock(sk); + return rc; +} + +static int smc_accept(struct socket *sock, struct socket *new_sock, + int flags, bool kern) +{ + struct sock *sk = sock->sk, *nsk; + DECLARE_WAITQUEUE(wait, current); + struct smc_sock *lsmc; + long timeo; + int rc = 0; + + lsmc = smc_sk(sk); + sock_hold(sk); /* sock_put below */ + lock_sock(sk); + + if (lsmc->sk.sk_state != SMC_LISTEN) { + rc = -EINVAL; + release_sock(sk); + goto out; + } + + /* Wait for an incoming connection */ + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + add_wait_queue_exclusive(sk_sleep(sk), &wait); + while (!(nsk = smc_accept_dequeue(sk, new_sock))) { + set_current_state(TASK_INTERRUPTIBLE); + if (!timeo) { + rc = -EAGAIN; + break; + } + release_sock(sk); + timeo = schedule_timeout(timeo); + /* wakeup by sk_data_ready in smc_listen_work() */ + sched_annotate_sleep(); + lock_sock(sk); + if (signal_pending(current)) { + rc = sock_intr_errno(timeo); + break; + } + } + set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + + if (!rc) + rc = sock_error(nsk); + release_sock(sk); + if (rc) + goto out; + + if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { + /* wait till data arrives on the socket */ + timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * + MSEC_PER_SEC); + if (smc_sk(nsk)->use_fallback) { + struct sock *clcsk = smc_sk(nsk)->clcsock->sk; + + lock_sock(clcsk); + if (skb_queue_empty(&clcsk->sk_receive_queue)) + sk_wait_data(clcsk, &timeo, NULL); + release_sock(clcsk); + } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { + lock_sock(nsk); + smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available); + release_sock(nsk); + } + } + +out: + sock_put(sk); /* sock_hold above */ + return rc; +} + +static int smc_getname(struct socket *sock, struct sockaddr *addr, + int peer) +{ + struct smc_sock *smc; + + if (peer && (sock->sk->sk_state != SMC_ACTIVE) && + (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) + return -ENOTCONN; + + smc = smc_sk(sock->sk); + + return smc->clcsock->ops->getname(smc->clcsock, addr, peer); +} + +static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc; + + smc = smc_sk(sk); + lock_sock(sk); + + /* SMC does not support connect with fastopen */ + if (msg->msg_flags & MSG_FASTOPEN) { + /* not connected yet, fallback */ + if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { + rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); + if (rc) + goto out; + } else { + rc = -EINVAL; + goto out; + } + } else if ((sk->sk_state != SMC_ACTIVE) && + (sk->sk_state != SMC_APPCLOSEWAIT1) && + (sk->sk_state != SMC_INIT)) { + rc = -EPIPE; + goto out; + } + + if (smc->use_fallback) { + rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); + } else { + rc = smc_tx_sendmsg(smc, msg, len); + SMC_STAT_TX_PAYLOAD(smc, len, rc); + } +out: + release_sock(sk); + return rc; +} + +static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -ENOTCONN; + + smc = smc_sk(sk); + lock_sock(sk); + if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { + /* socket was connected before, no more data to read */ + rc = 0; + goto out; + } + if ((sk->sk_state == SMC_INIT) || + (sk->sk_state == SMC_LISTEN) || + (sk->sk_state == SMC_CLOSED)) + goto out; + + if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { + rc = 0; + goto out; + } + + if (smc->use_fallback) { + rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); + } else { + msg->msg_namelen = 0; + rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); + SMC_STAT_RX_PAYLOAD(smc, rc, rc); + } + +out: + release_sock(sk); + return rc; +} + +static __poll_t smc_accept_poll(struct sock *parent) +{ + struct smc_sock *isk = smc_sk(parent); + __poll_t mask = 0; + + spin_lock(&isk->accept_q_lock); + if (!list_empty(&isk->accept_q)) + mask = EPOLLIN | EPOLLRDNORM; + spin_unlock(&isk->accept_q_lock); + + return mask; +} + +static __poll_t smc_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + __poll_t mask = 0; + + if (!sk) + return EPOLLNVAL; + + smc = smc_sk(sock->sk); + if (smc->use_fallback) { + /* delegate to CLC child sock */ + mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); + sk->sk_err = smc->clcsock->sk->sk_err; + } else { + if (sk->sk_state != SMC_CLOSED) + sock_poll_wait(file, sock, wait); + if (sk->sk_err) + mask |= EPOLLERR; + if ((sk->sk_shutdown == SHUTDOWN_MASK) || + (sk->sk_state == SMC_CLOSED)) + mask |= EPOLLHUP; + if (sk->sk_state == SMC_LISTEN) { + /* woken up by sk_data_ready in smc_listen_work() */ + mask |= smc_accept_poll(sk); + } else if (smc->use_fallback) { /* as result of connect_work()*/ + mask |= smc->clcsock->ops->poll(file, smc->clcsock, + wait); + sk->sk_err = smc->clcsock->sk->sk_err; + } else { + if ((sk->sk_state != SMC_INIT && + atomic_read(&smc->conn.sndbuf_space)) || + sk->sk_shutdown & SEND_SHUTDOWN) { + mask |= EPOLLOUT | EPOLLWRNORM; + } else { + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + if (atomic_read(&smc->conn.bytes_to_rcv)) + mask |= EPOLLIN | EPOLLRDNORM; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + if (sk->sk_state == SMC_APPCLOSEWAIT1) + mask |= EPOLLIN; + if (smc->conn.urg_state == SMC_URG_VALID) + mask |= EPOLLPRI; + } + } + + return mask; +} + +static int smc_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + bool do_shutdown = true; + struct smc_sock *smc; + int rc = -EINVAL; + int old_state; + int rc1 = 0; + + smc = smc_sk(sk); + + if ((how < SHUT_RD) || (how > SHUT_RDWR)) + return rc; + + lock_sock(sk); + + if (sock->state == SS_CONNECTING) { + if (sk->sk_state == SMC_ACTIVE) + sock->state = SS_CONNECTED; + else if (sk->sk_state == SMC_PEERCLOSEWAIT1 || + sk->sk_state == SMC_PEERCLOSEWAIT2 || + sk->sk_state == SMC_APPCLOSEWAIT1 || + sk->sk_state == SMC_APPCLOSEWAIT2 || + sk->sk_state == SMC_APPFINCLOSEWAIT) + sock->state = SS_DISCONNECTING; + } + + rc = -ENOTCONN; + if ((sk->sk_state != SMC_ACTIVE) && + (sk->sk_state != SMC_PEERCLOSEWAIT1) && + (sk->sk_state != SMC_PEERCLOSEWAIT2) && + (sk->sk_state != SMC_APPCLOSEWAIT1) && + (sk->sk_state != SMC_APPCLOSEWAIT2) && + (sk->sk_state != SMC_APPFINCLOSEWAIT)) + goto out; + if (smc->use_fallback) { + rc = kernel_sock_shutdown(smc->clcsock, how); + sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; + if (sk->sk_shutdown == SHUTDOWN_MASK) { + sk->sk_state = SMC_CLOSED; + sk->sk_socket->state = SS_UNCONNECTED; + sock_put(sk); + } + goto out; + } + switch (how) { + case SHUT_RDWR: /* shutdown in both directions */ + old_state = sk->sk_state; + rc = smc_close_active(smc); + if (old_state == SMC_ACTIVE && + sk->sk_state == SMC_PEERCLOSEWAIT1) + do_shutdown = false; + break; + case SHUT_WR: + rc = smc_close_shutdown_write(smc); + break; + case SHUT_RD: + rc = 0; + /* nothing more to do because peer is not involved */ + break; + } + if (do_shutdown && smc->clcsock) + rc1 = kernel_sock_shutdown(smc->clcsock, how); + /* map sock_shutdown_cmd constants to sk_shutdown value range */ + sk->sk_shutdown |= how + 1; + + if (sk->sk_state == SMC_CLOSED) + sock->state = SS_UNCONNECTED; + else + sock->state = SS_DISCONNECTING; +out: + release_sock(sk); + return rc ? rc : rc1; +} + +static int __smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct smc_sock *smc; + int val, len; + + smc = smc_sk(sock->sk); + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(int, len, sizeof(int)); + + if (len < 0) + return -EINVAL; + + switch (optname) { + case SMC_LIMIT_HS: + val = smc->limit_smc_hs; + break; + default: + return -EOPNOTSUPP; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +static int __smc_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int val, rc; + + smc = smc_sk(sk); + + lock_sock(sk); + switch (optname) { + case SMC_LIMIT_HS: + if (optlen < sizeof(int)) { + rc = -EINVAL; + break; + } + if (copy_from_sockptr(&val, optval, sizeof(int))) { + rc = -EFAULT; + break; + } + + smc->limit_smc_hs = !!val; + rc = 0; + break; + default: + rc = -EOPNOTSUPP; + break; + } + release_sock(sk); + + return rc; +} + +static int smc_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int val, rc; + + if (level == SOL_TCP && optname == TCP_ULP) + return -EOPNOTSUPP; + else if (level == SOL_SMC) + return __smc_setsockopt(sock, level, optname, optval, optlen); + + smc = smc_sk(sk); + + /* generic setsockopts reaching us here always apply to the + * CLC socket + */ + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } + if (unlikely(!smc->clcsock->ops->setsockopt)) + rc = -EOPNOTSUPP; + else + rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, + optval, optlen); + if (smc->clcsock->sk->sk_err) { + sk->sk_err = smc->clcsock->sk->sk_err; + sk_error_report(sk); + } + mutex_unlock(&smc->clcsock_release_lock); + + if (optlen < sizeof(int)) + return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; + + lock_sock(sk); + if (rc || smc->use_fallback) + goto out; + switch (optname) { + case TCP_FASTOPEN: + case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_KEY: + case TCP_FASTOPEN_NO_COOKIE: + /* option not supported by SMC */ + if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { + rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); + } else { + rc = -EINVAL; + } + break; + case TCP_NODELAY: + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { + if (val) { + SMC_STAT_INC(smc, ndly_cnt); + smc_tx_pending(&smc->conn); + cancel_delayed_work(&smc->conn.tx_work); + } + } + break; + case TCP_CORK: + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { + if (!val) { + SMC_STAT_INC(smc, cork_cnt); + smc_tx_pending(&smc->conn); + cancel_delayed_work(&smc->conn.tx_work); + } + } + break; + case TCP_DEFER_ACCEPT: + smc->sockopt_defer_accept = val; + break; + default: + break; + } +out: + release_sock(sk); + + return rc; +} + +static int smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct smc_sock *smc; + int rc; + + if (level == SOL_SMC) + return __smc_getsockopt(sock, level, optname, optval, optlen); + + smc = smc_sk(sock->sk); + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } + /* socket options apply to the CLC socket */ + if (unlikely(!smc->clcsock->ops->getsockopt)) { + mutex_unlock(&smc->clcsock_release_lock); + return -EOPNOTSUPP; + } + rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, + optval, optlen); + mutex_unlock(&smc->clcsock_release_lock); + return rc; +} + +static int smc_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + union smc_host_cursor cons, urg; + struct smc_connection *conn; + struct smc_sock *smc; + int answ; + + smc = smc_sk(sock->sk); + conn = &smc->conn; + lock_sock(&smc->sk); + if (smc->use_fallback) { + if (!smc->clcsock) { + release_sock(&smc->sk); + return -EBADF; + } + answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); + release_sock(&smc->sk); + return answ; + } + switch (cmd) { + case SIOCINQ: /* same as FIONREAD */ + if (smc->sk.sk_state == SMC_LISTEN) { + release_sock(&smc->sk); + return -EINVAL; + } + if (smc->sk.sk_state == SMC_INIT || + smc->sk.sk_state == SMC_CLOSED) + answ = 0; + else + answ = atomic_read(&smc->conn.bytes_to_rcv); + break; + case SIOCOUTQ: + /* output queue size (not send + not acked) */ + if (smc->sk.sk_state == SMC_LISTEN) { + release_sock(&smc->sk); + return -EINVAL; + } + if (smc->sk.sk_state == SMC_INIT || + smc->sk.sk_state == SMC_CLOSED) + answ = 0; + else + answ = smc->conn.sndbuf_desc->len - + atomic_read(&smc->conn.sndbuf_space); + break; + case SIOCOUTQNSD: + /* output queue size (not send only) */ + if (smc->sk.sk_state == SMC_LISTEN) { + release_sock(&smc->sk); + return -EINVAL; + } + if (smc->sk.sk_state == SMC_INIT || + smc->sk.sk_state == SMC_CLOSED) + answ = 0; + else + answ = smc_tx_prepared_sends(&smc->conn); + break; + case SIOCATMARK: + if (smc->sk.sk_state == SMC_LISTEN) { + release_sock(&smc->sk); + return -EINVAL; + } + if (smc->sk.sk_state == SMC_INIT || + smc->sk.sk_state == SMC_CLOSED) { + answ = 0; + } else { + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); + smc_curs_copy(&urg, &conn->urg_curs, conn); + answ = smc_curs_diff(conn->rmb_desc->len, + &cons, &urg) == 1; + } + break; + default: + release_sock(&smc->sk); + return -ENOIOCTLCMD; + } + release_sock(&smc->sk); + + return put_user(answ, (int __user *)arg); +} + +/* Map the affected portions of the rmbe into an spd, note the number of bytes + * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor + * updates till whenever a respective page has been fully processed. + * Note that subsequent recv() calls have to wait till all splice() processing + * completed. + */ +static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -ENOTCONN; + + smc = smc_sk(sk); + lock_sock(sk); + if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { + /* socket was connected before, no more data to read */ + rc = 0; + goto out; + } + if (sk->sk_state == SMC_INIT || + sk->sk_state == SMC_LISTEN || + sk->sk_state == SMC_CLOSED) + goto out; + + if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { + rc = 0; + goto out; + } + + if (smc->use_fallback) { + rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, + pipe, len, flags); + } else { + if (*ppos) { + rc = -ESPIPE; + goto out; + } + if (flags & SPLICE_F_NONBLOCK) + flags = MSG_DONTWAIT; + else + flags = 0; + SMC_STAT_INC(smc, splice_cnt); + rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); + } +out: + release_sock(sk); + + return rc; +} + +/* must look like tcp */ +static const struct proto_ops smc_sock_ops = { + .family = PF_SMC, + .owner = THIS_MODULE, + .release = smc_release, + .bind = smc_bind, + .connect = smc_connect, + .socketpair = sock_no_socketpair, + .accept = smc_accept, + .getname = smc_getname, + .poll = smc_poll, + .ioctl = smc_ioctl, + .listen = smc_listen, + .shutdown = smc_shutdown, + .setsockopt = smc_setsockopt, + .getsockopt = smc_getsockopt, + .sendmsg = smc_sendmsg, + .recvmsg = smc_recvmsg, + .mmap = sock_no_mmap, + .splice_read = smc_splice_read, +}; + +static int __smc_create(struct net *net, struct socket *sock, int protocol, + int kern, struct socket *clcsock) +{ + int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; + struct smc_sock *smc; + struct sock *sk; + int rc; + + rc = -ESOCKTNOSUPPORT; + if (sock->type != SOCK_STREAM) + goto out; + + rc = -EPROTONOSUPPORT; + if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6) + goto out; + + rc = -ENOBUFS; + sock->ops = &smc_sock_ops; + sock->state = SS_UNCONNECTED; + sk = smc_sock_alloc(net, sock, protocol); + if (!sk) + goto out; + + /* create internal TCP socket for CLC handshake and fallback */ + smc = smc_sk(sk); + smc->use_fallback = false; /* assume rdma capability first */ + smc->fallback_rsn = 0; + + /* default behavior from limit_smc_hs in every net namespace */ + smc->limit_smc_hs = net->smc.limit_smc_hs; + + rc = 0; + if (!clcsock) { + rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, + &smc->clcsock); + if (rc) { + sk_common_release(sk); + goto out; + } + + /* smc_clcsock_release() does not wait smc->clcsock->sk's + * destruction; its sk_state might not be TCP_CLOSE after + * smc->sk is close()d, and TCP timers can be fired later, + * which need net ref. + */ + sk = smc->clcsock->sk; + __netns_tracker_free(net, &sk->ns_tracker, false); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); + } else { + smc->clcsock = clcsock; + } + +out: + return rc; +} + +static int smc_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + return __smc_create(net, sock, protocol, kern, NULL); +} + +static const struct net_proto_family smc_sock_family_ops = { + .family = PF_SMC, + .owner = THIS_MODULE, + .create = smc_create, +}; + +static int smc_ulp_init(struct sock *sk) +{ + struct socket *tcp = sk->sk_socket; + struct net *net = sock_net(sk); + struct socket *smcsock; + int protocol, ret; + + /* only TCP can be replaced */ + if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP || + (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) + return -ESOCKTNOSUPPORT; + /* don't handle wq now */ + if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list) + return -ENOTCONN; + + if (sk->sk_family == AF_INET) + protocol = SMCPROTO_SMC; + else + protocol = SMCPROTO_SMC6; + + smcsock = sock_alloc(); + if (!smcsock) + return -ENFILE; + + smcsock->type = SOCK_STREAM; + __module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */ + ret = __smc_create(net, smcsock, protocol, 1, tcp); + if (ret) { + sock_release(smcsock); /* module_put() which ops won't be NULL */ + return ret; + } + + /* replace tcp socket to smc */ + smcsock->file = tcp->file; + smcsock->file->private_data = smcsock; + smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */ + smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */ + tcp->file = NULL; + + return ret; +} + +static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk, + const gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(newsk); + + /* don't inherit ulp ops to child when listen */ + icsk->icsk_ulp_ops = NULL; +} + +static struct tcp_ulp_ops smc_ulp_ops __read_mostly = { + .name = "smc", + .owner = THIS_MODULE, + .init = smc_ulp_init, + .clone = smc_ulp_clone, +}; + +unsigned int smc_net_id; + +static __net_init int smc_net_init(struct net *net) +{ + int rc; + + rc = smc_sysctl_net_init(net); + if (rc) + return rc; + return smc_pnet_net_init(net); +} + +static void __net_exit smc_net_exit(struct net *net) +{ + smc_sysctl_net_exit(net); + smc_pnet_net_exit(net); +} + +static __net_init int smc_net_stat_init(struct net *net) +{ + return smc_stats_init(net); +} + +static void __net_exit smc_net_stat_exit(struct net *net) +{ + smc_stats_exit(net); +} + +static struct pernet_operations smc_net_ops = { + .init = smc_net_init, + .exit = smc_net_exit, + .id = &smc_net_id, + .size = sizeof(struct smc_net), +}; + +static struct pernet_operations smc_net_stat_ops = { + .init = smc_net_stat_init, + .exit = smc_net_stat_exit, +}; + +static int __init smc_init(void) +{ + int rc; + + rc = register_pernet_subsys(&smc_net_ops); + if (rc) + return rc; + + rc = register_pernet_subsys(&smc_net_stat_ops); + if (rc) + goto out_pernet_subsys; + + rc = smc_ism_init(); + if (rc) + goto out_pernet_subsys_stat; + smc_clc_init(); + + rc = smc_nl_init(); + if (rc) + goto out_ism; + + rc = smc_pnet_init(); + if (rc) + goto out_nl; + + rc = -ENOMEM; + + smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); + if (!smc_tcp_ls_wq) + goto out_pnet; + + smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); + if (!smc_hs_wq) + goto out_alloc_tcp_ls_wq; + + smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); + if (!smc_close_wq) + goto out_alloc_hs_wq; + + rc = smc_core_init(); + if (rc) { + pr_err("%s: smc_core_init fails with %d\n", __func__, rc); + goto out_alloc_wqs; + } + + rc = smc_llc_init(); + if (rc) { + pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); + goto out_core; + } + + rc = smc_cdc_init(); + if (rc) { + pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); + goto out_core; + } + + rc = proto_register(&smc_proto, 1); + if (rc) { + pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); + goto out_core; + } + + rc = proto_register(&smc_proto6, 1); + if (rc) { + pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc); + goto out_proto; + } + + rc = sock_register(&smc_sock_family_ops); + if (rc) { + pr_err("%s: sock_register fails with %d\n", __func__, rc); + goto out_proto6; + } + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); + INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); + + rc = smc_ib_register_client(); + if (rc) { + pr_err("%s: ib_register fails with %d\n", __func__, rc); + goto out_sock; + } + + rc = tcp_register_ulp(&smc_ulp_ops); + if (rc) { + pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); + goto out_ib; + } + + static_branch_enable(&tcp_have_smc); + return 0; + +out_ib: + smc_ib_unregister_client(); +out_sock: + sock_unregister(PF_SMC); +out_proto6: + proto_unregister(&smc_proto6); +out_proto: + proto_unregister(&smc_proto); +out_core: + smc_core_exit(); +out_alloc_wqs: + destroy_workqueue(smc_close_wq); +out_alloc_hs_wq: + destroy_workqueue(smc_hs_wq); +out_alloc_tcp_ls_wq: + destroy_workqueue(smc_tcp_ls_wq); +out_pnet: + smc_pnet_exit(); +out_nl: + smc_nl_exit(); +out_ism: + smc_clc_exit(); + smc_ism_exit(); +out_pernet_subsys_stat: + unregister_pernet_subsys(&smc_net_stat_ops); +out_pernet_subsys: + unregister_pernet_subsys(&smc_net_ops); + + return rc; +} + +static void __exit smc_exit(void) +{ + static_branch_disable(&tcp_have_smc); + tcp_unregister_ulp(&smc_ulp_ops); + sock_unregister(PF_SMC); + smc_core_exit(); + smc_ib_unregister_client(); + smc_ism_exit(); + destroy_workqueue(smc_close_wq); + destroy_workqueue(smc_tcp_ls_wq); + destroy_workqueue(smc_hs_wq); + proto_unregister(&smc_proto6); + proto_unregister(&smc_proto); + smc_pnet_exit(); + smc_nl_exit(); + smc_clc_exit(); + unregister_pernet_subsys(&smc_net_stat_ops); + unregister_pernet_subsys(&smc_net_ops); + rcu_barrier(); +} + +module_init(smc_init); +module_exit(smc_exit); + +MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); +MODULE_DESCRIPTION("smc socket address family"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_SMC); +MODULE_ALIAS_TCP_ULP("smc"); +MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME); diff --git a/net/smc/smc.h b/net/smc/smc.h new file mode 100644 index 0000000000..e377980b84 --- /dev/null +++ b/net/smc/smc.h @@ -0,0 +1,385 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for the SMC module (socket related) + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ +#ifndef __SMC_H +#define __SMC_H + +#include <linux/socket.h> +#include <linux/types.h> +#include <linux/compiler.h> /* __aligned */ +#include <net/genetlink.h> +#include <net/sock.h> + +#include "smc_ib.h" + +#define SMC_V1 1 /* SMC version V1 */ +#define SMC_V2 2 /* SMC version V2 */ + +#define SMC_RELEASE_0 0 +#define SMC_RELEASE_1 1 +#define SMC_RELEASE SMC_RELEASE_1 /* the latest release version */ + +#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ +#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ + +#define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM + * devices + */ +#define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */ + +extern struct proto smc_proto; +extern struct proto smc_proto6; + +#ifdef ATOMIC64_INIT +#define KERNEL_HAS_ATOMIC64 +#endif + +enum smc_state { /* possible states of an SMC socket */ + SMC_ACTIVE = 1, + SMC_INIT = 2, + SMC_CLOSED = 7, + SMC_LISTEN = 10, + /* normal close */ + SMC_PEERCLOSEWAIT1 = 20, + SMC_PEERCLOSEWAIT2 = 21, + SMC_APPFINCLOSEWAIT = 24, + SMC_APPCLOSEWAIT1 = 22, + SMC_APPCLOSEWAIT2 = 23, + SMC_PEERFINCLOSEWAIT = 25, + /* abnormal close */ + SMC_PEERABORTWAIT = 26, + SMC_PROCESSABORT = 27, +}; + +struct smc_link_group; + +struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ + union { + u8 type; +#if defined(__BIG_ENDIAN_BITFIELD) + struct { + u8 llc_version:4, + llc_type:4; + }; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + struct { + u8 llc_type:4, + llc_version:4; + }; +#endif + }; +} __aligned(1); + +struct smc_cdc_conn_state_flags { +#if defined(__BIG_ENDIAN_BITFIELD) + u8 peer_done_writing : 1; /* Sending done indicator */ + u8 peer_conn_closed : 1; /* Peer connection closed indicator */ + u8 peer_conn_abort : 1; /* Abnormal close indicator */ + u8 reserved : 5; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 5; + u8 peer_conn_abort : 1; + u8 peer_conn_closed : 1; + u8 peer_done_writing : 1; +#endif +}; + +struct smc_cdc_producer_flags { +#if defined(__BIG_ENDIAN_BITFIELD) + u8 write_blocked : 1; /* Writing Blocked, no rx buf space */ + u8 urg_data_pending : 1; /* Urgent Data Pending */ + u8 urg_data_present : 1; /* Urgent Data Present */ + u8 cons_curs_upd_req : 1; /* cursor update requested */ + u8 failover_validation : 1;/* message replay due to failover */ + u8 reserved : 3; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 3; + u8 failover_validation : 1; + u8 cons_curs_upd_req : 1; + u8 urg_data_present : 1; + u8 urg_data_pending : 1; + u8 write_blocked : 1; +#endif +}; + +/* in host byte order */ +union smc_host_cursor { /* SMC cursor - an offset in an RMBE */ + struct { + u16 reserved; + u16 wrap; /* window wrap sequence number */ + u32 count; /* cursor (= offset) part */ + }; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t acurs; /* for atomic processing */ +#else + u64 acurs; /* for atomic processing */ +#endif +} __aligned(8); + +/* in host byte order, except for flag bitfields in network byte order */ +struct smc_host_cdc_msg { /* Connection Data Control message */ + struct smc_wr_rx_hdr common; /* .type = 0xFE */ + u8 len; /* length = 44 */ + u16 seqno; /* connection seq # */ + u32 token; /* alert_token */ + union smc_host_cursor prod; /* producer cursor */ + union smc_host_cursor cons; /* consumer cursor, + * piggy backed "ack" + */ + struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */ + struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/ + u8 reserved[18]; +} __aligned(8); + +enum smc_urg_state { + SMC_URG_VALID = 1, /* data present */ + SMC_URG_NOTYET = 2, /* data pending */ + SMC_URG_READ = 3, /* data was already read */ +}; + +struct smc_mark_woken { + bool woken; + void *key; + wait_queue_entry_t wait_entry; +}; + +struct smc_connection { + struct rb_node alert_node; + struct smc_link_group *lgr; /* link group of connection */ + struct smc_link *lnk; /* assigned SMC-R link */ + u32 alert_token_local; /* unique conn. id */ + u8 peer_rmbe_idx; /* from tcp handshake */ + int peer_rmbe_size; /* size of peer rx buffer */ + atomic_t peer_rmbe_space;/* remaining free bytes in peer + * rmbe + */ + int rtoken_idx; /* idx to peer RMB rkey/addr */ + + struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ + struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ + int rmbe_size_comp; /* compressed notation */ + int rmbe_update_limit; + /* lower limit for consumer + * cursor update + */ + + struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging + * buffer for CDC msg send + * .prod cf. TCP snd_nxt + * .cons cf. TCP sends ack + */ + union smc_host_cursor local_tx_ctrl_fin; + /* prod crsr - confirmed by peer + */ + union smc_host_cursor tx_curs_prep; /* tx - prepared data + * snd_max..wmem_alloc + */ + union smc_host_cursor tx_curs_sent; /* tx - sent data + * snd_nxt ? + */ + union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer + * snd-wnd-begin ? + */ + atomic_t sndbuf_space; /* remaining space in sndbuf */ + u16 tx_cdc_seq; /* sequence # for CDC send */ + u16 tx_cdc_seq_fin; /* sequence # - tx completed */ + spinlock_t send_lock; /* protect wr_sends */ + atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe + * - inc when post wqe, + * - dec on polled tx cqe + */ + wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ + atomic_t tx_pushing; /* nr_threads trying tx push */ + struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ + u32 tx_off; /* base offset in peer rmb */ + + struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. + * .prod cf. TCP rcv_nxt + * .cons cf. TCP snd_una + */ + union smc_host_cursor rx_curs_confirmed; /* confirmed to peer + * source of snd_una ? + */ + union smc_host_cursor urg_curs; /* points at urgent byte */ + enum smc_urg_state urg_state; + bool urg_tx_pend; /* urgent data staged */ + bool urg_rx_skip_pend; + /* indicate urgent oob data + * read, but previous regular + * data still pending + */ + char urg_rx_byte; /* urgent byte */ + bool tx_in_release_sock; + /* flush pending tx data in + * sock release_cb() + */ + atomic_t bytes_to_rcv; /* arrived data, + * not yet received + */ + atomic_t splice_pending; /* number of spliced bytes + * pending processing + */ +#ifndef KERNEL_HAS_ATOMIC64 + spinlock_t acurs_lock; /* protect cursors */ +#endif + struct work_struct close_work; /* peer sent some closing */ + struct work_struct abort_work; /* abort the connection */ + struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ + u8 rx_off; /* receive offset: + * 0 for SMC-R, 32 for SMC-D + */ + u64 peer_token; /* SMC-D token of peer */ + u8 killed : 1; /* abnormal termination */ + u8 freed : 1; /* normal termiation */ + u8 out_of_sync : 1; /* out of sync with peer */ +}; + +struct smc_sock { /* smc sock container */ + struct sock sk; + struct socket *clcsock; /* internal tcp socket */ + void (*clcsk_state_change)(struct sock *sk); + /* original stat_change fct. */ + void (*clcsk_data_ready)(struct sock *sk); + /* original data_ready fct. */ + void (*clcsk_write_space)(struct sock *sk); + /* original write_space fct. */ + void (*clcsk_error_report)(struct sock *sk); + /* original error_report fct. */ + struct smc_connection conn; /* smc connection */ + struct smc_sock *listen_smc; /* listen parent */ + struct work_struct connect_work; /* handle non-blocking connect*/ + struct work_struct tcp_listen_work;/* handle tcp socket accepts */ + struct work_struct smc_listen_work;/* prepare new accept socket */ + struct list_head accept_q; /* sockets to be accepted */ + spinlock_t accept_q_lock; /* protects accept_q */ + bool limit_smc_hs; /* put constraint on handshake */ + bool use_fallback; /* fallback to tcp */ + int fallback_rsn; /* reason for fallback */ + u32 peer_diagnosis; /* decline reason from peer */ + atomic_t queued_smc_hs; /* queued smc handshakes */ + struct inet_connection_sock_af_ops af_ops; + const struct inet_connection_sock_af_ops *ori_af_ops; + /* original af ops */ + int sockopt_defer_accept; + /* sockopt TCP_DEFER_ACCEPT + * value + */ + u8 wait_close_tx_prepared : 1; + /* shutdown wr or close + * started, waiting for unsent + * data to be sent + */ + u8 connect_nonblock : 1; + /* non-blocking connect in + * flight + */ + struct mutex clcsock_release_lock; + /* protects clcsock of a listen + * socket + * */ +}; + +#define smc_sk(ptr) container_of_const(ptr, struct smc_sock, sk) + +static inline void smc_init_saved_callbacks(struct smc_sock *smc) +{ + smc->clcsk_state_change = NULL; + smc->clcsk_data_ready = NULL; + smc->clcsk_write_space = NULL; + smc->clcsk_error_report = NULL; +} + +static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk) +{ + return (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); +} + +/* save target_cb in saved_cb, and replace target_cb with new_cb */ +static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *), + void (*new_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + /* only save once */ + if (!*saved_cb) + *saved_cb = *target_cb; + *target_cb = new_cb; +} + +/* restore target_cb to saved_cb, and reset saved_cb to NULL */ +static inline void smc_clcsock_restore_cb(void (**target_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + if (!*saved_cb) + return; + *target_cb = *saved_cb; + *saved_cb = NULL; +} + +extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ +extern struct workqueue_struct *smc_close_wq; /* wq for close work */ + +#define SMC_SYSTEMID_LEN 8 + +extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ + +#define ntohll(x) be64_to_cpu(x) +#define htonll(x) cpu_to_be64(x) + +/* convert an u32 value into network byte order, store it into a 3 byte field */ +static inline void hton24(u8 *net, u32 host) +{ + __be32 t; + + t = cpu_to_be32(host); + memcpy(net, ((u8 *)&t) + 1, 3); +} + +/* convert a received 3 byte field into host byte order*/ +static inline u32 ntoh24(u8 *net) +{ + __be32 t = 0; + + memcpy(((u8 *)&t) + 1, net, 3); + return be32_to_cpu(t); +} + +#ifdef CONFIG_XFRM +static inline bool using_ipsec(struct smc_sock *smc) +{ + return (smc->clcsock->sk->sk_policy[0] || + smc->clcsock->sk->sk_policy[1]) ? true : false; +} +#else +static inline bool using_ipsec(struct smc_sock *smc) +{ + return false; +} +#endif + +struct smc_gidlist; + +struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); +void smc_close_non_accepted(struct sock *sk); +void smc_fill_gid_list(struct smc_link_group *lgr, + struct smc_gidlist *gidlist, + struct smc_ib_device *known_dev, u8 *known_gid); + +/* smc handshake limitation interface for netlink */ +int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info); +int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info); + +static inline void smc_sock_set_flag(struct sock *sk, enum sock_flags flag) +{ + set_bit(flag, &sk->sk_flags); +} + +#endif /* __SMC_H */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c new file mode 100644 index 0000000000..3c06625ceb --- /dev/null +++ b/net/smc/smc_cdc.c @@ -0,0 +1,493 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Connection Data Control (CDC) + * handles flow control + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/spinlock.h> + +#include "smc.h" +#include "smc_wr.h" +#include "smc_cdc.h" +#include "smc_tx.h" +#include "smc_rx.h" +#include "smc_close.h" + +/********************************** send *************************************/ + +/* handler for send/transmission completion of a CDC msg */ +static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, + struct smc_link *link, + enum ib_wc_status wc_status) +{ + struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd; + struct smc_connection *conn = cdcpend->conn; + struct smc_buf_desc *sndbuf_desc; + struct smc_sock *smc; + int diff; + + sndbuf_desc = conn->sndbuf_desc; + smc = container_of(conn, struct smc_sock, conn); + bh_lock_sock(&smc->sk); + if (!wc_status && sndbuf_desc) { + diff = smc_curs_diff(sndbuf_desc->len, + &cdcpend->conn->tx_curs_fin, + &cdcpend->cursor); + /* sndbuf_space is decreased in smc_sendmsg */ + smp_mb__before_atomic(); + atomic_add(diff, &cdcpend->conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ + smp_mb__after_atomic(); + smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn); + smc_curs_copy(&conn->local_tx_ctrl_fin, &cdcpend->p_cursor, + conn); + conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; + } + + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) { + /* If user owns the sock_lock, mark the connection need sending. + * User context will later try to send when it release sock_lock + * in smc_release_cb() + */ + if (sock_owned_by_user(&smc->sk)) + conn->tx_in_release_sock = true; + else + smc_tx_pending(conn); + + if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) + wake_up(&conn->cdc_pend_tx_wq); + } + WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0); + + smc_tx_sndbuf_nonfull(smc); + bh_unlock_sock(&smc->sk); +} + +int smc_cdc_get_free_slot(struct smc_connection *conn, + struct smc_link *link, + struct smc_wr_buf **wr_buf, + struct smc_rdma_wr **wr_rdma_buf, + struct smc_cdc_tx_pend **pend) +{ + int rc; + + rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, + wr_rdma_buf, + (struct smc_wr_tx_pend_priv **)pend); + if (conn->killed) { + /* abnormal termination */ + if (!rc) + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)(*pend)); + rc = -EPIPE; + } + return rc; +} + +static inline void smc_cdc_add_pending_send(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend) +{ + BUILD_BUG_ON_MSG( + sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE, + "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)"); + BUILD_BUG_ON_MSG( + offsetofend(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE, + "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); + BUILD_BUG_ON_MSG( + sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, + "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)"); + pend->conn = conn; + pend->cursor = conn->tx_curs_sent; + pend->p_cursor = conn->local_tx_ctrl.prod; + pend->ctrl_seq = conn->tx_cdc_seq; +} + +int smc_cdc_msg_send(struct smc_connection *conn, + struct smc_wr_buf *wr_buf, + struct smc_cdc_tx_pend *pend) +{ + struct smc_link *link = conn->lnk; + union smc_host_cursor cfed; + int rc; + + smc_cdc_add_pending_send(conn, pend); + + conn->tx_cdc_seq++; + conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); + + atomic_inc(&conn->cdc_pend_tx_wr); + smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ + + rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); + if (!rc) { + smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); + conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; + } else { + conn->tx_cdc_seq--; + conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + atomic_dec(&conn->cdc_pend_tx_wr); + } + + return rc; +} + +/* send a validation msg indicating the move of a conn to an other QP link */ +int smcr_cdc_msg_send_validation(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf) +{ + struct smc_host_cdc_msg *local = &conn->local_tx_ctrl; + struct smc_link *link = conn->lnk; + struct smc_cdc_msg *peer; + int rc; + + peer = (struct smc_cdc_msg *)wr_buf; + peer->common.type = local->common.type; + peer->len = local->len; + peer->seqno = htons(conn->tx_cdc_seq_fin); /* seqno last compl. tx */ + peer->token = htonl(local->token); + peer->prod_flags.failover_validation = 1; + + /* We need to set pend->conn here to make sure smc_cdc_tx_handler() + * can handle properly + */ + smc_cdc_add_pending_send(conn, pend); + + atomic_inc(&conn->cdc_pend_tx_wr); + smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ + + rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); + if (unlikely(rc)) + atomic_dec(&conn->cdc_pend_tx_wr); + + return rc; +} + +static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) +{ + struct smc_cdc_tx_pend *pend; + struct smc_wr_buf *wr_buf; + struct smc_link *link; + bool again = false; + int rc; + +again: + link = conn->lnk; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend); + if (rc) + goto put_out; + + spin_lock_bh(&conn->send_lock); + if (link != conn->lnk) { + /* link of connection changed, try again one time*/ + spin_unlock_bh(&conn->send_lock); + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)pend); + smc_wr_tx_link_put(link); + if (again) + return -ENOLINK; + again = true; + goto again; + } + rc = smc_cdc_msg_send(conn, wr_buf, pend); + spin_unlock_bh(&conn->send_lock); +put_out: + smc_wr_tx_link_put(link); + return rc; +} + +int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) +{ + int rc; + + if (!smc_conn_lgr_valid(conn) || + (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) + return -EPIPE; + + if (conn->lgr->is_smcd) { + spin_lock_bh(&conn->send_lock); + rc = smcd_cdc_msg_send(conn); + spin_unlock_bh(&conn->send_lock); + } else { + rc = smcr_cdc_get_slot_and_msg_send(conn); + } + + return rc; +} + +void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn) +{ + wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr)); +} + +/* Send a SMC-D CDC header. + * This increments the free space available in our send buffer. + * Also update the confirmed receive buffer with what was sent to the peer. + */ +int smcd_cdc_msg_send(struct smc_connection *conn) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + union smc_host_cursor curs; + struct smcd_cdc_msg cdc; + int rc, diff; + + memset(&cdc, 0, sizeof(cdc)); + cdc.common.type = SMC_CDC_MSG_TYPE; + curs.acurs.counter = atomic64_read(&conn->local_tx_ctrl.prod.acurs); + cdc.prod.wrap = curs.wrap; + cdc.prod.count = curs.count; + curs.acurs.counter = atomic64_read(&conn->local_tx_ctrl.cons.acurs); + cdc.cons.wrap = curs.wrap; + cdc.cons.count = curs.count; + cdc.cons.prod_flags = conn->local_tx_ctrl.prod_flags; + cdc.cons.conn_state_flags = conn->local_tx_ctrl.conn_state_flags; + rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1); + if (rc) + return rc; + smc_curs_copy(&conn->rx_curs_confirmed, &curs, conn); + conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; + /* Calculate transmitted data and increment free send buffer space */ + diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin, + &conn->tx_curs_sent); + /* increased by confirmed number of bytes */ + smp_mb__before_atomic(); + atomic_add(diff, &conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ + smp_mb__after_atomic(); + smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn); + + smc_tx_sndbuf_nonfull(smc); + return rc; +} + +/********************************* receive ***********************************/ + +static inline bool smc_cdc_before(u16 seq1, u16 seq2) +{ + return (s16)(seq1 - seq2) < 0; +} + +static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc, + int *diff_prod) +{ + struct smc_connection *conn = &smc->conn; + char *base; + + /* new data included urgent business */ + smc_curs_copy(&conn->urg_curs, &conn->local_rx_ctrl.prod, conn); + conn->urg_state = SMC_URG_VALID; + if (!sock_flag(&smc->sk, SOCK_URGINLINE)) + /* we'll skip the urgent byte, so don't account for it */ + (*diff_prod)--; + base = (char *)conn->rmb_desc->cpu_addr + conn->rx_off; + if (conn->urg_curs.count) + conn->urg_rx_byte = *(base + conn->urg_curs.count - 1); + else + conn->urg_rx_byte = *(base + conn->rmb_desc->len - 1); + sk_send_sigurg(&smc->sk); +} + +static void smc_cdc_msg_validate(struct smc_sock *smc, struct smc_cdc_msg *cdc, + struct smc_link *link) +{ + struct smc_connection *conn = &smc->conn; + u16 recv_seq = ntohs(cdc->seqno); + s16 diff; + + /* check that seqnum was seen before */ + diff = conn->local_rx_ctrl.seqno - recv_seq; + if (diff < 0) { /* diff larger than 0x7fff */ + /* drop connection */ + conn->out_of_sync = 1; /* prevent any further receives */ + spin_lock_bh(&conn->send_lock); + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + conn->lnk = link; + spin_unlock_bh(&conn->send_lock); + sock_hold(&smc->sk); /* sock_put in abort_work */ + if (!queue_work(smc_close_wq, &conn->abort_work)) + sock_put(&smc->sk); + } +} + +static void smc_cdc_msg_recv_action(struct smc_sock *smc, + struct smc_cdc_msg *cdc) +{ + union smc_host_cursor cons_old, prod_old; + struct smc_connection *conn = &smc->conn; + int diff_cons, diff_prod; + + smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn); + smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn); + smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn); + + diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old, + &conn->local_rx_ctrl.cons); + if (diff_cons) { + /* peer_rmbe_space is decreased during data transfer with RDMA + * write + */ + smp_mb__before_atomic(); + atomic_add(diff_cons, &conn->peer_rmbe_space); + /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ + smp_mb__after_atomic(); + } + + diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old, + &conn->local_rx_ctrl.prod); + if (diff_prod) { + if (conn->local_rx_ctrl.prod_flags.urg_data_present) + smc_cdc_handle_urg_data_arrival(smc, &diff_prod); + /* bytes_to_rcv is decreased in smc_recvmsg */ + smp_mb__before_atomic(); + atomic_add(diff_prod, &conn->bytes_to_rcv); + /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */ + smp_mb__after_atomic(); + smc->sk.sk_data_ready(&smc->sk); + } else { + if (conn->local_rx_ctrl.prod_flags.write_blocked) + smc->sk.sk_data_ready(&smc->sk); + if (conn->local_rx_ctrl.prod_flags.urg_data_pending) + conn->urg_state = SMC_URG_NOTYET; + } + + /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ + if ((diff_cons && smc_tx_prepared_sends(conn)) || + conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || + conn->local_rx_ctrl.prod_flags.urg_data_pending) { + if (!sock_owned_by_user(&smc->sk)) + smc_tx_pending(conn); + else + conn->tx_in_release_sock = true; + } + + if (diff_cons && conn->urg_tx_pend && + atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) { + /* urg data confirmed by peer, indicate we're ready for more */ + conn->urg_tx_pend = false; + smc->sk.sk_write_space(&smc->sk); + } + + if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + smc->sk.sk_err = ECONNRESET; + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + } + if (smc_cdc_rxed_any_close_or_senddone(conn)) { + smc->sk.sk_shutdown |= RCV_SHUTDOWN; + if (smc->clcsock && smc->clcsock->sk) + smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; + smc_sock_set_flag(&smc->sk, SOCK_DONE); + sock_hold(&smc->sk); /* sock_put in close_work */ + if (!queue_work(smc_close_wq, &conn->close_work)) + sock_put(&smc->sk); + } +} + +/* called under tasklet context */ +static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc) +{ + sock_hold(&smc->sk); + bh_lock_sock(&smc->sk); + smc_cdc_msg_recv_action(smc, cdc); + bh_unlock_sock(&smc->sk); + sock_put(&smc->sk); /* no free sk in softirq-context */ +} + +/* Schedule a tasklet for this connection. Triggered from the ISM device IRQ + * handler to indicate update in the DMBE. + * + * Context: + * - tasklet context + */ +static void smcd_cdc_rx_tsklet(struct tasklet_struct *t) +{ + struct smc_connection *conn = from_tasklet(conn, t, rx_tsklet); + struct smcd_cdc_msg *data_cdc; + struct smcd_cdc_msg cdc; + struct smc_sock *smc; + + if (!conn || conn->killed) + return; + + data_cdc = (struct smcd_cdc_msg *)conn->rmb_desc->cpu_addr; + smcd_curs_copy(&cdc.prod, &data_cdc->prod, conn); + smcd_curs_copy(&cdc.cons, &data_cdc->cons, conn); + smc = container_of(conn, struct smc_sock, conn); + smc_cdc_msg_recv(smc, (struct smc_cdc_msg *)&cdc); +} + +/* Initialize receive tasklet. Called from ISM device IRQ handler to start + * receiver side. + */ +void smcd_cdc_rx_init(struct smc_connection *conn) +{ + tasklet_setup(&conn->rx_tsklet, smcd_cdc_rx_tsklet); +} + +/***************************** init, exit, misc ******************************/ + +static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + struct smc_cdc_msg *cdc = buf; + struct smc_connection *conn; + struct smc_link_group *lgr; + struct smc_sock *smc; + + if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved)) + return; /* short message */ + if (cdc->len != SMC_WR_TX_SIZE) + return; /* invalid message */ + + /* lookup connection */ + lgr = smc_get_lgr(link); + read_lock_bh(&lgr->conns_lock); + conn = smc_lgr_find_conn(ntohl(cdc->token), lgr); + read_unlock_bh(&lgr->conns_lock); + if (!conn || conn->out_of_sync) + return; + smc = container_of(conn, struct smc_sock, conn); + + if (cdc->prod_flags.failover_validation) { + smc_cdc_msg_validate(smc, cdc, link); + return; + } + if (smc_cdc_before(ntohs(cdc->seqno), + conn->local_rx_ctrl.seqno)) + /* received seqno is old */ + return; + + smc_cdc_msg_recv(smc, cdc); +} + +static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = { + { + .handler = smc_cdc_rx_handler, + .type = SMC_CDC_MSG_TYPE + }, + { + .handler = NULL, + } +}; + +int __init smc_cdc_init(void) +{ + struct smc_wr_rx_handler *handler; + int rc = 0; + + for (handler = smc_cdc_rx_handlers; handler->handler; handler++) { + INIT_HLIST_NODE(&handler->list); + rc = smc_wr_rx_register_handler(handler); + if (rc) + break; + } + return rc; +} diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h new file mode 100644 index 0000000000..696cc11f23 --- /dev/null +++ b/net/smc/smc_cdc.h @@ -0,0 +1,305 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Connection Data Control (CDC) + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_CDC_H +#define SMC_CDC_H + +#include <linux/kernel.h> /* max_t */ +#include <linux/atomic.h> +#include <linux/in.h> +#include <linux/compiler.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_wr.h" + +#define SMC_CDC_MSG_TYPE 0xFE + +/* in network byte order */ +union smc_cdc_cursor { /* SMC cursor */ + struct { + __be16 reserved; + __be16 wrap; + __be32 count; + }; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t acurs; /* for atomic processing */ +#else + u64 acurs; /* for atomic processing */ +#endif +} __aligned(8); + +/* in network byte order */ +struct smc_cdc_msg { + struct smc_wr_rx_hdr common; /* .type = 0xFE */ + u8 len; /* 44 */ + __be16 seqno; + __be32 token; + union smc_cdc_cursor prod; + union smc_cdc_cursor cons; /* piggy backed "ack" */ + struct smc_cdc_producer_flags prod_flags; + struct smc_cdc_conn_state_flags conn_state_flags; + u8 reserved[18]; +}; + +/* SMC-D cursor format */ +union smcd_cdc_cursor { + struct { + u16 wrap; + u32 count; + struct smc_cdc_producer_flags prod_flags; + struct smc_cdc_conn_state_flags conn_state_flags; + } __packed; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t acurs; /* for atomic processing */ +#else + u64 acurs; /* for atomic processing */ +#endif +} __aligned(8); + +/* CDC message for SMC-D */ +struct smcd_cdc_msg { + struct smc_wr_rx_hdr common; /* Type = 0xFE */ + u8 res1[7]; + union smcd_cdc_cursor prod; + union smcd_cdc_cursor cons; + u8 res3[8]; +} __aligned(8); + +static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) +{ + return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort || + conn->local_rx_ctrl.conn_state_flags.peer_conn_closed; +} + +static inline bool smc_cdc_rxed_any_close_or_senddone( + struct smc_connection *conn) +{ + return smc_cdc_rxed_any_close(conn) || + conn->local_rx_ctrl.conn_state_flags.peer_done_writing; +} + +static inline void smc_curs_add(int size, union smc_host_cursor *curs, + int value) +{ + curs->count += value; + if (curs->count >= size) { + curs->wrap++; + curs->count -= size; + } +} + +/* Copy cursor src into tgt */ +static inline void smc_curs_copy(union smc_host_cursor *tgt, + union smc_host_cursor *src, + struct smc_connection *conn) +{ +#ifndef KERNEL_HAS_ATOMIC64 + unsigned long flags; + + spin_lock_irqsave(&conn->acurs_lock, flags); + tgt->acurs = src->acurs; + spin_unlock_irqrestore(&conn->acurs_lock, flags); +#else + atomic64_set(&tgt->acurs, atomic64_read(&src->acurs)); +#endif +} + +static inline void smc_curs_copy_net(union smc_cdc_cursor *tgt, + union smc_cdc_cursor *src, + struct smc_connection *conn) +{ +#ifndef KERNEL_HAS_ATOMIC64 + unsigned long flags; + + spin_lock_irqsave(&conn->acurs_lock, flags); + tgt->acurs = src->acurs; + spin_unlock_irqrestore(&conn->acurs_lock, flags); +#else + atomic64_set(&tgt->acurs, atomic64_read(&src->acurs)); +#endif +} + +static inline void smcd_curs_copy(union smcd_cdc_cursor *tgt, + union smcd_cdc_cursor *src, + struct smc_connection *conn) +{ +#ifndef KERNEL_HAS_ATOMIC64 + unsigned long flags; + + spin_lock_irqsave(&conn->acurs_lock, flags); + tgt->acurs = src->acurs; + spin_unlock_irqrestore(&conn->acurs_lock, flags); +#else + atomic64_set(&tgt->acurs, atomic64_read(&src->acurs)); +#endif +} + +/* calculate cursor difference between old and new, where old <= new and + * difference cannot exceed size + */ +static inline int smc_curs_diff(unsigned int size, + union smc_host_cursor *old, + union smc_host_cursor *new) +{ + if (old->wrap != new->wrap) + return max_t(int, 0, + ((size - old->count) + new->count)); + + return max_t(int, 0, (new->count - old->count)); +} + +/* calculate cursor difference between old and new - returns negative + * value in case old > new + */ +static inline int smc_curs_comp(unsigned int size, + union smc_host_cursor *old, + union smc_host_cursor *new) +{ + if (old->wrap > new->wrap || + (old->wrap == new->wrap && old->count > new->count)) + return -smc_curs_diff(size, new, old); + return smc_curs_diff(size, old, new); +} + +/* calculate cursor difference between old and new, where old <= new and + * difference may exceed size + */ +static inline int smc_curs_diff_large(unsigned int size, + union smc_host_cursor *old, + union smc_host_cursor *new) +{ + if (old->wrap < new->wrap) + return min_t(int, + (size - old->count) + new->count + + (new->wrap - old->wrap - 1) * size, + size); + + if (old->wrap > new->wrap) /* wrap has switched from 0xffff to 0x0000 */ + return min_t(int, + (size - old->count) + new->count + + (new->wrap + 0xffff - old->wrap) * size, + size); + + return max_t(int, 0, (new->count - old->count)); +} + +static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer, + union smc_host_cursor *local, + union smc_host_cursor *save, + struct smc_connection *conn) +{ + smc_curs_copy(save, local, conn); + peer->count = htonl(save->count); + peer->wrap = htons(save->wrap); + /* peer->reserved = htons(0); must be ensured by caller */ +} + +static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer, + struct smc_connection *conn, + union smc_host_cursor *save) +{ + struct smc_host_cdc_msg *local = &conn->local_tx_ctrl; + + peer->common.type = local->common.type; + peer->len = local->len; + peer->seqno = htons(local->seqno); + peer->token = htonl(local->token); + smc_host_cursor_to_cdc(&peer->prod, &local->prod, save, conn); + smc_host_cursor_to_cdc(&peer->cons, &local->cons, save, conn); + peer->prod_flags = local->prod_flags; + peer->conn_state_flags = local->conn_state_flags; +} + +static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local, + union smc_cdc_cursor *peer, + struct smc_connection *conn) +{ + union smc_host_cursor temp, old; + union smc_cdc_cursor net; + + smc_curs_copy(&old, local, conn); + smc_curs_copy_net(&net, peer, conn); + temp.count = ntohl(net.count); + temp.wrap = ntohs(net.wrap); + if ((old.wrap > temp.wrap) && temp.wrap) + return; + if ((old.wrap == temp.wrap) && + (old.count > temp.count)) + return; + smc_curs_copy(local, &temp, conn); +} + +static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smc_cdc_msg *peer, + struct smc_connection *conn) +{ + local->common.type = peer->common.type; + local->len = peer->len; + local->seqno = ntohs(peer->seqno); + local->token = ntohl(peer->token); + smc_cdc_cursor_to_host(&local->prod, &peer->prod, conn); + smc_cdc_cursor_to_host(&local->cons, &peer->cons, conn); + local->prod_flags = peer->prod_flags; + local->conn_state_flags = peer->conn_state_flags; +} + +static inline void smcd_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smcd_cdc_msg *peer, + struct smc_connection *conn) +{ + union smc_host_cursor temp; + + temp.wrap = peer->prod.wrap; + temp.count = peer->prod.count; + smc_curs_copy(&local->prod, &temp, conn); + + temp.wrap = peer->cons.wrap; + temp.count = peer->cons.count; + smc_curs_copy(&local->cons, &temp, conn); + local->prod_flags = peer->cons.prod_flags; + local->conn_state_flags = peer->cons.conn_state_flags; +} + +static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smc_cdc_msg *peer, + struct smc_connection *conn) +{ + if (conn->lgr->is_smcd) + smcd_cdc_msg_to_host(local, (struct smcd_cdc_msg *)peer, conn); + else + smcr_cdc_msg_to_host(local, peer, conn); +} + +struct smc_cdc_tx_pend { + struct smc_connection *conn; /* socket connection */ + union smc_host_cursor cursor; /* tx sndbuf cursor sent */ + union smc_host_cursor p_cursor; /* rx RMBE cursor produced */ + u16 ctrl_seq; /* conn. tx sequence # */ +}; + +int smc_cdc_get_free_slot(struct smc_connection *conn, + struct smc_link *link, + struct smc_wr_buf **wr_buf, + struct smc_rdma_wr **wr_rdma_buf, + struct smc_cdc_tx_pend **pend); +void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn); +int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, + struct smc_cdc_tx_pend *pend); +int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); +int smcd_cdc_msg_send(struct smc_connection *conn); +int smcr_cdc_msg_send_validation(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf); +int smc_cdc_init(void) __init; +void smcd_cdc_rx_init(struct smc_connection *conn); + +#endif /* SMC_CDC_H */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c new file mode 100644 index 0000000000..72f4d81a3f --- /dev/null +++ b/net/smc/smc_clc.c @@ -0,0 +1,1278 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * CLC (connection layer control) handshake over initial TCP socket to + * prepare for RDMA traffic + * + * Copyright IBM Corp. 2016, 2018 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/in.h> +#include <linux/inetdevice.h> +#include <linux/if_ether.h> +#include <linux/sched/signal.h> +#include <linux/utsname.h> +#include <linux/ctype.h> + +#include <net/addrconf.h> +#include <net/sock.h> +#include <net/tcp.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_clc.h" +#include "smc_ib.h" +#include "smc_ism.h" +#include "smc_netlink.h" + +#define SMCR_CLC_ACCEPT_CONFIRM_LEN 68 +#define SMCD_CLC_ACCEPT_CONFIRM_LEN 48 +#define SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 78 +#define SMCR_CLC_ACCEPT_CONFIRM_LEN_V2 108 +#define SMC_CLC_RECV_BUF_LEN 100 + +/* eye catcher "SMCR" EBCDIC for CLC messages */ +static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; +/* eye catcher "SMCD" EBCDIC for CLC messages */ +static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'}; + +static u8 smc_hostname[SMC_MAX_HOSTNAME_LEN]; + +struct smc_clc_eid_table { + rwlock_t lock; + struct list_head list; + u8 ueid_cnt; + u8 seid_enabled; +}; + +static struct smc_clc_eid_table smc_clc_eid_table; + +struct smc_clc_eid_entry { + struct list_head list; + u8 eid[SMC_MAX_EID_LEN]; +}; + +/* The size of a user EID is 32 characters. + * Valid characters should be (single-byte character set) A-Z, 0-9, '.' and '-'. + * Blanks should only be used to pad to the expected size. + * First character must be alphanumeric. + */ +static bool smc_clc_ueid_valid(char *ueid) +{ + char *end = ueid + SMC_MAX_EID_LEN; + + while (--end >= ueid && isspace(*end)) + ; + if (end < ueid) + return false; + if (!isalnum(*ueid) || islower(*ueid)) + return false; + while (ueid <= end) { + if ((!isalnum(*ueid) || islower(*ueid)) && *ueid != '.' && + *ueid != '-') + return false; + ueid++; + } + return true; +} + +static int smc_clc_ueid_add(char *ueid) +{ + struct smc_clc_eid_entry *new_ueid, *tmp_ueid; + int rc; + + if (!smc_clc_ueid_valid(ueid)) + return -EINVAL; + + /* add a new ueid entry to the ueid table if there isn't one */ + new_ueid = kzalloc(sizeof(*new_ueid), GFP_KERNEL); + if (!new_ueid) + return -ENOMEM; + memcpy(new_ueid->eid, ueid, SMC_MAX_EID_LEN); + + write_lock(&smc_clc_eid_table.lock); + if (smc_clc_eid_table.ueid_cnt >= SMC_MAX_UEID) { + rc = -ERANGE; + goto err_out; + } + list_for_each_entry(tmp_ueid, &smc_clc_eid_table.list, list) { + if (!memcmp(tmp_ueid->eid, ueid, SMC_MAX_EID_LEN)) { + rc = -EEXIST; + goto err_out; + } + } + list_add_tail(&new_ueid->list, &smc_clc_eid_table.list); + smc_clc_eid_table.ueid_cnt++; + write_unlock(&smc_clc_eid_table.lock); + return 0; + +err_out: + write_unlock(&smc_clc_eid_table.lock); + kfree(new_ueid); + return rc; +} + +int smc_clc_ueid_count(void) +{ + int count; + + read_lock(&smc_clc_eid_table.lock); + count = smc_clc_eid_table.ueid_cnt; + read_unlock(&smc_clc_eid_table.lock); + + return count; +} + +int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY]; + char *ueid; + + if (!nla_ueid || nla_len(nla_ueid) != SMC_MAX_EID_LEN + 1) + return -EINVAL; + ueid = (char *)nla_data(nla_ueid); + + return smc_clc_ueid_add(ueid); +} + +/* remove one or all ueid entries from the table */ +static int smc_clc_ueid_remove(char *ueid) +{ + struct smc_clc_eid_entry *lst_ueid, *tmp_ueid; + int rc = -ENOENT; + + /* remove table entry */ + write_lock(&smc_clc_eid_table.lock); + list_for_each_entry_safe(lst_ueid, tmp_ueid, &smc_clc_eid_table.list, + list) { + if (!ueid || !memcmp(lst_ueid->eid, ueid, SMC_MAX_EID_LEN)) { + list_del(&lst_ueid->list); + smc_clc_eid_table.ueid_cnt--; + kfree(lst_ueid); + rc = 0; + } + } + if (!rc && !smc_clc_eid_table.ueid_cnt) { + smc_clc_eid_table.seid_enabled = 1; + rc = -EAGAIN; /* indicate success and enabling of seid */ + } + write_unlock(&smc_clc_eid_table.lock); + return rc; +} + +int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY]; + char *ueid; + + if (!nla_ueid || nla_len(nla_ueid) != SMC_MAX_EID_LEN + 1) + return -EINVAL; + ueid = (char *)nla_data(nla_ueid); + + return smc_clc_ueid_remove(ueid); +} + +int smc_nl_flush_ueid(struct sk_buff *skb, struct genl_info *info) +{ + smc_clc_ueid_remove(NULL); + return 0; +} + +static int smc_nl_ueid_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, + u32 flags, char *ueid) +{ + char ueid_str[SMC_MAX_EID_LEN + 1]; + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &smc_gen_nl_family, + flags, SMC_NETLINK_DUMP_UEID); + if (!hdr) + return -ENOMEM; + memcpy(ueid_str, ueid, SMC_MAX_EID_LEN); + ueid_str[SMC_MAX_EID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_EID_TABLE_ENTRY, ueid_str)) { + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; + } + genlmsg_end(skb, hdr); + return 0; +} + +static int _smc_nl_ueid_dump(struct sk_buff *skb, u32 portid, u32 seq, + int start_idx) +{ + struct smc_clc_eid_entry *lst_ueid; + int idx = 0; + + read_lock(&smc_clc_eid_table.lock); + list_for_each_entry(lst_ueid, &smc_clc_eid_table.list, list) { + if (idx++ < start_idx) + continue; + if (smc_nl_ueid_dumpinfo(skb, portid, seq, NLM_F_MULTI, + lst_ueid->eid)) { + --idx; + break; + } + } + read_unlock(&smc_clc_eid_table.lock); + return idx; +} + +int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + int idx; + + idx = _smc_nl_ueid_dump(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, cb_ctx->pos[0]); + + cb_ctx->pos[0] = idx; + return skb->len; +} + +int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + char seid_str[SMC_MAX_EID_LEN + 1]; + u8 seid_enabled; + void *hdr; + u8 *seid; + + if (cb_ctx->pos[0]) + return skb->len; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_DUMP_SEID); + if (!hdr) + return -ENOMEM; + if (!smc_ism_is_v2_capable()) + goto end; + + smc_ism_get_system_eid(&seid); + memcpy(seid_str, seid, SMC_MAX_EID_LEN); + seid_str[SMC_MAX_EID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_SEID_ENTRY, seid_str)) + goto err; + read_lock(&smc_clc_eid_table.lock); + seid_enabled = smc_clc_eid_table.seid_enabled; + read_unlock(&smc_clc_eid_table.lock); + if (nla_put_u8(skb, SMC_NLA_SEID_ENABLED, seid_enabled)) + goto err; +end: + genlmsg_end(skb, hdr); + cb_ctx->pos[0]++; + return skb->len; +err: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info) +{ + write_lock(&smc_clc_eid_table.lock); + smc_clc_eid_table.seid_enabled = 1; + write_unlock(&smc_clc_eid_table.lock); + return 0; +} + +int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info) +{ + int rc = 0; + + write_lock(&smc_clc_eid_table.lock); + if (!smc_clc_eid_table.ueid_cnt) + rc = -ENOENT; + else + smc_clc_eid_table.seid_enabled = 0; + write_unlock(&smc_clc_eid_table.lock); + return rc; +} + +static bool _smc_clc_match_ueid(u8 *peer_ueid) +{ + struct smc_clc_eid_entry *tmp_ueid; + + list_for_each_entry(tmp_ueid, &smc_clc_eid_table.list, list) { + if (!memcmp(tmp_ueid->eid, peer_ueid, SMC_MAX_EID_LEN)) + return true; + } + return false; +} + +bool smc_clc_match_eid(u8 *negotiated_eid, + struct smc_clc_v2_extension *smc_v2_ext, + u8 *peer_eid, u8 *local_eid) +{ + bool match = false; + int i; + + negotiated_eid[0] = 0; + read_lock(&smc_clc_eid_table.lock); + if (peer_eid && local_eid && + smc_clc_eid_table.seid_enabled && + smc_v2_ext->hdr.flag.seid && + !memcmp(peer_eid, local_eid, SMC_MAX_EID_LEN)) { + memcpy(negotiated_eid, peer_eid, SMC_MAX_EID_LEN); + match = true; + goto out; + } + + for (i = 0; i < smc_v2_ext->hdr.eid_cnt; i++) { + if (_smc_clc_match_ueid(smc_v2_ext->user_eids[i])) { + memcpy(negotiated_eid, smc_v2_ext->user_eids[i], + SMC_MAX_EID_LEN); + match = true; + goto out; + } + } +out: + read_unlock(&smc_clc_eid_table.lock); + return match; +} + +/* check arriving CLC proposal */ +static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) +{ + struct smc_clc_msg_proposal_prefix *pclc_prfx; + struct smc_clc_smcd_v2_extension *smcd_v2_ext; + struct smc_clc_msg_hdr *hdr = &pclc->hdr; + struct smc_clc_v2_extension *v2_ext; + + v2_ext = smc_get_clc_v2_ext(pclc); + pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (hdr->version == SMC_V1) { + if (hdr->typev1 == SMC_TYPE_N) + return false; + if (ntohs(hdr->length) != + sizeof(*pclc) + ntohs(pclc->iparea_offset) + + sizeof(*pclc_prfx) + + pclc_prfx->ipv6_prefixes_cnt * + sizeof(struct smc_clc_ipv6_prefix) + + sizeof(struct smc_clc_msg_trail)) + return false; + } else { + if (ntohs(hdr->length) != + sizeof(*pclc) + + sizeof(struct smc_clc_msg_smcd) + + (hdr->typev1 != SMC_TYPE_N ? + sizeof(*pclc_prfx) + + pclc_prfx->ipv6_prefixes_cnt * + sizeof(struct smc_clc_ipv6_prefix) : 0) + + (hdr->typev2 != SMC_TYPE_N ? + sizeof(*v2_ext) + + v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN : 0) + + (smcd_indicated(hdr->typev2) ? + sizeof(*smcd_v2_ext) + v2_ext->hdr.ism_gid_cnt * + sizeof(struct smc_clc_smcd_gid_chid) : + 0) + + sizeof(struct smc_clc_msg_trail)) + return false; + } + return true; +} + +/* check arriving CLC accept or confirm */ +static bool +smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2) +{ + struct smc_clc_msg_hdr *hdr = &clc_v2->hdr; + + if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D) + return false; + if (hdr->version == SMC_V1) { + if ((hdr->typev1 == SMC_TYPE_R && + ntohs(hdr->length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) || + (hdr->typev1 == SMC_TYPE_D && + ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN)) + return false; + } else { + if (hdr->typev1 == SMC_TYPE_D && + ntohs(hdr->length) < SMCD_CLC_ACCEPT_CONFIRM_LEN_V2) + return false; + if (hdr->typev1 == SMC_TYPE_R && + ntohs(hdr->length) < SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) + return false; + } + return true; +} + +/* check arriving CLC decline */ +static bool +smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc) +{ + struct smc_clc_msg_hdr *hdr = &dclc->hdr; + + if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D) + return false; + if (hdr->version == SMC_V1) { + if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline)) + return false; + } else { + if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline_v2)) + return false; + } + return true; +} + +static int smc_clc_fill_fce(struct smc_clc_first_contact_ext_v2x *fce, + struct smc_init_info *ini) +{ + int ret = sizeof(*fce); + + memset(fce, 0, sizeof(*fce)); + fce->fce_v2_base.os_type = SMC_CLC_OS_LINUX; + fce->fce_v2_base.release = ini->release_nr; + memcpy(fce->fce_v2_base.hostname, smc_hostname, sizeof(smc_hostname)); + if (ini->is_smcd && ini->release_nr < SMC_RELEASE_1) { + ret = sizeof(struct smc_clc_first_contact_ext); + goto out; + } + + if (ini->release_nr >= SMC_RELEASE_1) { + if (!ini->is_smcd) { + fce->max_conns = ini->max_conns; + fce->max_links = ini->max_links; + } + } + +out: + return ret; +} + +/* check if received message has a correct header length and contains valid + * heading and trailing eyecatchers + */ +static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) +{ + struct smc_clc_msg_accept_confirm_v2 *clc_v2; + struct smc_clc_msg_proposal *pclc; + struct smc_clc_msg_decline *dclc; + struct smc_clc_msg_trail *trl; + + if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && + memcmp(clcm->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) + return false; + switch (clcm->type) { + case SMC_CLC_PROPOSAL: + pclc = (struct smc_clc_msg_proposal *)clcm; + if (!smc_clc_msg_prop_valid(pclc)) + return false; + trl = (struct smc_clc_msg_trail *) + ((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl)); + break; + case SMC_CLC_ACCEPT: + case SMC_CLC_CONFIRM: + clc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)clcm; + if (!smc_clc_msg_acc_conf_valid(clc_v2)) + return false; + trl = (struct smc_clc_msg_trail *) + ((u8 *)clc_v2 + ntohs(clc_v2->hdr.length) - + sizeof(*trl)); + break; + case SMC_CLC_DECLINE: + dclc = (struct smc_clc_msg_decline *)clcm; + if (!smc_clc_msg_decl_valid(dclc)) + return false; + check_trl = false; + break; + default: + return false; + } + if (check_trl && + memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && + memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) + return false; + return true; +} + +/* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */ +static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, + struct smc_clc_msg_proposal_prefix *prop) +{ + struct in_device *in_dev = __in_dev_get_rcu(dst->dev); + const struct in_ifaddr *ifa; + + if (!in_dev) + return -ENODEV; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (!inet_ifa_match(ipv4, ifa)) + continue; + prop->prefix_len = inet_mask_len(ifa->ifa_mask); + prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask; + /* prop->ipv6_prefixes_cnt = 0; already done by memset before */ + return 0; + } + return -ENOENT; +} + +/* fill CLC proposal msg with ipv6 prefixes from device */ +static int smc_clc_prfx_set6_rcu(struct dst_entry *dst, + struct smc_clc_msg_proposal_prefix *prop, + struct smc_clc_ipv6_prefix *ipv6_prfx) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev = __in6_dev_get(dst->dev); + struct inet6_ifaddr *ifa; + int cnt = 0; + + if (!in6_dev) + return -ENODEV; + /* use a maximum of 8 IPv6 prefixes from device */ + list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { + if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) + continue; + ipv6_addr_prefix(&ipv6_prfx[cnt].prefix, + &ifa->addr, ifa->prefix_len); + ipv6_prfx[cnt].prefix_len = ifa->prefix_len; + cnt++; + if (cnt == SMC_CLC_MAX_V6_PREFIX) + break; + } + prop->ipv6_prefixes_cnt = cnt; + if (cnt) + return 0; +#endif + return -ENOENT; +} + +/* retrieve and set prefixes in CLC proposal msg */ +static int smc_clc_prfx_set(struct socket *clcsock, + struct smc_clc_msg_proposal_prefix *prop, + struct smc_clc_ipv6_prefix *ipv6_prfx) +{ + struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct sockaddr_storage addrs; + struct sockaddr_in6 *addr6; + struct sockaddr_in *addr; + int rc = -ENOENT; + + if (!dst) { + rc = -ENOTCONN; + goto out; + } + if (!dst->dev) { + rc = -ENODEV; + goto out_rel; + } + /* get address to which the internal TCP socket is bound */ + if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0) + goto out_rel; + /* analyze IP specific data of net_device belonging to TCP socket */ + addr6 = (struct sockaddr_in6 *)&addrs; + rcu_read_lock(); + if (addrs.ss_family == PF_INET) { + /* IPv4 */ + addr = (struct sockaddr_in *)&addrs; + rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop); + } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) { + /* mapped IPv4 address - peer is IPv4 only */ + rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3], + prop); + } else { + /* IPv6 */ + rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx); + } + rcu_read_unlock(); +out_rel: + dst_release(dst); +out: + return rc; +} + +/* match ipv4 addrs of dev against addr in CLC proposal */ +static int smc_clc_prfx_match4_rcu(struct net_device *dev, + struct smc_clc_msg_proposal_prefix *prop) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + const struct in_ifaddr *ifa; + + if (!in_dev) + return -ENODEV; + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) && + inet_ifa_match(prop->outgoing_subnet, ifa)) + return 0; + } + + return -ENOENT; +} + +/* match ipv6 addrs of dev against addrs in CLC proposal */ +static int smc_clc_prfx_match6_rcu(struct net_device *dev, + struct smc_clc_msg_proposal_prefix *prop) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev = __in6_dev_get(dev); + struct smc_clc_ipv6_prefix *ipv6_prfx; + struct inet6_ifaddr *ifa; + int i, max; + + if (!in6_dev) + return -ENODEV; + /* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */ + ipv6_prfx = (struct smc_clc_ipv6_prefix *)((u8 *)prop + sizeof(*prop)); + max = min_t(u8, prop->ipv6_prefixes_cnt, SMC_CLC_MAX_V6_PREFIX); + list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { + if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) + continue; + for (i = 0; i < max; i++) { + if (ifa->prefix_len == ipv6_prfx[i].prefix_len && + ipv6_prefix_equal(&ifa->addr, &ipv6_prfx[i].prefix, + ifa->prefix_len)) + return 0; + } + } +#endif + return -ENOENT; +} + +/* check if proposed prefixes match one of our device prefixes */ +int smc_clc_prfx_match(struct socket *clcsock, + struct smc_clc_msg_proposal_prefix *prop) +{ + struct dst_entry *dst = sk_dst_get(clcsock->sk); + int rc; + + if (!dst) { + rc = -ENOTCONN; + goto out; + } + if (!dst->dev) { + rc = -ENODEV; + goto out_rel; + } + rcu_read_lock(); + if (!prop->ipv6_prefixes_cnt) + rc = smc_clc_prfx_match4_rcu(dst->dev, prop); + else + rc = smc_clc_prfx_match6_rcu(dst->dev, prop); + rcu_read_unlock(); +out_rel: + dst_release(dst); +out: + return rc; +} + +/* Wait for data on the tcp-socket, analyze received data + * Returns: + * 0 if success and it was not a decline that we received. + * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send. + * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise. + */ +int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, + u8 expected_type, unsigned long timeout) +{ + long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo; + struct sock *clc_sk = smc->clcsock->sk; + struct smc_clc_msg_hdr *clcm = buf; + struct msghdr msg = {NULL, 0}; + int reason_code = 0; + struct kvec vec = {buf, buflen}; + int len, datlen, recvlen; + bool check_trl = true; + int krflags; + + /* peek the first few bytes to determine length of data to receive + * so we don't consume any subsequent CLC message or payload data + * in the TCP byte stream + */ + /* + * Caller must make sure that buflen is no less than + * sizeof(struct smc_clc_msg_hdr) + */ + krflags = MSG_PEEK | MSG_WAITALL; + clc_sk->sk_rcvtimeo = timeout; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, + sizeof(struct smc_clc_msg_hdr)); + len = sock_recvmsg(smc->clcsock, &msg, krflags); + if (signal_pending(current)) { + reason_code = -EINTR; + clc_sk->sk_err = EINTR; + smc->sk.sk_err = EINTR; + goto out; + } + if (clc_sk->sk_err) { + reason_code = -clc_sk->sk_err; + if (clc_sk->sk_err == EAGAIN && + expected_type == SMC_CLC_DECLINE) + clc_sk->sk_err = 0; /* reset for fallback usage */ + else + smc->sk.sk_err = clc_sk->sk_err; + goto out; + } + if (!len) { /* peer has performed orderly shutdown */ + smc->sk.sk_err = ECONNRESET; + reason_code = -ECONNRESET; + goto out; + } + if (len < 0) { + if (len != -EAGAIN || expected_type != SMC_CLC_DECLINE) + smc->sk.sk_err = -len; + reason_code = len; + goto out; + } + datlen = ntohs(clcm->length); + if ((len < sizeof(struct smc_clc_msg_hdr)) || + (clcm->version < SMC_V1) || + ((clcm->type != SMC_CLC_DECLINE) && + (clcm->type != expected_type))) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } + + /* receive the complete CLC message */ + memset(&msg, 0, sizeof(struct msghdr)); + if (datlen > buflen) { + check_trl = false; + recvlen = buflen; + } else { + recvlen = datlen; + } + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen); + krflags = MSG_WAITALL; + len = sock_recvmsg(smc->clcsock, &msg, krflags); + if (len < recvlen || !smc_clc_msg_hdr_valid(clcm, check_trl)) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } + datlen -= len; + while (datlen) { + u8 tmp[SMC_CLC_RECV_BUF_LEN]; + + vec.iov_base = &tmp; + vec.iov_len = SMC_CLC_RECV_BUF_LEN; + /* receive remaining proposal message */ + recvlen = datlen > SMC_CLC_RECV_BUF_LEN ? + SMC_CLC_RECV_BUF_LEN : datlen; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen); + len = sock_recvmsg(smc->clcsock, &msg, krflags); + datlen -= len; + } + if (clcm->type == SMC_CLC_DECLINE) { + struct smc_clc_msg_decline *dclc; + + dclc = (struct smc_clc_msg_decline *)clcm; + reason_code = SMC_CLC_DECL_PEERDECL; + smc->peer_diagnosis = ntohl(dclc->peer_diagnosis); + if (((struct smc_clc_msg_decline *)buf)->hdr.typev2 & + SMC_FIRST_CONTACT_MASK) { + smc->conn.lgr->sync_err = 1; + smc_lgr_terminate_sched(smc->conn.lgr); + } + } + +out: + clc_sk->sk_rcvtimeo = rcvtimeo; + return reason_code; +} + +/* send CLC DECLINE message across internal TCP socket */ +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) +{ + struct smc_clc_msg_decline *dclc_v1; + struct smc_clc_msg_decline_v2 dclc; + struct msghdr msg; + int len, send_len; + struct kvec vec; + + dclc_v1 = (struct smc_clc_msg_decline *)&dclc; + memset(&dclc, 0, sizeof(dclc)); + memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + dclc.hdr.type = SMC_CLC_DECLINE; + dclc.hdr.version = version; + dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX; + dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? + SMC_FIRST_CONTACT_MASK : 0; + if ((!smc_conn_lgr_valid(&smc->conn) || !smc->conn.lgr->is_smcd) && + smc_ib_is_valid_local_systemid()) + memcpy(dclc.id_for_peer, local_systemid, + sizeof(local_systemid)); + dclc.peer_diagnosis = htonl(peer_diag_info); + if (version == SMC_V1) { + memcpy(dclc_v1->trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + send_len = sizeof(*dclc_v1); + } else { + memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + send_len = sizeof(dclc); + } + dclc.hdr.length = htons(send_len); + + memset(&msg, 0, sizeof(msg)); + vec.iov_base = &dclc; + vec.iov_len = send_len; + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len); + if (len < 0 || len < send_len) + len = -EPROTO; + return len > 0 ? 0 : len; +} + +/* send CLC PROPOSAL message across internal TCP socket */ +int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) +{ + struct smc_clc_smcd_v2_extension *smcd_v2_ext; + struct smc_clc_msg_proposal_prefix *pclc_prfx; + struct smc_clc_msg_proposal *pclc_base; + struct smc_clc_smcd_gid_chid *gidchids; + struct smc_clc_msg_proposal_area *pclc; + struct smc_clc_ipv6_prefix *ipv6_prfx; + struct smc_clc_v2_extension *v2_ext; + struct smc_clc_msg_smcd *pclc_smcd; + struct smc_clc_msg_trail *trl; + struct smcd_dev *smcd; + int len, i, plen, rc; + int reason_code = 0; + struct kvec vec[8]; + struct msghdr msg; + + pclc = kzalloc(sizeof(*pclc), GFP_KERNEL); + if (!pclc) + return -ENOMEM; + + pclc_base = &pclc->pclc_base; + pclc_smcd = &pclc->pclc_smcd; + pclc_prfx = &pclc->pclc_prfx; + ipv6_prfx = pclc->pclc_prfx_ipv6; + v2_ext = &pclc->pclc_v2_ext; + smcd_v2_ext = &pclc->pclc_smcd_v2_ext; + gidchids = pclc->pclc_gidchids; + trl = &pclc->pclc_trl; + + pclc_base->hdr.version = SMC_V2; + pclc_base->hdr.typev1 = ini->smc_type_v1; + pclc_base->hdr.typev2 = ini->smc_type_v2; + plen = sizeof(*pclc_base) + sizeof(*pclc_smcd) + sizeof(*trl); + + /* retrieve ip prefixes for CLC proposal msg */ + if (ini->smc_type_v1 != SMC_TYPE_N) { + rc = smc_clc_prfx_set(smc->clcsock, pclc_prfx, ipv6_prfx); + if (rc) { + if (ini->smc_type_v2 == SMC_TYPE_N) { + kfree(pclc); + return SMC_CLC_DECL_CNFERR; + } + pclc_base->hdr.typev1 = SMC_TYPE_N; + } else { + pclc_base->iparea_offset = htons(sizeof(*pclc_smcd)); + plen += sizeof(*pclc_prfx) + + pclc_prfx->ipv6_prefixes_cnt * + sizeof(ipv6_prfx[0]); + } + } + + /* build SMC Proposal CLC message */ + memcpy(pclc_base->hdr.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + pclc_base->hdr.type = SMC_CLC_PROPOSAL; + if (smcr_indicated(ini->smc_type_v1)) { + /* add SMC-R specifics */ + memcpy(pclc_base->lcl.id_for_peer, local_systemid, + sizeof(local_systemid)); + memcpy(pclc_base->lcl.gid, ini->ib_gid, SMC_GID_SIZE); + memcpy(pclc_base->lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1], + ETH_ALEN); + } + if (smcd_indicated(ini->smc_type_v1)) { + /* add SMC-D specifics */ + if (ini->ism_dev[0]) { + smcd = ini->ism_dev[0]; + pclc_smcd->ism.gid = + htonll(smcd->ops->get_local_gid(smcd)); + pclc_smcd->ism.chid = + htons(smc_ism_get_chid(ini->ism_dev[0])); + } + } + if (ini->smc_type_v2 == SMC_TYPE_N) { + pclc_smcd->v2_ext_offset = 0; + } else { + struct smc_clc_eid_entry *ueident; + u16 v2_ext_offset; + + v2_ext->hdr.flag.release = SMC_RELEASE; + v2_ext_offset = sizeof(*pclc_smcd) - + offsetofend(struct smc_clc_msg_smcd, v2_ext_offset); + if (ini->smc_type_v1 != SMC_TYPE_N) + v2_ext_offset += sizeof(*pclc_prfx) + + pclc_prfx->ipv6_prefixes_cnt * + sizeof(ipv6_prfx[0]); + pclc_smcd->v2_ext_offset = htons(v2_ext_offset); + plen += sizeof(*v2_ext); + + read_lock(&smc_clc_eid_table.lock); + v2_ext->hdr.eid_cnt = smc_clc_eid_table.ueid_cnt; + plen += smc_clc_eid_table.ueid_cnt * SMC_MAX_EID_LEN; + i = 0; + list_for_each_entry(ueident, &smc_clc_eid_table.list, list) { + memcpy(v2_ext->user_eids[i++], ueident->eid, + sizeof(ueident->eid)); + } + read_unlock(&smc_clc_eid_table.lock); + } + if (smcd_indicated(ini->smc_type_v2)) { + u8 *eid = NULL; + + v2_ext->hdr.flag.seid = smc_clc_eid_table.seid_enabled; + v2_ext->hdr.ism_gid_cnt = ini->ism_offered_cnt; + v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) - + offsetofend(struct smc_clnt_opts_area_hdr, + smcd_v2_ext_offset) + + v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN); + smc_ism_get_system_eid(&eid); + if (eid && v2_ext->hdr.flag.seid) + memcpy(smcd_v2_ext->system_eid, eid, SMC_MAX_EID_LEN); + plen += sizeof(*smcd_v2_ext); + if (ini->ism_offered_cnt) { + for (i = 1; i <= ini->ism_offered_cnt; i++) { + smcd = ini->ism_dev[i]; + gidchids[i - 1].gid = + htonll(smcd->ops->get_local_gid(smcd)); + gidchids[i - 1].chid = + htons(smc_ism_get_chid(ini->ism_dev[i])); + } + plen += ini->ism_offered_cnt * + sizeof(struct smc_clc_smcd_gid_chid); + } + } + if (smcr_indicated(ini->smc_type_v2)) { + memcpy(v2_ext->roce, ini->smcrv2.ib_gid_v2, SMC_GID_SIZE); + v2_ext->max_conns = SMC_CONN_PER_LGR_PREFER; + v2_ext->max_links = SMC_LINKS_PER_LGR_MAX_PREFER; + } + + pclc_base->hdr.length = htons(plen); + memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + /* send SMC Proposal CLC message */ + memset(&msg, 0, sizeof(msg)); + i = 0; + vec[i].iov_base = pclc_base; + vec[i++].iov_len = sizeof(*pclc_base); + vec[i].iov_base = pclc_smcd; + vec[i++].iov_len = sizeof(*pclc_smcd); + if (ini->smc_type_v1 != SMC_TYPE_N) { + vec[i].iov_base = pclc_prfx; + vec[i++].iov_len = sizeof(*pclc_prfx); + if (pclc_prfx->ipv6_prefixes_cnt > 0) { + vec[i].iov_base = ipv6_prfx; + vec[i++].iov_len = pclc_prfx->ipv6_prefixes_cnt * + sizeof(ipv6_prfx[0]); + } + } + if (ini->smc_type_v2 != SMC_TYPE_N) { + vec[i].iov_base = v2_ext; + vec[i++].iov_len = sizeof(*v2_ext) + + (v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN); + if (smcd_indicated(ini->smc_type_v2)) { + vec[i].iov_base = smcd_v2_ext; + vec[i++].iov_len = sizeof(*smcd_v2_ext); + if (ini->ism_offered_cnt) { + vec[i].iov_base = gidchids; + vec[i++].iov_len = ini->ism_offered_cnt * + sizeof(struct smc_clc_smcd_gid_chid); + } + } + } + vec[i].iov_base = trl; + vec[i++].iov_len = sizeof(*trl); + /* due to the few bytes needed for clc-handshake this cannot block */ + len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen); + if (len < 0) { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + reason_code = -smc->sk.sk_err; + } else if (len < ntohs(pclc_base->hdr.length)) { + reason_code = -ENETUNREACH; + smc->sk.sk_err = -reason_code; + } + + kfree(pclc); + return reason_code; +} + +/* build and send CLC CONFIRM / ACCEPT message */ +static int smc_clc_send_confirm_accept(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm_v2 *clc_v2, + int first_contact, u8 version, + u8 *eid, struct smc_init_info *ini) +{ + struct smc_connection *conn = &smc->conn; + struct smc_clc_first_contact_ext_v2x fce; + struct smcd_dev *smcd = conn->lgr->smcd; + struct smc_clc_msg_accept_confirm *clc; + struct smc_clc_fce_gid_ext gle; + struct smc_clc_msg_trail trl; + int i, len, fce_len; + struct kvec vec[5]; + struct msghdr msg; + + /* send SMC Confirm CLC msg */ + clc = (struct smc_clc_msg_accept_confirm *)clc_v2; + clc->hdr.version = version; /* SMC version */ + if (first_contact) + clc->hdr.typev2 |= SMC_FIRST_CONTACT_MASK; + if (conn->lgr->is_smcd) { + /* SMC-D specific settings */ + memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + clc->hdr.typev1 = SMC_TYPE_D; + clc->d0.gid = htonll(smcd->ops->get_local_gid(smcd)); + clc->d0.token = htonll(conn->rmb_desc->token); + clc->d0.dmbe_size = conn->rmbe_size_comp; + clc->d0.dmbe_idx = 0; + memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); + if (version == SMC_V1) { + clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); + } else { + clc_v2->d1.chid = htons(smc_ism_get_chid(smcd)); + if (eid && eid[0]) + memcpy(clc_v2->d1.eid, eid, SMC_MAX_EID_LEN); + len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; + if (first_contact) { + fce_len = smc_clc_fill_fce(&fce, ini); + len += fce_len; + } + clc_v2->hdr.length = htons(len); + } + memcpy(trl.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + } else { + struct smc_link *link = conn->lnk; + + /* SMC-R specific settings */ + memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + clc->hdr.typev1 = SMC_TYPE_R; + clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + memcpy(clc->r0.lcl.id_for_peer, local_systemid, + sizeof(local_systemid)); + memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE); + memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1], + ETH_ALEN); + hton24(clc->r0.qpn, link->roce_qp->qp_num); + clc->r0.rmb_rkey = + htonl(conn->rmb_desc->mr[link->link_idx]->rkey); + clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ + clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); + switch (clc->hdr.type) { + case SMC_CLC_ACCEPT: + clc->r0.qp_mtu = link->path_mtu; + break; + case SMC_CLC_CONFIRM: + clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); + break; + } + clc->r0.rmbe_size = conn->rmbe_size_comp; + clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[link->link_idx].sgl)); + hton24(clc->r0.psn, link->psn_initial); + if (version == SMC_V1) { + clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + } else { + if (eid && eid[0]) + memcpy(clc_v2->r1.eid, eid, SMC_MAX_EID_LEN); + len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2; + if (first_contact) { + fce_len = smc_clc_fill_fce(&fce, ini); + len += fce_len; + fce.fce_v2_base.v2_direct = !link->lgr->uses_gateway; + if (clc->hdr.type == SMC_CLC_CONFIRM) { + memset(&gle, 0, sizeof(gle)); + gle.gid_cnt = ini->smcrv2.gidlist.len; + len += sizeof(gle); + len += gle.gid_cnt * sizeof(gle.gid[0]); + } + } + clc_v2->hdr.length = htons(len); + } + memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + } + + memset(&msg, 0, sizeof(msg)); + i = 0; + vec[i].iov_base = clc_v2; + if (version > SMC_V1) + vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ? + SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 : + SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) - + sizeof(trl); + else + vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ? + SMCD_CLC_ACCEPT_CONFIRM_LEN : + SMCR_CLC_ACCEPT_CONFIRM_LEN) - + sizeof(trl); + if (version > SMC_V1 && first_contact) { + vec[i].iov_base = &fce; + vec[i++].iov_len = fce_len; + if (!conn->lgr->is_smcd) { + if (clc->hdr.type == SMC_CLC_CONFIRM) { + vec[i].iov_base = &gle; + vec[i++].iov_len = sizeof(gle); + vec[i].iov_base = &ini->smcrv2.gidlist.list; + vec[i++].iov_len = gle.gid_cnt * + sizeof(gle.gid[0]); + } + } + } + vec[i].iov_base = &trl; + vec[i++].iov_len = sizeof(trl); + return kernel_sendmsg(smc->clcsock, &msg, vec, 1, + ntohs(clc->hdr.length)); +} + +/* send CLC CONFIRM message across internal TCP socket */ +int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, + u8 version, u8 *eid, struct smc_init_info *ini) +{ + struct smc_clc_msg_accept_confirm_v2 cclc_v2; + int reason_code = 0; + int len; + + /* send SMC Confirm CLC msg */ + memset(&cclc_v2, 0, sizeof(cclc_v2)); + cclc_v2.hdr.type = SMC_CLC_CONFIRM; + len = smc_clc_send_confirm_accept(smc, &cclc_v2, clnt_first_contact, + version, eid, ini); + if (len < ntohs(cclc_v2.hdr.length)) { + if (len >= 0) { + reason_code = -ENETUNREACH; + smc->sk.sk_err = -reason_code; + } else { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + reason_code = -smc->sk.sk_err; + } + } + return reason_code; +} + +/* send CLC ACCEPT message across internal TCP socket */ +int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, + u8 version, u8 *negotiated_eid, struct smc_init_info *ini) +{ + struct smc_clc_msg_accept_confirm_v2 aclc_v2; + int len; + + memset(&aclc_v2, 0, sizeof(aclc_v2)); + aclc_v2.hdr.type = SMC_CLC_ACCEPT; + len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact, + version, negotiated_eid, ini); + if (len < ntohs(aclc_v2.hdr.length)) + len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; + + return len > 0 ? 0 : len; +} + +int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_v2_extension *pclc_v2_ext; + + ini->max_conns = SMC_CONN_PER_LGR_MAX; + ini->max_links = SMC_LINKS_ADD_LNK_MAX; + + if ((!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) || + ini->release_nr < SMC_RELEASE_1) + return 0; + + pclc_v2_ext = smc_get_clc_v2_ext(pclc); + if (!pclc_v2_ext) + return SMC_CLC_DECL_NOV2EXT; + + if (ini->smcr_version & SMC_V2) { + ini->max_conns = min_t(u8, pclc_v2_ext->max_conns, SMC_CONN_PER_LGR_PREFER); + if (ini->max_conns < SMC_CONN_PER_LGR_MIN) + return SMC_CLC_DECL_MAXCONNERR; + + ini->max_links = min_t(u8, pclc_v2_ext->max_links, SMC_LINKS_PER_LGR_MAX_PREFER); + if (ini->max_links < SMC_LINKS_ADD_LNK_MIN) + return SMC_CLC_DECL_MAXLINKERR; + } + + return 0; +} + +int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, + struct smc_init_info *ini) +{ + struct smc_clc_first_contact_ext_v2x *fce_v2x = + (struct smc_clc_first_contact_ext_v2x *)fce; + + if (ini->release_nr < SMC_RELEASE_1) + return 0; + + if (!ini->is_smcd) { + if (fce_v2x->max_conns < SMC_CONN_PER_LGR_MIN) + return SMC_CLC_DECL_MAXCONNERR; + ini->max_conns = fce_v2x->max_conns; + + if (fce_v2x->max_links > SMC_LINKS_ADD_LNK_MAX || + fce_v2x->max_links < SMC_LINKS_ADD_LNK_MIN) + return SMC_CLC_DECL_MAXLINKERR; + ini->max_links = fce_v2x->max_links; + } + + return 0; +} + +int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, + struct smc_init_info *ini) +{ + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)cclc; + struct smc_clc_first_contact_ext *fce = + smc_get_clc_first_contact_ext(clc_v2, ini->is_smcd); + struct smc_clc_first_contact_ext_v2x *fce_v2x = + (struct smc_clc_first_contact_ext_v2x *)fce; + + if (cclc->hdr.version == SMC_V1 || + !(cclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) + return 0; + + if (ini->release_nr != fce->release) + return SMC_CLC_DECL_RELEASEERR; + + if (fce->release < SMC_RELEASE_1) + return 0; + + if (!ini->is_smcd) { + if (fce_v2x->max_conns != ini->max_conns) + return SMC_CLC_DECL_MAXCONNERR; + if (fce_v2x->max_links != ini->max_links) + return SMC_CLC_DECL_MAXLINKERR; + } + + return 0; +} + +void smc_clc_get_hostname(u8 **host) +{ + *host = &smc_hostname[0]; +} + +void __init smc_clc_init(void) +{ + struct new_utsname *u; + + memset(smc_hostname, _S, sizeof(smc_hostname)); /* ASCII blanks */ + u = utsname(); + memcpy(smc_hostname, u->nodename, + min_t(size_t, strlen(u->nodename), sizeof(smc_hostname))); + + INIT_LIST_HEAD(&smc_clc_eid_table.list); + rwlock_init(&smc_clc_eid_table.lock); + smc_clc_eid_table.ueid_cnt = 0; + smc_clc_eid_table.seid_enabled = 1; +} + +void smc_clc_exit(void) +{ + smc_clc_ueid_remove(NULL); +} diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h new file mode 100644 index 0000000000..08155a96a0 --- /dev/null +++ b/net/smc/smc_clc.h @@ -0,0 +1,446 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * CLC (connection layer control) handshake over initial TCP socket to + * prepare for RDMA traffic + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef _SMC_CLC_H +#define _SMC_CLC_H + +#include <rdma/ib_verbs.h> +#include <linux/smc.h> + +#include "smc.h" +#include "smc_netlink.h" + +#define SMC_CLC_PROPOSAL 0x01 +#define SMC_CLC_ACCEPT 0x02 +#define SMC_CLC_CONFIRM 0x03 +#define SMC_CLC_DECLINE 0x04 + +#define SMC_TYPE_R 0 /* SMC-R only */ +#define SMC_TYPE_D 1 /* SMC-D only */ +#define SMC_TYPE_N 2 /* neither SMC-R nor SMC-D */ +#define SMC_TYPE_B 3 /* SMC-R and SMC-D */ +#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ +#define CLC_WAIT_TIME_SHORT HZ /* short wait time on clcsock */ +#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ +#define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */ +#define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */ +#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */ +#define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */ +#define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */ +#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */ +#define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */ +#define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */ +#define SMC_CLC_DECL_NOISM2SUPP 0x03030003 /* hardware has no ISMv2 support */ +#define SMC_CLC_DECL_NOV2EXT 0x03030004 /* peer sent no clc v2 extension */ +#define SMC_CLC_DECL_NOV2DEXT 0x03030005 /* peer sent no clc SMC-Dv2 ext. */ +#define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */ +#define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */ +#define SMC_CLC_DECL_NOUEID 0x03030008 /* peer sent no UEID */ +#define SMC_CLC_DECL_RELEASEERR 0x03030009 /* release version negotiate failed */ +#define SMC_CLC_DECL_MAXCONNERR 0x0303000a /* max connections negotiate failed */ +#define SMC_CLC_DECL_MAXLINKERR 0x0303000b /* max links negotiate failed */ +#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ +#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ +#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ +#define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */ +#define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ +#define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ +#define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */ +#define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */ +#define SMC_CLC_DECL_VERSMISMAT 0x030c0000 /* SMC version mismatch */ +#define SMC_CLC_DECL_MAX_DMB 0x030d0000 /* SMC-D DMB limit exceeded */ +#define SMC_CLC_DECL_NOROUTE 0x030e0000 /* SMC-Rv2 conn. no route to peer */ +#define SMC_CLC_DECL_NOINDIRECT 0x030f0000 /* SMC-Rv2 conn. indirect mismatch*/ +#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ +#define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ +#define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ +#define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ +#define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ +#define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */ + +#define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ + +struct smc_clc_msg_hdr { /* header1 of clc messages */ + u8 eyecatcher[4]; /* eye catcher */ + u8 type; /* proposal / accept / confirm / decline */ + __be16 length; +#if defined(__BIG_ENDIAN_BITFIELD) + u8 version : 4, + typev2 : 2, + typev1 : 2; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 typev1 : 2, + typev2 : 2, + version : 4; +#endif +} __packed; /* format defined in RFC7609 */ + +struct smc_clc_msg_trail { /* trailer of clc messages */ + u8 eyecatcher[4]; +}; + +struct smc_clc_msg_local { /* header2 of clc messages */ + u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */ + u8 gid[16]; /* gid of ib_device port */ + u8 mac[6]; /* mac of ib_device port */ +}; + +/* Struct would be 4 byte aligned, but it is used in an array that is sent + * to peers and must conform to RFC7609, hence we need to use packed here. + */ +struct smc_clc_ipv6_prefix { + struct in6_addr prefix; + u8 prefix_len; +} __packed; /* format defined in RFC7609 */ + +#if defined(__BIG_ENDIAN_BITFIELD) +struct smc_clc_v2_flag { + u8 release : 4, + rsvd : 3, + seid : 1; +}; +#elif defined(__LITTLE_ENDIAN_BITFIELD) +struct smc_clc_v2_flag { + u8 seid : 1, + rsvd : 3, + release : 4; +}; +#endif + +struct smc_clnt_opts_area_hdr { + u8 eid_cnt; /* number of user defined EIDs */ + u8 ism_gid_cnt; /* number of ISMv2 GIDs */ + u8 reserved1; + struct smc_clc_v2_flag flag; + u8 reserved2[2]; + __be16 smcd_v2_ext_offset; /* SMC-Dv2 Extension Offset */ +}; + +struct smc_clc_smcd_gid_chid { + __be64 gid; /* ISM GID */ + __be16 chid; /* ISMv2 CHID */ +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 + * (https://www.ibm.com/support/pages/node/6326337) + */ + +struct smc_clc_v2_extension { + struct smc_clnt_opts_area_hdr hdr; + u8 roce[16]; /* RoCEv2 GID */ + u8 max_conns; + u8 max_links; + u8 reserved[14]; + u8 user_eids[][SMC_MAX_EID_LEN]; +}; + +struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ + __be32 outgoing_subnet; /* subnet mask */ + u8 prefix_len; /* number of significant bits in mask */ + u8 reserved[2]; + u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */ +} __aligned(4); + +struct smc_clc_msg_smcd { /* SMC-D GID information */ + struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */ + __be16 v2_ext_offset; /* SMC Version 2 Extension Offset */ + u8 vendor_oui[3]; /* vendor organizationally unique identifier */ + u8 vendor_exp_options[5]; + u8 reserved[20]; +}; + +struct smc_clc_smcd_v2_extension { + u8 system_eid[SMC_MAX_EID_LEN]; + u8 reserved[16]; + struct smc_clc_smcd_gid_chid gidchid[]; +}; + +struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ + struct smc_clc_msg_hdr hdr; + struct smc_clc_msg_local lcl; + __be16 iparea_offset; /* offset to IP address information area */ +} __aligned(4); + +#define SMC_CLC_MAX_V6_PREFIX 8 +#define SMC_CLC_MAX_UEID 8 + +struct smc_clc_msg_proposal_area { + struct smc_clc_msg_proposal pclc_base; + struct smc_clc_msg_smcd pclc_smcd; + struct smc_clc_msg_proposal_prefix pclc_prfx; + struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX]; + struct smc_clc_v2_extension pclc_v2_ext; + u8 user_eids[SMC_CLC_MAX_UEID][SMC_MAX_EID_LEN]; + struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext; + struct smc_clc_smcd_gid_chid pclc_gidchids[SMC_MAX_ISM_DEVS]; + struct smc_clc_msg_trail pclc_trl; +}; + +struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ + struct smc_clc_msg_local lcl; + u8 qpn[3]; /* QP number */ + __be32 rmb_rkey; /* RMB rkey */ + u8 rmbe_idx; /* Index of RMBE in RMB */ + __be32 rmbe_alert_token; /* unique connection id */ + #if defined(__BIG_ENDIAN_BITFIELD) + u8 rmbe_size : 4, /* buf size (compressed) */ + qp_mtu : 4; /* QP mtu */ +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 qp_mtu : 4, + rmbe_size : 4; +#endif + u8 reserved; + __be64 rmb_dma_addr; /* RMB virtual address */ + u8 reserved2; + u8 psn[3]; /* packet sequence number */ +} __packed; + +struct smcd_clc_msg_accept_confirm_common { /* SMCD accept/confirm */ + __be64 gid; /* Sender GID */ + __be64 token; /* DMB token */ + u8 dmbe_idx; /* DMBE index */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 dmbe_size : 4, /* buf size (compressed) */ + reserved3 : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved3 : 4, + dmbe_size : 4; +#endif + u16 reserved4; + __be32 linkid; /* Link identifier */ +} __packed; + +#define SMC_CLC_OS_ZOS 1 +#define SMC_CLC_OS_LINUX 2 +#define SMC_CLC_OS_AIX 3 + +struct smc_clc_first_contact_ext { +#if defined(__BIG_ENDIAN_BITFIELD) + u8 v2_direct : 1, + reserved : 7; + u8 os_type : 4, + release : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 7, + v2_direct : 1; + u8 release : 4, + os_type : 4; +#endif + u8 reserved2[2]; + u8 hostname[SMC_MAX_HOSTNAME_LEN]; +}; + +struct smc_clc_first_contact_ext_v2x { + struct smc_clc_first_contact_ext fce_v2_base; + u8 max_conns; /* for SMC-R only */ + u8 max_links; /* for SMC-R only */ + u8 reserved3[2]; + __be32 vendor_exp_options; + u8 reserved4[8]; +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 (Third Edition) + * (https://www.ibm.com/support/pages/node/7009315) + */ + +struct smc_clc_fce_gid_ext { + u8 gid_cnt; + u8 reserved2[3]; + u8 gid[][SMC_GID_SIZE]; +}; + +struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ + struct smc_clc_msg_hdr hdr; + union { + struct smcr_clc_msg_accept_confirm r0; /* SMC-R */ + struct { /* SMC-D */ + struct smcd_clc_msg_accept_confirm_common d0; + u32 reserved5[3]; + }; + }; +} __packed; /* format defined in RFC7609 */ + +struct smc_clc_msg_accept_confirm_v2 { /* clc accept / confirm message */ + struct smc_clc_msg_hdr hdr; + union { + struct { /* SMC-R */ + struct smcr_clc_msg_accept_confirm r0; + u8 eid[SMC_MAX_EID_LEN]; + u8 reserved6[8]; + } r1; + struct { /* SMC-D */ + struct smcd_clc_msg_accept_confirm_common d0; + __be16 chid; + u8 eid[SMC_MAX_EID_LEN]; + u8 reserved5[8]; + } d1; + }; +}; + +struct smc_clc_msg_decline { /* clc decline message */ + struct smc_clc_msg_hdr hdr; + u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */ + __be32 peer_diagnosis; /* diagnosis information */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 os_type : 4, + reserved : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 4, + os_type : 4; +#endif + u8 reserved2[3]; + struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */ +} __aligned(4); + +#define SMC_DECL_DIAG_COUNT_V2 4 /* no. of additional peer diagnosis codes */ + +struct smc_clc_msg_decline_v2 { /* clc decline message */ + struct smc_clc_msg_hdr hdr; + u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */ + __be32 peer_diagnosis; /* diagnosis information */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 os_type : 4, + reserved : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 4, + os_type : 4; +#endif + u8 reserved2[3]; + __be32 peer_diagnosis_v2[SMC_DECL_DIAG_COUNT_V2]; + struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */ +} __aligned(4); + +/* determine start of the prefix area within the proposal message */ +static inline struct smc_clc_msg_proposal_prefix * +smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) +{ + return (struct smc_clc_msg_proposal_prefix *) + ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); +} + +static inline bool smcr_indicated(int smc_type) +{ + return smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B; +} + +static inline bool smcd_indicated(int smc_type) +{ + return smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B; +} + +static inline u8 smc_indicated_type(int is_smcd, int is_smcr) +{ + if (is_smcd && is_smcr) + return SMC_TYPE_B; + if (is_smcd) + return SMC_TYPE_D; + if (is_smcr) + return SMC_TYPE_R; + return SMC_TYPE_N; +} + +/* get SMC-D info from proposal message */ +static inline struct smc_clc_msg_smcd * +smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop) +{ + if (smcd_indicated(prop->hdr.typev1) && + ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd)) + return NULL; + + return (struct smc_clc_msg_smcd *)(prop + 1); +} + +static inline struct smc_clc_v2_extension * +smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) +{ + struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop); + + if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset)) + return NULL; + + return (struct smc_clc_v2_extension *) + ((u8 *)prop_smcd + + offsetof(struct smc_clc_msg_smcd, v2_ext_offset) + + sizeof(prop_smcd->v2_ext_offset) + + ntohs(prop_smcd->v2_ext_offset)); +} + +static inline struct smc_clc_smcd_v2_extension * +smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext) +{ + if (!prop_v2ext) + return NULL; + if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)) + return NULL; + + return (struct smc_clc_smcd_v2_extension *) + ((u8 *)prop_v2ext + + offsetof(struct smc_clc_v2_extension, hdr) + + offsetof(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) + + sizeof(prop_v2ext->hdr.smcd_v2_ext_offset) + + ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)); +} + +static inline struct smc_clc_first_contact_ext * +smc_get_clc_first_contact_ext(struct smc_clc_msg_accept_confirm_v2 *clc_v2, + bool is_smcd) +{ + int clc_v2_len; + + if (clc_v2->hdr.version == SMC_V1 || + !(clc_v2->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) + return NULL; + + if (is_smcd) + clc_v2_len = + offsetofend(struct smc_clc_msg_accept_confirm_v2, d1); + else + clc_v2_len = + offsetofend(struct smc_clc_msg_accept_confirm_v2, r1); + + return (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + + clc_v2_len); +} + +struct smcd_dev; +struct smc_init_info; + +int smc_clc_prfx_match(struct socket *clcsock, + struct smc_clc_msg_proposal_prefix *prop); +int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, + u8 expected_type, unsigned long timeout); +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version); +int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini); +int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, + u8 version, u8 *eid, struct smc_init_info *ini); +int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact, + u8 version, u8 *negotiated_eid, struct smc_init_info *ini); +int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini); +int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, + struct smc_init_info *ini); +int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, + struct smc_init_info *ini); +void smc_clc_init(void) __init; +void smc_clc_exit(void); +void smc_clc_get_hostname(u8 **host); +bool smc_clc_match_eid(u8 *negotiated_eid, + struct smc_clc_v2_extension *smc_v2_ext, + u8 *peer_eid, u8 *local_eid); +int smc_clc_ueid_count(void); +int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info); +int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info); +int smc_nl_flush_ueid(struct sk_buff *skb, struct genl_info *info); +int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info); +int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info); + +#endif diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c new file mode 100644 index 0000000000..10219f55aa --- /dev/null +++ b/net/smc/smc_close.c @@ -0,0 +1,506 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Socket Closing - normal and abnormal + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/workqueue.h> +#include <linux/sched/signal.h> + +#include <net/sock.h> +#include <net/tcp.h> + +#include "smc.h" +#include "smc_tx.h" +#include "smc_cdc.h" +#include "smc_close.h" + +/* release the clcsock that is assigned to the smc_sock */ +void smc_clcsock_release(struct smc_sock *smc) +{ + struct socket *tcp; + + if (smc->listen_smc && current_work() != &smc->smc_listen_work) + cancel_work_sync(&smc->smc_listen_work); + mutex_lock(&smc->clcsock_release_lock); + if (smc->clcsock) { + tcp = smc->clcsock; + smc->clcsock = NULL; + sock_release(tcp); + } + mutex_unlock(&smc->clcsock_release_lock); +} + +static void smc_close_cleanup_listen(struct sock *parent) +{ + struct sock *sk; + + /* Close non-accepted connections */ + while ((sk = smc_accept_dequeue(parent, NULL))) + smc_close_non_accepted(sk); +} + +/* wait for sndbuf data being transmitted */ +static void smc_close_stream_wait(struct smc_sock *smc, long timeout) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct sock *sk = &smc->sk; + + if (!timeout) + return; + + if (!smc_tx_prepared_sends(&smc->conn)) + return; + + /* Send out corked data remaining in sndbuf */ + smc_tx_pending(&smc->conn); + + smc->wait_close_tx_prepared = 1; + add_wait_queue(sk_sleep(sk), &wait); + while (!signal_pending(current) && timeout) { + int rc; + + rc = sk_wait_event(sk, &timeout, + !smc_tx_prepared_sends(&smc->conn) || + READ_ONCE(sk->sk_err) == ECONNABORTED || + READ_ONCE(sk->sk_err) == ECONNRESET || + smc->conn.killed, + &wait); + if (rc) + break; + } + remove_wait_queue(sk_sleep(sk), &wait); + smc->wait_close_tx_prepared = 0; +} + +void smc_close_wake_tx_prepared(struct smc_sock *smc) +{ + if (smc->wait_close_tx_prepared) + /* wake up socket closing */ + smc->sk.sk_state_change(&smc->sk); +} + +static int smc_close_wr(struct smc_connection *conn) +{ + conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1; + + return smc_cdc_get_slot_and_msg_send(conn); +} + +static int smc_close_final(struct smc_connection *conn) +{ + if (atomic_read(&conn->bytes_to_rcv)) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + else + conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1; + if (conn->killed) + return -EPIPE; + + return smc_cdc_get_slot_and_msg_send(conn); +} + +int smc_close_abort(struct smc_connection *conn) +{ + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + + return smc_cdc_get_slot_and_msg_send(conn); +} + +static void smc_close_cancel_work(struct smc_sock *smc) +{ + struct sock *sk = &smc->sk; + + release_sock(sk); + if (cancel_work_sync(&smc->conn.close_work)) + sock_put(sk); + cancel_delayed_work_sync(&smc->conn.tx_work); + lock_sock(sk); +} + +/* terminate smc socket abnormally - active abort + * link group is terminated, i.e. RDMA communication no longer possible + */ +void smc_close_active_abort(struct smc_sock *smc) +{ + struct sock *sk = &smc->sk; + bool release_clcsock = false; + + if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) { + sk->sk_err = ECONNABORTED; + if (smc->clcsock && smc->clcsock->sk) + tcp_abort(smc->clcsock->sk, ECONNABORTED); + } + switch (sk->sk_state) { + case SMC_ACTIVE: + case SMC_APPCLOSEWAIT1: + case SMC_APPCLOSEWAIT2: + sk->sk_state = SMC_PEERABORTWAIT; + smc_close_cancel_work(smc); + if (sk->sk_state != SMC_PEERABORTWAIT) + break; + sk->sk_state = SMC_CLOSED; + sock_put(sk); /* (postponed) passive closing */ + break; + case SMC_PEERCLOSEWAIT1: + case SMC_PEERCLOSEWAIT2: + case SMC_PEERFINCLOSEWAIT: + sk->sk_state = SMC_PEERABORTWAIT; + smc_close_cancel_work(smc); + if (sk->sk_state != SMC_PEERABORTWAIT) + break; + sk->sk_state = SMC_CLOSED; + smc_conn_free(&smc->conn); + release_clcsock = true; + sock_put(sk); /* passive closing */ + break; + case SMC_PROCESSABORT: + case SMC_APPFINCLOSEWAIT: + sk->sk_state = SMC_PEERABORTWAIT; + smc_close_cancel_work(smc); + if (sk->sk_state != SMC_PEERABORTWAIT) + break; + sk->sk_state = SMC_CLOSED; + smc_conn_free(&smc->conn); + release_clcsock = true; + break; + case SMC_INIT: + case SMC_PEERABORTWAIT: + case SMC_CLOSED: + break; + } + + smc_sock_set_flag(sk, SOCK_DEAD); + sk->sk_state_change(sk); + + if (release_clcsock) { + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); + } +} + +static inline bool smc_close_sent_any_close(struct smc_connection *conn) +{ + return conn->local_tx_ctrl.conn_state_flags.peer_conn_abort || + conn->local_tx_ctrl.conn_state_flags.peer_conn_closed; +} + +int smc_close_active(struct smc_sock *smc) +{ + struct smc_cdc_conn_state_flags *txflags = + &smc->conn.local_tx_ctrl.conn_state_flags; + struct smc_connection *conn = &smc->conn; + struct sock *sk = &smc->sk; + int old_state; + long timeout; + int rc = 0; + int rc1 = 0; + + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; + + old_state = sk->sk_state; +again: + switch (sk->sk_state) { + case SMC_INIT: + sk->sk_state = SMC_CLOSED; + break; + case SMC_LISTEN: + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); /* wake up accept */ + if (smc->clcsock && smc->clcsock->sk) { + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); + smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); + rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); + } + smc_close_cleanup_listen(sk); + release_sock(sk); + flush_work(&smc->tcp_listen_work); + lock_sock(sk); + break; + case SMC_ACTIVE: + smc_close_stream_wait(smc, timeout); + release_sock(sk); + cancel_delayed_work_sync(&conn->tx_work); + lock_sock(sk); + if (sk->sk_state == SMC_ACTIVE) { + /* send close request */ + rc = smc_close_final(conn); + sk->sk_state = SMC_PEERCLOSEWAIT1; + + /* actively shutdown clcsock before peer close it, + * prevent peer from entering TIME_WAIT state. + */ + if (smc->clcsock && smc->clcsock->sk) { + rc1 = kernel_sock_shutdown(smc->clcsock, + SHUT_RDWR); + rc = rc ? rc : rc1; + } + } else { + /* peer event has changed the state */ + goto again; + } + break; + case SMC_APPFINCLOSEWAIT: + /* socket already shutdown wr or both (active close) */ + if (txflags->peer_done_writing && + !smc_close_sent_any_close(conn)) { + /* just shutdown wr done, send close request */ + rc = smc_close_final(conn); + } + sk->sk_state = SMC_CLOSED; + break; + case SMC_APPCLOSEWAIT1: + case SMC_APPCLOSEWAIT2: + if (!smc_cdc_rxed_any_close(conn)) + smc_close_stream_wait(smc, timeout); + release_sock(sk); + cancel_delayed_work_sync(&conn->tx_work); + lock_sock(sk); + if (sk->sk_state != SMC_APPCLOSEWAIT1 && + sk->sk_state != SMC_APPCLOSEWAIT2) + goto again; + /* confirm close from peer */ + rc = smc_close_final(conn); + if (smc_cdc_rxed_any_close(conn)) { + /* peer has closed the socket already */ + sk->sk_state = SMC_CLOSED; + sock_put(sk); /* postponed passive closing */ + } else { + /* peer has just issued a shutdown write */ + sk->sk_state = SMC_PEERFINCLOSEWAIT; + } + break; + case SMC_PEERCLOSEWAIT1: + case SMC_PEERCLOSEWAIT2: + if (txflags->peer_done_writing && + !smc_close_sent_any_close(conn)) { + /* just shutdown wr done, send close request */ + rc = smc_close_final(conn); + } + /* peer sending PeerConnectionClosed will cause transition */ + break; + case SMC_PEERFINCLOSEWAIT: + /* peer sending PeerConnectionClosed will cause transition */ + break; + case SMC_PROCESSABORT: + rc = smc_close_abort(conn); + sk->sk_state = SMC_CLOSED; + break; + case SMC_PEERABORTWAIT: + sk->sk_state = SMC_CLOSED; + break; + case SMC_CLOSED: + /* nothing to do, add tracing in future patch */ + break; + } + + if (old_state != sk->sk_state) + sk->sk_state_change(sk); + return rc; +} + +static void smc_close_passive_abort_received(struct smc_sock *smc) +{ + struct smc_cdc_conn_state_flags *txflags = + &smc->conn.local_tx_ctrl.conn_state_flags; + struct sock *sk = &smc->sk; + + switch (sk->sk_state) { + case SMC_INIT: + case SMC_ACTIVE: + case SMC_APPCLOSEWAIT1: + sk->sk_state = SMC_PROCESSABORT; + sock_put(sk); /* passive closing */ + break; + case SMC_APPFINCLOSEWAIT: + sk->sk_state = SMC_PROCESSABORT; + break; + case SMC_PEERCLOSEWAIT1: + case SMC_PEERCLOSEWAIT2: + if (txflags->peer_done_writing && + !smc_close_sent_any_close(&smc->conn)) + /* just shutdown, but not yet closed locally */ + sk->sk_state = SMC_PROCESSABORT; + else + sk->sk_state = SMC_CLOSED; + sock_put(sk); /* passive closing */ + break; + case SMC_APPCLOSEWAIT2: + case SMC_PEERFINCLOSEWAIT: + sk->sk_state = SMC_CLOSED; + sock_put(sk); /* passive closing */ + break; + case SMC_PEERABORTWAIT: + sk->sk_state = SMC_CLOSED; + break; + case SMC_PROCESSABORT: + /* nothing to do, add tracing in future patch */ + break; + } +} + +/* Either some kind of closing has been received: peer_conn_closed, + * peer_conn_abort, or peer_done_writing + * or the link group of the connection terminates abnormally. + */ +static void smc_close_passive_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(work, + struct smc_connection, + close_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + struct smc_cdc_conn_state_flags *rxflags; + bool release_clcsock = false; + struct sock *sk = &smc->sk; + int old_state; + + lock_sock(sk); + old_state = sk->sk_state; + + rxflags = &conn->local_rx_ctrl.conn_state_flags; + if (rxflags->peer_conn_abort) { + /* peer has not received all data */ + smc_close_passive_abort_received(smc); + release_sock(sk); + cancel_delayed_work_sync(&conn->tx_work); + lock_sock(sk); + goto wakeup; + } + + switch (sk->sk_state) { + case SMC_INIT: + sk->sk_state = SMC_APPCLOSEWAIT1; + break; + case SMC_ACTIVE: + sk->sk_state = SMC_APPCLOSEWAIT1; + /* postpone sock_put() for passive closing to cover + * received SEND_SHUTDOWN as well + */ + break; + case SMC_PEERCLOSEWAIT1: + if (rxflags->peer_done_writing) + sk->sk_state = SMC_PEERCLOSEWAIT2; + fallthrough; + /* to check for closing */ + case SMC_PEERCLOSEWAIT2: + if (!smc_cdc_rxed_any_close(conn)) + break; + if (sock_flag(sk, SOCK_DEAD) && + smc_close_sent_any_close(conn)) { + /* smc_release has already been called locally */ + sk->sk_state = SMC_CLOSED; + } else { + /* just shutdown, but not yet closed locally */ + sk->sk_state = SMC_APPFINCLOSEWAIT; + } + sock_put(sk); /* passive closing */ + break; + case SMC_PEERFINCLOSEWAIT: + if (smc_cdc_rxed_any_close(conn)) { + sk->sk_state = SMC_CLOSED; + sock_put(sk); /* passive closing */ + } + break; + case SMC_APPCLOSEWAIT1: + case SMC_APPCLOSEWAIT2: + /* postpone sock_put() for passive closing to cover + * received SEND_SHUTDOWN as well + */ + break; + case SMC_APPFINCLOSEWAIT: + case SMC_PEERABORTWAIT: + case SMC_PROCESSABORT: + case SMC_CLOSED: + /* nothing to do, add tracing in future patch */ + break; + } + +wakeup: + sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */ + sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */ + + if (old_state != sk->sk_state) { + sk->sk_state_change(sk); + if ((sk->sk_state == SMC_CLOSED) && + (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) { + smc_conn_free(conn); + if (smc->clcsock) + release_clcsock = true; + } + } + release_sock(sk); + if (release_clcsock) + smc_clcsock_release(smc); + sock_put(sk); /* sock_hold done by schedulers of close_work */ +} + +int smc_close_shutdown_write(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + struct sock *sk = &smc->sk; + int old_state; + long timeout; + int rc = 0; + + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; + + old_state = sk->sk_state; +again: + switch (sk->sk_state) { + case SMC_ACTIVE: + smc_close_stream_wait(smc, timeout); + release_sock(sk); + cancel_delayed_work_sync(&conn->tx_work); + lock_sock(sk); + if (sk->sk_state != SMC_ACTIVE) + goto again; + /* send close wr request */ + rc = smc_close_wr(conn); + sk->sk_state = SMC_PEERCLOSEWAIT1; + break; + case SMC_APPCLOSEWAIT1: + /* passive close */ + if (!smc_cdc_rxed_any_close(conn)) + smc_close_stream_wait(smc, timeout); + release_sock(sk); + cancel_delayed_work_sync(&conn->tx_work); + lock_sock(sk); + if (sk->sk_state != SMC_APPCLOSEWAIT1) + goto again; + /* confirm close from peer */ + rc = smc_close_wr(conn); + sk->sk_state = SMC_APPCLOSEWAIT2; + break; + case SMC_APPCLOSEWAIT2: + case SMC_PEERFINCLOSEWAIT: + case SMC_PEERCLOSEWAIT1: + case SMC_PEERCLOSEWAIT2: + case SMC_APPFINCLOSEWAIT: + case SMC_PROCESSABORT: + case SMC_PEERABORTWAIT: + /* nothing to do, add tracing in future patch */ + break; + } + + if (old_state != sk->sk_state) + sk->sk_state_change(sk); + return rc; +} + +/* Initialize close properties on connection establishment. */ +void smc_close_init(struct smc_sock *smc) +{ + INIT_WORK(&smc->conn.close_work, smc_close_passive_work); +} diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h new file mode 100644 index 0000000000..634fea2b7c --- /dev/null +++ b/net/smc/smc_close.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Socket Closing + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_CLOSE_H +#define SMC_CLOSE_H + +#include <linux/workqueue.h> + +#include "smc.h" + +#define SMC_MAX_STREAM_WAIT_TIMEOUT (2 * HZ) +#define SMC_CLOSE_SOCK_PUT_DELAY HZ + +void smc_close_wake_tx_prepared(struct smc_sock *smc); +int smc_close_active(struct smc_sock *smc); +int smc_close_shutdown_write(struct smc_sock *smc); +void smc_close_init(struct smc_sock *smc); +void smc_clcsock_release(struct smc_sock *smc); +int smc_close_abort(struct smc_connection *conn); +void smc_close_active_abort(struct smc_sock *smc); + +#endif /* SMC_CLOSE_H */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c new file mode 100644 index 0000000000..d520ee62c8 --- /dev/null +++ b/net/smc/smc_core.c @@ -0,0 +1,2633 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Basic Transport Functions exploiting Infiniband API + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/socket.h> +#include <linux/if_vlan.h> +#include <linux/random.h> +#include <linux/workqueue.h> +#include <linux/wait.h> +#include <linux/reboot.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/smc.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_cache.h> + +#include "smc.h" +#include "smc_clc.h" +#include "smc_core.h" +#include "smc_ib.h" +#include "smc_wr.h" +#include "smc_llc.h" +#include "smc_cdc.h" +#include "smc_close.h" +#include "smc_ism.h" +#include "smc_netlink.h" +#include "smc_stats.h" +#include "smc_tracepoint.h" + +#define SMC_LGR_NUM_INCR 256 +#define SMC_LGR_FREE_DELAY_SERV (600 * HZ) +#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) + +struct smc_lgr_list smc_lgr_list = { /* established link groups */ + .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), + .list = LIST_HEAD_INIT(smc_lgr_list.list), + .num = 0, +}; + +static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ +static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); + +static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc); +static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft); + +static void smc_link_down_work(struct work_struct *work); + +/* return head of link group list and its lock for a given link group */ +static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, + spinlock_t **lgr_lock) +{ + if (lgr->is_smcd) { + *lgr_lock = &lgr->smcd->lgr_lock; + return &lgr->smcd->lgr_list; + } + + *lgr_lock = &smc_lgr_list.lock; + return &smc_lgr_list.list; +} + +static void smc_ibdev_cnt_inc(struct smc_link *lnk) +{ + atomic_inc(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); +} + +static void smc_ibdev_cnt_dec(struct smc_link *lnk) +{ + atomic_dec(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); +} + +static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) +{ + /* client link group creation always follows the server link group + * creation. For client use a somewhat higher removal delay time, + * otherwise there is a risk of out-of-sync link groups. + */ + if (!lgr->freeing) { + mod_delayed_work(system_wq, &lgr->free_work, + (!lgr->is_smcd && lgr->role == SMC_CLNT) ? + SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); + } +} + +/* Register connection's alert token in our lookup structure. + * To use rbtrees we have to implement our own insert core. + * Requires @conns_lock + * @smc connection to register + * Returns 0 on success, != otherwise. + */ +static void smc_lgr_add_alert_token(struct smc_connection *conn) +{ + struct rb_node **link, *parent = NULL; + u32 token = conn->alert_token_local; + + link = &conn->lgr->conns_all.rb_node; + while (*link) { + struct smc_connection *cur = rb_entry(*link, + struct smc_connection, alert_node); + + parent = *link; + if (cur->alert_token_local > token) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + /* Put the new node there */ + rb_link_node(&conn->alert_node, parent, link); + rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); +} + +/* assign an SMC-R link to the connection */ +static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) +{ + enum smc_link_state expected = first ? SMC_LNK_ACTIVATING : + SMC_LNK_ACTIVE; + int i, j; + + /* do link balancing */ + conn->lnk = NULL; /* reset conn->lnk first */ + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &conn->lgr->lnk[i]; + + if (lnk->state != expected || lnk->link_is_asym) + continue; + if (conn->lgr->role == SMC_CLNT) { + conn->lnk = lnk; /* temporary, SMC server assigns link*/ + break; + } + if (conn->lgr->conns_num % 2) { + for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) { + struct smc_link *lnk2; + + lnk2 = &conn->lgr->lnk[j]; + if (lnk2->state == expected && + !lnk2->link_is_asym) { + conn->lnk = lnk2; + break; + } + } + } + if (!conn->lnk) + conn->lnk = lnk; + break; + } + if (!conn->lnk) + return SMC_CLC_DECL_NOACTLINK; + atomic_inc(&conn->lnk->conn_cnt); + return 0; +} + +/* Register connection in link group by assigning an alert token + * registered in a search tree. + * Requires @conns_lock + * Note that '0' is a reserved value and not assigned. + */ +static int smc_lgr_register_conn(struct smc_connection *conn, bool first) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + static atomic_t nexttoken = ATOMIC_INIT(0); + int rc; + + if (!conn->lgr->is_smcd) { + rc = smcr_lgr_conn_assign_link(conn, first); + if (rc) { + conn->lgr = NULL; + return rc; + } + } + /* find a new alert_token_local value not yet used by some connection + * in this link group + */ + sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */ + while (!conn->alert_token_local) { + conn->alert_token_local = atomic_inc_return(&nexttoken); + if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr)) + conn->alert_token_local = 0; + } + smc_lgr_add_alert_token(conn); + conn->lgr->conns_num++; + return 0; +} + +/* Unregister connection and reset the alert token of the given connection< + */ +static void __smc_lgr_unregister_conn(struct smc_connection *conn) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + struct smc_link_group *lgr = conn->lgr; + + rb_erase(&conn->alert_node, &lgr->conns_all); + if (conn->lnk) + atomic_dec(&conn->lnk->conn_cnt); + lgr->conns_num--; + conn->alert_token_local = 0; + sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ +} + +/* Unregister connection from lgr + */ +static void smc_lgr_unregister_conn(struct smc_connection *conn) +{ + struct smc_link_group *lgr = conn->lgr; + + if (!smc_conn_lgr_valid(conn)) + return; + write_lock_bh(&lgr->conns_lock); + if (conn->alert_token_local) { + __smc_lgr_unregister_conn(conn); + } + write_unlock_bh(&lgr->conns_lock); +} + +int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + char hostname[SMC_MAX_HOSTNAME_LEN + 1]; + char smc_seid[SMC_MAX_EID_LEN + 1]; + struct nlattr *attrs; + u8 *seid = NULL; + u8 *host = NULL; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_SYS_INFO); + if (!nlh) + goto errmsg; + if (cb_ctx->pos[0]) + goto errout; + attrs = nla_nest_start(skb, SMC_GEN_SYS_INFO); + if (!attrs) + goto errout; + if (nla_put_u8(skb, SMC_NLA_SYS_VER, SMC_V2)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_SYS_REL, SMC_RELEASE)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_SYS_IS_ISM_V2, smc_ism_is_v2_capable())) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_SYS_IS_SMCR_V2, true)) + goto errattr; + smc_clc_get_hostname(&host); + if (host) { + memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN); + hostname[SMC_MAX_HOSTNAME_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_SYS_LOCAL_HOST, hostname)) + goto errattr; + } + if (smc_ism_is_v2_capable()) { + smc_ism_get_system_eid(&seid); + memcpy(smc_seid, seid, SMC_MAX_EID_LEN); + smc_seid[SMC_MAX_EID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_SYS_SEID, smc_seid)) + goto errattr; + } + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + cb_ctx->pos[0] = 1; + return skb->len; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return skb->len; +} + +/* Fill SMC_NLA_LGR_D_V2_COMMON/SMC_NLA_LGR_R_V2_COMMON nested attributes */ +static int smc_nl_fill_lgr_v2_common(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb, + struct nlattr *v2_attrs) +{ + char smc_host[SMC_MAX_HOSTNAME_LEN + 1]; + char smc_eid[SMC_MAX_EID_LEN + 1]; + + if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os)) + goto errv2attr; + memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN); + smc_host[SMC_MAX_HOSTNAME_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host)) + goto errv2attr; + memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN); + smc_eid[SMC_MAX_EID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid)) + goto errv2attr; + + nla_nest_end(skb, v2_attrs); + return 0; + +errv2attr: + nla_nest_cancel(skb, v2_attrs); + return -EMSGSIZE; +} + +static int smc_nl_fill_smcr_lgr_v2(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *v2_attrs; + + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2); + if (!v2_attrs) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_DIRECT, !lgr->uses_gateway)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_CONNS, lgr->max_conns)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_LINKS, lgr->max_links)) + goto errv2attr; + + nla_nest_end(skb, v2_attrs); + return 0; + +errv2attr: + nla_nest_cancel(skb, v2_attrs); +errattr: + return -EMSGSIZE; +} + +static int smc_nl_fill_lgr(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_target[SMC_MAX_PNETID_LEN + 1]; + struct nlattr *attrs, *v2_attrs; + + attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCR); + if (!attrs) + goto errout; + + if (nla_put_u32(skb, SMC_NLA_LGR_R_ID, *((u32 *)&lgr->id))) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LGR_R_CONNS_NUM, lgr->conns_num)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_ROLE, lgr->role)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_BUF_TYPE, lgr->buf_type)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_R_NET_COOKIE, + lgr->net->net_cookie, SMC_NLA_LGR_R_PAD)) + goto errattr; + memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN); + smc_target[SMC_MAX_PNETID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) + goto errattr; + if (lgr->smc_version > SMC_V1) { + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2_COMMON); + if (!v2_attrs) + goto errattr; + if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) + goto errattr; + if (smc_nl_fill_smcr_lgr_v2(lgr, skb, cb)) + goto errattr; + } + + nla_nest_end(skb, attrs); + return 0; +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_lgr_link(struct smc_link_group *lgr, + struct smc_link *link, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_ibname[IB_DEVICE_NAME_MAX]; + u8 smc_gid_target[41]; + struct nlattr *attrs; + u32 link_uid = 0; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_LINK_SMCR); + if (!nlh) + goto errmsg; + + attrs = nla_nest_start(skb, SMC_GEN_LINK_SMCR); + if (!attrs) + goto errout; + + if (nla_put_u8(skb, SMC_NLA_LINK_ID, link->link_id)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LINK_STATE, link->state)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LINK_CONN_CNT, + atomic_read(&link->conn_cnt))) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LINK_IB_PORT, link->ibport)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LINK_NET_DEV, link->ndev_ifidx)) + goto errattr; + snprintf(smc_ibname, sizeof(smc_ibname), "%s", link->ibname); + if (nla_put_string(skb, SMC_NLA_LINK_IB_DEV, smc_ibname)) + goto errattr; + memcpy(&link_uid, link->link_uid, sizeof(link_uid)); + if (nla_put_u32(skb, SMC_NLA_LINK_UID, link_uid)) + goto errattr; + memcpy(&link_uid, link->peer_link_uid, sizeof(link_uid)); + if (nla_put_u32(skb, SMC_NLA_LINK_PEER_UID, link_uid)) + goto errattr; + memset(smc_gid_target, 0, sizeof(smc_gid_target)); + smc_gid_be16_convert(smc_gid_target, link->gid); + if (nla_put_string(skb, SMC_NLA_LINK_GID, smc_gid_target)) + goto errattr; + memset(smc_gid_target, 0, sizeof(smc_gid_target)); + smc_gid_be16_convert(smc_gid_target, link->peer_gid); + if (nla_put_string(skb, SMC_NLA_LINK_PEER_GID, smc_gid_target)) + goto errattr; + + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static int smc_nl_handle_lgr(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb, + bool list_links) +{ + void *nlh; + int i; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_LGR_SMCR); + if (!nlh) + goto errmsg; + if (smc_nl_fill_lgr(lgr, skb, cb)) + goto errout; + + genlmsg_end(skb, nlh); + if (!list_links) + goto out; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&lgr->lnk[i])) + continue; + if (smc_nl_fill_lgr_link(lgr, &lgr->lnk[i], skb, cb)) + goto errout; + } +out: + return 0; + +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static void smc_nl_fill_lgr_list(struct smc_lgr_list *smc_lgr, + struct sk_buff *skb, + struct netlink_callback *cb, + bool list_links) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_link_group *lgr; + int snum = cb_ctx->pos[0]; + int num = 0; + + spin_lock_bh(&smc_lgr->lock); + list_for_each_entry(lgr, &smc_lgr->list, list) { + if (num < snum) + goto next; + if (smc_nl_handle_lgr(lgr, skb, cb, list_links)) + goto errout; +next: + num++; + } +errout: + spin_unlock_bh(&smc_lgr->lock); + cb_ctx->pos[0] = num; +} + +static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct smcd_dev *smcd = lgr->smcd; + struct nlattr *attrs; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_LGR_SMCD); + if (!nlh) + goto errmsg; + + attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCD); + if (!attrs) + goto errout; + + if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, + smcd->ops->get_local_gid(smcd), + SMC_NLA_LGR_D_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid, + SMC_NLA_LGR_D_PAD)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_D_VLAN_ID, lgr->vlan_id)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LGR_D_CONNS_NUM, lgr->conns_num)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd))) + goto errattr; + memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN); + smc_pnet[SMC_MAX_PNETID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet)) + goto errattr; + if (lgr->smc_version > SMC_V1) { + struct nlattr *v2_attrs; + + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_D_V2_COMMON); + if (!v2_attrs) + goto errattr; + if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) + goto errattr; + } + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static int smc_nl_handle_smcd_lgr(struct smcd_dev *dev, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_link_group *lgr; + int snum = cb_ctx->pos[1]; + int rc = 0, num = 0; + + spin_lock_bh(&dev->lgr_lock); + list_for_each_entry(lgr, &dev->lgr_list, list) { + if (!lgr->is_smcd) + continue; + if (num < snum) + goto next; + rc = smc_nl_fill_smcd_lgr(lgr, skb, cb); + if (rc) + goto errout; +next: + num++; + } +errout: + spin_unlock_bh(&dev->lgr_lock); + cb_ctx->pos[1] = num; + return rc; +} + +static int smc_nl_fill_smcd_dev(struct smcd_dev_list *dev_list, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smcd_dev *smcd_dev; + int snum = cb_ctx->pos[0]; + int rc = 0, num = 0; + + mutex_lock(&dev_list->mutex); + list_for_each_entry(smcd_dev, &dev_list->list, list) { + if (list_empty(&smcd_dev->lgr_list)) + continue; + if (num < snum) + goto next; + rc = smc_nl_handle_smcd_lgr(smcd_dev, skb, cb); + if (rc) + goto errout; +next: + num++; + } +errout: + mutex_unlock(&dev_list->mutex); + cb_ctx->pos[0] = num; + return rc; +} + +int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) +{ + bool list_links = false; + + smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); + return skb->len; +} + +int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb) +{ + bool list_links = true; + + smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); + return skb->len; +} + +int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) +{ + smc_nl_fill_smcd_dev(&smcd_dev_list, skb, cb); + return skb->len; +} + +void smc_lgr_cleanup_early(struct smc_link_group *lgr) +{ + spinlock_t *lgr_lock; + + if (!lgr) + return; + + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + /* do not use this link group for new connections */ + if (!list_empty(&lgr->list)) + list_del_init(&lgr->list); + spin_unlock_bh(lgr_lock); + __smc_lgr_terminate(lgr, true); +} + +static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) +{ + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (smc_link_sendable(lnk)) + lnk->state = SMC_LNK_INACTIVE; + } + wake_up_all(&lgr->llc_msg_waiter); + wake_up_all(&lgr->llc_flow_waiter); +} + +static void smc_lgr_free(struct smc_link_group *lgr); + +static void smc_lgr_free_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(to_delayed_work(work), + struct smc_link_group, + free_work); + spinlock_t *lgr_lock; + bool conns; + + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + if (lgr->freeing) { + spin_unlock_bh(lgr_lock); + return; + } + read_lock_bh(&lgr->conns_lock); + conns = RB_EMPTY_ROOT(&lgr->conns_all); + read_unlock_bh(&lgr->conns_lock); + if (!conns) { /* number of lgr connections is no longer zero */ + spin_unlock_bh(lgr_lock); + return; + } + list_del_init(&lgr->list); /* remove from smc_lgr_list */ + lgr->freeing = 1; /* this instance does the freeing, no new schedule */ + spin_unlock_bh(lgr_lock); + cancel_delayed_work(&lgr->free_work); + + if (!lgr->is_smcd && !lgr->terminating) + smc_llc_send_link_delete_all(lgr, true, + SMC_LLC_DEL_PROG_INIT_TERM); + if (lgr->is_smcd && !lgr->terminating) + smc_ism_signal_shutdown(lgr); + if (!lgr->is_smcd) + smcr_lgr_link_deactivate_all(lgr); + smc_lgr_free(lgr); +} + +static void smc_lgr_terminate_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + terminate_work); + + __smc_lgr_terminate(lgr, true); +} + +/* return next unique link id for the lgr */ +static u8 smcr_next_link_id(struct smc_link_group *lgr) +{ + u8 link_id; + int i; + + while (1) { +again: + link_id = ++lgr->next_link_id; + if (!link_id) /* skip zero as link_id */ + link_id = ++lgr->next_link_id; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (smc_link_usable(&lgr->lnk[i]) && + lgr->lnk[i].link_id == link_id) + goto again; + } + break; + } + return link_id; +} + +static void smcr_copy_dev_info_to_link(struct smc_link *link) +{ + struct smc_ib_device *smcibdev = link->smcibdev; + + snprintf(link->ibname, sizeof(link->ibname), "%s", + smcibdev->ibdev->name); + link->ndev_ifidx = smcibdev->ndev_ifidx[link->ibport - 1]; +} + +int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + u8 link_idx, struct smc_init_info *ini) +{ + struct smc_ib_device *smcibdev; + u8 rndvec[3]; + int rc; + + if (lgr->smc_version == SMC_V2) { + lnk->smcibdev = ini->smcrv2.ib_dev_v2; + lnk->ibport = ini->smcrv2.ib_port_v2; + } else { + lnk->smcibdev = ini->ib_dev; + lnk->ibport = ini->ib_port; + } + get_device(&lnk->smcibdev->ibdev->dev); + atomic_inc(&lnk->smcibdev->lnk_cnt); + refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */ + lnk->clearing = 0; + lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; + lnk->link_id = smcr_next_link_id(lgr); + lnk->lgr = lgr; + smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ + lnk->link_idx = link_idx; + lnk->wr_rx_id_compl = 0; + smc_ibdev_cnt_inc(lnk); + smcr_copy_dev_info_to_link(lnk); + atomic_set(&lnk->conn_cnt, 0); + smc_llc_link_set_uid(lnk); + INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); + if (!lnk->smcibdev->initialized) { + rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev); + if (rc) + goto out; + } + get_random_bytes(rndvec, sizeof(rndvec)); + lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + + (rndvec[2] << 16); + rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, + ini->vlan_id, lnk->gid, &lnk->sgid_index, + lgr->smc_version == SMC_V2 ? + &ini->smcrv2 : NULL); + if (rc) + goto out; + rc = smc_llc_link_init(lnk); + if (rc) + goto out; + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto clear_llc_lnk; + rc = smc_ib_create_protection_domain(lnk); + if (rc) + goto free_link_mem; + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_create_link(lnk); + if (rc) + goto destroy_qp; + lnk->state = SMC_LNK_ACTIVATING; + return 0; + +destroy_qp: + smc_ib_destroy_queue_pair(lnk); +dealloc_pd: + smc_ib_dealloc_protection_domain(lnk); +free_link_mem: + smc_wr_free_link_mem(lnk); +clear_llc_lnk: + smc_llc_link_clear(lnk, false); +out: + smc_ibdev_cnt_dec(lnk); + put_device(&lnk->smcibdev->ibdev->dev); + smcibdev = lnk->smcibdev; + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&smcibdev->lnk_cnt)) + wake_up(&smcibdev->lnks_deleted); + smc_lgr_put(lgr); /* lgr_hold above */ + return rc; +} + +/* create a new SMC link group */ +static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) +{ + struct smc_link_group *lgr; + struct list_head *lgr_list; + struct smcd_dev *smcd; + struct smc_link *lnk; + spinlock_t *lgr_lock; + u8 link_idx; + int rc = 0; + int i; + + if (ini->is_smcd && ini->vlan_id) { + if (smc_ism_get_vlan(ini->ism_dev[ini->ism_selected], + ini->vlan_id)) { + rc = SMC_CLC_DECL_ISMVLANERR; + goto out; + } + } + + lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); + if (!lgr) { + rc = SMC_CLC_DECL_MEM; + goto ism_put_vlan; + } + lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0, + SMC_LGR_ID_SIZE, &lgr->id); + if (!lgr->tx_wq) { + rc = -ENOMEM; + goto free_lgr; + } + lgr->is_smcd = ini->is_smcd; + lgr->sync_err = 0; + lgr->terminating = 0; + lgr->freeing = 0; + lgr->vlan_id = ini->vlan_id; + refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */ + init_rwsem(&lgr->sndbufs_lock); + init_rwsem(&lgr->rmbs_lock); + rwlock_init(&lgr->conns_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + INIT_LIST_HEAD(&lgr->sndbufs[i]); + INIT_LIST_HEAD(&lgr->rmbs[i]); + } + lgr->next_link_id = 0; + smc_lgr_list.num += SMC_LGR_NUM_INCR; + memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); + INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); + INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work); + lgr->conns_all = RB_ROOT; + if (ini->is_smcd) { + /* SMC-D specific settings */ + smcd = ini->ism_dev[ini->ism_selected]; + get_device(smcd->ops->get_dev(smcd)); + lgr->peer_gid = ini->ism_peer_gid[ini->ism_selected]; + lgr->smcd = ini->ism_dev[ini->ism_selected]; + lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list; + lgr_lock = &lgr->smcd->lgr_lock; + lgr->smc_version = ini->smcd_version; + lgr->peer_shutdown = 0; + atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt); + } else { + /* SMC-R specific settings */ + struct smc_ib_device *ibdev; + int ibport; + + lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + lgr->smc_version = ini->smcr_version; + memcpy(lgr->peer_systemid, ini->peer_systemid, + SMC_SYSTEMID_LEN); + if (lgr->smc_version == SMC_V2) { + ibdev = ini->smcrv2.ib_dev_v2; + ibport = ini->smcrv2.ib_port_v2; + lgr->saddr = ini->smcrv2.saddr; + lgr->uses_gateway = ini->smcrv2.uses_gateway; + memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac, + ETH_ALEN); + lgr->max_conns = ini->max_conns; + lgr->max_links = ini->max_links; + } else { + ibdev = ini->ib_dev; + ibport = ini->ib_port; + lgr->max_conns = SMC_CONN_PER_LGR_MAX; + lgr->max_links = SMC_LINKS_ADD_LNK_MAX; + } + memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], + SMC_MAX_PNETID_LEN); + rc = smc_wr_alloc_lgr_mem(lgr); + if (rc) + goto free_wq; + smc_llc_lgr_init(lgr, smc); + + link_idx = SMC_SINGLE_LINK; + lnk = &lgr->lnk[link_idx]; + rc = smcr_link_init(lgr, lnk, link_idx, ini); + if (rc) { + smc_wr_free_lgr_mem(lgr); + goto free_wq; + } + lgr->net = smc_ib_net(lnk->smcibdev); + lgr_list = &smc_lgr_list.list; + lgr_lock = &smc_lgr_list.lock; + lgr->buf_type = lgr->net->smc.sysctl_smcr_buf_type; + atomic_inc(&lgr_cnt); + } + smc->conn.lgr = lgr; + spin_lock_bh(lgr_lock); + list_add_tail(&lgr->list, lgr_list); + spin_unlock_bh(lgr_lock); + return 0; + +free_wq: + destroy_workqueue(lgr->tx_wq); +free_lgr: + kfree(lgr); +ism_put_vlan: + if (ini->is_smcd && ini->vlan_id) + smc_ism_put_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id); +out: + if (rc < 0) { + if (rc == -ENOMEM) + rc = SMC_CLC_DECL_MEM; + else + rc = SMC_CLC_DECL_INTERR; + } + return rc; +} + +static int smc_write_space(struct smc_connection *conn) +{ + int buffer_len = conn->peer_rmbe_size; + union smc_host_cursor prod; + union smc_host_cursor cons; + int space; + + smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); + /* determine rx_buf space */ + space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod); + return space; +} + +static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf) +{ + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons, fin; + int rc = 0; + int diff; + + smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn); + smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn); + /* set prod cursor to old state, enforce tx_rdma_writes() */ + smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn); + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); + + if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) { + /* cons cursor advanced more than fin, and prod was set + * fin above, so now prod is smaller than cons. Fix that. + */ + diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons); + smc_curs_add(conn->sndbuf_desc->len, + &conn->tx_curs_sent, diff); + smc_curs_add(conn->sndbuf_desc->len, + &conn->tx_curs_fin, diff); + + smp_mb__before_atomic(); + atomic_add(diff, &conn->sndbuf_space); + smp_mb__after_atomic(); + + smc_curs_add(conn->peer_rmbe_size, + &conn->local_tx_ctrl.prod, diff); + smc_curs_add(conn->peer_rmbe_size, + &conn->local_tx_ctrl_fin, diff); + } + /* recalculate, value is used by tx_rdma_writes() */ + atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn)); + + if (smc->sk.sk_state != SMC_INIT && + smc->sk.sk_state != SMC_CLOSED) { + rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf); + if (!rc) { + queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 0); + smc->sk.sk_data_ready(&smc->sk); + } + } else { + smc_wr_tx_put_slot(conn->lnk, + (struct smc_wr_tx_pend_priv *)pend); + } + return rc; +} + +void smc_switch_link_and_count(struct smc_connection *conn, + struct smc_link *to_lnk) +{ + atomic_dec(&conn->lnk->conn_cnt); + /* link_hold in smc_conn_create() */ + smcr_link_put(conn->lnk); + conn->lnk = to_lnk; + atomic_inc(&conn->lnk->conn_cnt); + /* link_put in smc_conn_free() */ + smcr_link_hold(conn->lnk); +} + +struct smc_link *smc_switch_conns(struct smc_link_group *lgr, + struct smc_link *from_lnk, bool is_dev_err) +{ + struct smc_link *to_lnk = NULL; + struct smc_cdc_tx_pend *pend; + struct smc_connection *conn; + struct smc_wr_buf *wr_buf; + struct smc_sock *smc; + struct rb_node *node; + int i, rc = 0; + + /* link is inactive, wake up tx waiters */ + smc_wr_wakeup_tx_wait(from_lnk); + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i]) || i == from_lnk->link_idx) + continue; + if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev && + from_lnk->ibport == lgr->lnk[i].ibport) { + continue; + } + to_lnk = &lgr->lnk[i]; + break; + } + if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) { + smc_lgr_terminate_sched(lgr); + return NULL; + } +again: + read_lock_bh(&lgr->conns_lock); + for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) { + conn = rb_entry(node, struct smc_connection, alert_node); + if (conn->lnk != from_lnk) + continue; + smc = container_of(conn, struct smc_sock, conn); + /* conn->lnk not yet set in SMC_INIT state */ + if (smc->sk.sk_state == SMC_INIT) + continue; + if (smc->sk.sk_state == SMC_CLOSED || + smc->sk.sk_state == SMC_PEERCLOSEWAIT1 || + smc->sk.sk_state == SMC_PEERCLOSEWAIT2 || + smc->sk.sk_state == SMC_APPFINCLOSEWAIT || + smc->sk.sk_state == SMC_APPCLOSEWAIT1 || + smc->sk.sk_state == SMC_APPCLOSEWAIT2 || + smc->sk.sk_state == SMC_PEERFINCLOSEWAIT || + smc->sk.sk_state == SMC_PEERABORTWAIT || + smc->sk.sk_state == SMC_PROCESSABORT) { + spin_lock_bh(&conn->send_lock); + smc_switch_link_and_count(conn, to_lnk); + spin_unlock_bh(&conn->send_lock); + continue; + } + sock_hold(&smc->sk); + read_unlock_bh(&lgr->conns_lock); + /* pre-fetch buffer outside of send_lock, might sleep */ + rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); + if (rc) + goto err_out; + /* avoid race with smcr_tx_sndbuf_nonempty() */ + spin_lock_bh(&conn->send_lock); + smc_switch_link_and_count(conn, to_lnk); + rc = smc_switch_cursor(smc, pend, wr_buf); + spin_unlock_bh(&conn->send_lock); + sock_put(&smc->sk); + if (rc) + goto err_out; + goto again; + } + read_unlock_bh(&lgr->conns_lock); + smc_wr_tx_link_put(to_lnk); + return to_lnk; + +err_out: + smcr_link_down_cond_sched(to_lnk); + smc_wr_tx_link_put(to_lnk); + return NULL; +} + +static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link_group *lgr) +{ + struct rw_semaphore *lock; /* lock buffer list */ + int rc; + + if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) { + /* unregister rmb with peer */ + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (!rc) { + /* protect against smc_llc_cli_rkey_exchange() */ + down_read(&lgr->llc_conf_mutex); + smc_llc_do_delete_rkey(lgr, buf_desc); + buf_desc->is_conf_rkey = false; + up_read(&lgr->llc_conf_mutex); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + } + } + + if (buf_desc->is_reg_err) { + /* buf registration failed, reuse not possible */ + lock = is_rmb ? &lgr->rmbs_lock : + &lgr->sndbufs_lock; + down_write(lock); + list_del(&buf_desc->list); + up_write(lock); + + smc_buf_free(lgr, is_rmb, buf_desc); + } else { + /* memzero_explicit provides potential memory barrier semantics */ + memzero_explicit(buf_desc->cpu_addr, buf_desc->len); + WRITE_ONCE(buf_desc->used, 0); + } +} + +static void smc_buf_unuse(struct smc_connection *conn, + struct smc_link_group *lgr) +{ + if (conn->sndbuf_desc) { + if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) { + smcr_buf_unuse(conn->sndbuf_desc, false, lgr); + } else { + memzero_explicit(conn->sndbuf_desc->cpu_addr, conn->sndbuf_desc->len); + WRITE_ONCE(conn->sndbuf_desc->used, 0); + } + } + if (conn->rmb_desc) { + if (!lgr->is_smcd) { + smcr_buf_unuse(conn->rmb_desc, true, lgr); + } else { + memzero_explicit(conn->rmb_desc->cpu_addr, + conn->rmb_desc->len + sizeof(struct smcd_cdc_msg)); + WRITE_ONCE(conn->rmb_desc->used, 0); + } + } +} + +/* remove a finished connection from its link group */ +void smc_conn_free(struct smc_connection *conn) +{ + struct smc_link_group *lgr = conn->lgr; + + if (!lgr || conn->freed) + /* Connection has never been registered in a + * link group, or has already been freed. + */ + return; + + conn->freed = 1; + if (!smc_conn_lgr_valid(conn)) + /* Connection has already unregistered from + * link group. + */ + goto lgr_put; + + if (lgr->is_smcd) { + if (!list_empty(&lgr->list)) + smc_ism_unset_conn(conn); + tasklet_kill(&conn->rx_tsklet); + } else { + smc_cdc_wait_pend_tx_wr(conn); + if (current_work() != &conn->abort_work) + cancel_work_sync(&conn->abort_work); + } + if (!list_empty(&lgr->list)) { + smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + smc_lgr_unregister_conn(conn); + } + + if (!lgr->conns_num) + smc_lgr_schedule_free_work(lgr); +lgr_put: + if (!lgr->is_smcd) + smcr_link_put(conn->lnk); /* link_hold in smc_conn_create() */ + smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */ +} + +/* unregister a link from a buf_desc */ +static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link *lnk) +{ + if (is_rmb || buf_desc->is_vm) + buf_desc->is_reg_mr[lnk->link_idx] = false; + if (!buf_desc->is_map_ib[lnk->link_idx]) + return; + + if ((is_rmb || buf_desc->is_vm) && + buf_desc->mr[lnk->link_idx]) { + smc_ib_put_memory_region(buf_desc->mr[lnk->link_idx]); + buf_desc->mr[lnk->link_idx] = NULL; + } + if (is_rmb) + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); + else + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); + + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + buf_desc->is_map_ib[lnk->link_idx] = false; +} + +/* unmap all buffers of lgr for a deleted link */ +static void smcr_buf_unmap_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_buf_desc *buf_desc, *bf; + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + down_write(&lgr->rmbs_lock); + list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) + smcr_buf_unmap_link(buf_desc, true, lnk); + up_write(&lgr->rmbs_lock); + + down_write(&lgr->sndbufs_lock); + list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], + list) + smcr_buf_unmap_link(buf_desc, false, lnk); + up_write(&lgr->sndbufs_lock); + } +} + +static void smcr_rtoken_clear_link(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + lgr->rtokens[i][lnk->link_idx].rkey = 0; + lgr->rtokens[i][lnk->link_idx].dma_addr = 0; + } +} + +static void __smcr_link_clear(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_ib_device *smcibdev; + + smc_wr_free_link_mem(lnk); + smc_ibdev_cnt_dec(lnk); + put_device(&lnk->smcibdev->ibdev->dev); + smcibdev = lnk->smcibdev; + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&smcibdev->lnk_cnt)) + wake_up(&smcibdev->lnks_deleted); + smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ +} + +/* must be called under lgr->llc_conf_mutex lock */ +void smcr_link_clear(struct smc_link *lnk, bool log) +{ + if (!lnk->lgr || lnk->clearing || + lnk->state == SMC_LNK_UNUSED) + return; + lnk->clearing = 1; + lnk->peer_qpn = 0; + smc_llc_link_clear(lnk, log); + smcr_buf_unmap_lgr(lnk); + smcr_rtoken_clear_link(lnk); + smc_ib_modify_qp_error(lnk); + smc_wr_free_link(lnk); + smc_ib_destroy_queue_pair(lnk); + smc_ib_dealloc_protection_domain(lnk); + smcr_link_put(lnk); /* theoretically last link_put */ +} + +void smcr_link_hold(struct smc_link *lnk) +{ + refcount_inc(&lnk->refcnt); +} + +void smcr_link_put(struct smc_link *lnk) +{ + if (refcount_dec_and_test(&lnk->refcnt)) + __smcr_link_clear(lnk); +} + +static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc) +{ + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); + + if (!buf_desc->is_vm && buf_desc->pages) + __free_pages(buf_desc->pages, buf_desc->order); + else if (buf_desc->is_vm && buf_desc->cpu_addr) + vfree(buf_desc->cpu_addr); + kfree(buf_desc); +} + +static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, + struct smc_buf_desc *buf_desc) +{ + if (is_dmb) { + /* restore original buf len */ + buf_desc->len += sizeof(struct smcd_cdc_msg); + smc_ism_unregister_dmb(lgr->smcd, buf_desc); + } else { + kfree(buf_desc->cpu_addr); + } + kfree(buf_desc); +} + +static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc) +{ + if (lgr->is_smcd) + smcd_buf_free(lgr, is_rmb, buf_desc); + else + smcr_buf_free(lgr, is_rmb, buf_desc); +} + +static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) +{ + struct smc_buf_desc *buf_desc, *bf_desc; + struct list_head *buf_list; + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + if (is_rmb) + buf_list = &lgr->rmbs[i]; + else + buf_list = &lgr->sndbufs[i]; + list_for_each_entry_safe(buf_desc, bf_desc, buf_list, + list) { + list_del(&buf_desc->list); + smc_buf_free(lgr, is_rmb, buf_desc); + } + } +} + +static void smc_lgr_free_bufs(struct smc_link_group *lgr) +{ + /* free send buffers */ + __smc_lgr_free_bufs(lgr, false); + /* free rmbs */ + __smc_lgr_free_bufs(lgr, true); +} + +/* won't be freed until no one accesses to lgr anymore */ +static void __smc_lgr_free(struct smc_link_group *lgr) +{ + smc_lgr_free_bufs(lgr); + if (lgr->is_smcd) { + if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) + wake_up(&lgr->smcd->lgrs_deleted); + } else { + smc_wr_free_lgr_mem(lgr); + if (!atomic_dec_return(&lgr_cnt)) + wake_up(&lgrs_deleted); + } + kfree(lgr); +} + +/* remove a link group */ +static void smc_lgr_free(struct smc_link_group *lgr) +{ + int i; + + if (!lgr->is_smcd) { + down_write(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_UNUSED) + smcr_link_clear(&lgr->lnk[i], false); + } + up_write(&lgr->llc_conf_mutex); + smc_llc_lgr_clear(lgr); + } + + destroy_workqueue(lgr->tx_wq); + if (lgr->is_smcd) { + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + put_device(lgr->smcd->ops->get_dev(lgr->smcd)); + } + smc_lgr_put(lgr); /* theoretically last lgr_put */ +} + +void smc_lgr_hold(struct smc_link_group *lgr) +{ + refcount_inc(&lgr->refcnt); +} + +void smc_lgr_put(struct smc_link_group *lgr) +{ + if (refcount_dec_and_test(&lgr->refcnt)) + __smc_lgr_free(lgr); +} + +static void smc_sk_wake_ups(struct smc_sock *smc) +{ + smc->sk.sk_write_space(&smc->sk); + smc->sk.sk_data_ready(&smc->sk); + smc->sk.sk_state_change(&smc->sk); +} + +/* kill a connection */ +static void smc_conn_kill(struct smc_connection *conn, bool soft) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + if (conn->lgr->is_smcd && conn->lgr->peer_shutdown) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + else + smc_close_abort(conn); + conn->killed = 1; + smc->sk.sk_err = ECONNABORTED; + smc_sk_wake_ups(smc); + if (conn->lgr->is_smcd) { + smc_ism_unset_conn(conn); + if (soft) + tasklet_kill(&conn->rx_tsklet); + else + tasklet_unlock_wait(&conn->rx_tsklet); + } else { + smc_cdc_wait_pend_tx_wr(conn); + } + smc_lgr_unregister_conn(conn); + smc_close_active_abort(smc); +} + +static void smc_lgr_cleanup(struct smc_link_group *lgr) +{ + if (lgr->is_smcd) { + smc_ism_signal_shutdown(lgr); + } else { + u32 rsn = lgr->llc_termination_rsn; + + if (!rsn) + rsn = SMC_LLC_DEL_PROG_INIT_TERM; + smc_llc_send_link_delete_all(lgr, false, rsn); + smcr_lgr_link_deactivate_all(lgr); + } +} + +/* terminate link group + * @soft: true if link group shutdown can take its time + * false if immediate link group shutdown is required + */ +static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) +{ + struct smc_connection *conn; + struct smc_sock *smc; + struct rb_node *node; + + if (lgr->terminating) + return; /* lgr already terminating */ + /* cancel free_work sync, will terminate when lgr->freeing is set */ + cancel_delayed_work(&lgr->free_work); + lgr->terminating = 1; + + /* kill remaining link group connections */ + read_lock_bh(&lgr->conns_lock); + node = rb_first(&lgr->conns_all); + while (node) { + read_unlock_bh(&lgr->conns_lock); + conn = rb_entry(node, struct smc_connection, alert_node); + smc = container_of(conn, struct smc_sock, conn); + sock_hold(&smc->sk); /* sock_put below */ + lock_sock(&smc->sk); + smc_conn_kill(conn, soft); + release_sock(&smc->sk); + sock_put(&smc->sk); /* sock_hold above */ + read_lock_bh(&lgr->conns_lock); + node = rb_first(&lgr->conns_all); + } + read_unlock_bh(&lgr->conns_lock); + smc_lgr_cleanup(lgr); + smc_lgr_free(lgr); +} + +/* unlink link group and schedule termination */ +void smc_lgr_terminate_sched(struct smc_link_group *lgr) +{ + spinlock_t *lgr_lock; + + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + if (list_empty(&lgr->list) || lgr->terminating || lgr->freeing) { + spin_unlock_bh(lgr_lock); + return; /* lgr already terminating */ + } + list_del_init(&lgr->list); + lgr->freeing = 1; + spin_unlock_bh(lgr_lock); + schedule_work(&lgr->terminate_work); +} + +/* Called when peer lgr shutdown (regularly or abnormally) is received */ +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) +{ + struct smc_link_group *lgr, *l; + LIST_HEAD(lgr_free_list); + + /* run common cleanup function and build free list */ + spin_lock_bh(&dev->lgr_lock); + list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { + if ((!peer_gid || lgr->peer_gid == peer_gid) && + (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { + if (peer_gid) /* peer triggered termination */ + lgr->peer_shutdown = 1; + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + } + } + spin_unlock_bh(&dev->lgr_lock); + + /* cancel the regular free workers and actually free lgrs */ + list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { + list_del_init(&lgr->list); + schedule_work(&lgr->terminate_work); + } +} + +/* Called when an SMCD device is removed or the smc module is unloaded */ +void smc_smcd_terminate_all(struct smcd_dev *smcd) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_free_list); + + spin_lock_bh(&smcd->lgr_lock); + list_splice_init(&smcd->lgr_list, &lgr_free_list); + list_for_each_entry(lgr, &lgr_free_list, list) + lgr->freeing = 1; + spin_unlock_bh(&smcd->lgr_lock); + + list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr, false); + } + + if (atomic_read(&smcd->lgr_cnt)) + wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt)); +} + +/* Called when an SMCR device is removed or the smc module is unloaded. + * If smcibdev is given, all SMCR link groups using this device are terminated. + * If smcibdev is NULL, all SMCR link groups are terminated. + */ +void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_free_list); + int i; + + spin_lock_bh(&smc_lgr_list.lock); + if (!smcibdev) { + list_splice_init(&smc_lgr_list.list, &lgr_free_list); + list_for_each_entry(lgr, &lgr_free_list, list) + lgr->freeing = 1; + } else { + list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].smcibdev == smcibdev) + smcr_link_down_cond_sched(&lgr->lnk[i]); + } + } + } + spin_unlock_bh(&smc_lgr_list.lock); + + list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { + list_del_init(&lgr->list); + smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM); + __smc_lgr_terminate(lgr, false); + } + + if (smcibdev) { + if (atomic_read(&smcibdev->lnk_cnt)) + wait_event(smcibdev->lnks_deleted, + !atomic_read(&smcibdev->lnk_cnt)); + } else { + if (atomic_read(&lgr_cnt)) + wait_event(lgrs_deleted, !atomic_read(&lgr_cnt)); + } +} + +/* set new lgr type and clear all asymmetric link tagging */ +void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type) +{ + char *lgr_type = ""; + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (smc_link_usable(&lgr->lnk[i])) + lgr->lnk[i].link_is_asym = false; + if (lgr->type == new_type) + return; + lgr->type = new_type; + + switch (lgr->type) { + case SMC_LGR_NONE: + lgr_type = "NONE"; + break; + case SMC_LGR_SINGLE: + lgr_type = "SINGLE"; + break; + case SMC_LGR_SYMMETRIC: + lgr_type = "SYMMETRIC"; + break; + case SMC_LGR_ASYMMETRIC_PEER: + lgr_type = "ASYMMETRIC_PEER"; + break; + case SMC_LGR_ASYMMETRIC_LOCAL: + lgr_type = "ASYMMETRIC_LOCAL"; + break; + } + pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu state changed: " + "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id, + lgr->net->net_cookie, lgr_type, lgr->pnet_id); +} + +/* set new lgr type and tag a link as asymmetric */ +void smcr_lgr_set_type_asym(struct smc_link_group *lgr, + enum smc_lgr_type new_type, int asym_lnk_idx) +{ + smcr_lgr_set_type(lgr, new_type); + lgr->lnk[asym_lnk_idx].link_is_asym = true; +} + +/* abort connection, abort_work scheduled from tasklet context */ +static void smc_conn_abort_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(work, + struct smc_connection, + abort_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + lock_sock(&smc->sk); + smc_conn_kill(conn, true); + release_sock(&smc->sk); + sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */ +} + +void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr, *n; + + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { + struct smc_link *link; + + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN) || + lgr->type == SMC_LGR_SYMMETRIC || + lgr->type == SMC_LGR_ASYMMETRIC_PEER || + !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) + continue; + + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) + continue; + + /* trigger local add link processing */ + link = smc_llc_usable_link(lgr); + if (link) + smc_llc_add_link_local(link); + } + spin_unlock_bh(&smc_lgr_list.lock); +} + +/* link is down - switch connections to alternate link, + * must be called under lgr->llc_conf_mutex lock + */ +static void smcr_link_down(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_link *to_lnk; + int del_link_id; + + if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list)) + return; + + to_lnk = smc_switch_conns(lgr, lnk, true); + if (!to_lnk) { /* no backup link available */ + smcr_link_clear(lnk, true); + return; + } + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); + del_link_id = lnk->link_id; + + if (lgr->role == SMC_SERV) { + /* trigger local delete link processing */ + smc_llc_srv_delete_link_local(to_lnk, del_link_id); + } else { + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* another llc task is ongoing */ + up_write(&lgr->llc_conf_mutex); + wait_event_timeout(lgr->llc_flow_waiter, + (list_empty(&lgr->list) || + lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), + SMC_LLC_WAIT_TIME); + down_write(&lgr->llc_conf_mutex); + } + if (!list_empty(&lgr->list)) { + smc_llc_send_delete_link(to_lnk, del_link_id, + SMC_LLC_REQ, true, + SMC_LLC_DEL_LOST_PATH); + smcr_link_clear(lnk, true); + } + wake_up(&lgr->llc_flow_waiter); /* wake up next waiter */ + } +} + +/* must be called under lgr->llc_conf_mutex lock */ +void smcr_link_down_cond(struct smc_link *lnk) +{ + if (smc_link_downing(&lnk->state)) { + trace_smcr_link_down(lnk, __builtin_return_address(0)); + smcr_link_down(lnk); + } +} + +/* will get the lgr->llc_conf_mutex lock */ +void smcr_link_down_cond_sched(struct smc_link *lnk) +{ + if (smc_link_downing(&lnk->state)) { + trace_smcr_link_down(lnk, __builtin_return_address(0)); + schedule_work(&lnk->link_down_wrk); + } +} + +void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr, *n; + int i; + + list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN)) + continue; /* lgr is not affected */ + if (list_empty(&lgr->list)) + continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (smc_link_usable(lnk) && + lnk->smcibdev == smcibdev && lnk->ibport == ibport) + smcr_link_down_cond_sched(lnk); + } + } +} + +static void smc_link_down_work(struct work_struct *work) +{ + struct smc_link *link = container_of(work, struct smc_link, + link_down_wrk); + struct smc_link_group *lgr = link->lgr; + + if (list_empty(&lgr->list)) + return; + wake_up_all(&lgr->llc_msg_waiter); + down_write(&lgr->llc_conf_mutex); + smcr_link_down(link); + up_write(&lgr->llc_conf_mutex); +} + +static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, + struct netdev_nested_priv *priv) +{ + unsigned short *vlan_id = (unsigned short *)priv->data; + + if (is_vlan_dev(lower_dev)) { + *vlan_id = vlan_dev_vlan_id(lower_dev); + return 1; + } + + return 0; +} + +/* Determine vlan of internal TCP socket. */ +int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) +{ + struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct netdev_nested_priv priv; + struct net_device *ndev; + int rc = 0; + + ini->vlan_id = 0; + if (!dst) { + rc = -ENOTCONN; + goto out; + } + if (!dst->dev) { + rc = -ENODEV; + goto out_rel; + } + + ndev = dst->dev; + if (is_vlan_dev(ndev)) { + ini->vlan_id = vlan_dev_vlan_id(ndev); + goto out_rel; + } + + priv.data = (void *)&ini->vlan_id; + rtnl_lock(); + netdev_walk_all_lower_dev(ndev, smc_vlan_by_tcpsk_walk, &priv); + rtnl_unlock(); + +out_rel: + dst_release(dst); +out: + return rc; +} + +static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, + u8 peer_systemid[], + u8 peer_gid[], + u8 peer_mac_v1[], + enum smc_lgr_role role, u32 clcqpn, + struct net *net) +{ + struct smc_link *lnk; + int i; + + if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) || + lgr->role != role) + return false; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + lnk = &lgr->lnk[i]; + + if (!smc_link_active(lnk)) + continue; + /* use verbs API to check netns, instead of lgr->net */ + if (!rdma_dev_access_netns(lnk->smcibdev->ibdev, net)) + return false; + if ((lgr->role == SMC_SERV || lnk->peer_qpn == clcqpn) && + !memcmp(lnk->peer_gid, peer_gid, SMC_GID_SIZE) && + (smcr_version == SMC_V2 || + !memcmp(lnk->peer_mac, peer_mac_v1, ETH_ALEN))) + return true; + } + return false; +} + +static bool smcd_lgr_match(struct smc_link_group *lgr, + struct smcd_dev *smcismdev, u64 peer_gid) +{ + return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; +} + +/* create a new SMC connection (and a new link group if necessary) */ +int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) +{ + struct smc_connection *conn = &smc->conn; + struct net *net = sock_net(&smc->sk); + struct list_head *lgr_list; + struct smc_link_group *lgr; + enum smc_lgr_role role; + spinlock_t *lgr_lock; + int rc = 0; + + lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list : + &smc_lgr_list.list; + lgr_lock = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_lock : + &smc_lgr_list.lock; + ini->first_contact_local = 1; + role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + if (role == SMC_CLNT && ini->first_contact_peer) + /* create new link group as well */ + goto create; + + /* determine if an existing link group can be reused */ + spin_lock_bh(lgr_lock); + list_for_each_entry(lgr, lgr_list, list) { + write_lock_bh(&lgr->conns_lock); + if ((ini->is_smcd ? + smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected], + ini->ism_peer_gid[ini->ism_selected]) : + smcr_lgr_match(lgr, ini->smcr_version, + ini->peer_systemid, + ini->peer_gid, ini->peer_mac, role, + ini->ib_clcqpn, net)) && + !lgr->sync_err && + (ini->smcd_version == SMC_V2 || + lgr->vlan_id == ini->vlan_id) && + (role == SMC_CLNT || ini->is_smcd || + (lgr->conns_num < lgr->max_conns && + !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { + /* link group found */ + ini->first_contact_local = 0; + conn->lgr = lgr; + rc = smc_lgr_register_conn(conn, false); + write_unlock_bh(&lgr->conns_lock); + if (!rc && delayed_work_pending(&lgr->free_work)) + cancel_delayed_work(&lgr->free_work); + break; + } + write_unlock_bh(&lgr->conns_lock); + } + spin_unlock_bh(lgr_lock); + if (rc) + return rc; + + if (role == SMC_CLNT && !ini->first_contact_peer && + ini->first_contact_local) { + /* Server reuses a link group, but Client wants to start + * a new one + * send out_of_sync decline, reason synchr. error + */ + return SMC_CLC_DECL_SYNCERR; + } + +create: + if (ini->first_contact_local) { + rc = smc_lgr_create(smc, ini); + if (rc) + goto out; + lgr = conn->lgr; + write_lock_bh(&lgr->conns_lock); + rc = smc_lgr_register_conn(conn, true); + write_unlock_bh(&lgr->conns_lock); + if (rc) { + smc_lgr_cleanup_early(lgr); + goto out; + } + } + smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ + if (!conn->lgr->is_smcd) + smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */ + conn->freed = 0; + conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; + conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; + conn->urg_state = SMC_URG_READ; + init_waitqueue_head(&conn->cdc_pend_tx_wq); + INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); + if (ini->is_smcd) { + conn->rx_off = sizeof(struct smcd_cdc_msg); + smcd_cdc_rx_init(conn); /* init tasklet for this conn */ + } else { + conn->rx_off = 0; + } +#ifndef KERNEL_HAS_ATOMIC64 + spin_lock_init(&conn->acurs_lock); +#endif + +out: + return rc; +} + +#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ +#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */ + +/* convert the RMB size into the compressed notation (minimum 16K, see + * SMCD/R_DMBE_SIZES. + * In contrast to plain ilog2, this rounds towards the next power of 2, + * so the socket application gets at least its desired sndbuf / rcvbuf size. + */ +static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) +{ + const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; + u8 compressed; + + if (size <= SMC_BUF_MIN_SIZE) + return 0; + + size = (size - 1) >> 14; /* convert to 16K multiple */ + compressed = min_t(u8, ilog2(size) + 1, + is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); + + if (!is_smcd && is_rmb) + /* RMBs are backed by & limited to max size of scatterlists */ + compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); + + return compressed; +} + +/* convert the RMB size from compressed notation into integer */ +int smc_uncompress_bufsize(u8 compressed) +{ + u32 size; + + size = 0x00000001 << (((int)compressed) + 14); + return (int)size; +} + +/* try to reuse a sndbuf or rmb description slot for a certain + * buffer size; if not available, return NULL + */ +static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, + struct rw_semaphore *lock, + struct list_head *buf_list) +{ + struct smc_buf_desc *buf_slot; + + down_read(lock); + list_for_each_entry(buf_slot, buf_list, list) { + if (cmpxchg(&buf_slot->used, 0, 1) == 0) { + up_read(lock); + return buf_slot; + } + } + up_read(lock); + return NULL; +} + +/* one of the conditions for announcing a receiver's current window size is + * that it "results in a minimum increase in the window size of 10% of the + * receive buffer space" [RFC7609] + */ +static inline int smc_rmb_wnd_update_limit(int rmbe_size) +{ + return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); +} + +/* map an buf to a link */ +static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link *lnk) +{ + int rc, i, nents, offset, buf_size, size, access_flags; + struct scatterlist *sg; + void *buf; + + if (buf_desc->is_map_ib[lnk->link_idx]) + return 0; + + if (buf_desc->is_vm) { + buf = buf_desc->cpu_addr; + buf_size = buf_desc->len; + offset = offset_in_page(buf_desc->cpu_addr); + nents = PAGE_ALIGN(buf_size + offset) / PAGE_SIZE; + } else { + nents = 1; + } + + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], nents, GFP_KERNEL); + if (rc) + return rc; + + if (buf_desc->is_vm) { + /* virtually contiguous buffer */ + for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { + size = min_t(int, PAGE_SIZE - offset, buf_size); + sg_set_page(sg, vmalloc_to_page(buf), size, offset); + buf += size / sizeof(*buf); + buf_size -= size; + offset = 0; + } + } else { + /* physically contiguous buffer */ + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, + buf_desc->cpu_addr, buf_desc->len); + } + + /* map sg table to DMA address */ + rc = smc_ib_buf_map_sg(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + /* SMC protocol depends on mapping to one DMA address only */ + if (rc != nents) { + rc = -EAGAIN; + goto free_table; + } + + buf_desc->is_dma_need_sync |= + smc_ib_is_sg_need_sync(lnk, buf_desc) << lnk->link_idx; + + if (is_rmb || buf_desc->is_vm) { + /* create a new memory region for the RMB or vzalloced sndbuf */ + access_flags = is_rmb ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_LOCAL_WRITE; + + rc = smc_ib_get_memory_region(lnk->roce_pd, access_flags, + buf_desc, lnk->link_idx); + if (rc) + goto buf_unmap; + smc_ib_sync_sg_for_device(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + } + buf_desc->is_map_ib[lnk->link_idx] = true; + return 0; + +buf_unmap: + smc_ib_buf_unmap_sg(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); +free_table: + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + return rc; +} + +/* register a new buf on IB device, rmb or vzalloced sndbuf + * must be called under lgr->llc_conf_mutex lock + */ +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc) +{ + if (list_empty(&link->lgr->list)) + return -ENOLINK; + if (!buf_desc->is_reg_mr[link->link_idx]) { + /* register memory region for new buf */ + if (buf_desc->is_vm) + buf_desc->mr[link->link_idx]->iova = + (uintptr_t)buf_desc->cpu_addr; + if (smc_wr_reg_send(link, buf_desc->mr[link->link_idx])) { + buf_desc->is_reg_err = true; + return -EFAULT; + } + buf_desc->is_reg_mr[link->link_idx] = true; + } + return 0; +} + +static int _smcr_buf_map_lgr(struct smc_link *lnk, struct rw_semaphore *lock, + struct list_head *lst, bool is_rmb) +{ + struct smc_buf_desc *buf_desc, *bf; + int rc = 0; + + down_write(lock); + list_for_each_entry_safe(buf_desc, bf, lst, list) { + if (!buf_desc->used) + continue; + rc = smcr_buf_map_link(buf_desc, is_rmb, lnk); + if (rc) + goto out; + } +out: + up_write(lock); + return rc; +} + +/* map all used buffers of lgr for a new link */ +int smcr_buf_map_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + int i, rc = 0; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock, + &lgr->rmbs[i], true); + if (rc) + return rc; + rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock, + &lgr->sndbufs[i], false); + if (rc) + return rc; + } + return 0; +} + +/* register all used buffers of lgr for a new link, + * must be called under lgr->llc_conf_mutex lock + */ +int smcr_buf_reg_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_buf_desc *buf_desc, *bf; + int i, rc = 0; + + /* reg all RMBs for a new link */ + down_write(&lgr->rmbs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { + if (!buf_desc->used) + continue; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + up_write(&lgr->rmbs_lock); + return rc; + } + } + } + up_write(&lgr->rmbs_lock); + + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + return rc; + + /* reg all vzalloced sndbufs for a new link */ + down_write(&lgr->sndbufs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) { + if (!buf_desc->used || !buf_desc->is_vm) + continue; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + up_write(&lgr->sndbufs_lock); + return rc; + } + } + } + up_write(&lgr->sndbufs_lock); + return rc; +} + +static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, + bool is_rmb, int bufsize) +{ + struct smc_buf_desc *buf_desc; + + /* try to alloc a new buffer */ + buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + if (!buf_desc) + return ERR_PTR(-ENOMEM); + + switch (lgr->buf_type) { + case SMCR_PHYS_CONT_BUFS: + case SMCR_MIXED_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | __GFP_COMP | + __GFP_NORETRY | __GFP_ZERO, + buf_desc->order); + if (buf_desc->pages) { + buf_desc->cpu_addr = + (void *)page_address(buf_desc->pages); + buf_desc->len = bufsize; + buf_desc->is_vm = false; + break; + } + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + goto out; + fallthrough; // try virtually continguous buf + case SMCR_VIRT_CONT_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order); + if (!buf_desc->cpu_addr) + goto out; + buf_desc->pages = NULL; + buf_desc->len = bufsize; + buf_desc->is_vm = true; + break; + } + return buf_desc; + +out: + kfree(buf_desc); + return ERR_PTR(-EAGAIN); +} + +/* map buf_desc on all usable links, + * unused buffers stay mapped as long as the link is up + */ +static int smcr_buf_map_usable_links(struct smc_link_group *lgr, + struct smc_buf_desc *buf_desc, bool is_rmb) +{ + int i, rc = 0, cnt = 0; + + /* protect against parallel link reconfiguration */ + down_read(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (!smc_link_usable(lnk)) + continue; + if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { + rc = -ENOMEM; + goto out; + } + cnt++; + } +out: + up_read(&lgr->llc_conf_mutex); + if (!rc && !cnt) + rc = -EINVAL; + return rc; +} + +static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, + bool is_dmb, int bufsize) +{ + struct smc_buf_desc *buf_desc; + int rc; + + /* try to alloc a new DMB */ + buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + if (!buf_desc) + return ERR_PTR(-ENOMEM); + if (is_dmb) { + rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); + if (rc) { + kfree(buf_desc); + if (rc == -ENOMEM) + return ERR_PTR(-EAGAIN); + if (rc == -ENOSPC) + return ERR_PTR(-ENOSPC); + return ERR_PTR(-EIO); + } + buf_desc->pages = virt_to_page(buf_desc->cpu_addr); + /* CDC header stored in buf. So, pretend it was smaller */ + buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg); + } else { + buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC); + if (!buf_desc->cpu_addr) { + kfree(buf_desc); + return ERR_PTR(-EAGAIN); + } + buf_desc->len = bufsize; + } + return buf_desc; +} + +static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) +{ + struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); + struct smc_connection *conn = &smc->conn; + struct smc_link_group *lgr = conn->lgr; + struct list_head *buf_list; + int bufsize, bufsize_comp; + struct rw_semaphore *lock; /* lock buffer list */ + bool is_dgraded = false; + + if (is_rmb) + /* use socket recv buffer size (w/o overhead) as start value */ + bufsize = smc->sk.sk_rcvbuf / 2; + else + /* use socket send buffer size (w/o overhead) as start value */ + bufsize = smc->sk.sk_sndbuf / 2; + + for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb); + bufsize_comp >= 0; bufsize_comp--) { + if (is_rmb) { + lock = &lgr->rmbs_lock; + buf_list = &lgr->rmbs[bufsize_comp]; + } else { + lock = &lgr->sndbufs_lock; + buf_list = &lgr->sndbufs[bufsize_comp]; + } + bufsize = smc_uncompress_bufsize(bufsize_comp); + + /* check for reusable slot in the link group */ + buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list); + if (buf_desc) { + buf_desc->is_dma_need_sync = 0; + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); + SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb); + break; /* found reusable slot */ + } + + if (is_smcd) + buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); + else + buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); + + if (PTR_ERR(buf_desc) == -ENOMEM) + break; + if (IS_ERR(buf_desc)) { + if (!is_dgraded) { + is_dgraded = true; + SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rmb); + } + continue; + } + + SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb); + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); + buf_desc->used = 1; + down_write(lock); + list_add(&buf_desc->list, buf_list); + up_write(lock); + break; /* found */ + } + + if (IS_ERR(buf_desc)) + return PTR_ERR(buf_desc); + + if (!is_smcd) { + if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { + smcr_buf_unuse(buf_desc, is_rmb, lgr); + return -ENOMEM; + } + } + + if (is_rmb) { + conn->rmb_desc = buf_desc; + conn->rmbe_size_comp = bufsize_comp; + smc->sk.sk_rcvbuf = bufsize * 2; + atomic_set(&conn->bytes_to_rcv, 0); + conn->rmbe_update_limit = + smc_rmb_wnd_update_limit(buf_desc->len); + if (is_smcd) + smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ + } else { + conn->sndbuf_desc = buf_desc; + smc->sk.sk_sndbuf = bufsize * 2; + atomic_set(&conn->sndbuf_space, bufsize); + } + return 0; +} + +void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) +{ + if (!conn->sndbuf_desc->is_dma_need_sync) + return; + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || + !smc_link_active(conn->lnk)) + return; + smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); +} + +void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) +{ + int i; + + if (!conn->rmb_desc->is_dma_need_sync) + return; + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) + return; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&conn->lgr->lnk[i])) + continue; + smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc, + DMA_FROM_DEVICE); + } +} + +/* create the send and receive buffer for an SMC socket; + * receive buffers are called RMBs; + * (even though the SMC protocol allows more than one RMB-element per RMB, + * the Linux implementation uses just one RMB-element per RMB, i.e. uses an + * extra RMB for every connection in a link group + */ +int smc_buf_create(struct smc_sock *smc, bool is_smcd) +{ + int rc; + + /* create send buffer */ + rc = __smc_buf_create(smc, is_smcd, false); + if (rc) + return rc; + /* create rmb */ + rc = __smc_buf_create(smc, is_smcd, true); + if (rc) { + down_write(&smc->conn.lgr->sndbufs_lock); + list_del(&smc->conn.sndbuf_desc->list); + up_write(&smc->conn.lgr->sndbufs_lock); + smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); + smc->conn.sndbuf_desc = NULL; + } + return rc; +} + +static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) +{ + int i; + + for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) { + if (!test_and_set_bit(i, lgr->rtokens_used_mask)) + return i; + } + return -ENOSPC; +} + +static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx, + u32 rkey) +{ + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + if (test_bit(i, lgr->rtokens_used_mask) && + lgr->rtokens[i][lnk_idx].rkey == rkey) + return i; + } + return -ENOENT; +} + +/* set rtoken for a new link to an existing rmb */ +void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, + __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey) +{ + int rtok_idx; + + rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known)); + if (rtok_idx == -ENOENT) + return; + lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey); + lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr); +} + +/* set rtoken for a new link whose link_id is given */ +void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, + __be64 nw_vaddr, __be32 nw_rkey) +{ + u64 dma_addr = be64_to_cpu(nw_vaddr); + u32 rkey = ntohl(nw_rkey); + bool found = false; + int link_idx; + + for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) { + if (lgr->lnk[link_idx].link_id == link_id) { + found = true; + break; + } + } + if (!found) + return; + lgr->rtokens[rtok_idx][link_idx].rkey = rkey; + lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr; +} + +/* add a new rtoken from peer */ +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) +{ + struct smc_link_group *lgr = smc_get_lgr(lnk); + u64 dma_addr = be64_to_cpu(nw_vaddr); + u32 rkey = ntohl(nw_rkey); + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && + lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr && + test_bit(i, lgr->rtokens_used_mask)) { + /* already in list */ + return i; + } + } + i = smc_rmb_reserve_rtoken_idx(lgr); + if (i < 0) + return i; + lgr->rtokens[i][lnk->link_idx].rkey = rkey; + lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr; + return i; +} + +/* delete an rtoken from all links */ +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) +{ + struct smc_link_group *lgr = smc_get_lgr(lnk); + u32 rkey = ntohl(nw_rkey); + int i, j; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && + test_bit(i, lgr->rtokens_used_mask)) { + for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) { + lgr->rtokens[i][j].rkey = 0; + lgr->rtokens[i][j].dma_addr = 0; + } + clear_bit(i, lgr->rtokens_used_mask); + return 0; + } + } + return -ENOENT; +} + +/* save rkey and dma_addr received from peer during clc handshake */ +int smc_rmb_rtoken_handling(struct smc_connection *conn, + struct smc_link *lnk, + struct smc_clc_msg_accept_confirm *clc) +{ + conn->rtoken_idx = smc_rtoken_add(lnk, clc->r0.rmb_dma_addr, + clc->r0.rmb_rkey); + if (conn->rtoken_idx < 0) + return conn->rtoken_idx; + return 0; +} + +static void smc_core_going_away(void) +{ + struct smc_ib_device *smcibdev; + struct smcd_dev *smcd; + + mutex_lock(&smc_ib_devices.mutex); + list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { + int i; + + for (i = 0; i < SMC_MAX_PORTS; i++) + set_bit(i, smcibdev->ports_going_away); + } + mutex_unlock(&smc_ib_devices.mutex); + + mutex_lock(&smcd_dev_list.mutex); + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + smcd->going_away = 1; + } + mutex_unlock(&smcd_dev_list.mutex); +} + +/* Clean up all SMC link groups */ +static void smc_lgrs_shutdown(void) +{ + struct smcd_dev *smcd; + + smc_core_going_away(); + + smc_smcr_terminate_all(NULL); + + mutex_lock(&smcd_dev_list.mutex); + list_for_each_entry(smcd, &smcd_dev_list.list, list) + smc_smcd_terminate_all(smcd); + mutex_unlock(&smcd_dev_list.mutex); +} + +static int smc_core_reboot_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + smc_lgrs_shutdown(); + smc_ib_unregister_client(); + smc_ism_exit(); + return 0; +} + +static struct notifier_block smc_reboot_notifier = { + .notifier_call = smc_core_reboot_event, +}; + +int __init smc_core_init(void) +{ + return register_reboot_notifier(&smc_reboot_notifier); +} + +/* Called (from smc_exit) when module is removed */ +void smc_core_exit(void) +{ + unregister_reboot_notifier(&smc_reboot_notifier); + smc_lgrs_shutdown(); +} diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h new file mode 100644 index 0000000000..120027d404 --- /dev/null +++ b/net/smc/smc_core.h @@ -0,0 +1,596 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for SMC Connections, Link Groups and Links + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef _SMC_CORE_H +#define _SMC_CORE_H + +#include <linux/atomic.h> +#include <linux/smc.h> +#include <linux/pci.h> +#include <rdma/ib_verbs.h> +#include <net/genetlink.h> + +#include "smc.h" +#include "smc_ib.h" + +#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ +#define SMC_CONN_PER_LGR_MIN 16 /* min. # of connections per link group */ +#define SMC_CONN_PER_LGR_MAX 255 /* max. # of connections per link group, + * also is the default value for SMC-R v1 and v2.0 + */ +#define SMC_CONN_PER_LGR_PREFER 255 /* Preferred connections per link group used for + * SMC-R v2.1 and later negotiation, vendors or + * distrubutions may modify it to a value between + * 16-255 as needed. + */ + +struct smc_lgr_list { /* list of link group definition */ + struct list_head list; + spinlock_t lock; /* protects list of link groups */ + u32 num; /* unique link group number */ +}; + +enum smc_lgr_role { /* possible roles of a link group */ + SMC_CLNT, /* client */ + SMC_SERV /* server */ +}; + +enum smc_link_state { /* possible states of a link */ + SMC_LNK_UNUSED, /* link is unused */ + SMC_LNK_INACTIVE, /* link is inactive */ + SMC_LNK_ACTIVATING, /* link is being activated */ + SMC_LNK_ACTIVE, /* link is active */ +}; + +#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ +#define SMC_WR_BUF_V2_SIZE 8192 /* size of v2 work request buffer */ + +struct smc_wr_buf { + u8 raw[SMC_WR_BUF_SIZE]; +}; + +struct smc_wr_v2_buf { + u8 raw[SMC_WR_BUF_V2_SIZE]; +}; + +#define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */ + +enum smc_wr_reg_state { + POSTED, /* ib_wr_reg_mr request posted */ + CONFIRMED, /* ib_wr_reg_mr response: successful */ + FAILED /* ib_wr_reg_mr response: failure */ +}; + +struct smc_rdma_sge { /* sges for RDMA writes */ + struct ib_sge wr_tx_rdma_sge[SMC_IB_MAX_SEND_SGE]; +}; + +#define SMC_MAX_RDMA_WRITES 2 /* max. # of RDMA writes per + * message send + */ + +struct smc_rdma_sges { /* sges per message send */ + struct smc_rdma_sge tx_rdma_sge[SMC_MAX_RDMA_WRITES]; +}; + +struct smc_rdma_wr { /* work requests per message + * send + */ + struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES]; +}; + +#define SMC_LGR_ID_SIZE 4 + +struct smc_link { + struct smc_ib_device *smcibdev; /* ib-device */ + u8 ibport; /* port - values 1 | 2 */ + struct ib_pd *roce_pd; /* IB protection domain, + * unique for every RoCE QP + */ + struct ib_qp *roce_qp; /* IB queue pair */ + struct ib_qp_attr qp_attr; /* IB queue pair attributes */ + + struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */ + struct ib_send_wr *wr_tx_ibs; /* WR send meta data */ + struct ib_sge *wr_tx_sges; /* WR send gather meta data */ + struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/ + struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */ + struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ + struct completion *wr_tx_compl; /* WR send CQE completion */ + /* above four vectors have wr_tx_cnt elements and use the same index */ + struct ib_send_wr *wr_tx_v2_ib; /* WR send v2 meta data */ + struct ib_sge *wr_tx_v2_sge; /* WR send v2 gather meta data*/ + struct smc_wr_tx_pend *wr_tx_v2_pend; /* WR send v2 waiting for CQE */ + dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ + dma_addr_t wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/ + atomic_long_t wr_tx_id; /* seq # of last sent WR */ + unsigned long *wr_tx_mask; /* bit mask of used indexes */ + u32 wr_tx_cnt; /* number of WR send buffers */ + wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ + struct { + struct percpu_ref wr_tx_refs; + } ____cacheline_aligned_in_smp; + struct completion tx_ref_comp; + + struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ + struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ + struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ + /* above three vectors have wr_rx_cnt elements and use the same index */ + dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ + dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ + u64 wr_rx_id; /* seq # of last recv WR */ + u64 wr_rx_id_compl; /* seq # of last completed WR */ + u32 wr_rx_cnt; /* number of WR recv buffers */ + unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ + wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */ + + struct ib_reg_wr wr_reg; /* WR register memory region */ + wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ + struct { + struct percpu_ref wr_reg_refs; + } ____cacheline_aligned_in_smp; + struct completion reg_ref_comp; + enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ + + u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ + u8 sgid_index; /* gid index for vlan id */ + u32 peer_qpn; /* QP number of peer */ + enum ib_mtu path_mtu; /* used mtu */ + enum ib_mtu peer_mtu; /* mtu size of peer */ + u32 psn_initial; /* QP tx initial packet seqno */ + u32 peer_psn; /* QP rx initial packet seqno */ + u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ + u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ + u8 link_id; /* unique # within link group */ + u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */ + u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ + u8 link_idx; /* index in lgr link array */ + u8 link_is_asym; /* is link asymmetric? */ + u8 clearing : 1; /* link is being cleared */ + refcount_t refcnt; /* link reference count */ + struct smc_link_group *lgr; /* parent link group */ + struct work_struct link_down_wrk; /* wrk to bring link down */ + char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */ + int ndev_ifidx; /* network device ifindex */ + + enum smc_link_state state; /* state of link */ + struct delayed_work llc_testlink_wrk; /* testlink worker */ + struct completion llc_testlink_resp; /* wait for rx of testlink */ + int llc_testlink_time; /* testlink interval */ + atomic_t conn_cnt; /* connections on this link */ +}; + +/* For now we just allow one parallel link per link group. The SMC protocol + * allows more (up to 8). + */ +#define SMC_LINKS_PER_LGR_MAX 3 +#define SMC_SINGLE_LINK 0 +#define SMC_LINKS_ADD_LNK_MIN 1 /* min. # of links per link group */ +#define SMC_LINKS_ADD_LNK_MAX 2 /* max. # of links per link group, also is the + * default value for smc-r v1.0 and v2.0 + */ +#define SMC_LINKS_PER_LGR_MAX_PREFER 2 /* Preferred max links per link group used for + * SMC-R v2.1 and later negotiation, vendors or + * distrubutions may modify it to a value between + * 1-2 as needed. + */ + +/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ +struct smc_buf_desc { + struct list_head list; + void *cpu_addr; /* virtual address of buffer */ + struct page *pages; + int len; /* length of buffer */ + u32 used; /* currently used / unused */ + union { + struct { /* SMC-R */ + struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; + /* virtual buffer */ + struct ib_mr *mr[SMC_LINKS_PER_LGR_MAX]; + /* memory region: for rmb and + * vzalloced sndbuf + * incl. rkey provided to peer + * and lkey provided to local + */ + u32 order; /* allocation order */ + + u8 is_conf_rkey; + /* confirm_rkey done */ + u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX]; + /* mem region registered */ + u8 is_map_ib[SMC_LINKS_PER_LGR_MAX]; + /* mem region mapped to lnk */ + u8 is_dma_need_sync; + u8 is_reg_err; + /* buffer registration err */ + u8 is_vm; + /* virtually contiguous */ + }; + struct { /* SMC-D */ + unsigned short sba_idx; + /* SBA index number */ + u64 token; + /* DMB token number */ + dma_addr_t dma_addr; + /* DMA address */ + }; + }; +}; + +struct smc_rtoken { /* address/key of remote RMB */ + u64 dma_addr; + u32 rkey; +}; + +#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ +#define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ +/* theoretically, the RFC states that largest size would be 512K, + * i.e. compressed 5 and thus 6 sizes (0..5), despite + * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) + */ + +struct smcd_dev; + +enum smc_lgr_type { /* redundancy state of lgr */ + SMC_LGR_NONE, /* no active links, lgr to be deleted */ + SMC_LGR_SINGLE, /* 1 active RNIC on each peer */ + SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */ + SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */ + SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ +}; + +enum smcr_buf_type { /* types of SMC-R sndbufs and RMBs */ + SMCR_PHYS_CONT_BUFS = 0, + SMCR_VIRT_CONT_BUFS = 1, + SMCR_MIXED_BUFS = 2, +}; + +enum smc_llc_flowtype { + SMC_LLC_FLOW_NONE = 0, + SMC_LLC_FLOW_ADD_LINK = 2, + SMC_LLC_FLOW_DEL_LINK = 4, + SMC_LLC_FLOW_REQ_ADD_LINK = 5, + SMC_LLC_FLOW_RKEY = 6, +}; + +struct smc_llc_qentry; + +struct smc_llc_flow { + enum smc_llc_flowtype type; + struct smc_llc_qentry *qentry; +}; + +struct smc_link_group { + struct list_head list; + struct rb_root conns_all; /* connection tree */ + rwlock_t conns_lock; /* protects conns_all */ + unsigned int conns_num; /* current # of connections */ + unsigned short vlan_id; /* vlan id of link group */ + + struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ + struct rw_semaphore sndbufs_lock; /* protects tx buffers */ + struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ + struct rw_semaphore rmbs_lock; /* protects rx buffers */ + + u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ + struct delayed_work free_work; /* delayed freeing of an lgr */ + struct work_struct terminate_work; /* abnormal lgr termination */ + struct workqueue_struct *tx_wq; /* wq for conn. tx workers */ + u8 sync_err : 1; /* lgr no longer fits to peer */ + u8 terminating : 1;/* lgr is terminating */ + u8 freeing : 1; /* lgr is being freed */ + + refcount_t refcnt; /* lgr reference count */ + bool is_smcd; /* SMC-R or SMC-D */ + u8 smc_version; + u8 negotiated_eid[SMC_MAX_EID_LEN]; + u8 peer_os; /* peer operating system */ + u8 peer_smc_release; + u8 peer_hostname[SMC_MAX_HOSTNAME_LEN]; + union { + struct { /* SMC-R */ + enum smc_lgr_role role; + /* client or server */ + struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; + /* smc link */ + struct smc_wr_v2_buf *wr_rx_buf_v2; + /* WR v2 recv payload buffer */ + struct smc_wr_v2_buf *wr_tx_buf_v2; + /* WR v2 send payload buffer */ + char peer_systemid[SMC_SYSTEMID_LEN]; + /* unique system_id of peer */ + struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] + [SMC_LINKS_PER_LGR_MAX]; + /* remote addr/key pairs */ + DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); + /* used rtoken elements */ + u8 next_link_id; + enum smc_lgr_type type; + enum smcr_buf_type buf_type; + /* redundancy state */ + u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; + /* pnet id of this lgr */ + struct list_head llc_event_q; + /* queue for llc events */ + spinlock_t llc_event_q_lock; + /* protects llc_event_q */ + struct rw_semaphore llc_conf_mutex; + /* protects lgr reconfig. */ + struct work_struct llc_add_link_work; + struct work_struct llc_del_link_work; + struct work_struct llc_event_work; + /* llc event worker */ + wait_queue_head_t llc_flow_waiter; + /* w4 next llc event */ + wait_queue_head_t llc_msg_waiter; + /* w4 next llc msg */ + struct smc_llc_flow llc_flow_lcl; + /* llc local control field */ + struct smc_llc_flow llc_flow_rmt; + /* llc remote control field */ + struct smc_llc_qentry *delayed_event; + /* arrived when flow active */ + spinlock_t llc_flow_lock; + /* protects llc flow */ + int llc_testlink_time; + /* link keep alive time */ + u32 llc_termination_rsn; + /* rsn code for termination */ + u8 nexthop_mac[ETH_ALEN]; + u8 uses_gateway; + __be32 saddr; + /* net namespace */ + struct net *net; + u8 max_conns; + /* max conn can be assigned to lgr */ + u8 max_links; + /* max links can be added in lgr */ + }; + struct { /* SMC-D */ + u64 peer_gid; + /* Peer GID (remote) */ + struct smcd_dev *smcd; + /* ISM device for VLAN reg. */ + u8 peer_shutdown : 1; + /* peer triggered shutdownn */ + }; + }; +}; + +struct smc_clc_msg_local; + +#define GID_LIST_SIZE 2 + +struct smc_gidlist { + u8 len; + u8 list[GID_LIST_SIZE][SMC_GID_SIZE]; +}; + +struct smc_init_info_smcrv2 { + /* Input fields */ + __be32 saddr; + struct sock *clc_sk; + __be32 daddr; + + /* Output fields when saddr is set */ + struct smc_ib_device *ib_dev_v2; + u8 ib_port_v2; + u8 ib_gid_v2[SMC_GID_SIZE]; + + /* Additional output fields when clc_sk and daddr is set as well */ + u8 uses_gateway; + u8 nexthop_mac[ETH_ALEN]; + + struct smc_gidlist gidlist; +}; + +struct smc_init_info { + u8 is_smcd; + u8 smc_type_v1; + u8 smc_type_v2; + u8 release_nr; + u8 max_conns; + u8 max_links; + u8 first_contact_peer; + u8 first_contact_local; + unsigned short vlan_id; + u32 rc; + u8 negotiated_eid[SMC_MAX_EID_LEN]; + /* SMC-R */ + u8 smcr_version; + u8 check_smcrv2; + u8 peer_gid[SMC_GID_SIZE]; + u8 peer_mac[ETH_ALEN]; + u8 peer_systemid[SMC_SYSTEMID_LEN]; + struct smc_ib_device *ib_dev; + u8 ib_gid[SMC_GID_SIZE]; + u8 ib_port; + u32 ib_clcqpn; + struct smc_init_info_smcrv2 smcrv2; + /* SMC-D */ + u64 ism_peer_gid[SMC_MAX_ISM_DEVS + 1]; + struct smcd_dev *ism_dev[SMC_MAX_ISM_DEVS + 1]; + u16 ism_chid[SMC_MAX_ISM_DEVS + 1]; + u8 ism_offered_cnt; /* # of ISM devices offered */ + u8 ism_selected; /* index of selected ISM dev*/ + u8 smcd_version; +}; + +/* Find the connection associated with the given alert token in the link group. + * To use rbtrees we have to implement our own search core. + * Requires @conns_lock + * @token alert token to search for + * @lgr link group to search in + * Returns connection associated with token if found, NULL otherwise. + */ +static inline struct smc_connection *smc_lgr_find_conn( + u32 token, struct smc_link_group *lgr) +{ + struct smc_connection *res = NULL; + struct rb_node *node; + + node = lgr->conns_all.rb_node; + while (node) { + struct smc_connection *cur = rb_entry(node, + struct smc_connection, alert_node); + + if (cur->alert_token_local > token) { + node = node->rb_left; + } else { + if (cur->alert_token_local < token) { + node = node->rb_right; + } else { + res = cur; + break; + } + } + } + + return res; +} + +static inline bool smc_conn_lgr_valid(struct smc_connection *conn) +{ + return conn->lgr && conn->alert_token_local; +} + +/* + * Returns true if the specified link is usable. + * + * usable means the link is ready to receive RDMA messages, map memory + * on the link, etc. This doesn't ensure we are able to send RDMA messages + * on this link, if sending RDMA messages is needed, use smc_link_sendable() + */ +static inline bool smc_link_usable(struct smc_link *lnk) +{ + if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE) + return false; + return true; +} + +/* + * Returns true if the specified link is ready to receive AND send RDMA + * messages. + * + * For the client side in first contact, the underlying QP may still in + * RESET or RTR when the link state is ACTIVATING, checks in smc_link_usable() + * is not strong enough. For those places that need to send any CDC or LLC + * messages, use smc_link_sendable(), otherwise, use smc_link_usable() instead + */ +static inline bool smc_link_sendable(struct smc_link *lnk) +{ + return smc_link_usable(lnk) && + lnk->qp_attr.cur_qp_state == IB_QPS_RTS; +} + +static inline bool smc_link_active(struct smc_link *lnk) +{ + return lnk->state == SMC_LNK_ACTIVE; +} + +static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) +{ + sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", + be16_to_cpu(((__be16 *)gid_raw)[0]), + be16_to_cpu(((__be16 *)gid_raw)[1]), + be16_to_cpu(((__be16 *)gid_raw)[2]), + be16_to_cpu(((__be16 *)gid_raw)[3]), + be16_to_cpu(((__be16 *)gid_raw)[4]), + be16_to_cpu(((__be16 *)gid_raw)[5]), + be16_to_cpu(((__be16 *)gid_raw)[6]), + be16_to_cpu(((__be16 *)gid_raw)[7])); +} + +struct smc_pci_dev { + __u32 pci_fid; + __u16 pci_pchid; + __u16 pci_vendor; + __u16 pci_device; + __u8 pci_id[SMC_PCI_ID_STR_LEN]; +}; + +static inline void smc_set_pci_values(struct pci_dev *pci_dev, + struct smc_pci_dev *smc_dev) +{ + smc_dev->pci_vendor = pci_dev->vendor; + smc_dev->pci_device = pci_dev->device; + snprintf(smc_dev->pci_id, sizeof(smc_dev->pci_id), "%s", + pci_name(pci_dev)); +#if IS_ENABLED(CONFIG_S390) + { /* Set s390 specific PCI information */ + struct zpci_dev *zdev; + + zdev = to_zpci(pci_dev); + smc_dev->pci_fid = zdev->fid; + smc_dev->pci_pchid = zdev->pchid; + } +#endif +} + +struct smc_sock; +struct smc_clc_msg_accept_confirm; + +void smc_lgr_cleanup_early(struct smc_link_group *lgr); +void smc_lgr_terminate_sched(struct smc_link_group *lgr); +void smc_lgr_hold(struct smc_link_group *lgr); +void smc_lgr_put(struct smc_link_group *lgr); +void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); +void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, + unsigned short vlan); +void smc_smcd_terminate_all(struct smcd_dev *dev); +void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); +int smc_buf_create(struct smc_sock *smc, bool is_smcd); +int smc_uncompress_bufsize(u8 compressed); +int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, + struct smc_clc_msg_accept_confirm *clc); +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); +void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, + __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey); +void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, + __be64 nw_vaddr, __be32 nw_rkey); +void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); +void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); +int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); + +void smc_conn_free(struct smc_connection *conn); +int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); +int smc_core_init(void); +void smc_core_exit(void); + +int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + u8 link_idx, struct smc_init_info *ini); +void smcr_link_clear(struct smc_link *lnk, bool log); +void smcr_link_hold(struct smc_link *lnk); +void smcr_link_put(struct smc_link *lnk); +void smc_switch_link_and_count(struct smc_connection *conn, + struct smc_link *to_lnk); +int smcr_buf_map_lgr(struct smc_link *lnk); +int smcr_buf_reg_lgr(struct smc_link *lnk); +void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); +void smcr_lgr_set_type_asym(struct smc_link_group *lgr, + enum smc_lgr_type new_type, int asym_lnk_idx); +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *rmb_desc); +struct smc_link *smc_switch_conns(struct smc_link_group *lgr, + struct smc_link *from_lnk, bool is_dev_err); +void smcr_link_down_cond(struct smc_link *lnk); +void smcr_link_down_cond_sched(struct smc_link *lnk); +int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb); +int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); +int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb); +int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); + +static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) +{ + return link->lgr; +} +#endif diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c new file mode 100644 index 0000000000..37833b96b5 --- /dev/null +++ b/net/smc/smc_diag.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Monitoring SMC transport protocol sockets + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/sock_diag.h> +#include <linux/inet_diag.h> +#include <linux/smc_diag.h> +#include <net/netlink.h> +#include <net/smc.h> + +#include "smc.h" +#include "smc_core.h" + +struct smc_diag_dump_ctx { + int pos[2]; +}; + +static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb) +{ + return (struct smc_diag_dump_ctx *)cb->ctx; +} + +static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + memset(r, 0, sizeof(*r)); + r->diag_family = sk->sk_family; + sock_diag_save_cookie(sk, r->id.idiag_cookie); + if (!smc->clcsock) + return; + r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); + r->id.idiag_dport = smc->clcsock->sk->sk_dport; + r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; + if (sk->sk_protocol == SMCPROTO_SMC) { + r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; + r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; +#if IS_ENABLED(CONFIG_IPV6) + } else if (sk->sk_protocol == SMCPROTO_SMC6) { + memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr, + sizeof(smc->clcsock->sk->sk_v6_rcv_saddr)); + memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr, + sizeof(smc->clcsock->sk->sk_v6_daddr)); +#endif + } +} + +static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, + struct smc_diag_msg *r, + struct user_namespace *user_ns) +{ + if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown)) + return 1; + + r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); + r->diag_inode = sock_i_ino(sk); + return 0; +} + +static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct smc_diag_req *req, + struct nlattr *bc) +{ + struct smc_sock *smc = smc_sk(sk); + struct smc_diag_fallback fallback; + struct user_namespace *user_ns; + struct smc_diag_msg *r; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + smc_diag_msg_common_fill(r, sk); + r->diag_state = sk->sk_state; + if (smc->use_fallback) + r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP; + else if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd) + r->diag_mode = SMC_DIAG_MODE_SMCD; + else + r->diag_mode = SMC_DIAG_MODE_SMCR; + user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk); + if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) + goto errout; + + fallback.reason = smc->fallback_rsn; + fallback.peer_diagnosis = smc->peer_diagnosis; + if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0) + goto errout; + + if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && + smc->conn.alert_token_local) { + struct smc_connection *conn = &smc->conn; + struct smc_diag_conninfo cinfo = { + .token = conn->alert_token_local, + .sndbuf_size = conn->sndbuf_desc ? + conn->sndbuf_desc->len : 0, + .rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0, + .peer_rmbe_size = conn->peer_rmbe_size, + + .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap, + .rx_prod.count = conn->local_rx_ctrl.prod.count, + .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap, + .rx_cons.count = conn->local_rx_ctrl.cons.count, + + .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap, + .tx_prod.count = conn->local_tx_ctrl.prod.count, + .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap, + .tx_cons.count = conn->local_tx_ctrl.cons.count, + + .tx_prod_flags = + *(u8 *)&conn->local_tx_ctrl.prod_flags, + .tx_conn_state_flags = + *(u8 *)&conn->local_tx_ctrl.conn_state_flags, + .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags, + .rx_conn_state_flags = + *(u8 *)&conn->local_rx_ctrl.conn_state_flags, + + .tx_prep.wrap = conn->tx_curs_prep.wrap, + .tx_prep.count = conn->tx_curs_prep.count, + .tx_sent.wrap = conn->tx_curs_sent.wrap, + .tx_sent.count = conn->tx_curs_sent.count, + .tx_fin.wrap = conn->tx_curs_fin.wrap, + .tx_fin.count = conn->tx_curs_fin.count, + }; + + if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) + goto errout; + } + + if (smc_conn_lgr_valid(&smc->conn) && !smc->conn.lgr->is_smcd && + (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && + !list_empty(&smc->conn.lgr->list)) { + struct smc_link *link = smc->conn.lnk; + + struct smc_diag_lgrinfo linfo = { + .role = smc->conn.lgr->role, + .lnk[0].ibport = link->ibport, + .lnk[0].link_id = link->link_id, + }; + + memcpy(linfo.lnk[0].ibname, link->smcibdev->ibdev->name, + sizeof(link->smcibdev->ibdev->name)); + smc_gid_be16_convert(linfo.lnk[0].gid, link->gid); + smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid); + + if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) + goto errout; + } + if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && + (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && + !list_empty(&smc->conn.lgr->list) && smc->conn.rmb_desc) { + struct smc_connection *conn = &smc->conn; + struct smcd_diag_dmbinfo dinfo; + struct smcd_dev *smcd = conn->lgr->smcd; + + memset(&dinfo, 0, sizeof(dinfo)); + + dinfo.linkid = *((u32 *)conn->lgr->id); + dinfo.peer_gid = conn->lgr->peer_gid; + dinfo.my_gid = smcd->ops->get_local_gid(smcd); + dinfo.token = conn->rmb_desc->token; + dinfo.peer_token = conn->peer_token; + + if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0) + goto errout; + } + + nlmsg_end(skb, nlh); + return 0; + +errout: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, + struct netlink_callback *cb, int p_type) +{ + struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb); + struct net *net = sock_net(skb->sk); + int snum = cb_ctx->pos[p_type]; + struct nlattr *bc = NULL; + struct hlist_head *head; + int rc = 0, num = 0; + struct sock *sk; + + read_lock(&prot->h.smc_hash->lock); + head = &prot->h.smc_hash->ht; + if (hlist_empty(head)) + goto out; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num < snum) + goto next; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc < 0) + goto out; +next: + num++; + } + +out: + read_unlock(&prot->h.smc_hash->lock); + cb_ctx->pos[p_type] = num; + return rc; +} + +static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int rc = 0; + + rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC); + if (!rc) + smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6); + return skb->len; +} + +static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) +{ + struct net *net = sock_net(skb->sk); + + if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && + h->nlmsg_flags & NLM_F_DUMP) { + { + struct netlink_dump_control c = { + .dump = smc_diag_dump, + .min_dump_alloc = SKB_WITH_OVERHEAD(32768), + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); + } + } + return 0; +} + +static const struct sock_diag_handler smc_diag_handler = { + .family = AF_SMC, + .dump = smc_diag_handler_dump, +}; + +static int __init smc_diag_init(void) +{ + return sock_diag_register(&smc_diag_handler); +} + +static void __exit smc_diag_exit(void) +{ + sock_diag_unregister(&smc_diag_handler); +} + +module_init(smc_diag_init); +module_exit(smc_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); +MODULE_ALIAS_GENL_FAMILY(SMCR_GENL_FAMILY_NAME); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c new file mode 100644 index 0000000000..89981dbe46 --- /dev/null +++ b/net/smc/smc_ib.c @@ -0,0 +1,1018 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * IB infrastructure: + * Establish SMC-R as an Infiniband Client to be notified about added and + * removed IB devices of type RDMA. + * Determine device and port characteristics for these IB devices. + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/etherdevice.h> +#include <linux/if_vlan.h> +#include <linux/random.h> +#include <linux/workqueue.h> +#include <linux/scatterlist.h> +#include <linux/wait.h> +#include <linux/mutex.h> +#include <linux/inetdevice.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_cache.h> + +#include "smc_pnet.h" +#include "smc_ib.h" +#include "smc_core.h" +#include "smc_wr.h" +#include "smc.h" +#include "smc_netlink.h" + +#define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ + +#define SMC_QP_MIN_RNR_TIMER 5 +#define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */ +#define SMC_QP_RETRY_CNT 7 /* 7: infinite */ +#define SMC_QP_RNR_RETRY 7 /* 7: infinite */ + +struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ + .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex), + .list = LIST_HEAD_INIT(smc_ib_devices.list), +}; + +u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ + +static int smc_ib_modify_qp_init(struct smc_link *lnk) +{ + struct ib_qp_attr qp_attr; + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.pkey_index = 0; + qp_attr.port_num = lnk->ibport; + qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE + | IB_ACCESS_REMOTE_WRITE; + return ib_modify_qp(lnk->roce_qp, &qp_attr, + IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_ACCESS_FLAGS | IB_QP_PORT); +} + +static int smc_ib_modify_qp_rtr(struct smc_link *lnk) +{ + enum ib_qp_attr_mask qp_attr_mask = + IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | + IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; + struct ib_qp_attr qp_attr; + u8 hop_lim = 1; + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_RTR; + qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); + qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; + rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); + if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) + hop_lim = IPV6_DEFAULT_HOPLIMIT; + rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0); + rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); + if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) + memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac, + sizeof(lnk->lgr->nexthop_mac)); + else + memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, + sizeof(lnk->peer_mac)); + qp_attr.dest_qp_num = lnk->peer_qpn; + qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ + qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming + * requests + */ + qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER; + + return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask); +} + +int smc_ib_modify_qp_rts(struct smc_link *lnk) +{ + struct ib_qp_attr qp_attr; + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */ + qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */ + qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */ + qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */ + qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and + * atomic ops allowed + */ + return ib_modify_qp(lnk->roce_qp, &qp_attr, + IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | + IB_QP_SQ_PSN | IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC); +} + +int smc_ib_modify_qp_error(struct smc_link *lnk) +{ + struct ib_qp_attr qp_attr; + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_ERR; + return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); +} + +int smc_ib_ready_link(struct smc_link *lnk) +{ + struct smc_link_group *lgr = smc_get_lgr(lnk); + int rc = 0; + + rc = smc_ib_modify_qp_init(lnk); + if (rc) + goto out; + + rc = smc_ib_modify_qp_rtr(lnk); + if (rc) + goto out; + smc_wr_remember_qp_attr(lnk); + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, + IB_CQ_SOLICITED_MASK); + if (rc) + goto out; + rc = smc_wr_rx_post_init(lnk); + if (rc) + goto out; + smc_wr_remember_qp_attr(lnk); + + if (lgr->role == SMC_SERV) { + rc = smc_ib_modify_qp_rts(lnk); + if (rc) + goto out; + smc_wr_remember_qp_attr(lnk); + } +out: + return rc; +} + +static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport) +{ + const struct ib_gid_attr *attr; + int rc; + + attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0); + if (IS_ERR(attr)) + return -ENODEV; + + rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]); + rdma_put_gid_attr(attr); + return rc; +} + +/* Create an identifier unique for this instance of SMC-R. + * The MAC-address of the first active registered IB device + * plus a random 2-byte number is used to create this identifier. + * This name is delivered to the peer during connection initialization. + */ +static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, + u8 ibport) +{ + memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], + sizeof(smcibdev->mac[ibport - 1])); +} + +bool smc_ib_is_valid_local_systemid(void) +{ + return !is_zero_ether_addr(&local_systemid[2]); +} + +static void smc_ib_init_local_systemid(void) +{ + get_random_bytes(&local_systemid[0], 2); +} + +bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) +{ + return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; +} + +int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr, + u8 nexthop_mac[], u8 *uses_gateway) +{ + struct neighbour *neigh = NULL; + struct rtable *rt = NULL; + struct flowi4 fl4 = { + .saddr = saddr, + .daddr = daddr + }; + + if (daddr == cpu_to_be32(INADDR_NONE)) + goto out; + rt = ip_route_output_flow(net, &fl4, NULL); + if (IS_ERR(rt)) + goto out; + if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET) + goto out; + neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr); + if (neigh) { + memcpy(nexthop_mac, neigh->ha, ETH_ALEN); + *uses_gateway = rt->rt_uses_gateway; + return 0; + } +out: + return -ENOENT; +} + +static int smc_ib_determine_gid_rcu(const struct net_device *ndev, + const struct ib_gid_attr *attr, + u8 gid[], u8 *sgid_index, + struct smc_init_info_smcrv2 *smcrv2) +{ + if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) { + if (gid) + memcpy(gid, &attr->gid, SMC_GID_SIZE); + if (sgid_index) + *sgid_index = attr->index; + return 0; + } + if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && + smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) { + struct in_device *in_dev = __in_dev_get_rcu(ndev); + struct net *net = dev_net(ndev); + const struct in_ifaddr *ifa; + bool subnet_match = false; + + if (!in_dev) + goto out; + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (!inet_ifa_match(smcrv2->saddr, ifa)) + continue; + subnet_match = true; + break; + } + if (!subnet_match) + goto out; + if (smcrv2->daddr && smc_ib_find_route(net, smcrv2->saddr, + smcrv2->daddr, + smcrv2->nexthop_mac, + &smcrv2->uses_gateway)) + goto out; + + if (gid) + memcpy(gid, &attr->gid, SMC_GID_SIZE); + if (sgid_index) + *sgid_index = attr->index; + return 0; + } +out: + return -ENODEV; +} + +/* determine the gid for an ib-device port and vlan id */ +int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, + unsigned short vlan_id, u8 gid[], u8 *sgid_index, + struct smc_init_info_smcrv2 *smcrv2) +{ + const struct ib_gid_attr *attr; + const struct net_device *ndev; + int i; + + for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { + attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i); + if (IS_ERR(attr)) + continue; + + rcu_read_lock(); + ndev = rdma_read_gid_attr_ndev_rcu(attr); + if (!IS_ERR(ndev) && + ((!vlan_id && !is_vlan_dev(ndev)) || + (vlan_id && is_vlan_dev(ndev) && + vlan_dev_vlan_id(ndev) == vlan_id))) { + if (!smc_ib_determine_gid_rcu(ndev, attr, gid, + sgid_index, smcrv2)) { + rcu_read_unlock(); + rdma_put_gid_attr(attr); + return 0; + } + } + rcu_read_unlock(); + rdma_put_gid_attr(attr); + } + return -ENODEV; +} + +/* check if gid is still defined on smcibdev */ +static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2, + struct smc_ib_device *smcibdev, u8 ibport) +{ + const struct ib_gid_attr *attr; + bool rc = false; + int i; + + for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { + attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i); + if (IS_ERR(attr)) + continue; + + rcu_read_lock(); + if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) || + (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && + !(ipv6_addr_type((const struct in6_addr *)&attr->gid) + & IPV6_ADDR_LINKLOCAL))) + if (!memcmp(gid, &attr->gid, SMC_GID_SIZE)) + rc = true; + rcu_read_unlock(); + rdma_put_gid_attr(attr); + } + return rc; +} + +/* check all links if the gid is still defined on smcibdev */ +static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr; + int i; + + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry(lgr, &smc_lgr_list.list, list) { + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN)) + continue; /* lgr is not affected */ + if (list_empty(&lgr->list)) + continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_UNUSED || + lgr->lnk[i].smcibdev != smcibdev) + continue; + if (!smc_ib_check_link_gid(lgr->lnk[i].gid, + lgr->smc_version == SMC_V2, + smcibdev, ibport)) + smcr_port_err(smcibdev, ibport); + } + } + spin_unlock_bh(&smc_lgr_list.lock); +} + +static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) +{ + int rc; + + memset(&smcibdev->pattr[ibport - 1], 0, + sizeof(smcibdev->pattr[ibport - 1])); + rc = ib_query_port(smcibdev->ibdev, ibport, + &smcibdev->pattr[ibport - 1]); + if (rc) + goto out; + /* the SMC protocol requires specification of the RoCE MAC address */ + rc = smc_ib_fill_mac(smcibdev, ibport); + if (rc) + goto out; + if (!smc_ib_is_valid_local_systemid() && + smc_ib_port_active(smcibdev, ibport)) + /* create unique system identifier */ + smc_ib_define_local_systemid(smcibdev, ibport); +out: + return rc; +} + +/* process context wrapper for might_sleep smc_ib_remember_port_attr */ +static void smc_ib_port_event_work(struct work_struct *work) +{ + struct smc_ib_device *smcibdev = container_of( + work, struct smc_ib_device, port_event_work); + u8 port_idx; + + for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { + smc_ib_remember_port_attr(smcibdev, port_idx + 1); + clear_bit(port_idx, &smcibdev->port_event_mask); + if (!smc_ib_port_active(smcibdev, port_idx + 1)) { + set_bit(port_idx, smcibdev->ports_going_away); + smcr_port_err(smcibdev, port_idx + 1); + } else { + clear_bit(port_idx, smcibdev->ports_going_away); + smcr_port_add(smcibdev, port_idx + 1); + smc_ib_gid_check(smcibdev, port_idx + 1); + } + } +} + +/* can be called in IRQ context */ +static void smc_ib_global_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent) +{ + struct smc_ib_device *smcibdev; + bool schedule = false; + u8 port_idx; + + smcibdev = container_of(handler, struct smc_ib_device, event_handler); + + switch (ibevent->event) { + case IB_EVENT_DEVICE_FATAL: + /* terminate all ports on device */ + for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) { + set_bit(port_idx, &smcibdev->port_event_mask); + if (!test_and_set_bit(port_idx, + smcibdev->ports_going_away)) + schedule = true; + } + if (schedule) + schedule_work(&smcibdev->port_event_work); + break; + case IB_EVENT_PORT_ACTIVE: + port_idx = ibevent->element.port_num - 1; + if (port_idx >= SMC_MAX_PORTS) + break; + set_bit(port_idx, &smcibdev->port_event_mask); + if (test_and_clear_bit(port_idx, smcibdev->ports_going_away)) + schedule_work(&smcibdev->port_event_work); + break; + case IB_EVENT_PORT_ERR: + port_idx = ibevent->element.port_num - 1; + if (port_idx >= SMC_MAX_PORTS) + break; + set_bit(port_idx, &smcibdev->port_event_mask); + if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) + schedule_work(&smcibdev->port_event_work); + break; + case IB_EVENT_GID_CHANGE: + port_idx = ibevent->element.port_num - 1; + if (port_idx >= SMC_MAX_PORTS) + break; + set_bit(port_idx, &smcibdev->port_event_mask); + schedule_work(&smcibdev->port_event_work); + break; + default: + break; + } +} + +void smc_ib_dealloc_protection_domain(struct smc_link *lnk) +{ + if (lnk->roce_pd) + ib_dealloc_pd(lnk->roce_pd); + lnk->roce_pd = NULL; +} + +int smc_ib_create_protection_domain(struct smc_link *lnk) +{ + int rc; + + lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); + rc = PTR_ERR_OR_ZERO(lnk->roce_pd); + if (IS_ERR(lnk->roce_pd)) + lnk->roce_pd = NULL; + return rc; +} + +static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr, + struct smc_ib_device *smcibdev) +{ + struct smc_link_group *lgr; + bool rc = false; + int i; + + spin_lock_bh(&smc_lgr->lock); + list_for_each_entry(lgr, &smc_lgr->list, list) { + if (lgr->is_smcd) + continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_UNUSED || + lgr->lnk[i].smcibdev != smcibdev) + continue; + if (lgr->type == SMC_LGR_SINGLE || + lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) { + rc = true; + goto out; + } + } + } +out: + spin_unlock_bh(&smc_lgr->lock); + return rc; +} + +static int smc_nl_handle_dev_port(struct sk_buff *skb, + struct ib_device *ibdev, + struct smc_ib_device *smcibdev, + int port) +{ + char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct nlattr *port_attrs; + unsigned char port_state; + int lnk_count = 0; + + port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port); + if (!port_attrs) + goto errout; + + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, + smcibdev->pnetid_by_user[port])) + goto errattr; + memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN); + smc_pnet[SMC_MAX_PNETID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV, + smcibdev->ndev_ifidx[port])) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1)) + goto errattr; + port_state = smc_ib_port_active(smcibdev, port + 1); + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state)) + goto errattr; + lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]); + if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count)) + goto errattr; + nla_nest_end(skb, port_attrs); + return 0; +errattr: + nla_nest_cancel(skb, port_attrs); +errout: + return -EMSGSIZE; +} + +static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev, + struct sk_buff *skb) +{ + if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid)) + return false; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid)) + return false; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor)) + return false; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device)) + return false; + if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id)) + return false; + return true; +} + +static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_ibname[IB_DEVICE_NAME_MAX]; + struct smc_pci_dev smc_pci_dev; + struct pci_dev *pci_dev; + unsigned char is_crit; + struct nlattr *attrs; + void *nlh; + int i; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_DEV_SMCR); + if (!nlh) + goto errmsg; + attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR); + if (!attrs) + goto errout; + is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev); + if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit)) + goto errattr; + if (smcibdev->ibdev->dev.parent) { + memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); + pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent); + smc_set_pci_values(pci_dev, &smc_pci_dev); + if (!smc_nl_handle_pci_values(&smc_pci_dev, skb)) + goto errattr; + } + snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name); + if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname)) + goto errattr; + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(smcibdev->ibdev, i)) + continue; + if (smc_nl_handle_dev_port(skb, smcibdev->ibdev, + smcibdev, i - 1)) + goto errattr; + } + + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_ib_device *smcibdev; + int snum = cb_ctx->pos[0]; + int num = 0; + + mutex_lock(&dev_list->mutex); + list_for_each_entry(smcibdev, &dev_list->list, list) { + if (num < snum) + goto next; + if (smc_nl_handle_smcr_dev(smcibdev, skb, cb)) + goto errout; +next: + num++; + } +errout: + mutex_unlock(&dev_list->mutex); + cb_ctx->pos[0] = num; +} + +int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) +{ + smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb); + return skb->len; +} + +static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) +{ + struct smc_link *lnk = (struct smc_link *)priv; + struct smc_ib_device *smcibdev = lnk->smcibdev; + u8 port_idx; + + switch (ibevent->event) { + case IB_EVENT_QP_FATAL: + case IB_EVENT_QP_ACCESS_ERR: + port_idx = ibevent->element.qp->port - 1; + if (port_idx >= SMC_MAX_PORTS) + break; + set_bit(port_idx, &smcibdev->port_event_mask); + if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) + schedule_work(&smcibdev->port_event_work); + break; + default: + break; + } +} + +void smc_ib_destroy_queue_pair(struct smc_link *lnk) +{ + if (lnk->roce_qp) + ib_destroy_qp(lnk->roce_qp); + lnk->roce_qp = NULL; +} + +/* create a queue pair within the protection domain for a link */ +int smc_ib_create_queue_pair(struct smc_link *lnk) +{ + int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; + struct ib_qp_init_attr qp_attr = { + .event_handler = smc_ib_qp_event_handler, + .qp_context = lnk, + .send_cq = lnk->smcibdev->roce_cq_send, + .recv_cq = lnk->smcibdev->roce_cq_recv, + .srq = NULL, + .cap = { + /* include unsolicited rdma_writes as well, + * there are max. 2 RDMA_WRITE per 1 WR_SEND + */ + .max_send_wr = SMC_WR_BUF_CNT * 3, + .max_recv_wr = SMC_WR_BUF_CNT * 3, + .max_send_sge = SMC_IB_MAX_SEND_SGE, + .max_recv_sge = sges_per_buf, + .max_inline_data = 0, + }, + .sq_sig_type = IB_SIGNAL_REQ_WR, + .qp_type = IB_QPT_RC, + }; + int rc; + + lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); + rc = PTR_ERR_OR_ZERO(lnk->roce_qp); + if (IS_ERR(lnk->roce_qp)) + lnk->roce_qp = NULL; + else + smc_wr_remember_qp_attr(lnk); + return rc; +} + +void smc_ib_put_memory_region(struct ib_mr *mr) +{ + ib_dereg_mr(mr); +} + +static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) +{ + unsigned int offset = 0; + int sg_num; + + /* map the largest prefix of a dma mapped SG list */ + sg_num = ib_map_mr_sg(buf_slot->mr[link_idx], + buf_slot->sgt[link_idx].sgl, + buf_slot->sgt[link_idx].orig_nents, + &offset, PAGE_SIZE); + + return sg_num; +} + +/* Allocate a memory region and map the dma mapped SG list of buf_slot */ +int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, + struct smc_buf_desc *buf_slot, u8 link_idx) +{ + if (buf_slot->mr[link_idx]) + return 0; /* already done */ + + buf_slot->mr[link_idx] = + ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); + if (IS_ERR(buf_slot->mr[link_idx])) { + int rc; + + rc = PTR_ERR(buf_slot->mr[link_idx]); + buf_slot->mr[link_idx] = NULL; + return rc; + } + + if (smc_ib_map_mr_sg(buf_slot, link_idx) != + buf_slot->sgt[link_idx].orig_nents) + return -EINVAL; + + return 0; +} + +bool smc_ib_is_sg_need_sync(struct smc_link *lnk, + struct smc_buf_desc *buf_slot) +{ + struct scatterlist *sg; + unsigned int i; + bool ret = false; + + /* for now there is just one DMA address */ + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { + if (!sg_dma_len(sg)) + break; + if (dma_need_sync(lnk->smcibdev->ibdev->dma_device, + sg_dma_address(sg))) { + ret = true; + goto out; + } + } + +out: + return ret; +} + +/* synchronize buffer usage for cpu access */ +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction) +{ + struct scatterlist *sg; + unsigned int i; + + if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) + return; + + /* for now there is just one DMA address */ + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { + if (!sg_dma_len(sg)) + break; + ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev, + sg_dma_address(sg), + sg_dma_len(sg), + data_direction); + } +} + +/* synchronize buffer usage for device access */ +void smc_ib_sync_sg_for_device(struct smc_link *lnk, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction) +{ + struct scatterlist *sg; + unsigned int i; + + if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) + return; + + /* for now there is just one DMA address */ + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { + if (!sg_dma_len(sg)) + break; + ib_dma_sync_single_for_device(lnk->smcibdev->ibdev, + sg_dma_address(sg), + sg_dma_len(sg), + data_direction); + } +} + +/* Map a new TX or RX buffer SG-table to DMA */ +int smc_ib_buf_map_sg(struct smc_link *lnk, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction) +{ + int mapped_nents; + + mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, + data_direction); + if (!mapped_nents) + return -ENOMEM; + + return mapped_nents; +} + +void smc_ib_buf_unmap_sg(struct smc_link *lnk, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction) +{ + if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address) + return; /* already unmapped */ + + ib_dma_unmap_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, + data_direction); + buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; +} + +long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) +{ + struct ib_cq_init_attr cqattr = { + .cqe = SMC_MAX_CQE, .comp_vector = 0 }; + int cqe_size_order, smc_order; + long rc; + + mutex_lock(&smcibdev->mutex); + rc = 0; + if (smcibdev->initialized) + goto out; + /* the calculated number of cq entries fits to mlx5 cq allocation */ + cqe_size_order = cache_line_size() == 128 ? 7 : 6; + smc_order = MAX_ORDER - cqe_size_order; + if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) + cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; + smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); + if (IS_ERR(smcibdev->roce_cq_send)) { + smcibdev->roce_cq_send = NULL; + goto out; + } + smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); + if (IS_ERR(smcibdev->roce_cq_recv)) { + smcibdev->roce_cq_recv = NULL; + goto err; + } + smc_wr_add_dev(smcibdev); + smcibdev->initialized = 1; + goto out; + +err: + ib_destroy_cq(smcibdev->roce_cq_send); +out: + mutex_unlock(&smcibdev->mutex); + return rc; +} + +static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) +{ + mutex_lock(&smcibdev->mutex); + if (!smcibdev->initialized) + goto out; + smcibdev->initialized = 0; + ib_destroy_cq(smcibdev->roce_cq_recv); + ib_destroy_cq(smcibdev->roce_cq_send); + smc_wr_remove_dev(smcibdev); +out: + mutex_unlock(&smcibdev->mutex); +} + +static struct ib_client smc_ib_client; + +static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port) +{ + struct ib_device *ibdev = smcibdev->ibdev; + struct net_device *ndev; + + if (!ibdev->ops.get_netdev) + return; + ndev = ibdev->ops.get_netdev(ibdev, port + 1); + if (ndev) { + smcibdev->ndev_ifidx[port] = ndev->ifindex; + dev_put(ndev); + } +} + +void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) +{ + struct smc_ib_device *smcibdev; + struct ib_device *libdev; + struct net_device *lndev; + u8 port_cnt; + int i; + + mutex_lock(&smc_ib_devices.mutex); + list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { + port_cnt = smcibdev->ibdev->phys_port_cnt; + for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { + libdev = smcibdev->ibdev; + if (!libdev->ops.get_netdev) + continue; + lndev = libdev->ops.get_netdev(libdev, i + 1); + dev_put(lndev); + if (lndev != ndev) + continue; + if (event == NETDEV_REGISTER) + smcibdev->ndev_ifidx[i] = ndev->ifindex; + if (event == NETDEV_UNREGISTER) + smcibdev->ndev_ifidx[i] = 0; + } + } + mutex_unlock(&smc_ib_devices.mutex); +} + +/* callback function for ib_register_client() */ +static int smc_ib_add_dev(struct ib_device *ibdev) +{ + struct smc_ib_device *smcibdev; + u8 port_cnt; + int i; + + if (ibdev->node_type != RDMA_NODE_IB_CA) + return -EOPNOTSUPP; + + smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); + if (!smcibdev) + return -ENOMEM; + + smcibdev->ibdev = ibdev; + INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); + atomic_set(&smcibdev->lnk_cnt, 0); + init_waitqueue_head(&smcibdev->lnks_deleted); + mutex_init(&smcibdev->mutex); + mutex_lock(&smc_ib_devices.mutex); + list_add_tail(&smcibdev->list, &smc_ib_devices.list); + mutex_unlock(&smc_ib_devices.mutex); + ib_set_client_data(ibdev, &smc_ib_client, smcibdev); + INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, + smc_ib_global_event_handler); + ib_register_event_handler(&smcibdev->event_handler); + + /* trigger reading of the port attributes */ + port_cnt = smcibdev->ibdev->phys_port_cnt; + pr_warn_ratelimited("smc: adding ib device %s with port count %d\n", + smcibdev->ibdev->name, port_cnt); + for (i = 0; + i < min_t(size_t, port_cnt, SMC_MAX_PORTS); + i++) { + set_bit(i, &smcibdev->port_event_mask); + /* determine pnetids of the port */ + if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, + smcibdev->pnetid[i])) + smc_pnetid_by_table_ib(smcibdev, i + 1); + smc_copy_netdev_ifindex(smcibdev, i); + pr_warn_ratelimited("smc: ib device %s port %d has pnetid " + "%.16s%s\n", + smcibdev->ibdev->name, i + 1, + smcibdev->pnetid[i], + smcibdev->pnetid_by_user[i] ? + " (user defined)" : + ""); + } + schedule_work(&smcibdev->port_event_work); + return 0; +} + +/* callback function for ib_unregister_client() */ +static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) +{ + struct smc_ib_device *smcibdev = client_data; + + mutex_lock(&smc_ib_devices.mutex); + list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ + mutex_unlock(&smc_ib_devices.mutex); + pr_warn_ratelimited("smc: removing ib device %s\n", + smcibdev->ibdev->name); + smc_smcr_terminate_all(smcibdev); + smc_ib_cleanup_per_ibdev(smcibdev); + ib_unregister_event_handler(&smcibdev->event_handler); + cancel_work_sync(&smcibdev->port_event_work); + kfree(smcibdev); +} + +static struct ib_client smc_ib_client = { + .name = "smc_ib", + .add = smc_ib_add_dev, + .remove = smc_ib_remove_dev, +}; + +int __init smc_ib_register_client(void) +{ + smc_ib_init_local_systemid(); + return ib_register_client(&smc_ib_client); +} + +void smc_ib_unregister_client(void) +{ + ib_unregister_client(&smc_ib_client); +} diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h new file mode 100644 index 0000000000..ef8ac2b754 --- /dev/null +++ b/net/smc/smc_ib.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for IB environment + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <Ursula Braun@linux.vnet.ibm.com> + */ + +#ifndef _SMC_IB_H +#define _SMC_IB_H + +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/mutex.h> +#include <linux/wait.h> +#include <rdma/ib_verbs.h> +#include <net/smc.h> + +#define SMC_MAX_PORTS 2 /* Max # of ports */ +#define SMC_GID_SIZE sizeof(union ib_gid) + +#define SMC_IB_MAX_SEND_SGE 2 + +struct smc_ib_devices { /* list of smc ib devices definition */ + struct list_head list; + struct mutex mutex; /* protects list of smc ib devices */ +}; + +extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ +extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */ + +struct smc_ib_device { /* ib-device infos for smc */ + struct list_head list; + struct ib_device *ibdev; + struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ + struct ib_event_handler event_handler; /* global ib_event handler */ + struct ib_cq *roce_cq_send; /* send completion queue */ + struct ib_cq *roce_cq_recv; /* recv completion queue */ + struct tasklet_struct send_tasklet; /* called by send cq handler */ + struct tasklet_struct recv_tasklet; /* called by recv cq handler */ + char mac[SMC_MAX_PORTS][ETH_ALEN]; + /* mac address per port*/ + u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; + /* pnetid per port */ + bool pnetid_by_user[SMC_MAX_PORTS]; + /* pnetid defined by user? */ + u8 initialized : 1; /* ib dev CQ, evthdl done */ + struct work_struct port_event_work; + unsigned long port_event_mask; + DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS); + atomic_t lnk_cnt; /* number of links on ibdev */ + wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/ + struct mutex mutex; /* protect dev setup+cleanup */ + atomic_t lnk_cnt_by_port[SMC_MAX_PORTS]; + /* number of links per port */ + int ndev_ifidx[SMC_MAX_PORTS]; /* ndev if indexes */ +}; + +static inline __be32 smc_ib_gid_to_ipv4(u8 gid[SMC_GID_SIZE]) +{ + struct in6_addr *addr6 = (struct in6_addr *)gid; + + if (ipv6_addr_v4mapped(addr6) || + !(addr6->s6_addr32[0] | addr6->s6_addr32[1] | addr6->s6_addr32[2])) + return addr6->s6_addr32[3]; + return cpu_to_be32(INADDR_NONE); +} + +static inline struct net *smc_ib_net(struct smc_ib_device *smcibdev) +{ + if (smcibdev && smcibdev->ibdev) + return read_pnet(&smcibdev->ibdev->coredev.rdma_net); + return NULL; +} + +struct smc_init_info_smcrv2; +struct smc_buf_desc; +struct smc_link; + +void smc_ib_ndev_change(struct net_device *ndev, unsigned long event); +int smc_ib_register_client(void) __init; +void smc_ib_unregister_client(void); +bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); +int smc_ib_buf_map_sg(struct smc_link *lnk, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction); +void smc_ib_buf_unmap_sg(struct smc_link *lnk, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction); +void smc_ib_dealloc_protection_domain(struct smc_link *lnk); +int smc_ib_create_protection_domain(struct smc_link *lnk); +void smc_ib_destroy_queue_pair(struct smc_link *lnk); +int smc_ib_create_queue_pair(struct smc_link *lnk); +int smc_ib_ready_link(struct smc_link *lnk); +int smc_ib_modify_qp_rts(struct smc_link *lnk); +int smc_ib_modify_qp_error(struct smc_link *lnk); +long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); +int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, + struct smc_buf_desc *buf_slot, u8 link_idx); +void smc_ib_put_memory_region(struct ib_mr *mr); +bool smc_ib_is_sg_need_sync(struct smc_link *lnk, + struct smc_buf_desc *buf_slot); +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction); +void smc_ib_sync_sg_for_device(struct smc_link *lnk, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction); +int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, + unsigned short vlan_id, u8 gid[], u8 *sgid_index, + struct smc_init_info_smcrv2 *smcrv2); +int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr, + u8 nexthop_mac[], u8 *uses_gateway); +bool smc_ib_is_valid_local_systemid(void); +int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); +#endif diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c new file mode 100644 index 0000000000..fbee249309 --- /dev/null +++ b/net/smc/smc_ism.c @@ -0,0 +1,554 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Shared Memory Communications Direct over ISM devices (SMC-D) + * + * Functions for ISM device. + * + * Copyright IBM Corp. 2018 + */ + +#include <linux/if_vlan.h> +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <asm/page.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_ism.h" +#include "smc_pnet.h" +#include "smc_netlink.h" +#include "linux/ism.h" + +struct smcd_dev_list smcd_dev_list = { + .list = LIST_HEAD_INIT(smcd_dev_list.list), + .mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex) +}; + +static bool smc_ism_v2_capable; +static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; + +#if IS_ENABLED(CONFIG_ISM) +static void smcd_register_dev(struct ism_dev *ism); +static void smcd_unregister_dev(struct ism_dev *ism); +static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); +static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, + u16 dmbemask); + +static struct ism_client smc_ism_client = { + .name = "SMC-D", + .add = smcd_register_dev, + .remove = smcd_unregister_dev, + .handle_event = smcd_handle_event, + .handle_irq = smcd_handle_irq, +}; +#endif + +/* Test if an ISM communication is possible - same CPC */ +int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) +{ + return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, + vlan_id); +} + +void smc_ism_get_system_eid(u8 **eid) +{ + if (!smc_ism_v2_capable) + *eid = NULL; + else + *eid = smc_ism_v2_system_eid; +} + +u16 smc_ism_get_chid(struct smcd_dev *smcd) +{ + return smcd->ops->get_chid(smcd); +} + +/* HW supports ISM V2 and thus System EID is defined */ +bool smc_ism_is_v2_capable(void) +{ + return smc_ism_v2_capable; +} + +/* Set a connection using this DMBE. */ +void smc_ism_set_conn(struct smc_connection *conn) +{ + unsigned long flags; + + spin_lock_irqsave(&conn->lgr->smcd->lock, flags); + conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn; + spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); +} + +/* Unset a connection using this DMBE. */ +void smc_ism_unset_conn(struct smc_connection *conn) +{ + unsigned long flags; + + if (!conn->rmb_desc) + return; + + spin_lock_irqsave(&conn->lgr->smcd->lock, flags); + conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL; + spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); +} + +/* Register a VLAN identifier with the ISM device. Use a reference count + * and add a VLAN identifier only when the first DMB using this VLAN is + * registered. + */ +int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) +{ + struct smc_ism_vlanid *new_vlan, *vlan; + unsigned long flags; + int rc = 0; + + if (!vlanid) /* No valid vlan id */ + return -EINVAL; + + /* create new vlan entry, in case we need it */ + new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); + if (!new_vlan) + return -ENOMEM; + new_vlan->vlanid = vlanid; + refcount_set(&new_vlan->refcnt, 1); + + /* if there is an existing entry, increase count and return */ + spin_lock_irqsave(&smcd->lock, flags); + list_for_each_entry(vlan, &smcd->vlan, list) { + if (vlan->vlanid == vlanid) { + refcount_inc(&vlan->refcnt); + kfree(new_vlan); + goto out; + } + } + + /* no existing entry found. + * add new entry to device; might fail, e.g., if HW limit reached + */ + if (smcd->ops->add_vlan_id(smcd, vlanid)) { + kfree(new_vlan); + rc = -EIO; + goto out; + } + list_add_tail(&new_vlan->list, &smcd->vlan); +out: + spin_unlock_irqrestore(&smcd->lock, flags); + return rc; +} + +/* Unregister a VLAN identifier with the ISM device. Use a reference count + * and remove a VLAN identifier only when the last DMB using this VLAN is + * unregistered. + */ +int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) +{ + struct smc_ism_vlanid *vlan; + unsigned long flags; + bool found = false; + int rc = 0; + + if (!vlanid) /* No valid vlan id */ + return -EINVAL; + + spin_lock_irqsave(&smcd->lock, flags); + list_for_each_entry(vlan, &smcd->vlan, list) { + if (vlan->vlanid == vlanid) { + if (!refcount_dec_and_test(&vlan->refcnt)) + goto out; + found = true; + break; + } + } + if (!found) { + rc = -ENOENT; + goto out; /* VLAN id not in table */ + } + + /* Found and the last reference just gone */ + if (smcd->ops->del_vlan_id(smcd, vlanid)) + rc = -EIO; + list_del(&vlan->list); + kfree(vlan); +out: + spin_unlock_irqrestore(&smcd->lock, flags); + return rc; +} + +int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) +{ + struct smcd_dmb dmb; + int rc = 0; + + if (!dmb_desc->dma_addr) + return rc; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_tok = dmb_desc->token; + dmb.sba_idx = dmb_desc->sba_idx; + dmb.cpu_addr = dmb_desc->cpu_addr; + dmb.dma_addr = dmb_desc->dma_addr; + dmb.dmb_len = dmb_desc->len; + rc = smcd->ops->unregister_dmb(smcd, &dmb); + if (!rc || rc == ISM_ERROR) { + dmb_desc->cpu_addr = NULL; + dmb_desc->dma_addr = 0; + } + + return rc; +} + +int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, + struct smc_buf_desc *dmb_desc) +{ +#if IS_ENABLED(CONFIG_ISM) + struct smcd_dmb dmb; + int rc; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_len = dmb_len; + dmb.sba_idx = dmb_desc->sba_idx; + dmb.vlan_id = lgr->vlan_id; + dmb.rgid = lgr->peer_gid; + rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, &smc_ism_client); + if (!rc) { + dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->token = dmb.dmb_tok; + dmb_desc->cpu_addr = dmb.cpu_addr; + dmb_desc->dma_addr = dmb.dma_addr; + dmb_desc->len = dmb.dmb_len; + } + return rc; +#else + return 0; +#endif +} + +static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct smc_pci_dev smc_pci_dev; + struct nlattr *port_attrs; + struct nlattr *attrs; + struct ism_dev *ism; + int use_cnt = 0; + void *nlh; + + ism = smcd->priv; + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_DEV_SMCD); + if (!nlh) + goto errmsg; + attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCD); + if (!attrs) + goto errout; + use_cnt = atomic_read(&smcd->lgr_cnt); + if (nla_put_u32(skb, SMC_NLA_DEV_USE_CNT, use_cnt)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) + goto errattr; + memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); + smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); + if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device)) + goto errattr; + if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id)) + goto errattr; + + port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT); + if (!port_attrs) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcd->pnetid_by_user)) + goto errportattr; + memcpy(smc_pnet, smcd->pnetid, SMC_MAX_PNETID_LEN); + smc_pnet[SMC_MAX_PNETID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) + goto errportattr; + + nla_nest_end(skb, port_attrs); + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; + +errportattr: + nla_nest_cancel(skb, port_attrs); +errattr: + nla_nest_cancel(skb, attrs); +errout: + nlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + int snum = cb_ctx->pos[0]; + struct smcd_dev *smcd; + int num = 0; + + mutex_lock(&dev_list->mutex); + list_for_each_entry(smcd, &dev_list->list, list) { + if (num < snum) + goto next; + if (smc_nl_handle_smcd_dev(smcd, skb, cb)) + goto errout; +next: + num++; + } +errout: + mutex_unlock(&dev_list->mutex); + cb_ctx->pos[0] = num; +} + +int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) +{ + smc_nl_prep_smcd_dev(&smcd_dev_list, skb, cb); + return skb->len; +} + +#if IS_ENABLED(CONFIG_ISM) +struct smc_ism_event_work { + struct work_struct work; + struct smcd_dev *smcd; + struct ism_event event; +}; + +#define ISM_EVENT_REQUEST 0x0001 +#define ISM_EVENT_RESPONSE 0x0002 +#define ISM_EVENT_REQUEST_IR 0x00000001 +#define ISM_EVENT_CODE_SHUTDOWN 0x80 +#define ISM_EVENT_CODE_TESTLINK 0x83 + +union smcd_sw_event_info { + u64 info; + struct { + u8 uid[SMC_LGR_ID_SIZE]; + unsigned short vlan_id; + u16 code; + }; +}; + +static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) +{ + union smcd_sw_event_info ev_info; + + ev_info.info = wrk->event.info; + switch (wrk->event.code) { + case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ + smc_smcd_terminate(wrk->smcd, wrk->event.tok, ev_info.vlan_id); + break; + case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ + if (ev_info.code == ISM_EVENT_REQUEST) { + ev_info.code = ISM_EVENT_RESPONSE; + wrk->smcd->ops->signal_event(wrk->smcd, + wrk->event.tok, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_TESTLINK, + ev_info.info); + } + break; + } +} + +/* worker for SMC-D events */ +static void smc_ism_event_work(struct work_struct *work) +{ + struct smc_ism_event_work *wrk = + container_of(work, struct smc_ism_event_work, work); + + switch (wrk->event.type) { + case ISM_EVENT_GID: /* GID event, token is peer GID */ + smc_smcd_terminate(wrk->smcd, wrk->event.tok, VLAN_VID_MASK); + break; + case ISM_EVENT_DMB: + break; + case ISM_EVENT_SWR: /* Software defined event */ + smcd_handle_sw_event(wrk); + break; + } + kfree(wrk); +} + +static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, + const struct smcd_ops *ops, int max_dmbs) +{ + struct smcd_dev *smcd; + + smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); + if (!smcd) + return NULL; + smcd->conn = devm_kcalloc(parent, max_dmbs, + sizeof(struct smc_connection *), GFP_KERNEL); + if (!smcd->conn) + return NULL; + + smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", + WQ_MEM_RECLAIM, name); + if (!smcd->event_wq) + return NULL; + + smcd->ops = ops; + + spin_lock_init(&smcd->lock); + spin_lock_init(&smcd->lgr_lock); + INIT_LIST_HEAD(&smcd->vlan); + INIT_LIST_HEAD(&smcd->lgr_list); + init_waitqueue_head(&smcd->lgrs_deleted); + return smcd; +} + +static void smcd_register_dev(struct ism_dev *ism) +{ + const struct smcd_ops *ops = ism_get_smcd_ops(); + struct smcd_dev *smcd; + + if (!ops) + return; + + smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, + ISM_NR_DMBS); + if (!smcd) + return; + smcd->priv = ism; + ism_set_priv(ism, &smc_ism_client, smcd); + if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); + + mutex_lock(&smcd_dev_list.mutex); + if (list_empty(&smcd_dev_list.list)) { + u8 *system_eid = NULL; + + system_eid = smcd->ops->get_system_eid(); + if (smcd->ops->supports_v2()) { + smc_ism_v2_capable = true; + memcpy(smc_ism_v2_system_eid, system_eid, + SMC_MAX_EID_LEN); + } + } + /* sort list: devices without pnetid before devices with pnetid */ + if (smcd->pnetid[0]) + list_add_tail(&smcd->list, &smcd_dev_list.list); + else + list_add(&smcd->list, &smcd_dev_list.list); + mutex_unlock(&smcd_dev_list.mutex); + + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&ism->dev), smcd->pnetid, + smcd->pnetid_by_user ? " (user defined)" : ""); + + return; +} + +static void smcd_unregister_dev(struct ism_dev *ism) +{ + struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&ism->dev)); + smcd->going_away = 1; + smc_smcd_terminate_all(smcd); + mutex_lock(&smcd_dev_list.mutex); + list_del_init(&smcd->list); + mutex_unlock(&smcd_dev_list.mutex); + destroy_workqueue(smcd->event_wq); +} + +/* SMCD Device event handler. Called from ISM device interrupt handler. + * Parameters are ism device pointer, + * - event->type (0 --> DMB, 1 --> GID), + * - event->code (event code), + * - event->tok (either DMB token when event type 0, or GID when event type 1) + * - event->time (time of day) + * - event->info (debug info). + * + * Context: + * - Function called in IRQ context from ISM device driver event handler. + */ +static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) +{ + struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smc_ism_event_work *wrk; + + if (smcd->going_away) + return; + /* copy event to event work queue, and let it be handled there */ + wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); + if (!wrk) + return; + INIT_WORK(&wrk->work, smc_ism_event_work); + wrk->smcd = smcd; + wrk->event = *event; + queue_work(smcd->event_wq, &wrk->work); +} + +/* SMCD Device interrupt handler. Called from ISM device interrupt handler. + * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. + * Find the connection and schedule the tasklet for this connection. + * + * Context: + * - Function called in IRQ context from ISM device driver IRQ handler. + */ +static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, + u16 dmbemask) +{ + struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smc_connection *conn = NULL; + unsigned long flags; + + spin_lock_irqsave(&smcd->lock, flags); + conn = smcd->conn[dmbno]; + if (conn && !conn->killed) + tasklet_schedule(&conn->rx_tsklet); + spin_unlock_irqrestore(&smcd->lock, flags); +} +#endif + +int smc_ism_signal_shutdown(struct smc_link_group *lgr) +{ + int rc = 0; +#if IS_ENABLED(CONFIG_ISM) + union smcd_sw_event_info ev_info; + + if (lgr->peer_shutdown) + return 0; + + memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); + ev_info.vlan_id = lgr->vlan_id; + ev_info.code = ISM_EVENT_REQUEST; + rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_SHUTDOWN, + ev_info.info); +#endif + return rc; +} + +int smc_ism_init(void) +{ + int rc = 0; + +#if IS_ENABLED(CONFIG_ISM) + smc_ism_v2_capable = false; + memset(smc_ism_v2_system_eid, 0, SMC_MAX_EID_LEN); + + rc = ism_register_client(&smc_ism_client); +#endif + return rc; +} + +void smc_ism_exit(void) +{ +#if IS_ENABLED(CONFIG_ISM) + ism_unregister_client(&smc_ism_client); +#endif +} diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h new file mode 100644 index 0000000000..832b2f42d7 --- /dev/null +++ b/net/smc/smc_ism.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Shared Memory Communications Direct over ISM devices (SMC-D) + * + * SMC-D ISM device structure definitions. + * + * Copyright IBM Corp. 2018 + */ + +#ifndef SMCD_ISM_H +#define SMCD_ISM_H + +#include <linux/uio.h> +#include <linux/types.h> +#include <linux/mutex.h> + +#include "smc.h" + +struct smcd_dev_list { /* List of SMCD devices */ + struct list_head list; + struct mutex mutex; /* Protects list of devices */ +}; + +extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */ + +struct smc_ism_vlanid { /* VLAN id set on ISM device */ + struct list_head list; + unsigned short vlanid; /* Vlan id */ + refcount_t refcnt; /* Reference count */ +}; + +struct smcd_dev; + +int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev); +void smc_ism_set_conn(struct smc_connection *conn); +void smc_ism_unset_conn(struct smc_connection *conn); +int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id); +int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); +int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, + struct smc_buf_desc *dmb_desc); +int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +int smc_ism_signal_shutdown(struct smc_link_group *lgr); +void smc_ism_get_system_eid(u8 **eid); +u16 smc_ism_get_chid(struct smcd_dev *dev); +bool smc_ism_is_v2_capable(void); +int smc_ism_init(void); +void smc_ism_exit(void); +int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); + +static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, + unsigned int idx, bool sf, unsigned int offset, + void *data, size_t len) +{ + int rc; + + rc = smcd->ops->move_data(smcd, dmb_tok, idx, sf, offset, data, len); + return rc < 0 ? rc : 0; +} + +#endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c new file mode 100644 index 0000000000..018ce8133b --- /dev/null +++ b/net/smc/smc_llc.c @@ -0,0 +1,2365 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Link Layer Control (LLC) + * + * Copyright IBM Corp. 2016 + * + * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com> + * Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <net/tcp.h> +#include <rdma/ib_verbs.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_clc.h" +#include "smc_llc.h" +#include "smc_pnet.h" + +#define SMC_LLC_DATA_LEN 40 + +struct smc_llc_hdr { + struct smc_wr_rx_hdr common; + union { + struct { + u8 length; /* 44 */ + #if defined(__BIG_ENDIAN_BITFIELD) + u8 reserved:4, + add_link_rej_rsn:4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 add_link_rej_rsn:4, + reserved:4; +#endif + }; + u16 length_v2; /* 44 - 8192*/ + }; + u8 flags; +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 + * (https://www.ibm.com/support/pages/node/6326337) + */ + +#define SMC_LLC_FLAG_NO_RMBE_EYEC 0x03 + +struct smc_llc_msg_confirm_link { /* type 0x01 */ + struct smc_llc_hdr hd; + u8 sender_mac[ETH_ALEN]; + u8 sender_gid[SMC_GID_SIZE]; + u8 sender_qp_num[3]; + u8 link_num; + u8 link_uid[SMC_LGR_ID_SIZE]; + u8 max_links; + u8 max_conns; + u8 reserved[8]; +}; + +#define SMC_LLC_FLAG_ADD_LNK_REJ 0x40 +#define SMC_LLC_REJ_RSN_NO_ALT_PATH 1 + +struct smc_llc_msg_add_link { /* type 0x02 */ + struct smc_llc_hdr hd; + u8 sender_mac[ETH_ALEN]; + u8 reserved2[2]; + u8 sender_gid[SMC_GID_SIZE]; + u8 sender_qp_num[3]; + u8 link_num; +#if defined(__BIG_ENDIAN_BITFIELD) + u8 reserved3 : 4, + qp_mtu : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 qp_mtu : 4, + reserved3 : 4; +#endif + u8 initial_psn[3]; + u8 reserved[8]; +}; + +struct smc_llc_msg_add_link_cont_rt { + __be32 rmb_key; + __be32 rmb_key_new; + __be64 rmb_vaddr_new; +}; + +struct smc_llc_msg_add_link_v2_ext { +#if defined(__BIG_ENDIAN_BITFIELD) + u8 v2_direct : 1, + reserved : 7; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 7, + v2_direct : 1; +#endif + u8 reserved2; + u8 client_target_gid[SMC_GID_SIZE]; + u8 reserved3[8]; + u16 num_rkeys; + struct smc_llc_msg_add_link_cont_rt rt[]; +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 + * (https://www.ibm.com/support/pages/node/6326337) + */ + +struct smc_llc_msg_req_add_link_v2 { + struct smc_llc_hdr hd; + u8 reserved[20]; + u8 gid_cnt; + u8 reserved2[3]; + u8 gid[][SMC_GID_SIZE]; +}; + +#define SMC_LLC_RKEYS_PER_CONT_MSG 2 + +struct smc_llc_msg_add_link_cont { /* type 0x03 */ + struct smc_llc_hdr hd; + u8 link_num; + u8 num_rkeys; + u8 reserved2[2]; + struct smc_llc_msg_add_link_cont_rt rt[SMC_LLC_RKEYS_PER_CONT_MSG]; + u8 reserved[4]; +} __packed; /* format defined in RFC7609 */ + +#define SMC_LLC_FLAG_DEL_LINK_ALL 0x40 +#define SMC_LLC_FLAG_DEL_LINK_ORDERLY 0x20 + +struct smc_llc_msg_del_link { /* type 0x04 */ + struct smc_llc_hdr hd; + u8 link_num; + __be32 reason; + u8 reserved[35]; +} __packed; /* format defined in RFC7609 */ + +struct smc_llc_msg_test_link { /* type 0x07 */ + struct smc_llc_hdr hd; + u8 user_data[16]; + u8 reserved[24]; +}; + +struct smc_rmb_rtoken { + union { + u8 num_rkeys; /* first rtoken byte of CONFIRM LINK msg */ + /* is actually the num of rtokens, first */ + /* rtoken is always for the current link */ + u8 link_id; /* link id of the rtoken */ + }; + __be32 rmb_key; + __be64 rmb_vaddr; +} __packed; /* format defined in RFC7609 */ + +#define SMC_LLC_RKEYS_PER_MSG 3 +#define SMC_LLC_RKEYS_PER_MSG_V2 255 + +struct smc_llc_msg_confirm_rkey { /* type 0x06 */ + struct smc_llc_hdr hd; + struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG]; + u8 reserved; +}; + +#define SMC_LLC_DEL_RKEY_MAX 8 +#define SMC_LLC_FLAG_RKEY_RETRY 0x10 +#define SMC_LLC_FLAG_RKEY_NEG 0x20 + +struct smc_llc_msg_delete_rkey { /* type 0x09 */ + struct smc_llc_hdr hd; + u8 num_rkeys; + u8 err_mask; + u8 reserved[2]; + __be32 rkey[8]; + u8 reserved2[4]; +}; + +struct smc_llc_msg_delete_rkey_v2 { /* type 0x29 */ + struct smc_llc_hdr hd; + u8 num_rkeys; + u8 num_inval_rkeys; + u8 reserved[2]; + __be32 rkey[]; +}; + +union smc_llc_msg { + struct smc_llc_msg_confirm_link confirm_link; + struct smc_llc_msg_add_link add_link; + struct smc_llc_msg_req_add_link_v2 req_add_link; + struct smc_llc_msg_add_link_cont add_link_cont; + struct smc_llc_msg_del_link delete_link; + + struct smc_llc_msg_confirm_rkey confirm_rkey; + struct smc_llc_msg_delete_rkey delete_rkey; + + struct smc_llc_msg_test_link test_link; + struct { + struct smc_llc_hdr hdr; + u8 data[SMC_LLC_DATA_LEN]; + } raw; +}; + +#define SMC_LLC_FLAG_RESP 0x80 + +struct smc_llc_qentry { + struct list_head list; + struct smc_link *link; + union smc_llc_msg msg; +}; + +static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc); + +struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow) +{ + struct smc_llc_qentry *qentry = flow->qentry; + + flow->qentry = NULL; + return qentry; +} + +void smc_llc_flow_qentry_del(struct smc_llc_flow *flow) +{ + struct smc_llc_qentry *qentry; + + if (flow->qentry) { + qentry = flow->qentry; + flow->qentry = NULL; + kfree(qentry); + } +} + +static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow, + struct smc_llc_qentry *qentry) +{ + flow->qentry = qentry; +} + +static void smc_llc_flow_parallel(struct smc_link_group *lgr, u8 flow_type, + struct smc_llc_qentry *qentry) +{ + u8 msg_type = qentry->msg.raw.hdr.common.llc_type; + + if ((msg_type == SMC_LLC_ADD_LINK || msg_type == SMC_LLC_DELETE_LINK) && + flow_type != msg_type && !lgr->delayed_event) { + lgr->delayed_event = qentry; + return; + } + /* drop parallel or already-in-progress llc requests */ + if (flow_type != msg_type) + pr_warn_once("smc: SMC-R lg %*phN net %llu dropped parallel " + "LLC msg: msg %d flow %d role %d\n", + SMC_LGR_ID_SIZE, &lgr->id, + lgr->net->net_cookie, + qentry->msg.raw.hdr.common.type, + flow_type, lgr->role); + kfree(qentry); +} + +/* try to start a new llc flow, initiated by an incoming llc msg */ +static bool smc_llc_flow_start(struct smc_llc_flow *flow, + struct smc_llc_qentry *qentry) +{ + struct smc_link_group *lgr = qentry->link->lgr; + + spin_lock_bh(&lgr->llc_flow_lock); + if (flow->type) { + /* a flow is already active */ + smc_llc_flow_parallel(lgr, flow->type, qentry); + spin_unlock_bh(&lgr->llc_flow_lock); + return false; + } + switch (qentry->msg.raw.hdr.common.llc_type) { + case SMC_LLC_ADD_LINK: + flow->type = SMC_LLC_FLOW_ADD_LINK; + break; + case SMC_LLC_DELETE_LINK: + flow->type = SMC_LLC_FLOW_DEL_LINK; + break; + case SMC_LLC_CONFIRM_RKEY: + case SMC_LLC_DELETE_RKEY: + flow->type = SMC_LLC_FLOW_RKEY; + break; + default: + flow->type = SMC_LLC_FLOW_NONE; + } + smc_llc_flow_qentry_set(flow, qentry); + spin_unlock_bh(&lgr->llc_flow_lock); + return true; +} + +/* start a new local llc flow, wait till current flow finished */ +int smc_llc_flow_initiate(struct smc_link_group *lgr, + enum smc_llc_flowtype type) +{ + enum smc_llc_flowtype allowed_remote = SMC_LLC_FLOW_NONE; + int rc; + + /* all flows except confirm_rkey and delete_rkey are exclusive, + * confirm/delete rkey flows can run concurrently (local and remote) + */ + if (type == SMC_LLC_FLOW_RKEY) + allowed_remote = SMC_LLC_FLOW_RKEY; +again: + if (list_empty(&lgr->list)) + return -ENODEV; + spin_lock_bh(&lgr->llc_flow_lock); + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && + (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || + lgr->llc_flow_rmt.type == allowed_remote)) { + lgr->llc_flow_lcl.type = type; + spin_unlock_bh(&lgr->llc_flow_lock); + return 0; + } + spin_unlock_bh(&lgr->llc_flow_lock); + rc = wait_event_timeout(lgr->llc_flow_waiter, (list_empty(&lgr->list) || + (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && + (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || + lgr->llc_flow_rmt.type == allowed_remote))), + SMC_LLC_WAIT_TIME * 10); + if (!rc) + return -ETIMEDOUT; + goto again; +} + +/* finish the current llc flow */ +void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow) +{ + spin_lock_bh(&lgr->llc_flow_lock); + memset(flow, 0, sizeof(*flow)); + flow->type = SMC_LLC_FLOW_NONE; + spin_unlock_bh(&lgr->llc_flow_lock); + if (!list_empty(&lgr->list) && lgr->delayed_event && + flow == &lgr->llc_flow_lcl) + schedule_work(&lgr->llc_event_work); + else + wake_up(&lgr->llc_flow_waiter); +} + +/* lnk is optional and used for early wakeup when link goes down, useful in + * cases where we wait for a response on the link after we sent a request + */ +struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, + struct smc_link *lnk, + int time_out, u8 exp_msg) +{ + struct smc_llc_flow *flow = &lgr->llc_flow_lcl; + u8 rcv_msg; + + wait_event_timeout(lgr->llc_msg_waiter, + (flow->qentry || + (lnk && !smc_link_usable(lnk)) || + list_empty(&lgr->list)), + time_out); + if (!flow->qentry || + (lnk && !smc_link_usable(lnk)) || list_empty(&lgr->list)) { + smc_llc_flow_qentry_del(flow); + goto out; + } + rcv_msg = flow->qentry->msg.raw.hdr.common.llc_type; + if (exp_msg && rcv_msg != exp_msg) { + if (exp_msg == SMC_LLC_ADD_LINK && + rcv_msg == SMC_LLC_DELETE_LINK) { + /* flow_start will delay the unexpected msg */ + smc_llc_flow_start(&lgr->llc_flow_lcl, + smc_llc_flow_qentry_clr(flow)); + return NULL; + } + pr_warn_once("smc: SMC-R lg %*phN net %llu dropped unexpected LLC msg: " + "msg %d exp %d flow %d role %d flags %x\n", + SMC_LGR_ID_SIZE, &lgr->id, lgr->net->net_cookie, + rcv_msg, exp_msg, + flow->type, lgr->role, + flow->qentry->msg.raw.hdr.flags); + smc_llc_flow_qentry_del(flow); + } +out: + return flow->qentry; +} + +/********************************** send *************************************/ + +struct smc_llc_tx_pend { +}; + +/* handler for send/transmission completion of an LLC msg */ +static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend, + struct smc_link *link, + enum ib_wc_status wc_status) +{ + /* future work: handle wc_status error for recovery and failover */ +} + +/** + * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits + * @link: Pointer to SMC link used for sending LLC control message. + * @wr_buf: Out variable returning pointer to work request payload buffer. + * @pend: Out variable returning pointer to private pending WR tracking. + * It's the context the transmit complete handler will get. + * + * Reserves and pre-fills an entry for a pending work request send/tx. + * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx. + * Can sleep due to smc_get_ctrl_buf (if not in softirq context). + * + * Return: 0 on success, otherwise an error value. + */ +static int smc_llc_add_pending_send(struct smc_link *link, + struct smc_wr_buf **wr_buf, + struct smc_wr_tx_pend_priv **pend) +{ + int rc; + + rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, NULL, + pend); + if (rc < 0) + return rc; + BUILD_BUG_ON_MSG( + sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE, + "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)"); + BUILD_BUG_ON_MSG( + sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE, + "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); + BUILD_BUG_ON_MSG( + sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, + "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)"); + return 0; +} + +static int smc_llc_add_pending_send_v2(struct smc_link *link, + struct smc_wr_v2_buf **wr_buf, + struct smc_wr_tx_pend_priv **pend) +{ + int rc; + + rc = smc_wr_tx_get_v2_slot(link, smc_llc_tx_handler, wr_buf, pend); + if (rc < 0) + return rc; + return 0; +} + +static void smc_llc_init_msg_hdr(struct smc_llc_hdr *hdr, + struct smc_link_group *lgr, size_t len) +{ + if (lgr->smc_version == SMC_V2) { + hdr->common.llc_version = SMC_V2; + hdr->length_v2 = len; + } else { + hdr->common.llc_version = 0; + hdr->length = len; + } +} + +/* high-level API to send LLC confirm link */ +int smc_llc_send_confirm_link(struct smc_link *link, + enum smc_llc_reqresp reqresp) +{ + struct smc_llc_msg_confirm_link *confllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + goto put_out; + confllc = (struct smc_llc_msg_confirm_link *)wr_buf; + memset(confllc, 0, sizeof(*confllc)); + confllc->hd.common.llc_type = SMC_LLC_CONFIRM_LINK; + smc_llc_init_msg_hdr(&confllc->hd, link->lgr, sizeof(*confllc)); + confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC; + if (reqresp == SMC_LLC_RESP) + confllc->hd.flags |= SMC_LLC_FLAG_RESP; + memcpy(confllc->sender_mac, link->smcibdev->mac[link->ibport - 1], + ETH_ALEN); + memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE); + hton24(confllc->sender_qp_num, link->roce_qp->qp_num); + confllc->link_num = link->link_id; + memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE); + confllc->max_links = SMC_LINKS_ADD_LNK_MAX; + if (link->lgr->smc_version == SMC_V2 && + link->lgr->peer_smc_release >= SMC_RELEASE_1) { + confllc->max_conns = link->lgr->max_conns; + confllc->max_links = link->lgr->max_links; + } + /* send llc message */ + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; +} + +/* send LLC confirm rkey request */ +static int smc_llc_send_confirm_rkey(struct smc_link *send_link, + struct smc_buf_desc *rmb_desc) +{ + struct smc_llc_msg_confirm_rkey *rkeyllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + struct smc_link *link; + int i, rc, rtok_ix; + + if (!smc_wr_tx_link_hold(send_link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend); + if (rc) + goto put_out; + rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf; + memset(rkeyllc, 0, sizeof(*rkeyllc)); + rkeyllc->hd.common.llc_type = SMC_LLC_CONFIRM_RKEY; + smc_llc_init_msg_hdr(&rkeyllc->hd, send_link->lgr, sizeof(*rkeyllc)); + + rtok_ix = 1; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + link = &send_link->lgr->lnk[i]; + if (smc_link_active(link) && link != send_link) { + rkeyllc->rtoken[rtok_ix].link_id = link->link_id; + rkeyllc->rtoken[rtok_ix].rmb_key = + htonl(rmb_desc->mr[link->link_idx]->rkey); + rkeyllc->rtoken[rtok_ix].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[link->link_idx].sgl)); + rtok_ix++; + } + } + /* rkey of send_link is in rtoken[0] */ + rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1; + rkeyllc->rtoken[0].rmb_key = + htonl(rmb_desc->mr[send_link->link_idx]->rkey); + rkeyllc->rtoken[0].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[send_link->link_idx].sgl)); + /* send llc message */ + rc = smc_wr_tx_send(send_link, pend); +put_out: + smc_wr_tx_link_put(send_link); + return rc; +} + +/* send LLC delete rkey request */ +static int smc_llc_send_delete_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc) +{ + struct smc_llc_msg_delete_rkey *rkeyllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + goto put_out; + rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf; + memset(rkeyllc, 0, sizeof(*rkeyllc)); + rkeyllc->hd.common.llc_type = SMC_LLC_DELETE_RKEY; + smc_llc_init_msg_hdr(&rkeyllc->hd, link->lgr, sizeof(*rkeyllc)); + rkeyllc->num_rkeys = 1; + rkeyllc->rkey[0] = htonl(rmb_desc->mr[link->link_idx]->rkey); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; +} + +/* return first buffer from any of the next buf lists */ +static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst) +{ + struct smc_buf_desc *buf_pos; + + while (*buf_lst < SMC_RMBE_SIZES) { + buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst], + struct smc_buf_desc, list); + if (buf_pos) + return buf_pos; + (*buf_lst)++; + } + return NULL; +} + +/* return next rmb from buffer lists */ +static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst, + struct smc_buf_desc *buf_pos) +{ + struct smc_buf_desc *buf_next; + + if (!buf_pos) + return _smc_llc_get_next_rmb(lgr, buf_lst); + + if (list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { + (*buf_lst)++; + return _smc_llc_get_next_rmb(lgr, buf_lst); + } + buf_next = list_next_entry(buf_pos, list); + return buf_next; +} + +static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr, + int *buf_lst) +{ + *buf_lst = 0; + return smc_llc_get_next_rmb(lgr, buf_lst, NULL); +} + +static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, + struct smc_link *link, struct smc_link *link_new) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_buf_desc *buf_pos; + int prim_lnk_idx, lnk_idx, i; + struct smc_buf_desc *rmb; + int len = sizeof(*ext); + int buf_lst; + + ext->v2_direct = !lgr->uses_gateway; + memcpy(ext->client_target_gid, link_new->gid, SMC_GID_SIZE); + + prim_lnk_idx = link->link_idx; + lnk_idx = link_new->link_idx; + down_write(&lgr->rmbs_lock); + ext->num_rkeys = lgr->conns_num; + if (!ext->num_rkeys) + goto out; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + for (i = 0; i < ext->num_rkeys; i++) { + while (buf_pos && !(buf_pos)->used) + buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); + if (!buf_pos) + break; + rmb = buf_pos; + ext->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey); + ext->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey); + ext->rt[i].rmb_vaddr_new = rmb->is_vm ? + cpu_to_be64((uintptr_t)rmb->cpu_addr) : + cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); + buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); + } + len += i * sizeof(ext->rt[0]); +out: + up_write(&lgr->rmbs_lock); + return len; +} + +/* send ADD LINK request or response */ +int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], + struct smc_link *link_new, + enum smc_llc_reqresp reqresp) +{ + struct smc_llc_msg_add_link_v2_ext *ext = NULL; + struct smc_llc_msg_add_link *addllc; + struct smc_wr_tx_pend_priv *pend; + int len = sizeof(*addllc); + int rc; + + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + if (link->lgr->smc_version == SMC_V2) { + struct smc_wr_v2_buf *wr_buf; + + rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend); + if (rc) + goto put_out; + addllc = (struct smc_llc_msg_add_link *)wr_buf; + ext = (struct smc_llc_msg_add_link_v2_ext *) + &wr_buf->raw[sizeof(*addllc)]; + memset(ext, 0, SMC_WR_TX_SIZE); + } else { + struct smc_wr_buf *wr_buf; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + goto put_out; + addllc = (struct smc_llc_msg_add_link *)wr_buf; + } + + memset(addllc, 0, sizeof(*addllc)); + addllc->hd.common.llc_type = SMC_LLC_ADD_LINK; + if (reqresp == SMC_LLC_RESP) + addllc->hd.flags |= SMC_LLC_FLAG_RESP; + memcpy(addllc->sender_mac, mac, ETH_ALEN); + memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); + if (link_new) { + addllc->link_num = link_new->link_id; + hton24(addllc->sender_qp_num, link_new->roce_qp->qp_num); + hton24(addllc->initial_psn, link_new->psn_initial); + if (reqresp == SMC_LLC_REQ) + addllc->qp_mtu = link_new->path_mtu; + else + addllc->qp_mtu = min(link_new->path_mtu, + link_new->peer_mtu); + } + if (ext && link_new) + len += smc_llc_fill_ext_v2(ext, link, link_new); + smc_llc_init_msg_hdr(&addllc->hd, link->lgr, len); + /* send llc message */ + if (link->lgr->smc_version == SMC_V2) + rc = smc_wr_tx_v2_send(link, pend, len); + else + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; +} + +/* send DELETE LINK request or response */ +int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, + enum smc_llc_reqresp reqresp, bool orderly, + u32 reason) +{ + struct smc_llc_msg_del_link *delllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + goto put_out; + delllc = (struct smc_llc_msg_del_link *)wr_buf; + + memset(delllc, 0, sizeof(*delllc)); + delllc->hd.common.llc_type = SMC_LLC_DELETE_LINK; + smc_llc_init_msg_hdr(&delllc->hd, link->lgr, sizeof(*delllc)); + if (reqresp == SMC_LLC_RESP) + delllc->hd.flags |= SMC_LLC_FLAG_RESP; + if (orderly) + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + if (link_del_id) + delllc->link_num = link_del_id; + else + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; + delllc->reason = htonl(reason); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; +} + +/* send LLC test link request */ +static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) +{ + struct smc_llc_msg_test_link *testllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + goto put_out; + testllc = (struct smc_llc_msg_test_link *)wr_buf; + memset(testllc, 0, sizeof(*testllc)); + testllc->hd.common.llc_type = SMC_LLC_TEST_LINK; + smc_llc_init_msg_hdr(&testllc->hd, link->lgr, sizeof(*testllc)); + memcpy(testllc->user_data, user_data, sizeof(testllc->user_data)); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; +} + +/* schedule an llc send on link, may wait for buffers */ +static int smc_llc_send_message(struct smc_link *link, void *llcbuf) +{ + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + goto put_out; + memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; +} + +/* schedule an llc send on link, may wait for buffers, + * and wait for send completion notification. + * @return 0 on success + */ +static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf) +{ + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + goto put_out; + memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); + rc = smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME); +put_out: + smc_wr_tx_link_put(link); + return rc; +} + +/********************************* receive ***********************************/ + +static int smc_llc_alloc_alt_link(struct smc_link_group *lgr, + enum smc_lgr_type lgr_new_t) +{ + int i; + + if (lgr->type == SMC_LGR_SYMMETRIC || + (lgr->type != SMC_LGR_SINGLE && + (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER))) + return -EMLINK; + + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) { + for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--) + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + return i; + } else { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + return i; + } + return -EMLINK; +} + +/* send one add_link_continue msg */ +static int smc_llc_add_link_cont(struct smc_link *link, + struct smc_link *link_new, u8 *num_rkeys_todo, + int *buf_lst, struct smc_buf_desc **buf_pos) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + int prim_lnk_idx, lnk_idx, i, rc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + struct smc_buf_desc *rmb; + u8 n; + + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + goto put_out; + addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf; + memset(addc_llc, 0, sizeof(*addc_llc)); + + prim_lnk_idx = link->link_idx; + lnk_idx = link_new->link_idx; + addc_llc->link_num = link_new->link_id; + addc_llc->num_rkeys = *num_rkeys_todo; + n = *num_rkeys_todo; + for (i = 0; i < min_t(u8, n, SMC_LLC_RKEYS_PER_CONT_MSG); i++) { + while (*buf_pos && !(*buf_pos)->used) + *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); + if (!*buf_pos) { + addc_llc->num_rkeys = addc_llc->num_rkeys - + *num_rkeys_todo; + *num_rkeys_todo = 0; + break; + } + rmb = *buf_pos; + + addc_llc->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey); + addc_llc->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey); + addc_llc->rt[i].rmb_vaddr_new = rmb->is_vm ? + cpu_to_be64((uintptr_t)rmb->cpu_addr) : + cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); + + (*num_rkeys_todo)--; + *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); + } + addc_llc->hd.common.llc_type = SMC_LLC_ADD_LINK_CONT; + addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont); + if (lgr->role == SMC_CLNT) + addc_llc->hd.flags |= SMC_LLC_FLAG_RESP; + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; +} + +static int smc_llc_cli_rkey_exchange(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + u8 max, num_rkeys_send, num_rkeys_recv; + struct smc_llc_qentry *qentry; + struct smc_buf_desc *buf_pos; + int buf_lst; + int rc = 0; + int i; + + down_write(&lgr->rmbs_lock); + num_rkeys_send = lgr->conns_num; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + do { + qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK_CONT); + if (!qentry) { + rc = -ETIMEDOUT; + break; + } + addc_llc = &qentry->msg.add_link_cont; + num_rkeys_recv = addc_llc->num_rkeys; + max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + addc_llc->rt[i].rmb_key, + addc_llc->rt[i].rmb_vaddr_new, + addc_llc->rt[i].rmb_key_new); + num_rkeys_recv--; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + rc = smc_llc_add_link_cont(link, link_new, &num_rkeys_send, + &buf_lst, &buf_pos); + if (rc) + break; + } while (num_rkeys_send || num_rkeys_recv); + + up_write(&lgr->rmbs_lock); + return rc; +} + +/* prepare and send an add link reject response */ +static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry) +{ + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_ADD_LNK_REJ; + qentry->msg.raw.hdr.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH; + smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr, + sizeof(qentry->msg)); + return smc_llc_send_message(qentry->link, &qentry->msg); +} + +static int smc_llc_cli_conf_link(struct smc_link *link, + struct smc_init_info *ini, + struct smc_link *link_new, + enum smc_lgr_type lgr_new_t) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc = 0; + + /* receive CONFIRM LINK request over RoCE fabric */ + qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_FIRST_TIME, 0); + if (!qentry) { + rc = smc_llc_send_delete_link(link, link_new->link_id, + SMC_LLC_REQ, false, + SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + if (qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) { + /* received DELETE_LINK instead */ + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return -ENOLINK; + } + smc_llc_save_peer_uid(qentry); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + + rc = smc_ib_modify_qp_rts(link_new); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + smc_wr_remember_qp_attr(link_new); + + rc = smcr_buf_reg_lgr(link_new); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + + /* send CONFIRM LINK response over RoCE fabric */ + rc = smc_llc_send_confirm_link(link_new, SMC_LLC_RESP); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + smc_llc_link_active(link_new); + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) + smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx); + else + smcr_lgr_set_type(lgr, lgr_new_t); + return 0; +} + +static void smc_llc_save_add_link_rkeys(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_v2_ext *ext; + struct smc_link_group *lgr = link->lgr; + int max, i; + + ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 + + SMC_WR_TX_SIZE); + max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); + down_write(&lgr->rmbs_lock); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + ext->rt[i].rmb_key, + ext->rt[i].rmb_vaddr_new, + ext->rt[i].rmb_key_new); + } + up_write(&lgr->rmbs_lock); +} + +static void smc_llc_save_add_link_info(struct smc_link *link, + struct smc_llc_msg_add_link *add_llc) +{ + link->peer_qpn = ntoh24(add_llc->sender_qp_num); + memcpy(link->peer_gid, add_llc->sender_gid, SMC_GID_SIZE); + memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); + link->peer_psn = ntoh24(add_llc->initial_psn); + link->peer_mtu = add_llc->qp_mtu; +} + +/* as an SMC client, process an add link request */ +int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) +{ + struct smc_llc_msg_add_link *llc = &qentry->msg.add_link; + enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; + struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_init_info *ini = NULL; + struct smc_link *lnk_new = NULL; + int lnk_idx, rc = 0; + + if (!llc->qp_mtu) + goto out_reject; + + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) { + rc = -ENOMEM; + goto out_reject; + } + + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) { + rc = 0; + goto out_reject; + } + + ini->vlan_id = lgr->vlan_id; + if (lgr->smc_version == SMC_V2) { + ini->check_smcrv2 = true; + ini->smcrv2.saddr = lgr->saddr; + ini->smcrv2.daddr = smc_ib_gid_to_ipv4(llc->sender_gid); + } + smc_pnet_find_alt_roce(lgr, ini, link->smcibdev); + if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && + (lgr->smc_version == SMC_V2 || + !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN))) { + if (!ini->ib_dev && !ini->smcrv2.ib_dev_v2) + goto out_reject; + lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; + } + if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini->smcrv2.ib_dev_v2 = link->smcibdev; + ini->smcrv2.ib_port_v2 = link->ibport; + } else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini->ib_dev = link->smcibdev; + ini->ib_port = link->ibport; + } + lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); + if (lnk_idx < 0) + goto out_reject; + lnk_new = &lgr->lnk[lnk_idx]; + rc = smcr_link_init(lgr, lnk_new, lnk_idx, ini); + if (rc) + goto out_reject; + smc_llc_save_add_link_info(lnk_new, llc); + lnk_new->link_id = llc->link_num; /* SMC server assigns link id */ + smc_llc_link_set_uid(lnk_new); + + rc = smc_ib_ready_link(lnk_new); + if (rc) + goto out_clear_lnk; + + rc = smcr_buf_map_lgr(lnk_new); + if (rc) + goto out_clear_lnk; + + rc = smc_llc_send_add_link(link, + lnk_new->smcibdev->mac[lnk_new->ibport - 1], + lnk_new->gid, lnk_new, SMC_LLC_RESP); + if (rc) + goto out_clear_lnk; + if (lgr->smc_version == SMC_V2) { + smc_llc_save_add_link_rkeys(link, lnk_new); + } else { + rc = smc_llc_cli_rkey_exchange(link, lnk_new); + if (rc) { + rc = 0; + goto out_clear_lnk; + } + } + rc = smc_llc_cli_conf_link(link, ini, lnk_new, lgr_new_t); + if (!rc) + goto out; +out_clear_lnk: + lnk_new->state = SMC_LNK_INACTIVE; + smcr_link_clear(lnk_new, false); +out_reject: + smc_llc_cli_add_link_reject(qentry); +out: + kfree(ini); + kfree(qentry); + return rc; +} + +static void smc_llc_send_request_add_link(struct smc_link *link) +{ + struct smc_llc_msg_req_add_link_v2 *llc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_v2_buf *wr_buf; + struct smc_gidlist gidlist; + int rc, len, i; + + if (!smc_wr_tx_link_hold(link)) + return; + if (link->lgr->type == SMC_LGR_SYMMETRIC || + link->lgr->type == SMC_LGR_ASYMMETRIC_PEER) + goto put_out; + + smc_fill_gid_list(link->lgr, &gidlist, link->smcibdev, link->gid); + if (gidlist.len <= 1) + goto put_out; + + rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend); + if (rc) + goto put_out; + llc = (struct smc_llc_msg_req_add_link_v2 *)wr_buf; + memset(llc, 0, SMC_WR_TX_SIZE); + + llc->hd.common.llc_type = SMC_LLC_REQ_ADD_LINK; + for (i = 0; i < gidlist.len; i++) + memcpy(llc->gid[i], gidlist.list[i], sizeof(gidlist.list[0])); + llc->gid_cnt = gidlist.len; + len = sizeof(*llc) + (gidlist.len * sizeof(gidlist.list[0])); + smc_llc_init_msg_hdr(&llc->hd, link->lgr, len); + rc = smc_wr_tx_v2_send(link, pend, len); + if (!rc) + /* set REQ_ADD_LINK flow and wait for response from peer */ + link->lgr->llc_flow_lcl.type = SMC_LLC_FLOW_REQ_ADD_LINK; +put_out: + smc_wr_tx_link_put(link); +} + +/* as an SMC client, invite server to start the add_link processing */ +static void smc_llc_cli_add_link_invite(struct smc_link *link, + struct smc_llc_qentry *qentry) +{ + struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_init_info *ini = NULL; + + if (lgr->smc_version == SMC_V2) { + smc_llc_send_request_add_link(link); + goto out; + } + + if (lgr->type == SMC_LGR_SYMMETRIC || + lgr->type == SMC_LGR_ASYMMETRIC_PEER) + goto out; + + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) + goto out; + + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) + goto out; + + ini->vlan_id = lgr->vlan_id; + smc_pnet_find_alt_roce(lgr, ini, link->smcibdev); + if (!ini->ib_dev) + goto out; + + smc_llc_send_add_link(link, ini->ib_dev->mac[ini->ib_port - 1], + ini->ib_gid, NULL, SMC_LLC_REQ); +out: + kfree(ini); + kfree(qentry); +} + +static bool smc_llc_is_empty_llc_message(union smc_llc_msg *llc) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(llc->raw.data); i++) + if (llc->raw.data[i]) + return false; + return true; +} + +static bool smc_llc_is_local_add_link(union smc_llc_msg *llc) +{ + if (llc->raw.hdr.common.llc_type == SMC_LLC_ADD_LINK && + smc_llc_is_empty_llc_message(llc)) + return true; + return false; +} + +static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) +{ + struct smc_llc_qentry *qentry; + + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + + down_write(&lgr->llc_conf_mutex); + if (smc_llc_is_local_add_link(&qentry->msg)) + smc_llc_cli_add_link_invite(qentry->link, qentry); + else + smc_llc_cli_add_link(qentry->link, qentry); + up_write(&lgr->llc_conf_mutex); +} + +static int smc_llc_active_link_count(struct smc_link_group *lgr) +{ + int i, link_count = 0; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + link_count++; + } + return link_count; +} + +/* find the asymmetric link when 3 links are established */ +static struct smc_link *smc_llc_find_asym_link(struct smc_link_group *lgr) +{ + int asym_idx = -ENOENT; + int i, j, k; + bool found; + + /* determine asymmetric link */ + found = false; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) { + if (!smc_link_usable(&lgr->lnk[i]) || + !smc_link_usable(&lgr->lnk[j])) + continue; + if (!memcmp(lgr->lnk[i].gid, lgr->lnk[j].gid, + SMC_GID_SIZE)) { + found = true; /* asym_lnk is i or j */ + break; + } + } + if (found) + break; + } + if (!found) + goto out; /* no asymmetric link */ + for (k = 0; k < SMC_LINKS_PER_LGR_MAX; k++) { + if (!smc_link_usable(&lgr->lnk[k])) + continue; + if (k != i && + !memcmp(lgr->lnk[i].peer_gid, lgr->lnk[k].peer_gid, + SMC_GID_SIZE)) { + asym_idx = i; + break; + } + if (k != j && + !memcmp(lgr->lnk[j].peer_gid, lgr->lnk[k].peer_gid, + SMC_GID_SIZE)) { + asym_idx = j; + break; + } + } +out: + return (asym_idx < 0) ? NULL : &lgr->lnk[asym_idx]; +} + +static void smc_llc_delete_asym_link(struct smc_link_group *lgr) +{ + struct smc_link *lnk_new = NULL, *lnk_asym; + struct smc_llc_qentry *qentry; + int rc; + + lnk_asym = smc_llc_find_asym_link(lgr); + if (!lnk_asym) + return; /* no asymmetric link */ + if (!smc_link_downing(&lnk_asym->state)) + return; + lnk_new = smc_switch_conns(lgr, lnk_asym, false); + smc_wr_tx_wait_no_pending_sends(lnk_asym); + if (!lnk_new) + goto out_free; + /* change flow type from ADD_LINK into DEL_LINK */ + lgr->llc_flow_lcl.type = SMC_LLC_FLOW_DEL_LINK; + rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ, + true, SMC_LLC_DEL_NO_ASYM_NEEDED); + if (rc) { + smcr_link_down_cond(lnk_new); + goto out_free; + } + qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_LINK); + if (!qentry) { + smcr_link_down_cond(lnk_new); + goto out_free; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); +out_free: + smcr_link_clear(lnk_asym, true); +} + +static int smc_llc_srv_rkey_exchange(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + u8 max, num_rkeys_send, num_rkeys_recv; + struct smc_llc_qentry *qentry = NULL; + struct smc_buf_desc *buf_pos; + int buf_lst; + int rc = 0; + int i; + + down_write(&lgr->rmbs_lock); + num_rkeys_send = lgr->conns_num; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + do { + smc_llc_add_link_cont(link, link_new, &num_rkeys_send, + &buf_lst, &buf_pos); + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK_CONT); + if (!qentry) { + rc = -ETIMEDOUT; + goto out; + } + addc_llc = &qentry->msg.add_link_cont; + num_rkeys_recv = addc_llc->num_rkeys; + max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + addc_llc->rt[i].rmb_key, + addc_llc->rt[i].rmb_vaddr_new, + addc_llc->rt[i].rmb_key_new); + num_rkeys_recv--; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + } while (num_rkeys_send || num_rkeys_recv); +out: + up_write(&lgr->rmbs_lock); + return rc; +} + +static int smc_llc_srv_conf_link(struct smc_link *link, + struct smc_link *link_new, + enum smc_lgr_type lgr_new_t) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc; + + /* send CONFIRM LINK request over the RoCE fabric */ + rc = smc_llc_send_confirm_link(link_new, SMC_LLC_REQ); + if (rc) + return -ENOLINK; + /* receive CONFIRM LINK response over the RoCE fabric */ + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, 0); + if (!qentry || + qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) { + /* send DELETE LINK */ + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return -ENOLINK; + } + smc_llc_save_peer_uid(qentry); + smc_llc_link_active(link_new); + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) + smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx); + else + smcr_lgr_set_type(lgr, lgr_new_t); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return 0; +} + +static void smc_llc_send_req_add_link_response(struct smc_llc_qentry *qentry) +{ + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr, + sizeof(qentry->msg)); + memset(&qentry->msg.raw.data, 0, sizeof(qentry->msg.raw.data)); + smc_llc_send_message(qentry->link, &qentry->msg); +} + +int smc_llc_srv_add_link(struct smc_link *link, + struct smc_llc_qentry *req_qentry) +{ + enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; + struct smc_link_group *lgr = link->lgr; + struct smc_llc_msg_add_link *add_llc; + struct smc_llc_qentry *qentry = NULL; + bool send_req_add_link_resp = false; + struct smc_link *link_new = NULL; + struct smc_init_info *ini = NULL; + int lnk_idx, rc = 0; + + if (req_qentry && + req_qentry->msg.raw.hdr.common.llc_type == SMC_LLC_REQ_ADD_LINK) + send_req_add_link_resp = true; + + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) { + rc = -ENOMEM; + goto out; + } + + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) { + rc = 0; + goto out; + } + + /* ignore client add link recommendation, start new flow */ + ini->vlan_id = lgr->vlan_id; + if (lgr->smc_version == SMC_V2) { + ini->check_smcrv2 = true; + ini->smcrv2.saddr = lgr->saddr; + if (send_req_add_link_resp) { + struct smc_llc_msg_req_add_link_v2 *req_add = + &req_qentry->msg.req_add_link; + + ini->smcrv2.daddr = smc_ib_gid_to_ipv4(req_add->gid[0]); + } + } + smc_pnet_find_alt_roce(lgr, ini, link->smcibdev); + if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini->smcrv2.ib_dev_v2 = link->smcibdev; + ini->smcrv2.ib_port_v2 = link->ibport; + } else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini->ib_dev = link->smcibdev; + ini->ib_port = link->ibport; + } + lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); + if (lnk_idx < 0) { + rc = 0; + goto out; + } + + rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, ini); + if (rc) + goto out; + link_new = &lgr->lnk[lnk_idx]; + + rc = smcr_buf_map_lgr(link_new); + if (rc) + goto out_err; + + rc = smc_llc_send_add_link(link, + link_new->smcibdev->mac[link_new->ibport-1], + link_new->gid, link_new, SMC_LLC_REQ); + if (rc) + goto out_err; + send_req_add_link_resp = false; + /* receive ADD LINK response over the RoCE fabric */ + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK); + if (!qentry) { + rc = -ETIMEDOUT; + goto out_err; + } + add_llc = &qentry->msg.add_link; + if (add_llc->hd.flags & SMC_LLC_FLAG_ADD_LNK_REJ) { + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + rc = -ENOLINK; + goto out_err; + } + if (lgr->type == SMC_LGR_SINGLE && + (!memcmp(add_llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && + (lgr->smc_version == SMC_V2 || + !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN)))) { + lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; + } + smc_llc_save_add_link_info(link_new, add_llc); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + + rc = smc_ib_ready_link(link_new); + if (rc) + goto out_err; + rc = smcr_buf_reg_lgr(link_new); + if (rc) + goto out_err; + if (lgr->smc_version == SMC_V2) { + smc_llc_save_add_link_rkeys(link, link_new); + } else { + rc = smc_llc_srv_rkey_exchange(link, link_new); + if (rc) + goto out_err; + } + rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t); + if (rc) + goto out_err; + kfree(ini); + return 0; +out_err: + if (link_new) { + link_new->state = SMC_LNK_INACTIVE; + smcr_link_clear(link_new, false); + } +out: + kfree(ini); + if (send_req_add_link_resp) + smc_llc_send_req_add_link_response(req_qentry); + return rc; +} + +static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) +{ + struct smc_link *link = lgr->llc_flow_lcl.qentry->link; + struct smc_llc_qentry *qentry; + int rc; + + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + + down_write(&lgr->llc_conf_mutex); + rc = smc_llc_srv_add_link(link, qentry); + if (!rc && lgr->type == SMC_LGR_SYMMETRIC) { + /* delete any asymmetric link */ + smc_llc_delete_asym_link(lgr); + } + up_write(&lgr->llc_conf_mutex); + kfree(qentry); +} + +/* enqueue a local add_link req to trigger a new add_link flow */ +void smc_llc_add_link_local(struct smc_link *link) +{ + struct smc_llc_msg_add_link add_llc = {}; + + add_llc.hd.common.llc_type = SMC_LLC_ADD_LINK; + smc_llc_init_msg_hdr(&add_llc.hd, link->lgr, sizeof(add_llc)); + /* no dev and port needed */ + smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc); +} + +/* worker to process an add link message */ +static void smc_llc_add_link_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_add_link_work); + + if (list_empty(&lgr->list)) { + /* link group is terminating */ + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + goto out; + } + + if (lgr->role == SMC_CLNT) + smc_llc_process_cli_add_link(lgr); + else + smc_llc_process_srv_add_link(lgr); +out: + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_REQ_ADD_LINK) + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); +} + +/* enqueue a local del_link msg to trigger a new del_link flow, + * called only for role SMC_SERV + */ +void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id) +{ + struct smc_llc_msg_del_link del_llc = {}; + + del_llc.hd.common.llc_type = SMC_LLC_DELETE_LINK; + smc_llc_init_msg_hdr(&del_llc.hd, link->lgr, sizeof(del_llc)); + del_llc.link_num = del_link_id; + del_llc.reason = htonl(SMC_LLC_DEL_LOST_PATH); + del_llc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + smc_llc_enqueue(link, (union smc_llc_msg *)&del_llc); +} + +static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) +{ + struct smc_link *lnk_del = NULL, *lnk_asym, *lnk; + struct smc_llc_msg_del_link *del_llc; + struct smc_llc_qentry *qentry; + int active_links; + int lnk_idx; + + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + lnk = qentry->link; + del_llc = &qentry->msg.delete_link; + + if (del_llc->hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) { + smc_lgr_terminate_sched(lgr); + goto out; + } + down_write(&lgr->llc_conf_mutex); + /* delete single link */ + for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) { + if (lgr->lnk[lnk_idx].link_id != del_llc->link_num) + continue; + lnk_del = &lgr->lnk[lnk_idx]; + break; + } + del_llc->hd.flags |= SMC_LLC_FLAG_RESP; + if (!lnk_del) { + /* link was not found */ + del_llc->reason = htonl(SMC_LLC_DEL_NOLNK); + smc_llc_send_message(lnk, &qentry->msg); + goto out_unlock; + } + lnk_asym = smc_llc_find_asym_link(lgr); + + del_llc->reason = 0; + smc_llc_send_message(lnk, &qentry->msg); /* response */ + + if (smc_link_downing(&lnk_del->state)) + smc_switch_conns(lgr, lnk_del, false); + smcr_link_clear(lnk_del, true); + + active_links = smc_llc_active_link_count(lgr); + if (lnk_del == lnk_asym) { + /* expected deletion of asym link, don't change lgr state */ + } else if (active_links == 1) { + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); + } else if (!active_links) { + smcr_lgr_set_type(lgr, SMC_LGR_NONE); + smc_lgr_terminate_sched(lgr); + } +out_unlock: + up_write(&lgr->llc_conf_mutex); +out: + kfree(qentry); +} + +/* try to send a DELETE LINK ALL request on any active link, + * waiting for send completion + */ +void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn) +{ + struct smc_llc_msg_del_link delllc = {}; + int i; + + delllc.hd.common.llc_type = SMC_LLC_DELETE_LINK; + smc_llc_init_msg_hdr(&delllc.hd, lgr, sizeof(delllc)); + if (ord) + delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; + delllc.reason = htonl(rsn); + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_sendable(&lgr->lnk[i])) + continue; + if (!smc_llc_send_message_wait(&lgr->lnk[i], &delllc)) + break; + } +} + +static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) +{ + struct smc_llc_msg_del_link *del_llc; + struct smc_link *lnk, *lnk_del; + struct smc_llc_qentry *qentry; + int active_links; + int i; + + down_write(&lgr->llc_conf_mutex); + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + lnk = qentry->link; + del_llc = &qentry->msg.delete_link; + + if (qentry->msg.delete_link.hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) { + /* delete entire lgr */ + smc_llc_send_link_delete_all(lgr, true, ntohl( + qentry->msg.delete_link.reason)); + smc_lgr_terminate_sched(lgr); + goto out; + } + /* delete single link */ + lnk_del = NULL; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].link_id == del_llc->link_num) { + lnk_del = &lgr->lnk[i]; + break; + } + } + if (!lnk_del) + goto out; /* asymmetric link already deleted */ + + if (smc_link_downing(&lnk_del->state)) { + if (smc_switch_conns(lgr, lnk_del, false)) + smc_wr_tx_wait_no_pending_sends(lnk_del); + } + if (!list_empty(&lgr->list)) { + /* qentry is either a request from peer (send it back to + * initiate the DELETE_LINK processing), or a locally + * enqueued DELETE_LINK request (forward it) + */ + if (!smc_llc_send_message(lnk, &qentry->msg)) { + struct smc_llc_qentry *qentry2; + + qentry2 = smc_llc_wait(lgr, lnk, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_LINK); + if (qentry2) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + } + } + smcr_link_clear(lnk_del, true); + + active_links = smc_llc_active_link_count(lgr); + if (active_links == 1) { + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); + } else if (!active_links) { + smcr_lgr_set_type(lgr, SMC_LGR_NONE); + smc_lgr_terminate_sched(lgr); + } + + if (lgr->type == SMC_LGR_SINGLE && !list_empty(&lgr->list)) { + /* trigger setup of asymm alt link */ + smc_llc_add_link_local(lnk); + } +out: + up_write(&lgr->llc_conf_mutex); + kfree(qentry); +} + +static void smc_llc_delete_link_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_del_link_work); + + if (list_empty(&lgr->list)) { + /* link group is terminating */ + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + goto out; + } + + if (lgr->role == SMC_CLNT) + smc_llc_process_cli_delete_link(lgr); + else + smc_llc_process_srv_delete_link(lgr); +out: + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); +} + +/* process a confirm_rkey request from peer, remote flow */ +static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr) +{ + struct smc_llc_msg_confirm_rkey *llc; + struct smc_llc_qentry *qentry; + struct smc_link *link; + int num_entries; + int rk_idx; + int i; + + qentry = lgr->llc_flow_rmt.qentry; + llc = &qentry->msg.confirm_rkey; + link = qentry->link; + + num_entries = llc->rtoken[0].num_rkeys; + if (num_entries > SMC_LLC_RKEYS_PER_MSG) + goto out_err; + /* first rkey entry is for receiving link */ + rk_idx = smc_rtoken_add(link, + llc->rtoken[0].rmb_vaddr, + llc->rtoken[0].rmb_key); + if (rk_idx < 0) + goto out_err; + + for (i = 1; i <= min_t(u8, num_entries, SMC_LLC_RKEYS_PER_MSG - 1); i++) + smc_rtoken_set2(lgr, rk_idx, llc->rtoken[i].link_id, + llc->rtoken[i].rmb_vaddr, + llc->rtoken[i].rmb_key); + /* max links is 3 so there is no need to support conf_rkey_cont msgs */ + goto out; +out_err: + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY; +out: + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc)); + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); +} + +/* process a delete_rkey request from peer, remote flow */ +static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) +{ + struct smc_llc_msg_delete_rkey *llc; + struct smc_llc_qentry *qentry; + struct smc_link *link; + u8 err_mask = 0; + int i, max; + + qentry = lgr->llc_flow_rmt.qentry; + llc = &qentry->msg.delete_rkey; + link = qentry->link; + + if (lgr->smc_version == SMC_V2) { + struct smc_llc_msg_delete_rkey_v2 *llcv2; + + memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + llcv2->num_inval_rkeys = 0; + + max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); + for (i = 0; i < max; i++) { + if (smc_rtoken_delete(link, llcv2->rkey[i])) + llcv2->num_inval_rkeys++; + } + memset(&llc->rkey[0], 0, sizeof(llc->rkey)); + memset(&llc->reserved2, 0, sizeof(llc->reserved2)); + smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc)); + if (llcv2->num_inval_rkeys) { + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->err_mask = llcv2->num_inval_rkeys; + } + goto finish; + } + + max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); + for (i = 0; i < max; i++) { + if (smc_rtoken_delete(link, llc->rkey[i])) + err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); + } + if (err_mask) { + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->err_mask = err_mask; + } +finish: + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); +} + +static void smc_llc_protocol_violation(struct smc_link_group *lgr, u8 type) +{ + pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu LLC protocol violation: " + "llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id, + lgr->net->net_cookie, type); + smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_PROT_VIOL); + smc_lgr_terminate_sched(lgr); +} + +/* flush the llc event queue */ +static void smc_llc_event_flush(struct smc_link_group *lgr) +{ + struct smc_llc_qentry *qentry, *q; + + spin_lock_bh(&lgr->llc_event_q_lock); + list_for_each_entry_safe(qentry, q, &lgr->llc_event_q, list) { + list_del_init(&qentry->list); + kfree(qentry); + } + spin_unlock_bh(&lgr->llc_event_q_lock); +} + +static void smc_llc_event_handler(struct smc_llc_qentry *qentry) +{ + union smc_llc_msg *llc = &qentry->msg; + struct smc_link *link = qentry->link; + struct smc_link_group *lgr = link->lgr; + + if (!smc_link_usable(link)) + goto out; + + switch (llc->raw.hdr.common.llc_type) { + case SMC_LLC_TEST_LINK: + llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); + break; + case SMC_LLC_ADD_LINK: + if (list_empty(&lgr->list)) + goto out; /* lgr is terminating */ + if (lgr->role == SMC_CLNT) { + if (smc_llc_is_local_add_link(llc)) { + if (lgr->llc_flow_lcl.type == + SMC_LLC_FLOW_ADD_LINK) + break; /* add_link in progress */ + if (smc_llc_flow_start(&lgr->llc_flow_lcl, + qentry)) { + schedule_work(&lgr->llc_add_link_work); + } + return; + } + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK && + !lgr->llc_flow_lcl.qentry) { + /* a flow is waiting for this message */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + wake_up(&lgr->llc_msg_waiter); + return; + } + if (lgr->llc_flow_lcl.type == + SMC_LLC_FLOW_REQ_ADD_LINK) { + /* server started add_link processing */ + lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK; + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + schedule_work(&lgr->llc_add_link_work); + return; + } + if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + schedule_work(&lgr->llc_add_link_work); + } + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + /* as smc server, handle client suggestion */ + schedule_work(&lgr->llc_add_link_work); + } + return; + case SMC_LLC_CONFIRM_LINK: + case SMC_LLC_ADD_LINK_CONT: + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* a flow is waiting for this message */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); + wake_up(&lgr->llc_msg_waiter); + return; + } + break; + case SMC_LLC_DELETE_LINK: + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK && + !lgr->llc_flow_lcl.qentry) { + /* DEL LINK REQ during ADD LINK SEQ */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); + wake_up(&lgr->llc_msg_waiter); + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + schedule_work(&lgr->llc_del_link_work); + } + return; + case SMC_LLC_CONFIRM_RKEY: + /* new request from remote, assign to remote flow */ + if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) { + /* process here, does not wait for more llc msgs */ + smc_llc_rmt_conf_rkey(lgr); + smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); + } + return; + case SMC_LLC_CONFIRM_RKEY_CONT: + /* not used because max links is 3, and 3 rkeys fit into + * one CONFIRM_RKEY message + */ + break; + case SMC_LLC_DELETE_RKEY: + /* new request from remote, assign to remote flow */ + if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) { + /* process here, does not wait for more llc msgs */ + smc_llc_rmt_delete_rkey(lgr); + smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); + } + return; + case SMC_LLC_REQ_ADD_LINK: + /* handle response here, smc_llc_flow_stop() cannot be called + * in tasklet context + */ + if (lgr->role == SMC_CLNT && + lgr->llc_flow_lcl.type == SMC_LLC_FLOW_REQ_ADD_LINK && + (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP)) { + smc_llc_flow_stop(link->lgr, &lgr->llc_flow_lcl); + } else if (lgr->role == SMC_SERV) { + if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + /* as smc server, handle client suggestion */ + lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK; + schedule_work(&lgr->llc_add_link_work); + } + return; + } + break; + default: + smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type); + break; + } +out: + kfree(qentry); +} + +/* worker to process llc messages on the event queue */ +static void smc_llc_event_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_event_work); + struct smc_llc_qentry *qentry; + + if (!lgr->llc_flow_lcl.type && lgr->delayed_event) { + qentry = lgr->delayed_event; + lgr->delayed_event = NULL; + if (smc_link_usable(qentry->link)) + smc_llc_event_handler(qentry); + else + kfree(qentry); + } + +again: + spin_lock_bh(&lgr->llc_event_q_lock); + if (!list_empty(&lgr->llc_event_q)) { + qentry = list_first_entry(&lgr->llc_event_q, + struct smc_llc_qentry, list); + list_del_init(&qentry->list); + spin_unlock_bh(&lgr->llc_event_q_lock); + smc_llc_event_handler(qentry); + goto again; + } + spin_unlock_bh(&lgr->llc_event_q_lock); +} + +/* process llc responses in tasklet context */ +static void smc_llc_rx_response(struct smc_link *link, + struct smc_llc_qentry *qentry) +{ + enum smc_llc_flowtype flowtype = link->lgr->llc_flow_lcl.type; + struct smc_llc_flow *flow = &link->lgr->llc_flow_lcl; + u8 llc_type = qentry->msg.raw.hdr.common.llc_type; + + switch (llc_type) { + case SMC_LLC_TEST_LINK: + if (smc_link_active(link)) + complete(&link->llc_testlink_resp); + break; + case SMC_LLC_ADD_LINK: + case SMC_LLC_ADD_LINK_CONT: + case SMC_LLC_CONFIRM_LINK: + if (flowtype != SMC_LLC_FLOW_ADD_LINK || flow->qentry) + break; /* drop out-of-flow response */ + goto assign; + case SMC_LLC_DELETE_LINK: + if (flowtype != SMC_LLC_FLOW_DEL_LINK || flow->qentry) + break; /* drop out-of-flow response */ + goto assign; + case SMC_LLC_CONFIRM_RKEY: + case SMC_LLC_DELETE_RKEY: + if (flowtype != SMC_LLC_FLOW_RKEY || flow->qentry) + break; /* drop out-of-flow response */ + goto assign; + case SMC_LLC_CONFIRM_RKEY_CONT: + /* not used because max links is 3 */ + break; + default: + smc_llc_protocol_violation(link->lgr, + qentry->msg.raw.hdr.common.type); + break; + } + kfree(qentry); + return; +assign: + /* assign responses to the local flow, we requested them */ + smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); + wake_up(&link->lgr->llc_msg_waiter); +} + +static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry; + unsigned long flags; + + qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC); + if (!qentry) + return; + qentry->link = link; + INIT_LIST_HEAD(&qentry->list); + memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg)); + + /* process responses immediately */ + if ((llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) && + llc->raw.hdr.common.llc_type != SMC_LLC_REQ_ADD_LINK) { + smc_llc_rx_response(link, qentry); + return; + } + + /* add requests to event queue */ + spin_lock_irqsave(&lgr->llc_event_q_lock, flags); + list_add_tail(&qentry->list, &lgr->llc_event_q); + spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags); + queue_work(system_highpri_wq, &lgr->llc_event_work); +} + +/* copy received msg and add it to the event queue */ +static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + union smc_llc_msg *llc = buf; + + if (wc->byte_len < sizeof(*llc)) + return; /* short message */ + if (!llc->raw.hdr.common.llc_version) { + if (llc->raw.hdr.length != sizeof(*llc)) + return; /* invalid message */ + } else { + if (llc->raw.hdr.length_v2 < sizeof(*llc)) + return; /* invalid message */ + } + + smc_llc_enqueue(link, llc); +} + +/***************************** worker, utils *********************************/ + +static void smc_llc_testlink_work(struct work_struct *work) +{ + struct smc_link *link = container_of(to_delayed_work(work), + struct smc_link, llc_testlink_wrk); + unsigned long next_interval; + unsigned long expire_time; + u8 user_data[16] = { 0 }; + int rc; + + if (!smc_link_active(link)) + return; /* don't reschedule worker */ + expire_time = link->wr_rx_tstamp + link->llc_testlink_time; + if (time_is_after_jiffies(expire_time)) { + next_interval = expire_time - jiffies; + goto out; + } + reinit_completion(&link->llc_testlink_resp); + smc_llc_send_test_link(link, user_data); + /* receive TEST LINK response over RoCE fabric */ + rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, + SMC_LLC_WAIT_TIME); + if (!smc_link_active(link)) + return; /* link state changed */ + if (rc <= 0) { + smcr_link_down_cond_sched(link); + return; + } + next_interval = link->llc_testlink_time; +out: + schedule_delayed_work(&link->llc_testlink_wrk, next_interval); +} + +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) +{ + struct net *net = sock_net(smc->clcsock->sk); + + INIT_WORK(&lgr->llc_event_work, smc_llc_event_work); + INIT_WORK(&lgr->llc_add_link_work, smc_llc_add_link_work); + INIT_WORK(&lgr->llc_del_link_work, smc_llc_delete_link_work); + INIT_LIST_HEAD(&lgr->llc_event_q); + spin_lock_init(&lgr->llc_event_q_lock); + spin_lock_init(&lgr->llc_flow_lock); + init_waitqueue_head(&lgr->llc_flow_waiter); + init_waitqueue_head(&lgr->llc_msg_waiter); + init_rwsem(&lgr->llc_conf_mutex); + lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time); +} + +/* called after lgr was removed from lgr_list */ +void smc_llc_lgr_clear(struct smc_link_group *lgr) +{ + smc_llc_event_flush(lgr); + wake_up_all(&lgr->llc_flow_waiter); + wake_up_all(&lgr->llc_msg_waiter); + cancel_work_sync(&lgr->llc_event_work); + cancel_work_sync(&lgr->llc_add_link_work); + cancel_work_sync(&lgr->llc_del_link_work); + if (lgr->delayed_event) { + kfree(lgr->delayed_event); + lgr->delayed_event = NULL; + } +} + +int smc_llc_link_init(struct smc_link *link) +{ + init_completion(&link->llc_testlink_resp); + INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); + return 0; +} + +void smc_llc_link_active(struct smc_link *link) +{ + pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu link added: id %*phN, " + "peerid %*phN, ibdev %s, ibport %d\n", + SMC_LGR_ID_SIZE, &link->lgr->id, + link->lgr->net->net_cookie, + SMC_LGR_ID_SIZE, &link->link_uid, + SMC_LGR_ID_SIZE, &link->peer_link_uid, + link->smcibdev->ibdev->name, link->ibport); + link->state = SMC_LNK_ACTIVE; + if (link->lgr->llc_testlink_time) { + link->llc_testlink_time = link->lgr->llc_testlink_time; + schedule_delayed_work(&link->llc_testlink_wrk, + link->llc_testlink_time); + } +} + +/* called in worker context */ +void smc_llc_link_clear(struct smc_link *link, bool log) +{ + if (log) + pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu link removed: id %*phN" + ", peerid %*phN, ibdev %s, ibport %d\n", + SMC_LGR_ID_SIZE, &link->lgr->id, + link->lgr->net->net_cookie, + SMC_LGR_ID_SIZE, &link->link_uid, + SMC_LGR_ID_SIZE, &link->peer_link_uid, + link->smcibdev->ibdev->name, link->ibport); + complete(&link->llc_testlink_resp); + cancel_delayed_work_sync(&link->llc_testlink_wrk); +} + +/* register a new rtoken at the remote peer (for all links) */ +int smc_llc_do_confirm_rkey(struct smc_link *send_link, + struct smc_buf_desc *rmb_desc) +{ + struct smc_link_group *lgr = send_link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc = 0; + + rc = smc_llc_send_confirm_rkey(send_link, rmb_desc); + if (rc) + goto out; + /* receive CONFIRM RKEY response from server over RoCE fabric */ + qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_RKEY); + if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG)) + rc = -EFAULT; +out: + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return rc; +} + +/* unregister an rtoken at the remote peer */ +int smc_llc_do_delete_rkey(struct smc_link_group *lgr, + struct smc_buf_desc *rmb_desc) +{ + struct smc_llc_qentry *qentry = NULL; + struct smc_link *send_link; + int rc = 0; + + send_link = smc_llc_usable_link(lgr); + if (!send_link) + return -ENOLINK; + + /* protected by llc_flow control */ + rc = smc_llc_send_delete_rkey(send_link, rmb_desc); + if (rc) + goto out; + /* receive DELETE RKEY response from server over RoCE fabric */ + qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_RKEY); + if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG)) + rc = -EFAULT; +out: + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return rc; +} + +void smc_llc_link_set_uid(struct smc_link *link) +{ + __be32 link_uid; + + link_uid = htonl(*((u32 *)link->lgr->id) + link->link_id); + memcpy(link->link_uid, &link_uid, SMC_LGR_ID_SIZE); +} + +/* save peers link user id, used for debug purposes */ +void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry) +{ + memcpy(qentry->link->peer_link_uid, qentry->msg.confirm_link.link_uid, + SMC_LGR_ID_SIZE); +} + +/* evaluate confirm link request or response */ +int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, + enum smc_llc_reqresp type) +{ + if (type == SMC_LLC_REQ) { /* SMC server assigns link_id */ + qentry->link->link_id = qentry->msg.confirm_link.link_num; + smc_llc_link_set_uid(qentry->link); + } + if (!(qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) + return -ENOTSUPP; + return 0; +} + +/***************************** init, exit, misc ******************************/ + +static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_LINK + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_TEST_LINK + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ADD_LINK + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ADD_LINK_CONT + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_LINK + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_RKEY + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_RKEY_CONT + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_RKEY + }, + /* V2 types */ + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_TEST_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ADD_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_REQ_ADD_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_RKEY_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_RKEY_V2 + }, + { + .handler = NULL, + } +}; + +int __init smc_llc_init(void) +{ + struct smc_wr_rx_handler *handler; + int rc = 0; + + for (handler = smc_llc_rx_handlers; handler->handler; handler++) { + INIT_HLIST_NODE(&handler->list); + rc = smc_wr_rx_register_handler(handler); + if (rc) + break; + } + return rc; +} diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h new file mode 100644 index 0000000000..7e7a3162c6 --- /dev/null +++ b/net/smc/smc_llc.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for LLC (link layer control) message handling + * + * Copyright IBM Corp. 2016 + * + * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com> + * Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_LLC_H +#define SMC_LLC_H + +#include "smc_wr.h" + +#define SMC_LLC_FLAG_RESP 0x80 + +#define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) +#define SMC_LLC_WAIT_TIME (2 * HZ) +#define SMC_LLC_TESTLINK_DEFAULT_TIME (30 * HZ) + +enum smc_llc_reqresp { + SMC_LLC_REQ, + SMC_LLC_RESP +}; + +enum smc_llc_msg_type { + SMC_LLC_CONFIRM_LINK = 0x01, + SMC_LLC_ADD_LINK = 0x02, + SMC_LLC_ADD_LINK_CONT = 0x03, + SMC_LLC_DELETE_LINK = 0x04, + SMC_LLC_REQ_ADD_LINK = 0x05, + SMC_LLC_CONFIRM_RKEY = 0x06, + SMC_LLC_TEST_LINK = 0x07, + SMC_LLC_CONFIRM_RKEY_CONT = 0x08, + SMC_LLC_DELETE_RKEY = 0x09, + /* V2 types */ + SMC_LLC_CONFIRM_LINK_V2 = 0x21, + SMC_LLC_ADD_LINK_V2 = 0x22, + SMC_LLC_DELETE_LINK_V2 = 0x24, + SMC_LLC_REQ_ADD_LINK_V2 = 0x25, + SMC_LLC_CONFIRM_RKEY_V2 = 0x26, + SMC_LLC_TEST_LINK_V2 = 0x27, + SMC_LLC_DELETE_RKEY_V2 = 0x29, +}; + +#define smc_link_downing(state) \ + (cmpxchg(state, SMC_LNK_ACTIVE, SMC_LNK_INACTIVE) == SMC_LNK_ACTIVE) + +/* LLC DELETE LINK Request Reason Codes */ +#define SMC_LLC_DEL_LOST_PATH 0x00010000 +#define SMC_LLC_DEL_OP_INIT_TERM 0x00020000 +#define SMC_LLC_DEL_PROG_INIT_TERM 0x00030000 +#define SMC_LLC_DEL_PROT_VIOL 0x00040000 +#define SMC_LLC_DEL_NO_ASYM_NEEDED 0x00050000 +/* LLC DELETE LINK Response Reason Codes */ +#define SMC_LLC_DEL_NOLNK 0x00100000 /* Unknown Link ID (no link) */ +#define SMC_LLC_DEL_NOLGR 0x00200000 /* Unknown Link Group */ + +/* returns a usable link of the link group, or NULL */ +static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr) +{ + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (smc_link_usable(&lgr->lnk[i])) + return &lgr->lnk[i]; + return NULL; +} + +/* set the termination reason code for the link group */ +static inline void smc_llc_set_termination_rsn(struct smc_link_group *lgr, + u32 rsn) +{ + if (!lgr->llc_termination_rsn) + lgr->llc_termination_rsn = rsn; +} + +/* transmit */ +int smc_llc_send_confirm_link(struct smc_link *lnk, + enum smc_llc_reqresp reqresp); +int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], + struct smc_link *link_new, + enum smc_llc_reqresp reqresp); +int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, + enum smc_llc_reqresp reqresp, bool orderly, + u32 reason); +void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); +void smc_llc_lgr_clear(struct smc_link_group *lgr); +int smc_llc_link_init(struct smc_link *link); +void smc_llc_link_active(struct smc_link *link); +void smc_llc_link_clear(struct smc_link *link, bool log); +int smc_llc_do_confirm_rkey(struct smc_link *send_link, + struct smc_buf_desc *rmb_desc); +int smc_llc_do_delete_rkey(struct smc_link_group *lgr, + struct smc_buf_desc *rmb_desc); +int smc_llc_flow_initiate(struct smc_link_group *lgr, + enum smc_llc_flowtype type); +void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow); +int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, + enum smc_llc_reqresp type); +void smc_llc_link_set_uid(struct smc_link *link); +void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry); +struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, + struct smc_link *lnk, + int time_out, u8 exp_msg); +struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow); +void smc_llc_flow_qentry_del(struct smc_llc_flow *flow); +void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, + u32 rsn); +int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry); +int smc_llc_srv_add_link(struct smc_link *link, + struct smc_llc_qentry *req_qentry); +void smc_llc_add_link_local(struct smc_link *link); +int smc_llc_init(void) __init; + +#endif /* SMC_LLC_H */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c new file mode 100644 index 0000000000..621c46c700 --- /dev/null +++ b/net/smc/smc_netlink.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic netlink support functions to interact with SMC module + * + * Copyright IBM Corp. 2020 + * + * Author(s): Guvenc Gulce <guvenc@linux.ibm.com> + */ + +#include <linux/module.h> +#include <linux/list.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/if.h> +#include <linux/smc.h> + +#include "smc_core.h" +#include "smc_ism.h" +#include "smc_ib.h" +#include "smc_clc.h" +#include "smc_stats.h" +#include "smc_netlink.h" + +const struct nla_policy +smc_gen_ueid_policy[SMC_NLA_EID_TABLE_MAX + 1] = { + [SMC_NLA_EID_TABLE_UNSPEC] = { .type = NLA_UNSPEC }, + [SMC_NLA_EID_TABLE_ENTRY] = { .type = NLA_STRING, + .len = SMC_MAX_EID_LEN, + }, +}; + +#define SMC_CMD_MAX_ATTR 1 +/* SMC_GENL generic netlink operation definition */ +static const struct genl_ops smc_gen_nl_ops[] = { + { + .cmd = SMC_NETLINK_GET_SYS_INFO, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_sys_info, + }, + { + .cmd = SMC_NETLINK_GET_LGR_SMCR, + /* can be retrieved by unprivileged users */ + .dumpit = smcr_nl_get_lgr, + }, + { + .cmd = SMC_NETLINK_GET_LINK_SMCR, + /* can be retrieved by unprivileged users */ + .dumpit = smcr_nl_get_link, + }, + { + .cmd = SMC_NETLINK_GET_LGR_SMCD, + /* can be retrieved by unprivileged users */ + .dumpit = smcd_nl_get_lgr, + }, + { + .cmd = SMC_NETLINK_GET_DEV_SMCD, + /* can be retrieved by unprivileged users */ + .dumpit = smcd_nl_get_device, + }, + { + .cmd = SMC_NETLINK_GET_DEV_SMCR, + /* can be retrieved by unprivileged users */ + .dumpit = smcr_nl_get_device, + }, + { + .cmd = SMC_NETLINK_GET_STATS, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_stats, + }, + { + .cmd = SMC_NETLINK_GET_FBACK_STATS, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_fback_stats, + }, + { + .cmd = SMC_NETLINK_DUMP_UEID, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_dump_ueid, + }, + { + .cmd = SMC_NETLINK_ADD_UEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_add_ueid, + .policy = smc_gen_ueid_policy, + }, + { + .cmd = SMC_NETLINK_REMOVE_UEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_remove_ueid, + .policy = smc_gen_ueid_policy, + }, + { + .cmd = SMC_NETLINK_FLUSH_UEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_flush_ueid, + }, + { + .cmd = SMC_NETLINK_DUMP_SEID, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_dump_seid, + }, + { + .cmd = SMC_NETLINK_ENABLE_SEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_enable_seid, + }, + { + .cmd = SMC_NETLINK_DISABLE_SEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_disable_seid, + }, + { + .cmd = SMC_NETLINK_DUMP_HS_LIMITATION, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_dump_hs_limitation, + }, + { + .cmd = SMC_NETLINK_ENABLE_HS_LIMITATION, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_enable_hs_limitation, + }, + { + .cmd = SMC_NETLINK_DISABLE_HS_LIMITATION, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_disable_hs_limitation, + }, +}; + +static const struct nla_policy smc_gen_nl_policy[2] = { + [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, }, +}; + +/* SMC_GENL family definition */ +struct genl_family smc_gen_nl_family __ro_after_init = { + .hdrsize = 0, + .name = SMC_GENL_FAMILY_NAME, + .version = SMC_GENL_FAMILY_VERSION, + .maxattr = SMC_CMD_MAX_ATTR, + .policy = smc_gen_nl_policy, + .netnsok = true, + .module = THIS_MODULE, + .ops = smc_gen_nl_ops, + .n_ops = ARRAY_SIZE(smc_gen_nl_ops), + .resv_start_op = SMC_NETLINK_DISABLE_HS_LIMITATION + 1, +}; + +int __init smc_nl_init(void) +{ + return genl_register_family(&smc_gen_nl_family); +} + +void smc_nl_exit(void) +{ + genl_unregister_family(&smc_gen_nl_family); +} diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h new file mode 100644 index 0000000000..e8c6c3f0e9 --- /dev/null +++ b/net/smc/smc_netlink.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * SMC Generic netlink operations + * + * Copyright IBM Corp. 2020 + * + * Author(s): Guvenc Gulce <guvenc@linux.ibm.com> + */ + +#ifndef _SMC_NETLINK_H +#define _SMC_NETLINK_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +extern struct genl_family smc_gen_nl_family; + +extern const struct nla_policy smc_gen_ueid_policy[]; + +struct smc_nl_dmp_ctx { + int pos[3]; +}; + +static inline struct smc_nl_dmp_ctx *smc_nl_dmp_ctx(struct netlink_callback *c) +{ + return (struct smc_nl_dmp_ctx *)c->ctx; +} + +int smc_nl_init(void) __init; +void smc_nl_exit(void); + +#endif diff --git a/net/smc/smc_netns.h b/net/smc/smc_netns.h new file mode 100644 index 0000000000..0f4f35aa43 --- /dev/null +++ b/net/smc/smc_netns.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Shared Memory Communications + * + * Network namespace definitions. + * + * Copyright IBM Corp. 2018 + */ + +#ifndef SMC_NETNS_H +#define SMC_NETNS_H + +#include "smc_pnet.h" + +extern unsigned int smc_net_id; + +/* per-network namespace private data */ +struct smc_net { + struct smc_pnettable pnettable; + struct smc_pnetids_ndev pnetids_ndev; +}; +#endif diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c new file mode 100644 index 0000000000..11775401df --- /dev/null +++ b/net/smc/smc_pnet.c @@ -0,0 +1,1210 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic netlink support functions to configure an SMC-R PNET table + * + * Copyright IBM Corp. 2016 + * + * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com> + */ + +#include <linux/module.h> +#include <linux/list.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/if.h> +#include <uapi/linux/smc.h> + +#include <rdma/ib_verbs.h> + +#include <net/netns/generic.h> +#include "smc_netns.h" + +#include "smc_pnet.h" +#include "smc_ib.h" +#include "smc_ism.h" +#include "smc_core.h" + +static struct net_device *__pnet_find_base_ndev(struct net_device *ndev); +static struct net_device *pnet_find_base_ndev(struct net_device *ndev); + +static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { + [SMC_PNETID_NAME] = { + .type = NLA_NUL_STRING, + .len = SMC_MAX_PNETID_LEN + }, + [SMC_PNETID_ETHNAME] = { + .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1 + }, + [SMC_PNETID_IBNAME] = { + .type = NLA_NUL_STRING, + .len = IB_DEVICE_NAME_MAX - 1 + }, + [SMC_PNETID_IBPORT] = { .type = NLA_U8 } +}; + +static struct genl_family smc_pnet_nl_family; + +enum smc_pnet_nametype { + SMC_PNET_ETH = 1, + SMC_PNET_IB = 2, +}; + +/* pnet entry stored in pnet table */ +struct smc_pnetentry { + struct list_head list; + char pnet_name[SMC_MAX_PNETID_LEN + 1]; + enum smc_pnet_nametype type; + union { + struct { + char eth_name[IFNAMSIZ + 1]; + struct net_device *ndev; + netdevice_tracker dev_tracker; + }; + struct { + char ib_name[IB_DEVICE_NAME_MAX + 1]; + u8 ib_port; + }; + }; +}; + +/* Check if the pnetid is set */ +bool smc_pnet_is_pnetid_set(u8 *pnetid) +{ + if (pnetid[0] == 0 || pnetid[0] == _S) + return false; + return true; +} + +/* Check if two given pnetids match */ +static bool smc_pnet_match(u8 *pnetid1, u8 *pnetid2) +{ + int i; + + for (i = 0; i < SMC_MAX_PNETID_LEN; i++) { + if ((pnetid1[i] == 0 || pnetid1[i] == _S) && + (pnetid2[i] == 0 || pnetid2[i] == _S)) + break; + if (pnetid1[i] != pnetid2[i]) + return false; + } + return true; +} + +/* Remove a pnetid from the pnet table. + */ +static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + struct smc_pnettable *pnettable; + struct smc_ib_device *ibdev; + struct smcd_dev *smcd; + struct smc_net *sn; + int rc = -ENOENT; + int ibport; + + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; + + /* remove table entry */ + mutex_lock(&pnettable->lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, + list) { + if (!pnet_name || + smc_pnet_match(pnetelem->pnet_name, pnet_name)) { + list_del(&pnetelem->list); + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) { + netdev_put(pnetelem->ndev, + &pnetelem->dev_tracker); + pr_warn_ratelimited("smc: net device %s " + "erased user defined " + "pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); + } + kfree(pnetelem); + rc = 0; + } + } + mutex_unlock(&pnettable->lock); + + /* if this is not the initial namespace, stop here */ + if (net != &init_net) + return rc; + + /* remove ib devices */ + mutex_lock(&smc_ib_devices.mutex); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { + if (ibdev->pnetid_by_user[ibport] && + (!pnet_name || + smc_pnet_match(pnet_name, + ibdev->pnetid[ibport]))) { + pr_warn_ratelimited("smc: ib device %s ibport " + "%d erased user defined " + "pnetid %.16s\n", + ibdev->ibdev->name, + ibport + 1, + ibdev->pnetid[ibport]); + memset(ibdev->pnetid[ibport], 0, + SMC_MAX_PNETID_LEN); + ibdev->pnetid_by_user[ibport] = false; + rc = 0; + } + } + } + mutex_unlock(&smc_ib_devices.mutex); + /* remove smcd devices */ + mutex_lock(&smcd_dev_list.mutex); + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + if (smcd->pnetid_by_user && + (!pnet_name || + smc_pnet_match(pnet_name, smcd->pnetid))) { + pr_warn_ratelimited("smc: smcd device %s " + "erased user defined pnetid " + "%.16s\n", + dev_name(smcd->ops->get_dev(smcd)), + smcd->pnetid); + memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); + smcd->pnetid_by_user = false; + rc = 0; + } + } + mutex_unlock(&smcd_dev_list.mutex); + return rc; +} + +/* Add the reference to a given network device to the pnet table. + */ +static int smc_pnet_add_by_ndev(struct net_device *ndev) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + struct smc_pnettable *pnettable; + struct net *net = dev_net(ndev); + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; + + mutex_lock(&pnettable->lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { + if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && + !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { + netdev_hold(ndev, &pnetelem->dev_tracker, GFP_ATOMIC); + pnetelem->ndev = ndev; + rc = 0; + pr_warn_ratelimited("smc: adding net device %s with " + "user defined pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); + break; + } + } + mutex_unlock(&pnettable->lock); + return rc; +} + +/* Remove the reference to a given network device from the pnet table. + */ +static int smc_pnet_remove_by_ndev(struct net_device *ndev) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + struct smc_pnettable *pnettable; + struct net *net = dev_net(ndev); + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; + + mutex_lock(&pnettable->lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { + netdev_put(pnetelem->ndev, &pnetelem->dev_tracker); + pnetelem->ndev = NULL; + rc = 0; + pr_warn_ratelimited("smc: removing net device %s with " + "user defined pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); + break; + } + } + mutex_unlock(&pnettable->lock); + return rc; +} + +/* Apply pnetid to ib device when no pnetid is set. + */ +static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port, + char *pnet_name) +{ + bool applied = false; + + mutex_lock(&smc_ib_devices.mutex); + if (!smc_pnet_is_pnetid_set(ib_dev->pnetid[ib_port - 1])) { + memcpy(ib_dev->pnetid[ib_port - 1], pnet_name, + SMC_MAX_PNETID_LEN); + ib_dev->pnetid_by_user[ib_port - 1] = true; + applied = true; + } + mutex_unlock(&smc_ib_devices.mutex); + return applied; +} + +/* Apply pnetid to smcd device when no pnetid is set. + */ +static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name) +{ + bool applied = false; + + mutex_lock(&smcd_dev_list.mutex); + if (!smc_pnet_is_pnetid_set(smcd_dev->pnetid)) { + memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN); + smcd_dev->pnetid_by_user = true; + applied = true; + } + mutex_unlock(&smcd_dev_list.mutex); + return applied; +} + +/* The limit for pnetid is 16 characters. + * Valid characters should be (single-byte character set) a-z, A-Z, 0-9. + * Lower case letters are converted to upper case. + * Interior blanks should not be used. + */ +static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) +{ + char *bf = skip_spaces(pnet_name); + size_t len = strlen(bf); + char *end = bf + len; + + if (!len) + return false; + while (--end >= bf && isspace(*end)) + ; + if (end - bf >= SMC_MAX_PNETID_LEN) + return false; + while (bf <= end) { + if (!isalnum(*bf)) + return false; + *pnetid++ = islower(*bf) ? toupper(*bf) : *bf; + bf++; + } + *pnetid = '\0'; + return true; +} + +/* Find an infiniband device by a given name. The device might not exist. */ +static struct smc_ib_device *smc_pnet_find_ib(char *ib_name) +{ + struct smc_ib_device *ibdev; + + mutex_lock(&smc_ib_devices.mutex); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + if (!strncmp(ibdev->ibdev->name, ib_name, + sizeof(ibdev->ibdev->name)) || + (ibdev->ibdev->dev.parent && + !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name, + IB_DEVICE_NAME_MAX - 1))) { + goto out; + } + } + ibdev = NULL; +out: + mutex_unlock(&smc_ib_devices.mutex); + return ibdev; +} + +/* Find an smcd device by a given name. The device might not exist. */ +static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) +{ + struct smcd_dev *smcd_dev; + + mutex_lock(&smcd_dev_list.mutex); + list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { + if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)), + smcd_name, IB_DEVICE_NAME_MAX - 1)) + goto out; + } + smcd_dev = NULL; +out: + mutex_unlock(&smcd_dev_list.mutex); + return smcd_dev; +} + +static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, + char *eth_name, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct net_device *ndev, *base_ndev; + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + bool new_netdev; + int rc; + + /* check if (base) netdev already has a pnetid. If there is one, we do + * not want to add a pnet table entry + */ + rc = -EEXIST; + ndev = dev_get_by_name(net, eth_name); /* dev_hold() */ + if (ndev) { + base_ndev = pnet_find_base_ndev(ndev); + if (!smc_pnetid_by_dev_port(base_ndev->dev.parent, + base_ndev->dev_port, ndev_pnetid)) + goto out_put; + } + + /* add a new netdev entry to the pnet table if there isn't one */ + rc = -ENOMEM; + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + goto out_put; + new_pe->type = SMC_PNET_ETH; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); + rc = -EEXIST; + new_netdev = true; + mutex_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_ETH && + !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { + new_netdev = false; + break; + } + } + if (new_netdev) { + if (ndev) { + new_pe->ndev = ndev; + netdev_tracker_alloc(ndev, &new_pe->dev_tracker, + GFP_ATOMIC); + } + list_add_tail(&new_pe->list, &pnettable->pnetlist); + mutex_unlock(&pnettable->lock); + } else { + mutex_unlock(&pnettable->lock); + kfree(new_pe); + goto out_put; + } + if (ndev) + pr_warn_ratelimited("smc: net device %s " + "applied user defined pnetid %.16s\n", + new_pe->eth_name, new_pe->pnet_name); + return 0; + +out_put: + dev_put(ndev); + return rc; +} + +static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, + u8 ib_port, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct smc_ib_device *ib_dev; + bool smcddev_applied = true; + bool ibdev_applied = true; + struct smcd_dev *smcd; + struct device *dev; + bool new_ibdev; + + /* try to apply the pnetid to active devices */ + ib_dev = smc_pnet_find_ib(ib_name); + if (ib_dev) { + ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name); + if (ibdev_applied) + pr_warn_ratelimited("smc: ib device %s ibport %d " + "applied user defined pnetid " + "%.16s\n", ib_dev->ibdev->name, + ib_port, + ib_dev->pnetid[ib_port - 1]); + } + smcd = smc_pnet_find_smcd(ib_name); + if (smcd) { + smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); + if (smcddev_applied) { + dev = smcd->ops->get_dev(smcd); + pr_warn_ratelimited("smc: smcd device %s " + "applied user defined pnetid " + "%.16s\n", dev_name(dev), + smcd->pnetid); + } + } + /* Apply fails when a device has a hardware-defined pnetid set, do not + * add a pnet table entry in that case. + */ + if (!ibdev_applied || !smcddev_applied) + return -EEXIST; + + /* add a new ib entry to the pnet table if there isn't one */ + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + return -ENOMEM; + new_pe->type = SMC_PNET_IB; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); + new_pe->ib_port = ib_port; + + new_ibdev = true; + mutex_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + new_ibdev = false; + break; + } + } + if (new_ibdev) { + list_add_tail(&new_pe->list, &pnettable->pnetlist); + mutex_unlock(&pnettable->lock); + } else { + mutex_unlock(&pnettable->lock); + kfree(new_pe); + } + return (new_ibdev) ? 0 : -EEXIST; +} + +/* Append a pnetid to the end of the pnet table if not already on this list. + */ +static int smc_pnet_enter(struct net *net, struct nlattr *tb[]) +{ + char pnet_name[SMC_MAX_PNETID_LEN + 1]; + struct smc_pnettable *pnettable; + bool new_netdev = false; + bool new_ibdev = false; + struct smc_net *sn; + u8 ibport = 1; + char *string; + int rc; + + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; + + rc = -EINVAL; + if (!tb[SMC_PNETID_NAME]) + goto error; + string = (char *)nla_data(tb[SMC_PNETID_NAME]); + if (!smc_pnetid_valid(string, pnet_name)) + goto error; + + if (tb[SMC_PNETID_ETHNAME]) { + string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); + rc = smc_pnet_add_eth(pnettable, net, string, pnet_name); + if (!rc) + new_netdev = true; + else if (rc != -EEXIST) + goto error; + } + + /* if this is not the initial namespace, stop here */ + if (net != &init_net) + return new_netdev ? 0 : -EEXIST; + + rc = -EINVAL; + if (tb[SMC_PNETID_IBNAME]) { + string = (char *)nla_data(tb[SMC_PNETID_IBNAME]); + string = strim(string); + if (tb[SMC_PNETID_IBPORT]) { + ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]); + if (ibport < 1 || ibport > SMC_MAX_PORTS) + goto error; + } + rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name); + if (!rc) + new_ibdev = true; + else if (rc != -EEXIST) + goto error; + } + return (new_netdev || new_ibdev) ? 0 : -EEXIST; + +error: + return rc; +} + +/* Convert an smc_pnetentry to a netlink attribute sequence */ +static int smc_pnet_set_nla(struct sk_buff *msg, + struct smc_pnetentry *pnetelem) +{ + if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name)) + return -1; + if (pnetelem->type == SMC_PNET_ETH) { + if (nla_put_string(msg, SMC_PNETID_ETHNAME, + pnetelem->eth_name)) + return -1; + } else { + if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a")) + return -1; + } + if (pnetelem->type == SMC_PNET_IB) { + if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) || + nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) + return -1; + } else { + if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") || + nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff)) + return -1; + } + + return 0; +} + +static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + + return smc_pnet_enter(net, info->attrs); +} + +static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + + if (!info->attrs[SMC_PNETID_NAME]) + return -EINVAL; + return smc_pnet_remove_by_pnetid(net, + (char *)nla_data(info->attrs[SMC_PNETID_NAME])); +} + +static int smc_pnet_dump_start(struct netlink_callback *cb) +{ + cb->args[0] = 0; + return 0; +} + +static int smc_pnet_dumpinfo(struct sk_buff *skb, + u32 portid, u32 seq, u32 flags, + struct smc_pnetentry *pnetelem) +{ + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family, + flags, SMC_PNETID_GET); + if (!hdr) + return -ENOMEM; + if (smc_pnet_set_nla(skb, pnetelem) < 0) { + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; + } + genlmsg_end(skb, hdr); + return 0; +} + +static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, + u32 seq, u8 *pnetid, int start_idx) +{ + struct smc_pnettable *pnettable; + struct smc_pnetentry *pnetelem; + struct smc_net *sn; + int idx = 0; + + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; + + /* dump pnettable entries */ + mutex_lock(&pnettable->lock); + list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { + if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) + continue; + if (idx++ < start_idx) + continue; + /* if this is not the initial namespace, dump only netdev */ + if (net != &init_net && pnetelem->type != SMC_PNET_ETH) + continue; + if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, + pnetelem)) { + --idx; + break; + } + } + mutex_unlock(&pnettable->lock); + return idx; +} + +static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int idx; + + idx = _smc_pnet_dump(net, skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NULL, cb->args[0]); + + cb->args[0] = idx; + return skb->len; +} + +/* Retrieve one PNETID entry */ +static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct sk_buff *msg; + void *hdr; + + if (!info->attrs[SMC_PNETID_NAME]) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + _smc_pnet_dump(net, msg, info->snd_portid, info->snd_seq, + nla_data(info->attrs[SMC_PNETID_NAME]), 0); + + /* finish multi part message and send it */ + hdr = nlmsg_put(msg, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, + NLM_F_MULTI); + if (!hdr) { + nlmsg_free(msg); + return -EMSGSIZE; + } + return genlmsg_reply(msg, info); +} + +/* Remove and delete all pnetids from pnet table. + */ +static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + + smc_pnet_remove_by_pnetid(net, NULL); + return 0; +} + +/* SMC_PNETID generic netlink operation definition */ +static const struct genl_ops smc_pnet_ops[] = { + { + .cmd = SMC_PNETID_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + /* can be retrieved by unprivileged users */ + .doit = smc_pnet_get, + .dumpit = smc_pnet_dump, + .start = smc_pnet_dump_start + }, + { + .cmd = SMC_PNETID_ADD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = smc_pnet_add + }, + { + .cmd = SMC_PNETID_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = smc_pnet_del + }, + { + .cmd = SMC_PNETID_FLUSH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = smc_pnet_flush + } +}; + +/* SMC_PNETID family definition */ +static struct genl_family smc_pnet_nl_family __ro_after_init = { + .hdrsize = 0, + .name = SMCR_GENL_FAMILY_NAME, + .version = SMCR_GENL_FAMILY_VERSION, + .maxattr = SMC_PNETID_MAX, + .policy = smc_pnet_policy, + .netnsok = true, + .module = THIS_MODULE, + .ops = smc_pnet_ops, + .n_ops = ARRAY_SIZE(smc_pnet_ops), + .resv_start_op = SMC_PNETID_FLUSH + 1, +}; + +bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid) +{ + struct smc_net *sn = net_generic(net, smc_net_id); + struct smc_pnetids_ndev_entry *pe; + bool rc = false; + + read_lock(&sn->pnetids_ndev.lock); + list_for_each_entry(pe, &sn->pnetids_ndev.list, list) { + if (smc_pnet_match(pnetid, pe->pnetid)) { + rc = true; + goto unlock; + } + } + +unlock: + read_unlock(&sn->pnetids_ndev.lock); + return rc; +} + +static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid) +{ + struct smc_net *sn = net_generic(net, smc_net_id); + struct smc_pnetids_ndev_entry *pe, *pi; + + pe = kzalloc(sizeof(*pe), GFP_KERNEL); + if (!pe) + return -ENOMEM; + + write_lock(&sn->pnetids_ndev.lock); + list_for_each_entry(pi, &sn->pnetids_ndev.list, list) { + if (smc_pnet_match(pnetid, pe->pnetid)) { + refcount_inc(&pi->refcnt); + kfree(pe); + goto unlock; + } + } + refcount_set(&pe->refcnt, 1); + memcpy(pe->pnetid, pnetid, SMC_MAX_PNETID_LEN); + list_add_tail(&pe->list, &sn->pnetids_ndev.list); + +unlock: + write_unlock(&sn->pnetids_ndev.lock); + return 0; +} + +static void smc_pnet_remove_pnetid(struct net *net, u8 *pnetid) +{ + struct smc_net *sn = net_generic(net, smc_net_id); + struct smc_pnetids_ndev_entry *pe, *pe2; + + write_lock(&sn->pnetids_ndev.lock); + list_for_each_entry_safe(pe, pe2, &sn->pnetids_ndev.list, list) { + if (smc_pnet_match(pnetid, pe->pnetid)) { + if (refcount_dec_and_test(&pe->refcnt)) { + list_del(&pe->list); + kfree(pe); + } + break; + } + } + write_unlock(&sn->pnetids_ndev.lock); +} + +static void smc_pnet_add_base_pnetid(struct net *net, struct net_device *dev, + u8 *ndev_pnetid) +{ + struct net_device *base_dev; + + base_dev = __pnet_find_base_ndev(dev); + if (base_dev->flags & IFF_UP && + !smc_pnetid_by_dev_port(base_dev->dev.parent, base_dev->dev_port, + ndev_pnetid)) { + /* add to PNETIDs list */ + smc_pnet_add_pnetid(net, ndev_pnetid); + } +} + +/* create initial list of netdevice pnetids */ +static void smc_pnet_create_pnetids_list(struct net *net) +{ + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct net_device *dev; + + rtnl_lock(); + for_each_netdev(net, dev) + smc_pnet_add_base_pnetid(net, dev, ndev_pnetid); + rtnl_unlock(); +} + +/* clean up list of netdevice pnetids */ +static void smc_pnet_destroy_pnetids_list(struct net *net) +{ + struct smc_net *sn = net_generic(net, smc_net_id); + struct smc_pnetids_ndev_entry *pe, *temp_pe; + + write_lock(&sn->pnetids_ndev.lock); + list_for_each_entry_safe(pe, temp_pe, &sn->pnetids_ndev.list, list) { + list_del(&pe->list); + kfree(pe); + } + write_unlock(&sn->pnetids_ndev.lock); +} + +static int smc_pnet_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(event_dev); + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + + switch (event) { + case NETDEV_REBOOT: + case NETDEV_UNREGISTER: + smc_pnet_remove_by_ndev(event_dev); + smc_ib_ndev_change(event_dev, event); + return NOTIFY_OK; + case NETDEV_REGISTER: + smc_pnet_add_by_ndev(event_dev); + smc_ib_ndev_change(event_dev, event); + return NOTIFY_OK; + case NETDEV_UP: + smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid); + return NOTIFY_OK; + case NETDEV_DOWN: + event_dev = __pnet_find_base_ndev(event_dev); + if (!smc_pnetid_by_dev_port(event_dev->dev.parent, + event_dev->dev_port, ndev_pnetid)) { + /* remove from PNETIDs list */ + smc_pnet_remove_pnetid(net, ndev_pnetid); + } + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block smc_netdev_notifier = { + .notifier_call = smc_pnet_netdev_event +}; + +/* init network namespace */ +int smc_pnet_net_init(struct net *net) +{ + struct smc_net *sn = net_generic(net, smc_net_id); + struct smc_pnettable *pnettable = &sn->pnettable; + struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev; + + INIT_LIST_HEAD(&pnettable->pnetlist); + mutex_init(&pnettable->lock); + INIT_LIST_HEAD(&pnetids_ndev->list); + rwlock_init(&pnetids_ndev->lock); + + smc_pnet_create_pnetids_list(net); + + /* disable handshake limitation by default */ + net->smc.limit_smc_hs = 0; + + return 0; +} + +int __init smc_pnet_init(void) +{ + int rc; + + rc = genl_register_family(&smc_pnet_nl_family); + if (rc) + return rc; + rc = register_netdevice_notifier(&smc_netdev_notifier); + if (rc) + genl_unregister_family(&smc_pnet_nl_family); + + return rc; +} + +/* exit network namespace */ +void smc_pnet_net_exit(struct net *net) +{ + /* flush pnet table */ + smc_pnet_remove_by_pnetid(net, NULL); + smc_pnet_destroy_pnetids_list(net); +} + +void smc_pnet_exit(void) +{ + unregister_netdevice_notifier(&smc_netdev_notifier); + genl_unregister_family(&smc_pnet_nl_family); +} + +static struct net_device *__pnet_find_base_ndev(struct net_device *ndev) +{ + int i, nest_lvl; + + ASSERT_RTNL(); + nest_lvl = ndev->lower_level; + for (i = 0; i < nest_lvl; i++) { + struct list_head *lower = &ndev->adj_list.lower; + + if (list_empty(lower)) + break; + lower = lower->next; + ndev = netdev_lower_get_next(ndev, &lower); + } + return ndev; +} + +/* Determine one base device for stacked net devices. + * If the lower device level contains more than one devices + * (for instance with bonding slaves), just the first device + * is used to reach a base device. + */ +static struct net_device *pnet_find_base_ndev(struct net_device *ndev) +{ + rtnl_lock(); + ndev = __pnet_find_base_ndev(ndev); + rtnl_unlock(); + return ndev; +} + +static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, + u8 *pnetid) +{ + struct smc_pnettable *pnettable; + struct net *net = dev_net(ndev); + struct smc_pnetentry *pnetelem; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; + + mutex_lock(&pnettable->lock); + list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { + if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { + /* get pnetid of netdev device */ + memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN); + rc = 0; + break; + } + } + mutex_unlock(&pnettable->lock); + return rc; +} + +static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i, + struct smc_init_info *ini) +{ + if (!ini->check_smcrv2 && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL, + NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; + return 0; + } + if (ini->check_smcrv2 && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->smcrv2.ib_gid_v2, + NULL, &ini->smcrv2)) { + ini->smcrv2.ib_dev_v2 = ibdev; + ini->smcrv2.ib_port_v2 = i; + return 0; + } + return -ENODEV; +} + +/* find a roce device for the given pnetid */ +static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, + struct smc_init_info *ini, + struct smc_ib_device *known_dev, + struct net *net) +{ + struct smc_ib_device *ibdev; + int i; + + mutex_lock(&smc_ib_devices.mutex); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + if (ibdev == known_dev || + !rdma_dev_access_netns(ibdev->ibdev, net)) + continue; + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(ibdev->ibdev, i)) + continue; + if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && + smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away)) { + if (!smc_pnet_determine_gid(ibdev, i, ini)) + goto out; + } + } + } +out: + mutex_unlock(&smc_ib_devices.mutex); +} + +/* find alternate roce device with same pnet_id, vlan_id and net namespace */ +void smc_pnet_find_alt_roce(struct smc_link_group *lgr, + struct smc_init_info *ini, + struct smc_ib_device *known_dev) +{ + struct net *net = lgr->net; + + _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net); +} + +/* if handshake network device belongs to a roce device, return its + * IB device and port + */ +static void smc_pnet_find_rdma_dev(struct net_device *netdev, + struct smc_init_info *ini) +{ + struct net *net = dev_net(netdev); + struct smc_ib_device *ibdev; + + mutex_lock(&smc_ib_devices.mutex); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + struct net_device *ndev; + int i; + + /* check rdma net namespace */ + if (!rdma_dev_access_netns(ibdev->ibdev, net)) + continue; + + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(ibdev->ibdev, i)) + continue; + if (!ibdev->ibdev->ops.get_netdev) + continue; + ndev = ibdev->ibdev->ops.get_netdev(ibdev->ibdev, i); + if (!ndev) + continue; + dev_put(ndev); + if (netdev == ndev && + smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away)) { + if (!smc_pnet_determine_gid(ibdev, i, ini)) + break; + } + } + } + mutex_unlock(&smc_ib_devices.mutex); +} + +/* Determine the corresponding IB device port based on the hardware PNETID. + * Searching stops at the first matching active IB device port with vlan_id + * configured. + * If nothing found, check pnetid table. + * If nothing found, try to use handshake device + */ +static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, + struct smc_init_info *ini) +{ + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct net *net; + + ndev = pnet_find_base_ndev(ndev); + net = dev_net(ndev); + if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + ndev_pnetid) && + smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { + smc_pnet_find_rdma_dev(ndev, ini); + return; /* pnetid could not be determined */ + } + _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); +} + +static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, + struct smc_init_info *ini) +{ + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct smcd_dev *ismdev; + + ndev = pnet_find_base_ndev(ndev); + if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + ndev_pnetid) && + smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) + return; /* pnetid could not be determined */ + + mutex_lock(&smcd_dev_list.mutex); + list_for_each_entry(ismdev, &smcd_dev_list.list, list) { + if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && + !ismdev->going_away && + (!ini->ism_peer_gid[0] || + !smc_ism_cantalk(ini->ism_peer_gid[0], ini->vlan_id, + ismdev))) { + ini->ism_dev[0] = ismdev; + break; + } + } + mutex_unlock(&smcd_dev_list.mutex); +} + +/* PNET table analysis for a given sock: + * determine ib_device and port belonging to used internal TCP socket + * ethernet interface. + */ +void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) +{ + struct dst_entry *dst = sk_dst_get(sk); + + if (!dst) + goto out; + if (!dst->dev) + goto out_rel; + + smc_pnet_find_roce_by_pnetid(dst->dev, ini); + +out_rel: + dst_release(dst); +out: + return; +} + +void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) +{ + struct dst_entry *dst = sk_dst_get(sk); + + ini->ism_dev[0] = NULL; + if (!dst) + goto out; + if (!dst->dev) + goto out_rel; + + smc_pnet_find_ism_by_pnetid(dst->dev, ini); + +out_rel: + dst_release(dst); +out: + return; +} + +/* Lookup and apply a pnet table entry to the given ib device. + */ +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) +{ + char *ib_name = smcibdev->ibdev->name; + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + mutex_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && + tmp_pe->ib_port == ib_port) { + smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name); + rc = 0; + break; + } + } + mutex_unlock(&pnettable->lock); + + return rc; +} + +/* Lookup and apply a pnet table entry to the given smcd device. + */ +int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) +{ + const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev)); + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + mutex_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); + rc = 0; + break; + } + } + mutex_unlock(&pnettable->lock); + + return rc; +} diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h new file mode 100644 index 0000000000..80a88eea49 --- /dev/null +++ b/net/smc/smc_pnet.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * PNET table queries + * + * Copyright IBM Corp. 2016 + * + * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com> + */ + +#ifndef _SMC_PNET_H +#define _SMC_PNET_H + +#include <net/smc.h> + +#if IS_ENABLED(CONFIG_HAVE_PNETID) +#include <asm/pnet.h> +#endif + +struct smc_ib_device; +struct smcd_dev; +struct smc_init_info; +struct smc_link_group; + +/** + * struct smc_pnettable - SMC PNET table anchor + * @lock: Lock for list action + * @pnetlist: List of PNETIDs + */ +struct smc_pnettable { + struct mutex lock; + struct list_head pnetlist; +}; + +struct smc_pnetids_ndev { /* list of pnetids for net devices in UP state*/ + struct list_head list; + rwlock_t lock; +}; + +struct smc_pnetids_ndev_entry { + struct list_head list; + u8 pnetid[SMC_MAX_PNETID_LEN]; + refcount_t refcnt; +}; + +static inline int smc_pnetid_by_dev_port(struct device *dev, + unsigned short port, u8 *pnetid) +{ +#if IS_ENABLED(CONFIG_HAVE_PNETID) + return pnet_id_by_dev_port(dev, port, pnetid); +#else + return -ENOENT; +#endif +} + +int smc_pnet_init(void) __init; +int smc_pnet_net_init(struct net *net); +void smc_pnet_exit(void); +void smc_pnet_net_exit(struct net *net); +void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini); +void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini); +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port); +int smc_pnetid_by_table_smcd(struct smcd_dev *smcd); +void smc_pnet_find_alt_roce(struct smc_link_group *lgr, + struct smc_init_info *ini, + struct smc_ib_device *known_dev); +bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid); +bool smc_pnet_is_pnetid_set(u8 *pnetid); +#endif diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c new file mode 100644 index 0000000000..9a2f3638d1 --- /dev/null +++ b/net/smc/smc_rx.c @@ -0,0 +1,515 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Manage RMBE + * copy new RMBE data into user space + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/net.h> +#include <linux/rcupdate.h> +#include <linux/sched/signal.h> +#include <linux/splice.h> + +#include <net/sock.h> +#include <trace/events/sock.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_cdc.h" +#include "smc_tx.h" /* smc_tx_consumer_update() */ +#include "smc_rx.h" +#include "smc_stats.h" +#include "smc_tracepoint.h" + +/* callback implementation to wakeup consumers blocked with smc_rx_wait(). + * indirectly called by smc_cdc_msg_recv_action(). + */ +static void smc_rx_wake_up(struct sock *sk) +{ + struct socket_wq *wq; + + trace_sk_data_ready(sk); + + /* derived from sock_def_readable() */ + /* called already in smc_listen_work() */ + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | + EPOLLRDNORM | EPOLLRDBAND); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + if ((sk->sk_shutdown == SHUTDOWN_MASK) || + (sk->sk_state == SMC_CLOSED)) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + rcu_read_unlock(); +} + +/* Update consumer cursor + * @conn connection to update + * @cons consumer cursor + * @len number of Bytes consumed + * Returns: + * 1 if we should end our receive, 0 otherwise + */ +static int smc_rx_update_consumer(struct smc_sock *smc, + union smc_host_cursor cons, size_t len) +{ + struct smc_connection *conn = &smc->conn; + struct sock *sk = &smc->sk; + bool force = false; + int diff, rc = 0; + + smc_curs_add(conn->rmb_desc->len, &cons, len); + + /* did we process urgent data? */ + if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) { + diff = smc_curs_comp(conn->rmb_desc->len, &cons, + &conn->urg_curs); + if (sock_flag(sk, SOCK_URGINLINE)) { + if (diff == 0) { + force = true; + rc = 1; + conn->urg_state = SMC_URG_READ; + } + } else { + if (diff == 1) { + /* skip urgent byte */ + force = true; + smc_curs_add(conn->rmb_desc->len, &cons, 1); + conn->urg_rx_skip_pend = false; + } else if (diff < -1) + /* we read past urgent byte */ + conn->urg_state = SMC_URG_READ; + } + } + + smc_curs_copy(&conn->local_tx_ctrl.cons, &cons, conn); + + /* send consumer cursor update if required */ + /* similar to advertising new TCP rcv_wnd if required */ + smc_tx_consumer_update(conn, force); + + return rc; +} + +static void smc_rx_update_cons(struct smc_sock *smc, size_t len) +{ + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons; + + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); + smc_rx_update_consumer(smc, cons, len); +} + +struct smc_spd_priv { + struct smc_sock *smc; + size_t len; +}; + +static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private; + struct smc_sock *smc = priv->smc; + struct smc_connection *conn; + struct sock *sk = &smc->sk; + + if (sk->sk_state == SMC_CLOSED || + sk->sk_state == SMC_PEERFINCLOSEWAIT || + sk->sk_state == SMC_APPFINCLOSEWAIT) + goto out; + conn = &smc->conn; + lock_sock(sk); + smc_rx_update_cons(smc, priv->len); + release_sock(sk); + if (atomic_sub_and_test(priv->len, &conn->splice_pending)) + smc_rx_wake_up(sk); +out: + kfree(priv); + put_page(buf->page); + sock_put(sk); +} + +static const struct pipe_buf_operations smc_pipe_ops = { + .release = smc_rx_pipe_buf_release, + .get = generic_pipe_buf_get +}; + +static void smc_rx_spd_release(struct splice_pipe_desc *spd, + unsigned int i) +{ + put_page(spd->pages[i]); +} + +static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, + struct smc_sock *smc) +{ + struct smc_link_group *lgr = smc->conn.lgr; + int offset = offset_in_page(src); + struct partial_page *partial; + struct splice_pipe_desc spd; + struct smc_spd_priv **priv; + struct page **pages; + int bytes, nr_pages; + int i; + + nr_pages = !lgr->is_smcd && smc->conn.rmb_desc->is_vm ? + PAGE_ALIGN(len + offset) / PAGE_SIZE : 1; + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + goto out; + partial = kcalloc(nr_pages, sizeof(*partial), GFP_KERNEL); + if (!partial) + goto out_page; + priv = kcalloc(nr_pages, sizeof(*priv), GFP_KERNEL); + if (!priv) + goto out_part; + for (i = 0; i < nr_pages; i++) { + priv[i] = kzalloc(sizeof(**priv), GFP_KERNEL); + if (!priv[i]) + goto out_priv; + } + + if (lgr->is_smcd || + (!lgr->is_smcd && !smc->conn.rmb_desc->is_vm)) { + /* smcd or smcr that uses physically contiguous RMBs */ + priv[0]->len = len; + priv[0]->smc = smc; + partial[0].offset = src - (char *)smc->conn.rmb_desc->cpu_addr; + partial[0].len = len; + partial[0].private = (unsigned long)priv[0]; + pages[0] = smc->conn.rmb_desc->pages; + } else { + int size, left = len; + void *buf = src; + /* smcr that uses virtually contiguous RMBs*/ + for (i = 0; i < nr_pages; i++) { + size = min_t(int, PAGE_SIZE - offset, left); + priv[i]->len = size; + priv[i]->smc = smc; + pages[i] = vmalloc_to_page(buf); + partial[i].offset = offset; + partial[i].len = size; + partial[i].private = (unsigned long)priv[i]; + buf += size / sizeof(*buf); + left -= size; + offset = 0; + } + } + spd.nr_pages_max = nr_pages; + spd.nr_pages = nr_pages; + spd.pages = pages; + spd.partial = partial; + spd.ops = &smc_pipe_ops; + spd.spd_release = smc_rx_spd_release; + + bytes = splice_to_pipe(pipe, &spd); + if (bytes > 0) { + sock_hold(&smc->sk); + if (!lgr->is_smcd && smc->conn.rmb_desc->is_vm) { + for (i = 0; i < PAGE_ALIGN(bytes + offset) / PAGE_SIZE; i++) + get_page(pages[i]); + } else { + get_page(smc->conn.rmb_desc->pages); + } + atomic_add(bytes, &smc->conn.splice_pending); + } + kfree(priv); + kfree(partial); + kfree(pages); + + return bytes; + +out_priv: + for (i = (i - 1); i >= 0; i--) + kfree(priv[i]); + kfree(priv); +out_part: + kfree(partial); +out_page: + kfree(pages); +out: + return -ENOMEM; +} + +static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) +{ + return atomic_read(&conn->bytes_to_rcv) && + !atomic_read(&conn->splice_pending); +} + +/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted + * @smc smc socket + * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout + * @fcrit add'l criterion to evaluate as function pointer + * Returns: + * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. + * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). + */ +int smc_rx_wait(struct smc_sock *smc, long *timeo, + int (*fcrit)(struct smc_connection *conn)) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct smc_connection *conn = &smc->conn; + struct smc_cdc_conn_state_flags *cflags = + &conn->local_tx_ctrl.conn_state_flags; + struct sock *sk = &smc->sk; + int rc; + + if (fcrit(conn)) + return 1; + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + add_wait_queue(sk_sleep(sk), &wait); + rc = sk_wait_event(sk, timeo, + READ_ONCE(sk->sk_err) || + cflags->peer_conn_abort || + READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN || + conn->killed || + fcrit(conn), + &wait); + remove_wait_queue(sk_sleep(sk), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + return rc; +} + +static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, + int flags) +{ + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons; + struct sock *sk = &smc->sk; + int rc = 0; + + if (sock_flag(sk, SOCK_URGINLINE) || + !(conn->urg_state == SMC_URG_VALID) || + conn->urg_state == SMC_URG_READ) + return -EINVAL; + + SMC_STAT_INC(smc, urg_data_cnt); + if (conn->urg_state == SMC_URG_VALID) { + if (!(flags & MSG_PEEK)) + smc->conn.urg_state = SMC_URG_READ; + msg->msg_flags |= MSG_OOB; + if (len > 0) { + if (!(flags & MSG_TRUNC)) + rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1); + len = 1; + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); + if (smc_curs_diff(conn->rmb_desc->len, &cons, + &conn->urg_curs) > 1) + conn->urg_rx_skip_pend = true; + /* Urgent Byte was already accounted for, but trigger + * skipping the urgent byte in non-inline case + */ + if (!(flags & MSG_PEEK)) + smc_rx_update_consumer(smc, cons, 0); + } else { + msg->msg_flags |= MSG_TRUNC; + } + + return rc ? -EFAULT : len; + } + + if (sk->sk_state == SMC_CLOSED || sk->sk_shutdown & RCV_SHUTDOWN) + return 0; + + return -EAGAIN; +} + +static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + + if (smc_rx_data_available(conn)) + return true; + else if (conn->urg_state == SMC_URG_VALID) + /* we received a single urgent Byte - skip */ + smc_rx_update_cons(smc, 0); + return false; +} + +/* smc_rx_recvmsg - receive data from RMBE + * @msg: copy data to receive buffer + * @pipe: copy data to pipe if set - indicates splice() call + * + * rcvbuf consumer: main API called by socket layer. + * Called under sk lock. + */ +int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, + struct pipe_inode_info *pipe, size_t len, int flags) +{ + size_t copylen, read_done = 0, read_remaining = len; + size_t chunk_len, chunk_off, chunk_len_sum; + struct smc_connection *conn = &smc->conn; + int (*func)(struct smc_connection *conn); + union smc_host_cursor cons; + int readable, chunk; + char *rcvbuf_base; + struct sock *sk; + int splbytes; + long timeo; + int target; /* Read at least these many bytes */ + int rc; + + if (unlikely(flags & MSG_ERRQUEUE)) + return -EINVAL; /* future work for sk.sk_family == AF_SMC */ + + sk = &smc->sk; + if (sk->sk_state == SMC_LISTEN) + return -ENOTCONN; + if (flags & MSG_OOB) + return smc_rx_recv_urg(smc, msg, len, flags); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + readable = atomic_read(&conn->bytes_to_rcv); + if (readable >= conn->rmb_desc->len) + SMC_STAT_RMB_RX_FULL(smc, !conn->lnk); + + if (len < readable) + SMC_STAT_RMB_RX_SIZE_SMALL(smc, !conn->lnk); + /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ + rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr; + + do { /* while (read_remaining) */ + if (read_done >= target || (pipe && read_done)) + break; + + if (conn->killed) + break; + + if (smc_rx_recvmsg_data_available(smc)) + goto copy; + + if (sk->sk_shutdown & RCV_SHUTDOWN) { + /* smc_cdc_msg_recv_action() could have run after + * above smc_rx_recvmsg_data_available() + */ + if (smc_rx_recvmsg_data_available(smc)) + goto copy; + break; + } + + if (read_done) { + if (sk->sk_err || + sk->sk_state == SMC_CLOSED || + !timeo || + signal_pending(current)) + break; + } else { + if (sk->sk_err) { + read_done = sock_error(sk); + break; + } + if (sk->sk_state == SMC_CLOSED) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + read_done = -ENOTCONN; + break; + } + break; + } + if (!timeo) + return -EAGAIN; + if (signal_pending(current)) { + read_done = sock_intr_errno(timeo); + break; + } + } + + if (!smc_rx_data_available(conn)) { + smc_rx_wait(smc, &timeo, smc_rx_data_available); + continue; + } + +copy: + /* initialize variables for 1st iteration of subsequent loop */ + /* could be just 1 byte, even after waiting on data above */ + readable = atomic_read(&conn->bytes_to_rcv); + splbytes = atomic_read(&conn->splice_pending); + if (!readable || (msg && splbytes)) { + if (splbytes) + func = smc_rx_data_available_and_no_splice_pend; + else + func = smc_rx_data_available; + smc_rx_wait(smc, &timeo, func); + continue; + } + + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); + /* subsequent splice() calls pick up where previous left */ + if (splbytes) + smc_curs_add(conn->rmb_desc->len, &cons, splbytes); + if (conn->urg_state == SMC_URG_VALID && + sock_flag(&smc->sk, SOCK_URGINLINE) && + readable > 1) + readable--; /* always stop at urgent Byte */ + /* not more than what user space asked for */ + copylen = min_t(size_t, read_remaining, readable); + /* determine chunks where to read from rcvbuf */ + /* either unwrapped case, or 1st chunk of wrapped case */ + chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - + cons.count); + chunk_len_sum = chunk_len; + chunk_off = cons.count; + smc_rmb_sync_sg_for_cpu(conn); + for (chunk = 0; chunk < 2; chunk++) { + if (!(flags & MSG_TRUNC)) { + if (msg) { + rc = memcpy_to_msg(msg, rcvbuf_base + + chunk_off, + chunk_len); + } else { + rc = smc_rx_splice(pipe, rcvbuf_base + + chunk_off, chunk_len, + smc); + } + if (rc < 0) { + if (!read_done) + read_done = -EFAULT; + goto out; + } + } + read_remaining -= chunk_len; + read_done += chunk_len; + + if (chunk_len_sum == copylen) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + chunk_len = copylen - chunk_len; /* remainder */ + chunk_len_sum += chunk_len; + chunk_off = 0; /* modulo offset in recv ring buffer */ + } + + /* update cursors */ + if (!(flags & MSG_PEEK)) { + /* increased in recv tasklet smc_cdc_msg_rcv() */ + smp_mb__before_atomic(); + atomic_sub(copylen, &conn->bytes_to_rcv); + /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */ + smp_mb__after_atomic(); + if (msg && smc_rx_update_consumer(smc, cons, copylen)) + goto out; + } + + trace_smc_rx_recvmsg(smc, copylen); + } while (read_remaining); +out: + return read_done; +} + +/* Initialize receive properties on connection establishment. NB: not __init! */ +void smc_rx_init(struct smc_sock *smc) +{ + smc->sk.sk_data_ready = smc_rx_wake_up; + atomic_set(&smc->conn.splice_pending, 0); + smc->conn.urg_state = SMC_URG_READ; +} diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h new file mode 100644 index 0000000000..db823c97d8 --- /dev/null +++ b/net/smc/smc_rx.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Manage RMBE + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_RX_H +#define SMC_RX_H + +#include <linux/socket.h> +#include <linux/types.h> + +#include "smc.h" + +void smc_rx_init(struct smc_sock *smc); + +int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, + struct pipe_inode_info *pipe, size_t len, int flags); +int smc_rx_wait(struct smc_sock *smc, long *timeo, + int (*fcrit)(struct smc_connection *conn)); +static inline int smc_rx_data_available(struct smc_connection *conn) +{ + return atomic_read(&conn->bytes_to_rcv); +} + +#endif /* SMC_RX_H */ diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c new file mode 100644 index 0000000000..ca14c0f3a0 --- /dev/null +++ b/net/smc/smc_stats.c @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * SMC statistics netlink routines + * + * Copyright IBM Corp. 2021 + * + * Author(s): Guvenc Gulce + */ +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/ctype.h> +#include <linux/smc.h> +#include <net/genetlink.h> +#include <net/sock.h> +#include "smc_netlink.h" +#include "smc_stats.h" + +int smc_stats_init(struct net *net) +{ + net->smc.fback_rsn = kzalloc(sizeof(*net->smc.fback_rsn), GFP_KERNEL); + if (!net->smc.fback_rsn) + goto err_fback; + net->smc.smc_stats = alloc_percpu(struct smc_stats); + if (!net->smc.smc_stats) + goto err_stats; + mutex_init(&net->smc.mutex_fback_rsn); + return 0; + +err_stats: + kfree(net->smc.fback_rsn); +err_fback: + return -ENOMEM; +} + +void smc_stats_exit(struct net *net) +{ + kfree(net->smc.fback_rsn); + if (net->smc.smc_stats) + free_percpu(net->smc.smc_stats); +} + +static int smc_nl_fill_stats_rmb_data(struct sk_buff *skb, + struct smc_stats *stats, int tech, + int type) +{ + struct smc_stats_rmbcnt *stats_rmb_cnt; + struct nlattr *attrs; + + if (type == SMC_NLA_STATS_T_TX_RMB_STATS) + stats_rmb_cnt = &stats->smc[tech].rmb_tx; + else + stats_rmb_cnt = &stats->smc[tech].rmb_rx; + + attrs = nla_nest_start(skb, type); + if (!attrs) + goto errout; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_REUSE_CNT, + stats_rmb_cnt->reuse_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_PEER_CNT, + stats_rmb_cnt->buf_size_small_peer_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_CNT, + stats_rmb_cnt->buf_size_small_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_PEER_CNT, + stats_rmb_cnt->buf_full_peer_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_CNT, + stats_rmb_cnt->buf_full_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_ALLOC_CNT, + stats_rmb_cnt->alloc_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_DGRADE_CNT, + stats_rmb_cnt->dgrade_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_stats_bufsize_data(struct sk_buff *skb, + struct smc_stats *stats, int tech, + int type) +{ + struct smc_stats_memsize *stats_pload; + struct nlattr *attrs; + + if (type == SMC_NLA_STATS_T_TXPLOAD_SIZE) + stats_pload = &stats->smc[tech].tx_pd; + else if (type == SMC_NLA_STATS_T_RXPLOAD_SIZE) + stats_pload = &stats->smc[tech].rx_pd; + else if (type == SMC_NLA_STATS_T_TX_RMB_SIZE) + stats_pload = &stats->smc[tech].tx_rmbsize; + else if (type == SMC_NLA_STATS_T_RX_RMB_SIZE) + stats_pload = &stats->smc[tech].rx_rmbsize; + else + goto errout; + + attrs = nla_nest_start(skb, type); + if (!attrs) + goto errout; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_8K, + stats_pload->buf[SMC_BUF_8K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_16K, + stats_pload->buf[SMC_BUF_16K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_32K, + stats_pload->buf[SMC_BUF_32K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_64K, + stats_pload->buf[SMC_BUF_64K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_128K, + stats_pload->buf[SMC_BUF_128K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_256K, + stats_pload->buf[SMC_BUF_256K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_512K, + stats_pload->buf[SMC_BUF_512K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_1024K, + stats_pload->buf[SMC_BUF_1024K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_G_1024K, + stats_pload->buf[SMC_BUF_G_1024K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_stats_tech_data(struct sk_buff *skb, + struct smc_stats *stats, int tech) +{ + struct smc_stats_tech *smc_tech; + struct nlattr *attrs; + + smc_tech = &stats->smc[tech]; + if (tech == SMC_TYPE_D) + attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCD_TECH); + else + attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCR_TECH); + + if (!attrs) + goto errout; + if (smc_nl_fill_stats_rmb_data(skb, stats, tech, + SMC_NLA_STATS_T_TX_RMB_STATS)) + goto errattr; + if (smc_nl_fill_stats_rmb_data(skb, stats, tech, + SMC_NLA_STATS_T_RX_RMB_STATS)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_TXPLOAD_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_RXPLOAD_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_TX_RMB_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_RX_RMB_SIZE)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V1_SUCC, + smc_tech->clnt_v1_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V2_SUCC, + smc_tech->clnt_v2_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V1_SUCC, + smc_tech->srv_v1_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V2_SUCC, + smc_tech->srv_v2_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_BYTES, + smc_tech->rx_bytes, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_BYTES, + smc_tech->tx_bytes, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT, + smc_tech->rx_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_CNT, + smc_tech->tx_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SENDPAGE_CNT, + 0, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CORK_CNT, + smc_tech->cork_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_NDLY_CNT, + smc_tech->ndly_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SPLICE_CNT, + smc_tech->splice_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_URG_DATA_CNT, + smc_tech->urg_data_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +int smc_nl_get_stats(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); + struct smc_stats *stats; + struct nlattr *attrs; + int cpu, i, size; + void *nlh; + u64 *src; + u64 *sum; + + if (cb_ctx->pos[0]) + goto errmsg; + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_STATS); + if (!nlh) + goto errmsg; + + attrs = nla_nest_start(skb, SMC_GEN_STATS); + if (!attrs) + goto errnest; + stats = kzalloc(sizeof(*stats), GFP_KERNEL); + if (!stats) + goto erralloc; + size = sizeof(*stats) / sizeof(u64); + for_each_possible_cpu(cpu) { + src = (u64 *)per_cpu_ptr(net->smc.smc_stats, cpu); + sum = (u64 *)stats; + for (i = 0; i < size; i++) + *(sum++) += *(src++); + } + if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_D)) + goto errattr; + if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_R)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_CLNT_HS_ERR_CNT, + stats->clnt_hshake_err_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_SRV_HS_ERR_CNT, + stats->srv_hshake_err_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + cb_ctx->pos[0] = 1; + kfree(stats); + return skb->len; + +errattr: + kfree(stats); +erralloc: + nla_nest_cancel(skb, attrs); +errnest: + genlmsg_cancel(skb, nlh); +errmsg: + return skb->len; +} + +static int smc_nl_get_fback_details(struct sk_buff *skb, + struct netlink_callback *cb, int pos, + bool is_srv) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); + int cnt_reported = cb_ctx->pos[2]; + struct smc_stats_fback *trgt_arr; + struct nlattr *attrs; + int rc = 0; + void *nlh; + + if (is_srv) + trgt_arr = &net->smc.fback_rsn->srv[0]; + else + trgt_arr = &net->smc.fback_rsn->clnt[0]; + if (!trgt_arr[pos].fback_code) + return -ENODATA; + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_FBACK_STATS); + if (!nlh) + goto errmsg; + attrs = nla_nest_start(skb, SMC_GEN_FBACK_STATS); + if (!attrs) + goto errout; + if (nla_put_u8(skb, SMC_NLA_FBACK_STATS_TYPE, is_srv)) + goto errattr; + if (!cnt_reported) { + if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_SRV_CNT, + net->smc.fback_rsn->srv_fback_cnt, + SMC_NLA_FBACK_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_CLNT_CNT, + net->smc.fback_rsn->clnt_fback_cnt, + SMC_NLA_FBACK_STATS_PAD)) + goto errattr; + cnt_reported = 1; + } + + if (nla_put_u32(skb, SMC_NLA_FBACK_STATS_RSN_CODE, + trgt_arr[pos].fback_code)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_FBACK_STATS_RSN_CNT, + trgt_arr[pos].count)) + goto errattr; + + cb_ctx->pos[2] = cnt_reported; + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return rc; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); + int rc_srv = 0, rc_clnt = 0, k; + int skip_serv = cb_ctx->pos[1]; + int snum = cb_ctx->pos[0]; + bool is_srv = true; + + mutex_lock(&net->smc.mutex_fback_rsn); + for (k = 0; k < SMC_MAX_FBACK_RSN_CNT; k++) { + if (k < snum) + continue; + if (!skip_serv) { + rc_srv = smc_nl_get_fback_details(skb, cb, k, is_srv); + if (rc_srv && rc_srv != -ENODATA) + break; + } else { + skip_serv = 0; + } + rc_clnt = smc_nl_get_fback_details(skb, cb, k, !is_srv); + if (rc_clnt && rc_clnt != -ENODATA) { + skip_serv = 1; + break; + } + if (rc_clnt == -ENODATA && rc_srv == -ENODATA) + break; + } + mutex_unlock(&net->smc.mutex_fback_rsn); + cb_ctx->pos[1] = skip_serv; + cb_ctx->pos[0] = k; + return skb->len; +} diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h new file mode 100644 index 0000000000..9d32058db2 --- /dev/null +++ b/net/smc/smc_stats.h @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Macros for SMC statistics + * + * Copyright IBM Corp. 2021 + * + * Author(s): Guvenc Gulce + */ + +#ifndef NET_SMC_SMC_STATS_H_ +#define NET_SMC_SMC_STATS_H_ +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/ctype.h> +#include <linux/smc.h> + +#include "smc_clc.h" + +#define SMC_MAX_FBACK_RSN_CNT 30 + +enum { + SMC_BUF_8K, + SMC_BUF_16K, + SMC_BUF_32K, + SMC_BUF_64K, + SMC_BUF_128K, + SMC_BUF_256K, + SMC_BUF_512K, + SMC_BUF_1024K, + SMC_BUF_G_1024K, + SMC_BUF_MAX, +}; + +struct smc_stats_fback { + int fback_code; + u16 count; +}; + +struct smc_stats_rsn { + struct smc_stats_fback srv[SMC_MAX_FBACK_RSN_CNT]; + struct smc_stats_fback clnt[SMC_MAX_FBACK_RSN_CNT]; + u64 srv_fback_cnt; + u64 clnt_fback_cnt; +}; + +struct smc_stats_rmbcnt { + u64 buf_size_small_peer_cnt; + u64 buf_size_small_cnt; + u64 buf_full_peer_cnt; + u64 buf_full_cnt; + u64 reuse_cnt; + u64 alloc_cnt; + u64 dgrade_cnt; +}; + +struct smc_stats_memsize { + u64 buf[SMC_BUF_MAX]; +}; + +struct smc_stats_tech { + struct smc_stats_memsize tx_rmbsize; + struct smc_stats_memsize rx_rmbsize; + struct smc_stats_memsize tx_pd; + struct smc_stats_memsize rx_pd; + struct smc_stats_rmbcnt rmb_tx; + struct smc_stats_rmbcnt rmb_rx; + u64 clnt_v1_succ_cnt; + u64 clnt_v2_succ_cnt; + u64 srv_v1_succ_cnt; + u64 srv_v2_succ_cnt; + u64 urg_data_cnt; + u64 splice_cnt; + u64 cork_cnt; + u64 ndly_cnt; + u64 rx_bytes; + u64 tx_bytes; + u64 rx_cnt; + u64 tx_cnt; +}; + +struct smc_stats { + struct smc_stats_tech smc[2]; + u64 clnt_hshake_err_cnt; + u64 srv_hshake_err_cnt; +}; + +#define SMC_STAT_PAYLOAD_SUB(_smc_stats, _tech, key, _len, _rc) \ +do { \ + typeof(_smc_stats) stats = (_smc_stats); \ + typeof(_tech) t = (_tech); \ + typeof(_len) l = (_len); \ + int _pos; \ + typeof(_rc) r = (_rc); \ + int m = SMC_BUF_MAX - 1; \ + this_cpu_inc((*stats).smc[t].key ## _cnt); \ + if (r <= 0 || l <= 0) \ + break; \ + _pos = fls64((l - 1) >> 13); \ + _pos = (_pos <= m) ? _pos : m; \ + this_cpu_inc((*stats).smc[t].key ## _pd.buf[_pos]); \ + this_cpu_add((*stats).smc[t].key ## _bytes, r); \ +} \ +while (0) + +#define SMC_STAT_TX_PAYLOAD(_smc, length, rcode) \ +do { \ + typeof(_smc) __smc = _smc; \ + struct net *_net = sock_net(&__smc->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(length) _len = (length); \ + typeof(rcode) _rc = (rcode); \ + bool is_smcd = !__smc->conn.lnk; \ + if (is_smcd) \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, tx, _len, _rc); \ + else \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, tx, _len, _rc); \ +} \ +while (0) + +#define SMC_STAT_RX_PAYLOAD(_smc, length, rcode) \ +do { \ + typeof(_smc) __smc = _smc; \ + struct net *_net = sock_net(&__smc->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(length) _len = (length); \ + typeof(rcode) _rc = (rcode); \ + bool is_smcd = !__smc->conn.lnk; \ + if (is_smcd) \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, rx, _len, _rc); \ + else \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, rx, _len, _rc); \ +} \ +while (0) + +#define SMC_STAT_RMB_SIZE_SUB(_smc_stats, _tech, k, _len) \ +do { \ + typeof(_len) _l = (_len); \ + typeof(_tech) t = (_tech); \ + int _pos; \ + int m = SMC_BUF_MAX - 1; \ + if (_l <= 0) \ + break; \ + _pos = fls((_l - 1) >> 13); \ + _pos = (_pos <= m) ? _pos : m; \ + this_cpu_inc((*(_smc_stats)).smc[t].k ## _rmbsize.buf[_pos]); \ +} \ +while (0) + +#define SMC_STAT_RMB_SUB(_smc_stats, type, t, key) \ + this_cpu_inc((*(_smc_stats)).smc[t].rmb ## _ ## key.type ## _cnt) + +#define SMC_STAT_RMB_SIZE(_smc, _is_smcd, _is_rx, _len) \ +do { \ + struct net *_net = sock_net(&(_smc)->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(_is_smcd) is_d = (_is_smcd); \ + typeof(_is_rx) is_r = (_is_rx); \ + typeof(_len) l = (_len); \ + if ((is_d) && (is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, rx, l); \ + if ((is_d) && !(is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, tx, l); \ + if (!(is_d) && (is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, rx, l); \ + if (!(is_d) && !(is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, tx, l); \ +} \ +while (0) + +#define SMC_STAT_RMB(_smc, type, _is_smcd, _is_rx) \ +do { \ + struct net *net = sock_net(&(_smc)->sk); \ + struct smc_stats __percpu *_smc_stats = net->smc.smc_stats; \ + typeof(_is_smcd) is_d = (_is_smcd); \ + typeof(_is_rx) is_r = (_is_rx); \ + if ((is_d) && (is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, rx); \ + if ((is_d) && !(is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, tx); \ + if (!(is_d) && (is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, rx); \ + if (!(is_d) && !(is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, tx); \ +} \ +while (0) + +#define SMC_STAT_BUF_REUSE(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, reuse, is_smcd, is_rx) + +#define SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, alloc, is_smcd, is_rx) + +#define SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, dgrade, is_smcd, is_rx) + +#define SMC_STAT_RMB_TX_PEER_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full_peer, is_smcd, false) + +#define SMC_STAT_RMB_TX_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full, is_smcd, false) + +#define SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small_peer, is_smcd, false) + +#define SMC_STAT_RMB_TX_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small, is_smcd, false) + +#define SMC_STAT_RMB_RX_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small, is_smcd, true) + +#define SMC_STAT_RMB_RX_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full, is_smcd, true) + +#define SMC_STAT_INC(_smc, type) \ +do { \ + typeof(_smc) __smc = _smc; \ + bool is_smcd = !(__smc)->conn.lnk; \ + struct net *net = sock_net(&(__smc)->sk); \ + struct smc_stats __percpu *smc_stats = net->smc.smc_stats; \ + if ((is_smcd)) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].type); \ + else \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].type); \ +} \ +while (0) + +#define SMC_STAT_CLNT_SUCC_INC(net, _aclc) \ +do { \ + typeof(_aclc) acl = (_aclc); \ + bool is_v2 = (acl->hdr.version == SMC_V2); \ + bool is_smcd = (acl->hdr.typev1 == SMC_TYPE_D); \ + struct smc_stats __percpu *smc_stats = (net)->smc.smc_stats; \ + if (is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v2_succ_cnt); \ + else if (is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v2_succ_cnt); \ + else if (!is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v1_succ_cnt); \ + else if (!is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v1_succ_cnt); \ +} \ +while (0) + +#define SMC_STAT_SERV_SUCC_INC(net, _ini) \ +do { \ + typeof(_ini) i = (_ini); \ + bool is_smcd = (i->is_smcd); \ + u8 version = is_smcd ? i->smcd_version : i->smcr_version; \ + bool is_v2 = (version & SMC_V2); \ + typeof(net->smc.smc_stats) smc_stats = (net)->smc.smc_stats; \ + if (is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v2_succ_cnt); \ + else if (is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v2_succ_cnt); \ + else if (!is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v1_succ_cnt); \ + else if (!is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v1_succ_cnt); \ +} \ +while (0) + +int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb); +int smc_stats_init(struct net *net); +void smc_stats_exit(struct net *net); + +#endif /* NET_SMC_SMC_STATS_H_ */ diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c new file mode 100644 index 0000000000..5cbc18c6e6 --- /dev/null +++ b/net/smc/smc_sysctl.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu <tonylu@linux.alibaba.com> + * + */ + +#include <linux/init.h> +#include <linux/sysctl.h> +#include <net/net_namespace.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_llc.h" +#include "smc_sysctl.h" + +static int min_sndbuf = SMC_BUF_MIN_SIZE; +static int min_rcvbuf = SMC_BUF_MIN_SIZE; +static int max_sndbuf = INT_MAX / 2; +static int max_rcvbuf = INT_MAX / 2; +static const int net_smc_wmem_init = (64 * 1024); +static const int net_smc_rmem_init = (64 * 1024); + +static struct ctl_table smc_table[] = { + { + .procname = "autocorking_size", + .data = &init_net.smc.sysctl_autocorking_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + { + .procname = "smcr_buf_type", + .data = &init_net.smc.sysctl_smcr_buf_type, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "smcr_testlink_time", + .data = &init_net.smc.sysctl_smcr_testlink_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "wmem", + .data = &init_net.smc.sysctl_wmem, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + .extra2 = &max_sndbuf, + }, + { + .procname = "rmem", + .data = &init_net.smc.sysctl_rmem, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + .extra2 = &max_rcvbuf, + }, + { } +}; + +int __net_init smc_sysctl_net_init(struct net *net) +{ + struct ctl_table *table; + + table = smc_table; + if (!net_eq(net, &init_net)) { + int i; + + table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) + table[i].data += (void *)net - (void *)&init_net; + } + + net->smc.smc_hdr = register_net_sysctl_sz(net, "net/smc", table, + ARRAY_SIZE(smc_table)); + if (!net->smc.smc_hdr) + goto err_reg; + + net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; + net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; + WRITE_ONCE(net->smc.sysctl_wmem, net_smc_wmem_init); + WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init); + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +void __net_exit smc_sysctl_net_exit(struct net *net) +{ + struct ctl_table *table; + + table = net->smc.smc_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->smc.smc_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h new file mode 100644 index 0000000000..0becc11bd2 --- /dev/null +++ b/net/smc/smc_sysctl.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu <tonylu@linux.alibaba.com> + * + */ + +#ifndef _SMC_SYSCTL_H +#define _SMC_SYSCTL_H + +#ifdef CONFIG_SYSCTL + +int __net_init smc_sysctl_net_init(struct net *net); +void __net_exit smc_sysctl_net_exit(struct net *net); + +#else + +static inline int smc_sysctl_net_init(struct net *net) +{ + net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + return 0; +} + +static inline void smc_sysctl_net_exit(struct net *net) { } + +#endif /* CONFIG_SYSCTL */ + +#endif /* _SMC_SYSCTL_H */ diff --git a/net/smc/smc_tracepoint.c b/net/smc/smc_tracepoint.c new file mode 100644 index 0000000000..8d47ced5a4 --- /dev/null +++ b/net/smc/smc_tracepoint.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define CREATE_TRACE_POINTS +#include "smc_tracepoint.h" + +EXPORT_TRACEPOINT_SYMBOL(smc_switch_to_fallback); +EXPORT_TRACEPOINT_SYMBOL(smc_tx_sendmsg); +EXPORT_TRACEPOINT_SYMBOL(smc_rx_recvmsg); +EXPORT_TRACEPOINT_SYMBOL(smcr_link_down); diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h new file mode 100644 index 0000000000..9fc5e586d2 --- /dev/null +++ b/net/smc/smc_tracepoint.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM smc + +#if !defined(_TRACE_SMC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SMC_H + +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <linux/tracepoint.h> +#include <net/ipv6.h> +#include "smc.h" +#include "smc_core.h" + +TRACE_EVENT(smc_switch_to_fallback, + + TP_PROTO(const struct smc_sock *smc, int fallback_rsn), + + TP_ARGS(smc, fallback_rsn), + + TP_STRUCT__entry( + __field(const void *, sk) + __field(const void *, clcsk) + __field(u64, net_cookie) + __field(int, fallback_rsn) + ), + + TP_fast_assign( + const struct sock *sk = &smc->sk; + const struct sock *clcsk = smc->clcsock->sk; + + __entry->sk = sk; + __entry->clcsk = clcsk; + __entry->net_cookie = sock_net(sk)->net_cookie; + __entry->fallback_rsn = fallback_rsn; + ), + + TP_printk("sk=%p clcsk=%p net=%llu fallback_rsn=%d", + __entry->sk, __entry->clcsk, + __entry->net_cookie, __entry->fallback_rsn) +); + +DECLARE_EVENT_CLASS(smc_msg_event, + + TP_PROTO(const struct smc_sock *smc, size_t len), + + TP_ARGS(smc, len), + + TP_STRUCT__entry( + __field(const void *, smc) + __field(u64, net_cookie) + __field(size_t, len) + __string(name, smc->conn.lnk->ibname) + ), + + TP_fast_assign( + const struct sock *sk = &smc->sk; + + __entry->smc = smc; + __entry->net_cookie = sock_net(sk)->net_cookie; + __entry->len = len; + __assign_str(name, smc->conn.lnk->ibname); + ), + + TP_printk("smc=%p net=%llu len=%zu dev=%s", + __entry->smc, __entry->net_cookie, + __entry->len, __get_str(name)) +); + +DEFINE_EVENT(smc_msg_event, smc_tx_sendmsg, + + TP_PROTO(const struct smc_sock *smc, size_t len), + + TP_ARGS(smc, len) +); + +DEFINE_EVENT(smc_msg_event, smc_rx_recvmsg, + + TP_PROTO(const struct smc_sock *smc, size_t len), + + TP_ARGS(smc, len) +); + +TRACE_EVENT(smcr_link_down, + + TP_PROTO(const struct smc_link *lnk, void *location), + + TP_ARGS(lnk, location), + + TP_STRUCT__entry( + __field(const void *, lnk) + __field(const void *, lgr) + __field(u64, net_cookie) + __field(int, state) + __string(name, lnk->ibname) + __field(void *, location) + ), + + TP_fast_assign( + const struct smc_link_group *lgr = lnk->lgr; + + __entry->lnk = lnk; + __entry->lgr = lgr; + __entry->net_cookie = lgr->net->net_cookie; + __entry->state = lnk->state; + __assign_str(name, lnk->ibname); + __entry->location = location; + ), + + TP_printk("lnk=%p lgr=%p net=%llu state=%d dev=%s location=%pS", + __entry->lnk, __entry->lgr, __entry->net_cookie, + __entry->state, __get_str(name), + __entry->location) +); + +#endif /* _TRACE_SMC_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE smc_tracepoint + +#include <trace/define_trace.h> diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c new file mode 100644 index 0000000000..3b0ff3b589 --- /dev/null +++ b/net/smc/smc_tx.c @@ -0,0 +1,762 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Manage send buffer. + * Producer: + * Copy user space data into send buffer, if send buffer space available. + * Consumer: + * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available. + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/net.h> +#include <linux/rcupdate.h> +#include <linux/workqueue.h> +#include <linux/sched/signal.h> + +#include <net/sock.h> +#include <net/tcp.h> + +#include "smc.h" +#include "smc_wr.h" +#include "smc_cdc.h" +#include "smc_close.h" +#include "smc_ism.h" +#include "smc_tx.h" +#include "smc_stats.h" +#include "smc_tracepoint.h" + +#define SMC_TX_WORK_DELAY 0 + +/***************************** sndbuf producer *******************************/ + +/* callback implementation for sk.sk_write_space() + * to wakeup sndbuf producers that blocked with smc_tx_wait(). + * called under sk_socket lock. + */ +static void smc_tx_write_space(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + struct smc_sock *smc = smc_sk(sk); + struct socket_wq *wq; + + /* similar to sk_stream_write_space */ + if (atomic_read(&smc->conn.sndbuf_space) && sock) { + if (test_bit(SOCK_NOSPACE, &sock->flags)) + SMC_STAT_RMB_TX_FULL(smc, !smc->conn.lnk); + clear_bit(SOCK_NOSPACE, &sock->flags); + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, + EPOLLOUT | EPOLLWRNORM | + EPOLLWRBAND); + if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); + rcu_read_unlock(); + } +} + +/* Wakeup sndbuf producers that blocked with smc_tx_wait(). + * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space(). + */ +void smc_tx_sndbuf_nonfull(struct smc_sock *smc) +{ + if (smc->sk.sk_socket && + test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags)) + smc->sk.sk_write_space(&smc->sk); +} + +/* blocks sndbuf producer until at least one byte of free space available + * or urgent Byte was consumed + */ +static int smc_tx_wait(struct smc_sock *smc, int flags) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct smc_connection *conn = &smc->conn; + struct sock *sk = &smc->sk; + long timeo; + int rc = 0; + + /* similar to sk_stream_wait_memory */ + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + add_wait_queue(sk_sleep(sk), &wait); + while (1) { + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); + if (sk->sk_err || + (sk->sk_shutdown & SEND_SHUTDOWN) || + conn->killed || + conn->local_tx_ctrl.conn_state_flags.peer_done_writing) { + rc = -EPIPE; + break; + } + if (smc_cdc_rxed_any_close(conn)) { + rc = -ECONNRESET; + break; + } + if (!timeo) { + /* ensure EPOLLOUT is subsequently generated */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + rc = -EAGAIN; + break; + } + if (signal_pending(current)) { + rc = sock_intr_errno(timeo); + break; + } + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend) + break; /* at least 1 byte of free & no urgent data */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk_wait_event(sk, &timeo, + READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || + smc_cdc_rxed_any_close(conn) || + (atomic_read(&conn->sndbuf_space) && + !conn->urg_tx_pend), + &wait); + } + remove_wait_queue(sk_sleep(sk), &wait); + return rc; +} + +static bool smc_tx_is_corked(struct smc_sock *smc) +{ + struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); + + return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; +} + +/* If we have pending CDC messages, do not send: + * Because CQE of this CDC message will happen shortly, it gives + * a chance to coalesce future sendmsg() payload in to one RDMA Write, + * without need for a timer, and with no latency trade off. + * Algorithm here: + * 1. First message should never cork + * 2. If we have pending Tx CDC messages, wait for the first CDC + * message's completion + * 3. Don't cork to much data in a single RDMA Write to prevent burst + * traffic, total corked message should not exceed sendbuf/2 + */ +static bool smc_should_autocork(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + int corking_size; + + corking_size = min_t(unsigned int, conn->sndbuf_desc->len >> 1, + sock_net(&smc->sk)->smc.sysctl_autocorking_size); + + if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || + smc_tx_prepared_sends(conn) > corking_size) + return false; + return true; +} + +static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) +{ + struct smc_connection *conn = &smc->conn; + + if (smc_should_autocork(smc)) + return true; + + /* for a corked socket defer the RDMA writes if + * sndbuf_space is still available. The applications + * should known how/when to uncork it. + */ + if ((msg->msg_flags & MSG_MORE || + smc_tx_is_corked(smc)) && + atomic_read(&conn->sndbuf_space)) + return true; + + return false; +} + +/* sndbuf producer: main API called by socket layer. + * called under sock lock. + */ +int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) +{ + size_t copylen, send_done = 0, send_remaining = len; + size_t chunk_len, chunk_off, chunk_len_sum; + struct smc_connection *conn = &smc->conn; + union smc_host_cursor prep; + struct sock *sk = &smc->sk; + char *sndbuf_base; + int tx_cnt_prep; + int writespace; + int rc, chunk; + + /* This should be in poll */ + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { + rc = -EPIPE; + goto out_err; + } + + if (sk->sk_state == SMC_INIT) + return -ENOTCONN; + + if (len > conn->sndbuf_desc->len) + SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk); + + if (len > conn->peer_rmbe_size) + SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, !conn->lnk); + + if (msg->msg_flags & MSG_OOB) + SMC_STAT_INC(smc, urg_data_cnt); + + while (msg_data_left(msg)) { + if (smc->sk.sk_shutdown & SEND_SHUTDOWN || + (smc->sk.sk_err == ECONNABORTED) || + conn->killed) + return -EPIPE; + if (smc_cdc_rxed_any_close(conn)) + return send_done ?: -ECONNRESET; + + if (msg->msg_flags & MSG_OOB) + conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; + + if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { + if (send_done) + return send_done; + rc = smc_tx_wait(smc, msg->msg_flags); + if (rc) + goto out_err; + continue; + } + + /* initialize variables for 1st iteration of subsequent loop */ + /* could be just 1 byte, even after smc_tx_wait above */ + writespace = atomic_read(&conn->sndbuf_space); + /* not more than what user space asked for */ + copylen = min_t(size_t, send_remaining, writespace); + /* determine start of sndbuf */ + sndbuf_base = conn->sndbuf_desc->cpu_addr; + smc_curs_copy(&prep, &conn->tx_curs_prep, conn); + tx_cnt_prep = prep.count; + /* determine chunks where to write into sndbuf */ + /* either unwrapped case, or 1st chunk of wrapped case */ + chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len - + tx_cnt_prep); + chunk_len_sum = chunk_len; + chunk_off = tx_cnt_prep; + for (chunk = 0; chunk < 2; chunk++) { + rc = memcpy_from_msg(sndbuf_base + chunk_off, + msg, chunk_len); + if (rc) { + smc_sndbuf_sync_sg_for_device(conn); + if (send_done) + return send_done; + goto out_err; + } + send_done += chunk_len; + send_remaining -= chunk_len; + + if (chunk_len_sum == copylen) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + chunk_len = copylen - chunk_len; /* remainder */ + chunk_len_sum += chunk_len; + chunk_off = 0; /* modulo offset in send ring buffer */ + } + smc_sndbuf_sync_sg_for_device(conn); + /* update cursors */ + smc_curs_add(conn->sndbuf_desc->len, &prep, copylen); + smc_curs_copy(&conn->tx_curs_prep, &prep, conn); + /* increased in send tasklet smc_cdc_tx_handler() */ + smp_mb__before_atomic(); + atomic_sub(copylen, &conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ + smp_mb__after_atomic(); + /* since we just produced more new data into sndbuf, + * trigger sndbuf consumer: RDMA write into peer RMBE and CDC + */ + if ((msg->msg_flags & MSG_OOB) && !send_remaining) + conn->urg_tx_pend = true; + /* If we need to cork, do nothing and wait for the next + * sendmsg() call or push on tx completion + */ + if (!smc_tx_should_cork(smc, msg)) + smc_tx_sndbuf_nonempty(conn); + + trace_smc_tx_sendmsg(smc, copylen); + } /* while (msg_data_left(msg)) */ + + return send_done; + +out_err: + rc = sk_stream_error(sk, msg->msg_flags, rc); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(rc == -EAGAIN)) + sk->sk_write_space(sk); + return rc; +} + +/***************************** sndbuf consumer *******************************/ + +/* sndbuf consumer: actual data transfer of one target chunk with ISM write */ +int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, + u32 offset, int signal) +{ + int rc; + + rc = smc_ism_write(conn->lgr->smcd, conn->peer_token, + conn->peer_rmbe_idx, signal, conn->tx_off + offset, + data, len); + if (rc) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + return rc; +} + +/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */ +static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, + int num_sges, struct ib_rdma_wr *rdma_wr) +{ + struct smc_link_group *lgr = conn->lgr; + struct smc_link *link = conn->lnk; + int rc; + + rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link); + rdma_wr->wr.num_sge = num_sges; + rdma_wr->remote_addr = + lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr + + /* RMBE within RMB */ + conn->tx_off + + /* offset within RMBE */ + peer_rmbe_offset; + rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; + rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); + if (rc) + smcr_link_down_cond_sched(link); + return rc; +} + +/* sndbuf consumer */ +static inline void smc_tx_advance_cursors(struct smc_connection *conn, + union smc_host_cursor *prod, + union smc_host_cursor *sent, + size_t len) +{ + smc_curs_add(conn->peer_rmbe_size, prod, len); + /* increased in recv tasklet smc_cdc_msg_rcv() */ + smp_mb__before_atomic(); + /* data in flight reduces usable snd_wnd */ + atomic_sub(len, &conn->peer_rmbe_space); + /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ + smp_mb__after_atomic(); + smc_curs_add(conn->sndbuf_desc->len, sent, len); +} + +/* SMC-R helper for smc_tx_rdma_writes() */ +static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, + size_t src_off, size_t src_len, + size_t dst_off, size_t dst_len, + struct smc_rdma_wr *wr_rdma_buf) +{ + struct smc_link *link = conn->lnk; + + dma_addr_t dma_addr = + sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); + u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; + int src_len_sum = src_len, dst_len_sum = dst_len; + int sent_count = src_off; + int srcchunk, dstchunk; + int num_sges; + int rc; + + for (dstchunk = 0; dstchunk < 2; dstchunk++) { + struct ib_rdma_wr *wr = &wr_rdma_buf->wr_tx_rdma[dstchunk]; + struct ib_sge *sge = wr->wr.sg_list; + u64 base_addr = dma_addr; + + if (dst_len < link->qp_attr.cap.max_inline_data) { + base_addr = virt_addr; + wr->wr.send_flags |= IB_SEND_INLINE; + } else { + wr->wr.send_flags &= ~IB_SEND_INLINE; + } + + num_sges = 0; + for (srcchunk = 0; srcchunk < 2; srcchunk++) { + sge[srcchunk].addr = conn->sndbuf_desc->is_vm ? + (virt_addr + src_off) : (base_addr + src_off); + sge[srcchunk].length = src_len; + if (conn->sndbuf_desc->is_vm) + sge[srcchunk].lkey = + conn->sndbuf_desc->mr[link->link_idx]->lkey; + num_sges++; + + src_off += src_len; + if (src_off >= conn->sndbuf_desc->len) + src_off -= conn->sndbuf_desc->len; + /* modulo in send ring */ + if (src_len_sum == dst_len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + src_len = dst_len - src_len; /* remainder */ + src_len_sum += src_len; + } + rc = smc_tx_rdma_write(conn, dst_off, num_sges, wr); + if (rc) + return rc; + if (dst_len_sum == len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + dst_off = 0; /* modulo offset in RMBE ring buffer */ + dst_len = len - dst_len; /* remainder */ + dst_len_sum += dst_len; + src_len = min_t(int, dst_len, conn->sndbuf_desc->len - + sent_count); + src_len_sum = src_len; + } + return 0; +} + +/* SMC-D helper for smc_tx_rdma_writes() */ +static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, + size_t src_off, size_t src_len, + size_t dst_off, size_t dst_len) +{ + int src_len_sum = src_len, dst_len_sum = dst_len; + int srcchunk, dstchunk; + int rc; + + for (dstchunk = 0; dstchunk < 2; dstchunk++) { + for (srcchunk = 0; srcchunk < 2; srcchunk++) { + void *data = conn->sndbuf_desc->cpu_addr + src_off; + + rc = smcd_tx_ism_write(conn, data, src_len, dst_off + + sizeof(struct smcd_cdc_msg), 0); + if (rc) + return rc; + dst_off += src_len; + src_off += src_len; + if (src_off >= conn->sndbuf_desc->len) + src_off -= conn->sndbuf_desc->len; + /* modulo in send ring */ + if (src_len_sum == dst_len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + src_len = dst_len - src_len; /* remainder */ + src_len_sum += src_len; + } + if (dst_len_sum == len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + dst_off = 0; /* modulo offset in RMBE ring buffer */ + dst_len = len - dst_len; /* remainder */ + dst_len_sum += dst_len; + src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off); + src_len_sum = src_len; + } + return 0; +} + +/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; + * usable snd_wnd as max transmit + */ +static int smc_tx_rdma_writes(struct smc_connection *conn, + struct smc_rdma_wr *wr_rdma_buf) +{ + size_t len, src_len, dst_off, dst_len; /* current chunk values */ + union smc_host_cursor sent, prep, prod, cons; + struct smc_cdc_producer_flags *pflags; + int to_send, rmbespace; + int rc; + + /* source: sndbuf */ + smc_curs_copy(&sent, &conn->tx_curs_sent, conn); + smc_curs_copy(&prep, &conn->tx_curs_prep, conn); + /* cf. wmem_alloc - (snd_max - snd_una) */ + to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); + if (to_send <= 0) + return 0; + + /* destination: RMBE */ + /* cf. snd_wnd */ + rmbespace = atomic_read(&conn->peer_rmbe_space); + if (rmbespace <= 0) { + struct smc_sock *smc = container_of(conn, struct smc_sock, + conn); + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); + return 0; + } + smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); + + /* if usable snd_wnd closes ask peer to advertise once it opens again */ + pflags = &conn->local_tx_ctrl.prod_flags; + pflags->write_blocked = (to_send >= rmbespace); + /* cf. usable snd_wnd */ + len = min(to_send, rmbespace); + + /* initialize variables for first iteration of subsequent nested loop */ + dst_off = prod.count; + if (prod.wrap == cons.wrap) { + /* the filled destination area is unwrapped, + * hence the available free destination space is wrapped + * and we need 2 destination chunks of sum len; start with 1st + * which is limited by what's available in sndbuf + */ + dst_len = min_t(size_t, + conn->peer_rmbe_size - prod.count, len); + } else { + /* the filled destination area is wrapped, + * hence the available free destination space is unwrapped + * and we need a single destination chunk of entire len + */ + dst_len = len; + } + /* dst_len determines the maximum src_len */ + if (sent.count + dst_len <= conn->sndbuf_desc->len) { + /* unwrapped src case: single chunk of entire dst_len */ + src_len = dst_len; + } else { + /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ + src_len = conn->sndbuf_desc->len - sent.count; + } + + if (conn->lgr->is_smcd) + rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len, + dst_off, dst_len); + else + rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len, + dst_off, dst_len, wr_rdma_buf); + if (rc) + return rc; + + if (conn->urg_tx_pend && len == to_send) + pflags->urg_data_present = 1; + smc_tx_advance_cursors(conn, &prod, &sent, len); + /* update connection's cursors with advanced local cursors */ + smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn); + /* dst: peer RMBE */ + smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */ + + return 0; +} + +/* Wakeup sndbuf consumers from any context (IRQ or process) + * since there is more data to transmit; usable snd_wnd as max transmit + */ +static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; + struct smc_link *link = conn->lnk; + struct smc_rdma_wr *wr_rdma_buf; + struct smc_cdc_tx_pend *pend; + struct smc_wr_buf *wr_buf; + int rc; + + if (!link || !smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); + if (rc < 0) { + smc_wr_tx_link_put(link); + if (rc == -EBUSY) { + struct smc_sock *smc = + container_of(conn, struct smc_sock, conn); + + if (smc->sk.sk_err == ECONNABORTED) + return sock_error(&smc->sk); + if (conn->killed) + return -EPIPE; + rc = 0; + mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); + } + return rc; + } + + spin_lock_bh(&conn->send_lock); + if (link != conn->lnk) { + /* link of connection changed, tx_work will restart */ + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)pend); + rc = -ENOLINK; + goto out_unlock; + } + if (!pflags->urg_data_present) { + rc = smc_tx_rdma_writes(conn, wr_rdma_buf); + if (rc) { + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)pend); + goto out_unlock; + } + } + + rc = smc_cdc_msg_send(conn, wr_buf, pend); + if (!rc && pflags->urg_data_present) { + pflags->urg_data_pending = 0; + pflags->urg_data_present = 0; + } + +out_unlock: + spin_unlock_bh(&conn->send_lock); + smc_wr_tx_link_put(link); + return rc; +} + +static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; + int rc = 0; + + spin_lock_bh(&conn->send_lock); + if (!pflags->urg_data_present) + rc = smc_tx_rdma_writes(conn, NULL); + if (!rc) + rc = smcd_cdc_msg_send(conn); + + if (!rc && pflags->urg_data_present) { + pflags->urg_data_pending = 0; + pflags->urg_data_present = 0; + } + spin_unlock_bh(&conn->send_lock); + return rc; +} + +static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + int rc = 0; + + /* No data in the send queue */ + if (unlikely(smc_tx_prepared_sends(conn) <= 0)) + goto out; + + /* Peer don't have RMBE space */ + if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); + goto out; + } + + if (conn->killed || + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + rc = -EPIPE; /* connection being aborted */ + goto out; + } + if (conn->lgr->is_smcd) + rc = smcd_tx_sndbuf_nonempty(conn); + else + rc = smcr_tx_sndbuf_nonempty(conn); + + if (!rc) { + /* trigger socket release if connection is closing */ + smc_close_wake_tx_prepared(smc); + } + +out: + return rc; +} + +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + int rc; + + /* This make sure only one can send simultaneously to prevent wasting + * of CPU and CDC slot. + * Record whether someone has tried to push while we are pushing. + */ + if (atomic_inc_return(&conn->tx_pushing) > 1) + return 0; + +again: + atomic_set(&conn->tx_pushing, 1); + smp_wmb(); /* Make sure tx_pushing is 1 before real send */ + rc = __smc_tx_sndbuf_nonempty(conn); + + /* We need to check whether someone else have added some data into + * the send queue and tried to push but failed after the atomic_set() + * when we are pushing. + * If so, we need to push again to prevent those data hang in the send + * queue. + */ + if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) + goto again; + + return rc; +} + +/* Wakeup sndbuf consumers from process context + * since there is more data to transmit. The caller + * must hold sock lock. + */ +void smc_tx_pending(struct smc_connection *conn) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + int rc; + + if (smc->sk.sk_err) + return; + + rc = smc_tx_sndbuf_nonempty(conn); + if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && + !atomic_read(&conn->bytes_to_rcv)) + conn->local_rx_ctrl.prod_flags.write_blocked = 0; +} + +/* Wakeup sndbuf consumers from process context + * since there is more data to transmit in locked + * sock. + */ +void smc_tx_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(to_delayed_work(work), + struct smc_connection, + tx_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + lock_sock(&smc->sk); + smc_tx_pending(conn); + release_sock(&smc->sk); +} + +void smc_tx_consumer_update(struct smc_connection *conn, bool force) +{ + union smc_host_cursor cfed, cons, prod; + int sender_free = conn->rmb_desc->len; + int to_confirm; + + smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); + smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn); + to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons); + if (to_confirm > conn->rmbe_update_limit) { + smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn); + sender_free = conn->rmb_desc->len - + smc_curs_diff_large(conn->rmb_desc->len, + &cfed, &prod); + } + + if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || + force || + ((to_confirm > conn->rmbe_update_limit) && + ((sender_free <= (conn->rmb_desc->len / 2)) || + conn->local_rx_ctrl.prod_flags.write_blocked))) { + if (conn->killed || + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return; + if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && + !conn->killed) { + queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); + return; + } + } + if (conn->local_rx_ctrl.prod_flags.write_blocked && + !atomic_read(&conn->bytes_to_rcv)) + conn->local_rx_ctrl.prod_flags.write_blocked = 0; +} + +/***************************** send initialize *******************************/ + +/* Initialize send properties on connection establishment. NB: not __init! */ +void smc_tx_init(struct smc_sock *smc) +{ + smc->sk.sk_write_space = smc_tx_write_space; +} diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h new file mode 100644 index 0000000000..a59f370b8b --- /dev/null +++ b/net/smc/smc_tx.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Manage send buffer + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_TX_H +#define SMC_TX_H + +#include <linux/socket.h> +#include <linux/types.h> + +#include "smc.h" +#include "smc_cdc.h" + +static inline int smc_tx_prepared_sends(struct smc_connection *conn) +{ + union smc_host_cursor sent, prep; + + smc_curs_copy(&sent, &conn->tx_curs_sent, conn); + smc_curs_copy(&prep, &conn->tx_curs_prep, conn); + return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); +} + +void smc_tx_pending(struct smc_connection *conn); +void smc_tx_work(struct work_struct *work); +void smc_tx_init(struct smc_sock *smc); +int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); +int smc_tx_sndbuf_nonempty(struct smc_connection *conn); +void smc_tx_sndbuf_nonfull(struct smc_sock *smc); +void smc_tx_consumer_update(struct smc_connection *conn, bool force); +int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, + u32 offset, int signal); + +#endif /* SMC_TX_H */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c new file mode 100644 index 0000000000..0021065a60 --- /dev/null +++ b/net/smc/smc_wr.c @@ -0,0 +1,939 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Work Requests exploiting Infiniband API + * + * Work requests (WR) of type ib_post_send or ib_post_recv respectively + * are submitted to either RC SQ or RC RQ respectively + * (reliably connected send/receive queue) + * and become work queue entries (WQEs). + * While an SQ WR/WQE is pending, we track it until transmission completion. + * Through a send or receive completion queue (CQ) respectively, + * we get completion queue entries (CQEs) [aka work completions (WCs)]. + * Since the CQ callback is called from IRQ context, we split work by using + * bottom halves implemented by tasklets. + * + * SMC uses this to exchange LLC (link layer control) + * and CDC (connection data control) messages. + * + * Copyright IBM Corp. 2016 + * + * Author(s): Steffen Maier <maier@linux.vnet.ibm.com> + */ + +#include <linux/atomic.h> +#include <linux/hashtable.h> +#include <linux/wait.h> +#include <rdma/ib_verbs.h> +#include <asm/div64.h> + +#include "smc.h" +#include "smc_wr.h" + +#define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */ + +#define SMC_WR_RX_HASH_BITS 4 +static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS); +static DEFINE_SPINLOCK(smc_wr_rx_hash_lock); + +struct smc_wr_tx_pend { /* control data for a pending send request */ + u64 wr_id; /* work request id sent */ + smc_wr_tx_handler handler; + enum ib_wc_status wc_status; /* CQE status */ + struct smc_link *link; + u32 idx; + struct smc_wr_tx_pend_priv priv; + u8 compl_requested; +}; + +/******************************** send queue *********************************/ + +/*------------------------------- completion --------------------------------*/ + +/* returns true if at least one tx work request is pending on the given link */ +static inline bool smc_wr_is_tx_pend(struct smc_link *link) +{ + return !bitmap_empty(link->wr_tx_mask, link->wr_tx_cnt); +} + +/* wait till all pending tx work requests on the given link are completed */ +void smc_wr_tx_wait_no_pending_sends(struct smc_link *link) +{ + wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link)); +} + +static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) +{ + u32 i; + + for (i = 0; i < link->wr_tx_cnt; i++) { + if (link->wr_tx_pends[i].wr_id == wr_id) + return i; + } + return link->wr_tx_cnt; +} + +static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) +{ + struct smc_wr_tx_pend pnd_snd; + struct smc_link *link; + u32 pnd_snd_idx; + + link = wc->qp->qp_context; + + if (wc->opcode == IB_WC_REG_MR) { + if (wc->status) + link->wr_reg_state = FAILED; + else + link->wr_reg_state = CONFIRMED; + smc_wr_wakeup_reg_wait(link); + return; + } + + pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); + if (pnd_snd_idx == link->wr_tx_cnt) { + if (link->lgr->smc_version != SMC_V2 || + link->wr_tx_v2_pend->wr_id != wc->wr_id) + return; + link->wr_tx_v2_pend->wc_status = wc->status; + memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd)); + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(link->wr_tx_v2_pend, 0, + sizeof(*link->wr_tx_v2_pend)); + memset(link->lgr->wr_tx_buf_v2, 0, + sizeof(*link->lgr->wr_tx_buf_v2)); + } else { + link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; + if (link->wr_tx_pends[pnd_snd_idx].compl_requested) + complete(&link->wr_tx_compl[pnd_snd_idx]); + memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], + sizeof(pnd_snd)); + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(&link->wr_tx_pends[pnd_snd_idx], 0, + sizeof(link->wr_tx_pends[pnd_snd_idx])); + memset(&link->wr_tx_bufs[pnd_snd_idx], 0, + sizeof(link->wr_tx_bufs[pnd_snd_idx])); + if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) + return; + } + + if (wc->status) { + if (link->lgr->smc_version == SMC_V2) { + memset(link->wr_tx_v2_pend, 0, + sizeof(*link->wr_tx_v2_pend)); + memset(link->lgr->wr_tx_buf_v2, 0, + sizeof(*link->lgr->wr_tx_buf_v2)); + } + /* terminate link */ + smcr_link_down_cond_sched(link); + } + if (pnd_snd.handler) + pnd_snd.handler(&pnd_snd.priv, link, wc->status); + wake_up(&link->wr_tx_wait); +} + +static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) +{ + struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); + struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; + int i = 0, rc; + int polled = 0; + +again: + polled++; + do { + memset(&wc, 0, sizeof(wc)); + rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); + if (polled == 1) { + ib_req_notify_cq(dev->roce_cq_send, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS); + } + if (!rc) + break; + for (i = 0; i < rc; i++) + smc_wr_tx_process_cqe(&wc[i]); + } while (rc > 0); + if (polled == 1) + goto again; +} + +void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) +{ + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + + tasklet_schedule(&dev->send_tasklet); +} + +/*---------------------------- request submission ---------------------------*/ + +static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) +{ + *idx = link->wr_tx_cnt; + if (!smc_link_sendable(link)) + return -ENOLINK; + for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { + if (!test_and_set_bit(*idx, link->wr_tx_mask)) + return 0; + } + *idx = link->wr_tx_cnt; + return -EBUSY; +} + +/** + * smc_wr_tx_get_free_slot() - returns buffer for message assembly, + * and sets info for pending transmit tracking + * @link: Pointer to smc_link used to later send the message. + * @handler: Send completion handler function pointer. + * @wr_buf: Out value returns pointer to message buffer. + * @wr_rdma_buf: Out value returns pointer to rdma work request. + * @wr_pend_priv: Out value returns pointer serving as handler context. + * + * Return: 0 on success, or -errno on error. + */ +int smc_wr_tx_get_free_slot(struct smc_link *link, + smc_wr_tx_handler handler, + struct smc_wr_buf **wr_buf, + struct smc_rdma_wr **wr_rdma_buf, + struct smc_wr_tx_pend_priv **wr_pend_priv) +{ + struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_wr_tx_pend *wr_pend; + u32 idx = link->wr_tx_cnt; + struct ib_send_wr *wr_ib; + u64 wr_id; + int rc; + + *wr_buf = NULL; + *wr_pend_priv = NULL; + if (in_softirq() || lgr->terminating) { + rc = smc_wr_tx_get_free_slot_index(link, &idx); + if (rc) + return rc; + } else { + rc = wait_event_interruptible_timeout( + link->wr_tx_wait, + !smc_link_sendable(link) || + lgr->terminating || + (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), + SMC_WR_TX_WAIT_FREE_SLOT_TIME); + if (!rc) { + /* timeout - terminate link */ + smcr_link_down_cond_sched(link); + return -EPIPE; + } + if (idx == link->wr_tx_cnt) + return -EPIPE; + } + wr_id = smc_wr_tx_get_next_wr_id(link); + wr_pend = &link->wr_tx_pends[idx]; + wr_pend->wr_id = wr_id; + wr_pend->handler = handler; + wr_pend->link = link; + wr_pend->idx = idx; + wr_ib = &link->wr_tx_ibs[idx]; + wr_ib->wr_id = wr_id; + *wr_buf = &link->wr_tx_bufs[idx]; + if (wr_rdma_buf) + *wr_rdma_buf = &link->wr_tx_rdmas[idx]; + *wr_pend_priv = &wr_pend->priv; + return 0; +} + +int smc_wr_tx_get_v2_slot(struct smc_link *link, + smc_wr_tx_handler handler, + struct smc_wr_v2_buf **wr_buf, + struct smc_wr_tx_pend_priv **wr_pend_priv) +{ + struct smc_wr_tx_pend *wr_pend; + struct ib_send_wr *wr_ib; + u64 wr_id; + + if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt) + return -EBUSY; + + *wr_buf = NULL; + *wr_pend_priv = NULL; + wr_id = smc_wr_tx_get_next_wr_id(link); + wr_pend = link->wr_tx_v2_pend; + wr_pend->wr_id = wr_id; + wr_pend->handler = handler; + wr_pend->link = link; + wr_pend->idx = link->wr_tx_cnt; + wr_ib = link->wr_tx_v2_ib; + wr_ib->wr_id = wr_id; + *wr_buf = link->lgr->wr_tx_buf_v2; + *wr_pend_priv = &wr_pend->priv; + return 0; +} + +int smc_wr_tx_put_slot(struct smc_link *link, + struct smc_wr_tx_pend_priv *wr_pend_priv) +{ + struct smc_wr_tx_pend *pend; + + pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv); + if (pend->idx < link->wr_tx_cnt) { + u32 idx = pend->idx; + + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(&link->wr_tx_pends[idx], 0, + sizeof(link->wr_tx_pends[idx])); + memset(&link->wr_tx_bufs[idx], 0, + sizeof(link->wr_tx_bufs[idx])); + test_and_clear_bit(idx, link->wr_tx_mask); + wake_up(&link->wr_tx_wait); + return 1; + } else if (link->lgr->smc_version == SMC_V2 && + pend->idx == link->wr_tx_cnt) { + /* Large v2 buffer */ + memset(&link->wr_tx_v2_pend, 0, + sizeof(link->wr_tx_v2_pend)); + memset(&link->lgr->wr_tx_buf_v2, 0, + sizeof(link->lgr->wr_tx_buf_v2)); + return 1; + } + + return 0; +} + +/* Send prepared WR slot via ib_post_send. + * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer + */ +int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) +{ + struct smc_wr_tx_pend *pend; + int rc; + + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + pend = container_of(priv, struct smc_wr_tx_pend, priv); + rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); + if (rc) { + smc_wr_tx_put_slot(link, priv); + smcr_link_down_cond_sched(link); + } + return rc; +} + +int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + int len) +{ + int rc; + + link->wr_tx_v2_ib->sg_list[0].length = len; + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); + if (rc) { + smc_wr_tx_put_slot(link, priv); + smcr_link_down_cond_sched(link); + } + return rc; +} + +/* Send prepared WR slot via ib_post_send and wait for send completion + * notification. + * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer + */ +int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + unsigned long timeout) +{ + struct smc_wr_tx_pend *pend; + u32 pnd_idx; + int rc; + + pend = container_of(priv, struct smc_wr_tx_pend, priv); + pend->compl_requested = 1; + pnd_idx = pend->idx; + init_completion(&link->wr_tx_compl[pnd_idx]); + + rc = smc_wr_tx_send(link, priv); + if (rc) + return rc; + /* wait for completion by smc_wr_tx_process_cqe() */ + rc = wait_for_completion_interruptible_timeout( + &link->wr_tx_compl[pnd_idx], timeout); + if (rc <= 0) + rc = -ENODATA; + if (rc > 0) + rc = 0; + return rc; +} + +/* Register a memory region and wait for result. */ +int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) +{ + int rc; + + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + link->wr_reg_state = POSTED; + link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; + link->wr_reg.mr = mr; + link->wr_reg.key = mr->rkey; + rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL); + if (rc) + return rc; + + percpu_ref_get(&link->wr_reg_refs); + rc = wait_event_interruptible_timeout(link->wr_reg_wait, + (link->wr_reg_state != POSTED), + SMC_WR_REG_MR_WAIT_TIME); + percpu_ref_put(&link->wr_reg_refs); + if (!rc) { + /* timeout - terminate link */ + smcr_link_down_cond_sched(link); + return -EPIPE; + } + if (rc == -ERESTARTSYS) + return -EINTR; + switch (link->wr_reg_state) { + case CONFIRMED: + rc = 0; + break; + case FAILED: + rc = -EIO; + break; + case POSTED: + rc = -EPIPE; + break; + } + return rc; +} + +/****************************** receive queue ********************************/ + +int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) +{ + struct smc_wr_rx_handler *h_iter; + int rc = 0; + + spin_lock(&smc_wr_rx_hash_lock); + hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) { + if (h_iter->type == handler->type) { + rc = -EEXIST; + goto out_unlock; + } + } + hash_add(smc_wr_rx_hash, &handler->list, handler->type); +out_unlock: + spin_unlock(&smc_wr_rx_hash_lock); + return rc; +} + +/* Demultiplex a received work request based on the message type to its handler. + * Relies on smc_wr_rx_hash having been completely filled before any IB WRs, + * and not being modified any more afterwards so we don't need to lock it. + */ +static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + struct smc_wr_rx_handler *handler; + struct smc_wr_rx_hdr *wr_rx; + u64 temp_wr_id; + u32 index; + + if (wc->byte_len < sizeof(*wr_rx)) + return; /* short message */ + temp_wr_id = wc->wr_id; + index = do_div(temp_wr_id, link->wr_rx_cnt); + wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; + hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { + if (handler->type == wr_rx->type) + handler->handler(wc, wr_rx); + } +} + +static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) +{ + struct smc_link *link; + int i; + + for (i = 0; i < num; i++) { + link = wc[i].qp->qp_context; + link->wr_rx_id_compl = wc[i].wr_id; + if (wc[i].status == IB_WC_SUCCESS) { + link->wr_rx_tstamp = jiffies; + smc_wr_rx_demultiplex(&wc[i]); + smc_wr_rx_post(link); /* refill WR RX */ + } else { + /* handle status errors */ + switch (wc[i].status) { + case IB_WC_RETRY_EXC_ERR: + case IB_WC_RNR_RETRY_EXC_ERR: + case IB_WC_WR_FLUSH_ERR: + smcr_link_down_cond_sched(link); + if (link->wr_rx_id_compl == link->wr_rx_id) + wake_up(&link->wr_rx_empty_wait); + break; + default: + smc_wr_rx_post(link); /* refill WR RX */ + break; + } + } + } +} + +static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) +{ + struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); + struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; + int polled = 0; + int rc; + +again: + polled++; + do { + memset(&wc, 0, sizeof(wc)); + rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); + if (polled == 1) { + ib_req_notify_cq(dev->roce_cq_recv, + IB_CQ_SOLICITED_MASK + | IB_CQ_REPORT_MISSED_EVENTS); + } + if (!rc) + break; + smc_wr_rx_process_cqes(&wc[0], rc); + } while (rc > 0); + if (polled == 1) + goto again; +} + +void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) +{ + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + + tasklet_schedule(&dev->recv_tasklet); +} + +int smc_wr_rx_post_init(struct smc_link *link) +{ + u32 i; + int rc = 0; + + for (i = 0; i < link->wr_rx_cnt; i++) + rc = smc_wr_rx_post(link); + return rc; +} + +/***************************** init, exit, misc ******************************/ + +void smc_wr_remember_qp_attr(struct smc_link *lnk) +{ + struct ib_qp_attr *attr = &lnk->qp_attr; + struct ib_qp_init_attr init_attr; + + memset(attr, 0, sizeof(*attr)); + memset(&init_attr, 0, sizeof(init_attr)); + ib_query_qp(lnk->roce_qp, attr, + IB_QP_STATE | + IB_QP_CUR_STATE | + IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY | + IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_RQ_PSN | + IB_QP_ALT_PATH | + IB_QP_MIN_RNR_TIMER | + IB_QP_SQ_PSN | + IB_QP_PATH_MIG_STATE | + IB_QP_CAP | + IB_QP_DEST_QPN, + &init_attr); + + lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, + lnk->qp_attr.cap.max_send_wr); + lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, + lnk->qp_attr.cap.max_recv_wr); +} + +static void smc_wr_init_sge(struct smc_link *lnk) +{ + int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; + bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE); + u32 i; + + for (i = 0; i < lnk->wr_tx_cnt; i++) { + lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) : + lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE; + lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE; + lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; + lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey = + lnk->roce_pd->local_dma_lkey; + lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey = + lnk->roce_pd->local_dma_lkey; + lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey = + lnk->roce_pd->local_dma_lkey; + lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey = + lnk->roce_pd->local_dma_lkey; + lnk->wr_tx_ibs[i].next = NULL; + lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i]; + lnk->wr_tx_ibs[i].num_sge = 1; + lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; + lnk->wr_tx_ibs[i].send_flags = + IB_SEND_SIGNALED | IB_SEND_SOLICITED; + if (send_inline) + lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE; + lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; + lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE; + lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list = + lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge; + lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list = + lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge; + } + + if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr; + lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE; + lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey; + + lnk->wr_tx_v2_ib->next = NULL; + lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge; + lnk->wr_tx_v2_ib->num_sge = 1; + lnk->wr_tx_v2_ib->opcode = IB_WR_SEND; + lnk->wr_tx_v2_ib->send_flags = + IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE. + * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer + * and the same buffer for all sges. When a larger message arrived then + * the content of the first small sge is copied to the beginning of + * the larger spillover buffer, allowing easy data mapping. + */ + for (i = 0; i < lnk->wr_rx_cnt; i++) { + int x = i * sges_per_buf; + + lnk->wr_rx_sges[x].addr = + lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; + lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; + if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_rx_sges[x + 1].addr = + lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x + 1].length = + SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x + 1].lkey = + lnk->roce_pd->local_dma_lkey; + } + lnk->wr_rx_ibs[i].next = NULL; + lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; + lnk->wr_rx_ibs[i].num_sge = sges_per_buf; + } + lnk->wr_reg.wr.next = NULL; + lnk->wr_reg.wr.num_sge = 0; + lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED; + lnk->wr_reg.wr.opcode = IB_WR_REG_MR; + lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE; +} + +void smc_wr_free_link(struct smc_link *lnk) +{ + struct ib_device *ibdev; + + if (!lnk->smcibdev) + return; + ibdev = lnk->smcibdev->ibdev; + + smc_wr_drain_cq(lnk); + smc_wr_wakeup_reg_wait(lnk); + smc_wr_wakeup_tx_wait(lnk); + + smc_wr_tx_wait_no_pending_sends(lnk); + percpu_ref_kill(&lnk->wr_reg_refs); + wait_for_completion(&lnk->reg_ref_comp); + percpu_ref_kill(&lnk->wr_tx_refs); + wait_for_completion(&lnk->tx_ref_comp); + + if (lnk->wr_rx_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, + SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + DMA_FROM_DEVICE); + lnk->wr_rx_dma_addr = 0; + } + if (lnk->wr_rx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + lnk->wr_rx_v2_dma_addr = 0; + } + if (lnk->wr_tx_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr, + SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, + DMA_TO_DEVICE); + lnk->wr_tx_dma_addr = 0; + } + if (lnk->wr_tx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_TO_DEVICE); + lnk->wr_tx_v2_dma_addr = 0; + } +} + +void smc_wr_free_lgr_mem(struct smc_link_group *lgr) +{ + if (lgr->smc_version < SMC_V2) + return; + + kfree(lgr->wr_rx_buf_v2); + lgr->wr_rx_buf_v2 = NULL; + kfree(lgr->wr_tx_buf_v2); + lgr->wr_tx_buf_v2 = NULL; +} + +void smc_wr_free_link_mem(struct smc_link *lnk) +{ + kfree(lnk->wr_tx_v2_ib); + lnk->wr_tx_v2_ib = NULL; + kfree(lnk->wr_tx_v2_sge); + lnk->wr_tx_v2_sge = NULL; + kfree(lnk->wr_tx_v2_pend); + lnk->wr_tx_v2_pend = NULL; + kfree(lnk->wr_tx_compl); + lnk->wr_tx_compl = NULL; + kfree(lnk->wr_tx_pends); + lnk->wr_tx_pends = NULL; + bitmap_free(lnk->wr_tx_mask); + lnk->wr_tx_mask = NULL; + kfree(lnk->wr_tx_sges); + lnk->wr_tx_sges = NULL; + kfree(lnk->wr_tx_rdma_sges); + lnk->wr_tx_rdma_sges = NULL; + kfree(lnk->wr_rx_sges); + lnk->wr_rx_sges = NULL; + kfree(lnk->wr_tx_rdmas); + lnk->wr_tx_rdmas = NULL; + kfree(lnk->wr_rx_ibs); + lnk->wr_rx_ibs = NULL; + kfree(lnk->wr_tx_ibs); + lnk->wr_tx_ibs = NULL; + kfree(lnk->wr_tx_bufs); + lnk->wr_tx_bufs = NULL; + kfree(lnk->wr_rx_bufs); + lnk->wr_rx_bufs = NULL; +} + +int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) +{ + if (lgr->smc_version < SMC_V2) + return 0; + + lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); + if (!lgr->wr_rx_buf_v2) + return -ENOMEM; + lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); + if (!lgr->wr_tx_buf_v2) { + kfree(lgr->wr_rx_buf_v2); + return -ENOMEM; + } + return 0; +} + +int smc_wr_alloc_link_mem(struct smc_link *link) +{ + int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; + + /* allocate link related memory */ + link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); + if (!link->wr_tx_bufs) + goto no_mem; + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + GFP_KERNEL); + if (!link->wr_rx_bufs) + goto no_mem_wr_tx_bufs; + link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]), + GFP_KERNEL); + if (!link->wr_tx_ibs) + goto no_mem_wr_rx_bufs; + link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, + sizeof(link->wr_rx_ibs[0]), + GFP_KERNEL); + if (!link->wr_rx_ibs) + goto no_mem_wr_tx_ibs; + link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT, + sizeof(link->wr_tx_rdmas[0]), + GFP_KERNEL); + if (!link->wr_tx_rdmas) + goto no_mem_wr_rx_ibs; + link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT, + sizeof(link->wr_tx_rdma_sges[0]), + GFP_KERNEL); + if (!link->wr_tx_rdma_sges) + goto no_mem_wr_tx_rdmas; + link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]), + GFP_KERNEL); + if (!link->wr_tx_sges) + goto no_mem_wr_tx_rdma_sges; + link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, + sizeof(link->wr_rx_sges[0]) * sges_per_buf, + GFP_KERNEL); + if (!link->wr_rx_sges) + goto no_mem_wr_tx_sges; + link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL); + if (!link->wr_tx_mask) + goto no_mem_wr_rx_sges; + link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, + sizeof(link->wr_tx_pends[0]), + GFP_KERNEL); + if (!link->wr_tx_pends) + goto no_mem_wr_tx_mask; + link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT, + sizeof(link->wr_tx_compl[0]), + GFP_KERNEL); + if (!link->wr_tx_compl) + goto no_mem_wr_tx_pends; + + if (link->lgr->smc_version == SMC_V2) { + link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib), + GFP_KERNEL); + if (!link->wr_tx_v2_ib) + goto no_mem_tx_compl; + link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge), + GFP_KERNEL); + if (!link->wr_tx_v2_sge) + goto no_mem_v2_ib; + link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend), + GFP_KERNEL); + if (!link->wr_tx_v2_pend) + goto no_mem_v2_sge; + } + return 0; + +no_mem_v2_sge: + kfree(link->wr_tx_v2_sge); +no_mem_v2_ib: + kfree(link->wr_tx_v2_ib); +no_mem_tx_compl: + kfree(link->wr_tx_compl); +no_mem_wr_tx_pends: + kfree(link->wr_tx_pends); +no_mem_wr_tx_mask: + kfree(link->wr_tx_mask); +no_mem_wr_rx_sges: + kfree(link->wr_rx_sges); +no_mem_wr_tx_sges: + kfree(link->wr_tx_sges); +no_mem_wr_tx_rdma_sges: + kfree(link->wr_tx_rdma_sges); +no_mem_wr_tx_rdmas: + kfree(link->wr_tx_rdmas); +no_mem_wr_rx_ibs: + kfree(link->wr_rx_ibs); +no_mem_wr_tx_ibs: + kfree(link->wr_tx_ibs); +no_mem_wr_rx_bufs: + kfree(link->wr_rx_bufs); +no_mem_wr_tx_bufs: + kfree(link->wr_tx_bufs); +no_mem: + return -ENOMEM; +} + +void smc_wr_remove_dev(struct smc_ib_device *smcibdev) +{ + tasklet_kill(&smcibdev->recv_tasklet); + tasklet_kill(&smcibdev->send_tasklet); +} + +void smc_wr_add_dev(struct smc_ib_device *smcibdev) +{ + tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); + tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); +} + +static void smcr_wr_tx_refs_free(struct percpu_ref *ref) +{ + struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs); + + complete(&lnk->tx_ref_comp); +} + +static void smcr_wr_reg_refs_free(struct percpu_ref *ref) +{ + struct smc_link *lnk = container_of(ref, struct smc_link, wr_reg_refs); + + complete(&lnk->reg_ref_comp); +} + +int smc_wr_create_link(struct smc_link *lnk) +{ + struct ib_device *ibdev = lnk->smcibdev->ibdev; + int rc = 0; + + smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); + lnk->wr_rx_id = 0; + lnk->wr_rx_dma_addr = ib_dma_map_single( + ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) { + lnk->wr_rx_dma_addr = 0; + rc = -EIO; + goto out; + } + if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev, + lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { + lnk->wr_rx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } + lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, + lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) { + lnk->wr_tx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } + } + lnk->wr_tx_dma_addr = ib_dma_map_single( + ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) { + rc = -EIO; + goto dma_unmap; + } + smc_wr_init_sge(lnk); + bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT); + init_waitqueue_head(&lnk->wr_tx_wait); + rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free, 0, GFP_KERNEL); + if (rc) + goto dma_unmap; + init_completion(&lnk->tx_ref_comp); + init_waitqueue_head(&lnk->wr_reg_wait); + rc = percpu_ref_init(&lnk->wr_reg_refs, smcr_wr_reg_refs_free, 0, GFP_KERNEL); + if (rc) + goto dma_unmap; + init_completion(&lnk->reg_ref_comp); + init_waitqueue_head(&lnk->wr_rx_empty_wait); + return rc; + +dma_unmap: + if (lnk->wr_rx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + lnk->wr_rx_v2_dma_addr = 0; + } + if (lnk->wr_tx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_TO_DEVICE); + lnk->wr_tx_v2_dma_addr = 0; + } + ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, + SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + DMA_FROM_DEVICE); + lnk->wr_rx_dma_addr = 0; +out: + return rc; +} diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h new file mode 100644 index 0000000000..f3008dda22 --- /dev/null +++ b/net/smc/smc_wr.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Work Requests exploiting Infiniband API + * + * Copyright IBM Corp. 2016 + * + * Author(s): Steffen Maier <maier@linux.vnet.ibm.com> + */ + +#ifndef SMC_WR_H +#define SMC_WR_H + +#include <linux/atomic.h> +#include <rdma/ib_verbs.h> +#include <asm/div64.h> + +#include "smc.h" +#include "smc_core.h" + +#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ + +#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) + +#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */ + +#define SMC_WR_TX_PEND_PRIV_SIZE 32 + +struct smc_wr_tx_pend_priv { + u8 priv[SMC_WR_TX_PEND_PRIV_SIZE]; +}; + +typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *, + struct smc_link *, + enum ib_wc_status); + +typedef bool (*smc_wr_tx_filter)(struct smc_wr_tx_pend_priv *, + unsigned long); + +typedef void (*smc_wr_tx_dismisser)(struct smc_wr_tx_pend_priv *); + +struct smc_wr_rx_handler { + struct hlist_node list; /* hash table collision resolution */ + void (*handler)(struct ib_wc *, void *); + u8 type; +}; + +/* Only used by RDMA write WRs. + * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly + */ +static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link) +{ + return atomic_long_inc_return(&link->wr_tx_id); +} + +static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) +{ + atomic_long_set(wr_tx_id, val); +} + +static inline bool smc_wr_tx_link_hold(struct smc_link *link) +{ + if (!smc_link_sendable(link)) + return false; + percpu_ref_get(&link->wr_tx_refs); + return true; +} + +static inline void smc_wr_tx_link_put(struct smc_link *link) +{ + percpu_ref_put(&link->wr_tx_refs); +} + +static inline void smc_wr_drain_cq(struct smc_link *lnk) +{ + wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id); +} + +static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) +{ + wake_up_all(&lnk->wr_tx_wait); +} + +static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) +{ + wake_up(&lnk->wr_reg_wait); +} + +/* post a new receive work request to fill a completed old work request entry */ +static inline int smc_wr_rx_post(struct smc_link *link) +{ + int rc; + u64 wr_id, temp_wr_id; + u32 index; + + wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */ + temp_wr_id = wr_id; + index = do_div(temp_wr_id, link->wr_rx_cnt); + link->wr_rx_ibs[index].wr_id = wr_id; + rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL); + return rc; +} + +int smc_wr_create_link(struct smc_link *lnk); +int smc_wr_alloc_link_mem(struct smc_link *lnk); +int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr); +void smc_wr_free_link(struct smc_link *lnk); +void smc_wr_free_link_mem(struct smc_link *lnk); +void smc_wr_free_lgr_mem(struct smc_link_group *lgr); +void smc_wr_remember_qp_attr(struct smc_link *lnk); +void smc_wr_remove_dev(struct smc_ib_device *smcibdev); +void smc_wr_add_dev(struct smc_ib_device *smcibdev); + +int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler, + struct smc_wr_buf **wr_buf, + struct smc_rdma_wr **wrs, + struct smc_wr_tx_pend_priv **wr_pend_priv); +int smc_wr_tx_get_v2_slot(struct smc_link *link, + smc_wr_tx_handler handler, + struct smc_wr_v2_buf **wr_buf, + struct smc_wr_tx_pend_priv **wr_pend_priv); +int smc_wr_tx_put_slot(struct smc_link *link, + struct smc_wr_tx_pend_priv *wr_pend_priv); +int smc_wr_tx_send(struct smc_link *link, + struct smc_wr_tx_pend_priv *wr_pend_priv); +int smc_wr_tx_v2_send(struct smc_link *link, + struct smc_wr_tx_pend_priv *priv, int len); +int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + unsigned long timeout); +void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); +void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); + +int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); +int smc_wr_rx_post_init(struct smc_link *link); +void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context); +int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr); + +#endif /* SMC_WR_H */ |