From 10ebd8556b7990499c896a550e3d416b444211e6 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Fri, 10 May 2024 02:23:07 +0000
Subject: [PATCH] add led

---
 kernel/net/smc/af_smc.c | 1584 ++++++++++++++++++++++++++++++++++++++++------------------
 1 files changed, 1,092 insertions(+), 492 deletions(-)

diff --git a/kernel/net/smc/af_smc.c b/kernel/net/smc/af_smc.c
index 4c904ab..8ab8492 100644
--- a/kernel/net/smc/af_smc.c
+++ b/kernel/net/smc/af_smc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
  *
@@ -24,11 +25,17 @@
 #include <linux/in.h>
 #include <linux/sched/signal.h>
 #include <linux/if_vlan.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/ctype.h>
 
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <net/smc.h>
 #include <asm/ioctls.h>
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include "smc_netns.h"
 
 #include "smc.h"
 #include "smc_clc.h"
@@ -42,9 +49,15 @@
 #include "smc_rx.h"
 #include "smc_close.h"
 
-static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
-						 * creation
+static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
+						 * creation on server
 						 */
+static DEFINE_MUTEX(smc_client_lgr_pending);	/* serialize link group
+						 * creation on client
+						 */
+
+struct workqueue_struct	*smc_hs_wq;	/* wq for handshake work */
+struct workqueue_struct	*smc_close_wq;	/* wq for close work */
 
 static void smc_tcp_listen_work(struct work_struct *);
 static void smc_connect_work(struct work_struct *);
@@ -115,21 +128,74 @@
 };
 EXPORT_SYMBOL_GPL(smc_proto6);
 
+static void smc_restore_fallback_changes(struct smc_sock *smc)
+{
+	if (smc->clcsock->file) { /* non-accepted sockets have no file yet */
+		smc->clcsock->file->private_data = smc->sk.sk_socket;
+		smc->clcsock->file = NULL;
+	}
+}
+
+static int __smc_release(struct smc_sock *smc)
+{
+	struct sock *sk = &smc->sk;
+	int rc = 0;
+
+	if (!smc->use_fallback) {
+		rc = smc_close_active(smc);
+		sock_set_flag(sk, SOCK_DEAD);
+		sk->sk_shutdown |= SHUTDOWN_MASK;
+	} else {
+		if (sk->sk_state != SMC_CLOSED) {
+			if (sk->sk_state != SMC_LISTEN &&
+			    sk->sk_state != SMC_INIT)
+				sock_put(sk); /* passive closing */
+			if (sk->sk_state == SMC_LISTEN) {
+				/* wake up clcsock accept */
+				rc = kernel_sock_shutdown(smc->clcsock,
+							  SHUT_RDWR);
+			}
+			sk->sk_state = SMC_CLOSED;
+			sk->sk_state_change(sk);
+		}
+		smc_restore_fallback_changes(smc);
+	}
+
+	sk->sk_prot->unhash(sk);
+
+	if (sk->sk_state == SMC_CLOSED) {
+		if (smc->clcsock) {
+			release_sock(sk);
+			smc_clcsock_release(smc);
+			lock_sock(sk);
+		}
+		if (!smc->use_fallback)
+			smc_conn_free(&smc->conn);
+	}
+
+	return rc;
+}
+
 static int smc_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct smc_sock *smc;
-	int rc = 0;
+	int old_state, rc = 0;
 
 	if (!sk)
 		goto out;
 
+	sock_hold(sk); /* sock_put below */
 	smc = smc_sk(sk);
 
+	old_state = sk->sk_state;
+
 	/* cleanup for a dangling non-blocking connect */
-	flush_work(&smc->connect_work);
-	kfree(smc->connect_info);
-	smc->connect_info = NULL;
+	if (smc->connect_nonblock && old_state == SMC_INIT)
+		tcp_abort(smc->clcsock->sk, ECONNABORTED);
+
+	if (cancel_work_sync(&smc->connect_work))
+		sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */
 
 	if (sk->sk_state == SMC_LISTEN)
 		/* smc_close_non_accepted() is called and acquires
@@ -139,38 +205,18 @@
 	else
 		lock_sock(sk);
 
-	if (!smc->use_fallback) {
-		rc = smc_close_active(smc);
-		sock_set_flag(sk, SOCK_DEAD);
-		sk->sk_shutdown |= SHUTDOWN_MASK;
-	}
+	if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE &&
+	    !smc->use_fallback)
+		smc_close_active_abort(smc);
 
-	sk->sk_prot->unhash(sk);
-
-	if (smc->clcsock) {
-		if (smc->use_fallback && sk->sk_state == SMC_LISTEN) {
-			/* wake up clcsock accept */
-			rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
-		}
-		mutex_lock(&smc->clcsock_release_lock);
-		sock_release(smc->clcsock);
-		smc->clcsock = NULL;
-		mutex_unlock(&smc->clcsock_release_lock);
-	}
-	if (smc->use_fallback) {
-		if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
-			sock_put(sk); /* passive closing */
-		sk->sk_state = SMC_CLOSED;
-		sk->sk_state_change(sk);
-	}
+	rc = __smc_release(smc);
 
 	/* detach socket */
 	sock_orphan(sk);
 	sock->sk = NULL;
-	if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
-		smc_conn_free(&smc->conn);
 	release_sock(sk);
 
+	sock_put(sk); /* sock_hold above */
 	sock_put(sk); /* final sock_put */
 out:
 	return rc;
@@ -245,7 +291,7 @@
 
 	/* Check if socket is already active */
 	rc = -EINVAL;
-	if (sk->sk_state != SMC_INIT)
+	if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
 		goto out_rel;
 
 	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
@@ -289,7 +335,8 @@
 			     (1UL << SOCK_RXQ_OVFL) | \
 			     (1UL << SOCK_WIFI_STATUS) | \
 			     (1UL << SOCK_NOFCS) | \
-			     (1UL << SOCK_FILTER_LOCKED))
+			     (1UL << SOCK_FILTER_LOCKED) | \
+			     (1UL << SOCK_TSTAMP_NEW))
 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
  * clc socket (since smc is not called for these options from net/core)
  */
@@ -308,47 +355,61 @@
 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
 }
 
-/* register a new rmb, optionally send confirm_rkey msg to register with peer */
-static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
-		       bool conf_rkey)
+/* register the new rmb on all links */
+static int smcr_lgr_reg_rmbs(struct smc_link *link,
+			     struct smc_buf_desc *rmb_desc)
 {
-	/* register memory region for new rmb */
-	if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
-		rmb_desc->regerr = 1;
-		return -EFAULT;
+	struct smc_link_group *lgr = link->lgr;
+	int i, rc = 0;
+
+	rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
+	if (rc)
+		return rc;
+	/* protect against parallel smc_llc_cli_rkey_exchange() and
+	 * parallel smcr_link_reg_rmb()
+	 */
+	mutex_lock(&lgr->llc_conf_mutex);
+	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+		if (!smc_link_active(&lgr->lnk[i]))
+			continue;
+		rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc);
+		if (rc)
+			goto out;
 	}
-	if (!conf_rkey)
-		return 0;
+
 	/* exchange confirm_rkey msg with peer */
-	if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
-		rmb_desc->regerr = 1;
-		return -EFAULT;
+	rc = smc_llc_do_confirm_rkey(link, rmb_desc);
+	if (rc) {
+		rc = -EFAULT;
+		goto out;
 	}
-	return 0;
+	rmb_desc->is_conf_rkey = true;
+out:
+	mutex_unlock(&lgr->llc_conf_mutex);
+	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+	return rc;
 }
 
-static int smc_clnt_conf_first_link(struct smc_sock *smc)
+static int smcr_clnt_conf_first_link(struct smc_sock *smc)
 {
-	struct net *net = sock_net(smc->clcsock->sk);
-	struct smc_link_group *lgr = smc->conn.lgr;
-	struct smc_link *link;
-	int rest;
+	struct smc_link *link = smc->conn.lnk;
+	struct smc_llc_qentry *qentry;
 	int rc;
 
-	link = &lgr->lnk[SMC_SINGLE_LINK];
 	/* receive CONFIRM LINK request from server over RoCE fabric */
-	rest = wait_for_completion_interruptible_timeout(
-		&link->llc_confirm,
-		SMC_LLC_WAIT_FIRST_TIME);
-	if (rest <= 0) {
+	qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
+			      SMC_LLC_CONFIRM_LINK);
+	if (!qentry) {
 		struct smc_clc_msg_decline dclc;
 
 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
-				      SMC_CLC_DECLINE);
-		return rc;
+				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
 	}
-
-	if (link->llc_confirm_rc)
+	smc_llc_save_peer_uid(qentry);
+	rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
+	smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
+	if (rc)
 		return SMC_CLC_DECL_RMBE_EC;
 
 	rc = smc_ib_modify_qp_rts(link);
@@ -357,60 +418,86 @@
 
 	smc_wr_remember_qp_attr(link);
 
-	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
+	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
 		return SMC_CLC_DECL_ERR_REGRMB;
+
+	/* confirm_rkey is implicit on 1st contact */
+	smc->conn.rmb_desc->is_conf_rkey = true;
 
 	/* send CONFIRM LINK response over RoCE fabric */
 	rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
 	if (rc < 0)
 		return SMC_CLC_DECL_TIMEOUT_CL;
 
-	/* receive ADD LINK request from server over RoCE fabric */
-	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
-							 SMC_LLC_WAIT_TIME);
-	if (rest <= 0) {
+	smc_llc_link_active(link);
+	smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
+
+	/* optional 2nd link, receive ADD LINK request from server */
+	qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
+			      SMC_LLC_ADD_LINK);
+	if (!qentry) {
 		struct smc_clc_msg_decline dclc;
 
 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
-				      SMC_CLC_DECLINE);
+				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+		if (rc == -EAGAIN)
+			rc = 0; /* no DECLINE received, go with one link */
 		return rc;
 	}
-
-	/* send add link reject message, only one link supported for now */
-	rc = smc_llc_send_add_link(link,
-				   link->smcibdev->mac[link->ibport - 1],
-				   link->gid, SMC_LLC_RESP);
-	if (rc < 0)
-		return SMC_CLC_DECL_TIMEOUT_AL;
-
-	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
-
+	smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
+	smc_llc_cli_add_link(link, qentry);
 	return 0;
 }
 
 static void smcr_conn_save_peer_info(struct smc_sock *smc,
 				     struct smc_clc_msg_accept_confirm *clc)
 {
-	int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
+	int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);
 
-	smc->conn.peer_rmbe_idx = clc->rmbe_idx;
-	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
+	smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
+	smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
 	smc->conn.peer_rmbe_size = bufsize;
 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
 	smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
 }
 
+static bool smc_isascii(char *hostname)
+{
+	int i;
+
+	for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
+		if (!isascii(hostname[i]))
+			return false;
+	return true;
+}
+
 static void smcd_conn_save_peer_info(struct smc_sock *smc,
 				     struct smc_clc_msg_accept_confirm *clc)
 {
-	int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
+	int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);
 
-	smc->conn.peer_rmbe_idx = clc->dmbe_idx;
-	smc->conn.peer_token = clc->token;
+	smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
+	smc->conn.peer_token = clc->d0.token;
 	/* msg header takes up space in the buffer */
 	smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
 	smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
+	if (clc->hdr.version > SMC_V1 &&
+	    (clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) {
+		struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+			(struct smc_clc_msg_accept_confirm_v2 *)clc;
+		struct smc_clc_first_contact_ext *fce =
+			(struct smc_clc_first_contact_ext *)
+				(((u8 *)clc_v2) + sizeof(*clc_v2));
+
+		memcpy(smc->conn.lgr->negotiated_eid, clc_v2->eid,
+		       SMC_MAX_EID_LEN);
+		smc->conn.lgr->peer_os = fce->os_type;
+		smc->conn.lgr->peer_smc_release = fce->release;
+		if (smc_isascii(fce->hostname))
+			memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
+			       SMC_MAX_HOSTNAME_LEN);
+	}
 }
 
 static void smc_conn_save_peer_info(struct smc_sock *smc,
@@ -425,26 +512,53 @@
 static void smc_link_save_peer_info(struct smc_link *link,
 				    struct smc_clc_msg_accept_confirm *clc)
 {
-	link->peer_qpn = ntoh24(clc->qpn);
-	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
-	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
-	link->peer_psn = ntoh24(clc->psn);
-	link->peer_mtu = clc->qp_mtu;
+	link->peer_qpn = ntoh24(clc->r0.qpn);
+	memcpy(link->peer_gid, clc->r0.lcl.gid, SMC_GID_SIZE);
+	memcpy(link->peer_mac, clc->r0.lcl.mac, sizeof(link->peer_mac));
+	link->peer_psn = ntoh24(clc->r0.psn);
+	link->peer_mtu = clc->r0.qp_mtu;
+}
+
+static void smc_switch_to_fallback(struct smc_sock *smc)
+{
+	wait_queue_head_t *smc_wait = sk_sleep(&smc->sk);
+	wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk);
+	unsigned long flags;
+
+	smc->use_fallback = true;
+	if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
+		smc->clcsock->file = smc->sk.sk_socket->file;
+		smc->clcsock->file->private_data = smc->clcsock;
+		smc->clcsock->wq.fasync_list =
+			smc->sk.sk_socket->wq.fasync_list;
+
+		/* There may be some entries remaining in
+		 * smc socket->wq, which should be removed
+		 * to clcsocket->wq during the fallback.
+		 */
+		spin_lock_irqsave(&smc_wait->lock, flags);
+		spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING);
+		list_splice_init(&smc_wait->head, &clc_wait->head);
+		spin_unlock(&clc_wait->lock);
+		spin_unlock_irqrestore(&smc_wait->lock, flags);
+	}
 }
 
 /* fall back during connect */
 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
 {
-	smc->use_fallback = true;
+	smc_switch_to_fallback(smc);
 	smc->fallback_rsn = reason_code;
 	smc_copy_sock_settings_to_clc(smc);
+	smc->connect_nonblock = 0;
 	if (smc->sk.sk_state == SMC_INIT)
 		smc->sk.sk_state = SMC_ACTIVE;
 	return 0;
 }
 
 /* decline and fall back during connect */
-static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
+static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code,
+					u8 version)
 {
 	int rc;
 
@@ -454,7 +568,7 @@
 		return reason_code;
 	}
 	if (reason_code != SMC_CLC_DECL_PEERDECL) {
-		rc = smc_clc_send_decline(smc, reason_code);
+		rc = smc_clc_send_decline(smc, reason_code, version);
 		if (rc < 0) {
 			if (smc->sk.sk_state == SMC_INIT)
 				sock_put(&smc->sk); /* passive closing */
@@ -465,190 +579,367 @@
 }
 
 /* abort connecting */
-static int smc_connect_abort(struct smc_sock *smc, int reason_code,
-			     int local_contact)
+static void smc_connect_abort(struct smc_sock *smc, int local_first)
 {
-	if (local_contact == SMC_FIRST_CONTACT)
-		smc_lgr_forget(smc->conn.lgr);
-	mutex_unlock(&smc_create_lgr_pending);
-	smc_conn_free(&smc->conn);
-	return reason_code;
+	if (local_first)
+		smc_lgr_cleanup_early(&smc->conn);
+	else
+		smc_conn_free(&smc->conn);
 }
 
 /* check if there is a rdma device available for this connection. */
 /* called for connect and listen */
-static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
-			  u8 *ibport, unsigned short vlan_id, u8 gid[])
+static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
 {
-	int reason_code = 0;
-
 	/* PNET table look up: search active ib_device and port
 	 * within same PNETID that also contains the ethernet device
 	 * used for the internal TCP socket
 	 */
-	smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id,
-				    gid);
-	if (!(*ibdev))
-		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
-
-	return reason_code;
+	smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
+	if (!ini->ib_dev)
+		return SMC_CLC_DECL_NOSMCRDEV;
+	return 0;
 }
 
 /* check if there is an ISM device available for this connection. */
 /* called for connect and listen */
-static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev)
+static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
 {
 	/* Find ISM device with same PNETID as connecting interface  */
-	smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev);
-	if (!(*ismdev))
-		return SMC_CLC_DECL_CNFERR; /* configuration error */
+	smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
+	if (!ini->ism_dev[0])
+		return SMC_CLC_DECL_NOSMCDDEV;
+	else
+		ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
 	return 0;
+}
+
+/* is chid unique for the ism devices that are already determined? */
+static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
+					   int cnt)
+{
+	int i = (!ini->ism_dev[0]) ? 1 : 0;
+
+	for (; i < cnt; i++)
+		if (ini->ism_chid[i] == chid)
+			return false;
+	return true;
+}
+
+/* determine possible V2 ISM devices (either without PNETID or with PNETID plus
+ * PNETID matching net_device)
+ */
+static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
+				       struct smc_init_info *ini)
+{
+	int rc = SMC_CLC_DECL_NOSMCDDEV;
+	struct smcd_dev *smcd;
+	int i = 1;
+	u16 chid;
+
+	if (smcd_indicated(ini->smc_type_v1))
+		rc = 0;		/* already initialized for V1 */
+	mutex_lock(&smcd_dev_list.mutex);
+	list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+		if (smcd->going_away || smcd == ini->ism_dev[0])
+			continue;
+		chid = smc_ism_get_chid(smcd);
+		if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
+			continue;
+		if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
+		    smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
+			ini->ism_dev[i] = smcd;
+			ini->ism_chid[i] = chid;
+			ini->is_smcd = true;
+			rc = 0;
+			i++;
+			if (i > SMC_MAX_ISM_DEVS)
+				break;
+		}
+	}
+	mutex_unlock(&smcd_dev_list.mutex);
+	ini->ism_offered_cnt = i - 1;
+	if (!ini->ism_dev[0] && !ini->ism_dev[1])
+		ini->smcd_version = 0;
+
+	return rc;
 }
 
 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
-				      struct smcd_dev *ismdev,
-				      unsigned short vlan_id)
+				      struct smc_init_info *ini)
 {
-	if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id))
-		return SMC_CLC_DECL_CNFERR;
+	if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
+		return SMC_CLC_DECL_ISMVLANERR;
 	return 0;
+}
+
+static int smc_find_proposal_devices(struct smc_sock *smc,
+				     struct smc_init_info *ini)
+{
+	int rc = 0;
+
+	/* check if there is an ism device available */
+	if (ini->smcd_version & SMC_V1) {
+		if (smc_find_ism_device(smc, ini) ||
+		    smc_connect_ism_vlan_setup(smc, ini)) {
+			if (ini->smc_type_v1 == SMC_TYPE_B)
+				ini->smc_type_v1 = SMC_TYPE_R;
+			else
+				ini->smc_type_v1 = SMC_TYPE_N;
+		} /* else ISM V1 is supported for this connection */
+		if (smc_find_rdma_device(smc, ini)) {
+			if (ini->smc_type_v1 == SMC_TYPE_B)
+				ini->smc_type_v1 = SMC_TYPE_D;
+			else
+				ini->smc_type_v1 = SMC_TYPE_N;
+		} /* else RDMA is supported for this connection */
+	}
+	if (smc_ism_v2_capable && smc_find_ism_v2_device_clnt(smc, ini))
+		ini->smc_type_v2 = SMC_TYPE_N;
+
+	/* if neither ISM nor RDMA are supported, fallback */
+	if (!smcr_indicated(ini->smc_type_v1) &&
+	    ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
+		rc = SMC_CLC_DECL_NOSMCDEV;
+
+	return rc;
 }
 
 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
  * used, the VLAN ID will be registered again during the connection setup.
  */
-static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
-					struct smcd_dev *ismdev,
-					unsigned short vlan_id)
+static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
+					struct smc_init_info *ini)
 {
-	if (!is_smcd)
+	if (!smcd_indicated(ini->smc_type_v1))
 		return 0;
-	if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id))
+	if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id))
 		return SMC_CLC_DECL_CNFERR;
 	return 0;
 }
 
+#define SMC_CLC_MAX_ACCEPT_LEN \
+	(sizeof(struct smc_clc_msg_accept_confirm_v2) + \
+	 sizeof(struct smc_clc_first_contact_ext) + \
+	 sizeof(struct smc_clc_msg_trail))
+
 /* CLC handshake during connect */
-static int smc_connect_clc(struct smc_sock *smc, int smc_type,
-			   struct smc_clc_msg_accept_confirm *aclc,
-			   struct smc_ib_device *ibdev, u8 ibport,
-			   u8 gid[], struct smcd_dev *ismdev)
+static int smc_connect_clc(struct smc_sock *smc,
+			   struct smc_clc_msg_accept_confirm_v2 *aclc2,
+			   struct smc_init_info *ini)
 {
 	int rc = 0;
 
 	/* do inband token exchange */
-	rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev);
+	rc = smc_clc_send_proposal(smc, ini);
 	if (rc)
 		return rc;
 	/* receive SMC Accept CLC message */
-	return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT);
+	return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN,
+				SMC_CLC_ACCEPT, CLC_WAIT_TIME);
 }
 
 /* setup for RDMA connection of client */
 static int smc_connect_rdma(struct smc_sock *smc,
 			    struct smc_clc_msg_accept_confirm *aclc,
-			    struct smc_ib_device *ibdev, u8 ibport)
+			    struct smc_init_info *ini)
 {
-	int local_contact = SMC_FIRST_CONTACT;
+	int i, reason_code = 0;
 	struct smc_link *link;
-	int reason_code = 0;
 
-	mutex_lock(&smc_create_lgr_pending);
-	local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev,
-					ibport, &aclc->lcl, NULL, 0);
-	if (local_contact < 0) {
-		if (local_contact == -ENOMEM)
-			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
-		else if (local_contact == -ENOLINK)
-			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
-		else
-			reason_code = SMC_CLC_DECL_INTERR; /* other error */
-		return smc_connect_abort(smc, reason_code, 0);
+	ini->is_smcd = false;
+	ini->ib_lcl = &aclc->r0.lcl;
+	ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
+	ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
+
+	mutex_lock(&smc_client_lgr_pending);
+	reason_code = smc_conn_create(smc, ini);
+	if (reason_code) {
+		mutex_unlock(&smc_client_lgr_pending);
+		return reason_code;
 	}
-	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 
 	smc_conn_save_peer_info(smc, aclc);
 
-	/* create send buffer and rmb */
-	if (smc_buf_create(smc, false))
-		return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
+	if (ini->first_contact_local) {
+		link = smc->conn.lnk;
+	} else {
+		/* set link that was assigned by server */
+		link = NULL;
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+			struct smc_link *l = &smc->conn.lgr->lnk[i];
 
-	if (local_contact == SMC_FIRST_CONTACT)
+			if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
+			    !memcmp(l->peer_gid, &aclc->r0.lcl.gid,
+				    SMC_GID_SIZE) &&
+			    !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
+				    sizeof(l->peer_mac))) {
+				link = l;
+				break;
+			}
+		}
+		if (!link) {
+			reason_code = SMC_CLC_DECL_NOSRVLINK;
+			goto connect_abort;
+		}
+		smc->conn.lnk = link;
+	}
+
+	/* create send buffer and rmb */
+	if (smc_buf_create(smc, false)) {
+		reason_code = SMC_CLC_DECL_MEM;
+		goto connect_abort;
+	}
+
+	if (ini->first_contact_local)
 		smc_link_save_peer_info(link, aclc);
 
-	if (smc_rmb_rtoken_handling(&smc->conn, aclc))
-		return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
-					 local_contact);
+	if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
+		reason_code = SMC_CLC_DECL_ERR_RTOK;
+		goto connect_abort;
+	}
 
 	smc_close_init(smc);
 	smc_rx_init(smc);
 
-	if (local_contact == SMC_FIRST_CONTACT) {
-		if (smc_ib_ready_link(link))
-			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
-						 local_contact);
+	if (ini->first_contact_local) {
+		if (smc_ib_ready_link(link)) {
+			reason_code = SMC_CLC_DECL_ERR_RDYLNK;
+			goto connect_abort;
+		}
 	} else {
-		if (!smc->conn.rmb_desc->reused &&
-		    smc_reg_rmb(link, smc->conn.rmb_desc, true))
-			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
-						 local_contact);
+		if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
+			reason_code = SMC_CLC_DECL_ERR_REGRMB;
+			goto connect_abort;
+		}
 	}
 	smc_rmb_sync_sg_for_device(&smc->conn);
 
-	reason_code = smc_clc_send_confirm(smc);
+	reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
+					   SMC_V1);
 	if (reason_code)
-		return smc_connect_abort(smc, reason_code, local_contact);
+		goto connect_abort;
 
 	smc_tx_init(smc);
 
-	if (local_contact == SMC_FIRST_CONTACT) {
+	if (ini->first_contact_local) {
 		/* QP confirmation over RoCE fabric */
-		reason_code = smc_clnt_conf_first_link(smc);
+		smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
+		reason_code = smcr_clnt_conf_first_link(smc);
+		smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
 		if (reason_code)
-			return smc_connect_abort(smc, reason_code,
-						 local_contact);
+			goto connect_abort;
 	}
-	mutex_unlock(&smc_create_lgr_pending);
+	mutex_unlock(&smc_client_lgr_pending);
 
 	smc_copy_sock_settings_to_clc(smc);
+	smc->connect_nonblock = 0;
 	if (smc->sk.sk_state == SMC_INIT)
 		smc->sk.sk_state = SMC_ACTIVE;
 
 	return 0;
+connect_abort:
+	smc_connect_abort(smc, ini->first_contact_local);
+	mutex_unlock(&smc_client_lgr_pending);
+	smc->connect_nonblock = 0;
+
+	return reason_code;
+}
+
+/* The server has chosen one of the proposed ISM devices for the communication.
+ * Determine from the CHID of the received CLC ACCEPT the ISM device chosen.
+ */
+static int
+smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc,
+			       struct smc_init_info *ini)
+{
+	int i;
+
+	for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
+		if (ini->ism_chid[i] == ntohs(aclc->chid)) {
+			ini->ism_selected = i;
+			return 0;
+		}
+	}
+
+	return -EPROTO;
 }
 
 /* setup for ISM connection of client */
 static int smc_connect_ism(struct smc_sock *smc,
 			   struct smc_clc_msg_accept_confirm *aclc,
-			   struct smcd_dev *ismdev)
+			   struct smc_init_info *ini)
 {
-	int local_contact = SMC_FIRST_CONTACT;
 	int rc = 0;
 
-	mutex_lock(&smc_create_lgr_pending);
-	local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0,
-					NULL, ismdev, aclc->gid);
-	if (local_contact < 0)
-		return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0);
+	ini->is_smcd = true;
+	ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
+
+	if (aclc->hdr.version == SMC_V2) {
+		struct smc_clc_msg_accept_confirm_v2 *aclc_v2 =
+			(struct smc_clc_msg_accept_confirm_v2 *)aclc;
+
+		rc = smc_v2_determine_accepted_chid(aclc_v2, ini);
+		if (rc)
+			return rc;
+	}
+	ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid;
+
+	/* there is only one lgr role for SMC-D; use server lock */
+	mutex_lock(&smc_server_lgr_pending);
+	rc = smc_conn_create(smc, ini);
+	if (rc) {
+		mutex_unlock(&smc_server_lgr_pending);
+		return rc;
+	}
 
 	/* Create send and receive buffers */
-	if (smc_buf_create(smc, true))
-		return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
+	rc = smc_buf_create(smc, true);
+	if (rc) {
+		rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM;
+		goto connect_abort;
+	}
 
 	smc_conn_save_peer_info(smc, aclc);
 	smc_close_init(smc);
 	smc_rx_init(smc);
 	smc_tx_init(smc);
 
-	rc = smc_clc_send_confirm(smc);
+	rc = smc_clc_send_confirm(smc, ini->first_contact_local,
+				  aclc->hdr.version);
 	if (rc)
-		return smc_connect_abort(smc, rc, local_contact);
-	mutex_unlock(&smc_create_lgr_pending);
+		goto connect_abort;
+	mutex_unlock(&smc_server_lgr_pending);
 
 	smc_copy_sock_settings_to_clc(smc);
+	smc->connect_nonblock = 0;
 	if (smc->sk.sk_state == SMC_INIT)
 		smc->sk.sk_state = SMC_ACTIVE;
+
+	return 0;
+connect_abort:
+	smc_connect_abort(smc, ini->first_contact_local);
+	mutex_unlock(&smc_server_lgr_pending);
+	smc->connect_nonblock = 0;
+
+	return rc;
+}
+
+/* check if received accept type and version matches a proposed one */
+static int smc_connect_check_aclc(struct smc_init_info *ini,
+				  struct smc_clc_msg_accept_confirm *aclc)
+{
+	if ((aclc->hdr.typev1 == SMC_TYPE_R &&
+	     !smcr_indicated(ini->smc_type_v1)) ||
+	    (aclc->hdr.typev1 == SMC_TYPE_D &&
+	     ((!smcd_indicated(ini->smc_type_v1) &&
+	       !smcd_indicated(ini->smc_type_v2)) ||
+	      (aclc->hdr.version == SMC_V1 &&
+	       !smcd_indicated(ini->smc_type_v1)) ||
+	      (aclc->hdr.version == SMC_V2 &&
+	       !smcd_indicated(ini->smc_type_v2)))))
+		return SMC_CLC_DECL_MODEUNSUPP;
 
 	return 0;
 }
@@ -656,17 +947,12 @@
 /* perform steps before actually connecting */
 static int __smc_connect(struct smc_sock *smc)
 {
-	bool ism_supported = false, rdma_supported = false;
-	struct smc_clc_msg_accept_confirm aclc;
-	struct smc_ib_device *ibdev;
-	struct smcd_dev *ismdev;
-	u8 gid[SMC_GID_SIZE];
-	unsigned short vlan;
-	int smc_type;
+	u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1;
+	struct smc_clc_msg_accept_confirm_v2 *aclc2;
+	struct smc_clc_msg_accept_confirm *aclc;
+	struct smc_init_info *ini = NULL;
+	u8 *buf = NULL;
 	int rc = 0;
-	u8 ibport;
-
-	sock_hold(&smc->sk); /* sock put in passive closing */
 
 	if (smc->use_fallback)
 		return smc_connect_fallback(smc, smc->fallback_rsn);
@@ -675,74 +961,107 @@
 	if (!tcp_sk(smc->clcsock->sk)->syn_smc)
 		return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
 
-	/* IPSec connections opt out of SMC-R optimizations */
+	/* IPSec connections opt out of SMC optimizations */
 	if (using_ipsec(smc))
-		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
+		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC,
+						    version);
 
-	/* check for VLAN ID */
-	if (smc_vlan_by_tcpsk(smc->clcsock, &vlan))
-		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
+	ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+	if (!ini)
+		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM,
+						    version);
 
-	/* check if there is an ism device available */
-	if (!smc_check_ism(smc, &ismdev) &&
-	    !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) {
-		/* ISM is supported for this connection */
-		ism_supported = true;
-		smc_type = SMC_TYPE_D;
+	ini->smcd_version = SMC_V1;
+	ini->smcd_version |= smc_ism_v2_capable ? SMC_V2 : 0;
+	ini->smc_type_v1 = SMC_TYPE_B;
+	ini->smc_type_v2 = smc_ism_v2_capable ? SMC_TYPE_D : SMC_TYPE_N;
+
+	/* get vlan id from IP device */
+	if (smc_vlan_by_tcpsk(smc->clcsock, ini)) {
+		ini->smcd_version &= ~SMC_V1;
+		ini->smc_type_v1 = SMC_TYPE_N;
+		if (!ini->smcd_version) {
+			rc = SMC_CLC_DECL_GETVLANERR;
+			goto fallback;
+		}
 	}
 
-	/* check if there is a rdma device available */
-	if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) {
-		/* RDMA is supported for this connection */
-		rdma_supported = true;
-		if (ism_supported)
-			smc_type = SMC_TYPE_B; /* both */
-		else
-			smc_type = SMC_TYPE_R; /* only RDMA */
-	}
+	rc = smc_find_proposal_devices(smc, ini);
+	if (rc)
+		goto fallback;
 
-	/* if neither ISM nor RDMA are supported, fallback */
-	if (!rdma_supported && !ism_supported)
-		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
+	buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL);
+	if (!buf) {
+		rc = SMC_CLC_DECL_MEM;
+		goto fallback;
+	}
+	aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf;
+	aclc = (struct smc_clc_msg_accept_confirm *)aclc2;
 
 	/* perform CLC handshake */
-	rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev);
-	if (rc) {
-		smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
-		return smc_connect_decline_fallback(smc, rc);
-	}
+	rc = smc_connect_clc(smc, aclc2, ini);
+	if (rc)
+		goto vlan_cleanup;
+
+	/* check if smc modes and versions of CLC proposal and accept match */
+	rc = smc_connect_check_aclc(ini, aclc);
+	version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2;
+	ini->smcd_version = version;
+	if (rc)
+		goto vlan_cleanup;
 
 	/* depending on previous steps, connect using rdma or ism */
-	if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
-		rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
-	else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
-		rc = smc_connect_ism(smc, &aclc, ismdev);
-	else
-		rc = SMC_CLC_DECL_MODEUNSUPP;
-	if (rc) {
-		smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
-		return smc_connect_decline_fallback(smc, rc);
-	}
+	if (aclc->hdr.typev1 == SMC_TYPE_R)
+		rc = smc_connect_rdma(smc, aclc, ini);
+	else if (aclc->hdr.typev1 == SMC_TYPE_D)
+		rc = smc_connect_ism(smc, aclc, ini);
+	if (rc)
+		goto vlan_cleanup;
 
-	smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
+	smc_connect_ism_vlan_cleanup(smc, ini);
+	kfree(buf);
+	kfree(ini);
 	return 0;
+
+vlan_cleanup:
+	smc_connect_ism_vlan_cleanup(smc, ini);
+	kfree(buf);
+fallback:
+	kfree(ini);
+	return smc_connect_decline_fallback(smc, rc, version);
 }
 
 static void smc_connect_work(struct work_struct *work)
 {
 	struct smc_sock *smc = container_of(work, struct smc_sock,
 					    connect_work);
-	int rc;
+	long timeo = smc->sk.sk_sndtimeo;
+	int rc = 0;
 
-	lock_sock(&smc->sk);
-	rc = kernel_connect(smc->clcsock, &smc->connect_info->addr,
-			    smc->connect_info->alen, smc->connect_info->flags);
+	if (!timeo)
+		timeo = MAX_SCHEDULE_TIMEOUT;
+	lock_sock(smc->clcsock->sk);
 	if (smc->clcsock->sk->sk_err) {
 		smc->sk.sk_err = smc->clcsock->sk->sk_err;
-		goto out;
+	} else if ((1 << smc->clcsock->sk->sk_state) &
+					(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
+		if ((rc == -EPIPE) &&
+		    ((1 << smc->clcsock->sk->sk_state) &
+					(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
+			rc = 0;
 	}
-	if (rc < 0) {
-		smc->sk.sk_err = -rc;
+	release_sock(smc->clcsock->sk);
+	lock_sock(&smc->sk);
+	if (rc != 0 || smc->sk.sk_err) {
+		smc->sk.sk_state = SMC_CLOSED;
+		if (rc == -EPIPE || rc == -EAGAIN)
+			smc->sk.sk_err = EPIPE;
+		else if (rc == -ECONNREFUSED)
+			smc->sk.sk_err = ECONNREFUSED;
+		else if (signal_pending(current))
+			smc->sk.sk_err = -sock_intr_errno(timeo);
+		sock_put(&smc->sk); /* passive closing */
 		goto out;
 	}
 
@@ -751,12 +1070,14 @@
 		smc->sk.sk_err = -rc;
 
 out:
-	if (smc->sk.sk_err)
-		smc->sk.sk_state_change(&smc->sk);
-	else
-		smc->sk.sk_write_space(&smc->sk);
-	kfree(smc->connect_info);
-	smc->connect_info = NULL;
+	if (!sock_flag(&smc->sk, SOCK_DEAD)) {
+		if (smc->sk.sk_err) {
+			smc->sk.sk_state_change(&smc->sk);
+		} else { /* allow polling before and after fallback decision */
+			smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
+			smc->sk.sk_write_space(&smc->sk);
+		}
+	}
 	release_sock(&smc->sk);
 }
 
@@ -789,26 +1110,22 @@
 
 	smc_copy_sock_settings_to_clc(smc);
 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+	if (smc->connect_nonblock) {
+		rc = -EALREADY;
+		goto out;
+	}
+	rc = kernel_connect(smc->clcsock, addr, alen, flags);
+	if (rc && rc != -EINPROGRESS)
+		goto out;
+
+	if (smc->use_fallback)
+		goto out;
+	sock_hold(&smc->sk); /* sock put in passive closing */
 	if (flags & O_NONBLOCK) {
-		if (smc->connect_info) {
-			rc = -EALREADY;
-			goto out;
-		}
-		smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL);
-		if (!smc->connect_info) {
-			rc = -ENOMEM;
-			goto out;
-		}
-		smc->connect_info->alen = alen;
-		smc->connect_info->flags = flags ^ O_NONBLOCK;
-		memcpy(&smc->connect_info->addr, addr, alen);
-		schedule_work(&smc->connect_work);
+		if (queue_work(smc_hs_wq, &smc->connect_work))
+			smc->connect_nonblock = 1;
 		rc = -EINPROGRESS;
 	} else {
-		rc = kernel_connect(smc->clcsock, addr, alen, flags);
-		if (rc)
-			goto out;
-
 		rc = __smc_connect(smc);
 		if (rc < 0)
 			goto out;
@@ -842,10 +1159,10 @@
 
 	mutex_lock(&lsmc->clcsock_release_lock);
 	if (lsmc->clcsock)
-		rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+		rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
 	mutex_unlock(&lsmc->clcsock_release_lock);
 	lock_sock(lsk);
-	if  (rc < 0)
+	if  (rc < 0 && rc != -EAGAIN)
 		lsk->sk_err = -rc;
 	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
 		new_sk->sk_prot->unhash(new_sk);
@@ -858,6 +1175,10 @@
 		goto out;
 	}
 
+	/* new clcsock has inherited the smc listen-specific sk_data_ready
+	 * function; switch it back to the original sk_data_ready function
+	 */
+	new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready;
 	(*new_smc)->clcsock = new_clcsock;
 out:
 	return rc;
@@ -911,8 +1232,13 @@
 			sock_put(new_sk); /* final */
 			continue;
 		}
-		if (new_sock)
+		if (new_sock) {
 			sock_graft(new_sk, new_sock);
+			if (isk->use_fallback) {
+				smc_sk(new_sk)->clcsock->file = new_sock->file;
+				isk->clcsock->file->private_data = isk->clcsock;
+			}
+		}
 		return new_sk;
 	}
 	return NULL;
@@ -923,45 +1249,24 @@
 {
 	struct smc_sock *smc = smc_sk(sk);
 
+	sock_hold(sk); /* sock_put below */
 	lock_sock(sk);
 	if (!sk->sk_lingertime)
 		/* wait for peer closing */
 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
-	if (!smc->use_fallback) {
-		smc_close_active(smc);
-		sock_set_flag(sk, SOCK_DEAD);
-		sk->sk_shutdown |= SHUTDOWN_MASK;
-	}
-	sk->sk_prot->unhash(sk);
-	if (smc->clcsock) {
-		struct socket *tcp;
-
-		tcp = smc->clcsock;
-		smc->clcsock = NULL;
-		sock_release(tcp);
-	}
-	if (smc->use_fallback) {
-		sock_put(sk); /* passive closing */
-		sk->sk_state = SMC_CLOSED;
-	} else {
-		if (sk->sk_state == SMC_CLOSED)
-			smc_conn_free(&smc->conn);
-	}
+	__smc_release(smc);
 	release_sock(sk);
+	sock_put(sk); /* sock_hold above */
 	sock_put(sk); /* final sock_put */
 }
 
-static int smc_serv_conf_first_link(struct smc_sock *smc)
+static int smcr_serv_conf_first_link(struct smc_sock *smc)
 {
-	struct net *net = sock_net(smc->clcsock->sk);
-	struct smc_link_group *lgr = smc->conn.lgr;
-	struct smc_link *link;
-	int rest;
+	struct smc_link *link = smc->conn.lnk;
+	struct smc_llc_qentry *qentry;
 	int rc;
 
-	link = &lgr->lnk[SMC_SINGLE_LINK];
-
-	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
+	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
 		return SMC_CLC_DECL_ERR_REGRMB;
 
 	/* send CONFIRM LINK request to client over the RoCE fabric */
@@ -970,40 +1275,29 @@
 		return SMC_CLC_DECL_TIMEOUT_CL;
 
 	/* receive CONFIRM LINK response from client over the RoCE fabric */
-	rest = wait_for_completion_interruptible_timeout(
-		&link->llc_confirm_resp,
-		SMC_LLC_WAIT_FIRST_TIME);
-	if (rest <= 0) {
+	qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
+			      SMC_LLC_CONFIRM_LINK);
+	if (!qentry) {
 		struct smc_clc_msg_decline dclc;
 
 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
-				      SMC_CLC_DECLINE);
-		return rc;
+				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
 	}
-
-	if (link->llc_confirm_resp_rc)
+	smc_llc_save_peer_uid(qentry);
+	rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
+	smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
+	if (rc)
 		return SMC_CLC_DECL_RMBE_EC;
 
-	/* send ADD LINK request to client over the RoCE fabric */
-	rc = smc_llc_send_add_link(link,
-				   link->smcibdev->mac[link->ibport - 1],
-				   link->gid, SMC_LLC_REQ);
-	if (rc < 0)
-		return SMC_CLC_DECL_TIMEOUT_AL;
+	/* confirm_rkey is implicit on 1st contact */
+	smc->conn.rmb_desc->is_conf_rkey = true;
 
-	/* receive ADD LINK response from client over the RoCE fabric */
-	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
-							 SMC_LLC_WAIT_TIME);
-	if (rest <= 0) {
-		struct smc_clc_msg_decline dclc;
+	smc_llc_link_active(link);
+	smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
 
-		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
-				      SMC_CLC_DECLINE);
-		return rc;
-	}
-
-	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
-
+	/* initial contact - try to establish second link */
+	smc_llc_srv_add_link(link);
 	return 0;
 }
 
@@ -1013,13 +1307,13 @@
 	struct smc_sock *lsmc = new_smc->listen_smc;
 	struct sock *newsmcsk = &new_smc->sk;
 
-	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
 	if (lsmc->sk.sk_state == SMC_LISTEN) {
+		lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
+		release_sock(&lsmc->sk);
 	} else { /* no longer listening */
 		smc_close_non_accepted(newsmcsk);
 	}
-	release_sock(&lsmc->sk);
 
 	/* Wake up accept */
 	lsmc->sk.sk_data_ready(&lsmc->sk);
@@ -1031,7 +1325,6 @@
 {
 	struct sock *newsmcsk = &new_smc->sk;
 
-	sk_refcnt_debug_inc(newsmcsk);
 	if (newsmcsk->sk_state == SMC_INIT)
 		newsmcsk->sk_state = SMC_ACTIVE;
 
@@ -1046,27 +1339,27 @@
 	if (newsmcsk->sk_state == SMC_INIT)
 		sock_put(&new_smc->sk); /* passive closing */
 	newsmcsk->sk_state = SMC_CLOSED;
-	smc_conn_free(&new_smc->conn);
 
 	smc_listen_out(new_smc);
 }
 
 /* listen worker: decline and fall back if possible */
 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
-			       int local_contact)
+			       int local_first, u8 version)
 {
 	/* RDMA setup failed, switch back to TCP */
-	if (local_contact == SMC_FIRST_CONTACT)
-		smc_lgr_forget(new_smc->conn.lgr);
+	if (local_first)
+		smc_lgr_cleanup_early(&new_smc->conn);
+	else
+		smc_conn_free(&new_smc->conn);
 	if (reason_code < 0) { /* error, no fallback possible */
 		smc_listen_out_err(new_smc);
 		return;
 	}
-	smc_conn_free(&new_smc->conn);
-	new_smc->use_fallback = true;
+	smc_switch_to_fallback(new_smc);
 	new_smc->fallback_rsn = reason_code;
 	if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
-		if (smc_clc_send_decline(new_smc, reason_code) < 0) {
+		if (smc_clc_send_decline(new_smc, reason_code, version) < 0) {
 			smc_listen_out_err(new_smc);
 			return;
 		}
@@ -1074,34 +1367,73 @@
 	smc_listen_out_connected(new_smc);
 }
 
+/* listen worker: version checking */
+static int smc_listen_v2_check(struct smc_sock *new_smc,
+			       struct smc_clc_msg_proposal *pclc,
+			       struct smc_init_info *ini)
+{
+	struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext;
+	struct smc_clc_v2_extension *pclc_v2_ext;
+
+	ini->smc_type_v1 = pclc->hdr.typev1;
+	ini->smc_type_v2 = pclc->hdr.typev2;
+	ini->smcd_version = ini->smc_type_v1 != SMC_TYPE_N ? SMC_V1 : 0;
+	if (pclc->hdr.version > SMC_V1)
+		ini->smcd_version |=
+				ini->smc_type_v2 != SMC_TYPE_N ? SMC_V2 : 0;
+	if (!smc_ism_v2_capable) {
+		ini->smcd_version &= ~SMC_V2;
+		goto out;
+	}
+	pclc_v2_ext = smc_get_clc_v2_ext(pclc);
+	if (!pclc_v2_ext) {
+		ini->smcd_version &= ~SMC_V2;
+		goto out;
+	}
+	pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext);
+	if (!pclc_smcd_v2_ext)
+		ini->smcd_version &= ~SMC_V2;
+
+out:
+	if (!ini->smcd_version) {
+		if (pclc->hdr.typev1 == SMC_TYPE_B ||
+		    pclc->hdr.typev2 == SMC_TYPE_B)
+			return SMC_CLC_DECL_NOSMCDEV;
+		if (pclc->hdr.typev1 == SMC_TYPE_D ||
+		    pclc->hdr.typev2 == SMC_TYPE_D)
+			return SMC_CLC_DECL_NOSMCDDEV;
+		return SMC_CLC_DECL_NOSMCRDEV;
+	}
+
+	return 0;
+}
+
 /* listen worker: check prefixes */
-static int smc_listen_rdma_check(struct smc_sock *new_smc,
+static int smc_listen_prfx_check(struct smc_sock *new_smc,
 				 struct smc_clc_msg_proposal *pclc)
 {
 	struct smc_clc_msg_proposal_prefix *pclc_prfx;
 	struct socket *newclcsock = new_smc->clcsock;
 
+	if (pclc->hdr.typev1 == SMC_TYPE_N)
+		return 0;
 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
 	if (smc_clc_prfx_match(newclcsock, pclc_prfx))
-		return SMC_CLC_DECL_CNFERR;
+		return SMC_CLC_DECL_DIFFPREFIX;
 
 	return 0;
 }
 
 /* listen worker: initialize connection and buffers */
 static int smc_listen_rdma_init(struct smc_sock *new_smc,
-				struct smc_clc_msg_proposal *pclc,
-				struct smc_ib_device *ibdev, u8 ibport,
-				int *local_contact)
+				struct smc_init_info *ini)
 {
+	int rc;
+
 	/* allocate connection / link group */
-	*local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport,
-					 &pclc->lcl, NULL, 0);
-	if (*local_contact < 0) {
-		if (*local_contact == -ENOMEM)
-			return SMC_CLC_DECL_MEM;/* insufficient memory*/
-		return SMC_CLC_DECL_INTERR; /* other error */
-	}
+	rc = smc_conn_create(new_smc, ini);
+	if (rc)
+		return rc;
 
 	/* create send buffer and rmb */
 	if (smc_buf_create(new_smc, false))
@@ -1112,109 +1444,266 @@
 
 /* listen worker: initialize connection and buffers for SMC-D */
 static int smc_listen_ism_init(struct smc_sock *new_smc,
-			       struct smc_clc_msg_proposal *pclc,
-			       struct smcd_dev *ismdev,
-			       int *local_contact)
+			       struct smc_init_info *ini)
 {
-	struct smc_clc_msg_smcd *pclc_smcd;
+	int rc;
 
-	pclc_smcd = smc_get_clc_msg_smcd(pclc);
-	*local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, NULL,
-					 ismdev, pclc_smcd->gid);
-	if (*local_contact < 0) {
-		if (*local_contact == -ENOMEM)
-			return SMC_CLC_DECL_MEM;/* insufficient memory*/
-		return SMC_CLC_DECL_INTERR; /* other error */
-	}
-
-	/* Check if peer can be reached via ISM device */
-	if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
-			    new_smc->conn.lgr->vlan_id,
-			    new_smc->conn.lgr->smcd)) {
-		if (*local_contact == SMC_FIRST_CONTACT)
-			smc_lgr_forget(new_smc->conn.lgr);
-		smc_conn_free(&new_smc->conn);
-		return SMC_CLC_DECL_CNFERR;
-	}
+	rc = smc_conn_create(new_smc, ini);
+	if (rc)
+		return rc;
 
 	/* Create send and receive buffers */
-	if (smc_buf_create(new_smc, true)) {
-		if (*local_contact == SMC_FIRST_CONTACT)
-			smc_lgr_forget(new_smc->conn.lgr);
-		smc_conn_free(&new_smc->conn);
-		return SMC_CLC_DECL_MEM;
+	rc = smc_buf_create(new_smc, true);
+	if (rc) {
+		if (ini->first_contact_local)
+			smc_lgr_cleanup_early(&new_smc->conn);
+		else
+			smc_conn_free(&new_smc->conn);
+		return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB :
+					 SMC_CLC_DECL_MEM;
 	}
 
 	return 0;
 }
 
-/* listen worker: register buffers */
-static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
+static bool smc_is_already_selected(struct smcd_dev *smcd,
+				    struct smc_init_info *ini,
+				    int matches)
 {
-	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+	int i;
 
-	if (local_contact != SMC_FIRST_CONTACT) {
-		if (!new_smc->conn.rmb_desc->reused) {
-			if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
-				return SMC_CLC_DECL_ERR_REGRMB;
+	for (i = 0; i < matches; i++)
+		if (smcd == ini->ism_dev[i])
+			return true;
+
+	return false;
+}
+
+/* check for ISM devices matching proposed ISM devices */
+static void smc_check_ism_v2_match(struct smc_init_info *ini,
+				   u16 proposed_chid, u64 proposed_gid,
+				   unsigned int *matches)
+{
+	struct smcd_dev *smcd;
+
+	list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+		if (smcd->going_away)
+			continue;
+		if (smc_is_already_selected(smcd, ini, *matches))
+			continue;
+		if (smc_ism_get_chid(smcd) == proposed_chid &&
+		    !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) {
+			ini->ism_peer_gid[*matches] = proposed_gid;
+			ini->ism_dev[*matches] = smcd;
+			(*matches)++;
+			break;
 		}
+	}
+}
+
+static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
+					struct smc_clc_msg_proposal *pclc,
+					struct smc_init_info *ini)
+{
+	struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+	struct smc_clc_v2_extension *smc_v2_ext;
+	struct smc_clc_msg_smcd *pclc_smcd;
+	unsigned int matches = 0;
+	u8 smcd_version;
+	u8 *eid = NULL;
+	int i;
+
+	if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2))
+		goto not_found;
+
+	pclc_smcd = smc_get_clc_msg_smcd(pclc);
+	smc_v2_ext = smc_get_clc_v2_ext(pclc);
+	smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext);
+	if (!smcd_v2_ext ||
+	    !smc_v2_ext->hdr.flag.seid) /* no system EID support for SMCD */
+		goto not_found;
+
+	mutex_lock(&smcd_dev_list.mutex);
+	if (pclc_smcd->ism.chid)
+		/* check for ISM device matching proposed native ISM device */
+		smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid),
+				       ntohll(pclc_smcd->ism.gid), &matches);
+	for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) {
+		/* check for ISM devices matching proposed non-native ISM
+		 * devices
+		 */
+		smc_check_ism_v2_match(ini,
+				       ntohs(smcd_v2_ext->gidchid[i - 1].chid),
+				       ntohll(smcd_v2_ext->gidchid[i - 1].gid),
+				       &matches);
+	}
+	mutex_unlock(&smcd_dev_list.mutex);
+
+	if (ini->ism_dev[0]) {
+		smc_ism_get_system_eid(ini->ism_dev[0], &eid);
+		if (memcmp(eid, smcd_v2_ext->system_eid, SMC_MAX_EID_LEN))
+			goto not_found;
+	} else {
+		goto not_found;
+	}
+
+	/* separate - outside the smcd_dev_list.lock */
+	smcd_version = ini->smcd_version;
+	for (i = 0; i < matches; i++) {
+		ini->smcd_version = SMC_V2;
+		ini->is_smcd = true;
+		ini->ism_selected = i;
+		if (smc_listen_ism_init(new_smc, ini))
+			/* try next active ISM device */
+			continue;
+		return; /* matching and usable V2 ISM device found */
+	}
+	/* no V2 ISM device could be initialized */
+	ini->smcd_version = smcd_version;	/* restore original value */
+
+not_found:
+	ini->smcd_version &= ~SMC_V2;
+	ini->ism_dev[0] = NULL;
+	ini->is_smcd = false;
+}
+
+static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
+					struct smc_clc_msg_proposal *pclc,
+					struct smc_init_info *ini)
+{
+	struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc);
+
+	/* check if ISM V1 is available */
+	if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1))
+		goto not_found;
+	ini->is_smcd = true; /* prepare ISM check */
+	ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid);
+	if (smc_find_ism_device(new_smc, ini))
+		goto not_found;
+	ini->ism_selected = 0;
+	if (!smc_listen_ism_init(new_smc, ini))
+		return;		/* V1 ISM device found */
+
+not_found:
+	ini->ism_dev[0] = NULL;
+	ini->is_smcd = false;
+}
+
+/* listen worker: register buffers */
+static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
+{
+	struct smc_connection *conn = &new_smc->conn;
+
+	if (!local_first) {
+		if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
+			return SMC_CLC_DECL_ERR_REGRMB;
 	}
 	smc_rmb_sync_sg_for_device(&new_smc->conn);
 
 	return 0;
 }
 
+static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc,
+					struct smc_clc_msg_proposal *pclc,
+					struct smc_init_info *ini)
+{
+	int rc;
+
+	if (!smcr_indicated(ini->smc_type_v1))
+		return SMC_CLC_DECL_NOSMCDEV;
+
+	/* prepare RDMA check */
+	ini->ib_lcl = &pclc->lcl;
+	rc = smc_find_rdma_device(new_smc, ini);
+	if (rc) {
+		/* no RDMA device found */
+		if (ini->smc_type_v1 == SMC_TYPE_B)
+			/* neither ISM nor RDMA device found */
+			rc = SMC_CLC_DECL_NOSMCDEV;
+		return rc;
+	}
+	rc = smc_listen_rdma_init(new_smc, ini);
+	if (rc)
+		return rc;
+	return smc_listen_rdma_reg(new_smc, ini->first_contact_local);
+}
+
+/* determine the local device matching to proposal */
+static int smc_listen_find_device(struct smc_sock *new_smc,
+				  struct smc_clc_msg_proposal *pclc,
+				  struct smc_init_info *ini)
+{
+	int rc;
+
+	/* check for ISM device matching V2 proposed device */
+	smc_find_ism_v2_device_serv(new_smc, pclc, ini);
+	if (ini->ism_dev[0])
+		return 0;
+
+	if (!(ini->smcd_version & SMC_V1))
+		return SMC_CLC_DECL_NOSMCDEV;
+
+	/* check for matching IP prefix and subnet length */
+	rc = smc_listen_prfx_check(new_smc, pclc);
+	if (rc)
+		return rc;
+
+	/* get vlan id from IP device */
+	if (smc_vlan_by_tcpsk(new_smc->clcsock, ini))
+		return SMC_CLC_DECL_GETVLANERR;
+
+	/* check for ISM device matching V1 proposed device */
+	smc_find_ism_v1_device_serv(new_smc, pclc, ini);
+	if (ini->ism_dev[0])
+		return 0;
+
+	if (pclc->hdr.typev1 == SMC_TYPE_D)
+		return SMC_CLC_DECL_NOSMCDDEV; /* skip RDMA and decline */
+
+	/* check if RDMA is available */
+	return smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
+}
+
 /* listen worker: finish RDMA setup */
 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
 				  struct smc_clc_msg_accept_confirm *cclc,
-				  int local_contact)
+				  bool local_first)
 {
-	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+	struct smc_link *link = new_smc->conn.lnk;
 	int reason_code = 0;
 
-	if (local_contact == SMC_FIRST_CONTACT)
+	if (local_first)
 		smc_link_save_peer_info(link, cclc);
 
-	if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
-		reason_code = SMC_CLC_DECL_ERR_RTOK;
-		goto decline;
-	}
+	if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
+		return SMC_CLC_DECL_ERR_RTOK;
 
-	if (local_contact == SMC_FIRST_CONTACT) {
-		if (smc_ib_ready_link(link)) {
-			reason_code = SMC_CLC_DECL_ERR_RDYLNK;
-			goto decline;
-		}
+	if (local_first) {
+		if (smc_ib_ready_link(link))
+			return SMC_CLC_DECL_ERR_RDYLNK;
 		/* QP confirmation over RoCE fabric */
-		reason_code = smc_serv_conf_first_link(new_smc);
-		if (reason_code)
-			goto decline;
+		smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
+		reason_code = smcr_serv_conf_first_link(new_smc);
+		smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
 	}
-	return 0;
-
-decline:
-	mutex_unlock(&smc_create_lgr_pending);
-	smc_listen_decline(new_smc, reason_code, local_contact);
 	return reason_code;
 }
 
-/* setup for RDMA connection of server */
+/* setup for connection of server */
 static void smc_listen_work(struct work_struct *work)
 {
 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
 						smc_listen_work);
+	u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1;
 	struct socket *newclcsock = new_smc->clcsock;
-	struct smc_clc_msg_accept_confirm cclc;
+	struct smc_clc_msg_accept_confirm *cclc;
+	struct smc_clc_msg_proposal_area *buf;
 	struct smc_clc_msg_proposal *pclc;
-	struct smc_ib_device *ibdev;
-	bool ism_supported = false;
-	struct smcd_dev *ismdev;
-	u8 buf[SMC_CLC_MAX_LEN];
-	int local_contact = 0;
-	unsigned short vlan;
-	int reason_code = 0;
+	struct smc_init_info *ini = NULL;
 	int rc = 0;
-	u8 ibport;
+
+	if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
+		return smc_listen_out_err(new_smc);
 
 	if (new_smc->use_fallback) {
 		smc_listen_out_connected(new_smc);
@@ -1223,7 +1712,7 @@
 
 	/* check if peer is smc capable */
 	if (!tcp_sk(newclcsock->sk)->syn_smc) {
-		new_smc->use_fallback = true;
+		smc_switch_to_fallback(new_smc);
 		new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
 		smc_listen_out_connected(new_smc);
 		return;
@@ -1232,73 +1721,86 @@
 	/* do inband token exchange -
 	 * wait for and receive SMC Proposal CLC message
 	 */
-	pclc = (struct smc_clc_msg_proposal *)&buf;
-	reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
-				       SMC_CLC_PROPOSAL);
-	if (reason_code) {
-		smc_listen_decline(new_smc, reason_code, 0);
-		return;
+	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+	if (!buf) {
+		rc = SMC_CLC_DECL_MEM;
+		goto out_decl;
 	}
+	pclc = (struct smc_clc_msg_proposal *)buf;
+	rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf),
+			      SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
+	if (rc)
+		goto out_decl;
+	version = pclc->hdr.version == SMC_V1 ? SMC_V1 : version;
 
-	/* IPSec connections opt out of SMC-R optimizations */
+	/* IPSec connections opt out of SMC optimizations */
 	if (using_ipsec(new_smc)) {
-		smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0);
-		return;
+		rc = SMC_CLC_DECL_IPSEC;
+		goto out_decl;
 	}
 
-	mutex_lock(&smc_create_lgr_pending);
+	ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+	if (!ini) {
+		rc = SMC_CLC_DECL_MEM;
+		goto out_decl;
+	}
+
+	/* initial version checking */
+	rc = smc_listen_v2_check(new_smc, pclc, ini);
+	if (rc)
+		goto out_decl;
+
+	mutex_lock(&smc_server_lgr_pending);
 	smc_close_init(new_smc);
 	smc_rx_init(new_smc);
 	smc_tx_init(new_smc);
 
-	/* check if ISM is available */
-	if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) &&
-	    !smc_check_ism(new_smc, &ismdev) &&
-	    !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) {
-		ism_supported = true;
-	}
-
-	/* check if RDMA is available */
-	if (!ism_supported &&
-	    ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) ||
-	     smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) ||
-	     smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) ||
-	     smc_listen_rdma_check(new_smc, pclc) ||
-	     smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
-				  &local_contact) ||
-	     smc_listen_rdma_reg(new_smc, local_contact))) {
-		/* SMC not supported, decline */
-		mutex_unlock(&smc_create_lgr_pending);
-		smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP,
-				   local_contact);
-		return;
-	}
+	/* determine ISM or RoCE device used for connection */
+	rc = smc_listen_find_device(new_smc, pclc, ini);
+	if (rc)
+		goto out_unlock;
 
 	/* send SMC Accept CLC message */
-	rc = smc_clc_send_accept(new_smc, local_contact);
-	if (rc) {
-		mutex_unlock(&smc_create_lgr_pending);
-		smc_listen_decline(new_smc, rc, local_contact);
-		return;
-	}
+	rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
+				 ini->smcd_version == SMC_V2 ? SMC_V2 : SMC_V1);
+	if (rc)
+		goto out_unlock;
+
+	/* SMC-D does not need this lock any more */
+	if (ini->is_smcd)
+		mutex_unlock(&smc_server_lgr_pending);
 
 	/* receive SMC Confirm CLC message */
-	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
-				       SMC_CLC_CONFIRM);
-	if (reason_code) {
-		mutex_unlock(&smc_create_lgr_pending);
-		smc_listen_decline(new_smc, reason_code, local_contact);
-		return;
+	memset(buf, 0, sizeof(*buf));
+	cclc = (struct smc_clc_msg_accept_confirm *)buf;
+	rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf),
+			      SMC_CLC_CONFIRM, CLC_WAIT_TIME);
+	if (rc) {
+		if (!ini->is_smcd)
+			goto out_unlock;
+		goto out_decl;
 	}
 
 	/* finish worker */
-	if (!ism_supported) {
-		if (smc_listen_rdma_finish(new_smc, &cclc, local_contact))
-			return;
+	if (!ini->is_smcd) {
+		rc = smc_listen_rdma_finish(new_smc, cclc,
+					    ini->first_contact_local);
+		if (rc)
+			goto out_unlock;
+		mutex_unlock(&smc_server_lgr_pending);
 	}
-	smc_conn_save_peer_info(new_smc, &cclc);
-	mutex_unlock(&smc_create_lgr_pending);
+	smc_conn_save_peer_info(new_smc, cclc);
 	smc_listen_out_connected(new_smc);
+	goto out_free;
+
+out_unlock:
+	mutex_unlock(&smc_server_lgr_pending);
+out_decl:
+	smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
+			   version);
+out_free:
+	kfree(ini);
+	kfree(buf);
 }
 
 static void smc_tcp_listen_work(struct work_struct *work)
@@ -1312,7 +1814,7 @@
 	lock_sock(lsk);
 	while (lsk->sk_state == SMC_LISTEN) {
 		rc = smc_clcsock_accept(lsmc, &new_smc);
-		if (rc)
+		if (rc) /* clcsock accept queue empty or error */
 			goto out;
 		if (!new_smc)
 			continue;
@@ -1326,13 +1828,29 @@
 		new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
 		new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
 		sock_hold(&new_smc->sk); /* sock_put in passive closing */
-		if (!schedule_work(&new_smc->smc_listen_work))
+		if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
 			sock_put(&new_smc->sk);
 	}
 
 out:
 	release_sock(lsk);
-	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
+	sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
+}
+
+static void smc_clcsock_data_ready(struct sock *listen_clcsock)
+{
+	struct smc_sock *lsmc;
+
+	lsmc = (struct smc_sock *)
+	       ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY);
+	if (!lsmc)
+		return;
+	lsmc->clcsk_data_ready(listen_clcsock);
+	if (lsmc->sk.sk_state == SMC_LISTEN) {
+		sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
+		if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work))
+			sock_put(&lsmc->sk);
+	}
 }
 
 static int smc_listen(struct socket *sock, int backlog)
@@ -1345,7 +1863,8 @@
 	lock_sock(sk);
 
 	rc = -EINVAL;
-	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
+	if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
+	    smc->connect_nonblock)
 		goto out;
 
 	rc = 0;
@@ -1360,16 +1879,21 @@
 	if (!smc->use_fallback)
 		tcp_sk(smc->clcsock->sk)->syn_smc = 1;
 
+	/* save original sk_data_ready function and establish
+	 * smc-specific sk_data_ready function
+	 */
+	smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready;
+	smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready;
+	smc->clcsock->sk->sk_user_data =
+		(void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
 	rc = kernel_listen(smc->clcsock, backlog);
-	if (rc)
+	if (rc) {
+		smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready;
 		goto out;
+	}
 	sk->sk_max_ack_backlog = backlog;
 	sk->sk_ack_backlog = 0;
 	sk->sk_state = SMC_LISTEN;
-	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
-	sock_hold(sk); /* sock_hold in tcp_listen_worker */
-	if (!schedule_work(&smc->tcp_listen_work))
-		sock_put(sk);
 
 out:
 	release_sock(sk);
@@ -1464,23 +1988,26 @@
 {
 	struct sock *sk = sock->sk;
 	struct smc_sock *smc;
-	int rc = -EPIPE;
+	int rc;
 
 	smc = smc_sk(sk);
 	lock_sock(sk);
-	if ((sk->sk_state != SMC_ACTIVE) &&
-	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
-	    (sk->sk_state != SMC_INIT))
-		goto out;
 
+	/* SMC does not support connect with fastopen */
 	if (msg->msg_flags & MSG_FASTOPEN) {
-		if (sk->sk_state == SMC_INIT) {
-			smc->use_fallback = true;
+		/* not connected yet, fallback */
+		if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
+			smc_switch_to_fallback(smc);
 			smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
 		} else {
 			rc = -EINVAL;
 			goto out;
 		}
+	} else if ((sk->sk_state != SMC_ACTIVE) &&
+		   (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+		   (sk->sk_state != SMC_INIT)) {
+		rc = -EPIPE;
+		goto out;
 	}
 
 	if (smc->use_fallback)
@@ -1501,6 +2028,11 @@
 
 	smc = smc_sk(sk);
 	lock_sock(sk);
+	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
+		/* socket was connected before, no more data to read */
+		rc = 0;
+		goto out;
+	}
 	if ((sk->sk_state == SMC_INIT) ||
 	    (sk->sk_state == SMC_LISTEN) ||
 	    (sk->sk_state == SMC_CLOSED))
@@ -1540,8 +2072,8 @@
 			     poll_table *wait)
 {
 	struct sock *sk = sock->sk;
-	__poll_t mask = 0;
 	struct smc_sock *smc;
+	__poll_t mask = 0;
 
 	if (!sk)
 		return EPOLLNVAL;
@@ -1551,8 +2083,6 @@
 		/* delegate to CLC child sock */
 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
 		sk->sk_err = smc->clcsock->sk->sk_err;
-		if (sk->sk_err)
-			mask |= EPOLLERR;
 	} else {
 		if (sk->sk_state != SMC_CLOSED)
 			sock_poll_wait(file, sock, wait);
@@ -1563,9 +2093,14 @@
 			mask |= EPOLLHUP;
 		if (sk->sk_state == SMC_LISTEN) {
 			/* woken up by sk_data_ready in smc_listen_work() */
-			mask = smc_accept_poll(sk);
+			mask |= smc_accept_poll(sk);
+		} else if (smc->use_fallback) { /* as result of connect_work()*/
+			mask |= smc->clcsock->ops->poll(file, smc->clcsock,
+							   wait);
+			sk->sk_err = smc->clcsock->sk->sk_err;
 		} else {
-			if (atomic_read(&smc->conn.sndbuf_space) ||
+			if ((sk->sk_state != SMC_INIT &&
+			     atomic_read(&smc->conn.sndbuf_space)) ||
 			    sk->sk_shutdown & SEND_SHUTDOWN) {
 				mask |= EPOLLOUT | EPOLLWRNORM;
 			} else {
@@ -1613,8 +2148,10 @@
 	if (smc->use_fallback) {
 		rc = kernel_sock_shutdown(smc->clcsock, how);
 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
-		if (sk->sk_shutdown == SHUTDOWN_MASK)
+		if (sk->sk_shutdown == SHUTDOWN_MASK) {
 			sk->sk_state = SMC_CLOSED;
+			sock_put(sk);
+		}
 		goto out;
 	}
 	switch (how) {
@@ -1644,7 +2181,7 @@
 }
 
 static int smc_setsockopt(struct socket *sock, int level, int optname,
-			  char __user *optval, unsigned int optlen)
+			  sockptr_t optval, unsigned int optlen)
 {
 	struct sock *sk = sock->sk;
 	struct smc_sock *smc;
@@ -1658,51 +2195,53 @@
 	/* generic setsockopts reaching us here always apply to the
 	 * CLC socket
 	 */
-	rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
-					   optval, optlen);
+	if (unlikely(!smc->clcsock->ops->setsockopt))
+		rc = -EOPNOTSUPP;
+	else
+		rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+						   optval, optlen);
 	if (smc->clcsock->sk->sk_err) {
 		sk->sk_err = smc->clcsock->sk->sk_err;
 		sk->sk_error_report(sk);
 	}
-	if (rc)
-		return rc;
 
 	if (optlen < sizeof(int))
 		return -EINVAL;
-	if (get_user(val, (int __user *)optval))
+	if (copy_from_sockptr(&val, optval, sizeof(int)))
 		return -EFAULT;
 
 	lock_sock(sk);
+	if (rc || smc->use_fallback)
+		goto out;
 	switch (optname) {
 	case TCP_FASTOPEN:
 	case TCP_FASTOPEN_CONNECT:
 	case TCP_FASTOPEN_KEY:
 	case TCP_FASTOPEN_NO_COOKIE:
 		/* option not supported by SMC */
-		if (sk->sk_state == SMC_INIT) {
-			smc->use_fallback = true;
+		if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
+			smc_switch_to_fallback(smc);
 			smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
 		} else {
-			if (!smc->use_fallback)
-				rc = -EINVAL;
+			rc = -EINVAL;
 		}
 		break;
 	case TCP_NODELAY:
 		if (sk->sk_state != SMC_INIT &&
 		    sk->sk_state != SMC_LISTEN &&
 		    sk->sk_state != SMC_CLOSED) {
-			if (val && !smc->use_fallback)
-				mod_delayed_work(system_wq, &smc->conn.tx_work,
-						 0);
+			if (val)
+				mod_delayed_work(smc->conn.lgr->tx_wq,
+						 &smc->conn.tx_work, 0);
 		}
 		break;
 	case TCP_CORK:
 		if (sk->sk_state != SMC_INIT &&
 		    sk->sk_state != SMC_LISTEN &&
 		    sk->sk_state != SMC_CLOSED) {
-			if (!val && !smc->use_fallback)
-				mod_delayed_work(system_wq, &smc->conn.tx_work,
-						 0);
+			if (!val)
+				mod_delayed_work(smc->conn.lgr->tx_wq,
+						 &smc->conn.tx_work, 0);
 		}
 		break;
 	case TCP_DEFER_ACCEPT:
@@ -1711,6 +2250,7 @@
 	default:
 		break;
 	}
+out:
 	release_sock(sk);
 
 	return rc;
@@ -1723,6 +2263,8 @@
 
 	smc = smc_sk(sock->sk);
 	/* socket options apply to the CLC socket */
+	if (unlikely(!smc->clcsock->ops->getsockopt))
+		return -EOPNOTSUPP;
 	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
 					     optval, optlen);
 }
@@ -1848,7 +2390,11 @@
 
 	smc = smc_sk(sk);
 	lock_sock(sk);
-
+	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
+		/* socket was connected before, no more data to read */
+		rc = 0;
+		goto out;
+	}
 	if (sk->sk_state == SMC_INIT ||
 	    sk->sk_state == SMC_LISTEN ||
 	    sk->sk_state == SMC_CLOSED)
@@ -1947,30 +2493,71 @@
 	.create	= smc_create,
 };
 
+unsigned int smc_net_id;
+
+static __net_init int smc_net_init(struct net *net)
+{
+	return smc_pnet_net_init(net);
+}
+
+static void __net_exit smc_net_exit(struct net *net)
+{
+	smc_pnet_net_exit(net);
+}
+
+static struct pernet_operations smc_net_ops = {
+	.init = smc_net_init,
+	.exit = smc_net_exit,
+	.id   = &smc_net_id,
+	.size = sizeof(struct smc_net),
+};
+
 static int __init smc_init(void)
 {
 	int rc;
 
-	rc = smc_pnet_init();
+	rc = register_pernet_subsys(&smc_net_ops);
 	if (rc)
 		return rc;
+
+	smc_ism_init();
+	smc_clc_init();
+
+	rc = smc_pnet_init();
+	if (rc)
+		goto out_pernet_subsys;
+
+	rc = -ENOMEM;
+	smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
+	if (!smc_hs_wq)
+		goto out_pnet;
+
+	smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
+	if (!smc_close_wq)
+		goto out_alloc_hs_wq;
+
+	rc = smc_core_init();
+	if (rc) {
+		pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
+		goto out_alloc_wqs;
+	}
 
 	rc = smc_llc_init();
 	if (rc) {
 		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
-		goto out_pnet;
+		goto out_core;
 	}
 
 	rc = smc_cdc_init();
 	if (rc) {
 		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
-		goto out_pnet;
+		goto out_core;
 	}
 
 	rc = proto_register(&smc_proto, 1);
 	if (rc) {
 		pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
-		goto out_pnet;
+		goto out_core;
 	}
 
 	rc = proto_register(&smc_proto6, 1);
@@ -2002,20 +2589,33 @@
 	proto_unregister(&smc_proto6);
 out_proto:
 	proto_unregister(&smc_proto);
+out_core:
+	smc_core_exit();
+out_alloc_wqs:
+	destroy_workqueue(smc_close_wq);
+out_alloc_hs_wq:
+	destroy_workqueue(smc_hs_wq);
 out_pnet:
 	smc_pnet_exit();
+out_pernet_subsys:
+	unregister_pernet_subsys(&smc_net_ops);
+
 	return rc;
 }
 
 static void __exit smc_exit(void)
 {
-	smc_core_exit();
 	static_branch_disable(&tcp_have_smc);
-	smc_ib_unregister_client();
 	sock_unregister(PF_SMC);
+	smc_core_exit();
+	smc_ib_unregister_client();
+	destroy_workqueue(smc_close_wq);
+	destroy_workqueue(smc_hs_wq);
 	proto_unregister(&smc_proto6);
 	proto_unregister(&smc_proto);
 	smc_pnet_exit();
+	unregister_pernet_subsys(&smc_net_ops);
+	rcu_barrier();
 }
 
 module_init(smc_init);

--
Gitblit v1.6.2