From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M
---
kernel/net/smc/af_smc.c | 1584 ++++++++++++++++++++++++++++++++++++++++------------------
1 files changed, 1,092 insertions(+), 492 deletions(-)
diff --git a/kernel/net/smc/af_smc.c b/kernel/net/smc/af_smc.c
index 4c904ab..8ab8492 100644
--- a/kernel/net/smc/af_smc.c
+++ b/kernel/net/smc/af_smc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Shared Memory Communications over RDMA (SMC-R) and RoCE
*
@@ -24,11 +25,17 @@
#include <linux/in.h>
#include <linux/sched/signal.h>
#include <linux/if_vlan.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/ctype.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <net/smc.h>
#include <asm/ioctls.h>
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include "smc_netns.h"
#include "smc.h"
#include "smc_clc.h"
@@ -42,9 +49,15 @@
#include "smc_rx.h"
#include "smc_close.h"
-static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
- * creation
+static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
+ * creation on server
*/
+static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group
+ * creation on client
+ */
+
+struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
+struct workqueue_struct *smc_close_wq; /* wq for close work */
static void smc_tcp_listen_work(struct work_struct *);
static void smc_connect_work(struct work_struct *);
@@ -115,21 +128,74 @@
};
EXPORT_SYMBOL_GPL(smc_proto6);
+static void smc_restore_fallback_changes(struct smc_sock *smc)
+{
+ if (smc->clcsock->file) { /* non-accepted sockets have no file yet */
+ smc->clcsock->file->private_data = smc->sk.sk_socket;
+ smc->clcsock->file = NULL;
+ }
+}
+
+static int __smc_release(struct smc_sock *smc)
+{
+ struct sock *sk = &smc->sk;
+ int rc = 0;
+
+ if (!smc->use_fallback) {
+ rc = smc_close_active(smc);
+ sock_set_flag(sk, SOCK_DEAD);
+ sk->sk_shutdown |= SHUTDOWN_MASK;
+ } else {
+ if (sk->sk_state != SMC_CLOSED) {
+ if (sk->sk_state != SMC_LISTEN &&
+ sk->sk_state != SMC_INIT)
+ sock_put(sk); /* passive closing */
+ if (sk->sk_state == SMC_LISTEN) {
+ /* wake up clcsock accept */
+ rc = kernel_sock_shutdown(smc->clcsock,
+ SHUT_RDWR);
+ }
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_state_change(sk);
+ }
+ smc_restore_fallback_changes(smc);
+ }
+
+ sk->sk_prot->unhash(sk);
+
+ if (sk->sk_state == SMC_CLOSED) {
+ if (smc->clcsock) {
+ release_sock(sk);
+ smc_clcsock_release(smc);
+ lock_sock(sk);
+ }
+ if (!smc->use_fallback)
+ smc_conn_free(&smc->conn);
+ }
+
+ return rc;
+}
+
static int smc_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
- int rc = 0;
+ int old_state, rc = 0;
if (!sk)
goto out;
+ sock_hold(sk); /* sock_put below */
smc = smc_sk(sk);
+ old_state = sk->sk_state;
+
/* cleanup for a dangling non-blocking connect */
- flush_work(&smc->connect_work);
- kfree(smc->connect_info);
- smc->connect_info = NULL;
+ if (smc->connect_nonblock && old_state == SMC_INIT)
+ tcp_abort(smc->clcsock->sk, ECONNABORTED);
+
+ if (cancel_work_sync(&smc->connect_work))
+ sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */
if (sk->sk_state == SMC_LISTEN)
/* smc_close_non_accepted() is called and acquires
@@ -139,38 +205,18 @@
else
lock_sock(sk);
- if (!smc->use_fallback) {
- rc = smc_close_active(smc);
- sock_set_flag(sk, SOCK_DEAD);
- sk->sk_shutdown |= SHUTDOWN_MASK;
- }
+ if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE &&
+ !smc->use_fallback)
+ smc_close_active_abort(smc);
- sk->sk_prot->unhash(sk);
-
- if (smc->clcsock) {
- if (smc->use_fallback && sk->sk_state == SMC_LISTEN) {
- /* wake up clcsock accept */
- rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
- }
- mutex_lock(&smc->clcsock_release_lock);
- sock_release(smc->clcsock);
- smc->clcsock = NULL;
- mutex_unlock(&smc->clcsock_release_lock);
- }
- if (smc->use_fallback) {
- if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
- sock_put(sk); /* passive closing */
- sk->sk_state = SMC_CLOSED;
- sk->sk_state_change(sk);
- }
+ rc = __smc_release(smc);
/* detach socket */
sock_orphan(sk);
sock->sk = NULL;
- if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
- smc_conn_free(&smc->conn);
release_sock(sk);
+ sock_put(sk); /* sock_hold above */
sock_put(sk); /* final sock_put */
out:
return rc;
@@ -245,7 +291,7 @@
/* Check if socket is already active */
rc = -EINVAL;
- if (sk->sk_state != SMC_INIT)
+ if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
goto out_rel;
smc->clcsock->sk->sk_reuse = sk->sk_reuse;
@@ -289,7 +335,8 @@
(1UL << SOCK_RXQ_OVFL) | \
(1UL << SOCK_WIFI_STATUS) | \
(1UL << SOCK_NOFCS) | \
- (1UL << SOCK_FILTER_LOCKED))
+ (1UL << SOCK_FILTER_LOCKED) | \
+ (1UL << SOCK_TSTAMP_NEW))
/* copy only relevant settings and flags of SOL_SOCKET level from smc to
* clc socket (since smc is not called for these options from net/core)
*/
@@ -308,47 +355,61 @@
smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}
-/* register a new rmb, optionally send confirm_rkey msg to register with peer */
-static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
- bool conf_rkey)
+/* register the new rmb on all links */
+static int smcr_lgr_reg_rmbs(struct smc_link *link,
+ struct smc_buf_desc *rmb_desc)
{
- /* register memory region for new rmb */
- if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
- rmb_desc->regerr = 1;
- return -EFAULT;
+ struct smc_link_group *lgr = link->lgr;
+ int i, rc = 0;
+
+ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
+ if (rc)
+ return rc;
+ /* protect against parallel smc_llc_cli_rkey_exchange() and
+ * parallel smcr_link_reg_rmb()
+ */
+ mutex_lock(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]))
+ continue;
+ rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc);
+ if (rc)
+ goto out;
}
- if (!conf_rkey)
- return 0;
+
/* exchange confirm_rkey msg with peer */
- if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
- rmb_desc->regerr = 1;
- return -EFAULT;
+ rc = smc_llc_do_confirm_rkey(link, rmb_desc);
+ if (rc) {
+ rc = -EFAULT;
+ goto out;
}
- return 0;
+ rmb_desc->is_conf_rkey = true;
+out:
+ mutex_unlock(&lgr->llc_conf_mutex);
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+ return rc;
}
-static int smc_clnt_conf_first_link(struct smc_sock *smc)
+static int smcr_clnt_conf_first_link(struct smc_sock *smc)
{
- struct net *net = sock_net(smc->clcsock->sk);
- struct smc_link_group *lgr = smc->conn.lgr;
- struct smc_link *link;
- int rest;
+ struct smc_link *link = smc->conn.lnk;
+ struct smc_llc_qentry *qentry;
int rc;
- link = &lgr->lnk[SMC_SINGLE_LINK];
/* receive CONFIRM LINK request from server over RoCE fabric */
- rest = wait_for_completion_interruptible_timeout(
- &link->llc_confirm,
- SMC_LLC_WAIT_FIRST_TIME);
- if (rest <= 0) {
+ qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
+ SMC_LLC_CONFIRM_LINK);
+ if (!qentry) {
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
- SMC_CLC_DECLINE);
- return rc;
+ SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+ return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
}
-
- if (link->llc_confirm_rc)
+ smc_llc_save_peer_uid(qentry);
+ rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
+ smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
+ if (rc)
return SMC_CLC_DECL_RMBE_EC;
rc = smc_ib_modify_qp_rts(link);
@@ -357,60 +418,86 @@
smc_wr_remember_qp_attr(link);
- if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
+ if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
return SMC_CLC_DECL_ERR_REGRMB;
+
+ /* confirm_rkey is implicit on 1st contact */
+ smc->conn.rmb_desc->is_conf_rkey = true;
/* send CONFIRM LINK response over RoCE fabric */
rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
if (rc < 0)
return SMC_CLC_DECL_TIMEOUT_CL;
- /* receive ADD LINK request from server over RoCE fabric */
- rest = wait_for_completion_interruptible_timeout(&link->llc_add,
- SMC_LLC_WAIT_TIME);
- if (rest <= 0) {
+ smc_llc_link_active(link);
+ smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
+
+ /* optional 2nd link, receive ADD LINK request from server */
+ qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
+ SMC_LLC_ADD_LINK);
+ if (!qentry) {
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
- SMC_CLC_DECLINE);
+ SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+ if (rc == -EAGAIN)
+ rc = 0; /* no DECLINE received, go with one link */
return rc;
}
-
- /* send add link reject message, only one link supported for now */
- rc = smc_llc_send_add_link(link,
- link->smcibdev->mac[link->ibport - 1],
- link->gid, SMC_LLC_RESP);
- if (rc < 0)
- return SMC_CLC_DECL_TIMEOUT_AL;
-
- smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
-
+ smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
+ smc_llc_cli_add_link(link, qentry);
return 0;
}
static void smcr_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
- int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
+ int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);
- smc->conn.peer_rmbe_idx = clc->rmbe_idx;
- smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
+ smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
+ smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
smc->conn.peer_rmbe_size = bufsize;
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
}
+static bool smc_isascii(char *hostname)
+{
+ int i;
+
+ for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
+ if (!isascii(hostname[i]))
+ return false;
+ return true;
+}
+
static void smcd_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
- int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
+ int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);
- smc->conn.peer_rmbe_idx = clc->dmbe_idx;
- smc->conn.peer_token = clc->token;
+ smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
+ smc->conn.peer_token = clc->d0.token;
/* msg header takes up space in the buffer */
smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
+ if (clc->hdr.version > SMC_V1 &&
+ (clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) {
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)clc;
+ struct smc_clc_first_contact_ext *fce =
+ (struct smc_clc_first_contact_ext *)
+ (((u8 *)clc_v2) + sizeof(*clc_v2));
+
+ memcpy(smc->conn.lgr->negotiated_eid, clc_v2->eid,
+ SMC_MAX_EID_LEN);
+ smc->conn.lgr->peer_os = fce->os_type;
+ smc->conn.lgr->peer_smc_release = fce->release;
+ if (smc_isascii(fce->hostname))
+ memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
+ SMC_MAX_HOSTNAME_LEN);
+ }
}
static void smc_conn_save_peer_info(struct smc_sock *smc,
@@ -425,26 +512,53 @@
static void smc_link_save_peer_info(struct smc_link *link,
struct smc_clc_msg_accept_confirm *clc)
{
- link->peer_qpn = ntoh24(clc->qpn);
- memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
- memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
- link->peer_psn = ntoh24(clc->psn);
- link->peer_mtu = clc->qp_mtu;
+ link->peer_qpn = ntoh24(clc->r0.qpn);
+ memcpy(link->peer_gid, clc->r0.lcl.gid, SMC_GID_SIZE);
+ memcpy(link->peer_mac, clc->r0.lcl.mac, sizeof(link->peer_mac));
+ link->peer_psn = ntoh24(clc->r0.psn);
+ link->peer_mtu = clc->r0.qp_mtu;
+}
+
+static void smc_switch_to_fallback(struct smc_sock *smc)
+{
+ wait_queue_head_t *smc_wait = sk_sleep(&smc->sk);
+ wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk);
+ unsigned long flags;
+
+ smc->use_fallback = true;
+ if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
+ smc->clcsock->file = smc->sk.sk_socket->file;
+ smc->clcsock->file->private_data = smc->clcsock;
+ smc->clcsock->wq.fasync_list =
+ smc->sk.sk_socket->wq.fasync_list;
+
+ /* There may be some entries remaining in
+ * smc socket->wq, which should be removed
+ * to clcsocket->wq during the fallback.
+ */
+ spin_lock_irqsave(&smc_wait->lock, flags);
+ spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING);
+ list_splice_init(&smc_wait->head, &clc_wait->head);
+ spin_unlock(&clc_wait->lock);
+ spin_unlock_irqrestore(&smc_wait->lock, flags);
+ }
}
/* fall back during connect */
static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
{
- smc->use_fallback = true;
+ smc_switch_to_fallback(smc);
smc->fallback_rsn = reason_code;
smc_copy_sock_settings_to_clc(smc);
+ smc->connect_nonblock = 0;
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
return 0;
}
/* decline and fall back during connect */
-static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
+static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code,
+ u8 version)
{
int rc;
@@ -454,7 +568,7 @@
return reason_code;
}
if (reason_code != SMC_CLC_DECL_PEERDECL) {
- rc = smc_clc_send_decline(smc, reason_code);
+ rc = smc_clc_send_decline(smc, reason_code, version);
if (rc < 0) {
if (smc->sk.sk_state == SMC_INIT)
sock_put(&smc->sk); /* passive closing */
@@ -465,190 +579,367 @@
}
/* abort connecting */
-static int smc_connect_abort(struct smc_sock *smc, int reason_code,
- int local_contact)
+static void smc_connect_abort(struct smc_sock *smc, int local_first)
{
- if (local_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(smc->conn.lgr);
- mutex_unlock(&smc_create_lgr_pending);
- smc_conn_free(&smc->conn);
- return reason_code;
+ if (local_first)
+ smc_lgr_cleanup_early(&smc->conn);
+ else
+ smc_conn_free(&smc->conn);
}
/* check if there is a rdma device available for this connection. */
/* called for connect and listen */
-static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
- u8 *ibport, unsigned short vlan_id, u8 gid[])
+static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
{
- int reason_code = 0;
-
/* PNET table look up: search active ib_device and port
* within same PNETID that also contains the ethernet device
* used for the internal TCP socket
*/
- smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id,
- gid);
- if (!(*ibdev))
- reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
-
- return reason_code;
+ smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
+ if (!ini->ib_dev)
+ return SMC_CLC_DECL_NOSMCRDEV;
+ return 0;
}
/* check if there is an ISM device available for this connection. */
/* called for connect and listen */
-static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev)
+static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
{
/* Find ISM device with same PNETID as connecting interface */
- smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev);
- if (!(*ismdev))
- return SMC_CLC_DECL_CNFERR; /* configuration error */
+ smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
+ if (!ini->ism_dev[0])
+ return SMC_CLC_DECL_NOSMCDDEV;
+ else
+ ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
return 0;
+}
+
+/* is chid unique for the ism devices that are already determined? */
+static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
+ int cnt)
+{
+ int i = (!ini->ism_dev[0]) ? 1 : 0;
+
+ for (; i < cnt; i++)
+ if (ini->ism_chid[i] == chid)
+ return false;
+ return true;
+}
+
+/* determine possible V2 ISM devices (either without PNETID or with PNETID plus
+ * PNETID matching net_device)
+ */
+static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ int rc = SMC_CLC_DECL_NOSMCDDEV;
+ struct smcd_dev *smcd;
+ int i = 1;
+ u16 chid;
+
+ if (smcd_indicated(ini->smc_type_v1))
+ rc = 0; /* already initialized for V1 */
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ if (smcd->going_away || smcd == ini->ism_dev[0])
+ continue;
+ chid = smc_ism_get_chid(smcd);
+ if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
+ continue;
+ if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
+ smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
+ ini->ism_dev[i] = smcd;
+ ini->ism_chid[i] = chid;
+ ini->is_smcd = true;
+ rc = 0;
+ i++;
+ if (i > SMC_MAX_ISM_DEVS)
+ break;
+ }
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+ ini->ism_offered_cnt = i - 1;
+ if (!ini->ism_dev[0] && !ini->ism_dev[1])
+ ini->smcd_version = 0;
+
+ return rc;
}
/* Check for VLAN ID and register it on ISM device just for CLC handshake */
static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
- struct smcd_dev *ismdev,
- unsigned short vlan_id)
+ struct smc_init_info *ini)
{
- if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id))
- return SMC_CLC_DECL_CNFERR;
+ if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
+ return SMC_CLC_DECL_ISMVLANERR;
return 0;
+}
+
+static int smc_find_proposal_devices(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ int rc = 0;
+
+ /* check if there is an ism device available */
+ if (ini->smcd_version & SMC_V1) {
+ if (smc_find_ism_device(smc, ini) ||
+ smc_connect_ism_vlan_setup(smc, ini)) {
+ if (ini->smc_type_v1 == SMC_TYPE_B)
+ ini->smc_type_v1 = SMC_TYPE_R;
+ else
+ ini->smc_type_v1 = SMC_TYPE_N;
+ } /* else ISM V1 is supported for this connection */
+ if (smc_find_rdma_device(smc, ini)) {
+ if (ini->smc_type_v1 == SMC_TYPE_B)
+ ini->smc_type_v1 = SMC_TYPE_D;
+ else
+ ini->smc_type_v1 = SMC_TYPE_N;
+ } /* else RDMA is supported for this connection */
+ }
+ if (smc_ism_v2_capable && smc_find_ism_v2_device_clnt(smc, ini))
+ ini->smc_type_v2 = SMC_TYPE_N;
+
+ /* if neither ISM nor RDMA are supported, fallback */
+ if (!smcr_indicated(ini->smc_type_v1) &&
+ ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
+ rc = SMC_CLC_DECL_NOSMCDEV;
+
+ return rc;
}
/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
* used, the VLAN ID will be registered again during the connection setup.
*/
-static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
- struct smcd_dev *ismdev,
- unsigned short vlan_id)
+static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
+ struct smc_init_info *ini)
{
- if (!is_smcd)
+ if (!smcd_indicated(ini->smc_type_v1))
return 0;
- if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id))
+ if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id))
return SMC_CLC_DECL_CNFERR;
return 0;
}
+#define SMC_CLC_MAX_ACCEPT_LEN \
+ (sizeof(struct smc_clc_msg_accept_confirm_v2) + \
+ sizeof(struct smc_clc_first_contact_ext) + \
+ sizeof(struct smc_clc_msg_trail))
+
/* CLC handshake during connect */
-static int smc_connect_clc(struct smc_sock *smc, int smc_type,
- struct smc_clc_msg_accept_confirm *aclc,
- struct smc_ib_device *ibdev, u8 ibport,
- u8 gid[], struct smcd_dev *ismdev)
+static int smc_connect_clc(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm_v2 *aclc2,
+ struct smc_init_info *ini)
{
int rc = 0;
/* do inband token exchange */
- rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev);
+ rc = smc_clc_send_proposal(smc, ini);
if (rc)
return rc;
/* receive SMC Accept CLC message */
- return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT);
+ return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN,
+ SMC_CLC_ACCEPT, CLC_WAIT_TIME);
}
/* setup for RDMA connection of client */
static int smc_connect_rdma(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *aclc,
- struct smc_ib_device *ibdev, u8 ibport)
+ struct smc_init_info *ini)
{
- int local_contact = SMC_FIRST_CONTACT;
+ int i, reason_code = 0;
struct smc_link *link;
- int reason_code = 0;
- mutex_lock(&smc_create_lgr_pending);
- local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev,
- ibport, &aclc->lcl, NULL, 0);
- if (local_contact < 0) {
- if (local_contact == -ENOMEM)
- reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
- else if (local_contact == -ENOLINK)
- reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
- else
- reason_code = SMC_CLC_DECL_INTERR; /* other error */
- return smc_connect_abort(smc, reason_code, 0);
+ ini->is_smcd = false;
+ ini->ib_lcl = &aclc->r0.lcl;
+ ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
+ ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
+
+ mutex_lock(&smc_client_lgr_pending);
+ reason_code = smc_conn_create(smc, ini);
+ if (reason_code) {
+ mutex_unlock(&smc_client_lgr_pending);
+ return reason_code;
}
- link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
smc_conn_save_peer_info(smc, aclc);
- /* create send buffer and rmb */
- if (smc_buf_create(smc, false))
- return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
+ if (ini->first_contact_local) {
+ link = smc->conn.lnk;
+ } else {
+ /* set link that was assigned by server */
+ link = NULL;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *l = &smc->conn.lgr->lnk[i];
- if (local_contact == SMC_FIRST_CONTACT)
+ if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
+ !memcmp(l->peer_gid, &aclc->r0.lcl.gid,
+ SMC_GID_SIZE) &&
+ !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
+ sizeof(l->peer_mac))) {
+ link = l;
+ break;
+ }
+ }
+ if (!link) {
+ reason_code = SMC_CLC_DECL_NOSRVLINK;
+ goto connect_abort;
+ }
+ smc->conn.lnk = link;
+ }
+
+ /* create send buffer and rmb */
+ if (smc_buf_create(smc, false)) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto connect_abort;
+ }
+
+ if (ini->first_contact_local)
smc_link_save_peer_info(link, aclc);
- if (smc_rmb_rtoken_handling(&smc->conn, aclc))
- return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
- local_contact);
+ if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
+ reason_code = SMC_CLC_DECL_ERR_RTOK;
+ goto connect_abort;
+ }
smc_close_init(smc);
smc_rx_init(smc);
- if (local_contact == SMC_FIRST_CONTACT) {
- if (smc_ib_ready_link(link))
- return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
- local_contact);
+ if (ini->first_contact_local) {
+ if (smc_ib_ready_link(link)) {
+ reason_code = SMC_CLC_DECL_ERR_RDYLNK;
+ goto connect_abort;
+ }
} else {
- if (!smc->conn.rmb_desc->reused &&
- smc_reg_rmb(link, smc->conn.rmb_desc, true))
- return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
- local_contact);
+ if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
+ reason_code = SMC_CLC_DECL_ERR_REGRMB;
+ goto connect_abort;
+ }
}
smc_rmb_sync_sg_for_device(&smc->conn);
- reason_code = smc_clc_send_confirm(smc);
+ reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
+ SMC_V1);
if (reason_code)
- return smc_connect_abort(smc, reason_code, local_contact);
+ goto connect_abort;
smc_tx_init(smc);
- if (local_contact == SMC_FIRST_CONTACT) {
+ if (ini->first_contact_local) {
/* QP confirmation over RoCE fabric */
- reason_code = smc_clnt_conf_first_link(smc);
+ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
+ reason_code = smcr_clnt_conf_first_link(smc);
+ smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
if (reason_code)
- return smc_connect_abort(smc, reason_code,
- local_contact);
+ goto connect_abort;
}
- mutex_unlock(&smc_create_lgr_pending);
+ mutex_unlock(&smc_client_lgr_pending);
smc_copy_sock_settings_to_clc(smc);
+ smc->connect_nonblock = 0;
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
return 0;
+connect_abort:
+ smc_connect_abort(smc, ini->first_contact_local);
+ mutex_unlock(&smc_client_lgr_pending);
+ smc->connect_nonblock = 0;
+
+ return reason_code;
+}
+
+/* The server has chosen one of the proposed ISM devices for the communication.
+ * Determine from the CHID of the received CLC ACCEPT the ISM device chosen.
+ */
+static int
+smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc,
+ struct smc_init_info *ini)
+{
+ int i;
+
+ for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
+ if (ini->ism_chid[i] == ntohs(aclc->chid)) {
+ ini->ism_selected = i;
+ return 0;
+ }
+ }
+
+ return -EPROTO;
}
/* setup for ISM connection of client */
static int smc_connect_ism(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *aclc,
- struct smcd_dev *ismdev)
+ struct smc_init_info *ini)
{
- int local_contact = SMC_FIRST_CONTACT;
int rc = 0;
- mutex_lock(&smc_create_lgr_pending);
- local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0,
- NULL, ismdev, aclc->gid);
- if (local_contact < 0)
- return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0);
+ ini->is_smcd = true;
+ ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
+
+ if (aclc->hdr.version == SMC_V2) {
+ struct smc_clc_msg_accept_confirm_v2 *aclc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)aclc;
+
+ rc = smc_v2_determine_accepted_chid(aclc_v2, ini);
+ if (rc)
+ return rc;
+ }
+ ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid;
+
+ /* there is only one lgr role for SMC-D; use server lock */
+ mutex_lock(&smc_server_lgr_pending);
+ rc = smc_conn_create(smc, ini);
+ if (rc) {
+ mutex_unlock(&smc_server_lgr_pending);
+ return rc;
+ }
/* Create send and receive buffers */
- if (smc_buf_create(smc, true))
- return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
+ rc = smc_buf_create(smc, true);
+ if (rc) {
+ rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM;
+ goto connect_abort;
+ }
smc_conn_save_peer_info(smc, aclc);
smc_close_init(smc);
smc_rx_init(smc);
smc_tx_init(smc);
- rc = smc_clc_send_confirm(smc);
+ rc = smc_clc_send_confirm(smc, ini->first_contact_local,
+ aclc->hdr.version);
if (rc)
- return smc_connect_abort(smc, rc, local_contact);
- mutex_unlock(&smc_create_lgr_pending);
+ goto connect_abort;
+ mutex_unlock(&smc_server_lgr_pending);
smc_copy_sock_settings_to_clc(smc);
+ smc->connect_nonblock = 0;
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
+
+ return 0;
+connect_abort:
+ smc_connect_abort(smc, ini->first_contact_local);
+ mutex_unlock(&smc_server_lgr_pending);
+ smc->connect_nonblock = 0;
+
+ return rc;
+}
+
+/* check if received accept type and version matches a proposed one */
+static int smc_connect_check_aclc(struct smc_init_info *ini,
+ struct smc_clc_msg_accept_confirm *aclc)
+{
+ if ((aclc->hdr.typev1 == SMC_TYPE_R &&
+ !smcr_indicated(ini->smc_type_v1)) ||
+ (aclc->hdr.typev1 == SMC_TYPE_D &&
+ ((!smcd_indicated(ini->smc_type_v1) &&
+ !smcd_indicated(ini->smc_type_v2)) ||
+ (aclc->hdr.version == SMC_V1 &&
+ !smcd_indicated(ini->smc_type_v1)) ||
+ (aclc->hdr.version == SMC_V2 &&
+ !smcd_indicated(ini->smc_type_v2)))))
+ return SMC_CLC_DECL_MODEUNSUPP;
return 0;
}
@@ -656,17 +947,12 @@
/* perform steps before actually connecting */
static int __smc_connect(struct smc_sock *smc)
{
- bool ism_supported = false, rdma_supported = false;
- struct smc_clc_msg_accept_confirm aclc;
- struct smc_ib_device *ibdev;
- struct smcd_dev *ismdev;
- u8 gid[SMC_GID_SIZE];
- unsigned short vlan;
- int smc_type;
+ u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1;
+ struct smc_clc_msg_accept_confirm_v2 *aclc2;
+ struct smc_clc_msg_accept_confirm *aclc;
+ struct smc_init_info *ini = NULL;
+ u8 *buf = NULL;
int rc = 0;
- u8 ibport;
-
- sock_hold(&smc->sk); /* sock put in passive closing */
if (smc->use_fallback)
return smc_connect_fallback(smc, smc->fallback_rsn);
@@ -675,74 +961,107 @@
if (!tcp_sk(smc->clcsock->sk)->syn_smc)
return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
- /* IPSec connections opt out of SMC-R optimizations */
+ /* IPSec connections opt out of SMC optimizations */
if (using_ipsec(smc))
- return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC,
+ version);
- /* check for VLAN ID */
- if (smc_vlan_by_tcpsk(smc->clcsock, &vlan))
- return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini)
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM,
+ version);
- /* check if there is an ism device available */
- if (!smc_check_ism(smc, &ismdev) &&
- !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) {
- /* ISM is supported for this connection */
- ism_supported = true;
- smc_type = SMC_TYPE_D;
+ ini->smcd_version = SMC_V1;
+ ini->smcd_version |= smc_ism_v2_capable ? SMC_V2 : 0;
+ ini->smc_type_v1 = SMC_TYPE_B;
+ ini->smc_type_v2 = smc_ism_v2_capable ? SMC_TYPE_D : SMC_TYPE_N;
+
+ /* get vlan id from IP device */
+ if (smc_vlan_by_tcpsk(smc->clcsock, ini)) {
+ ini->smcd_version &= ~SMC_V1;
+ ini->smc_type_v1 = SMC_TYPE_N;
+ if (!ini->smcd_version) {
+ rc = SMC_CLC_DECL_GETVLANERR;
+ goto fallback;
+ }
}
- /* check if there is a rdma device available */
- if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) {
- /* RDMA is supported for this connection */
- rdma_supported = true;
- if (ism_supported)
- smc_type = SMC_TYPE_B; /* both */
- else
- smc_type = SMC_TYPE_R; /* only RDMA */
- }
+ rc = smc_find_proposal_devices(smc, ini);
+ if (rc)
+ goto fallback;
- /* if neither ISM nor RDMA are supported, fallback */
- if (!rdma_supported && !ism_supported)
- return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
+ buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL);
+ if (!buf) {
+ rc = SMC_CLC_DECL_MEM;
+ goto fallback;
+ }
+ aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf;
+ aclc = (struct smc_clc_msg_accept_confirm *)aclc2;
/* perform CLC handshake */
- rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev);
- if (rc) {
- smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
- return smc_connect_decline_fallback(smc, rc);
- }
+ rc = smc_connect_clc(smc, aclc2, ini);
+ if (rc)
+ goto vlan_cleanup;
+
+ /* check if smc modes and versions of CLC proposal and accept match */
+ rc = smc_connect_check_aclc(ini, aclc);
+ version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2;
+ ini->smcd_version = version;
+ if (rc)
+ goto vlan_cleanup;
/* depending on previous steps, connect using rdma or ism */
- if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
- rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
- else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
- rc = smc_connect_ism(smc, &aclc, ismdev);
- else
- rc = SMC_CLC_DECL_MODEUNSUPP;
- if (rc) {
- smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
- return smc_connect_decline_fallback(smc, rc);
- }
+ if (aclc->hdr.typev1 == SMC_TYPE_R)
+ rc = smc_connect_rdma(smc, aclc, ini);
+ else if (aclc->hdr.typev1 == SMC_TYPE_D)
+ rc = smc_connect_ism(smc, aclc, ini);
+ if (rc)
+ goto vlan_cleanup;
- smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
+ smc_connect_ism_vlan_cleanup(smc, ini);
+ kfree(buf);
+ kfree(ini);
return 0;
+
+vlan_cleanup:
+ smc_connect_ism_vlan_cleanup(smc, ini);
+ kfree(buf);
+fallback:
+ kfree(ini);
+ return smc_connect_decline_fallback(smc, rc, version);
}
static void smc_connect_work(struct work_struct *work)
{
struct smc_sock *smc = container_of(work, struct smc_sock,
connect_work);
- int rc;
+ long timeo = smc->sk.sk_sndtimeo;
+ int rc = 0;
- lock_sock(&smc->sk);
- rc = kernel_connect(smc->clcsock, &smc->connect_info->addr,
- smc->connect_info->alen, smc->connect_info->flags);
+ if (!timeo)
+ timeo = MAX_SCHEDULE_TIMEOUT;
+ lock_sock(smc->clcsock->sk);
if (smc->clcsock->sk->sk_err) {
smc->sk.sk_err = smc->clcsock->sk->sk_err;
- goto out;
+ } else if ((1 << smc->clcsock->sk->sk_state) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
+ if ((rc == -EPIPE) &&
+ ((1 << smc->clcsock->sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
+ rc = 0;
}
- if (rc < 0) {
- smc->sk.sk_err = -rc;
+ release_sock(smc->clcsock->sk);
+ lock_sock(&smc->sk);
+ if (rc != 0 || smc->sk.sk_err) {
+ smc->sk.sk_state = SMC_CLOSED;
+ if (rc == -EPIPE || rc == -EAGAIN)
+ smc->sk.sk_err = EPIPE;
+ else if (rc == -ECONNREFUSED)
+ smc->sk.sk_err = ECONNREFUSED;
+ else if (signal_pending(current))
+ smc->sk.sk_err = -sock_intr_errno(timeo);
+ sock_put(&smc->sk); /* passive closing */
goto out;
}
@@ -751,12 +1070,14 @@
smc->sk.sk_err = -rc;
out:
- if (smc->sk.sk_err)
- smc->sk.sk_state_change(&smc->sk);
- else
- smc->sk.sk_write_space(&smc->sk);
- kfree(smc->connect_info);
- smc->connect_info = NULL;
+ if (!sock_flag(&smc->sk, SOCK_DEAD)) {
+ if (smc->sk.sk_err) {
+ smc->sk.sk_state_change(&smc->sk);
+ } else { /* allow polling before and after fallback decision */
+ smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
+ smc->sk.sk_write_space(&smc->sk);
+ }
+ }
release_sock(&smc->sk);
}
@@ -789,26 +1110,22 @@
smc_copy_sock_settings_to_clc(smc);
tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+ if (smc->connect_nonblock) {
+ rc = -EALREADY;
+ goto out;
+ }
+ rc = kernel_connect(smc->clcsock, addr, alen, flags);
+ if (rc && rc != -EINPROGRESS)
+ goto out;
+
+ if (smc->use_fallback)
+ goto out;
+ sock_hold(&smc->sk); /* sock put in passive closing */
if (flags & O_NONBLOCK) {
- if (smc->connect_info) {
- rc = -EALREADY;
- goto out;
- }
- smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL);
- if (!smc->connect_info) {
- rc = -ENOMEM;
- goto out;
- }
- smc->connect_info->alen = alen;
- smc->connect_info->flags = flags ^ O_NONBLOCK;
- memcpy(&smc->connect_info->addr, addr, alen);
- schedule_work(&smc->connect_work);
+ if (queue_work(smc_hs_wq, &smc->connect_work))
+ smc->connect_nonblock = 1;
rc = -EINPROGRESS;
} else {
- rc = kernel_connect(smc->clcsock, addr, alen, flags);
- if (rc)
- goto out;
-
rc = __smc_connect(smc);
if (rc < 0)
goto out;
@@ -842,10 +1159,10 @@
mutex_lock(&lsmc->clcsock_release_lock);
if (lsmc->clcsock)
- rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+ rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
mutex_unlock(&lsmc->clcsock_release_lock);
lock_sock(lsk);
- if (rc < 0)
+ if (rc < 0 && rc != -EAGAIN)
lsk->sk_err = -rc;
if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
new_sk->sk_prot->unhash(new_sk);
@@ -858,6 +1175,10 @@
goto out;
}
+ /* new clcsock has inherited the smc listen-specific sk_data_ready
+ * function; switch it back to the original sk_data_ready function
+ */
+ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready;
(*new_smc)->clcsock = new_clcsock;
out:
return rc;
@@ -911,8 +1232,13 @@
sock_put(new_sk); /* final */
continue;
}
- if (new_sock)
+ if (new_sock) {
sock_graft(new_sk, new_sock);
+ if (isk->use_fallback) {
+ smc_sk(new_sk)->clcsock->file = new_sock->file;
+ isk->clcsock->file->private_data = isk->clcsock;
+ }
+ }
return new_sk;
}
return NULL;
@@ -923,45 +1249,24 @@
{
struct smc_sock *smc = smc_sk(sk);
+ sock_hold(sk); /* sock_put below */
lock_sock(sk);
if (!sk->sk_lingertime)
/* wait for peer closing */
sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
- if (!smc->use_fallback) {
- smc_close_active(smc);
- sock_set_flag(sk, SOCK_DEAD);
- sk->sk_shutdown |= SHUTDOWN_MASK;
- }
- sk->sk_prot->unhash(sk);
- if (smc->clcsock) {
- struct socket *tcp;
-
- tcp = smc->clcsock;
- smc->clcsock = NULL;
- sock_release(tcp);
- }
- if (smc->use_fallback) {
- sock_put(sk); /* passive closing */
- sk->sk_state = SMC_CLOSED;
- } else {
- if (sk->sk_state == SMC_CLOSED)
- smc_conn_free(&smc->conn);
- }
+ __smc_release(smc);
release_sock(sk);
+ sock_put(sk); /* sock_hold above */
sock_put(sk); /* final sock_put */
}
-static int smc_serv_conf_first_link(struct smc_sock *smc)
+static int smcr_serv_conf_first_link(struct smc_sock *smc)
{
- struct net *net = sock_net(smc->clcsock->sk);
- struct smc_link_group *lgr = smc->conn.lgr;
- struct smc_link *link;
- int rest;
+ struct smc_link *link = smc->conn.lnk;
+ struct smc_llc_qentry *qentry;
int rc;
- link = &lgr->lnk[SMC_SINGLE_LINK];
-
- if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
+ if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
return SMC_CLC_DECL_ERR_REGRMB;
/* send CONFIRM LINK request to client over the RoCE fabric */
@@ -970,40 +1275,29 @@
return SMC_CLC_DECL_TIMEOUT_CL;
/* receive CONFIRM LINK response from client over the RoCE fabric */
- rest = wait_for_completion_interruptible_timeout(
- &link->llc_confirm_resp,
- SMC_LLC_WAIT_FIRST_TIME);
- if (rest <= 0) {
+ qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
+ SMC_LLC_CONFIRM_LINK);
+ if (!qentry) {
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
- SMC_CLC_DECLINE);
- return rc;
+ SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
+ return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
}
-
- if (link->llc_confirm_resp_rc)
+ smc_llc_save_peer_uid(qentry);
+ rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
+ smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
+ if (rc)
return SMC_CLC_DECL_RMBE_EC;
- /* send ADD LINK request to client over the RoCE fabric */
- rc = smc_llc_send_add_link(link,
- link->smcibdev->mac[link->ibport - 1],
- link->gid, SMC_LLC_REQ);
- if (rc < 0)
- return SMC_CLC_DECL_TIMEOUT_AL;
+ /* confirm_rkey is implicit on 1st contact */
+ smc->conn.rmb_desc->is_conf_rkey = true;
- /* receive ADD LINK response from client over the RoCE fabric */
- rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
- SMC_LLC_WAIT_TIME);
- if (rest <= 0) {
- struct smc_clc_msg_decline dclc;
+ smc_llc_link_active(link);
+ smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
- rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
- SMC_CLC_DECLINE);
- return rc;
- }
-
- smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
-
+ /* initial contact - try to establish second link */
+ smc_llc_srv_add_link(link);
return 0;
}
@@ -1013,13 +1307,13 @@
struct smc_sock *lsmc = new_smc->listen_smc;
struct sock *newsmcsk = &new_smc->sk;
- lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
if (lsmc->sk.sk_state == SMC_LISTEN) {
+ lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
smc_accept_enqueue(&lsmc->sk, newsmcsk);
+ release_sock(&lsmc->sk);
} else { /* no longer listening */
smc_close_non_accepted(newsmcsk);
}
- release_sock(&lsmc->sk);
/* Wake up accept */
lsmc->sk.sk_data_ready(&lsmc->sk);
@@ -1031,7 +1325,6 @@
{
struct sock *newsmcsk = &new_smc->sk;
- sk_refcnt_debug_inc(newsmcsk);
if (newsmcsk->sk_state == SMC_INIT)
newsmcsk->sk_state = SMC_ACTIVE;
@@ -1046,27 +1339,27 @@
if (newsmcsk->sk_state == SMC_INIT)
sock_put(&new_smc->sk); /* passive closing */
newsmcsk->sk_state = SMC_CLOSED;
- smc_conn_free(&new_smc->conn);
smc_listen_out(new_smc);
}
/* listen worker: decline and fall back if possible */
static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
- int local_contact)
+ int local_first, u8 version)
{
/* RDMA setup failed, switch back to TCP */
- if (local_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(new_smc->conn.lgr);
+ if (local_first)
+ smc_lgr_cleanup_early(&new_smc->conn);
+ else
+ smc_conn_free(&new_smc->conn);
if (reason_code < 0) { /* error, no fallback possible */
smc_listen_out_err(new_smc);
return;
}
- smc_conn_free(&new_smc->conn);
- new_smc->use_fallback = true;
+ smc_switch_to_fallback(new_smc);
new_smc->fallback_rsn = reason_code;
if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
- if (smc_clc_send_decline(new_smc, reason_code) < 0) {
+ if (smc_clc_send_decline(new_smc, reason_code, version) < 0) {
smc_listen_out_err(new_smc);
return;
}
@@ -1074,34 +1367,73 @@
smc_listen_out_connected(new_smc);
}
+/* listen worker: version checking */
+static int smc_listen_v2_check(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext;
+ struct smc_clc_v2_extension *pclc_v2_ext;
+
+ ini->smc_type_v1 = pclc->hdr.typev1;
+ ini->smc_type_v2 = pclc->hdr.typev2;
+ ini->smcd_version = ini->smc_type_v1 != SMC_TYPE_N ? SMC_V1 : 0;
+ if (pclc->hdr.version > SMC_V1)
+ ini->smcd_version |=
+ ini->smc_type_v2 != SMC_TYPE_N ? SMC_V2 : 0;
+ if (!smc_ism_v2_capable) {
+ ini->smcd_version &= ~SMC_V2;
+ goto out;
+ }
+ pclc_v2_ext = smc_get_clc_v2_ext(pclc);
+ if (!pclc_v2_ext) {
+ ini->smcd_version &= ~SMC_V2;
+ goto out;
+ }
+ pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext);
+ if (!pclc_smcd_v2_ext)
+ ini->smcd_version &= ~SMC_V2;
+
+out:
+ if (!ini->smcd_version) {
+ if (pclc->hdr.typev1 == SMC_TYPE_B ||
+ pclc->hdr.typev2 == SMC_TYPE_B)
+ return SMC_CLC_DECL_NOSMCDEV;
+ if (pclc->hdr.typev1 == SMC_TYPE_D ||
+ pclc->hdr.typev2 == SMC_TYPE_D)
+ return SMC_CLC_DECL_NOSMCDDEV;
+ return SMC_CLC_DECL_NOSMCRDEV;
+ }
+
+ return 0;
+}
+
/* listen worker: check prefixes */
-static int smc_listen_rdma_check(struct smc_sock *new_smc,
+static int smc_listen_prfx_check(struct smc_sock *new_smc,
struct smc_clc_msg_proposal *pclc)
{
struct smc_clc_msg_proposal_prefix *pclc_prfx;
struct socket *newclcsock = new_smc->clcsock;
+ if (pclc->hdr.typev1 == SMC_TYPE_N)
+ return 0;
pclc_prfx = smc_clc_proposal_get_prefix(pclc);
if (smc_clc_prfx_match(newclcsock, pclc_prfx))
- return SMC_CLC_DECL_CNFERR;
+ return SMC_CLC_DECL_DIFFPREFIX;
return 0;
}
/* listen worker: initialize connection and buffers */
static int smc_listen_rdma_init(struct smc_sock *new_smc,
- struct smc_clc_msg_proposal *pclc,
- struct smc_ib_device *ibdev, u8 ibport,
- int *local_contact)
+ struct smc_init_info *ini)
{
+ int rc;
+
/* allocate connection / link group */
- *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport,
- &pclc->lcl, NULL, 0);
- if (*local_contact < 0) {
- if (*local_contact == -ENOMEM)
- return SMC_CLC_DECL_MEM;/* insufficient memory*/
- return SMC_CLC_DECL_INTERR; /* other error */
- }
+ rc = smc_conn_create(new_smc, ini);
+ if (rc)
+ return rc;
/* create send buffer and rmb */
if (smc_buf_create(new_smc, false))
@@ -1112,109 +1444,266 @@
/* listen worker: initialize connection and buffers for SMC-D */
static int smc_listen_ism_init(struct smc_sock *new_smc,
- struct smc_clc_msg_proposal *pclc,
- struct smcd_dev *ismdev,
- int *local_contact)
+ struct smc_init_info *ini)
{
- struct smc_clc_msg_smcd *pclc_smcd;
+ int rc;
- pclc_smcd = smc_get_clc_msg_smcd(pclc);
- *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, NULL,
- ismdev, pclc_smcd->gid);
- if (*local_contact < 0) {
- if (*local_contact == -ENOMEM)
- return SMC_CLC_DECL_MEM;/* insufficient memory*/
- return SMC_CLC_DECL_INTERR; /* other error */
- }
-
- /* Check if peer can be reached via ISM device */
- if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
- new_smc->conn.lgr->vlan_id,
- new_smc->conn.lgr->smcd)) {
- if (*local_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(new_smc->conn.lgr);
- smc_conn_free(&new_smc->conn);
- return SMC_CLC_DECL_CNFERR;
- }
+ rc = smc_conn_create(new_smc, ini);
+ if (rc)
+ return rc;
/* Create send and receive buffers */
- if (smc_buf_create(new_smc, true)) {
- if (*local_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(new_smc->conn.lgr);
- smc_conn_free(&new_smc->conn);
- return SMC_CLC_DECL_MEM;
+ rc = smc_buf_create(new_smc, true);
+ if (rc) {
+ if (ini->first_contact_local)
+ smc_lgr_cleanup_early(&new_smc->conn);
+ else
+ smc_conn_free(&new_smc->conn);
+ return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB :
+ SMC_CLC_DECL_MEM;
}
return 0;
}
-/* listen worker: register buffers */
-static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
+static bool smc_is_already_selected(struct smcd_dev *smcd,
+ struct smc_init_info *ini,
+ int matches)
{
- struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+ int i;
- if (local_contact != SMC_FIRST_CONTACT) {
- if (!new_smc->conn.rmb_desc->reused) {
- if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
- return SMC_CLC_DECL_ERR_REGRMB;
+ for (i = 0; i < matches; i++)
+ if (smcd == ini->ism_dev[i])
+ return true;
+
+ return false;
+}
+
+/* check for ISM devices matching proposed ISM devices */
+static void smc_check_ism_v2_match(struct smc_init_info *ini,
+ u16 proposed_chid, u64 proposed_gid,
+ unsigned int *matches)
+{
+ struct smcd_dev *smcd;
+
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ if (smcd->going_away)
+ continue;
+ if (smc_is_already_selected(smcd, ini, *matches))
+ continue;
+ if (smc_ism_get_chid(smcd) == proposed_chid &&
+ !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) {
+ ini->ism_peer_gid[*matches] = proposed_gid;
+ ini->ism_dev[*matches] = smcd;
+ (*matches)++;
+ break;
}
+ }
+}
+
+static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+ struct smc_clc_v2_extension *smc_v2_ext;
+ struct smc_clc_msg_smcd *pclc_smcd;
+ unsigned int matches = 0;
+ u8 smcd_version;
+ u8 *eid = NULL;
+ int i;
+
+ if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2))
+ goto not_found;
+
+ pclc_smcd = smc_get_clc_msg_smcd(pclc);
+ smc_v2_ext = smc_get_clc_v2_ext(pclc);
+ smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext);
+ if (!smcd_v2_ext ||
+ !smc_v2_ext->hdr.flag.seid) /* no system EID support for SMCD */
+ goto not_found;
+
+ mutex_lock(&smcd_dev_list.mutex);
+ if (pclc_smcd->ism.chid)
+ /* check for ISM device matching proposed native ISM device */
+ smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid),
+ ntohll(pclc_smcd->ism.gid), &matches);
+ for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) {
+ /* check for ISM devices matching proposed non-native ISM
+ * devices
+ */
+ smc_check_ism_v2_match(ini,
+ ntohs(smcd_v2_ext->gidchid[i - 1].chid),
+ ntohll(smcd_v2_ext->gidchid[i - 1].gid),
+ &matches);
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+
+ if (ini->ism_dev[0]) {
+ smc_ism_get_system_eid(ini->ism_dev[0], &eid);
+ if (memcmp(eid, smcd_v2_ext->system_eid, SMC_MAX_EID_LEN))
+ goto not_found;
+ } else {
+ goto not_found;
+ }
+
+ /* separate - outside the smcd_dev_list.lock */
+ smcd_version = ini->smcd_version;
+ for (i = 0; i < matches; i++) {
+ ini->smcd_version = SMC_V2;
+ ini->is_smcd = true;
+ ini->ism_selected = i;
+ if (smc_listen_ism_init(new_smc, ini))
+ /* try next active ISM device */
+ continue;
+ return; /* matching and usable V2 ISM device found */
+ }
+ /* no V2 ISM device could be initialized */
+ ini->smcd_version = smcd_version; /* restore original value */
+
+not_found:
+ ini->smcd_version &= ~SMC_V2;
+ ini->ism_dev[0] = NULL;
+ ini->is_smcd = false;
+}
+
+static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc);
+
+ /* check if ISM V1 is available */
+ if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1))
+ goto not_found;
+ ini->is_smcd = true; /* prepare ISM check */
+ ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid);
+ if (smc_find_ism_device(new_smc, ini))
+ goto not_found;
+ ini->ism_selected = 0;
+ if (!smc_listen_ism_init(new_smc, ini))
+ return; /* V1 ISM device found */
+
+not_found:
+ ini->ism_dev[0] = NULL;
+ ini->is_smcd = false;
+}
+
+/* listen worker: register buffers */
+static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
+{
+ struct smc_connection *conn = &new_smc->conn;
+
+ if (!local_first) {
+ if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
+ return SMC_CLC_DECL_ERR_REGRMB;
}
smc_rmb_sync_sg_for_device(&new_smc->conn);
return 0;
}
+static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ int rc;
+
+ if (!smcr_indicated(ini->smc_type_v1))
+ return SMC_CLC_DECL_NOSMCDEV;
+
+ /* prepare RDMA check */
+ ini->ib_lcl = &pclc->lcl;
+ rc = smc_find_rdma_device(new_smc, ini);
+ if (rc) {
+ /* no RDMA device found */
+ if (ini->smc_type_v1 == SMC_TYPE_B)
+ /* neither ISM nor RDMA device found */
+ rc = SMC_CLC_DECL_NOSMCDEV;
+ return rc;
+ }
+ rc = smc_listen_rdma_init(new_smc, ini);
+ if (rc)
+ return rc;
+ return smc_listen_rdma_reg(new_smc, ini->first_contact_local);
+}
+
+/* determine the local device matching to proposal */
+static int smc_listen_find_device(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ int rc;
+
+ /* check for ISM device matching V2 proposed device */
+ smc_find_ism_v2_device_serv(new_smc, pclc, ini);
+ if (ini->ism_dev[0])
+ return 0;
+
+ if (!(ini->smcd_version & SMC_V1))
+ return SMC_CLC_DECL_NOSMCDEV;
+
+ /* check for matching IP prefix and subnet length */
+ rc = smc_listen_prfx_check(new_smc, pclc);
+ if (rc)
+ return rc;
+
+ /* get vlan id from IP device */
+ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini))
+ return SMC_CLC_DECL_GETVLANERR;
+
+ /* check for ISM device matching V1 proposed device */
+ smc_find_ism_v1_device_serv(new_smc, pclc, ini);
+ if (ini->ism_dev[0])
+ return 0;
+
+ if (pclc->hdr.typev1 == SMC_TYPE_D)
+ return SMC_CLC_DECL_NOSMCDDEV; /* skip RDMA and decline */
+
+ /* check if RDMA is available */
+ return smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
+}
+
/* listen worker: finish RDMA setup */
static int smc_listen_rdma_finish(struct smc_sock *new_smc,
struct smc_clc_msg_accept_confirm *cclc,
- int local_contact)
+ bool local_first)
{
- struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+ struct smc_link *link = new_smc->conn.lnk;
int reason_code = 0;
- if (local_contact == SMC_FIRST_CONTACT)
+ if (local_first)
smc_link_save_peer_info(link, cclc);
- if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
- reason_code = SMC_CLC_DECL_ERR_RTOK;
- goto decline;
- }
+ if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
+ return SMC_CLC_DECL_ERR_RTOK;
- if (local_contact == SMC_FIRST_CONTACT) {
- if (smc_ib_ready_link(link)) {
- reason_code = SMC_CLC_DECL_ERR_RDYLNK;
- goto decline;
- }
+ if (local_first) {
+ if (smc_ib_ready_link(link))
+ return SMC_CLC_DECL_ERR_RDYLNK;
/* QP confirmation over RoCE fabric */
- reason_code = smc_serv_conf_first_link(new_smc);
- if (reason_code)
- goto decline;
+ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
+ reason_code = smcr_serv_conf_first_link(new_smc);
+ smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
}
- return 0;
-
-decline:
- mutex_unlock(&smc_create_lgr_pending);
- smc_listen_decline(new_smc, reason_code, local_contact);
return reason_code;
}
-/* setup for RDMA connection of server */
+/* setup for connection of server */
static void smc_listen_work(struct work_struct *work)
{
struct smc_sock *new_smc = container_of(work, struct smc_sock,
smc_listen_work);
+ u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1;
struct socket *newclcsock = new_smc->clcsock;
- struct smc_clc_msg_accept_confirm cclc;
+ struct smc_clc_msg_accept_confirm *cclc;
+ struct smc_clc_msg_proposal_area *buf;
struct smc_clc_msg_proposal *pclc;
- struct smc_ib_device *ibdev;
- bool ism_supported = false;
- struct smcd_dev *ismdev;
- u8 buf[SMC_CLC_MAX_LEN];
- int local_contact = 0;
- unsigned short vlan;
- int reason_code = 0;
+ struct smc_init_info *ini = NULL;
int rc = 0;
- u8 ibport;
+
+ if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
+ return smc_listen_out_err(new_smc);
if (new_smc->use_fallback) {
smc_listen_out_connected(new_smc);
@@ -1223,7 +1712,7 @@
/* check if peer is smc capable */
if (!tcp_sk(newclcsock->sk)->syn_smc) {
- new_smc->use_fallback = true;
+ smc_switch_to_fallback(new_smc);
new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
smc_listen_out_connected(new_smc);
return;
@@ -1232,73 +1721,86 @@
/* do inband token exchange -
* wait for and receive SMC Proposal CLC message
*/
- pclc = (struct smc_clc_msg_proposal *)&buf;
- reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
- SMC_CLC_PROPOSAL);
- if (reason_code) {
- smc_listen_decline(new_smc, reason_code, 0);
- return;
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+ if (!buf) {
+ rc = SMC_CLC_DECL_MEM;
+ goto out_decl;
}
+ pclc = (struct smc_clc_msg_proposal *)buf;
+ rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf),
+ SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
+ if (rc)
+ goto out_decl;
+ version = pclc->hdr.version == SMC_V1 ? SMC_V1 : version;
- /* IPSec connections opt out of SMC-R optimizations */
+ /* IPSec connections opt out of SMC optimizations */
if (using_ipsec(new_smc)) {
- smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0);
- return;
+ rc = SMC_CLC_DECL_IPSEC;
+ goto out_decl;
}
- mutex_lock(&smc_create_lgr_pending);
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini) {
+ rc = SMC_CLC_DECL_MEM;
+ goto out_decl;
+ }
+
+ /* initial version checking */
+ rc = smc_listen_v2_check(new_smc, pclc, ini);
+ if (rc)
+ goto out_decl;
+
+ mutex_lock(&smc_server_lgr_pending);
smc_close_init(new_smc);
smc_rx_init(new_smc);
smc_tx_init(new_smc);
- /* check if ISM is available */
- if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) &&
- !smc_check_ism(new_smc, &ismdev) &&
- !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) {
- ism_supported = true;
- }
-
- /* check if RDMA is available */
- if (!ism_supported &&
- ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) ||
- smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) ||
- smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) ||
- smc_listen_rdma_check(new_smc, pclc) ||
- smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
- &local_contact) ||
- smc_listen_rdma_reg(new_smc, local_contact))) {
- /* SMC not supported, decline */
- mutex_unlock(&smc_create_lgr_pending);
- smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP,
- local_contact);
- return;
- }
+ /* determine ISM or RoCE device used for connection */
+ rc = smc_listen_find_device(new_smc, pclc, ini);
+ if (rc)
+ goto out_unlock;
/* send SMC Accept CLC message */
- rc = smc_clc_send_accept(new_smc, local_contact);
- if (rc) {
- mutex_unlock(&smc_create_lgr_pending);
- smc_listen_decline(new_smc, rc, local_contact);
- return;
- }
+ rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
+ ini->smcd_version == SMC_V2 ? SMC_V2 : SMC_V1);
+ if (rc)
+ goto out_unlock;
+
+ /* SMC-D does not need this lock any more */
+ if (ini->is_smcd)
+ mutex_unlock(&smc_server_lgr_pending);
/* receive SMC Confirm CLC message */
- reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
- SMC_CLC_CONFIRM);
- if (reason_code) {
- mutex_unlock(&smc_create_lgr_pending);
- smc_listen_decline(new_smc, reason_code, local_contact);
- return;
+ memset(buf, 0, sizeof(*buf));
+ cclc = (struct smc_clc_msg_accept_confirm *)buf;
+ rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf),
+ SMC_CLC_CONFIRM, CLC_WAIT_TIME);
+ if (rc) {
+ if (!ini->is_smcd)
+ goto out_unlock;
+ goto out_decl;
}
/* finish worker */
- if (!ism_supported) {
- if (smc_listen_rdma_finish(new_smc, &cclc, local_contact))
- return;
+ if (!ini->is_smcd) {
+ rc = smc_listen_rdma_finish(new_smc, cclc,
+ ini->first_contact_local);
+ if (rc)
+ goto out_unlock;
+ mutex_unlock(&smc_server_lgr_pending);
}
- smc_conn_save_peer_info(new_smc, &cclc);
- mutex_unlock(&smc_create_lgr_pending);
+ smc_conn_save_peer_info(new_smc, cclc);
smc_listen_out_connected(new_smc);
+ goto out_free;
+
+out_unlock:
+ mutex_unlock(&smc_server_lgr_pending);
+out_decl:
+ smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
+ version);
+out_free:
+ kfree(ini);
+ kfree(buf);
}
static void smc_tcp_listen_work(struct work_struct *work)
@@ -1312,7 +1814,7 @@
lock_sock(lsk);
while (lsk->sk_state == SMC_LISTEN) {
rc = smc_clcsock_accept(lsmc, &new_smc);
- if (rc)
+ if (rc) /* clcsock accept queue empty or error */
goto out;
if (!new_smc)
continue;
@@ -1326,13 +1828,29 @@
new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
sock_hold(&new_smc->sk); /* sock_put in passive closing */
- if (!schedule_work(&new_smc->smc_listen_work))
+ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
sock_put(&new_smc->sk);
}
out:
release_sock(lsk);
- sock_put(&lsmc->sk); /* sock_hold in smc_listen */
+ sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
+}
+
+static void smc_clcsock_data_ready(struct sock *listen_clcsock)
+{
+ struct smc_sock *lsmc;
+
+ lsmc = (struct smc_sock *)
+ ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY);
+ if (!lsmc)
+ return;
+ lsmc->clcsk_data_ready(listen_clcsock);
+ if (lsmc->sk.sk_state == SMC_LISTEN) {
+ sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
+ if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work))
+ sock_put(&lsmc->sk);
+ }
}
static int smc_listen(struct socket *sock, int backlog)
@@ -1345,7 +1863,8 @@
lock_sock(sk);
rc = -EINVAL;
- if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
+ if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
+ smc->connect_nonblock)
goto out;
rc = 0;
@@ -1360,16 +1879,21 @@
if (!smc->use_fallback)
tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+ /* save original sk_data_ready function and establish
+ * smc-specific sk_data_ready function
+ */
+ smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready;
+ smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready;
+ smc->clcsock->sk->sk_user_data =
+ (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
rc = kernel_listen(smc->clcsock, backlog);
- if (rc)
+ if (rc) {
+ smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready;
goto out;
+ }
sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
sk->sk_state = SMC_LISTEN;
- INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
- sock_hold(sk); /* sock_hold in tcp_listen_worker */
- if (!schedule_work(&smc->tcp_listen_work))
- sock_put(sk);
out:
release_sock(sk);
@@ -1464,23 +1988,26 @@
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
- int rc = -EPIPE;
+ int rc;
smc = smc_sk(sk);
lock_sock(sk);
- if ((sk->sk_state != SMC_ACTIVE) &&
- (sk->sk_state != SMC_APPCLOSEWAIT1) &&
- (sk->sk_state != SMC_INIT))
- goto out;
+ /* SMC does not support connect with fastopen */
if (msg->msg_flags & MSG_FASTOPEN) {
- if (sk->sk_state == SMC_INIT) {
- smc->use_fallback = true;
+ /* not connected yet, fallback */
+ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
+ smc_switch_to_fallback(smc);
smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
} else {
rc = -EINVAL;
goto out;
}
+ } else if ((sk->sk_state != SMC_ACTIVE) &&
+ (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+ (sk->sk_state != SMC_INIT)) {
+ rc = -EPIPE;
+ goto out;
}
if (smc->use_fallback)
@@ -1501,6 +2028,11 @@
smc = smc_sk(sk);
lock_sock(sk);
+ if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
+ /* socket was connected before, no more data to read */
+ rc = 0;
+ goto out;
+ }
if ((sk->sk_state == SMC_INIT) ||
(sk->sk_state == SMC_LISTEN) ||
(sk->sk_state == SMC_CLOSED))
@@ -1540,8 +2072,8 @@
poll_table *wait)
{
struct sock *sk = sock->sk;
- __poll_t mask = 0;
struct smc_sock *smc;
+ __poll_t mask = 0;
if (!sk)
return EPOLLNVAL;
@@ -1551,8 +2083,6 @@
/* delegate to CLC child sock */
mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
sk->sk_err = smc->clcsock->sk->sk_err;
- if (sk->sk_err)
- mask |= EPOLLERR;
} else {
if (sk->sk_state != SMC_CLOSED)
sock_poll_wait(file, sock, wait);
@@ -1563,9 +2093,14 @@
mask |= EPOLLHUP;
if (sk->sk_state == SMC_LISTEN) {
/* woken up by sk_data_ready in smc_listen_work() */
- mask = smc_accept_poll(sk);
+ mask |= smc_accept_poll(sk);
+ } else if (smc->use_fallback) { /* as result of connect_work()*/
+ mask |= smc->clcsock->ops->poll(file, smc->clcsock,
+ wait);
+ sk->sk_err = smc->clcsock->sk->sk_err;
} else {
- if (atomic_read(&smc->conn.sndbuf_space) ||
+ if ((sk->sk_state != SMC_INIT &&
+ atomic_read(&smc->conn.sndbuf_space)) ||
sk->sk_shutdown & SEND_SHUTDOWN) {
mask |= EPOLLOUT | EPOLLWRNORM;
} else {
@@ -1613,8 +2148,10 @@
if (smc->use_fallback) {
rc = kernel_sock_shutdown(smc->clcsock, how);
sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
- if (sk->sk_shutdown == SHUTDOWN_MASK)
+ if (sk->sk_shutdown == SHUTDOWN_MASK) {
sk->sk_state = SMC_CLOSED;
+ sock_put(sk);
+ }
goto out;
}
switch (how) {
@@ -1644,7 +2181,7 @@
}
static int smc_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
@@ -1658,51 +2195,53 @@
/* generic setsockopts reaching us here always apply to the
* CLC socket
*/
- rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
- optval, optlen);
+ if (unlikely(!smc->clcsock->ops->setsockopt))
+ rc = -EOPNOTSUPP;
+ else
+ rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+ optval, optlen);
if (smc->clcsock->sk->sk_err) {
sk->sk_err = smc->clcsock->sk->sk_err;
sk->sk_error_report(sk);
}
- if (rc)
- return rc;
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
return -EFAULT;
lock_sock(sk);
+ if (rc || smc->use_fallback)
+ goto out;
switch (optname) {
case TCP_FASTOPEN:
case TCP_FASTOPEN_CONNECT:
case TCP_FASTOPEN_KEY:
case TCP_FASTOPEN_NO_COOKIE:
/* option not supported by SMC */
- if (sk->sk_state == SMC_INIT) {
- smc->use_fallback = true;
+ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
+ smc_switch_to_fallback(smc);
smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
} else {
- if (!smc->use_fallback)
- rc = -EINVAL;
+ rc = -EINVAL;
}
break;
case TCP_NODELAY:
if (sk->sk_state != SMC_INIT &&
sk->sk_state != SMC_LISTEN &&
sk->sk_state != SMC_CLOSED) {
- if (val && !smc->use_fallback)
- mod_delayed_work(system_wq, &smc->conn.tx_work,
- 0);
+ if (val)
+ mod_delayed_work(smc->conn.lgr->tx_wq,
+ &smc->conn.tx_work, 0);
}
break;
case TCP_CORK:
if (sk->sk_state != SMC_INIT &&
sk->sk_state != SMC_LISTEN &&
sk->sk_state != SMC_CLOSED) {
- if (!val && !smc->use_fallback)
- mod_delayed_work(system_wq, &smc->conn.tx_work,
- 0);
+ if (!val)
+ mod_delayed_work(smc->conn.lgr->tx_wq,
+ &smc->conn.tx_work, 0);
}
break;
case TCP_DEFER_ACCEPT:
@@ -1711,6 +2250,7 @@
default:
break;
}
+out:
release_sock(sk);
return rc;
@@ -1723,6 +2263,8 @@
smc = smc_sk(sock->sk);
/* socket options apply to the CLC socket */
+ if (unlikely(!smc->clcsock->ops->getsockopt))
+ return -EOPNOTSUPP;
return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
optval, optlen);
}
@@ -1848,7 +2390,11 @@
smc = smc_sk(sk);
lock_sock(sk);
-
+ if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
+ /* socket was connected before, no more data to read */
+ rc = 0;
+ goto out;
+ }
if (sk->sk_state == SMC_INIT ||
sk->sk_state == SMC_LISTEN ||
sk->sk_state == SMC_CLOSED)
@@ -1947,30 +2493,71 @@
.create = smc_create,
};
+unsigned int smc_net_id;
+
+static __net_init int smc_net_init(struct net *net)
+{
+ return smc_pnet_net_init(net);
+}
+
+static void __net_exit smc_net_exit(struct net *net)
+{
+ smc_pnet_net_exit(net);
+}
+
+static struct pernet_operations smc_net_ops = {
+ .init = smc_net_init,
+ .exit = smc_net_exit,
+ .id = &smc_net_id,
+ .size = sizeof(struct smc_net),
+};
+
static int __init smc_init(void)
{
int rc;
- rc = smc_pnet_init();
+ rc = register_pernet_subsys(&smc_net_ops);
if (rc)
return rc;
+
+ smc_ism_init();
+ smc_clc_init();
+
+ rc = smc_pnet_init();
+ if (rc)
+ goto out_pernet_subsys;
+
+ rc = -ENOMEM;
+ smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
+ if (!smc_hs_wq)
+ goto out_pnet;
+
+ smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
+ if (!smc_close_wq)
+ goto out_alloc_hs_wq;
+
+ rc = smc_core_init();
+ if (rc) {
+ pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
+ goto out_alloc_wqs;
+ }
rc = smc_llc_init();
if (rc) {
pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
- goto out_pnet;
+ goto out_core;
}
rc = smc_cdc_init();
if (rc) {
pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
- goto out_pnet;
+ goto out_core;
}
rc = proto_register(&smc_proto, 1);
if (rc) {
pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
- goto out_pnet;
+ goto out_core;
}
rc = proto_register(&smc_proto6, 1);
@@ -2002,20 +2589,33 @@
proto_unregister(&smc_proto6);
out_proto:
proto_unregister(&smc_proto);
+out_core:
+ smc_core_exit();
+out_alloc_wqs:
+ destroy_workqueue(smc_close_wq);
+out_alloc_hs_wq:
+ destroy_workqueue(smc_hs_wq);
out_pnet:
smc_pnet_exit();
+out_pernet_subsys:
+ unregister_pernet_subsys(&smc_net_ops);
+
return rc;
}
static void __exit smc_exit(void)
{
- smc_core_exit();
static_branch_disable(&tcp_have_smc);
- smc_ib_unregister_client();
sock_unregister(PF_SMC);
+ smc_core_exit();
+ smc_ib_unregister_client();
+ destroy_workqueue(smc_close_wq);
+ destroy_workqueue(smc_hs_wq);
proto_unregister(&smc_proto6);
proto_unregister(&smc_proto);
smc_pnet_exit();
+ unregister_pernet_subsys(&smc_net_ops);
+ rcu_barrier();
}
module_init(smc_init);
--
Gitblit v1.6.2