From 10ebd8556b7990499c896a550e3d416b444211e6 Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Fri, 10 May 2024 02:23:07 +0000 Subject: [PATCH] add led --- kernel/net/smc/af_smc.c | 1584 ++++++++++++++++++++++++++++++++++++++++------------------ 1 files changed, 1,092 insertions(+), 492 deletions(-) diff --git a/kernel/net/smc/af_smc.c b/kernel/net/smc/af_smc.c index 4c904ab..8ab8492 100644 --- a/kernel/net/smc/af_smc.c +++ b/kernel/net/smc/af_smc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * @@ -24,11 +25,17 @@ #include <linux/in.h> #include <linux/sched/signal.h> #include <linux/if_vlan.h> +#include <linux/rcupdate_wait.h> +#include <linux/ctype.h> #include <net/sock.h> #include <net/tcp.h> #include <net/smc.h> #include <asm/ioctls.h> + +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include "smc_netns.h" #include "smc.h" #include "smc_clc.h" @@ -42,9 +49,15 @@ #include "smc_rx.h" #include "smc_close.h" -static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group - * creation +static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group + * creation on server */ +static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group + * creation on client + */ + +struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ +struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); @@ -115,21 +128,74 @@ }; EXPORT_SYMBOL_GPL(smc_proto6); +static void smc_restore_fallback_changes(struct smc_sock *smc) +{ + if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ + smc->clcsock->file->private_data = smc->sk.sk_socket; + smc->clcsock->file = NULL; + } +} + +static int __smc_release(struct smc_sock *smc) +{ + struct sock *sk = &smc->sk; + int rc = 0; + + if (!smc->use_fallback) { + rc = smc_close_active(smc); + sock_set_flag(sk, SOCK_DEAD); + sk->sk_shutdown |= SHUTDOWN_MASK; + } else { + if (sk->sk_state != SMC_CLOSED) { + if (sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_INIT) + sock_put(sk); /* passive closing */ + if (sk->sk_state == SMC_LISTEN) { + /* wake up clcsock accept */ + rc = kernel_sock_shutdown(smc->clcsock, + SHUT_RDWR); + } + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); + } + smc_restore_fallback_changes(smc); + } + + sk->sk_prot->unhash(sk); + + if (sk->sk_state == SMC_CLOSED) { + if (smc->clcsock) { + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); + } + if (!smc->use_fallback) + smc_conn_free(&smc->conn); + } + + return rc; +} + static int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; - int rc = 0; + int old_state, rc = 0; if (!sk) goto out; + sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); + old_state = sk->sk_state; + /* cleanup for a dangling non-blocking connect */ - flush_work(&smc->connect_work); - kfree(smc->connect_info); - smc->connect_info = NULL; + if (smc->connect_nonblock && old_state == SMC_INIT) + tcp_abort(smc->clcsock->sk, ECONNABORTED); + + if (cancel_work_sync(&smc->connect_work)) + sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires @@ -139,38 +205,18 @@ else lock_sock(sk); - if (!smc->use_fallback) { - rc = smc_close_active(smc); - sock_set_flag(sk, SOCK_DEAD); - sk->sk_shutdown |= SHUTDOWN_MASK; - } + if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE && + !smc->use_fallback) + smc_close_active_abort(smc); - sk->sk_prot->unhash(sk); - - if (smc->clcsock) { - if (smc->use_fallback && sk->sk_state == SMC_LISTEN) { - /* wake up clcsock accept */ - rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); - } - mutex_lock(&smc->clcsock_release_lock); - sock_release(smc->clcsock); - smc->clcsock = NULL; - mutex_unlock(&smc->clcsock_release_lock); - } - if (smc->use_fallback) { - if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT) - sock_put(sk); /* passive closing */ - sk->sk_state = SMC_CLOSED; - sk->sk_state_change(sk); - } + rc = __smc_release(smc); /* detach socket */ sock_orphan(sk); sock->sk = NULL; - if (!smc->use_fallback && sk->sk_state == SMC_CLOSED) - smc_conn_free(&smc->conn); release_sock(sk); + sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ out: return rc; @@ -245,7 +291,7 @@ /* Check if socket is already active */ rc = -EINVAL; - if (sk->sk_state != SMC_INIT) + if (sk->sk_state != SMC_INIT || smc->connect_nonblock) goto out_rel; smc->clcsock->sk->sk_reuse = sk->sk_reuse; @@ -289,7 +335,8 @@ (1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_WIFI_STATUS) | \ (1UL << SOCK_NOFCS) | \ - (1UL << SOCK_FILTER_LOCKED)) + (1UL << SOCK_FILTER_LOCKED) | \ + (1UL << SOCK_TSTAMP_NEW)) /* copy only relevant settings and flags of SOL_SOCKET level from smc to * clc socket (since smc is not called for these options from net/core) */ @@ -308,47 +355,61 @@ smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } -/* register a new rmb, optionally send confirm_rkey msg to register with peer */ -static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, - bool conf_rkey) +/* register the new rmb on all links */ +static int smcr_lgr_reg_rmbs(struct smc_link *link, + struct smc_buf_desc *rmb_desc) { - /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { - rmb_desc->regerr = 1; - return -EFAULT; + struct smc_link_group *lgr = link->lgr; + int i, rc = 0; + + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (rc) + return rc; + /* protect against parallel smc_llc_cli_rkey_exchange() and + * parallel smcr_link_reg_rmb() + */ + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc); + if (rc) + goto out; } - if (!conf_rkey) - return 0; + /* exchange confirm_rkey msg with peer */ - if (smc_llc_do_confirm_rkey(link, rmb_desc)) { - rmb_desc->regerr = 1; - return -EFAULT; + rc = smc_llc_do_confirm_rkey(link, rmb_desc); + if (rc) { + rc = -EFAULT; + goto out; } - return 0; + rmb_desc->is_conf_rkey = true; +out: + mutex_unlock(&lgr->llc_conf_mutex); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + return rc; } -static int smc_clnt_conf_first_link(struct smc_sock *smc) +static int smcr_clnt_conf_first_link(struct smc_sock *smc) { - struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; - int rest; + struct smc_link *link = smc->conn.lnk; + struct smc_llc_qentry *qentry; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; /* receive CONFIRM LINK request from server over RoCE fabric */ - rest = wait_for_completion_interruptible_timeout( - &link->llc_confirm, - SMC_LLC_WAIT_FIRST_TIME); - if (rest <= 0) { + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } - - if (link->llc_confirm_rc) + smc_llc_save_peer_uid(qentry); + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) return SMC_CLC_DECL_RMBE_EC; rc = smc_ib_modify_qp_rts(link); @@ -357,60 +418,86 @@ smc_wr_remember_qp_attr(link); - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; + + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; /* send CONFIRM LINK response over RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); if (rc < 0) return SMC_CLC_DECL_TIMEOUT_CL; - /* receive ADD LINK request from server over RoCE fabric */ - rest = wait_for_completion_interruptible_timeout(&link->llc_add, - SMC_LLC_WAIT_TIME); - if (rest <= 0) { + smc_llc_link_active(link); + smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); + + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ return rc; } - - /* send add link reject message, only one link supported for now */ - rc = smc_llc_send_add_link(link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); - if (rc < 0) - return SMC_CLC_DECL_TIMEOUT_AL; - - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); - + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + smc_llc_cli_add_link(link, qentry); return 0; } static void smcr_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { - int bufsize = smc_uncompress_bufsize(clc->rmbe_size); + int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size); - smc->conn.peer_rmbe_idx = clc->rmbe_idx; - smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); + smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx; + smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token); smc->conn.peer_rmbe_size = bufsize; atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); } +static bool smc_isascii(char *hostname) +{ + int i; + + for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++) + if (!isascii(hostname[i])) + return false; + return true; +} + static void smcd_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { - int bufsize = smc_uncompress_bufsize(clc->dmbe_size); + int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size); - smc->conn.peer_rmbe_idx = clc->dmbe_idx; - smc->conn.peer_token = clc->token; + smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx; + smc->conn.peer_token = clc->d0.token; /* msg header takes up space in the buffer */ smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx; + if (clc->hdr.version > SMC_V1 && + (clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) { + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)clc; + struct smc_clc_first_contact_ext *fce = + (struct smc_clc_first_contact_ext *) + (((u8 *)clc_v2) + sizeof(*clc_v2)); + + memcpy(smc->conn.lgr->negotiated_eid, clc_v2->eid, + SMC_MAX_EID_LEN); + smc->conn.lgr->peer_os = fce->os_type; + smc->conn.lgr->peer_smc_release = fce->release; + if (smc_isascii(fce->hostname)) + memcpy(smc->conn.lgr->peer_hostname, fce->hostname, + SMC_MAX_HOSTNAME_LEN); + } } static void smc_conn_save_peer_info(struct smc_sock *smc, @@ -425,26 +512,53 @@ static void smc_link_save_peer_info(struct smc_link *link, struct smc_clc_msg_accept_confirm *clc) { - link->peer_qpn = ntoh24(clc->qpn); - memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); - memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); - link->peer_psn = ntoh24(clc->psn); - link->peer_mtu = clc->qp_mtu; + link->peer_qpn = ntoh24(clc->r0.qpn); + memcpy(link->peer_gid, clc->r0.lcl.gid, SMC_GID_SIZE); + memcpy(link->peer_mac, clc->r0.lcl.mac, sizeof(link->peer_mac)); + link->peer_psn = ntoh24(clc->r0.psn); + link->peer_mtu = clc->r0.qp_mtu; +} + +static void smc_switch_to_fallback(struct smc_sock *smc) +{ + wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); + wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk); + unsigned long flags; + + smc->use_fallback = true; + if (smc->sk.sk_socket && smc->sk.sk_socket->file) { + smc->clcsock->file = smc->sk.sk_socket->file; + smc->clcsock->file->private_data = smc->clcsock; + smc->clcsock->wq.fasync_list = + smc->sk.sk_socket->wq.fasync_list; + + /* There may be some entries remaining in + * smc socket->wq, which should be removed + * to clcsocket->wq during the fallback. + */ + spin_lock_irqsave(&smc_wait->lock, flags); + spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING); + list_splice_init(&smc_wait->head, &clc_wait->head); + spin_unlock(&clc_wait->lock); + spin_unlock_irqrestore(&smc_wait->lock, flags); + } } /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - smc->use_fallback = true; + smc_switch_to_fallback(smc); smc->fallback_rsn = reason_code; smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; } /* decline and fall back during connect */ -static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) +static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, + u8 version) { int rc; @@ -454,7 +568,7 @@ return reason_code; } if (reason_code != SMC_CLC_DECL_PEERDECL) { - rc = smc_clc_send_decline(smc, reason_code); + rc = smc_clc_send_decline(smc, reason_code, version); if (rc < 0) { if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ @@ -465,190 +579,367 @@ } /* abort connecting */ -static int smc_connect_abort(struct smc_sock *smc, int reason_code, - int local_contact) +static void smc_connect_abort(struct smc_sock *smc, int local_first) { - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(smc->conn.lgr); - mutex_unlock(&smc_create_lgr_pending); - smc_conn_free(&smc->conn); - return reason_code; + if (local_first) + smc_lgr_cleanup_early(&smc->conn); + else + smc_conn_free(&smc->conn); } /* check if there is a rdma device available for this connection. */ /* called for connect and listen */ -static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, - u8 *ibport, unsigned short vlan_id, u8 gid[]) +static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini) { - int reason_code = 0; - /* PNET table look up: search active ib_device and port * within same PNETID that also contains the ethernet device * used for the internal TCP socket */ - smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id, - gid); - if (!(*ibdev)) - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - - return reason_code; + smc_pnet_find_roce_resource(smc->clcsock->sk, ini); + if (!ini->ib_dev) + return SMC_CLC_DECL_NOSMCRDEV; + return 0; } /* check if there is an ISM device available for this connection. */ /* called for connect and listen */ -static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev) +static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini) { /* Find ISM device with same PNETID as connecting interface */ - smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev); - if (!(*ismdev)) - return SMC_CLC_DECL_CNFERR; /* configuration error */ + smc_pnet_find_ism_resource(smc->clcsock->sk, ini); + if (!ini->ism_dev[0]) + return SMC_CLC_DECL_NOSMCDDEV; + else + ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]); return 0; +} + +/* is chid unique for the ism devices that are already determined? */ +static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini, + int cnt) +{ + int i = (!ini->ism_dev[0]) ? 1 : 0; + + for (; i < cnt; i++) + if (ini->ism_chid[i] == chid) + return false; + return true; +} + +/* determine possible V2 ISM devices (either without PNETID or with PNETID plus + * PNETID matching net_device) + */ +static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, + struct smc_init_info *ini) +{ + int rc = SMC_CLC_DECL_NOSMCDDEV; + struct smcd_dev *smcd; + int i = 1; + u16 chid; + + if (smcd_indicated(ini->smc_type_v1)) + rc = 0; /* already initialized for V1 */ + mutex_lock(&smcd_dev_list.mutex); + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + if (smcd->going_away || smcd == ini->ism_dev[0]) + continue; + chid = smc_ism_get_chid(smcd); + if (!smc_find_ism_v2_is_unique_chid(chid, ini, i)) + continue; + if (!smc_pnet_is_pnetid_set(smcd->pnetid) || + smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) { + ini->ism_dev[i] = smcd; + ini->ism_chid[i] = chid; + ini->is_smcd = true; + rc = 0; + i++; + if (i > SMC_MAX_ISM_DEVS) + break; + } + } + mutex_unlock(&smcd_dev_list.mutex); + ini->ism_offered_cnt = i - 1; + if (!ini->ism_dev[0] && !ini->ism_dev[1]) + ini->smcd_version = 0; + + return rc; } /* Check for VLAN ID and register it on ISM device just for CLC handshake */ static int smc_connect_ism_vlan_setup(struct smc_sock *smc, - struct smcd_dev *ismdev, - unsigned short vlan_id) + struct smc_init_info *ini) { - if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id)) - return SMC_CLC_DECL_CNFERR; + if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id)) + return SMC_CLC_DECL_ISMVLANERR; return 0; +} + +static int smc_find_proposal_devices(struct smc_sock *smc, + struct smc_init_info *ini) +{ + int rc = 0; + + /* check if there is an ism device available */ + if (ini->smcd_version & SMC_V1) { + if (smc_find_ism_device(smc, ini) || + smc_connect_ism_vlan_setup(smc, ini)) { + if (ini->smc_type_v1 == SMC_TYPE_B) + ini->smc_type_v1 = SMC_TYPE_R; + else + ini->smc_type_v1 = SMC_TYPE_N; + } /* else ISM V1 is supported for this connection */ + if (smc_find_rdma_device(smc, ini)) { + if (ini->smc_type_v1 == SMC_TYPE_B) + ini->smc_type_v1 = SMC_TYPE_D; + else + ini->smc_type_v1 = SMC_TYPE_N; + } /* else RDMA is supported for this connection */ + } + if (smc_ism_v2_capable && smc_find_ism_v2_device_clnt(smc, ini)) + ini->smc_type_v2 = SMC_TYPE_N; + + /* if neither ISM nor RDMA are supported, fallback */ + if (!smcr_indicated(ini->smc_type_v1) && + ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N) + rc = SMC_CLC_DECL_NOSMCDEV; + + return rc; } /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is * used, the VLAN ID will be registered again during the connection setup. */ -static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, - struct smcd_dev *ismdev, - unsigned short vlan_id) +static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, + struct smc_init_info *ini) { - if (!is_smcd) + if (!smcd_indicated(ini->smc_type_v1)) return 0; - if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id)) + if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id)) return SMC_CLC_DECL_CNFERR; return 0; } +#define SMC_CLC_MAX_ACCEPT_LEN \ + (sizeof(struct smc_clc_msg_accept_confirm_v2) + \ + sizeof(struct smc_clc_first_contact_ext) + \ + sizeof(struct smc_clc_msg_trail)) + /* CLC handshake during connect */ -static int smc_connect_clc(struct smc_sock *smc, int smc_type, - struct smc_clc_msg_accept_confirm *aclc, - struct smc_ib_device *ibdev, u8 ibport, - u8 gid[], struct smcd_dev *ismdev) +static int smc_connect_clc(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm_v2 *aclc2, + struct smc_init_info *ini) { int rc = 0; /* do inband token exchange */ - rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev); + rc = smc_clc_send_proposal(smc, ini); if (rc) return rc; /* receive SMC Accept CLC message */ - return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT); + return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, + SMC_CLC_ACCEPT, CLC_WAIT_TIME); } /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, - struct smc_ib_device *ibdev, u8 ibport) + struct smc_init_info *ini) { - int local_contact = SMC_FIRST_CONTACT; + int i, reason_code = 0; struct smc_link *link; - int reason_code = 0; - mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev, - ibport, &aclc->lcl, NULL, 0); - if (local_contact < 0) { - if (local_contact == -ENOMEM) - reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (local_contact == -ENOLINK) - reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ - else - reason_code = SMC_CLC_DECL_INTERR; /* other error */ - return smc_connect_abort(smc, reason_code, 0); + ini->is_smcd = false; + ini->ib_lcl = &aclc->r0.lcl; + ini->ib_clcqpn = ntoh24(aclc->r0.qpn); + ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; + + mutex_lock(&smc_client_lgr_pending); + reason_code = smc_conn_create(smc, ini); + if (reason_code) { + mutex_unlock(&smc_client_lgr_pending); + return reason_code; } - link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; smc_conn_save_peer_info(smc, aclc); - /* create send buffer and rmb */ - if (smc_buf_create(smc, false)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); + if (ini->first_contact_local) { + link = smc->conn.lnk; + } else { + /* set link that was assigned by server */ + link = NULL; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *l = &smc->conn.lgr->lnk[i]; - if (local_contact == SMC_FIRST_CONTACT) + if (l->peer_qpn == ntoh24(aclc->r0.qpn) && + !memcmp(l->peer_gid, &aclc->r0.lcl.gid, + SMC_GID_SIZE) && + !memcmp(l->peer_mac, &aclc->r0.lcl.mac, + sizeof(l->peer_mac))) { + link = l; + break; + } + } + if (!link) { + reason_code = SMC_CLC_DECL_NOSRVLINK; + goto connect_abort; + } + smc->conn.lnk = link; + } + + /* create send buffer and rmb */ + if (smc_buf_create(smc, false)) { + reason_code = SMC_CLC_DECL_MEM; + goto connect_abort; + } + + if (ini->first_contact_local) smc_link_save_peer_info(link, aclc); - if (smc_rmb_rtoken_handling(&smc->conn, aclc)) - return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, - local_contact); + if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) { + reason_code = SMC_CLC_DECL_ERR_RTOK; + goto connect_abort; + } smc_close_init(smc); smc_rx_init(smc); - if (local_contact == SMC_FIRST_CONTACT) { - if (smc_ib_ready_link(link)) - return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, - local_contact); + if (ini->first_contact_local) { + if (smc_ib_ready_link(link)) { + reason_code = SMC_CLC_DECL_ERR_RDYLNK; + goto connect_abort; + } } else { - if (!smc->conn.rmb_desc->reused && - smc_reg_rmb(link, smc->conn.rmb_desc, true)) - return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, - local_contact); + if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { + reason_code = SMC_CLC_DECL_ERR_REGRMB; + goto connect_abort; + } } smc_rmb_sync_sg_for_device(&smc->conn); - reason_code = smc_clc_send_confirm(smc); + reason_code = smc_clc_send_confirm(smc, ini->first_contact_local, + SMC_V1); if (reason_code) - return smc_connect_abort(smc, reason_code, local_contact); + goto connect_abort; smc_tx_init(smc); - if (local_contact == SMC_FIRST_CONTACT) { + if (ini->first_contact_local) { /* QP confirmation over RoCE fabric */ - reason_code = smc_clnt_conf_first_link(smc); + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); + reason_code = smcr_clnt_conf_first_link(smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) - return smc_connect_abort(smc, reason_code, - local_contact); + goto connect_abort; } - mutex_unlock(&smc_create_lgr_pending); + mutex_unlock(&smc_client_lgr_pending); smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; +connect_abort: + smc_connect_abort(smc, ini->first_contact_local); + mutex_unlock(&smc_client_lgr_pending); + smc->connect_nonblock = 0; + + return reason_code; +} + +/* The server has chosen one of the proposed ISM devices for the communication. + * Determine from the CHID of the received CLC ACCEPT the ISM device chosen. + */ +static int +smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc, + struct smc_init_info *ini) +{ + int i; + + for (i = 0; i < ini->ism_offered_cnt + 1; i++) { + if (ini->ism_chid[i] == ntohs(aclc->chid)) { + ini->ism_selected = i; + return 0; + } + } + + return -EPROTO; } /* setup for ISM connection of client */ static int smc_connect_ism(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, - struct smcd_dev *ismdev) + struct smc_init_info *ini) { - int local_contact = SMC_FIRST_CONTACT; int rc = 0; - mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, - NULL, ismdev, aclc->gid); - if (local_contact < 0) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0); + ini->is_smcd = true; + ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; + + if (aclc->hdr.version == SMC_V2) { + struct smc_clc_msg_accept_confirm_v2 *aclc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + + rc = smc_v2_determine_accepted_chid(aclc_v2, ini); + if (rc) + return rc; + } + ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid; + + /* there is only one lgr role for SMC-D; use server lock */ + mutex_lock(&smc_server_lgr_pending); + rc = smc_conn_create(smc, ini); + if (rc) { + mutex_unlock(&smc_server_lgr_pending); + return rc; + } /* Create send and receive buffers */ - if (smc_buf_create(smc, true)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); + rc = smc_buf_create(smc, true); + if (rc) { + rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM; + goto connect_abort; + } smc_conn_save_peer_info(smc, aclc); smc_close_init(smc); smc_rx_init(smc); smc_tx_init(smc); - rc = smc_clc_send_confirm(smc); + rc = smc_clc_send_confirm(smc, ini->first_contact_local, + aclc->hdr.version); if (rc) - return smc_connect_abort(smc, rc, local_contact); - mutex_unlock(&smc_create_lgr_pending); + goto connect_abort; + mutex_unlock(&smc_server_lgr_pending); smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; + + return 0; +connect_abort: + smc_connect_abort(smc, ini->first_contact_local); + mutex_unlock(&smc_server_lgr_pending); + smc->connect_nonblock = 0; + + return rc; +} + +/* check if received accept type and version matches a proposed one */ +static int smc_connect_check_aclc(struct smc_init_info *ini, + struct smc_clc_msg_accept_confirm *aclc) +{ + if ((aclc->hdr.typev1 == SMC_TYPE_R && + !smcr_indicated(ini->smc_type_v1)) || + (aclc->hdr.typev1 == SMC_TYPE_D && + ((!smcd_indicated(ini->smc_type_v1) && + !smcd_indicated(ini->smc_type_v2)) || + (aclc->hdr.version == SMC_V1 && + !smcd_indicated(ini->smc_type_v1)) || + (aclc->hdr.version == SMC_V2 && + !smcd_indicated(ini->smc_type_v2))))) + return SMC_CLC_DECL_MODEUNSUPP; return 0; } @@ -656,17 +947,12 @@ /* perform steps before actually connecting */ static int __smc_connect(struct smc_sock *smc) { - bool ism_supported = false, rdma_supported = false; - struct smc_clc_msg_accept_confirm aclc; - struct smc_ib_device *ibdev; - struct smcd_dev *ismdev; - u8 gid[SMC_GID_SIZE]; - unsigned short vlan; - int smc_type; + u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1; + struct smc_clc_msg_accept_confirm_v2 *aclc2; + struct smc_clc_msg_accept_confirm *aclc; + struct smc_init_info *ini = NULL; + u8 *buf = NULL; int rc = 0; - u8 ibport; - - sock_hold(&smc->sk); /* sock put in passive closing */ if (smc->use_fallback) return smc_connect_fallback(smc, smc->fallback_rsn); @@ -675,74 +961,107 @@ if (!tcp_sk(smc->clcsock->sk)->syn_smc) return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC); - /* IPSec connections opt out of SMC-R optimizations */ + /* IPSec connections opt out of SMC optimizations */ if (using_ipsec(smc)) - return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC, + version); - /* check for VLAN ID */ - if (smc_vlan_by_tcpsk(smc->clcsock, &vlan)) - return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM, + version); - /* check if there is an ism device available */ - if (!smc_check_ism(smc, &ismdev) && - !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) { - /* ISM is supported for this connection */ - ism_supported = true; - smc_type = SMC_TYPE_D; + ini->smcd_version = SMC_V1; + ini->smcd_version |= smc_ism_v2_capable ? SMC_V2 : 0; + ini->smc_type_v1 = SMC_TYPE_B; + ini->smc_type_v2 = smc_ism_v2_capable ? SMC_TYPE_D : SMC_TYPE_N; + + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(smc->clcsock, ini)) { + ini->smcd_version &= ~SMC_V1; + ini->smc_type_v1 = SMC_TYPE_N; + if (!ini->smcd_version) { + rc = SMC_CLC_DECL_GETVLANERR; + goto fallback; + } } - /* check if there is a rdma device available */ - if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) { - /* RDMA is supported for this connection */ - rdma_supported = true; - if (ism_supported) - smc_type = SMC_TYPE_B; /* both */ - else - smc_type = SMC_TYPE_R; /* only RDMA */ - } + rc = smc_find_proposal_devices(smc, ini); + if (rc) + goto fallback; - /* if neither ISM nor RDMA are supported, fallback */ - if (!rdma_supported && !ism_supported) - return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV); + buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL); + if (!buf) { + rc = SMC_CLC_DECL_MEM; + goto fallback; + } + aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf; + aclc = (struct smc_clc_msg_accept_confirm *)aclc2; /* perform CLC handshake */ - rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev); - if (rc) { - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); - return smc_connect_decline_fallback(smc, rc); - } + rc = smc_connect_clc(smc, aclc2, ini); + if (rc) + goto vlan_cleanup; + + /* check if smc modes and versions of CLC proposal and accept match */ + rc = smc_connect_check_aclc(ini, aclc); + version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2; + ini->smcd_version = version; + if (rc) + goto vlan_cleanup; /* depending on previous steps, connect using rdma or ism */ - if (rdma_supported && aclc.hdr.path == SMC_TYPE_R) - rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); - else if (ism_supported && aclc.hdr.path == SMC_TYPE_D) - rc = smc_connect_ism(smc, &aclc, ismdev); - else - rc = SMC_CLC_DECL_MODEUNSUPP; - if (rc) { - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); - return smc_connect_decline_fallback(smc, rc); - } + if (aclc->hdr.typev1 == SMC_TYPE_R) + rc = smc_connect_rdma(smc, aclc, ini); + else if (aclc->hdr.typev1 == SMC_TYPE_D) + rc = smc_connect_ism(smc, aclc, ini); + if (rc) + goto vlan_cleanup; - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); + smc_connect_ism_vlan_cleanup(smc, ini); + kfree(buf); + kfree(ini); return 0; + +vlan_cleanup: + smc_connect_ism_vlan_cleanup(smc, ini); + kfree(buf); +fallback: + kfree(ini); + return smc_connect_decline_fallback(smc, rc, version); } static void smc_connect_work(struct work_struct *work) { struct smc_sock *smc = container_of(work, struct smc_sock, connect_work); - int rc; + long timeo = smc->sk.sk_sndtimeo; + int rc = 0; - lock_sock(&smc->sk); - rc = kernel_connect(smc->clcsock, &smc->connect_info->addr, - smc->connect_info->alen, smc->connect_info->flags); + if (!timeo) + timeo = MAX_SCHEDULE_TIMEOUT; + lock_sock(smc->clcsock->sk); if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; - goto out; + } else if ((1 << smc->clcsock->sk->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); + if ((rc == -EPIPE) && + ((1 << smc->clcsock->sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))) + rc = 0; } - if (rc < 0) { - smc->sk.sk_err = -rc; + release_sock(smc->clcsock->sk); + lock_sock(&smc->sk); + if (rc != 0 || smc->sk.sk_err) { + smc->sk.sk_state = SMC_CLOSED; + if (rc == -EPIPE || rc == -EAGAIN) + smc->sk.sk_err = EPIPE; + else if (rc == -ECONNREFUSED) + smc->sk.sk_err = ECONNREFUSED; + else if (signal_pending(current)) + smc->sk.sk_err = -sock_intr_errno(timeo); + sock_put(&smc->sk); /* passive closing */ goto out; } @@ -751,12 +1070,14 @@ smc->sk.sk_err = -rc; out: - if (smc->sk.sk_err) - smc->sk.sk_state_change(&smc->sk); - else - smc->sk.sk_write_space(&smc->sk); - kfree(smc->connect_info); - smc->connect_info = NULL; + if (!sock_flag(&smc->sk, SOCK_DEAD)) { + if (smc->sk.sk_err) { + smc->sk.sk_state_change(&smc->sk); + } else { /* allow polling before and after fallback decision */ + smc->clcsock->sk->sk_write_space(smc->clcsock->sk); + smc->sk.sk_write_space(&smc->sk); + } + } release_sock(&smc->sk); } @@ -789,26 +1110,22 @@ smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; + if (smc->connect_nonblock) { + rc = -EALREADY; + goto out; + } + rc = kernel_connect(smc->clcsock, addr, alen, flags); + if (rc && rc != -EINPROGRESS) + goto out; + + if (smc->use_fallback) + goto out; + sock_hold(&smc->sk); /* sock put in passive closing */ if (flags & O_NONBLOCK) { - if (smc->connect_info) { - rc = -EALREADY; - goto out; - } - smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL); - if (!smc->connect_info) { - rc = -ENOMEM; - goto out; - } - smc->connect_info->alen = alen; - smc->connect_info->flags = flags ^ O_NONBLOCK; - memcpy(&smc->connect_info->addr, addr, alen); - schedule_work(&smc->connect_work); + if (queue_work(smc_hs_wq, &smc->connect_work)) + smc->connect_nonblock = 1; rc = -EINPROGRESS; } else { - rc = kernel_connect(smc->clcsock, addr, alen, flags); - if (rc) - goto out; - rc = __smc_connect(smc); if (rc < 0) goto out; @@ -842,10 +1159,10 @@ mutex_lock(&lsmc->clcsock_release_lock); if (lsmc->clcsock) - rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); + rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); mutex_unlock(&lsmc->clcsock_release_lock); lock_sock(lsk); - if (rc < 0) + if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); @@ -858,6 +1175,10 @@ goto out; } + /* new clcsock has inherited the smc listen-specific sk_data_ready + * function; switch it back to the original sk_data_ready function + */ + new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; (*new_smc)->clcsock = new_clcsock; out: return rc; @@ -911,8 +1232,13 @@ sock_put(new_sk); /* final */ continue; } - if (new_sock) + if (new_sock) { sock_graft(new_sk, new_sock); + if (isk->use_fallback) { + smc_sk(new_sk)->clcsock->file = new_sock->file; + isk->clcsock->file->private_data = isk->clcsock; + } + } return new_sk; } return NULL; @@ -923,45 +1249,24 @@ { struct smc_sock *smc = smc_sk(sk); + sock_hold(sk); /* sock_put below */ lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; - if (!smc->use_fallback) { - smc_close_active(smc); - sock_set_flag(sk, SOCK_DEAD); - sk->sk_shutdown |= SHUTDOWN_MASK; - } - sk->sk_prot->unhash(sk); - if (smc->clcsock) { - struct socket *tcp; - - tcp = smc->clcsock; - smc->clcsock = NULL; - sock_release(tcp); - } - if (smc->use_fallback) { - sock_put(sk); /* passive closing */ - sk->sk_state = SMC_CLOSED; - } else { - if (sk->sk_state == SMC_CLOSED) - smc_conn_free(&smc->conn); - } + __smc_release(smc); release_sock(sk); + sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ } -static int smc_serv_conf_first_link(struct smc_sock *smc) +static int smcr_serv_conf_first_link(struct smc_sock *smc) { - struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; - int rest; + struct smc_link *link = smc->conn.lnk; + struct smc_llc_qentry *qentry; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; - - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK request to client over the RoCE fabric */ @@ -970,40 +1275,29 @@ return SMC_CLC_DECL_TIMEOUT_CL; /* receive CONFIRM LINK response from client over the RoCE fabric */ - rest = wait_for_completion_interruptible_timeout( - &link->llc_confirm_resp, - SMC_LLC_WAIT_FIRST_TIME); - if (rest <= 0) { + qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } - - if (link->llc_confirm_resp_rc) + smc_llc_save_peer_uid(qentry); + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) return SMC_CLC_DECL_RMBE_EC; - /* send ADD LINK request to client over the RoCE fabric */ - rc = smc_llc_send_add_link(link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); - if (rc < 0) - return SMC_CLC_DECL_TIMEOUT_AL; + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; - /* receive ADD LINK response from client over the RoCE fabric */ - rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, - SMC_LLC_WAIT_TIME); - if (rest <= 0) { - struct smc_clc_msg_decline dclc; + smc_llc_link_active(link); + smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE); - return rc; - } - - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); - + /* initial contact - try to establish second link */ + smc_llc_srv_add_link(link); return 0; } @@ -1013,13 +1307,13 @@ struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; - lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); if (lsmc->sk.sk_state == SMC_LISTEN) { + lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); + release_sock(&lsmc->sk); } else { /* no longer listening */ smc_close_non_accepted(newsmcsk); } - release_sock(&lsmc->sk); /* Wake up accept */ lsmc->sk.sk_data_ready(&lsmc->sk); @@ -1031,7 +1325,6 @@ { struct sock *newsmcsk = &new_smc->sk; - sk_refcnt_debug_inc(newsmcsk); if (newsmcsk->sk_state == SMC_INIT) newsmcsk->sk_state = SMC_ACTIVE; @@ -1046,27 +1339,27 @@ if (newsmcsk->sk_state == SMC_INIT) sock_put(&new_smc->sk); /* passive closing */ newsmcsk->sk_state = SMC_CLOSED; - smc_conn_free(&new_smc->conn); smc_listen_out(new_smc); } /* listen worker: decline and fall back if possible */ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, - int local_contact) + int local_first, u8 version) { /* RDMA setup failed, switch back to TCP */ - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); + if (local_first) + smc_lgr_cleanup_early(&new_smc->conn); + else + smc_conn_free(&new_smc->conn); if (reason_code < 0) { /* error, no fallback possible */ smc_listen_out_err(new_smc); return; } - smc_conn_free(&new_smc->conn); - new_smc->use_fallback = true; + smc_switch_to_fallback(new_smc); new_smc->fallback_rsn = reason_code; if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { - if (smc_clc_send_decline(new_smc, reason_code) < 0) { + if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { smc_listen_out_err(new_smc); return; } @@ -1074,34 +1367,73 @@ smc_listen_out_connected(new_smc); } +/* listen worker: version checking */ +static int smc_listen_v2_check(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext; + struct smc_clc_v2_extension *pclc_v2_ext; + + ini->smc_type_v1 = pclc->hdr.typev1; + ini->smc_type_v2 = pclc->hdr.typev2; + ini->smcd_version = ini->smc_type_v1 != SMC_TYPE_N ? SMC_V1 : 0; + if (pclc->hdr.version > SMC_V1) + ini->smcd_version |= + ini->smc_type_v2 != SMC_TYPE_N ? SMC_V2 : 0; + if (!smc_ism_v2_capable) { + ini->smcd_version &= ~SMC_V2; + goto out; + } + pclc_v2_ext = smc_get_clc_v2_ext(pclc); + if (!pclc_v2_ext) { + ini->smcd_version &= ~SMC_V2; + goto out; + } + pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext); + if (!pclc_smcd_v2_ext) + ini->smcd_version &= ~SMC_V2; + +out: + if (!ini->smcd_version) { + if (pclc->hdr.typev1 == SMC_TYPE_B || + pclc->hdr.typev2 == SMC_TYPE_B) + return SMC_CLC_DECL_NOSMCDEV; + if (pclc->hdr.typev1 == SMC_TYPE_D || + pclc->hdr.typev2 == SMC_TYPE_D) + return SMC_CLC_DECL_NOSMCDDEV; + return SMC_CLC_DECL_NOSMCRDEV; + } + + return 0; +} + /* listen worker: check prefixes */ -static int smc_listen_rdma_check(struct smc_sock *new_smc, +static int smc_listen_prfx_check(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc) { struct smc_clc_msg_proposal_prefix *pclc_prfx; struct socket *newclcsock = new_smc->clcsock; + if (pclc->hdr.typev1 == SMC_TYPE_N) + return 0; pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (smc_clc_prfx_match(newclcsock, pclc_prfx)) - return SMC_CLC_DECL_CNFERR; + return SMC_CLC_DECL_DIFFPREFIX; return 0; } /* listen worker: initialize connection and buffers */ static int smc_listen_rdma_init(struct smc_sock *new_smc, - struct smc_clc_msg_proposal *pclc, - struct smc_ib_device *ibdev, u8 ibport, - int *local_contact) + struct smc_init_info *ini) { + int rc; + /* allocate connection / link group */ - *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, - &pclc->lcl, NULL, 0); - if (*local_contact < 0) { - if (*local_contact == -ENOMEM) - return SMC_CLC_DECL_MEM;/* insufficient memory*/ - return SMC_CLC_DECL_INTERR; /* other error */ - } + rc = smc_conn_create(new_smc, ini); + if (rc) + return rc; /* create send buffer and rmb */ if (smc_buf_create(new_smc, false)) @@ -1112,109 +1444,266 @@ /* listen worker: initialize connection and buffers for SMC-D */ static int smc_listen_ism_init(struct smc_sock *new_smc, - struct smc_clc_msg_proposal *pclc, - struct smcd_dev *ismdev, - int *local_contact) + struct smc_init_info *ini) { - struct smc_clc_msg_smcd *pclc_smcd; + int rc; - pclc_smcd = smc_get_clc_msg_smcd(pclc); - *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, NULL, - ismdev, pclc_smcd->gid); - if (*local_contact < 0) { - if (*local_contact == -ENOMEM) - return SMC_CLC_DECL_MEM;/* insufficient memory*/ - return SMC_CLC_DECL_INTERR; /* other error */ - } - - /* Check if peer can be reached via ISM device */ - if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid, - new_smc->conn.lgr->vlan_id, - new_smc->conn.lgr->smcd)) { - if (*local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); - smc_conn_free(&new_smc->conn); - return SMC_CLC_DECL_CNFERR; - } + rc = smc_conn_create(new_smc, ini); + if (rc) + return rc; /* Create send and receive buffers */ - if (smc_buf_create(new_smc, true)) { - if (*local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); - smc_conn_free(&new_smc->conn); - return SMC_CLC_DECL_MEM; + rc = smc_buf_create(new_smc, true); + if (rc) { + if (ini->first_contact_local) + smc_lgr_cleanup_early(&new_smc->conn); + else + smc_conn_free(&new_smc->conn); + return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : + SMC_CLC_DECL_MEM; } return 0; } -/* listen worker: register buffers */ -static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) +static bool smc_is_already_selected(struct smcd_dev *smcd, + struct smc_init_info *ini, + int matches) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + int i; - if (local_contact != SMC_FIRST_CONTACT) { - if (!new_smc->conn.rmb_desc->reused) { - if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) - return SMC_CLC_DECL_ERR_REGRMB; + for (i = 0; i < matches; i++) + if (smcd == ini->ism_dev[i]) + return true; + + return false; +} + +/* check for ISM devices matching proposed ISM devices */ +static void smc_check_ism_v2_match(struct smc_init_info *ini, + u16 proposed_chid, u64 proposed_gid, + unsigned int *matches) +{ + struct smcd_dev *smcd; + + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + if (smcd->going_away) + continue; + if (smc_is_already_selected(smcd, ini, *matches)) + continue; + if (smc_ism_get_chid(smcd) == proposed_chid && + !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) { + ini->ism_peer_gid[*matches] = proposed_gid; + ini->ism_dev[*matches] = smcd; + (*matches)++; + break; } + } +} + +static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_smcd_v2_extension *smcd_v2_ext; + struct smc_clc_v2_extension *smc_v2_ext; + struct smc_clc_msg_smcd *pclc_smcd; + unsigned int matches = 0; + u8 smcd_version; + u8 *eid = NULL; + int i; + + if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2)) + goto not_found; + + pclc_smcd = smc_get_clc_msg_smcd(pclc); + smc_v2_ext = smc_get_clc_v2_ext(pclc); + smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); + if (!smcd_v2_ext || + !smc_v2_ext->hdr.flag.seid) /* no system EID support for SMCD */ + goto not_found; + + mutex_lock(&smcd_dev_list.mutex); + if (pclc_smcd->ism.chid) + /* check for ISM device matching proposed native ISM device */ + smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid), + ntohll(pclc_smcd->ism.gid), &matches); + for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) { + /* check for ISM devices matching proposed non-native ISM + * devices + */ + smc_check_ism_v2_match(ini, + ntohs(smcd_v2_ext->gidchid[i - 1].chid), + ntohll(smcd_v2_ext->gidchid[i - 1].gid), + &matches); + } + mutex_unlock(&smcd_dev_list.mutex); + + if (ini->ism_dev[0]) { + smc_ism_get_system_eid(ini->ism_dev[0], &eid); + if (memcmp(eid, smcd_v2_ext->system_eid, SMC_MAX_EID_LEN)) + goto not_found; + } else { + goto not_found; + } + + /* separate - outside the smcd_dev_list.lock */ + smcd_version = ini->smcd_version; + for (i = 0; i < matches; i++) { + ini->smcd_version = SMC_V2; + ini->is_smcd = true; + ini->ism_selected = i; + if (smc_listen_ism_init(new_smc, ini)) + /* try next active ISM device */ + continue; + return; /* matching and usable V2 ISM device found */ + } + /* no V2 ISM device could be initialized */ + ini->smcd_version = smcd_version; /* restore original value */ + +not_found: + ini->smcd_version &= ~SMC_V2; + ini->ism_dev[0] = NULL; + ini->is_smcd = false; +} + +static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc); + + /* check if ISM V1 is available */ + if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) + goto not_found; + ini->is_smcd = true; /* prepare ISM check */ + ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid); + if (smc_find_ism_device(new_smc, ini)) + goto not_found; + ini->ism_selected = 0; + if (!smc_listen_ism_init(new_smc, ini)) + return; /* V1 ISM device found */ + +not_found: + ini->ism_dev[0] = NULL; + ini->is_smcd = false; +} + +/* listen worker: register buffers */ +static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) +{ + struct smc_connection *conn = &new_smc->conn; + + if (!local_first) { + if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) + return SMC_CLC_DECL_ERR_REGRMB; } smc_rmb_sync_sg_for_device(&new_smc->conn); return 0; } +static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + int rc; + + if (!smcr_indicated(ini->smc_type_v1)) + return SMC_CLC_DECL_NOSMCDEV; + + /* prepare RDMA check */ + ini->ib_lcl = &pclc->lcl; + rc = smc_find_rdma_device(new_smc, ini); + if (rc) { + /* no RDMA device found */ + if (ini->smc_type_v1 == SMC_TYPE_B) + /* neither ISM nor RDMA device found */ + rc = SMC_CLC_DECL_NOSMCDEV; + return rc; + } + rc = smc_listen_rdma_init(new_smc, ini); + if (rc) + return rc; + return smc_listen_rdma_reg(new_smc, ini->first_contact_local); +} + +/* determine the local device matching to proposal */ +static int smc_listen_find_device(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + int rc; + + /* check for ISM device matching V2 proposed device */ + smc_find_ism_v2_device_serv(new_smc, pclc, ini); + if (ini->ism_dev[0]) + return 0; + + if (!(ini->smcd_version & SMC_V1)) + return SMC_CLC_DECL_NOSMCDEV; + + /* check for matching IP prefix and subnet length */ + rc = smc_listen_prfx_check(new_smc, pclc); + if (rc) + return rc; + + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) + return SMC_CLC_DECL_GETVLANERR; + + /* check for ISM device matching V1 proposed device */ + smc_find_ism_v1_device_serv(new_smc, pclc, ini); + if (ini->ism_dev[0]) + return 0; + + if (pclc->hdr.typev1 == SMC_TYPE_D) + return SMC_CLC_DECL_NOSMCDDEV; /* skip RDMA and decline */ + + /* check if RDMA is available */ + return smc_find_rdma_v1_device_serv(new_smc, pclc, ini); +} + /* listen worker: finish RDMA setup */ static int smc_listen_rdma_finish(struct smc_sock *new_smc, struct smc_clc_msg_accept_confirm *cclc, - int local_contact) + bool local_first) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = new_smc->conn.lnk; int reason_code = 0; - if (local_contact == SMC_FIRST_CONTACT) + if (local_first) smc_link_save_peer_info(link, cclc); - if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { - reason_code = SMC_CLC_DECL_ERR_RTOK; - goto decline; - } + if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) + return SMC_CLC_DECL_ERR_RTOK; - if (local_contact == SMC_FIRST_CONTACT) { - if (smc_ib_ready_link(link)) { - reason_code = SMC_CLC_DECL_ERR_RDYLNK; - goto decline; - } + if (local_first) { + if (smc_ib_ready_link(link)) + return SMC_CLC_DECL_ERR_RDYLNK; /* QP confirmation over RoCE fabric */ - reason_code = smc_serv_conf_first_link(new_smc); - if (reason_code) - goto decline; + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); + reason_code = smcr_serv_conf_first_link(new_smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); } - return 0; - -decline: - mutex_unlock(&smc_create_lgr_pending); - smc_listen_decline(new_smc, reason_code, local_contact); return reason_code; } -/* setup for RDMA connection of server */ +/* setup for connection of server */ static void smc_listen_work(struct work_struct *work) { struct smc_sock *new_smc = container_of(work, struct smc_sock, smc_listen_work); + u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1; struct socket *newclcsock = new_smc->clcsock; - struct smc_clc_msg_accept_confirm cclc; + struct smc_clc_msg_accept_confirm *cclc; + struct smc_clc_msg_proposal_area *buf; struct smc_clc_msg_proposal *pclc; - struct smc_ib_device *ibdev; - bool ism_supported = false; - struct smcd_dev *ismdev; - u8 buf[SMC_CLC_MAX_LEN]; - int local_contact = 0; - unsigned short vlan; - int reason_code = 0; + struct smc_init_info *ini = NULL; int rc = 0; - u8 ibport; + + if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) + return smc_listen_out_err(new_smc); if (new_smc->use_fallback) { smc_listen_out_connected(new_smc); @@ -1223,7 +1712,7 @@ /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - new_smc->use_fallback = true; + smc_switch_to_fallback(new_smc); new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; smc_listen_out_connected(new_smc); return; @@ -1232,73 +1721,86 @@ /* do inband token exchange - * wait for and receive SMC Proposal CLC message */ - pclc = (struct smc_clc_msg_proposal *)&buf; - reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, - SMC_CLC_PROPOSAL); - if (reason_code) { - smc_listen_decline(new_smc, reason_code, 0); - return; + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) { + rc = SMC_CLC_DECL_MEM; + goto out_decl; } + pclc = (struct smc_clc_msg_proposal *)buf; + rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf), + SMC_CLC_PROPOSAL, CLC_WAIT_TIME); + if (rc) + goto out_decl; + version = pclc->hdr.version == SMC_V1 ? SMC_V1 : version; - /* IPSec connections opt out of SMC-R optimizations */ + /* IPSec connections opt out of SMC optimizations */ if (using_ipsec(new_smc)) { - smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0); - return; + rc = SMC_CLC_DECL_IPSEC; + goto out_decl; } - mutex_lock(&smc_create_lgr_pending); + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) { + rc = SMC_CLC_DECL_MEM; + goto out_decl; + } + + /* initial version checking */ + rc = smc_listen_v2_check(new_smc, pclc, ini); + if (rc) + goto out_decl; + + mutex_lock(&smc_server_lgr_pending); smc_close_init(new_smc); smc_rx_init(new_smc); smc_tx_init(new_smc); - /* check if ISM is available */ - if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) && - !smc_check_ism(new_smc, &ismdev) && - !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) { - ism_supported = true; - } - - /* check if RDMA is available */ - if (!ism_supported && - ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) || - smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) || - smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) || - smc_listen_rdma_check(new_smc, pclc) || - smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, - &local_contact) || - smc_listen_rdma_reg(new_smc, local_contact))) { - /* SMC not supported, decline */ - mutex_unlock(&smc_create_lgr_pending); - smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP, - local_contact); - return; - } + /* determine ISM or RoCE device used for connection */ + rc = smc_listen_find_device(new_smc, pclc, ini); + if (rc) + goto out_unlock; /* send SMC Accept CLC message */ - rc = smc_clc_send_accept(new_smc, local_contact); - if (rc) { - mutex_unlock(&smc_create_lgr_pending); - smc_listen_decline(new_smc, rc, local_contact); - return; - } + rc = smc_clc_send_accept(new_smc, ini->first_contact_local, + ini->smcd_version == SMC_V2 ? SMC_V2 : SMC_V1); + if (rc) + goto out_unlock; + + /* SMC-D does not need this lock any more */ + if (ini->is_smcd) + mutex_unlock(&smc_server_lgr_pending); /* receive SMC Confirm CLC message */ - reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), - SMC_CLC_CONFIRM); - if (reason_code) { - mutex_unlock(&smc_create_lgr_pending); - smc_listen_decline(new_smc, reason_code, local_contact); - return; + memset(buf, 0, sizeof(*buf)); + cclc = (struct smc_clc_msg_accept_confirm *)buf; + rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf), + SMC_CLC_CONFIRM, CLC_WAIT_TIME); + if (rc) { + if (!ini->is_smcd) + goto out_unlock; + goto out_decl; } /* finish worker */ - if (!ism_supported) { - if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) - return; + if (!ini->is_smcd) { + rc = smc_listen_rdma_finish(new_smc, cclc, + ini->first_contact_local); + if (rc) + goto out_unlock; + mutex_unlock(&smc_server_lgr_pending); } - smc_conn_save_peer_info(new_smc, &cclc); - mutex_unlock(&smc_create_lgr_pending); + smc_conn_save_peer_info(new_smc, cclc); smc_listen_out_connected(new_smc); + goto out_free; + +out_unlock: + mutex_unlock(&smc_server_lgr_pending); +out_decl: + smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, + version); +out_free: + kfree(ini); + kfree(buf); } static void smc_tcp_listen_work(struct work_struct *work) @@ -1312,7 +1814,7 @@ lock_sock(lsk); while (lsk->sk_state == SMC_LISTEN) { rc = smc_clcsock_accept(lsmc, &new_smc); - if (rc) + if (rc) /* clcsock accept queue empty or error */ goto out; if (!new_smc) continue; @@ -1326,13 +1828,29 @@ new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; sock_hold(&new_smc->sk); /* sock_put in passive closing */ - if (!schedule_work(&new_smc->smc_listen_work)) + if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) sock_put(&new_smc->sk); } out: release_sock(lsk); - sock_put(&lsmc->sk); /* sock_hold in smc_listen */ + sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ +} + +static void smc_clcsock_data_ready(struct sock *listen_clcsock) +{ + struct smc_sock *lsmc; + + lsmc = (struct smc_sock *) + ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY); + if (!lsmc) + return; + lsmc->clcsk_data_ready(listen_clcsock); + if (lsmc->sk.sk_state == SMC_LISTEN) { + sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ + if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work)) + sock_put(&lsmc->sk); + } } static int smc_listen(struct socket *sock, int backlog) @@ -1345,7 +1863,8 @@ lock_sock(sk); rc = -EINVAL; - if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) + if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) || + smc->connect_nonblock) goto out; rc = 0; @@ -1360,16 +1879,21 @@ if (!smc->use_fallback) tcp_sk(smc->clcsock->sk)->syn_smc = 1; + /* save original sk_data_ready function and establish + * smc-specific sk_data_ready function + */ + smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready; + smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready; + smc->clcsock->sk->sk_user_data = + (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); rc = kernel_listen(smc->clcsock, backlog); - if (rc) + if (rc) { + smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; goto out; + } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = SMC_LISTEN; - INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); - sock_hold(sk); /* sock_hold in tcp_listen_worker */ - if (!schedule_work(&smc->tcp_listen_work)) - sock_put(sk); out: release_sock(sk); @@ -1464,23 +1988,26 @@ { struct sock *sk = sock->sk; struct smc_sock *smc; - int rc = -EPIPE; + int rc; smc = smc_sk(sk); lock_sock(sk); - if ((sk->sk_state != SMC_ACTIVE) && - (sk->sk_state != SMC_APPCLOSEWAIT1) && - (sk->sk_state != SMC_INIT)) - goto out; + /* SMC does not support connect with fastopen */ if (msg->msg_flags & MSG_FASTOPEN) { - if (sk->sk_state == SMC_INIT) { - smc->use_fallback = true; + /* not connected yet, fallback */ + if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { + smc_switch_to_fallback(smc); smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { rc = -EINVAL; goto out; } + } else if ((sk->sk_state != SMC_ACTIVE) && + (sk->sk_state != SMC_APPCLOSEWAIT1) && + (sk->sk_state != SMC_INIT)) { + rc = -EPIPE; + goto out; } if (smc->use_fallback) @@ -1501,6 +2028,11 @@ smc = smc_sk(sk); lock_sock(sk); + if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { + /* socket was connected before, no more data to read */ + rc = 0; + goto out; + } if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) || (sk->sk_state == SMC_CLOSED)) @@ -1540,8 +2072,8 @@ poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; struct smc_sock *smc; + __poll_t mask = 0; if (!sk) return EPOLLNVAL; @@ -1551,8 +2083,6 @@ /* delegate to CLC child sock */ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; - if (sk->sk_err) - mask |= EPOLLERR; } else { if (sk->sk_state != SMC_CLOSED) sock_poll_wait(file, sock, wait); @@ -1563,9 +2093,14 @@ mask |= EPOLLHUP; if (sk->sk_state == SMC_LISTEN) { /* woken up by sk_data_ready in smc_listen_work() */ - mask = smc_accept_poll(sk); + mask |= smc_accept_poll(sk); + } else if (smc->use_fallback) { /* as result of connect_work()*/ + mask |= smc->clcsock->ops->poll(file, smc->clcsock, + wait); + sk->sk_err = smc->clcsock->sk->sk_err; } else { - if (atomic_read(&smc->conn.sndbuf_space) || + if ((sk->sk_state != SMC_INIT && + atomic_read(&smc->conn.sndbuf_space)) || sk->sk_shutdown & SEND_SHUTDOWN) { mask |= EPOLLOUT | EPOLLWRNORM; } else { @@ -1613,8 +2148,10 @@ if (smc->use_fallback) { rc = kernel_sock_shutdown(smc->clcsock, how); sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; + sock_put(sk); + } goto out; } switch (how) { @@ -1644,7 +2181,7 @@ } static int smc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -1658,51 +2195,53 @@ /* generic setsockopts reaching us here always apply to the * CLC socket */ - rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, - optval, optlen); + if (unlikely(!smc->clcsock->ops->setsockopt)) + rc = -EOPNOTSUPP; + else + rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, + optval, optlen); if (smc->clcsock->sk->sk_err) { sk->sk_err = smc->clcsock->sk->sk_err; sk->sk_error_report(sk); } - if (rc) - return rc; if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); + if (rc || smc->use_fallback) + goto out; switch (optname) { case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ - if (sk->sk_state == SMC_INIT) { - smc->use_fallback = true; + if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { + smc_switch_to_fallback(smc); smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { - if (!smc->use_fallback) - rc = -EINVAL; + rc = -EINVAL; } break; case TCP_NODELAY: if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (val && !smc->use_fallback) - mod_delayed_work(system_wq, &smc->conn.tx_work, - 0); + if (val) + mod_delayed_work(smc->conn.lgr->tx_wq, + &smc->conn.tx_work, 0); } break; case TCP_CORK: if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (!val && !smc->use_fallback) - mod_delayed_work(system_wq, &smc->conn.tx_work, - 0); + if (!val) + mod_delayed_work(smc->conn.lgr->tx_wq, + &smc->conn.tx_work, 0); } break; case TCP_DEFER_ACCEPT: @@ -1711,6 +2250,7 @@ default: break; } +out: release_sock(sk); return rc; @@ -1723,6 +2263,8 @@ smc = smc_sk(sock->sk); /* socket options apply to the CLC socket */ + if (unlikely(!smc->clcsock->ops->getsockopt)) + return -EOPNOTSUPP; return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); } @@ -1848,7 +2390,11 @@ smc = smc_sk(sk); lock_sock(sk); - + if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { + /* socket was connected before, no more data to read */ + rc = 0; + goto out; + } if (sk->sk_state == SMC_INIT || sk->sk_state == SMC_LISTEN || sk->sk_state == SMC_CLOSED) @@ -1947,30 +2493,71 @@ .create = smc_create, }; +unsigned int smc_net_id; + +static __net_init int smc_net_init(struct net *net) +{ + return smc_pnet_net_init(net); +} + +static void __net_exit smc_net_exit(struct net *net) +{ + smc_pnet_net_exit(net); +} + +static struct pernet_operations smc_net_ops = { + .init = smc_net_init, + .exit = smc_net_exit, + .id = &smc_net_id, + .size = sizeof(struct smc_net), +}; + static int __init smc_init(void) { int rc; - rc = smc_pnet_init(); + rc = register_pernet_subsys(&smc_net_ops); if (rc) return rc; + + smc_ism_init(); + smc_clc_init(); + + rc = smc_pnet_init(); + if (rc) + goto out_pernet_subsys; + + rc = -ENOMEM; + smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); + if (!smc_hs_wq) + goto out_pnet; + + smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); + if (!smc_close_wq) + goto out_alloc_hs_wq; + + rc = smc_core_init(); + if (rc) { + pr_err("%s: smc_core_init fails with %d\n", __func__, rc); + goto out_alloc_wqs; + } rc = smc_llc_init(); if (rc) { pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = smc_cdc_init(); if (rc) { pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = proto_register(&smc_proto, 1); if (rc) { pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = proto_register(&smc_proto6, 1); @@ -2002,20 +2589,33 @@ proto_unregister(&smc_proto6); out_proto: proto_unregister(&smc_proto); +out_core: + smc_core_exit(); +out_alloc_wqs: + destroy_workqueue(smc_close_wq); +out_alloc_hs_wq: + destroy_workqueue(smc_hs_wq); out_pnet: smc_pnet_exit(); +out_pernet_subsys: + unregister_pernet_subsys(&smc_net_ops); + return rc; } static void __exit smc_exit(void) { - smc_core_exit(); static_branch_disable(&tcp_have_smc); - smc_ib_unregister_client(); sock_unregister(PF_SMC); + smc_core_exit(); + smc_ib_unregister_client(); + destroy_workqueue(smc_close_wq); + destroy_workqueue(smc_hs_wq); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); + unregister_pernet_subsys(&smc_net_ops); + rcu_barrier(); } module_init(smc_init); -- Gitblit v1.6.2