From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/cifs/smbdirect.c | 849 ++++++++++++++++++++++++-------------------------------- 1 files changed, 360 insertions(+), 489 deletions(-) diff --git a/kernel/fs/cifs/smbdirect.c b/kernel/fs/cifs/smbdirect.c index 784628e..bcc6110 100644 --- a/kernel/fs/cifs/smbdirect.c +++ b/kernel/fs/cifs/smbdirect.c @@ -1,17 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2017, Microsoft Corporation. * * Author(s): Long Li <longli@microsoft.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #include <linux/module.h> #include <linux/highmem.h> @@ -164,95 +155,6 @@ #define log_rdma_mr(level, fmt, args...) \ log_rdma(level, LOG_RDMA_MR, fmt, ##args) -/* - * Destroy the transport and related RDMA and memory resources - * Need to go through all the pending counters and make sure on one is using - * the transport while it is destroyed - */ -static void smbd_destroy_rdma_work(struct work_struct *work) -{ - struct smbd_response *response; - struct smbd_connection *info = - container_of(work, struct smbd_connection, destroy_work); - unsigned long flags; - - log_rdma_event(INFO, "destroying qp\n"); - ib_drain_qp(info->id->qp); - rdma_destroy_qp(info->id); - - /* Unblock all I/O waiting on the send queue */ - wake_up_interruptible_all(&info->wait_send_queue); - - log_rdma_event(INFO, "cancelling idle timer\n"); - cancel_delayed_work_sync(&info->idle_timer_work); - log_rdma_event(INFO, "cancelling send immediate work\n"); - cancel_delayed_work_sync(&info->send_immediate_work); - - log_rdma_event(INFO, "wait for all send to finish\n"); - wait_event(info->wait_smbd_send_pending, - info->smbd_send_pending == 0); - - log_rdma_event(INFO, "wait for all recv to finish\n"); - wake_up_interruptible(&info->wait_reassembly_queue); - wait_event(info->wait_smbd_recv_pending, - info->smbd_recv_pending == 0); - - log_rdma_event(INFO, "wait for all send posted to IB to finish\n"); - wait_event(info->wait_send_pending, - atomic_read(&info->send_pending) == 0); - wait_event(info->wait_send_payload_pending, - atomic_read(&info->send_payload_pending) == 0); - - log_rdma_event(INFO, "freeing mr list\n"); - wake_up_interruptible_all(&info->wait_mr); - wait_event(info->wait_for_mr_cleanup, - atomic_read(&info->mr_used_count) == 0); - destroy_mr_list(info); - - /* It's not posssible for upper layer to get to reassembly */ - log_rdma_event(INFO, "drain the reassembly queue\n"); - do { - spin_lock_irqsave(&info->reassembly_queue_lock, flags); - response = _get_first_reassembly(info); - if (response) { - list_del(&response->list); - spin_unlock_irqrestore( - &info->reassembly_queue_lock, flags); - put_receive_buffer(info, response); - } else - spin_unlock_irqrestore(&info->reassembly_queue_lock, flags); - } while (response); - - info->reassembly_data_length = 0; - - log_rdma_event(INFO, "free receive buffers\n"); - wait_event(info->wait_receive_queues, - info->count_receive_queue + info->count_empty_packet_queue - == info->receive_credit_max); - destroy_receive_buffers(info); - - ib_free_cq(info->send_cq); - ib_free_cq(info->recv_cq); - ib_dealloc_pd(info->pd); - rdma_destroy_id(info->id); - - /* free mempools */ - mempool_destroy(info->request_mempool); - kmem_cache_destroy(info->request_cache); - - mempool_destroy(info->response_mempool); - kmem_cache_destroy(info->response_cache); - - info->transport_status = SMBD_DESTROYED; - wake_up_all(&info->wait_destroy); -} - -static int smbd_process_disconnected(struct smbd_connection *info) -{ - schedule_work(&info->destroy_work); - return 0; -} - static void smbd_disconnect_rdma_work(struct work_struct *work) { struct smbd_connection *info = @@ -319,7 +221,9 @@ } info->transport_status = SMBD_DISCONNECTED; - smbd_process_disconnected(info); + wake_up_interruptible(&info->disconn_wait); + wake_up_interruptible(&info->wait_reassembly_queue); + wake_up_interruptible_all(&info->wait_send_queue); break; default: @@ -380,28 +284,22 @@ request->sge[i].length, DMA_TO_DEVICE); - if (request->has_payload) { - if (atomic_dec_and_test(&request->info->send_payload_pending)) - wake_up(&request->info->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&request->info->send_pending)) - wake_up(&request->info->wait_send_pending); - } + if (atomic_dec_and_test(&request->info->send_pending)) + wake_up(&request->info->wait_send_pending); + + wake_up(&request->info->wait_post_send); mempool_free(request, request->info->request_mempool); } static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp) { - log_rdma_event(INFO, "resp message min_version %u max_version %u " - "negotiated_version %u credits_requested %u " - "credits_granted %u status %u max_readwrite_size %u " - "preferred_send_size %u max_receive_size %u " - "max_fragmented_size %u\n", - resp->min_version, resp->max_version, resp->negotiated_version, - resp->credits_requested, resp->credits_granted, resp->status, - resp->max_readwrite_size, resp->preferred_send_size, - resp->max_receive_size, resp->max_fragmented_size); + log_rdma_event(INFO, "resp message min_version %u max_version %u negotiated_version %u credits_requested %u credits_granted %u status %u max_readwrite_size %u preferred_send_size %u max_receive_size %u max_fragmented_size %u\n", + resp->min_version, resp->max_version, + resp->negotiated_version, resp->credits_requested, + resp->credits_granted, resp->status, + resp->max_readwrite_size, resp->preferred_send_size, + resp->max_receive_size, resp->max_fragmented_size); } /* @@ -479,27 +377,6 @@ return true; } -/* - * Check and schedule to send an immediate packet - * This is used to extend credtis to remote peer to keep the transport busy - */ -static void check_and_send_immediate(struct smbd_connection *info) -{ - if (info->transport_status != SMBD_CONNECTED) - return; - - info->send_immediate = true; - - /* - * Promptly send a packet if our peer is running low on receive - * credits - */ - if (atomic_read(&info->receive_credits) < - info->receive_credit_target - 1) - queue_delayed_work( - info->workqueue, &info->send_immediate_work, 0); -} - static void smbd_post_send_credits(struct work_struct *work) { int ret = 0; @@ -549,29 +426,16 @@ info->new_credits_offered += ret; spin_unlock(&info->lock_new_credits_offered); - atomic_add(ret, &info->receive_credits); - - /* Check if we can post new receive and grant credits to peer */ - check_and_send_immediate(info); -} - -static void smbd_recv_done_work(struct work_struct *work) -{ - struct smbd_connection *info = - container_of(work, struct smbd_connection, recv_done_work); - - /* - * We may have new send credits granted from remote peer - * If any sender is blcoked on lack of credets, unblock it - */ - if (atomic_read(&info->send_credits)) - wake_up_interruptible(&info->wait_send_queue); - - /* - * Check if we need to send something to remote peer to - * grant more credits or respond to KEEP_ALIVE packet - */ - check_and_send_immediate(info); + /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */ + info->send_immediate = true; + if (atomic_read(&info->receive_credits) < + info->receive_credit_target - 1) { + if (info->keep_alive_requested == KEEP_ALIVE_PENDING || + info->send_immediate) { + log_keep_alive(INFO, "send an empty message\n"); + smbd_post_send_empty(info); + } + } } /* Called from softirq, when recv is done */ @@ -583,10 +447,9 @@ struct smbd_connection *info = response->info; int data_length = 0; - log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d " - "byte_len=%d pkey_index=%x\n", - response, response->type, wc->status, wc->opcode, - wc->byte_len, wc->pkey_index); + log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%x\n", + response, response->type, wc->status, wc->opcode, + wc->byte_len, wc->pkey_index); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { log_rdma_recv(INFO, "wc->status=%d opcode=%d\n", @@ -642,15 +505,21 @@ atomic_dec(&info->receive_credits); info->receive_credit_target = le16_to_cpu(data_transfer->credits_requested); - atomic_add(le16_to_cpu(data_transfer->credits_granted), - &info->send_credits); + if (le16_to_cpu(data_transfer->credits_granted)) { + atomic_add(le16_to_cpu(data_transfer->credits_granted), + &info->send_credits); + /* + * We have new send credits granted from remote peer + * If any sender is waiting for credits, unblock it + */ + wake_up_interruptible(&info->wait_send_queue); + } - log_incoming(INFO, "data flags %d data_offset %d " - "data_length %d remaining_data_length %d\n", - le16_to_cpu(data_transfer->flags), - le32_to_cpu(data_transfer->data_offset), - le32_to_cpu(data_transfer->data_length), - le32_to_cpu(data_transfer->remaining_data_length)); + log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n", + le16_to_cpu(data_transfer->flags), + le32_to_cpu(data_transfer->data_offset), + le32_to_cpu(data_transfer->data_length), + le32_to_cpu(data_transfer->remaining_data_length)); /* Send a KEEP_ALIVE response right away if requested */ info->keep_alive_requested = KEEP_ALIVE_NONE; @@ -659,7 +528,6 @@ info->keep_alive_requested = KEEP_ALIVE_PENDING; } - queue_work(info->workqueue, &info->recv_done_work); return; default: @@ -759,14 +627,10 @@ } if (!frwr_is_supported(&info->id->device->attrs)) { - log_rdma_event(ERR, - "Fast Registration Work Requests " - "(FRWR) is not supported\n"); - log_rdma_event(ERR, - "Device capability flags = %llx " - "max_fast_reg_page_list_len = %u\n", - info->id->device->attrs.device_cap_flags, - info->id->device->attrs.max_fast_reg_page_list_len); + log_rdma_event(ERR, "Fast Registration Work Requests (FRWR) is not supported\n"); + log_rdma_event(ERR, "Device capability flags = %llx max_fast_reg_page_list_len = %u\n", + info->id->device->attrs.device_cap_flags, + info->id->device->attrs.max_fast_reg_page_list_len); rc = -EPROTONOSUPPORT; goto out2; } @@ -852,7 +716,6 @@ request->sge[0].addr, request->sge[0].length, request->sge[0].lkey); - request->has_payload = false; atomic_inc(&info->send_pending); rc = ib_post_send(info->id->qp, &send_wr, NULL); if (!rc) @@ -909,120 +772,9 @@ return 0; } -/* - * Build and prepare the SMBD packet header - * This function waits for avaialbe send credits and build a SMBD packet - * header. The caller then optional append payload to the packet after - * the header - * intput values - * size: the size of the payload - * remaining_data_length: remaining data to send if this is part of a - * fragmented packet - * output values - * request_out: the request allocated from this function - * return values: 0 on success, otherwise actual error code returned - */ -static int smbd_create_header(struct smbd_connection *info, - int size, int remaining_data_length, - struct smbd_request **request_out) -{ - struct smbd_request *request; - struct smbd_data_transfer *packet; - int header_length; - int rc; - - /* Wait for send credits. A SMBD packet needs one credit */ - rc = wait_event_interruptible(info->wait_send_queue, - atomic_read(&info->send_credits) > 0 || - info->transport_status != SMBD_CONNECTED); - if (rc) - return rc; - - if (info->transport_status != SMBD_CONNECTED) { - log_outgoing(ERR, "disconnected not sending\n"); - return -ENOENT; - } - atomic_dec(&info->send_credits); - - request = mempool_alloc(info->request_mempool, GFP_KERNEL); - if (!request) { - rc = -ENOMEM; - goto err; - } - - request->info = info; - - /* Fill in the packet header */ - packet = smbd_request_payload(request); - packet->credits_requested = cpu_to_le16(info->send_credit_target); - packet->credits_granted = - cpu_to_le16(manage_credits_prior_sending(info)); - info->send_immediate = false; - - packet->flags = 0; - if (manage_keep_alive_before_sending(info)) - packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED); - - packet->reserved = 0; - if (!size) - packet->data_offset = 0; - else - packet->data_offset = cpu_to_le32(24); - packet->data_length = cpu_to_le32(size); - packet->remaining_data_length = cpu_to_le32(remaining_data_length); - packet->padding = 0; - - log_outgoing(INFO, "credits_requested=%d credits_granted=%d " - "data_offset=%d data_length=%d remaining_data_length=%d\n", - le16_to_cpu(packet->credits_requested), - le16_to_cpu(packet->credits_granted), - le32_to_cpu(packet->data_offset), - le32_to_cpu(packet->data_length), - le32_to_cpu(packet->remaining_data_length)); - - /* Map the packet to DMA */ - header_length = sizeof(struct smbd_data_transfer); - /* If this is a packet without payload, don't send padding */ - if (!size) - header_length = offsetof(struct smbd_data_transfer, padding); - - request->num_sge = 1; - request->sge[0].addr = ib_dma_map_single(info->id->device, - (void *)packet, - header_length, - DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) { - mempool_free(request, info->request_mempool); - rc = -EIO; - goto err; - } - - request->sge[0].length = header_length; - request->sge[0].lkey = info->pd->local_dma_lkey; - - *request_out = request; - return 0; - -err: - atomic_inc(&info->send_credits); - return rc; -} - -static void smbd_destroy_header(struct smbd_connection *info, - struct smbd_request *request) -{ - - ib_dma_unmap_single(info->id->device, - request->sge[0].addr, - request->sge[0].length, - DMA_TO_DEVICE); - mempool_free(request, info->request_mempool); - atomic_inc(&info->send_credits); -} - /* Post the send request */ static int smbd_post_send(struct smbd_connection *info, - struct smbd_request *request, bool has_payload) + struct smbd_request *request) { struct ib_send_wr send_wr; int rc, i; @@ -1047,25 +799,11 @@ send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; - if (has_payload) { - request->has_payload = true; - atomic_inc(&info->send_payload_pending); - } else { - request->has_payload = false; - atomic_inc(&info->send_pending); - } - rc = ib_post_send(info->id->qp, &send_wr, NULL); if (rc) { log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); - if (has_payload) { - if (atomic_dec_and_test(&info->send_payload_pending)) - wake_up(&info->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&info->send_pending)) - wake_up(&info->wait_send_pending); - } smbd_disconnect_rdma_connection(info); + rc = -EAGAIN; } else /* Reset timer for idle connection after packet is sent */ mod_delayed_work(info->workqueue, &info->idle_timer_work, @@ -1079,42 +817,150 @@ { int num_sgs; int i, rc; + int header_length; struct smbd_request *request; + struct smbd_data_transfer *packet; + int new_credits; struct scatterlist *sg; - rc = smbd_create_header( - info, data_length, remaining_data_length, &request); +wait_credit: + /* Wait for send credits. A SMBD packet needs one credit */ + rc = wait_event_interruptible(info->wait_send_queue, + atomic_read(&info->send_credits) > 0 || + info->transport_status != SMBD_CONNECTED); if (rc) - return rc; + goto err_wait_credit; + if (info->transport_status != SMBD_CONNECTED) { + log_outgoing(ERR, "disconnected not sending on wait_credit\n"); + rc = -EAGAIN; + goto err_wait_credit; + } + if (unlikely(atomic_dec_return(&info->send_credits) < 0)) { + atomic_inc(&info->send_credits); + goto wait_credit; + } + +wait_send_queue: + wait_event(info->wait_post_send, + atomic_read(&info->send_pending) < info->send_credit_target || + info->transport_status != SMBD_CONNECTED); + + if (info->transport_status != SMBD_CONNECTED) { + log_outgoing(ERR, "disconnected not sending on wait_send_queue\n"); + rc = -EAGAIN; + goto err_wait_send_queue; + } + + if (unlikely(atomic_inc_return(&info->send_pending) > + info->send_credit_target)) { + atomic_dec(&info->send_pending); + goto wait_send_queue; + } + + request = mempool_alloc(info->request_mempool, GFP_KERNEL); + if (!request) { + rc = -ENOMEM; + goto err_alloc; + } + + request->info = info; + + /* Fill in the packet header */ + packet = smbd_request_payload(request); + packet->credits_requested = cpu_to_le16(info->send_credit_target); + + new_credits = manage_credits_prior_sending(info); + atomic_add(new_credits, &info->receive_credits); + packet->credits_granted = cpu_to_le16(new_credits); + + info->send_immediate = false; + + packet->flags = 0; + if (manage_keep_alive_before_sending(info)) + packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED); + + packet->reserved = 0; + if (!data_length) + packet->data_offset = 0; + else + packet->data_offset = cpu_to_le32(24); + packet->data_length = cpu_to_le32(data_length); + packet->remaining_data_length = cpu_to_le32(remaining_data_length); + packet->padding = 0; + + log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n", + le16_to_cpu(packet->credits_requested), + le16_to_cpu(packet->credits_granted), + le32_to_cpu(packet->data_offset), + le32_to_cpu(packet->data_length), + le32_to_cpu(packet->remaining_data_length)); + + /* Map the packet to DMA */ + header_length = sizeof(struct smbd_data_transfer); + /* If this is a packet without payload, don't send padding */ + if (!data_length) + header_length = offsetof(struct smbd_data_transfer, padding); + + request->num_sge = 1; + request->sge[0].addr = ib_dma_map_single(info->id->device, + (void *)packet, + header_length, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) { + rc = -EIO; + request->sge[0].addr = 0; + goto err_dma; + } + + request->sge[0].length = header_length; + request->sge[0].lkey = info->pd->local_dma_lkey; + + /* Fill in the packet data payload */ num_sgs = sgl ? sg_nents(sgl) : 0; for_each_sg(sgl, sg, num_sgs, i) { request->sge[i+1].addr = ib_dma_map_page(info->id->device, sg_page(sg), - sg->offset, sg->length, DMA_BIDIRECTIONAL); + sg->offset, sg->length, DMA_TO_DEVICE); if (ib_dma_mapping_error( info->id->device, request->sge[i+1].addr)) { rc = -EIO; request->sge[i+1].addr = 0; - goto dma_mapping_failure; + goto err_dma; } request->sge[i+1].length = sg->length; request->sge[i+1].lkey = info->pd->local_dma_lkey; request->num_sge++; } - rc = smbd_post_send(info, request, data_length); + rc = smbd_post_send(info, request); if (!rc) return 0; -dma_mapping_failure: - for (i = 1; i < request->num_sge; i++) +err_dma: + for (i = 0; i < request->num_sge; i++) if (request->sge[i].addr) ib_dma_unmap_single(info->id->device, request->sge[i].addr, request->sge[i].length, DMA_TO_DEVICE); - smbd_destroy_header(info, request); + mempool_free(request, info->request_mempool); + + /* roll back receive credits and credits to be offered */ + spin_lock(&info->lock_new_credits_offered); + info->new_credits_offered += new_credits; + spin_unlock(&info->lock_new_credits_offered); + atomic_sub(new_credits, &info->receive_credits); + +err_alloc: + if (atomic_dec_and_test(&info->send_pending)) + wake_up(&info->wait_send_pending); + +err_wait_send_queue: + /* roll back send credits and pending */ + atomic_inc(&info->send_credits); + +err_wait_credit: return rc; } @@ -1222,11 +1068,9 @@ response->type = SMBD_NEGOTIATE_RESP; rc = smbd_post_recv(info, response); - log_rdma_event(INFO, - "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x " - "iov.lkey=%x\n", - rc, response->sge.addr, - response->sge.length, response->sge.lkey); + log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x iov.lkey=%x\n", + rc, response->sge.addr, + response->sge.length, response->sge.lkey); if (rc) return rc; @@ -1436,25 +1280,6 @@ mempool_free(response, info->response_mempool); } -/* - * Check and send an immediate or keep alive packet - * The condition to send those packets are defined in [MS-SMBD] 3.1.1.1 - * Connection.KeepaliveRequested and Connection.SendImmediate - * The idea is to extend credits to server as soon as it becomes available - */ -static void send_immediate_work(struct work_struct *work) -{ - struct smbd_connection *info = container_of( - work, struct smbd_connection, - send_immediate_work.work); - - if (info->keep_alive_requested == KEEP_ALIVE_PENDING || - info->send_immediate) { - log_keep_alive(INFO, "send an empty message\n"); - smbd_post_send_empty(info); - } -} - /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */ static void idle_connection_timer(struct work_struct *work) { @@ -1478,21 +1303,98 @@ info->keep_alive_interval*HZ); } -/* Destroy this SMBD connection, called from upper layer */ -void smbd_destroy(struct smbd_connection *info) +/* + * Destroy the transport and related RDMA and memory resources + * Need to go through all the pending counters and make sure on one is using + * the transport while it is destroyed + */ +void smbd_destroy(struct TCP_Server_Info *server) { + struct smbd_connection *info = server->smbd_conn; + struct smbd_response *response; + unsigned long flags; + + if (!info) { + log_rdma_event(INFO, "rdma session already destroyed\n"); + return; + } + log_rdma_event(INFO, "destroying rdma session\n"); + if (info->transport_status != SMBD_DISCONNECTED) { + rdma_disconnect(server->smbd_conn->id); + log_rdma_event(INFO, "wait for transport being disconnected\n"); + wait_event_interruptible( + info->disconn_wait, + info->transport_status == SMBD_DISCONNECTED); + } - /* Kick off the disconnection process */ - smbd_disconnect_rdma_connection(info); + log_rdma_event(INFO, "destroying qp\n"); + ib_drain_qp(info->id->qp); + rdma_destroy_qp(info->id); - log_rdma_event(INFO, "wait for transport being destroyed\n"); - wait_event(info->wait_destroy, - info->transport_status == SMBD_DESTROYED); + log_rdma_event(INFO, "cancelling idle timer\n"); + cancel_delayed_work_sync(&info->idle_timer_work); + + log_rdma_event(INFO, "wait for all send posted to IB to finish\n"); + wait_event(info->wait_send_pending, + atomic_read(&info->send_pending) == 0); + + /* It's not posssible for upper layer to get to reassembly */ + log_rdma_event(INFO, "drain the reassembly queue\n"); + do { + spin_lock_irqsave(&info->reassembly_queue_lock, flags); + response = _get_first_reassembly(info); + if (response) { + list_del(&response->list); + spin_unlock_irqrestore( + &info->reassembly_queue_lock, flags); + put_receive_buffer(info, response); + } else + spin_unlock_irqrestore( + &info->reassembly_queue_lock, flags); + } while (response); + info->reassembly_data_length = 0; + + log_rdma_event(INFO, "free receive buffers\n"); + wait_event(info->wait_receive_queues, + info->count_receive_queue + info->count_empty_packet_queue + == info->receive_credit_max); + destroy_receive_buffers(info); + + /* + * For performance reasons, memory registration and deregistration + * are not locked by srv_mutex. It is possible some processes are + * blocked on transport srv_mutex while holding memory registration. + * Release the transport srv_mutex to allow them to hit the failure + * path when sending data, and then release memory registartions. + */ + log_rdma_event(INFO, "freeing mr list\n"); + wake_up_interruptible_all(&info->wait_mr); + while (atomic_read(&info->mr_used_count)) { + mutex_unlock(&server->srv_mutex); + msleep(1000); + mutex_lock(&server->srv_mutex); + } + destroy_mr_list(info); + + ib_free_cq(info->send_cq); + ib_free_cq(info->recv_cq); + ib_dealloc_pd(info->pd); + rdma_destroy_id(info->id); + + /* free mempools */ + mempool_destroy(info->request_mempool); + kmem_cache_destroy(info->request_cache); + + mempool_destroy(info->response_mempool); + kmem_cache_destroy(info->response_cache); + + info->transport_status = SMBD_DESTROYED; destroy_workqueue(info->workqueue); log_rdma_event(INFO, "rdma session destroyed\n"); kfree(info); + server->smbd_conn = NULL; } /* @@ -1514,16 +1416,8 @@ */ if (server->smbd_conn->transport_status == SMBD_CONNECTED) { log_rdma_event(INFO, "disconnecting transport\n"); - smbd_disconnect_rdma_connection(server->smbd_conn); + smbd_destroy(server); } - - /* wait until the transport is destroyed */ - if (!wait_event_timeout(server->smbd_conn->wait_destroy, - server->smbd_conn->transport_status == SMBD_DESTROYED, 5*HZ)) - return -EAGAIN; - - destroy_workqueue(server->smbd_conn->workqueue); - kfree(server->smbd_conn); create_conn: log_rdma_event(INFO, "creating rdma session\n"); @@ -1552,7 +1446,7 @@ char name[MAX_NAME_LEN]; int rc; - snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info); info->request_cache = kmem_cache_create( name, @@ -1568,7 +1462,7 @@ if (!info->request_mempool) goto out1; - snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info); info->response_cache = kmem_cache_create( name, @@ -1584,7 +1478,7 @@ if (!info->response_mempool) goto out3; - snprintf(name, MAX_NAME_LEN, "smbd_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbd_%p", info); info->workqueue = create_workqueue(name); if (!info->workqueue) goto out4; @@ -1635,25 +1529,19 @@ if (smbd_send_credit_target > info->id->device->attrs.max_cqe || smbd_send_credit_target > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, - "consider lowering send_credit_target = %d. " - "Possible CQE overrun, device " - "reporting max_cpe %d max_qp_wr %d\n", - smbd_send_credit_target, - info->id->device->attrs.max_cqe, - info->id->device->attrs.max_qp_wr); + log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", + smbd_send_credit_target, + info->id->device->attrs.max_cqe, + info->id->device->attrs.max_qp_wr); goto config_failed; } if (smbd_receive_credit_max > info->id->device->attrs.max_cqe || smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, - "consider lowering receive_credit_max = %d. " - "Possible CQE overrun, device " - "reporting max_cpe %d max_qp_wr %d\n", - smbd_receive_credit_max, - info->id->device->attrs.max_cqe, - info->id->device->attrs.max_qp_wr); + log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", + smbd_receive_credit_max, + info->id->device->attrs.max_cqe, + info->id->device->attrs.max_qp_wr); goto config_failed; } @@ -1679,15 +1567,17 @@ info->send_cq = NULL; info->recv_cq = NULL; - info->send_cq = ib_alloc_cq(info->id->device, info, - info->send_credit_target, 0, IB_POLL_SOFTIRQ); + info->send_cq = + ib_alloc_cq_any(info->id->device, info, + info->send_credit_target, IB_POLL_SOFTIRQ); if (IS_ERR(info->send_cq)) { info->send_cq = NULL; goto alloc_cq_failed; } - info->recv_cq = ib_alloc_cq(info->id->device, info, - info->receive_credit_max, 0, IB_POLL_SOFTIRQ); + info->recv_cq = + ib_alloc_cq_any(info->id->device, info, + info->receive_credit_max, IB_POLL_SOFTIRQ); if (IS_ERR(info->recv_cq)) { info->recv_cq = NULL; goto alloc_cq_failed; @@ -1726,7 +1616,7 @@ info->responder_resources); /* Need to send IRD/ORD in private data for iWARP */ - info->id->device->get_port_immutable( + info->id->device->ops.get_port_immutable( info->id->device, info->id->port_num, &port_immutable); if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { ird_ord_hdr[0] = info->responder_resources; @@ -1741,12 +1631,13 @@ conn_param.retry_count = SMBD_CM_RETRY; conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY; conn_param.flow_control = 0; - init_waitqueue_head(&info->wait_destroy); log_rdma_event(INFO, "connecting to IP %pI4 port %d\n", &addr_in->sin_addr, port); init_waitqueue_head(&info->conn_wait); + init_waitqueue_head(&info->disconn_wait); + init_waitqueue_head(&info->wait_reassembly_queue); rc = rdma_connect(info->id, &conn_param); if (rc) { log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc); @@ -1770,28 +1661,16 @@ } init_waitqueue_head(&info->wait_send_queue); - init_waitqueue_head(&info->wait_reassembly_queue); - INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer); - INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work); queue_delayed_work(info->workqueue, &info->idle_timer_work, info->keep_alive_interval*HZ); - - init_waitqueue_head(&info->wait_smbd_send_pending); - info->smbd_send_pending = 0; - - init_waitqueue_head(&info->wait_smbd_recv_pending); - info->smbd_recv_pending = 0; init_waitqueue_head(&info->wait_send_pending); atomic_set(&info->send_pending, 0); - init_waitqueue_head(&info->wait_send_payload_pending); - atomic_set(&info->send_payload_pending, 0); + init_waitqueue_head(&info->wait_post_send); INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work); - INIT_WORK(&info->destroy_work, smbd_destroy_rdma_work); - INIT_WORK(&info->recv_done_work, smbd_recv_done_work); INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits); info->new_credits_offered = 0; spin_lock_init(&info->lock_new_credits_offered); @@ -1812,7 +1691,8 @@ allocate_mr_failed: /* At this point, need to a full transport shutdown */ - smbd_destroy(info); + server->smbd_conn = info; + smbd_destroy(server); return NULL; negotiation_failed: @@ -1884,11 +1764,6 @@ int rc; again: - if (info->transport_status != SMBD_CONNECTED) { - log_read(ERR, "disconnected\n"); - return -ENODEV; - } - /* * No need to hold the reassembly queue lock all the time as we are * the only one reading from the front of the queue. The transport @@ -1974,11 +1849,9 @@ to_read -= to_copy; data_read += to_copy; - log_read(INFO, "_get_first_reassembly memcpy %d bytes " - "data_transfer_length-offset=%d after that " - "to_read=%d data_read=%d offset=%d\n", - to_copy, data_length - offset, - to_read, data_read, offset); + log_read(INFO, "_get_first_reassembly memcpy %d bytes data_transfer_length-offset=%d after that to_read=%d data_read=%d offset=%d\n", + to_copy, data_length - offset, + to_read, data_read, offset); } spin_lock_irq(&info->reassembly_queue_lock); @@ -1987,10 +1860,9 @@ spin_unlock_irq(&info->reassembly_queue_lock); info->first_entry_offset = offset; - log_read(INFO, "returning to thread data_read=%d " - "reassembly_data_length=%d first_entry_offset=%d\n", - data_read, info->reassembly_data_length, - info->first_entry_offset); + log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", + data_read, info->reassembly_data_length, + info->first_entry_offset); read_rfc1002_done: return data_read; } @@ -2002,7 +1874,12 @@ info->transport_status != SMBD_CONNECTED); /* Don't return any data if interrupted */ if (rc) - return -ENODEV; + return rc; + + if (info->transport_status != SMBD_CONNECTED) { + log_read(ERR, "disconnected\n"); + return -ECONNABORTED; + } goto again; } @@ -2054,16 +1931,22 @@ unsigned int to_read, page_offset; int rc; - info->smbd_recv_pending++; + if (iov_iter_rw(&msg->msg_iter) == WRITE) { + /* It's a bug in upper layer to get there */ + cifs_dbg(VFS, "Invalid msg iter dir %u\n", + iov_iter_rw(&msg->msg_iter)); + rc = -EINVAL; + goto out; + } - switch (msg->msg_iter.type) { - case READ | ITER_KVEC: + switch (iov_iter_type(&msg->msg_iter)) { + case ITER_KVEC: buf = msg->msg_iter.kvec->iov_base; to_read = msg->msg_iter.kvec->iov_len; rc = smbd_recv_buf(info, buf, to_read); break; - case READ | ITER_BVEC: + case ITER_BVEC: page = msg->msg_iter.bvec->bv_page; page_offset = msg->msg_iter.bvec->bv_offset; to_read = msg->msg_iter.bvec->bv_len; @@ -2072,14 +1955,12 @@ default: /* It's a bug in upper layer to get there */ - cifs_dbg(VFS, "CIFS: invalid msg type %d\n", - msg->msg_iter.type); + cifs_dbg(VFS, "Invalid msg type %d\n", + iov_iter_type(&msg->msg_iter)); rc = -EINVAL; } - info->smbd_recv_pending--; - wake_up(&info->wait_smbd_recv_pending); - +out: /* SMBDirect will read it all or nothing */ if (rc > 0) msg->msg_iter.count = 0; @@ -2108,9 +1989,8 @@ struct smb_rqst *rqst; int rqst_idx; - info->smbd_send_pending++; if (info->transport_status != SMBD_CONNECTED) { - rc = -ENODEV; + rc = -EAGAIN; goto done; } @@ -2123,16 +2003,17 @@ for (i = 0; i < num_rqst; i++) remaining_data_length += smb_rqst_len(server, &rqst_array[i]); - if (remaining_data_length + sizeof(struct smbd_data_transfer) > - info->max_fragmented_send_size) { + if (remaining_data_length > info->max_fragmented_send_size) { log_write(ERR, "payload size %d > max size %d\n", remaining_data_length, info->max_fragmented_send_size); rc = -EINVAL; goto done; } - rqst_idx = 0; + log_write(INFO, "num_rqst=%d total length=%u\n", + num_rqst, remaining_data_length); + rqst_idx = 0; next_rqst: rqst = &rqst_array[rqst_idx]; iov = rqst->rq_iov; @@ -2143,10 +2024,9 @@ dump_smb(iov[i].iov_base, iov[i].iov_len); - log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d " - "rq_tailsz=%d buflen=%lu\n", - rqst_idx, rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz, - rqst->rq_tailsz, smb_rqst_len(server, rqst)); + log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n", + rqst_idx, rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz, + rqst->rq_tailsz, smb_rqst_len(server, rqst)); start = i = 0; buflen = 0; @@ -2156,11 +2036,9 @@ if (i > start) { remaining_data_length -= (buflen-iov[i].iov_len); - log_write(INFO, "sending iov[] from start=%d " - "i=%d nvecs=%d " - "remaining_data_length=%d\n", - start, i, i-start, - remaining_data_length); + log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n", + start, i, i - start, + remaining_data_length); rc = smbd_post_send_data( info, &iov[start], i-start, remaining_data_length); @@ -2169,10 +2047,9 @@ } else { /* iov[start] is too big, break it */ nvecs = (buflen+max_iov_size-1)/max_iov_size; - log_write(INFO, "iov[%d] iov_base=%p buflen=%d" - " break to %d vectors\n", - start, iov[start].iov_base, - buflen, nvecs); + log_write(INFO, "iov[%d] iov_base=%p buflen=%d break to %d vectors\n", + start, iov[start].iov_base, + buflen, nvecs); for (j = 0; j < nvecs; j++) { vec.iov_base = (char *)iov[start].iov_base + @@ -2184,11 +2061,9 @@ max_iov_size*(nvecs-1); remaining_data_length -= vec.iov_len; log_write(INFO, - "sending vec j=%d iov_base=%p" - " iov_len=%zu " - "remaining_data_length=%d\n", - j, vec.iov_base, vec.iov_len, - remaining_data_length); + "sending vec j=%d iov_base=%p iov_len=%zu remaining_data_length=%d\n", + j, vec.iov_base, vec.iov_len, + remaining_data_length); rc = smbd_post_send_data( info, &vec, 1, remaining_data_length); @@ -2206,11 +2081,9 @@ if (i == rqst->rq_nvec) { /* send out all remaining vecs */ remaining_data_length -= buflen; - log_write(INFO, - "sending iov[] from start=%d i=%d " - "nvecs=%d remaining_data_length=%d\n", - start, i, i-start, - remaining_data_length); + log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n", + start, i, i - start, + remaining_data_length); rc = smbd_post_send_data(info, &iov[start], i-start, remaining_data_length); if (rc) @@ -2234,10 +2107,9 @@ if (j == nvecs-1) size = buflen - j*max_iov_size; remaining_data_length -= size; - log_write(INFO, "sending pages i=%d offset=%d size=%d" - " remaining_data_length=%d\n", - i, j*max_iov_size+offset, size, - remaining_data_length); + log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n", + i, j * max_iov_size + offset, size, + remaining_data_length); rc = smbd_post_send_page( info, rqst->rq_pages[i], j*max_iov_size + offset, @@ -2259,11 +2131,8 @@ * that means all the I/Os have been out and we are good to return */ - wait_event(info->wait_send_payload_pending, - atomic_read(&info->send_payload_pending) == 0); - - info->smbd_send_pending--; - wake_up(&info->wait_smbd_send_pending); + wait_event(info->wait_send_pending, + atomic_read(&info->send_pending) == 0); return rc; } @@ -2298,8 +2167,7 @@ int rc; list_for_each_entry(smbdirect_mr, &info->mr_list, list) { - if (smbdirect_mr->state == MR_INVALIDATED || - smbdirect_mr->state == MR_ERROR) { + if (smbdirect_mr->state == MR_ERROR) { /* recover this MR entry */ rc = ib_dereg_mr(smbdirect_mr->mr); @@ -2315,33 +2183,27 @@ info->pd, info->mr_type, info->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { - log_rdma_mr(ERR, - "ib_alloc_mr failed mr_type=%x " - "max_frmr_depth=%x\n", - info->mr_type, - info->max_frmr_depth); + log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", + info->mr_type, + info->max_frmr_depth); smbd_disconnect_rdma_connection(info); continue; } + } else + /* This MR is being used, don't recover it */ + continue; - if (smbdirect_mr->state == MR_INVALIDATED) - ib_dma_unmap_sg( - info->id->device, smbdirect_mr->sgl, - smbdirect_mr->sgl_count, - smbdirect_mr->dir); + smbdirect_mr->state = MR_READY; - smbdirect_mr->state = MR_READY; - - /* smbdirect_mr->state is updated by this function - * and is read and updated by I/O issuing CPUs trying - * to get a MR, the call to atomic_inc_return - * implicates a memory barrier and guarantees this - * value is updated before waking up any calls to - * get_mr() from the I/O issuing CPUs - */ - if (atomic_inc_return(&info->mr_ready_count) == 1) - wake_up_interruptible(&info->wait_mr); - } + /* smbdirect_mr->state is updated by this function + * and is read and updated by I/O issuing CPUs trying + * to get a MR, the call to atomic_inc_return + * implicates a memory barrier and guarantees this + * value is updated before waking up any calls to + * get_mr() from the I/O issuing CPUs + */ + if (atomic_inc_return(&info->mr_ready_count) == 1) + wake_up_interruptible(&info->wait_mr); } } @@ -2378,6 +2240,7 @@ atomic_set(&info->mr_ready_count, 0); atomic_set(&info->mr_used_count, 0); init_waitqueue_head(&info->wait_for_mr_cleanup); + INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); /* Allocate more MRs (2x) than hardware responder_resources */ for (i = 0; i < info->responder_resources * 2; i++) { smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); @@ -2386,9 +2249,8 @@ smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type, info->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { - log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x " - "max_frmr_depth=%x\n", - info->mr_type, info->max_frmr_depth); + log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", + info->mr_type, info->max_frmr_depth); goto out; } smbdirect_mr->sgl = kcalloc( @@ -2406,13 +2268,13 @@ list_add_tail(&smbdirect_mr->list, &info->mr_list); atomic_inc(&info->mr_ready_count); } - INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); return 0; out: kfree(smbdirect_mr); list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) { + list_del(&smbdirect_mr->list); ib_dereg_mr(smbdirect_mr->mr); kfree(smbdirect_mr->sgl); kfree(smbdirect_mr); @@ -2631,11 +2493,20 @@ */ smbdirect_mr->state = MR_INVALIDATED; - /* - * Schedule the work to do MR recovery for future I/Os - * MR recovery is slow and we don't want it to block the current I/O - */ - queue_work(info->workqueue, &info->mr_recovery_work); + if (smbdirect_mr->state == MR_INVALIDATED) { + ib_dma_unmap_sg( + info->id->device, smbdirect_mr->sgl, + smbdirect_mr->sgl_count, + smbdirect_mr->dir); + smbdirect_mr->state = MR_READY; + if (atomic_inc_return(&info->mr_ready_count) == 1) + wake_up_interruptible(&info->wait_mr); + } else + /* + * Schedule the work to do MR recovery for future I/Os MR + * recovery is slow and don't want it to block current I/O + */ + queue_work(info->workqueue, &info->mr_recovery_work); done: if (atomic_dec_and_test(&info->mr_used_count)) -- Gitblit v1.6.2