hc
2024-05-10 10ebd8556b7990499c896a550e3d416b444211e6
kernel/net/smc/smc_wr.c
....@@ -44,11 +44,28 @@
4444 struct smc_link *link;
4545 u32 idx;
4646 struct smc_wr_tx_pend_priv priv;
47
+ u8 compl_requested;
4748 };
4849
4950 /******************************** send queue *********************************/
5051
5152 /*------------------------------- completion --------------------------------*/
53
+
54
+/* returns true if at least one tx work request is pending on the given link */
55
+static inline bool smc_wr_is_tx_pend(struct smc_link *link)
56
+{
57
+ if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) !=
58
+ link->wr_tx_cnt) {
59
+ return true;
60
+ }
61
+ return false;
62
+}
63
+
64
+/* wait till all pending tx work requests on the given link are completed */
65
+void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
66
+{
67
+ wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
68
+}
5269
5370 static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
5471 {
....@@ -66,7 +83,6 @@
6683 struct smc_wr_tx_pend pnd_snd;
6784 struct smc_link *link;
6885 u32 pnd_snd_idx;
69
- int i;
7086
7187 link = wc->qp->qp_context;
7288
....@@ -75,7 +91,7 @@
7591 link->wr_reg_state = FAILED;
7692 else
7793 link->wr_reg_state = CONFIRMED;
78
- wake_up(&link->wr_reg_wait);
94
+ smc_wr_wakeup_reg_wait(link);
7995 return;
8096 }
8197
....@@ -83,6 +99,8 @@
8399 if (pnd_snd_idx == link->wr_tx_cnt)
84100 return;
85101 link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
102
+ if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
103
+ complete(&link->wr_tx_compl[pnd_snd_idx]);
86104 memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
87105 /* clear the full struct smc_wr_tx_pend including .priv */
88106 memset(&link->wr_tx_pends[pnd_snd_idx], 0,
....@@ -92,16 +110,8 @@
92110 if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
93111 return;
94112 if (wc->status) {
95
- for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
96
- /* clear full struct smc_wr_tx_pend including .priv */
97
- memset(&link->wr_tx_pends[i], 0,
98
- sizeof(link->wr_tx_pends[i]));
99
- memset(&link->wr_tx_bufs[i], 0,
100
- sizeof(link->wr_tx_bufs[i]));
101
- clear_bit(i, link->wr_tx_mask);
102
- }
103
- /* terminate connections of this link group abnormally */
104
- smc_lgr_terminate(smc_get_lgr(link));
113
+ /* terminate link */
114
+ smcr_link_down_cond_sched(link);
105115 }
106116 if (pnd_snd.handler)
107117 pnd_snd.handler(&pnd_snd.priv, link, wc->status);
....@@ -146,6 +156,8 @@
146156 static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
147157 {
148158 *idx = link->wr_tx_cnt;
159
+ if (!smc_link_sendable(link))
160
+ return -ENOLINK;
149161 for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
150162 if (!test_and_set_bit(*idx, link->wr_tx_mask))
151163 return 0;
....@@ -160,6 +172,7 @@
160172 * @link: Pointer to smc_link used to later send the message.
161173 * @handler: Send completion handler function pointer.
162174 * @wr_buf: Out value returns pointer to message buffer.
175
+ * @wr_rdma_buf: Out value returns pointer to rdma work request.
163176 * @wr_pend_priv: Out value returns pointer serving as handler context.
164177 *
165178 * Return: 0 on success, or -errno on error.
....@@ -167,8 +180,10 @@
167180 int smc_wr_tx_get_free_slot(struct smc_link *link,
168181 smc_wr_tx_handler handler,
169182 struct smc_wr_buf **wr_buf,
183
+ struct smc_rdma_wr **wr_rdma_buf,
170184 struct smc_wr_tx_pend_priv **wr_pend_priv)
171185 {
186
+ struct smc_link_group *lgr = smc_get_lgr(link);
172187 struct smc_wr_tx_pend *wr_pend;
173188 u32 idx = link->wr_tx_cnt;
174189 struct ib_send_wr *wr_ib;
....@@ -177,19 +192,20 @@
177192
178193 *wr_buf = NULL;
179194 *wr_pend_priv = NULL;
180
- if (in_softirq()) {
195
+ if (in_softirq() || lgr->terminating) {
181196 rc = smc_wr_tx_get_free_slot_index(link, &idx);
182197 if (rc)
183198 return rc;
184199 } else {
185
- rc = wait_event_timeout(
200
+ rc = wait_event_interruptible_timeout(
186201 link->wr_tx_wait,
187
- link->state == SMC_LNK_INACTIVE ||
202
+ !smc_link_sendable(link) ||
203
+ lgr->terminating ||
188204 (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
189205 SMC_WR_TX_WAIT_FREE_SLOT_TIME);
190206 if (!rc) {
191
- /* timeout - terminate connections */
192
- smc_lgr_terminate(smc_get_lgr(link));
207
+ /* timeout - terminate link */
208
+ smcr_link_down_cond_sched(link);
193209 return -EPIPE;
194210 }
195211 if (idx == link->wr_tx_cnt)
....@@ -204,6 +220,8 @@
204220 wr_ib = &link->wr_tx_ibs[idx];
205221 wr_ib->wr_id = wr_id;
206222 *wr_buf = &link->wr_tx_bufs[idx];
223
+ if (wr_rdma_buf)
224
+ *wr_rdma_buf = &link->wr_tx_rdmas[idx];
207225 *wr_pend_priv = &wr_pend->priv;
208226 return 0;
209227 }
....@@ -218,11 +236,12 @@
218236 u32 idx = pend->idx;
219237
220238 /* clear the full struct smc_wr_tx_pend including .priv */
221
- memset(&link->wr_tx_pends[pend->idx], 0,
222
- sizeof(link->wr_tx_pends[pend->idx]));
223
- memset(&link->wr_tx_bufs[pend->idx], 0,
224
- sizeof(link->wr_tx_bufs[pend->idx]));
239
+ memset(&link->wr_tx_pends[idx], 0,
240
+ sizeof(link->wr_tx_pends[idx]));
241
+ memset(&link->wr_tx_bufs[idx], 0,
242
+ sizeof(link->wr_tx_bufs[idx]));
225243 test_and_clear_bit(idx, link->wr_tx_mask);
244
+ wake_up(&link->wr_tx_wait);
226245 return 1;
227246 }
228247
....@@ -243,8 +262,37 @@
243262 rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
244263 if (rc) {
245264 smc_wr_tx_put_slot(link, priv);
246
- smc_lgr_terminate(smc_get_lgr(link));
265
+ smcr_link_down_cond_sched(link);
247266 }
267
+ return rc;
268
+}
269
+
270
+/* Send prepared WR slot via ib_post_send and wait for send completion
271
+ * notification.
272
+ * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
273
+ */
274
+int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
275
+ unsigned long timeout)
276
+{
277
+ struct smc_wr_tx_pend *pend;
278
+ u32 pnd_idx;
279
+ int rc;
280
+
281
+ pend = container_of(priv, struct smc_wr_tx_pend, priv);
282
+ pend->compl_requested = 1;
283
+ pnd_idx = pend->idx;
284
+ init_completion(&link->wr_tx_compl[pnd_idx]);
285
+
286
+ rc = smc_wr_tx_send(link, priv);
287
+ if (rc)
288
+ return rc;
289
+ /* wait for completion by smc_wr_tx_process_cqe() */
290
+ rc = wait_for_completion_interruptible_timeout(
291
+ &link->wr_tx_compl[pnd_idx], timeout);
292
+ if (rc <= 0)
293
+ rc = -ENODATA;
294
+ if (rc > 0)
295
+ rc = 0;
248296 return rc;
249297 }
250298
....@@ -263,12 +311,15 @@
263311 if (rc)
264312 return rc;
265313
314
+ atomic_inc(&link->wr_reg_refcnt);
266315 rc = wait_event_interruptible_timeout(link->wr_reg_wait,
267316 (link->wr_reg_state != POSTED),
268317 SMC_WR_REG_MR_WAIT_TIME);
318
+ if (atomic_dec_and_test(&link->wr_reg_refcnt))
319
+ wake_up_all(&link->wr_reg_wait);
269320 if (!rc) {
270
- /* timeout - terminate connections */
271
- smc_lgr_terminate(smc_get_lgr(link));
321
+ /* timeout - terminate link */
322
+ smcr_link_down_cond_sched(link);
272323 return -EPIPE;
273324 }
274325 if (rc == -ERESTARTSYS)
....@@ -285,25 +336,6 @@
285336 break;
286337 }
287338 return rc;
288
-}
289
-
290
-void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type,
291
- smc_wr_tx_filter filter,
292
- smc_wr_tx_dismisser dismisser,
293
- unsigned long data)
294
-{
295
- struct smc_wr_tx_pend_priv *tx_pend;
296
- struct smc_wr_rx_hdr *wr_tx;
297
- int i;
298
-
299
- for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
300
- wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i];
301
- if (wr_tx->type != wr_tx_hdr_type)
302
- continue;
303
- tx_pend = &link->wr_tx_pends[i].priv;
304
- if (filter(tx_pend, data))
305
- dismisser(tx_pend);
306
- }
307339 }
308340
309341 /****************************** receive queue ********************************/
....@@ -366,10 +398,7 @@
366398 case IB_WC_RETRY_EXC_ERR:
367399 case IB_WC_RNR_RETRY_EXC_ERR:
368400 case IB_WC_WR_FLUSH_ERR:
369
- /* terminate connections of this link group
370
- * abnormally
371
- */
372
- smc_lgr_terminate(smc_get_lgr(link));
401
+ smcr_link_down_cond_sched(link);
373402 break;
374403 default:
375404 smc_wr_rx_post(link); /* refill WR RX */
....@@ -465,12 +494,26 @@
465494 lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
466495 lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
467496 lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
497
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
498
+ lnk->roce_pd->local_dma_lkey;
499
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
500
+ lnk->roce_pd->local_dma_lkey;
501
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
502
+ lnk->roce_pd->local_dma_lkey;
503
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
504
+ lnk->roce_pd->local_dma_lkey;
468505 lnk->wr_tx_ibs[i].next = NULL;
469506 lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
470507 lnk->wr_tx_ibs[i].num_sge = 1;
471508 lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
472509 lnk->wr_tx_ibs[i].send_flags =
473510 IB_SEND_SIGNALED | IB_SEND_SOLICITED;
511
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
512
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
513
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
514
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
515
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
516
+ lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
474517 }
475518 for (i = 0; i < lnk->wr_rx_cnt; i++) {
476519 lnk->wr_rx_sges[i].addr =
....@@ -492,12 +535,16 @@
492535 {
493536 struct ib_device *ibdev;
494537
495
- memset(lnk->wr_tx_mask, 0,
496
- BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
497
-
498538 if (!lnk->smcibdev)
499539 return;
500540 ibdev = lnk->smcibdev->ibdev;
541
+
542
+ smc_wr_wakeup_reg_wait(lnk);
543
+ smc_wr_wakeup_tx_wait(lnk);
544
+
545
+ smc_wr_tx_wait_no_pending_sends(lnk);
546
+ wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt)));
547
+ wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt)));
501548
502549 if (lnk->wr_rx_dma_addr) {
503550 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
....@@ -515,14 +562,20 @@
515562
516563 void smc_wr_free_link_mem(struct smc_link *lnk)
517564 {
565
+ kfree(lnk->wr_tx_compl);
566
+ lnk->wr_tx_compl = NULL;
518567 kfree(lnk->wr_tx_pends);
519568 lnk->wr_tx_pends = NULL;
520569 kfree(lnk->wr_tx_mask);
521570 lnk->wr_tx_mask = NULL;
522571 kfree(lnk->wr_tx_sges);
523572 lnk->wr_tx_sges = NULL;
573
+ kfree(lnk->wr_tx_rdma_sges);
574
+ lnk->wr_tx_rdma_sges = NULL;
524575 kfree(lnk->wr_rx_sges);
525576 lnk->wr_rx_sges = NULL;
577
+ kfree(lnk->wr_tx_rdmas);
578
+ lnk->wr_tx_rdmas = NULL;
526579 kfree(lnk->wr_rx_ibs);
527580 lnk->wr_rx_ibs = NULL;
528581 kfree(lnk->wr_tx_ibs);
....@@ -552,10 +605,20 @@
552605 GFP_KERNEL);
553606 if (!link->wr_rx_ibs)
554607 goto no_mem_wr_tx_ibs;
608
+ link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT,
609
+ sizeof(link->wr_tx_rdmas[0]),
610
+ GFP_KERNEL);
611
+ if (!link->wr_tx_rdmas)
612
+ goto no_mem_wr_rx_ibs;
613
+ link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT,
614
+ sizeof(link->wr_tx_rdma_sges[0]),
615
+ GFP_KERNEL);
616
+ if (!link->wr_tx_rdma_sges)
617
+ goto no_mem_wr_tx_rdmas;
555618 link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
556619 GFP_KERNEL);
557620 if (!link->wr_tx_sges)
558
- goto no_mem_wr_rx_ibs;
621
+ goto no_mem_wr_tx_rdma_sges;
559622 link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
560623 sizeof(link->wr_rx_sges[0]),
561624 GFP_KERNEL);
....@@ -571,14 +634,25 @@
571634 GFP_KERNEL);
572635 if (!link->wr_tx_pends)
573636 goto no_mem_wr_tx_mask;
637
+ link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
638
+ sizeof(link->wr_tx_compl[0]),
639
+ GFP_KERNEL);
640
+ if (!link->wr_tx_compl)
641
+ goto no_mem_wr_tx_pends;
574642 return 0;
575643
644
+no_mem_wr_tx_pends:
645
+ kfree(link->wr_tx_pends);
576646 no_mem_wr_tx_mask:
577647 kfree(link->wr_tx_mask);
578648 no_mem_wr_rx_sges:
579649 kfree(link->wr_rx_sges);
580650 no_mem_wr_tx_sges:
581651 kfree(link->wr_tx_sges);
652
+no_mem_wr_tx_rdma_sges:
653
+ kfree(link->wr_tx_rdma_sges);
654
+no_mem_wr_tx_rdmas:
655
+ kfree(link->wr_tx_rdmas);
582656 no_mem_wr_rx_ibs:
583657 kfree(link->wr_rx_ibs);
584658 no_mem_wr_tx_ibs:
....@@ -631,7 +705,9 @@
631705 memset(lnk->wr_tx_mask, 0,
632706 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
633707 init_waitqueue_head(&lnk->wr_tx_wait);
708
+ atomic_set(&lnk->wr_tx_refcnt, 0);
634709 init_waitqueue_head(&lnk->wr_reg_wait);
710
+ atomic_set(&lnk->wr_reg_refcnt, 0);
635711 return rc;
636712
637713 dma_unmap: