hc
2024-05-10 10ebd8556b7990499c896a550e3d416b444211e6
kernel/net/smc/smc_ib.c
....@@ -15,6 +15,8 @@
1515 #include <linux/random.h>
1616 #include <linux/workqueue.h>
1717 #include <linux/scatterlist.h>
18
+#include <linux/wait.h>
19
+#include <linux/mutex.h>
1820 #include <rdma/ib_verbs.h>
1921 #include <rdma/ib_cache.h>
2022
....@@ -32,15 +34,11 @@
3234 #define SMC_QP_RNR_RETRY 7 /* 7: infinite */
3335
3436 struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
35
- .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock),
37
+ .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex),
3638 .list = LIST_HEAD_INIT(smc_ib_devices.list),
3739 };
3840
39
-#define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%"
40
-
41
-u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system
42
- * identifier
43
- */
41
+u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
4442
4543 static int smc_ib_modify_qp_init(struct smc_link *lnk)
4644 {
....@@ -102,12 +100,12 @@
102100 IB_QP_MAX_QP_RD_ATOMIC);
103101 }
104102
105
-int smc_ib_modify_qp_reset(struct smc_link *lnk)
103
+int smc_ib_modify_qp_error(struct smc_link *lnk)
106104 {
107105 struct ib_qp_attr qp_attr;
108106
109107 memset(&qp_attr, 0, sizeof(qp_attr));
110
- qp_attr.qp_state = IB_QPS_RESET;
108
+ qp_attr.qp_state = IB_QPS_ERR;
111109 return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
112110 }
113111
....@@ -146,18 +144,13 @@
146144 static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport)
147145 {
148146 const struct ib_gid_attr *attr;
149
- int rc = 0;
147
+ int rc;
150148
151149 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
152150 if (IS_ERR(attr))
153151 return -ENODEV;
154152
155
- if (attr->ndev)
156
- memcpy(smcibdev->mac[ibport - 1], attr->ndev->dev_addr,
157
- ETH_ALEN);
158
- else
159
- rc = -ENODEV;
160
-
153
+ rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]);
161154 rdma_put_gid_attr(attr);
162155 return rc;
163156 }
....@@ -172,6 +165,15 @@
172165 {
173166 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
174167 sizeof(smcibdev->mac[ibport - 1]));
168
+}
169
+
170
+bool smc_ib_is_valid_local_systemid(void)
171
+{
172
+ return !is_zero_ether_addr(&local_systemid[2]);
173
+}
174
+
175
+static void smc_ib_init_local_systemid(void)
176
+{
175177 get_random_bytes(&local_systemid[0], 2);
176178 }
177179
....@@ -185,6 +187,7 @@
185187 unsigned short vlan_id, u8 gid[], u8 *sgid_index)
186188 {
187189 const struct ib_gid_attr *attr;
190
+ const struct net_device *ndev;
188191 int i;
189192
190193 for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
....@@ -192,11 +195,14 @@
192195 if (IS_ERR(attr))
193196 continue;
194197
195
- if (attr->ndev &&
196
- ((!vlan_id && !is_vlan_dev(attr->ndev)) ||
197
- (vlan_id && is_vlan_dev(attr->ndev) &&
198
- vlan_dev_vlan_id(attr->ndev) == vlan_id)) &&
198
+ rcu_read_lock();
199
+ ndev = rdma_read_gid_attr_ndev_rcu(attr);
200
+ if (!IS_ERR(ndev) &&
201
+ ((!vlan_id && !is_vlan_dev(ndev)) ||
202
+ (vlan_id && is_vlan_dev(ndev) &&
203
+ vlan_dev_vlan_id(ndev) == vlan_id)) &&
199204 attr->gid_type == IB_GID_TYPE_ROCE) {
205
+ rcu_read_unlock();
200206 if (gid)
201207 memcpy(gid, &attr->gid, SMC_GID_SIZE);
202208 if (sgid_index)
....@@ -204,6 +210,7 @@
204210 rdma_put_gid_attr(attr);
205211 return 0;
206212 }
213
+ rcu_read_unlock();
207214 rdma_put_gid_attr(attr);
208215 }
209216 return -ENODEV;
....@@ -223,8 +230,7 @@
223230 rc = smc_ib_fill_mac(smcibdev, ibport);
224231 if (rc)
225232 goto out;
226
- if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
227
- sizeof(local_systemid)) &&
233
+ if (!smc_ib_is_valid_local_systemid() &&
228234 smc_ib_port_active(smcibdev, ibport))
229235 /* create unique system identifier */
230236 smc_ib_define_local_systemid(smcibdev, ibport);
....@@ -242,8 +248,13 @@
242248 for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
243249 smc_ib_remember_port_attr(smcibdev, port_idx + 1);
244250 clear_bit(port_idx, &smcibdev->port_event_mask);
245
- if (!smc_ib_port_active(smcibdev, port_idx + 1))
246
- smc_port_terminate(smcibdev, port_idx + 1);
251
+ if (!smc_ib_port_active(smcibdev, port_idx + 1)) {
252
+ set_bit(port_idx, smcibdev->ports_going_away);
253
+ smcr_port_err(smcibdev, port_idx + 1);
254
+ } else {
255
+ clear_bit(port_idx, smcibdev->ports_going_away);
256
+ smcr_port_add(smcibdev, port_idx + 1);
257
+ }
247258 }
248259 }
249260
....@@ -252,15 +263,43 @@
252263 struct ib_event *ibevent)
253264 {
254265 struct smc_ib_device *smcibdev;
266
+ bool schedule = false;
255267 u8 port_idx;
256268
257269 smcibdev = container_of(handler, struct smc_ib_device, event_handler);
258270
259271 switch (ibevent->event) {
260
- case IB_EVENT_PORT_ERR:
261272 case IB_EVENT_DEVICE_FATAL:
273
+ /* terminate all ports on device */
274
+ for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) {
275
+ set_bit(port_idx, &smcibdev->port_event_mask);
276
+ if (!test_and_set_bit(port_idx,
277
+ smcibdev->ports_going_away))
278
+ schedule = true;
279
+ }
280
+ if (schedule)
281
+ schedule_work(&smcibdev->port_event_work);
282
+ break;
262283 case IB_EVENT_PORT_ACTIVE:
263284 port_idx = ibevent->element.port_num - 1;
285
+ if (port_idx >= SMC_MAX_PORTS)
286
+ break;
287
+ set_bit(port_idx, &smcibdev->port_event_mask);
288
+ if (test_and_clear_bit(port_idx, smcibdev->ports_going_away))
289
+ schedule_work(&smcibdev->port_event_work);
290
+ break;
291
+ case IB_EVENT_PORT_ERR:
292
+ port_idx = ibevent->element.port_num - 1;
293
+ if (port_idx >= SMC_MAX_PORTS)
294
+ break;
295
+ set_bit(port_idx, &smcibdev->port_event_mask);
296
+ if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
297
+ schedule_work(&smcibdev->port_event_work);
298
+ break;
299
+ case IB_EVENT_GID_CHANGE:
300
+ port_idx = ibevent->element.port_num - 1;
301
+ if (port_idx >= SMC_MAX_PORTS)
302
+ break;
264303 set_bit(port_idx, &smcibdev->port_event_mask);
265304 schedule_work(&smcibdev->port_event_work);
266305 break;
....@@ -289,18 +328,19 @@
289328
290329 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
291330 {
292
- struct smc_ib_device *smcibdev =
293
- (struct smc_ib_device *)ibevent->device;
331
+ struct smc_link *lnk = (struct smc_link *)priv;
332
+ struct smc_ib_device *smcibdev = lnk->smcibdev;
294333 u8 port_idx;
295334
296335 switch (ibevent->event) {
297
- case IB_EVENT_DEVICE_FATAL:
298
- case IB_EVENT_GID_CHANGE:
299
- case IB_EVENT_PORT_ERR:
336
+ case IB_EVENT_QP_FATAL:
300337 case IB_EVENT_QP_ACCESS_ERR:
301
- port_idx = ibevent->element.port_num - 1;
338
+ port_idx = ibevent->element.qp->port - 1;
339
+ if (port_idx >= SMC_MAX_PORTS)
340
+ break;
302341 set_bit(port_idx, &smcibdev->port_event_mask);
303
- schedule_work(&smcibdev->port_event_work);
342
+ if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
343
+ schedule_work(&smcibdev->port_event_work);
304344 break;
305345 default:
306346 break;
....@@ -351,15 +391,15 @@
351391 ib_dereg_mr(mr);
352392 }
353393
354
-static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot)
394
+static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx)
355395 {
356396 unsigned int offset = 0;
357397 int sg_num;
358398
359399 /* map the largest prefix of a dma mapped SG list */
360
- sg_num = ib_map_mr_sg(buf_slot->mr_rx[SMC_SINGLE_LINK],
361
- buf_slot->sgt[SMC_SINGLE_LINK].sgl,
362
- buf_slot->sgt[SMC_SINGLE_LINK].orig_nents,
400
+ sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx],
401
+ buf_slot->sgt[link_idx].sgl,
402
+ buf_slot->sgt[link_idx].orig_nents,
363403 &offset, PAGE_SIZE);
364404
365405 return sg_num;
....@@ -367,29 +407,29 @@
367407
368408 /* Allocate a memory region and map the dma mapped SG list of buf_slot */
369409 int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
370
- struct smc_buf_desc *buf_slot)
410
+ struct smc_buf_desc *buf_slot, u8 link_idx)
371411 {
372
- if (buf_slot->mr_rx[SMC_SINGLE_LINK])
412
+ if (buf_slot->mr_rx[link_idx])
373413 return 0; /* already done */
374414
375
- buf_slot->mr_rx[SMC_SINGLE_LINK] =
415
+ buf_slot->mr_rx[link_idx] =
376416 ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order);
377
- if (IS_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK])) {
417
+ if (IS_ERR(buf_slot->mr_rx[link_idx])) {
378418 int rc;
379419
380
- rc = PTR_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK]);
381
- buf_slot->mr_rx[SMC_SINGLE_LINK] = NULL;
420
+ rc = PTR_ERR(buf_slot->mr_rx[link_idx]);
421
+ buf_slot->mr_rx[link_idx] = NULL;
382422 return rc;
383423 }
384424
385
- if (smc_ib_map_mr_sg(buf_slot) != 1)
425
+ if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1)
386426 return -EINVAL;
387427
388428 return 0;
389429 }
390430
391431 /* synchronize buffer usage for cpu access */
392
-void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev,
432
+void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
393433 struct smc_buf_desc *buf_slot,
394434 enum dma_data_direction data_direction)
395435 {
....@@ -397,11 +437,11 @@
397437 unsigned int i;
398438
399439 /* for now there is just one DMA address */
400
- for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg,
401
- buf_slot->sgt[SMC_SINGLE_LINK].nents, i) {
440
+ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
441
+ buf_slot->sgt[lnk->link_idx].nents, i) {
402442 if (!sg_dma_len(sg))
403443 break;
404
- ib_dma_sync_single_for_cpu(smcibdev->ibdev,
444
+ ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev,
405445 sg_dma_address(sg),
406446 sg_dma_len(sg),
407447 data_direction);
....@@ -409,7 +449,7 @@
409449 }
410450
411451 /* synchronize buffer usage for device access */
412
-void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
452
+void smc_ib_sync_sg_for_device(struct smc_link *lnk,
413453 struct smc_buf_desc *buf_slot,
414454 enum dma_data_direction data_direction)
415455 {
....@@ -417,11 +457,11 @@
417457 unsigned int i;
418458
419459 /* for now there is just one DMA address */
420
- for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg,
421
- buf_slot->sgt[SMC_SINGLE_LINK].nents, i) {
460
+ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
461
+ buf_slot->sgt[lnk->link_idx].nents, i) {
422462 if (!sg_dma_len(sg))
423463 break;
424
- ib_dma_sync_single_for_device(smcibdev->ibdev,
464
+ ib_dma_sync_single_for_device(lnk->smcibdev->ibdev,
425465 sg_dma_address(sg),
426466 sg_dma_len(sg),
427467 data_direction);
....@@ -429,15 +469,15 @@
429469 }
430470
431471 /* Map a new TX or RX buffer SG-table to DMA */
432
-int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,
472
+int smc_ib_buf_map_sg(struct smc_link *lnk,
433473 struct smc_buf_desc *buf_slot,
434474 enum dma_data_direction data_direction)
435475 {
436476 int mapped_nents;
437477
438
- mapped_nents = ib_dma_map_sg(smcibdev->ibdev,
439
- buf_slot->sgt[SMC_SINGLE_LINK].sgl,
440
- buf_slot->sgt[SMC_SINGLE_LINK].orig_nents,
478
+ mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev,
479
+ buf_slot->sgt[lnk->link_idx].sgl,
480
+ buf_slot->sgt[lnk->link_idx].orig_nents,
441481 data_direction);
442482 if (!mapped_nents)
443483 return -ENOMEM;
....@@ -445,18 +485,18 @@
445485 return mapped_nents;
446486 }
447487
448
-void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev,
488
+void smc_ib_buf_unmap_sg(struct smc_link *lnk,
449489 struct smc_buf_desc *buf_slot,
450490 enum dma_data_direction data_direction)
451491 {
452
- if (!buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address)
492
+ if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address)
453493 return; /* already unmapped */
454494
455
- ib_dma_unmap_sg(smcibdev->ibdev,
456
- buf_slot->sgt[SMC_SINGLE_LINK].sgl,
457
- buf_slot->sgt[SMC_SINGLE_LINK].orig_nents,
495
+ ib_dma_unmap_sg(lnk->smcibdev->ibdev,
496
+ buf_slot->sgt[lnk->link_idx].sgl,
497
+ buf_slot->sgt[lnk->link_idx].orig_nents,
458498 data_direction);
459
- buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0;
499
+ buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
460500 }
461501
462502 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
....@@ -466,6 +506,10 @@
466506 int cqe_size_order, smc_order;
467507 long rc;
468508
509
+ mutex_lock(&smcibdev->mutex);
510
+ rc = 0;
511
+ if (smcibdev->initialized)
512
+ goto out;
469513 /* the calculated number of cq entries fits to mlx5 cq allocation */
470514 cqe_size_order = cache_line_size() == 128 ? 7 : 6;
471515 smc_order = MAX_ORDER - cqe_size_order - 1;
....@@ -477,7 +521,7 @@
477521 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
478522 if (IS_ERR(smcibdev->roce_cq_send)) {
479523 smcibdev->roce_cq_send = NULL;
480
- return rc;
524
+ goto out;
481525 }
482526 smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
483527 smc_wr_rx_cq_handler, NULL,
....@@ -489,45 +533,52 @@
489533 }
490534 smc_wr_add_dev(smcibdev);
491535 smcibdev->initialized = 1;
492
- return rc;
536
+ goto out;
493537
494538 err:
495539 ib_destroy_cq(smcibdev->roce_cq_send);
540
+out:
541
+ mutex_unlock(&smcibdev->mutex);
496542 return rc;
497543 }
498544
499545 static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
500546 {
547
+ mutex_lock(&smcibdev->mutex);
501548 if (!smcibdev->initialized)
502
- return;
549
+ goto out;
503550 smcibdev->initialized = 0;
504
- smc_wr_remove_dev(smcibdev);
505551 ib_destroy_cq(smcibdev->roce_cq_recv);
506552 ib_destroy_cq(smcibdev->roce_cq_send);
553
+ smc_wr_remove_dev(smcibdev);
554
+out:
555
+ mutex_unlock(&smcibdev->mutex);
507556 }
508557
509558 static struct ib_client smc_ib_client;
510559
511560 /* callback function for ib_register_client() */
512
-static void smc_ib_add_dev(struct ib_device *ibdev)
561
+static int smc_ib_add_dev(struct ib_device *ibdev)
513562 {
514563 struct smc_ib_device *smcibdev;
515564 u8 port_cnt;
516565 int i;
517566
518567 if (ibdev->node_type != RDMA_NODE_IB_CA)
519
- return;
568
+ return -EOPNOTSUPP;
520569
521570 smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
522571 if (!smcibdev)
523
- return;
572
+ return -ENOMEM;
524573
525574 smcibdev->ibdev = ibdev;
526575 INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
527
-
528
- spin_lock(&smc_ib_devices.lock);
576
+ atomic_set(&smcibdev->lnk_cnt, 0);
577
+ init_waitqueue_head(&smcibdev->lnks_deleted);
578
+ mutex_init(&smcibdev->mutex);
579
+ mutex_lock(&smc_ib_devices.mutex);
529580 list_add_tail(&smcibdev->list, &smc_ib_devices.list);
530
- spin_unlock(&smc_ib_devices.lock);
581
+ mutex_unlock(&smc_ib_devices.mutex);
531582 ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
532583 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
533584 smc_ib_global_event_handler);
....@@ -535,30 +586,39 @@
535586
536587 /* trigger reading of the port attributes */
537588 port_cnt = smcibdev->ibdev->phys_port_cnt;
589
+ pr_warn_ratelimited("smc: adding ib device %s with port count %d\n",
590
+ smcibdev->ibdev->name, port_cnt);
538591 for (i = 0;
539592 i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
540593 i++) {
541594 set_bit(i, &smcibdev->port_event_mask);
542595 /* determine pnetids of the port */
543
- smc_pnetid_by_dev_port(ibdev->dev.parent, i,
544
- smcibdev->pnetid[i]);
596
+ if (smc_pnetid_by_dev_port(ibdev->dev.parent, i,
597
+ smcibdev->pnetid[i]))
598
+ smc_pnetid_by_table_ib(smcibdev, i + 1);
599
+ pr_warn_ratelimited("smc: ib device %s port %d has pnetid "
600
+ "%.16s%s\n",
601
+ smcibdev->ibdev->name, i + 1,
602
+ smcibdev->pnetid[i],
603
+ smcibdev->pnetid_by_user[i] ?
604
+ " (user defined)" :
605
+ "");
545606 }
546607 schedule_work(&smcibdev->port_event_work);
608
+ return 0;
547609 }
548610
549
-/* callback function for ib_register_client() */
611
+/* callback function for ib_unregister_client() */
550612 static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
551613 {
552
- struct smc_ib_device *smcibdev;
614
+ struct smc_ib_device *smcibdev = client_data;
553615
554
- smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
555
- if (!smcibdev || smcibdev->ibdev != ibdev)
556
- return;
557
- ib_set_client_data(ibdev, &smc_ib_client, NULL);
558
- spin_lock(&smc_ib_devices.lock);
616
+ mutex_lock(&smc_ib_devices.mutex);
559617 list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
560
- spin_unlock(&smc_ib_devices.lock);
561
- smc_pnet_remove_by_ibdev(smcibdev);
618
+ mutex_unlock(&smc_ib_devices.mutex);
619
+ pr_warn_ratelimited("smc: removing ib device %s\n",
620
+ smcibdev->ibdev->name);
621
+ smc_smcr_terminate_all(smcibdev);
562622 smc_ib_cleanup_per_ibdev(smcibdev);
563623 ib_unregister_event_handler(&smcibdev->event_handler);
564624 cancel_work_sync(&smcibdev->port_event_work);
....@@ -573,6 +633,7 @@
573633
574634 int __init smc_ib_register_client(void)
575635 {
636
+ smc_ib_init_local_systemid();
576637 return ib_register_client(&smc_ib_client);
577638 }
578639