hc
2024-10-22 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5
kernel/net/rds/ib_rdma.c
....@@ -37,11 +37,15 @@
3737
3838 #include "rds_single_path.h"
3939 #include "ib_mr.h"
40
+#include "rds.h"
4041
4142 struct workqueue_struct *rds_ib_mr_wq;
43
+struct rds_ib_dereg_odp_mr {
44
+ struct work_struct work;
45
+ struct ib_mr *mr;
46
+};
4247
43
-static DEFINE_PER_CPU(unsigned long, clean_list_grace);
44
-#define CLEAN_LIST_BUSY_BIT 0
48
+static void rds_ib_odp_mr_worker(struct work_struct *work);
4549
4650 static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
4751 {
....@@ -177,7 +181,7 @@
177181 struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
178182
179183 iinfo->rdma_mr_max = pool_1m->max_items;
180
- iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
184
+ iinfo->rdma_mr_size = pool_1m->max_pages;
181185 }
182186
183187 #if IS_ENABLED(CONFIG_IPV6)
....@@ -187,7 +191,7 @@
187191 struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
188192
189193 iinfo6->rdma_mr_max = pool_1m->max_items;
190
- iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages;
194
+ iinfo6->rdma_mr_size = pool_1m->max_pages;
191195 }
192196 #endif
193197
....@@ -195,12 +199,11 @@
195199 {
196200 struct rds_ib_mr *ibmr = NULL;
197201 struct llist_node *ret;
198
- unsigned long *flag;
202
+ unsigned long flags;
199203
200
- preempt_disable();
201
- flag = this_cpu_ptr(&clean_list_grace);
202
- set_bit(CLEAN_LIST_BUSY_BIT, flag);
204
+ spin_lock_irqsave(&pool->clean_lock, flags);
203205 ret = llist_del_first(&pool->clean_list);
206
+ spin_unlock_irqrestore(&pool->clean_lock, flags);
204207 if (ret) {
205208 ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
206209 if (pool->pool_type == RDS_IB_MR_8K_POOL)
....@@ -209,27 +212,16 @@
209212 rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
210213 }
211214
212
- clear_bit(CLEAN_LIST_BUSY_BIT, flag);
213
- preempt_enable();
214215 return ibmr;
215
-}
216
-
217
-static inline void wait_clean_list_grace(void)
218
-{
219
- int cpu;
220
- unsigned long *flag;
221
-
222
- for_each_online_cpu(cpu) {
223
- flag = &per_cpu(clean_list_grace, cpu);
224
- while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
225
- cpu_relax();
226
- }
227216 }
228217
229218 void rds_ib_sync_mr(void *trans_private, int direction)
230219 {
231220 struct rds_ib_mr *ibmr = trans_private;
232221 struct rds_ib_device *rds_ibdev = ibmr->device;
222
+
223
+ if (ibmr->odp)
224
+ return;
233225
234226 switch (direction) {
235227 case DMA_FROM_DEVICE:
....@@ -324,8 +316,7 @@
324316 * of clusters. Each cluster has linked llist nodes of
325317 * MR_CLUSTER_SIZE mrs that are ready for reuse.
326318 */
327
-static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
328
- struct list_head *list,
319
+static void list_to_llist_nodes(struct list_head *list,
329320 struct llist_node **nodes_head,
330321 struct llist_node **nodes_tail)
331322 {
....@@ -402,41 +393,36 @@
402393 */
403394 dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
404395 dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
405
- if (free_all)
396
+ if (free_all) {
397
+ unsigned long flags;
398
+
399
+ spin_lock_irqsave(&pool->clean_lock, flags);
406400 llist_append_to_list(&pool->clean_list, &unmap_list);
401
+ spin_unlock_irqrestore(&pool->clean_lock, flags);
402
+ }
407403
408404 free_goal = rds_ib_flush_goal(pool, free_all);
409405
410406 if (list_empty(&unmap_list))
411407 goto out;
412408
413
- if (pool->use_fastreg)
414
- rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
415
- else
416
- rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
409
+ rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
417410
418411 if (!list_empty(&unmap_list)) {
419
- /* we have to make sure that none of the things we're about
420
- * to put on the clean list would race with other cpus trying
421
- * to pull items off. The llist would explode if we managed to
422
- * remove something from the clean list and then add it back again
423
- * while another CPU was spinning on that same item in llist_del_first.
424
- *
425
- * This is pretty unlikely, but just in case wait for an llist grace period
426
- * here before adding anything back into the clean list.
427
- */
428
- wait_clean_list_grace();
412
+ unsigned long flags;
429413
430
- list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
414
+ list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail);
431415 if (ibmr_ret) {
432416 *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
433417 clean_nodes = clean_nodes->next;
434418 }
435419 /* more than one entry in llist nodes */
436
- if (clean_nodes)
420
+ if (clean_nodes) {
421
+ spin_lock_irqsave(&pool->clean_lock, flags);
437422 llist_add_batch(clean_nodes, clean_tail,
438423 &pool->clean_list);
439
-
424
+ spin_unlock_irqrestore(&pool->clean_lock, flags);
425
+ }
440426 }
441427
442428 atomic_sub(unpinned, &pool->free_pinned);
....@@ -471,7 +457,7 @@
471457 rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
472458 else
473459 rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
474
- return ERR_PTR(-EAGAIN);
460
+ break;
475461 }
476462
477463 /* We do have some empty MRs. Flush them out. */
....@@ -485,7 +471,7 @@
485471 return ibmr;
486472 }
487473
488
- return ibmr;
474
+ return NULL;
489475 }
490476
491477 static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
....@@ -503,11 +489,18 @@
503489
504490 rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
505491
492
+ if (ibmr->odp) {
493
+ /* A MR created and marked as use_once. We use delayed work,
494
+ * because there is a change that we are in interrupt and can't
495
+ * call to ib_dereg_mr() directly.
496
+ */
497
+ INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker);
498
+ queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0);
499
+ return;
500
+ }
501
+
506502 /* Return it to the pool's free list */
507
- if (rds_ibdev->use_fastreg)
508
- rds_ib_free_frmr_list(ibmr);
509
- else
510
- rds_ib_free_fmr_list(ibmr);
503
+ rds_ib_free_frmr_list(ibmr);
511504
512505 atomic_add(ibmr->sg_len, &pool->free_pinned);
513506 atomic_inc(&pool->dirty_count);
....@@ -547,9 +540,17 @@
547540 up_read(&rds_ib_devices_lock);
548541 }
549542
543
+u32 rds_ib_get_lkey(void *trans_private)
544
+{
545
+ struct rds_ib_mr *ibmr = trans_private;
546
+
547
+ return ibmr->u.mr->lkey;
548
+}
549
+
550550 void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
551551 struct rds_sock *rs, u32 *key_ret,
552
- struct rds_connection *conn)
552
+ struct rds_connection *conn,
553
+ u64 start, u64 length, int need_odp)
553554 {
554555 struct rds_ib_device *rds_ibdev;
555556 struct rds_ib_mr *ibmr = NULL;
....@@ -562,6 +563,51 @@
562563 goto out;
563564 }
564565
566
+ if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) {
567
+ u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start;
568
+ int access_flags =
569
+ (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
570
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC |
571
+ IB_ACCESS_ON_DEMAND);
572
+ struct ib_sge sge = {};
573
+ struct ib_mr *ib_mr;
574
+
575
+ if (!rds_ibdev->odp_capable) {
576
+ ret = -EOPNOTSUPP;
577
+ goto out;
578
+ }
579
+
580
+ ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr,
581
+ access_flags);
582
+
583
+ if (IS_ERR(ib_mr)) {
584
+ rdsdebug("rds_ib_get_user_mr returned %d\n",
585
+ IS_ERR(ib_mr));
586
+ ret = PTR_ERR(ib_mr);
587
+ goto out;
588
+ }
589
+ if (key_ret)
590
+ *key_ret = ib_mr->rkey;
591
+
592
+ ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
593
+ if (!ibmr) {
594
+ ib_dereg_mr(ib_mr);
595
+ ret = -ENOMEM;
596
+ goto out;
597
+ }
598
+ ibmr->u.mr = ib_mr;
599
+ ibmr->odp = 1;
600
+
601
+ sge.addr = virt_addr;
602
+ sge.length = length;
603
+ sge.lkey = ib_mr->lkey;
604
+
605
+ ib_advise_mr(rds_ibdev->pd,
606
+ IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE,
607
+ IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1);
608
+ return ibmr;
609
+ }
610
+
565611 if (conn)
566612 ic = conn->c_transport_data;
567613
....@@ -570,10 +616,7 @@
570616 goto out;
571617 }
572618
573
- if (rds_ibdev->use_fastreg)
574
- ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
575
- else
576
- ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret);
619
+ ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
577620 if (IS_ERR(ibmr)) {
578621 ret = PTR_ERR(ibmr);
579622 pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret);
....@@ -610,25 +653,23 @@
610653 init_llist_head(&pool->free_list);
611654 init_llist_head(&pool->drop_list);
612655 init_llist_head(&pool->clean_list);
656
+ spin_lock_init(&pool->clean_lock);
613657 mutex_init(&pool->flush_lock);
614658 init_waitqueue_head(&pool->flush_wait);
615659 INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
616660
617661 if (pool_type == RDS_IB_MR_1M_POOL) {
618662 /* +1 allows for unaligned MRs */
619
- pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1;
663
+ pool->max_pages = RDS_MR_1M_MSG_SIZE + 1;
620664 pool->max_items = rds_ibdev->max_1m_mrs;
621665 } else {
622666 /* pool_type == RDS_IB_MR_8K_POOL */
623
- pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1;
667
+ pool->max_pages = RDS_MR_8K_MSG_SIZE + 1;
624668 pool->max_items = rds_ibdev->max_8k_mrs;
625669 }
626670
627
- pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
628
- pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
629
- pool->fmr_attr.page_shift = PAGE_SHIFT;
671
+ pool->max_free_pinned = pool->max_items * pool->max_pages / 4;
630672 pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4;
631
- pool->use_fastreg = rds_ibdev->use_fastreg;
632673
633674 return pool;
634675 }
....@@ -649,3 +690,12 @@
649690 {
650691 destroy_workqueue(rds_ib_mr_wq);
651692 }
693
+
694
+static void rds_ib_odp_mr_worker(struct work_struct *work)
695
+{
696
+ struct rds_ib_mr *ibmr;
697
+
698
+ ibmr = container_of(work, struct rds_ib_mr, work.work);
699
+ ib_dereg_mr(ibmr->u.mr);
700
+ kfree(ibmr);
701
+}