.. | .. |
---|
37 | 37 | |
---|
38 | 38 | #include "rds_single_path.h" |
---|
39 | 39 | #include "ib_mr.h" |
---|
| 40 | +#include "rds.h" |
---|
40 | 41 | |
---|
41 | 42 | struct workqueue_struct *rds_ib_mr_wq; |
---|
| 43 | +struct rds_ib_dereg_odp_mr { |
---|
| 44 | + struct work_struct work; |
---|
| 45 | + struct ib_mr *mr; |
---|
| 46 | +}; |
---|
42 | 47 | |
---|
43 | | -static DEFINE_PER_CPU(unsigned long, clean_list_grace); |
---|
44 | | -#define CLEAN_LIST_BUSY_BIT 0 |
---|
| 48 | +static void rds_ib_odp_mr_worker(struct work_struct *work); |
---|
45 | 49 | |
---|
46 | 50 | static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) |
---|
47 | 51 | { |
---|
.. | .. |
---|
177 | 181 | struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; |
---|
178 | 182 | |
---|
179 | 183 | iinfo->rdma_mr_max = pool_1m->max_items; |
---|
180 | | - iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; |
---|
| 184 | + iinfo->rdma_mr_size = pool_1m->max_pages; |
---|
181 | 185 | } |
---|
182 | 186 | |
---|
183 | 187 | #if IS_ENABLED(CONFIG_IPV6) |
---|
.. | .. |
---|
187 | 191 | struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; |
---|
188 | 192 | |
---|
189 | 193 | iinfo6->rdma_mr_max = pool_1m->max_items; |
---|
190 | | - iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages; |
---|
| 194 | + iinfo6->rdma_mr_size = pool_1m->max_pages; |
---|
191 | 195 | } |
---|
192 | 196 | #endif |
---|
193 | 197 | |
---|
.. | .. |
---|
195 | 199 | { |
---|
196 | 200 | struct rds_ib_mr *ibmr = NULL; |
---|
197 | 201 | struct llist_node *ret; |
---|
198 | | - unsigned long *flag; |
---|
| 202 | + unsigned long flags; |
---|
199 | 203 | |
---|
200 | | - preempt_disable(); |
---|
201 | | - flag = this_cpu_ptr(&clean_list_grace); |
---|
202 | | - set_bit(CLEAN_LIST_BUSY_BIT, flag); |
---|
| 204 | + spin_lock_irqsave(&pool->clean_lock, flags); |
---|
203 | 205 | ret = llist_del_first(&pool->clean_list); |
---|
| 206 | + spin_unlock_irqrestore(&pool->clean_lock, flags); |
---|
204 | 207 | if (ret) { |
---|
205 | 208 | ibmr = llist_entry(ret, struct rds_ib_mr, llnode); |
---|
206 | 209 | if (pool->pool_type == RDS_IB_MR_8K_POOL) |
---|
.. | .. |
---|
209 | 212 | rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); |
---|
210 | 213 | } |
---|
211 | 214 | |
---|
212 | | - clear_bit(CLEAN_LIST_BUSY_BIT, flag); |
---|
213 | | - preempt_enable(); |
---|
214 | 215 | return ibmr; |
---|
215 | | -} |
---|
216 | | - |
---|
217 | | -static inline void wait_clean_list_grace(void) |
---|
218 | | -{ |
---|
219 | | - int cpu; |
---|
220 | | - unsigned long *flag; |
---|
221 | | - |
---|
222 | | - for_each_online_cpu(cpu) { |
---|
223 | | - flag = &per_cpu(clean_list_grace, cpu); |
---|
224 | | - while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) |
---|
225 | | - cpu_relax(); |
---|
226 | | - } |
---|
227 | 216 | } |
---|
228 | 217 | |
---|
229 | 218 | void rds_ib_sync_mr(void *trans_private, int direction) |
---|
230 | 219 | { |
---|
231 | 220 | struct rds_ib_mr *ibmr = trans_private; |
---|
232 | 221 | struct rds_ib_device *rds_ibdev = ibmr->device; |
---|
| 222 | + |
---|
| 223 | + if (ibmr->odp) |
---|
| 224 | + return; |
---|
233 | 225 | |
---|
234 | 226 | switch (direction) { |
---|
235 | 227 | case DMA_FROM_DEVICE: |
---|
.. | .. |
---|
324 | 316 | * of clusters. Each cluster has linked llist nodes of |
---|
325 | 317 | * MR_CLUSTER_SIZE mrs that are ready for reuse. |
---|
326 | 318 | */ |
---|
327 | | -static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, |
---|
328 | | - struct list_head *list, |
---|
| 319 | +static void list_to_llist_nodes(struct list_head *list, |
---|
329 | 320 | struct llist_node **nodes_head, |
---|
330 | 321 | struct llist_node **nodes_tail) |
---|
331 | 322 | { |
---|
.. | .. |
---|
402 | 393 | */ |
---|
403 | 394 | dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); |
---|
404 | 395 | dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); |
---|
405 | | - if (free_all) |
---|
| 396 | + if (free_all) { |
---|
| 397 | + unsigned long flags; |
---|
| 398 | + |
---|
| 399 | + spin_lock_irqsave(&pool->clean_lock, flags); |
---|
406 | 400 | llist_append_to_list(&pool->clean_list, &unmap_list); |
---|
| 401 | + spin_unlock_irqrestore(&pool->clean_lock, flags); |
---|
| 402 | + } |
---|
407 | 403 | |
---|
408 | 404 | free_goal = rds_ib_flush_goal(pool, free_all); |
---|
409 | 405 | |
---|
410 | 406 | if (list_empty(&unmap_list)) |
---|
411 | 407 | goto out; |
---|
412 | 408 | |
---|
413 | | - if (pool->use_fastreg) |
---|
414 | | - rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); |
---|
415 | | - else |
---|
416 | | - rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal); |
---|
| 409 | + rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); |
---|
417 | 410 | |
---|
418 | 411 | if (!list_empty(&unmap_list)) { |
---|
419 | | - /* we have to make sure that none of the things we're about |
---|
420 | | - * to put on the clean list would race with other cpus trying |
---|
421 | | - * to pull items off. The llist would explode if we managed to |
---|
422 | | - * remove something from the clean list and then add it back again |
---|
423 | | - * while another CPU was spinning on that same item in llist_del_first. |
---|
424 | | - * |
---|
425 | | - * This is pretty unlikely, but just in case wait for an llist grace period |
---|
426 | | - * here before adding anything back into the clean list. |
---|
427 | | - */ |
---|
428 | | - wait_clean_list_grace(); |
---|
| 412 | + unsigned long flags; |
---|
429 | 413 | |
---|
430 | | - list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail); |
---|
| 414 | + list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail); |
---|
431 | 415 | if (ibmr_ret) { |
---|
432 | 416 | *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); |
---|
433 | 417 | clean_nodes = clean_nodes->next; |
---|
434 | 418 | } |
---|
435 | 419 | /* more than one entry in llist nodes */ |
---|
436 | | - if (clean_nodes) |
---|
| 420 | + if (clean_nodes) { |
---|
| 421 | + spin_lock_irqsave(&pool->clean_lock, flags); |
---|
437 | 422 | llist_add_batch(clean_nodes, clean_tail, |
---|
438 | 423 | &pool->clean_list); |
---|
439 | | - |
---|
| 424 | + spin_unlock_irqrestore(&pool->clean_lock, flags); |
---|
| 425 | + } |
---|
440 | 426 | } |
---|
441 | 427 | |
---|
442 | 428 | atomic_sub(unpinned, &pool->free_pinned); |
---|
.. | .. |
---|
471 | 457 | rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); |
---|
472 | 458 | else |
---|
473 | 459 | rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); |
---|
474 | | - return ERR_PTR(-EAGAIN); |
---|
| 460 | + break; |
---|
475 | 461 | } |
---|
476 | 462 | |
---|
477 | 463 | /* We do have some empty MRs. Flush them out. */ |
---|
.. | .. |
---|
485 | 471 | return ibmr; |
---|
486 | 472 | } |
---|
487 | 473 | |
---|
488 | | - return ibmr; |
---|
| 474 | + return NULL; |
---|
489 | 475 | } |
---|
490 | 476 | |
---|
491 | 477 | static void rds_ib_mr_pool_flush_worker(struct work_struct *work) |
---|
.. | .. |
---|
503 | 489 | |
---|
504 | 490 | rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); |
---|
505 | 491 | |
---|
| 492 | + if (ibmr->odp) { |
---|
| 493 | + /* A MR created and marked as use_once. We use delayed work, |
---|
| 494 | + * because there is a change that we are in interrupt and can't |
---|
| 495 | + * call to ib_dereg_mr() directly. |
---|
| 496 | + */ |
---|
| 497 | + INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker); |
---|
| 498 | + queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0); |
---|
| 499 | + return; |
---|
| 500 | + } |
---|
| 501 | + |
---|
506 | 502 | /* Return it to the pool's free list */ |
---|
507 | | - if (rds_ibdev->use_fastreg) |
---|
508 | | - rds_ib_free_frmr_list(ibmr); |
---|
509 | | - else |
---|
510 | | - rds_ib_free_fmr_list(ibmr); |
---|
| 503 | + rds_ib_free_frmr_list(ibmr); |
---|
511 | 504 | |
---|
512 | 505 | atomic_add(ibmr->sg_len, &pool->free_pinned); |
---|
513 | 506 | atomic_inc(&pool->dirty_count); |
---|
.. | .. |
---|
547 | 540 | up_read(&rds_ib_devices_lock); |
---|
548 | 541 | } |
---|
549 | 542 | |
---|
| 543 | +u32 rds_ib_get_lkey(void *trans_private) |
---|
| 544 | +{ |
---|
| 545 | + struct rds_ib_mr *ibmr = trans_private; |
---|
| 546 | + |
---|
| 547 | + return ibmr->u.mr->lkey; |
---|
| 548 | +} |
---|
| 549 | + |
---|
550 | 550 | void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, |
---|
551 | 551 | struct rds_sock *rs, u32 *key_ret, |
---|
552 | | - struct rds_connection *conn) |
---|
| 552 | + struct rds_connection *conn, |
---|
| 553 | + u64 start, u64 length, int need_odp) |
---|
553 | 554 | { |
---|
554 | 555 | struct rds_ib_device *rds_ibdev; |
---|
555 | 556 | struct rds_ib_mr *ibmr = NULL; |
---|
.. | .. |
---|
562 | 563 | goto out; |
---|
563 | 564 | } |
---|
564 | 565 | |
---|
| 566 | + if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) { |
---|
| 567 | + u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start; |
---|
| 568 | + int access_flags = |
---|
| 569 | + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | |
---|
| 570 | + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | |
---|
| 571 | + IB_ACCESS_ON_DEMAND); |
---|
| 572 | + struct ib_sge sge = {}; |
---|
| 573 | + struct ib_mr *ib_mr; |
---|
| 574 | + |
---|
| 575 | + if (!rds_ibdev->odp_capable) { |
---|
| 576 | + ret = -EOPNOTSUPP; |
---|
| 577 | + goto out; |
---|
| 578 | + } |
---|
| 579 | + |
---|
| 580 | + ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr, |
---|
| 581 | + access_flags); |
---|
| 582 | + |
---|
| 583 | + if (IS_ERR(ib_mr)) { |
---|
| 584 | + rdsdebug("rds_ib_get_user_mr returned %d\n", |
---|
| 585 | + IS_ERR(ib_mr)); |
---|
| 586 | + ret = PTR_ERR(ib_mr); |
---|
| 587 | + goto out; |
---|
| 588 | + } |
---|
| 589 | + if (key_ret) |
---|
| 590 | + *key_ret = ib_mr->rkey; |
---|
| 591 | + |
---|
| 592 | + ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); |
---|
| 593 | + if (!ibmr) { |
---|
| 594 | + ib_dereg_mr(ib_mr); |
---|
| 595 | + ret = -ENOMEM; |
---|
| 596 | + goto out; |
---|
| 597 | + } |
---|
| 598 | + ibmr->u.mr = ib_mr; |
---|
| 599 | + ibmr->odp = 1; |
---|
| 600 | + |
---|
| 601 | + sge.addr = virt_addr; |
---|
| 602 | + sge.length = length; |
---|
| 603 | + sge.lkey = ib_mr->lkey; |
---|
| 604 | + |
---|
| 605 | + ib_advise_mr(rds_ibdev->pd, |
---|
| 606 | + IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE, |
---|
| 607 | + IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1); |
---|
| 608 | + return ibmr; |
---|
| 609 | + } |
---|
| 610 | + |
---|
565 | 611 | if (conn) |
---|
566 | 612 | ic = conn->c_transport_data; |
---|
567 | 613 | |
---|
.. | .. |
---|
570 | 616 | goto out; |
---|
571 | 617 | } |
---|
572 | 618 | |
---|
573 | | - if (rds_ibdev->use_fastreg) |
---|
574 | | - ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); |
---|
575 | | - else |
---|
576 | | - ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret); |
---|
| 619 | + ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); |
---|
577 | 620 | if (IS_ERR(ibmr)) { |
---|
578 | 621 | ret = PTR_ERR(ibmr); |
---|
579 | 622 | pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret); |
---|
.. | .. |
---|
610 | 653 | init_llist_head(&pool->free_list); |
---|
611 | 654 | init_llist_head(&pool->drop_list); |
---|
612 | 655 | init_llist_head(&pool->clean_list); |
---|
| 656 | + spin_lock_init(&pool->clean_lock); |
---|
613 | 657 | mutex_init(&pool->flush_lock); |
---|
614 | 658 | init_waitqueue_head(&pool->flush_wait); |
---|
615 | 659 | INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); |
---|
616 | 660 | |
---|
617 | 661 | if (pool_type == RDS_IB_MR_1M_POOL) { |
---|
618 | 662 | /* +1 allows for unaligned MRs */ |
---|
619 | | - pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1; |
---|
| 663 | + pool->max_pages = RDS_MR_1M_MSG_SIZE + 1; |
---|
620 | 664 | pool->max_items = rds_ibdev->max_1m_mrs; |
---|
621 | 665 | } else { |
---|
622 | 666 | /* pool_type == RDS_IB_MR_8K_POOL */ |
---|
623 | | - pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1; |
---|
| 667 | + pool->max_pages = RDS_MR_8K_MSG_SIZE + 1; |
---|
624 | 668 | pool->max_items = rds_ibdev->max_8k_mrs; |
---|
625 | 669 | } |
---|
626 | 670 | |
---|
627 | | - pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; |
---|
628 | | - pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; |
---|
629 | | - pool->fmr_attr.page_shift = PAGE_SHIFT; |
---|
| 671 | + pool->max_free_pinned = pool->max_items * pool->max_pages / 4; |
---|
630 | 672 | pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4; |
---|
631 | | - pool->use_fastreg = rds_ibdev->use_fastreg; |
---|
632 | 673 | |
---|
633 | 674 | return pool; |
---|
634 | 675 | } |
---|
.. | .. |
---|
649 | 690 | { |
---|
650 | 691 | destroy_workqueue(rds_ib_mr_wq); |
---|
651 | 692 | } |
---|
| 693 | + |
---|
| 694 | +static void rds_ib_odp_mr_worker(struct work_struct *work) |
---|
| 695 | +{ |
---|
| 696 | + struct rds_ib_mr *ibmr; |
---|
| 697 | + |
---|
| 698 | + ibmr = container_of(work, struct rds_ib_mr, work.work); |
---|
| 699 | + ib_dereg_mr(ibmr->u.mr); |
---|
| 700 | + kfree(ibmr); |
---|
| 701 | +} |
---|