From 9d77db3c730780c8ef5ccd4b66403ff5675cfe4e Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Mon, 13 May 2024 10:30:14 +0000
Subject: [PATCH] modify sin led gpio
---
kernel/drivers/infiniband/hw/mlx5/odp.c | 1712 ++++++++++++++++++++++++++++++++++++++++------------------
1 files changed, 1,188 insertions(+), 524 deletions(-)
diff --git a/kernel/drivers/infiniband/hw/mlx5/odp.c b/kernel/drivers/infiniband/hw/mlx5/odp.c
index 453e5c4..5c853ec 100644
--- a/kernel/drivers/infiniband/hw/mlx5/odp.c
+++ b/kernel/drivers/infiniband/hw/mlx5/odp.c
@@ -36,6 +36,47 @@
#include "mlx5_ib.h"
#include "cmd.h"
+#include "qp.h"
+
+#include <linux/mlx5/eq.h>
+
+/* Contains the details of a pagefault. */
+struct mlx5_pagefault {
+ u32 bytes_committed;
+ u32 token;
+ u8 event_subtype;
+ u8 type;
+ union {
+ /* Initiator or send message responder pagefault details. */
+ struct {
+ /* Received packet size, only valid for responders. */
+ u32 packet_size;
+ /*
+ * Number of resource holding WQE, depends on type.
+ */
+ u32 wq_num;
+ /*
+ * WQE index. Refers to either the send queue or
+ * receive queue, according to event_subtype.
+ */
+ u16 wqe_index;
+ } wqe;
+ /* RDMA responder pagefault details */
+ struct {
+ u32 r_key;
+ /*
+ * Received packet size, minimal size page fault
+ * resolution required for forward progress.
+ */
+ u32 packet_size;
+ u32 rdma_op_len;
+ u64 rdma_va;
+ } rdma;
+ };
+
+ struct mlx5_ib_pf_eq *eq;
+ struct work_struct work;
+};
#define MAX_PREFETCH_LEN (4*1024*1024U)
@@ -53,145 +94,224 @@
static u64 mlx5_imr_ksm_entries;
-static int check_parent(struct ib_umem_odp *odp,
- struct mlx5_ib_mr *parent)
+static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *imr, int flags)
{
- struct mlx5_ib_mr *mr = odp->private;
-
- return mr && mr->parent == parent && !odp->dying;
-}
-
-static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp)
-{
- struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent;
- struct ib_ucontext *ctx = odp->umem->context;
- struct rb_node *rb;
-
- down_read(&ctx->umem_rwsem);
- while (1) {
- rb = rb_next(&odp->interval_tree.rb);
- if (!rb)
- goto not_found;
- odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
- if (check_parent(odp, parent))
- goto end;
- }
-not_found:
- odp = NULL;
-end:
- up_read(&ctx->umem_rwsem);
- return odp;
-}
-
-static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx,
- u64 start, u64 length,
- struct mlx5_ib_mr *parent)
-{
- struct ib_umem_odp *odp;
- struct rb_node *rb;
-
- down_read(&ctx->umem_rwsem);
- odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length);
- if (!odp)
- goto end;
-
- while (1) {
- if (check_parent(odp, parent))
- goto end;
- rb = rb_next(&odp->interval_tree.rb);
- if (!rb)
- goto not_found;
- odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
- if (ib_umem_start(odp->umem) > start + length)
- goto not_found;
- }
-not_found:
- odp = NULL;
-end:
- up_read(&ctx->umem_rwsem);
- return odp;
-}
-
-void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
- size_t nentries, struct mlx5_ib_mr *mr, int flags)
-{
- struct ib_pd *pd = mr->ibmr.pd;
- struct ib_ucontext *ctx = pd->uobject->context;
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
- struct ib_umem_odp *odp;
- unsigned long va;
- int i;
+ struct mlx5_klm *end = pklm + nentries;
if (flags & MLX5_IB_UPD_XLT_ZAP) {
- for (i = 0; i < nentries; i++, pklm++) {
+ for (; pklm != end; pklm++, idx++) {
pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
- pklm->key = cpu_to_be32(dev->null_mkey);
+ pklm->key = cpu_to_be32(imr->dev->null_mkey);
pklm->va = 0;
}
return;
}
- odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE,
- nentries * MLX5_IMR_MTT_SIZE, mr);
+ /*
+ * The locking here is pretty subtle. Ideally the implicit_children
+ * xarray would be protected by the umem_mutex, however that is not
+ * possible. Instead this uses a weaker update-then-lock pattern:
+ *
+ * srcu_read_lock()
+ * xa_store()
+ * mutex_lock(umem_mutex)
+ * mlx5_ib_update_xlt()
+ * mutex_unlock(umem_mutex)
+ * destroy lkey
+ *
+ * ie any change the xarray must be followed by the locked update_xlt
+ * before destroying.
+ *
+ * The umem_mutex provides the acquire/release semantic needed to make
+ * the xa_store() visible to a racing thread. While SRCU is not
+ * technically required, using it gives consistent use of the SRCU
+ * locking around the xarray.
+ */
+ lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
+ lockdep_assert_held(&imr->dev->odp_srcu);
- for (i = 0; i < nentries; i++, pklm++) {
+ for (; pklm != end; pklm++, idx++) {
+ struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
+
pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
- va = (offset + i) * MLX5_IMR_MTT_SIZE;
- if (odp && odp->umem->address == va) {
- struct mlx5_ib_mr *mtt = odp->private;
-
+ if (mtt) {
pklm->key = cpu_to_be32(mtt->ibmr.lkey);
- odp = odp_next(odp);
+ pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
} else {
- pklm->key = cpu_to_be32(dev->null_mkey);
+ pklm->key = cpu_to_be32(imr->dev->null_mkey);
+ pklm->va = 0;
}
- mlx5_ib_dbg(dev, "[%d] va %lx key %x\n",
- i, va, be32_to_cpu(pklm->key));
}
}
-static void mr_leaf_free_action(struct work_struct *work)
+static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
{
- struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
- int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT;
- struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
+ u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
- mr->parent = NULL;
- synchronize_srcu(&mr->dev->mr_srcu);
+ if (umem_dma & ODP_READ_ALLOWED_BIT)
+ mtt_entry |= MLX5_IB_MTT_READ;
+ if (umem_dma & ODP_WRITE_ALLOWED_BIT)
+ mtt_entry |= MLX5_IB_MTT_WRITE;
- ib_umem_release(odp->umem);
- if (imr->live)
- mlx5_ib_update_xlt(imr, idx, 1, 0,
- MLX5_IB_UPD_XLT_INDIRECT |
- MLX5_IB_UPD_XLT_ATOMIC);
- mlx5_mr_cache_free(mr->dev, mr);
-
- if (atomic_dec_and_test(&imr->num_leaf_free))
- wake_up(&imr->q_leaf_free);
+ return mtt_entry;
}
-void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
- unsigned long end)
+static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags)
{
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+ dma_addr_t pa;
+ size_t i;
+
+ if (flags & MLX5_IB_UPD_XLT_ZAP)
+ return;
+
+ for (i = 0; i < nentries; i++) {
+ pa = odp->dma_list[idx + i];
+ pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
+ }
+}
+
+void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+ struct mlx5_ib_mr *mr, int flags)
+{
+ if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
+ populate_klm(xlt, idx, nentries, mr, flags);
+ } else {
+ populate_mtt(xlt, idx, nentries, mr, flags);
+ }
+}
+
+static void dma_fence_odp_mr(struct mlx5_ib_mr *mr)
+{
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+
+ /* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */
+ mutex_lock(&odp->umem_mutex);
+ if (odp->npages) {
+ mlx5_mr_cache_invalidate(mr);
+ ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp),
+ ib_umem_end(odp));
+ WARN_ON(odp->npages);
+ }
+ odp->private = NULL;
+ mutex_unlock(&odp->umem_mutex);
+
+ if (!mr->cache_ent) {
+ mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey);
+ WARN_ON(mr->descs);
+ }
+}
+
+/*
+ * This must be called after the mr has been removed from implicit_children
+ * and the SRCU synchronized. NOTE: The MR does not necessarily have to be
+ * empty here, parallel page faults could have raced with the free process and
+ * added pages to it.
+ */
+static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
+{
+ struct mlx5_ib_mr *imr = mr->parent;
+ struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+ unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
+ int srcu_key;
+
+ /* implicit_child_mr's are not allowed to have deferred work */
+ WARN_ON(atomic_read(&mr->num_deferred_work));
+
+ if (need_imr_xlt) {
+ srcu_key = srcu_read_lock(&mr->dev->odp_srcu);
+ mutex_lock(&odp_imr->umem_mutex);
+ mlx5_ib_update_xlt(mr->parent, idx, 1, 0,
+ MLX5_IB_UPD_XLT_INDIRECT |
+ MLX5_IB_UPD_XLT_ATOMIC);
+ mutex_unlock(&odp_imr->umem_mutex);
+ srcu_read_unlock(&mr->dev->odp_srcu, srcu_key);
+ }
+
+ dma_fence_odp_mr(mr);
+
+ mr->parent = NULL;
+ mlx5_mr_cache_free(mr->dev, mr);
+ ib_umem_odp_release(odp);
+ if (atomic_dec_and_test(&imr->num_deferred_work))
+ wake_up(&imr->q_deferred_work);
+}
+
+static void free_implicit_child_mr_work(struct work_struct *work)
+{
+ struct mlx5_ib_mr *mr =
+ container_of(work, struct mlx5_ib_mr, odp_destroy.work);
+
+ free_implicit_child_mr(mr, true);
+}
+
+static void free_implicit_child_mr_rcu(struct rcu_head *head)
+{
+ struct mlx5_ib_mr *mr =
+ container_of(head, struct mlx5_ib_mr, odp_destroy.rcu);
+
+ /* Freeing a MR is a sleeping operation, so bounce to a work queue */
+ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
+ queue_work(system_unbound_wq, &mr->odp_destroy.work);
+}
+
+static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
+{
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+ unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
+ struct mlx5_ib_mr *imr = mr->parent;
+
+ xa_lock(&imr->implicit_children);
+ /*
+ * This can race with mlx5_ib_free_implicit_mr(), the first one to
+ * reach the xa lock wins the race and destroys the MR.
+ */
+ if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) !=
+ mr)
+ goto out_unlock;
+
+ atomic_inc(&imr->num_deferred_work);
+ call_srcu(&mr->dev->odp_srcu, &mr->odp_destroy.rcu,
+ free_implicit_child_mr_rcu);
+
+out_unlock:
+ xa_unlock(&imr->implicit_children);
+}
+
+static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
+{
+ struct ib_umem_odp *umem_odp =
+ container_of(mni, struct ib_umem_odp, notifier);
struct mlx5_ib_mr *mr;
const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
sizeof(struct mlx5_mtt)) - 1;
u64 idx = 0, blk_start_idx = 0;
+ u64 invalidations = 0;
+ unsigned long start;
+ unsigned long end;
int in_block = 0;
u64 addr;
- if (!umem || !umem->odp_data) {
- pr_err("invalidation called on NULL umem or non-ODP umem\n");
- return;
- }
+ if (!mmu_notifier_range_blockable(range))
+ return false;
- mr = umem->odp_data->private;
+ mutex_lock(&umem_odp->umem_mutex);
+ mmu_interval_set_seq(mni, cur_seq);
+ /*
+ * If npages is zero then umem_odp->private may not be setup yet. This
+ * does not complete until after the first page is mapped for DMA.
+ */
+ if (!umem_odp->npages)
+ goto out;
+ mr = umem_odp->private;
- if (!mr || !mr->ibmr.pd)
- return;
-
- start = max_t(u64, ib_umem_start(umem), start);
- end = min_t(u64, ib_umem_end(umem), end);
+ start = max_t(u64, ib_umem_start(umem_odp), range->start);
+ end = min_t(u64, ib_umem_end(umem_odp), range->end);
/*
* Iteration one - zap the HW's MTTs. The notifiers_count ensures that
@@ -199,21 +319,23 @@
* overwrite the same MTTs. Concurent invalidations might race us,
* but they will write 0s as well, so no difference in the end result.
*/
-
- for (addr = start; addr < end; addr += BIT(umem->page_shift)) {
- idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
+ for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) {
+ idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
/*
* Strive to write the MTTs in chunks, but avoid overwriting
* non-existing MTTs. The huristic here can be improved to
* estimate the cost of another UMR vs. the cost of bigger
* UMR.
*/
- if (umem->odp_data->dma_list[idx] &
+ if (umem_odp->dma_list[idx] &
(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
if (!in_block) {
blk_start_idx = idx;
in_block = 1;
}
+
+ /* Count page invalidations */
+ invalidations += idx - blk_start_idx + 1;
} else {
u64 umr_offset = idx & umr_block_mask;
@@ -231,21 +353,27 @@
idx - blk_start_idx + 1, 0,
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ATOMIC);
+
+ mlx5_update_odp_stats(mr, invalidations, invalidations);
+
/*
* We are now sure that the device will not access the
* memory. We can safely unmap it, and mark it as dirty if
* needed.
*/
- ib_umem_odp_unmap_dma_pages(umem, start, end);
+ ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
- if (unlikely(!umem->npages && mr->parent &&
- !umem->odp_data->dying)) {
- WRITE_ONCE(umem->odp_data->dying, 1);
- atomic_inc(&mr->parent->num_leaf_free);
- schedule_work(&umem->odp_data->work);
- }
+ if (unlikely(!umem_odp->npages && mr->parent))
+ destroy_unused_implicit_child_mr(mr);
+out:
+ mutex_unlock(&umem_odp->umem_mutex);
+ return true;
}
+
+const struct mmu_interval_notifier_ops mlx5_mn_ops = {
+ .invalidate = mlx5_ib_invalidate_range,
+};
void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
{
@@ -253,7 +381,8 @@
memset(caps, 0, sizeof(*caps));
- if (!MLX5_CAP_GEN(dev->mdev, pg))
+ if (!MLX5_CAP_GEN(dev->mdev, pg) ||
+ !mlx5_ib_can_load_pas_with_umr(dev, 0))
return;
caps->general_caps = IB_ODP_SUPPORT;
@@ -265,6 +394,9 @@
if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
+
+ if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive))
+ caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
@@ -281,12 +413,32 @@
if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
+ if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive))
+ caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
+
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send))
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND;
+
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive))
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV;
+
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write))
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE;
+
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read))
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ;
+
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic))
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
+
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive))
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
+
if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
MLX5_CAP_GEN(dev->mdev, null_mkey) &&
- MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+ MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
+ !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
-
- return;
}
static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
@@ -295,318 +447,402 @@
{
int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
pfault->wqe.wq_num : pfault->token;
- int ret = mlx5_core_page_fault_resume(dev->mdev,
- pfault->token,
- wq_num,
- pfault->type,
- error);
- if (ret)
- mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n",
- wq_num);
-}
-
-static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
- struct ib_umem *umem,
- bool ksm, int access_flags)
-{
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
- struct mlx5_ib_mr *mr;
+ u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {};
int err;
- mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY :
- MLX5_IMR_MTT_CACHE_ENTRY);
+ MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
+ MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type);
+ MLX5_SET(page_fault_resume_in, in, token, pfault->token);
+ MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
+ MLX5_SET(page_fault_resume_in, in, error, !!error);
- if (IS_ERR(mr))
- return mr;
-
- mr->ibmr.pd = pd;
-
- mr->dev = dev;
- mr->access_flags = access_flags;
- mr->mmkey.iova = 0;
- mr->umem = umem;
-
- if (ksm) {
- err = mlx5_ib_update_xlt(mr, 0,
- mlx5_imr_ksm_entries,
- MLX5_KSM_PAGE_SHIFT,
- MLX5_IB_UPD_XLT_INDIRECT |
- MLX5_IB_UPD_XLT_ZAP |
- MLX5_IB_UPD_XLT_ENABLE);
-
- } else {
- err = mlx5_ib_update_xlt(mr, 0,
- MLX5_IMR_MTT_ENTRIES,
- PAGE_SHIFT,
- MLX5_IB_UPD_XLT_ZAP |
- MLX5_IB_UPD_XLT_ENABLE |
- MLX5_IB_UPD_XLT_ATOMIC);
- }
-
+ err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in);
if (err)
- goto fail;
-
- mr->ibmr.lkey = mr->mmkey.key;
- mr->ibmr.rkey = mr->mmkey.key;
-
- mr->live = 1;
-
- mlx5_ib_dbg(dev, "key %x dev %p mr %p\n",
- mr->mmkey.key, dev->mdev, mr);
-
- return mr;
-
-fail:
- mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
- mlx5_mr_cache_free(dev, mr);
-
- return ERR_PTR(err);
+ mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n",
+ wq_num, err);
}
-static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr,
- u64 io_virt, size_t bcnt)
+static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
+ unsigned long idx)
{
- struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context;
- struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device);
- struct ib_umem_odp *odp, *result = NULL;
- u64 addr = io_virt & MLX5_IMR_MTT_MASK;
- int nentries = 0, start_idx = 0, ret;
- struct mlx5_ib_mr *mtt;
- struct ib_umem *umem;
+ struct ib_umem_odp *odp;
+ struct mlx5_ib_mr *mr;
+ struct mlx5_ib_mr *ret;
+ int err;
- mutex_lock(&mr->umem->odp_data->umem_mutex);
- odp = odp_lookup(ctx, addr, 1, mr);
+ odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
+ idx * MLX5_IMR_MTT_SIZE,
+ MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
+ if (IS_ERR(odp))
+ return ERR_CAST(odp);
- mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n",
- io_virt, bcnt, addr, odp);
+ ret = mr = mlx5_mr_cache_alloc(imr->dev, MLX5_IMR_MTT_CACHE_ENTRY,
+ imr->access_flags);
+ if (IS_ERR(mr))
+ goto out_umem;
-next_mr:
- if (likely(odp)) {
- if (nentries)
- nentries++;
- } else {
- umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE);
- if (IS_ERR(umem)) {
- mutex_unlock(&mr->umem->odp_data->umem_mutex);
- return ERR_CAST(umem);
- }
+ mr->ibmr.pd = imr->ibmr.pd;
+ mr->umem = &odp->umem;
+ mr->ibmr.lkey = mr->mmkey.key;
+ mr->ibmr.rkey = mr->mmkey.key;
+ mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE;
+ mr->parent = imr;
+ odp->private = mr;
- mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags);
- if (IS_ERR(mtt)) {
- mutex_unlock(&mr->umem->odp_data->umem_mutex);
- ib_umem_release(umem);
- return ERR_CAST(mtt);
- }
-
- odp = umem->odp_data;
- odp->private = mtt;
- mtt->umem = umem;
- mtt->mmkey.iova = addr;
- mtt->parent = mr;
- INIT_WORK(&odp->work, mr_leaf_free_action);
-
- if (!nentries)
- start_idx = addr >> MLX5_IMR_MTT_SHIFT;
- nentries++;
+ err = mlx5_ib_update_xlt(mr, 0,
+ MLX5_IMR_MTT_ENTRIES,
+ PAGE_SHIFT,
+ MLX5_IB_UPD_XLT_ZAP |
+ MLX5_IB_UPD_XLT_ENABLE);
+ if (err) {
+ ret = ERR_PTR(err);
+ goto out_mr;
}
- /* Return first odp if region not covered by single one */
- if (likely(!result))
- result = odp;
-
- addr += MLX5_IMR_MTT_SIZE;
- if (unlikely(addr < io_virt + bcnt)) {
- odp = odp_next(odp);
- if (odp && odp->umem->address != addr)
- odp = NULL;
- goto next_mr;
- }
-
- if (unlikely(nentries)) {
- ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0,
- MLX5_IB_UPD_XLT_INDIRECT |
- MLX5_IB_UPD_XLT_ATOMIC);
- if (ret) {
- mlx5_ib_err(dev, "Failed to update PAS\n");
- result = ERR_PTR(ret);
+ /*
+ * Once the store to either xarray completes any error unwind has to
+ * use synchronize_srcu(). Avoid this with xa_reserve()
+ */
+ ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
+ GFP_KERNEL);
+ if (unlikely(ret)) {
+ if (xa_is_err(ret)) {
+ ret = ERR_PTR(xa_err(ret));
+ goto out_mr;
}
+ /*
+ * Another thread beat us to creating the child mr, use
+ * theirs.
+ */
+ goto out_mr;
}
- mutex_unlock(&mr->umem->odp_data->umem_mutex);
- return result;
+ mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr);
+ return mr;
+
+out_mr:
+ mlx5_mr_cache_free(imr->dev, mr);
+out_umem:
+ ib_umem_odp_release(odp);
+ return ret;
}
struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+ struct ib_udata *udata,
int access_flags)
{
- struct ib_ucontext *ctx = pd->ibpd.uobject->context;
+ struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
+ struct ib_umem_odp *umem_odp;
struct mlx5_ib_mr *imr;
- struct ib_umem *umem;
+ int err;
- umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0);
- if (IS_ERR(umem))
- return ERR_CAST(umem);
+ umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
+ if (IS_ERR(umem_odp))
+ return ERR_CAST(umem_odp);
- imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
+ imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags);
if (IS_ERR(imr)) {
- ib_umem_release(umem);
- return ERR_CAST(imr);
+ err = PTR_ERR(imr);
+ goto out_umem;
}
- imr->umem = umem;
- init_waitqueue_head(&imr->q_leaf_free);
- atomic_set(&imr->num_leaf_free, 0);
+ imr->ibmr.pd = &pd->ibpd;
+ imr->mmkey.iova = 0;
+ imr->umem = &umem_odp->umem;
+ imr->ibmr.lkey = imr->mmkey.key;
+ imr->ibmr.rkey = imr->mmkey.key;
+ imr->umem = &umem_odp->umem;
+ imr->is_odp_implicit = true;
+ atomic_set(&imr->num_deferred_work, 0);
+ init_waitqueue_head(&imr->q_deferred_work);
+ xa_init(&imr->implicit_children);
+ err = mlx5_ib_update_xlt(imr, 0,
+ mlx5_imr_ksm_entries,
+ MLX5_KSM_PAGE_SHIFT,
+ MLX5_IB_UPD_XLT_INDIRECT |
+ MLX5_IB_UPD_XLT_ZAP |
+ MLX5_IB_UPD_XLT_ENABLE);
+ if (err)
+ goto out_mr;
+
+ err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key),
+ &imr->mmkey, GFP_KERNEL));
+ if (err)
+ goto out_mr;
+
+ mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr);
return imr;
-}
-
-static int mr_leaf_free(struct ib_umem *umem, u64 start,
- u64 end, void *cookie)
-{
- struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie;
-
- if (mr->parent != imr)
- return 0;
-
- ib_umem_odp_unmap_dma_pages(umem,
- ib_umem_start(umem),
- ib_umem_end(umem));
-
- if (umem->odp_data->dying)
- return 0;
-
- WRITE_ONCE(umem->odp_data->dying, 1);
- atomic_inc(&imr->num_leaf_free);
- schedule_work(&umem->odp_data->work);
-
- return 0;
+out_mr:
+ mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
+ mlx5_mr_cache_free(dev, imr);
+out_umem:
+ ib_umem_odp_release(umem_odp);
+ return ERR_PTR(err);
}
void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
{
- struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context;
+ struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
+ struct mlx5_ib_dev *dev = imr->dev;
+ struct list_head destroy_list;
+ struct mlx5_ib_mr *mtt;
+ struct mlx5_ib_mr *tmp;
+ unsigned long idx;
- down_read(&ctx->umem_rwsem);
- rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
- mr_leaf_free, true, imr);
- up_read(&ctx->umem_rwsem);
+ INIT_LIST_HEAD(&destroy_list);
- wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
+ xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key));
+ /*
+ * This stops the SRCU protected page fault path from touching either
+ * the imr or any children. The page fault path can only reach the
+ * children xarray via the imr.
+ */
+ synchronize_srcu(&dev->odp_srcu);
+
+ /*
+ * All work on the prefetch list must be completed, xa_erase() prevented
+ * new work from being created.
+ */
+ wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
+
+ /*
+ * At this point it is forbidden for any other thread to enter
+ * pagefault_mr() on this imr. It is already forbidden to call
+ * pagefault_mr() on an implicit child. Due to this additions to
+ * implicit_children are prevented.
+ */
+
+ /*
+ * Block destroy_unused_implicit_child_mr() from incrementing
+ * num_deferred_work.
+ */
+ xa_lock(&imr->implicit_children);
+ xa_for_each (&imr->implicit_children, idx, mtt) {
+ __xa_erase(&imr->implicit_children, idx);
+ list_add(&mtt->odp_destroy.elm, &destroy_list);
+ }
+ xa_unlock(&imr->implicit_children);
+
+ /*
+ * Wait for any concurrent destroy_unused_implicit_child_mr() to
+ * complete.
+ */
+ wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
+
+ /*
+ * Fence the imr before we destroy the children. This allows us to
+ * skip updating the XLT of the imr during destroy of the child mkey
+ * the imr points to.
+ */
+ mlx5_mr_cache_invalidate(imr);
+
+ list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm)
+ free_implicit_child_mr(mtt, false);
+
+ mlx5_mr_cache_free(dev, imr);
+ ib_umem_odp_release(odp_imr);
}
-static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
- u64 io_virt, size_t bcnt, u32 *bytes_mapped)
+/**
+ * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR
+ * @mr: to fence
+ *
+ * On return no parallel threads will be touching this MR and no DMA will be
+ * active.
+ */
+void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
{
+ /* Prevent new page faults and prefetch requests from succeeding */
+ xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
+
+ /* Wait for all running page-fault handlers to finish. */
+ synchronize_srcu(&mr->dev->odp_srcu);
+
+ wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work));
+
+ dma_fence_odp_mr(mr);
+}
+
+#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
+#define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
+#define MLX5_PF_FLAGS_ENABLE BIT(3)
+static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
+ u64 user_va, size_t bcnt, u32 *bytes_mapped,
+ u32 flags)
+{
+ int page_shift, ret, np;
+ bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
u64 access_mask;
- int npages = 0, page_shift, np;
- u64 start_idx, page_mask;
- struct ib_umem_odp *odp;
- int current_seq;
- size_t size;
- int ret;
+ u64 start_idx;
+ bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
+ u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
- if (!mr->umem->odp_data->page_list) {
- odp = implicit_mr_get_data(mr, io_virt, bcnt);
+ if (flags & MLX5_PF_FLAGS_ENABLE)
+ xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
- if (IS_ERR(odp))
- return PTR_ERR(odp);
- mr = odp->private;
-
- } else {
- odp = mr->umem->odp_data;
- }
-
-next_mr:
- size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt);
-
- page_shift = mr->umem->page_shift;
- page_mask = ~(BIT(page_shift) - 1);
- start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
+ page_shift = odp->page_shift;
+ start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
access_mask = ODP_READ_ALLOWED_BIT;
- if (mr->umem->writable)
+ if (odp->umem.writable && !downgrade)
access_mask |= ODP_WRITE_ALLOWED_BIT;
- current_seq = READ_ONCE(odp->notifiers_seq);
+ np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
+ if (np < 0)
+ return np;
+
/*
- * Ensure the sequence number is valid for some time before we call
- * gup.
+ * No need to check whether the MTTs really belong to this MR, since
+ * ib_umem_odp_map_dma_and_lock already checks this.
*/
- smp_rmb();
-
- ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size,
- access_mask, current_seq);
-
- if (ret < 0)
- goto out;
-
- np = ret;
-
- mutex_lock(&odp->umem_mutex);
- if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
- /*
- * No need to check whether the MTTs really belong to
- * this MR, since ib_umem_odp_map_dma_pages already
- * checks this.
- */
- ret = mlx5_ib_update_xlt(mr, start_idx, np,
- page_shift, MLX5_IB_UPD_XLT_ATOMIC);
- } else {
- ret = -EAGAIN;
- }
+ ret = mlx5_ib_update_xlt(mr, start_idx, np, page_shift, xlt_flags);
mutex_unlock(&odp->umem_mutex);
if (ret < 0) {
if (ret != -EAGAIN)
- mlx5_ib_err(dev, "Failed to update mkey page tables\n");
+ mlx5_ib_err(mr->dev,
+ "Failed to update mkey page tables\n");
goto out;
}
if (bytes_mapped) {
u32 new_mappings = (np << page_shift) -
- (io_virt - round_down(io_virt, 1 << page_shift));
- *bytes_mapped += min_t(u32, new_mappings, size);
+ (user_va - round_down(user_va, 1 << page_shift));
+
+ *bytes_mapped += min_t(u32, new_mappings, bcnt);
}
- npages += np << (page_shift - PAGE_SHIFT);
- bcnt -= size;
-
- if (unlikely(bcnt)) {
- struct ib_umem_odp *next;
-
- io_virt += size;
- next = odp_next(odp);
- if (unlikely(!next || next->umem->address != io_virt)) {
- mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
- io_virt, next);
- return -EAGAIN;
- }
- odp = next;
- mr = odp->private;
- goto next_mr;
- }
-
- return npages;
+ return np << (page_shift - PAGE_SHIFT);
out:
- if (ret == -EAGAIN) {
- if (mr->parent || !odp->dying) {
- unsigned long timeout =
- msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
+ return ret;
+}
- if (!wait_for_completion_timeout(
- &odp->notifier_completion,
- timeout)) {
- mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n",
- current_seq, odp->notifiers_seq);
+static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
+ struct ib_umem_odp *odp_imr, u64 user_va,
+ size_t bcnt, u32 *bytes_mapped, u32 flags)
+{
+ unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
+ unsigned long upd_start_idx = end_idx + 1;
+ unsigned long upd_len = 0;
+ unsigned long npages = 0;
+ int err;
+ int ret;
+
+ if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
+ mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
+ return -EFAULT;
+
+ /* Fault each child mr that intersects with our interval. */
+ while (bcnt) {
+ unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
+ struct ib_umem_odp *umem_odp;
+ struct mlx5_ib_mr *mtt;
+ u64 len;
+
+ mtt = xa_load(&imr->implicit_children, idx);
+ if (unlikely(!mtt)) {
+ mtt = implicit_get_child_mr(imr, idx);
+ if (IS_ERR(mtt)) {
+ ret = PTR_ERR(mtt);
+ goto out;
}
- } else {
- /* The MR is being killed, kill the QP as well. */
- ret = -EFAULT;
+ upd_start_idx = min(upd_start_idx, idx);
+ upd_len = idx - upd_start_idx + 1;
}
+
+ umem_odp = to_ib_umem_odp(mtt->umem);
+ len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) -
+ user_va;
+
+ ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
+ bytes_mapped, flags);
+ if (ret < 0)
+ goto out;
+ user_va += len;
+ bcnt -= len;
+ npages += ret;
}
+ ret = npages;
+
+ /*
+ * Any time the implicit_children are changed we must perform an
+ * update of the xlt before exiting to ensure the HW and the
+ * implicit_children remains synchronized.
+ */
+out:
+ if (likely(!upd_len))
+ return ret;
+
+ /*
+ * Notice this is not strictly ordered right, the KSM is updated after
+ * the implicit_children is updated, so a parallel page fault could
+ * see a MR that is not yet visible in the KSM. This is similar to a
+ * parallel page fault seeing a MR that is being concurrently removed
+ * from the KSM. Both of these improbable situations are resolved
+ * safely by resuming the HW and then taking another page fault. The
+ * next pagefault handler will see the new information.
+ */
+ mutex_lock(&odp_imr->umem_mutex);
+ err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0,
+ MLX5_IB_UPD_XLT_INDIRECT |
+ MLX5_IB_UPD_XLT_ATOMIC);
+ mutex_unlock(&odp_imr->umem_mutex);
+ if (err) {
+ mlx5_ib_err(imr->dev, "Failed to update PAS\n");
+ return err;
+ }
return ret;
+}
+
+/*
+ * Returns:
+ * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
+ * not accessible, or the MR is no longer valid.
+ * -EAGAIN/-ENOMEM: The operation should be retried
+ *
+ * -EINVAL/others: General internal malfunction
+ * >0: Number of pages mapped
+ */
+static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
+ u32 *bytes_mapped, u32 flags)
+{
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+
+ lockdep_assert_held(&mr->dev->odp_srcu);
+ if (unlikely(io_virt < mr->mmkey.iova))
+ return -EFAULT;
+
+ if (!odp->is_implicit_odp) {
+ u64 user_va;
+
+ if (check_add_overflow(io_virt - mr->mmkey.iova,
+ (u64)odp->umem.address, &user_va))
+ return -EFAULT;
+ if (unlikely(user_va >= ib_umem_end(odp) ||
+ ib_umem_end(odp) - user_va < bcnt))
+ return -EFAULT;
+ return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
+ flags);
+ }
+ return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
+ flags);
+}
+
+int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable)
+{
+ u32 flags = MLX5_PF_FLAGS_SNAPSHOT;
+ int ret;
+
+ if (enable)
+ flags |= MLX5_PF_FLAGS_ENABLE;
+
+ ret = pagefault_real_mr(mr, to_ib_umem_odp(mr->umem),
+ mr->umem->address, mr->umem->length, NULL,
+ flags);
+ return ret >= 0 ? 0 : ret;
}
struct pf_frame {
@@ -616,6 +852,30 @@
size_t bcnt;
int depth;
};
+
+static bool mkey_is_eq(struct mlx5_core_mkey *mmkey, u32 key)
+{
+ if (!mmkey)
+ return false;
+ if (mmkey->type == MLX5_MKEY_MW)
+ return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key);
+ return mmkey->key == key;
+}
+
+static int get_indirect_num_descs(struct mlx5_core_mkey *mmkey)
+{
+ struct mlx5_ib_mw *mw;
+ struct mlx5_ib_devx_mr *devx_mr;
+
+ if (mmkey->type == MLX5_MKEY_MW) {
+ mw = container_of(mmkey, struct mlx5_ib_mw, mmkey);
+ return mw->ndescs;
+ }
+
+ devx_mr = container_of(mmkey, struct mlx5_ib_devx_mr,
+ mmkey);
+ return devx_mr->ndescs;
+}
/*
* Handle a single data segment in a page-fault WQE or RDMA region.
@@ -629,27 +889,43 @@
* abort the page fault handling.
*/
static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
- u32 key, u64 io_virt, size_t bcnt,
+ struct ib_pd *pd, u32 key,
+ u64 io_virt, size_t bcnt,
u32 *bytes_committed,
u32 *bytes_mapped)
{
int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
struct pf_frame *head = NULL, *frame;
struct mlx5_core_mkey *mmkey;
- struct mlx5_ib_mw *mw;
struct mlx5_ib_mr *mr;
struct mlx5_klm *pklm;
u32 *out = NULL;
size_t offset;
+ int ndescs;
- srcu_key = srcu_read_lock(&dev->mr_srcu);
+ srcu_key = srcu_read_lock(&dev->odp_srcu);
io_virt += *bytes_committed;
bcnt -= *bytes_committed;
next_mr:
- mmkey = __mlx5_mr_lookup(dev->mdev, mlx5_base_mkey(key));
- if (!mmkey || mmkey->key != key) {
+ mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
+ if (!mmkey) {
+ mlx5_ib_dbg(
+ dev,
+ "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
+ key);
+ if (bytes_mapped)
+ *bytes_mapped += bcnt;
+ /*
+ * The user could specify a SGL with multiple lkeys and only
+ * some of them are ODP. Treat the non-ODP ones as fully
+ * faulted.
+ */
+ ret = 0;
+ goto srcu_unlock;
+ }
+ if (!mkey_is_eq(mmkey, key)) {
mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
ret = -EFAULT;
goto srcu_unlock;
@@ -658,22 +934,20 @@
switch (mmkey->type) {
case MLX5_MKEY_MR:
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
- if (!mr->live || !mr->ibmr.pd) {
- mlx5_ib_dbg(dev, "got dead MR\n");
- ret = -EFAULT;
- goto srcu_unlock;
- }
- ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped);
+ ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
if (ret < 0)
goto srcu_unlock;
+
+ mlx5_update_odp_stats(mr, faults, ret);
npages += ret;
ret = 0;
break;
case MLX5_MKEY_MW:
- mw = container_of(mmkey, struct mlx5_ib_mw, mmkey);
+ case MLX5_MKEY_INDIRECT_DEVX:
+ ndescs = get_indirect_num_descs(mmkey);
if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) {
mlx5_ib_dbg(dev, "indirection level exceeded\n");
@@ -682,7 +956,7 @@
}
outlen = MLX5_ST_SZ_BYTES(query_mkey_out) +
- sizeof(*pklm) * (mw->ndescs - 2);
+ sizeof(*pklm) * (ndescs - 2);
if (outlen > cur_outlen) {
kfree(out);
@@ -697,14 +971,14 @@
pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out,
bsf0_klm0_pas_mtt0_1);
- ret = mlx5_core_query_mkey(dev->mdev, &mw->mmkey, out, outlen);
+ ret = mlx5_core_query_mkey(dev->mdev, mmkey, out, outlen);
if (ret)
goto srcu_unlock;
offset = io_virt - MLX5_GET64(query_mkey_out, out,
memory_key_mkey_entry.start_addr);
- for (i = 0; bcnt && i < mw->ndescs; i++, pklm++) {
+ for (i = 0; bcnt && i < ndescs; i++, pklm++) {
if (offset >= be32_to_cpu(pklm->bcount)) {
offset -= be32_to_cpu(pklm->bcount);
continue;
@@ -756,7 +1030,7 @@
}
kfree(out);
- srcu_read_unlock(&dev->mr_srcu, srcu_key);
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
*bytes_committed = 0;
return ret ? ret : npages;
}
@@ -764,7 +1038,6 @@
/**
* Parse a series of data segments for page fault handling.
*
- * @qp the QP on which the fault occurred.
* @pfault contains page fault information.
* @wqe points at the first data segment in the WQE.
* @wqe_end points after the end of the WQE.
@@ -781,9 +1054,9 @@
*/
static int pagefault_data_segments(struct mlx5_ib_dev *dev,
struct mlx5_pagefault *pfault,
- struct mlx5_ib_qp *qp, void *wqe,
+ void *wqe,
void *wqe_end, u32 *bytes_mapped,
- u32 *total_wqe_bytes, int receive_queue)
+ u32 *total_wqe_bytes, bool receive_queue)
{
int ret = 0, npages = 0;
u64 io_virt;
@@ -791,10 +1064,6 @@
u32 byte_count;
size_t bcnt;
int inline_segment;
-
- /* Skip SRQ next-WQE segment. */
- if (receive_queue && qp->ibqp.srq)
- wqe += sizeof(struct mlx5_wqe_srq_next_seg);
if (bytes_mapped)
*bytes_mapped = 0;
@@ -839,7 +1108,8 @@
continue;
}
- ret = pagefault_single_data_segment(dev, key, io_virt, bcnt,
+ ret = pagefault_single_data_segment(dev, NULL, key,
+ io_virt, bcnt,
&pfault->bytes_committed,
bytes_mapped);
if (ret < 0)
@@ -849,17 +1119,6 @@
return ret < 0 ? ret : npages;
}
-
-static const u32 mlx5_ib_odp_opcode_cap[] = {
- [MLX5_OPCODE_SEND] = IB_ODP_SUPPORT_SEND,
- [MLX5_OPCODE_SEND_IMM] = IB_ODP_SUPPORT_SEND,
- [MLX5_OPCODE_SEND_INVAL] = IB_ODP_SUPPORT_SEND,
- [MLX5_OPCODE_RDMA_WRITE] = IB_ODP_SUPPORT_WRITE,
- [MLX5_OPCODE_RDMA_WRITE_IMM] = IB_ODP_SUPPORT_WRITE,
- [MLX5_OPCODE_RDMA_READ] = IB_ODP_SUPPORT_READ,
- [MLX5_OPCODE_ATOMIC_CS] = IB_ODP_SUPPORT_ATOMIC,
- [MLX5_OPCODE_ATOMIC_FA] = IB_ODP_SUPPORT_ATOMIC,
-};
/*
* Parse initiator WQE. Advances the wqe pointer to point at the
@@ -871,12 +1130,8 @@
{
struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
u16 wqe_index = pfault->wqe.wqe_index;
- u32 transport_caps;
struct mlx5_base_av *av;
unsigned ds, opcode;
-#if defined(DEBUG)
- u32 ctrl_wqe_index, ctrl_qpn;
-#endif
u32 qpn = qp->trans_qp.base.mqp.qpn;
ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
@@ -892,54 +1147,16 @@
return -EFAULT;
}
-#if defined(DEBUG)
- ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) &
- MLX5_WQE_CTRL_WQE_INDEX_MASK) >>
- MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
- if (wqe_index != ctrl_wqe_index) {
- mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
- wqe_index, qpn,
- ctrl_wqe_index);
- return -EFAULT;
- }
-
- ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
- MLX5_WQE_CTRL_QPN_SHIFT;
- if (qpn != ctrl_qpn) {
- mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
- wqe_index, qpn,
- ctrl_qpn);
- return -EFAULT;
- }
-#endif /* DEBUG */
-
*wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
*wqe += sizeof(*ctrl);
opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
MLX5_WQE_CTRL_OPCODE_MASK;
- switch (qp->ibqp.qp_type) {
- case IB_QPT_RC:
- transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps;
- break;
- case IB_QPT_UD:
- transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps;
- break;
- default:
- mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n",
- qp->ibqp.qp_type);
- return -EFAULT;
- }
+ if (qp->ibqp.qp_type == IB_QPT_XRC_INI)
+ *wqe += sizeof(struct mlx5_wqe_xrc_seg);
- if (unlikely(opcode >= ARRAY_SIZE(mlx5_ib_odp_opcode_cap) ||
- !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) {
- mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n",
- opcode);
- return -EFAULT;
- }
-
- if (qp->ibqp.qp_type != IB_QPT_RC) {
+ if (qp->type == IB_QPT_UD || qp->type == MLX5_IB_QPT_DCI) {
av = *wqe;
if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
*wqe += sizeof(struct mlx5_av);
@@ -964,22 +1181,35 @@
}
/*
- * Parse responder WQE. Advances the wqe pointer to point at the
- * scatter-gather list, and set wqe_end to the end of the WQE.
+ * Parse responder WQE and set wqe_end to the end of the WQE.
*/
-static int mlx5_ib_mr_responder_pfault_handler(
- struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
- struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
+static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_srq *srq,
+ void **wqe, void **wqe_end,
+ int wqe_length)
+{
+ int wqe_size = 1 << srq->msrq.wqe_shift;
+
+ if (wqe_size > wqe_length) {
+ mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
+ return -EFAULT;
+ }
+
+ *wqe_end = *wqe + wqe_size;
+ *wqe += sizeof(struct mlx5_wqe_srq_next_seg);
+
+ return 0;
+}
+
+static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev,
+ struct mlx5_ib_qp *qp,
+ void *wqe, void **wqe_end,
+ int wqe_length)
{
struct mlx5_ib_wq *wq = &qp->rq;
int wqe_size = 1 << wq->wqe_shift;
- if (qp->ibqp.srq) {
- mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n");
- return -EFAULT;
- }
-
- if (qp->wq_sig) {
+ if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) {
mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
return -EFAULT;
}
@@ -989,99 +1219,138 @@
return -EFAULT;
}
- switch (qp->ibqp.qp_type) {
- case IB_QPT_RC:
- if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
- IB_ODP_SUPPORT_RECV))
- goto invalid_transport_or_opcode;
- break;
- default:
-invalid_transport_or_opcode:
- mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
- qp->ibqp.qp_type);
- return -EFAULT;
- }
-
- *wqe_end = *wqe + wqe_size;
+ *wqe_end = wqe + wqe_size;
return 0;
}
-static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev,
- u32 wq_num)
+static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev,
+ u32 wq_num, int pf_type)
{
- struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num);
+ struct mlx5_core_rsc_common *common = NULL;
+ struct mlx5_core_srq *srq;
- if (!mqp) {
- mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num);
- return NULL;
+ switch (pf_type) {
+ case MLX5_WQE_PF_TYPE_RMP:
+ srq = mlx5_cmd_get_srq(dev, wq_num);
+ if (srq)
+ common = &srq->common;
+ break;
+ case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE:
+ case MLX5_WQE_PF_TYPE_RESP:
+ case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC:
+ common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP);
+ break;
+ default:
+ break;
}
+ return common;
+}
+
+static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res)
+{
+ struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res;
+
return to_mibqp(mqp);
+}
+
+static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res)
+{
+ struct mlx5_core_srq *msrq =
+ container_of(res, struct mlx5_core_srq, common);
+
+ return to_mibsrq(msrq);
}
static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
struct mlx5_pagefault *pfault)
{
- int ret;
- void *wqe, *wqe_end;
- u32 bytes_mapped, total_wqe_bytes;
- char *buffer = NULL;
- int resume_with_error = 1;
+ bool sq = pfault->type & MLX5_PFAULT_REQUESTOR;
u16 wqe_index = pfault->wqe.wqe_index;
- int requestor = pfault->type & MLX5_PFAULT_REQUESTOR;
+ void *wqe, *wqe_start = NULL, *wqe_end = NULL;
+ u32 bytes_mapped, total_wqe_bytes;
+ struct mlx5_core_rsc_common *res;
+ int resume_with_error = 1;
struct mlx5_ib_qp *qp;
+ size_t bytes_copied;
+ int ret = 0;
- buffer = (char *)__get_free_page(GFP_KERNEL);
- if (!buffer) {
+ res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type);
+ if (!res) {
+ mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num);
+ return;
+ }
+
+ if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ &&
+ res->res != MLX5_RES_XSRQ) {
+ mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n",
+ pfault->type);
+ goto resolve_page_fault;
+ }
+
+ wqe_start = (void *)__get_free_page(GFP_KERNEL);
+ if (!wqe_start) {
mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
goto resolve_page_fault;
}
- qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num);
- if (!qp)
- goto resolve_page_fault;
+ wqe = wqe_start;
+ qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL;
+ if (qp && sq) {
+ ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
+ &bytes_copied);
+ if (ret)
+ goto read_user;
+ ret = mlx5_ib_mr_initiator_pfault_handler(
+ dev, pfault, qp, &wqe, &wqe_end, bytes_copied);
+ } else if (qp && !sq) {
+ ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
+ &bytes_copied);
+ if (ret)
+ goto read_user;
+ ret = mlx5_ib_mr_responder_pfault_handler_rq(
+ dev, qp, wqe, &wqe_end, bytes_copied);
+ } else if (!qp) {
+ struct mlx5_ib_srq *srq = res_to_srq(res);
- ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
- PAGE_SIZE, &qp->trans_qp.base);
- if (ret < 0) {
- mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n",
- ret, wqe_index, pfault->token);
- goto resolve_page_fault;
+ ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
+ &bytes_copied);
+ if (ret)
+ goto read_user;
+ ret = mlx5_ib_mr_responder_pfault_handler_srq(
+ dev, srq, &wqe, &wqe_end, bytes_copied);
}
- wqe = buffer;
- if (requestor)
- ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe,
- &wqe_end, ret);
- else
- ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe,
- &wqe_end, ret);
- if (ret < 0)
+ if (ret < 0 || wqe >= wqe_end)
goto resolve_page_fault;
- if (wqe >= wqe_end) {
- mlx5_ib_err(dev, "ODP fault on invalid WQE.\n");
- goto resolve_page_fault;
- }
+ ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped,
+ &total_wqe_bytes, !sq);
+ if (ret == -EAGAIN)
+ goto out;
- ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end,
- &bytes_mapped, &total_wqe_bytes,
- !requestor);
- if (ret == -EAGAIN) {
- resume_with_error = 0;
+ if (ret < 0 || total_wqe_bytes > bytes_mapped)
goto resolve_page_fault;
- } else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
- goto resolve_page_fault;
- }
+out:
+ ret = 0;
resume_with_error = 0;
+
+read_user:
+ if (ret)
+ mlx5_ib_err(
+ dev,
+ "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n",
+ ret, wqe_index, pfault->token);
+
resolve_page_fault:
mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
pfault->wqe.wq_num, resume_with_error,
pfault->type);
- free_page((unsigned long)buffer);
+ mlx5_core_res_put(res);
+ free_page((unsigned long)wqe_start);
}
static int pages_in_range(u64 address, u32 length)
@@ -1123,7 +1392,7 @@
prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
}
- ret = pagefault_single_data_segment(dev, rkey, address, length,
+ ret = pagefault_single_data_segment(dev, NULL, rkey, address, length,
&pfault->bytes_committed, NULL);
if (ret == -EAGAIN) {
/* We're racing with an invalidation, don't prefetch */
@@ -1149,7 +1418,7 @@
if (prefetch_activated) {
u32 bytes_committed = 0;
- ret = pagefault_single_data_segment(dev, rkey, address,
+ ret = pagefault_single_data_segment(dev, NULL, rkey, address,
prefetch_len,
&bytes_committed, NULL);
if (ret < 0 && ret != -EAGAIN) {
@@ -1159,10 +1428,8 @@
}
}
-void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
- struct mlx5_pagefault *pfault)
+static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
{
- struct mlx5_ib_dev *dev = context;
u8 event_subtype = pfault->event_subtype;
switch (event_subtype) {
@@ -1177,6 +1444,212 @@
event_subtype);
mlx5_ib_page_fault_resume(dev, pfault, 1);
}
+}
+
+static void mlx5_ib_eqe_pf_action(struct work_struct *work)
+{
+ struct mlx5_pagefault *pfault = container_of(work,
+ struct mlx5_pagefault,
+ work);
+ struct mlx5_ib_pf_eq *eq = pfault->eq;
+
+ mlx5_ib_pfault(eq->dev, pfault);
+ mempool_free(pfault, eq->pool);
+}
+
+static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
+{
+ struct mlx5_eqe_page_fault *pf_eqe;
+ struct mlx5_pagefault *pfault;
+ struct mlx5_eqe *eqe;
+ int cc = 0;
+
+ while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) {
+ pfault = mempool_alloc(eq->pool, GFP_ATOMIC);
+ if (!pfault) {
+ schedule_work(&eq->work);
+ break;
+ }
+
+ pf_eqe = &eqe->data.page_fault;
+ pfault->event_subtype = eqe->sub_type;
+ pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
+
+ mlx5_ib_dbg(eq->dev,
+ "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
+ eqe->sub_type, pfault->bytes_committed);
+
+ switch (eqe->sub_type) {
+ case MLX5_PFAULT_SUBTYPE_RDMA:
+ /* RDMA based event */
+ pfault->type =
+ be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
+ pfault->token =
+ be32_to_cpu(pf_eqe->rdma.pftype_token) &
+ MLX5_24BIT_MASK;
+ pfault->rdma.r_key =
+ be32_to_cpu(pf_eqe->rdma.r_key);
+ pfault->rdma.packet_size =
+ be16_to_cpu(pf_eqe->rdma.packet_length);
+ pfault->rdma.rdma_op_len =
+ be32_to_cpu(pf_eqe->rdma.rdma_op_len);
+ pfault->rdma.rdma_va =
+ be64_to_cpu(pf_eqe->rdma.rdma_va);
+ mlx5_ib_dbg(eq->dev,
+ "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
+ pfault->type, pfault->token,
+ pfault->rdma.r_key);
+ mlx5_ib_dbg(eq->dev,
+ "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
+ pfault->rdma.rdma_op_len,
+ pfault->rdma.rdma_va);
+ break;
+
+ case MLX5_PFAULT_SUBTYPE_WQE:
+ /* WQE based event */
+ pfault->type =
+ (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
+ pfault->token =
+ be32_to_cpu(pf_eqe->wqe.token);
+ pfault->wqe.wq_num =
+ be32_to_cpu(pf_eqe->wqe.pftype_wq) &
+ MLX5_24BIT_MASK;
+ pfault->wqe.wqe_index =
+ be16_to_cpu(pf_eqe->wqe.wqe_index);
+ pfault->wqe.packet_size =
+ be16_to_cpu(pf_eqe->wqe.packet_length);
+ mlx5_ib_dbg(eq->dev,
+ "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
+ pfault->type, pfault->token,
+ pfault->wqe.wq_num,
+ pfault->wqe.wqe_index);
+ break;
+
+ default:
+ mlx5_ib_warn(eq->dev,
+ "Unsupported page fault event sub-type: 0x%02hhx\n",
+ eqe->sub_type);
+ /* Unsupported page faults should still be
+ * resolved by the page fault handler
+ */
+ }
+
+ pfault->eq = eq;
+ INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action);
+ queue_work(eq->wq, &pfault->work);
+
+ cc = mlx5_eq_update_cc(eq->core, ++cc);
+ }
+
+ mlx5_eq_update_ci(eq->core, cc, 1);
+}
+
+static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type,
+ void *data)
+{
+ struct mlx5_ib_pf_eq *eq =
+ container_of(nb, struct mlx5_ib_pf_eq, irq_nb);
+ unsigned long flags;
+
+ if (spin_trylock_irqsave(&eq->lock, flags)) {
+ mlx5_ib_eq_pf_process(eq);
+ spin_unlock_irqrestore(&eq->lock, flags);
+ } else {
+ schedule_work(&eq->work);
+ }
+
+ return IRQ_HANDLED;
+}
+
+/* mempool_refill() was proposed but unfortunately wasn't accepted
+ * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
+ * Cheap workaround.
+ */
+static void mempool_refill(mempool_t *pool)
+{
+ while (pool->curr_nr < pool->min_nr)
+ mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
+}
+
+static void mlx5_ib_eq_pf_action(struct work_struct *work)
+{
+ struct mlx5_ib_pf_eq *eq =
+ container_of(work, struct mlx5_ib_pf_eq, work);
+
+ mempool_refill(eq->pool);
+
+ spin_lock_irq(&eq->lock);
+ mlx5_ib_eq_pf_process(eq);
+ spin_unlock_irq(&eq->lock);
+}
+
+enum {
+ MLX5_IB_NUM_PF_EQE = 0x1000,
+ MLX5_IB_NUM_PF_DRAIN = 64,
+};
+
+static int
+mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
+{
+ struct mlx5_eq_param param = {};
+ int err;
+
+ INIT_WORK(&eq->work, mlx5_ib_eq_pf_action);
+ spin_lock_init(&eq->lock);
+ eq->dev = dev;
+
+ eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN,
+ sizeof(struct mlx5_pagefault));
+ if (!eq->pool)
+ return -ENOMEM;
+
+ eq->wq = alloc_workqueue("mlx5_ib_page_fault",
+ WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM,
+ MLX5_NUM_CMD_EQE);
+ if (!eq->wq) {
+ err = -ENOMEM;
+ goto err_mempool;
+ }
+
+ eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
+ param = (struct mlx5_eq_param) {
+ .irq_index = 0,
+ .nent = MLX5_IB_NUM_PF_EQE,
+ };
+ param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
+ eq->core = mlx5_eq_create_generic(dev->mdev, ¶m);
+ if (IS_ERR(eq->core)) {
+ err = PTR_ERR(eq->core);
+ goto err_wq;
+ }
+ err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb);
+ if (err) {
+ mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err);
+ goto err_eq;
+ }
+
+ return 0;
+err_eq:
+ mlx5_eq_destroy_generic(dev->mdev, eq->core);
+err_wq:
+ destroy_workqueue(eq->wq);
+err_mempool:
+ mempool_destroy(eq->pool);
+ return err;
+}
+
+static int
+mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
+{
+ int err;
+
+ mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb);
+ err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
+ cancel_work_sync(&eq->work);
+ destroy_workqueue(eq->wq);
+ mempool_destroy(eq->pool);
+
+ return err;
}
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
@@ -1205,9 +1678,18 @@
}
}
+static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
+ .advise_mr = mlx5_ib_advise_mr,
+};
+
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
{
- int ret;
+ int ret = 0;
+
+ if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
+ return ret;
+
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
@@ -1217,7 +1699,17 @@
}
}
- return 0;
+ ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq);
+
+ return ret;
+}
+
+void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
+{
+ if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
+ return;
+
+ mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq);
}
int mlx5_ib_odp_init(void)
@@ -1228,3 +1720,175 @@
return 0;
}
+struct prefetch_mr_work {
+ struct work_struct work;
+ u32 pf_flags;
+ u32 num_sge;
+ struct {
+ u64 io_virt;
+ struct mlx5_ib_mr *mr;
+ size_t length;
+ } frags[];
+};
+
+static void destroy_prefetch_work(struct prefetch_mr_work *work)
+{
+ u32 i;
+
+ for (i = 0; i < work->num_sge; ++i)
+ if (atomic_dec_and_test(&work->frags[i].mr->num_deferred_work))
+ wake_up(&work->frags[i].mr->q_deferred_work);
+ kvfree(work);
+}
+
+static struct mlx5_ib_mr *
+get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
+ u32 lkey)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ struct mlx5_core_mkey *mmkey;
+ struct ib_umem_odp *odp;
+ struct mlx5_ib_mr *mr;
+
+ lockdep_assert_held(&dev->odp_srcu);
+
+ mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey));
+ if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR)
+ return NULL;
+
+ mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+
+ if (mr->ibmr.pd != pd)
+ return NULL;
+
+ odp = to_ib_umem_odp(mr->umem);
+
+ /* prefetch with write-access must be supported by the MR */
+ if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
+ !odp->umem.writable)
+ return NULL;
+
+ return mr;
+}
+
+static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
+{
+ struct prefetch_mr_work *work =
+ container_of(w, struct prefetch_mr_work, work);
+ struct mlx5_ib_dev *dev;
+ u32 bytes_mapped = 0;
+ int srcu_key;
+ int ret;
+ u32 i;
+
+ /* We rely on IB/core that work is executed if we have num_sge != 0 only. */
+ WARN_ON(!work->num_sge);
+ dev = work->frags[0].mr->dev;
+ /* SRCU should be held when calling to mlx5_odp_populate_xlt() */
+ srcu_key = srcu_read_lock(&dev->odp_srcu);
+ for (i = 0; i < work->num_sge; ++i) {
+ ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
+ work->frags[i].length, &bytes_mapped,
+ work->pf_flags);
+ if (ret <= 0)
+ continue;
+ mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
+ }
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
+
+ destroy_prefetch_work(work);
+}
+
+static bool init_prefetch_work(struct ib_pd *pd,
+ enum ib_uverbs_advise_mr_advice advice,
+ u32 pf_flags, struct prefetch_mr_work *work,
+ struct ib_sge *sg_list, u32 num_sge)
+{
+ u32 i;
+
+ INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
+ work->pf_flags = pf_flags;
+
+ for (i = 0; i < num_sge; ++i) {
+ work->frags[i].io_virt = sg_list[i].addr;
+ work->frags[i].length = sg_list[i].length;
+ work->frags[i].mr =
+ get_prefetchable_mr(pd, advice, sg_list[i].lkey);
+ if (!work->frags[i].mr) {
+ work->num_sge = i;
+ return false;
+ }
+
+ /* Keep the MR pointer will valid outside the SRCU */
+ atomic_inc(&work->frags[i].mr->num_deferred_work);
+ }
+ work->num_sge = num_sge;
+ return true;
+}
+
+static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
+ enum ib_uverbs_advise_mr_advice advice,
+ u32 pf_flags, struct ib_sge *sg_list,
+ u32 num_sge)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ u32 bytes_mapped = 0;
+ int srcu_key;
+ int ret = 0;
+ u32 i;
+
+ srcu_key = srcu_read_lock(&dev->odp_srcu);
+ for (i = 0; i < num_sge; ++i) {
+ struct mlx5_ib_mr *mr;
+
+ mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
+ if (!mr) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
+ &bytes_mapped, pf_flags);
+ if (ret < 0)
+ goto out;
+ mlx5_update_odp_stats(mr, prefetch, ret);
+ }
+ ret = 0;
+
+out:
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
+ return ret;
+}
+
+int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
+ enum ib_uverbs_advise_mr_advice advice,
+ u32 flags, struct ib_sge *sg_list, u32 num_sge)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ u32 pf_flags = 0;
+ struct prefetch_mr_work *work;
+ int srcu_key;
+
+ if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
+ pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
+
+ if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
+ pf_flags |= MLX5_PF_FLAGS_SNAPSHOT;
+
+ if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
+ return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list,
+ num_sge);
+
+ work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ srcu_key = srcu_read_lock(&dev->odp_srcu);
+ if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) {
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
+ destroy_prefetch_work(work);
+ return -EINVAL;
+ }
+ queue_work(system_unbound_wq, &work->work);
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
+ return 0;
+}
--
Gitblit v1.6.2