hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/drivers/infiniband/hw/mlx5/odp.c
....@@ -36,6 +36,47 @@
3636
3737 #include "mlx5_ib.h"
3838 #include "cmd.h"
39
+#include "qp.h"
40
+
41
+#include <linux/mlx5/eq.h>
42
+
43
+/* Contains the details of a pagefault. */
44
+struct mlx5_pagefault {
45
+ u32 bytes_committed;
46
+ u32 token;
47
+ u8 event_subtype;
48
+ u8 type;
49
+ union {
50
+ /* Initiator or send message responder pagefault details. */
51
+ struct {
52
+ /* Received packet size, only valid for responders. */
53
+ u32 packet_size;
54
+ /*
55
+ * Number of resource holding WQE, depends on type.
56
+ */
57
+ u32 wq_num;
58
+ /*
59
+ * WQE index. Refers to either the send queue or
60
+ * receive queue, according to event_subtype.
61
+ */
62
+ u16 wqe_index;
63
+ } wqe;
64
+ /* RDMA responder pagefault details */
65
+ struct {
66
+ u32 r_key;
67
+ /*
68
+ * Received packet size, minimal size page fault
69
+ * resolution required for forward progress.
70
+ */
71
+ u32 packet_size;
72
+ u32 rdma_op_len;
73
+ u64 rdma_va;
74
+ } rdma;
75
+ };
76
+
77
+ struct mlx5_ib_pf_eq *eq;
78
+ struct work_struct work;
79
+};
3980
4081 #define MAX_PREFETCH_LEN (4*1024*1024U)
4182
....@@ -53,145 +94,224 @@
5394
5495 static u64 mlx5_imr_ksm_entries;
5596
56
-static int check_parent(struct ib_umem_odp *odp,
57
- struct mlx5_ib_mr *parent)
97
+static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
98
+ struct mlx5_ib_mr *imr, int flags)
5899 {
59
- struct mlx5_ib_mr *mr = odp->private;
60
-
61
- return mr && mr->parent == parent && !odp->dying;
62
-}
63
-
64
-static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp)
65
-{
66
- struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent;
67
- struct ib_ucontext *ctx = odp->umem->context;
68
- struct rb_node *rb;
69
-
70
- down_read(&ctx->umem_rwsem);
71
- while (1) {
72
- rb = rb_next(&odp->interval_tree.rb);
73
- if (!rb)
74
- goto not_found;
75
- odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
76
- if (check_parent(odp, parent))
77
- goto end;
78
- }
79
-not_found:
80
- odp = NULL;
81
-end:
82
- up_read(&ctx->umem_rwsem);
83
- return odp;
84
-}
85
-
86
-static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx,
87
- u64 start, u64 length,
88
- struct mlx5_ib_mr *parent)
89
-{
90
- struct ib_umem_odp *odp;
91
- struct rb_node *rb;
92
-
93
- down_read(&ctx->umem_rwsem);
94
- odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length);
95
- if (!odp)
96
- goto end;
97
-
98
- while (1) {
99
- if (check_parent(odp, parent))
100
- goto end;
101
- rb = rb_next(&odp->interval_tree.rb);
102
- if (!rb)
103
- goto not_found;
104
- odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
105
- if (ib_umem_start(odp->umem) > start + length)
106
- goto not_found;
107
- }
108
-not_found:
109
- odp = NULL;
110
-end:
111
- up_read(&ctx->umem_rwsem);
112
- return odp;
113
-}
114
-
115
-void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
116
- size_t nentries, struct mlx5_ib_mr *mr, int flags)
117
-{
118
- struct ib_pd *pd = mr->ibmr.pd;
119
- struct ib_ucontext *ctx = pd->uobject->context;
120
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
121
- struct ib_umem_odp *odp;
122
- unsigned long va;
123
- int i;
100
+ struct mlx5_klm *end = pklm + nentries;
124101
125102 if (flags & MLX5_IB_UPD_XLT_ZAP) {
126
- for (i = 0; i < nentries; i++, pklm++) {
103
+ for (; pklm != end; pklm++, idx++) {
127104 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
128
- pklm->key = cpu_to_be32(dev->null_mkey);
105
+ pklm->key = cpu_to_be32(imr->dev->null_mkey);
129106 pklm->va = 0;
130107 }
131108 return;
132109 }
133110
134
- odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE,
135
- nentries * MLX5_IMR_MTT_SIZE, mr);
111
+ /*
112
+ * The locking here is pretty subtle. Ideally the implicit_children
113
+ * xarray would be protected by the umem_mutex, however that is not
114
+ * possible. Instead this uses a weaker update-then-lock pattern:
115
+ *
116
+ * srcu_read_lock()
117
+ * xa_store()
118
+ * mutex_lock(umem_mutex)
119
+ * mlx5_ib_update_xlt()
120
+ * mutex_unlock(umem_mutex)
121
+ * destroy lkey
122
+ *
123
+ * ie any change the xarray must be followed by the locked update_xlt
124
+ * before destroying.
125
+ *
126
+ * The umem_mutex provides the acquire/release semantic needed to make
127
+ * the xa_store() visible to a racing thread. While SRCU is not
128
+ * technically required, using it gives consistent use of the SRCU
129
+ * locking around the xarray.
130
+ */
131
+ lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
132
+ lockdep_assert_held(&imr->dev->odp_srcu);
136133
137
- for (i = 0; i < nentries; i++, pklm++) {
134
+ for (; pklm != end; pklm++, idx++) {
135
+ struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
136
+
138137 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
139
- va = (offset + i) * MLX5_IMR_MTT_SIZE;
140
- if (odp && odp->umem->address == va) {
141
- struct mlx5_ib_mr *mtt = odp->private;
142
-
138
+ if (mtt) {
143139 pklm->key = cpu_to_be32(mtt->ibmr.lkey);
144
- odp = odp_next(odp);
140
+ pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
145141 } else {
146
- pklm->key = cpu_to_be32(dev->null_mkey);
142
+ pklm->key = cpu_to_be32(imr->dev->null_mkey);
143
+ pklm->va = 0;
147144 }
148
- mlx5_ib_dbg(dev, "[%d] va %lx key %x\n",
149
- i, va, be32_to_cpu(pklm->key));
150145 }
151146 }
152147
153
-static void mr_leaf_free_action(struct work_struct *work)
148
+static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
154149 {
155
- struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
156
- int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT;
157
- struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
150
+ u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
158151
159
- mr->parent = NULL;
160
- synchronize_srcu(&mr->dev->mr_srcu);
152
+ if (umem_dma & ODP_READ_ALLOWED_BIT)
153
+ mtt_entry |= MLX5_IB_MTT_READ;
154
+ if (umem_dma & ODP_WRITE_ALLOWED_BIT)
155
+ mtt_entry |= MLX5_IB_MTT_WRITE;
161156
162
- ib_umem_release(odp->umem);
163
- if (imr->live)
164
- mlx5_ib_update_xlt(imr, idx, 1, 0,
165
- MLX5_IB_UPD_XLT_INDIRECT |
166
- MLX5_IB_UPD_XLT_ATOMIC);
167
- mlx5_mr_cache_free(mr->dev, mr);
168
-
169
- if (atomic_dec_and_test(&imr->num_leaf_free))
170
- wake_up(&imr->q_leaf_free);
157
+ return mtt_entry;
171158 }
172159
173
-void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
174
- unsigned long end)
160
+static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
161
+ struct mlx5_ib_mr *mr, int flags)
175162 {
163
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
164
+ dma_addr_t pa;
165
+ size_t i;
166
+
167
+ if (flags & MLX5_IB_UPD_XLT_ZAP)
168
+ return;
169
+
170
+ for (i = 0; i < nentries; i++) {
171
+ pa = odp->dma_list[idx + i];
172
+ pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
173
+ }
174
+}
175
+
176
+void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
177
+ struct mlx5_ib_mr *mr, int flags)
178
+{
179
+ if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
180
+ populate_klm(xlt, idx, nentries, mr, flags);
181
+ } else {
182
+ populate_mtt(xlt, idx, nentries, mr, flags);
183
+ }
184
+}
185
+
186
+static void dma_fence_odp_mr(struct mlx5_ib_mr *mr)
187
+{
188
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
189
+
190
+ /* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */
191
+ mutex_lock(&odp->umem_mutex);
192
+ if (odp->npages) {
193
+ mlx5_mr_cache_invalidate(mr);
194
+ ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp),
195
+ ib_umem_end(odp));
196
+ WARN_ON(odp->npages);
197
+ }
198
+ odp->private = NULL;
199
+ mutex_unlock(&odp->umem_mutex);
200
+
201
+ if (!mr->cache_ent) {
202
+ mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey);
203
+ WARN_ON(mr->descs);
204
+ }
205
+}
206
+
207
+/*
208
+ * This must be called after the mr has been removed from implicit_children
209
+ * and the SRCU synchronized. NOTE: The MR does not necessarily have to be
210
+ * empty here, parallel page faults could have raced with the free process and
211
+ * added pages to it.
212
+ */
213
+static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
214
+{
215
+ struct mlx5_ib_mr *imr = mr->parent;
216
+ struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
217
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
218
+ unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
219
+ int srcu_key;
220
+
221
+ /* implicit_child_mr's are not allowed to have deferred work */
222
+ WARN_ON(atomic_read(&mr->num_deferred_work));
223
+
224
+ if (need_imr_xlt) {
225
+ srcu_key = srcu_read_lock(&mr->dev->odp_srcu);
226
+ mutex_lock(&odp_imr->umem_mutex);
227
+ mlx5_ib_update_xlt(mr->parent, idx, 1, 0,
228
+ MLX5_IB_UPD_XLT_INDIRECT |
229
+ MLX5_IB_UPD_XLT_ATOMIC);
230
+ mutex_unlock(&odp_imr->umem_mutex);
231
+ srcu_read_unlock(&mr->dev->odp_srcu, srcu_key);
232
+ }
233
+
234
+ dma_fence_odp_mr(mr);
235
+
236
+ mr->parent = NULL;
237
+ mlx5_mr_cache_free(mr->dev, mr);
238
+ ib_umem_odp_release(odp);
239
+ if (atomic_dec_and_test(&imr->num_deferred_work))
240
+ wake_up(&imr->q_deferred_work);
241
+}
242
+
243
+static void free_implicit_child_mr_work(struct work_struct *work)
244
+{
245
+ struct mlx5_ib_mr *mr =
246
+ container_of(work, struct mlx5_ib_mr, odp_destroy.work);
247
+
248
+ free_implicit_child_mr(mr, true);
249
+}
250
+
251
+static void free_implicit_child_mr_rcu(struct rcu_head *head)
252
+{
253
+ struct mlx5_ib_mr *mr =
254
+ container_of(head, struct mlx5_ib_mr, odp_destroy.rcu);
255
+
256
+ /* Freeing a MR is a sleeping operation, so bounce to a work queue */
257
+ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
258
+ queue_work(system_unbound_wq, &mr->odp_destroy.work);
259
+}
260
+
261
+static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
262
+{
263
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
264
+ unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
265
+ struct mlx5_ib_mr *imr = mr->parent;
266
+
267
+ xa_lock(&imr->implicit_children);
268
+ /*
269
+ * This can race with mlx5_ib_free_implicit_mr(), the first one to
270
+ * reach the xa lock wins the race and destroys the MR.
271
+ */
272
+ if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) !=
273
+ mr)
274
+ goto out_unlock;
275
+
276
+ atomic_inc(&imr->num_deferred_work);
277
+ call_srcu(&mr->dev->odp_srcu, &mr->odp_destroy.rcu,
278
+ free_implicit_child_mr_rcu);
279
+
280
+out_unlock:
281
+ xa_unlock(&imr->implicit_children);
282
+}
283
+
284
+static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
285
+ const struct mmu_notifier_range *range,
286
+ unsigned long cur_seq)
287
+{
288
+ struct ib_umem_odp *umem_odp =
289
+ container_of(mni, struct ib_umem_odp, notifier);
176290 struct mlx5_ib_mr *mr;
177291 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
178292 sizeof(struct mlx5_mtt)) - 1;
179293 u64 idx = 0, blk_start_idx = 0;
294
+ u64 invalidations = 0;
295
+ unsigned long start;
296
+ unsigned long end;
180297 int in_block = 0;
181298 u64 addr;
182299
183
- if (!umem || !umem->odp_data) {
184
- pr_err("invalidation called on NULL umem or non-ODP umem\n");
185
- return;
186
- }
300
+ if (!mmu_notifier_range_blockable(range))
301
+ return false;
187302
188
- mr = umem->odp_data->private;
303
+ mutex_lock(&umem_odp->umem_mutex);
304
+ mmu_interval_set_seq(mni, cur_seq);
305
+ /*
306
+ * If npages is zero then umem_odp->private may not be setup yet. This
307
+ * does not complete until after the first page is mapped for DMA.
308
+ */
309
+ if (!umem_odp->npages)
310
+ goto out;
311
+ mr = umem_odp->private;
189312
190
- if (!mr || !mr->ibmr.pd)
191
- return;
192
-
193
- start = max_t(u64, ib_umem_start(umem), start);
194
- end = min_t(u64, ib_umem_end(umem), end);
313
+ start = max_t(u64, ib_umem_start(umem_odp), range->start);
314
+ end = min_t(u64, ib_umem_end(umem_odp), range->end);
195315
196316 /*
197317 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
....@@ -199,21 +319,23 @@
199319 * overwrite the same MTTs. Concurent invalidations might race us,
200320 * but they will write 0s as well, so no difference in the end result.
201321 */
202
-
203
- for (addr = start; addr < end; addr += BIT(umem->page_shift)) {
204
- idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
322
+ for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) {
323
+ idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
205324 /*
206325 * Strive to write the MTTs in chunks, but avoid overwriting
207326 * non-existing MTTs. The huristic here can be improved to
208327 * estimate the cost of another UMR vs. the cost of bigger
209328 * UMR.
210329 */
211
- if (umem->odp_data->dma_list[idx] &
330
+ if (umem_odp->dma_list[idx] &
212331 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
213332 if (!in_block) {
214333 blk_start_idx = idx;
215334 in_block = 1;
216335 }
336
+
337
+ /* Count page invalidations */
338
+ invalidations += idx - blk_start_idx + 1;
217339 } else {
218340 u64 umr_offset = idx & umr_block_mask;
219341
....@@ -231,21 +353,27 @@
231353 idx - blk_start_idx + 1, 0,
232354 MLX5_IB_UPD_XLT_ZAP |
233355 MLX5_IB_UPD_XLT_ATOMIC);
356
+
357
+ mlx5_update_odp_stats(mr, invalidations, invalidations);
358
+
234359 /*
235360 * We are now sure that the device will not access the
236361 * memory. We can safely unmap it, and mark it as dirty if
237362 * needed.
238363 */
239364
240
- ib_umem_odp_unmap_dma_pages(umem, start, end);
365
+ ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
241366
242
- if (unlikely(!umem->npages && mr->parent &&
243
- !umem->odp_data->dying)) {
244
- WRITE_ONCE(umem->odp_data->dying, 1);
245
- atomic_inc(&mr->parent->num_leaf_free);
246
- schedule_work(&umem->odp_data->work);
247
- }
367
+ if (unlikely(!umem_odp->npages && mr->parent))
368
+ destroy_unused_implicit_child_mr(mr);
369
+out:
370
+ mutex_unlock(&umem_odp->umem_mutex);
371
+ return true;
248372 }
373
+
374
+const struct mmu_interval_notifier_ops mlx5_mn_ops = {
375
+ .invalidate = mlx5_ib_invalidate_range,
376
+};
249377
250378 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
251379 {
....@@ -253,7 +381,8 @@
253381
254382 memset(caps, 0, sizeof(*caps));
255383
256
- if (!MLX5_CAP_GEN(dev->mdev, pg))
384
+ if (!MLX5_CAP_GEN(dev->mdev, pg) ||
385
+ !mlx5_ib_can_load_pas_with_umr(dev, 0))
257386 return;
258387
259388 caps->general_caps = IB_ODP_SUPPORT;
....@@ -265,6 +394,9 @@
265394
266395 if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
267396 caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
397
+
398
+ if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive))
399
+ caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
268400
269401 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
270402 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
....@@ -281,12 +413,32 @@
281413 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
282414 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
283415
416
+ if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive))
417
+ caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
418
+
419
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send))
420
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND;
421
+
422
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive))
423
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV;
424
+
425
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write))
426
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE;
427
+
428
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read))
429
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ;
430
+
431
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic))
432
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
433
+
434
+ if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive))
435
+ caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
436
+
284437 if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
285438 MLX5_CAP_GEN(dev->mdev, null_mkey) &&
286
- MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
439
+ MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
440
+ !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
287441 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
288
-
289
- return;
290442 }
291443
292444 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
....@@ -295,318 +447,402 @@
295447 {
296448 int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
297449 pfault->wqe.wq_num : pfault->token;
298
- int ret = mlx5_core_page_fault_resume(dev->mdev,
299
- pfault->token,
300
- wq_num,
301
- pfault->type,
302
- error);
303
- if (ret)
304
- mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n",
305
- wq_num);
306
-}
307
-
308
-static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
309
- struct ib_umem *umem,
310
- bool ksm, int access_flags)
311
-{
312
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
313
- struct mlx5_ib_mr *mr;
450
+ u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {};
314451 int err;
315452
316
- mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY :
317
- MLX5_IMR_MTT_CACHE_ENTRY);
453
+ MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
454
+ MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type);
455
+ MLX5_SET(page_fault_resume_in, in, token, pfault->token);
456
+ MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
457
+ MLX5_SET(page_fault_resume_in, in, error, !!error);
318458
319
- if (IS_ERR(mr))
320
- return mr;
321
-
322
- mr->ibmr.pd = pd;
323
-
324
- mr->dev = dev;
325
- mr->access_flags = access_flags;
326
- mr->mmkey.iova = 0;
327
- mr->umem = umem;
328
-
329
- if (ksm) {
330
- err = mlx5_ib_update_xlt(mr, 0,
331
- mlx5_imr_ksm_entries,
332
- MLX5_KSM_PAGE_SHIFT,
333
- MLX5_IB_UPD_XLT_INDIRECT |
334
- MLX5_IB_UPD_XLT_ZAP |
335
- MLX5_IB_UPD_XLT_ENABLE);
336
-
337
- } else {
338
- err = mlx5_ib_update_xlt(mr, 0,
339
- MLX5_IMR_MTT_ENTRIES,
340
- PAGE_SHIFT,
341
- MLX5_IB_UPD_XLT_ZAP |
342
- MLX5_IB_UPD_XLT_ENABLE |
343
- MLX5_IB_UPD_XLT_ATOMIC);
344
- }
345
-
459
+ err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in);
346460 if (err)
347
- goto fail;
348
-
349
- mr->ibmr.lkey = mr->mmkey.key;
350
- mr->ibmr.rkey = mr->mmkey.key;
351
-
352
- mr->live = 1;
353
-
354
- mlx5_ib_dbg(dev, "key %x dev %p mr %p\n",
355
- mr->mmkey.key, dev->mdev, mr);
356
-
357
- return mr;
358
-
359
-fail:
360
- mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
361
- mlx5_mr_cache_free(dev, mr);
362
-
363
- return ERR_PTR(err);
461
+ mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n",
462
+ wq_num, err);
364463 }
365464
366
-static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr,
367
- u64 io_virt, size_t bcnt)
465
+static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
466
+ unsigned long idx)
368467 {
369
- struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context;
370
- struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device);
371
- struct ib_umem_odp *odp, *result = NULL;
372
- u64 addr = io_virt & MLX5_IMR_MTT_MASK;
373
- int nentries = 0, start_idx = 0, ret;
374
- struct mlx5_ib_mr *mtt;
375
- struct ib_umem *umem;
468
+ struct ib_umem_odp *odp;
469
+ struct mlx5_ib_mr *mr;
470
+ struct mlx5_ib_mr *ret;
471
+ int err;
376472
377
- mutex_lock(&mr->umem->odp_data->umem_mutex);
378
- odp = odp_lookup(ctx, addr, 1, mr);
473
+ odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
474
+ idx * MLX5_IMR_MTT_SIZE,
475
+ MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
476
+ if (IS_ERR(odp))
477
+ return ERR_CAST(odp);
379478
380
- mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n",
381
- io_virt, bcnt, addr, odp);
479
+ ret = mr = mlx5_mr_cache_alloc(imr->dev, MLX5_IMR_MTT_CACHE_ENTRY,
480
+ imr->access_flags);
481
+ if (IS_ERR(mr))
482
+ goto out_umem;
382483
383
-next_mr:
384
- if (likely(odp)) {
385
- if (nentries)
386
- nentries++;
387
- } else {
388
- umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE);
389
- if (IS_ERR(umem)) {
390
- mutex_unlock(&mr->umem->odp_data->umem_mutex);
391
- return ERR_CAST(umem);
392
- }
484
+ mr->ibmr.pd = imr->ibmr.pd;
485
+ mr->umem = &odp->umem;
486
+ mr->ibmr.lkey = mr->mmkey.key;
487
+ mr->ibmr.rkey = mr->mmkey.key;
488
+ mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE;
489
+ mr->parent = imr;
490
+ odp->private = mr;
393491
394
- mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags);
395
- if (IS_ERR(mtt)) {
396
- mutex_unlock(&mr->umem->odp_data->umem_mutex);
397
- ib_umem_release(umem);
398
- return ERR_CAST(mtt);
399
- }
400
-
401
- odp = umem->odp_data;
402
- odp->private = mtt;
403
- mtt->umem = umem;
404
- mtt->mmkey.iova = addr;
405
- mtt->parent = mr;
406
- INIT_WORK(&odp->work, mr_leaf_free_action);
407
-
408
- if (!nentries)
409
- start_idx = addr >> MLX5_IMR_MTT_SHIFT;
410
- nentries++;
492
+ err = mlx5_ib_update_xlt(mr, 0,
493
+ MLX5_IMR_MTT_ENTRIES,
494
+ PAGE_SHIFT,
495
+ MLX5_IB_UPD_XLT_ZAP |
496
+ MLX5_IB_UPD_XLT_ENABLE);
497
+ if (err) {
498
+ ret = ERR_PTR(err);
499
+ goto out_mr;
411500 }
412501
413
- /* Return first odp if region not covered by single one */
414
- if (likely(!result))
415
- result = odp;
416
-
417
- addr += MLX5_IMR_MTT_SIZE;
418
- if (unlikely(addr < io_virt + bcnt)) {
419
- odp = odp_next(odp);
420
- if (odp && odp->umem->address != addr)
421
- odp = NULL;
422
- goto next_mr;
423
- }
424
-
425
- if (unlikely(nentries)) {
426
- ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0,
427
- MLX5_IB_UPD_XLT_INDIRECT |
428
- MLX5_IB_UPD_XLT_ATOMIC);
429
- if (ret) {
430
- mlx5_ib_err(dev, "Failed to update PAS\n");
431
- result = ERR_PTR(ret);
502
+ /*
503
+ * Once the store to either xarray completes any error unwind has to
504
+ * use synchronize_srcu(). Avoid this with xa_reserve()
505
+ */
506
+ ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
507
+ GFP_KERNEL);
508
+ if (unlikely(ret)) {
509
+ if (xa_is_err(ret)) {
510
+ ret = ERR_PTR(xa_err(ret));
511
+ goto out_mr;
432512 }
513
+ /*
514
+ * Another thread beat us to creating the child mr, use
515
+ * theirs.
516
+ */
517
+ goto out_mr;
433518 }
434519
435
- mutex_unlock(&mr->umem->odp_data->umem_mutex);
436
- return result;
520
+ mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr);
521
+ return mr;
522
+
523
+out_mr:
524
+ mlx5_mr_cache_free(imr->dev, mr);
525
+out_umem:
526
+ ib_umem_odp_release(odp);
527
+ return ret;
437528 }
438529
439530 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
531
+ struct ib_udata *udata,
440532 int access_flags)
441533 {
442
- struct ib_ucontext *ctx = pd->ibpd.uobject->context;
534
+ struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
535
+ struct ib_umem_odp *umem_odp;
443536 struct mlx5_ib_mr *imr;
444
- struct ib_umem *umem;
537
+ int err;
445538
446
- umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0);
447
- if (IS_ERR(umem))
448
- return ERR_CAST(umem);
539
+ umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
540
+ if (IS_ERR(umem_odp))
541
+ return ERR_CAST(umem_odp);
449542
450
- imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
543
+ imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags);
451544 if (IS_ERR(imr)) {
452
- ib_umem_release(umem);
453
- return ERR_CAST(imr);
545
+ err = PTR_ERR(imr);
546
+ goto out_umem;
454547 }
455548
456
- imr->umem = umem;
457
- init_waitqueue_head(&imr->q_leaf_free);
458
- atomic_set(&imr->num_leaf_free, 0);
549
+ imr->ibmr.pd = &pd->ibpd;
550
+ imr->mmkey.iova = 0;
551
+ imr->umem = &umem_odp->umem;
552
+ imr->ibmr.lkey = imr->mmkey.key;
553
+ imr->ibmr.rkey = imr->mmkey.key;
554
+ imr->umem = &umem_odp->umem;
555
+ imr->is_odp_implicit = true;
556
+ atomic_set(&imr->num_deferred_work, 0);
557
+ init_waitqueue_head(&imr->q_deferred_work);
558
+ xa_init(&imr->implicit_children);
459559
560
+ err = mlx5_ib_update_xlt(imr, 0,
561
+ mlx5_imr_ksm_entries,
562
+ MLX5_KSM_PAGE_SHIFT,
563
+ MLX5_IB_UPD_XLT_INDIRECT |
564
+ MLX5_IB_UPD_XLT_ZAP |
565
+ MLX5_IB_UPD_XLT_ENABLE);
566
+ if (err)
567
+ goto out_mr;
568
+
569
+ err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key),
570
+ &imr->mmkey, GFP_KERNEL));
571
+ if (err)
572
+ goto out_mr;
573
+
574
+ mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr);
460575 return imr;
461
-}
462
-
463
-static int mr_leaf_free(struct ib_umem *umem, u64 start,
464
- u64 end, void *cookie)
465
-{
466
- struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie;
467
-
468
- if (mr->parent != imr)
469
- return 0;
470
-
471
- ib_umem_odp_unmap_dma_pages(umem,
472
- ib_umem_start(umem),
473
- ib_umem_end(umem));
474
-
475
- if (umem->odp_data->dying)
476
- return 0;
477
-
478
- WRITE_ONCE(umem->odp_data->dying, 1);
479
- atomic_inc(&imr->num_leaf_free);
480
- schedule_work(&umem->odp_data->work);
481
-
482
- return 0;
576
+out_mr:
577
+ mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
578
+ mlx5_mr_cache_free(dev, imr);
579
+out_umem:
580
+ ib_umem_odp_release(umem_odp);
581
+ return ERR_PTR(err);
483582 }
484583
485584 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
486585 {
487
- struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context;
586
+ struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
587
+ struct mlx5_ib_dev *dev = imr->dev;
588
+ struct list_head destroy_list;
589
+ struct mlx5_ib_mr *mtt;
590
+ struct mlx5_ib_mr *tmp;
591
+ unsigned long idx;
488592
489
- down_read(&ctx->umem_rwsem);
490
- rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
491
- mr_leaf_free, true, imr);
492
- up_read(&ctx->umem_rwsem);
593
+ INIT_LIST_HEAD(&destroy_list);
493594
494
- wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
595
+ xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key));
596
+ /*
597
+ * This stops the SRCU protected page fault path from touching either
598
+ * the imr or any children. The page fault path can only reach the
599
+ * children xarray via the imr.
600
+ */
601
+ synchronize_srcu(&dev->odp_srcu);
602
+
603
+ /*
604
+ * All work on the prefetch list must be completed, xa_erase() prevented
605
+ * new work from being created.
606
+ */
607
+ wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
608
+
609
+ /*
610
+ * At this point it is forbidden for any other thread to enter
611
+ * pagefault_mr() on this imr. It is already forbidden to call
612
+ * pagefault_mr() on an implicit child. Due to this additions to
613
+ * implicit_children are prevented.
614
+ */
615
+
616
+ /*
617
+ * Block destroy_unused_implicit_child_mr() from incrementing
618
+ * num_deferred_work.
619
+ */
620
+ xa_lock(&imr->implicit_children);
621
+ xa_for_each (&imr->implicit_children, idx, mtt) {
622
+ __xa_erase(&imr->implicit_children, idx);
623
+ list_add(&mtt->odp_destroy.elm, &destroy_list);
624
+ }
625
+ xa_unlock(&imr->implicit_children);
626
+
627
+ /*
628
+ * Wait for any concurrent destroy_unused_implicit_child_mr() to
629
+ * complete.
630
+ */
631
+ wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
632
+
633
+ /*
634
+ * Fence the imr before we destroy the children. This allows us to
635
+ * skip updating the XLT of the imr during destroy of the child mkey
636
+ * the imr points to.
637
+ */
638
+ mlx5_mr_cache_invalidate(imr);
639
+
640
+ list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm)
641
+ free_implicit_child_mr(mtt, false);
642
+
643
+ mlx5_mr_cache_free(dev, imr);
644
+ ib_umem_odp_release(odp_imr);
495645 }
496646
497
-static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
498
- u64 io_virt, size_t bcnt, u32 *bytes_mapped)
647
+/**
648
+ * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR
649
+ * @mr: to fence
650
+ *
651
+ * On return no parallel threads will be touching this MR and no DMA will be
652
+ * active.
653
+ */
654
+void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
499655 {
656
+ /* Prevent new page faults and prefetch requests from succeeding */
657
+ xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
658
+
659
+ /* Wait for all running page-fault handlers to finish. */
660
+ synchronize_srcu(&mr->dev->odp_srcu);
661
+
662
+ wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work));
663
+
664
+ dma_fence_odp_mr(mr);
665
+}
666
+
667
+#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
668
+#define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
669
+#define MLX5_PF_FLAGS_ENABLE BIT(3)
670
+static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
671
+ u64 user_va, size_t bcnt, u32 *bytes_mapped,
672
+ u32 flags)
673
+{
674
+ int page_shift, ret, np;
675
+ bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
500676 u64 access_mask;
501
- int npages = 0, page_shift, np;
502
- u64 start_idx, page_mask;
503
- struct ib_umem_odp *odp;
504
- int current_seq;
505
- size_t size;
506
- int ret;
677
+ u64 start_idx;
678
+ bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
679
+ u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
507680
508
- if (!mr->umem->odp_data->page_list) {
509
- odp = implicit_mr_get_data(mr, io_virt, bcnt);
681
+ if (flags & MLX5_PF_FLAGS_ENABLE)
682
+ xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
510683
511
- if (IS_ERR(odp))
512
- return PTR_ERR(odp);
513
- mr = odp->private;
514
-
515
- } else {
516
- odp = mr->umem->odp_data;
517
- }
518
-
519
-next_mr:
520
- size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt);
521
-
522
- page_shift = mr->umem->page_shift;
523
- page_mask = ~(BIT(page_shift) - 1);
524
- start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
684
+ page_shift = odp->page_shift;
685
+ start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
525686 access_mask = ODP_READ_ALLOWED_BIT;
526687
527
- if (mr->umem->writable)
688
+ if (odp->umem.writable && !downgrade)
528689 access_mask |= ODP_WRITE_ALLOWED_BIT;
529690
530
- current_seq = READ_ONCE(odp->notifiers_seq);
691
+ np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
692
+ if (np < 0)
693
+ return np;
694
+
531695 /*
532
- * Ensure the sequence number is valid for some time before we call
533
- * gup.
696
+ * No need to check whether the MTTs really belong to this MR, since
697
+ * ib_umem_odp_map_dma_and_lock already checks this.
534698 */
535
- smp_rmb();
536
-
537
- ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size,
538
- access_mask, current_seq);
539
-
540
- if (ret < 0)
541
- goto out;
542
-
543
- np = ret;
544
-
545
- mutex_lock(&odp->umem_mutex);
546
- if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
547
- /*
548
- * No need to check whether the MTTs really belong to
549
- * this MR, since ib_umem_odp_map_dma_pages already
550
- * checks this.
551
- */
552
- ret = mlx5_ib_update_xlt(mr, start_idx, np,
553
- page_shift, MLX5_IB_UPD_XLT_ATOMIC);
554
- } else {
555
- ret = -EAGAIN;
556
- }
699
+ ret = mlx5_ib_update_xlt(mr, start_idx, np, page_shift, xlt_flags);
557700 mutex_unlock(&odp->umem_mutex);
558701
559702 if (ret < 0) {
560703 if (ret != -EAGAIN)
561
- mlx5_ib_err(dev, "Failed to update mkey page tables\n");
704
+ mlx5_ib_err(mr->dev,
705
+ "Failed to update mkey page tables\n");
562706 goto out;
563707 }
564708
565709 if (bytes_mapped) {
566710 u32 new_mappings = (np << page_shift) -
567
- (io_virt - round_down(io_virt, 1 << page_shift));
568
- *bytes_mapped += min_t(u32, new_mappings, size);
711
+ (user_va - round_down(user_va, 1 << page_shift));
712
+
713
+ *bytes_mapped += min_t(u32, new_mappings, bcnt);
569714 }
570715
571
- npages += np << (page_shift - PAGE_SHIFT);
572
- bcnt -= size;
573
-
574
- if (unlikely(bcnt)) {
575
- struct ib_umem_odp *next;
576
-
577
- io_virt += size;
578
- next = odp_next(odp);
579
- if (unlikely(!next || next->umem->address != io_virt)) {
580
- mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
581
- io_virt, next);
582
- return -EAGAIN;
583
- }
584
- odp = next;
585
- mr = odp->private;
586
- goto next_mr;
587
- }
588
-
589
- return npages;
716
+ return np << (page_shift - PAGE_SHIFT);
590717
591718 out:
592
- if (ret == -EAGAIN) {
593
- if (mr->parent || !odp->dying) {
594
- unsigned long timeout =
595
- msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
719
+ return ret;
720
+}
596721
597
- if (!wait_for_completion_timeout(
598
- &odp->notifier_completion,
599
- timeout)) {
600
- mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n",
601
- current_seq, odp->notifiers_seq);
722
+static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
723
+ struct ib_umem_odp *odp_imr, u64 user_va,
724
+ size_t bcnt, u32 *bytes_mapped, u32 flags)
725
+{
726
+ unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
727
+ unsigned long upd_start_idx = end_idx + 1;
728
+ unsigned long upd_len = 0;
729
+ unsigned long npages = 0;
730
+ int err;
731
+ int ret;
732
+
733
+ if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
734
+ mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
735
+ return -EFAULT;
736
+
737
+ /* Fault each child mr that intersects with our interval. */
738
+ while (bcnt) {
739
+ unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
740
+ struct ib_umem_odp *umem_odp;
741
+ struct mlx5_ib_mr *mtt;
742
+ u64 len;
743
+
744
+ mtt = xa_load(&imr->implicit_children, idx);
745
+ if (unlikely(!mtt)) {
746
+ mtt = implicit_get_child_mr(imr, idx);
747
+ if (IS_ERR(mtt)) {
748
+ ret = PTR_ERR(mtt);
749
+ goto out;
602750 }
603
- } else {
604
- /* The MR is being killed, kill the QP as well. */
605
- ret = -EFAULT;
751
+ upd_start_idx = min(upd_start_idx, idx);
752
+ upd_len = idx - upd_start_idx + 1;
606753 }
754
+
755
+ umem_odp = to_ib_umem_odp(mtt->umem);
756
+ len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) -
757
+ user_va;
758
+
759
+ ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
760
+ bytes_mapped, flags);
761
+ if (ret < 0)
762
+ goto out;
763
+ user_va += len;
764
+ bcnt -= len;
765
+ npages += ret;
607766 }
608767
768
+ ret = npages;
769
+
770
+ /*
771
+ * Any time the implicit_children are changed we must perform an
772
+ * update of the xlt before exiting to ensure the HW and the
773
+ * implicit_children remains synchronized.
774
+ */
775
+out:
776
+ if (likely(!upd_len))
777
+ return ret;
778
+
779
+ /*
780
+ * Notice this is not strictly ordered right, the KSM is updated after
781
+ * the implicit_children is updated, so a parallel page fault could
782
+ * see a MR that is not yet visible in the KSM. This is similar to a
783
+ * parallel page fault seeing a MR that is being concurrently removed
784
+ * from the KSM. Both of these improbable situations are resolved
785
+ * safely by resuming the HW and then taking another page fault. The
786
+ * next pagefault handler will see the new information.
787
+ */
788
+ mutex_lock(&odp_imr->umem_mutex);
789
+ err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0,
790
+ MLX5_IB_UPD_XLT_INDIRECT |
791
+ MLX5_IB_UPD_XLT_ATOMIC);
792
+ mutex_unlock(&odp_imr->umem_mutex);
793
+ if (err) {
794
+ mlx5_ib_err(imr->dev, "Failed to update PAS\n");
795
+ return err;
796
+ }
609797 return ret;
798
+}
799
+
800
+/*
801
+ * Returns:
802
+ * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
803
+ * not accessible, or the MR is no longer valid.
804
+ * -EAGAIN/-ENOMEM: The operation should be retried
805
+ *
806
+ * -EINVAL/others: General internal malfunction
807
+ * >0: Number of pages mapped
808
+ */
809
+static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
810
+ u32 *bytes_mapped, u32 flags)
811
+{
812
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
813
+
814
+ lockdep_assert_held(&mr->dev->odp_srcu);
815
+ if (unlikely(io_virt < mr->mmkey.iova))
816
+ return -EFAULT;
817
+
818
+ if (!odp->is_implicit_odp) {
819
+ u64 user_va;
820
+
821
+ if (check_add_overflow(io_virt - mr->mmkey.iova,
822
+ (u64)odp->umem.address, &user_va))
823
+ return -EFAULT;
824
+ if (unlikely(user_va >= ib_umem_end(odp) ||
825
+ ib_umem_end(odp) - user_va < bcnt))
826
+ return -EFAULT;
827
+ return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
828
+ flags);
829
+ }
830
+ return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
831
+ flags);
832
+}
833
+
834
+int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable)
835
+{
836
+ u32 flags = MLX5_PF_FLAGS_SNAPSHOT;
837
+ int ret;
838
+
839
+ if (enable)
840
+ flags |= MLX5_PF_FLAGS_ENABLE;
841
+
842
+ ret = pagefault_real_mr(mr, to_ib_umem_odp(mr->umem),
843
+ mr->umem->address, mr->umem->length, NULL,
844
+ flags);
845
+ return ret >= 0 ? 0 : ret;
610846 }
611847
612848 struct pf_frame {
....@@ -616,6 +852,30 @@
616852 size_t bcnt;
617853 int depth;
618854 };
855
+
856
+static bool mkey_is_eq(struct mlx5_core_mkey *mmkey, u32 key)
857
+{
858
+ if (!mmkey)
859
+ return false;
860
+ if (mmkey->type == MLX5_MKEY_MW)
861
+ return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key);
862
+ return mmkey->key == key;
863
+}
864
+
865
+static int get_indirect_num_descs(struct mlx5_core_mkey *mmkey)
866
+{
867
+ struct mlx5_ib_mw *mw;
868
+ struct mlx5_ib_devx_mr *devx_mr;
869
+
870
+ if (mmkey->type == MLX5_MKEY_MW) {
871
+ mw = container_of(mmkey, struct mlx5_ib_mw, mmkey);
872
+ return mw->ndescs;
873
+ }
874
+
875
+ devx_mr = container_of(mmkey, struct mlx5_ib_devx_mr,
876
+ mmkey);
877
+ return devx_mr->ndescs;
878
+}
619879
620880 /*
621881 * Handle a single data segment in a page-fault WQE or RDMA region.
....@@ -629,27 +889,43 @@
629889 * abort the page fault handling.
630890 */
631891 static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
632
- u32 key, u64 io_virt, size_t bcnt,
892
+ struct ib_pd *pd, u32 key,
893
+ u64 io_virt, size_t bcnt,
633894 u32 *bytes_committed,
634895 u32 *bytes_mapped)
635896 {
636897 int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
637898 struct pf_frame *head = NULL, *frame;
638899 struct mlx5_core_mkey *mmkey;
639
- struct mlx5_ib_mw *mw;
640900 struct mlx5_ib_mr *mr;
641901 struct mlx5_klm *pklm;
642902 u32 *out = NULL;
643903 size_t offset;
904
+ int ndescs;
644905
645
- srcu_key = srcu_read_lock(&dev->mr_srcu);
906
+ srcu_key = srcu_read_lock(&dev->odp_srcu);
646907
647908 io_virt += *bytes_committed;
648909 bcnt -= *bytes_committed;
649910
650911 next_mr:
651
- mmkey = __mlx5_mr_lookup(dev->mdev, mlx5_base_mkey(key));
652
- if (!mmkey || mmkey->key != key) {
912
+ mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
913
+ if (!mmkey) {
914
+ mlx5_ib_dbg(
915
+ dev,
916
+ "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
917
+ key);
918
+ if (bytes_mapped)
919
+ *bytes_mapped += bcnt;
920
+ /*
921
+ * The user could specify a SGL with multiple lkeys and only
922
+ * some of them are ODP. Treat the non-ODP ones as fully
923
+ * faulted.
924
+ */
925
+ ret = 0;
926
+ goto srcu_unlock;
927
+ }
928
+ if (!mkey_is_eq(mmkey, key)) {
653929 mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
654930 ret = -EFAULT;
655931 goto srcu_unlock;
....@@ -658,22 +934,20 @@
658934 switch (mmkey->type) {
659935 case MLX5_MKEY_MR:
660936 mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
661
- if (!mr->live || !mr->ibmr.pd) {
662
- mlx5_ib_dbg(dev, "got dead MR\n");
663
- ret = -EFAULT;
664
- goto srcu_unlock;
665
- }
666937
667
- ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped);
938
+ ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
668939 if (ret < 0)
669940 goto srcu_unlock;
941
+
942
+ mlx5_update_odp_stats(mr, faults, ret);
670943
671944 npages += ret;
672945 ret = 0;
673946 break;
674947
675948 case MLX5_MKEY_MW:
676
- mw = container_of(mmkey, struct mlx5_ib_mw, mmkey);
949
+ case MLX5_MKEY_INDIRECT_DEVX:
950
+ ndescs = get_indirect_num_descs(mmkey);
677951
678952 if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) {
679953 mlx5_ib_dbg(dev, "indirection level exceeded\n");
....@@ -682,7 +956,7 @@
682956 }
683957
684958 outlen = MLX5_ST_SZ_BYTES(query_mkey_out) +
685
- sizeof(*pklm) * (mw->ndescs - 2);
959
+ sizeof(*pklm) * (ndescs - 2);
686960
687961 if (outlen > cur_outlen) {
688962 kfree(out);
....@@ -697,14 +971,14 @@
697971 pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out,
698972 bsf0_klm0_pas_mtt0_1);
699973
700
- ret = mlx5_core_query_mkey(dev->mdev, &mw->mmkey, out, outlen);
974
+ ret = mlx5_core_query_mkey(dev->mdev, mmkey, out, outlen);
701975 if (ret)
702976 goto srcu_unlock;
703977
704978 offset = io_virt - MLX5_GET64(query_mkey_out, out,
705979 memory_key_mkey_entry.start_addr);
706980
707
- for (i = 0; bcnt && i < mw->ndescs; i++, pklm++) {
981
+ for (i = 0; bcnt && i < ndescs; i++, pklm++) {
708982 if (offset >= be32_to_cpu(pklm->bcount)) {
709983 offset -= be32_to_cpu(pklm->bcount);
710984 continue;
....@@ -756,7 +1030,7 @@
7561030 }
7571031 kfree(out);
7581032
759
- srcu_read_unlock(&dev->mr_srcu, srcu_key);
1033
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
7601034 *bytes_committed = 0;
7611035 return ret ? ret : npages;
7621036 }
....@@ -764,7 +1038,6 @@
7641038 /**
7651039 * Parse a series of data segments for page fault handling.
7661040 *
767
- * @qp the QP on which the fault occurred.
7681041 * @pfault contains page fault information.
7691042 * @wqe points at the first data segment in the WQE.
7701043 * @wqe_end points after the end of the WQE.
....@@ -781,9 +1054,9 @@
7811054 */
7821055 static int pagefault_data_segments(struct mlx5_ib_dev *dev,
7831056 struct mlx5_pagefault *pfault,
784
- struct mlx5_ib_qp *qp, void *wqe,
1057
+ void *wqe,
7851058 void *wqe_end, u32 *bytes_mapped,
786
- u32 *total_wqe_bytes, int receive_queue)
1059
+ u32 *total_wqe_bytes, bool receive_queue)
7871060 {
7881061 int ret = 0, npages = 0;
7891062 u64 io_virt;
....@@ -791,10 +1064,6 @@
7911064 u32 byte_count;
7921065 size_t bcnt;
7931066 int inline_segment;
794
-
795
- /* Skip SRQ next-WQE segment. */
796
- if (receive_queue && qp->ibqp.srq)
797
- wqe += sizeof(struct mlx5_wqe_srq_next_seg);
7981067
7991068 if (bytes_mapped)
8001069 *bytes_mapped = 0;
....@@ -839,7 +1108,8 @@
8391108 continue;
8401109 }
8411110
842
- ret = pagefault_single_data_segment(dev, key, io_virt, bcnt,
1111
+ ret = pagefault_single_data_segment(dev, NULL, key,
1112
+ io_virt, bcnt,
8431113 &pfault->bytes_committed,
8441114 bytes_mapped);
8451115 if (ret < 0)
....@@ -849,17 +1119,6 @@
8491119
8501120 return ret < 0 ? ret : npages;
8511121 }
852
-
853
-static const u32 mlx5_ib_odp_opcode_cap[] = {
854
- [MLX5_OPCODE_SEND] = IB_ODP_SUPPORT_SEND,
855
- [MLX5_OPCODE_SEND_IMM] = IB_ODP_SUPPORT_SEND,
856
- [MLX5_OPCODE_SEND_INVAL] = IB_ODP_SUPPORT_SEND,
857
- [MLX5_OPCODE_RDMA_WRITE] = IB_ODP_SUPPORT_WRITE,
858
- [MLX5_OPCODE_RDMA_WRITE_IMM] = IB_ODP_SUPPORT_WRITE,
859
- [MLX5_OPCODE_RDMA_READ] = IB_ODP_SUPPORT_READ,
860
- [MLX5_OPCODE_ATOMIC_CS] = IB_ODP_SUPPORT_ATOMIC,
861
- [MLX5_OPCODE_ATOMIC_FA] = IB_ODP_SUPPORT_ATOMIC,
862
-};
8631122
8641123 /*
8651124 * Parse initiator WQE. Advances the wqe pointer to point at the
....@@ -871,12 +1130,8 @@
8711130 {
8721131 struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
8731132 u16 wqe_index = pfault->wqe.wqe_index;
874
- u32 transport_caps;
8751133 struct mlx5_base_av *av;
8761134 unsigned ds, opcode;
877
-#if defined(DEBUG)
878
- u32 ctrl_wqe_index, ctrl_qpn;
879
-#endif
8801135 u32 qpn = qp->trans_qp.base.mqp.qpn;
8811136
8821137 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
....@@ -892,54 +1147,16 @@
8921147 return -EFAULT;
8931148 }
8941149
895
-#if defined(DEBUG)
896
- ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) &
897
- MLX5_WQE_CTRL_WQE_INDEX_MASK) >>
898
- MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
899
- if (wqe_index != ctrl_wqe_index) {
900
- mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
901
- wqe_index, qpn,
902
- ctrl_wqe_index);
903
- return -EFAULT;
904
- }
905
-
906
- ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
907
- MLX5_WQE_CTRL_QPN_SHIFT;
908
- if (qpn != ctrl_qpn) {
909
- mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
910
- wqe_index, qpn,
911
- ctrl_qpn);
912
- return -EFAULT;
913
- }
914
-#endif /* DEBUG */
915
-
9161150 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
9171151 *wqe += sizeof(*ctrl);
9181152
9191153 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
9201154 MLX5_WQE_CTRL_OPCODE_MASK;
9211155
922
- switch (qp->ibqp.qp_type) {
923
- case IB_QPT_RC:
924
- transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps;
925
- break;
926
- case IB_QPT_UD:
927
- transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps;
928
- break;
929
- default:
930
- mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n",
931
- qp->ibqp.qp_type);
932
- return -EFAULT;
933
- }
1156
+ if (qp->ibqp.qp_type == IB_QPT_XRC_INI)
1157
+ *wqe += sizeof(struct mlx5_wqe_xrc_seg);
9341158
935
- if (unlikely(opcode >= ARRAY_SIZE(mlx5_ib_odp_opcode_cap) ||
936
- !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) {
937
- mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n",
938
- opcode);
939
- return -EFAULT;
940
- }
941
-
942
- if (qp->ibqp.qp_type != IB_QPT_RC) {
1159
+ if (qp->type == IB_QPT_UD || qp->type == MLX5_IB_QPT_DCI) {
9431160 av = *wqe;
9441161 if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
9451162 *wqe += sizeof(struct mlx5_av);
....@@ -964,22 +1181,35 @@
9641181 }
9651182
9661183 /*
967
- * Parse responder WQE. Advances the wqe pointer to point at the
968
- * scatter-gather list, and set wqe_end to the end of the WQE.
1184
+ * Parse responder WQE and set wqe_end to the end of the WQE.
9691185 */
970
-static int mlx5_ib_mr_responder_pfault_handler(
971
- struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
972
- struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
1186
+static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev,
1187
+ struct mlx5_ib_srq *srq,
1188
+ void **wqe, void **wqe_end,
1189
+ int wqe_length)
1190
+{
1191
+ int wqe_size = 1 << srq->msrq.wqe_shift;
1192
+
1193
+ if (wqe_size > wqe_length) {
1194
+ mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
1195
+ return -EFAULT;
1196
+ }
1197
+
1198
+ *wqe_end = *wqe + wqe_size;
1199
+ *wqe += sizeof(struct mlx5_wqe_srq_next_seg);
1200
+
1201
+ return 0;
1202
+}
1203
+
1204
+static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev,
1205
+ struct mlx5_ib_qp *qp,
1206
+ void *wqe, void **wqe_end,
1207
+ int wqe_length)
9731208 {
9741209 struct mlx5_ib_wq *wq = &qp->rq;
9751210 int wqe_size = 1 << wq->wqe_shift;
9761211
977
- if (qp->ibqp.srq) {
978
- mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n");
979
- return -EFAULT;
980
- }
981
-
982
- if (qp->wq_sig) {
1212
+ if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) {
9831213 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
9841214 return -EFAULT;
9851215 }
....@@ -989,99 +1219,138 @@
9891219 return -EFAULT;
9901220 }
9911221
992
- switch (qp->ibqp.qp_type) {
993
- case IB_QPT_RC:
994
- if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
995
- IB_ODP_SUPPORT_RECV))
996
- goto invalid_transport_or_opcode;
997
- break;
998
- default:
999
-invalid_transport_or_opcode:
1000
- mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
1001
- qp->ibqp.qp_type);
1002
- return -EFAULT;
1003
- }
1004
-
1005
- *wqe_end = *wqe + wqe_size;
1222
+ *wqe_end = wqe + wqe_size;
10061223
10071224 return 0;
10081225 }
10091226
1010
-static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev,
1011
- u32 wq_num)
1227
+static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev,
1228
+ u32 wq_num, int pf_type)
10121229 {
1013
- struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num);
1230
+ struct mlx5_core_rsc_common *common = NULL;
1231
+ struct mlx5_core_srq *srq;
10141232
1015
- if (!mqp) {
1016
- mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num);
1017
- return NULL;
1233
+ switch (pf_type) {
1234
+ case MLX5_WQE_PF_TYPE_RMP:
1235
+ srq = mlx5_cmd_get_srq(dev, wq_num);
1236
+ if (srq)
1237
+ common = &srq->common;
1238
+ break;
1239
+ case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE:
1240
+ case MLX5_WQE_PF_TYPE_RESP:
1241
+ case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC:
1242
+ common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP);
1243
+ break;
1244
+ default:
1245
+ break;
10181246 }
10191247
1248
+ return common;
1249
+}
1250
+
1251
+static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res)
1252
+{
1253
+ struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res;
1254
+
10201255 return to_mibqp(mqp);
1256
+}
1257
+
1258
+static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res)
1259
+{
1260
+ struct mlx5_core_srq *msrq =
1261
+ container_of(res, struct mlx5_core_srq, common);
1262
+
1263
+ return to_mibsrq(msrq);
10211264 }
10221265
10231266 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
10241267 struct mlx5_pagefault *pfault)
10251268 {
1026
- int ret;
1027
- void *wqe, *wqe_end;
1028
- u32 bytes_mapped, total_wqe_bytes;
1029
- char *buffer = NULL;
1030
- int resume_with_error = 1;
1269
+ bool sq = pfault->type & MLX5_PFAULT_REQUESTOR;
10311270 u16 wqe_index = pfault->wqe.wqe_index;
1032
- int requestor = pfault->type & MLX5_PFAULT_REQUESTOR;
1271
+ void *wqe, *wqe_start = NULL, *wqe_end = NULL;
1272
+ u32 bytes_mapped, total_wqe_bytes;
1273
+ struct mlx5_core_rsc_common *res;
1274
+ int resume_with_error = 1;
10331275 struct mlx5_ib_qp *qp;
1276
+ size_t bytes_copied;
1277
+ int ret = 0;
10341278
1035
- buffer = (char *)__get_free_page(GFP_KERNEL);
1036
- if (!buffer) {
1279
+ res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type);
1280
+ if (!res) {
1281
+ mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num);
1282
+ return;
1283
+ }
1284
+
1285
+ if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ &&
1286
+ res->res != MLX5_RES_XSRQ) {
1287
+ mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n",
1288
+ pfault->type);
1289
+ goto resolve_page_fault;
1290
+ }
1291
+
1292
+ wqe_start = (void *)__get_free_page(GFP_KERNEL);
1293
+ if (!wqe_start) {
10371294 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
10381295 goto resolve_page_fault;
10391296 }
10401297
1041
- qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num);
1042
- if (!qp)
1043
- goto resolve_page_fault;
1298
+ wqe = wqe_start;
1299
+ qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL;
1300
+ if (qp && sq) {
1301
+ ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
1302
+ &bytes_copied);
1303
+ if (ret)
1304
+ goto read_user;
1305
+ ret = mlx5_ib_mr_initiator_pfault_handler(
1306
+ dev, pfault, qp, &wqe, &wqe_end, bytes_copied);
1307
+ } else if (qp && !sq) {
1308
+ ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
1309
+ &bytes_copied);
1310
+ if (ret)
1311
+ goto read_user;
1312
+ ret = mlx5_ib_mr_responder_pfault_handler_rq(
1313
+ dev, qp, wqe, &wqe_end, bytes_copied);
1314
+ } else if (!qp) {
1315
+ struct mlx5_ib_srq *srq = res_to_srq(res);
10441316
1045
- ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
1046
- PAGE_SIZE, &qp->trans_qp.base);
1047
- if (ret < 0) {
1048
- mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n",
1049
- ret, wqe_index, pfault->token);
1050
- goto resolve_page_fault;
1317
+ ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
1318
+ &bytes_copied);
1319
+ if (ret)
1320
+ goto read_user;
1321
+ ret = mlx5_ib_mr_responder_pfault_handler_srq(
1322
+ dev, srq, &wqe, &wqe_end, bytes_copied);
10511323 }
10521324
1053
- wqe = buffer;
1054
- if (requestor)
1055
- ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe,
1056
- &wqe_end, ret);
1057
- else
1058
- ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe,
1059
- &wqe_end, ret);
1060
- if (ret < 0)
1325
+ if (ret < 0 || wqe >= wqe_end)
10611326 goto resolve_page_fault;
10621327
1063
- if (wqe >= wqe_end) {
1064
- mlx5_ib_err(dev, "ODP fault on invalid WQE.\n");
1065
- goto resolve_page_fault;
1066
- }
1328
+ ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped,
1329
+ &total_wqe_bytes, !sq);
1330
+ if (ret == -EAGAIN)
1331
+ goto out;
10671332
1068
- ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end,
1069
- &bytes_mapped, &total_wqe_bytes,
1070
- !requestor);
1071
- if (ret == -EAGAIN) {
1072
- resume_with_error = 0;
1333
+ if (ret < 0 || total_wqe_bytes > bytes_mapped)
10731334 goto resolve_page_fault;
1074
- } else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
1075
- goto resolve_page_fault;
1076
- }
10771335
1336
+out:
1337
+ ret = 0;
10781338 resume_with_error = 0;
1339
+
1340
+read_user:
1341
+ if (ret)
1342
+ mlx5_ib_err(
1343
+ dev,
1344
+ "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n",
1345
+ ret, wqe_index, pfault->token);
1346
+
10791347 resolve_page_fault:
10801348 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
10811349 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
10821350 pfault->wqe.wq_num, resume_with_error,
10831351 pfault->type);
1084
- free_page((unsigned long)buffer);
1352
+ mlx5_core_res_put(res);
1353
+ free_page((unsigned long)wqe_start);
10851354 }
10861355
10871356 static int pages_in_range(u64 address, u32 length)
....@@ -1123,7 +1392,7 @@
11231392 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
11241393 }
11251394
1126
- ret = pagefault_single_data_segment(dev, rkey, address, length,
1395
+ ret = pagefault_single_data_segment(dev, NULL, rkey, address, length,
11271396 &pfault->bytes_committed, NULL);
11281397 if (ret == -EAGAIN) {
11291398 /* We're racing with an invalidation, don't prefetch */
....@@ -1149,7 +1418,7 @@
11491418 if (prefetch_activated) {
11501419 u32 bytes_committed = 0;
11511420
1152
- ret = pagefault_single_data_segment(dev, rkey, address,
1421
+ ret = pagefault_single_data_segment(dev, NULL, rkey, address,
11531422 prefetch_len,
11541423 &bytes_committed, NULL);
11551424 if (ret < 0 && ret != -EAGAIN) {
....@@ -1159,10 +1428,8 @@
11591428 }
11601429 }
11611430
1162
-void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
1163
- struct mlx5_pagefault *pfault)
1431
+static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
11641432 {
1165
- struct mlx5_ib_dev *dev = context;
11661433 u8 event_subtype = pfault->event_subtype;
11671434
11681435 switch (event_subtype) {
....@@ -1177,6 +1444,212 @@
11771444 event_subtype);
11781445 mlx5_ib_page_fault_resume(dev, pfault, 1);
11791446 }
1447
+}
1448
+
1449
+static void mlx5_ib_eqe_pf_action(struct work_struct *work)
1450
+{
1451
+ struct mlx5_pagefault *pfault = container_of(work,
1452
+ struct mlx5_pagefault,
1453
+ work);
1454
+ struct mlx5_ib_pf_eq *eq = pfault->eq;
1455
+
1456
+ mlx5_ib_pfault(eq->dev, pfault);
1457
+ mempool_free(pfault, eq->pool);
1458
+}
1459
+
1460
+static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
1461
+{
1462
+ struct mlx5_eqe_page_fault *pf_eqe;
1463
+ struct mlx5_pagefault *pfault;
1464
+ struct mlx5_eqe *eqe;
1465
+ int cc = 0;
1466
+
1467
+ while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) {
1468
+ pfault = mempool_alloc(eq->pool, GFP_ATOMIC);
1469
+ if (!pfault) {
1470
+ schedule_work(&eq->work);
1471
+ break;
1472
+ }
1473
+
1474
+ pf_eqe = &eqe->data.page_fault;
1475
+ pfault->event_subtype = eqe->sub_type;
1476
+ pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
1477
+
1478
+ mlx5_ib_dbg(eq->dev,
1479
+ "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
1480
+ eqe->sub_type, pfault->bytes_committed);
1481
+
1482
+ switch (eqe->sub_type) {
1483
+ case MLX5_PFAULT_SUBTYPE_RDMA:
1484
+ /* RDMA based event */
1485
+ pfault->type =
1486
+ be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
1487
+ pfault->token =
1488
+ be32_to_cpu(pf_eqe->rdma.pftype_token) &
1489
+ MLX5_24BIT_MASK;
1490
+ pfault->rdma.r_key =
1491
+ be32_to_cpu(pf_eqe->rdma.r_key);
1492
+ pfault->rdma.packet_size =
1493
+ be16_to_cpu(pf_eqe->rdma.packet_length);
1494
+ pfault->rdma.rdma_op_len =
1495
+ be32_to_cpu(pf_eqe->rdma.rdma_op_len);
1496
+ pfault->rdma.rdma_va =
1497
+ be64_to_cpu(pf_eqe->rdma.rdma_va);
1498
+ mlx5_ib_dbg(eq->dev,
1499
+ "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
1500
+ pfault->type, pfault->token,
1501
+ pfault->rdma.r_key);
1502
+ mlx5_ib_dbg(eq->dev,
1503
+ "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
1504
+ pfault->rdma.rdma_op_len,
1505
+ pfault->rdma.rdma_va);
1506
+ break;
1507
+
1508
+ case MLX5_PFAULT_SUBTYPE_WQE:
1509
+ /* WQE based event */
1510
+ pfault->type =
1511
+ (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
1512
+ pfault->token =
1513
+ be32_to_cpu(pf_eqe->wqe.token);
1514
+ pfault->wqe.wq_num =
1515
+ be32_to_cpu(pf_eqe->wqe.pftype_wq) &
1516
+ MLX5_24BIT_MASK;
1517
+ pfault->wqe.wqe_index =
1518
+ be16_to_cpu(pf_eqe->wqe.wqe_index);
1519
+ pfault->wqe.packet_size =
1520
+ be16_to_cpu(pf_eqe->wqe.packet_length);
1521
+ mlx5_ib_dbg(eq->dev,
1522
+ "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
1523
+ pfault->type, pfault->token,
1524
+ pfault->wqe.wq_num,
1525
+ pfault->wqe.wqe_index);
1526
+ break;
1527
+
1528
+ default:
1529
+ mlx5_ib_warn(eq->dev,
1530
+ "Unsupported page fault event sub-type: 0x%02hhx\n",
1531
+ eqe->sub_type);
1532
+ /* Unsupported page faults should still be
1533
+ * resolved by the page fault handler
1534
+ */
1535
+ }
1536
+
1537
+ pfault->eq = eq;
1538
+ INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action);
1539
+ queue_work(eq->wq, &pfault->work);
1540
+
1541
+ cc = mlx5_eq_update_cc(eq->core, ++cc);
1542
+ }
1543
+
1544
+ mlx5_eq_update_ci(eq->core, cc, 1);
1545
+}
1546
+
1547
+static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type,
1548
+ void *data)
1549
+{
1550
+ struct mlx5_ib_pf_eq *eq =
1551
+ container_of(nb, struct mlx5_ib_pf_eq, irq_nb);
1552
+ unsigned long flags;
1553
+
1554
+ if (spin_trylock_irqsave(&eq->lock, flags)) {
1555
+ mlx5_ib_eq_pf_process(eq);
1556
+ spin_unlock_irqrestore(&eq->lock, flags);
1557
+ } else {
1558
+ schedule_work(&eq->work);
1559
+ }
1560
+
1561
+ return IRQ_HANDLED;
1562
+}
1563
+
1564
+/* mempool_refill() was proposed but unfortunately wasn't accepted
1565
+ * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
1566
+ * Cheap workaround.
1567
+ */
1568
+static void mempool_refill(mempool_t *pool)
1569
+{
1570
+ while (pool->curr_nr < pool->min_nr)
1571
+ mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
1572
+}
1573
+
1574
+static void mlx5_ib_eq_pf_action(struct work_struct *work)
1575
+{
1576
+ struct mlx5_ib_pf_eq *eq =
1577
+ container_of(work, struct mlx5_ib_pf_eq, work);
1578
+
1579
+ mempool_refill(eq->pool);
1580
+
1581
+ spin_lock_irq(&eq->lock);
1582
+ mlx5_ib_eq_pf_process(eq);
1583
+ spin_unlock_irq(&eq->lock);
1584
+}
1585
+
1586
+enum {
1587
+ MLX5_IB_NUM_PF_EQE = 0x1000,
1588
+ MLX5_IB_NUM_PF_DRAIN = 64,
1589
+};
1590
+
1591
+static int
1592
+mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
1593
+{
1594
+ struct mlx5_eq_param param = {};
1595
+ int err;
1596
+
1597
+ INIT_WORK(&eq->work, mlx5_ib_eq_pf_action);
1598
+ spin_lock_init(&eq->lock);
1599
+ eq->dev = dev;
1600
+
1601
+ eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN,
1602
+ sizeof(struct mlx5_pagefault));
1603
+ if (!eq->pool)
1604
+ return -ENOMEM;
1605
+
1606
+ eq->wq = alloc_workqueue("mlx5_ib_page_fault",
1607
+ WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM,
1608
+ MLX5_NUM_CMD_EQE);
1609
+ if (!eq->wq) {
1610
+ err = -ENOMEM;
1611
+ goto err_mempool;
1612
+ }
1613
+
1614
+ eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
1615
+ param = (struct mlx5_eq_param) {
1616
+ .irq_index = 0,
1617
+ .nent = MLX5_IB_NUM_PF_EQE,
1618
+ };
1619
+ param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
1620
+ eq->core = mlx5_eq_create_generic(dev->mdev, &param);
1621
+ if (IS_ERR(eq->core)) {
1622
+ err = PTR_ERR(eq->core);
1623
+ goto err_wq;
1624
+ }
1625
+ err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb);
1626
+ if (err) {
1627
+ mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err);
1628
+ goto err_eq;
1629
+ }
1630
+
1631
+ return 0;
1632
+err_eq:
1633
+ mlx5_eq_destroy_generic(dev->mdev, eq->core);
1634
+err_wq:
1635
+ destroy_workqueue(eq->wq);
1636
+err_mempool:
1637
+ mempool_destroy(eq->pool);
1638
+ return err;
1639
+}
1640
+
1641
+static int
1642
+mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
1643
+{
1644
+ int err;
1645
+
1646
+ mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb);
1647
+ err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
1648
+ cancel_work_sync(&eq->work);
1649
+ destroy_workqueue(eq->wq);
1650
+ mempool_destroy(eq->pool);
1651
+
1652
+ return err;
11801653 }
11811654
11821655 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
....@@ -1205,9 +1678,18 @@
12051678 }
12061679 }
12071680
1681
+static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
1682
+ .advise_mr = mlx5_ib_advise_mr,
1683
+};
1684
+
12081685 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
12091686 {
1210
- int ret;
1687
+ int ret = 0;
1688
+
1689
+ if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
1690
+ return ret;
1691
+
1692
+ ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
12111693
12121694 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
12131695 ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
....@@ -1217,7 +1699,17 @@
12171699 }
12181700 }
12191701
1220
- return 0;
1702
+ ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq);
1703
+
1704
+ return ret;
1705
+}
1706
+
1707
+void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
1708
+{
1709
+ if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
1710
+ return;
1711
+
1712
+ mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq);
12211713 }
12221714
12231715 int mlx5_ib_odp_init(void)
....@@ -1228,3 +1720,175 @@
12281720 return 0;
12291721 }
12301722
1723
+struct prefetch_mr_work {
1724
+ struct work_struct work;
1725
+ u32 pf_flags;
1726
+ u32 num_sge;
1727
+ struct {
1728
+ u64 io_virt;
1729
+ struct mlx5_ib_mr *mr;
1730
+ size_t length;
1731
+ } frags[];
1732
+};
1733
+
1734
+static void destroy_prefetch_work(struct prefetch_mr_work *work)
1735
+{
1736
+ u32 i;
1737
+
1738
+ for (i = 0; i < work->num_sge; ++i)
1739
+ if (atomic_dec_and_test(&work->frags[i].mr->num_deferred_work))
1740
+ wake_up(&work->frags[i].mr->q_deferred_work);
1741
+ kvfree(work);
1742
+}
1743
+
1744
+static struct mlx5_ib_mr *
1745
+get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
1746
+ u32 lkey)
1747
+{
1748
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
1749
+ struct mlx5_core_mkey *mmkey;
1750
+ struct ib_umem_odp *odp;
1751
+ struct mlx5_ib_mr *mr;
1752
+
1753
+ lockdep_assert_held(&dev->odp_srcu);
1754
+
1755
+ mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey));
1756
+ if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR)
1757
+ return NULL;
1758
+
1759
+ mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
1760
+
1761
+ if (mr->ibmr.pd != pd)
1762
+ return NULL;
1763
+
1764
+ odp = to_ib_umem_odp(mr->umem);
1765
+
1766
+ /* prefetch with write-access must be supported by the MR */
1767
+ if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1768
+ !odp->umem.writable)
1769
+ return NULL;
1770
+
1771
+ return mr;
1772
+}
1773
+
1774
+static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
1775
+{
1776
+ struct prefetch_mr_work *work =
1777
+ container_of(w, struct prefetch_mr_work, work);
1778
+ struct mlx5_ib_dev *dev;
1779
+ u32 bytes_mapped = 0;
1780
+ int srcu_key;
1781
+ int ret;
1782
+ u32 i;
1783
+
1784
+ /* We rely on IB/core that work is executed if we have num_sge != 0 only. */
1785
+ WARN_ON(!work->num_sge);
1786
+ dev = work->frags[0].mr->dev;
1787
+ /* SRCU should be held when calling to mlx5_odp_populate_xlt() */
1788
+ srcu_key = srcu_read_lock(&dev->odp_srcu);
1789
+ for (i = 0; i < work->num_sge; ++i) {
1790
+ ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
1791
+ work->frags[i].length, &bytes_mapped,
1792
+ work->pf_flags);
1793
+ if (ret <= 0)
1794
+ continue;
1795
+ mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
1796
+ }
1797
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
1798
+
1799
+ destroy_prefetch_work(work);
1800
+}
1801
+
1802
+static bool init_prefetch_work(struct ib_pd *pd,
1803
+ enum ib_uverbs_advise_mr_advice advice,
1804
+ u32 pf_flags, struct prefetch_mr_work *work,
1805
+ struct ib_sge *sg_list, u32 num_sge)
1806
+{
1807
+ u32 i;
1808
+
1809
+ INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
1810
+ work->pf_flags = pf_flags;
1811
+
1812
+ for (i = 0; i < num_sge; ++i) {
1813
+ work->frags[i].io_virt = sg_list[i].addr;
1814
+ work->frags[i].length = sg_list[i].length;
1815
+ work->frags[i].mr =
1816
+ get_prefetchable_mr(pd, advice, sg_list[i].lkey);
1817
+ if (!work->frags[i].mr) {
1818
+ work->num_sge = i;
1819
+ return false;
1820
+ }
1821
+
1822
+ /* Keep the MR pointer will valid outside the SRCU */
1823
+ atomic_inc(&work->frags[i].mr->num_deferred_work);
1824
+ }
1825
+ work->num_sge = num_sge;
1826
+ return true;
1827
+}
1828
+
1829
+static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
1830
+ enum ib_uverbs_advise_mr_advice advice,
1831
+ u32 pf_flags, struct ib_sge *sg_list,
1832
+ u32 num_sge)
1833
+{
1834
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
1835
+ u32 bytes_mapped = 0;
1836
+ int srcu_key;
1837
+ int ret = 0;
1838
+ u32 i;
1839
+
1840
+ srcu_key = srcu_read_lock(&dev->odp_srcu);
1841
+ for (i = 0; i < num_sge; ++i) {
1842
+ struct mlx5_ib_mr *mr;
1843
+
1844
+ mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
1845
+ if (!mr) {
1846
+ ret = -ENOENT;
1847
+ goto out;
1848
+ }
1849
+ ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
1850
+ &bytes_mapped, pf_flags);
1851
+ if (ret < 0)
1852
+ goto out;
1853
+ mlx5_update_odp_stats(mr, prefetch, ret);
1854
+ }
1855
+ ret = 0;
1856
+
1857
+out:
1858
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
1859
+ return ret;
1860
+}
1861
+
1862
+int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
1863
+ enum ib_uverbs_advise_mr_advice advice,
1864
+ u32 flags, struct ib_sge *sg_list, u32 num_sge)
1865
+{
1866
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
1867
+ u32 pf_flags = 0;
1868
+ struct prefetch_mr_work *work;
1869
+ int srcu_key;
1870
+
1871
+ if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
1872
+ pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
1873
+
1874
+ if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1875
+ pf_flags |= MLX5_PF_FLAGS_SNAPSHOT;
1876
+
1877
+ if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
1878
+ return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list,
1879
+ num_sge);
1880
+
1881
+ work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
1882
+ if (!work)
1883
+ return -ENOMEM;
1884
+
1885
+ srcu_key = srcu_read_lock(&dev->odp_srcu);
1886
+ if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) {
1887
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
1888
+ destroy_prefetch_work(work);
1889
+ return -EINVAL;
1890
+ }
1891
+ queue_work(system_unbound_wq, &work->work);
1892
+ srcu_read_unlock(&dev->odp_srcu, srcu_key);
1893
+ return 0;
1894
+}