hc
2023-12-11 6778948f9de86c3cfaf36725a7c87dcff9ba247f
kernel/drivers/infiniband/hw/mlx5/mr.c
....@@ -47,10 +47,69 @@
4747
4848 #define MLX5_UMR_ALIGN 2048
4949
50
+static void
51
+create_mkey_callback(int status, struct mlx5_async_work *context);
52
+
53
+static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
54
+ struct ib_pd *pd)
55
+{
56
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
57
+
58
+ MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
59
+ MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
60
+ MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
61
+ MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
62
+ MLX5_SET(mkc, mkc, lr, 1);
63
+
64
+ if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
65
+ MLX5_SET(mkc, mkc, relaxed_ordering_write,
66
+ !!(acc & IB_ACCESS_RELAXED_ORDERING));
67
+ if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
68
+ MLX5_SET(mkc, mkc, relaxed_ordering_read,
69
+ !!(acc & IB_ACCESS_RELAXED_ORDERING));
70
+
71
+ MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
72
+ MLX5_SET(mkc, mkc, qpn, 0xffffff);
73
+ MLX5_SET64(mkc, mkc, start_addr, start_addr);
74
+}
75
+
76
+static void
77
+assign_mkey_variant(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
78
+ u32 *in)
79
+{
80
+ u8 key = atomic_inc_return(&dev->mkey_var);
81
+ void *mkc;
82
+
83
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
84
+ MLX5_SET(mkc, mkc, mkey_7_0, key);
85
+ mkey->key = key;
86
+}
87
+
88
+static int
89
+mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
90
+ u32 *in, int inlen)
91
+{
92
+ assign_mkey_variant(dev, mkey, in);
93
+ return mlx5_core_create_mkey(dev->mdev, mkey, in, inlen);
94
+}
95
+
96
+static int
97
+mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
98
+ struct mlx5_core_mkey *mkey,
99
+ struct mlx5_async_ctx *async_ctx,
100
+ u32 *in, int inlen, u32 *out, int outlen,
101
+ struct mlx5_async_work *context)
102
+{
103
+ MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
104
+ assign_mkey_variant(dev, mkey, in);
105
+ return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
106
+ create_mkey_callback, context);
107
+}
108
+
50109 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
51110 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
52111 static int mr_cache_max_order(struct mlx5_ib_dev *dev);
53
-static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
112
+static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
54113
55114 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
56115 {
....@@ -59,113 +118,79 @@
59118
60119 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
61120 {
62
- int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
121
+ WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
63122
64
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
65
- /* Wait until all page fault handlers using the mr complete. */
66
- synchronize_srcu(&dev->mr_srcu);
67
-#endif
68
-
69
- return err;
123
+ return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
70124 }
71125
72
-static int order2idx(struct mlx5_ib_dev *dev, int order)
73
-{
74
- struct mlx5_mr_cache *cache = &dev->cache;
75
-
76
- if (order < cache->ent[0].order)
77
- return 0;
78
- else
79
- return order - cache->ent[0].order;
80
-}
81
-
82
-static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
126
+static inline bool mlx5_ib_pas_fits_in_mr(struct mlx5_ib_mr *mr, u64 start,
127
+ u64 length)
83128 {
84129 return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >=
85130 length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
86131 }
87132
88
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
89
-static void update_odp_mr(struct mlx5_ib_mr *mr)
133
+static void create_mkey_callback(int status, struct mlx5_async_work *context)
90134 {
91
- if (mr->umem->odp_data) {
92
- /*
93
- * This barrier prevents the compiler from moving the
94
- * setting of umem->odp_data->private to point to our
95
- * MR, before reg_umr finished, to ensure that the MR
96
- * initialization have finished before starting to
97
- * handle invalidations.
98
- */
99
- smp_wmb();
100
- mr->umem->odp_data->private = mr;
101
- /*
102
- * Make sure we will see the new
103
- * umem->odp_data->private value in the invalidation
104
- * routines, before we can get page faults on the
105
- * MR. Page faults can happen once we put the MR in
106
- * the tree, below this line. Without the barrier,
107
- * there can be a fault handling and an invalidation
108
- * before umem->odp_data->private == mr is visible to
109
- * the invalidation handler.
110
- */
111
- smp_wmb();
112
- }
113
-}
114
-#endif
115
-
116
-static void reg_mr_callback(int status, void *context)
117
-{
118
- struct mlx5_ib_mr *mr = context;
135
+ struct mlx5_ib_mr *mr =
136
+ container_of(context, struct mlx5_ib_mr, cb_work);
119137 struct mlx5_ib_dev *dev = mr->dev;
120
- struct mlx5_mr_cache *cache = &dev->cache;
121
- int c = order2idx(dev, mr->order);
122
- struct mlx5_cache_ent *ent = &cache->ent[c];
123
- u8 key;
138
+ struct mlx5_cache_ent *ent = mr->cache_ent;
124139 unsigned long flags;
125
- struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table;
126
- int err;
127140
128
- spin_lock_irqsave(&ent->lock, flags);
129
- ent->pending--;
130
- spin_unlock_irqrestore(&ent->lock, flags);
131141 if (status) {
132142 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
133143 kfree(mr);
134
- dev->fill_delay = 1;
144
+ spin_lock_irqsave(&ent->lock, flags);
145
+ ent->pending--;
146
+ WRITE_ONCE(dev->fill_delay, 1);
147
+ spin_unlock_irqrestore(&ent->lock, flags);
135148 mod_timer(&dev->delay_timer, jiffies + HZ);
136149 return;
137150 }
138151
139152 mr->mmkey.type = MLX5_MKEY_MR;
140
- spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
141
- key = dev->mdev->priv.mkey_key++;
142
- spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
143
- mr->mmkey.key = mlx5_idx_to_mkey(MLX5_GET(create_mkey_out, mr->out, mkey_index)) | key;
153
+ mr->mmkey.key |= mlx5_idx_to_mkey(
154
+ MLX5_GET(create_mkey_out, mr->out, mkey_index));
144155
145
- cache->last_add = jiffies;
156
+ WRITE_ONCE(dev->cache.last_add, jiffies);
146157
147158 spin_lock_irqsave(&ent->lock, flags);
148159 list_add_tail(&mr->list, &ent->head);
149
- ent->cur++;
150
- ent->size++;
160
+ ent->available_mrs++;
161
+ ent->total_mrs++;
162
+ /* If we are doing fill_to_high_water then keep going. */
163
+ queue_adjust_cache_locked(ent);
164
+ ent->pending--;
151165 spin_unlock_irqrestore(&ent->lock, flags);
152
-
153
- write_lock_irqsave(&table->lock, flags);
154
- err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmkey.key),
155
- &mr->mmkey);
156
- if (err)
157
- pr_err("Error inserting to mkey tree. 0x%x\n", -err);
158
- write_unlock_irqrestore(&table->lock, flags);
159
-
160
- if (!completion_done(&ent->compl))
161
- complete(&ent->compl);
162166 }
163167
164
-static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
168
+static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
165169 {
166
- struct mlx5_mr_cache *cache = &dev->cache;
167
- struct mlx5_cache_ent *ent = &cache->ent[c];
168
- int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
170
+ struct mlx5_ib_mr *mr;
171
+
172
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
173
+ if (!mr)
174
+ return NULL;
175
+ mr->order = ent->order;
176
+ mr->cache_ent = ent;
177
+ mr->dev = ent->dev;
178
+
179
+ set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
180
+ MLX5_SET(mkc, mkc, free, 1);
181
+ MLX5_SET(mkc, mkc, umr_en, 1);
182
+ MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
183
+ MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
184
+
185
+ MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
186
+ MLX5_SET(mkc, mkc, log_page_size, ent->page);
187
+ return mr;
188
+}
189
+
190
+/* Asynchronously schedule new MRs to be populated in the cache. */
191
+static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
192
+{
193
+ size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
169194 struct mlx5_ib_mr *mr;
170195 void *mkc;
171196 u32 *in;
....@@ -178,42 +203,29 @@
178203
179204 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
180205 for (i = 0; i < num; i++) {
181
- if (ent->pending >= MAX_PENDING_REG_MR) {
182
- err = -EAGAIN;
183
- break;
184
- }
185
-
186
- mr = kzalloc(sizeof(*mr), GFP_KERNEL);
206
+ mr = alloc_cache_mr(ent, mkc);
187207 if (!mr) {
188208 err = -ENOMEM;
189209 break;
190210 }
191
- mr->order = ent->order;
192
- mr->allocated_from_cache = 1;
193
- mr->dev = dev;
194
-
195
- MLX5_SET(mkc, mkc, free, 1);
196
- MLX5_SET(mkc, mkc, umr_en, 1);
197
- MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
198
- MLX5_SET(mkc, mkc, access_mode_4_2,
199
- (ent->access_mode >> 2) & 0x7);
200
-
201
- MLX5_SET(mkc, mkc, qpn, 0xffffff);
202
- MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
203
- MLX5_SET(mkc, mkc, log_page_size, ent->page);
204
-
205211 spin_lock_irq(&ent->lock);
212
+ if (ent->pending >= MAX_PENDING_REG_MR) {
213
+ err = -EAGAIN;
214
+ spin_unlock_irq(&ent->lock);
215
+ kfree(mr);
216
+ break;
217
+ }
206218 ent->pending++;
207219 spin_unlock_irq(&ent->lock);
208
- err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey,
209
- in, inlen,
210
- mr->out, sizeof(mr->out),
211
- reg_mr_callback, mr);
220
+ err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
221
+ &ent->dev->async_ctx, in, inlen,
222
+ mr->out, sizeof(mr->out),
223
+ &mr->cb_work);
212224 if (err) {
213225 spin_lock_irq(&ent->lock);
214226 ent->pending--;
215227 spin_unlock_irq(&ent->lock);
216
- mlx5_ib_warn(dev, "create mkey failed %d\n", err);
228
+ mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
217229 kfree(mr);
218230 break;
219231 }
....@@ -223,36 +235,89 @@
223235 return err;
224236 }
225237
226
-static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
238
+/* Synchronously create a MR in the cache */
239
+static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
227240 {
228
- struct mlx5_mr_cache *cache = &dev->cache;
229
- struct mlx5_cache_ent *ent = &cache->ent[c];
230
- struct mlx5_ib_mr *tmp_mr;
241
+ size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
231242 struct mlx5_ib_mr *mr;
232
- LIST_HEAD(del_list);
233
- int i;
243
+ void *mkc;
244
+ u32 *in;
245
+ int err;
234246
235
- for (i = 0; i < num; i++) {
236
- spin_lock_irq(&ent->lock);
237
- if (list_empty(&ent->head)) {
238
- spin_unlock_irq(&ent->lock);
239
- break;
240
- }
241
- mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
242
- list_move(&mr->list, &del_list);
243
- ent->cur--;
244
- ent->size--;
245
- spin_unlock_irq(&ent->lock);
246
- mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
247
+ in = kzalloc(inlen, GFP_KERNEL);
248
+ if (!in)
249
+ return ERR_PTR(-ENOMEM);
250
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
251
+
252
+ mr = alloc_cache_mr(ent, mkc);
253
+ if (!mr) {
254
+ err = -ENOMEM;
255
+ goto free_in;
247256 }
248257
249
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
250
- synchronize_srcu(&dev->mr_srcu);
251
-#endif
258
+ err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey, in, inlen);
259
+ if (err)
260
+ goto free_mr;
252261
253
- list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
254
- list_del(&mr->list);
255
- kfree(mr);
262
+ mr->mmkey.type = MLX5_MKEY_MR;
263
+ WRITE_ONCE(ent->dev->cache.last_add, jiffies);
264
+ spin_lock_irq(&ent->lock);
265
+ ent->total_mrs++;
266
+ spin_unlock_irq(&ent->lock);
267
+ kfree(in);
268
+ return mr;
269
+free_mr:
270
+ kfree(mr);
271
+free_in:
272
+ kfree(in);
273
+ return ERR_PTR(err);
274
+}
275
+
276
+static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
277
+{
278
+ struct mlx5_ib_mr *mr;
279
+
280
+ lockdep_assert_held(&ent->lock);
281
+ if (list_empty(&ent->head))
282
+ return;
283
+ mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
284
+ list_del(&mr->list);
285
+ ent->available_mrs--;
286
+ ent->total_mrs--;
287
+ spin_unlock_irq(&ent->lock);
288
+ mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey);
289
+ kfree(mr);
290
+ spin_lock_irq(&ent->lock);
291
+}
292
+
293
+static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
294
+ bool limit_fill)
295
+{
296
+ int err;
297
+
298
+ lockdep_assert_held(&ent->lock);
299
+
300
+ while (true) {
301
+ if (limit_fill)
302
+ target = ent->limit * 2;
303
+ if (target == ent->available_mrs + ent->pending)
304
+ return 0;
305
+ if (target > ent->available_mrs + ent->pending) {
306
+ u32 todo = target - (ent->available_mrs + ent->pending);
307
+
308
+ spin_unlock_irq(&ent->lock);
309
+ err = add_keys(ent, todo);
310
+ if (err == -EAGAIN)
311
+ usleep_range(3000, 5000);
312
+ spin_lock_irq(&ent->lock);
313
+ if (err) {
314
+ if (err != -EAGAIN)
315
+ return err;
316
+ } else
317
+ return 0;
318
+ } else {
319
+ remove_cache_mr_locked(ent);
320
+ }
256321 }
257322 }
258323
....@@ -260,37 +325,38 @@
260325 size_t count, loff_t *pos)
261326 {
262327 struct mlx5_cache_ent *ent = filp->private_data;
263
- struct mlx5_ib_dev *dev = ent->dev;
264
- char lbuf[20] = {0};
265
- u32 var;
328
+ u32 target;
266329 int err;
267
- int c;
268330
269
- count = min(count, sizeof(lbuf) - 1);
270
- if (copy_from_user(lbuf, buf, count))
271
- return -EFAULT;
331
+ err = kstrtou32_from_user(buf, count, 0, &target);
332
+ if (err)
333
+ return err;
272334
273
- c = order2idx(dev, ent->order);
274
-
275
- if (sscanf(lbuf, "%u", &var) != 1)
276
- return -EINVAL;
277
-
278
- if (var < ent->limit)
279
- return -EINVAL;
280
-
281
- if (var > ent->size) {
282
- do {
283
- err = add_keys(dev, c, var - ent->size);
284
- if (err && err != -EAGAIN)
285
- return err;
286
-
287
- usleep_range(3000, 5000);
288
- } while (err);
289
- } else if (var < ent->size) {
290
- remove_keys(dev, c, ent->size - var);
335
+ /*
336
+ * Target is the new value of total_mrs the user requests, however we
337
+ * cannot free MRs that are in use. Compute the target value for
338
+ * available_mrs.
339
+ */
340
+ spin_lock_irq(&ent->lock);
341
+ if (target < ent->total_mrs - ent->available_mrs) {
342
+ err = -EINVAL;
343
+ goto err_unlock;
291344 }
345
+ target = target - (ent->total_mrs - ent->available_mrs);
346
+ if (target < ent->limit || target > ent->limit*2) {
347
+ err = -EINVAL;
348
+ goto err_unlock;
349
+ }
350
+ err = resize_available_mrs(ent, target, false);
351
+ if (err)
352
+ goto err_unlock;
353
+ spin_unlock_irq(&ent->lock);
292354
293355 return count;
356
+
357
+err_unlock:
358
+ spin_unlock_irq(&ent->lock);
359
+ return err;
294360 }
295361
296362 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
....@@ -300,7 +366,7 @@
300366 char lbuf[20];
301367 int err;
302368
303
- err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size);
369
+ err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
304370 if (err < 0)
305371 return err;
306372
....@@ -318,32 +384,23 @@
318384 size_t count, loff_t *pos)
319385 {
320386 struct mlx5_cache_ent *ent = filp->private_data;
321
- struct mlx5_ib_dev *dev = ent->dev;
322
- char lbuf[20] = {0};
323387 u32 var;
324388 int err;
325
- int c;
326389
327
- count = min(count, sizeof(lbuf) - 1);
328
- if (copy_from_user(lbuf, buf, count))
329
- return -EFAULT;
390
+ err = kstrtou32_from_user(buf, count, 0, &var);
391
+ if (err)
392
+ return err;
330393
331
- c = order2idx(dev, ent->order);
332
-
333
- if (sscanf(lbuf, "%u", &var) != 1)
334
- return -EINVAL;
335
-
336
- if (var > ent->size)
337
- return -EINVAL;
338
-
394
+ /*
395
+ * Upon set we immediately fill the cache to high water mark implied by
396
+ * the limit.
397
+ */
398
+ spin_lock_irq(&ent->lock);
339399 ent->limit = var;
340
-
341
- if (ent->cur < ent->limit) {
342
- err = add_keys(dev, c, 2 * ent->limit - ent->cur);
343
- if (err)
344
- return err;
345
- }
346
-
400
+ err = resize_available_mrs(ent, 0, true);
401
+ spin_unlock_irq(&ent->lock);
402
+ if (err)
403
+ return err;
347404 return count;
348405 }
349406
....@@ -368,68 +425,121 @@
368425 .read = limit_read,
369426 };
370427
371
-static int someone_adding(struct mlx5_mr_cache *cache)
428
+static bool someone_adding(struct mlx5_mr_cache *cache)
372429 {
373
- int i;
430
+ unsigned int i;
374431
375432 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
376
- if (cache->ent[i].cur < cache->ent[i].limit)
377
- return 1;
378
- }
433
+ struct mlx5_cache_ent *ent = &cache->ent[i];
434
+ bool ret;
379435
380
- return 0;
436
+ spin_lock_irq(&ent->lock);
437
+ ret = ent->available_mrs < ent->limit;
438
+ spin_unlock_irq(&ent->lock);
439
+ if (ret)
440
+ return true;
441
+ }
442
+ return false;
443
+}
444
+
445
+/*
446
+ * Check if the bucket is outside the high/low water mark and schedule an async
447
+ * update. The cache refill has hysteresis, once the low water mark is hit it is
448
+ * refilled up to the high mark.
449
+ */
450
+static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
451
+{
452
+ lockdep_assert_held(&ent->lock);
453
+
454
+ if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
455
+ return;
456
+ if (ent->available_mrs < ent->limit) {
457
+ ent->fill_to_high_water = true;
458
+ queue_work(ent->dev->cache.wq, &ent->work);
459
+ } else if (ent->fill_to_high_water &&
460
+ ent->available_mrs + ent->pending < 2 * ent->limit) {
461
+ /*
462
+ * Once we start populating due to hitting a low water mark
463
+ * continue until we pass the high water mark.
464
+ */
465
+ queue_work(ent->dev->cache.wq, &ent->work);
466
+ } else if (ent->available_mrs == 2 * ent->limit) {
467
+ ent->fill_to_high_water = false;
468
+ } else if (ent->available_mrs > 2 * ent->limit) {
469
+ /* Queue deletion of excess entries */
470
+ ent->fill_to_high_water = false;
471
+ if (ent->pending)
472
+ queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
473
+ msecs_to_jiffies(1000));
474
+ else
475
+ queue_work(ent->dev->cache.wq, &ent->work);
476
+ }
381477 }
382478
383479 static void __cache_work_func(struct mlx5_cache_ent *ent)
384480 {
385481 struct mlx5_ib_dev *dev = ent->dev;
386482 struct mlx5_mr_cache *cache = &dev->cache;
387
- int i = order2idx(dev, ent->order);
388483 int err;
389484
390
- if (cache->stopped)
391
- return;
485
+ spin_lock_irq(&ent->lock);
486
+ if (ent->disabled)
487
+ goto out;
392488
393
- ent = &dev->cache.ent[i];
394
- if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
395
- err = add_keys(dev, i, 1);
396
- if (ent->cur < 2 * ent->limit) {
397
- if (err == -EAGAIN) {
398
- mlx5_ib_dbg(dev, "returned eagain, order %d\n",
399
- i + 2);
400
- queue_delayed_work(cache->wq, &ent->dwork,
401
- msecs_to_jiffies(3));
402
- } else if (err) {
403
- mlx5_ib_warn(dev, "command failed order %d, err %d\n",
404
- i + 2, err);
489
+ if (ent->fill_to_high_water &&
490
+ ent->available_mrs + ent->pending < 2 * ent->limit &&
491
+ !READ_ONCE(dev->fill_delay)) {
492
+ spin_unlock_irq(&ent->lock);
493
+ err = add_keys(ent, 1);
494
+ spin_lock_irq(&ent->lock);
495
+ if (ent->disabled)
496
+ goto out;
497
+ if (err) {
498
+ /*
499
+ * EAGAIN only happens if pending is positive, so we
500
+ * will be rescheduled from reg_mr_callback(). The only
501
+ * failure path here is ENOMEM.
502
+ */
503
+ if (err != -EAGAIN) {
504
+ mlx5_ib_warn(
505
+ dev,
506
+ "command failed order %d, err %d\n",
507
+ ent->order, err);
405508 queue_delayed_work(cache->wq, &ent->dwork,
406509 msecs_to_jiffies(1000));
407
- } else {
408
- queue_work(cache->wq, &ent->work);
409510 }
410511 }
411
- } else if (ent->cur > 2 * ent->limit) {
512
+ } else if (ent->available_mrs > 2 * ent->limit) {
513
+ bool need_delay;
514
+
412515 /*
413
- * The remove_keys() logic is performed as garbage collection
414
- * task. Such task is intended to be run when no other active
415
- * processes are running.
516
+ * The remove_cache_mr() logic is performed as garbage
517
+ * collection task. Such task is intended to be run when no
518
+ * other active processes are running.
416519 *
417520 * The need_resched() will return TRUE if there are user tasks
418521 * to be activated in near future.
419522 *
420
- * In such case, we don't execute remove_keys() and postpone
421
- * the garbage collection work to try to run in next cycle,
422
- * in order to free CPU resources to other tasks.
523
+ * In such case, we don't execute remove_cache_mr() and postpone
524
+ * the garbage collection work to try to run in next cycle, in
525
+ * order to free CPU resources to other tasks.
423526 */
424
- if (!need_resched() && !someone_adding(cache) &&
425
- time_after(jiffies, cache->last_add + 300 * HZ)) {
426
- remove_keys(dev, i, 1);
427
- if (ent->cur > ent->limit)
428
- queue_work(cache->wq, &ent->work);
429
- } else {
527
+ spin_unlock_irq(&ent->lock);
528
+ need_delay = need_resched() || someone_adding(cache) ||
529
+ !time_after(jiffies,
530
+ READ_ONCE(cache->last_add) + 300 * HZ);
531
+ spin_lock_irq(&ent->lock);
532
+ if (ent->disabled)
533
+ goto out;
534
+ if (need_delay) {
430535 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
536
+ goto out;
431537 }
538
+ remove_cache_mr_locked(ent);
539
+ queue_adjust_cache_locked(ent);
432540 }
541
+out:
542
+ spin_unlock_irq(&ent->lock);
433543 }
434544
435545 static void delayed_cache_work_func(struct work_struct *work)
....@@ -448,117 +558,103 @@
448558 __cache_work_func(ent);
449559 }
450560
451
-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
561
+/* Allocate a special entry from the cache */
562
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
563
+ unsigned int entry, int access_flags)
452564 {
453565 struct mlx5_mr_cache *cache = &dev->cache;
454566 struct mlx5_cache_ent *ent;
455567 struct mlx5_ib_mr *mr;
456
- int err;
457568
458
- if (entry < 0 || entry >= MAX_MR_CACHE_ENTRIES) {
459
- mlx5_ib_err(dev, "cache entry %d is out of range\n", entry);
569
+ if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY ||
570
+ entry >= ARRAY_SIZE(cache->ent)))
460571 return ERR_PTR(-EINVAL);
461
- }
572
+
573
+ /* Matches access in alloc_cache_mr() */
574
+ if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
575
+ return ERR_PTR(-EOPNOTSUPP);
462576
463577 ent = &cache->ent[entry];
464
- while (1) {
465
- spin_lock_irq(&ent->lock);
466
- if (list_empty(&ent->head)) {
467
- spin_unlock_irq(&ent->lock);
468
-
469
- err = add_keys(dev, entry, 1);
470
- if (err && err != -EAGAIN)
471
- return ERR_PTR(err);
472
-
473
- wait_for_completion(&ent->compl);
474
- } else {
475
- mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
476
- list);
477
- list_del(&mr->list);
478
- ent->cur--;
479
- spin_unlock_irq(&ent->lock);
480
- if (ent->cur < ent->limit)
481
- queue_work(cache->wq, &ent->work);
578
+ spin_lock_irq(&ent->lock);
579
+ if (list_empty(&ent->head)) {
580
+ queue_adjust_cache_locked(ent);
581
+ ent->miss++;
582
+ spin_unlock_irq(&ent->lock);
583
+ mr = create_cache_mr(ent);
584
+ if (IS_ERR(mr))
482585 return mr;
483
- }
586
+ } else {
587
+ mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
588
+ list_del(&mr->list);
589
+ ent->available_mrs--;
590
+ queue_adjust_cache_locked(ent);
591
+ spin_unlock_irq(&ent->lock);
484592 }
593
+ mr->access_flags = access_flags;
594
+ return mr;
485595 }
486596
487
-static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
597
+/* Return a MR already available in the cache */
598
+static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
488599 {
489
- struct mlx5_mr_cache *cache = &dev->cache;
600
+ struct mlx5_ib_dev *dev = req_ent->dev;
490601 struct mlx5_ib_mr *mr = NULL;
491
- struct mlx5_cache_ent *ent;
492
- int last_umr_cache_entry;
493
- int c;
494
- int i;
602
+ struct mlx5_cache_ent *ent = req_ent;
495603
496
- c = order2idx(dev, order);
497
- last_umr_cache_entry = order2idx(dev, mr_cache_max_order(dev));
498
- if (c < 0 || c > last_umr_cache_entry) {
499
- mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
500
- return NULL;
501
- }
502
-
503
- for (i = c; i <= last_umr_cache_entry; i++) {
504
- ent = &cache->ent[i];
505
-
506
- mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
604
+ /* Try larger MR pools from the cache to satisfy the allocation */
605
+ for (; ent != &dev->cache.ent[MR_CACHE_LAST_STD_ENTRY + 1]; ent++) {
606
+ mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order,
607
+ ent - dev->cache.ent);
507608
508609 spin_lock_irq(&ent->lock);
509610 if (!list_empty(&ent->head)) {
510611 mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
511612 list);
512613 list_del(&mr->list);
513
- ent->cur--;
614
+ ent->available_mrs--;
615
+ queue_adjust_cache_locked(ent);
514616 spin_unlock_irq(&ent->lock);
515
- if (ent->cur < ent->limit)
516
- queue_work(cache->wq, &ent->work);
517617 break;
518618 }
619
+ queue_adjust_cache_locked(ent);
519620 spin_unlock_irq(&ent->lock);
520
-
521
- queue_work(cache->wq, &ent->work);
522621 }
523622
524623 if (!mr)
525
- cache->ent[c].miss++;
624
+ req_ent->miss++;
526625
527626 return mr;
528627 }
529628
629
+static void detach_mr_from_cache(struct mlx5_ib_mr *mr)
630
+{
631
+ struct mlx5_cache_ent *ent = mr->cache_ent;
632
+
633
+ mr->cache_ent = NULL;
634
+ spin_lock_irq(&ent->lock);
635
+ ent->total_mrs--;
636
+ spin_unlock_irq(&ent->lock);
637
+}
638
+
530639 void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
531640 {
532
- struct mlx5_mr_cache *cache = &dev->cache;
533
- struct mlx5_cache_ent *ent;
534
- int shrink = 0;
535
- int c;
641
+ struct mlx5_cache_ent *ent = mr->cache_ent;
536642
537
- if (!mr->allocated_from_cache)
643
+ if (!ent)
538644 return;
539645
540
- c = order2idx(dev, mr->order);
541
- WARN_ON(c < 0 || c >= MAX_MR_CACHE_ENTRIES);
542
-
543
- if (unreg_umr(dev, mr)) {
544
- mr->allocated_from_cache = false;
646
+ if (mlx5_mr_cache_invalidate(mr)) {
647
+ detach_mr_from_cache(mr);
545648 destroy_mkey(dev, mr);
546
- ent = &cache->ent[c];
547
- if (ent->cur < ent->limit)
548
- queue_work(cache->wq, &ent->work);
649
+ kfree(mr);
549650 return;
550651 }
551652
552
- ent = &cache->ent[c];
553653 spin_lock_irq(&ent->lock);
554654 list_add_tail(&mr->list, &ent->head);
555
- ent->cur++;
556
- if (ent->cur > 2 * ent->limit)
557
- shrink = 1;
655
+ ent->available_mrs++;
656
+ queue_adjust_cache_locked(ent);
558657 spin_unlock_irq(&ent->lock);
559
-
560
- if (shrink)
561
- queue_work(cache->wq, &ent->work);
562658 }
563659
564660 static void clean_keys(struct mlx5_ib_dev *dev, int c)
....@@ -578,15 +674,11 @@
578674 }
579675 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
580676 list_move(&mr->list, &del_list);
581
- ent->cur--;
582
- ent->size--;
677
+ ent->available_mrs--;
678
+ ent->total_mrs--;
583679 spin_unlock_irq(&ent->lock);
584680 mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
585681 }
586
-
587
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
588
- synchronize_srcu(&dev->mr_srcu);
589
-#endif
590682
591683 list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
592684 list_del(&mr->list);
....@@ -596,73 +688,47 @@
596688
597689 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
598690 {
599
- if (!mlx5_debugfs_root || dev->rep)
691
+ if (!mlx5_debugfs_root || dev->is_rep)
600692 return;
601693
602694 debugfs_remove_recursive(dev->cache.root);
603695 dev->cache.root = NULL;
604696 }
605697
606
-static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
698
+static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
607699 {
608700 struct mlx5_mr_cache *cache = &dev->cache;
609701 struct mlx5_cache_ent *ent;
702
+ struct dentry *dir;
610703 int i;
611704
612
- if (!mlx5_debugfs_root || dev->rep)
613
- return 0;
705
+ if (!mlx5_debugfs_root || dev->is_rep)
706
+ return;
614707
615708 cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
616
- if (!cache->root)
617
- return -ENOMEM;
618709
619710 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
620711 ent = &cache->ent[i];
621712 sprintf(ent->name, "%d", ent->order);
622
- ent->dir = debugfs_create_dir(ent->name, cache->root);
623
- if (!ent->dir)
624
- goto err;
625
-
626
- ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent,
627
- &size_fops);
628
- if (!ent->fsize)
629
- goto err;
630
-
631
- ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent,
632
- &limit_fops);
633
- if (!ent->flimit)
634
- goto err;
635
-
636
- ent->fcur = debugfs_create_u32("cur", 0400, ent->dir,
637
- &ent->cur);
638
- if (!ent->fcur)
639
- goto err;
640
-
641
- ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir,
642
- &ent->miss);
643
- if (!ent->fmiss)
644
- goto err;
713
+ dir = debugfs_create_dir(ent->name, cache->root);
714
+ debugfs_create_file("size", 0600, dir, ent, &size_fops);
715
+ debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
716
+ debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
717
+ debugfs_create_u32("miss", 0600, dir, &ent->miss);
645718 }
646
-
647
- return 0;
648
-err:
649
- mlx5_mr_cache_debugfs_cleanup(dev);
650
-
651
- return -ENOMEM;
652719 }
653720
654721 static void delay_time_func(struct timer_list *t)
655722 {
656723 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
657724
658
- dev->fill_delay = 0;
725
+ WRITE_ONCE(dev->fill_delay, 0);
659726 }
660727
661728 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
662729 {
663730 struct mlx5_mr_cache *cache = &dev->cache;
664731 struct mlx5_cache_ent *ent;
665
- int err;
666732 int i;
667733
668734 mutex_init(&dev->slow_path_mutex);
....@@ -672,6 +738,7 @@
672738 return -ENOMEM;
673739 }
674740
741
+ mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
675742 timer_setup(&dev->delay_timer, delay_time_func, 0);
676743 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
677744 ent = &cache->ent[i];
....@@ -681,7 +748,6 @@
681748 ent->dev = dev;
682749 ent->limit = 0;
683750
684
- init_completion(&ent->compl);
685751 INIT_WORK(&ent->work, cache_work_func);
686752 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
687753
....@@ -698,70 +764,45 @@
698764 MLX5_IB_UMR_OCTOWORD;
699765 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
700766 if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
701
- !dev->rep &&
702
- mlx5_core_is_pf(dev->mdev))
767
+ !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
768
+ mlx5_ib_can_load_pas_with_umr(dev, 0))
703769 ent->limit = dev->mdev->profile->mr_cache[i].limit;
704770 else
705771 ent->limit = 0;
706
- queue_work(cache->wq, &ent->work);
772
+ spin_lock_irq(&ent->lock);
773
+ queue_adjust_cache_locked(ent);
774
+ spin_unlock_irq(&ent->lock);
707775 }
708776
709
- err = mlx5_mr_cache_debugfs_init(dev);
710
- if (err)
711
- mlx5_ib_warn(dev, "cache debugfs failure\n");
712
-
713
- /*
714
- * We don't want to fail driver if debugfs failed to initialize,
715
- * so we are not forwarding error to the user.
716
- */
777
+ mlx5_mr_cache_debugfs_init(dev);
717778
718779 return 0;
719780 }
720781
721
-static void wait_for_async_commands(struct mlx5_ib_dev *dev)
722
-{
723
- struct mlx5_mr_cache *cache = &dev->cache;
724
- struct mlx5_cache_ent *ent;
725
- int total = 0;
726
- int i;
727
- int j;
728
-
729
- for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
730
- ent = &cache->ent[i];
731
- for (j = 0 ; j < 1000; j++) {
732
- if (!ent->pending)
733
- break;
734
- msleep(50);
735
- }
736
- }
737
- for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
738
- ent = &cache->ent[i];
739
- total += ent->pending;
740
- }
741
-
742
- if (total)
743
- mlx5_ib_warn(dev, "aborted while there are %d pending mr requests\n", total);
744
- else
745
- mlx5_ib_warn(dev, "done with all pending requests\n");
746
-}
747
-
748782 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
749783 {
750
- int i;
784
+ unsigned int i;
751785
752786 if (!dev->cache.wq)
753787 return 0;
754788
755
- dev->cache.stopped = 1;
756
- flush_workqueue(dev->cache.wq);
789
+ for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
790
+ struct mlx5_cache_ent *ent = &dev->cache.ent[i];
791
+
792
+ spin_lock_irq(&ent->lock);
793
+ ent->disabled = true;
794
+ spin_unlock_irq(&ent->lock);
795
+ cancel_work_sync(&ent->work);
796
+ cancel_delayed_work_sync(&ent->dwork);
797
+ }
757798
758799 mlx5_mr_cache_debugfs_cleanup(dev);
800
+ mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
759801
760802 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
761803 clean_keys(dev, i);
762804
763805 destroy_workqueue(dev->cache.wq);
764
- wait_for_async_commands(dev);
765806 del_timer_sync(&dev->delay_timer);
766807
767808 return 0;
....@@ -771,7 +812,6 @@
771812 {
772813 struct mlx5_ib_dev *dev = to_mdev(pd->device);
773814 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
774
- struct mlx5_core_dev *mdev = dev->mdev;
775815 struct mlx5_ib_mr *mr;
776816 void *mkc;
777817 u32 *in;
....@@ -790,18 +830,10 @@
790830 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
791831
792832 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
793
- MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
794
- MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
795
- MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
796
- MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
797
- MLX5_SET(mkc, mkc, lr, 1);
798
-
799833 MLX5_SET(mkc, mkc, length64, 1);
800
- MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
801
- MLX5_SET(mkc, mkc, qpn, 0xffffff);
802
- MLX5_SET64(mkc, mkc, start_addr, 0);
834
+ set_mkc_access_pd_addr_fields(mkc, acc, 0, pd);
803835
804
- err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
836
+ err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
805837 if (err)
806838 goto err_in;
807839
....@@ -840,26 +872,43 @@
840872 return MLX5_MAX_UMR_SHIFT;
841873 }
842874
843
-static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
844
- int access_flags, struct ib_umem **umem,
845
- int *npages, int *page_shift, int *ncont,
846
- int *order)
875
+static int mr_umem_get(struct mlx5_ib_dev *dev, u64 start, u64 length,
876
+ int access_flags, struct ib_umem **umem, int *npages,
877
+ int *page_shift, int *ncont, int *order)
847878 {
848
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
849879 struct ib_umem *u;
850
- int err;
851880
852881 *umem = NULL;
853882
854
- u = ib_umem_get(pd->uobject->context, start, length, access_flags, 0);
855
- err = PTR_ERR_OR_ZERO(u);
856
- if (err) {
857
- mlx5_ib_dbg(dev, "umem get failed (%d)\n", err);
858
- return err;
883
+ if (access_flags & IB_ACCESS_ON_DEMAND) {
884
+ struct ib_umem_odp *odp;
885
+
886
+ odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
887
+ &mlx5_mn_ops);
888
+ if (IS_ERR(odp)) {
889
+ mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
890
+ PTR_ERR(odp));
891
+ return PTR_ERR(odp);
892
+ }
893
+
894
+ u = &odp->umem;
895
+
896
+ *page_shift = odp->page_shift;
897
+ *ncont = ib_umem_odp_num_pages(odp);
898
+ *npages = *ncont << (*page_shift - PAGE_SHIFT);
899
+ if (order)
900
+ *order = ilog2(roundup_pow_of_two(*ncont));
901
+ } else {
902
+ u = ib_umem_get(&dev->ib_dev, start, length, access_flags);
903
+ if (IS_ERR(u)) {
904
+ mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
905
+ return PTR_ERR(u);
906
+ }
907
+
908
+ mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
909
+ page_shift, ncont, order);
859910 }
860911
861
- mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
862
- page_shift, ncont, order);
863912 if (!*npages) {
864913 mlx5_ib_warn(dev, "avoid zero region\n");
865914 ib_umem_release(u);
....@@ -917,30 +966,41 @@
917966 return err;
918967 }
919968
920
-static struct mlx5_ib_mr *alloc_mr_from_cache(
921
- struct ib_pd *pd, struct ib_umem *umem,
922
- u64 virt_addr, u64 len, int npages,
923
- int page_shift, int order, int access_flags)
969
+static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
970
+ unsigned int order)
971
+{
972
+ struct mlx5_mr_cache *cache = &dev->cache;
973
+
974
+ if (order < cache->ent[0].order)
975
+ return &cache->ent[0];
976
+ order = order - cache->ent[0].order;
977
+ if (order > MR_CACHE_LAST_STD_ENTRY)
978
+ return NULL;
979
+ return &cache->ent[order];
980
+}
981
+
982
+static struct mlx5_ib_mr *
983
+alloc_mr_from_cache(struct ib_pd *pd, struct ib_umem *umem, u64 virt_addr,
984
+ u64 len, int npages, int page_shift, unsigned int order,
985
+ int access_flags)
924986 {
925987 struct mlx5_ib_dev *dev = to_mdev(pd->device);
988
+ struct mlx5_cache_ent *ent = mr_cache_ent_from_order(dev, order);
926989 struct mlx5_ib_mr *mr;
927
- int err = 0;
928
- int i;
929990
930
- for (i = 0; i < 1; i++) {
931
- mr = alloc_cached_mr(dev, order);
932
- if (mr)
933
- break;
991
+ if (!ent)
992
+ return ERR_PTR(-E2BIG);
934993
935
- err = add_keys(dev, order2idx(dev, order), 1);
936
- if (err && err != -EAGAIN) {
937
- mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
938
- break;
939
- }
994
+ /* Matches access in alloc_cache_mr() */
995
+ if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
996
+ return ERR_PTR(-EOPNOTSUPP);
997
+
998
+ mr = get_cache_mr(ent);
999
+ if (!mr) {
1000
+ mr = create_cache_mr(ent);
1001
+ if (IS_ERR(mr))
1002
+ return mr;
9401003 }
941
-
942
- if (!mr)
943
- return ERR_PTR(-EAGAIN);
9441004
9451005 mr->ibmr.pd = pd;
9461006 mr->umem = umem;
....@@ -951,36 +1011,6 @@
9511011 mr->mmkey.pd = to_mpd(pd)->pdn;
9521012
9531013 return mr;
954
-}
955
-
956
-static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages,
957
- void *xlt, int page_shift, size_t size,
958
- int flags)
959
-{
960
- struct mlx5_ib_dev *dev = mr->dev;
961
- struct ib_umem *umem = mr->umem;
962
-
963
- if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
964
- if (!umr_can_use_indirect_mkey(dev))
965
- return -EPERM;
966
- mlx5_odp_populate_klm(xlt, idx, npages, mr, flags);
967
- return npages;
968
- }
969
-
970
- npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx);
971
-
972
- if (!(flags & MLX5_IB_UPD_XLT_ZAP)) {
973
- __mlx5_ib_populate_pas(dev, umem, page_shift,
974
- idx, npages, xlt,
975
- MLX5_IB_MTT_PRESENT);
976
- /* Clear padding after the pages
977
- * brought from the umem.
978
- */
979
- memset(xlt + (npages * sizeof(struct mlx5_mtt)), 0,
980
- size - npages * sizeof(struct mlx5_mtt));
981
- }
982
-
983
- return npages;
9841014 }
9851015
9861016 #define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
....@@ -1006,6 +1036,7 @@
10061036 size_t pages_mapped = 0;
10071037 size_t pages_to_map = 0;
10081038 size_t pages_iter = 0;
1039
+ size_t size_to_map = 0;
10091040 gfp_t gfp;
10101041 bool use_emergency_page = false;
10111042
....@@ -1052,6 +1083,15 @@
10521083 goto free_xlt;
10531084 }
10541085
1086
+ if (mr->umem->is_odp) {
1087
+ if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
1088
+ struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
1089
+ size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
1090
+
1091
+ pages_to_map = min_t(size_t, pages_to_map, max_pages);
1092
+ }
1093
+ }
1094
+
10551095 sg.addr = dma;
10561096 sg.lkey = dev->umrc.pd->local_dma_lkey;
10571097
....@@ -1074,14 +1114,22 @@
10741114 pages_mapped < pages_to_map && !err;
10751115 pages_mapped += pages_iter, idx += pages_iter) {
10761116 npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
1117
+ size_to_map = npages * desc_size;
10771118 dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
1078
- npages = populate_xlt(mr, idx, npages, xlt,
1079
- page_shift, size, flags);
1080
-
1119
+ if (mr->umem->is_odp) {
1120
+ mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
1121
+ } else {
1122
+ __mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx,
1123
+ npages, xlt,
1124
+ MLX5_IB_MTT_PRESENT);
1125
+ /* Clear padding after the pages
1126
+ * brought from the umem.
1127
+ */
1128
+ memset(xlt + size_to_map, 0, size - size_to_map);
1129
+ }
10811130 dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
10821131
1083
- sg.length = ALIGN(npages * desc_size,
1084
- MLX5_UMR_MTT_ALIGNMENT);
1132
+ sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
10851133
10861134 if (pages_mapped + pages_iter >= pages_to_map) {
10871135 if (flags & MLX5_IB_UPD_XLT_ENABLE)
....@@ -1149,38 +1197,37 @@
11491197 goto err_1;
11501198 }
11511199 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1152
- if (populate && !(access_flags & IB_ACCESS_ON_DEMAND))
1200
+ if (populate) {
1201
+ if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
1202
+ err = -EINVAL;
1203
+ goto err_2;
1204
+ }
11531205 mlx5_ib_populate_pas(dev, umem, page_shift, pas,
11541206 pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1207
+ }
11551208
11561209 /* The pg_access bit allows setting the access flags
11571210 * in the page list submitted with the command. */
11581211 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
11591212
11601213 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1214
+ set_mkc_access_pd_addr_fields(mkc, access_flags, virt_addr,
1215
+ populate ? pd : dev->umrc.pd);
11611216 MLX5_SET(mkc, mkc, free, !populate);
11621217 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1163
- MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
1164
- MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
1165
- MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
1166
- MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
1167
- MLX5_SET(mkc, mkc, lr, 1);
11681218 MLX5_SET(mkc, mkc, umr_en, 1);
11691219
1170
- MLX5_SET64(mkc, mkc, start_addr, virt_addr);
11711220 MLX5_SET64(mkc, mkc, len, length);
1172
- MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
11731221 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
11741222 MLX5_SET(mkc, mkc, translations_octword_size,
11751223 get_octo_len(virt_addr, length, page_shift));
11761224 MLX5_SET(mkc, mkc, log_page_size, page_shift);
1177
- MLX5_SET(mkc, mkc, qpn, 0xffffff);
11781225 if (populate) {
11791226 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
11801227 get_octo_len(virt_addr, length, page_shift));
11811228 }
11821229
1183
- err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
1230
+ err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
11841231 if (err) {
11851232 mlx5_ib_warn(dev, "create mkey failed\n");
11861233 goto err_2;
....@@ -1204,23 +1251,20 @@
12041251 return ERR_PTR(err);
12051252 }
12061253
1207
-static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1208
- int npages, u64 length, int access_flags)
1254
+static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1255
+ u64 length, int access_flags)
12091256 {
1210
- mr->npages = npages;
1211
- atomic_add(npages, &dev->mdev->priv.reg_pages);
12121257 mr->ibmr.lkey = mr->mmkey.key;
12131258 mr->ibmr.rkey = mr->mmkey.key;
12141259 mr->ibmr.length = length;
12151260 mr->access_flags = access_flags;
12161261 }
12171262
1218
-static struct ib_mr *mlx5_ib_get_memic_mr(struct ib_pd *pd, u64 memic_addr,
1219
- u64 length, int acc)
1263
+static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1264
+ u64 length, int acc, int mode)
12201265 {
12211266 struct mlx5_ib_dev *dev = to_mdev(pd->device);
12221267 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1223
- struct mlx5_core_dev *mdev = dev->mdev;
12241268 struct mlx5_ib_mr *mr;
12251269 void *mkc;
12261270 u32 *in;
....@@ -1238,29 +1282,18 @@
12381282
12391283 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
12401284
1241
- MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MEMIC & 0x3);
1242
- MLX5_SET(mkc, mkc, access_mode_4_2,
1243
- (MLX5_MKC_ACCESS_MODE_MEMIC >> 2) & 0x7);
1244
- MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
1245
- MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
1246
- MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
1247
- MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
1248
- MLX5_SET(mkc, mkc, lr, 1);
1249
-
1285
+ MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1286
+ MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
12501287 MLX5_SET64(mkc, mkc, len, length);
1251
- MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1252
- MLX5_SET(mkc, mkc, qpn, 0xffffff);
1253
- MLX5_SET64(mkc, mkc, start_addr,
1254
- memic_addr - pci_resource_start(dev->mdev->pdev, 0));
1288
+ set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
12551289
1256
- err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
1290
+ err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
12571291 if (err)
12581292 goto err_in;
12591293
12601294 kfree(in);
12611295
1262
- mr->umem = NULL;
1263
- set_mr_fileds(dev, mr, 0, length, acc);
1296
+ set_mr_fields(dev, mr, length, acc);
12641297
12651298 return &mr->ibmr;
12661299
....@@ -1273,20 +1306,52 @@
12731306 return ERR_PTR(err);
12741307 }
12751308
1309
+int mlx5_ib_advise_mr(struct ib_pd *pd,
1310
+ enum ib_uverbs_advise_mr_advice advice,
1311
+ u32 flags,
1312
+ struct ib_sge *sg_list,
1313
+ u32 num_sge,
1314
+ struct uverbs_attr_bundle *attrs)
1315
+{
1316
+ if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1317
+ advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1318
+ advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1319
+ return -EOPNOTSUPP;
1320
+
1321
+ return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1322
+ sg_list, num_sge);
1323
+}
1324
+
12761325 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
12771326 struct ib_dm_mr_attr *attr,
12781327 struct uverbs_attr_bundle *attrs)
12791328 {
12801329 struct mlx5_ib_dm *mdm = to_mdm(dm);
1281
- u64 memic_addr;
1330
+ struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1331
+ u64 start_addr = mdm->dev_addr + attr->offset;
1332
+ int mode;
12821333
1283
- if (attr->access_flags & ~MLX5_IB_DM_ALLOWED_ACCESS)
1334
+ switch (mdm->type) {
1335
+ case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1336
+ if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1337
+ return ERR_PTR(-EINVAL);
1338
+
1339
+ mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1340
+ start_addr -= pci_resource_start(dev->pdev, 0);
1341
+ break;
1342
+ case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1343
+ case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1344
+ if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1345
+ return ERR_PTR(-EINVAL);
1346
+
1347
+ mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1348
+ break;
1349
+ default:
12841350 return ERR_PTR(-EINVAL);
1351
+ }
12851352
1286
- memic_addr = mdm->dev_addr + attr->offset;
1287
-
1288
- return mlx5_ib_get_memic_mr(pd, memic_addr, attr->length,
1289
- attr->access_flags);
1353
+ return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1354
+ attr->access_flags, mode);
12901355 }
12911356
12921357 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
....@@ -1295,7 +1360,7 @@
12951360 {
12961361 struct mlx5_ib_dev *dev = to_mdev(pd->device);
12971362 struct mlx5_ib_mr *mr = NULL;
1298
- bool use_umr;
1363
+ bool xlt_with_umr;
12991364 struct ib_umem *umem;
13001365 int page_shift;
13011366 int npages;
....@@ -1309,49 +1374,42 @@
13091374 mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
13101375 start, virt_addr, length, access_flags);
13111376
1312
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1313
- if (!start && length == U64_MAX) {
1377
+ xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, length);
1378
+ /* ODP requires xlt update via umr to work. */
1379
+ if (!xlt_with_umr && (access_flags & IB_ACCESS_ON_DEMAND))
1380
+ return ERR_PTR(-EINVAL);
1381
+
1382
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start &&
1383
+ length == U64_MAX) {
1384
+ if (virt_addr != start)
1385
+ return ERR_PTR(-EINVAL);
13141386 if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
13151387 !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
13161388 return ERR_PTR(-EINVAL);
13171389
1318
- mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1390
+ mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags);
13191391 if (IS_ERR(mr))
13201392 return ERR_CAST(mr);
13211393 return &mr->ibmr;
13221394 }
1323
-#endif
13241395
1325
- err = mr_umem_get(pd, start, length, access_flags, &umem, &npages,
1326
- &page_shift, &ncont, &order);
1396
+ err = mr_umem_get(dev, start, length, access_flags, &umem,
1397
+ &npages, &page_shift, &ncont, &order);
13271398
13281399 if (err < 0)
13291400 return ERR_PTR(err);
13301401
1331
- use_umr = !MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled) &&
1332
- (!MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled) ||
1333
- !MLX5_CAP_GEN(dev->mdev, atomic));
1334
-
1335
- if (order <= mr_cache_max_order(dev) && use_umr) {
1402
+ if (xlt_with_umr) {
13361403 mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
13371404 page_shift, order, access_flags);
1338
- if (PTR_ERR(mr) == -EAGAIN) {
1339
- mlx5_ib_dbg(dev, "cache empty for order %d\n", order);
1405
+ if (IS_ERR(mr))
13401406 mr = NULL;
1341
- }
1342
- } else if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) {
1343
- if (access_flags & IB_ACCESS_ON_DEMAND) {
1344
- err = -EINVAL;
1345
- pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB\n");
1346
- goto error;
1347
- }
1348
- use_umr = false;
13491407 }
13501408
13511409 if (!mr) {
13521410 mutex_lock(&dev->slow_path_mutex);
13531411 mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
1354
- page_shift, access_flags, !use_umr);
1412
+ page_shift, access_flags, !xlt_with_umr);
13551413 mutex_unlock(&dev->slow_path_mutex);
13561414 }
13571415
....@@ -1363,52 +1421,74 @@
13631421 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
13641422
13651423 mr->umem = umem;
1366
- set_mr_fileds(dev, mr, npages, length, access_flags);
1424
+ mr->npages = npages;
1425
+ atomic_add(mr->npages, &dev->mdev->priv.reg_pages);
1426
+ set_mr_fields(dev, mr, length, access_flags);
13671427
1368
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1369
- update_odp_mr(mr);
1370
-#endif
1371
-
1372
- if (use_umr) {
1428
+ if (xlt_with_umr && !(access_flags & IB_ACCESS_ON_DEMAND)) {
1429
+ /*
1430
+ * If the MR was created with reg_create then it will be
1431
+ * configured properly but left disabled. It is safe to go ahead
1432
+ * and configure it again via UMR while enabling it.
1433
+ */
13731434 int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
1374
-
1375
- if (access_flags & IB_ACCESS_ON_DEMAND)
1376
- update_xlt_flags |= MLX5_IB_UPD_XLT_ZAP;
13771435
13781436 err = mlx5_ib_update_xlt(mr, 0, ncont, page_shift,
13791437 update_xlt_flags);
1380
-
13811438 if (err) {
13821439 dereg_mr(dev, mr);
13831440 return ERR_PTR(err);
13841441 }
13851442 }
13861443
1387
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1388
- mr->live = 1;
1389
-#endif
1444
+ if (is_odp_mr(mr)) {
1445
+ to_ib_umem_odp(mr->umem)->private = mr;
1446
+ init_waitqueue_head(&mr->q_deferred_work);
1447
+ atomic_set(&mr->num_deferred_work, 0);
1448
+ err = xa_err(xa_store(&dev->odp_mkeys,
1449
+ mlx5_base_mkey(mr->mmkey.key), &mr->mmkey,
1450
+ GFP_KERNEL));
1451
+ if (err) {
1452
+ dereg_mr(dev, mr);
1453
+ return ERR_PTR(err);
1454
+ }
1455
+
1456
+ err = mlx5_ib_init_odp_mr(mr, xlt_with_umr);
1457
+ if (err) {
1458
+ dereg_mr(dev, mr);
1459
+ return ERR_PTR(err);
1460
+ }
1461
+ }
1462
+
13901463 return &mr->ibmr;
13911464 error:
13921465 ib_umem_release(umem);
13931466 return ERR_PTR(err);
13941467 }
13951468
1396
-static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1469
+/**
1470
+ * mlx5_mr_cache_invalidate - Fence all DMA on the MR
1471
+ * @mr: The MR to fence
1472
+ *
1473
+ * Upon return the NIC will not be doing any DMA to the pages under the MR,
1474
+ * and any DMA inprogress will be completed. Failure of this function
1475
+ * indicates the HW has failed catastrophically.
1476
+ */
1477
+int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr)
13971478 {
1398
- struct mlx5_core_dev *mdev = dev->mdev;
13991479 struct mlx5_umr_wr umrwr = {};
14001480
1401
- if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1481
+ if (mr->dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
14021482 return 0;
14031483
14041484 umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
14051485 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
14061486 umrwr.wr.opcode = MLX5_IB_WR_UMR;
1407
- umrwr.pd = dev->umrc.pd;
1487
+ umrwr.pd = mr->dev->umrc.pd;
14081488 umrwr.mkey = mr->mmkey.key;
14091489 umrwr.ignore_free_state = 1;
14101490
1411
- return mlx5_ib_post_send_wait(dev, &umrwr);
1491
+ return mlx5_ib_post_send_wait(mr->dev, &umrwr);
14121492 }
14131493
14141494 static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr,
....@@ -1455,10 +1535,11 @@
14551535 mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
14561536 start, virt_addr, length, access_flags);
14571537
1458
- atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
1459
-
14601538 if (!mr->umem)
14611539 return -EINVAL;
1540
+
1541
+ if (is_odp_mr(mr))
1542
+ return -EOPNOTSUPP;
14621543
14631544 if (flags & IB_MR_REREG_TRANS) {
14641545 addr = virt_addr;
....@@ -1474,22 +1555,30 @@
14741555 * used.
14751556 */
14761557 flags |= IB_MR_REREG_TRANS;
1558
+ atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
1559
+ mr->npages = 0;
14771560 ib_umem_release(mr->umem);
14781561 mr->umem = NULL;
1479
- err = mr_umem_get(pd, addr, len, access_flags, &mr->umem,
1562
+
1563
+ err = mr_umem_get(dev, addr, len, access_flags, &mr->umem,
14801564 &npages, &page_shift, &ncont, &order);
14811565 if (err)
14821566 goto err;
1567
+ mr->npages = ncont;
1568
+ atomic_add(mr->npages, &dev->mdev->priv.reg_pages);
14831569 }
14841570
1485
- if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) {
1571
+ if (!mlx5_ib_can_reconfig_with_umr(dev, mr->access_flags,
1572
+ access_flags) ||
1573
+ !mlx5_ib_can_load_pas_with_umr(dev, len) ||
1574
+ (flags & IB_MR_REREG_TRANS &&
1575
+ !mlx5_ib_pas_fits_in_mr(mr, addr, len))) {
14861576 /*
14871577 * UMR can't be used - MKey needs to be replaced.
14881578 */
1489
- if (mr->allocated_from_cache)
1490
- err = unreg_umr(dev, mr);
1491
- else
1492
- err = destroy_mkey(dev, mr);
1579
+ if (mr->cache_ent)
1580
+ detach_mr_from_cache(mr);
1581
+ err = destroy_mkey(dev, mr);
14931582 if (err)
14941583 goto err;
14951584
....@@ -1501,11 +1590,6 @@
15011590 mr = to_mmr(ib_mr);
15021591 goto err;
15031592 }
1504
-
1505
- mr->allocated_from_cache = 0;
1506
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1507
- mr->live = 1;
1508
-#endif
15091593 } else {
15101594 /*
15111595 * Send a UMR WQE
....@@ -1532,18 +1616,14 @@
15321616 goto err;
15331617 }
15341618
1535
- set_mr_fileds(dev, mr, npages, len, access_flags);
1619
+ set_mr_fields(dev, mr, len, access_flags);
15361620
1537
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1538
- update_odp_mr(mr);
1539
-#endif
15401621 return 0;
15411622
15421623 err:
1543
- if (mr->umem) {
1544
- ib_umem_release(mr->umem);
1545
- mr->umem = NULL;
1546
- }
1624
+ ib_umem_release(mr->umem);
1625
+ mr->umem = NULL;
1626
+
15471627 clean_mr(dev, mr);
15481628 return err;
15491629 }
....@@ -1596,8 +1676,6 @@
15961676
15971677 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
15981678 {
1599
- int allocated_from_cache = mr->allocated_from_cache;
1600
-
16011679 if (mr->sig) {
16021680 if (mlx5_core_destroy_psv(dev->mdev,
16031681 mr->sig->psv_memory.psv_idx))
....@@ -1607,11 +1685,12 @@
16071685 mr->sig->psv_wire.psv_idx))
16081686 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
16091687 mr->sig->psv_wire.psv_idx);
1688
+ xa_erase(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key));
16101689 kfree(mr->sig);
16111690 mr->sig = NULL;
16121691 }
16131692
1614
- if (!allocated_from_cache) {
1693
+ if (!mr->cache_ent) {
16151694 destroy_mkey(dev, mr);
16161695 mlx5_free_priv_descs(mr);
16171696 }
....@@ -1622,60 +1701,235 @@
16221701 int npages = mr->npages;
16231702 struct ib_umem *umem = mr->umem;
16241703
1625
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1626
- if (umem && umem->odp_data) {
1627
- /* Prevent new page faults from succeeding */
1628
- mr->live = 0;
1629
- /* Wait for all running page-fault handlers to finish. */
1630
- synchronize_srcu(&dev->mr_srcu);
1631
- /* Destroy all page mappings */
1632
- if (umem->odp_data->page_list)
1633
- mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
1634
- ib_umem_end(umem));
1635
- else
1636
- mlx5_ib_free_implicit_mr(mr);
1637
- /*
1638
- * We kill the umem before the MR for ODP,
1639
- * so that there will not be any invalidations in
1640
- * flight, looking at the *mr struct.
1641
- */
1642
- ib_umem_release(umem);
1643
- atomic_sub(npages, &dev->mdev->priv.reg_pages);
1704
+ /* Stop all DMA */
1705
+ if (is_odp_mr(mr))
1706
+ mlx5_ib_fence_odp_mr(mr);
1707
+ else
1708
+ clean_mr(dev, mr);
16441709
1645
- /* Avoid double-freeing the umem. */
1646
- umem = NULL;
1647
- }
1648
-#endif
1649
- clean_mr(dev, mr);
1650
-
1651
- /*
1652
- * We should unregister the DMA address from the HCA before
1653
- * remove the DMA mapping.
1654
- */
1655
- mlx5_mr_cache_free(dev, mr);
1656
- if (umem) {
1657
- ib_umem_release(umem);
1658
- atomic_sub(npages, &dev->mdev->priv.reg_pages);
1659
- }
1660
- if (!mr->allocated_from_cache)
1710
+ if (mr->cache_ent)
1711
+ mlx5_mr_cache_free(dev, mr);
1712
+ else
16611713 kfree(mr);
1714
+
1715
+ ib_umem_release(umem);
1716
+ atomic_sub(npages, &dev->mdev->priv.reg_pages);
1717
+
16621718 }
16631719
1664
-int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
1720
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
16651721 {
1666
- dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr));
1722
+ struct mlx5_ib_mr *mmr = to_mmr(ibmr);
1723
+
1724
+ if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1725
+ dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr);
1726
+ dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr);
1727
+ }
1728
+
1729
+ if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) {
1730
+ mlx5_ib_free_implicit_mr(mmr);
1731
+ return 0;
1732
+ }
1733
+
1734
+ dereg_mr(to_mdev(ibmr->device), mmr);
1735
+
16671736 return 0;
16681737 }
16691738
1670
-struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
1671
- enum ib_mr_type mr_type,
1672
- u32 max_num_sg)
1739
+static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
1740
+ int access_mode, int page_shift)
1741
+{
1742
+ void *mkc;
1743
+
1744
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1745
+
1746
+ /* This is only used from the kernel, so setting the PD is OK. */
1747
+ set_mkc_access_pd_addr_fields(mkc, 0, 0, pd);
1748
+ MLX5_SET(mkc, mkc, free, 1);
1749
+ MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1750
+ MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1751
+ MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1752
+ MLX5_SET(mkc, mkc, umr_en, 1);
1753
+ MLX5_SET(mkc, mkc, log_page_size, page_shift);
1754
+}
1755
+
1756
+static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1757
+ int ndescs, int desc_size, int page_shift,
1758
+ int access_mode, u32 *in, int inlen)
1759
+{
1760
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
1761
+ int err;
1762
+
1763
+ mr->access_mode = access_mode;
1764
+ mr->desc_size = desc_size;
1765
+ mr->max_descs = ndescs;
1766
+
1767
+ err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
1768
+ if (err)
1769
+ return err;
1770
+
1771
+ mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
1772
+
1773
+ err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1774
+ if (err)
1775
+ goto err_free_descs;
1776
+
1777
+ mr->mmkey.type = MLX5_MKEY_MR;
1778
+ mr->ibmr.lkey = mr->mmkey.key;
1779
+ mr->ibmr.rkey = mr->mmkey.key;
1780
+
1781
+ return 0;
1782
+
1783
+err_free_descs:
1784
+ mlx5_free_priv_descs(mr);
1785
+ return err;
1786
+}
1787
+
1788
+static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
1789
+ u32 max_num_sg, u32 max_num_meta_sg,
1790
+ int desc_size, int access_mode)
1791
+{
1792
+ int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1793
+ int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
1794
+ int page_shift = 0;
1795
+ struct mlx5_ib_mr *mr;
1796
+ u32 *in;
1797
+ int err;
1798
+
1799
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1800
+ if (!mr)
1801
+ return ERR_PTR(-ENOMEM);
1802
+
1803
+ mr->ibmr.pd = pd;
1804
+ mr->ibmr.device = pd->device;
1805
+
1806
+ in = kzalloc(inlen, GFP_KERNEL);
1807
+ if (!in) {
1808
+ err = -ENOMEM;
1809
+ goto err_free;
1810
+ }
1811
+
1812
+ if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
1813
+ page_shift = PAGE_SHIFT;
1814
+
1815
+ err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
1816
+ access_mode, in, inlen);
1817
+ if (err)
1818
+ goto err_free_in;
1819
+
1820
+ mr->umem = NULL;
1821
+ kfree(in);
1822
+
1823
+ return mr;
1824
+
1825
+err_free_in:
1826
+ kfree(in);
1827
+err_free:
1828
+ kfree(mr);
1829
+ return ERR_PTR(err);
1830
+}
1831
+
1832
+static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1833
+ int ndescs, u32 *in, int inlen)
1834
+{
1835
+ return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
1836
+ PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
1837
+ inlen);
1838
+}
1839
+
1840
+static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1841
+ int ndescs, u32 *in, int inlen)
1842
+{
1843
+ return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
1844
+ 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
1845
+}
1846
+
1847
+static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1848
+ int max_num_sg, int max_num_meta_sg,
1849
+ u32 *in, int inlen)
1850
+{
1851
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
1852
+ u32 psv_index[2];
1853
+ void *mkc;
1854
+ int err;
1855
+
1856
+ mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
1857
+ if (!mr->sig)
1858
+ return -ENOMEM;
1859
+
1860
+ /* create mem & wire PSVs */
1861
+ err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
1862
+ if (err)
1863
+ goto err_free_sig;
1864
+
1865
+ mr->sig->psv_memory.psv_idx = psv_index[0];
1866
+ mr->sig->psv_wire.psv_idx = psv_index[1];
1867
+
1868
+ mr->sig->sig_status_checked = true;
1869
+ mr->sig->sig_err_exists = false;
1870
+ /* Next UMR, Arm SIGERR */
1871
+ ++mr->sig->sigerr_count;
1872
+ mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
1873
+ sizeof(struct mlx5_klm),
1874
+ MLX5_MKC_ACCESS_MODE_KLMS);
1875
+ if (IS_ERR(mr->klm_mr)) {
1876
+ err = PTR_ERR(mr->klm_mr);
1877
+ goto err_destroy_psv;
1878
+ }
1879
+ mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
1880
+ sizeof(struct mlx5_mtt),
1881
+ MLX5_MKC_ACCESS_MODE_MTT);
1882
+ if (IS_ERR(mr->mtt_mr)) {
1883
+ err = PTR_ERR(mr->mtt_mr);
1884
+ goto err_free_klm_mr;
1885
+ }
1886
+
1887
+ /* Set bsf descriptors for mkey */
1888
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1889
+ MLX5_SET(mkc, mkc, bsf_en, 1);
1890
+ MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
1891
+
1892
+ err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
1893
+ MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
1894
+ if (err)
1895
+ goto err_free_mtt_mr;
1896
+
1897
+ err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
1898
+ mr->sig, GFP_KERNEL));
1899
+ if (err)
1900
+ goto err_free_descs;
1901
+ return 0;
1902
+
1903
+err_free_descs:
1904
+ destroy_mkey(dev, mr);
1905
+ mlx5_free_priv_descs(mr);
1906
+err_free_mtt_mr:
1907
+ dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr);
1908
+ mr->mtt_mr = NULL;
1909
+err_free_klm_mr:
1910
+ dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr);
1911
+ mr->klm_mr = NULL;
1912
+err_destroy_psv:
1913
+ if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
1914
+ mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1915
+ mr->sig->psv_memory.psv_idx);
1916
+ if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
1917
+ mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1918
+ mr->sig->psv_wire.psv_idx);
1919
+err_free_sig:
1920
+ kfree(mr->sig);
1921
+
1922
+ return err;
1923
+}
1924
+
1925
+static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
1926
+ enum ib_mr_type mr_type, u32 max_num_sg,
1927
+ u32 max_num_meta_sg)
16731928 {
16741929 struct mlx5_ib_dev *dev = to_mdev(pd->device);
16751930 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
16761931 int ndescs = ALIGN(max_num_sg, 4);
16771932 struct mlx5_ib_mr *mr;
1678
- void *mkc;
16791933 u32 *in;
16801934 int err;
16811935
....@@ -1689,93 +1943,32 @@
16891943 goto err_free;
16901944 }
16911945
1692
- mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1693
- MLX5_SET(mkc, mkc, free, 1);
1694
- MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1695
- MLX5_SET(mkc, mkc, qpn, 0xffffff);
1696
- MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1946
+ mr->ibmr.device = pd->device;
1947
+ mr->umem = NULL;
16971948
1698
- if (mr_type == IB_MR_TYPE_MEM_REG) {
1699
- mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
1700
- MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
1701
- err = mlx5_alloc_priv_descs(pd->device, mr,
1702
- ndescs, sizeof(struct mlx5_mtt));
1703
- if (err)
1704
- goto err_free_in;
1705
-
1706
- mr->desc_size = sizeof(struct mlx5_mtt);
1707
- mr->max_descs = ndescs;
1708
- } else if (mr_type == IB_MR_TYPE_SG_GAPS) {
1709
- mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
1710
-
1711
- err = mlx5_alloc_priv_descs(pd->device, mr,
1712
- ndescs, sizeof(struct mlx5_klm));
1713
- if (err)
1714
- goto err_free_in;
1715
- mr->desc_size = sizeof(struct mlx5_klm);
1716
- mr->max_descs = ndescs;
1717
- } else if (mr_type == IB_MR_TYPE_SIGNATURE) {
1718
- u32 psv_index[2];
1719
-
1720
- MLX5_SET(mkc, mkc, bsf_en, 1);
1721
- MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
1722
- mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
1723
- if (!mr->sig) {
1724
- err = -ENOMEM;
1725
- goto err_free_in;
1726
- }
1727
-
1728
- /* create mem & wire PSVs */
1729
- err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn,
1730
- 2, psv_index);
1731
- if (err)
1732
- goto err_free_sig;
1733
-
1734
- mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
1735
- mr->sig->psv_memory.psv_idx = psv_index[0];
1736
- mr->sig->psv_wire.psv_idx = psv_index[1];
1737
-
1738
- mr->sig->sig_status_checked = true;
1739
- mr->sig->sig_err_exists = false;
1740
- /* Next UMR, Arm SIGERR */
1741
- ++mr->sig->sigerr_count;
1742
- } else {
1949
+ switch (mr_type) {
1950
+ case IB_MR_TYPE_MEM_REG:
1951
+ err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
1952
+ break;
1953
+ case IB_MR_TYPE_SG_GAPS:
1954
+ err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
1955
+ break;
1956
+ case IB_MR_TYPE_INTEGRITY:
1957
+ err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
1958
+ max_num_meta_sg, in, inlen);
1959
+ break;
1960
+ default:
17431961 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
17441962 err = -EINVAL;
1745
- goto err_free_in;
17461963 }
17471964
1748
- MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3);
1749
- MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7);
1750
- MLX5_SET(mkc, mkc, umr_en, 1);
1751
-
1752
- mr->ibmr.device = pd->device;
1753
- err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
17541965 if (err)
1755
- goto err_destroy_psv;
1966
+ goto err_free_in;
17561967
1757
- mr->mmkey.type = MLX5_MKEY_MR;
1758
- mr->ibmr.lkey = mr->mmkey.key;
1759
- mr->ibmr.rkey = mr->mmkey.key;
1760
- mr->umem = NULL;
17611968 kfree(in);
17621969
17631970 return &mr->ibmr;
17641971
1765
-err_destroy_psv:
1766
- if (mr->sig) {
1767
- if (mlx5_core_destroy_psv(dev->mdev,
1768
- mr->sig->psv_memory.psv_idx))
1769
- mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1770
- mr->sig->psv_memory.psv_idx);
1771
- if (mlx5_core_destroy_psv(dev->mdev,
1772
- mr->sig->psv_wire.psv_idx))
1773
- mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1774
- mr->sig->psv_wire.psv_idx);
1775
- }
1776
- mlx5_free_priv_descs(mr);
1777
-err_free_sig:
1778
- kfree(mr->sig);
17791972 err_free_in:
17801973 kfree(in);
17811974 err_free:
....@@ -1783,12 +1976,24 @@
17831976 return ERR_PTR(err);
17841977 }
17851978
1786
-struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
1787
- struct ib_udata *udata)
1979
+struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
1980
+ u32 max_num_sg)
17881981 {
1789
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
1982
+ return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
1983
+}
1984
+
1985
+struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
1986
+ u32 max_num_sg, u32 max_num_meta_sg)
1987
+{
1988
+ return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
1989
+ max_num_meta_sg);
1990
+}
1991
+
1992
+int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
1993
+{
1994
+ struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
17901995 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1791
- struct mlx5_ib_mw *mw = NULL;
1996
+ struct mlx5_ib_mw *mw = to_mmw(ibmw);
17921997 u32 *in = NULL;
17931998 void *mkc;
17941999 int ndescs;
....@@ -1801,21 +2006,20 @@
18012006
18022007 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
18032008 if (err)
1804
- return ERR_PTR(err);
2009
+ return err;
18052010
18062011 if (req.comp_mask || req.reserved1 || req.reserved2)
1807
- return ERR_PTR(-EOPNOTSUPP);
2012
+ return -EOPNOTSUPP;
18082013
18092014 if (udata->inlen > sizeof(req) &&
18102015 !ib_is_udata_cleared(udata, sizeof(req),
18112016 udata->inlen - sizeof(req)))
1812
- return ERR_PTR(-EOPNOTSUPP);
2017
+ return -EOPNOTSUPP;
18132018
18142019 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
18152020
1816
- mw = kzalloc(sizeof(*mw), GFP_KERNEL);
18172021 in = kzalloc(inlen, GFP_KERNEL);
1818
- if (!mw || !in) {
2022
+ if (!in) {
18192023 err = -ENOMEM;
18202024 goto free;
18212025 }
....@@ -1824,50 +2028,62 @@
18242028
18252029 MLX5_SET(mkc, mkc, free, 1);
18262030 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1827
- MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
2031
+ MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
18282032 MLX5_SET(mkc, mkc, umr_en, 1);
18292033 MLX5_SET(mkc, mkc, lr, 1);
18302034 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
1831
- MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2)));
2035
+ MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
18322036 MLX5_SET(mkc, mkc, qpn, 0xffffff);
18332037
1834
- err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, inlen);
2038
+ err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
18352039 if (err)
18362040 goto free;
18372041
18382042 mw->mmkey.type = MLX5_MKEY_MW;
1839
- mw->ibmw.rkey = mw->mmkey.key;
2043
+ ibmw->rkey = mw->mmkey.key;
18402044 mw->ndescs = ndescs;
18412045
1842
- resp.response_length = min(offsetof(typeof(resp), response_length) +
1843
- sizeof(resp.response_length), udata->outlen);
2046
+ resp.response_length =
2047
+ min(offsetofend(typeof(resp), response_length), udata->outlen);
18442048 if (resp.response_length) {
18452049 err = ib_copy_to_udata(udata, &resp, resp.response_length);
1846
- if (err) {
1847
- mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
1848
- goto free;
1849
- }
2050
+ if (err)
2051
+ goto free_mkey;
2052
+ }
2053
+
2054
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2055
+ err = xa_err(xa_store(&dev->odp_mkeys,
2056
+ mlx5_base_mkey(mw->mmkey.key), &mw->mmkey,
2057
+ GFP_KERNEL));
2058
+ if (err)
2059
+ goto free_mkey;
18502060 }
18512061
18522062 kfree(in);
1853
- return &mw->ibmw;
2063
+ return 0;
18542064
2065
+free_mkey:
2066
+ mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
18552067 free:
1856
- kfree(mw);
18572068 kfree(in);
1858
- return ERR_PTR(err);
2069
+ return err;
18592070 }
18602071
18612072 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
18622073 {
2074
+ struct mlx5_ib_dev *dev = to_mdev(mw->device);
18632075 struct mlx5_ib_mw *mmw = to_mmw(mw);
1864
- int err;
18652076
1866
- err = mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev,
1867
- &mmw->mmkey);
1868
- if (!err)
1869
- kfree(mmw);
1870
- return err;
2077
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2078
+ xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key));
2079
+ /*
2080
+ * pagefault_single_data_segment() may be accessing mmw under
2081
+ * SRCU if the user bound an ODP MR to this MW.
2082
+ */
2083
+ synchronize_srcu(&dev->odp_srcu);
2084
+ }
2085
+
2086
+ return mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
18712087 }
18722088
18732089 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
....@@ -1912,16 +2128,53 @@
19122128 }
19132129
19142130 static int
2131
+mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2132
+ int data_sg_nents, unsigned int *data_sg_offset,
2133
+ struct scatterlist *meta_sg, int meta_sg_nents,
2134
+ unsigned int *meta_sg_offset)
2135
+{
2136
+ struct mlx5_ib_mr *mr = to_mmr(ibmr);
2137
+ unsigned int sg_offset = 0;
2138
+ int n = 0;
2139
+
2140
+ mr->meta_length = 0;
2141
+ if (data_sg_nents == 1) {
2142
+ n++;
2143
+ mr->ndescs = 1;
2144
+ if (data_sg_offset)
2145
+ sg_offset = *data_sg_offset;
2146
+ mr->data_length = sg_dma_len(data_sg) - sg_offset;
2147
+ mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2148
+ if (meta_sg_nents == 1) {
2149
+ n++;
2150
+ mr->meta_ndescs = 1;
2151
+ if (meta_sg_offset)
2152
+ sg_offset = *meta_sg_offset;
2153
+ else
2154
+ sg_offset = 0;
2155
+ mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2156
+ mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2157
+ }
2158
+ ibmr->length = mr->data_length + mr->meta_length;
2159
+ }
2160
+
2161
+ return n;
2162
+}
2163
+
2164
+static int
19152165 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
19162166 struct scatterlist *sgl,
19172167 unsigned short sg_nents,
1918
- unsigned int *sg_offset_p)
2168
+ unsigned int *sg_offset_p,
2169
+ struct scatterlist *meta_sgl,
2170
+ unsigned short meta_sg_nents,
2171
+ unsigned int *meta_sg_offset_p)
19192172 {
19202173 struct scatterlist *sg = sgl;
19212174 struct mlx5_klm *klms = mr->descs;
19222175 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
19232176 u32 lkey = mr->ibmr.pd->local_dma_lkey;
1924
- int i;
2177
+ int i, j = 0;
19252178
19262179 mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
19272180 mr->ibmr.length = 0;
....@@ -1936,12 +2189,36 @@
19362189
19372190 sg_offset = 0;
19382191 }
1939
- mr->ndescs = i;
19402192
19412193 if (sg_offset_p)
19422194 *sg_offset_p = sg_offset;
19432195
1944
- return i;
2196
+ mr->ndescs = i;
2197
+ mr->data_length = mr->ibmr.length;
2198
+
2199
+ if (meta_sg_nents) {
2200
+ sg = meta_sgl;
2201
+ sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2202
+ for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2203
+ if (unlikely(i + j >= mr->max_descs))
2204
+ break;
2205
+ klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2206
+ sg_offset);
2207
+ klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2208
+ sg_offset);
2209
+ klms[i + j].key = cpu_to_be32(lkey);
2210
+ mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2211
+
2212
+ sg_offset = 0;
2213
+ }
2214
+ if (meta_sg_offset_p)
2215
+ *meta_sg_offset_p = sg_offset;
2216
+
2217
+ mr->meta_ndescs = j;
2218
+ mr->meta_length = mr->ibmr.length - mr->data_length;
2219
+ }
2220
+
2221
+ return i + j;
19452222 }
19462223
19472224 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
....@@ -1954,6 +2231,181 @@
19542231
19552232 descs = mr->descs;
19562233 descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2234
+
2235
+ return 0;
2236
+}
2237
+
2238
+static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2239
+{
2240
+ struct mlx5_ib_mr *mr = to_mmr(ibmr);
2241
+ __be64 *descs;
2242
+
2243
+ if (unlikely(mr->ndescs + mr->meta_ndescs == mr->max_descs))
2244
+ return -ENOMEM;
2245
+
2246
+ descs = mr->descs;
2247
+ descs[mr->ndescs + mr->meta_ndescs++] =
2248
+ cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2249
+
2250
+ return 0;
2251
+}
2252
+
2253
+static int
2254
+mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2255
+ int data_sg_nents, unsigned int *data_sg_offset,
2256
+ struct scatterlist *meta_sg, int meta_sg_nents,
2257
+ unsigned int *meta_sg_offset)
2258
+{
2259
+ struct mlx5_ib_mr *mr = to_mmr(ibmr);
2260
+ struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2261
+ int n;
2262
+
2263
+ pi_mr->ndescs = 0;
2264
+ pi_mr->meta_ndescs = 0;
2265
+ pi_mr->meta_length = 0;
2266
+
2267
+ ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2268
+ pi_mr->desc_size * pi_mr->max_descs,
2269
+ DMA_TO_DEVICE);
2270
+
2271
+ pi_mr->ibmr.page_size = ibmr->page_size;
2272
+ n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2273
+ mlx5_set_page);
2274
+ if (n != data_sg_nents)
2275
+ return n;
2276
+
2277
+ pi_mr->data_iova = pi_mr->ibmr.iova;
2278
+ pi_mr->data_length = pi_mr->ibmr.length;
2279
+ pi_mr->ibmr.length = pi_mr->data_length;
2280
+ ibmr->length = pi_mr->data_length;
2281
+
2282
+ if (meta_sg_nents) {
2283
+ u64 page_mask = ~((u64)ibmr->page_size - 1);
2284
+ u64 iova = pi_mr->data_iova;
2285
+
2286
+ n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2287
+ meta_sg_offset, mlx5_set_page_pi);
2288
+
2289
+ pi_mr->meta_length = pi_mr->ibmr.length;
2290
+ /*
2291
+ * PI address for the HW is the offset of the metadata address
2292
+ * relative to the first data page address.
2293
+ * It equals to first data page address + size of data pages +
2294
+ * metadata offset at the first metadata page
2295
+ */
2296
+ pi_mr->pi_iova = (iova & page_mask) +
2297
+ pi_mr->ndescs * ibmr->page_size +
2298
+ (pi_mr->ibmr.iova & ~page_mask);
2299
+ /*
2300
+ * In order to use one MTT MR for data and metadata, we register
2301
+ * also the gaps between the end of the data and the start of
2302
+ * the metadata (the sig MR will verify that the HW will access
2303
+ * to right addresses). This mapping is safe because we use
2304
+ * internal mkey for the registration.
2305
+ */
2306
+ pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2307
+ pi_mr->ibmr.iova = iova;
2308
+ ibmr->length += pi_mr->meta_length;
2309
+ }
2310
+
2311
+ ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2312
+ pi_mr->desc_size * pi_mr->max_descs,
2313
+ DMA_TO_DEVICE);
2314
+
2315
+ return n;
2316
+}
2317
+
2318
+static int
2319
+mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2320
+ int data_sg_nents, unsigned int *data_sg_offset,
2321
+ struct scatterlist *meta_sg, int meta_sg_nents,
2322
+ unsigned int *meta_sg_offset)
2323
+{
2324
+ struct mlx5_ib_mr *mr = to_mmr(ibmr);
2325
+ struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2326
+ int n;
2327
+
2328
+ pi_mr->ndescs = 0;
2329
+ pi_mr->meta_ndescs = 0;
2330
+ pi_mr->meta_length = 0;
2331
+
2332
+ ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2333
+ pi_mr->desc_size * pi_mr->max_descs,
2334
+ DMA_TO_DEVICE);
2335
+
2336
+ n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2337
+ meta_sg, meta_sg_nents, meta_sg_offset);
2338
+
2339
+ ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2340
+ pi_mr->desc_size * pi_mr->max_descs,
2341
+ DMA_TO_DEVICE);
2342
+
2343
+ /* This is zero-based memory region */
2344
+ pi_mr->data_iova = 0;
2345
+ pi_mr->ibmr.iova = 0;
2346
+ pi_mr->pi_iova = pi_mr->data_length;
2347
+ ibmr->length = pi_mr->ibmr.length;
2348
+
2349
+ return n;
2350
+}
2351
+
2352
+int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2353
+ int data_sg_nents, unsigned int *data_sg_offset,
2354
+ struct scatterlist *meta_sg, int meta_sg_nents,
2355
+ unsigned int *meta_sg_offset)
2356
+{
2357
+ struct mlx5_ib_mr *mr = to_mmr(ibmr);
2358
+ struct mlx5_ib_mr *pi_mr = NULL;
2359
+ int n;
2360
+
2361
+ WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2362
+
2363
+ mr->ndescs = 0;
2364
+ mr->data_length = 0;
2365
+ mr->data_iova = 0;
2366
+ mr->meta_ndescs = 0;
2367
+ mr->pi_iova = 0;
2368
+ /*
2369
+ * As a performance optimization, if possible, there is no need to
2370
+ * perform UMR operation to register the data/metadata buffers.
2371
+ * First try to map the sg lists to PA descriptors with local_dma_lkey.
2372
+ * Fallback to UMR only in case of a failure.
2373
+ */
2374
+ n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2375
+ data_sg_offset, meta_sg, meta_sg_nents,
2376
+ meta_sg_offset);
2377
+ if (n == data_sg_nents + meta_sg_nents)
2378
+ goto out;
2379
+ /*
2380
+ * As a performance optimization, if possible, there is no need to map
2381
+ * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2382
+ * descriptors and fallback to KLM only in case of a failure.
2383
+ * It's more efficient for the HW to work with MTT descriptors
2384
+ * (especially in high load).
2385
+ * Use KLM (indirect access) only if it's mandatory.
2386
+ */
2387
+ pi_mr = mr->mtt_mr;
2388
+ n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2389
+ data_sg_offset, meta_sg, meta_sg_nents,
2390
+ meta_sg_offset);
2391
+ if (n == data_sg_nents + meta_sg_nents)
2392
+ goto out;
2393
+
2394
+ pi_mr = mr->klm_mr;
2395
+ n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2396
+ data_sg_offset, meta_sg, meta_sg_nents,
2397
+ meta_sg_offset);
2398
+ if (unlikely(n != data_sg_nents + meta_sg_nents))
2399
+ return -ENOMEM;
2400
+
2401
+out:
2402
+ /* This is zero-based memory region */
2403
+ ibmr->iova = 0;
2404
+ mr->pi_mr = pi_mr;
2405
+ if (pi_mr)
2406
+ ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2407
+ else
2408
+ ibmr->sig_attrs->meta_length = mr->meta_length;
19572409
19582410 return 0;
19592411 }
....@@ -1971,7 +2423,8 @@
19712423 DMA_TO_DEVICE);
19722424
19732425 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
1974
- n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset);
2426
+ n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2427
+ NULL);
19752428 else
19762429 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
19772430 mlx5_set_page);