hc
2024-10-22 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5
kernel/net/rds/rdma.c
....@@ -1,5 +1,5 @@
11 /*
2
- * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
2
+ * Copyright (c) 2007, 2020 Oracle and/or its affiliates.
33 *
44 * This software is available to you under a choice of one of two
55 * licenses. You may choose to be licensed under the terms of the GNU
....@@ -84,7 +84,7 @@
8484 if (insert) {
8585 rb_link_node(&insert->r_rb_node, parent, p);
8686 rb_insert_color(&insert->r_rb_node, root);
87
- refcount_inc(&insert->r_refcount);
87
+ kref_get(&insert->r_kref);
8888 }
8989 return NULL;
9090 }
....@@ -99,10 +99,7 @@
9999 unsigned long flags;
100100
101101 rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
102
- mr->r_key, refcount_read(&mr->r_refcount));
103
-
104
- if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
105
- return;
102
+ mr->r_key, kref_read(&mr->r_kref));
106103
107104 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
108105 if (!RB_EMPTY_NODE(&mr->r_rb_node))
....@@ -115,8 +112,10 @@
115112 mr->r_trans->free_mr(trans_private, mr->r_invalidate);
116113 }
117114
118
-void __rds_put_mr_final(struct rds_mr *mr)
115
+void __rds_put_mr_final(struct kref *kref)
119116 {
117
+ struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref);
118
+
120119 rds_destroy_mr(mr);
121120 kfree(mr);
122121 }
....@@ -140,8 +139,7 @@
140139 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
141140 RB_CLEAR_NODE(&mr->r_rb_node);
142141 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
143
- rds_destroy_mr(mr);
144
- rds_mr_put(mr);
142
+ kref_put(&mr->r_kref, __rds_put_mr_final);
145143 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
146144 }
147145 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
....@@ -156,13 +154,15 @@
156154 static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
157155 struct page **pages, int write)
158156 {
157
+ unsigned int gup_flags = FOLL_LONGTERM;
159158 int ret;
160159
161
- ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
160
+ if (write)
161
+ gup_flags |= FOLL_WRITE;
162162
163
+ ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages);
163164 if (ret >= 0 && ret < nr_pages) {
164
- while (ret--)
165
- put_page(pages[ret]);
165
+ unpin_user_pages(pages, ret);
166166 ret = -EFAULT;
167167 }
168168
....@@ -174,13 +174,14 @@
174174 struct rds_conn_path *cp)
175175 {
176176 struct rds_mr *mr = NULL, *found;
177
+ struct scatterlist *sg = NULL;
177178 unsigned int nr_pages;
178179 struct page **pages = NULL;
179
- struct scatterlist *sg;
180180 void *trans_private;
181181 unsigned long flags;
182182 rds_rdma_cookie_t cookie;
183
- unsigned int nents;
183
+ unsigned int nents = 0;
184
+ int need_odp = 0;
184185 long i;
185186 int ret;
186187
....@@ -191,6 +192,21 @@
191192
192193 if (!rs->rs_transport->get_mr) {
193194 ret = -EOPNOTSUPP;
195
+ goto out;
196
+ }
197
+
198
+ /* If the combination of the addr and size requested for this memory
199
+ * region causes an integer overflow, return error.
200
+ */
201
+ if (((args->vec.addr + args->vec.bytes) < args->vec.addr) ||
202
+ PAGE_ALIGN(args->vec.addr + args->vec.bytes) <
203
+ (args->vec.addr + args->vec.bytes)) {
204
+ ret = -EINVAL;
205
+ goto out;
206
+ }
207
+
208
+ if (!can_do_mlock()) {
209
+ ret = -EPERM;
194210 goto out;
195211 }
196212
....@@ -224,7 +240,7 @@
224240 goto out;
225241 }
226242
227
- refcount_set(&mr->r_refcount, 1);
243
+ kref_init(&mr->r_kref);
228244 RB_CLEAR_NODE(&mr->r_rb_node);
229245 mr->r_trans = rs->rs_transport;
230246 mr->r_sock = rs;
....@@ -247,36 +263,43 @@
247263 * the zero page.
248264 */
249265 ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
250
- if (ret < 0)
266
+ if (ret == -EOPNOTSUPP) {
267
+ need_odp = 1;
268
+ } else if (ret <= 0) {
251269 goto out;
270
+ } else {
271
+ nents = ret;
272
+ sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL);
273
+ if (!sg) {
274
+ ret = -ENOMEM;
275
+ goto out;
276
+ }
277
+ WARN_ON(!nents);
278
+ sg_init_table(sg, nents);
252279
253
- nents = ret;
254
- sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
255
- if (!sg) {
256
- ret = -ENOMEM;
257
- goto out;
280
+ /* Stick all pages into the scatterlist */
281
+ for (i = 0 ; i < nents; i++)
282
+ sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
283
+
284
+ rdsdebug("RDS: trans_private nents is %u\n", nents);
258285 }
259
- WARN_ON(!nents);
260
- sg_init_table(sg, nents);
261
-
262
- /* Stick all pages into the scatterlist */
263
- for (i = 0 ; i < nents; i++)
264
- sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
265
-
266
- rdsdebug("RDS: trans_private nents is %u\n", nents);
267
-
268286 /* Obtain a transport specific MR. If this succeeds, the
269287 * s/g list is now owned by the MR.
270288 * Note that dma_map() implies that pending writes are
271289 * flushed to RAM, so no dma_sync is needed here. */
272
- trans_private = rs->rs_transport->get_mr(sg, nents, rs,
273
- &mr->r_key,
274
- cp ? cp->cp_conn : NULL);
290
+ trans_private = rs->rs_transport->get_mr(
291
+ sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL,
292
+ args->vec.addr, args->vec.bytes,
293
+ need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED);
275294
276295 if (IS_ERR(trans_private)) {
277
- for (i = 0 ; i < nents; i++)
278
- put_page(sg_page(&sg[i]));
279
- kfree(sg);
296
+ /* In ODP case, we don't GUP pages, so don't need
297
+ * to release anything.
298
+ */
299
+ if (!need_odp) {
300
+ unpin_user_pages(pages, nr_pages);
301
+ kfree(sg);
302
+ }
280303 ret = PTR_ERR(trans_private);
281304 goto out;
282305 }
....@@ -290,11 +313,20 @@
290313 * map page aligned regions. So we keep the offset, and build
291314 * a 64bit cookie containing <R_Key, offset> and pass that
292315 * around. */
293
- cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
316
+ if (need_odp)
317
+ cookie = rds_rdma_make_cookie(mr->r_key, 0);
318
+ else
319
+ cookie = rds_rdma_make_cookie(mr->r_key,
320
+ args->vec.addr & ~PAGE_MASK);
294321 if (cookie_ret)
295322 *cookie_ret = cookie;
296323
297
- if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
324
+ if (args->cookie_addr &&
325
+ put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) {
326
+ if (!need_odp) {
327
+ unpin_user_pages(pages, nr_pages);
328
+ kfree(sg);
329
+ }
298330 ret = -EFAULT;
299331 goto out;
300332 }
....@@ -309,7 +341,7 @@
309341
310342 rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
311343 if (mr_ret) {
312
- refcount_inc(&mr->r_refcount);
344
+ kref_get(&mr->r_kref);
313345 *mr_ret = mr;
314346 }
315347
....@@ -317,25 +349,24 @@
317349 out:
318350 kfree(pages);
319351 if (mr)
320
- rds_mr_put(mr);
352
+ kref_put(&mr->r_kref, __rds_put_mr_final);
321353 return ret;
322354 }
323355
324
-int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
356
+int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen)
325357 {
326358 struct rds_get_mr_args args;
327359
328360 if (optlen != sizeof(struct rds_get_mr_args))
329361 return -EINVAL;
330362
331
- if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
332
- sizeof(struct rds_get_mr_args)))
363
+ if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args)))
333364 return -EFAULT;
334365
335366 return __rds_rdma_map(rs, &args, NULL, NULL, NULL);
336367 }
337368
338
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
369
+int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen)
339370 {
340371 struct rds_get_mr_for_dest_args args;
341372 struct rds_get_mr_args new_args;
....@@ -343,7 +374,7 @@
343374 if (optlen != sizeof(struct rds_get_mr_for_dest_args))
344375 return -EINVAL;
345376
346
- if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
377
+ if (copy_from_sockptr(&args, optval,
347378 sizeof(struct rds_get_mr_for_dest_args)))
348379 return -EFAULT;
349380
....@@ -362,7 +393,7 @@
362393 /*
363394 * Free the MR indicated by the given R_Key
364395 */
365
-int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
396
+int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen)
366397 {
367398 struct rds_free_mr_args args;
368399 struct rds_mr *mr;
....@@ -371,8 +402,7 @@
371402 if (optlen != sizeof(struct rds_free_mr_args))
372403 return -EINVAL;
373404
374
- if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
375
- sizeof(struct rds_free_mr_args)))
405
+ if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args)))
376406 return -EFAULT;
377407
378408 /* Special case - a null cookie means flush all unused MRs */
....@@ -400,13 +430,7 @@
400430 if (!mr)
401431 return -EINVAL;
402432
403
- /*
404
- * call rds_destroy_mr() ourselves so that we're sure it's done by the time
405
- * we return. If we let rds_mr_put() do it it might not happen until
406
- * someone else drops their ref.
407
- */
408
- rds_destroy_mr(mr);
409
- rds_mr_put(mr);
433
+ kref_put(&mr->r_kref, __rds_put_mr_final);
410434 return 0;
411435 }
412436
....@@ -430,6 +454,14 @@
430454 return;
431455 }
432456
457
+ /* Get a reference so that the MR won't go away before calling
458
+ * sync_mr() below.
459
+ */
460
+ kref_get(&mr->r_kref);
461
+
462
+ /* If it is going to be freed, remove it from the tree now so
463
+ * that no other thread can find it and free it.
464
+ */
433465 if (mr->r_use_once || force) {
434466 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
435467 RB_CLEAR_NODE(&mr->r_rb_node);
....@@ -443,34 +475,37 @@
443475 if (mr->r_trans->sync_mr)
444476 mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
445477
478
+ /* Release the reference held above. */
479
+ kref_put(&mr->r_kref, __rds_put_mr_final);
480
+
446481 /* If the MR was marked as invalidate, this will
447482 * trigger an async flush. */
448
- if (zot_me) {
449
- rds_destroy_mr(mr);
450
- rds_mr_put(mr);
451
- }
483
+ if (zot_me)
484
+ kref_put(&mr->r_kref, __rds_put_mr_final);
452485 }
453486
454487 void rds_rdma_free_op(struct rm_rdma_op *ro)
455488 {
456489 unsigned int i;
457490
458
- for (i = 0; i < ro->op_nents; i++) {
459
- struct page *page = sg_page(&ro->op_sg[i]);
491
+ if (ro->op_odp_mr) {
492
+ kref_put(&ro->op_odp_mr->r_kref, __rds_put_mr_final);
493
+ } else {
494
+ for (i = 0; i < ro->op_nents; i++) {
495
+ struct page *page = sg_page(&ro->op_sg[i]);
460496
461
- /* Mark page dirty if it was possibly modified, which
462
- * is the case for a RDMA_READ which copies from remote
463
- * to local memory */
464
- if (!ro->op_write) {
465
- WARN_ON(!page->mapping && irqs_disabled());
466
- set_page_dirty(page);
497
+ /* Mark page dirty if it was possibly modified, which
498
+ * is the case for a RDMA_READ which copies from remote
499
+ * to local memory
500
+ */
501
+ unpin_user_pages_dirty_lock(&page, 1, !ro->op_write);
467502 }
468
- put_page(page);
469503 }
470504
471505 kfree(ro->op_notifier);
472506 ro->op_notifier = NULL;
473507 ro->op_active = 0;
508
+ ro->op_odp_mr = NULL;
474509 }
475510
476511 void rds_atomic_free_op(struct rm_atomic_op *ao)
....@@ -480,8 +515,7 @@
480515 /* Mark page dirty if it was possibly modified, which
481516 * is the case for a RDMA_READ which copies from remote
482517 * to local memory */
483
- set_page_dirty(page);
484
- put_page(page);
518
+ unpin_user_pages_dirty_lock(&page, 1, true);
485519
486520 kfree(ao->op_notifier);
487521 ao->op_notifier = NULL;
....@@ -583,6 +617,7 @@
583617 struct rds_iovec *iovs;
584618 unsigned int i, j;
585619 int ret = 0;
620
+ bool odp_supported = true;
586621
587622 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
588623 || rm->rdma.op_active)
....@@ -604,6 +639,9 @@
604639 ret = -EINVAL;
605640 goto out_ret;
606641 }
642
+ /* odp-mr is not supported for multiple requests within one message */
643
+ if (args->nr_local != 1)
644
+ odp_supported = false;
607645
608646 iovs = vec->iov;
609647
....@@ -625,10 +663,12 @@
625663 op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
626664 op->op_active = 1;
627665 op->op_recverr = rs->rs_recverr;
666
+ op->op_odp_mr = NULL;
667
+
628668 WARN_ON(!nr_pages);
629669 op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
630
- if (!op->op_sg) {
631
- ret = -ENOMEM;
670
+ if (IS_ERR(op->op_sg)) {
671
+ ret = PTR_ERR(op->op_sg);
632672 goto out_pages;
633673 }
634674
....@@ -645,16 +685,6 @@
645685 }
646686 op->op_notifier->n_user_token = args->user_token;
647687 op->op_notifier->n_status = RDS_RDMA_SUCCESS;
648
-
649
- /* Enable rmda notification on data operation for composite
650
- * rds messages and make sure notification is enabled only
651
- * for the data operation which follows it so that application
652
- * gets notified only after full message gets delivered.
653
- */
654
- if (rm->data.op_sg) {
655
- rm->rdma.op_notify = 0;
656
- rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
657
- }
658688 }
659689
660690 /* The cookie contains the R_Key of the remote memory region, and
....@@ -686,10 +716,44 @@
686716 * If it's a READ operation, we need to pin the pages for writing.
687717 */
688718 ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
689
- if (ret < 0)
719
+ if ((!odp_supported && ret <= 0) ||
720
+ (odp_supported && ret <= 0 && ret != -EOPNOTSUPP))
690721 goto out_pages;
691
- else
692
- ret = 0;
722
+
723
+ if (ret == -EOPNOTSUPP) {
724
+ struct rds_mr *local_odp_mr;
725
+
726
+ if (!rs->rs_transport->get_mr) {
727
+ ret = -EOPNOTSUPP;
728
+ goto out_pages;
729
+ }
730
+ local_odp_mr =
731
+ kzalloc(sizeof(*local_odp_mr), GFP_KERNEL);
732
+ if (!local_odp_mr) {
733
+ ret = -ENOMEM;
734
+ goto out_pages;
735
+ }
736
+ RB_CLEAR_NODE(&local_odp_mr->r_rb_node);
737
+ kref_init(&local_odp_mr->r_kref);
738
+ local_odp_mr->r_trans = rs->rs_transport;
739
+ local_odp_mr->r_sock = rs;
740
+ local_odp_mr->r_trans_private =
741
+ rs->rs_transport->get_mr(
742
+ NULL, 0, rs, &local_odp_mr->r_key, NULL,
743
+ iov->addr, iov->bytes, ODP_VIRTUAL);
744
+ if (IS_ERR(local_odp_mr->r_trans_private)) {
745
+ ret = IS_ERR(local_odp_mr->r_trans_private);
746
+ rdsdebug("get_mr ret %d %p\"", ret,
747
+ local_odp_mr->r_trans_private);
748
+ kfree(local_odp_mr);
749
+ ret = -EOPNOTSUPP;
750
+ goto out_pages;
751
+ }
752
+ rdsdebug("Need odp; local_odp_mr %p trans_private %p\n",
753
+ local_odp_mr, local_odp_mr->r_trans_private);
754
+ op->op_odp_mr = local_odp_mr;
755
+ op->op_odp_addr = iov->addr;
756
+ }
693757
694758 rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
695759 nr_bytes, nr, iov->bytes, iov->addr);
....@@ -705,6 +769,7 @@
705769 min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
706770 offset);
707771
772
+ sg_dma_len(sg) = sg->length;
708773 rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
709774 sg->offset, sg->length, iov->addr, iov->bytes);
710775
....@@ -723,6 +788,7 @@
723788 goto out_pages;
724789 }
725790 op->op_bytes = nr_bytes;
791
+ ret = 0;
726792
727793 out_pages:
728794 kfree(pages);
....@@ -765,11 +831,12 @@
765831 if (!mr)
766832 err = -EINVAL; /* invalid r_key */
767833 else
768
- refcount_inc(&mr->r_refcount);
834
+ kref_get(&mr->r_kref);
769835 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
770836
771837 if (mr) {
772
- mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
838
+ mr->r_trans->sync_mr(mr->r_trans_private,
839
+ DMA_TO_DEVICE);
773840 rm->rdma.op_rdma_mr = mr;
774841 }
775842 return err;
....@@ -843,8 +910,8 @@
843910 rm->atomic.op_active = 1;
844911 rm->atomic.op_recverr = rs->rs_recverr;
845912 rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
846
- if (!rm->atomic.op_sg) {
847
- ret = -ENOMEM;
913
+ if (IS_ERR(rm->atomic.op_sg)) {
914
+ ret = PTR_ERR(rm->atomic.op_sg);
848915 goto err;
849916 }
850917
....@@ -883,7 +950,7 @@
883950 return ret;
884951 err:
885952 if (page)
886
- put_page(page);
953
+ unpin_user_page(page);
887954 rm->atomic.op_active = 0;
888955 kfree(rm->atomic.op_notifier);
889956