hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/mm/mremap.c
....@@ -30,12 +30,11 @@
3030
3131 #include "internal.h"
3232
33
-static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
33
+static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
3434 {
3535 pgd_t *pgd;
3636 p4d_t *p4d;
3737 pud_t *pud;
38
- pmd_t *pmd;
3938
4039 pgd = pgd_offset(mm, addr);
4140 if (pgd_none_or_clear_bad(pgd))
....@@ -49,6 +48,18 @@
4948 if (pud_none_or_clear_bad(pud))
5049 return NULL;
5150
51
+ return pud;
52
+}
53
+
54
+static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
55
+{
56
+ pud_t *pud;
57
+ pmd_t *pmd;
58
+
59
+ pud = get_old_pud(mm, addr);
60
+ if (!pud)
61
+ return NULL;
62
+
5263 pmd = pmd_offset(pud, addr);
5364 if (pmd_none(*pmd))
5465 return NULL;
....@@ -56,19 +67,27 @@
5667 return pmd;
5768 }
5869
59
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
70
+static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
6071 unsigned long addr)
6172 {
6273 pgd_t *pgd;
6374 p4d_t *p4d;
64
- pud_t *pud;
65
- pmd_t *pmd;
6675
6776 pgd = pgd_offset(mm, addr);
6877 p4d = p4d_alloc(mm, pgd, addr);
6978 if (!p4d)
7079 return NULL;
71
- pud = pud_alloc(mm, p4d, addr);
80
+
81
+ return pud_alloc(mm, p4d, addr);
82
+}
83
+
84
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
85
+ unsigned long addr)
86
+{
87
+ pud_t *pud;
88
+ pmd_t *pmd;
89
+
90
+ pud = alloc_new_pud(mm, vma, addr);
7291 if (!pud)
7392 return NULL;
7493
....@@ -133,7 +152,7 @@
133152 * such races:
134153 *
135154 * - During exec() shift_arg_pages(), we use a specially tagged vma
136
- * which rmap call sites look for using is_vma_temporary_stack().
155
+ * which rmap call sites look for using vma_is_temporary_stack().
137156 *
138157 * - During mremap(), new_vma is often known to be placed after vma
139158 * in rmap traversal order. This ensures rmap will always observe
....@@ -146,7 +165,7 @@
146165
147166 /*
148167 * We don't have to worry about the ordering of src and dst
149
- * pte locks because exclusive mmap_sem prevents deadlock.
168
+ * pte locks because exclusive mmap_lock prevents deadlock.
150169 */
151170 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
152171 new_pte = pte_offset_map(new_pmd, new_addr);
....@@ -191,63 +210,327 @@
191210 drop_rmap_locks(vma);
192211 }
193212
213
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
214
+static inline bool trylock_vma_ref_count(struct vm_area_struct *vma)
215
+{
216
+ /*
217
+ * If we have the only reference, swap the refcount to -1. This
218
+ * will prevent other concurrent references by get_vma() for SPFs.
219
+ */
220
+ return atomic_cmpxchg(&vma->vm_ref_count, 1, -1) == 1;
221
+}
222
+
223
+/*
224
+ * Restore the VMA reference count to 1 after a fast mremap.
225
+ */
226
+static inline void unlock_vma_ref_count(struct vm_area_struct *vma)
227
+{
228
+ /*
229
+ * This should only be called after a corresponding,
230
+ * successful trylock_vma_ref_count().
231
+ */
232
+ VM_BUG_ON_VMA(atomic_cmpxchg(&vma->vm_ref_count, -1, 1) != -1,
233
+ vma);
234
+}
235
+#else /* !CONFIG_SPECULATIVE_PAGE_FAULT */
236
+static inline bool trylock_vma_ref_count(struct vm_area_struct *vma)
237
+{
238
+ return true;
239
+}
240
+static inline void unlock_vma_ref_count(struct vm_area_struct *vma)
241
+{
242
+}
243
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
244
+
245
+#ifdef CONFIG_HAVE_MOVE_PMD
246
+static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
247
+ unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
248
+{
249
+ spinlock_t *old_ptl, *new_ptl;
250
+ struct mm_struct *mm = vma->vm_mm;
251
+ pmd_t pmd;
252
+
253
+ /*
254
+ * The destination pmd shouldn't be established, free_pgtables()
255
+ * should have released it.
256
+ *
257
+ * However, there's a case during execve() where we use mremap
258
+ * to move the initial stack, and in that case the target area
259
+ * may overlap the source area (always moving down).
260
+ *
261
+ * If everything is PMD-aligned, that works fine, as moving
262
+ * each pmd down will clear the source pmd. But if we first
263
+ * have a few 4kB-only pages that get moved down, and then
264
+ * hit the "now the rest is PMD-aligned, let's do everything
265
+ * one pmd at a time", we will still have the old (now empty
266
+ * of any 4kB pages, but still there) PMD in the page table
267
+ * tree.
268
+ *
269
+ * Warn on it once - because we really should try to figure
270
+ * out how to do this better - but then say "I won't move
271
+ * this pmd".
272
+ *
273
+ * One alternative might be to just unmap the target pmd at
274
+ * this point, and verify that it really is empty. We'll see.
275
+ */
276
+ if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
277
+ return false;
278
+
279
+ /*
280
+ * We hold both exclusive mmap_lock and rmap_lock at this point and
281
+ * cannot block. If we cannot immediately take exclusive ownership
282
+ * of the VMA fallback to the move_ptes().
283
+ */
284
+ if (!trylock_vma_ref_count(vma))
285
+ return false;
286
+
287
+ /*
288
+ * We don't have to worry about the ordering of src and dst
289
+ * ptlocks because exclusive mmap_lock prevents deadlock.
290
+ */
291
+ old_ptl = pmd_lock(vma->vm_mm, old_pmd);
292
+ new_ptl = pmd_lockptr(mm, new_pmd);
293
+ if (new_ptl != old_ptl)
294
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
295
+
296
+ /* Clear the pmd */
297
+ pmd = *old_pmd;
298
+ pmd_clear(old_pmd);
299
+
300
+ VM_BUG_ON(!pmd_none(*new_pmd));
301
+
302
+ /* Set the new pmd */
303
+ set_pmd_at(mm, new_addr, new_pmd, pmd);
304
+ flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
305
+ if (new_ptl != old_ptl)
306
+ spin_unlock(new_ptl);
307
+ spin_unlock(old_ptl);
308
+
309
+ unlock_vma_ref_count(vma);
310
+ return true;
311
+}
312
+#else
313
+static inline bool move_normal_pmd(struct vm_area_struct *vma,
314
+ unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
315
+ pmd_t *new_pmd)
316
+{
317
+ return false;
318
+}
319
+#endif
320
+
321
+#ifdef CONFIG_HAVE_MOVE_PUD
322
+static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
323
+ unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
324
+{
325
+ spinlock_t *old_ptl, *new_ptl;
326
+ struct mm_struct *mm = vma->vm_mm;
327
+ pud_t pud;
328
+
329
+ /*
330
+ * The destination pud shouldn't be established, free_pgtables()
331
+ * should have released it.
332
+ */
333
+ if (WARN_ON_ONCE(!pud_none(*new_pud)))
334
+ return false;
335
+
336
+ /*
337
+ * We hold both exclusive mmap_lock and rmap_lock at this point and
338
+ * cannot block. If we cannot immediately take exclusive ownership
339
+ * of the VMA fallback to the move_ptes().
340
+ */
341
+ if (!trylock_vma_ref_count(vma))
342
+ return false;
343
+
344
+ /*
345
+ * We don't have to worry about the ordering of src and dst
346
+ * ptlocks because exclusive mmap_lock prevents deadlock.
347
+ */
348
+ old_ptl = pud_lock(vma->vm_mm, old_pud);
349
+ new_ptl = pud_lockptr(mm, new_pud);
350
+ if (new_ptl != old_ptl)
351
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
352
+
353
+ /* Clear the pud */
354
+ pud = *old_pud;
355
+ pud_clear(old_pud);
356
+
357
+ VM_BUG_ON(!pud_none(*new_pud));
358
+
359
+ /* Set the new pud */
360
+ set_pud_at(mm, new_addr, new_pud, pud);
361
+ flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
362
+ if (new_ptl != old_ptl)
363
+ spin_unlock(new_ptl);
364
+ spin_unlock(old_ptl);
365
+
366
+ unlock_vma_ref_count(vma);
367
+ return true;
368
+}
369
+#else
370
+static inline bool move_normal_pud(struct vm_area_struct *vma,
371
+ unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
372
+ pud_t *new_pud)
373
+{
374
+ return false;
375
+}
376
+#endif
377
+
378
+enum pgt_entry {
379
+ NORMAL_PMD,
380
+ HPAGE_PMD,
381
+ NORMAL_PUD,
382
+};
383
+
384
+/*
385
+ * Returns an extent of the corresponding size for the pgt_entry specified if
386
+ * valid. Else returns a smaller extent bounded by the end of the source and
387
+ * destination pgt_entry.
388
+ */
389
+static __always_inline unsigned long get_extent(enum pgt_entry entry,
390
+ unsigned long old_addr, unsigned long old_end,
391
+ unsigned long new_addr)
392
+{
393
+ unsigned long next, extent, mask, size;
394
+
395
+ switch (entry) {
396
+ case HPAGE_PMD:
397
+ case NORMAL_PMD:
398
+ mask = PMD_MASK;
399
+ size = PMD_SIZE;
400
+ break;
401
+ case NORMAL_PUD:
402
+ mask = PUD_MASK;
403
+ size = PUD_SIZE;
404
+ break;
405
+ default:
406
+ BUILD_BUG();
407
+ break;
408
+ }
409
+
410
+ next = (old_addr + size) & mask;
411
+ /* even if next overflowed, extent below will be ok */
412
+ extent = next - old_addr;
413
+ if (extent > old_end - old_addr)
414
+ extent = old_end - old_addr;
415
+ next = (new_addr + size) & mask;
416
+ if (extent > next - new_addr)
417
+ extent = next - new_addr;
418
+ return extent;
419
+}
420
+
421
+/*
422
+ * Attempts to speedup the move by moving entry at the level corresponding to
423
+ * pgt_entry. Returns true if the move was successful, else false.
424
+ */
425
+static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
426
+ unsigned long old_addr, unsigned long new_addr,
427
+ void *old_entry, void *new_entry, bool need_rmap_locks)
428
+{
429
+ bool moved = false;
430
+
431
+ /* See comment in move_ptes() */
432
+ if (need_rmap_locks)
433
+ take_rmap_locks(vma);
434
+
435
+ switch (entry) {
436
+ case NORMAL_PMD:
437
+ moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
438
+ new_entry);
439
+ break;
440
+ case NORMAL_PUD:
441
+ moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
442
+ new_entry);
443
+ break;
444
+ case HPAGE_PMD:
445
+ moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
446
+ move_huge_pmd(vma, old_addr, new_addr, old_entry,
447
+ new_entry);
448
+ break;
449
+ default:
450
+ WARN_ON_ONCE(1);
451
+ break;
452
+ }
453
+
454
+ if (need_rmap_locks)
455
+ drop_rmap_locks(vma);
456
+
457
+ return moved;
458
+}
459
+
194460 unsigned long move_page_tables(struct vm_area_struct *vma,
195461 unsigned long old_addr, struct vm_area_struct *new_vma,
196462 unsigned long new_addr, unsigned long len,
197463 bool need_rmap_locks)
198464 {
199
- unsigned long extent, next, old_end;
465
+ unsigned long extent, old_end;
466
+ struct mmu_notifier_range range;
200467 pmd_t *old_pmd, *new_pmd;
201
- unsigned long mmun_start; /* For mmu_notifiers */
202
- unsigned long mmun_end; /* For mmu_notifiers */
468
+
469
+ if (!len)
470
+ return 0;
203471
204472 old_end = old_addr + len;
205473 flush_cache_range(vma, old_addr, old_end);
206474
207
- mmun_start = old_addr;
208
- mmun_end = old_end;
209
- mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
475
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
476
+ old_addr, old_end);
477
+ mmu_notifier_invalidate_range_start(&range);
210478
211479 for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
212480 cond_resched();
213
- next = (old_addr + PMD_SIZE) & PMD_MASK;
214
- /* even if next overflowed, extent below will be ok */
215
- extent = next - old_addr;
216
- if (extent > old_end - old_addr)
217
- extent = old_end - old_addr;
481
+ /*
482
+ * If extent is PUD-sized try to speed up the move by moving at the
483
+ * PUD level if possible.
484
+ */
485
+ extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
486
+ if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
487
+ pud_t *old_pud, *new_pud;
488
+
489
+ old_pud = get_old_pud(vma->vm_mm, old_addr);
490
+ if (!old_pud)
491
+ continue;
492
+ new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
493
+ if (!new_pud)
494
+ break;
495
+ if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
496
+ old_pud, new_pud, true))
497
+ continue;
498
+ }
499
+
500
+ extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
218501 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
219502 if (!old_pmd)
220503 continue;
221504 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
222505 if (!new_pmd)
223506 break;
224
- if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) {
225
- if (extent == HPAGE_PMD_SIZE) {
226
- bool moved;
227
- /* See comment in move_ptes() */
228
- if (need_rmap_locks)
229
- take_rmap_locks(vma);
230
- moved = move_huge_pmd(vma, old_addr, new_addr,
231
- old_end, old_pmd, new_pmd);
232
- if (need_rmap_locks)
233
- drop_rmap_locks(vma);
234
- if (moved)
235
- continue;
236
- }
507
+ if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
508
+ pmd_devmap(*old_pmd)) {
509
+ if (extent == HPAGE_PMD_SIZE &&
510
+ move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
511
+ old_pmd, new_pmd, need_rmap_locks))
512
+ continue;
237513 split_huge_pmd(vma, old_pmd, old_addr);
238514 if (pmd_trans_unstable(old_pmd))
239515 continue;
516
+ } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
517
+ extent == PMD_SIZE) {
518
+ /*
519
+ * If the extent is PMD-sized, try to speed the move by
520
+ * moving at the PMD level if possible.
521
+ */
522
+ if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
523
+ old_pmd, new_pmd, true))
524
+ continue;
240525 }
241
- if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
526
+
527
+ if (pte_alloc(new_vma->vm_mm, new_pmd))
242528 break;
243
- next = (new_addr + PMD_SIZE) & PMD_MASK;
244
- if (extent > next - new_addr)
245
- extent = next - new_addr;
246529 move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
247530 new_pmd, new_addr, need_rmap_locks);
248531 }
249532
250
- mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
533
+ mmu_notifier_invalidate_range_end(&range);
251534
252535 return len + old_addr - old_end; /* how much done */
253536 }
....@@ -255,8 +538,8 @@
255538 static unsigned long move_vma(struct vm_area_struct *vma,
256539 unsigned long old_addr, unsigned long old_len,
257540 unsigned long new_len, unsigned long new_addr,
258
- bool *locked, struct vm_userfaultfd_ctx *uf,
259
- struct list_head *uf_unmap)
541
+ bool *locked, unsigned long flags,
542
+ struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
260543 {
261544 struct mm_struct *mm = vma->vm_mm;
262545 struct vm_area_struct *new_vma;
....@@ -294,6 +577,14 @@
294577 if (!new_vma)
295578 return -ENOMEM;
296579
580
+ /* new_vma is returned protected by copy_vma, to prevent speculative
581
+ * page fault to be done in the destination area before we move the pte.
582
+ * Now, we must also protect the source VMA since we don't want pages
583
+ * to be mapped in our back while we are copying the PTEs.
584
+ */
585
+ if (vma != new_vma)
586
+ vm_write_begin(vma);
587
+
297588 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
298589 need_rmap_locks);
299590 if (moved_len < old_len) {
....@@ -310,6 +601,8 @@
310601 */
311602 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
312603 true);
604
+ if (vma != new_vma)
605
+ vm_write_end(vma);
313606 vma = new_vma;
314607 old_len = new_len;
315608 old_addr = new_addr;
....@@ -318,7 +611,10 @@
318611 mremap_userfaultfd_prep(new_vma, uf);
319612 arch_remap(mm, old_addr, old_addr + old_len,
320613 new_addr, new_addr + new_len);
614
+ if (vma != new_vma)
615
+ vm_write_end(vma);
321616 }
617
+ vm_write_end(new_vma);
322618
323619 /* Conceal VM_ACCOUNT so old reservation is not undone */
324620 if (vm_flags & VM_ACCOUNT) {
....@@ -345,11 +641,43 @@
345641 if (unlikely(vma->vm_flags & VM_PFNMAP))
346642 untrack_pfn_moved(vma);
347643
644
+ if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
645
+ if (vm_flags & VM_ACCOUNT) {
646
+ /* Always put back VM_ACCOUNT since we won't unmap */
647
+ vma->vm_flags |= VM_ACCOUNT;
648
+
649
+ vm_acct_memory(new_len >> PAGE_SHIFT);
650
+ }
651
+
652
+ /*
653
+ * VMAs can actually be merged back together in copy_vma
654
+ * calling merge_vma. This can happen with anonymous vmas
655
+ * which have not yet been faulted, so if we were to consider
656
+ * this VMA split we'll end up adding VM_ACCOUNT on the
657
+ * next VMA, which is completely unrelated if this VMA
658
+ * was re-merged.
659
+ */
660
+ if (split && new_vma == vma)
661
+ split = 0;
662
+
663
+ /* We always clear VM_LOCKED[ONFAULT] on the old vma */
664
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
665
+
666
+ /* Because we won't unmap we don't need to touch locked_vm */
667
+ goto out;
668
+ }
669
+
348670 if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
349671 /* OOM: unable to split vma, just get accounts right */
350672 vm_unacct_memory(excess >> PAGE_SHIFT);
351673 excess = 0;
352674 }
675
+
676
+ if (vm_flags & VM_LOCKED) {
677
+ mm->locked_vm += new_len >> PAGE_SHIFT;
678
+ *locked = true;
679
+ }
680
+out:
353681 mm->hiwater_vm = hiwater_vm;
354682
355683 /* Restore VM_ACCOUNT if one or two pieces of vma left */
....@@ -359,16 +687,12 @@
359687 vma->vm_next->vm_flags |= VM_ACCOUNT;
360688 }
361689
362
- if (vm_flags & VM_LOCKED) {
363
- mm->locked_vm += new_len >> PAGE_SHIFT;
364
- *locked = true;
365
- }
366
-
367690 return new_addr;
368691 }
369692
370693 static struct vm_area_struct *vma_to_resize(unsigned long addr,
371
- unsigned long old_len, unsigned long new_len, unsigned long *p)
694
+ unsigned long old_len, unsigned long new_len, unsigned long flags,
695
+ unsigned long *p)
372696 {
373697 struct mm_struct *mm = current->mm;
374698 struct vm_area_struct *vma = find_vma(mm, addr);
....@@ -389,6 +713,10 @@
389713 pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
390714 return ERR_PTR(-EINVAL);
391715 }
716
+
717
+ if ((flags & MREMAP_DONTUNMAP) &&
718
+ (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
719
+ return ERR_PTR(-EINVAL);
392720
393721 if (is_vm_hugetlb_page(vma))
394722 return ERR_PTR(-EINVAL);
....@@ -434,7 +762,7 @@
434762
435763 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
436764 unsigned long new_addr, unsigned long new_len, bool *locked,
437
- struct vm_userfaultfd_ctx *uf,
765
+ unsigned long flags, struct vm_userfaultfd_ctx *uf,
438766 struct list_head *uf_unmap_early,
439767 struct list_head *uf_unmap)
440768 {
....@@ -442,7 +770,7 @@
442770 struct vm_area_struct *vma;
443771 unsigned long ret = -EINVAL;
444772 unsigned long charged = 0;
445
- unsigned long map_flags;
773
+ unsigned long map_flags = 0;
446774
447775 if (offset_in_page(new_addr))
448776 goto out;
....@@ -454,9 +782,28 @@
454782 if (addr + old_len > new_addr && new_addr + new_len > addr)
455783 goto out;
456784
457
- ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
458
- if (ret)
459
- goto out;
785
+ /*
786
+ * move_vma() need us to stay 4 maps below the threshold, otherwise
787
+ * it will bail out at the very beginning.
788
+ * That is a problem if we have already unmaped the regions here
789
+ * (new_addr, and old_addr), because userspace will not know the
790
+ * state of the vma's after it gets -ENOMEM.
791
+ * So, to avoid such scenario we can pre-compute if the whole
792
+ * operation has high chances to success map-wise.
793
+ * Worst-scenario case is when both vma's (new_addr and old_addr) get
794
+ * split in 3 before unmaping it.
795
+ * That means 2 more maps (1 for each) to the ones we already hold.
796
+ * Check whether current map count plus 2 still leads us to 4 maps below
797
+ * the threshold, otherwise return -ENOMEM here to be more safe.
798
+ */
799
+ if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
800
+ return -ENOMEM;
801
+
802
+ if (flags & MREMAP_FIXED) {
803
+ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
804
+ if (ret)
805
+ goto out;
806
+ }
460807
461808 if (old_len >= new_len) {
462809 ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
....@@ -465,26 +812,41 @@
465812 old_len = new_len;
466813 }
467814
468
- vma = vma_to_resize(addr, old_len, new_len, &charged);
815
+ vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
469816 if (IS_ERR(vma)) {
470817 ret = PTR_ERR(vma);
471818 goto out;
472819 }
473820
474
- map_flags = MAP_FIXED;
821
+ /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
822
+ if (flags & MREMAP_DONTUNMAP &&
823
+ !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
824
+ ret = -ENOMEM;
825
+ goto out;
826
+ }
827
+
828
+ if (flags & MREMAP_FIXED)
829
+ map_flags |= MAP_FIXED;
830
+
475831 if (vma->vm_flags & VM_MAYSHARE)
476832 map_flags |= MAP_SHARED;
477833
478834 ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
479835 ((addr - vma->vm_start) >> PAGE_SHIFT),
480836 map_flags);
481
- if (offset_in_page(ret))
837
+ if (IS_ERR_VALUE(ret))
482838 goto out1;
483839
484
- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
840
+ /* We got a new mapping */
841
+ if (!(flags & MREMAP_FIXED))
842
+ new_addr = ret;
843
+
844
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
485845 uf_unmap);
846
+
486847 if (!(offset_in_page(ret)))
487848 goto out;
849
+
488850 out1:
489851 vm_unacct_memory(charged);
490852
....@@ -521,17 +883,37 @@
521883 unsigned long ret = -EINVAL;
522884 unsigned long charged = 0;
523885 bool locked = false;
886
+ bool downgraded = false;
524887 struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
525888 LIST_HEAD(uf_unmap_early);
526889 LIST_HEAD(uf_unmap);
527890
891
+ /*
892
+ * There is a deliberate asymmetry here: we strip the pointer tag
893
+ * from the old address but leave the new address alone. This is
894
+ * for consistency with mmap(), where we prevent the creation of
895
+ * aliasing mappings in userspace by leaving the tag bits of the
896
+ * mapping address intact. A non-zero tag will cause the subsequent
897
+ * range checks to reject the address as invalid.
898
+ *
899
+ * See Documentation/arm64/tagged-address-abi.rst for more information.
900
+ */
528901 addr = untagged_addr(addr);
529902
530
- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
903
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
531904 return ret;
532905
533906 if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
534907 return ret;
908
+
909
+ /*
910
+ * MREMAP_DONTUNMAP is always a move and it does not allow resizing
911
+ * in the process.
912
+ */
913
+ if (flags & MREMAP_DONTUNMAP &&
914
+ (!(flags & MREMAP_MAYMOVE) || old_len != new_len))
915
+ return ret;
916
+
535917
536918 if (offset_in_page(addr))
537919 return ret;
....@@ -547,24 +929,33 @@
547929 if (!new_len)
548930 return ret;
549931
550
- if (down_write_killable(&current->mm->mmap_sem))
932
+ if (mmap_write_lock_killable(current->mm))
551933 return -EINTR;
552934
553
- if (flags & MREMAP_FIXED) {
935
+ if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
554936 ret = mremap_to(addr, old_len, new_addr, new_len,
555
- &locked, &uf, &uf_unmap_early, &uf_unmap);
937
+ &locked, flags, &uf, &uf_unmap_early,
938
+ &uf_unmap);
556939 goto out;
557940 }
558941
559942 /*
560943 * Always allow a shrinking remap: that just unmaps
561944 * the unnecessary pages..
562
- * do_munmap does all the needed commit accounting
945
+ * __do_munmap does all the needed commit accounting, and
946
+ * downgrades mmap_lock to read if so directed.
563947 */
564948 if (old_len >= new_len) {
565
- ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap);
566
- if (ret && old_len != new_len)
949
+ int retval;
950
+
951
+ retval = __do_munmap(mm, addr+new_len, old_len - new_len,
952
+ &uf_unmap, true);
953
+ if (retval < 0 && old_len != new_len) {
954
+ ret = retval;
567955 goto out;
956
+ /* Returning 1 indicates mmap_lock is downgraded to read. */
957
+ } else if (retval == 1)
958
+ downgraded = true;
568959 ret = addr;
569960 goto out;
570961 }
....@@ -572,7 +963,7 @@
572963 /*
573964 * Ok, we need to grow..
574965 */
575
- vma = vma_to_resize(addr, old_len, new_len, &charged);
966
+ vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
576967 if (IS_ERR(vma)) {
577968 ret = PTR_ERR(vma);
578969 goto out;
....@@ -616,24 +1007,27 @@
6161007 vma->vm_pgoff +
6171008 ((addr - vma->vm_start) >> PAGE_SHIFT),
6181009 map_flags);
619
- if (offset_in_page(new_addr)) {
1010
+ if (IS_ERR_VALUE(new_addr)) {
6201011 ret = new_addr;
6211012 goto out;
6221013 }
6231014
6241015 ret = move_vma(vma, addr, old_len, new_len, new_addr,
625
- &locked, &uf, &uf_unmap);
1016
+ &locked, flags, &uf, &uf_unmap);
6261017 }
6271018 out:
6281019 if (offset_in_page(ret)) {
6291020 vm_unacct_memory(charged);
630
- locked = 0;
1021
+ locked = false;
6311022 }
632
- up_write(&current->mm->mmap_sem);
1023
+ if (downgraded)
1024
+ mmap_read_unlock(current->mm);
1025
+ else
1026
+ mmap_write_unlock(current->mm);
6331027 if (locked && new_len > old_len)
6341028 mm_populate(new_addr + old_len, new_len - old_len);
6351029 userfaultfd_unmap_complete(mm, &uf_unmap_early);
636
- mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
1030
+ mremap_userfaultfd_complete(&uf, addr, ret, old_len);
6371031 userfaultfd_unmap_complete(mm, &uf_unmap);
6381032 return ret;
6391033 }