From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/mm/mmap.c | 837 +++++++++++++++++++++++++++++++++++++++-------------------- 1 files changed, 553 insertions(+), 284 deletions(-) diff --git a/kernel/mm/mmap.c b/kernel/mm/mmap.c index 4bf28d1..8ec92cc 100644 --- a/kernel/mm/mmap.c +++ b/kernel/mm/mmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * mm/mmap.c * @@ -52,6 +53,10 @@ #include <asm/tlb.h> #include <asm/mmu_context.h> +#define CREATE_TRACE_POINTS +#include <trace/events/mmap.h> +#undef CREATE_TRACE_POINTS +#include <trace/hooks/mm.h> #include "internal.h" #ifndef arch_mmap_check @@ -128,7 +133,7 @@ vm_flags &= ~VM_SHARED; vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); } - /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */ + /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } @@ -139,7 +144,7 @@ struct file *file, struct address_space *mapping) { if (vma->vm_flags & VM_DENYWRITE) - atomic_inc(&file_inode(file)->i_writecount); + allow_write_access(file); if (vma->vm_flags & VM_SHARED) mapping_unmap_writable(mapping); @@ -164,6 +169,27 @@ } } +static void __free_vma(struct vm_area_struct *vma) +{ + if (vma->vm_file) + fput(vma->vm_file); + mpol_put(vma_policy(vma)); + vm_area_free(vma); +} + +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +void put_vma(struct vm_area_struct *vma) +{ + if (atomic_dec_and_test(&vma->vm_ref_count)) + __free_vma(vma); +} +#else +static inline void put_vma(struct vm_area_struct *vma) +{ + __free_vma(vma); +} +#endif + /* * Close a vm structure and free it, returning the next. */ @@ -174,10 +200,7 @@ might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) - fput(vma->vm_file); - mpol_put(vma_policy(vma)); - vm_area_free(vma); + put_vma(vma); return next; } @@ -186,15 +209,18 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long retval; - unsigned long newbrk, oldbrk; + unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; struct vm_area_struct *next; unsigned long min_brk; bool populate; + bool downgraded = false; LIST_HEAD(uf); - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; + + origbrk = mm->brk; #ifdef CONFIG_COMPAT_BRK /* @@ -224,14 +250,32 @@ newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); - if (oldbrk == newbrk) - goto set_brk; + if (oldbrk == newbrk) { + mm->brk = brk; + goto success; + } - /* Always allow shrinking brk. */ + /* + * Always allow shrinking brk. + * __do_munmap() may downgrade mmap_lock to read. + */ if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf)) - goto set_brk; - goto out; + int ret; + + /* + * mm->brk must to be protected by write mmap_lock so update it + * before downgrading mmap_lock. When __do_munmap() fails, + * mm->brk will be restored from origbrk. + */ + mm->brk = brk; + ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); + if (ret < 0) { + mm->brk = origbrk; + goto out; + } else if (ret == 1) { + downgraded = true; + } + goto success; } /* Check against existing mmap mappings. */ @@ -242,25 +286,28 @@ /* Ok, looks good - let it rip. */ if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) goto out; - -set_brk: mm->brk = brk; + +success: populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; - up_write(&mm->mmap_sem); + if (downgraded) + mmap_read_unlock(mm); + else + mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; out: - retval = mm->brk; - up_write(&mm->mmap_sem); + retval = origbrk; + mmap_write_unlock(mm); return retval; } -static long vma_compute_subtree_gap(struct vm_area_struct *vma) +static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) { - unsigned long max, prev_end, subtree_gap; + unsigned long gap, prev_end; /* * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we @@ -268,14 +315,21 @@ * an unmapped area; whereas when expanding we only require one. * That's a little inconsistent, but keeps the code here simpler. */ - max = vm_start_gap(vma); + gap = vm_start_gap(vma); if (vma->vm_prev) { prev_end = vm_end_gap(vma->vm_prev); - if (max > prev_end) - max -= prev_end; + if (gap > prev_end) + gap -= prev_end; else - max = 0; + gap = 0; } + return gap; +} + +#ifdef CONFIG_DEBUG_VM_RB +static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma) +{ + unsigned long max = vma_compute_gap(vma), subtree_gap; if (vma->vm_rb.rb_left) { subtree_gap = rb_entry(vma->vm_rb.rb_left, struct vm_area_struct, vm_rb)->rb_subtree_gap; @@ -291,7 +345,6 @@ return max; } -#ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct mm_struct *mm) { struct rb_root *root = &mm->mm_rb; @@ -397,8 +450,16 @@ #define validate_mm(mm) do { } while (0) #endif -RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, - unsigned long, rb_subtree_gap, vma_compute_subtree_gap) +RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks, + struct vm_area_struct, vm_rb, + unsigned long, rb_subtree_gap, vma_compute_gap) +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +#define mm_rb_write_lock(mm) write_lock(&(mm)->mm_rb_lock) +#define mm_rb_write_unlock(mm) write_unlock(&(mm)->mm_rb_lock) +#else +#define mm_rb_write_lock(mm) do { } while (0) +#define mm_rb_write_unlock(mm) do { } while (0) +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ /* * Update augmented rbtree rb_subtree_gap values after vma->vm_start or @@ -408,55 +469,64 @@ static void vma_gap_update(struct vm_area_struct *vma) { /* - * As it turns out, RB_DECLARE_CALLBACKS() already created a callback - * function that does exacltly what we want. + * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created + * a callback function that does exactly what we want. */ vma_gap_callbacks_propagate(&vma->vm_rb, NULL); } static inline void vma_rb_insert(struct vm_area_struct *vma, - struct rb_root *root) + struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; + /* All rb_subtree_gap values must be consistent prior to insertion */ validate_mm_rb(root, NULL); rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); } -static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; /* * Note rb_erase_augmented is a fairly large inline function, * so make sure we instantiate it only once with our desired * augmented rbtree callbacks. */ + mm_rb_write_lock(mm); rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); + mm_rb_write_unlock(mm); /* wmb */ + + /* + * Ensure the removal is complete before clearing the node. + * Matched by vma_has_changed()/handle_speculative_fault(). + */ + RB_CLEAR_NODE(&vma->vm_rb); } static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, - struct rb_root *root, + struct mm_struct *mm, struct vm_area_struct *ignore) { /* * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of the "next" vma being erased if - * next->vm_start was reduced. + * with the possible exception of + * + * a. the "next" vma being erased if next->vm_start was reduced in + * __vma_adjust() -> __vma_unlink() + * b. the vma being erased in detach_vmas_to_be_unmapped() -> + * vma_rb_erase() */ - validate_mm_rb(root, ignore); + validate_mm_rb(&mm->mm_rb, ignore); - __vma_rb_erase(vma, root); + __vma_rb_erase(vma, mm); } static __always_inline void vma_rb_erase(struct vm_area_struct *vma, - struct rb_root *root) + struct mm_struct *mm) { - /* - * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of the vma being erased. - */ - validate_mm_rb(root, vma); - - __vma_rb_erase(vma, root); + vma_rb_erase_ignore(vma, mm, vma); } /* @@ -470,7 +540,7 @@ * After the update, the vma will be reinserted using * anon_vma_interval_tree_post_update_vma(). * - * The entire update must be protected by exclusive mmap_sem and by + * The entire update must be protected by exclusive mmap_lock and by * the root anon_vma's mutex. */ static inline void @@ -525,6 +595,50 @@ return 0; } +/* + * vma_next() - Get the next VMA. + * @mm: The mm_struct. + * @vma: The current vma. + * + * If @vma is NULL, return the first vma in the mm. + * + * Returns: The next VMA after @vma. + */ +static inline struct vm_area_struct *vma_next(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + if (!vma) + return mm->mmap; + + return vma->vm_next; +} + +/* + * munmap_vma_range() - munmap VMAs that overlap a range. + * @mm: The mm struct + * @start: The start of the range. + * @len: The length of the range. + * @pprev: pointer to the pointer that will be set to previous vm_area_struct + * @rb_link: the rb_node + * @rb_parent: the parent rb_node + * + * Find all the vm_area_struct that overlap from @start to + * @end and munmap them. Set @pprev to the previous vm_area_struct. + * + * Returns: -ENOMEM on munmap failure or 0 on success. + */ +static inline int +munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, + struct vm_area_struct **pprev, struct rb_node ***link, + struct rb_node **parent, struct list_head *uf) +{ + + while (find_vma_links(mm, start, start + len, pprev, link, parent)) + if (do_munmap(mm, start, len, uf)) + return -ENOMEM; + + return 0; +} static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -571,10 +685,12 @@ * immediately update the gap to the correct value. Finally we * rebalance the rbtree after all augmented values have been set. */ + mm_rb_write_lock(mm); rb_link_node(&vma->vm_rb, rb_parent, rb_link); vma->rb_subtree_gap = 0; vma_gap_update(vma); - vma_rb_insert(vma, &mm->mm_rb); + vma_rb_insert(vma, mm); + mm_rb_write_unlock(mm); } static void __vma_link_file(struct vm_area_struct *vma) @@ -586,9 +702,9 @@ struct address_space *mapping = file->f_mapping; if (vma->vm_flags & VM_DENYWRITE) - atomic_dec(&file_inode(file)->i_writecount); + put_write_access(file_inode(file)); if (vma->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); + mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_insert(vma, &mapping->i_mmap); @@ -601,7 +717,7 @@ struct vm_area_struct *prev, struct rb_node **rb_link, struct rb_node *rb_parent) { - __vma_link_list(mm, vma, prev, rb_parent); + __vma_link_list(mm, vma, prev); __vma_link_rb(mm, vma, rb_link, rb_parent); } @@ -642,37 +758,14 @@ mm->map_count++; } -static __always_inline void __vma_unlink_common(struct mm_struct *mm, +static __always_inline void __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, - bool has_prev, struct vm_area_struct *ignore) { - struct vm_area_struct *next; - - vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); - next = vma->vm_next; - if (has_prev) - prev->vm_next = next; - else { - prev = vma->vm_prev; - if (prev) - prev->vm_next = next; - else - mm->mmap = next; - } - if (next) - next->vm_prev = prev; - + vma_rb_erase_ignore(vma, mm, ignore); + __vma_unlink_list(mm, vma); /* Kill the cache */ vmacache_invalidate(mm); -} - -static inline void __vma_unlink_prev(struct mm_struct *mm, - struct vm_area_struct *vma, - struct vm_area_struct *prev) -{ - __vma_unlink_common(mm, vma, prev, true, vma); } /* @@ -684,7 +777,7 @@ */ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand) + struct vm_area_struct *expand, bool keep_locked) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; @@ -695,6 +788,10 @@ bool start_changed = false, end_changed = false; long adjust_next = 0; int remove_next = 0; + + vm_write_begin(vma); + if (next) + vm_write_begin(next); if (next && !insert) { struct vm_area_struct *exporter = NULL, *importer = NULL; @@ -729,8 +826,6 @@ remove_next = 1 + (end > next->vm_end); VM_WARN_ON(remove_next == 2 && end != next->vm_next->vm_end); - VM_WARN_ON(remove_next == 1 && - end != next->vm_end); /* trim end to next, for case 6 first pass */ end = next->vm_end; } @@ -750,7 +845,7 @@ * vma expands, overlapping part of the next: * mprotect case 5 shifting the boundary up. */ - adjust_next = (end - next->vm_start) >> PAGE_SHIFT; + adjust_next = (end - next->vm_start); exporter = next; importer = vma; VM_WARN_ON(expand != importer); @@ -760,7 +855,7 @@ * split_vma inserting another: so it must be * mprotect case 4 shifting the boundary down. */ - adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); + adjust_next = -(vma->vm_end - end); exporter = vma; importer = next; VM_WARN_ON(expand != importer); @@ -776,8 +871,12 @@ importer->anon_vma = exporter->anon_vma; error = anon_vma_clone(importer, exporter); - if (error) + if (error) { + if (next && next != vma) + vm_write_end(next); + vm_write_end(vma); return error; + } } } again: @@ -815,7 +914,7 @@ anon_vma_interval_tree_pre_update_vma(next); } - if (root) { + if (file) { flush_dcache_mmap_lock(mapping); vma_interval_tree_remove(vma, root); if (adjust_next) @@ -823,20 +922,22 @@ } if (start != vma->vm_start) { - vma->vm_start = start; + WRITE_ONCE(vma->vm_start, start); start_changed = true; } if (end != vma->vm_end) { - vma->vm_end = end; + WRITE_ONCE(vma->vm_end, end); end_changed = true; } - vma->vm_pgoff = pgoff; + WRITE_ONCE(vma->vm_pgoff, pgoff); if (adjust_next) { - next->vm_start += adjust_next << PAGE_SHIFT; - next->vm_pgoff += adjust_next; + WRITE_ONCE(next->vm_start, + next->vm_start + adjust_next); + WRITE_ONCE(next->vm_pgoff, + next->vm_pgoff + (adjust_next >> PAGE_SHIFT)); } - if (root) { + if (file) { if (adjust_next) vma_interval_tree_insert(next, root); vma_interval_tree_insert(vma, root); @@ -849,7 +950,7 @@ * us to remove next before dropping the locks. */ if (remove_next != 3) - __vma_unlink_prev(mm, next, vma); + __vma_unlink(mm, next, next); else /* * vma is not before next if they've been @@ -860,7 +961,7 @@ * "next" (which is stored in post-swap() * "vma"). */ - __vma_unlink_common(mm, next, NULL, false, vma); + __vma_unlink(mm, next, vma); if (file) __remove_shared_vm_struct(next, file, mapping); } else if (insert) { @@ -887,10 +988,9 @@ anon_vma_interval_tree_post_update_vma(next); anon_vma_unlock_write(anon_vma); } - if (mapping) - i_mmap_unlock_write(mapping); - if (root) { + if (file) { + i_mmap_unlock_write(mapping); uprobe_mmap(vma); if (adjust_next) @@ -898,15 +998,13 @@ } if (remove_next) { - if (file) { + if (file) uprobe_munmap(next, next->vm_start, next->vm_end); - fput(file); - } if (next->anon_vma) anon_vma_merge(vma, next); mm->map_count--; - mpol_put(vma_policy(next)); - vm_area_free(next); + vm_write_end(next); + put_vma(next); /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -920,6 +1018,8 @@ * "vma->vm_next" gap must be updated. */ next = vma->vm_next; + if (next) + vm_write_begin(next); } else { /* * For the scope of the comment "next" and @@ -966,6 +1066,11 @@ if (insert && file) uprobe_mmap(insert); + if (next && next != vma) + vm_write_end(next); + if (!keep_locked) + vm_write_end(vma); + validate_mm(mm); return 0; @@ -984,7 +1089,7 @@ * VM_SOFTDIRTY should not prevent from VMA merging, if we * match the flags but dirty bit -- the caller should mark * merged VMA as dirty. If dirty bit won't be excluded from - * comparison, we increase pressue on the memory system forcing + * comparison, we increase pressure on the memory system forcing * the kernel to generate new VMAs when old one could be * extended instead. */ @@ -1023,7 +1128,7 @@ * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We don't check here for the merged mmap wrapping around the end of pagecache - * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which + * indices (16TB on ia32) because do_mmap() does not permit mmap's which * wrap, nor mmaps which cover the final page at index -1UL. */ static int @@ -1081,17 +1186,20 @@ * the area passed down from mprotect_fixup, never extending beyond one * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: * - * AAAA AAAA AAAA AAAA - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX - * cannot merge might become might become might become - * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or - * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or - * mremap move: PPPPXXXXXXXX 8 - * AAAA - * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN - * might become case 1 below case 2 below case 3 below + * AAAA AAAA AAAA + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN + * cannot merge might become might become + * PPNNNNNNNNNN PPPPPPPPPPNN + * mmap, brk or case 4 below case 5 below + * mremap move: + * AAAA AAAA + * PPPP NNNN PPPPNNNNXXXX + * might become might become + * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or + * PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or + * PPPPNNNNNNNN 3 PPPPXXXXXXXX 8 * - * It is important for case 8 that the the vma NNNN overlapping the + * It is important for case 8 that the vma NNNN overlapping the * region AAAA is never going to extended over XXXX. Instead XXXX must * be extended in region AAAA and NNNN must be removed. This way in * all cases where vma_merge succeeds, the moment vma_adjust drops the @@ -1105,13 +1213,13 @@ * parameter) may establish ptes with the wrong permissions of NNNN * instead of the right permissions of XXXX. */ -struct vm_area_struct *vma_merge(struct mm_struct *mm, +struct vm_area_struct *__vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - const char __user *anon_name) + const char __user *anon_name, bool keep_locked) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1124,10 +1232,7 @@ if (vm_flags & VM_SPECIAL) return NULL; - if (prev) - next = prev->vm_next; - else - next = mm->mmap; + next = vma_next(mm, prev); area = next; if (area && area->vm_end == end) /* cases 6, 7, 8 */ next = next->vm_next; @@ -1161,10 +1266,11 @@ /* cases 1, 6 */ err = __vma_adjust(prev, prev->vm_start, next->vm_end, prev->vm_pgoff, NULL, - prev); + prev, keep_locked); } else /* cases 2, 5, 7 */ err = __vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL, prev); + end, prev->vm_pgoff, NULL, prev, + keep_locked); if (err) return NULL; khugepaged_enter_vma_merge(prev, vm_flags); @@ -1182,10 +1288,12 @@ anon_name)) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL, next); + addr, prev->vm_pgoff, NULL, next, + keep_locked); else { /* cases 3, 8 */ err = __vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL, next); + next->vm_pgoff - pglen, NULL, next, + keep_locked); /* * In case 3 area is already equal to next and * this is a noop, but in case 8 "area" has @@ -1203,7 +1311,7 @@ } /* - * Rough compatbility check to quickly see if it's even worth looking + * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. * * They need to have the same vm_file, and the flags can only differ @@ -1220,7 +1328,7 @@ return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && + !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } @@ -1267,26 +1375,22 @@ */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { - struct anon_vma *anon_vma; - struct vm_area_struct *near; + struct anon_vma *anon_vma = NULL; - near = vma->vm_next; - if (!near) - goto try_prev; + /* Try next first. */ + if (vma->vm_next) { + anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next); + if (anon_vma) + return anon_vma; + } - anon_vma = reusable_anon_vma(near, vma, near); - if (anon_vma) - return anon_vma; -try_prev: - near = vma->vm_prev; - if (!near) - goto none; + /* Try prev next. */ + if (vma->vm_prev) + anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma); - anon_vma = reusable_anon_vma(near, near, vma); - if (anon_vma) - return anon_vma; -none: /* + * We might reach here with anon_vma == NULL if we can't find + * any reusable anon_vma. * There's no absolute need to look only at touching neighbours: * we could search further afield for "compatible" anon_vmas. * But it would probably just be a waste of time searching, @@ -1294,7 +1398,7 @@ * We're trying to allow mprotect remerging later on, * not trying to minimize memory used for anon_vmas. */ - return NULL; + return anon_vma; } /* @@ -1336,6 +1440,9 @@ if (S_ISBLK(inode->i_mode)) return MAX_LFS_FILESIZE; + if (S_ISSOCK(inode->i_mode)) + return MAX_LFS_FILESIZE; + /* Special "we do even unsigned file positions" case */ if (file->f_mode & FMODE_UNSIGNED_OFFSET) return 0; @@ -1358,15 +1465,15 @@ } /* - * The caller must hold down_write(¤t->mm->mmap_sem). + * The caller must write-lock current->mm->mmap_lock. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, vm_flags_t vm_flags, - unsigned long pgoff, unsigned long *populate, - struct list_head *uf) + unsigned long flags, unsigned long pgoff, + unsigned long *populate, struct list_head *uf) { struct mm_struct *mm = current->mm; + vm_flags_t vm_flags; int pkey = 0; *populate = 0; @@ -1408,7 +1515,7 @@ * that it represents a valid section of the address space. */ addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (offset_in_page(addr)) + if (IS_ERR_VALUE(addr)) return addr; if (flags & MAP_FIXED_NOREPLACE) { @@ -1428,7 +1535,7 @@ * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | + vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) @@ -1457,7 +1564,7 @@ * with MAP_SHARED to preserve backward compatibility. */ flags &= LEGACY_MAP_MASK; - /* fall through */ + fallthrough; case MAP_SHARED_VALIDATE: if (flags & ~flags_mask) return -EOPNOTSUPP; @@ -1484,8 +1591,7 @@ vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); - - /* fall through */ + fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; @@ -1560,11 +1666,12 @@ file = fget(fd); if (!file) return -EBADF; - if (is_file_hugepages(file)) + if (is_file_hugepages(file)) { len = ALIGN(len, huge_page_size(hstate_file(file))); - retval = -EINVAL; - if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) + } else if (unlikely(flags & MAP_HUGETLB)) { + retval = -EINVAL; goto out_fput; + } } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; struct hstate *hs; @@ -1629,7 +1736,7 @@ #endif /* __ARCH_WANT_SYS_OLD_MMAP */ /* - * Some shared mappigns will want the pages marked read-only + * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot * to the private version (using protection_map[] without the * VM_SHARED bit). @@ -1653,8 +1760,12 @@ pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) return 0; - /* Do we need to track softdirty? */ - if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) + /* + * Do we need to track softdirty? hugetlb does not support softdirty + * tracking yet. + */ + if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY) && + !is_vm_hugetlb_page(vma)) return 1; /* Specialty mapping? */ @@ -1663,7 +1774,7 @@ /* Can the mapping track the dirty pages? */ return vma->vm_file && vma->vm_file->f_mapping && - mapping_cap_account_dirty(vma->vm_file->f_mapping); + mapping_can_writeback(vma->vm_file->f_mapping); } /* @@ -1687,7 +1798,7 @@ struct list_head *uf) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev; + struct vm_area_struct *vma, *prev, *merge; int error; struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; @@ -1707,13 +1818,9 @@ return -ENOMEM; } - /* Clear old maps */ - while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, - &rb_parent)) { - if (do_munmap(mm, addr, len, uf)) - return -ENOMEM; - } - + /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ + if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + return -ENOMEM; /* * Private writable mapping: check memory availability */ @@ -1781,6 +1888,28 @@ WARN_ON_ONCE(addr != vma->vm_start); addr = vma->vm_start; + + /* If vm_flags changed after call_mmap(), we should try merge vma again + * as we may succeed this time. + */ + if (unlikely(vm_flags != vma->vm_flags && prev)) { + merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, + vma_get_anon_name(vma)); + if (merge) { + /* ->mmap() can change vma->vm_file and fput the original file. So + * fput the vma->vm_file here or we would add an extra fput for file + * and cause general protection fault ultimately. + */ + fput(vma->vm_file); + vm_area_free(vma); + vma = merge; + /* Update vm_flags to pick up the change. */ + vm_flags = vma->vm_flags; + goto unmap_writable; + } + } + vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); @@ -1790,9 +1919,19 @@ vma_set_anonymous(vma); } + /* Allow architectures to sanity-check the vm_flags */ + if (!arch_validate_flags(vma->vm_flags)) { + error = -EINVAL; + if (file) + goto close_and_free_vma; + else + goto free_vma; + } + vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ if (file) { +unmap_writable: if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); if (vm_flags & VM_DENYWRITE) @@ -1802,12 +1941,14 @@ out: perf_event_mmap(vma); + vm_write_begin(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + WRITE_ONCE(vma->vm_flags, + vma->vm_flags & VM_LOCKED_CLEAR_MASK); else mm->locked_vm += (len >> PAGE_SHIFT); } @@ -1822,19 +1963,24 @@ * then new mapped in-place (which must be aimed as * a completely new data area). */ - vma->vm_flags |= VM_SOFTDIRTY; + WRITE_ONCE(vma->vm_flags, vma->vm_flags | VM_SOFTDIRTY); vma_set_page_prot(vma); + vm_write_end(vma); + + trace_android_vh_mmap_region(vma, addr); return addr; +close_and_free_vma: + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); unmap_and_free_vma: vma->vm_file = NULL; fput(file); /* Undo any partial mapping done by a device driver. */ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); - charged = 0; if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); allow_write_and_free_vma: @@ -1848,7 +1994,7 @@ return error; } -unsigned long unmapped_area(struct vm_unmapped_area_info *info) +static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { /* * We implement the search by looking for an rbtree node that @@ -1951,16 +2097,21 @@ return gap_start; } -unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) +static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long length, low_limit, high_limit, gap_start, gap_end; + unsigned long addr = 0; /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; + + trace_android_vh_get_from_fragment_pool(mm, info, &addr); + if (addr) + return addr; /* * Adjust search limits by the desired length. @@ -2049,7 +2200,29 @@ VM_BUG_ON(gap_end < gap_start); return gap_end; } -EXPORT_SYMBOL_GPL(unmapped_area_topdown); + +/* + * Search for an unmapped address range. + * + * We are looking for a range that: + * - does not intersect with any VMA; + * - is contained within the [low_limit, high_limit) interval; + * - is at least the desired size. + * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) + */ +unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) +{ + unsigned long addr; + + if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) + addr = unmapped_area_topdown(info); + else + addr = unmapped_area(info); + + trace_vm_unmapped_area(addr, info); + return addr; +} +EXPORT_SYMBOL_GPL(vm_unmapped_area); /* Get an address range which is currently unmapped. * For shmat() with addr=0. @@ -2070,8 +2243,9 @@ struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info; + const unsigned long mmap_end = arch_get_mmap_end(addr); - if (len > TASK_SIZE - mmap_min_addr) + if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -2080,7 +2254,7 @@ if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; @@ -2089,7 +2263,7 @@ info.flags = 0; info.length = len; info.low_limit = mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = mmap_end; info.align_mask = 0; info.align_offset = 0; return vm_unmapped_area(&info); @@ -2102,17 +2276,17 @@ */ #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; struct vm_unmapped_area_info info; + const unsigned long mmap_end = arch_get_mmap_end(addr); /* requested length too big for entire address space */ - if (len > TASK_SIZE - mmap_min_addr) + if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -2122,7 +2296,7 @@ if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; @@ -2131,9 +2305,10 @@ info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = mm->mmap_base; + info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); info.align_mask = 0; info.align_offset = 0; + trace_android_vh_exclude_reserved_zone(mm, &info); addr = vm_unmapped_area(&info); /* @@ -2146,9 +2321,11 @@ VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; + info.high_limit = mmap_end; addr = vm_unmapped_area(&info); } + + trace_android_vh_include_reserved_zone(mm, &info, &addr); return addr; } @@ -2177,7 +2354,7 @@ /* * mmap_region() will call shmem_zero_setup() to create a file, * so use shmem's get_unmapped_area in case it can be huge. - * do_mmap_pgoff() will clear pgoff, so match alignment. + * do_mmap() will clear pgoff, so match alignment. */ pgoff = 0; get_area = shmem_get_unmapped_area; @@ -2199,15 +2376,11 @@ EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +static struct vm_area_struct *__find_vma(struct mm_struct *mm, + unsigned long addr) { struct rb_node *rb_node; - struct vm_area_struct *vma; - - /* Check the cache first. */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; + struct vm_area_struct *vma = NULL; rb_node = mm->mm_rb.rb_node; @@ -2225,12 +2398,53 @@ rb_node = rb_node->rb_right; } + return vma; +} + +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + + /* Check the cache first. */ + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; + + vma = __find_vma(mm, addr); if (vma) vmacache_update(addr, vma); return vma; } - EXPORT_SYMBOL(find_vma); + +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = NULL; + + read_lock(&mm->mm_rb_lock); + vma = __find_vma(mm, addr); + + /* + * If there is a concurrent fast mremap, bail out since the entire + * PMD/PUD subtree may have been remapped. + * + * This is usually safe for conventional mremap since it takes the + * PTE locks as does SPF. However fast mremap only takes the lock + * at the PMD/PUD level which is ok as it is done with the mmap + * write lock held. But since SPF, as the term implies forgoes, + * taking the mmap read lock and also cannot take PTL lock at the + * larger PMD/PUD granualrity, since it would introduce huge + * contention in the page fault path; fall back to regular fault + * handling. + */ + if (vma && !atomic_inc_unless_negative(&vma->vm_ref_count)) + vma = NULL; + read_unlock(&mm->mm_rb_lock); + + return vma; +} +#endif /* * Same as find_vma, but also return a pointer to the previous VMA in *pprev. @@ -2245,12 +2459,9 @@ if (vma) { *pprev = vma->vm_prev; } else { - struct rb_node *rb_node = mm->mm_rb.rb_node; - *pprev = NULL; - while (rb_node) { - *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); - rb_node = rb_node->rb_right; - } + struct rb_node *rb_node = rb_last(&mm->mm_rb); + + *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL; } return vma; } @@ -2330,8 +2541,7 @@ gap_addr = TASK_SIZE; next = vma->vm_next; - if (next && next->vm_start < gap_addr && - (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { + if (next && next->vm_start < gap_addr && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ @@ -2343,7 +2553,7 @@ /* * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_sem in read mode. We need the + * is required to hold the mmap_lock in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ anon_vma_lock_write(vma->anon_vma); @@ -2361,7 +2571,7 @@ if (!error) { /* * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_sem + * updates, but we only hold a shared mmap_lock * lock here, so we need to protect against * concurrent vma expansions. * anon_vma_lock_write() doesn't help here, as @@ -2412,7 +2622,7 @@ prev = vma->vm_prev; /* Check that both stack segments have the same anon_vma? */ if (prev && !(prev->vm_flags & VM_GROWSDOWN) && - (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { + vma_is_accessible(prev)) { if (address - prev->vm_end < stack_guard_gap) return -ENOMEM; } @@ -2423,7 +2633,7 @@ /* * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_sem in read mode. We need the + * is required to hold the mmap_lock in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ anon_vma_lock_write(vma->anon_vma); @@ -2441,7 +2651,7 @@ if (!error) { /* * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_sem + * updates, but we only hold a shared mmap_lock * lock here, so we need to protect against * concurrent vma expansions. * anon_vma_lock_write() doesn't help here, as @@ -2455,8 +2665,8 @@ mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); - vma->vm_start = address; - vma->vm_pgoff -= grow; + WRITE_ONCE(vma->vm_start, address); + WRITE_ONCE(vma->vm_pgoff, vma->vm_pgoff - grow); anon_vma_interval_tree_post_update_vma(vma); vma_gap_update(vma); spin_unlock(&mm->page_table_lock); @@ -2483,7 +2693,7 @@ if (!*endptr) stack_guard_gap = val << PAGE_SHIFT; - return 0; + return 1; } __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); @@ -2503,7 +2713,7 @@ if (vma && (vma->vm_start <= addr)) return vma; /* don't alter vm_end if the coredump is running */ - if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) + if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); @@ -2528,9 +2738,6 @@ if (vma->vm_start <= addr) return vma; if (!(vma->vm_flags & VM_GROWSDOWN)) - return NULL; - /* don't alter vm_start if the coredump is running */ - if (!mmget_still_valid(mm)) return NULL; start = vma->vm_start; if (expand_stack(vma, addr)) @@ -2576,13 +2783,30 @@ struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end) { - struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; + struct vm_area_struct *next = vma_next(mm, prev); struct mmu_gather tlb; + struct vm_area_struct *cur_vma; lru_add_drain(); tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end); + + /* + * Ensure we have no stale TLB entries by the time this mapping is + * removed from the rmap. + * Note that we don't have to worry about nested flushes here because + * we're holding the mm semaphore for removing the mapping - so any + * concurrent flush in this region has to be coming through the rmap, + * and we synchronize against that using the rmap lock. + */ + for (cur_vma = vma; cur_vma; cur_vma = cur_vma->vm_next) { + if ((cur_vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) != 0) { + tlb_flush_mmu(&tlb); + break; + } + } + free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, start, end); @@ -2592,7 +2816,7 @@ * Create a list of vma's touched by the unmap, removing them from the mm's * vma list as we go.. */ -static void +static bool detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long end) { @@ -2602,7 +2826,7 @@ insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; do { - vma_rb_erase(vma, &mm->mm_rb); + vma_rb_erase(vma, mm); mm->map_count--; tail_vma = vma; vma = vma->vm_next; @@ -2617,6 +2841,17 @@ /* Kill the cache */ vmacache_invalidate(mm); + + /* + * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or + * VM_GROWSUP VMA. Such VMAs can change their size under + * down_read(mmap_lock) and collide with the VMA we are about to unmap. + */ + if (vma && (vma->vm_flags & VM_GROWSDOWN)) + return false; + if (prev && (prev->vm_flags & VM_GROWSUP)) + return false; + return true; } /* @@ -2701,8 +2936,8 @@ * work. This now handles partial unmappings. * Jeremy Fitzhardinge <jeremy@goop.org> */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, - struct list_head *uf) +int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf, bool downgrade) { unsigned long end; struct vm_area_struct *vma, *prev, *last; @@ -2711,8 +2946,16 @@ return -EINVAL; len = PAGE_ALIGN(len); + end = start + len; if (len == 0) return -EINVAL; + + /* + * arch_unmap() might do unmaps itself. It must be called + * and finish any rbtree manipulation before this code + * runs and also starts to manipulate the rbtree. + */ + arch_unmap(mm, start, end); /* Find the first overlapping VMA */ vma = find_vma(mm, start); @@ -2722,7 +2965,6 @@ /* we have start < vma->vm_end */ /* if it doesn't overlap, we have nothing.. */ - end = start + len; if (vma->vm_start >= end) return 0; @@ -2757,7 +2999,7 @@ if (error) return error; } - vma = prev ? prev->vm_next : mm->mmap; + vma = vma_next(mm, prev); if (unlikely(uf)) { /* @@ -2784,37 +3026,60 @@ mm->locked_vm -= vma_pages(tmp); munlock_vma_pages_all(tmp); } + tmp = tmp->vm_next; } } - /* - * Remove the vma's, and unmap the actual pages - */ - detach_vmas_to_be_unmapped(mm, vma, prev, end); - unmap_region(mm, vma, prev, start, end); + /* Detach vmas from rbtree */ + if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) + downgrade = false; - arch_unmap(mm, vma, start, end); + if (downgrade) + mmap_write_downgrade(mm); + + unmap_region(mm, vma, prev, start, end); /* Fix up all other VM information */ remove_vma_list(mm, vma); - return 0; + return downgrade ? 1 : 0; } -int vm_munmap(unsigned long start, size_t len) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf) +{ + return __do_munmap(mm, start, len, uf, false); +} + +static int __vm_munmap(unsigned long start, size_t len, bool downgrade) { int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_munmap(mm, start, len, &uf); - up_write(&mm->mmap_sem); + ret = __do_munmap(mm, start, len, &uf, downgrade); + /* + * Returning 1 indicates mmap_lock is downgraded. + * But 1 is not legal return value of vm_munmap() and munmap(), reset + * it to 0 before return. + */ + if (ret == 1) { + mmap_read_unlock(mm); + ret = 0; + } else + mmap_write_unlock(mm); + userfaultfd_unmap_complete(mm, &uf); return ret; +} + +int vm_munmap(unsigned long start, size_t len) +{ + return __vm_munmap(start, len, false); } EXPORT_SYMBOL(vm_munmap); @@ -2822,7 +3087,7 @@ { addr = untagged_addr(addr); profile_munmap(addr); - return vm_munmap(addr, len); + return __vm_munmap(addr, len, true); } @@ -2854,7 +3119,7 @@ if (pgoff + (size >> PAGE_SHIFT) < pgoff) return ret; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; vma = find_vma(mm, start); @@ -2913,26 +3178,16 @@ } file = get_file(vma->vm_file); - ret = do_mmap_pgoff(vma->vm_file, start, size, + ret = do_mmap(vma->vm_file, start, size, prot, flags, pgoff, &populate, NULL); fput(file); out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); if (populate) mm_populate(ret, populate); if (!IS_ERR_VALUE(ret)) ret = 0; return ret; -} - -static inline void verify_mm_writelocked(struct mm_struct *mm) -{ -#ifdef CONFIG_DEBUG_VM - if (unlikely(down_read_trylock(&mm->mmap_sem))) { - WARN_ON(1); - up_read(&mm->mmap_sem); - } -#endif } /* @@ -2947,34 +3202,24 @@ struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; + unsigned long mapped_addr; /* Until we need other flags, refuse anything except VM_EXEC. */ if ((flags & (~VM_EXEC)) != 0) return -EINVAL; flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (offset_in_page(error)) - return error; + mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (IS_ERR_VALUE(mapped_addr)) + return mapped_addr; error = mlock_future_check(mm, mm->def_flags, len); if (error) return error; - /* - * mm->mmap_sem is required to protect against another thread - * changing the mappings in case we sleep. - */ - verify_mm_writelocked(mm); - - /* - * Clear old maps. this also does some error checking for us - */ - while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, - &rb_parent)) { - if (do_munmap(mm, addr, len, uf)) - return -ENOMEM; - } + /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ + if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + return -ENOMEM; /* Check against address space limits *after* clearing old maps... */ if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) @@ -3032,12 +3277,12 @@ if (!len) return 0; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_brk_flags(addr, len, flags, &uf); populate = ((mm->def_flags & VM_LOCKED) != 0); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); @@ -3065,12 +3310,12 @@ /* * Manually reap the mm to free as much memory as possible. * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard - * this mm from further consideration. Taking mm->mmap_sem for + * this mm from further consideration. Taking mm->mmap_lock for * write after setting MMF_OOM_SKIP will guarantee that the oom - * reaper will not run on this mm again after mmap_sem is + * reaper will not run on this mm again after mmap_lock is * dropped. * - * Nothing can be holding mm->mmap_sem here and the above call + * Nothing can be holding mm->mmap_lock here and the above call * to mmu_notifier_release(mm) ensures mmu notifier callbacks in * __oom_reap_task_mm() will not block. * @@ -3081,10 +3326,9 @@ (void)__oom_reap_task_mm(mm); set_bit(MMF_OOM_SKIP, &mm->flags); - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); } + mmap_write_lock(mm); if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -3097,8 +3341,11 @@ arch_exit_mmap(mm); vma = mm->mmap; - if (!vma) /* Can happen if dup_mmap() received an OOM */ + if (!vma) { + /* Can happen if dup_mmap() received an OOM */ + mmap_write_unlock(mm); return; + } lru_add_drain(); flush_cache_mm(mm); @@ -3109,16 +3356,15 @@ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); - /* - * Walk the list again, actually closing and freeing it, - * with preemption enabled, without holding any MM locks. - */ + /* Walk the list again, actually closing and freeing it. */ while (vma) { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma = remove_vma(vma); cond_resched(); } + mm->mmap = NULL; + mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } @@ -3148,7 +3394,7 @@ * By setting it to reflect the virtual start address of the * vma, merges and splits can happen in a seamless way, just * using the existing file pgoff checks and manipulations. - * Similarly in do_mmap_pgoff and in do_brk. + * Similarly in do_mmap and in do_brk_flags. */ if (vma_is_anonymous(vma)) { BUG_ON(vma->anon_vma); @@ -3185,9 +3431,21 @@ if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) return NULL; /* should never get here */ - new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, vma_get_anon_name(vma)); + + /* There is 3 cases to manage here in + * AAAA AAAA AAAA AAAA + * PPPP.... PPPP......NNNN PPPP....NNNN PP........NN + * PPPPPPPP(A) PPPP..NNNNNNNN(B) PPPPPPPPPPPP(1) NULL + * PPPPPPPPNNNN(2) + * PPPPNNNNNNNN(3) + * + * new_vma == prev in case A,1,2 + * new_vma == next in case B,3 + */ + new_vma = __vma_merge(mm, prev, addr, addr + len, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, + vma_policy(vma), vma->vm_userfaultfd_ctx, + vma_get_anon_name(vma), true); if (new_vma) { /* * Source vma may have been merged into new_vma @@ -3225,6 +3483,15 @@ get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); + /* + * As the VMA is linked right now, it may be hit by the + * speculative page fault handler. But we don't want it to + * to start mapping page in this area until the caller has + * potentially move the pte from the moved VMA. To prevent + * that we protect it right now, and let the caller unprotect + * it once the move is done. + */ + vm_write_begin(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); *need_rmap_locks = false; } @@ -3311,6 +3578,8 @@ .fault = special_mapping_fault, .mremap = special_mapping_mremap, .name = special_mapping_name, + /* vDSO code relies that VVAR can't be accessed remotely */ + .access = NULL, }; static const struct vm_operations_struct legacy_special_mapping_vmops = { @@ -3394,7 +3663,7 @@ } /* - * Called with mm->mmap_sem held for writing. + * Called with mm->mmap_lock held for writing. * Insert a new vma covering the given region, with the given flags. * Its pages are supplied by the given array of struct page *. * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. @@ -3431,7 +3700,7 @@ * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); /* * We can safely modify head.next after taking the * anon_vma->root->rwsem. If some other vma in this mm shares @@ -3461,7 +3730,7 @@ */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); } } @@ -3470,11 +3739,11 @@ * operations that could ever happen on a certain mm. This includes * vmtruncate, try_to_unmap, and all page faults. * - * The caller must take the mmap_sem in write mode before calling + * The caller must take the mmap_lock in write mode before calling * mm_take_all_locks(). The caller isn't allowed to release the - * mmap_sem until mm_drop_all_locks() returns. + * mmap_lock until mm_drop_all_locks() returns. * - * mmap_sem in write mode is required in order to block all operations + * mmap_lock in write mode is required in order to block all operations * that could modify pagetables and free pages without need of * altering the vma layout. It's also needed in write mode to avoid new * anon_vmas to be associated with existing vmas. @@ -3507,7 +3776,7 @@ struct vm_area_struct *vma; struct anon_vma_chain *avc; - BUG_ON(down_read_trylock(&mm->mmap_sem)); + BUG_ON(mmap_read_trylock(mm)); mutex_lock(&mm_all_locks_mutex); @@ -3579,7 +3848,7 @@ } /* - * The mmap_sem cannot be released by the caller until + * The mmap_lock cannot be released by the caller until * mm_drop_all_locks() returns. */ void mm_drop_all_locks(struct mm_struct *mm) @@ -3587,7 +3856,7 @@ struct vm_area_struct *vma; struct anon_vma_chain *avc; - BUG_ON(down_read_trylock(&mm->mmap_sem)); + BUG_ON(mmap_read_trylock(mm)); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); for (vma = mm->mmap; vma; vma = vma->vm_next) { -- Gitblit v1.6.2