From 08f87f769b595151be1afeff53e144f543faa614 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Wed, 06 Dec 2023 09:51:13 +0000
Subject: [PATCH] add dts config
---
kernel/mm/mmap.c | 837 +++++++++++++++++++++++++++++++++++++++--------------------
1 files changed, 553 insertions(+), 284 deletions(-)
diff --git a/kernel/mm/mmap.c b/kernel/mm/mmap.c
index 4bf28d1..8ec92cc 100644
--- a/kernel/mm/mmap.c
+++ b/kernel/mm/mmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/mmap.c
*
@@ -52,6 +53,10 @@
#include <asm/tlb.h>
#include <asm/mmu_context.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/mmap.h>
+#undef CREATE_TRACE_POINTS
+#include <trace/hooks/mm.h>
#include "internal.h"
#ifndef arch_mmap_check
@@ -128,7 +133,7 @@
vm_flags &= ~VM_SHARED;
vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
}
- /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */
+ /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
}
@@ -139,7 +144,7 @@
struct file *file, struct address_space *mapping)
{
if (vma->vm_flags & VM_DENYWRITE)
- atomic_inc(&file_inode(file)->i_writecount);
+ allow_write_access(file);
if (vma->vm_flags & VM_SHARED)
mapping_unmap_writable(mapping);
@@ -164,6 +169,27 @@
}
}
+static void __free_vma(struct vm_area_struct *vma)
+{
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ mpol_put(vma_policy(vma));
+ vm_area_free(vma);
+}
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+void put_vma(struct vm_area_struct *vma)
+{
+ if (atomic_dec_and_test(&vma->vm_ref_count))
+ __free_vma(vma);
+}
+#else
+static inline void put_vma(struct vm_area_struct *vma)
+{
+ __free_vma(vma);
+}
+#endif
+
/*
* Close a vm structure and free it, returning the next.
*/
@@ -174,10 +200,7 @@
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (vma->vm_file)
- fput(vma->vm_file);
- mpol_put(vma_policy(vma));
- vm_area_free(vma);
+ put_vma(vma);
return next;
}
@@ -186,15 +209,18 @@
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long retval;
- unsigned long newbrk, oldbrk;
+ unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
struct vm_area_struct *next;
unsigned long min_brk;
bool populate;
+ bool downgraded = false;
LIST_HEAD(uf);
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
+
+ origbrk = mm->brk;
#ifdef CONFIG_COMPAT_BRK
/*
@@ -224,14 +250,32 @@
newbrk = PAGE_ALIGN(brk);
oldbrk = PAGE_ALIGN(mm->brk);
- if (oldbrk == newbrk)
- goto set_brk;
+ if (oldbrk == newbrk) {
+ mm->brk = brk;
+ goto success;
+ }
- /* Always allow shrinking brk. */
+ /*
+ * Always allow shrinking brk.
+ * __do_munmap() may downgrade mmap_lock to read.
+ */
if (brk <= mm->brk) {
- if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf))
- goto set_brk;
- goto out;
+ int ret;
+
+ /*
+ * mm->brk must to be protected by write mmap_lock so update it
+ * before downgrading mmap_lock. When __do_munmap() fails,
+ * mm->brk will be restored from origbrk.
+ */
+ mm->brk = brk;
+ ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
+ if (ret < 0) {
+ mm->brk = origbrk;
+ goto out;
+ } else if (ret == 1) {
+ downgraded = true;
+ }
+ goto success;
}
/* Check against existing mmap mappings. */
@@ -242,25 +286,28 @@
/* Ok, looks good - let it rip. */
if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
goto out;
-
-set_brk:
mm->brk = brk;
+
+success:
populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
- up_write(&mm->mmap_sem);
+ if (downgraded)
+ mmap_read_unlock(mm);
+ else
+ mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);
return brk;
out:
- retval = mm->brk;
- up_write(&mm->mmap_sem);
+ retval = origbrk;
+ mmap_write_unlock(mm);
return retval;
}
-static long vma_compute_subtree_gap(struct vm_area_struct *vma)
+static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
{
- unsigned long max, prev_end, subtree_gap;
+ unsigned long gap, prev_end;
/*
* Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
@@ -268,14 +315,21 @@
* an unmapped area; whereas when expanding we only require one.
* That's a little inconsistent, but keeps the code here simpler.
*/
- max = vm_start_gap(vma);
+ gap = vm_start_gap(vma);
if (vma->vm_prev) {
prev_end = vm_end_gap(vma->vm_prev);
- if (max > prev_end)
- max -= prev_end;
+ if (gap > prev_end)
+ gap -= prev_end;
else
- max = 0;
+ gap = 0;
}
+ return gap;
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
+{
+ unsigned long max = vma_compute_gap(vma), subtree_gap;
if (vma->vm_rb.rb_left) {
subtree_gap = rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb)->rb_subtree_gap;
@@ -291,7 +345,6 @@
return max;
}
-#ifdef CONFIG_DEBUG_VM_RB
static int browse_rb(struct mm_struct *mm)
{
struct rb_root *root = &mm->mm_rb;
@@ -397,8 +450,16 @@
#define validate_mm(mm) do { } while (0)
#endif
-RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
- unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
+RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
+ struct vm_area_struct, vm_rb,
+ unsigned long, rb_subtree_gap, vma_compute_gap)
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+#define mm_rb_write_lock(mm) write_lock(&(mm)->mm_rb_lock)
+#define mm_rb_write_unlock(mm) write_unlock(&(mm)->mm_rb_lock)
+#else
+#define mm_rb_write_lock(mm) do { } while (0)
+#define mm_rb_write_unlock(mm) do { } while (0)
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
/*
* Update augmented rbtree rb_subtree_gap values after vma->vm_start or
@@ -408,55 +469,64 @@
static void vma_gap_update(struct vm_area_struct *vma)
{
/*
- * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
- * function that does exacltly what we want.
+ * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
+ * a callback function that does exactly what we want.
*/
vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
}
static inline void vma_rb_insert(struct vm_area_struct *vma,
- struct rb_root *root)
+ struct mm_struct *mm)
{
+ struct rb_root *root = &mm->mm_rb;
+
/* All rb_subtree_gap values must be consistent prior to insertion */
validate_mm_rb(root, NULL);
rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}
-static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm)
{
+ struct rb_root *root = &mm->mm_rb;
/*
* Note rb_erase_augmented is a fairly large inline function,
* so make sure we instantiate it only once with our desired
* augmented rbtree callbacks.
*/
+ mm_rb_write_lock(mm);
rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+ mm_rb_write_unlock(mm); /* wmb */
+
+ /*
+ * Ensure the removal is complete before clearing the node.
+ * Matched by vma_has_changed()/handle_speculative_fault().
+ */
+ RB_CLEAR_NODE(&vma->vm_rb);
}
static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
- struct rb_root *root,
+ struct mm_struct *mm,
struct vm_area_struct *ignore)
{
/*
* All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of the "next" vma being erased if
- * next->vm_start was reduced.
+ * with the possible exception of
+ *
+ * a. the "next" vma being erased if next->vm_start was reduced in
+ * __vma_adjust() -> __vma_unlink()
+ * b. the vma being erased in detach_vmas_to_be_unmapped() ->
+ * vma_rb_erase()
*/
- validate_mm_rb(root, ignore);
+ validate_mm_rb(&mm->mm_rb, ignore);
- __vma_rb_erase(vma, root);
+ __vma_rb_erase(vma, mm);
}
static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
- struct rb_root *root)
+ struct mm_struct *mm)
{
- /*
- * All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of the vma being erased.
- */
- validate_mm_rb(root, vma);
-
- __vma_rb_erase(vma, root);
+ vma_rb_erase_ignore(vma, mm, vma);
}
/*
@@ -470,7 +540,7 @@
* After the update, the vma will be reinserted using
* anon_vma_interval_tree_post_update_vma().
*
- * The entire update must be protected by exclusive mmap_sem and by
+ * The entire update must be protected by exclusive mmap_lock and by
* the root anon_vma's mutex.
*/
static inline void
@@ -525,6 +595,50 @@
return 0;
}
+/*
+ * vma_next() - Get the next VMA.
+ * @mm: The mm_struct.
+ * @vma: The current vma.
+ *
+ * If @vma is NULL, return the first vma in the mm.
+ *
+ * Returns: The next VMA after @vma.
+ */
+static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ if (!vma)
+ return mm->mmap;
+
+ return vma->vm_next;
+}
+
+/*
+ * munmap_vma_range() - munmap VMAs that overlap a range.
+ * @mm: The mm struct
+ * @start: The start of the range.
+ * @len: The length of the range.
+ * @pprev: pointer to the pointer that will be set to previous vm_area_struct
+ * @rb_link: the rb_node
+ * @rb_parent: the parent rb_node
+ *
+ * Find all the vm_area_struct that overlap from @start to
+ * @end and munmap them. Set @pprev to the previous vm_area_struct.
+ *
+ * Returns: -ENOMEM on munmap failure or 0 on success.
+ */
+static inline int
+munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
+ struct vm_area_struct **pprev, struct rb_node ***link,
+ struct rb_node **parent, struct list_head *uf)
+{
+
+ while (find_vma_links(mm, start, start + len, pprev, link, parent))
+ if (do_munmap(mm, start, len, uf))
+ return -ENOMEM;
+
+ return 0;
+}
static unsigned long count_vma_pages_range(struct mm_struct *mm,
unsigned long addr, unsigned long end)
{
@@ -571,10 +685,12 @@
* immediately update the gap to the correct value. Finally we
* rebalance the rbtree after all augmented values have been set.
*/
+ mm_rb_write_lock(mm);
rb_link_node(&vma->vm_rb, rb_parent, rb_link);
vma->rb_subtree_gap = 0;
vma_gap_update(vma);
- vma_rb_insert(vma, &mm->mm_rb);
+ vma_rb_insert(vma, mm);
+ mm_rb_write_unlock(mm);
}
static void __vma_link_file(struct vm_area_struct *vma)
@@ -586,9 +702,9 @@
struct address_space *mapping = file->f_mapping;
if (vma->vm_flags & VM_DENYWRITE)
- atomic_dec(&file_inode(file)->i_writecount);
+ put_write_access(file_inode(file));
if (vma->vm_flags & VM_SHARED)
- atomic_inc(&mapping->i_mmap_writable);
+ mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
vma_interval_tree_insert(vma, &mapping->i_mmap);
@@ -601,7 +717,7 @@
struct vm_area_struct *prev, struct rb_node **rb_link,
struct rb_node *rb_parent)
{
- __vma_link_list(mm, vma, prev, rb_parent);
+ __vma_link_list(mm, vma, prev);
__vma_link_rb(mm, vma, rb_link, rb_parent);
}
@@ -642,37 +758,14 @@
mm->map_count++;
}
-static __always_inline void __vma_unlink_common(struct mm_struct *mm,
+static __always_inline void __vma_unlink(struct mm_struct *mm,
struct vm_area_struct *vma,
- struct vm_area_struct *prev,
- bool has_prev,
struct vm_area_struct *ignore)
{
- struct vm_area_struct *next;
-
- vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
- next = vma->vm_next;
- if (has_prev)
- prev->vm_next = next;
- else {
- prev = vma->vm_prev;
- if (prev)
- prev->vm_next = next;
- else
- mm->mmap = next;
- }
- if (next)
- next->vm_prev = prev;
-
+ vma_rb_erase_ignore(vma, mm, ignore);
+ __vma_unlink_list(mm, vma);
/* Kill the cache */
vmacache_invalidate(mm);
-}
-
-static inline void __vma_unlink_prev(struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct vm_area_struct *prev)
-{
- __vma_unlink_common(mm, vma, prev, true, vma);
}
/*
@@ -684,7 +777,7 @@
*/
int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
- struct vm_area_struct *expand)
+ struct vm_area_struct *expand, bool keep_locked)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
@@ -695,6 +788,10 @@
bool start_changed = false, end_changed = false;
long adjust_next = 0;
int remove_next = 0;
+
+ vm_write_begin(vma);
+ if (next)
+ vm_write_begin(next);
if (next && !insert) {
struct vm_area_struct *exporter = NULL, *importer = NULL;
@@ -729,8 +826,6 @@
remove_next = 1 + (end > next->vm_end);
VM_WARN_ON(remove_next == 2 &&
end != next->vm_next->vm_end);
- VM_WARN_ON(remove_next == 1 &&
- end != next->vm_end);
/* trim end to next, for case 6 first pass */
end = next->vm_end;
}
@@ -750,7 +845,7 @@
* vma expands, overlapping part of the next:
* mprotect case 5 shifting the boundary up.
*/
- adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
+ adjust_next = (end - next->vm_start);
exporter = next;
importer = vma;
VM_WARN_ON(expand != importer);
@@ -760,7 +855,7 @@
* split_vma inserting another: so it must be
* mprotect case 4 shifting the boundary down.
*/
- adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
+ adjust_next = -(vma->vm_end - end);
exporter = vma;
importer = next;
VM_WARN_ON(expand != importer);
@@ -776,8 +871,12 @@
importer->anon_vma = exporter->anon_vma;
error = anon_vma_clone(importer, exporter);
- if (error)
+ if (error) {
+ if (next && next != vma)
+ vm_write_end(next);
+ vm_write_end(vma);
return error;
+ }
}
}
again:
@@ -815,7 +914,7 @@
anon_vma_interval_tree_pre_update_vma(next);
}
- if (root) {
+ if (file) {
flush_dcache_mmap_lock(mapping);
vma_interval_tree_remove(vma, root);
if (adjust_next)
@@ -823,20 +922,22 @@
}
if (start != vma->vm_start) {
- vma->vm_start = start;
+ WRITE_ONCE(vma->vm_start, start);
start_changed = true;
}
if (end != vma->vm_end) {
- vma->vm_end = end;
+ WRITE_ONCE(vma->vm_end, end);
end_changed = true;
}
- vma->vm_pgoff = pgoff;
+ WRITE_ONCE(vma->vm_pgoff, pgoff);
if (adjust_next) {
- next->vm_start += adjust_next << PAGE_SHIFT;
- next->vm_pgoff += adjust_next;
+ WRITE_ONCE(next->vm_start,
+ next->vm_start + adjust_next);
+ WRITE_ONCE(next->vm_pgoff,
+ next->vm_pgoff + (adjust_next >> PAGE_SHIFT));
}
- if (root) {
+ if (file) {
if (adjust_next)
vma_interval_tree_insert(next, root);
vma_interval_tree_insert(vma, root);
@@ -849,7 +950,7 @@
* us to remove next before dropping the locks.
*/
if (remove_next != 3)
- __vma_unlink_prev(mm, next, vma);
+ __vma_unlink(mm, next, next);
else
/*
* vma is not before next if they've been
@@ -860,7 +961,7 @@
* "next" (which is stored in post-swap()
* "vma").
*/
- __vma_unlink_common(mm, next, NULL, false, vma);
+ __vma_unlink(mm, next, vma);
if (file)
__remove_shared_vm_struct(next, file, mapping);
} else if (insert) {
@@ -887,10 +988,9 @@
anon_vma_interval_tree_post_update_vma(next);
anon_vma_unlock_write(anon_vma);
}
- if (mapping)
- i_mmap_unlock_write(mapping);
- if (root) {
+ if (file) {
+ i_mmap_unlock_write(mapping);
uprobe_mmap(vma);
if (adjust_next)
@@ -898,15 +998,13 @@
}
if (remove_next) {
- if (file) {
+ if (file)
uprobe_munmap(next, next->vm_start, next->vm_end);
- fput(file);
- }
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
- mpol_put(vma_policy(next));
- vm_area_free(next);
+ vm_write_end(next);
+ put_vma(next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
@@ -920,6 +1018,8 @@
* "vma->vm_next" gap must be updated.
*/
next = vma->vm_next;
+ if (next)
+ vm_write_begin(next);
} else {
/*
* For the scope of the comment "next" and
@@ -966,6 +1066,11 @@
if (insert && file)
uprobe_mmap(insert);
+ if (next && next != vma)
+ vm_write_end(next);
+ if (!keep_locked)
+ vm_write_end(vma);
+
validate_mm(mm);
return 0;
@@ -984,7 +1089,7 @@
* VM_SOFTDIRTY should not prevent from VMA merging, if we
* match the flags but dirty bit -- the caller should mark
* merged VMA as dirty. If dirty bit won't be excluded from
- * comparison, we increase pressue on the memory system forcing
+ * comparison, we increase pressure on the memory system forcing
* the kernel to generate new VMAs when old one could be
* extended instead.
*/
@@ -1023,7 +1128,7 @@
* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
*
* We don't check here for the merged mmap wrapping around the end of pagecache
- * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
+ * indices (16TB on ia32) because do_mmap() does not permit mmap's which
* wrap, nor mmaps which cover the final page at index -1UL.
*/
static int
@@ -1081,17 +1186,20 @@
* the area passed down from mprotect_fixup, never extending beyond one
* vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
*
- * AAAA AAAA AAAA AAAA
- * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX
- * cannot merge might become might become might become
- * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
- * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
- * mremap move: PPPPXXXXXXXX 8
- * AAAA
- * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
- * might become case 1 below case 2 below case 3 below
+ * AAAA AAAA AAAA
+ * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN
+ * cannot merge might become might become
+ * PPNNNNNNNNNN PPPPPPPPPPNN
+ * mmap, brk or case 4 below case 5 below
+ * mremap move:
+ * AAAA AAAA
+ * PPPP NNNN PPPPNNNNXXXX
+ * might become might become
+ * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
+ * PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or
+ * PPPPNNNNNNNN 3 PPPPXXXXXXXX 8
*
- * It is important for case 8 that the the vma NNNN overlapping the
+ * It is important for case 8 that the vma NNNN overlapping the
* region AAAA is never going to extended over XXXX. Instead XXXX must
* be extended in region AAAA and NNNN must be removed. This way in
* all cases where vma_merge succeeds, the moment vma_adjust drops the
@@ -1105,13 +1213,13 @@
* parameter) may establish ptes with the wrong permissions of NNNN
* instead of the right permissions of XXXX.
*/
-struct vm_area_struct *vma_merge(struct mm_struct *mm,
+struct vm_area_struct *__vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char __user *anon_name)
+ const char __user *anon_name, bool keep_locked)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1124,10 +1232,7 @@
if (vm_flags & VM_SPECIAL)
return NULL;
- if (prev)
- next = prev->vm_next;
- else
- next = mm->mmap;
+ next = vma_next(mm, prev);
area = next;
if (area && area->vm_end == end) /* cases 6, 7, 8 */
next = next->vm_next;
@@ -1161,10 +1266,11 @@
/* cases 1, 6 */
err = __vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL,
- prev);
+ prev, keep_locked);
} else /* cases 2, 5, 7 */
err = __vma_adjust(prev, prev->vm_start,
- end, prev->vm_pgoff, NULL, prev);
+ end, prev->vm_pgoff, NULL, prev,
+ keep_locked);
if (err)
return NULL;
khugepaged_enter_vma_merge(prev, vm_flags);
@@ -1182,10 +1288,12 @@
anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
- addr, prev->vm_pgoff, NULL, next);
+ addr, prev->vm_pgoff, NULL, next,
+ keep_locked);
else { /* cases 3, 8 */
err = __vma_adjust(area, addr, next->vm_end,
- next->vm_pgoff - pglen, NULL, next);
+ next->vm_pgoff - pglen, NULL, next,
+ keep_locked);
/*
* In case 3 area is already equal to next and
* this is a noop, but in case 8 "area" has
@@ -1203,7 +1311,7 @@
}
/*
- * Rough compatbility check to quickly see if it's even worth looking
+ * Rough compatibility check to quickly see if it's even worth looking
* at sharing an anon_vma.
*
* They need to have the same vm_file, and the flags can only differ
@@ -1220,7 +1328,7 @@
return a->vm_end == b->vm_start &&
mpol_equal(vma_policy(a), vma_policy(b)) &&
a->vm_file == b->vm_file &&
- !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}
@@ -1267,26 +1375,22 @@
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
- struct anon_vma *anon_vma;
- struct vm_area_struct *near;
+ struct anon_vma *anon_vma = NULL;
- near = vma->vm_next;
- if (!near)
- goto try_prev;
+ /* Try next first. */
+ if (vma->vm_next) {
+ anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
+ if (anon_vma)
+ return anon_vma;
+ }
- anon_vma = reusable_anon_vma(near, vma, near);
- if (anon_vma)
- return anon_vma;
-try_prev:
- near = vma->vm_prev;
- if (!near)
- goto none;
+ /* Try prev next. */
+ if (vma->vm_prev)
+ anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
- anon_vma = reusable_anon_vma(near, near, vma);
- if (anon_vma)
- return anon_vma;
-none:
/*
+ * We might reach here with anon_vma == NULL if we can't find
+ * any reusable anon_vma.
* There's no absolute need to look only at touching neighbours:
* we could search further afield for "compatible" anon_vmas.
* But it would probably just be a waste of time searching,
@@ -1294,7 +1398,7 @@
* We're trying to allow mprotect remerging later on,
* not trying to minimize memory used for anon_vmas.
*/
- return NULL;
+ return anon_vma;
}
/*
@@ -1336,6 +1440,9 @@
if (S_ISBLK(inode->i_mode))
return MAX_LFS_FILESIZE;
+ if (S_ISSOCK(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
/* Special "we do even unsigned file positions" case */
if (file->f_mode & FMODE_UNSIGNED_OFFSET)
return 0;
@@ -1358,15 +1465,15 @@
}
/*
- * The caller must hold down_write(¤t->mm->mmap_sem).
+ * The caller must write-lock current->mm->mmap_lock.
*/
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
- unsigned long flags, vm_flags_t vm_flags,
- unsigned long pgoff, unsigned long *populate,
- struct list_head *uf)
+ unsigned long flags, unsigned long pgoff,
+ unsigned long *populate, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
+ vm_flags_t vm_flags;
int pkey = 0;
*populate = 0;
@@ -1408,7 +1515,7 @@
* that it represents a valid section of the address space.
*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);
- if (offset_in_page(addr))
+ if (IS_ERR_VALUE(addr))
return addr;
if (flags & MAP_FIXED_NOREPLACE) {
@@ -1428,7 +1535,7 @@
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
- vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+ vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
@@ -1457,7 +1564,7 @@
* with MAP_SHARED to preserve backward compatibility.
*/
flags &= LEGACY_MAP_MASK;
- /* fall through */
+ fallthrough;
case MAP_SHARED_VALIDATE:
if (flags & ~flags_mask)
return -EOPNOTSUPP;
@@ -1484,8 +1591,7 @@
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
-
- /* fall through */
+ fallthrough;
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
@@ -1560,11 +1666,12 @@
file = fget(fd);
if (!file)
return -EBADF;
- if (is_file_hugepages(file))
+ if (is_file_hugepages(file)) {
len = ALIGN(len, huge_page_size(hstate_file(file)));
- retval = -EINVAL;
- if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
+ } else if (unlikely(flags & MAP_HUGETLB)) {
+ retval = -EINVAL;
goto out_fput;
+ }
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
struct hstate *hs;
@@ -1629,7 +1736,7 @@
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
/*
- * Some shared mappigns will want the pages marked read-only
+ * Some shared mappings will want the pages marked read-only
* to track write events. If so, we'll downgrade vm_page_prot
* to the private version (using protection_map[] without the
* VM_SHARED bit).
@@ -1653,8 +1760,12 @@
pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
return 0;
- /* Do we need to track softdirty? */
- if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
+ /*
+ * Do we need to track softdirty? hugetlb does not support softdirty
+ * tracking yet.
+ */
+ if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY) &&
+ !is_vm_hugetlb_page(vma))
return 1;
/* Specialty mapping? */
@@ -1663,7 +1774,7 @@
/* Can the mapping track the dirty pages? */
return vma->vm_file && vma->vm_file->f_mapping &&
- mapping_cap_account_dirty(vma->vm_file->f_mapping);
+ mapping_can_writeback(vma->vm_file->f_mapping);
}
/*
@@ -1687,7 +1798,7 @@
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
+ struct vm_area_struct *vma, *prev, *merge;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
@@ -1707,13 +1818,9 @@
return -ENOMEM;
}
- /* Clear old maps */
- while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
- &rb_parent)) {
- if (do_munmap(mm, addr, len, uf))
- return -ENOMEM;
- }
-
+ /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
+ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
+ return -ENOMEM;
/*
* Private writable mapping: check memory availability
*/
@@ -1781,6 +1888,28 @@
WARN_ON_ONCE(addr != vma->vm_start);
addr = vma->vm_start;
+
+ /* If vm_flags changed after call_mmap(), we should try merge vma again
+ * as we may succeed this time.
+ */
+ if (unlikely(vm_flags != vma->vm_flags && prev)) {
+ merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
+ NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX,
+ vma_get_anon_name(vma));
+ if (merge) {
+ /* ->mmap() can change vma->vm_file and fput the original file. So
+ * fput the vma->vm_file here or we would add an extra fput for file
+ * and cause general protection fault ultimately.
+ */
+ fput(vma->vm_file);
+ vm_area_free(vma);
+ vma = merge;
+ /* Update vm_flags to pick up the change. */
+ vm_flags = vma->vm_flags;
+ goto unmap_writable;
+ }
+ }
+
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
@@ -1790,9 +1919,19 @@
vma_set_anonymous(vma);
}
+ /* Allow architectures to sanity-check the vm_flags */
+ if (!arch_validate_flags(vma->vm_flags)) {
+ error = -EINVAL;
+ if (file)
+ goto close_and_free_vma;
+ else
+ goto free_vma;
+ }
+
vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
if (file) {
+unmap_writable:
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
if (vm_flags & VM_DENYWRITE)
@@ -1802,12 +1941,14 @@
out:
perf_event_mmap(vma);
+ vm_write_begin(vma);
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ WRITE_ONCE(vma->vm_flags,
+ vma->vm_flags & VM_LOCKED_CLEAR_MASK);
else
mm->locked_vm += (len >> PAGE_SHIFT);
}
@@ -1822,19 +1963,24 @@
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
- vma->vm_flags |= VM_SOFTDIRTY;
+ WRITE_ONCE(vma->vm_flags, vma->vm_flags | VM_SOFTDIRTY);
vma_set_page_prot(vma);
+ vm_write_end(vma);
+
+ trace_android_vh_mmap_region(vma, addr);
return addr;
+close_and_free_vma:
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
unmap_and_free_vma:
vma->vm_file = NULL;
fput(file);
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
- charged = 0;
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
@@ -1848,7 +1994,7 @@
return error;
}
-unsigned long unmapped_area(struct vm_unmapped_area_info *info)
+static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
/*
* We implement the search by looking for an rbtree node that
@@ -1951,16 +2097,21 @@
return gap_start;
}
-unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
+static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
+ unsigned long addr = 0;
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
+
+ trace_android_vh_get_from_fragment_pool(mm, info, &addr);
+ if (addr)
+ return addr;
/*
* Adjust search limits by the desired length.
@@ -2049,7 +2200,29 @@
VM_BUG_ON(gap_end < gap_start);
return gap_end;
}
-EXPORT_SYMBOL_GPL(unmapped_area_topdown);
+
+/*
+ * Search for an unmapped address range.
+ *
+ * We are looking for a range that:
+ * - does not intersect with any VMA;
+ * - is contained within the [low_limit, high_limit) interval;
+ * - is at least the desired size.
+ * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
+ */
+unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
+{
+ unsigned long addr;
+
+ if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
+ addr = unmapped_area_topdown(info);
+ else
+ addr = unmapped_area(info);
+
+ trace_vm_unmapped_area(addr, info);
+ return addr;
+}
+EXPORT_SYMBOL_GPL(vm_unmapped_area);
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
@@ -2070,8 +2243,9 @@
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
struct vm_unmapped_area_info info;
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
- if (len > TASK_SIZE - mmap_min_addr)
+ if (len > mmap_end - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -2080,7 +2254,7 @@
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma_prev(mm, addr, &prev);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ if (mmap_end - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vm_start_gap(vma)) &&
(!prev || addr >= vm_end_gap(prev)))
return addr;
@@ -2089,7 +2263,7 @@
info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = mmap_end;
info.align_mask = 0;
info.align_offset = 0;
return vm_unmapped_area(&info);
@@ -2102,17 +2276,17 @@
*/
#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct vm_area_struct *vma, *prev;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
struct vm_unmapped_area_info info;
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
/* requested length too big for entire address space */
- if (len > TASK_SIZE - mmap_min_addr)
+ if (len > mmap_end - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -2122,7 +2296,7 @@
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma_prev(mm, addr, &prev);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ if (mmap_end - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vm_start_gap(vma)) &&
(!prev || addr >= vm_end_gap(prev)))
return addr;
@@ -2131,9 +2305,10 @@
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = mm->mmap_base;
+ info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
info.align_mask = 0;
info.align_offset = 0;
+ trace_android_vh_exclude_reserved_zone(mm, &info);
addr = vm_unmapped_area(&info);
/*
@@ -2146,9 +2321,11 @@
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = TASK_SIZE;
+ info.high_limit = mmap_end;
addr = vm_unmapped_area(&info);
}
+
+ trace_android_vh_include_reserved_zone(mm, &info, &addr);
return addr;
}
@@ -2177,7 +2354,7 @@
/*
* mmap_region() will call shmem_zero_setup() to create a file,
* so use shmem's get_unmapped_area in case it can be huge.
- * do_mmap_pgoff() will clear pgoff, so match alignment.
+ * do_mmap() will clear pgoff, so match alignment.
*/
pgoff = 0;
get_area = shmem_get_unmapped_area;
@@ -2199,15 +2376,11 @@
EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+static struct vm_area_struct *__find_vma(struct mm_struct *mm,
+ unsigned long addr)
{
struct rb_node *rb_node;
- struct vm_area_struct *vma;
-
- /* Check the cache first. */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
+ struct vm_area_struct *vma = NULL;
rb_node = mm->mm_rb.rb_node;
@@ -2225,12 +2398,53 @@
rb_node = rb_node->rb_right;
}
+ return vma;
+}
+
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma;
+
+ /* Check the cache first. */
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
+
+ vma = __find_vma(mm, addr);
if (vma)
vmacache_update(addr, vma);
return vma;
}
-
EXPORT_SYMBOL(find_vma);
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = NULL;
+
+ read_lock(&mm->mm_rb_lock);
+ vma = __find_vma(mm, addr);
+
+ /*
+ * If there is a concurrent fast mremap, bail out since the entire
+ * PMD/PUD subtree may have been remapped.
+ *
+ * This is usually safe for conventional mremap since it takes the
+ * PTE locks as does SPF. However fast mremap only takes the lock
+ * at the PMD/PUD level which is ok as it is done with the mmap
+ * write lock held. But since SPF, as the term implies forgoes,
+ * taking the mmap read lock and also cannot take PTL lock at the
+ * larger PMD/PUD granualrity, since it would introduce huge
+ * contention in the page fault path; fall back to regular fault
+ * handling.
+ */
+ if (vma && !atomic_inc_unless_negative(&vma->vm_ref_count))
+ vma = NULL;
+ read_unlock(&mm->mm_rb_lock);
+
+ return vma;
+}
+#endif
/*
* Same as find_vma, but also return a pointer to the previous VMA in *pprev.
@@ -2245,12 +2459,9 @@
if (vma) {
*pprev = vma->vm_prev;
} else {
- struct rb_node *rb_node = mm->mm_rb.rb_node;
- *pprev = NULL;
- while (rb_node) {
- *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
- rb_node = rb_node->rb_right;
- }
+ struct rb_node *rb_node = rb_last(&mm->mm_rb);
+
+ *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
}
return vma;
}
@@ -2330,8 +2541,7 @@
gap_addr = TASK_SIZE;
next = vma->vm_next;
- if (next && next->vm_start < gap_addr &&
- (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
+ if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
if (!(next->vm_flags & VM_GROWSUP))
return -ENOMEM;
/* Check that both stack segments have the same anon_vma? */
@@ -2343,7 +2553,7 @@
/*
* vma->vm_start/vm_end cannot change under us because the caller
- * is required to hold the mmap_sem in read mode. We need the
+ * is required to hold the mmap_lock in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
anon_vma_lock_write(vma->anon_vma);
@@ -2361,7 +2571,7 @@
if (!error) {
/*
* vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_sem
+ * updates, but we only hold a shared mmap_lock
* lock here, so we need to protect against
* concurrent vma expansions.
* anon_vma_lock_write() doesn't help here, as
@@ -2412,7 +2622,7 @@
prev = vma->vm_prev;
/* Check that both stack segments have the same anon_vma? */
if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
- (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
+ vma_is_accessible(prev)) {
if (address - prev->vm_end < stack_guard_gap)
return -ENOMEM;
}
@@ -2423,7 +2633,7 @@
/*
* vma->vm_start/vm_end cannot change under us because the caller
- * is required to hold the mmap_sem in read mode. We need the
+ * is required to hold the mmap_lock in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
anon_vma_lock_write(vma->anon_vma);
@@ -2441,7 +2651,7 @@
if (!error) {
/*
* vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_sem
+ * updates, but we only hold a shared mmap_lock
* lock here, so we need to protect against
* concurrent vma expansions.
* anon_vma_lock_write() doesn't help here, as
@@ -2455,8 +2665,8 @@
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
- vma->vm_start = address;
- vma->vm_pgoff -= grow;
+ WRITE_ONCE(vma->vm_start, address);
+ WRITE_ONCE(vma->vm_pgoff, vma->vm_pgoff - grow);
anon_vma_interval_tree_post_update_vma(vma);
vma_gap_update(vma);
spin_unlock(&mm->page_table_lock);
@@ -2483,7 +2693,7 @@
if (!*endptr)
stack_guard_gap = val << PAGE_SHIFT;
- return 0;
+ return 1;
}
__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
@@ -2503,7 +2713,7 @@
if (vma && (vma->vm_start <= addr))
return vma;
/* don't alter vm_end if the coredump is running */
- if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
+ if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL);
@@ -2528,9 +2738,6 @@
if (vma->vm_start <= addr)
return vma;
if (!(vma->vm_flags & VM_GROWSDOWN))
- return NULL;
- /* don't alter vm_start if the coredump is running */
- if (!mmget_still_valid(mm))
return NULL;
start = vma->vm_start;
if (expand_stack(vma, addr))
@@ -2576,13 +2783,30 @@
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
+ struct vm_area_struct *next = vma_next(mm, prev);
struct mmu_gather tlb;
+ struct vm_area_struct *cur_vma;
lru_add_drain();
tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
+
+ /*
+ * Ensure we have no stale TLB entries by the time this mapping is
+ * removed from the rmap.
+ * Note that we don't have to worry about nested flushes here because
+ * we're holding the mm semaphore for removing the mapping - so any
+ * concurrent flush in this region has to be coming through the rmap,
+ * and we synchronize against that using the rmap lock.
+ */
+ for (cur_vma = vma; cur_vma; cur_vma = cur_vma->vm_next) {
+ if ((cur_vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) != 0) {
+ tlb_flush_mmu(&tlb);
+ break;
+ }
+ }
+
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, start, end);
@@ -2592,7 +2816,7 @@
* Create a list of vma's touched by the unmap, removing them from the mm's
* vma list as we go..
*/
-static void
+static bool
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, unsigned long end)
{
@@ -2602,7 +2826,7 @@
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
do {
- vma_rb_erase(vma, &mm->mm_rb);
+ vma_rb_erase(vma, mm);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
@@ -2617,6 +2841,17 @@
/* Kill the cache */
vmacache_invalidate(mm);
+
+ /*
+ * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
+ * VM_GROWSUP VMA. Such VMAs can change their size under
+ * down_read(mmap_lock) and collide with the VMA we are about to unmap.
+ */
+ if (vma && (vma->vm_flags & VM_GROWSDOWN))
+ return false;
+ if (prev && (prev->vm_flags & VM_GROWSUP))
+ return false;
+ return true;
}
/*
@@ -2701,8 +2936,8 @@
* work. This now handles partial unmappings.
* Jeremy Fitzhardinge <jeremy@goop.org>
*/
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
- struct list_head *uf)
+int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+ struct list_head *uf, bool downgrade)
{
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
@@ -2711,8 +2946,16 @@
return -EINVAL;
len = PAGE_ALIGN(len);
+ end = start + len;
if (len == 0)
return -EINVAL;
+
+ /*
+ * arch_unmap() might do unmaps itself. It must be called
+ * and finish any rbtree manipulation before this code
+ * runs and also starts to manipulate the rbtree.
+ */
+ arch_unmap(mm, start, end);
/* Find the first overlapping VMA */
vma = find_vma(mm, start);
@@ -2722,7 +2965,6 @@
/* we have start < vma->vm_end */
/* if it doesn't overlap, we have nothing.. */
- end = start + len;
if (vma->vm_start >= end)
return 0;
@@ -2757,7 +2999,7 @@
if (error)
return error;
}
- vma = prev ? prev->vm_next : mm->mmap;
+ vma = vma_next(mm, prev);
if (unlikely(uf)) {
/*
@@ -2784,37 +3026,60 @@
mm->locked_vm -= vma_pages(tmp);
munlock_vma_pages_all(tmp);
}
+
tmp = tmp->vm_next;
}
}
- /*
- * Remove the vma's, and unmap the actual pages
- */
- detach_vmas_to_be_unmapped(mm, vma, prev, end);
- unmap_region(mm, vma, prev, start, end);
+ /* Detach vmas from rbtree */
+ if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
+ downgrade = false;
- arch_unmap(mm, vma, start, end);
+ if (downgrade)
+ mmap_write_downgrade(mm);
+
+ unmap_region(mm, vma, prev, start, end);
/* Fix up all other VM information */
remove_vma_list(mm, vma);
- return 0;
+ return downgrade ? 1 : 0;
}
-int vm_munmap(unsigned long start, size_t len)
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+ struct list_head *uf)
+{
+ return __do_munmap(mm, start, len, uf, false);
+}
+
+static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
{
int ret;
struct mm_struct *mm = current->mm;
LIST_HEAD(uf);
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = do_munmap(mm, start, len, &uf);
- up_write(&mm->mmap_sem);
+ ret = __do_munmap(mm, start, len, &uf, downgrade);
+ /*
+ * Returning 1 indicates mmap_lock is downgraded.
+ * But 1 is not legal return value of vm_munmap() and munmap(), reset
+ * it to 0 before return.
+ */
+ if (ret == 1) {
+ mmap_read_unlock(mm);
+ ret = 0;
+ } else
+ mmap_write_unlock(mm);
+
userfaultfd_unmap_complete(mm, &uf);
return ret;
+}
+
+int vm_munmap(unsigned long start, size_t len)
+{
+ return __vm_munmap(start, len, false);
}
EXPORT_SYMBOL(vm_munmap);
@@ -2822,7 +3087,7 @@
{
addr = untagged_addr(addr);
profile_munmap(addr);
- return vm_munmap(addr, len);
+ return __vm_munmap(addr, len, true);
}
@@ -2854,7 +3119,7 @@
if (pgoff + (size >> PAGE_SHIFT) < pgoff)
return ret;
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
vma = find_vma(mm, start);
@@ -2913,26 +3178,16 @@
}
file = get_file(vma->vm_file);
- ret = do_mmap_pgoff(vma->vm_file, start, size,
+ ret = do_mmap(vma->vm_file, start, size,
prot, flags, pgoff, &populate, NULL);
fput(file);
out:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
if (populate)
mm_populate(ret, populate);
if (!IS_ERR_VALUE(ret))
ret = 0;
return ret;
-}
-
-static inline void verify_mm_writelocked(struct mm_struct *mm)
-{
-#ifdef CONFIG_DEBUG_VM
- if (unlikely(down_read_trylock(&mm->mmap_sem))) {
- WARN_ON(1);
- up_read(&mm->mmap_sem);
- }
-#endif
}
/*
@@ -2947,34 +3202,24 @@
struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
+ unsigned long mapped_addr;
/* Until we need other flags, refuse anything except VM_EXEC. */
if ((flags & (~VM_EXEC)) != 0)
return -EINVAL;
flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
- error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (offset_in_page(error))
- return error;
+ mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (IS_ERR_VALUE(mapped_addr))
+ return mapped_addr;
error = mlock_future_check(mm, mm->def_flags, len);
if (error)
return error;
- /*
- * mm->mmap_sem is required to protect against another thread
- * changing the mappings in case we sleep.
- */
- verify_mm_writelocked(mm);
-
- /*
- * Clear old maps. this also does some error checking for us
- */
- while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
- &rb_parent)) {
- if (do_munmap(mm, addr, len, uf))
- return -ENOMEM;
- }
+ /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
+ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
+ return -ENOMEM;
/* Check against address space limits *after* clearing old maps... */
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
@@ -3032,12 +3277,12 @@
if (!len)
return 0;
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
ret = do_brk_flags(addr, len, flags, &uf);
populate = ((mm->def_flags & VM_LOCKED) != 0);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret)
mm_populate(addr, len);
@@ -3065,12 +3310,12 @@
/*
* Manually reap the mm to free as much memory as possible.
* Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
- * this mm from further consideration. Taking mm->mmap_sem for
+ * this mm from further consideration. Taking mm->mmap_lock for
* write after setting MMF_OOM_SKIP will guarantee that the oom
- * reaper will not run on this mm again after mmap_sem is
+ * reaper will not run on this mm again after mmap_lock is
* dropped.
*
- * Nothing can be holding mm->mmap_sem here and the above call
+ * Nothing can be holding mm->mmap_lock here and the above call
* to mmu_notifier_release(mm) ensures mmu notifier callbacks in
* __oom_reap_task_mm() will not block.
*
@@ -3081,10 +3326,9 @@
(void)__oom_reap_task_mm(mm);
set_bit(MMF_OOM_SKIP, &mm->flags);
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
}
+ mmap_write_lock(mm);
if (mm->locked_vm) {
vma = mm->mmap;
while (vma) {
@@ -3097,8 +3341,11 @@
arch_exit_mmap(mm);
vma = mm->mmap;
- if (!vma) /* Can happen if dup_mmap() received an OOM */
+ if (!vma) {
+ /* Can happen if dup_mmap() received an OOM */
+ mmap_write_unlock(mm);
return;
+ }
lru_add_drain();
flush_cache_mm(mm);
@@ -3109,16 +3356,15 @@
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
- /*
- * Walk the list again, actually closing and freeing it,
- * with preemption enabled, without holding any MM locks.
- */
+ /* Walk the list again, actually closing and freeing it. */
while (vma) {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
vma = remove_vma(vma);
cond_resched();
}
+ mm->mmap = NULL;
+ mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
@@ -3148,7 +3394,7 @@
* By setting it to reflect the virtual start address of the
* vma, merges and splits can happen in a seamless way, just
* using the existing file pgoff checks and manipulations.
- * Similarly in do_mmap_pgoff and in do_brk.
+ * Similarly in do_mmap and in do_brk_flags.
*/
if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
@@ -3185,9 +3431,21 @@
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
return NULL; /* should never get here */
- new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
+
+ /* There is 3 cases to manage here in
+ * AAAA AAAA AAAA AAAA
+ * PPPP.... PPPP......NNNN PPPP....NNNN PP........NN
+ * PPPPPPPP(A) PPPP..NNNNNNNN(B) PPPPPPPPPPPP(1) NULL
+ * PPPPPPPPNNNN(2)
+ * PPPPNNNNNNNN(3)
+ *
+ * new_vma == prev in case A,1,2
+ * new_vma == next in case B,3
+ */
+ new_vma = __vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff,
+ vma_policy(vma), vma->vm_userfaultfd_ctx,
+ vma_get_anon_name(vma), true);
if (new_vma) {
/*
* Source vma may have been merged into new_vma
@@ -3225,6 +3483,15 @@
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
+ /*
+ * As the VMA is linked right now, it may be hit by the
+ * speculative page fault handler. But we don't want it to
+ * to start mapping page in this area until the caller has
+ * potentially move the pte from the moved VMA. To prevent
+ * that we protect it right now, and let the caller unprotect
+ * it once the move is done.
+ */
+ vm_write_begin(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
*need_rmap_locks = false;
}
@@ -3311,6 +3578,8 @@
.fault = special_mapping_fault,
.mremap = special_mapping_mremap,
.name = special_mapping_name,
+ /* vDSO code relies that VVAR can't be accessed remotely */
+ .access = NULL,
};
static const struct vm_operations_struct legacy_special_mapping_vmops = {
@@ -3394,7 +3663,7 @@
}
/*
- * Called with mm->mmap_sem held for writing.
+ * Called with mm->mmap_lock held for writing.
* Insert a new vma covering the given region, with the given flags.
* Its pages are supplied by the given array of struct page *.
* The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
@@ -3431,7 +3700,7 @@
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
+ down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
/*
* We can safely modify head.next after taking the
* anon_vma->root->rwsem. If some other vma in this mm shares
@@ -3461,7 +3730,7 @@
*/
if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
BUG();
- down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
+ down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
}
}
@@ -3470,11 +3739,11 @@
* operations that could ever happen on a certain mm. This includes
* vmtruncate, try_to_unmap, and all page faults.
*
- * The caller must take the mmap_sem in write mode before calling
+ * The caller must take the mmap_lock in write mode before calling
* mm_take_all_locks(). The caller isn't allowed to release the
- * mmap_sem until mm_drop_all_locks() returns.
+ * mmap_lock until mm_drop_all_locks() returns.
*
- * mmap_sem in write mode is required in order to block all operations
+ * mmap_lock in write mode is required in order to block all operations
* that could modify pagetables and free pages without need of
* altering the vma layout. It's also needed in write mode to avoid new
* anon_vmas to be associated with existing vmas.
@@ -3507,7 +3776,7 @@
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- BUG_ON(down_read_trylock(&mm->mmap_sem));
+ BUG_ON(mmap_read_trylock(mm));
mutex_lock(&mm_all_locks_mutex);
@@ -3579,7 +3848,7 @@
}
/*
- * The mmap_sem cannot be released by the caller until
+ * The mmap_lock cannot be released by the caller until
* mm_drop_all_locks() returns.
*/
void mm_drop_all_locks(struct mm_struct *mm)
@@ -3587,7 +3856,7 @@
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- BUG_ON(down_read_trylock(&mm->mmap_sem));
+ BUG_ON(mmap_read_trylock(mm));
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
for (vma = mm->mmap; vma; vma = vma->vm_next) {
--
Gitblit v1.6.2