hc
2023-12-11 6778948f9de86c3cfaf36725a7c87dcff9ba247f
kernel/mm/mmap.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * mm/mmap.c
34 *
....@@ -52,6 +53,10 @@
5253 #include <asm/tlb.h>
5354 #include <asm/mmu_context.h>
5455
56
+#define CREATE_TRACE_POINTS
57
+#include <trace/events/mmap.h>
58
+#undef CREATE_TRACE_POINTS
59
+#include <trace/hooks/mm.h>
5560 #include "internal.h"
5661
5762 #ifndef arch_mmap_check
....@@ -128,7 +133,7 @@
128133 vm_flags &= ~VM_SHARED;
129134 vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
130135 }
131
- /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */
136
+ /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
132137 WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
133138 }
134139
....@@ -139,7 +144,7 @@
139144 struct file *file, struct address_space *mapping)
140145 {
141146 if (vma->vm_flags & VM_DENYWRITE)
142
- atomic_inc(&file_inode(file)->i_writecount);
147
+ allow_write_access(file);
143148 if (vma->vm_flags & VM_SHARED)
144149 mapping_unmap_writable(mapping);
145150
....@@ -164,6 +169,27 @@
164169 }
165170 }
166171
172
+static void __free_vma(struct vm_area_struct *vma)
173
+{
174
+ if (vma->vm_file)
175
+ fput(vma->vm_file);
176
+ mpol_put(vma_policy(vma));
177
+ vm_area_free(vma);
178
+}
179
+
180
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
181
+void put_vma(struct vm_area_struct *vma)
182
+{
183
+ if (atomic_dec_and_test(&vma->vm_ref_count))
184
+ __free_vma(vma);
185
+}
186
+#else
187
+static inline void put_vma(struct vm_area_struct *vma)
188
+{
189
+ __free_vma(vma);
190
+}
191
+#endif
192
+
167193 /*
168194 * Close a vm structure and free it, returning the next.
169195 */
....@@ -174,10 +200,7 @@
174200 might_sleep();
175201 if (vma->vm_ops && vma->vm_ops->close)
176202 vma->vm_ops->close(vma);
177
- if (vma->vm_file)
178
- fput(vma->vm_file);
179
- mpol_put(vma_policy(vma));
180
- vm_area_free(vma);
203
+ put_vma(vma);
181204 return next;
182205 }
183206
....@@ -186,15 +209,18 @@
186209 SYSCALL_DEFINE1(brk, unsigned long, brk)
187210 {
188211 unsigned long retval;
189
- unsigned long newbrk, oldbrk;
212
+ unsigned long newbrk, oldbrk, origbrk;
190213 struct mm_struct *mm = current->mm;
191214 struct vm_area_struct *next;
192215 unsigned long min_brk;
193216 bool populate;
217
+ bool downgraded = false;
194218 LIST_HEAD(uf);
195219
196
- if (down_write_killable(&mm->mmap_sem))
220
+ if (mmap_write_lock_killable(mm))
197221 return -EINTR;
222
+
223
+ origbrk = mm->brk;
198224
199225 #ifdef CONFIG_COMPAT_BRK
200226 /*
....@@ -224,14 +250,32 @@
224250
225251 newbrk = PAGE_ALIGN(brk);
226252 oldbrk = PAGE_ALIGN(mm->brk);
227
- if (oldbrk == newbrk)
228
- goto set_brk;
253
+ if (oldbrk == newbrk) {
254
+ mm->brk = brk;
255
+ goto success;
256
+ }
229257
230
- /* Always allow shrinking brk. */
258
+ /*
259
+ * Always allow shrinking brk.
260
+ * __do_munmap() may downgrade mmap_lock to read.
261
+ */
231262 if (brk <= mm->brk) {
232
- if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf))
233
- goto set_brk;
234
- goto out;
263
+ int ret;
264
+
265
+ /*
266
+ * mm->brk must to be protected by write mmap_lock so update it
267
+ * before downgrading mmap_lock. When __do_munmap() fails,
268
+ * mm->brk will be restored from origbrk.
269
+ */
270
+ mm->brk = brk;
271
+ ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
272
+ if (ret < 0) {
273
+ mm->brk = origbrk;
274
+ goto out;
275
+ } else if (ret == 1) {
276
+ downgraded = true;
277
+ }
278
+ goto success;
235279 }
236280
237281 /* Check against existing mmap mappings. */
....@@ -242,25 +286,28 @@
242286 /* Ok, looks good - let it rip. */
243287 if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
244288 goto out;
245
-
246
-set_brk:
247289 mm->brk = brk;
290
+
291
+success:
248292 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
249
- up_write(&mm->mmap_sem);
293
+ if (downgraded)
294
+ mmap_read_unlock(mm);
295
+ else
296
+ mmap_write_unlock(mm);
250297 userfaultfd_unmap_complete(mm, &uf);
251298 if (populate)
252299 mm_populate(oldbrk, newbrk - oldbrk);
253300 return brk;
254301
255302 out:
256
- retval = mm->brk;
257
- up_write(&mm->mmap_sem);
303
+ retval = origbrk;
304
+ mmap_write_unlock(mm);
258305 return retval;
259306 }
260307
261
-static long vma_compute_subtree_gap(struct vm_area_struct *vma)
308
+static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
262309 {
263
- unsigned long max, prev_end, subtree_gap;
310
+ unsigned long gap, prev_end;
264311
265312 /*
266313 * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
....@@ -268,14 +315,21 @@
268315 * an unmapped area; whereas when expanding we only require one.
269316 * That's a little inconsistent, but keeps the code here simpler.
270317 */
271
- max = vm_start_gap(vma);
318
+ gap = vm_start_gap(vma);
272319 if (vma->vm_prev) {
273320 prev_end = vm_end_gap(vma->vm_prev);
274
- if (max > prev_end)
275
- max -= prev_end;
321
+ if (gap > prev_end)
322
+ gap -= prev_end;
276323 else
277
- max = 0;
324
+ gap = 0;
278325 }
326
+ return gap;
327
+}
328
+
329
+#ifdef CONFIG_DEBUG_VM_RB
330
+static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
331
+{
332
+ unsigned long max = vma_compute_gap(vma), subtree_gap;
279333 if (vma->vm_rb.rb_left) {
280334 subtree_gap = rb_entry(vma->vm_rb.rb_left,
281335 struct vm_area_struct, vm_rb)->rb_subtree_gap;
....@@ -291,7 +345,6 @@
291345 return max;
292346 }
293347
294
-#ifdef CONFIG_DEBUG_VM_RB
295348 static int browse_rb(struct mm_struct *mm)
296349 {
297350 struct rb_root *root = &mm->mm_rb;
....@@ -397,8 +450,16 @@
397450 #define validate_mm(mm) do { } while (0)
398451 #endif
399452
400
-RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
401
- unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
453
+RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
454
+ struct vm_area_struct, vm_rb,
455
+ unsigned long, rb_subtree_gap, vma_compute_gap)
456
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
457
+#define mm_rb_write_lock(mm) write_lock(&(mm)->mm_rb_lock)
458
+#define mm_rb_write_unlock(mm) write_unlock(&(mm)->mm_rb_lock)
459
+#else
460
+#define mm_rb_write_lock(mm) do { } while (0)
461
+#define mm_rb_write_unlock(mm) do { } while (0)
462
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
402463
403464 /*
404465 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
....@@ -408,55 +469,64 @@
408469 static void vma_gap_update(struct vm_area_struct *vma)
409470 {
410471 /*
411
- * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
412
- * function that does exacltly what we want.
472
+ * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
473
+ * a callback function that does exactly what we want.
413474 */
414475 vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
415476 }
416477
417478 static inline void vma_rb_insert(struct vm_area_struct *vma,
418
- struct rb_root *root)
479
+ struct mm_struct *mm)
419480 {
481
+ struct rb_root *root = &mm->mm_rb;
482
+
420483 /* All rb_subtree_gap values must be consistent prior to insertion */
421484 validate_mm_rb(root, NULL);
422485
423486 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
424487 }
425488
426
-static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
489
+static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm)
427490 {
491
+ struct rb_root *root = &mm->mm_rb;
428492 /*
429493 * Note rb_erase_augmented is a fairly large inline function,
430494 * so make sure we instantiate it only once with our desired
431495 * augmented rbtree callbacks.
432496 */
497
+ mm_rb_write_lock(mm);
433498 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
499
+ mm_rb_write_unlock(mm); /* wmb */
500
+
501
+ /*
502
+ * Ensure the removal is complete before clearing the node.
503
+ * Matched by vma_has_changed()/handle_speculative_fault().
504
+ */
505
+ RB_CLEAR_NODE(&vma->vm_rb);
434506 }
435507
436508 static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
437
- struct rb_root *root,
509
+ struct mm_struct *mm,
438510 struct vm_area_struct *ignore)
439511 {
440512 /*
441513 * All rb_subtree_gap values must be consistent prior to erase,
442
- * with the possible exception of the "next" vma being erased if
443
- * next->vm_start was reduced.
514
+ * with the possible exception of
515
+ *
516
+ * a. the "next" vma being erased if next->vm_start was reduced in
517
+ * __vma_adjust() -> __vma_unlink()
518
+ * b. the vma being erased in detach_vmas_to_be_unmapped() ->
519
+ * vma_rb_erase()
444520 */
445
- validate_mm_rb(root, ignore);
521
+ validate_mm_rb(&mm->mm_rb, ignore);
446522
447
- __vma_rb_erase(vma, root);
523
+ __vma_rb_erase(vma, mm);
448524 }
449525
450526 static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
451
- struct rb_root *root)
527
+ struct mm_struct *mm)
452528 {
453
- /*
454
- * All rb_subtree_gap values must be consistent prior to erase,
455
- * with the possible exception of the vma being erased.
456
- */
457
- validate_mm_rb(root, vma);
458
-
459
- __vma_rb_erase(vma, root);
529
+ vma_rb_erase_ignore(vma, mm, vma);
460530 }
461531
462532 /*
....@@ -470,7 +540,7 @@
470540 * After the update, the vma will be reinserted using
471541 * anon_vma_interval_tree_post_update_vma().
472542 *
473
- * The entire update must be protected by exclusive mmap_sem and by
543
+ * The entire update must be protected by exclusive mmap_lock and by
474544 * the root anon_vma's mutex.
475545 */
476546 static inline void
....@@ -525,6 +595,50 @@
525595 return 0;
526596 }
527597
598
+/*
599
+ * vma_next() - Get the next VMA.
600
+ * @mm: The mm_struct.
601
+ * @vma: The current vma.
602
+ *
603
+ * If @vma is NULL, return the first vma in the mm.
604
+ *
605
+ * Returns: The next VMA after @vma.
606
+ */
607
+static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
608
+ struct vm_area_struct *vma)
609
+{
610
+ if (!vma)
611
+ return mm->mmap;
612
+
613
+ return vma->vm_next;
614
+}
615
+
616
+/*
617
+ * munmap_vma_range() - munmap VMAs that overlap a range.
618
+ * @mm: The mm struct
619
+ * @start: The start of the range.
620
+ * @len: The length of the range.
621
+ * @pprev: pointer to the pointer that will be set to previous vm_area_struct
622
+ * @rb_link: the rb_node
623
+ * @rb_parent: the parent rb_node
624
+ *
625
+ * Find all the vm_area_struct that overlap from @start to
626
+ * @end and munmap them. Set @pprev to the previous vm_area_struct.
627
+ *
628
+ * Returns: -ENOMEM on munmap failure or 0 on success.
629
+ */
630
+static inline int
631
+munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
632
+ struct vm_area_struct **pprev, struct rb_node ***link,
633
+ struct rb_node **parent, struct list_head *uf)
634
+{
635
+
636
+ while (find_vma_links(mm, start, start + len, pprev, link, parent))
637
+ if (do_munmap(mm, start, len, uf))
638
+ return -ENOMEM;
639
+
640
+ return 0;
641
+}
528642 static unsigned long count_vma_pages_range(struct mm_struct *mm,
529643 unsigned long addr, unsigned long end)
530644 {
....@@ -571,10 +685,12 @@
571685 * immediately update the gap to the correct value. Finally we
572686 * rebalance the rbtree after all augmented values have been set.
573687 */
688
+ mm_rb_write_lock(mm);
574689 rb_link_node(&vma->vm_rb, rb_parent, rb_link);
575690 vma->rb_subtree_gap = 0;
576691 vma_gap_update(vma);
577
- vma_rb_insert(vma, &mm->mm_rb);
692
+ vma_rb_insert(vma, mm);
693
+ mm_rb_write_unlock(mm);
578694 }
579695
580696 static void __vma_link_file(struct vm_area_struct *vma)
....@@ -586,9 +702,9 @@
586702 struct address_space *mapping = file->f_mapping;
587703
588704 if (vma->vm_flags & VM_DENYWRITE)
589
- atomic_dec(&file_inode(file)->i_writecount);
705
+ put_write_access(file_inode(file));
590706 if (vma->vm_flags & VM_SHARED)
591
- atomic_inc(&mapping->i_mmap_writable);
707
+ mapping_allow_writable(mapping);
592708
593709 flush_dcache_mmap_lock(mapping);
594710 vma_interval_tree_insert(vma, &mapping->i_mmap);
....@@ -601,7 +717,7 @@
601717 struct vm_area_struct *prev, struct rb_node **rb_link,
602718 struct rb_node *rb_parent)
603719 {
604
- __vma_link_list(mm, vma, prev, rb_parent);
720
+ __vma_link_list(mm, vma, prev);
605721 __vma_link_rb(mm, vma, rb_link, rb_parent);
606722 }
607723
....@@ -642,37 +758,14 @@
642758 mm->map_count++;
643759 }
644760
645
-static __always_inline void __vma_unlink_common(struct mm_struct *mm,
761
+static __always_inline void __vma_unlink(struct mm_struct *mm,
646762 struct vm_area_struct *vma,
647
- struct vm_area_struct *prev,
648
- bool has_prev,
649763 struct vm_area_struct *ignore)
650764 {
651
- struct vm_area_struct *next;
652
-
653
- vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
654
- next = vma->vm_next;
655
- if (has_prev)
656
- prev->vm_next = next;
657
- else {
658
- prev = vma->vm_prev;
659
- if (prev)
660
- prev->vm_next = next;
661
- else
662
- mm->mmap = next;
663
- }
664
- if (next)
665
- next->vm_prev = prev;
666
-
765
+ vma_rb_erase_ignore(vma, mm, ignore);
766
+ __vma_unlink_list(mm, vma);
667767 /* Kill the cache */
668768 vmacache_invalidate(mm);
669
-}
670
-
671
-static inline void __vma_unlink_prev(struct mm_struct *mm,
672
- struct vm_area_struct *vma,
673
- struct vm_area_struct *prev)
674
-{
675
- __vma_unlink_common(mm, vma, prev, true, vma);
676769 }
677770
678771 /*
....@@ -684,7 +777,7 @@
684777 */
685778 int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
686779 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
687
- struct vm_area_struct *expand)
780
+ struct vm_area_struct *expand, bool keep_locked)
688781 {
689782 struct mm_struct *mm = vma->vm_mm;
690783 struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
....@@ -695,6 +788,10 @@
695788 bool start_changed = false, end_changed = false;
696789 long adjust_next = 0;
697790 int remove_next = 0;
791
+
792
+ vm_write_begin(vma);
793
+ if (next)
794
+ vm_write_begin(next);
698795
699796 if (next && !insert) {
700797 struct vm_area_struct *exporter = NULL, *importer = NULL;
....@@ -729,8 +826,6 @@
729826 remove_next = 1 + (end > next->vm_end);
730827 VM_WARN_ON(remove_next == 2 &&
731828 end != next->vm_next->vm_end);
732
- VM_WARN_ON(remove_next == 1 &&
733
- end != next->vm_end);
734829 /* trim end to next, for case 6 first pass */
735830 end = next->vm_end;
736831 }
....@@ -750,7 +845,7 @@
750845 * vma expands, overlapping part of the next:
751846 * mprotect case 5 shifting the boundary up.
752847 */
753
- adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
848
+ adjust_next = (end - next->vm_start);
754849 exporter = next;
755850 importer = vma;
756851 VM_WARN_ON(expand != importer);
....@@ -760,7 +855,7 @@
760855 * split_vma inserting another: so it must be
761856 * mprotect case 4 shifting the boundary down.
762857 */
763
- adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
858
+ adjust_next = -(vma->vm_end - end);
764859 exporter = vma;
765860 importer = next;
766861 VM_WARN_ON(expand != importer);
....@@ -776,8 +871,12 @@
776871
777872 importer->anon_vma = exporter->anon_vma;
778873 error = anon_vma_clone(importer, exporter);
779
- if (error)
874
+ if (error) {
875
+ if (next && next != vma)
876
+ vm_write_end(next);
877
+ vm_write_end(vma);
780878 return error;
879
+ }
781880 }
782881 }
783882 again:
....@@ -815,7 +914,7 @@
815914 anon_vma_interval_tree_pre_update_vma(next);
816915 }
817916
818
- if (root) {
917
+ if (file) {
819918 flush_dcache_mmap_lock(mapping);
820919 vma_interval_tree_remove(vma, root);
821920 if (adjust_next)
....@@ -823,20 +922,22 @@
823922 }
824923
825924 if (start != vma->vm_start) {
826
- vma->vm_start = start;
925
+ WRITE_ONCE(vma->vm_start, start);
827926 start_changed = true;
828927 }
829928 if (end != vma->vm_end) {
830
- vma->vm_end = end;
929
+ WRITE_ONCE(vma->vm_end, end);
831930 end_changed = true;
832931 }
833
- vma->vm_pgoff = pgoff;
932
+ WRITE_ONCE(vma->vm_pgoff, pgoff);
834933 if (adjust_next) {
835
- next->vm_start += adjust_next << PAGE_SHIFT;
836
- next->vm_pgoff += adjust_next;
934
+ WRITE_ONCE(next->vm_start,
935
+ next->vm_start + adjust_next);
936
+ WRITE_ONCE(next->vm_pgoff,
937
+ next->vm_pgoff + (adjust_next >> PAGE_SHIFT));
837938 }
838939
839
- if (root) {
940
+ if (file) {
840941 if (adjust_next)
841942 vma_interval_tree_insert(next, root);
842943 vma_interval_tree_insert(vma, root);
....@@ -849,7 +950,7 @@
849950 * us to remove next before dropping the locks.
850951 */
851952 if (remove_next != 3)
852
- __vma_unlink_prev(mm, next, vma);
953
+ __vma_unlink(mm, next, next);
853954 else
854955 /*
855956 * vma is not before next if they've been
....@@ -860,7 +961,7 @@
860961 * "next" (which is stored in post-swap()
861962 * "vma").
862963 */
863
- __vma_unlink_common(mm, next, NULL, false, vma);
964
+ __vma_unlink(mm, next, vma);
864965 if (file)
865966 __remove_shared_vm_struct(next, file, mapping);
866967 } else if (insert) {
....@@ -887,10 +988,9 @@
887988 anon_vma_interval_tree_post_update_vma(next);
888989 anon_vma_unlock_write(anon_vma);
889990 }
890
- if (mapping)
891
- i_mmap_unlock_write(mapping);
892991
893
- if (root) {
992
+ if (file) {
993
+ i_mmap_unlock_write(mapping);
894994 uprobe_mmap(vma);
895995
896996 if (adjust_next)
....@@ -898,15 +998,13 @@
898998 }
899999
9001000 if (remove_next) {
901
- if (file) {
1001
+ if (file)
9021002 uprobe_munmap(next, next->vm_start, next->vm_end);
903
- fput(file);
904
- }
9051003 if (next->anon_vma)
9061004 anon_vma_merge(vma, next);
9071005 mm->map_count--;
908
- mpol_put(vma_policy(next));
909
- vm_area_free(next);
1006
+ vm_write_end(next);
1007
+ put_vma(next);
9101008 /*
9111009 * In mprotect's case 6 (see comments on vma_merge),
9121010 * we must remove another next too. It would clutter
....@@ -920,6 +1018,8 @@
9201018 * "vma->vm_next" gap must be updated.
9211019 */
9221020 next = vma->vm_next;
1021
+ if (next)
1022
+ vm_write_begin(next);
9231023 } else {
9241024 /*
9251025 * For the scope of the comment "next" and
....@@ -966,6 +1066,11 @@
9661066 if (insert && file)
9671067 uprobe_mmap(insert);
9681068
1069
+ if (next && next != vma)
1070
+ vm_write_end(next);
1071
+ if (!keep_locked)
1072
+ vm_write_end(vma);
1073
+
9691074 validate_mm(mm);
9701075
9711076 return 0;
....@@ -984,7 +1089,7 @@
9841089 * VM_SOFTDIRTY should not prevent from VMA merging, if we
9851090 * match the flags but dirty bit -- the caller should mark
9861091 * merged VMA as dirty. If dirty bit won't be excluded from
987
- * comparison, we increase pressue on the memory system forcing
1092
+ * comparison, we increase pressure on the memory system forcing
9881093 * the kernel to generate new VMAs when old one could be
9891094 * extended instead.
9901095 */
....@@ -1023,7 +1128,7 @@
10231128 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
10241129 *
10251130 * We don't check here for the merged mmap wrapping around the end of pagecache
1026
- * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
1131
+ * indices (16TB on ia32) because do_mmap() does not permit mmap's which
10271132 * wrap, nor mmaps which cover the final page at index -1UL.
10281133 */
10291134 static int
....@@ -1081,17 +1186,20 @@
10811186 * the area passed down from mprotect_fixup, never extending beyond one
10821187 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
10831188 *
1084
- * AAAA AAAA AAAA AAAA
1085
- * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX
1086
- * cannot merge might become might become might become
1087
- * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
1088
- * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
1089
- * mremap move: PPPPXXXXXXXX 8
1090
- * AAAA
1091
- * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
1092
- * might become case 1 below case 2 below case 3 below
1189
+ * AAAA AAAA AAAA
1190
+ * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN
1191
+ * cannot merge might become might become
1192
+ * PPNNNNNNNNNN PPPPPPPPPPNN
1193
+ * mmap, brk or case 4 below case 5 below
1194
+ * mremap move:
1195
+ * AAAA AAAA
1196
+ * PPPP NNNN PPPPNNNNXXXX
1197
+ * might become might become
1198
+ * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
1199
+ * PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or
1200
+ * PPPPNNNNNNNN 3 PPPPXXXXXXXX 8
10931201 *
1094
- * It is important for case 8 that the the vma NNNN overlapping the
1202
+ * It is important for case 8 that the vma NNNN overlapping the
10951203 * region AAAA is never going to extended over XXXX. Instead XXXX must
10961204 * be extended in region AAAA and NNNN must be removed. This way in
10971205 * all cases where vma_merge succeeds, the moment vma_adjust drops the
....@@ -1105,13 +1213,13 @@
11051213 * parameter) may establish ptes with the wrong permissions of NNNN
11061214 * instead of the right permissions of XXXX.
11071215 */
1108
-struct vm_area_struct *vma_merge(struct mm_struct *mm,
1216
+struct vm_area_struct *__vma_merge(struct mm_struct *mm,
11091217 struct vm_area_struct *prev, unsigned long addr,
11101218 unsigned long end, unsigned long vm_flags,
11111219 struct anon_vma *anon_vma, struct file *file,
11121220 pgoff_t pgoff, struct mempolicy *policy,
11131221 struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
1114
- const char __user *anon_name)
1222
+ const char __user *anon_name, bool keep_locked)
11151223 {
11161224 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
11171225 struct vm_area_struct *area, *next;
....@@ -1124,10 +1232,7 @@
11241232 if (vm_flags & VM_SPECIAL)
11251233 return NULL;
11261234
1127
- if (prev)
1128
- next = prev->vm_next;
1129
- else
1130
- next = mm->mmap;
1235
+ next = vma_next(mm, prev);
11311236 area = next;
11321237 if (area && area->vm_end == end) /* cases 6, 7, 8 */
11331238 next = next->vm_next;
....@@ -1161,10 +1266,11 @@
11611266 /* cases 1, 6 */
11621267 err = __vma_adjust(prev, prev->vm_start,
11631268 next->vm_end, prev->vm_pgoff, NULL,
1164
- prev);
1269
+ prev, keep_locked);
11651270 } else /* cases 2, 5, 7 */
11661271 err = __vma_adjust(prev, prev->vm_start,
1167
- end, prev->vm_pgoff, NULL, prev);
1272
+ end, prev->vm_pgoff, NULL, prev,
1273
+ keep_locked);
11681274 if (err)
11691275 return NULL;
11701276 khugepaged_enter_vma_merge(prev, vm_flags);
....@@ -1182,10 +1288,12 @@
11821288 anon_name)) {
11831289 if (prev && addr < prev->vm_end) /* case 4 */
11841290 err = __vma_adjust(prev, prev->vm_start,
1185
- addr, prev->vm_pgoff, NULL, next);
1291
+ addr, prev->vm_pgoff, NULL, next,
1292
+ keep_locked);
11861293 else { /* cases 3, 8 */
11871294 err = __vma_adjust(area, addr, next->vm_end,
1188
- next->vm_pgoff - pglen, NULL, next);
1295
+ next->vm_pgoff - pglen, NULL, next,
1296
+ keep_locked);
11891297 /*
11901298 * In case 3 area is already equal to next and
11911299 * this is a noop, but in case 8 "area" has
....@@ -1203,7 +1311,7 @@
12031311 }
12041312
12051313 /*
1206
- * Rough compatbility check to quickly see if it's even worth looking
1314
+ * Rough compatibility check to quickly see if it's even worth looking
12071315 * at sharing an anon_vma.
12081316 *
12091317 * They need to have the same vm_file, and the flags can only differ
....@@ -1220,7 +1328,7 @@
12201328 return a->vm_end == b->vm_start &&
12211329 mpol_equal(vma_policy(a), vma_policy(b)) &&
12221330 a->vm_file == b->vm_file &&
1223
- !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
1331
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
12241332 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
12251333 }
12261334
....@@ -1267,26 +1375,22 @@
12671375 */
12681376 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
12691377 {
1270
- struct anon_vma *anon_vma;
1271
- struct vm_area_struct *near;
1378
+ struct anon_vma *anon_vma = NULL;
12721379
1273
- near = vma->vm_next;
1274
- if (!near)
1275
- goto try_prev;
1380
+ /* Try next first. */
1381
+ if (vma->vm_next) {
1382
+ anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
1383
+ if (anon_vma)
1384
+ return anon_vma;
1385
+ }
12761386
1277
- anon_vma = reusable_anon_vma(near, vma, near);
1278
- if (anon_vma)
1279
- return anon_vma;
1280
-try_prev:
1281
- near = vma->vm_prev;
1282
- if (!near)
1283
- goto none;
1387
+ /* Try prev next. */
1388
+ if (vma->vm_prev)
1389
+ anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
12841390
1285
- anon_vma = reusable_anon_vma(near, near, vma);
1286
- if (anon_vma)
1287
- return anon_vma;
1288
-none:
12891391 /*
1392
+ * We might reach here with anon_vma == NULL if we can't find
1393
+ * any reusable anon_vma.
12901394 * There's no absolute need to look only at touching neighbours:
12911395 * we could search further afield for "compatible" anon_vmas.
12921396 * But it would probably just be a waste of time searching,
....@@ -1294,7 +1398,7 @@
12941398 * We're trying to allow mprotect remerging later on,
12951399 * not trying to minimize memory used for anon_vmas.
12961400 */
1297
- return NULL;
1401
+ return anon_vma;
12981402 }
12991403
13001404 /*
....@@ -1336,6 +1440,9 @@
13361440 if (S_ISBLK(inode->i_mode))
13371441 return MAX_LFS_FILESIZE;
13381442
1443
+ if (S_ISSOCK(inode->i_mode))
1444
+ return MAX_LFS_FILESIZE;
1445
+
13391446 /* Special "we do even unsigned file positions" case */
13401447 if (file->f_mode & FMODE_UNSIGNED_OFFSET)
13411448 return 0;
....@@ -1358,15 +1465,15 @@
13581465 }
13591466
13601467 /*
1361
- * The caller must hold down_write(&current->mm->mmap_sem).
1468
+ * The caller must write-lock current->mm->mmap_lock.
13621469 */
13631470 unsigned long do_mmap(struct file *file, unsigned long addr,
13641471 unsigned long len, unsigned long prot,
1365
- unsigned long flags, vm_flags_t vm_flags,
1366
- unsigned long pgoff, unsigned long *populate,
1367
- struct list_head *uf)
1472
+ unsigned long flags, unsigned long pgoff,
1473
+ unsigned long *populate, struct list_head *uf)
13681474 {
13691475 struct mm_struct *mm = current->mm;
1476
+ vm_flags_t vm_flags;
13701477 int pkey = 0;
13711478
13721479 *populate = 0;
....@@ -1408,7 +1515,7 @@
14081515 * that it represents a valid section of the address space.
14091516 */
14101517 addr = get_unmapped_area(file, addr, len, pgoff, flags);
1411
- if (offset_in_page(addr))
1518
+ if (IS_ERR_VALUE(addr))
14121519 return addr;
14131520
14141521 if (flags & MAP_FIXED_NOREPLACE) {
....@@ -1428,7 +1535,7 @@
14281535 * to. we assume access permissions have been handled by the open
14291536 * of the memory object, so we don't do any here.
14301537 */
1431
- vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
1538
+ vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
14321539 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
14331540
14341541 if (flags & MAP_LOCKED)
....@@ -1457,7 +1564,7 @@
14571564 * with MAP_SHARED to preserve backward compatibility.
14581565 */
14591566 flags &= LEGACY_MAP_MASK;
1460
- /* fall through */
1567
+ fallthrough;
14611568 case MAP_SHARED_VALIDATE:
14621569 if (flags & ~flags_mask)
14631570 return -EOPNOTSUPP;
....@@ -1484,8 +1591,7 @@
14841591 vm_flags |= VM_SHARED | VM_MAYSHARE;
14851592 if (!(file->f_mode & FMODE_WRITE))
14861593 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
1487
-
1488
- /* fall through */
1594
+ fallthrough;
14891595 case MAP_PRIVATE:
14901596 if (!(file->f_mode & FMODE_READ))
14911597 return -EACCES;
....@@ -1560,11 +1666,12 @@
15601666 file = fget(fd);
15611667 if (!file)
15621668 return -EBADF;
1563
- if (is_file_hugepages(file))
1669
+ if (is_file_hugepages(file)) {
15641670 len = ALIGN(len, huge_page_size(hstate_file(file)));
1565
- retval = -EINVAL;
1566
- if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
1671
+ } else if (unlikely(flags & MAP_HUGETLB)) {
1672
+ retval = -EINVAL;
15671673 goto out_fput;
1674
+ }
15681675 } else if (flags & MAP_HUGETLB) {
15691676 struct user_struct *user = NULL;
15701677 struct hstate *hs;
....@@ -1629,7 +1736,7 @@
16291736 #endif /* __ARCH_WANT_SYS_OLD_MMAP */
16301737
16311738 /*
1632
- * Some shared mappigns will want the pages marked read-only
1739
+ * Some shared mappings will want the pages marked read-only
16331740 * to track write events. If so, we'll downgrade vm_page_prot
16341741 * to the private version (using protection_map[] without the
16351742 * VM_SHARED bit).
....@@ -1653,8 +1760,12 @@
16531760 pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
16541761 return 0;
16551762
1656
- /* Do we need to track softdirty? */
1657
- if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
1763
+ /*
1764
+ * Do we need to track softdirty? hugetlb does not support softdirty
1765
+ * tracking yet.
1766
+ */
1767
+ if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY) &&
1768
+ !is_vm_hugetlb_page(vma))
16581769 return 1;
16591770
16601771 /* Specialty mapping? */
....@@ -1663,7 +1774,7 @@
16631774
16641775 /* Can the mapping track the dirty pages? */
16651776 return vma->vm_file && vma->vm_file->f_mapping &&
1666
- mapping_cap_account_dirty(vma->vm_file->f_mapping);
1777
+ mapping_can_writeback(vma->vm_file->f_mapping);
16671778 }
16681779
16691780 /*
....@@ -1687,7 +1798,7 @@
16871798 struct list_head *uf)
16881799 {
16891800 struct mm_struct *mm = current->mm;
1690
- struct vm_area_struct *vma, *prev;
1801
+ struct vm_area_struct *vma, *prev, *merge;
16911802 int error;
16921803 struct rb_node **rb_link, *rb_parent;
16931804 unsigned long charged = 0;
....@@ -1707,13 +1818,9 @@
17071818 return -ENOMEM;
17081819 }
17091820
1710
- /* Clear old maps */
1711
- while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
1712
- &rb_parent)) {
1713
- if (do_munmap(mm, addr, len, uf))
1714
- return -ENOMEM;
1715
- }
1716
-
1821
+ /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
1822
+ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
1823
+ return -ENOMEM;
17171824 /*
17181825 * Private writable mapping: check memory availability
17191826 */
....@@ -1781,6 +1888,28 @@
17811888 WARN_ON_ONCE(addr != vma->vm_start);
17821889
17831890 addr = vma->vm_start;
1891
+
1892
+ /* If vm_flags changed after call_mmap(), we should try merge vma again
1893
+ * as we may succeed this time.
1894
+ */
1895
+ if (unlikely(vm_flags != vma->vm_flags && prev)) {
1896
+ merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
1897
+ NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX,
1898
+ vma_get_anon_name(vma));
1899
+ if (merge) {
1900
+ /* ->mmap() can change vma->vm_file and fput the original file. So
1901
+ * fput the vma->vm_file here or we would add an extra fput for file
1902
+ * and cause general protection fault ultimately.
1903
+ */
1904
+ fput(vma->vm_file);
1905
+ vm_area_free(vma);
1906
+ vma = merge;
1907
+ /* Update vm_flags to pick up the change. */
1908
+ vm_flags = vma->vm_flags;
1909
+ goto unmap_writable;
1910
+ }
1911
+ }
1912
+
17841913 vm_flags = vma->vm_flags;
17851914 } else if (vm_flags & VM_SHARED) {
17861915 error = shmem_zero_setup(vma);
....@@ -1790,9 +1919,19 @@
17901919 vma_set_anonymous(vma);
17911920 }
17921921
1922
+ /* Allow architectures to sanity-check the vm_flags */
1923
+ if (!arch_validate_flags(vma->vm_flags)) {
1924
+ error = -EINVAL;
1925
+ if (file)
1926
+ goto close_and_free_vma;
1927
+ else
1928
+ goto free_vma;
1929
+ }
1930
+
17931931 vma_link(mm, vma, prev, rb_link, rb_parent);
17941932 /* Once vma denies write, undo our temporary denial count */
17951933 if (file) {
1934
+unmap_writable:
17961935 if (vm_flags & VM_SHARED)
17971936 mapping_unmap_writable(file->f_mapping);
17981937 if (vm_flags & VM_DENYWRITE)
....@@ -1802,12 +1941,14 @@
18021941 out:
18031942 perf_event_mmap(vma);
18041943
1944
+ vm_write_begin(vma);
18051945 vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
18061946 if (vm_flags & VM_LOCKED) {
18071947 if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
18081948 is_vm_hugetlb_page(vma) ||
18091949 vma == get_gate_vma(current->mm))
1810
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
1950
+ WRITE_ONCE(vma->vm_flags,
1951
+ vma->vm_flags & VM_LOCKED_CLEAR_MASK);
18111952 else
18121953 mm->locked_vm += (len >> PAGE_SHIFT);
18131954 }
....@@ -1822,19 +1963,24 @@
18221963 * then new mapped in-place (which must be aimed as
18231964 * a completely new data area).
18241965 */
1825
- vma->vm_flags |= VM_SOFTDIRTY;
1966
+ WRITE_ONCE(vma->vm_flags, vma->vm_flags | VM_SOFTDIRTY);
18261967
18271968 vma_set_page_prot(vma);
1969
+ vm_write_end(vma);
1970
+
1971
+ trace_android_vh_mmap_region(vma, addr);
18281972
18291973 return addr;
18301974
1975
+close_and_free_vma:
1976
+ if (vma->vm_ops && vma->vm_ops->close)
1977
+ vma->vm_ops->close(vma);
18311978 unmap_and_free_vma:
18321979 vma->vm_file = NULL;
18331980 fput(file);
18341981
18351982 /* Undo any partial mapping done by a device driver. */
18361983 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1837
- charged = 0;
18381984 if (vm_flags & VM_SHARED)
18391985 mapping_unmap_writable(file->f_mapping);
18401986 allow_write_and_free_vma:
....@@ -1848,7 +1994,7 @@
18481994 return error;
18491995 }
18501996
1851
-unsigned long unmapped_area(struct vm_unmapped_area_info *info)
1997
+static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
18521998 {
18531999 /*
18542000 * We implement the search by looking for an rbtree node that
....@@ -1951,16 +2097,21 @@
19512097 return gap_start;
19522098 }
19532099
1954
-unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
2100
+static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
19552101 {
19562102 struct mm_struct *mm = current->mm;
19572103 struct vm_area_struct *vma;
19582104 unsigned long length, low_limit, high_limit, gap_start, gap_end;
2105
+ unsigned long addr = 0;
19592106
19602107 /* Adjust search length to account for worst case alignment overhead */
19612108 length = info->length + info->align_mask;
19622109 if (length < info->length)
19632110 return -ENOMEM;
2111
+
2112
+ trace_android_vh_get_from_fragment_pool(mm, info, &addr);
2113
+ if (addr)
2114
+ return addr;
19642115
19652116 /*
19662117 * Adjust search limits by the desired length.
....@@ -2049,7 +2200,29 @@
20492200 VM_BUG_ON(gap_end < gap_start);
20502201 return gap_end;
20512202 }
2052
-EXPORT_SYMBOL_GPL(unmapped_area_topdown);
2203
+
2204
+/*
2205
+ * Search for an unmapped address range.
2206
+ *
2207
+ * We are looking for a range that:
2208
+ * - does not intersect with any VMA;
2209
+ * - is contained within the [low_limit, high_limit) interval;
2210
+ * - is at least the desired size.
2211
+ * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
2212
+ */
2213
+unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
2214
+{
2215
+ unsigned long addr;
2216
+
2217
+ if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
2218
+ addr = unmapped_area_topdown(info);
2219
+ else
2220
+ addr = unmapped_area(info);
2221
+
2222
+ trace_vm_unmapped_area(addr, info);
2223
+ return addr;
2224
+}
2225
+EXPORT_SYMBOL_GPL(vm_unmapped_area);
20532226
20542227 /* Get an address range which is currently unmapped.
20552228 * For shmat() with addr=0.
....@@ -2070,8 +2243,9 @@
20702243 struct mm_struct *mm = current->mm;
20712244 struct vm_area_struct *vma, *prev;
20722245 struct vm_unmapped_area_info info;
2246
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
20732247
2074
- if (len > TASK_SIZE - mmap_min_addr)
2248
+ if (len > mmap_end - mmap_min_addr)
20752249 return -ENOMEM;
20762250
20772251 if (flags & MAP_FIXED)
....@@ -2080,7 +2254,7 @@
20802254 if (addr) {
20812255 addr = PAGE_ALIGN(addr);
20822256 vma = find_vma_prev(mm, addr, &prev);
2083
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
2257
+ if (mmap_end - len >= addr && addr >= mmap_min_addr &&
20842258 (!vma || addr + len <= vm_start_gap(vma)) &&
20852259 (!prev || addr >= vm_end_gap(prev)))
20862260 return addr;
....@@ -2089,7 +2263,7 @@
20892263 info.flags = 0;
20902264 info.length = len;
20912265 info.low_limit = mm->mmap_base;
2092
- info.high_limit = TASK_SIZE;
2266
+ info.high_limit = mmap_end;
20932267 info.align_mask = 0;
20942268 info.align_offset = 0;
20952269 return vm_unmapped_area(&info);
....@@ -2102,17 +2276,17 @@
21022276 */
21032277 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
21042278 unsigned long
2105
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
2106
- const unsigned long len, const unsigned long pgoff,
2107
- const unsigned long flags)
2279
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
2280
+ unsigned long len, unsigned long pgoff,
2281
+ unsigned long flags)
21082282 {
21092283 struct vm_area_struct *vma, *prev;
21102284 struct mm_struct *mm = current->mm;
2111
- unsigned long addr = addr0;
21122285 struct vm_unmapped_area_info info;
2286
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
21132287
21142288 /* requested length too big for entire address space */
2115
- if (len > TASK_SIZE - mmap_min_addr)
2289
+ if (len > mmap_end - mmap_min_addr)
21162290 return -ENOMEM;
21172291
21182292 if (flags & MAP_FIXED)
....@@ -2122,7 +2296,7 @@
21222296 if (addr) {
21232297 addr = PAGE_ALIGN(addr);
21242298 vma = find_vma_prev(mm, addr, &prev);
2125
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
2299
+ if (mmap_end - len >= addr && addr >= mmap_min_addr &&
21262300 (!vma || addr + len <= vm_start_gap(vma)) &&
21272301 (!prev || addr >= vm_end_gap(prev)))
21282302 return addr;
....@@ -2131,9 +2305,10 @@
21312305 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
21322306 info.length = len;
21332307 info.low_limit = max(PAGE_SIZE, mmap_min_addr);
2134
- info.high_limit = mm->mmap_base;
2308
+ info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
21352309 info.align_mask = 0;
21362310 info.align_offset = 0;
2311
+ trace_android_vh_exclude_reserved_zone(mm, &info);
21372312 addr = vm_unmapped_area(&info);
21382313
21392314 /*
....@@ -2146,9 +2321,11 @@
21462321 VM_BUG_ON(addr != -ENOMEM);
21472322 info.flags = 0;
21482323 info.low_limit = TASK_UNMAPPED_BASE;
2149
- info.high_limit = TASK_SIZE;
2324
+ info.high_limit = mmap_end;
21502325 addr = vm_unmapped_area(&info);
21512326 }
2327
+
2328
+ trace_android_vh_include_reserved_zone(mm, &info, &addr);
21522329
21532330 return addr;
21542331 }
....@@ -2177,7 +2354,7 @@
21772354 /*
21782355 * mmap_region() will call shmem_zero_setup() to create a file,
21792356 * so use shmem's get_unmapped_area in case it can be huge.
2180
- * do_mmap_pgoff() will clear pgoff, so match alignment.
2357
+ * do_mmap() will clear pgoff, so match alignment.
21812358 */
21822359 pgoff = 0;
21832360 get_area = shmem_get_unmapped_area;
....@@ -2199,15 +2376,11 @@
21992376 EXPORT_SYMBOL(get_unmapped_area);
22002377
22012378 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
2202
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
2379
+static struct vm_area_struct *__find_vma(struct mm_struct *mm,
2380
+ unsigned long addr)
22032381 {
22042382 struct rb_node *rb_node;
2205
- struct vm_area_struct *vma;
2206
-
2207
- /* Check the cache first. */
2208
- vma = vmacache_find(mm, addr);
2209
- if (likely(vma))
2210
- return vma;
2383
+ struct vm_area_struct *vma = NULL;
22112384
22122385 rb_node = mm->mm_rb.rb_node;
22132386
....@@ -2225,12 +2398,53 @@
22252398 rb_node = rb_node->rb_right;
22262399 }
22272400
2401
+ return vma;
2402
+}
2403
+
2404
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
2405
+{
2406
+ struct vm_area_struct *vma;
2407
+
2408
+ /* Check the cache first. */
2409
+ vma = vmacache_find(mm, addr);
2410
+ if (likely(vma))
2411
+ return vma;
2412
+
2413
+ vma = __find_vma(mm, addr);
22282414 if (vma)
22292415 vmacache_update(addr, vma);
22302416 return vma;
22312417 }
2232
-
22332418 EXPORT_SYMBOL(find_vma);
2419
+
2420
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
2421
+struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr)
2422
+{
2423
+ struct vm_area_struct *vma = NULL;
2424
+
2425
+ read_lock(&mm->mm_rb_lock);
2426
+ vma = __find_vma(mm, addr);
2427
+
2428
+ /*
2429
+ * If there is a concurrent fast mremap, bail out since the entire
2430
+ * PMD/PUD subtree may have been remapped.
2431
+ *
2432
+ * This is usually safe for conventional mremap since it takes the
2433
+ * PTE locks as does SPF. However fast mremap only takes the lock
2434
+ * at the PMD/PUD level which is ok as it is done with the mmap
2435
+ * write lock held. But since SPF, as the term implies forgoes,
2436
+ * taking the mmap read lock and also cannot take PTL lock at the
2437
+ * larger PMD/PUD granualrity, since it would introduce huge
2438
+ * contention in the page fault path; fall back to regular fault
2439
+ * handling.
2440
+ */
2441
+ if (vma && !atomic_inc_unless_negative(&vma->vm_ref_count))
2442
+ vma = NULL;
2443
+ read_unlock(&mm->mm_rb_lock);
2444
+
2445
+ return vma;
2446
+}
2447
+#endif
22342448
22352449 /*
22362450 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
....@@ -2245,12 +2459,9 @@
22452459 if (vma) {
22462460 *pprev = vma->vm_prev;
22472461 } else {
2248
- struct rb_node *rb_node = mm->mm_rb.rb_node;
2249
- *pprev = NULL;
2250
- while (rb_node) {
2251
- *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2252
- rb_node = rb_node->rb_right;
2253
- }
2462
+ struct rb_node *rb_node = rb_last(&mm->mm_rb);
2463
+
2464
+ *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
22542465 }
22552466 return vma;
22562467 }
....@@ -2330,8 +2541,7 @@
23302541 gap_addr = TASK_SIZE;
23312542
23322543 next = vma->vm_next;
2333
- if (next && next->vm_start < gap_addr &&
2334
- (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
2544
+ if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
23352545 if (!(next->vm_flags & VM_GROWSUP))
23362546 return -ENOMEM;
23372547 /* Check that both stack segments have the same anon_vma? */
....@@ -2343,7 +2553,7 @@
23432553
23442554 /*
23452555 * vma->vm_start/vm_end cannot change under us because the caller
2346
- * is required to hold the mmap_sem in read mode. We need the
2556
+ * is required to hold the mmap_lock in read mode. We need the
23472557 * anon_vma lock to serialize against concurrent expand_stacks.
23482558 */
23492559 anon_vma_lock_write(vma->anon_vma);
....@@ -2361,7 +2571,7 @@
23612571 if (!error) {
23622572 /*
23632573 * vma_gap_update() doesn't support concurrent
2364
- * updates, but we only hold a shared mmap_sem
2574
+ * updates, but we only hold a shared mmap_lock
23652575 * lock here, so we need to protect against
23662576 * concurrent vma expansions.
23672577 * anon_vma_lock_write() doesn't help here, as
....@@ -2412,7 +2622,7 @@
24122622 prev = vma->vm_prev;
24132623 /* Check that both stack segments have the same anon_vma? */
24142624 if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
2415
- (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
2625
+ vma_is_accessible(prev)) {
24162626 if (address - prev->vm_end < stack_guard_gap)
24172627 return -ENOMEM;
24182628 }
....@@ -2423,7 +2633,7 @@
24232633
24242634 /*
24252635 * vma->vm_start/vm_end cannot change under us because the caller
2426
- * is required to hold the mmap_sem in read mode. We need the
2636
+ * is required to hold the mmap_lock in read mode. We need the
24272637 * anon_vma lock to serialize against concurrent expand_stacks.
24282638 */
24292639 anon_vma_lock_write(vma->anon_vma);
....@@ -2441,7 +2651,7 @@
24412651 if (!error) {
24422652 /*
24432653 * vma_gap_update() doesn't support concurrent
2444
- * updates, but we only hold a shared mmap_sem
2654
+ * updates, but we only hold a shared mmap_lock
24452655 * lock here, so we need to protect against
24462656 * concurrent vma expansions.
24472657 * anon_vma_lock_write() doesn't help here, as
....@@ -2455,8 +2665,8 @@
24552665 mm->locked_vm += grow;
24562666 vm_stat_account(mm, vma->vm_flags, grow);
24572667 anon_vma_interval_tree_pre_update_vma(vma);
2458
- vma->vm_start = address;
2459
- vma->vm_pgoff -= grow;
2668
+ WRITE_ONCE(vma->vm_start, address);
2669
+ WRITE_ONCE(vma->vm_pgoff, vma->vm_pgoff - grow);
24602670 anon_vma_interval_tree_post_update_vma(vma);
24612671 vma_gap_update(vma);
24622672 spin_unlock(&mm->page_table_lock);
....@@ -2483,7 +2693,7 @@
24832693 if (!*endptr)
24842694 stack_guard_gap = val << PAGE_SHIFT;
24852695
2486
- return 0;
2696
+ return 1;
24872697 }
24882698 __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
24892699
....@@ -2503,7 +2713,7 @@
25032713 if (vma && (vma->vm_start <= addr))
25042714 return vma;
25052715 /* don't alter vm_end if the coredump is running */
2506
- if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
2716
+ if (!prev || expand_stack(prev, addr))
25072717 return NULL;
25082718 if (prev->vm_flags & VM_LOCKED)
25092719 populate_vma_page_range(prev, addr, prev->vm_end, NULL);
....@@ -2528,9 +2738,6 @@
25282738 if (vma->vm_start <= addr)
25292739 return vma;
25302740 if (!(vma->vm_flags & VM_GROWSDOWN))
2531
- return NULL;
2532
- /* don't alter vm_start if the coredump is running */
2533
- if (!mmget_still_valid(mm))
25342741 return NULL;
25352742 start = vma->vm_start;
25362743 if (expand_stack(vma, addr))
....@@ -2576,13 +2783,30 @@
25762783 struct vm_area_struct *vma, struct vm_area_struct *prev,
25772784 unsigned long start, unsigned long end)
25782785 {
2579
- struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
2786
+ struct vm_area_struct *next = vma_next(mm, prev);
25802787 struct mmu_gather tlb;
2788
+ struct vm_area_struct *cur_vma;
25812789
25822790 lru_add_drain();
25832791 tlb_gather_mmu(&tlb, mm, start, end);
25842792 update_hiwater_rss(mm);
25852793 unmap_vmas(&tlb, vma, start, end);
2794
+
2795
+ /*
2796
+ * Ensure we have no stale TLB entries by the time this mapping is
2797
+ * removed from the rmap.
2798
+ * Note that we don't have to worry about nested flushes here because
2799
+ * we're holding the mm semaphore for removing the mapping - so any
2800
+ * concurrent flush in this region has to be coming through the rmap,
2801
+ * and we synchronize against that using the rmap lock.
2802
+ */
2803
+ for (cur_vma = vma; cur_vma; cur_vma = cur_vma->vm_next) {
2804
+ if ((cur_vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) != 0) {
2805
+ tlb_flush_mmu(&tlb);
2806
+ break;
2807
+ }
2808
+ }
2809
+
25862810 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
25872811 next ? next->vm_start : USER_PGTABLES_CEILING);
25882812 tlb_finish_mmu(&tlb, start, end);
....@@ -2592,7 +2816,7 @@
25922816 * Create a list of vma's touched by the unmap, removing them from the mm's
25932817 * vma list as we go..
25942818 */
2595
-static void
2819
+static bool
25962820 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
25972821 struct vm_area_struct *prev, unsigned long end)
25982822 {
....@@ -2602,7 +2826,7 @@
26022826 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
26032827 vma->vm_prev = NULL;
26042828 do {
2605
- vma_rb_erase(vma, &mm->mm_rb);
2829
+ vma_rb_erase(vma, mm);
26062830 mm->map_count--;
26072831 tail_vma = vma;
26082832 vma = vma->vm_next;
....@@ -2617,6 +2841,17 @@
26172841
26182842 /* Kill the cache */
26192843 vmacache_invalidate(mm);
2844
+
2845
+ /*
2846
+ * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
2847
+ * VM_GROWSUP VMA. Such VMAs can change their size under
2848
+ * down_read(mmap_lock) and collide with the VMA we are about to unmap.
2849
+ */
2850
+ if (vma && (vma->vm_flags & VM_GROWSDOWN))
2851
+ return false;
2852
+ if (prev && (prev->vm_flags & VM_GROWSUP))
2853
+ return false;
2854
+ return true;
26202855 }
26212856
26222857 /*
....@@ -2701,8 +2936,8 @@
27012936 * work. This now handles partial unmappings.
27022937 * Jeremy Fitzhardinge <jeremy@goop.org>
27032938 */
2704
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
2705
- struct list_head *uf)
2939
+int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
2940
+ struct list_head *uf, bool downgrade)
27062941 {
27072942 unsigned long end;
27082943 struct vm_area_struct *vma, *prev, *last;
....@@ -2711,8 +2946,16 @@
27112946 return -EINVAL;
27122947
27132948 len = PAGE_ALIGN(len);
2949
+ end = start + len;
27142950 if (len == 0)
27152951 return -EINVAL;
2952
+
2953
+ /*
2954
+ * arch_unmap() might do unmaps itself. It must be called
2955
+ * and finish any rbtree manipulation before this code
2956
+ * runs and also starts to manipulate the rbtree.
2957
+ */
2958
+ arch_unmap(mm, start, end);
27162959
27172960 /* Find the first overlapping VMA */
27182961 vma = find_vma(mm, start);
....@@ -2722,7 +2965,6 @@
27222965 /* we have start < vma->vm_end */
27232966
27242967 /* if it doesn't overlap, we have nothing.. */
2725
- end = start + len;
27262968 if (vma->vm_start >= end)
27272969 return 0;
27282970
....@@ -2757,7 +2999,7 @@
27572999 if (error)
27583000 return error;
27593001 }
2760
- vma = prev ? prev->vm_next : mm->mmap;
3002
+ vma = vma_next(mm, prev);
27613003
27623004 if (unlikely(uf)) {
27633005 /*
....@@ -2784,37 +3026,60 @@
27843026 mm->locked_vm -= vma_pages(tmp);
27853027 munlock_vma_pages_all(tmp);
27863028 }
3029
+
27873030 tmp = tmp->vm_next;
27883031 }
27893032 }
27903033
2791
- /*
2792
- * Remove the vma's, and unmap the actual pages
2793
- */
2794
- detach_vmas_to_be_unmapped(mm, vma, prev, end);
2795
- unmap_region(mm, vma, prev, start, end);
3034
+ /* Detach vmas from rbtree */
3035
+ if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
3036
+ downgrade = false;
27963037
2797
- arch_unmap(mm, vma, start, end);
3038
+ if (downgrade)
3039
+ mmap_write_downgrade(mm);
3040
+
3041
+ unmap_region(mm, vma, prev, start, end);
27983042
27993043 /* Fix up all other VM information */
28003044 remove_vma_list(mm, vma);
28013045
2802
- return 0;
3046
+ return downgrade ? 1 : 0;
28033047 }
28043048
2805
-int vm_munmap(unsigned long start, size_t len)
3049
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
3050
+ struct list_head *uf)
3051
+{
3052
+ return __do_munmap(mm, start, len, uf, false);
3053
+}
3054
+
3055
+static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
28063056 {
28073057 int ret;
28083058 struct mm_struct *mm = current->mm;
28093059 LIST_HEAD(uf);
28103060
2811
- if (down_write_killable(&mm->mmap_sem))
3061
+ if (mmap_write_lock_killable(mm))
28123062 return -EINTR;
28133063
2814
- ret = do_munmap(mm, start, len, &uf);
2815
- up_write(&mm->mmap_sem);
3064
+ ret = __do_munmap(mm, start, len, &uf, downgrade);
3065
+ /*
3066
+ * Returning 1 indicates mmap_lock is downgraded.
3067
+ * But 1 is not legal return value of vm_munmap() and munmap(), reset
3068
+ * it to 0 before return.
3069
+ */
3070
+ if (ret == 1) {
3071
+ mmap_read_unlock(mm);
3072
+ ret = 0;
3073
+ } else
3074
+ mmap_write_unlock(mm);
3075
+
28163076 userfaultfd_unmap_complete(mm, &uf);
28173077 return ret;
3078
+}
3079
+
3080
+int vm_munmap(unsigned long start, size_t len)
3081
+{
3082
+ return __vm_munmap(start, len, false);
28183083 }
28193084 EXPORT_SYMBOL(vm_munmap);
28203085
....@@ -2822,7 +3087,7 @@
28223087 {
28233088 addr = untagged_addr(addr);
28243089 profile_munmap(addr);
2825
- return vm_munmap(addr, len);
3090
+ return __vm_munmap(addr, len, true);
28263091 }
28273092
28283093
....@@ -2854,7 +3119,7 @@
28543119 if (pgoff + (size >> PAGE_SHIFT) < pgoff)
28553120 return ret;
28563121
2857
- if (down_write_killable(&mm->mmap_sem))
3122
+ if (mmap_write_lock_killable(mm))
28583123 return -EINTR;
28593124
28603125 vma = find_vma(mm, start);
....@@ -2913,26 +3178,16 @@
29133178 }
29143179
29153180 file = get_file(vma->vm_file);
2916
- ret = do_mmap_pgoff(vma->vm_file, start, size,
3181
+ ret = do_mmap(vma->vm_file, start, size,
29173182 prot, flags, pgoff, &populate, NULL);
29183183 fput(file);
29193184 out:
2920
- up_write(&mm->mmap_sem);
3185
+ mmap_write_unlock(mm);
29213186 if (populate)
29223187 mm_populate(ret, populate);
29233188 if (!IS_ERR_VALUE(ret))
29243189 ret = 0;
29253190 return ret;
2926
-}
2927
-
2928
-static inline void verify_mm_writelocked(struct mm_struct *mm)
2929
-{
2930
-#ifdef CONFIG_DEBUG_VM
2931
- if (unlikely(down_read_trylock(&mm->mmap_sem))) {
2932
- WARN_ON(1);
2933
- up_read(&mm->mmap_sem);
2934
- }
2935
-#endif
29363191 }
29373192
29383193 /*
....@@ -2947,34 +3202,24 @@
29473202 struct rb_node **rb_link, *rb_parent;
29483203 pgoff_t pgoff = addr >> PAGE_SHIFT;
29493204 int error;
3205
+ unsigned long mapped_addr;
29503206
29513207 /* Until we need other flags, refuse anything except VM_EXEC. */
29523208 if ((flags & (~VM_EXEC)) != 0)
29533209 return -EINVAL;
29543210 flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
29553211
2956
- error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
2957
- if (offset_in_page(error))
2958
- return error;
3212
+ mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
3213
+ if (IS_ERR_VALUE(mapped_addr))
3214
+ return mapped_addr;
29593215
29603216 error = mlock_future_check(mm, mm->def_flags, len);
29613217 if (error)
29623218 return error;
29633219
2964
- /*
2965
- * mm->mmap_sem is required to protect against another thread
2966
- * changing the mappings in case we sleep.
2967
- */
2968
- verify_mm_writelocked(mm);
2969
-
2970
- /*
2971
- * Clear old maps. this also does some error checking for us
2972
- */
2973
- while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
2974
- &rb_parent)) {
2975
- if (do_munmap(mm, addr, len, uf))
2976
- return -ENOMEM;
2977
- }
3220
+ /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
3221
+ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
3222
+ return -ENOMEM;
29783223
29793224 /* Check against address space limits *after* clearing old maps... */
29803225 if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
....@@ -3032,12 +3277,12 @@
30323277 if (!len)
30333278 return 0;
30343279
3035
- if (down_write_killable(&mm->mmap_sem))
3280
+ if (mmap_write_lock_killable(mm))
30363281 return -EINTR;
30373282
30383283 ret = do_brk_flags(addr, len, flags, &uf);
30393284 populate = ((mm->def_flags & VM_LOCKED) != 0);
3040
- up_write(&mm->mmap_sem);
3285
+ mmap_write_unlock(mm);
30413286 userfaultfd_unmap_complete(mm, &uf);
30423287 if (populate && !ret)
30433288 mm_populate(addr, len);
....@@ -3065,12 +3310,12 @@
30653310 /*
30663311 * Manually reap the mm to free as much memory as possible.
30673312 * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
3068
- * this mm from further consideration. Taking mm->mmap_sem for
3313
+ * this mm from further consideration. Taking mm->mmap_lock for
30693314 * write after setting MMF_OOM_SKIP will guarantee that the oom
3070
- * reaper will not run on this mm again after mmap_sem is
3315
+ * reaper will not run on this mm again after mmap_lock is
30713316 * dropped.
30723317 *
3073
- * Nothing can be holding mm->mmap_sem here and the above call
3318
+ * Nothing can be holding mm->mmap_lock here and the above call
30743319 * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
30753320 * __oom_reap_task_mm() will not block.
30763321 *
....@@ -3081,10 +3326,9 @@
30813326 (void)__oom_reap_task_mm(mm);
30823327
30833328 set_bit(MMF_OOM_SKIP, &mm->flags);
3084
- down_write(&mm->mmap_sem);
3085
- up_write(&mm->mmap_sem);
30863329 }
30873330
3331
+ mmap_write_lock(mm);
30883332 if (mm->locked_vm) {
30893333 vma = mm->mmap;
30903334 while (vma) {
....@@ -3097,8 +3341,11 @@
30973341 arch_exit_mmap(mm);
30983342
30993343 vma = mm->mmap;
3100
- if (!vma) /* Can happen if dup_mmap() received an OOM */
3344
+ if (!vma) {
3345
+ /* Can happen if dup_mmap() received an OOM */
3346
+ mmap_write_unlock(mm);
31013347 return;
3348
+ }
31023349
31033350 lru_add_drain();
31043351 flush_cache_mm(mm);
....@@ -3109,16 +3356,15 @@
31093356 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
31103357 tlb_finish_mmu(&tlb, 0, -1);
31113358
3112
- /*
3113
- * Walk the list again, actually closing and freeing it,
3114
- * with preemption enabled, without holding any MM locks.
3115
- */
3359
+ /* Walk the list again, actually closing and freeing it. */
31163360 while (vma) {
31173361 if (vma->vm_flags & VM_ACCOUNT)
31183362 nr_accounted += vma_pages(vma);
31193363 vma = remove_vma(vma);
31203364 cond_resched();
31213365 }
3366
+ mm->mmap = NULL;
3367
+ mmap_write_unlock(mm);
31223368 vm_unacct_memory(nr_accounted);
31233369 }
31243370
....@@ -3148,7 +3394,7 @@
31483394 * By setting it to reflect the virtual start address of the
31493395 * vma, merges and splits can happen in a seamless way, just
31503396 * using the existing file pgoff checks and manipulations.
3151
- * Similarly in do_mmap_pgoff and in do_brk.
3397
+ * Similarly in do_mmap and in do_brk_flags.
31523398 */
31533399 if (vma_is_anonymous(vma)) {
31543400 BUG_ON(vma->anon_vma);
....@@ -3185,9 +3431,21 @@
31853431
31863432 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
31873433 return NULL; /* should never get here */
3188
- new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
3189
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
3190
- vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
3434
+
3435
+ /* There is 3 cases to manage here in
3436
+ * AAAA AAAA AAAA AAAA
3437
+ * PPPP.... PPPP......NNNN PPPP....NNNN PP........NN
3438
+ * PPPPPPPP(A) PPPP..NNNNNNNN(B) PPPPPPPPPPPP(1) NULL
3439
+ * PPPPPPPPNNNN(2)
3440
+ * PPPPNNNNNNNN(3)
3441
+ *
3442
+ * new_vma == prev in case A,1,2
3443
+ * new_vma == next in case B,3
3444
+ */
3445
+ new_vma = __vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
3446
+ vma->anon_vma, vma->vm_file, pgoff,
3447
+ vma_policy(vma), vma->vm_userfaultfd_ctx,
3448
+ vma_get_anon_name(vma), true);
31913449 if (new_vma) {
31923450 /*
31933451 * Source vma may have been merged into new_vma
....@@ -3225,6 +3483,15 @@
32253483 get_file(new_vma->vm_file);
32263484 if (new_vma->vm_ops && new_vma->vm_ops->open)
32273485 new_vma->vm_ops->open(new_vma);
3486
+ /*
3487
+ * As the VMA is linked right now, it may be hit by the
3488
+ * speculative page fault handler. But we don't want it to
3489
+ * to start mapping page in this area until the caller has
3490
+ * potentially move the pte from the moved VMA. To prevent
3491
+ * that we protect it right now, and let the caller unprotect
3492
+ * it once the move is done.
3493
+ */
3494
+ vm_write_begin(new_vma);
32283495 vma_link(mm, new_vma, prev, rb_link, rb_parent);
32293496 *need_rmap_locks = false;
32303497 }
....@@ -3311,6 +3578,8 @@
33113578 .fault = special_mapping_fault,
33123579 .mremap = special_mapping_mremap,
33133580 .name = special_mapping_name,
3581
+ /* vDSO code relies that VVAR can't be accessed remotely */
3582
+ .access = NULL,
33143583 };
33153584
33163585 static const struct vm_operations_struct legacy_special_mapping_vmops = {
....@@ -3394,7 +3663,7 @@
33943663 }
33953664
33963665 /*
3397
- * Called with mm->mmap_sem held for writing.
3666
+ * Called with mm->mmap_lock held for writing.
33983667 * Insert a new vma covering the given region, with the given flags.
33993668 * Its pages are supplied by the given array of struct page *.
34003669 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
....@@ -3431,7 +3700,7 @@
34313700 * The LSB of head.next can't change from under us
34323701 * because we hold the mm_all_locks_mutex.
34333702 */
3434
- down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
3703
+ down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
34353704 /*
34363705 * We can safely modify head.next after taking the
34373706 * anon_vma->root->rwsem. If some other vma in this mm shares
....@@ -3461,7 +3730,7 @@
34613730 */
34623731 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
34633732 BUG();
3464
- down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
3733
+ down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
34653734 }
34663735 }
34673736
....@@ -3470,11 +3739,11 @@
34703739 * operations that could ever happen on a certain mm. This includes
34713740 * vmtruncate, try_to_unmap, and all page faults.
34723741 *
3473
- * The caller must take the mmap_sem in write mode before calling
3742
+ * The caller must take the mmap_lock in write mode before calling
34743743 * mm_take_all_locks(). The caller isn't allowed to release the
3475
- * mmap_sem until mm_drop_all_locks() returns.
3744
+ * mmap_lock until mm_drop_all_locks() returns.
34763745 *
3477
- * mmap_sem in write mode is required in order to block all operations
3746
+ * mmap_lock in write mode is required in order to block all operations
34783747 * that could modify pagetables and free pages without need of
34793748 * altering the vma layout. It's also needed in write mode to avoid new
34803749 * anon_vmas to be associated with existing vmas.
....@@ -3507,7 +3776,7 @@
35073776 struct vm_area_struct *vma;
35083777 struct anon_vma_chain *avc;
35093778
3510
- BUG_ON(down_read_trylock(&mm->mmap_sem));
3779
+ BUG_ON(mmap_read_trylock(mm));
35113780
35123781 mutex_lock(&mm_all_locks_mutex);
35133782
....@@ -3579,7 +3848,7 @@
35793848 }
35803849
35813850 /*
3582
- * The mmap_sem cannot be released by the caller until
3851
+ * The mmap_lock cannot be released by the caller until
35833852 * mm_drop_all_locks() returns.
35843853 */
35853854 void mm_drop_all_locks(struct mm_struct *mm)
....@@ -3587,7 +3856,7 @@
35873856 struct vm_area_struct *vma;
35883857 struct anon_vma_chain *avc;
35893858
3590
- BUG_ON(down_read_trylock(&mm->mmap_sem));
3859
+ BUG_ON(mmap_read_trylock(mm));
35913860 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
35923861
35933862 for (vma = mm->mmap; vma; vma = vma->vm_next) {