.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * mm/mmap.c |
---|
3 | 4 | * |
---|
.. | .. |
---|
52 | 53 | #include <asm/tlb.h> |
---|
53 | 54 | #include <asm/mmu_context.h> |
---|
54 | 55 | |
---|
| 56 | +#define CREATE_TRACE_POINTS |
---|
| 57 | +#include <trace/events/mmap.h> |
---|
| 58 | +#undef CREATE_TRACE_POINTS |
---|
| 59 | +#include <trace/hooks/mm.h> |
---|
55 | 60 | #include "internal.h" |
---|
56 | 61 | |
---|
57 | 62 | #ifndef arch_mmap_check |
---|
.. | .. |
---|
128 | 133 | vm_flags &= ~VM_SHARED; |
---|
129 | 134 | vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); |
---|
130 | 135 | } |
---|
131 | | - /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */ |
---|
| 136 | + /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ |
---|
132 | 137 | WRITE_ONCE(vma->vm_page_prot, vm_page_prot); |
---|
133 | 138 | } |
---|
134 | 139 | |
---|
.. | .. |
---|
139 | 144 | struct file *file, struct address_space *mapping) |
---|
140 | 145 | { |
---|
141 | 146 | if (vma->vm_flags & VM_DENYWRITE) |
---|
142 | | - atomic_inc(&file_inode(file)->i_writecount); |
---|
| 147 | + allow_write_access(file); |
---|
143 | 148 | if (vma->vm_flags & VM_SHARED) |
---|
144 | 149 | mapping_unmap_writable(mapping); |
---|
145 | 150 | |
---|
.. | .. |
---|
164 | 169 | } |
---|
165 | 170 | } |
---|
166 | 171 | |
---|
| 172 | +static void __free_vma(struct vm_area_struct *vma) |
---|
| 173 | +{ |
---|
| 174 | + if (vma->vm_file) |
---|
| 175 | + fput(vma->vm_file); |
---|
| 176 | + mpol_put(vma_policy(vma)); |
---|
| 177 | + vm_area_free(vma); |
---|
| 178 | +} |
---|
| 179 | + |
---|
| 180 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 181 | +void put_vma(struct vm_area_struct *vma) |
---|
| 182 | +{ |
---|
| 183 | + if (atomic_dec_and_test(&vma->vm_ref_count)) |
---|
| 184 | + __free_vma(vma); |
---|
| 185 | +} |
---|
| 186 | +#else |
---|
| 187 | +static inline void put_vma(struct vm_area_struct *vma) |
---|
| 188 | +{ |
---|
| 189 | + __free_vma(vma); |
---|
| 190 | +} |
---|
| 191 | +#endif |
---|
| 192 | + |
---|
167 | 193 | /* |
---|
168 | 194 | * Close a vm structure and free it, returning the next. |
---|
169 | 195 | */ |
---|
.. | .. |
---|
174 | 200 | might_sleep(); |
---|
175 | 201 | if (vma->vm_ops && vma->vm_ops->close) |
---|
176 | 202 | vma->vm_ops->close(vma); |
---|
177 | | - if (vma->vm_file) |
---|
178 | | - fput(vma->vm_file); |
---|
179 | | - mpol_put(vma_policy(vma)); |
---|
180 | | - vm_area_free(vma); |
---|
| 203 | + put_vma(vma); |
---|
181 | 204 | return next; |
---|
182 | 205 | } |
---|
183 | 206 | |
---|
.. | .. |
---|
186 | 209 | SYSCALL_DEFINE1(brk, unsigned long, brk) |
---|
187 | 210 | { |
---|
188 | 211 | unsigned long retval; |
---|
189 | | - unsigned long newbrk, oldbrk; |
---|
| 212 | + unsigned long newbrk, oldbrk, origbrk; |
---|
190 | 213 | struct mm_struct *mm = current->mm; |
---|
191 | 214 | struct vm_area_struct *next; |
---|
192 | 215 | unsigned long min_brk; |
---|
193 | 216 | bool populate; |
---|
| 217 | + bool downgraded = false; |
---|
194 | 218 | LIST_HEAD(uf); |
---|
195 | 219 | |
---|
196 | | - if (down_write_killable(&mm->mmap_sem)) |
---|
| 220 | + if (mmap_write_lock_killable(mm)) |
---|
197 | 221 | return -EINTR; |
---|
| 222 | + |
---|
| 223 | + origbrk = mm->brk; |
---|
198 | 224 | |
---|
199 | 225 | #ifdef CONFIG_COMPAT_BRK |
---|
200 | 226 | /* |
---|
.. | .. |
---|
224 | 250 | |
---|
225 | 251 | newbrk = PAGE_ALIGN(brk); |
---|
226 | 252 | oldbrk = PAGE_ALIGN(mm->brk); |
---|
227 | | - if (oldbrk == newbrk) |
---|
228 | | - goto set_brk; |
---|
| 253 | + if (oldbrk == newbrk) { |
---|
| 254 | + mm->brk = brk; |
---|
| 255 | + goto success; |
---|
| 256 | + } |
---|
229 | 257 | |
---|
230 | | - /* Always allow shrinking brk. */ |
---|
| 258 | + /* |
---|
| 259 | + * Always allow shrinking brk. |
---|
| 260 | + * __do_munmap() may downgrade mmap_lock to read. |
---|
| 261 | + */ |
---|
231 | 262 | if (brk <= mm->brk) { |
---|
232 | | - if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf)) |
---|
233 | | - goto set_brk; |
---|
234 | | - goto out; |
---|
| 263 | + int ret; |
---|
| 264 | + |
---|
| 265 | + /* |
---|
| 266 | + * mm->brk must to be protected by write mmap_lock so update it |
---|
| 267 | + * before downgrading mmap_lock. When __do_munmap() fails, |
---|
| 268 | + * mm->brk will be restored from origbrk. |
---|
| 269 | + */ |
---|
| 270 | + mm->brk = brk; |
---|
| 271 | + ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); |
---|
| 272 | + if (ret < 0) { |
---|
| 273 | + mm->brk = origbrk; |
---|
| 274 | + goto out; |
---|
| 275 | + } else if (ret == 1) { |
---|
| 276 | + downgraded = true; |
---|
| 277 | + } |
---|
| 278 | + goto success; |
---|
235 | 279 | } |
---|
236 | 280 | |
---|
237 | 281 | /* Check against existing mmap mappings. */ |
---|
.. | .. |
---|
242 | 286 | /* Ok, looks good - let it rip. */ |
---|
243 | 287 | if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) |
---|
244 | 288 | goto out; |
---|
245 | | - |
---|
246 | | -set_brk: |
---|
247 | 289 | mm->brk = brk; |
---|
| 290 | + |
---|
| 291 | +success: |
---|
248 | 292 | populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; |
---|
249 | | - up_write(&mm->mmap_sem); |
---|
| 293 | + if (downgraded) |
---|
| 294 | + mmap_read_unlock(mm); |
---|
| 295 | + else |
---|
| 296 | + mmap_write_unlock(mm); |
---|
250 | 297 | userfaultfd_unmap_complete(mm, &uf); |
---|
251 | 298 | if (populate) |
---|
252 | 299 | mm_populate(oldbrk, newbrk - oldbrk); |
---|
253 | 300 | return brk; |
---|
254 | 301 | |
---|
255 | 302 | out: |
---|
256 | | - retval = mm->brk; |
---|
257 | | - up_write(&mm->mmap_sem); |
---|
| 303 | + retval = origbrk; |
---|
| 304 | + mmap_write_unlock(mm); |
---|
258 | 305 | return retval; |
---|
259 | 306 | } |
---|
260 | 307 | |
---|
261 | | -static long vma_compute_subtree_gap(struct vm_area_struct *vma) |
---|
| 308 | +static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) |
---|
262 | 309 | { |
---|
263 | | - unsigned long max, prev_end, subtree_gap; |
---|
| 310 | + unsigned long gap, prev_end; |
---|
264 | 311 | |
---|
265 | 312 | /* |
---|
266 | 313 | * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we |
---|
.. | .. |
---|
268 | 315 | * an unmapped area; whereas when expanding we only require one. |
---|
269 | 316 | * That's a little inconsistent, but keeps the code here simpler. |
---|
270 | 317 | */ |
---|
271 | | - max = vm_start_gap(vma); |
---|
| 318 | + gap = vm_start_gap(vma); |
---|
272 | 319 | if (vma->vm_prev) { |
---|
273 | 320 | prev_end = vm_end_gap(vma->vm_prev); |
---|
274 | | - if (max > prev_end) |
---|
275 | | - max -= prev_end; |
---|
| 321 | + if (gap > prev_end) |
---|
| 322 | + gap -= prev_end; |
---|
276 | 323 | else |
---|
277 | | - max = 0; |
---|
| 324 | + gap = 0; |
---|
278 | 325 | } |
---|
| 326 | + return gap; |
---|
| 327 | +} |
---|
| 328 | + |
---|
| 329 | +#ifdef CONFIG_DEBUG_VM_RB |
---|
| 330 | +static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma) |
---|
| 331 | +{ |
---|
| 332 | + unsigned long max = vma_compute_gap(vma), subtree_gap; |
---|
279 | 333 | if (vma->vm_rb.rb_left) { |
---|
280 | 334 | subtree_gap = rb_entry(vma->vm_rb.rb_left, |
---|
281 | 335 | struct vm_area_struct, vm_rb)->rb_subtree_gap; |
---|
.. | .. |
---|
291 | 345 | return max; |
---|
292 | 346 | } |
---|
293 | 347 | |
---|
294 | | -#ifdef CONFIG_DEBUG_VM_RB |
---|
295 | 348 | static int browse_rb(struct mm_struct *mm) |
---|
296 | 349 | { |
---|
297 | 350 | struct rb_root *root = &mm->mm_rb; |
---|
.. | .. |
---|
397 | 450 | #define validate_mm(mm) do { } while (0) |
---|
398 | 451 | #endif |
---|
399 | 452 | |
---|
400 | | -RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, |
---|
401 | | - unsigned long, rb_subtree_gap, vma_compute_subtree_gap) |
---|
| 453 | +RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks, |
---|
| 454 | + struct vm_area_struct, vm_rb, |
---|
| 455 | + unsigned long, rb_subtree_gap, vma_compute_gap) |
---|
| 456 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 457 | +#define mm_rb_write_lock(mm) write_lock(&(mm)->mm_rb_lock) |
---|
| 458 | +#define mm_rb_write_unlock(mm) write_unlock(&(mm)->mm_rb_lock) |
---|
| 459 | +#else |
---|
| 460 | +#define mm_rb_write_lock(mm) do { } while (0) |
---|
| 461 | +#define mm_rb_write_unlock(mm) do { } while (0) |
---|
| 462 | +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ |
---|
402 | 463 | |
---|
403 | 464 | /* |
---|
404 | 465 | * Update augmented rbtree rb_subtree_gap values after vma->vm_start or |
---|
.. | .. |
---|
408 | 469 | static void vma_gap_update(struct vm_area_struct *vma) |
---|
409 | 470 | { |
---|
410 | 471 | /* |
---|
411 | | - * As it turns out, RB_DECLARE_CALLBACKS() already created a callback |
---|
412 | | - * function that does exacltly what we want. |
---|
| 472 | + * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created |
---|
| 473 | + * a callback function that does exactly what we want. |
---|
413 | 474 | */ |
---|
414 | 475 | vma_gap_callbacks_propagate(&vma->vm_rb, NULL); |
---|
415 | 476 | } |
---|
416 | 477 | |
---|
417 | 478 | static inline void vma_rb_insert(struct vm_area_struct *vma, |
---|
418 | | - struct rb_root *root) |
---|
| 479 | + struct mm_struct *mm) |
---|
419 | 480 | { |
---|
| 481 | + struct rb_root *root = &mm->mm_rb; |
---|
| 482 | + |
---|
420 | 483 | /* All rb_subtree_gap values must be consistent prior to insertion */ |
---|
421 | 484 | validate_mm_rb(root, NULL); |
---|
422 | 485 | |
---|
423 | 486 | rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); |
---|
424 | 487 | } |
---|
425 | 488 | |
---|
426 | | -static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) |
---|
| 489 | +static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm) |
---|
427 | 490 | { |
---|
| 491 | + struct rb_root *root = &mm->mm_rb; |
---|
428 | 492 | /* |
---|
429 | 493 | * Note rb_erase_augmented is a fairly large inline function, |
---|
430 | 494 | * so make sure we instantiate it only once with our desired |
---|
431 | 495 | * augmented rbtree callbacks. |
---|
432 | 496 | */ |
---|
| 497 | + mm_rb_write_lock(mm); |
---|
433 | 498 | rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); |
---|
| 499 | + mm_rb_write_unlock(mm); /* wmb */ |
---|
| 500 | + |
---|
| 501 | + /* |
---|
| 502 | + * Ensure the removal is complete before clearing the node. |
---|
| 503 | + * Matched by vma_has_changed()/handle_speculative_fault(). |
---|
| 504 | + */ |
---|
| 505 | + RB_CLEAR_NODE(&vma->vm_rb); |
---|
434 | 506 | } |
---|
435 | 507 | |
---|
436 | 508 | static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, |
---|
437 | | - struct rb_root *root, |
---|
| 509 | + struct mm_struct *mm, |
---|
438 | 510 | struct vm_area_struct *ignore) |
---|
439 | 511 | { |
---|
440 | 512 | /* |
---|
441 | 513 | * All rb_subtree_gap values must be consistent prior to erase, |
---|
442 | | - * with the possible exception of the "next" vma being erased if |
---|
443 | | - * next->vm_start was reduced. |
---|
| 514 | + * with the possible exception of |
---|
| 515 | + * |
---|
| 516 | + * a. the "next" vma being erased if next->vm_start was reduced in |
---|
| 517 | + * __vma_adjust() -> __vma_unlink() |
---|
| 518 | + * b. the vma being erased in detach_vmas_to_be_unmapped() -> |
---|
| 519 | + * vma_rb_erase() |
---|
444 | 520 | */ |
---|
445 | | - validate_mm_rb(root, ignore); |
---|
| 521 | + validate_mm_rb(&mm->mm_rb, ignore); |
---|
446 | 522 | |
---|
447 | | - __vma_rb_erase(vma, root); |
---|
| 523 | + __vma_rb_erase(vma, mm); |
---|
448 | 524 | } |
---|
449 | 525 | |
---|
450 | 526 | static __always_inline void vma_rb_erase(struct vm_area_struct *vma, |
---|
451 | | - struct rb_root *root) |
---|
| 527 | + struct mm_struct *mm) |
---|
452 | 528 | { |
---|
453 | | - /* |
---|
454 | | - * All rb_subtree_gap values must be consistent prior to erase, |
---|
455 | | - * with the possible exception of the vma being erased. |
---|
456 | | - */ |
---|
457 | | - validate_mm_rb(root, vma); |
---|
458 | | - |
---|
459 | | - __vma_rb_erase(vma, root); |
---|
| 529 | + vma_rb_erase_ignore(vma, mm, vma); |
---|
460 | 530 | } |
---|
461 | 531 | |
---|
462 | 532 | /* |
---|
.. | .. |
---|
470 | 540 | * After the update, the vma will be reinserted using |
---|
471 | 541 | * anon_vma_interval_tree_post_update_vma(). |
---|
472 | 542 | * |
---|
473 | | - * The entire update must be protected by exclusive mmap_sem and by |
---|
| 543 | + * The entire update must be protected by exclusive mmap_lock and by |
---|
474 | 544 | * the root anon_vma's mutex. |
---|
475 | 545 | */ |
---|
476 | 546 | static inline void |
---|
.. | .. |
---|
525 | 595 | return 0; |
---|
526 | 596 | } |
---|
527 | 597 | |
---|
| 598 | +/* |
---|
| 599 | + * vma_next() - Get the next VMA. |
---|
| 600 | + * @mm: The mm_struct. |
---|
| 601 | + * @vma: The current vma. |
---|
| 602 | + * |
---|
| 603 | + * If @vma is NULL, return the first vma in the mm. |
---|
| 604 | + * |
---|
| 605 | + * Returns: The next VMA after @vma. |
---|
| 606 | + */ |
---|
| 607 | +static inline struct vm_area_struct *vma_next(struct mm_struct *mm, |
---|
| 608 | + struct vm_area_struct *vma) |
---|
| 609 | +{ |
---|
| 610 | + if (!vma) |
---|
| 611 | + return mm->mmap; |
---|
| 612 | + |
---|
| 613 | + return vma->vm_next; |
---|
| 614 | +} |
---|
| 615 | + |
---|
| 616 | +/* |
---|
| 617 | + * munmap_vma_range() - munmap VMAs that overlap a range. |
---|
| 618 | + * @mm: The mm struct |
---|
| 619 | + * @start: The start of the range. |
---|
| 620 | + * @len: The length of the range. |
---|
| 621 | + * @pprev: pointer to the pointer that will be set to previous vm_area_struct |
---|
| 622 | + * @rb_link: the rb_node |
---|
| 623 | + * @rb_parent: the parent rb_node |
---|
| 624 | + * |
---|
| 625 | + * Find all the vm_area_struct that overlap from @start to |
---|
| 626 | + * @end and munmap them. Set @pprev to the previous vm_area_struct. |
---|
| 627 | + * |
---|
| 628 | + * Returns: -ENOMEM on munmap failure or 0 on success. |
---|
| 629 | + */ |
---|
| 630 | +static inline int |
---|
| 631 | +munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, |
---|
| 632 | + struct vm_area_struct **pprev, struct rb_node ***link, |
---|
| 633 | + struct rb_node **parent, struct list_head *uf) |
---|
| 634 | +{ |
---|
| 635 | + |
---|
| 636 | + while (find_vma_links(mm, start, start + len, pprev, link, parent)) |
---|
| 637 | + if (do_munmap(mm, start, len, uf)) |
---|
| 638 | + return -ENOMEM; |
---|
| 639 | + |
---|
| 640 | + return 0; |
---|
| 641 | +} |
---|
528 | 642 | static unsigned long count_vma_pages_range(struct mm_struct *mm, |
---|
529 | 643 | unsigned long addr, unsigned long end) |
---|
530 | 644 | { |
---|
.. | .. |
---|
571 | 685 | * immediately update the gap to the correct value. Finally we |
---|
572 | 686 | * rebalance the rbtree after all augmented values have been set. |
---|
573 | 687 | */ |
---|
| 688 | + mm_rb_write_lock(mm); |
---|
574 | 689 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); |
---|
575 | 690 | vma->rb_subtree_gap = 0; |
---|
576 | 691 | vma_gap_update(vma); |
---|
577 | | - vma_rb_insert(vma, &mm->mm_rb); |
---|
| 692 | + vma_rb_insert(vma, mm); |
---|
| 693 | + mm_rb_write_unlock(mm); |
---|
578 | 694 | } |
---|
579 | 695 | |
---|
580 | 696 | static void __vma_link_file(struct vm_area_struct *vma) |
---|
.. | .. |
---|
586 | 702 | struct address_space *mapping = file->f_mapping; |
---|
587 | 703 | |
---|
588 | 704 | if (vma->vm_flags & VM_DENYWRITE) |
---|
589 | | - atomic_dec(&file_inode(file)->i_writecount); |
---|
| 705 | + put_write_access(file_inode(file)); |
---|
590 | 706 | if (vma->vm_flags & VM_SHARED) |
---|
591 | | - atomic_inc(&mapping->i_mmap_writable); |
---|
| 707 | + mapping_allow_writable(mapping); |
---|
592 | 708 | |
---|
593 | 709 | flush_dcache_mmap_lock(mapping); |
---|
594 | 710 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
---|
.. | .. |
---|
601 | 717 | struct vm_area_struct *prev, struct rb_node **rb_link, |
---|
602 | 718 | struct rb_node *rb_parent) |
---|
603 | 719 | { |
---|
604 | | - __vma_link_list(mm, vma, prev, rb_parent); |
---|
| 720 | + __vma_link_list(mm, vma, prev); |
---|
605 | 721 | __vma_link_rb(mm, vma, rb_link, rb_parent); |
---|
606 | 722 | } |
---|
607 | 723 | |
---|
.. | .. |
---|
642 | 758 | mm->map_count++; |
---|
643 | 759 | } |
---|
644 | 760 | |
---|
645 | | -static __always_inline void __vma_unlink_common(struct mm_struct *mm, |
---|
| 761 | +static __always_inline void __vma_unlink(struct mm_struct *mm, |
---|
646 | 762 | struct vm_area_struct *vma, |
---|
647 | | - struct vm_area_struct *prev, |
---|
648 | | - bool has_prev, |
---|
649 | 763 | struct vm_area_struct *ignore) |
---|
650 | 764 | { |
---|
651 | | - struct vm_area_struct *next; |
---|
652 | | - |
---|
653 | | - vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); |
---|
654 | | - next = vma->vm_next; |
---|
655 | | - if (has_prev) |
---|
656 | | - prev->vm_next = next; |
---|
657 | | - else { |
---|
658 | | - prev = vma->vm_prev; |
---|
659 | | - if (prev) |
---|
660 | | - prev->vm_next = next; |
---|
661 | | - else |
---|
662 | | - mm->mmap = next; |
---|
663 | | - } |
---|
664 | | - if (next) |
---|
665 | | - next->vm_prev = prev; |
---|
666 | | - |
---|
| 765 | + vma_rb_erase_ignore(vma, mm, ignore); |
---|
| 766 | + __vma_unlink_list(mm, vma); |
---|
667 | 767 | /* Kill the cache */ |
---|
668 | 768 | vmacache_invalidate(mm); |
---|
669 | | -} |
---|
670 | | - |
---|
671 | | -static inline void __vma_unlink_prev(struct mm_struct *mm, |
---|
672 | | - struct vm_area_struct *vma, |
---|
673 | | - struct vm_area_struct *prev) |
---|
674 | | -{ |
---|
675 | | - __vma_unlink_common(mm, vma, prev, true, vma); |
---|
676 | 769 | } |
---|
677 | 770 | |
---|
678 | 771 | /* |
---|
.. | .. |
---|
684 | 777 | */ |
---|
685 | 778 | int __vma_adjust(struct vm_area_struct *vma, unsigned long start, |
---|
686 | 779 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, |
---|
687 | | - struct vm_area_struct *expand) |
---|
| 780 | + struct vm_area_struct *expand, bool keep_locked) |
---|
688 | 781 | { |
---|
689 | 782 | struct mm_struct *mm = vma->vm_mm; |
---|
690 | 783 | struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; |
---|
.. | .. |
---|
695 | 788 | bool start_changed = false, end_changed = false; |
---|
696 | 789 | long adjust_next = 0; |
---|
697 | 790 | int remove_next = 0; |
---|
| 791 | + |
---|
| 792 | + vm_write_begin(vma); |
---|
| 793 | + if (next) |
---|
| 794 | + vm_write_begin(next); |
---|
698 | 795 | |
---|
699 | 796 | if (next && !insert) { |
---|
700 | 797 | struct vm_area_struct *exporter = NULL, *importer = NULL; |
---|
.. | .. |
---|
729 | 826 | remove_next = 1 + (end > next->vm_end); |
---|
730 | 827 | VM_WARN_ON(remove_next == 2 && |
---|
731 | 828 | end != next->vm_next->vm_end); |
---|
732 | | - VM_WARN_ON(remove_next == 1 && |
---|
733 | | - end != next->vm_end); |
---|
734 | 829 | /* trim end to next, for case 6 first pass */ |
---|
735 | 830 | end = next->vm_end; |
---|
736 | 831 | } |
---|
.. | .. |
---|
750 | 845 | * vma expands, overlapping part of the next: |
---|
751 | 846 | * mprotect case 5 shifting the boundary up. |
---|
752 | 847 | */ |
---|
753 | | - adjust_next = (end - next->vm_start) >> PAGE_SHIFT; |
---|
| 848 | + adjust_next = (end - next->vm_start); |
---|
754 | 849 | exporter = next; |
---|
755 | 850 | importer = vma; |
---|
756 | 851 | VM_WARN_ON(expand != importer); |
---|
.. | .. |
---|
760 | 855 | * split_vma inserting another: so it must be |
---|
761 | 856 | * mprotect case 4 shifting the boundary down. |
---|
762 | 857 | */ |
---|
763 | | - adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); |
---|
| 858 | + adjust_next = -(vma->vm_end - end); |
---|
764 | 859 | exporter = vma; |
---|
765 | 860 | importer = next; |
---|
766 | 861 | VM_WARN_ON(expand != importer); |
---|
.. | .. |
---|
776 | 871 | |
---|
777 | 872 | importer->anon_vma = exporter->anon_vma; |
---|
778 | 873 | error = anon_vma_clone(importer, exporter); |
---|
779 | | - if (error) |
---|
| 874 | + if (error) { |
---|
| 875 | + if (next && next != vma) |
---|
| 876 | + vm_write_end(next); |
---|
| 877 | + vm_write_end(vma); |
---|
780 | 878 | return error; |
---|
| 879 | + } |
---|
781 | 880 | } |
---|
782 | 881 | } |
---|
783 | 882 | again: |
---|
.. | .. |
---|
815 | 914 | anon_vma_interval_tree_pre_update_vma(next); |
---|
816 | 915 | } |
---|
817 | 916 | |
---|
818 | | - if (root) { |
---|
| 917 | + if (file) { |
---|
819 | 918 | flush_dcache_mmap_lock(mapping); |
---|
820 | 919 | vma_interval_tree_remove(vma, root); |
---|
821 | 920 | if (adjust_next) |
---|
.. | .. |
---|
823 | 922 | } |
---|
824 | 923 | |
---|
825 | 924 | if (start != vma->vm_start) { |
---|
826 | | - vma->vm_start = start; |
---|
| 925 | + WRITE_ONCE(vma->vm_start, start); |
---|
827 | 926 | start_changed = true; |
---|
828 | 927 | } |
---|
829 | 928 | if (end != vma->vm_end) { |
---|
830 | | - vma->vm_end = end; |
---|
| 929 | + WRITE_ONCE(vma->vm_end, end); |
---|
831 | 930 | end_changed = true; |
---|
832 | 931 | } |
---|
833 | | - vma->vm_pgoff = pgoff; |
---|
| 932 | + WRITE_ONCE(vma->vm_pgoff, pgoff); |
---|
834 | 933 | if (adjust_next) { |
---|
835 | | - next->vm_start += adjust_next << PAGE_SHIFT; |
---|
836 | | - next->vm_pgoff += adjust_next; |
---|
| 934 | + WRITE_ONCE(next->vm_start, |
---|
| 935 | + next->vm_start + adjust_next); |
---|
| 936 | + WRITE_ONCE(next->vm_pgoff, |
---|
| 937 | + next->vm_pgoff + (adjust_next >> PAGE_SHIFT)); |
---|
837 | 938 | } |
---|
838 | 939 | |
---|
839 | | - if (root) { |
---|
| 940 | + if (file) { |
---|
840 | 941 | if (adjust_next) |
---|
841 | 942 | vma_interval_tree_insert(next, root); |
---|
842 | 943 | vma_interval_tree_insert(vma, root); |
---|
.. | .. |
---|
849 | 950 | * us to remove next before dropping the locks. |
---|
850 | 951 | */ |
---|
851 | 952 | if (remove_next != 3) |
---|
852 | | - __vma_unlink_prev(mm, next, vma); |
---|
| 953 | + __vma_unlink(mm, next, next); |
---|
853 | 954 | else |
---|
854 | 955 | /* |
---|
855 | 956 | * vma is not before next if they've been |
---|
.. | .. |
---|
860 | 961 | * "next" (which is stored in post-swap() |
---|
861 | 962 | * "vma"). |
---|
862 | 963 | */ |
---|
863 | | - __vma_unlink_common(mm, next, NULL, false, vma); |
---|
| 964 | + __vma_unlink(mm, next, vma); |
---|
864 | 965 | if (file) |
---|
865 | 966 | __remove_shared_vm_struct(next, file, mapping); |
---|
866 | 967 | } else if (insert) { |
---|
.. | .. |
---|
887 | 988 | anon_vma_interval_tree_post_update_vma(next); |
---|
888 | 989 | anon_vma_unlock_write(anon_vma); |
---|
889 | 990 | } |
---|
890 | | - if (mapping) |
---|
891 | | - i_mmap_unlock_write(mapping); |
---|
892 | 991 | |
---|
893 | | - if (root) { |
---|
| 992 | + if (file) { |
---|
| 993 | + i_mmap_unlock_write(mapping); |
---|
894 | 994 | uprobe_mmap(vma); |
---|
895 | 995 | |
---|
896 | 996 | if (adjust_next) |
---|
.. | .. |
---|
898 | 998 | } |
---|
899 | 999 | |
---|
900 | 1000 | if (remove_next) { |
---|
901 | | - if (file) { |
---|
| 1001 | + if (file) |
---|
902 | 1002 | uprobe_munmap(next, next->vm_start, next->vm_end); |
---|
903 | | - fput(file); |
---|
904 | | - } |
---|
905 | 1003 | if (next->anon_vma) |
---|
906 | 1004 | anon_vma_merge(vma, next); |
---|
907 | 1005 | mm->map_count--; |
---|
908 | | - mpol_put(vma_policy(next)); |
---|
909 | | - vm_area_free(next); |
---|
| 1006 | + vm_write_end(next); |
---|
| 1007 | + put_vma(next); |
---|
910 | 1008 | /* |
---|
911 | 1009 | * In mprotect's case 6 (see comments on vma_merge), |
---|
912 | 1010 | * we must remove another next too. It would clutter |
---|
.. | .. |
---|
920 | 1018 | * "vma->vm_next" gap must be updated. |
---|
921 | 1019 | */ |
---|
922 | 1020 | next = vma->vm_next; |
---|
| 1021 | + if (next) |
---|
| 1022 | + vm_write_begin(next); |
---|
923 | 1023 | } else { |
---|
924 | 1024 | /* |
---|
925 | 1025 | * For the scope of the comment "next" and |
---|
.. | .. |
---|
966 | 1066 | if (insert && file) |
---|
967 | 1067 | uprobe_mmap(insert); |
---|
968 | 1068 | |
---|
| 1069 | + if (next && next != vma) |
---|
| 1070 | + vm_write_end(next); |
---|
| 1071 | + if (!keep_locked) |
---|
| 1072 | + vm_write_end(vma); |
---|
| 1073 | + |
---|
969 | 1074 | validate_mm(mm); |
---|
970 | 1075 | |
---|
971 | 1076 | return 0; |
---|
.. | .. |
---|
984 | 1089 | * VM_SOFTDIRTY should not prevent from VMA merging, if we |
---|
985 | 1090 | * match the flags but dirty bit -- the caller should mark |
---|
986 | 1091 | * merged VMA as dirty. If dirty bit won't be excluded from |
---|
987 | | - * comparison, we increase pressue on the memory system forcing |
---|
| 1092 | + * comparison, we increase pressure on the memory system forcing |
---|
988 | 1093 | * the kernel to generate new VMAs when old one could be |
---|
989 | 1094 | * extended instead. |
---|
990 | 1095 | */ |
---|
.. | .. |
---|
1023 | 1128 | * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. |
---|
1024 | 1129 | * |
---|
1025 | 1130 | * We don't check here for the merged mmap wrapping around the end of pagecache |
---|
1026 | | - * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which |
---|
| 1131 | + * indices (16TB on ia32) because do_mmap() does not permit mmap's which |
---|
1027 | 1132 | * wrap, nor mmaps which cover the final page at index -1UL. |
---|
1028 | 1133 | */ |
---|
1029 | 1134 | static int |
---|
.. | .. |
---|
1081 | 1186 | * the area passed down from mprotect_fixup, never extending beyond one |
---|
1082 | 1187 | * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: |
---|
1083 | 1188 | * |
---|
1084 | | - * AAAA AAAA AAAA AAAA |
---|
1085 | | - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX |
---|
1086 | | - * cannot merge might become might become might become |
---|
1087 | | - * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or |
---|
1088 | | - * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or |
---|
1089 | | - * mremap move: PPPPXXXXXXXX 8 |
---|
1090 | | - * AAAA |
---|
1091 | | - * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN |
---|
1092 | | - * might become case 1 below case 2 below case 3 below |
---|
| 1189 | + * AAAA AAAA AAAA |
---|
| 1190 | + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN |
---|
| 1191 | + * cannot merge might become might become |
---|
| 1192 | + * PPNNNNNNNNNN PPPPPPPPPPNN |
---|
| 1193 | + * mmap, brk or case 4 below case 5 below |
---|
| 1194 | + * mremap move: |
---|
| 1195 | + * AAAA AAAA |
---|
| 1196 | + * PPPP NNNN PPPPNNNNXXXX |
---|
| 1197 | + * might become might become |
---|
| 1198 | + * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or |
---|
| 1199 | + * PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or |
---|
| 1200 | + * PPPPNNNNNNNN 3 PPPPXXXXXXXX 8 |
---|
1093 | 1201 | * |
---|
1094 | | - * It is important for case 8 that the the vma NNNN overlapping the |
---|
| 1202 | + * It is important for case 8 that the vma NNNN overlapping the |
---|
1095 | 1203 | * region AAAA is never going to extended over XXXX. Instead XXXX must |
---|
1096 | 1204 | * be extended in region AAAA and NNNN must be removed. This way in |
---|
1097 | 1205 | * all cases where vma_merge succeeds, the moment vma_adjust drops the |
---|
.. | .. |
---|
1105 | 1213 | * parameter) may establish ptes with the wrong permissions of NNNN |
---|
1106 | 1214 | * instead of the right permissions of XXXX. |
---|
1107 | 1215 | */ |
---|
1108 | | -struct vm_area_struct *vma_merge(struct mm_struct *mm, |
---|
| 1216 | +struct vm_area_struct *__vma_merge(struct mm_struct *mm, |
---|
1109 | 1217 | struct vm_area_struct *prev, unsigned long addr, |
---|
1110 | 1218 | unsigned long end, unsigned long vm_flags, |
---|
1111 | 1219 | struct anon_vma *anon_vma, struct file *file, |
---|
1112 | 1220 | pgoff_t pgoff, struct mempolicy *policy, |
---|
1113 | 1221 | struct vm_userfaultfd_ctx vm_userfaultfd_ctx, |
---|
1114 | | - const char __user *anon_name) |
---|
| 1222 | + const char __user *anon_name, bool keep_locked) |
---|
1115 | 1223 | { |
---|
1116 | 1224 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; |
---|
1117 | 1225 | struct vm_area_struct *area, *next; |
---|
.. | .. |
---|
1124 | 1232 | if (vm_flags & VM_SPECIAL) |
---|
1125 | 1233 | return NULL; |
---|
1126 | 1234 | |
---|
1127 | | - if (prev) |
---|
1128 | | - next = prev->vm_next; |
---|
1129 | | - else |
---|
1130 | | - next = mm->mmap; |
---|
| 1235 | + next = vma_next(mm, prev); |
---|
1131 | 1236 | area = next; |
---|
1132 | 1237 | if (area && area->vm_end == end) /* cases 6, 7, 8 */ |
---|
1133 | 1238 | next = next->vm_next; |
---|
.. | .. |
---|
1161 | 1266 | /* cases 1, 6 */ |
---|
1162 | 1267 | err = __vma_adjust(prev, prev->vm_start, |
---|
1163 | 1268 | next->vm_end, prev->vm_pgoff, NULL, |
---|
1164 | | - prev); |
---|
| 1269 | + prev, keep_locked); |
---|
1165 | 1270 | } else /* cases 2, 5, 7 */ |
---|
1166 | 1271 | err = __vma_adjust(prev, prev->vm_start, |
---|
1167 | | - end, prev->vm_pgoff, NULL, prev); |
---|
| 1272 | + end, prev->vm_pgoff, NULL, prev, |
---|
| 1273 | + keep_locked); |
---|
1168 | 1274 | if (err) |
---|
1169 | 1275 | return NULL; |
---|
1170 | 1276 | khugepaged_enter_vma_merge(prev, vm_flags); |
---|
.. | .. |
---|
1182 | 1288 | anon_name)) { |
---|
1183 | 1289 | if (prev && addr < prev->vm_end) /* case 4 */ |
---|
1184 | 1290 | err = __vma_adjust(prev, prev->vm_start, |
---|
1185 | | - addr, prev->vm_pgoff, NULL, next); |
---|
| 1291 | + addr, prev->vm_pgoff, NULL, next, |
---|
| 1292 | + keep_locked); |
---|
1186 | 1293 | else { /* cases 3, 8 */ |
---|
1187 | 1294 | err = __vma_adjust(area, addr, next->vm_end, |
---|
1188 | | - next->vm_pgoff - pglen, NULL, next); |
---|
| 1295 | + next->vm_pgoff - pglen, NULL, next, |
---|
| 1296 | + keep_locked); |
---|
1189 | 1297 | /* |
---|
1190 | 1298 | * In case 3 area is already equal to next and |
---|
1191 | 1299 | * this is a noop, but in case 8 "area" has |
---|
.. | .. |
---|
1203 | 1311 | } |
---|
1204 | 1312 | |
---|
1205 | 1313 | /* |
---|
1206 | | - * Rough compatbility check to quickly see if it's even worth looking |
---|
| 1314 | + * Rough compatibility check to quickly see if it's even worth looking |
---|
1207 | 1315 | * at sharing an anon_vma. |
---|
1208 | 1316 | * |
---|
1209 | 1317 | * They need to have the same vm_file, and the flags can only differ |
---|
.. | .. |
---|
1220 | 1328 | return a->vm_end == b->vm_start && |
---|
1221 | 1329 | mpol_equal(vma_policy(a), vma_policy(b)) && |
---|
1222 | 1330 | a->vm_file == b->vm_file && |
---|
1223 | | - !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && |
---|
| 1331 | + !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && |
---|
1224 | 1332 | b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); |
---|
1225 | 1333 | } |
---|
1226 | 1334 | |
---|
.. | .. |
---|
1267 | 1375 | */ |
---|
1268 | 1376 | struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) |
---|
1269 | 1377 | { |
---|
1270 | | - struct anon_vma *anon_vma; |
---|
1271 | | - struct vm_area_struct *near; |
---|
| 1378 | + struct anon_vma *anon_vma = NULL; |
---|
1272 | 1379 | |
---|
1273 | | - near = vma->vm_next; |
---|
1274 | | - if (!near) |
---|
1275 | | - goto try_prev; |
---|
| 1380 | + /* Try next first. */ |
---|
| 1381 | + if (vma->vm_next) { |
---|
| 1382 | + anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next); |
---|
| 1383 | + if (anon_vma) |
---|
| 1384 | + return anon_vma; |
---|
| 1385 | + } |
---|
1276 | 1386 | |
---|
1277 | | - anon_vma = reusable_anon_vma(near, vma, near); |
---|
1278 | | - if (anon_vma) |
---|
1279 | | - return anon_vma; |
---|
1280 | | -try_prev: |
---|
1281 | | - near = vma->vm_prev; |
---|
1282 | | - if (!near) |
---|
1283 | | - goto none; |
---|
| 1387 | + /* Try prev next. */ |
---|
| 1388 | + if (vma->vm_prev) |
---|
| 1389 | + anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma); |
---|
1284 | 1390 | |
---|
1285 | | - anon_vma = reusable_anon_vma(near, near, vma); |
---|
1286 | | - if (anon_vma) |
---|
1287 | | - return anon_vma; |
---|
1288 | | -none: |
---|
1289 | 1391 | /* |
---|
| 1392 | + * We might reach here with anon_vma == NULL if we can't find |
---|
| 1393 | + * any reusable anon_vma. |
---|
1290 | 1394 | * There's no absolute need to look only at touching neighbours: |
---|
1291 | 1395 | * we could search further afield for "compatible" anon_vmas. |
---|
1292 | 1396 | * But it would probably just be a waste of time searching, |
---|
.. | .. |
---|
1294 | 1398 | * We're trying to allow mprotect remerging later on, |
---|
1295 | 1399 | * not trying to minimize memory used for anon_vmas. |
---|
1296 | 1400 | */ |
---|
1297 | | - return NULL; |
---|
| 1401 | + return anon_vma; |
---|
1298 | 1402 | } |
---|
1299 | 1403 | |
---|
1300 | 1404 | /* |
---|
.. | .. |
---|
1336 | 1440 | if (S_ISBLK(inode->i_mode)) |
---|
1337 | 1441 | return MAX_LFS_FILESIZE; |
---|
1338 | 1442 | |
---|
| 1443 | + if (S_ISSOCK(inode->i_mode)) |
---|
| 1444 | + return MAX_LFS_FILESIZE; |
---|
| 1445 | + |
---|
1339 | 1446 | /* Special "we do even unsigned file positions" case */ |
---|
1340 | 1447 | if (file->f_mode & FMODE_UNSIGNED_OFFSET) |
---|
1341 | 1448 | return 0; |
---|
.. | .. |
---|
1358 | 1465 | } |
---|
1359 | 1466 | |
---|
1360 | 1467 | /* |
---|
1361 | | - * The caller must hold down_write(¤t->mm->mmap_sem). |
---|
| 1468 | + * The caller must write-lock current->mm->mmap_lock. |
---|
1362 | 1469 | */ |
---|
1363 | 1470 | unsigned long do_mmap(struct file *file, unsigned long addr, |
---|
1364 | 1471 | unsigned long len, unsigned long prot, |
---|
1365 | | - unsigned long flags, vm_flags_t vm_flags, |
---|
1366 | | - unsigned long pgoff, unsigned long *populate, |
---|
1367 | | - struct list_head *uf) |
---|
| 1472 | + unsigned long flags, unsigned long pgoff, |
---|
| 1473 | + unsigned long *populate, struct list_head *uf) |
---|
1368 | 1474 | { |
---|
1369 | 1475 | struct mm_struct *mm = current->mm; |
---|
| 1476 | + vm_flags_t vm_flags; |
---|
1370 | 1477 | int pkey = 0; |
---|
1371 | 1478 | |
---|
1372 | 1479 | *populate = 0; |
---|
.. | .. |
---|
1408 | 1515 | * that it represents a valid section of the address space. |
---|
1409 | 1516 | */ |
---|
1410 | 1517 | addr = get_unmapped_area(file, addr, len, pgoff, flags); |
---|
1411 | | - if (offset_in_page(addr)) |
---|
| 1518 | + if (IS_ERR_VALUE(addr)) |
---|
1412 | 1519 | return addr; |
---|
1413 | 1520 | |
---|
1414 | 1521 | if (flags & MAP_FIXED_NOREPLACE) { |
---|
.. | .. |
---|
1428 | 1535 | * to. we assume access permissions have been handled by the open |
---|
1429 | 1536 | * of the memory object, so we don't do any here. |
---|
1430 | 1537 | */ |
---|
1431 | | - vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | |
---|
| 1538 | + vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | |
---|
1432 | 1539 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; |
---|
1433 | 1540 | |
---|
1434 | 1541 | if (flags & MAP_LOCKED) |
---|
.. | .. |
---|
1457 | 1564 | * with MAP_SHARED to preserve backward compatibility. |
---|
1458 | 1565 | */ |
---|
1459 | 1566 | flags &= LEGACY_MAP_MASK; |
---|
1460 | | - /* fall through */ |
---|
| 1567 | + fallthrough; |
---|
1461 | 1568 | case MAP_SHARED_VALIDATE: |
---|
1462 | 1569 | if (flags & ~flags_mask) |
---|
1463 | 1570 | return -EOPNOTSUPP; |
---|
.. | .. |
---|
1484 | 1591 | vm_flags |= VM_SHARED | VM_MAYSHARE; |
---|
1485 | 1592 | if (!(file->f_mode & FMODE_WRITE)) |
---|
1486 | 1593 | vm_flags &= ~(VM_MAYWRITE | VM_SHARED); |
---|
1487 | | - |
---|
1488 | | - /* fall through */ |
---|
| 1594 | + fallthrough; |
---|
1489 | 1595 | case MAP_PRIVATE: |
---|
1490 | 1596 | if (!(file->f_mode & FMODE_READ)) |
---|
1491 | 1597 | return -EACCES; |
---|
.. | .. |
---|
1560 | 1666 | file = fget(fd); |
---|
1561 | 1667 | if (!file) |
---|
1562 | 1668 | return -EBADF; |
---|
1563 | | - if (is_file_hugepages(file)) |
---|
| 1669 | + if (is_file_hugepages(file)) { |
---|
1564 | 1670 | len = ALIGN(len, huge_page_size(hstate_file(file))); |
---|
1565 | | - retval = -EINVAL; |
---|
1566 | | - if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) |
---|
| 1671 | + } else if (unlikely(flags & MAP_HUGETLB)) { |
---|
| 1672 | + retval = -EINVAL; |
---|
1567 | 1673 | goto out_fput; |
---|
| 1674 | + } |
---|
1568 | 1675 | } else if (flags & MAP_HUGETLB) { |
---|
1569 | 1676 | struct user_struct *user = NULL; |
---|
1570 | 1677 | struct hstate *hs; |
---|
.. | .. |
---|
1629 | 1736 | #endif /* __ARCH_WANT_SYS_OLD_MMAP */ |
---|
1630 | 1737 | |
---|
1631 | 1738 | /* |
---|
1632 | | - * Some shared mappigns will want the pages marked read-only |
---|
| 1739 | + * Some shared mappings will want the pages marked read-only |
---|
1633 | 1740 | * to track write events. If so, we'll downgrade vm_page_prot |
---|
1634 | 1741 | * to the private version (using protection_map[] without the |
---|
1635 | 1742 | * VM_SHARED bit). |
---|
.. | .. |
---|
1653 | 1760 | pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) |
---|
1654 | 1761 | return 0; |
---|
1655 | 1762 | |
---|
1656 | | - /* Do we need to track softdirty? */ |
---|
1657 | | - if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) |
---|
| 1763 | + /* |
---|
| 1764 | + * Do we need to track softdirty? hugetlb does not support softdirty |
---|
| 1765 | + * tracking yet. |
---|
| 1766 | + */ |
---|
| 1767 | + if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY) && |
---|
| 1768 | + !is_vm_hugetlb_page(vma)) |
---|
1658 | 1769 | return 1; |
---|
1659 | 1770 | |
---|
1660 | 1771 | /* Specialty mapping? */ |
---|
.. | .. |
---|
1663 | 1774 | |
---|
1664 | 1775 | /* Can the mapping track the dirty pages? */ |
---|
1665 | 1776 | return vma->vm_file && vma->vm_file->f_mapping && |
---|
1666 | | - mapping_cap_account_dirty(vma->vm_file->f_mapping); |
---|
| 1777 | + mapping_can_writeback(vma->vm_file->f_mapping); |
---|
1667 | 1778 | } |
---|
1668 | 1779 | |
---|
1669 | 1780 | /* |
---|
.. | .. |
---|
1687 | 1798 | struct list_head *uf) |
---|
1688 | 1799 | { |
---|
1689 | 1800 | struct mm_struct *mm = current->mm; |
---|
1690 | | - struct vm_area_struct *vma, *prev; |
---|
| 1801 | + struct vm_area_struct *vma, *prev, *merge; |
---|
1691 | 1802 | int error; |
---|
1692 | 1803 | struct rb_node **rb_link, *rb_parent; |
---|
1693 | 1804 | unsigned long charged = 0; |
---|
.. | .. |
---|
1707 | 1818 | return -ENOMEM; |
---|
1708 | 1819 | } |
---|
1709 | 1820 | |
---|
1710 | | - /* Clear old maps */ |
---|
1711 | | - while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, |
---|
1712 | | - &rb_parent)) { |
---|
1713 | | - if (do_munmap(mm, addr, len, uf)) |
---|
1714 | | - return -ENOMEM; |
---|
1715 | | - } |
---|
1716 | | - |
---|
| 1821 | + /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ |
---|
| 1822 | + if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) |
---|
| 1823 | + return -ENOMEM; |
---|
1717 | 1824 | /* |
---|
1718 | 1825 | * Private writable mapping: check memory availability |
---|
1719 | 1826 | */ |
---|
.. | .. |
---|
1781 | 1888 | WARN_ON_ONCE(addr != vma->vm_start); |
---|
1782 | 1889 | |
---|
1783 | 1890 | addr = vma->vm_start; |
---|
| 1891 | + |
---|
| 1892 | + /* If vm_flags changed after call_mmap(), we should try merge vma again |
---|
| 1893 | + * as we may succeed this time. |
---|
| 1894 | + */ |
---|
| 1895 | + if (unlikely(vm_flags != vma->vm_flags && prev)) { |
---|
| 1896 | + merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, |
---|
| 1897 | + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, |
---|
| 1898 | + vma_get_anon_name(vma)); |
---|
| 1899 | + if (merge) { |
---|
| 1900 | + /* ->mmap() can change vma->vm_file and fput the original file. So |
---|
| 1901 | + * fput the vma->vm_file here or we would add an extra fput for file |
---|
| 1902 | + * and cause general protection fault ultimately. |
---|
| 1903 | + */ |
---|
| 1904 | + fput(vma->vm_file); |
---|
| 1905 | + vm_area_free(vma); |
---|
| 1906 | + vma = merge; |
---|
| 1907 | + /* Update vm_flags to pick up the change. */ |
---|
| 1908 | + vm_flags = vma->vm_flags; |
---|
| 1909 | + goto unmap_writable; |
---|
| 1910 | + } |
---|
| 1911 | + } |
---|
| 1912 | + |
---|
1784 | 1913 | vm_flags = vma->vm_flags; |
---|
1785 | 1914 | } else if (vm_flags & VM_SHARED) { |
---|
1786 | 1915 | error = shmem_zero_setup(vma); |
---|
.. | .. |
---|
1790 | 1919 | vma_set_anonymous(vma); |
---|
1791 | 1920 | } |
---|
1792 | 1921 | |
---|
| 1922 | + /* Allow architectures to sanity-check the vm_flags */ |
---|
| 1923 | + if (!arch_validate_flags(vma->vm_flags)) { |
---|
| 1924 | + error = -EINVAL; |
---|
| 1925 | + if (file) |
---|
| 1926 | + goto close_and_free_vma; |
---|
| 1927 | + else |
---|
| 1928 | + goto free_vma; |
---|
| 1929 | + } |
---|
| 1930 | + |
---|
1793 | 1931 | vma_link(mm, vma, prev, rb_link, rb_parent); |
---|
1794 | 1932 | /* Once vma denies write, undo our temporary denial count */ |
---|
1795 | 1933 | if (file) { |
---|
| 1934 | +unmap_writable: |
---|
1796 | 1935 | if (vm_flags & VM_SHARED) |
---|
1797 | 1936 | mapping_unmap_writable(file->f_mapping); |
---|
1798 | 1937 | if (vm_flags & VM_DENYWRITE) |
---|
.. | .. |
---|
1802 | 1941 | out: |
---|
1803 | 1942 | perf_event_mmap(vma); |
---|
1804 | 1943 | |
---|
| 1944 | + vm_write_begin(vma); |
---|
1805 | 1945 | vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); |
---|
1806 | 1946 | if (vm_flags & VM_LOCKED) { |
---|
1807 | 1947 | if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || |
---|
1808 | 1948 | is_vm_hugetlb_page(vma) || |
---|
1809 | 1949 | vma == get_gate_vma(current->mm)) |
---|
1810 | | - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; |
---|
| 1950 | + WRITE_ONCE(vma->vm_flags, |
---|
| 1951 | + vma->vm_flags & VM_LOCKED_CLEAR_MASK); |
---|
1811 | 1952 | else |
---|
1812 | 1953 | mm->locked_vm += (len >> PAGE_SHIFT); |
---|
1813 | 1954 | } |
---|
.. | .. |
---|
1822 | 1963 | * then new mapped in-place (which must be aimed as |
---|
1823 | 1964 | * a completely new data area). |
---|
1824 | 1965 | */ |
---|
1825 | | - vma->vm_flags |= VM_SOFTDIRTY; |
---|
| 1966 | + WRITE_ONCE(vma->vm_flags, vma->vm_flags | VM_SOFTDIRTY); |
---|
1826 | 1967 | |
---|
1827 | 1968 | vma_set_page_prot(vma); |
---|
| 1969 | + vm_write_end(vma); |
---|
| 1970 | + |
---|
| 1971 | + trace_android_vh_mmap_region(vma, addr); |
---|
1828 | 1972 | |
---|
1829 | 1973 | return addr; |
---|
1830 | 1974 | |
---|
| 1975 | +close_and_free_vma: |
---|
| 1976 | + if (vma->vm_ops && vma->vm_ops->close) |
---|
| 1977 | + vma->vm_ops->close(vma); |
---|
1831 | 1978 | unmap_and_free_vma: |
---|
1832 | 1979 | vma->vm_file = NULL; |
---|
1833 | 1980 | fput(file); |
---|
1834 | 1981 | |
---|
1835 | 1982 | /* Undo any partial mapping done by a device driver. */ |
---|
1836 | 1983 | unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); |
---|
1837 | | - charged = 0; |
---|
1838 | 1984 | if (vm_flags & VM_SHARED) |
---|
1839 | 1985 | mapping_unmap_writable(file->f_mapping); |
---|
1840 | 1986 | allow_write_and_free_vma: |
---|
.. | .. |
---|
1848 | 1994 | return error; |
---|
1849 | 1995 | } |
---|
1850 | 1996 | |
---|
1851 | | -unsigned long unmapped_area(struct vm_unmapped_area_info *info) |
---|
| 1997 | +static unsigned long unmapped_area(struct vm_unmapped_area_info *info) |
---|
1852 | 1998 | { |
---|
1853 | 1999 | /* |
---|
1854 | 2000 | * We implement the search by looking for an rbtree node that |
---|
.. | .. |
---|
1951 | 2097 | return gap_start; |
---|
1952 | 2098 | } |
---|
1953 | 2099 | |
---|
1954 | | -unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) |
---|
| 2100 | +static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) |
---|
1955 | 2101 | { |
---|
1956 | 2102 | struct mm_struct *mm = current->mm; |
---|
1957 | 2103 | struct vm_area_struct *vma; |
---|
1958 | 2104 | unsigned long length, low_limit, high_limit, gap_start, gap_end; |
---|
| 2105 | + unsigned long addr = 0; |
---|
1959 | 2106 | |
---|
1960 | 2107 | /* Adjust search length to account for worst case alignment overhead */ |
---|
1961 | 2108 | length = info->length + info->align_mask; |
---|
1962 | 2109 | if (length < info->length) |
---|
1963 | 2110 | return -ENOMEM; |
---|
| 2111 | + |
---|
| 2112 | + trace_android_vh_get_from_fragment_pool(mm, info, &addr); |
---|
| 2113 | + if (addr) |
---|
| 2114 | + return addr; |
---|
1964 | 2115 | |
---|
1965 | 2116 | /* |
---|
1966 | 2117 | * Adjust search limits by the desired length. |
---|
.. | .. |
---|
2049 | 2200 | VM_BUG_ON(gap_end < gap_start); |
---|
2050 | 2201 | return gap_end; |
---|
2051 | 2202 | } |
---|
2052 | | -EXPORT_SYMBOL_GPL(unmapped_area_topdown); |
---|
| 2203 | + |
---|
| 2204 | +/* |
---|
| 2205 | + * Search for an unmapped address range. |
---|
| 2206 | + * |
---|
| 2207 | + * We are looking for a range that: |
---|
| 2208 | + * - does not intersect with any VMA; |
---|
| 2209 | + * - is contained within the [low_limit, high_limit) interval; |
---|
| 2210 | + * - is at least the desired size. |
---|
| 2211 | + * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) |
---|
| 2212 | + */ |
---|
| 2213 | +unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) |
---|
| 2214 | +{ |
---|
| 2215 | + unsigned long addr; |
---|
| 2216 | + |
---|
| 2217 | + if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) |
---|
| 2218 | + addr = unmapped_area_topdown(info); |
---|
| 2219 | + else |
---|
| 2220 | + addr = unmapped_area(info); |
---|
| 2221 | + |
---|
| 2222 | + trace_vm_unmapped_area(addr, info); |
---|
| 2223 | + return addr; |
---|
| 2224 | +} |
---|
| 2225 | +EXPORT_SYMBOL_GPL(vm_unmapped_area); |
---|
2053 | 2226 | |
---|
2054 | 2227 | /* Get an address range which is currently unmapped. |
---|
2055 | 2228 | * For shmat() with addr=0. |
---|
.. | .. |
---|
2070 | 2243 | struct mm_struct *mm = current->mm; |
---|
2071 | 2244 | struct vm_area_struct *vma, *prev; |
---|
2072 | 2245 | struct vm_unmapped_area_info info; |
---|
| 2246 | + const unsigned long mmap_end = arch_get_mmap_end(addr); |
---|
2073 | 2247 | |
---|
2074 | | - if (len > TASK_SIZE - mmap_min_addr) |
---|
| 2248 | + if (len > mmap_end - mmap_min_addr) |
---|
2075 | 2249 | return -ENOMEM; |
---|
2076 | 2250 | |
---|
2077 | 2251 | if (flags & MAP_FIXED) |
---|
.. | .. |
---|
2080 | 2254 | if (addr) { |
---|
2081 | 2255 | addr = PAGE_ALIGN(addr); |
---|
2082 | 2256 | vma = find_vma_prev(mm, addr, &prev); |
---|
2083 | | - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && |
---|
| 2257 | + if (mmap_end - len >= addr && addr >= mmap_min_addr && |
---|
2084 | 2258 | (!vma || addr + len <= vm_start_gap(vma)) && |
---|
2085 | 2259 | (!prev || addr >= vm_end_gap(prev))) |
---|
2086 | 2260 | return addr; |
---|
.. | .. |
---|
2089 | 2263 | info.flags = 0; |
---|
2090 | 2264 | info.length = len; |
---|
2091 | 2265 | info.low_limit = mm->mmap_base; |
---|
2092 | | - info.high_limit = TASK_SIZE; |
---|
| 2266 | + info.high_limit = mmap_end; |
---|
2093 | 2267 | info.align_mask = 0; |
---|
2094 | 2268 | info.align_offset = 0; |
---|
2095 | 2269 | return vm_unmapped_area(&info); |
---|
.. | .. |
---|
2102 | 2276 | */ |
---|
2103 | 2277 | #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN |
---|
2104 | 2278 | unsigned long |
---|
2105 | | -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, |
---|
2106 | | - const unsigned long len, const unsigned long pgoff, |
---|
2107 | | - const unsigned long flags) |
---|
| 2279 | +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, |
---|
| 2280 | + unsigned long len, unsigned long pgoff, |
---|
| 2281 | + unsigned long flags) |
---|
2108 | 2282 | { |
---|
2109 | 2283 | struct vm_area_struct *vma, *prev; |
---|
2110 | 2284 | struct mm_struct *mm = current->mm; |
---|
2111 | | - unsigned long addr = addr0; |
---|
2112 | 2285 | struct vm_unmapped_area_info info; |
---|
| 2286 | + const unsigned long mmap_end = arch_get_mmap_end(addr); |
---|
2113 | 2287 | |
---|
2114 | 2288 | /* requested length too big for entire address space */ |
---|
2115 | | - if (len > TASK_SIZE - mmap_min_addr) |
---|
| 2289 | + if (len > mmap_end - mmap_min_addr) |
---|
2116 | 2290 | return -ENOMEM; |
---|
2117 | 2291 | |
---|
2118 | 2292 | if (flags & MAP_FIXED) |
---|
.. | .. |
---|
2122 | 2296 | if (addr) { |
---|
2123 | 2297 | addr = PAGE_ALIGN(addr); |
---|
2124 | 2298 | vma = find_vma_prev(mm, addr, &prev); |
---|
2125 | | - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && |
---|
| 2299 | + if (mmap_end - len >= addr && addr >= mmap_min_addr && |
---|
2126 | 2300 | (!vma || addr + len <= vm_start_gap(vma)) && |
---|
2127 | 2301 | (!prev || addr >= vm_end_gap(prev))) |
---|
2128 | 2302 | return addr; |
---|
.. | .. |
---|
2131 | 2305 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
---|
2132 | 2306 | info.length = len; |
---|
2133 | 2307 | info.low_limit = max(PAGE_SIZE, mmap_min_addr); |
---|
2134 | | - info.high_limit = mm->mmap_base; |
---|
| 2308 | + info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); |
---|
2135 | 2309 | info.align_mask = 0; |
---|
2136 | 2310 | info.align_offset = 0; |
---|
| 2311 | + trace_android_vh_exclude_reserved_zone(mm, &info); |
---|
2137 | 2312 | addr = vm_unmapped_area(&info); |
---|
2138 | 2313 | |
---|
2139 | 2314 | /* |
---|
.. | .. |
---|
2146 | 2321 | VM_BUG_ON(addr != -ENOMEM); |
---|
2147 | 2322 | info.flags = 0; |
---|
2148 | 2323 | info.low_limit = TASK_UNMAPPED_BASE; |
---|
2149 | | - info.high_limit = TASK_SIZE; |
---|
| 2324 | + info.high_limit = mmap_end; |
---|
2150 | 2325 | addr = vm_unmapped_area(&info); |
---|
2151 | 2326 | } |
---|
| 2327 | + |
---|
| 2328 | + trace_android_vh_include_reserved_zone(mm, &info, &addr); |
---|
2152 | 2329 | |
---|
2153 | 2330 | return addr; |
---|
2154 | 2331 | } |
---|
.. | .. |
---|
2177 | 2354 | /* |
---|
2178 | 2355 | * mmap_region() will call shmem_zero_setup() to create a file, |
---|
2179 | 2356 | * so use shmem's get_unmapped_area in case it can be huge. |
---|
2180 | | - * do_mmap_pgoff() will clear pgoff, so match alignment. |
---|
| 2357 | + * do_mmap() will clear pgoff, so match alignment. |
---|
2181 | 2358 | */ |
---|
2182 | 2359 | pgoff = 0; |
---|
2183 | 2360 | get_area = shmem_get_unmapped_area; |
---|
.. | .. |
---|
2199 | 2376 | EXPORT_SYMBOL(get_unmapped_area); |
---|
2200 | 2377 | |
---|
2201 | 2378 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ |
---|
2202 | | -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
---|
| 2379 | +static struct vm_area_struct *__find_vma(struct mm_struct *mm, |
---|
| 2380 | + unsigned long addr) |
---|
2203 | 2381 | { |
---|
2204 | 2382 | struct rb_node *rb_node; |
---|
2205 | | - struct vm_area_struct *vma; |
---|
2206 | | - |
---|
2207 | | - /* Check the cache first. */ |
---|
2208 | | - vma = vmacache_find(mm, addr); |
---|
2209 | | - if (likely(vma)) |
---|
2210 | | - return vma; |
---|
| 2383 | + struct vm_area_struct *vma = NULL; |
---|
2211 | 2384 | |
---|
2212 | 2385 | rb_node = mm->mm_rb.rb_node; |
---|
2213 | 2386 | |
---|
.. | .. |
---|
2225 | 2398 | rb_node = rb_node->rb_right; |
---|
2226 | 2399 | } |
---|
2227 | 2400 | |
---|
| 2401 | + return vma; |
---|
| 2402 | +} |
---|
| 2403 | + |
---|
| 2404 | +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
---|
| 2405 | +{ |
---|
| 2406 | + struct vm_area_struct *vma; |
---|
| 2407 | + |
---|
| 2408 | + /* Check the cache first. */ |
---|
| 2409 | + vma = vmacache_find(mm, addr); |
---|
| 2410 | + if (likely(vma)) |
---|
| 2411 | + return vma; |
---|
| 2412 | + |
---|
| 2413 | + vma = __find_vma(mm, addr); |
---|
2228 | 2414 | if (vma) |
---|
2229 | 2415 | vmacache_update(addr, vma); |
---|
2230 | 2416 | return vma; |
---|
2231 | 2417 | } |
---|
2232 | | - |
---|
2233 | 2418 | EXPORT_SYMBOL(find_vma); |
---|
| 2419 | + |
---|
| 2420 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 2421 | +struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr) |
---|
| 2422 | +{ |
---|
| 2423 | + struct vm_area_struct *vma = NULL; |
---|
| 2424 | + |
---|
| 2425 | + read_lock(&mm->mm_rb_lock); |
---|
| 2426 | + vma = __find_vma(mm, addr); |
---|
| 2427 | + |
---|
| 2428 | + /* |
---|
| 2429 | + * If there is a concurrent fast mremap, bail out since the entire |
---|
| 2430 | + * PMD/PUD subtree may have been remapped. |
---|
| 2431 | + * |
---|
| 2432 | + * This is usually safe for conventional mremap since it takes the |
---|
| 2433 | + * PTE locks as does SPF. However fast mremap only takes the lock |
---|
| 2434 | + * at the PMD/PUD level which is ok as it is done with the mmap |
---|
| 2435 | + * write lock held. But since SPF, as the term implies forgoes, |
---|
| 2436 | + * taking the mmap read lock and also cannot take PTL lock at the |
---|
| 2437 | + * larger PMD/PUD granualrity, since it would introduce huge |
---|
| 2438 | + * contention in the page fault path; fall back to regular fault |
---|
| 2439 | + * handling. |
---|
| 2440 | + */ |
---|
| 2441 | + if (vma && !atomic_inc_unless_negative(&vma->vm_ref_count)) |
---|
| 2442 | + vma = NULL; |
---|
| 2443 | + read_unlock(&mm->mm_rb_lock); |
---|
| 2444 | + |
---|
| 2445 | + return vma; |
---|
| 2446 | +} |
---|
| 2447 | +#endif |
---|
2234 | 2448 | |
---|
2235 | 2449 | /* |
---|
2236 | 2450 | * Same as find_vma, but also return a pointer to the previous VMA in *pprev. |
---|
.. | .. |
---|
2245 | 2459 | if (vma) { |
---|
2246 | 2460 | *pprev = vma->vm_prev; |
---|
2247 | 2461 | } else { |
---|
2248 | | - struct rb_node *rb_node = mm->mm_rb.rb_node; |
---|
2249 | | - *pprev = NULL; |
---|
2250 | | - while (rb_node) { |
---|
2251 | | - *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); |
---|
2252 | | - rb_node = rb_node->rb_right; |
---|
2253 | | - } |
---|
| 2462 | + struct rb_node *rb_node = rb_last(&mm->mm_rb); |
---|
| 2463 | + |
---|
| 2464 | + *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL; |
---|
2254 | 2465 | } |
---|
2255 | 2466 | return vma; |
---|
2256 | 2467 | } |
---|
.. | .. |
---|
2330 | 2541 | gap_addr = TASK_SIZE; |
---|
2331 | 2542 | |
---|
2332 | 2543 | next = vma->vm_next; |
---|
2333 | | - if (next && next->vm_start < gap_addr && |
---|
2334 | | - (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { |
---|
| 2544 | + if (next && next->vm_start < gap_addr && vma_is_accessible(next)) { |
---|
2335 | 2545 | if (!(next->vm_flags & VM_GROWSUP)) |
---|
2336 | 2546 | return -ENOMEM; |
---|
2337 | 2547 | /* Check that both stack segments have the same anon_vma? */ |
---|
.. | .. |
---|
2343 | 2553 | |
---|
2344 | 2554 | /* |
---|
2345 | 2555 | * vma->vm_start/vm_end cannot change under us because the caller |
---|
2346 | | - * is required to hold the mmap_sem in read mode. We need the |
---|
| 2556 | + * is required to hold the mmap_lock in read mode. We need the |
---|
2347 | 2557 | * anon_vma lock to serialize against concurrent expand_stacks. |
---|
2348 | 2558 | */ |
---|
2349 | 2559 | anon_vma_lock_write(vma->anon_vma); |
---|
.. | .. |
---|
2361 | 2571 | if (!error) { |
---|
2362 | 2572 | /* |
---|
2363 | 2573 | * vma_gap_update() doesn't support concurrent |
---|
2364 | | - * updates, but we only hold a shared mmap_sem |
---|
| 2574 | + * updates, but we only hold a shared mmap_lock |
---|
2365 | 2575 | * lock here, so we need to protect against |
---|
2366 | 2576 | * concurrent vma expansions. |
---|
2367 | 2577 | * anon_vma_lock_write() doesn't help here, as |
---|
.. | .. |
---|
2412 | 2622 | prev = vma->vm_prev; |
---|
2413 | 2623 | /* Check that both stack segments have the same anon_vma? */ |
---|
2414 | 2624 | if (prev && !(prev->vm_flags & VM_GROWSDOWN) && |
---|
2415 | | - (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { |
---|
| 2625 | + vma_is_accessible(prev)) { |
---|
2416 | 2626 | if (address - prev->vm_end < stack_guard_gap) |
---|
2417 | 2627 | return -ENOMEM; |
---|
2418 | 2628 | } |
---|
.. | .. |
---|
2423 | 2633 | |
---|
2424 | 2634 | /* |
---|
2425 | 2635 | * vma->vm_start/vm_end cannot change under us because the caller |
---|
2426 | | - * is required to hold the mmap_sem in read mode. We need the |
---|
| 2636 | + * is required to hold the mmap_lock in read mode. We need the |
---|
2427 | 2637 | * anon_vma lock to serialize against concurrent expand_stacks. |
---|
2428 | 2638 | */ |
---|
2429 | 2639 | anon_vma_lock_write(vma->anon_vma); |
---|
.. | .. |
---|
2441 | 2651 | if (!error) { |
---|
2442 | 2652 | /* |
---|
2443 | 2653 | * vma_gap_update() doesn't support concurrent |
---|
2444 | | - * updates, but we only hold a shared mmap_sem |
---|
| 2654 | + * updates, but we only hold a shared mmap_lock |
---|
2445 | 2655 | * lock here, so we need to protect against |
---|
2446 | 2656 | * concurrent vma expansions. |
---|
2447 | 2657 | * anon_vma_lock_write() doesn't help here, as |
---|
.. | .. |
---|
2455 | 2665 | mm->locked_vm += grow; |
---|
2456 | 2666 | vm_stat_account(mm, vma->vm_flags, grow); |
---|
2457 | 2667 | anon_vma_interval_tree_pre_update_vma(vma); |
---|
2458 | | - vma->vm_start = address; |
---|
2459 | | - vma->vm_pgoff -= grow; |
---|
| 2668 | + WRITE_ONCE(vma->vm_start, address); |
---|
| 2669 | + WRITE_ONCE(vma->vm_pgoff, vma->vm_pgoff - grow); |
---|
2460 | 2670 | anon_vma_interval_tree_post_update_vma(vma); |
---|
2461 | 2671 | vma_gap_update(vma); |
---|
2462 | 2672 | spin_unlock(&mm->page_table_lock); |
---|
.. | .. |
---|
2483 | 2693 | if (!*endptr) |
---|
2484 | 2694 | stack_guard_gap = val << PAGE_SHIFT; |
---|
2485 | 2695 | |
---|
2486 | | - return 0; |
---|
| 2696 | + return 1; |
---|
2487 | 2697 | } |
---|
2488 | 2698 | __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); |
---|
2489 | 2699 | |
---|
.. | .. |
---|
2503 | 2713 | if (vma && (vma->vm_start <= addr)) |
---|
2504 | 2714 | return vma; |
---|
2505 | 2715 | /* don't alter vm_end if the coredump is running */ |
---|
2506 | | - if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) |
---|
| 2716 | + if (!prev || expand_stack(prev, addr)) |
---|
2507 | 2717 | return NULL; |
---|
2508 | 2718 | if (prev->vm_flags & VM_LOCKED) |
---|
2509 | 2719 | populate_vma_page_range(prev, addr, prev->vm_end, NULL); |
---|
.. | .. |
---|
2528 | 2738 | if (vma->vm_start <= addr) |
---|
2529 | 2739 | return vma; |
---|
2530 | 2740 | if (!(vma->vm_flags & VM_GROWSDOWN)) |
---|
2531 | | - return NULL; |
---|
2532 | | - /* don't alter vm_start if the coredump is running */ |
---|
2533 | | - if (!mmget_still_valid(mm)) |
---|
2534 | 2741 | return NULL; |
---|
2535 | 2742 | start = vma->vm_start; |
---|
2536 | 2743 | if (expand_stack(vma, addr)) |
---|
.. | .. |
---|
2576 | 2783 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
---|
2577 | 2784 | unsigned long start, unsigned long end) |
---|
2578 | 2785 | { |
---|
2579 | | - struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; |
---|
| 2786 | + struct vm_area_struct *next = vma_next(mm, prev); |
---|
2580 | 2787 | struct mmu_gather tlb; |
---|
| 2788 | + struct vm_area_struct *cur_vma; |
---|
2581 | 2789 | |
---|
2582 | 2790 | lru_add_drain(); |
---|
2583 | 2791 | tlb_gather_mmu(&tlb, mm, start, end); |
---|
2584 | 2792 | update_hiwater_rss(mm); |
---|
2585 | 2793 | unmap_vmas(&tlb, vma, start, end); |
---|
| 2794 | + |
---|
| 2795 | + /* |
---|
| 2796 | + * Ensure we have no stale TLB entries by the time this mapping is |
---|
| 2797 | + * removed from the rmap. |
---|
| 2798 | + * Note that we don't have to worry about nested flushes here because |
---|
| 2799 | + * we're holding the mm semaphore for removing the mapping - so any |
---|
| 2800 | + * concurrent flush in this region has to be coming through the rmap, |
---|
| 2801 | + * and we synchronize against that using the rmap lock. |
---|
| 2802 | + */ |
---|
| 2803 | + for (cur_vma = vma; cur_vma; cur_vma = cur_vma->vm_next) { |
---|
| 2804 | + if ((cur_vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) != 0) { |
---|
| 2805 | + tlb_flush_mmu(&tlb); |
---|
| 2806 | + break; |
---|
| 2807 | + } |
---|
| 2808 | + } |
---|
| 2809 | + |
---|
2586 | 2810 | free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, |
---|
2587 | 2811 | next ? next->vm_start : USER_PGTABLES_CEILING); |
---|
2588 | 2812 | tlb_finish_mmu(&tlb, start, end); |
---|
.. | .. |
---|
2592 | 2816 | * Create a list of vma's touched by the unmap, removing them from the mm's |
---|
2593 | 2817 | * vma list as we go.. |
---|
2594 | 2818 | */ |
---|
2595 | | -static void |
---|
| 2819 | +static bool |
---|
2596 | 2820 | detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, |
---|
2597 | 2821 | struct vm_area_struct *prev, unsigned long end) |
---|
2598 | 2822 | { |
---|
.. | .. |
---|
2602 | 2826 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); |
---|
2603 | 2827 | vma->vm_prev = NULL; |
---|
2604 | 2828 | do { |
---|
2605 | | - vma_rb_erase(vma, &mm->mm_rb); |
---|
| 2829 | + vma_rb_erase(vma, mm); |
---|
2606 | 2830 | mm->map_count--; |
---|
2607 | 2831 | tail_vma = vma; |
---|
2608 | 2832 | vma = vma->vm_next; |
---|
.. | .. |
---|
2617 | 2841 | |
---|
2618 | 2842 | /* Kill the cache */ |
---|
2619 | 2843 | vmacache_invalidate(mm); |
---|
| 2844 | + |
---|
| 2845 | + /* |
---|
| 2846 | + * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or |
---|
| 2847 | + * VM_GROWSUP VMA. Such VMAs can change their size under |
---|
| 2848 | + * down_read(mmap_lock) and collide with the VMA we are about to unmap. |
---|
| 2849 | + */ |
---|
| 2850 | + if (vma && (vma->vm_flags & VM_GROWSDOWN)) |
---|
| 2851 | + return false; |
---|
| 2852 | + if (prev && (prev->vm_flags & VM_GROWSUP)) |
---|
| 2853 | + return false; |
---|
| 2854 | + return true; |
---|
2620 | 2855 | } |
---|
2621 | 2856 | |
---|
2622 | 2857 | /* |
---|
.. | .. |
---|
2701 | 2936 | * work. This now handles partial unmappings. |
---|
2702 | 2937 | * Jeremy Fitzhardinge <jeremy@goop.org> |
---|
2703 | 2938 | */ |
---|
2704 | | -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, |
---|
2705 | | - struct list_head *uf) |
---|
| 2939 | +int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, |
---|
| 2940 | + struct list_head *uf, bool downgrade) |
---|
2706 | 2941 | { |
---|
2707 | 2942 | unsigned long end; |
---|
2708 | 2943 | struct vm_area_struct *vma, *prev, *last; |
---|
.. | .. |
---|
2711 | 2946 | return -EINVAL; |
---|
2712 | 2947 | |
---|
2713 | 2948 | len = PAGE_ALIGN(len); |
---|
| 2949 | + end = start + len; |
---|
2714 | 2950 | if (len == 0) |
---|
2715 | 2951 | return -EINVAL; |
---|
| 2952 | + |
---|
| 2953 | + /* |
---|
| 2954 | + * arch_unmap() might do unmaps itself. It must be called |
---|
| 2955 | + * and finish any rbtree manipulation before this code |
---|
| 2956 | + * runs and also starts to manipulate the rbtree. |
---|
| 2957 | + */ |
---|
| 2958 | + arch_unmap(mm, start, end); |
---|
2716 | 2959 | |
---|
2717 | 2960 | /* Find the first overlapping VMA */ |
---|
2718 | 2961 | vma = find_vma(mm, start); |
---|
.. | .. |
---|
2722 | 2965 | /* we have start < vma->vm_end */ |
---|
2723 | 2966 | |
---|
2724 | 2967 | /* if it doesn't overlap, we have nothing.. */ |
---|
2725 | | - end = start + len; |
---|
2726 | 2968 | if (vma->vm_start >= end) |
---|
2727 | 2969 | return 0; |
---|
2728 | 2970 | |
---|
.. | .. |
---|
2757 | 2999 | if (error) |
---|
2758 | 3000 | return error; |
---|
2759 | 3001 | } |
---|
2760 | | - vma = prev ? prev->vm_next : mm->mmap; |
---|
| 3002 | + vma = vma_next(mm, prev); |
---|
2761 | 3003 | |
---|
2762 | 3004 | if (unlikely(uf)) { |
---|
2763 | 3005 | /* |
---|
.. | .. |
---|
2784 | 3026 | mm->locked_vm -= vma_pages(tmp); |
---|
2785 | 3027 | munlock_vma_pages_all(tmp); |
---|
2786 | 3028 | } |
---|
| 3029 | + |
---|
2787 | 3030 | tmp = tmp->vm_next; |
---|
2788 | 3031 | } |
---|
2789 | 3032 | } |
---|
2790 | 3033 | |
---|
2791 | | - /* |
---|
2792 | | - * Remove the vma's, and unmap the actual pages |
---|
2793 | | - */ |
---|
2794 | | - detach_vmas_to_be_unmapped(mm, vma, prev, end); |
---|
2795 | | - unmap_region(mm, vma, prev, start, end); |
---|
| 3034 | + /* Detach vmas from rbtree */ |
---|
| 3035 | + if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) |
---|
| 3036 | + downgrade = false; |
---|
2796 | 3037 | |
---|
2797 | | - arch_unmap(mm, vma, start, end); |
---|
| 3038 | + if (downgrade) |
---|
| 3039 | + mmap_write_downgrade(mm); |
---|
| 3040 | + |
---|
| 3041 | + unmap_region(mm, vma, prev, start, end); |
---|
2798 | 3042 | |
---|
2799 | 3043 | /* Fix up all other VM information */ |
---|
2800 | 3044 | remove_vma_list(mm, vma); |
---|
2801 | 3045 | |
---|
2802 | | - return 0; |
---|
| 3046 | + return downgrade ? 1 : 0; |
---|
2803 | 3047 | } |
---|
2804 | 3048 | |
---|
2805 | | -int vm_munmap(unsigned long start, size_t len) |
---|
| 3049 | +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, |
---|
| 3050 | + struct list_head *uf) |
---|
| 3051 | +{ |
---|
| 3052 | + return __do_munmap(mm, start, len, uf, false); |
---|
| 3053 | +} |
---|
| 3054 | + |
---|
| 3055 | +static int __vm_munmap(unsigned long start, size_t len, bool downgrade) |
---|
2806 | 3056 | { |
---|
2807 | 3057 | int ret; |
---|
2808 | 3058 | struct mm_struct *mm = current->mm; |
---|
2809 | 3059 | LIST_HEAD(uf); |
---|
2810 | 3060 | |
---|
2811 | | - if (down_write_killable(&mm->mmap_sem)) |
---|
| 3061 | + if (mmap_write_lock_killable(mm)) |
---|
2812 | 3062 | return -EINTR; |
---|
2813 | 3063 | |
---|
2814 | | - ret = do_munmap(mm, start, len, &uf); |
---|
2815 | | - up_write(&mm->mmap_sem); |
---|
| 3064 | + ret = __do_munmap(mm, start, len, &uf, downgrade); |
---|
| 3065 | + /* |
---|
| 3066 | + * Returning 1 indicates mmap_lock is downgraded. |
---|
| 3067 | + * But 1 is not legal return value of vm_munmap() and munmap(), reset |
---|
| 3068 | + * it to 0 before return. |
---|
| 3069 | + */ |
---|
| 3070 | + if (ret == 1) { |
---|
| 3071 | + mmap_read_unlock(mm); |
---|
| 3072 | + ret = 0; |
---|
| 3073 | + } else |
---|
| 3074 | + mmap_write_unlock(mm); |
---|
| 3075 | + |
---|
2816 | 3076 | userfaultfd_unmap_complete(mm, &uf); |
---|
2817 | 3077 | return ret; |
---|
| 3078 | +} |
---|
| 3079 | + |
---|
| 3080 | +int vm_munmap(unsigned long start, size_t len) |
---|
| 3081 | +{ |
---|
| 3082 | + return __vm_munmap(start, len, false); |
---|
2818 | 3083 | } |
---|
2819 | 3084 | EXPORT_SYMBOL(vm_munmap); |
---|
2820 | 3085 | |
---|
.. | .. |
---|
2822 | 3087 | { |
---|
2823 | 3088 | addr = untagged_addr(addr); |
---|
2824 | 3089 | profile_munmap(addr); |
---|
2825 | | - return vm_munmap(addr, len); |
---|
| 3090 | + return __vm_munmap(addr, len, true); |
---|
2826 | 3091 | } |
---|
2827 | 3092 | |
---|
2828 | 3093 | |
---|
.. | .. |
---|
2854 | 3119 | if (pgoff + (size >> PAGE_SHIFT) < pgoff) |
---|
2855 | 3120 | return ret; |
---|
2856 | 3121 | |
---|
2857 | | - if (down_write_killable(&mm->mmap_sem)) |
---|
| 3122 | + if (mmap_write_lock_killable(mm)) |
---|
2858 | 3123 | return -EINTR; |
---|
2859 | 3124 | |
---|
2860 | 3125 | vma = find_vma(mm, start); |
---|
.. | .. |
---|
2913 | 3178 | } |
---|
2914 | 3179 | |
---|
2915 | 3180 | file = get_file(vma->vm_file); |
---|
2916 | | - ret = do_mmap_pgoff(vma->vm_file, start, size, |
---|
| 3181 | + ret = do_mmap(vma->vm_file, start, size, |
---|
2917 | 3182 | prot, flags, pgoff, &populate, NULL); |
---|
2918 | 3183 | fput(file); |
---|
2919 | 3184 | out: |
---|
2920 | | - up_write(&mm->mmap_sem); |
---|
| 3185 | + mmap_write_unlock(mm); |
---|
2921 | 3186 | if (populate) |
---|
2922 | 3187 | mm_populate(ret, populate); |
---|
2923 | 3188 | if (!IS_ERR_VALUE(ret)) |
---|
2924 | 3189 | ret = 0; |
---|
2925 | 3190 | return ret; |
---|
2926 | | -} |
---|
2927 | | - |
---|
2928 | | -static inline void verify_mm_writelocked(struct mm_struct *mm) |
---|
2929 | | -{ |
---|
2930 | | -#ifdef CONFIG_DEBUG_VM |
---|
2931 | | - if (unlikely(down_read_trylock(&mm->mmap_sem))) { |
---|
2932 | | - WARN_ON(1); |
---|
2933 | | - up_read(&mm->mmap_sem); |
---|
2934 | | - } |
---|
2935 | | -#endif |
---|
2936 | 3191 | } |
---|
2937 | 3192 | |
---|
2938 | 3193 | /* |
---|
.. | .. |
---|
2947 | 3202 | struct rb_node **rb_link, *rb_parent; |
---|
2948 | 3203 | pgoff_t pgoff = addr >> PAGE_SHIFT; |
---|
2949 | 3204 | int error; |
---|
| 3205 | + unsigned long mapped_addr; |
---|
2950 | 3206 | |
---|
2951 | 3207 | /* Until we need other flags, refuse anything except VM_EXEC. */ |
---|
2952 | 3208 | if ((flags & (~VM_EXEC)) != 0) |
---|
2953 | 3209 | return -EINVAL; |
---|
2954 | 3210 | flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; |
---|
2955 | 3211 | |
---|
2956 | | - error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); |
---|
2957 | | - if (offset_in_page(error)) |
---|
2958 | | - return error; |
---|
| 3212 | + mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); |
---|
| 3213 | + if (IS_ERR_VALUE(mapped_addr)) |
---|
| 3214 | + return mapped_addr; |
---|
2959 | 3215 | |
---|
2960 | 3216 | error = mlock_future_check(mm, mm->def_flags, len); |
---|
2961 | 3217 | if (error) |
---|
2962 | 3218 | return error; |
---|
2963 | 3219 | |
---|
2964 | | - /* |
---|
2965 | | - * mm->mmap_sem is required to protect against another thread |
---|
2966 | | - * changing the mappings in case we sleep. |
---|
2967 | | - */ |
---|
2968 | | - verify_mm_writelocked(mm); |
---|
2969 | | - |
---|
2970 | | - /* |
---|
2971 | | - * Clear old maps. this also does some error checking for us |
---|
2972 | | - */ |
---|
2973 | | - while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, |
---|
2974 | | - &rb_parent)) { |
---|
2975 | | - if (do_munmap(mm, addr, len, uf)) |
---|
2976 | | - return -ENOMEM; |
---|
2977 | | - } |
---|
| 3220 | + /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ |
---|
| 3221 | + if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) |
---|
| 3222 | + return -ENOMEM; |
---|
2978 | 3223 | |
---|
2979 | 3224 | /* Check against address space limits *after* clearing old maps... */ |
---|
2980 | 3225 | if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) |
---|
.. | .. |
---|
3032 | 3277 | if (!len) |
---|
3033 | 3278 | return 0; |
---|
3034 | 3279 | |
---|
3035 | | - if (down_write_killable(&mm->mmap_sem)) |
---|
| 3280 | + if (mmap_write_lock_killable(mm)) |
---|
3036 | 3281 | return -EINTR; |
---|
3037 | 3282 | |
---|
3038 | 3283 | ret = do_brk_flags(addr, len, flags, &uf); |
---|
3039 | 3284 | populate = ((mm->def_flags & VM_LOCKED) != 0); |
---|
3040 | | - up_write(&mm->mmap_sem); |
---|
| 3285 | + mmap_write_unlock(mm); |
---|
3041 | 3286 | userfaultfd_unmap_complete(mm, &uf); |
---|
3042 | 3287 | if (populate && !ret) |
---|
3043 | 3288 | mm_populate(addr, len); |
---|
.. | .. |
---|
3065 | 3310 | /* |
---|
3066 | 3311 | * Manually reap the mm to free as much memory as possible. |
---|
3067 | 3312 | * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard |
---|
3068 | | - * this mm from further consideration. Taking mm->mmap_sem for |
---|
| 3313 | + * this mm from further consideration. Taking mm->mmap_lock for |
---|
3069 | 3314 | * write after setting MMF_OOM_SKIP will guarantee that the oom |
---|
3070 | | - * reaper will not run on this mm again after mmap_sem is |
---|
| 3315 | + * reaper will not run on this mm again after mmap_lock is |
---|
3071 | 3316 | * dropped. |
---|
3072 | 3317 | * |
---|
3073 | | - * Nothing can be holding mm->mmap_sem here and the above call |
---|
| 3318 | + * Nothing can be holding mm->mmap_lock here and the above call |
---|
3074 | 3319 | * to mmu_notifier_release(mm) ensures mmu notifier callbacks in |
---|
3075 | 3320 | * __oom_reap_task_mm() will not block. |
---|
3076 | 3321 | * |
---|
.. | .. |
---|
3081 | 3326 | (void)__oom_reap_task_mm(mm); |
---|
3082 | 3327 | |
---|
3083 | 3328 | set_bit(MMF_OOM_SKIP, &mm->flags); |
---|
3084 | | - down_write(&mm->mmap_sem); |
---|
3085 | | - up_write(&mm->mmap_sem); |
---|
3086 | 3329 | } |
---|
3087 | 3330 | |
---|
| 3331 | + mmap_write_lock(mm); |
---|
3088 | 3332 | if (mm->locked_vm) { |
---|
3089 | 3333 | vma = mm->mmap; |
---|
3090 | 3334 | while (vma) { |
---|
.. | .. |
---|
3097 | 3341 | arch_exit_mmap(mm); |
---|
3098 | 3342 | |
---|
3099 | 3343 | vma = mm->mmap; |
---|
3100 | | - if (!vma) /* Can happen if dup_mmap() received an OOM */ |
---|
| 3344 | + if (!vma) { |
---|
| 3345 | + /* Can happen if dup_mmap() received an OOM */ |
---|
| 3346 | + mmap_write_unlock(mm); |
---|
3101 | 3347 | return; |
---|
| 3348 | + } |
---|
3102 | 3349 | |
---|
3103 | 3350 | lru_add_drain(); |
---|
3104 | 3351 | flush_cache_mm(mm); |
---|
.. | .. |
---|
3109 | 3356 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); |
---|
3110 | 3357 | tlb_finish_mmu(&tlb, 0, -1); |
---|
3111 | 3358 | |
---|
3112 | | - /* |
---|
3113 | | - * Walk the list again, actually closing and freeing it, |
---|
3114 | | - * with preemption enabled, without holding any MM locks. |
---|
3115 | | - */ |
---|
| 3359 | + /* Walk the list again, actually closing and freeing it. */ |
---|
3116 | 3360 | while (vma) { |
---|
3117 | 3361 | if (vma->vm_flags & VM_ACCOUNT) |
---|
3118 | 3362 | nr_accounted += vma_pages(vma); |
---|
3119 | 3363 | vma = remove_vma(vma); |
---|
3120 | 3364 | cond_resched(); |
---|
3121 | 3365 | } |
---|
| 3366 | + mm->mmap = NULL; |
---|
| 3367 | + mmap_write_unlock(mm); |
---|
3122 | 3368 | vm_unacct_memory(nr_accounted); |
---|
3123 | 3369 | } |
---|
3124 | 3370 | |
---|
.. | .. |
---|
3148 | 3394 | * By setting it to reflect the virtual start address of the |
---|
3149 | 3395 | * vma, merges and splits can happen in a seamless way, just |
---|
3150 | 3396 | * using the existing file pgoff checks and manipulations. |
---|
3151 | | - * Similarly in do_mmap_pgoff and in do_brk. |
---|
| 3397 | + * Similarly in do_mmap and in do_brk_flags. |
---|
3152 | 3398 | */ |
---|
3153 | 3399 | if (vma_is_anonymous(vma)) { |
---|
3154 | 3400 | BUG_ON(vma->anon_vma); |
---|
.. | .. |
---|
3185 | 3431 | |
---|
3186 | 3432 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) |
---|
3187 | 3433 | return NULL; /* should never get here */ |
---|
3188 | | - new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, |
---|
3189 | | - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), |
---|
3190 | | - vma->vm_userfaultfd_ctx, vma_get_anon_name(vma)); |
---|
| 3434 | + |
---|
| 3435 | + /* There is 3 cases to manage here in |
---|
| 3436 | + * AAAA AAAA AAAA AAAA |
---|
| 3437 | + * PPPP.... PPPP......NNNN PPPP....NNNN PP........NN |
---|
| 3438 | + * PPPPPPPP(A) PPPP..NNNNNNNN(B) PPPPPPPPPPPP(1) NULL |
---|
| 3439 | + * PPPPPPPPNNNN(2) |
---|
| 3440 | + * PPPPNNNNNNNN(3) |
---|
| 3441 | + * |
---|
| 3442 | + * new_vma == prev in case A,1,2 |
---|
| 3443 | + * new_vma == next in case B,3 |
---|
| 3444 | + */ |
---|
| 3445 | + new_vma = __vma_merge(mm, prev, addr, addr + len, vma->vm_flags, |
---|
| 3446 | + vma->anon_vma, vma->vm_file, pgoff, |
---|
| 3447 | + vma_policy(vma), vma->vm_userfaultfd_ctx, |
---|
| 3448 | + vma_get_anon_name(vma), true); |
---|
3191 | 3449 | if (new_vma) { |
---|
3192 | 3450 | /* |
---|
3193 | 3451 | * Source vma may have been merged into new_vma |
---|
.. | .. |
---|
3225 | 3483 | get_file(new_vma->vm_file); |
---|
3226 | 3484 | if (new_vma->vm_ops && new_vma->vm_ops->open) |
---|
3227 | 3485 | new_vma->vm_ops->open(new_vma); |
---|
| 3486 | + /* |
---|
| 3487 | + * As the VMA is linked right now, it may be hit by the |
---|
| 3488 | + * speculative page fault handler. But we don't want it to |
---|
| 3489 | + * to start mapping page in this area until the caller has |
---|
| 3490 | + * potentially move the pte from the moved VMA. To prevent |
---|
| 3491 | + * that we protect it right now, and let the caller unprotect |
---|
| 3492 | + * it once the move is done. |
---|
| 3493 | + */ |
---|
| 3494 | + vm_write_begin(new_vma); |
---|
3228 | 3495 | vma_link(mm, new_vma, prev, rb_link, rb_parent); |
---|
3229 | 3496 | *need_rmap_locks = false; |
---|
3230 | 3497 | } |
---|
.. | .. |
---|
3311 | 3578 | .fault = special_mapping_fault, |
---|
3312 | 3579 | .mremap = special_mapping_mremap, |
---|
3313 | 3580 | .name = special_mapping_name, |
---|
| 3581 | + /* vDSO code relies that VVAR can't be accessed remotely */ |
---|
| 3582 | + .access = NULL, |
---|
3314 | 3583 | }; |
---|
3315 | 3584 | |
---|
3316 | 3585 | static const struct vm_operations_struct legacy_special_mapping_vmops = { |
---|
.. | .. |
---|
3394 | 3663 | } |
---|
3395 | 3664 | |
---|
3396 | 3665 | /* |
---|
3397 | | - * Called with mm->mmap_sem held for writing. |
---|
| 3666 | + * Called with mm->mmap_lock held for writing. |
---|
3398 | 3667 | * Insert a new vma covering the given region, with the given flags. |
---|
3399 | 3668 | * Its pages are supplied by the given array of struct page *. |
---|
3400 | 3669 | * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. |
---|
.. | .. |
---|
3431 | 3700 | * The LSB of head.next can't change from under us |
---|
3432 | 3701 | * because we hold the mm_all_locks_mutex. |
---|
3433 | 3702 | */ |
---|
3434 | | - down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); |
---|
| 3703 | + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); |
---|
3435 | 3704 | /* |
---|
3436 | 3705 | * We can safely modify head.next after taking the |
---|
3437 | 3706 | * anon_vma->root->rwsem. If some other vma in this mm shares |
---|
.. | .. |
---|
3461 | 3730 | */ |
---|
3462 | 3731 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) |
---|
3463 | 3732 | BUG(); |
---|
3464 | | - down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); |
---|
| 3733 | + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); |
---|
3465 | 3734 | } |
---|
3466 | 3735 | } |
---|
3467 | 3736 | |
---|
.. | .. |
---|
3470 | 3739 | * operations that could ever happen on a certain mm. This includes |
---|
3471 | 3740 | * vmtruncate, try_to_unmap, and all page faults. |
---|
3472 | 3741 | * |
---|
3473 | | - * The caller must take the mmap_sem in write mode before calling |
---|
| 3742 | + * The caller must take the mmap_lock in write mode before calling |
---|
3474 | 3743 | * mm_take_all_locks(). The caller isn't allowed to release the |
---|
3475 | | - * mmap_sem until mm_drop_all_locks() returns. |
---|
| 3744 | + * mmap_lock until mm_drop_all_locks() returns. |
---|
3476 | 3745 | * |
---|
3477 | | - * mmap_sem in write mode is required in order to block all operations |
---|
| 3746 | + * mmap_lock in write mode is required in order to block all operations |
---|
3478 | 3747 | * that could modify pagetables and free pages without need of |
---|
3479 | 3748 | * altering the vma layout. It's also needed in write mode to avoid new |
---|
3480 | 3749 | * anon_vmas to be associated with existing vmas. |
---|
.. | .. |
---|
3507 | 3776 | struct vm_area_struct *vma; |
---|
3508 | 3777 | struct anon_vma_chain *avc; |
---|
3509 | 3778 | |
---|
3510 | | - BUG_ON(down_read_trylock(&mm->mmap_sem)); |
---|
| 3779 | + BUG_ON(mmap_read_trylock(mm)); |
---|
3511 | 3780 | |
---|
3512 | 3781 | mutex_lock(&mm_all_locks_mutex); |
---|
3513 | 3782 | |
---|
.. | .. |
---|
3579 | 3848 | } |
---|
3580 | 3849 | |
---|
3581 | 3850 | /* |
---|
3582 | | - * The mmap_sem cannot be released by the caller until |
---|
| 3851 | + * The mmap_lock cannot be released by the caller until |
---|
3583 | 3852 | * mm_drop_all_locks() returns. |
---|
3584 | 3853 | */ |
---|
3585 | 3854 | void mm_drop_all_locks(struct mm_struct *mm) |
---|
.. | .. |
---|
3587 | 3856 | struct vm_area_struct *vma; |
---|
3588 | 3857 | struct anon_vma_chain *avc; |
---|
3589 | 3858 | |
---|
3590 | | - BUG_ON(down_read_trylock(&mm->mmap_sem)); |
---|
| 3859 | + BUG_ON(mmap_read_trylock(mm)); |
---|
3591 | 3860 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); |
---|
3592 | 3861 | |
---|
3593 | 3862 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
---|