| .. | .. |
|---|
| 17 | 17 | #include <linux/mempolicy.h> |
|---|
| 18 | 18 | #include <linux/syscalls.h> |
|---|
| 19 | 19 | #include <linux/sched.h> |
|---|
| 20 | +#include <linux/page_pinner.h> |
|---|
| 20 | 21 | #include <linux/export.h> |
|---|
| 21 | 22 | #include <linux/rmap.h> |
|---|
| 22 | 23 | #include <linux/mmzone.h> |
|---|
| .. | .. |
|---|
| 49 | 50 | * When lazy mlocking via vmscan, it is important to ensure that the |
|---|
| 50 | 51 | * vma's VM_LOCKED status is not concurrently being modified, otherwise we |
|---|
| 51 | 52 | * may have mlocked a page that is being munlocked. So lazy mlock must take |
|---|
| 52 | | - * the mmap_sem for read, and verify that the vma really is locked |
|---|
| 53 | + * the mmap_lock for read, and verify that the vma really is locked |
|---|
| 53 | 54 | * (see mm/rmap.c). |
|---|
| 54 | 55 | */ |
|---|
| 55 | 56 | |
|---|
| .. | .. |
|---|
| 58 | 59 | */ |
|---|
| 59 | 60 | void clear_page_mlock(struct page *page) |
|---|
| 60 | 61 | { |
|---|
| 62 | + int nr_pages; |
|---|
| 63 | + |
|---|
| 61 | 64 | if (!TestClearPageMlocked(page)) |
|---|
| 62 | 65 | return; |
|---|
| 63 | 66 | |
|---|
| 64 | | - mod_zone_page_state(page_zone(page), NR_MLOCK, |
|---|
| 65 | | - -hpage_nr_pages(page)); |
|---|
| 66 | | - count_vm_event(UNEVICTABLE_PGCLEARED); |
|---|
| 67 | + nr_pages = thp_nr_pages(page); |
|---|
| 68 | + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); |
|---|
| 69 | + count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); |
|---|
| 67 | 70 | /* |
|---|
| 68 | 71 | * The previous TestClearPageMlocked() corresponds to the smp_mb() |
|---|
| 69 | 72 | * in __pagevec_lru_add_fn(). |
|---|
| .. | .. |
|---|
| 77 | 80 | * We lost the race. the page already moved to evictable list. |
|---|
| 78 | 81 | */ |
|---|
| 79 | 82 | if (PageUnevictable(page)) |
|---|
| 80 | | - count_vm_event(UNEVICTABLE_PGSTRANDED); |
|---|
| 83 | + count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); |
|---|
| 81 | 84 | } |
|---|
| 82 | 85 | } |
|---|
| 83 | 86 | |
|---|
| .. | .. |
|---|
| 94 | 97 | VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); |
|---|
| 95 | 98 | |
|---|
| 96 | 99 | if (!TestSetPageMlocked(page)) { |
|---|
| 97 | | - mod_zone_page_state(page_zone(page), NR_MLOCK, |
|---|
| 98 | | - hpage_nr_pages(page)); |
|---|
| 99 | | - count_vm_event(UNEVICTABLE_PGMLOCKED); |
|---|
| 100 | + int nr_pages = thp_nr_pages(page); |
|---|
| 101 | + |
|---|
| 102 | + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); |
|---|
| 103 | + count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); |
|---|
| 100 | 104 | if (!isolate_lru_page(page)) |
|---|
| 101 | 105 | putback_lru_page(page); |
|---|
| 102 | 106 | } |
|---|
| .. | .. |
|---|
| 139 | 143 | |
|---|
| 140 | 144 | /* Did try_to_unlock() succeed or punt? */ |
|---|
| 141 | 145 | if (!PageMlocked(page)) |
|---|
| 142 | | - count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
|---|
| 146 | + count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page)); |
|---|
| 143 | 147 | |
|---|
| 144 | 148 | putback_lru_page(page); |
|---|
| 145 | 149 | } |
|---|
| .. | .. |
|---|
| 155 | 159 | */ |
|---|
| 156 | 160 | static void __munlock_isolation_failed(struct page *page) |
|---|
| 157 | 161 | { |
|---|
| 162 | + int nr_pages = thp_nr_pages(page); |
|---|
| 163 | + |
|---|
| 158 | 164 | if (PageUnevictable(page)) |
|---|
| 159 | | - __count_vm_event(UNEVICTABLE_PGSTRANDED); |
|---|
| 165 | + __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); |
|---|
| 160 | 166 | else |
|---|
| 161 | | - __count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
|---|
| 167 | + __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); |
|---|
| 162 | 168 | } |
|---|
| 163 | 169 | |
|---|
| 164 | 170 | /** |
|---|
| .. | .. |
|---|
| 182 | 188 | unsigned int munlock_vma_page(struct page *page) |
|---|
| 183 | 189 | { |
|---|
| 184 | 190 | int nr_pages; |
|---|
| 185 | | - struct zone *zone = page_zone(page); |
|---|
| 191 | + pg_data_t *pgdat = page_pgdat(page); |
|---|
| 186 | 192 | |
|---|
| 187 | 193 | /* For try_to_munlock() and to serialize with page migration */ |
|---|
| 188 | 194 | BUG_ON(!PageLocked(page)); |
|---|
| .. | .. |
|---|
| 192 | 198 | /* |
|---|
| 193 | 199 | * Serialize with any parallel __split_huge_page_refcount() which |
|---|
| 194 | 200 | * might otherwise copy PageMlocked to part of the tail pages before |
|---|
| 195 | | - * we clear it in the head page. It also stabilizes hpage_nr_pages(). |
|---|
| 201 | + * we clear it in the head page. It also stabilizes thp_nr_pages(). |
|---|
| 196 | 202 | */ |
|---|
| 197 | | - spin_lock_irq(zone_lru_lock(zone)); |
|---|
| 203 | + spin_lock_irq(&pgdat->lru_lock); |
|---|
| 198 | 204 | |
|---|
| 199 | 205 | if (!TestClearPageMlocked(page)) { |
|---|
| 200 | 206 | /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ |
|---|
| .. | .. |
|---|
| 202 | 208 | goto unlock_out; |
|---|
| 203 | 209 | } |
|---|
| 204 | 210 | |
|---|
| 205 | | - nr_pages = hpage_nr_pages(page); |
|---|
| 206 | | - __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); |
|---|
| 211 | + nr_pages = thp_nr_pages(page); |
|---|
| 212 | + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); |
|---|
| 207 | 213 | |
|---|
| 208 | 214 | if (__munlock_isolate_lru_page(page, true)) { |
|---|
| 209 | | - spin_unlock_irq(zone_lru_lock(zone)); |
|---|
| 215 | + spin_unlock_irq(&pgdat->lru_lock); |
|---|
| 210 | 216 | __munlock_isolated_page(page); |
|---|
| 211 | 217 | goto out; |
|---|
| 212 | 218 | } |
|---|
| 213 | 219 | __munlock_isolation_failed(page); |
|---|
| 214 | 220 | |
|---|
| 215 | 221 | unlock_out: |
|---|
| 216 | | - spin_unlock_irq(zone_lru_lock(zone)); |
|---|
| 222 | + spin_unlock_irq(&pgdat->lru_lock); |
|---|
| 217 | 223 | |
|---|
| 218 | 224 | out: |
|---|
| 219 | 225 | return nr_pages - 1; |
|---|
| .. | .. |
|---|
| 298 | 304 | pagevec_init(&pvec_putback); |
|---|
| 299 | 305 | |
|---|
| 300 | 306 | /* Phase 1: page isolation */ |
|---|
| 301 | | - spin_lock_irq(zone_lru_lock(zone)); |
|---|
| 307 | + spin_lock_irq(&zone->zone_pgdat->lru_lock); |
|---|
| 302 | 308 | for (i = 0; i < nr; i++) { |
|---|
| 303 | 309 | struct page *page = pvec->pages[i]; |
|---|
| 304 | 310 | |
|---|
| .. | .. |
|---|
| 325 | 331 | pvec->pages[i] = NULL; |
|---|
| 326 | 332 | } |
|---|
| 327 | 333 | __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); |
|---|
| 328 | | - spin_unlock_irq(zone_lru_lock(zone)); |
|---|
| 334 | + spin_unlock_irq(&zone->zone_pgdat->lru_lock); |
|---|
| 329 | 335 | |
|---|
| 330 | 336 | /* Now we can release pins of pages that we are not munlocking */ |
|---|
| 331 | 337 | pagevec_release(&pvec_putback); |
|---|
| .. | .. |
|---|
| 381 | 387 | /* |
|---|
| 382 | 388 | * Initialize pte walk starting at the already pinned page where we |
|---|
| 383 | 389 | * are sure that there is a pte, as it was pinned under the same |
|---|
| 384 | | - * mmap_sem write op. |
|---|
| 390 | + * mmap_lock write op. |
|---|
| 385 | 391 | */ |
|---|
| 386 | 392 | pte = get_locked_pte(vma->vm_mm, start, &ptl); |
|---|
| 387 | 393 | /* Make sure we do not cross the page table boundary */ |
|---|
| .. | .. |
|---|
| 445 | 451 | void munlock_vma_pages_range(struct vm_area_struct *vma, |
|---|
| 446 | 452 | unsigned long start, unsigned long end) |
|---|
| 447 | 453 | { |
|---|
| 448 | | - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; |
|---|
| 454 | + vm_write_begin(vma); |
|---|
| 455 | + WRITE_ONCE(vma->vm_flags, vma->vm_flags & VM_LOCKED_CLEAR_MASK); |
|---|
| 456 | + vm_write_end(vma); |
|---|
| 449 | 457 | |
|---|
| 450 | 458 | while (start < end) { |
|---|
| 451 | 459 | struct page *page; |
|---|
| .. | .. |
|---|
| 463 | 471 | * has sneaked into the range, we won't oops here: great). |
|---|
| 464 | 472 | */ |
|---|
| 465 | 473 | page = follow_page(vma, start, FOLL_GET | FOLL_DUMP); |
|---|
| 466 | | - |
|---|
| 467 | 474 | if (page && !IS_ERR(page)) { |
|---|
| 475 | + /* |
|---|
| 476 | + * munlock_vma_pages_range uses follow_page(FOLL_GET) |
|---|
| 477 | + * so it need to use put_user_page but the munlock |
|---|
| 478 | + * path is quite complicated to deal with each put |
|---|
| 479 | + * sites correctly so just unattribute them to avoid |
|---|
| 480 | + * false positive at this moment. |
|---|
| 481 | + */ |
|---|
| 482 | + reset_page_pinner(page, compound_order(page)); |
|---|
| 468 | 483 | if (PageTransTail(page)) { |
|---|
| 469 | 484 | VM_BUG_ON_PAGE(PageMlocked(page), page); |
|---|
| 470 | 485 | put_page(page); /* follow_page_mask() */ |
|---|
| .. | .. |
|---|
| 565 | 580 | mm->locked_vm += nr_pages; |
|---|
| 566 | 581 | |
|---|
| 567 | 582 | /* |
|---|
| 568 | | - * vm_flags is protected by the mmap_sem held in write mode. |
|---|
| 583 | + * vm_flags is protected by the mmap_lock held in write mode. |
|---|
| 569 | 584 | * It's okay if try_to_unmap_one unmaps a page just after we |
|---|
| 570 | 585 | * set VM_LOCKED, populate_vma_page_range will bring it back. |
|---|
| 571 | 586 | */ |
|---|
| 572 | | - |
|---|
| 573 | | - if (lock) |
|---|
| 574 | | - vma->vm_flags = newflags; |
|---|
| 575 | | - else |
|---|
| 587 | + if (lock) { |
|---|
| 588 | + vm_write_begin(vma); |
|---|
| 589 | + WRITE_ONCE(vma->vm_flags, newflags); |
|---|
| 590 | + vm_write_end(vma); |
|---|
| 591 | + } else |
|---|
| 576 | 592 | munlock_vma_pages_range(vma, start, end); |
|---|
| 577 | 593 | |
|---|
| 578 | 594 | out: |
|---|
| .. | .. |
|---|
| 686 | 702 | lock_limit >>= PAGE_SHIFT; |
|---|
| 687 | 703 | locked = len >> PAGE_SHIFT; |
|---|
| 688 | 704 | |
|---|
| 689 | | - if (down_write_killable(¤t->mm->mmap_sem)) |
|---|
| 705 | + if (mmap_write_lock_killable(current->mm)) |
|---|
| 690 | 706 | return -EINTR; |
|---|
| 691 | 707 | |
|---|
| 692 | 708 | locked += current->mm->locked_vm; |
|---|
| .. | .. |
|---|
| 705 | 721 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) |
|---|
| 706 | 722 | error = apply_vma_lock_flags(start, len, flags); |
|---|
| 707 | 723 | |
|---|
| 708 | | - up_write(¤t->mm->mmap_sem); |
|---|
| 724 | + mmap_write_unlock(current->mm); |
|---|
| 709 | 725 | if (error) |
|---|
| 710 | 726 | return error; |
|---|
| 711 | 727 | |
|---|
| .. | .. |
|---|
| 742 | 758 | len = PAGE_ALIGN(len + (offset_in_page(start))); |
|---|
| 743 | 759 | start &= PAGE_MASK; |
|---|
| 744 | 760 | |
|---|
| 745 | | - if (down_write_killable(¤t->mm->mmap_sem)) |
|---|
| 761 | + if (mmap_write_lock_killable(current->mm)) |
|---|
| 746 | 762 | return -EINTR; |
|---|
| 747 | 763 | ret = apply_vma_lock_flags(start, len, 0); |
|---|
| 748 | | - up_write(¤t->mm->mmap_sem); |
|---|
| 764 | + mmap_write_unlock(current->mm); |
|---|
| 749 | 765 | |
|---|
| 750 | 766 | return ret; |
|---|
| 751 | 767 | } |
|---|
| .. | .. |
|---|
| 801 | 817 | unsigned long lock_limit; |
|---|
| 802 | 818 | int ret; |
|---|
| 803 | 819 | |
|---|
| 804 | | - if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT))) |
|---|
| 820 | + if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) || |
|---|
| 821 | + flags == MCL_ONFAULT) |
|---|
| 805 | 822 | return -EINVAL; |
|---|
| 806 | 823 | |
|---|
| 807 | 824 | if (!can_do_mlock()) |
|---|
| .. | .. |
|---|
| 810 | 827 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
|---|
| 811 | 828 | lock_limit >>= PAGE_SHIFT; |
|---|
| 812 | 829 | |
|---|
| 813 | | - if (down_write_killable(¤t->mm->mmap_sem)) |
|---|
| 830 | + if (mmap_write_lock_killable(current->mm)) |
|---|
| 814 | 831 | return -EINTR; |
|---|
| 815 | 832 | |
|---|
| 816 | 833 | ret = -ENOMEM; |
|---|
| 817 | 834 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || |
|---|
| 818 | 835 | capable(CAP_IPC_LOCK)) |
|---|
| 819 | 836 | ret = apply_mlockall_flags(flags); |
|---|
| 820 | | - up_write(¤t->mm->mmap_sem); |
|---|
| 837 | + mmap_write_unlock(current->mm); |
|---|
| 821 | 838 | if (!ret && (flags & MCL_CURRENT)) |
|---|
| 822 | 839 | mm_populate(0, TASK_SIZE); |
|---|
| 823 | 840 | |
|---|
| .. | .. |
|---|
| 828 | 845 | { |
|---|
| 829 | 846 | int ret; |
|---|
| 830 | 847 | |
|---|
| 831 | | - if (down_write_killable(¤t->mm->mmap_sem)) |
|---|
| 848 | + if (mmap_write_lock_killable(current->mm)) |
|---|
| 832 | 849 | return -EINTR; |
|---|
| 833 | 850 | ret = apply_mlockall_flags(0); |
|---|
| 834 | | - up_write(¤t->mm->mmap_sem); |
|---|
| 851 | + mmap_write_unlock(current->mm); |
|---|
| 835 | 852 | return ret; |
|---|
| 836 | 853 | } |
|---|
| 837 | 854 | |
|---|