.. | .. |
---|
17 | 17 | #include <linux/mempolicy.h> |
---|
18 | 18 | #include <linux/syscalls.h> |
---|
19 | 19 | #include <linux/sched.h> |
---|
| 20 | +#include <linux/page_pinner.h> |
---|
20 | 21 | #include <linux/export.h> |
---|
21 | 22 | #include <linux/rmap.h> |
---|
22 | 23 | #include <linux/mmzone.h> |
---|
.. | .. |
---|
49 | 50 | * When lazy mlocking via vmscan, it is important to ensure that the |
---|
50 | 51 | * vma's VM_LOCKED status is not concurrently being modified, otherwise we |
---|
51 | 52 | * may have mlocked a page that is being munlocked. So lazy mlock must take |
---|
52 | | - * the mmap_sem for read, and verify that the vma really is locked |
---|
| 53 | + * the mmap_lock for read, and verify that the vma really is locked |
---|
53 | 54 | * (see mm/rmap.c). |
---|
54 | 55 | */ |
---|
55 | 56 | |
---|
.. | .. |
---|
58 | 59 | */ |
---|
59 | 60 | void clear_page_mlock(struct page *page) |
---|
60 | 61 | { |
---|
| 62 | + int nr_pages; |
---|
| 63 | + |
---|
61 | 64 | if (!TestClearPageMlocked(page)) |
---|
62 | 65 | return; |
---|
63 | 66 | |
---|
64 | | - mod_zone_page_state(page_zone(page), NR_MLOCK, |
---|
65 | | - -hpage_nr_pages(page)); |
---|
66 | | - count_vm_event(UNEVICTABLE_PGCLEARED); |
---|
| 67 | + nr_pages = thp_nr_pages(page); |
---|
| 68 | + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); |
---|
| 69 | + count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); |
---|
67 | 70 | /* |
---|
68 | 71 | * The previous TestClearPageMlocked() corresponds to the smp_mb() |
---|
69 | 72 | * in __pagevec_lru_add_fn(). |
---|
.. | .. |
---|
77 | 80 | * We lost the race. the page already moved to evictable list. |
---|
78 | 81 | */ |
---|
79 | 82 | if (PageUnevictable(page)) |
---|
80 | | - count_vm_event(UNEVICTABLE_PGSTRANDED); |
---|
| 83 | + count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); |
---|
81 | 84 | } |
---|
82 | 85 | } |
---|
83 | 86 | |
---|
.. | .. |
---|
94 | 97 | VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); |
---|
95 | 98 | |
---|
96 | 99 | if (!TestSetPageMlocked(page)) { |
---|
97 | | - mod_zone_page_state(page_zone(page), NR_MLOCK, |
---|
98 | | - hpage_nr_pages(page)); |
---|
99 | | - count_vm_event(UNEVICTABLE_PGMLOCKED); |
---|
| 100 | + int nr_pages = thp_nr_pages(page); |
---|
| 101 | + |
---|
| 102 | + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); |
---|
| 103 | + count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); |
---|
100 | 104 | if (!isolate_lru_page(page)) |
---|
101 | 105 | putback_lru_page(page); |
---|
102 | 106 | } |
---|
.. | .. |
---|
139 | 143 | |
---|
140 | 144 | /* Did try_to_unlock() succeed or punt? */ |
---|
141 | 145 | if (!PageMlocked(page)) |
---|
142 | | - count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
---|
| 146 | + count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page)); |
---|
143 | 147 | |
---|
144 | 148 | putback_lru_page(page); |
---|
145 | 149 | } |
---|
.. | .. |
---|
155 | 159 | */ |
---|
156 | 160 | static void __munlock_isolation_failed(struct page *page) |
---|
157 | 161 | { |
---|
| 162 | + int nr_pages = thp_nr_pages(page); |
---|
| 163 | + |
---|
158 | 164 | if (PageUnevictable(page)) |
---|
159 | | - __count_vm_event(UNEVICTABLE_PGSTRANDED); |
---|
| 165 | + __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); |
---|
160 | 166 | else |
---|
161 | | - __count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
---|
| 167 | + __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); |
---|
162 | 168 | } |
---|
163 | 169 | |
---|
164 | 170 | /** |
---|
.. | .. |
---|
182 | 188 | unsigned int munlock_vma_page(struct page *page) |
---|
183 | 189 | { |
---|
184 | 190 | int nr_pages; |
---|
185 | | - struct zone *zone = page_zone(page); |
---|
| 191 | + pg_data_t *pgdat = page_pgdat(page); |
---|
186 | 192 | |
---|
187 | 193 | /* For try_to_munlock() and to serialize with page migration */ |
---|
188 | 194 | BUG_ON(!PageLocked(page)); |
---|
.. | .. |
---|
192 | 198 | /* |
---|
193 | 199 | * Serialize with any parallel __split_huge_page_refcount() which |
---|
194 | 200 | * might otherwise copy PageMlocked to part of the tail pages before |
---|
195 | | - * we clear it in the head page. It also stabilizes hpage_nr_pages(). |
---|
| 201 | + * we clear it in the head page. It also stabilizes thp_nr_pages(). |
---|
196 | 202 | */ |
---|
197 | | - spin_lock_irq(zone_lru_lock(zone)); |
---|
| 203 | + spin_lock_irq(&pgdat->lru_lock); |
---|
198 | 204 | |
---|
199 | 205 | if (!TestClearPageMlocked(page)) { |
---|
200 | 206 | /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ |
---|
.. | .. |
---|
202 | 208 | goto unlock_out; |
---|
203 | 209 | } |
---|
204 | 210 | |
---|
205 | | - nr_pages = hpage_nr_pages(page); |
---|
206 | | - __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); |
---|
| 211 | + nr_pages = thp_nr_pages(page); |
---|
| 212 | + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); |
---|
207 | 213 | |
---|
208 | 214 | if (__munlock_isolate_lru_page(page, true)) { |
---|
209 | | - spin_unlock_irq(zone_lru_lock(zone)); |
---|
| 215 | + spin_unlock_irq(&pgdat->lru_lock); |
---|
210 | 216 | __munlock_isolated_page(page); |
---|
211 | 217 | goto out; |
---|
212 | 218 | } |
---|
213 | 219 | __munlock_isolation_failed(page); |
---|
214 | 220 | |
---|
215 | 221 | unlock_out: |
---|
216 | | - spin_unlock_irq(zone_lru_lock(zone)); |
---|
| 222 | + spin_unlock_irq(&pgdat->lru_lock); |
---|
217 | 223 | |
---|
218 | 224 | out: |
---|
219 | 225 | return nr_pages - 1; |
---|
.. | .. |
---|
298 | 304 | pagevec_init(&pvec_putback); |
---|
299 | 305 | |
---|
300 | 306 | /* Phase 1: page isolation */ |
---|
301 | | - spin_lock_irq(zone_lru_lock(zone)); |
---|
| 307 | + spin_lock_irq(&zone->zone_pgdat->lru_lock); |
---|
302 | 308 | for (i = 0; i < nr; i++) { |
---|
303 | 309 | struct page *page = pvec->pages[i]; |
---|
304 | 310 | |
---|
.. | .. |
---|
325 | 331 | pvec->pages[i] = NULL; |
---|
326 | 332 | } |
---|
327 | 333 | __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); |
---|
328 | | - spin_unlock_irq(zone_lru_lock(zone)); |
---|
| 334 | + spin_unlock_irq(&zone->zone_pgdat->lru_lock); |
---|
329 | 335 | |
---|
330 | 336 | /* Now we can release pins of pages that we are not munlocking */ |
---|
331 | 337 | pagevec_release(&pvec_putback); |
---|
.. | .. |
---|
381 | 387 | /* |
---|
382 | 388 | * Initialize pte walk starting at the already pinned page where we |
---|
383 | 389 | * are sure that there is a pte, as it was pinned under the same |
---|
384 | | - * mmap_sem write op. |
---|
| 390 | + * mmap_lock write op. |
---|
385 | 391 | */ |
---|
386 | 392 | pte = get_locked_pte(vma->vm_mm, start, &ptl); |
---|
387 | 393 | /* Make sure we do not cross the page table boundary */ |
---|
.. | .. |
---|
445 | 451 | void munlock_vma_pages_range(struct vm_area_struct *vma, |
---|
446 | 452 | unsigned long start, unsigned long end) |
---|
447 | 453 | { |
---|
448 | | - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; |
---|
| 454 | + vm_write_begin(vma); |
---|
| 455 | + WRITE_ONCE(vma->vm_flags, vma->vm_flags & VM_LOCKED_CLEAR_MASK); |
---|
| 456 | + vm_write_end(vma); |
---|
449 | 457 | |
---|
450 | 458 | while (start < end) { |
---|
451 | 459 | struct page *page; |
---|
.. | .. |
---|
463 | 471 | * has sneaked into the range, we won't oops here: great). |
---|
464 | 472 | */ |
---|
465 | 473 | page = follow_page(vma, start, FOLL_GET | FOLL_DUMP); |
---|
466 | | - |
---|
467 | 474 | if (page && !IS_ERR(page)) { |
---|
| 475 | + /* |
---|
| 476 | + * munlock_vma_pages_range uses follow_page(FOLL_GET) |
---|
| 477 | + * so it need to use put_user_page but the munlock |
---|
| 478 | + * path is quite complicated to deal with each put |
---|
| 479 | + * sites correctly so just unattribute them to avoid |
---|
| 480 | + * false positive at this moment. |
---|
| 481 | + */ |
---|
| 482 | + reset_page_pinner(page, compound_order(page)); |
---|
468 | 483 | if (PageTransTail(page)) { |
---|
469 | 484 | VM_BUG_ON_PAGE(PageMlocked(page), page); |
---|
470 | 485 | put_page(page); /* follow_page_mask() */ |
---|
.. | .. |
---|
565 | 580 | mm->locked_vm += nr_pages; |
---|
566 | 581 | |
---|
567 | 582 | /* |
---|
568 | | - * vm_flags is protected by the mmap_sem held in write mode. |
---|
| 583 | + * vm_flags is protected by the mmap_lock held in write mode. |
---|
569 | 584 | * It's okay if try_to_unmap_one unmaps a page just after we |
---|
570 | 585 | * set VM_LOCKED, populate_vma_page_range will bring it back. |
---|
571 | 586 | */ |
---|
572 | | - |
---|
573 | | - if (lock) |
---|
574 | | - vma->vm_flags = newflags; |
---|
575 | | - else |
---|
| 587 | + if (lock) { |
---|
| 588 | + vm_write_begin(vma); |
---|
| 589 | + WRITE_ONCE(vma->vm_flags, newflags); |
---|
| 590 | + vm_write_end(vma); |
---|
| 591 | + } else |
---|
576 | 592 | munlock_vma_pages_range(vma, start, end); |
---|
577 | 593 | |
---|
578 | 594 | out: |
---|
.. | .. |
---|
686 | 702 | lock_limit >>= PAGE_SHIFT; |
---|
687 | 703 | locked = len >> PAGE_SHIFT; |
---|
688 | 704 | |
---|
689 | | - if (down_write_killable(¤t->mm->mmap_sem)) |
---|
| 705 | + if (mmap_write_lock_killable(current->mm)) |
---|
690 | 706 | return -EINTR; |
---|
691 | 707 | |
---|
692 | 708 | locked += current->mm->locked_vm; |
---|
.. | .. |
---|
705 | 721 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) |
---|
706 | 722 | error = apply_vma_lock_flags(start, len, flags); |
---|
707 | 723 | |
---|
708 | | - up_write(¤t->mm->mmap_sem); |
---|
| 724 | + mmap_write_unlock(current->mm); |
---|
709 | 725 | if (error) |
---|
710 | 726 | return error; |
---|
711 | 727 | |
---|
.. | .. |
---|
742 | 758 | len = PAGE_ALIGN(len + (offset_in_page(start))); |
---|
743 | 759 | start &= PAGE_MASK; |
---|
744 | 760 | |
---|
745 | | - if (down_write_killable(¤t->mm->mmap_sem)) |
---|
| 761 | + if (mmap_write_lock_killable(current->mm)) |
---|
746 | 762 | return -EINTR; |
---|
747 | 763 | ret = apply_vma_lock_flags(start, len, 0); |
---|
748 | | - up_write(¤t->mm->mmap_sem); |
---|
| 764 | + mmap_write_unlock(current->mm); |
---|
749 | 765 | |
---|
750 | 766 | return ret; |
---|
751 | 767 | } |
---|
.. | .. |
---|
801 | 817 | unsigned long lock_limit; |
---|
802 | 818 | int ret; |
---|
803 | 819 | |
---|
804 | | - if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT))) |
---|
| 820 | + if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) || |
---|
| 821 | + flags == MCL_ONFAULT) |
---|
805 | 822 | return -EINVAL; |
---|
806 | 823 | |
---|
807 | 824 | if (!can_do_mlock()) |
---|
.. | .. |
---|
810 | 827 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
---|
811 | 828 | lock_limit >>= PAGE_SHIFT; |
---|
812 | 829 | |
---|
813 | | - if (down_write_killable(¤t->mm->mmap_sem)) |
---|
| 830 | + if (mmap_write_lock_killable(current->mm)) |
---|
814 | 831 | return -EINTR; |
---|
815 | 832 | |
---|
816 | 833 | ret = -ENOMEM; |
---|
817 | 834 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || |
---|
818 | 835 | capable(CAP_IPC_LOCK)) |
---|
819 | 836 | ret = apply_mlockall_flags(flags); |
---|
820 | | - up_write(¤t->mm->mmap_sem); |
---|
| 837 | + mmap_write_unlock(current->mm); |
---|
821 | 838 | if (!ret && (flags & MCL_CURRENT)) |
---|
822 | 839 | mm_populate(0, TASK_SIZE); |
---|
823 | 840 | |
---|
.. | .. |
---|
828 | 845 | { |
---|
829 | 846 | int ret; |
---|
830 | 847 | |
---|
831 | | - if (down_write_killable(¤t->mm->mmap_sem)) |
---|
| 848 | + if (mmap_write_lock_killable(current->mm)) |
---|
832 | 849 | return -EINTR; |
---|
833 | 850 | ret = apply_mlockall_flags(0); |
---|
834 | | - up_write(¤t->mm->mmap_sem); |
---|
| 851 | + mmap_write_unlock(current->mm); |
---|
835 | 852 | return ret; |
---|
836 | 853 | } |
---|
837 | 854 | |
---|