hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/mm/mlock.c
....@@ -17,6 +17,7 @@
1717 #include <linux/mempolicy.h>
1818 #include <linux/syscalls.h>
1919 #include <linux/sched.h>
20
+#include <linux/page_pinner.h>
2021 #include <linux/export.h>
2122 #include <linux/rmap.h>
2223 #include <linux/mmzone.h>
....@@ -49,7 +50,7 @@
4950 * When lazy mlocking via vmscan, it is important to ensure that the
5051 * vma's VM_LOCKED status is not concurrently being modified, otherwise we
5152 * may have mlocked a page that is being munlocked. So lazy mlock must take
52
- * the mmap_sem for read, and verify that the vma really is locked
53
+ * the mmap_lock for read, and verify that the vma really is locked
5354 * (see mm/rmap.c).
5455 */
5556
....@@ -58,12 +59,14 @@
5859 */
5960 void clear_page_mlock(struct page *page)
6061 {
62
+ int nr_pages;
63
+
6164 if (!TestClearPageMlocked(page))
6265 return;
6366
64
- mod_zone_page_state(page_zone(page), NR_MLOCK,
65
- -hpage_nr_pages(page));
66
- count_vm_event(UNEVICTABLE_PGCLEARED);
67
+ nr_pages = thp_nr_pages(page);
68
+ mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
69
+ count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
6770 /*
6871 * The previous TestClearPageMlocked() corresponds to the smp_mb()
6972 * in __pagevec_lru_add_fn().
....@@ -77,7 +80,7 @@
7780 * We lost the race. the page already moved to evictable list.
7881 */
7982 if (PageUnevictable(page))
80
- count_vm_event(UNEVICTABLE_PGSTRANDED);
83
+ count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
8184 }
8285 }
8386
....@@ -94,9 +97,10 @@
9497 VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
9598
9699 if (!TestSetPageMlocked(page)) {
97
- mod_zone_page_state(page_zone(page), NR_MLOCK,
98
- hpage_nr_pages(page));
99
- count_vm_event(UNEVICTABLE_PGMLOCKED);
100
+ int nr_pages = thp_nr_pages(page);
101
+
102
+ mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
103
+ count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
100104 if (!isolate_lru_page(page))
101105 putback_lru_page(page);
102106 }
....@@ -139,7 +143,7 @@
139143
140144 /* Did try_to_unlock() succeed or punt? */
141145 if (!PageMlocked(page))
142
- count_vm_event(UNEVICTABLE_PGMUNLOCKED);
146
+ count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page));
143147
144148 putback_lru_page(page);
145149 }
....@@ -155,10 +159,12 @@
155159 */
156160 static void __munlock_isolation_failed(struct page *page)
157161 {
162
+ int nr_pages = thp_nr_pages(page);
163
+
158164 if (PageUnevictable(page))
159
- __count_vm_event(UNEVICTABLE_PGSTRANDED);
165
+ __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
160166 else
161
- __count_vm_event(UNEVICTABLE_PGMUNLOCKED);
167
+ __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
162168 }
163169
164170 /**
....@@ -182,7 +188,7 @@
182188 unsigned int munlock_vma_page(struct page *page)
183189 {
184190 int nr_pages;
185
- struct zone *zone = page_zone(page);
191
+ pg_data_t *pgdat = page_pgdat(page);
186192
187193 /* For try_to_munlock() and to serialize with page migration */
188194 BUG_ON(!PageLocked(page));
....@@ -192,9 +198,9 @@
192198 /*
193199 * Serialize with any parallel __split_huge_page_refcount() which
194200 * might otherwise copy PageMlocked to part of the tail pages before
195
- * we clear it in the head page. It also stabilizes hpage_nr_pages().
201
+ * we clear it in the head page. It also stabilizes thp_nr_pages().
196202 */
197
- spin_lock_irq(zone_lru_lock(zone));
203
+ spin_lock_irq(&pgdat->lru_lock);
198204
199205 if (!TestClearPageMlocked(page)) {
200206 /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
....@@ -202,18 +208,18 @@
202208 goto unlock_out;
203209 }
204210
205
- nr_pages = hpage_nr_pages(page);
206
- __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
211
+ nr_pages = thp_nr_pages(page);
212
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
207213
208214 if (__munlock_isolate_lru_page(page, true)) {
209
- spin_unlock_irq(zone_lru_lock(zone));
215
+ spin_unlock_irq(&pgdat->lru_lock);
210216 __munlock_isolated_page(page);
211217 goto out;
212218 }
213219 __munlock_isolation_failed(page);
214220
215221 unlock_out:
216
- spin_unlock_irq(zone_lru_lock(zone));
222
+ spin_unlock_irq(&pgdat->lru_lock);
217223
218224 out:
219225 return nr_pages - 1;
....@@ -298,7 +304,7 @@
298304 pagevec_init(&pvec_putback);
299305
300306 /* Phase 1: page isolation */
301
- spin_lock_irq(zone_lru_lock(zone));
307
+ spin_lock_irq(&zone->zone_pgdat->lru_lock);
302308 for (i = 0; i < nr; i++) {
303309 struct page *page = pvec->pages[i];
304310
....@@ -325,7 +331,7 @@
325331 pvec->pages[i] = NULL;
326332 }
327333 __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
328
- spin_unlock_irq(zone_lru_lock(zone));
334
+ spin_unlock_irq(&zone->zone_pgdat->lru_lock);
329335
330336 /* Now we can release pins of pages that we are not munlocking */
331337 pagevec_release(&pvec_putback);
....@@ -381,7 +387,7 @@
381387 /*
382388 * Initialize pte walk starting at the already pinned page where we
383389 * are sure that there is a pte, as it was pinned under the same
384
- * mmap_sem write op.
390
+ * mmap_lock write op.
385391 */
386392 pte = get_locked_pte(vma->vm_mm, start, &ptl);
387393 /* Make sure we do not cross the page table boundary */
....@@ -445,7 +451,9 @@
445451 void munlock_vma_pages_range(struct vm_area_struct *vma,
446452 unsigned long start, unsigned long end)
447453 {
448
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
454
+ vm_write_begin(vma);
455
+ WRITE_ONCE(vma->vm_flags, vma->vm_flags & VM_LOCKED_CLEAR_MASK);
456
+ vm_write_end(vma);
449457
450458 while (start < end) {
451459 struct page *page;
....@@ -463,8 +471,15 @@
463471 * has sneaked into the range, we won't oops here: great).
464472 */
465473 page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
466
-
467474 if (page && !IS_ERR(page)) {
475
+ /*
476
+ * munlock_vma_pages_range uses follow_page(FOLL_GET)
477
+ * so it need to use put_user_page but the munlock
478
+ * path is quite complicated to deal with each put
479
+ * sites correctly so just unattribute them to avoid
480
+ * false positive at this moment.
481
+ */
482
+ reset_page_pinner(page, compound_order(page));
468483 if (PageTransTail(page)) {
469484 VM_BUG_ON_PAGE(PageMlocked(page), page);
470485 put_page(page); /* follow_page_mask() */
....@@ -565,14 +580,15 @@
565580 mm->locked_vm += nr_pages;
566581
567582 /*
568
- * vm_flags is protected by the mmap_sem held in write mode.
583
+ * vm_flags is protected by the mmap_lock held in write mode.
569584 * It's okay if try_to_unmap_one unmaps a page just after we
570585 * set VM_LOCKED, populate_vma_page_range will bring it back.
571586 */
572
-
573
- if (lock)
574
- vma->vm_flags = newflags;
575
- else
587
+ if (lock) {
588
+ vm_write_begin(vma);
589
+ WRITE_ONCE(vma->vm_flags, newflags);
590
+ vm_write_end(vma);
591
+ } else
576592 munlock_vma_pages_range(vma, start, end);
577593
578594 out:
....@@ -686,7 +702,7 @@
686702 lock_limit >>= PAGE_SHIFT;
687703 locked = len >> PAGE_SHIFT;
688704
689
- if (down_write_killable(&current->mm->mmap_sem))
705
+ if (mmap_write_lock_killable(current->mm))
690706 return -EINTR;
691707
692708 locked += current->mm->locked_vm;
....@@ -705,7 +721,7 @@
705721 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
706722 error = apply_vma_lock_flags(start, len, flags);
707723
708
- up_write(&current->mm->mmap_sem);
724
+ mmap_write_unlock(current->mm);
709725 if (error)
710726 return error;
711727
....@@ -742,10 +758,10 @@
742758 len = PAGE_ALIGN(len + (offset_in_page(start)));
743759 start &= PAGE_MASK;
744760
745
- if (down_write_killable(&current->mm->mmap_sem))
761
+ if (mmap_write_lock_killable(current->mm))
746762 return -EINTR;
747763 ret = apply_vma_lock_flags(start, len, 0);
748
- up_write(&current->mm->mmap_sem);
764
+ mmap_write_unlock(current->mm);
749765
750766 return ret;
751767 }
....@@ -801,7 +817,8 @@
801817 unsigned long lock_limit;
802818 int ret;
803819
804
- if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)))
820
+ if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
821
+ flags == MCL_ONFAULT)
805822 return -EINVAL;
806823
807824 if (!can_do_mlock())
....@@ -810,14 +827,14 @@
810827 lock_limit = rlimit(RLIMIT_MEMLOCK);
811828 lock_limit >>= PAGE_SHIFT;
812829
813
- if (down_write_killable(&current->mm->mmap_sem))
830
+ if (mmap_write_lock_killable(current->mm))
814831 return -EINTR;
815832
816833 ret = -ENOMEM;
817834 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
818835 capable(CAP_IPC_LOCK))
819836 ret = apply_mlockall_flags(flags);
820
- up_write(&current->mm->mmap_sem);
837
+ mmap_write_unlock(current->mm);
821838 if (!ret && (flags & MCL_CURRENT))
822839 mm_populate(0, TASK_SIZE);
823840
....@@ -828,10 +845,10 @@
828845 {
829846 int ret;
830847
831
- if (down_write_killable(&current->mm->mmap_sem))
848
+ if (mmap_write_lock_killable(current->mm))
832849 return -EINTR;
833850 ret = apply_mlockall_flags(0);
834
- up_write(&current->mm->mmap_sem);
851
+ mmap_write_unlock(current->mm);
835852 return ret;
836853 }
837854