hc
2024-05-10 9999e48639b3cecb08ffb37358bcba3b48161b29
kernel/fs/proc/task_mmu.c
....@@ -1,5 +1,5 @@
11 // SPDX-License-Identifier: GPL-2.0
2
-#include <linux/mm.h>
2
+#include <linux/pagewalk.h>
33 #include <linux/vmacache.h>
44 #include <linux/hugetlb.h>
55 #include <linux/huge_mm.h>
....@@ -59,7 +59,7 @@
5959 SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
6060 SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
6161 SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
62
- SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm);
62
+ SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
6363 SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
6464 SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
6565 SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
....@@ -147,8 +147,8 @@
147147 long pages_pinned;
148148 struct page *page;
149149
150
- pages_pinned = get_user_pages_remote(current, mm,
151
- page_start_vaddr, 1, 0, &page, NULL, NULL);
150
+ pages_pinned = get_user_pages_remote(mm, page_start_vaddr, 1, 0,
151
+ &page, NULL, NULL);
152152 if (pages_pinned < 1) {
153153 seq_puts(m, "<fault>]");
154154 return;
....@@ -159,7 +159,7 @@
159159 write_len = strnlen(kaddr + page_offset, len);
160160 seq_write(m, kaddr + page_offset, write_len);
161161 kunmap(page);
162
- put_page(page);
162
+ put_user_page(page);
163163
164164 /* if strnlen hit a null terminator then we're done */
165165 if (write_len != len)
....@@ -173,38 +173,14 @@
173173 seq_putc(m, ']');
174174 }
175175
176
-static void vma_stop(struct proc_maps_private *priv)
177
-{
178
- struct mm_struct *mm = priv->mm;
179
-
180
- release_task_mempolicy(priv);
181
- up_read(&mm->mmap_sem);
182
- mmput(mm);
183
-}
184
-
185
-static struct vm_area_struct *
186
-m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
187
-{
188
- if (vma == priv->tail_vma)
189
- return NULL;
190
- return vma->vm_next ?: priv->tail_vma;
191
-}
192
-
193
-static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
194
-{
195
- if (m->count < m->size) /* vma is copied successfully */
196
- m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
197
-}
198
-
199176 static void *m_start(struct seq_file *m, loff_t *ppos)
200177 {
201178 struct proc_maps_private *priv = m->private;
202
- unsigned long last_addr = m->version;
179
+ unsigned long last_addr = *ppos;
203180 struct mm_struct *mm;
204181 struct vm_area_struct *vma;
205
- unsigned int pos = *ppos;
206182
207
- /* See m_cache_vma(). Zero at the start or after lseek. */
183
+ /* See m_next(). Zero at the start or after lseek. */
208184 if (last_addr == -1UL)
209185 return NULL;
210186
....@@ -213,64 +189,59 @@
213189 return ERR_PTR(-ESRCH);
214190
215191 mm = priv->mm;
216
- if (!mm || !mmget_not_zero(mm))
192
+ if (!mm || !mmget_not_zero(mm)) {
193
+ put_task_struct(priv->task);
194
+ priv->task = NULL;
217195 return NULL;
196
+ }
218197
219
- if (down_read_killable(&mm->mmap_sem)) {
198
+ if (mmap_read_lock_killable(mm)) {
220199 mmput(mm);
200
+ put_task_struct(priv->task);
201
+ priv->task = NULL;
221202 return ERR_PTR(-EINTR);
222203 }
223204
224205 hold_task_mempolicy(priv);
225206 priv->tail_vma = get_gate_vma(mm);
226207
227
- if (last_addr) {
228
- vma = find_vma(mm, last_addr - 1);
229
- if (vma && vma->vm_start <= last_addr)
230
- vma = m_next_vma(priv, vma);
231
- if (vma)
232
- return vma;
233
- }
234
-
235
- m->version = 0;
236
- if (pos < mm->map_count) {
237
- for (vma = mm->mmap; pos; pos--) {
238
- m->version = vma->vm_start;
239
- vma = vma->vm_next;
240
- }
208
+ vma = find_vma(mm, last_addr);
209
+ if (vma)
241210 return vma;
242
- }
243211
244
- /* we do not bother to update m->version in this case */
245
- if (pos == mm->map_count && priv->tail_vma)
246
- return priv->tail_vma;
247
-
248
- vma_stop(priv);
249
- return NULL;
212
+ return priv->tail_vma;
250213 }
251214
252
-static void *m_next(struct seq_file *m, void *v, loff_t *pos)
215
+static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
253216 {
254217 struct proc_maps_private *priv = m->private;
255
- struct vm_area_struct *next;
218
+ struct vm_area_struct *next, *vma = v;
256219
257
- (*pos)++;
258
- next = m_next_vma(priv, v);
259
- if (!next)
260
- vma_stop(priv);
220
+ if (vma == priv->tail_vma)
221
+ next = NULL;
222
+ else if (vma->vm_next)
223
+ next = vma->vm_next;
224
+ else
225
+ next = priv->tail_vma;
226
+
227
+ *ppos = next ? next->vm_start : -1UL;
228
+
261229 return next;
262230 }
263231
264232 static void m_stop(struct seq_file *m, void *v)
265233 {
266234 struct proc_maps_private *priv = m->private;
235
+ struct mm_struct *mm = priv->mm;
267236
268
- if (!IS_ERR_OR_NULL(v))
269
- vma_stop(priv);
270
- if (priv->task) {
271
- put_task_struct(priv->task);
272
- priv->task = NULL;
273
- }
237
+ if (!priv->task)
238
+ return;
239
+
240
+ release_task_mempolicy(priv);
241
+ mmap_read_unlock(mm);
242
+ mmput(mm);
243
+ put_task_struct(priv->task);
244
+ priv->task = NULL;
274245 }
275246
276247 static int proc_maps_open(struct inode *inode, struct file *file,
....@@ -420,7 +391,6 @@
420391 static int show_map(struct seq_file *m, void *v)
421392 {
422393 show_map_vma(m, v);
423
- m_cache_vma(m, v);
424394 return 0;
425395 }
426396
....@@ -474,21 +444,59 @@
474444 unsigned long lazyfree;
475445 unsigned long anonymous_thp;
476446 unsigned long shmem_thp;
447
+ unsigned long file_thp;
477448 unsigned long swap;
478449 unsigned long shared_hugetlb;
479450 unsigned long private_hugetlb;
480451 u64 pss;
452
+ u64 pss_anon;
453
+ u64 pss_file;
454
+ u64 pss_shmem;
481455 u64 pss_locked;
482456 u64 swap_pss;
483457 bool check_shmem_swap;
484458 };
485459
486
-static void smaps_account(struct mem_size_stats *mss, struct page *page,
487
- bool compound, bool young, bool dirty, bool locked)
460
+static void smaps_page_accumulate(struct mem_size_stats *mss,
461
+ struct page *page, unsigned long size, unsigned long pss,
462
+ bool dirty, bool locked, bool private)
488463 {
489
- int i, nr = compound ? 1 << compound_order(page) : 1;
464
+ mss->pss += pss;
465
+
466
+ if (PageAnon(page))
467
+ mss->pss_anon += pss;
468
+ else if (PageSwapBacked(page))
469
+ mss->pss_shmem += pss;
470
+ else
471
+ mss->pss_file += pss;
472
+
473
+ if (locked)
474
+ mss->pss_locked += pss;
475
+
476
+ if (dirty || PageDirty(page)) {
477
+ if (private)
478
+ mss->private_dirty += size;
479
+ else
480
+ mss->shared_dirty += size;
481
+ } else {
482
+ if (private)
483
+ mss->private_clean += size;
484
+ else
485
+ mss->shared_clean += size;
486
+ }
487
+}
488
+
489
+static void smaps_account(struct mem_size_stats *mss, struct page *page,
490
+ bool compound, bool young, bool dirty, bool locked,
491
+ bool migration)
492
+{
493
+ int i, nr = compound ? compound_nr(page) : 1;
490494 unsigned long size = nr * PAGE_SIZE;
491495
496
+ /*
497
+ * First accumulate quantities that depend only on |size| and the type
498
+ * of the compound page.
499
+ */
492500 if (PageAnon(page)) {
493501 mss->anonymous += size;
494502 if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
....@@ -501,48 +509,38 @@
501509 mss->referenced += size;
502510
503511 /*
512
+ * Then accumulate quantities that may depend on sharing, or that may
513
+ * differ page-by-page.
514
+ *
504515 * page_count(page) == 1 guarantees the page is mapped exactly once.
505516 * If any subpage of the compound page mapped with PTE it would elevate
506517 * page_count().
518
+ *
519
+ * The page_mapcount() is called to get a snapshot of the mapcount.
520
+ * Without holding the page lock this snapshot can be slightly wrong as
521
+ * we cannot always read the mapcount atomically. It is not safe to
522
+ * call page_mapcount() even with PTL held if the page is not mapped,
523
+ * especially for migration entries. Treat regular migration entries
524
+ * as mapcount == 1.
507525 */
508
- if (page_count(page) == 1) {
509
- if (dirty || PageDirty(page))
510
- mss->private_dirty += size;
511
- else
512
- mss->private_clean += size;
513
- mss->pss += (u64)size << PSS_SHIFT;
514
- if (locked)
515
- mss->pss_locked += (u64)size << PSS_SHIFT;
526
+ if ((page_count(page) == 1) || migration) {
527
+ smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
528
+ locked, true);
516529 return;
517530 }
518
-
519531 for (i = 0; i < nr; i++, page++) {
520532 int mapcount = page_mapcount(page);
521
- unsigned long pss = (PAGE_SIZE << PSS_SHIFT);
522
-
523
- if (mapcount >= 2) {
524
- if (dirty || PageDirty(page))
525
- mss->shared_dirty += PAGE_SIZE;
526
- else
527
- mss->shared_clean += PAGE_SIZE;
528
- mss->pss += pss / mapcount;
529
- if (locked)
530
- mss->pss_locked += pss / mapcount;
531
- } else {
532
- if (dirty || PageDirty(page))
533
- mss->private_dirty += PAGE_SIZE;
534
- else
535
- mss->private_clean += PAGE_SIZE;
536
- mss->pss += pss;
537
- if (locked)
538
- mss->pss_locked += pss;
539
- }
533
+ unsigned long pss = PAGE_SIZE << PSS_SHIFT;
534
+ if (mapcount >= 2)
535
+ pss /= mapcount;
536
+ smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked,
537
+ mapcount < 2);
540538 }
541539 }
542540
543541 #ifdef CONFIG_SHMEM
544542 static int smaps_pte_hole(unsigned long addr, unsigned long end,
545
- struct mm_walk *walk)
543
+ __always_unused int depth, struct mm_walk *walk)
546544 {
547545 struct mem_size_stats *mss = walk->private;
548546
....@@ -551,7 +549,9 @@
551549
552550 return 0;
553551 }
554
-#endif
552
+#else
553
+#define smaps_pte_hole NULL
554
+#endif /* CONFIG_SHMEM */
555555
556556 static void smaps_pte_entry(pte_t *pte, unsigned long addr,
557557 struct mm_walk *walk)
....@@ -560,9 +560,12 @@
560560 struct vm_area_struct *vma = walk->vma;
561561 bool locked = !!(vma->vm_flags & VM_LOCKED);
562562 struct page *page = NULL;
563
+ bool migration = false, young = false, dirty = false;
563564
564565 if (pte_present(*pte)) {
565566 page = vm_normal_page(vma, addr, *pte);
567
+ young = pte_young(*pte);
568
+ dirty = pte_dirty(*pte);
566569 } else if (is_swap_pte(*pte)) {
567570 swp_entry_t swpent = pte_to_swp_entry(*pte);
568571
....@@ -579,29 +582,24 @@
579582 } else {
580583 mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
581584 }
582
- } else if (is_migration_entry(swpent))
585
+ } else if (is_migration_entry(swpent)) {
586
+ migration = true;
583587 page = migration_entry_to_page(swpent);
584
- else if (is_device_private_entry(swpent))
588
+ } else if (is_device_private_entry(swpent))
585589 page = device_private_entry_to_page(swpent);
586590 } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
587591 && pte_none(*pte))) {
588
- page = find_get_entry(vma->vm_file->f_mapping,
592
+ page = xa_load(&vma->vm_file->f_mapping->i_pages,
589593 linear_page_index(vma, addr));
590
- if (!page)
591
- return;
592
-
593
- if (radix_tree_exceptional_entry(page))
594
+ if (xa_is_value(page))
594595 mss->swap += PAGE_SIZE;
595
- else
596
- put_page(page);
597
-
598596 return;
599597 }
600598
601599 if (!page)
602600 return;
603601
604
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
602
+ smaps_account(mss, page, false, young, dirty, locked, migration);
605603 }
606604
607605 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
....@@ -611,10 +609,20 @@
611609 struct mem_size_stats *mss = walk->private;
612610 struct vm_area_struct *vma = walk->vma;
613611 bool locked = !!(vma->vm_flags & VM_LOCKED);
614
- struct page *page;
612
+ struct page *page = NULL;
613
+ bool migration = false;
615614
616
- /* FOLL_DUMP will return -EFAULT on huge zero page */
617
- page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
615
+ if (pmd_present(*pmd)) {
616
+ /* FOLL_DUMP will return -EFAULT on huge zero page */
617
+ page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
618
+ } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
619
+ swp_entry_t entry = pmd_to_swp_entry(*pmd);
620
+
621
+ if (is_migration_entry(entry)) {
622
+ migration = true;
623
+ page = migration_entry_to_page(entry);
624
+ }
625
+ }
618626 if (IS_ERR_OR_NULL(page))
619627 return;
620628 if (PageAnon(page))
....@@ -624,8 +632,10 @@
624632 else if (is_zone_device_page(page))
625633 /* pass */;
626634 else
627
- VM_BUG_ON_PAGE(1, page);
628
- smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
635
+ mss->file_thp += HPAGE_PMD_SIZE;
636
+
637
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
638
+ locked, migration);
629639 }
630640 #else
631641 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
....@@ -643,8 +653,7 @@
643653
644654 ptl = pmd_trans_huge_lock(pmd, vma);
645655 if (ptl) {
646
- if (pmd_present(*pmd))
647
- smaps_pmd_entry(pmd, addr, walk);
656
+ smaps_pmd_entry(pmd, addr, walk);
648657 spin_unlock(ptl);
649658 goto out;
650659 }
....@@ -652,7 +661,7 @@
652661 if (pmd_trans_unstable(pmd))
653662 goto out;
654663 /*
655
- * The mmap_sem held all the way back in m_start() is what
664
+ * The mmap_lock held all the way back in m_start() is what
656665 * keeps khugepaged out of here and from collapsing things
657666 * in here.
658667 */
....@@ -687,9 +696,6 @@
687696 [ilog2(VM_GROWSDOWN)] = "gd",
688697 [ilog2(VM_PFNMAP)] = "pf",
689698 [ilog2(VM_DENYWRITE)] = "dw",
690
-#ifdef CONFIG_X86_INTEL_MPX
691
- [ilog2(VM_MPX)] = "mp",
692
-#endif
693699 [ilog2(VM_LOCKED)] = "lo",
694700 [ilog2(VM_IO)] = "io",
695701 [ilog2(VM_SEQ_READ)] = "sr",
....@@ -703,6 +709,9 @@
703709 [ilog2(VM_ARCH_1)] = "ar",
704710 [ilog2(VM_WIPEONFORK)] = "wf",
705711 [ilog2(VM_DONTDUMP)] = "dd",
712
+#ifdef CONFIG_ARM64_BTI
713
+ [ilog2(VM_ARM64_BTI)] = "bt",
714
+#endif
706715 #ifdef CONFIG_MEM_SOFT_DIRTY
707716 [ilog2(VM_SOFTDIRTY)] = "sd",
708717 #endif
....@@ -712,6 +721,10 @@
712721 [ilog2(VM_MERGEABLE)] = "mg",
713722 [ilog2(VM_UFFD_MISSING)]= "um",
714723 [ilog2(VM_UFFD_WP)] = "uw",
724
+#ifdef CONFIG_ARM64_MTE
725
+ [ilog2(VM_MTE)] = "mt",
726
+ [ilog2(VM_MTE_ALLOWED)] = "",
727
+#endif
715728 #ifdef CONFIG_ARCH_HAS_PKEYS
716729 /* These come out via ProtectionKey: */
717730 [ilog2(VM_PKEY_BIT0)] = "",
....@@ -722,6 +735,9 @@
722735 [ilog2(VM_PKEY_BIT4)] = "",
723736 #endif
724737 #endif /* CONFIG_ARCH_HAS_PKEYS */
738
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
739
+ [ilog2(VM_UFFD_MINOR)] = "ui",
740
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
725741 };
726742 size_t i;
727743
....@@ -758,29 +774,42 @@
758774 page = device_private_entry_to_page(swpent);
759775 }
760776 if (page) {
761
- int mapcount = page_mapcount(page);
762
-
763
- if (mapcount >= 2)
777
+ if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte))
764778 mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
765779 else
766780 mss->private_hugetlb += huge_page_size(hstate_vma(vma));
767781 }
768782 return 0;
769783 }
784
+#else
785
+#define smaps_hugetlb_range NULL
770786 #endif /* HUGETLB_PAGE */
771787
772
-static void smap_gather_stats(struct vm_area_struct *vma,
773
- struct mem_size_stats *mss)
774
-{
775
- struct mm_walk smaps_walk = {
776
- .pmd_entry = smaps_pte_range,
777
-#ifdef CONFIG_HUGETLB_PAGE
778
- .hugetlb_entry = smaps_hugetlb_range,
779
-#endif
780
- .mm = vma->vm_mm,
781
- };
788
+static const struct mm_walk_ops smaps_walk_ops = {
789
+ .pmd_entry = smaps_pte_range,
790
+ .hugetlb_entry = smaps_hugetlb_range,
791
+};
782792
783
- smaps_walk.private = mss;
793
+static const struct mm_walk_ops smaps_shmem_walk_ops = {
794
+ .pmd_entry = smaps_pte_range,
795
+ .hugetlb_entry = smaps_hugetlb_range,
796
+ .pte_hole = smaps_pte_hole,
797
+};
798
+
799
+/*
800
+ * Gather mem stats from @vma with the indicated beginning
801
+ * address @start, and keep them in @mss.
802
+ *
803
+ * Use vm_start of @vma as the beginning address if @start is 0.
804
+ */
805
+static void smap_gather_stats(struct vm_area_struct *vma,
806
+ struct mem_size_stats *mss, unsigned long start)
807
+{
808
+ const struct mm_walk_ops *ops = &smaps_walk_ops;
809
+
810
+ /* Invalid start */
811
+ if (start >= vma->vm_end)
812
+ return;
784813
785814 #ifdef CONFIG_SHMEM
786815 /* In case of smaps_rollup, reset the value from previous vma */
....@@ -798,27 +827,43 @@
798827 */
799828 unsigned long shmem_swapped = shmem_swap_usage(vma);
800829
801
- if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
802
- !(vma->vm_flags & VM_WRITE)) {
830
+ if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
831
+ !(vma->vm_flags & VM_WRITE))) {
803832 mss->swap += shmem_swapped;
804833 } else {
805834 mss->check_shmem_swap = true;
806
- smaps_walk.pte_hole = smaps_pte_hole;
835
+ ops = &smaps_shmem_walk_ops;
807836 }
808837 }
809838 #endif
810
- /* mmap_sem is held in m_start */
811
- walk_page_vma(vma, &smaps_walk);
839
+ /* mmap_lock is held in m_start */
840
+ if (!start)
841
+ walk_page_vma(vma, ops, mss);
842
+ else
843
+ walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
812844 }
813845
814846 #define SEQ_PUT_DEC(str, val) \
815847 seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
816848
817849 /* Show the contents common for smaps and smaps_rollup */
818
-static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss)
850
+static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
851
+ bool rollup_mode)
819852 {
820853 SEQ_PUT_DEC("Rss: ", mss->resident);
821854 SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
855
+ if (rollup_mode) {
856
+ /*
857
+ * These are meaningful only for smaps_rollup, otherwise two of
858
+ * them are zero, and the other one is the same as Pss.
859
+ */
860
+ SEQ_PUT_DEC(" kB\nPss_Anon: ",
861
+ mss->pss_anon >> PSS_SHIFT);
862
+ SEQ_PUT_DEC(" kB\nPss_File: ",
863
+ mss->pss_file >> PSS_SHIFT);
864
+ SEQ_PUT_DEC(" kB\nPss_Shmem: ",
865
+ mss->pss_shmem >> PSS_SHIFT);
866
+ }
822867 SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean);
823868 SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty);
824869 SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean);
....@@ -828,6 +873,7 @@
828873 SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
829874 SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
830875 SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
876
+ SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
831877 SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
832878 seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
833879 mss->private_hugetlb >> 10, 7);
....@@ -846,7 +892,7 @@
846892
847893 memset(&mss, 0, sizeof(mss));
848894
849
- smap_gather_stats(vma, &mss);
895
+ smap_gather_stats(vma, &mss, 0);
850896
851897 show_map_vma(m, vma);
852898 if (vma_get_anon_name(vma)) {
....@@ -860,15 +906,14 @@
860906 SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma));
861907 seq_puts(m, " kB\n");
862908
863
- __show_smap(m, &mss);
909
+ __show_smap(m, &mss, false);
864910
865
- seq_printf(m, "THPeligible: %d\n", transparent_hugepage_enabled(vma));
911
+ seq_printf(m, "THPeligible: %d\n",
912
+ transparent_hugepage_active(vma));
866913
867914 if (arch_pkeys_enabled())
868915 seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
869916 show_smap_vma_flags(m, vma);
870
-
871
- m_cache_vma(m, vma);
872917
873918 return 0;
874919 }
....@@ -894,26 +939,90 @@
894939
895940 memset(&mss, 0, sizeof(mss));
896941
897
- ret = down_read_killable(&mm->mmap_sem);
942
+ ret = mmap_read_lock_killable(mm);
898943 if (ret)
899944 goto out_put_mm;
900945
901946 hold_task_mempolicy(priv);
902947
903
- for (vma = priv->mm->mmap; vma; vma = vma->vm_next) {
904
- smap_gather_stats(vma, &mss);
948
+ for (vma = priv->mm->mmap; vma;) {
949
+ smap_gather_stats(vma, &mss, 0);
905950 last_vma_end = vma->vm_end;
951
+
952
+ /*
953
+ * Release mmap_lock temporarily if someone wants to
954
+ * access it for write request.
955
+ */
956
+ if (mmap_lock_is_contended(mm)) {
957
+ mmap_read_unlock(mm);
958
+ ret = mmap_read_lock_killable(mm);
959
+ if (ret) {
960
+ release_task_mempolicy(priv);
961
+ goto out_put_mm;
962
+ }
963
+
964
+ /*
965
+ * After dropping the lock, there are four cases to
966
+ * consider. See the following example for explanation.
967
+ *
968
+ * +------+------+-----------+
969
+ * | VMA1 | VMA2 | VMA3 |
970
+ * +------+------+-----------+
971
+ * | | | |
972
+ * 4k 8k 16k 400k
973
+ *
974
+ * Suppose we drop the lock after reading VMA2 due to
975
+ * contention, then we get:
976
+ *
977
+ * last_vma_end = 16k
978
+ *
979
+ * 1) VMA2 is freed, but VMA3 exists:
980
+ *
981
+ * find_vma(mm, 16k - 1) will return VMA3.
982
+ * In this case, just continue from VMA3.
983
+ *
984
+ * 2) VMA2 still exists:
985
+ *
986
+ * find_vma(mm, 16k - 1) will return VMA2.
987
+ * Iterate the loop like the original one.
988
+ *
989
+ * 3) No more VMAs can be found:
990
+ *
991
+ * find_vma(mm, 16k - 1) will return NULL.
992
+ * No more things to do, just break.
993
+ *
994
+ * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
995
+ *
996
+ * find_vma(mm, 16k - 1) will return VMA' whose range
997
+ * contains last_vma_end.
998
+ * Iterate VMA' from last_vma_end.
999
+ */
1000
+ vma = find_vma(mm, last_vma_end - 1);
1001
+ /* Case 3 above */
1002
+ if (!vma)
1003
+ break;
1004
+
1005
+ /* Case 1 above */
1006
+ if (vma->vm_start >= last_vma_end)
1007
+ continue;
1008
+
1009
+ /* Case 4 above */
1010
+ if (vma->vm_end > last_vma_end)
1011
+ smap_gather_stats(vma, &mss, last_vma_end);
1012
+ }
1013
+ /* Case 2 above */
1014
+ vma = vma->vm_next;
9061015 }
9071016
908
- show_vma_header_prefix(m, priv->mm->mmap->vm_start,
1017
+ show_vma_header_prefix(m, priv->mm->mmap ? priv->mm->mmap->vm_start : 0,
9091018 last_vma_end, 0, 0, 0, 0);
9101019 seq_pad(m, ' ');
9111020 seq_puts(m, "[rollup]\n");
9121021
913
- __show_smap(m, &mss);
1022
+ __show_smap(m, &mss, true);
9141023
9151024 release_task_mempolicy(priv);
916
- up_read(&mm->mmap_sem);
1025
+ mmap_read_unlock(mm);
9171026
9181027 out_put_mm:
9191028 mmput(mm);
....@@ -1006,6 +1115,25 @@
10061115 };
10071116
10081117 #ifdef CONFIG_MEM_SOFT_DIRTY
1118
+
1119
+#define is_cow_mapping(flags) (((flags) & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE)
1120
+
1121
+static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1122
+{
1123
+ struct page *page;
1124
+
1125
+ if (!pte_write(pte))
1126
+ return false;
1127
+ if (!is_cow_mapping(vma->vm_flags))
1128
+ return false;
1129
+ if (likely(!atomic_read(&vma->vm_mm->has_pinned)))
1130
+ return false;
1131
+ page = vm_normal_page(vma, addr, pte);
1132
+ if (!page)
1133
+ return false;
1134
+ return page_maybe_dma_pinned(page);
1135
+}
1136
+
10091137 static inline void clear_soft_dirty(struct vm_area_struct *vma,
10101138 unsigned long addr, pte_t *pte)
10111139 {
....@@ -1018,10 +1146,14 @@
10181146 pte_t ptent = *pte;
10191147
10201148 if (pte_present(ptent)) {
1021
- ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
1022
- ptent = pte_wrprotect(ptent);
1149
+ pte_t old_pte;
1150
+
1151
+ if (pte_is_pinned(vma, addr, ptent))
1152
+ return;
1153
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
1154
+ ptent = pte_wrprotect(old_pte);
10231155 ptent = pte_clear_soft_dirty(ptent);
1024
- ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
1156
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
10251157 } else if (is_swap_pte(ptent)) {
10261158 ptent = pte_swp_clear_soft_dirty(ptent);
10271159 set_pte_at(vma->vm_mm, addr, pte, ptent);
....@@ -1145,6 +1277,11 @@
11451277 return 0;
11461278 }
11471279
1280
+static const struct mm_walk_ops clear_refs_walk_ops = {
1281
+ .pmd_entry = clear_refs_pte_range,
1282
+ .test_walk = clear_refs_test_walk,
1283
+};
1284
+
11481285 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
11491286 size_t count, loff_t *ppos)
11501287 {
....@@ -1153,7 +1290,6 @@
11531290 struct mm_struct *mm;
11541291 struct vm_area_struct *vma;
11551292 enum clear_refs_types type;
1156
- struct mmu_gather tlb;
11571293 int itype;
11581294 int rv;
11591295
....@@ -1174,77 +1310,49 @@
11741310 return -ESRCH;
11751311 mm = get_task_mm(task);
11761312 if (mm) {
1313
+ struct mmu_notifier_range range;
11771314 struct clear_refs_private cp = {
11781315 .type = type,
11791316 };
1180
- struct mm_walk clear_refs_walk = {
1181
- .pmd_entry = clear_refs_pte_range,
1182
- .test_walk = clear_refs_test_walk,
1183
- .mm = mm,
1184
- .private = &cp,
1185
- };
11861317
1318
+ if (mmap_write_lock_killable(mm)) {
1319
+ count = -EINTR;
1320
+ goto out_mm;
1321
+ }
11871322 if (type == CLEAR_REFS_MM_HIWATER_RSS) {
1188
- if (down_write_killable(&mm->mmap_sem)) {
1189
- count = -EINTR;
1190
- goto out_mm;
1191
- }
1192
-
11931323 /*
11941324 * Writing 5 to /proc/pid/clear_refs resets the peak
11951325 * resident set size to this mm's current rss value.
11961326 */
11971327 reset_mm_hiwater_rss(mm);
1198
- up_write(&mm->mmap_sem);
1199
- goto out_mm;
1328
+ goto out_unlock;
12001329 }
12011330
1202
- if (down_read_killable(&mm->mmap_sem)) {
1203
- count = -EINTR;
1204
- goto out_mm;
1205
- }
1206
- tlb_gather_mmu(&tlb, mm, 0, -1);
12071331 if (type == CLEAR_REFS_SOFT_DIRTY) {
12081332 for (vma = mm->mmap; vma; vma = vma->vm_next) {
12091333 if (!(vma->vm_flags & VM_SOFTDIRTY))
12101334 continue;
1211
- up_read(&mm->mmap_sem);
1212
- if (down_write_killable(&mm->mmap_sem)) {
1213
- count = -EINTR;
1214
- goto out_mm;
1215
- }
1216
- /*
1217
- * Avoid to modify vma->vm_flags
1218
- * without locked ops while the
1219
- * coredump reads the vm_flags.
1220
- */
1221
- if (!mmget_still_valid(mm)) {
1222
- /*
1223
- * Silently return "count"
1224
- * like if get_task_mm()
1225
- * failed. FIXME: should this
1226
- * function have returned
1227
- * -ESRCH if get_task_mm()
1228
- * failed like if
1229
- * get_proc_task() fails?
1230
- */
1231
- up_write(&mm->mmap_sem);
1232
- goto out_mm;
1233
- }
1234
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
1235
- vma->vm_flags &= ~VM_SOFTDIRTY;
1236
- vma_set_page_prot(vma);
1237
- }
1238
- downgrade_write(&mm->mmap_sem);
1239
- break;
1335
+ vm_write_begin(vma);
1336
+ WRITE_ONCE(vma->vm_flags,
1337
+ vma->vm_flags & ~VM_SOFTDIRTY);
1338
+ vma_set_page_prot(vma);
1339
+ vm_write_end(vma);
12401340 }
1241
- mmu_notifier_invalidate_range_start(mm, 0, -1);
1341
+
1342
+ inc_tlb_flush_pending(mm);
1343
+ mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
1344
+ 0, NULL, mm, 0, -1UL);
1345
+ mmu_notifier_invalidate_range_start(&range);
12421346 }
1243
- walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
1244
- if (type == CLEAR_REFS_SOFT_DIRTY)
1245
- mmu_notifier_invalidate_range_end(mm, 0, -1);
1246
- tlb_finish_mmu(&tlb, 0, -1);
1247
- up_read(&mm->mmap_sem);
1347
+ walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
1348
+ &cp);
1349
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
1350
+ mmu_notifier_invalidate_range_end(&range);
1351
+ flush_tlb_mm(mm);
1352
+ dec_tlb_flush_pending(mm);
1353
+ }
1354
+out_unlock:
1355
+ mmap_write_unlock(mm);
12481356 out_mm:
12491357 mmput(mm);
12501358 }
....@@ -1297,7 +1405,7 @@
12971405 }
12981406
12991407 static int pagemap_pte_hole(unsigned long start, unsigned long end,
1300
- struct mm_walk *walk)
1408
+ __always_unused int depth, struct mm_walk *walk)
13011409 {
13021410 struct pagemapread *pm = walk->private;
13031411 unsigned long addr = start;
....@@ -1341,12 +1449,13 @@
13411449 {
13421450 u64 frame = 0, flags = 0;
13431451 struct page *page = NULL;
1452
+ bool migration = false;
13441453
13451454 if (pte_present(pte)) {
13461455 if (pm->show_pfn)
13471456 frame = pte_pfn(pte);
13481457 flags |= PM_PRESENT;
1349
- page = _vm_normal_page(vma, addr, pte, true);
1458
+ page = vm_normal_page(vma, addr, pte);
13501459 if (pte_soft_dirty(pte))
13511460 flags |= PM_SOFT_DIRTY;
13521461 } else if (is_swap_pte(pte)) {
....@@ -1358,8 +1467,10 @@
13581467 frame = swp_type(entry) |
13591468 (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
13601469 flags |= PM_SWAP;
1361
- if (is_migration_entry(entry))
1470
+ if (is_migration_entry(entry)) {
1471
+ migration = true;
13621472 page = migration_entry_to_page(entry);
1473
+ }
13631474
13641475 if (is_device_private_entry(entry))
13651476 page = device_private_entry_to_page(entry);
....@@ -1367,7 +1478,7 @@
13671478
13681479 if (page && !PageAnon(page))
13691480 flags |= PM_FILE;
1370
- if (page && page_mapcount(page) == 1)
1481
+ if (page && !migration && page_mapcount(page) == 1)
13711482 flags |= PM_MMAP_EXCLUSIVE;
13721483 if (vma->vm_flags & VM_SOFTDIRTY)
13731484 flags |= PM_SOFT_DIRTY;
....@@ -1383,8 +1494,9 @@
13831494 spinlock_t *ptl;
13841495 pte_t *pte, *orig_pte;
13851496 int err = 0;
1386
-
13871497 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1498
+ bool migration = false;
1499
+
13881500 ptl = pmd_trans_huge_lock(pmdp, vma);
13891501 if (ptl) {
13901502 u64 flags = 0, frame = 0;
....@@ -1419,11 +1531,12 @@
14191531 if (pmd_swp_soft_dirty(pmd))
14201532 flags |= PM_SOFT_DIRTY;
14211533 VM_BUG_ON(!is_pmd_migration_entry(pmd));
1534
+ migration = is_migration_entry(entry);
14221535 page = migration_entry_to_page(entry);
14231536 }
14241537 #endif
14251538
1426
- if (page && page_mapcount(page) == 1)
1539
+ if (page && !migration && page_mapcount(page) == 1)
14271540 flags |= PM_MMAP_EXCLUSIVE;
14281541
14291542 for (; addr != end; addr += PAGE_SIZE) {
....@@ -1512,7 +1625,15 @@
15121625
15131626 return err;
15141627 }
1628
+#else
1629
+#define pagemap_hugetlb_range NULL
15151630 #endif /* HUGETLB_PAGE */
1631
+
1632
+static const struct mm_walk_ops pagemap_ops = {
1633
+ .pmd_entry = pagemap_pmd_range,
1634
+ .pte_hole = pagemap_pte_hole,
1635
+ .hugetlb_entry = pagemap_hugetlb_range,
1636
+};
15161637
15171638 /*
15181639 * /proc/pid/pagemap - an array mapping virtual pages to pfns
....@@ -1545,7 +1666,6 @@
15451666 {
15461667 struct mm_struct *mm = file->private_data;
15471668 struct pagemapread pm;
1548
- struct mm_walk pagemap_walk = {};
15491669 unsigned long src;
15501670 unsigned long svpfn;
15511671 unsigned long start_vaddr;
....@@ -1573,21 +1693,17 @@
15731693 if (!pm.buffer)
15741694 goto out_mm;
15751695
1576
- pagemap_walk.pmd_entry = pagemap_pmd_range;
1577
- pagemap_walk.pte_hole = pagemap_pte_hole;
1578
-#ifdef CONFIG_HUGETLB_PAGE
1579
- pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
1580
-#endif
1581
- pagemap_walk.mm = mm;
1582
- pagemap_walk.private = &pm;
1583
-
15841696 src = *ppos;
15851697 svpfn = src / PM_ENTRY_BYTES;
1586
- start_vaddr = svpfn << PAGE_SHIFT;
15871698 end_vaddr = mm->task_size;
15881699
15891700 /* watch out for wraparound */
1590
- if (svpfn > mm->task_size >> PAGE_SHIFT)
1701
+ start_vaddr = end_vaddr;
1702
+ if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
1703
+ start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
1704
+
1705
+ /* Ensure the address is inside the task */
1706
+ if (start_vaddr > mm->task_size)
15911707 start_vaddr = end_vaddr;
15921708
15931709 /*
....@@ -1606,11 +1722,11 @@
16061722 /* overflow ? */
16071723 if (end < start_vaddr || end > end_vaddr)
16081724 end = end_vaddr;
1609
- ret = down_read_killable(&mm->mmap_sem);
1725
+ ret = mmap_read_lock_killable(mm);
16101726 if (ret)
16111727 goto out_free;
1612
- ret = walk_page_range(start_vaddr, end, &pagemap_walk);
1613
- up_read(&mm->mmap_sem);
1728
+ ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
1729
+ mmap_read_unlock(mm);
16141730 start_vaddr = end;
16151731
16161732 len = min(count, PM_ENTRY_BYTES * pm.pos);
....@@ -1821,6 +1937,11 @@
18211937 }
18221938 #endif
18231939
1940
+static const struct mm_walk_ops show_numa_ops = {
1941
+ .hugetlb_entry = gather_hugetlb_stats,
1942
+ .pmd_entry = gather_pte_stats,
1943
+};
1944
+
18241945 /*
18251946 * Display pages allocated per node and memory policy via /proc.
18261947 */
....@@ -1832,12 +1953,6 @@
18321953 struct numa_maps *md = &numa_priv->md;
18331954 struct file *file = vma->vm_file;
18341955 struct mm_struct *mm = vma->vm_mm;
1835
- struct mm_walk walk = {
1836
- .hugetlb_entry = gather_hugetlb_stats,
1837
- .pmd_entry = gather_pte_stats,
1838
- .private = md,
1839
- .mm = mm,
1840
- };
18411956 struct mempolicy *pol;
18421957 char buffer[64];
18431958 int nid;
....@@ -1870,8 +1985,8 @@
18701985 if (is_vm_hugetlb_page(vma))
18711986 seq_puts(m, " huge");
18721987
1873
- /* mmap_sem is held by m_start */
1874
- walk_page_vma(vma, &walk);
1988
+ /* mmap_lock is held by m_start */
1989
+ walk_page_vma(vma, &show_numa_ops, md);
18751990
18761991 if (!md->pages)
18771992 goto out;
....@@ -1904,7 +2019,6 @@
19042019 seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
19052020 out:
19062021 seq_putc(m, '\n');
1907
- m_cache_vma(m, vma);
19082022 return 0;
19092023 }
19102024