forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-08 01573e231f18eb2d99162747186f59511f56b64d
kernel/fs/proc/task_mmu.c
....@@ -1,5 +1,5 @@
11 // SPDX-License-Identifier: GPL-2.0
2
-#include <linux/mm.h>
2
+#include <linux/pagewalk.h>
33 #include <linux/vmacache.h>
44 #include <linux/hugetlb.h>
55 #include <linux/huge_mm.h>
....@@ -59,7 +59,7 @@
5959 SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
6060 SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
6161 SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
62
- SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm);
62
+ SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
6363 SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
6464 SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
6565 SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
....@@ -147,8 +147,8 @@
147147 long pages_pinned;
148148 struct page *page;
149149
150
- pages_pinned = get_user_pages_remote(current, mm,
151
- page_start_vaddr, 1, 0, &page, NULL, NULL);
150
+ pages_pinned = get_user_pages_remote(mm, page_start_vaddr, 1, 0,
151
+ &page, NULL, NULL);
152152 if (pages_pinned < 1) {
153153 seq_puts(m, "<fault>]");
154154 return;
....@@ -159,7 +159,7 @@
159159 write_len = strnlen(kaddr + page_offset, len);
160160 seq_write(m, kaddr + page_offset, write_len);
161161 kunmap(page);
162
- put_page(page);
162
+ put_user_page(page);
163163
164164 /* if strnlen hit a null terminator then we're done */
165165 if (write_len != len)
....@@ -173,38 +173,14 @@
173173 seq_putc(m, ']');
174174 }
175175
176
-static void vma_stop(struct proc_maps_private *priv)
177
-{
178
- struct mm_struct *mm = priv->mm;
179
-
180
- release_task_mempolicy(priv);
181
- up_read(&mm->mmap_sem);
182
- mmput(mm);
183
-}
184
-
185
-static struct vm_area_struct *
186
-m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
187
-{
188
- if (vma == priv->tail_vma)
189
- return NULL;
190
- return vma->vm_next ?: priv->tail_vma;
191
-}
192
-
193
-static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
194
-{
195
- if (m->count < m->size) /* vma is copied successfully */
196
- m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
197
-}
198
-
199176 static void *m_start(struct seq_file *m, loff_t *ppos)
200177 {
201178 struct proc_maps_private *priv = m->private;
202
- unsigned long last_addr = m->version;
179
+ unsigned long last_addr = *ppos;
203180 struct mm_struct *mm;
204181 struct vm_area_struct *vma;
205
- unsigned int pos = *ppos;
206182
207
- /* See m_cache_vma(). Zero at the start or after lseek. */
183
+ /* See m_next(). Zero at the start or after lseek. */
208184 if (last_addr == -1UL)
209185 return NULL;
210186
....@@ -213,64 +189,59 @@
213189 return ERR_PTR(-ESRCH);
214190
215191 mm = priv->mm;
216
- if (!mm || !mmget_not_zero(mm))
192
+ if (!mm || !mmget_not_zero(mm)) {
193
+ put_task_struct(priv->task);
194
+ priv->task = NULL;
217195 return NULL;
196
+ }
218197
219
- if (down_read_killable(&mm->mmap_sem)) {
198
+ if (mmap_read_lock_killable(mm)) {
220199 mmput(mm);
200
+ put_task_struct(priv->task);
201
+ priv->task = NULL;
221202 return ERR_PTR(-EINTR);
222203 }
223204
224205 hold_task_mempolicy(priv);
225206 priv->tail_vma = get_gate_vma(mm);
226207
227
- if (last_addr) {
228
- vma = find_vma(mm, last_addr - 1);
229
- if (vma && vma->vm_start <= last_addr)
230
- vma = m_next_vma(priv, vma);
231
- if (vma)
232
- return vma;
233
- }
234
-
235
- m->version = 0;
236
- if (pos < mm->map_count) {
237
- for (vma = mm->mmap; pos; pos--) {
238
- m->version = vma->vm_start;
239
- vma = vma->vm_next;
240
- }
208
+ vma = find_vma(mm, last_addr);
209
+ if (vma)
241210 return vma;
242
- }
243211
244
- /* we do not bother to update m->version in this case */
245
- if (pos == mm->map_count && priv->tail_vma)
246
- return priv->tail_vma;
247
-
248
- vma_stop(priv);
249
- return NULL;
212
+ return priv->tail_vma;
250213 }
251214
252
-static void *m_next(struct seq_file *m, void *v, loff_t *pos)
215
+static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
253216 {
254217 struct proc_maps_private *priv = m->private;
255
- struct vm_area_struct *next;
218
+ struct vm_area_struct *next, *vma = v;
256219
257
- (*pos)++;
258
- next = m_next_vma(priv, v);
259
- if (!next)
260
- vma_stop(priv);
220
+ if (vma == priv->tail_vma)
221
+ next = NULL;
222
+ else if (vma->vm_next)
223
+ next = vma->vm_next;
224
+ else
225
+ next = priv->tail_vma;
226
+
227
+ *ppos = next ? next->vm_start : -1UL;
228
+
261229 return next;
262230 }
263231
264232 static void m_stop(struct seq_file *m, void *v)
265233 {
266234 struct proc_maps_private *priv = m->private;
235
+ struct mm_struct *mm = priv->mm;
267236
268
- if (!IS_ERR_OR_NULL(v))
269
- vma_stop(priv);
270
- if (priv->task) {
271
- put_task_struct(priv->task);
272
- priv->task = NULL;
273
- }
237
+ if (!priv->task)
238
+ return;
239
+
240
+ release_task_mempolicy(priv);
241
+ mmap_read_unlock(mm);
242
+ mmput(mm);
243
+ put_task_struct(priv->task);
244
+ priv->task = NULL;
274245 }
275246
276247 static int proc_maps_open(struct inode *inode, struct file *file,
....@@ -420,7 +391,6 @@
420391 static int show_map(struct seq_file *m, void *v)
421392 {
422393 show_map_vma(m, v);
423
- m_cache_vma(m, v);
424394 return 0;
425395 }
426396
....@@ -474,21 +444,59 @@
474444 unsigned long lazyfree;
475445 unsigned long anonymous_thp;
476446 unsigned long shmem_thp;
447
+ unsigned long file_thp;
477448 unsigned long swap;
478449 unsigned long shared_hugetlb;
479450 unsigned long private_hugetlb;
480451 u64 pss;
452
+ u64 pss_anon;
453
+ u64 pss_file;
454
+ u64 pss_shmem;
481455 u64 pss_locked;
482456 u64 swap_pss;
483457 bool check_shmem_swap;
484458 };
485459
486
-static void smaps_account(struct mem_size_stats *mss, struct page *page,
487
- bool compound, bool young, bool dirty, bool locked)
460
+static void smaps_page_accumulate(struct mem_size_stats *mss,
461
+ struct page *page, unsigned long size, unsigned long pss,
462
+ bool dirty, bool locked, bool private)
488463 {
489
- int i, nr = compound ? 1 << compound_order(page) : 1;
464
+ mss->pss += pss;
465
+
466
+ if (PageAnon(page))
467
+ mss->pss_anon += pss;
468
+ else if (PageSwapBacked(page))
469
+ mss->pss_shmem += pss;
470
+ else
471
+ mss->pss_file += pss;
472
+
473
+ if (locked)
474
+ mss->pss_locked += pss;
475
+
476
+ if (dirty || PageDirty(page)) {
477
+ if (private)
478
+ mss->private_dirty += size;
479
+ else
480
+ mss->shared_dirty += size;
481
+ } else {
482
+ if (private)
483
+ mss->private_clean += size;
484
+ else
485
+ mss->shared_clean += size;
486
+ }
487
+}
488
+
489
+static void smaps_account(struct mem_size_stats *mss, struct page *page,
490
+ bool compound, bool young, bool dirty, bool locked,
491
+ bool migration)
492
+{
493
+ int i, nr = compound ? compound_nr(page) : 1;
490494 unsigned long size = nr * PAGE_SIZE;
491495
496
+ /*
497
+ * First accumulate quantities that depend only on |size| and the type
498
+ * of the compound page.
499
+ */
492500 if (PageAnon(page)) {
493501 mss->anonymous += size;
494502 if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
....@@ -501,48 +509,38 @@
501509 mss->referenced += size;
502510
503511 /*
512
+ * Then accumulate quantities that may depend on sharing, or that may
513
+ * differ page-by-page.
514
+ *
504515 * page_count(page) == 1 guarantees the page is mapped exactly once.
505516 * If any subpage of the compound page mapped with PTE it would elevate
506517 * page_count().
518
+ *
519
+ * The page_mapcount() is called to get a snapshot of the mapcount.
520
+ * Without holding the page lock this snapshot can be slightly wrong as
521
+ * we cannot always read the mapcount atomically. It is not safe to
522
+ * call page_mapcount() even with PTL held if the page is not mapped,
523
+ * especially for migration entries. Treat regular migration entries
524
+ * as mapcount == 1.
507525 */
508
- if (page_count(page) == 1) {
509
- if (dirty || PageDirty(page))
510
- mss->private_dirty += size;
511
- else
512
- mss->private_clean += size;
513
- mss->pss += (u64)size << PSS_SHIFT;
514
- if (locked)
515
- mss->pss_locked += (u64)size << PSS_SHIFT;
526
+ if ((page_count(page) == 1) || migration) {
527
+ smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
528
+ locked, true);
516529 return;
517530 }
518
-
519531 for (i = 0; i < nr; i++, page++) {
520532 int mapcount = page_mapcount(page);
521
- unsigned long pss = (PAGE_SIZE << PSS_SHIFT);
522
-
523
- if (mapcount >= 2) {
524
- if (dirty || PageDirty(page))
525
- mss->shared_dirty += PAGE_SIZE;
526
- else
527
- mss->shared_clean += PAGE_SIZE;
528
- mss->pss += pss / mapcount;
529
- if (locked)
530
- mss->pss_locked += pss / mapcount;
531
- } else {
532
- if (dirty || PageDirty(page))
533
- mss->private_dirty += PAGE_SIZE;
534
- else
535
- mss->private_clean += PAGE_SIZE;
536
- mss->pss += pss;
537
- if (locked)
538
- mss->pss_locked += pss;
539
- }
533
+ unsigned long pss = PAGE_SIZE << PSS_SHIFT;
534
+ if (mapcount >= 2)
535
+ pss /= mapcount;
536
+ smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked,
537
+ mapcount < 2);
540538 }
541539 }
542540
543541 #ifdef CONFIG_SHMEM
544542 static int smaps_pte_hole(unsigned long addr, unsigned long end,
545
- struct mm_walk *walk)
543
+ __always_unused int depth, struct mm_walk *walk)
546544 {
547545 struct mem_size_stats *mss = walk->private;
548546
....@@ -551,7 +549,9 @@
551549
552550 return 0;
553551 }
554
-#endif
552
+#else
553
+#define smaps_pte_hole NULL
554
+#endif /* CONFIG_SHMEM */
555555
556556 static void smaps_pte_entry(pte_t *pte, unsigned long addr,
557557 struct mm_walk *walk)
....@@ -560,9 +560,12 @@
560560 struct vm_area_struct *vma = walk->vma;
561561 bool locked = !!(vma->vm_flags & VM_LOCKED);
562562 struct page *page = NULL;
563
+ bool migration = false, young = false, dirty = false;
563564
564565 if (pte_present(*pte)) {
565566 page = vm_normal_page(vma, addr, *pte);
567
+ young = pte_young(*pte);
568
+ dirty = pte_dirty(*pte);
566569 } else if (is_swap_pte(*pte)) {
567570 swp_entry_t swpent = pte_to_swp_entry(*pte);
568571
....@@ -579,29 +582,24 @@
579582 } else {
580583 mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
581584 }
582
- } else if (is_migration_entry(swpent))
585
+ } else if (is_migration_entry(swpent)) {
586
+ migration = true;
583587 page = migration_entry_to_page(swpent);
584
- else if (is_device_private_entry(swpent))
588
+ } else if (is_device_private_entry(swpent))
585589 page = device_private_entry_to_page(swpent);
586590 } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
587591 && pte_none(*pte))) {
588
- page = find_get_entry(vma->vm_file->f_mapping,
592
+ page = xa_load(&vma->vm_file->f_mapping->i_pages,
589593 linear_page_index(vma, addr));
590
- if (!page)
591
- return;
592
-
593
- if (radix_tree_exceptional_entry(page))
594
+ if (xa_is_value(page))
594595 mss->swap += PAGE_SIZE;
595
- else
596
- put_page(page);
597
-
598596 return;
599597 }
600598
601599 if (!page)
602600 return;
603601
604
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
602
+ smaps_account(mss, page, false, young, dirty, locked, migration);
605603 }
606604
607605 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
....@@ -611,10 +609,20 @@
611609 struct mem_size_stats *mss = walk->private;
612610 struct vm_area_struct *vma = walk->vma;
613611 bool locked = !!(vma->vm_flags & VM_LOCKED);
614
- struct page *page;
612
+ struct page *page = NULL;
613
+ bool migration = false;
615614
616
- /* FOLL_DUMP will return -EFAULT on huge zero page */
617
- page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
615
+ if (pmd_present(*pmd)) {
616
+ /* FOLL_DUMP will return -EFAULT on huge zero page */
617
+ page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
618
+ } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
619
+ swp_entry_t entry = pmd_to_swp_entry(*pmd);
620
+
621
+ if (is_migration_entry(entry)) {
622
+ migration = true;
623
+ page = migration_entry_to_page(entry);
624
+ }
625
+ }
618626 if (IS_ERR_OR_NULL(page))
619627 return;
620628 if (PageAnon(page))
....@@ -624,8 +632,10 @@
624632 else if (is_zone_device_page(page))
625633 /* pass */;
626634 else
627
- VM_BUG_ON_PAGE(1, page);
628
- smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
635
+ mss->file_thp += HPAGE_PMD_SIZE;
636
+
637
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
638
+ locked, migration);
629639 }
630640 #else
631641 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
....@@ -643,8 +653,7 @@
643653
644654 ptl = pmd_trans_huge_lock(pmd, vma);
645655 if (ptl) {
646
- if (pmd_present(*pmd))
647
- smaps_pmd_entry(pmd, addr, walk);
656
+ smaps_pmd_entry(pmd, addr, walk);
648657 spin_unlock(ptl);
649658 goto out;
650659 }
....@@ -652,7 +661,7 @@
652661 if (pmd_trans_unstable(pmd))
653662 goto out;
654663 /*
655
- * The mmap_sem held all the way back in m_start() is what
664
+ * The mmap_lock held all the way back in m_start() is what
656665 * keeps khugepaged out of here and from collapsing things
657666 * in here.
658667 */
....@@ -687,9 +696,6 @@
687696 [ilog2(VM_GROWSDOWN)] = "gd",
688697 [ilog2(VM_PFNMAP)] = "pf",
689698 [ilog2(VM_DENYWRITE)] = "dw",
690
-#ifdef CONFIG_X86_INTEL_MPX
691
- [ilog2(VM_MPX)] = "mp",
692
-#endif
693699 [ilog2(VM_LOCKED)] = "lo",
694700 [ilog2(VM_IO)] = "io",
695701 [ilog2(VM_SEQ_READ)] = "sr",
....@@ -703,6 +709,9 @@
703709 [ilog2(VM_ARCH_1)] = "ar",
704710 [ilog2(VM_WIPEONFORK)] = "wf",
705711 [ilog2(VM_DONTDUMP)] = "dd",
712
+#ifdef CONFIG_ARM64_BTI
713
+ [ilog2(VM_ARM64_BTI)] = "bt",
714
+#endif
706715 #ifdef CONFIG_MEM_SOFT_DIRTY
707716 [ilog2(VM_SOFTDIRTY)] = "sd",
708717 #endif
....@@ -712,6 +721,10 @@
712721 [ilog2(VM_MERGEABLE)] = "mg",
713722 [ilog2(VM_UFFD_MISSING)]= "um",
714723 [ilog2(VM_UFFD_WP)] = "uw",
724
+#ifdef CONFIG_ARM64_MTE
725
+ [ilog2(VM_MTE)] = "mt",
726
+ [ilog2(VM_MTE_ALLOWED)] = "",
727
+#endif
715728 #ifdef CONFIG_ARCH_HAS_PKEYS
716729 /* These come out via ProtectionKey: */
717730 [ilog2(VM_PKEY_BIT0)] = "",
....@@ -722,6 +735,9 @@
722735 [ilog2(VM_PKEY_BIT4)] = "",
723736 #endif
724737 #endif /* CONFIG_ARCH_HAS_PKEYS */
738
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
739
+ [ilog2(VM_UFFD_MINOR)] = "ui",
740
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
725741 };
726742 size_t i;
727743
....@@ -767,20 +783,35 @@
767783 }
768784 return 0;
769785 }
786
+#else
787
+#define smaps_hugetlb_range NULL
770788 #endif /* HUGETLB_PAGE */
771789
772
-static void smap_gather_stats(struct vm_area_struct *vma,
773
- struct mem_size_stats *mss)
774
-{
775
- struct mm_walk smaps_walk = {
776
- .pmd_entry = smaps_pte_range,
777
-#ifdef CONFIG_HUGETLB_PAGE
778
- .hugetlb_entry = smaps_hugetlb_range,
779
-#endif
780
- .mm = vma->vm_mm,
781
- };
790
+static const struct mm_walk_ops smaps_walk_ops = {
791
+ .pmd_entry = smaps_pte_range,
792
+ .hugetlb_entry = smaps_hugetlb_range,
793
+};
782794
783
- smaps_walk.private = mss;
795
+static const struct mm_walk_ops smaps_shmem_walk_ops = {
796
+ .pmd_entry = smaps_pte_range,
797
+ .hugetlb_entry = smaps_hugetlb_range,
798
+ .pte_hole = smaps_pte_hole,
799
+};
800
+
801
+/*
802
+ * Gather mem stats from @vma with the indicated beginning
803
+ * address @start, and keep them in @mss.
804
+ *
805
+ * Use vm_start of @vma as the beginning address if @start is 0.
806
+ */
807
+static void smap_gather_stats(struct vm_area_struct *vma,
808
+ struct mem_size_stats *mss, unsigned long start)
809
+{
810
+ const struct mm_walk_ops *ops = &smaps_walk_ops;
811
+
812
+ /* Invalid start */
813
+ if (start >= vma->vm_end)
814
+ return;
784815
785816 #ifdef CONFIG_SHMEM
786817 /* In case of smaps_rollup, reset the value from previous vma */
....@@ -798,27 +829,43 @@
798829 */
799830 unsigned long shmem_swapped = shmem_swap_usage(vma);
800831
801
- if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
802
- !(vma->vm_flags & VM_WRITE)) {
832
+ if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
833
+ !(vma->vm_flags & VM_WRITE))) {
803834 mss->swap += shmem_swapped;
804835 } else {
805836 mss->check_shmem_swap = true;
806
- smaps_walk.pte_hole = smaps_pte_hole;
837
+ ops = &smaps_shmem_walk_ops;
807838 }
808839 }
809840 #endif
810
- /* mmap_sem is held in m_start */
811
- walk_page_vma(vma, &smaps_walk);
841
+ /* mmap_lock is held in m_start */
842
+ if (!start)
843
+ walk_page_vma(vma, ops, mss);
844
+ else
845
+ walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
812846 }
813847
814848 #define SEQ_PUT_DEC(str, val) \
815849 seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
816850
817851 /* Show the contents common for smaps and smaps_rollup */
818
-static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss)
852
+static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
853
+ bool rollup_mode)
819854 {
820855 SEQ_PUT_DEC("Rss: ", mss->resident);
821856 SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
857
+ if (rollup_mode) {
858
+ /*
859
+ * These are meaningful only for smaps_rollup, otherwise two of
860
+ * them are zero, and the other one is the same as Pss.
861
+ */
862
+ SEQ_PUT_DEC(" kB\nPss_Anon: ",
863
+ mss->pss_anon >> PSS_SHIFT);
864
+ SEQ_PUT_DEC(" kB\nPss_File: ",
865
+ mss->pss_file >> PSS_SHIFT);
866
+ SEQ_PUT_DEC(" kB\nPss_Shmem: ",
867
+ mss->pss_shmem >> PSS_SHIFT);
868
+ }
822869 SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean);
823870 SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty);
824871 SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean);
....@@ -828,6 +875,7 @@
828875 SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
829876 SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
830877 SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
878
+ SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
831879 SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
832880 seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
833881 mss->private_hugetlb >> 10, 7);
....@@ -846,7 +894,7 @@
846894
847895 memset(&mss, 0, sizeof(mss));
848896
849
- smap_gather_stats(vma, &mss);
897
+ smap_gather_stats(vma, &mss, 0);
850898
851899 show_map_vma(m, vma);
852900 if (vma_get_anon_name(vma)) {
....@@ -860,15 +908,14 @@
860908 SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma));
861909 seq_puts(m, " kB\n");
862910
863
- __show_smap(m, &mss);
911
+ __show_smap(m, &mss, false);
864912
865
- seq_printf(m, "THPeligible: %d\n", transparent_hugepage_enabled(vma));
913
+ seq_printf(m, "THPeligible: %d\n",
914
+ transparent_hugepage_active(vma));
866915
867916 if (arch_pkeys_enabled())
868917 seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
869918 show_smap_vma_flags(m, vma);
870
-
871
- m_cache_vma(m, vma);
872919
873920 return 0;
874921 }
....@@ -894,26 +941,90 @@
894941
895942 memset(&mss, 0, sizeof(mss));
896943
897
- ret = down_read_killable(&mm->mmap_sem);
944
+ ret = mmap_read_lock_killable(mm);
898945 if (ret)
899946 goto out_put_mm;
900947
901948 hold_task_mempolicy(priv);
902949
903
- for (vma = priv->mm->mmap; vma; vma = vma->vm_next) {
904
- smap_gather_stats(vma, &mss);
950
+ for (vma = priv->mm->mmap; vma;) {
951
+ smap_gather_stats(vma, &mss, 0);
905952 last_vma_end = vma->vm_end;
953
+
954
+ /*
955
+ * Release mmap_lock temporarily if someone wants to
956
+ * access it for write request.
957
+ */
958
+ if (mmap_lock_is_contended(mm)) {
959
+ mmap_read_unlock(mm);
960
+ ret = mmap_read_lock_killable(mm);
961
+ if (ret) {
962
+ release_task_mempolicy(priv);
963
+ goto out_put_mm;
964
+ }
965
+
966
+ /*
967
+ * After dropping the lock, there are four cases to
968
+ * consider. See the following example for explanation.
969
+ *
970
+ * +------+------+-----------+
971
+ * | VMA1 | VMA2 | VMA3 |
972
+ * +------+------+-----------+
973
+ * | | | |
974
+ * 4k 8k 16k 400k
975
+ *
976
+ * Suppose we drop the lock after reading VMA2 due to
977
+ * contention, then we get:
978
+ *
979
+ * last_vma_end = 16k
980
+ *
981
+ * 1) VMA2 is freed, but VMA3 exists:
982
+ *
983
+ * find_vma(mm, 16k - 1) will return VMA3.
984
+ * In this case, just continue from VMA3.
985
+ *
986
+ * 2) VMA2 still exists:
987
+ *
988
+ * find_vma(mm, 16k - 1) will return VMA2.
989
+ * Iterate the loop like the original one.
990
+ *
991
+ * 3) No more VMAs can be found:
992
+ *
993
+ * find_vma(mm, 16k - 1) will return NULL.
994
+ * No more things to do, just break.
995
+ *
996
+ * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
997
+ *
998
+ * find_vma(mm, 16k - 1) will return VMA' whose range
999
+ * contains last_vma_end.
1000
+ * Iterate VMA' from last_vma_end.
1001
+ */
1002
+ vma = find_vma(mm, last_vma_end - 1);
1003
+ /* Case 3 above */
1004
+ if (!vma)
1005
+ break;
1006
+
1007
+ /* Case 1 above */
1008
+ if (vma->vm_start >= last_vma_end)
1009
+ continue;
1010
+
1011
+ /* Case 4 above */
1012
+ if (vma->vm_end > last_vma_end)
1013
+ smap_gather_stats(vma, &mss, last_vma_end);
1014
+ }
1015
+ /* Case 2 above */
1016
+ vma = vma->vm_next;
9061017 }
9071018
908
- show_vma_header_prefix(m, priv->mm->mmap->vm_start,
1019
+ show_vma_header_prefix(m, priv->mm->mmap ? priv->mm->mmap->vm_start : 0,
9091020 last_vma_end, 0, 0, 0, 0);
9101021 seq_pad(m, ' ');
9111022 seq_puts(m, "[rollup]\n");
9121023
913
- __show_smap(m, &mss);
1024
+ __show_smap(m, &mss, true);
9141025
9151026 release_task_mempolicy(priv);
916
- up_read(&mm->mmap_sem);
1027
+ mmap_read_unlock(mm);
9171028
9181029 out_put_mm:
9191030 mmput(mm);
....@@ -1006,6 +1117,25 @@
10061117 };
10071118
10081119 #ifdef CONFIG_MEM_SOFT_DIRTY
1120
+
1121
+#define is_cow_mapping(flags) (((flags) & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE)
1122
+
1123
+static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1124
+{
1125
+ struct page *page;
1126
+
1127
+ if (!pte_write(pte))
1128
+ return false;
1129
+ if (!is_cow_mapping(vma->vm_flags))
1130
+ return false;
1131
+ if (likely(!atomic_read(&vma->vm_mm->has_pinned)))
1132
+ return false;
1133
+ page = vm_normal_page(vma, addr, pte);
1134
+ if (!page)
1135
+ return false;
1136
+ return page_maybe_dma_pinned(page);
1137
+}
1138
+
10091139 static inline void clear_soft_dirty(struct vm_area_struct *vma,
10101140 unsigned long addr, pte_t *pte)
10111141 {
....@@ -1018,10 +1148,14 @@
10181148 pte_t ptent = *pte;
10191149
10201150 if (pte_present(ptent)) {
1021
- ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
1022
- ptent = pte_wrprotect(ptent);
1151
+ pte_t old_pte;
1152
+
1153
+ if (pte_is_pinned(vma, addr, ptent))
1154
+ return;
1155
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
1156
+ ptent = pte_wrprotect(old_pte);
10231157 ptent = pte_clear_soft_dirty(ptent);
1024
- ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
1158
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
10251159 } else if (is_swap_pte(ptent)) {
10261160 ptent = pte_swp_clear_soft_dirty(ptent);
10271161 set_pte_at(vma->vm_mm, addr, pte, ptent);
....@@ -1145,6 +1279,11 @@
11451279 return 0;
11461280 }
11471281
1282
+static const struct mm_walk_ops clear_refs_walk_ops = {
1283
+ .pmd_entry = clear_refs_pte_range,
1284
+ .test_walk = clear_refs_test_walk,
1285
+};
1286
+
11481287 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
11491288 size_t count, loff_t *ppos)
11501289 {
....@@ -1153,7 +1292,6 @@
11531292 struct mm_struct *mm;
11541293 struct vm_area_struct *vma;
11551294 enum clear_refs_types type;
1156
- struct mmu_gather tlb;
11571295 int itype;
11581296 int rv;
11591297
....@@ -1174,77 +1312,49 @@
11741312 return -ESRCH;
11751313 mm = get_task_mm(task);
11761314 if (mm) {
1315
+ struct mmu_notifier_range range;
11771316 struct clear_refs_private cp = {
11781317 .type = type,
11791318 };
1180
- struct mm_walk clear_refs_walk = {
1181
- .pmd_entry = clear_refs_pte_range,
1182
- .test_walk = clear_refs_test_walk,
1183
- .mm = mm,
1184
- .private = &cp,
1185
- };
11861319
1320
+ if (mmap_write_lock_killable(mm)) {
1321
+ count = -EINTR;
1322
+ goto out_mm;
1323
+ }
11871324 if (type == CLEAR_REFS_MM_HIWATER_RSS) {
1188
- if (down_write_killable(&mm->mmap_sem)) {
1189
- count = -EINTR;
1190
- goto out_mm;
1191
- }
1192
-
11931325 /*
11941326 * Writing 5 to /proc/pid/clear_refs resets the peak
11951327 * resident set size to this mm's current rss value.
11961328 */
11971329 reset_mm_hiwater_rss(mm);
1198
- up_write(&mm->mmap_sem);
1199
- goto out_mm;
1330
+ goto out_unlock;
12001331 }
12011332
1202
- if (down_read_killable(&mm->mmap_sem)) {
1203
- count = -EINTR;
1204
- goto out_mm;
1205
- }
1206
- tlb_gather_mmu(&tlb, mm, 0, -1);
12071333 if (type == CLEAR_REFS_SOFT_DIRTY) {
12081334 for (vma = mm->mmap; vma; vma = vma->vm_next) {
12091335 if (!(vma->vm_flags & VM_SOFTDIRTY))
12101336 continue;
1211
- up_read(&mm->mmap_sem);
1212
- if (down_write_killable(&mm->mmap_sem)) {
1213
- count = -EINTR;
1214
- goto out_mm;
1215
- }
1216
- /*
1217
- * Avoid to modify vma->vm_flags
1218
- * without locked ops while the
1219
- * coredump reads the vm_flags.
1220
- */
1221
- if (!mmget_still_valid(mm)) {
1222
- /*
1223
- * Silently return "count"
1224
- * like if get_task_mm()
1225
- * failed. FIXME: should this
1226
- * function have returned
1227
- * -ESRCH if get_task_mm()
1228
- * failed like if
1229
- * get_proc_task() fails?
1230
- */
1231
- up_write(&mm->mmap_sem);
1232
- goto out_mm;
1233
- }
1234
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
1235
- vma->vm_flags &= ~VM_SOFTDIRTY;
1236
- vma_set_page_prot(vma);
1237
- }
1238
- downgrade_write(&mm->mmap_sem);
1239
- break;
1337
+ vm_write_begin(vma);
1338
+ WRITE_ONCE(vma->vm_flags,
1339
+ vma->vm_flags & ~VM_SOFTDIRTY);
1340
+ vma_set_page_prot(vma);
1341
+ vm_write_end(vma);
12401342 }
1241
- mmu_notifier_invalidate_range_start(mm, 0, -1);
1343
+
1344
+ inc_tlb_flush_pending(mm);
1345
+ mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
1346
+ 0, NULL, mm, 0, -1UL);
1347
+ mmu_notifier_invalidate_range_start(&range);
12421348 }
1243
- walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
1244
- if (type == CLEAR_REFS_SOFT_DIRTY)
1245
- mmu_notifier_invalidate_range_end(mm, 0, -1);
1246
- tlb_finish_mmu(&tlb, 0, -1);
1247
- up_read(&mm->mmap_sem);
1349
+ walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
1350
+ &cp);
1351
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
1352
+ mmu_notifier_invalidate_range_end(&range);
1353
+ flush_tlb_mm(mm);
1354
+ dec_tlb_flush_pending(mm);
1355
+ }
1356
+out_unlock:
1357
+ mmap_write_unlock(mm);
12481358 out_mm:
12491359 mmput(mm);
12501360 }
....@@ -1297,7 +1407,7 @@
12971407 }
12981408
12991409 static int pagemap_pte_hole(unsigned long start, unsigned long end,
1300
- struct mm_walk *walk)
1410
+ __always_unused int depth, struct mm_walk *walk)
13011411 {
13021412 struct pagemapread *pm = walk->private;
13031413 unsigned long addr = start;
....@@ -1341,12 +1451,13 @@
13411451 {
13421452 u64 frame = 0, flags = 0;
13431453 struct page *page = NULL;
1454
+ bool migration = false;
13441455
13451456 if (pte_present(pte)) {
13461457 if (pm->show_pfn)
13471458 frame = pte_pfn(pte);
13481459 flags |= PM_PRESENT;
1349
- page = _vm_normal_page(vma, addr, pte, true);
1460
+ page = vm_normal_page(vma, addr, pte);
13501461 if (pte_soft_dirty(pte))
13511462 flags |= PM_SOFT_DIRTY;
13521463 } else if (is_swap_pte(pte)) {
....@@ -1358,8 +1469,10 @@
13581469 frame = swp_type(entry) |
13591470 (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
13601471 flags |= PM_SWAP;
1361
- if (is_migration_entry(entry))
1472
+ if (is_migration_entry(entry)) {
1473
+ migration = true;
13621474 page = migration_entry_to_page(entry);
1475
+ }
13631476
13641477 if (is_device_private_entry(entry))
13651478 page = device_private_entry_to_page(entry);
....@@ -1367,7 +1480,7 @@
13671480
13681481 if (page && !PageAnon(page))
13691482 flags |= PM_FILE;
1370
- if (page && page_mapcount(page) == 1)
1483
+ if (page && !migration && page_mapcount(page) == 1)
13711484 flags |= PM_MMAP_EXCLUSIVE;
13721485 if (vma->vm_flags & VM_SOFTDIRTY)
13731486 flags |= PM_SOFT_DIRTY;
....@@ -1383,8 +1496,9 @@
13831496 spinlock_t *ptl;
13841497 pte_t *pte, *orig_pte;
13851498 int err = 0;
1386
-
13871499 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1500
+ bool migration = false;
1501
+
13881502 ptl = pmd_trans_huge_lock(pmdp, vma);
13891503 if (ptl) {
13901504 u64 flags = 0, frame = 0;
....@@ -1419,11 +1533,12 @@
14191533 if (pmd_swp_soft_dirty(pmd))
14201534 flags |= PM_SOFT_DIRTY;
14211535 VM_BUG_ON(!is_pmd_migration_entry(pmd));
1536
+ migration = is_migration_entry(entry);
14221537 page = migration_entry_to_page(entry);
14231538 }
14241539 #endif
14251540
1426
- if (page && page_mapcount(page) == 1)
1541
+ if (page && !migration && page_mapcount(page) == 1)
14271542 flags |= PM_MMAP_EXCLUSIVE;
14281543
14291544 for (; addr != end; addr += PAGE_SIZE) {
....@@ -1512,7 +1627,15 @@
15121627
15131628 return err;
15141629 }
1630
+#else
1631
+#define pagemap_hugetlb_range NULL
15151632 #endif /* HUGETLB_PAGE */
1633
+
1634
+static const struct mm_walk_ops pagemap_ops = {
1635
+ .pmd_entry = pagemap_pmd_range,
1636
+ .pte_hole = pagemap_pte_hole,
1637
+ .hugetlb_entry = pagemap_hugetlb_range,
1638
+};
15161639
15171640 /*
15181641 * /proc/pid/pagemap - an array mapping virtual pages to pfns
....@@ -1545,7 +1668,6 @@
15451668 {
15461669 struct mm_struct *mm = file->private_data;
15471670 struct pagemapread pm;
1548
- struct mm_walk pagemap_walk = {};
15491671 unsigned long src;
15501672 unsigned long svpfn;
15511673 unsigned long start_vaddr;
....@@ -1573,21 +1695,17 @@
15731695 if (!pm.buffer)
15741696 goto out_mm;
15751697
1576
- pagemap_walk.pmd_entry = pagemap_pmd_range;
1577
- pagemap_walk.pte_hole = pagemap_pte_hole;
1578
-#ifdef CONFIG_HUGETLB_PAGE
1579
- pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
1580
-#endif
1581
- pagemap_walk.mm = mm;
1582
- pagemap_walk.private = &pm;
1583
-
15841698 src = *ppos;
15851699 svpfn = src / PM_ENTRY_BYTES;
1586
- start_vaddr = svpfn << PAGE_SHIFT;
15871700 end_vaddr = mm->task_size;
15881701
15891702 /* watch out for wraparound */
1590
- if (svpfn > mm->task_size >> PAGE_SHIFT)
1703
+ start_vaddr = end_vaddr;
1704
+ if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
1705
+ start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
1706
+
1707
+ /* Ensure the address is inside the task */
1708
+ if (start_vaddr > mm->task_size)
15911709 start_vaddr = end_vaddr;
15921710
15931711 /*
....@@ -1606,11 +1724,11 @@
16061724 /* overflow ? */
16071725 if (end < start_vaddr || end > end_vaddr)
16081726 end = end_vaddr;
1609
- ret = down_read_killable(&mm->mmap_sem);
1727
+ ret = mmap_read_lock_killable(mm);
16101728 if (ret)
16111729 goto out_free;
1612
- ret = walk_page_range(start_vaddr, end, &pagemap_walk);
1613
- up_read(&mm->mmap_sem);
1730
+ ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
1731
+ mmap_read_unlock(mm);
16141732 start_vaddr = end;
16151733
16161734 len = min(count, PM_ENTRY_BYTES * pm.pos);
....@@ -1821,6 +1939,11 @@
18211939 }
18221940 #endif
18231941
1942
+static const struct mm_walk_ops show_numa_ops = {
1943
+ .hugetlb_entry = gather_hugetlb_stats,
1944
+ .pmd_entry = gather_pte_stats,
1945
+};
1946
+
18241947 /*
18251948 * Display pages allocated per node and memory policy via /proc.
18261949 */
....@@ -1832,12 +1955,6 @@
18321955 struct numa_maps *md = &numa_priv->md;
18331956 struct file *file = vma->vm_file;
18341957 struct mm_struct *mm = vma->vm_mm;
1835
- struct mm_walk walk = {
1836
- .hugetlb_entry = gather_hugetlb_stats,
1837
- .pmd_entry = gather_pte_stats,
1838
- .private = md,
1839
- .mm = mm,
1840
- };
18411958 struct mempolicy *pol;
18421959 char buffer[64];
18431960 int nid;
....@@ -1870,8 +1987,8 @@
18701987 if (is_vm_hugetlb_page(vma))
18711988 seq_puts(m, " huge");
18721989
1873
- /* mmap_sem is held by m_start */
1874
- walk_page_vma(vma, &walk);
1990
+ /* mmap_lock is held by m_start */
1991
+ walk_page_vma(vma, &show_numa_ops, md);
18751992
18761993 if (!md->pages)
18771994 goto out;
....@@ -1904,7 +2021,6 @@
19042021 seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
19052022 out:
19062023 seq_putc(m, '\n');
1907
- m_cache_vma(m, vma);
19082024 return 0;
19092025 }
19102026