hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/mm/huge_memory.c
....@@ -1,8 +1,6 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Copyright (C) 2009 Red Hat, Inc.
3
- *
4
- * This work is licensed under the terms of the GNU GPL, version 2. See
5
- * the COPYING file in the top-level directory.
64 */
75
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
....@@ -33,8 +31,9 @@
3331 #include <linux/page_idle.h>
3432 #include <linux/shmem_fs.h>
3533 #include <linux/oom.h>
34
+#include <linux/numa.h>
3635 #include <linux/page_owner.h>
37
-
36
+#include <trace/hooks/mm.h>
3837 #include <asm/tlb.h>
3938 #include <asm/pgalloc.h>
4039 #include "internal.h"
....@@ -64,12 +63,26 @@
6463 struct page *huge_zero_page __read_mostly;
6564 unsigned long huge_zero_pfn __read_mostly = ~0UL;
6665
67
-bool transparent_hugepage_enabled(struct vm_area_struct *vma)
66
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
6867 {
68
+ return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file &&
69
+ !inode_is_open_for_write(vma->vm_file->f_inode) &&
70
+ (vma->vm_flags & VM_EXEC);
71
+}
72
+
73
+bool transparent_hugepage_active(struct vm_area_struct *vma)
74
+{
75
+ /* The addr is used to check if the vma size fits */
76
+ unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
77
+
78
+ if (!transhuge_vma_suitable(vma, addr))
79
+ return false;
6980 if (vma_is_anonymous(vma))
7081 return __transparent_hugepage_enabled(vma);
71
- if (vma_is_shmem(vma) && shmem_huge_enabled(vma))
72
- return __transparent_hugepage_enabled(vma);
82
+ if (vma_is_shmem(vma))
83
+ return shmem_huge_enabled(vma);
84
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
85
+ return file_thp_enabled(vma);
7386
7487 return false;
7588 }
....@@ -302,34 +315,13 @@
302315 static struct kobj_attribute hpage_pmd_size_attr =
303316 __ATTR_RO(hpage_pmd_size);
304317
305
-#ifdef CONFIG_DEBUG_VM
306
-static ssize_t debug_cow_show(struct kobject *kobj,
307
- struct kobj_attribute *attr, char *buf)
308
-{
309
- return single_hugepage_flag_show(kobj, attr, buf,
310
- TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
311
-}
312
-static ssize_t debug_cow_store(struct kobject *kobj,
313
- struct kobj_attribute *attr,
314
- const char *buf, size_t count)
315
-{
316
- return single_hugepage_flag_store(kobj, attr, buf, count,
317
- TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
318
-}
319
-static struct kobj_attribute debug_cow_attr =
320
- __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
321
-#endif /* CONFIG_DEBUG_VM */
322
-
323318 static struct attribute *hugepage_attr[] = {
324319 &enabled_attr.attr,
325320 &defrag_attr.attr,
326321 &use_zero_page_attr.attr,
327322 &hpage_pmd_size_attr.attr,
328
-#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
323
+#ifdef CONFIG_SHMEM
329324 &shmem_enabled_attr.attr,
330
-#endif
331
-#ifdef CONFIG_DEBUG_VM
332
- &debug_cow_attr.attr,
333325 #endif
334326 NULL,
335327 };
....@@ -392,7 +384,11 @@
392384 struct kobject *hugepage_kobj;
393385
394386 if (!has_transparent_hugepage()) {
395
- transparent_hugepage_flags = 0;
387
+ /*
388
+ * Hardware doesn't support hugepages, hence disable
389
+ * DAX PMD support.
390
+ */
391
+ transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
396392 return -EINVAL;
397393 }
398394
....@@ -426,7 +422,7 @@
426422 * where the extra memory used could hurt more than TLB overhead
427423 * is likely to save. The admin can still enable it through /sys.
428424 */
429
- if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
425
+ if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
430426 transparent_hugepage_flags = 0;
431427 return 0;
432428 }
....@@ -487,11 +483,25 @@
487483 return pmd;
488484 }
489485
490
-static inline struct list_head *page_deferred_list(struct page *page)
486
+#ifdef CONFIG_MEMCG
487
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
491488 {
492
- /* ->lru in the tail pages is occupied by compound_head. */
493
- return &page[2].deferred_list;
489
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
490
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
491
+
492
+ if (memcg)
493
+ return &memcg->deferred_split_queue;
494
+ else
495
+ return &pgdat->deferred_split_queue;
494496 }
497
+#else
498
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
499
+{
500
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
501
+
502
+ return &pgdat->deferred_split_queue;
503
+}
504
+#endif
495505
496506 void prep_transhuge_page(struct page *page)
497507 {
....@@ -503,6 +513,17 @@
503513 INIT_LIST_HEAD(page_deferred_list(page));
504514 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
505515 }
516
+
517
+bool is_transparent_hugepage(struct page *page)
518
+{
519
+ if (!PageCompound(page))
520
+ return false;
521
+
522
+ page = compound_head(page);
523
+ return is_huge_zero_page(page) ||
524
+ page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
525
+}
526
+EXPORT_SYMBOL_GPL(is_transparent_hugepage);
506527
507528 static unsigned long __thp_get_unmapped_area(struct file *filp,
508529 unsigned long addr, unsigned long len,
....@@ -561,20 +582,21 @@
561582 struct page *page, gfp_t gfp)
562583 {
563584 struct vm_area_struct *vma = vmf->vma;
564
- struct mem_cgroup *memcg;
565585 pgtable_t pgtable;
566586 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
567587 vm_fault_t ret = 0;
568588
569589 VM_BUG_ON_PAGE(!PageCompound(page), page);
570590
571
- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
591
+ if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
572592 put_page(page);
573593 count_vm_event(THP_FAULT_FALLBACK);
594
+ count_vm_event(THP_FAULT_FALLBACK_CHARGE);
574595 return VM_FAULT_FALLBACK;
575596 }
597
+ cgroup_throttle_swaprate(page, gfp);
576598
577
- pgtable = pte_alloc_one(vma->vm_mm, haddr);
599
+ pgtable = pte_alloc_one(vma->vm_mm);
578600 if (unlikely(!pgtable)) {
579601 ret = VM_FAULT_OOM;
580602 goto release;
....@@ -603,7 +625,6 @@
603625 vm_fault_t ret2;
604626
605627 spin_unlock(vmf->ptl);
606
- mem_cgroup_cancel_charge(page, memcg, true);
607628 put_page(page);
608629 pte_free(vma->vm_mm, pgtable);
609630 ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
....@@ -614,14 +635,14 @@
614635 entry = mk_huge_pmd(page, vma->vm_page_prot);
615636 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
616637 page_add_new_anon_rmap(page, vma, haddr, true);
617
- mem_cgroup_commit_charge(page, memcg, false, true);
618
- lru_cache_add_active_or_unevictable(page, vma);
638
+ lru_cache_add_inactive_or_unevictable(page, vma);
619639 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
620640 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
621641 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
622642 mm_inc_nr_ptes(vma->vm_mm);
623643 spin_unlock(vmf->ptl);
624644 count_vm_event(THP_FAULT_ALLOC);
645
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
625646 }
626647
627648 return 0;
....@@ -630,7 +651,6 @@
630651 release:
631652 if (pgtable)
632653 pte_free(vma->vm_mm, pgtable);
633
- mem_cgroup_cancel_charge(page, memcg, true);
634654 put_page(page);
635655 return ret;
636656
....@@ -649,16 +669,25 @@
649669 {
650670 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
651671
672
+ /* Always do synchronous compaction */
652673 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
653674 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
675
+
676
+ /* Kick kcompactd and fail quickly */
654677 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
655678 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
679
+
680
+ /* Synchronous compaction if madvised, otherwise kick kcompactd */
656681 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
657
- return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
658
- __GFP_KSWAPD_RECLAIM);
682
+ return GFP_TRANSHUGE_LIGHT |
683
+ (vma_madvised ? __GFP_DIRECT_RECLAIM :
684
+ __GFP_KSWAPD_RECLAIM);
685
+
686
+ /* Only do synchronous compaction if madvised */
659687 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
660
- return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
661
- 0);
688
+ return GFP_TRANSHUGE_LIGHT |
689
+ (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
690
+
662691 return GFP_TRANSHUGE_LIGHT;
663692 }
664693
....@@ -686,7 +715,7 @@
686715 struct page *page;
687716 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
688717
689
- if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
718
+ if (!transhuge_vma_suitable(vma, haddr))
690719 return VM_FAULT_FALLBACK;
691720 if (unlikely(anon_vma_prepare(vma)))
692721 return VM_FAULT_OOM;
....@@ -698,7 +727,7 @@
698727 pgtable_t pgtable;
699728 struct page *zero_page;
700729 vm_fault_t ret;
701
- pgtable = pte_alloc_one(vma->vm_mm, haddr);
730
+ pgtable = pte_alloc_one(vma->vm_mm);
702731 if (unlikely(!pgtable))
703732 return VM_FAULT_OOM;
704733 zero_page = mm_get_huge_zero_page(vma->vm_mm);
....@@ -787,11 +816,24 @@
787816 pte_free(mm, pgtable);
788817 }
789818
790
-vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
819
+/**
820
+ * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
821
+ * @vmf: Structure describing the fault
822
+ * @pfn: pfn to insert
823
+ * @pgprot: page protection to use
824
+ * @write: whether it's a write fault
825
+ *
826
+ * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
827
+ * also consult the vmf_insert_mixed_prot() documentation when
828
+ * @pgprot != @vmf->vma->vm_page_prot.
829
+ *
830
+ * Return: vm_fault_t value.
831
+ */
832
+vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
833
+ pgprot_t pgprot, bool write)
791834 {
792835 unsigned long addr = vmf->address & PMD_MASK;
793836 struct vm_area_struct *vma = vmf->vma;
794
- pgprot_t pgprot = vma->vm_page_prot;
795837 pgtable_t pgtable = NULL;
796838
797839 /*
....@@ -809,7 +851,7 @@
809851 return VM_FAULT_SIGBUS;
810852
811853 if (arch_needs_pgtable_deposit()) {
812
- pgtable = pte_alloc_one(vma->vm_mm, addr);
854
+ pgtable = pte_alloc_one(vma->vm_mm);
813855 if (!pgtable)
814856 return VM_FAULT_OOM;
815857 }
....@@ -819,7 +861,7 @@
819861 insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
820862 return VM_FAULT_NOPAGE;
821863 }
822
-EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
864
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
823865
824866 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
825867 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
....@@ -865,11 +907,24 @@
865907 spin_unlock(ptl);
866908 }
867909
868
-vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
910
+/**
911
+ * vmf_insert_pfn_pud_prot - insert a pud size pfn
912
+ * @vmf: Structure describing the fault
913
+ * @pfn: pfn to insert
914
+ * @pgprot: page protection to use
915
+ * @write: whether it's a write fault
916
+ *
917
+ * Insert a pud size pfn. See vmf_insert_pfn() for additional info and
918
+ * also consult the vmf_insert_mixed_prot() documentation when
919
+ * @pgprot != @vmf->vma->vm_page_prot.
920
+ *
921
+ * Return: vm_fault_t value.
922
+ */
923
+vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
924
+ pgprot_t pgprot, bool write)
869925 {
870926 unsigned long addr = vmf->address & PUD_MASK;
871927 struct vm_area_struct *vma = vmf->vma;
872
- pgprot_t pgprot = vma->vm_page_prot;
873928
874929 /*
875930 * If we had pud_special, we could avoid all these restrictions,
....@@ -890,7 +945,7 @@
890945 insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
891946 return VM_FAULT_NOPAGE;
892947 }
893
-EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
948
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
894949 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
895950
896951 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
....@@ -907,11 +962,10 @@
907962 }
908963
909964 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
910
- pmd_t *pmd, int flags)
965
+ pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
911966 {
912967 unsigned long pfn = pmd_pfn(*pmd);
913968 struct mm_struct *mm = vma->vm_mm;
914
- struct dev_pagemap *pgmap;
915969 struct page *page;
916970
917971 assert_spin_locked(pmd_lockptr(mm, pmd));
....@@ -921,6 +975,11 @@
921975 * not be in this function with `flags & FOLL_COW` set.
922976 */
923977 WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
978
+
979
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
980
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
981
+ (FOLL_PIN | FOLL_GET)))
982
+ return NULL;
924983
925984 if (flags & FOLL_WRITE && !pmd_write(*pmd))
926985 return NULL;
....@@ -937,23 +996,23 @@
937996 * device mapped pages can only be returned if the
938997 * caller will manage the page reference count.
939998 */
940
- if (!(flags & FOLL_GET))
999
+ if (!(flags & (FOLL_GET | FOLL_PIN)))
9411000 return ERR_PTR(-EEXIST);
9421001
9431002 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
944
- pgmap = get_dev_pagemap(pfn, NULL);
945
- if (!pgmap)
1003
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
1004
+ if (!*pgmap)
9461005 return ERR_PTR(-EFAULT);
9471006 page = pfn_to_page(pfn);
948
- get_page(page);
949
- put_dev_pagemap(pgmap);
1007
+ if (!try_grab_page(page, flags))
1008
+ page = ERR_PTR(-ENOMEM);
9501009
9511010 return page;
9521011 }
9531012
9541013 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
9551014 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
956
- struct vm_area_struct *vma)
1015
+ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
9571016 {
9581017 spinlock_t *dst_ptl, *src_ptl;
9591018 struct page *src_page;
....@@ -962,10 +1021,10 @@
9621021 int ret = -ENOMEM;
9631022
9641023 /* Skip if can be re-fill on fault */
965
- if (!vma_is_anonymous(vma))
1024
+ if (!vma_is_anonymous(dst_vma))
9661025 return 0;
9671026
968
- pgtable = pte_alloc_one(dst_mm, addr);
1027
+ pgtable = pte_alloc_one(dst_mm);
9691028 if (unlikely(!pgtable))
9701029 goto out;
9711030
....@@ -986,11 +1045,15 @@
9861045 pmd = swp_entry_to_pmd(entry);
9871046 if (pmd_swp_soft_dirty(*src_pmd))
9881047 pmd = pmd_swp_mksoft_dirty(pmd);
1048
+ if (pmd_swp_uffd_wp(*src_pmd))
1049
+ pmd = pmd_swp_mkuffd_wp(pmd);
9891050 set_pmd_at(src_mm, addr, src_pmd, pmd);
9901051 }
9911052 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
9921053 mm_inc_nr_ptes(dst_mm);
9931054 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1055
+ if (!userfaultfd_wp(dst_vma))
1056
+ pmd = pmd_swp_clear_uffd_wp(pmd);
9941057 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
9951058 ret = 0;
9961059 goto out_unlock;
....@@ -1007,28 +1070,44 @@
10071070 * a page table.
10081071 */
10091072 if (is_huge_zero_pmd(pmd)) {
1010
- struct page *zero_page;
10111073 /*
10121074 * get_huge_zero_page() will never allocate a new page here,
10131075 * since we already have a zero page to copy. It just takes a
10141076 * reference.
10151077 */
1016
- zero_page = mm_get_huge_zero_page(dst_mm);
1017
- set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
1018
- zero_page);
1019
- ret = 0;
1020
- goto out_unlock;
1078
+ mm_get_huge_zero_page(dst_mm);
1079
+ goto out_zero_page;
10211080 }
10221081
10231082 src_page = pmd_page(pmd);
10241083 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1084
+
1085
+ /*
1086
+ * If this page is a potentially pinned page, split and retry the fault
1087
+ * with smaller page size. Normally this should not happen because the
1088
+ * userspace should use MADV_DONTFORK upon pinned regions. This is a
1089
+ * best effort that the pinned pages won't be replaced by another
1090
+ * random page during the coming copy-on-write.
1091
+ */
1092
+ if (unlikely(is_cow_mapping(src_vma->vm_flags) &&
1093
+ atomic_read(&src_mm->has_pinned) &&
1094
+ page_maybe_dma_pinned(src_page))) {
1095
+ pte_free(dst_mm, pgtable);
1096
+ spin_unlock(src_ptl);
1097
+ spin_unlock(dst_ptl);
1098
+ __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
1099
+ return -EAGAIN;
1100
+ }
1101
+
10251102 get_page(src_page);
10261103 page_dup_rmap(src_page, true);
10271104 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1105
+out_zero_page:
10281106 mm_inc_nr_ptes(dst_mm);
10291107 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1030
-
10311108 pmdp_set_wrprotect(src_mm, addr, src_pmd);
1109
+ if (!userfaultfd_wp(dst_vma))
1110
+ pmd = pmd_clear_uffd_wp(pmd);
10321111 pmd = pmd_mkold(pmd_wrprotect(pmd));
10331112 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
10341113
....@@ -1055,16 +1134,20 @@
10551134 }
10561135
10571136 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
1058
- pud_t *pud, int flags)
1137
+ pud_t *pud, int flags, struct dev_pagemap **pgmap)
10591138 {
10601139 unsigned long pfn = pud_pfn(*pud);
10611140 struct mm_struct *mm = vma->vm_mm;
1062
- struct dev_pagemap *pgmap;
10631141 struct page *page;
10641142
10651143 assert_spin_locked(pud_lockptr(mm, pud));
10661144
10671145 if (flags & FOLL_WRITE && !pud_write(*pud))
1146
+ return NULL;
1147
+
1148
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
1149
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
1150
+ (FOLL_PIN | FOLL_GET)))
10681151 return NULL;
10691152
10701153 if (pud_present(*pud) && pud_devmap(*pud))
....@@ -1078,17 +1161,19 @@
10781161 /*
10791162 * device mapped pages can only be returned if the
10801163 * caller will manage the page reference count.
1164
+ *
1165
+ * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
10811166 */
1082
- if (!(flags & FOLL_GET))
1167
+ if (!(flags & (FOLL_GET | FOLL_PIN)))
10831168 return ERR_PTR(-EEXIST);
10841169
10851170 pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
1086
- pgmap = get_dev_pagemap(pfn, NULL);
1087
- if (!pgmap)
1171
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
1172
+ if (!*pgmap)
10881173 return ERR_PTR(-EFAULT);
10891174 page = pfn_to_page(pfn);
1090
- get_page(page);
1091
- put_dev_pagemap(pgmap);
1175
+ if (!try_grab_page(page, flags))
1176
+ page = ERR_PTR(-ENOMEM);
10921177
10931178 return page;
10941179 }
....@@ -1117,6 +1202,16 @@
11171202 */
11181203 if (is_huge_zero_pud(pud)) {
11191204 /* No huge zero pud yet */
1205
+ }
1206
+
1207
+ /* Please refer to comments in copy_huge_pmd() */
1208
+ if (unlikely(is_cow_mapping(vma->vm_flags) &&
1209
+ atomic_read(&src_mm->has_pinned) &&
1210
+ page_maybe_dma_pinned(pud_page(pud)))) {
1211
+ spin_unlock(src_ptl);
1212
+ spin_unlock(dst_ptl);
1213
+ __split_huge_pud(vma, src_pud, addr);
1214
+ return -EAGAIN;
11201215 }
11211216
11221217 pudp_set_wrprotect(src_mm, addr, src_pud);
....@@ -1173,274 +1268,73 @@
11731268 spin_unlock(vmf->ptl);
11741269 }
11751270
1176
-static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
1177
- pmd_t orig_pmd, struct page *page)
1178
-{
1179
- struct vm_area_struct *vma = vmf->vma;
1180
- unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1181
- struct mem_cgroup *memcg;
1182
- pgtable_t pgtable;
1183
- pmd_t _pmd;
1184
- int i;
1185
- vm_fault_t ret = 0;
1186
- struct page **pages;
1187
- unsigned long mmun_start; /* For mmu_notifiers */
1188
- unsigned long mmun_end; /* For mmu_notifiers */
1189
-
1190
- pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
1191
- GFP_KERNEL);
1192
- if (unlikely(!pages)) {
1193
- ret |= VM_FAULT_OOM;
1194
- goto out;
1195
- }
1196
-
1197
- for (i = 0; i < HPAGE_PMD_NR; i++) {
1198
- pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
1199
- vmf->address, page_to_nid(page));
1200
- if (unlikely(!pages[i] ||
1201
- mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
1202
- GFP_KERNEL, &memcg, false))) {
1203
- if (pages[i])
1204
- put_page(pages[i]);
1205
- while (--i >= 0) {
1206
- memcg = (void *)page_private(pages[i]);
1207
- set_page_private(pages[i], 0);
1208
- mem_cgroup_cancel_charge(pages[i], memcg,
1209
- false);
1210
- put_page(pages[i]);
1211
- }
1212
- kfree(pages);
1213
- ret |= VM_FAULT_OOM;
1214
- goto out;
1215
- }
1216
- set_page_private(pages[i], (unsigned long)memcg);
1217
- }
1218
-
1219
- for (i = 0; i < HPAGE_PMD_NR; i++) {
1220
- copy_user_highpage(pages[i], page + i,
1221
- haddr + PAGE_SIZE * i, vma);
1222
- __SetPageUptodate(pages[i]);
1223
- cond_resched();
1224
- }
1225
-
1226
- mmun_start = haddr;
1227
- mmun_end = haddr + HPAGE_PMD_SIZE;
1228
- mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
1229
-
1230
- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1231
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1232
- goto out_free_pages;
1233
- VM_BUG_ON_PAGE(!PageHead(page), page);
1234
-
1235
- /*
1236
- * Leave pmd empty until pte is filled note we must notify here as
1237
- * concurrent CPU thread might write to new page before the call to
1238
- * mmu_notifier_invalidate_range_end() happens which can lead to a
1239
- * device seeing memory write in different order than CPU.
1240
- *
1241
- * See Documentation/vm/mmu_notifier.rst
1242
- */
1243
- pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
1244
-
1245
- pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
1246
- pmd_populate(vma->vm_mm, &_pmd, pgtable);
1247
-
1248
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1249
- pte_t entry;
1250
- entry = mk_pte(pages[i], vma->vm_page_prot);
1251
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1252
- memcg = (void *)page_private(pages[i]);
1253
- set_page_private(pages[i], 0);
1254
- page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
1255
- mem_cgroup_commit_charge(pages[i], memcg, false, false);
1256
- lru_cache_add_active_or_unevictable(pages[i], vma);
1257
- vmf->pte = pte_offset_map(&_pmd, haddr);
1258
- VM_BUG_ON(!pte_none(*vmf->pte));
1259
- set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
1260
- pte_unmap(vmf->pte);
1261
- }
1262
- kfree(pages);
1263
-
1264
- smp_wmb(); /* make pte visible before pmd */
1265
- pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
1266
- page_remove_rmap(page, true);
1267
- spin_unlock(vmf->ptl);
1268
-
1269
- /*
1270
- * No need to double call mmu_notifier->invalidate_range() callback as
1271
- * the above pmdp_huge_clear_flush_notify() did already call it.
1272
- */
1273
- mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
1274
- mmun_end);
1275
-
1276
- ret |= VM_FAULT_WRITE;
1277
- put_page(page);
1278
-
1279
-out:
1280
- return ret;
1281
-
1282
-out_free_pages:
1283
- spin_unlock(vmf->ptl);
1284
- mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
1285
- for (i = 0; i < HPAGE_PMD_NR; i++) {
1286
- memcg = (void *)page_private(pages[i]);
1287
- set_page_private(pages[i], 0);
1288
- mem_cgroup_cancel_charge(pages[i], memcg, false);
1289
- put_page(pages[i]);
1290
- }
1291
- kfree(pages);
1292
- goto out;
1293
-}
1294
-
12951271 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
12961272 {
12971273 struct vm_area_struct *vma = vmf->vma;
1298
- struct page *page = NULL, *new_page;
1299
- struct mem_cgroup *memcg;
1274
+ struct page *page;
13001275 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1301
- unsigned long mmun_start; /* For mmu_notifiers */
1302
- unsigned long mmun_end; /* For mmu_notifiers */
1303
- gfp_t huge_gfp; /* for allocation and charge */
1304
- vm_fault_t ret = 0;
13051276
13061277 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
13071278 VM_BUG_ON_VMA(!vma->anon_vma, vma);
1279
+
13081280 if (is_huge_zero_pmd(orig_pmd))
1309
- goto alloc;
1281
+ goto fallback;
1282
+
13101283 spin_lock(vmf->ptl);
1311
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1312
- goto out_unlock;
1284
+
1285
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1286
+ spin_unlock(vmf->ptl);
1287
+ return 0;
1288
+ }
13131289
13141290 page = pmd_page(orig_pmd);
13151291 VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
1316
- /*
1317
- * We can only reuse the page if nobody else maps the huge page or it's
1318
- * part.
1319
- */
1292
+
1293
+ /* Lock page for reuse_swap_page() */
13201294 if (!trylock_page(page)) {
13211295 get_page(page);
13221296 spin_unlock(vmf->ptl);
13231297 lock_page(page);
13241298 spin_lock(vmf->ptl);
13251299 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1300
+ spin_unlock(vmf->ptl);
13261301 unlock_page(page);
13271302 put_page(page);
1328
- goto out_unlock;
1303
+ return 0;
13291304 }
13301305 put_page(page);
13311306 }
1307
+
1308
+ /*
1309
+ * We can only reuse the page if nobody else maps the huge page or it's
1310
+ * part.
1311
+ */
13321312 if (reuse_swap_page(page, NULL)) {
13331313 pmd_t entry;
13341314 entry = pmd_mkyoung(orig_pmd);
13351315 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1336
- if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
1316
+ if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
13371317 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1338
- ret |= VM_FAULT_WRITE;
13391318 unlock_page(page);
1340
- goto out_unlock;
1341
- }
1342
- unlock_page(page);
1343
- get_page(page);
1344
- spin_unlock(vmf->ptl);
1345
-alloc:
1346
- if (__transparent_hugepage_enabled(vma) &&
1347
- !transparent_hugepage_debug_cow()) {
1348
- huge_gfp = alloc_hugepage_direct_gfpmask(vma);
1349
- new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
1350
- } else
1351
- new_page = NULL;
1352
-
1353
- if (likely(new_page)) {
1354
- prep_transhuge_page(new_page);
1355
- } else {
1356
- if (!page) {
1357
- split_huge_pmd(vma, vmf->pmd, vmf->address);
1358
- ret |= VM_FAULT_FALLBACK;
1359
- } else {
1360
- ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
1361
- if (ret & VM_FAULT_OOM) {
1362
- split_huge_pmd(vma, vmf->pmd, vmf->address);
1363
- ret |= VM_FAULT_FALLBACK;
1364
- }
1365
- put_page(page);
1366
- }
1367
- count_vm_event(THP_FAULT_FALLBACK);
1368
- goto out;
1369
- }
1370
-
1371
- if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
1372
- huge_gfp, &memcg, true))) {
1373
- put_page(new_page);
1374
- split_huge_pmd(vma, vmf->pmd, vmf->address);
1375
- if (page)
1376
- put_page(page);
1377
- ret |= VM_FAULT_FALLBACK;
1378
- count_vm_event(THP_FAULT_FALLBACK);
1379
- goto out;
1380
- }
1381
-
1382
- count_vm_event(THP_FAULT_ALLOC);
1383
-
1384
- if (!page)
1385
- clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
1386
- else
1387
- copy_user_huge_page(new_page, page, vmf->address,
1388
- vma, HPAGE_PMD_NR);
1389
- __SetPageUptodate(new_page);
1390
-
1391
- mmun_start = haddr;
1392
- mmun_end = haddr + HPAGE_PMD_SIZE;
1393
- mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
1394
-
1395
- spin_lock(vmf->ptl);
1396
- if (page)
1397
- put_page(page);
1398
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
13991319 spin_unlock(vmf->ptl);
1400
- mem_cgroup_cancel_charge(new_page, memcg, true);
1401
- put_page(new_page);
1402
- goto out_mn;
1403
- } else {
1404
- pmd_t entry;
1405
- entry = mk_huge_pmd(new_page, vma->vm_page_prot);
1406
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1407
- pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
1408
- page_add_new_anon_rmap(new_page, vma, haddr, true);
1409
- mem_cgroup_commit_charge(new_page, memcg, false, true);
1410
- lru_cache_add_active_or_unevictable(new_page, vma);
1411
- set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
1412
- update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1413
- if (!page) {
1414
- add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1415
- } else {
1416
- VM_BUG_ON_PAGE(!PageHead(page), page);
1417
- page_remove_rmap(page, true);
1418
- put_page(page);
1419
- }
1420
- ret |= VM_FAULT_WRITE;
1320
+ return VM_FAULT_WRITE;
14211321 }
1322
+
1323
+ unlock_page(page);
14221324 spin_unlock(vmf->ptl);
1423
-out_mn:
1424
- /*
1425
- * No need to double call mmu_notifier->invalidate_range() callback as
1426
- * the above pmdp_huge_clear_flush_notify() did already call it.
1427
- */
1428
- mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
1429
- mmun_end);
1430
-out:
1431
- return ret;
1432
-out_unlock:
1433
- spin_unlock(vmf->ptl);
1434
- return ret;
1325
+fallback:
1326
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
1327
+ return VM_FAULT_FALLBACK;
14351328 }
14361329
14371330 /*
1438
- * FOLL_FORCE or a forced COW break can write even to unwritable pmd's,
1439
- * but only after we've gone through a COW cycle and they are dirty.
1331
+ * FOLL_FORCE can write to even unwritable pmd's, but only
1332
+ * after we've gone through a COW cycle and they are dirty.
14401333 */
14411334 static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
14421335 {
1443
- return pmd_write(pmd) || ((flags & FOLL_COW) && pmd_dirty(pmd));
1336
+ return pmd_write(pmd) ||
1337
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
14441338 }
14451339
14461340 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
....@@ -1466,8 +1360,13 @@
14661360
14671361 page = pmd_page(*pmd);
14681362 VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
1363
+
1364
+ if (!try_grab_page(page, flags))
1365
+ return ERR_PTR(-ENOMEM);
1366
+
14691367 if (flags & FOLL_TOUCH)
14701368 touch_pmd(vma, addr, pmd, flags);
1369
+
14711370 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
14721371 /*
14731372 * We don't mlock() pte-mapped THPs. This way we can avoid
....@@ -1496,7 +1395,6 @@
14961395 goto skip_mlock;
14971396 if (!trylock_page(page))
14981397 goto skip_mlock;
1499
- lru_add_drain();
15001398 if (page->mapping && !PageDoubleMap(page))
15011399 mlock_vma_page(page);
15021400 unlock_page(page);
....@@ -1504,8 +1402,6 @@
15041402 skip_mlock:
15051403 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
15061404 VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
1507
- if (flags & FOLL_GET)
1508
- get_page(page);
15091405
15101406 out:
15111407 return page;
....@@ -1518,7 +1414,7 @@
15181414 struct anon_vma *anon_vma = NULL;
15191415 struct page *page;
15201416 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1521
- int page_nid = -1, this_nid = numa_node_id();
1417
+ int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
15221418 int target_nid, last_cpupid = -1;
15231419 bool page_locked;
15241420 bool migrated = false;
....@@ -1539,8 +1435,7 @@
15391435 if (!get_page_unless_zero(page))
15401436 goto out_unlock;
15411437 spin_unlock(vmf->ptl);
1542
- wait_on_page_locked(page);
1543
- put_page(page);
1438
+ put_and_wait_on_page_locked(page);
15441439 goto out;
15451440 }
15461441
....@@ -1564,7 +1459,7 @@
15641459 */
15651460 page_locked = trylock_page(page);
15661461 target_nid = mpol_misplaced(page, vma, haddr);
1567
- if (target_nid == -1) {
1462
+ if (target_nid == NUMA_NO_NODE) {
15681463 /* If the page was locked, there are no parallel migrations */
15691464 if (page_locked)
15701465 goto clear_pmdnuma;
....@@ -1572,12 +1467,11 @@
15721467
15731468 /* Migration could have started since the pmd_trans_migrating check */
15741469 if (!page_locked) {
1575
- page_nid = -1;
1470
+ page_nid = NUMA_NO_NODE;
15761471 if (!get_page_unless_zero(page))
15771472 goto out_unlock;
15781473 spin_unlock(vmf->ptl);
1579
- wait_on_page_locked(page);
1580
- put_page(page);
1474
+ put_and_wait_on_page_locked(page);
15811475 goto out;
15821476 }
15831477
....@@ -1587,21 +1481,21 @@
15871481 */
15881482 get_page(page);
15891483 spin_unlock(vmf->ptl);
1590
- anon_vma = page_lock_anon_vma_read(page);
1484
+ anon_vma = page_lock_anon_vma_read(page, NULL);
15911485
15921486 /* Confirm the PMD did not change while page_table_lock was released */
15931487 spin_lock(vmf->ptl);
15941488 if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
15951489 unlock_page(page);
15961490 put_page(page);
1597
- page_nid = -1;
1491
+ page_nid = NUMA_NO_NODE;
15981492 goto out_unlock;
15991493 }
16001494
16011495 /* Bail if we fail to protect against THP splits for any reason */
16021496 if (unlikely(!anon_vma)) {
16031497 put_page(page);
1604
- page_nid = -1;
1498
+ page_nid = NUMA_NO_NODE;
16051499 goto clear_pmdnuma;
16061500 }
16071501
....@@ -1616,8 +1510,20 @@
16161510 * We are not sure a pending tlb flush here is for a huge page
16171511 * mapping or not. Hence use the tlb range variant
16181512 */
1619
- if (mm_tlb_flush_pending(vma->vm_mm))
1513
+ if (mm_tlb_flush_pending(vma->vm_mm)) {
16201514 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
1515
+ /*
1516
+ * change_huge_pmd() released the pmd lock before
1517
+ * invalidating the secondary MMUs sharing the primary
1518
+ * MMU pagetables (with ->invalidate_range()). The
1519
+ * mmu_notifier_invalidate_range_end() (which
1520
+ * internally calls ->invalidate_range()) in
1521
+ * change_pmd_range() will run after us, so we can't
1522
+ * rely on it here and we need an explicit invalidate.
1523
+ */
1524
+ mmu_notifier_invalidate_range(vma->vm_mm, haddr,
1525
+ haddr + HPAGE_PMD_SIZE);
1526
+ }
16211527
16221528 /*
16231529 * Migrate the THP to the requested node, returns with page unlocked
....@@ -1651,7 +1557,7 @@
16511557 if (anon_vma)
16521558 page_unlock_anon_vma_read(anon_vma);
16531559
1654
- if (page_nid != -1)
1560
+ if (page_nid != NUMA_NO_NODE)
16551561 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
16561562 flags);
16571563
....@@ -1671,7 +1577,7 @@
16711577 struct mm_struct *mm = tlb->mm;
16721578 bool ret = false;
16731579
1674
- tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
1580
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
16751581
16761582 ptl = pmd_trans_huge_lock(pmd, vma);
16771583 if (!ptl)
....@@ -1747,7 +1653,7 @@
17471653 pmd_t orig_pmd;
17481654 spinlock_t *ptl;
17491655
1750
- tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
1656
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
17511657
17521658 ptl = __pmd_trans_huge_lock(pmd, vma);
17531659 if (!ptl)
....@@ -1758,10 +1664,10 @@
17581664 * pgtable_trans_huge_withdraw after finishing pmdp related
17591665 * operations.
17601666 */
1761
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
1762
- tlb->fullmm);
1667
+ orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
1668
+ tlb->fullmm);
17631669 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1764
- if (vma_is_dax(vma)) {
1670
+ if (vma_is_special_huge(vma)) {
17651671 if (arch_needs_pgtable_deposit())
17661672 zap_deposited_table(tlb->mm, pmd);
17671673 spin_unlock(ptl);
....@@ -1785,7 +1691,7 @@
17851691
17861692 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
17871693 entry = pmd_to_swp_entry(orig_pmd);
1788
- page = pfn_to_page(swp_offset(entry));
1694
+ page = migration_entry_to_page(entry);
17891695 flush_needed = 0;
17901696 } else
17911697 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
....@@ -1833,18 +1739,12 @@
18331739 }
18341740
18351741 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1836
- unsigned long new_addr, unsigned long old_end,
1837
- pmd_t *old_pmd, pmd_t *new_pmd)
1742
+ unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
18381743 {
18391744 spinlock_t *old_ptl, *new_ptl;
18401745 pmd_t pmd;
18411746 struct mm_struct *mm = vma->vm_mm;
18421747 bool force_flush = false;
1843
-
1844
- if ((old_addr & ~HPAGE_PMD_MASK) ||
1845
- (new_addr & ~HPAGE_PMD_MASK) ||
1846
- old_end - old_addr < HPAGE_PMD_SIZE)
1847
- return false;
18481748
18491749 /*
18501750 * The destination pmd shouldn't be established, free_pgtables()
....@@ -1857,7 +1757,7 @@
18571757
18581758 /*
18591759 * We don't have to worry about the ordering of src and dst
1860
- * ptlocks because exclusive mmap_sem prevents deadlock.
1760
+ * ptlocks because exclusive mmap_lock prevents deadlock.
18611761 */
18621762 old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
18631763 if (old_ptl) {
....@@ -1893,13 +1793,16 @@
18931793 * - HPAGE_PMD_NR is protections changed and TLB flush necessary
18941794 */
18951795 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1896
- unsigned long addr, pgprot_t newprot, int prot_numa)
1796
+ unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
18971797 {
18981798 struct mm_struct *mm = vma->vm_mm;
18991799 spinlock_t *ptl;
19001800 pmd_t entry;
19011801 bool preserve_write;
19021802 int ret;
1803
+ bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
1804
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
1805
+ bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
19031806
19041807 ptl = __pmd_trans_huge_lock(pmd, vma);
19051808 if (!ptl)
....@@ -1923,6 +1826,8 @@
19231826 newpmd = swp_entry_to_pmd(entry);
19241827 if (pmd_swp_soft_dirty(*pmd))
19251828 newpmd = pmd_swp_mksoft_dirty(newpmd);
1829
+ if (pmd_swp_uffd_wp(*pmd))
1830
+ newpmd = pmd_swp_mkuffd_wp(newpmd);
19261831 set_pmd_at(mm, addr, pmd, newpmd);
19271832 }
19281833 goto unlock;
....@@ -1941,9 +1846,9 @@
19411846 goto unlock;
19421847
19431848 /*
1944
- * In case prot_numa, we are under down_read(mmap_sem). It's critical
1849
+ * In case prot_numa, we are under mmap_read_lock(mm). It's critical
19451850 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
1946
- * which is also under down_read(mmap_sem):
1851
+ * which is also under mmap_read_lock(mm):
19471852 *
19481853 * CPU0: CPU1:
19491854 * change_huge_pmd(prot_numa=1)
....@@ -1966,6 +1871,17 @@
19661871 entry = pmd_modify(entry, newprot);
19671872 if (preserve_write)
19681873 entry = pmd_mk_savedwrite(entry);
1874
+ if (uffd_wp) {
1875
+ entry = pmd_wrprotect(entry);
1876
+ entry = pmd_mkuffd_wp(entry);
1877
+ } else if (uffd_wp_resolve) {
1878
+ /*
1879
+ * Leave the write bit to be handled by PF interrupt
1880
+ * handler, then things like COW could be properly
1881
+ * handled.
1882
+ */
1883
+ entry = pmd_clear_uffd_wp(entry);
1884
+ }
19691885 ret = HPAGE_PMD_NR;
19701886 set_pmd_at(mm, addr, pmd, entry);
19711887 BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
....@@ -2012,7 +1928,6 @@
20121928 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
20131929 pud_t *pud, unsigned long addr)
20141930 {
2015
- pud_t orig_pud;
20161931 spinlock_t *ptl;
20171932
20181933 ptl = __pud_trans_huge_lock(pud, vma);
....@@ -2024,10 +1939,9 @@
20241939 * pgtable_trans_huge_withdraw after finishing pudp related
20251940 * operations.
20261941 */
2027
- orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
2028
- tlb->fullmm);
1942
+ pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
20291943 tlb_remove_pud_tlb_entry(tlb, pud, addr);
2030
- if (vma_is_dax(vma)) {
1944
+ if (vma_is_special_huge(vma)) {
20311945 spin_unlock(ptl);
20321946 /* No zero page support yet */
20331947 } else {
....@@ -2054,14 +1968,16 @@
20541968 unsigned long address)
20551969 {
20561970 spinlock_t *ptl;
2057
- struct mm_struct *mm = vma->vm_mm;
2058
- unsigned long haddr = address & HPAGE_PUD_MASK;
1971
+ struct mmu_notifier_range range;
20591972
2060
- mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
2061
- ptl = pud_lock(mm, pud);
1973
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1974
+ address & HPAGE_PUD_MASK,
1975
+ (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
1976
+ mmu_notifier_invalidate_range_start(&range);
1977
+ ptl = pud_lock(vma->vm_mm, pud);
20621978 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
20631979 goto out;
2064
- __split_huge_pud_locked(vma, pud, haddr);
1980
+ __split_huge_pud_locked(vma, pud, range.start);
20651981
20661982 out:
20671983 spin_unlock(ptl);
....@@ -2069,8 +1985,7 @@
20691985 * No need to double call mmu_notifier->invalidate_range() callback as
20701986 * the above pudp_huge_clear_flush_notify() did already call it.
20711987 */
2072
- mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
2073
- HPAGE_PUD_SIZE);
1988
+ mmu_notifier_invalidate_range_only_end(&range);
20741989 }
20751990 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
20761991
....@@ -2079,7 +1994,7 @@
20791994 {
20801995 struct mm_struct *mm = vma->vm_mm;
20811996 pgtable_t pgtable;
2082
- pmd_t _pmd;
1997
+ pmd_t _pmd, old_pmd;
20831998 int i;
20841999
20852000 /*
....@@ -2090,7 +2005,7 @@
20902005 *
20912006 * See Documentation/vm/mmu_notifier.rst
20922007 */
2093
- pmdp_huge_clear_flush(vma, haddr, pmd);
2008
+ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
20942009
20952010 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
20962011 pmd_populate(mm, &_pmd, pgtable);
....@@ -2099,6 +2014,8 @@
20992014 pte_t *pte, entry;
21002015 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
21012016 entry = pte_mkspecial(entry);
2017
+ if (pmd_uffd_wp(old_pmd))
2018
+ entry = pte_mkuffd_wp(entry);
21022019 pte = pte_offset_map(&_pmd, haddr);
21032020 VM_BUG_ON(!pte_none(*pte));
21042021 set_pte_at(mm, haddr, pte, entry);
....@@ -2115,9 +2032,10 @@
21152032 struct page *page;
21162033 pgtable_t pgtable;
21172034 pmd_t old_pmd, _pmd;
2118
- bool young, write, soft_dirty, pmd_migration = false;
2035
+ bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
21192036 unsigned long addr;
21202037 int i;
2038
+ bool success = false;
21212039
21222040 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
21232041 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
....@@ -2135,7 +2053,7 @@
21352053 */
21362054 if (arch_needs_pgtable_deposit())
21372055 zap_deposited_table(mm, pmd);
2138
- if (vma_is_dax(vma))
2056
+ if (vma_is_special_huge(vma))
21392057 return;
21402058 if (unlikely(is_pmd_migration_entry(old_pmd))) {
21412059 swp_entry_t entry;
....@@ -2176,8 +2094,8 @@
21762094 * free), userland could trigger a small page size TLB miss on the
21772095 * small sized TLB while the hugepage TLB entry is still established in
21782096 * the huge TLB. Some CPU doesn't like that.
2179
- * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
2180
- * 383 on page 93. Intel should be safe but is also warns that it's
2097
+ * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
2098
+ * 383 on page 105. Intel should be safe but is also warns that it's
21812099 * only safe if the permission and cache attributes of the two entries
21822100 * loaded in the two TLB is identical (which should be the case here).
21832101 * But it is generally safer to never allow small and huge TLB entries
....@@ -2195,10 +2113,11 @@
21952113 swp_entry_t entry;
21962114
21972115 entry = pmd_to_swp_entry(old_pmd);
2198
- page = pfn_to_page(swp_offset(entry));
2116
+ page = migration_entry_to_page(entry);
21992117 write = is_write_migration_entry(entry);
22002118 young = false;
22012119 soft_dirty = pmd_swp_soft_dirty(old_pmd);
2120
+ uffd_wp = pmd_swp_uffd_wp(old_pmd);
22022121 } else {
22032122 page = pmd_page(old_pmd);
22042123 if (pmd_dirty(old_pmd))
....@@ -2206,6 +2125,7 @@
22062125 write = pmd_write(old_pmd);
22072126 young = pmd_young(old_pmd);
22082127 soft_dirty = pmd_soft_dirty(old_pmd);
2128
+ uffd_wp = pmd_uffd_wp(old_pmd);
22092129 }
22102130 VM_BUG_ON_PAGE(!page_count(page), page);
22112131 page_ref_add(page, HPAGE_PMD_NR - 1);
....@@ -2230,21 +2150,29 @@
22302150 entry = swp_entry_to_pte(swp_entry);
22312151 if (soft_dirty)
22322152 entry = pte_swp_mksoft_dirty(entry);
2153
+ if (uffd_wp)
2154
+ entry = pte_swp_mkuffd_wp(entry);
22332155 } else {
22342156 entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
2235
- entry = maybe_mkwrite(entry, vma);
2157
+ entry = maybe_mkwrite(entry, vma->vm_flags);
22362158 if (!write)
22372159 entry = pte_wrprotect(entry);
22382160 if (!young)
22392161 entry = pte_mkold(entry);
22402162 if (soft_dirty)
22412163 entry = pte_mksoft_dirty(entry);
2164
+ if (uffd_wp)
2165
+ entry = pte_mkuffd_wp(entry);
22422166 }
22432167 pte = pte_offset_map(&_pmd, addr);
22442168 BUG_ON(!pte_none(*pte));
22452169 set_pte_at(mm, addr, pte, entry);
2246
- if (!pmd_migration)
2247
- atomic_inc(&page[i]._mapcount);
2170
+ if (!pmd_migration) {
2171
+ trace_android_vh_update_page_mapcount(&page[i], true,
2172
+ false, NULL, &success);
2173
+ if (!success)
2174
+ atomic_inc(&page[i]._mapcount);
2175
+ }
22482176 pte_unmap(pte);
22492177 }
22502178
....@@ -2255,8 +2183,12 @@
22552183 */
22562184 if (compound_mapcount(page) > 1 &&
22572185 !TestSetPageDoubleMap(page)) {
2258
- for (i = 0; i < HPAGE_PMD_NR; i++)
2259
- atomic_inc(&page[i]._mapcount);
2186
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
2187
+ trace_android_vh_update_page_mapcount(&page[i], true,
2188
+ false, NULL, &success);
2189
+ if (!success)
2190
+ atomic_inc(&page[i]._mapcount);
2191
+ }
22602192 }
22612193
22622194 lock_page_memcg(page);
....@@ -2265,8 +2197,12 @@
22652197 __dec_lruvec_page_state(page, NR_ANON_THPS);
22662198 if (TestClearPageDoubleMap(page)) {
22672199 /* No need in mapcount reference anymore */
2268
- for (i = 0; i < HPAGE_PMD_NR; i++)
2269
- atomic_dec(&page[i]._mapcount);
2200
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
2201
+ trace_android_vh_update_page_mapcount(&page[i],
2202
+ false, false, NULL, &success);
2203
+ if (!success)
2204
+ atomic_dec(&page[i]._mapcount);
2205
+ }
22702206 }
22712207 }
22722208 unlock_page_memcg(page);
....@@ -2287,13 +2223,15 @@
22872223 unsigned long address, bool freeze, struct page *page)
22882224 {
22892225 spinlock_t *ptl;
2290
- struct mm_struct *mm = vma->vm_mm;
2291
- unsigned long haddr = address & HPAGE_PMD_MASK;
2226
+ struct mmu_notifier_range range;
22922227 bool do_unlock_page = false;
22932228 pmd_t _pmd;
22942229
2295
- mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
2296
- ptl = pmd_lock(mm, pmd);
2230
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
2231
+ address & HPAGE_PMD_MASK,
2232
+ (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
2233
+ mmu_notifier_invalidate_range_start(&range);
2234
+ ptl = pmd_lock(vma->vm_mm, pmd);
22972235
22982236 /*
22992237 * If caller asks to setup a migration entries, we need a page to check
....@@ -2339,7 +2277,7 @@
23392277 clear_page_mlock(page);
23402278 } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
23412279 goto out;
2342
- __split_huge_pmd_locked(vma, pmd, haddr, freeze);
2280
+ __split_huge_pmd_locked(vma, pmd, range.start, freeze);
23432281 out:
23442282 spin_unlock(ptl);
23452283 if (do_unlock_page)
....@@ -2357,8 +2295,7 @@
23572295 * any further changes to individual pte will notify. So no need
23582296 * to call mmu_notifier->invalidate_range()
23592297 */
2360
- mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
2361
- HPAGE_PMD_SIZE);
2298
+ mmu_notifier_invalidate_range_only_end(&range);
23622299 }
23632300
23642301 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
....@@ -2413,13 +2350,13 @@
24132350
24142351 /*
24152352 * If we're also updating the vma->vm_next->vm_start, if the new
2416
- * vm_next->vm_start isn't page aligned and it could previously
2353
+ * vm_next->vm_start isn't hpage aligned and it could previously
24172354 * contain an hugepage: check if we need to split an huge pmd.
24182355 */
24192356 if (adjust_next > 0) {
24202357 struct vm_area_struct *next = vma->vm_next;
24212358 unsigned long nstart = next->vm_start;
2422
- nstart += adjust_next << PAGE_SHIFT;
2359
+ nstart += adjust_next;
24232360 if (nstart & ~HPAGE_PMD_MASK &&
24242361 (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
24252362 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
....@@ -2429,8 +2366,8 @@
24292366
24302367 static void unmap_page(struct page *page)
24312368 {
2432
- enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
2433
- TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC;
2369
+ enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
2370
+ TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
24342371
24352372 VM_BUG_ON_PAGE(!PageHead(page), page);
24362373
....@@ -2442,13 +2379,13 @@
24422379 VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
24432380 }
24442381
2445
-static void remap_page(struct page *page)
2382
+static void remap_page(struct page *page, unsigned int nr)
24462383 {
24472384 int i;
24482385 if (PageTransHuge(page)) {
24492386 remove_migration_ptes(page, page, true);
24502387 } else {
2451
- for (i = 0; i < HPAGE_PMD_NR; i++)
2388
+ for (i = 0; i < nr; i++)
24522389 remove_migration_ptes(page + i, page + i, true);
24532390 }
24542391 }
....@@ -2477,6 +2414,9 @@
24772414 (1L << PG_workingset) |
24782415 (1L << PG_locked) |
24792416 (1L << PG_unevictable) |
2417
+#ifdef CONFIG_64BIT
2418
+ (1L << PG_arch_2) |
2419
+#endif
24802420 (1L << PG_dirty)));
24812421
24822422 /* ->mapping in first tail page is compound_mapcount */
....@@ -2519,16 +2459,27 @@
25192459 pgoff_t end, unsigned long flags)
25202460 {
25212461 struct page *head = compound_head(page);
2522
- struct zone *zone = page_zone(head);
2462
+ pg_data_t *pgdat = page_pgdat(head);
25232463 struct lruvec *lruvec;
2464
+ struct address_space *swap_cache = NULL;
2465
+ unsigned long offset = 0;
2466
+ unsigned int nr = thp_nr_pages(head);
25242467 int i;
25252468
2526
- lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
2469
+ lruvec = mem_cgroup_page_lruvec(head, pgdat);
25272470
25282471 /* complete memcg works before add pages to LRU */
2529
- mem_cgroup_split_huge_fixup(head);
2472
+ split_page_memcg(head, nr);
25302473
2531
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
2474
+ if (PageAnon(head) && PageSwapCache(head)) {
2475
+ swp_entry_t entry = { .val = page_private(head) };
2476
+
2477
+ offset = swp_offset(entry);
2478
+ swap_cache = swap_address_space(entry);
2479
+ xa_lock(&swap_cache->i_pages);
2480
+ }
2481
+
2482
+ for (i = nr - 1; i >= 1; i--) {
25322483 __split_huge_page_tail(head, i, lruvec, list);
25332484 /* Some pages can be beyond i_size: drop them from page cache */
25342485 if (head[i].index >= end) {
....@@ -2537,31 +2488,45 @@
25372488 if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
25382489 shmem_uncharge(head->mapping->host, 1);
25392490 put_page(head + i);
2491
+ } else if (!PageAnon(page)) {
2492
+ __xa_store(&head->mapping->i_pages, head[i].index,
2493
+ head + i, 0);
2494
+ } else if (swap_cache) {
2495
+ __xa_store(&swap_cache->i_pages, offset + i,
2496
+ head + i, 0);
25402497 }
25412498 }
25422499
25432500 ClearPageCompound(head);
25442501
2545
- split_page_owner(head, HPAGE_PMD_ORDER);
2502
+ split_page_owner(head, nr);
25462503
25472504 /* See comment in __split_huge_page_tail() */
25482505 if (PageAnon(head)) {
2549
- /* Additional pin to radix tree of swap cache */
2550
- if (PageSwapCache(head))
2506
+ /* Additional pin to swap cache */
2507
+ if (PageSwapCache(head)) {
25512508 page_ref_add(head, 2);
2552
- else
2509
+ xa_unlock(&swap_cache->i_pages);
2510
+ } else {
25532511 page_ref_inc(head);
2512
+ }
25542513 } else {
2555
- /* Additional pin to radix tree */
2514
+ /* Additional pin to page cache */
25562515 page_ref_add(head, 2);
25572516 xa_unlock(&head->mapping->i_pages);
25582517 }
25592518
2560
- spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
2519
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
25612520
2562
- remap_page(head);
2521
+ remap_page(head, nr);
25632522
2564
- for (i = 0; i < HPAGE_PMD_NR; i++) {
2523
+ if (PageSwapCache(head)) {
2524
+ swp_entry_t entry = { .val = page_private(head) };
2525
+
2526
+ split_swap_cluster(entry);
2527
+ }
2528
+
2529
+ for (i = 0; i < nr; i++) {
25652530 struct page *subpage = head + i;
25662531 if (subpage == page)
25672532 continue;
....@@ -2580,7 +2545,7 @@
25802545
25812546 int total_mapcount(struct page *page)
25822547 {
2583
- int i, compound, ret;
2548
+ int i, compound, nr, ret;
25842549
25852550 VM_BUG_ON_PAGE(PageTail(page), page);
25862551
....@@ -2588,16 +2553,17 @@
25882553 return atomic_read(&page->_mapcount) + 1;
25892554
25902555 compound = compound_mapcount(page);
2556
+ nr = compound_nr(page);
25912557 if (PageHuge(page))
25922558 return compound;
25932559 ret = compound;
2594
- for (i = 0; i < HPAGE_PMD_NR; i++)
2560
+ for (i = 0; i < nr; i++)
25952561 ret += atomic_read(&page[i]._mapcount) + 1;
25962562 /* File pages has compound_mapcount included in _mapcount */
25972563 if (!PageAnon(page))
2598
- return ret - compound * HPAGE_PMD_NR;
2564
+ return ret - compound * nr;
25992565 if (PageDoubleMap(page))
2600
- ret -= HPAGE_PMD_NR;
2566
+ ret -= nr;
26012567 return ret;
26022568 }
26032569
....@@ -2642,14 +2608,14 @@
26422608 page = compound_head(page);
26432609
26442610 _total_mapcount = ret = 0;
2645
- for (i = 0; i < HPAGE_PMD_NR; i++) {
2611
+ for (i = 0; i < thp_nr_pages(page); i++) {
26462612 mapcount = atomic_read(&page[i]._mapcount) + 1;
26472613 ret = max(ret, mapcount);
26482614 _total_mapcount += mapcount;
26492615 }
26502616 if (PageDoubleMap(page)) {
26512617 ret -= 1;
2652
- _total_mapcount -= HPAGE_PMD_NR;
2618
+ _total_mapcount -= thp_nr_pages(page);
26532619 }
26542620 mapcount = compound_mapcount(page);
26552621 ret += mapcount;
....@@ -2664,11 +2630,11 @@
26642630 {
26652631 int extra_pins;
26662632
2667
- /* Additional pins from radix tree */
2633
+ /* Additional pins from page cache */
26682634 if (PageAnon(page))
2669
- extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
2635
+ extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
26702636 else
2671
- extra_pins = HPAGE_PMD_NR;
2637
+ extra_pins = thp_nr_pages(page);
26722638 if (pextra_pins)
26732639 *pextra_pins = extra_pins;
26742640 return total_mapcount(page) == page_count(page) - extra_pins - 1;
....@@ -2697,23 +2663,23 @@
26972663 {
26982664 struct page *head = compound_head(page);
26992665 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
2666
+ struct deferred_split *ds_queue = get_deferred_split_queue(head);
27002667 struct anon_vma *anon_vma = NULL;
27012668 struct address_space *mapping = NULL;
27022669 int extra_pins, ret;
2703
- bool mlocked;
27042670 unsigned long flags;
27052671 pgoff_t end;
27062672
27072673 VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
2708
- VM_BUG_ON_PAGE(!PageLocked(page), page);
2709
- VM_BUG_ON_PAGE(!PageCompound(page), page);
2674
+ VM_BUG_ON_PAGE(!PageLocked(head), head);
2675
+ VM_BUG_ON_PAGE(!PageCompound(head), head);
27102676
2711
- if (PageWriteback(page))
2677
+ if (PageWriteback(head))
27122678 return -EBUSY;
27132679
27142680 if (PageAnon(head)) {
27152681 /*
2716
- * The caller does not necessarily hold an mmap_sem that would
2682
+ * The caller does not necessarily hold an mmap_lock that would
27172683 * prevent the anon_vma disappearing so we first we take a
27182684 * reference to it and then lock the anon_vma for write. This
27192685 * is similar to page_lock_anon_vma_read except the write lock
....@@ -2759,55 +2725,47 @@
27592725 goto out_unlock;
27602726 }
27612727
2762
- mlocked = PageMlocked(page);
27632728 unmap_page(head);
27642729
2765
- /* Make sure the page is not on per-CPU pagevec as it takes pin */
2766
- if (mlocked)
2767
- lru_add_drain();
2768
-
27692730 /* prevent PageLRU to go away from under us, and freeze lru stats */
2770
- spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
2731
+ spin_lock_irqsave(&pgdata->lru_lock, flags);
27712732
27722733 if (mapping) {
2773
- void **pslot;
2734
+ XA_STATE(xas, &mapping->i_pages, page_index(head));
27742735
2775
- xa_lock(&mapping->i_pages);
2776
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
2777
- page_index(head));
27782736 /*
2779
- * Check if the head page is present in radix tree.
2737
+ * Check if the head page is present in page cache.
27802738 * We assume all tail are present too, if head is there.
27812739 */
2782
- if (radix_tree_deref_slot_protected(pslot,
2783
- &mapping->i_pages.xa_lock) != head)
2740
+ xa_lock(&mapping->i_pages);
2741
+ if (xas_load(&xas) != head)
27842742 goto fail;
27852743 }
27862744
27872745 /* Prevent deferred_split_scan() touching ->_refcount */
2788
- spin_lock(&pgdata->split_queue_lock);
2746
+ spin_lock(&ds_queue->split_queue_lock);
27892747 if (page_ref_freeze(head, 1 + extra_pins)) {
27902748 if (!list_empty(page_deferred_list(head))) {
2791
- pgdata->split_queue_len--;
2749
+ ds_queue->split_queue_len--;
27922750 list_del(page_deferred_list(head));
27932751 }
2794
- if (mapping)
2795
- __dec_node_page_state(page, NR_SHMEM_THPS);
2796
- spin_unlock(&pgdata->split_queue_lock);
2797
- __split_huge_page(page, list, end, flags);
2798
- if (PageSwapCache(head)) {
2799
- swp_entry_t entry = { .val = page_private(head) };
2752
+ spin_unlock(&ds_queue->split_queue_lock);
2753
+ if (mapping) {
2754
+ if (PageSwapBacked(head))
2755
+ __dec_node_page_state(head, NR_SHMEM_THPS);
2756
+ else
2757
+ __dec_node_page_state(head, NR_FILE_THPS);
2758
+ }
28002759
2801
- ret = split_swap_cluster(entry);
2802
- } else
2803
- ret = 0;
2760
+ __split_huge_page(page, list, end, flags);
2761
+ ret = 0;
28042762 } else {
2805
- spin_unlock(&pgdata->split_queue_lock);
2763
+ spin_unlock(&ds_queue->split_queue_lock);
28062764 fail:
28072765 if (mapping)
28082766 xa_unlock(&mapping->i_pages);
2809
- spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
2810
- remap_page(head);
2767
+ spin_unlock_irqrestore(&pgdata->lru_lock, flags);
2768
+ remap_page(head, thp_nr_pages(head));
28112769 ret = -EBUSY;
28122770 }
28132771
....@@ -2825,53 +2783,89 @@
28252783
28262784 void free_transhuge_page(struct page *page)
28272785 {
2828
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
2786
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
28292787 unsigned long flags;
28302788
2831
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
2789
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
28322790 if (!list_empty(page_deferred_list(page))) {
2833
- pgdata->split_queue_len--;
2791
+ ds_queue->split_queue_len--;
28342792 list_del(page_deferred_list(page));
28352793 }
2836
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
2794
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
28372795 free_compound_page(page);
28382796 }
28392797
28402798 void deferred_split_huge_page(struct page *page)
28412799 {
2842
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
2800
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
2801
+#ifdef CONFIG_MEMCG
2802
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
2803
+#endif
28432804 unsigned long flags;
28442805
28452806 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
28462807
2847
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
2808
+ /*
2809
+ * The try_to_unmap() in page reclaim path might reach here too,
2810
+ * this may cause a race condition to corrupt deferred split queue.
2811
+ * And, if page reclaim is already handling the same page, it is
2812
+ * unnecessary to handle it again in shrinker.
2813
+ *
2814
+ * Check PageSwapCache to determine if the page is being
2815
+ * handled by page reclaim since THP swap would add the page into
2816
+ * swap cache before calling try_to_unmap().
2817
+ */
2818
+ if (PageSwapCache(page))
2819
+ return;
2820
+
2821
+ if (!list_empty(page_deferred_list(page)))
2822
+ return;
2823
+
2824
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
28482825 if (list_empty(page_deferred_list(page))) {
28492826 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
2850
- list_add_tail(page_deferred_list(page), &pgdata->split_queue);
2851
- pgdata->split_queue_len++;
2827
+ list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
2828
+ ds_queue->split_queue_len++;
2829
+#ifdef CONFIG_MEMCG
2830
+ if (memcg)
2831
+ memcg_set_shrinker_bit(memcg, page_to_nid(page),
2832
+ deferred_split_shrinker.id);
2833
+#endif
28522834 }
2853
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
2835
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
28542836 }
28552837
28562838 static unsigned long deferred_split_count(struct shrinker *shrink,
28572839 struct shrink_control *sc)
28582840 {
28592841 struct pglist_data *pgdata = NODE_DATA(sc->nid);
2860
- return READ_ONCE(pgdata->split_queue_len);
2842
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2843
+
2844
+#ifdef CONFIG_MEMCG
2845
+ if (sc->memcg)
2846
+ ds_queue = &sc->memcg->deferred_split_queue;
2847
+#endif
2848
+ return READ_ONCE(ds_queue->split_queue_len);
28612849 }
28622850
28632851 static unsigned long deferred_split_scan(struct shrinker *shrink,
28642852 struct shrink_control *sc)
28652853 {
28662854 struct pglist_data *pgdata = NODE_DATA(sc->nid);
2855
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
28672856 unsigned long flags;
28682857 LIST_HEAD(list), *pos, *next;
28692858 struct page *page;
28702859 int split = 0;
28712860
2872
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
2861
+#ifdef CONFIG_MEMCG
2862
+ if (sc->memcg)
2863
+ ds_queue = &sc->memcg->deferred_split_queue;
2864
+#endif
2865
+
2866
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
28732867 /* Take pin on all head pages to avoid freeing them under us */
2874
- list_for_each_safe(pos, next, &pgdata->split_queue) {
2868
+ list_for_each_safe(pos, next, &ds_queue->split_queue) {
28752869 page = list_entry((void *)pos, struct page, mapping);
28762870 page = compound_head(page);
28772871 if (get_page_unless_zero(page)) {
....@@ -2879,12 +2873,12 @@
28792873 } else {
28802874 /* We lost race with put_compound_page() */
28812875 list_del_init(page_deferred_list(page));
2882
- pgdata->split_queue_len--;
2876
+ ds_queue->split_queue_len--;
28832877 }
28842878 if (!--sc->nr_to_scan)
28852879 break;
28862880 }
2887
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
2881
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
28882882
28892883 list_for_each_safe(pos, next, &list) {
28902884 page = list_entry((void *)pos, struct page, mapping);
....@@ -2898,15 +2892,15 @@
28982892 put_page(page);
28992893 }
29002894
2901
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
2902
- list_splice_tail(&list, &pgdata->split_queue);
2903
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
2895
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2896
+ list_splice_tail(&list, &ds_queue->split_queue);
2897
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
29042898
29052899 /*
29062900 * Stop shrinker if we didn't split any page, but the queue is empty.
29072901 * This can happen if pages were freed under us.
29082902 */
2909
- if (!split && list_empty(&pgdata->split_queue))
2903
+ if (!split && list_empty(&ds_queue->split_queue))
29102904 return SHRINK_STOP;
29112905 return split;
29122906 }
....@@ -2915,7 +2909,8 @@
29152909 .count_objects = deferred_split_count,
29162910 .scan_objects = deferred_split_scan,
29172911 .seeks = DEFAULT_SEEKS,
2918
- .flags = SHRINKER_NUMA_AWARE,
2912
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
2913
+ SHRINKER_NONSLAB,
29192914 };
29202915
29212916 #ifdef CONFIG_DEBUG_FS
....@@ -2959,17 +2954,13 @@
29592954
29602955 return 0;
29612956 }
2962
-DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
2957
+DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
29632958 "%llu\n");
29642959
29652960 static int __init split_huge_pages_debugfs(void)
29662961 {
2967
- void *ret;
2968
-
2969
- ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
2970
- &split_huge_pages_fops);
2971
- if (!ret)
2972
- pr_warn("Failed to create split_huge_pages in debugfs");
2962
+ debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
2963
+ &split_huge_pages_fops);
29732964 return 0;
29742965 }
29752966 late_initcall(split_huge_pages_debugfs);
....@@ -3021,6 +3012,8 @@
30213012 pmde = pmd_mksoft_dirty(pmde);
30223013 if (is_write_migration_entry(entry))
30233014 pmde = maybe_pmd_mkwrite(pmde, vma);
3015
+ if (pmd_swp_uffd_wp(*pvmw->pmd))
3016
+ pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
30243017
30253018 flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
30263019 if (PageAnon(new))