hc
2023-12-11 072de836f53be56a70cecf70b43ae43b7ce17376
kernel/mm/mempolicy.c
....@@ -1,9 +1,9 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Simple NUMA memory policy for the Linux kernel.
34 *
45 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
56 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6
- * Subject to the GNU Public License, version 2.
77 *
88 * NUMA policy allows the user to give hints in which node(s) memory should
99 * be allocated.
....@@ -68,7 +68,7 @@
6868 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
6969
7070 #include <linux/mempolicy.h>
71
-#include <linux/mm.h>
71
+#include <linux/pagewalk.h>
7272 #include <linux/highmem.h>
7373 #include <linux/hugetlb.h>
7474 #include <linux/kernel.h>
....@@ -126,6 +126,32 @@
126126 };
127127
128128 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129
+
130
+/**
131
+ * numa_map_to_online_node - Find closest online node
132
+ * @node: Node id to start the search
133
+ *
134
+ * Lookup the next closest node by distance if @nid is not online.
135
+ */
136
+int numa_map_to_online_node(int node)
137
+{
138
+ int min_dist = INT_MAX, dist, n, min_node;
139
+
140
+ if (node == NUMA_NO_NODE || node_online(node))
141
+ return node;
142
+
143
+ min_node = node;
144
+ for_each_online_node(n) {
145
+ dist = node_distance(node, n);
146
+ if (dist < min_dist) {
147
+ min_dist = dist;
148
+ min_node = n;
149
+ }
150
+ }
151
+
152
+ return min_node;
153
+}
154
+EXPORT_SYMBOL_GPL(numa_map_to_online_node);
129155
130156 struct mempolicy *get_task_policy(struct task_struct *p)
131157 {
....@@ -198,7 +224,7 @@
198224 * handle an empty nodemask with MPOL_PREFERRED here.
199225 *
200226 * Must be called holding task's alloc_lock to protect task's mems_allowed
201
- * and mempolicy. May also be called holding the mmap_semaphore for write.
227
+ * and mempolicy. May also be called holding the mmap_lock for write.
202228 */
203229 static int mpol_set_nodemask(struct mempolicy *pol,
204230 const nodemask_t *nodes, struct nodemask_scratch *nsc)
....@@ -342,13 +368,13 @@
342368 /*
343369 * mpol_rebind_policy - Migrate a policy to a different set of nodes
344370 *
345
- * Per-vma policies are protected by mmap_sem. Allocations using per-task
371
+ * Per-vma policies are protected by mmap_lock. Allocations using per-task
346372 * policies are protected by task->mems_allowed_seq to prevent a premature
347373 * OOM/allocation failure due to parallel nodemask modification.
348374 */
349375 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
350376 {
351
- if (!pol)
377
+ if (!pol || pol->mode == MPOL_LOCAL)
352378 return;
353379 if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
354380 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
....@@ -372,17 +398,20 @@
372398 /*
373399 * Rebind each vma in mm to new nodemask.
374400 *
375
- * Call holding a reference to mm. Takes mm->mmap_sem during call.
401
+ * Call holding a reference to mm. Takes mm->mmap_lock during call.
376402 */
377403
378404 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
379405 {
380406 struct vm_area_struct *vma;
381407
382
- down_write(&mm->mmap_sem);
383
- for (vma = mm->mmap; vma; vma = vma->vm_next)
408
+ mmap_write_lock(mm);
409
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
410
+ vm_write_begin(vma);
384411 mpol_rebind_policy(vma->vm_policy, new);
385
- up_write(&mm->mmap_sem);
412
+ vm_write_end(vma);
413
+ }
414
+ mmap_write_unlock(mm);
386415 }
387416
388417 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
....@@ -410,7 +439,9 @@
410439 struct list_head *pagelist;
411440 unsigned long flags;
412441 nodemask_t *nmask;
413
- struct vm_area_struct *prev;
442
+ unsigned long start;
443
+ unsigned long end;
444
+ struct vm_area_struct *first;
414445 };
415446
416447 /*
....@@ -440,6 +471,7 @@
440471 */
441472 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
442473 unsigned long end, struct mm_walk *walk)
474
+ __releases(ptl)
443475 {
444476 int ret = 0;
445477 struct page *page;
....@@ -555,9 +587,10 @@
555587 unsigned long addr, unsigned long end,
556588 struct mm_walk *walk)
557589 {
590
+ int ret = 0;
558591 #ifdef CONFIG_HUGETLB_PAGE
559592 struct queue_pages *qp = walk->private;
560
- unsigned long flags = qp->flags;
593
+ unsigned long flags = (qp->flags & MPOL_MF_VALID);
561594 struct page *page;
562595 spinlock_t *ptl;
563596 pte_t entry;
....@@ -569,16 +602,44 @@
569602 page = pte_page(entry);
570603 if (!queue_pages_required(page, qp))
571604 goto unlock;
605
+
606
+ if (flags == MPOL_MF_STRICT) {
607
+ /*
608
+ * STRICT alone means only detecting misplaced page and no
609
+ * need to further check other vma.
610
+ */
611
+ ret = -EIO;
612
+ goto unlock;
613
+ }
614
+
615
+ if (!vma_migratable(walk->vma)) {
616
+ /*
617
+ * Must be STRICT with MOVE*, otherwise .test_walk() have
618
+ * stopped walking current vma.
619
+ * Detecting misplaced page but allow migrating pages which
620
+ * have been queued.
621
+ */
622
+ ret = 1;
623
+ goto unlock;
624
+ }
625
+
572626 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
573627 if (flags & (MPOL_MF_MOVE_ALL) ||
574
- (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
575
- isolate_huge_page(page, qp->pagelist);
628
+ (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
629
+ if (!isolate_huge_page(page, qp->pagelist) &&
630
+ (flags & MPOL_MF_STRICT))
631
+ /*
632
+ * Failed to isolate page but allow migrating pages
633
+ * which have been queued.
634
+ */
635
+ ret = 1;
636
+ }
576637 unlock:
577638 spin_unlock(ptl);
578639 #else
579640 BUG();
580641 #endif
581
- return 0;
642
+ return ret;
582643 }
583644
584645 #ifdef CONFIG_NUMA_BALANCING
....@@ -596,7 +657,7 @@
596657 {
597658 int nr_updated;
598659
599
- nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
660
+ nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
600661 if (nr_updated)
601662 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
602663
....@@ -618,6 +679,22 @@
618679 unsigned long endvma = vma->vm_end;
619680 unsigned long flags = qp->flags;
620681
682
+ /* range check first */
683
+ VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
684
+
685
+ if (!qp->first) {
686
+ qp->first = vma;
687
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
688
+ (qp->start < vma->vm_start))
689
+ /* hole at head side of range */
690
+ return -EFAULT;
691
+ }
692
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
693
+ ((vma->vm_end < qp->end) &&
694
+ (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
695
+ /* hole at middle or tail of range */
696
+ return -EFAULT;
697
+
621698 /*
622699 * Need check MPOL_MF_STRICT to return -EIO if possible
623700 * regardless of vma_migratable
....@@ -628,22 +705,10 @@
628705
629706 if (endvma > end)
630707 endvma = end;
631
- if (vma->vm_start > start)
632
- start = vma->vm_start;
633
-
634
- if (!(flags & MPOL_MF_DISCONTIG_OK)) {
635
- if (!vma->vm_next && vma->vm_end < end)
636
- return -EFAULT;
637
- if (qp->prev && qp->prev->vm_end < vma->vm_start)
638
- return -EFAULT;
639
- }
640
-
641
- qp->prev = vma;
642708
643709 if (flags & MPOL_MF_LAZY) {
644710 /* Similar to task_numa_work, skip inaccessible VMAs */
645
- if (!is_vm_hugetlb_page(vma) &&
646
- (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
711
+ if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
647712 !(vma->vm_flags & VM_MIXEDMAP))
648713 change_prot_numa(vma, start, endvma);
649714 return 1;
....@@ -654,6 +719,12 @@
654719 return 0;
655720 return 1;
656721 }
722
+
723
+static const struct mm_walk_ops queue_pages_walk_ops = {
724
+ .hugetlb_entry = queue_pages_hugetlb,
725
+ .pmd_entry = queue_pages_pte_range,
726
+ .test_walk = queue_pages_test_walk,
727
+};
657728
658729 /*
659730 * Walk through page tables and collect pages to be migrated.
....@@ -675,26 +746,28 @@
675746 nodemask_t *nodes, unsigned long flags,
676747 struct list_head *pagelist)
677748 {
749
+ int err;
678750 struct queue_pages qp = {
679751 .pagelist = pagelist,
680752 .flags = flags,
681753 .nmask = nodes,
682
- .prev = NULL,
683
- };
684
- struct mm_walk queue_pages_walk = {
685
- .hugetlb_entry = queue_pages_hugetlb,
686
- .pmd_entry = queue_pages_pte_range,
687
- .test_walk = queue_pages_test_walk,
688
- .mm = mm,
689
- .private = &qp,
754
+ .start = start,
755
+ .end = end,
756
+ .first = NULL,
690757 };
691758
692
- return walk_page_range(start, end, &queue_pages_walk);
759
+ err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
760
+
761
+ if (!qp.first)
762
+ /* whole range in hole */
763
+ err = -EFAULT;
764
+
765
+ return err;
693766 }
694767
695768 /*
696769 * Apply policy to a single VMA
697
- * This must be called with the mmap_sem held for writing.
770
+ * This must be called with the mmap_lock held for writing.
698771 */
699772 static int vma_replace_policy(struct vm_area_struct *vma,
700773 struct mempolicy *pol)
....@@ -712,6 +785,7 @@
712785 if (IS_ERR(new))
713786 return PTR_ERR(new);
714787
788
+ vm_write_begin(vma);
715789 if (vma->vm_ops && vma->vm_ops->set_policy) {
716790 err = vma->vm_ops->set_policy(vma, new);
717791 if (err)
....@@ -719,11 +793,17 @@
719793 }
720794
721795 old = vma->vm_policy;
722
- vma->vm_policy = new; /* protected by mmap_sem */
796
+ /*
797
+ * The speculative page fault handler accesses this field without
798
+ * hodling the mmap_sem.
799
+ */
800
+ WRITE_ONCE(vma->vm_policy, new);
801
+ vm_write_end(vma);
723802 mpol_put(old);
724803
725804 return 0;
726805 err_out:
806
+ vm_write_end(vma);
727807 mpol_put(new);
728808 return err;
729809 }
....@@ -732,7 +812,6 @@
732812 static int mbind_range(struct mm_struct *mm, unsigned long start,
733813 unsigned long end, struct mempolicy *new_pol)
734814 {
735
- struct vm_area_struct *next;
736815 struct vm_area_struct *prev;
737816 struct vm_area_struct *vma;
738817 int err = 0;
....@@ -741,15 +820,13 @@
741820 unsigned long vmend;
742821
743822 vma = find_vma(mm, start);
744
- if (!vma || vma->vm_start > start)
745
- return -EFAULT;
823
+ VM_BUG_ON(!vma);
746824
747825 prev = vma->vm_prev;
748826 if (start > vma->vm_start)
749827 prev = vma;
750828
751
- for (; vma && vma->vm_start < end; prev = vma, vma = next) {
752
- next = vma->vm_next;
829
+ for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
753830 vmstart = max(start, vma->vm_start);
754831 vmend = min(end, vma->vm_end);
755832
....@@ -764,10 +841,6 @@
764841 vma_get_anon_name(vma));
765842 if (prev) {
766843 vma = prev;
767
- next = vma->vm_next;
768
- if (mpol_equal(vma_policy(vma), new_pol))
769
- continue;
770
- /* vma_merge() joined vma && vma->next, case 8 */
771844 goto replace;
772845 }
773846 if (vma->vm_start != vmstart) {
....@@ -807,13 +880,12 @@
807880 goto out;
808881 }
809882
810
- task_lock(current);
811883 ret = mpol_set_nodemask(new, nodes, scratch);
812884 if (ret) {
813
- task_unlock(current);
814885 mpol_put(new);
815886 goto out;
816887 }
888
+ task_lock(current);
817889 old = current->mempolicy;
818890 current->mempolicy = new;
819891 if (new && new->mode == MPOL_INTERLEAVE)
....@@ -839,7 +911,6 @@
839911
840912 switch (p->mode) {
841913 case MPOL_BIND:
842
- /* Fall through */
843914 case MPOL_INTERLEAVE:
844915 *nodes = p->v.nodes;
845916 break;
....@@ -853,16 +924,19 @@
853924 }
854925 }
855926
856
-static int lookup_node(unsigned long addr)
927
+static int lookup_node(struct mm_struct *mm, unsigned long addr)
857928 {
858
- struct page *p;
929
+ struct page *p = NULL;
859930 int err;
860931
861
- err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
862
- if (err >= 0) {
932
+ int locked = 1;
933
+ err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
934
+ if (err > 0) {
863935 err = page_to_nid(p);
864936 put_page(p);
865937 }
938
+ if (locked)
939
+ mmap_read_unlock(mm);
866940 return err;
867941 }
868942
....@@ -873,7 +947,7 @@
873947 int err;
874948 struct mm_struct *mm = current->mm;
875949 struct vm_area_struct *vma = NULL;
876
- struct mempolicy *pol = current->mempolicy;
950
+ struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
877951
878952 if (flags &
879953 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
....@@ -895,10 +969,10 @@
895969 * vma/shared policy at addr is NULL. We
896970 * want to return MPOL_DEFAULT in this case.
897971 */
898
- down_read(&mm->mmap_sem);
972
+ mmap_read_lock(mm);
899973 vma = find_vma_intersection(mm, addr, addr+1);
900974 if (!vma) {
901
- up_read(&mm->mmap_sem);
975
+ mmap_read_unlock(mm);
902976 return -EFAULT;
903977 }
904978 if (vma->vm_ops && vma->vm_ops->get_policy)
....@@ -913,7 +987,16 @@
913987
914988 if (flags & MPOL_F_NODE) {
915989 if (flags & MPOL_F_ADDR) {
916
- err = lookup_node(addr);
990
+ /*
991
+ * Take a refcount on the mpol, lookup_node()
992
+ * wil drop the mmap_lock, so after calling
993
+ * lookup_node() only "pol" remains valid, "vma"
994
+ * is stale.
995
+ */
996
+ pol_refcount = pol;
997
+ vma = NULL;
998
+ mpol_get(pol);
999
+ err = lookup_node(mm, addr);
9171000 if (err < 0)
9181001 goto out;
9191002 *policy = err;
....@@ -948,7 +1031,9 @@
9481031 out:
9491032 mpol_cond_put(pol);
9501033 if (vma)
951
- up_read(&current->mm->mmap_sem);
1034
+ mmap_read_unlock(mm);
1035
+ if (pol_refcount)
1036
+ mpol_put(pol_refcount);
9521037 return err;
9531038 }
9541039
....@@ -967,8 +1052,8 @@
9671052 if (!isolate_lru_page(head)) {
9681053 list_add_tail(&head->lru, pagelist);
9691054 mod_node_page_state(page_pgdat(head),
970
- NR_ISOLATED_ANON + page_is_file_cache(head),
971
- hpage_nr_pages(head));
1055
+ NR_ISOLATED_ANON + page_is_file_lru(head),
1056
+ thp_nr_pages(head));
9721057 } else if (flags & MPOL_MF_STRICT) {
9731058 /*
9741059 * Non-movable page may reach here. And, there may be
....@@ -984,27 +1069,6 @@
9841069 return 0;
9851070 }
9861071
987
-/* page allocation callback for NUMA node migration */
988
-struct page *alloc_new_node_page(struct page *page, unsigned long node)
989
-{
990
- if (PageHuge(page))
991
- return alloc_huge_page_node(page_hstate(compound_head(page)),
992
- node);
993
- else if (PageTransHuge(page)) {
994
- struct page *thp;
995
-
996
- thp = alloc_pages_node(node,
997
- (GFP_TRANSHUGE | __GFP_THISNODE),
998
- HPAGE_PMD_ORDER);
999
- if (!thp)
1000
- return NULL;
1001
- prep_transhuge_page(thp);
1002
- return thp;
1003
- } else
1004
- return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1005
- __GFP_THISNODE, 0);
1006
-}
1007
-
10081072 /*
10091073 * Migrate pages from one node to a target node.
10101074 * Returns error or the number of pages not migrated.
....@@ -1015,6 +1079,10 @@
10151079 nodemask_t nmask;
10161080 LIST_HEAD(pagelist);
10171081 int err = 0;
1082
+ struct migration_target_control mtc = {
1083
+ .nid = dest,
1084
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1085
+ };
10181086
10191087 nodes_clear(nmask);
10201088 node_set(source, nmask);
....@@ -1029,8 +1097,8 @@
10291097 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
10301098
10311099 if (!list_empty(&pagelist)) {
1032
- err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1033
- MIGRATE_SYNC, MR_SYSCALL);
1100
+ err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1101
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
10341102 if (err)
10351103 putback_movable_pages(&pagelist);
10361104 }
....@@ -1048,14 +1116,12 @@
10481116 const nodemask_t *to, int flags)
10491117 {
10501118 int busy = 0;
1051
- int err;
1119
+ int err = 0;
10521120 nodemask_t tmp;
10531121
1054
- err = migrate_prep();
1055
- if (err)
1056
- return err;
1122
+ lru_cache_disable();
10571123
1058
- down_read(&mm->mmap_sem);
1124
+ mmap_read_lock(mm);
10591125
10601126 /*
10611127 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
....@@ -1136,7 +1202,9 @@
11361202 if (err < 0)
11371203 break;
11381204 }
1139
- up_read(&mm->mmap_sem);
1205
+ mmap_read_unlock(mm);
1206
+
1207
+ lru_cache_enable();
11401208 if (err < 0)
11411209 return err;
11421210 return busy;
....@@ -1153,7 +1221,7 @@
11531221 static struct page *new_page(struct page *page, unsigned long start)
11541222 {
11551223 struct vm_area_struct *vma;
1156
- unsigned long uninitialized_var(address);
1224
+ unsigned long address;
11571225
11581226 vma = find_vma(current->mm, start);
11591227 while (vma) {
....@@ -1252,19 +1320,15 @@
12521320
12531321 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
12541322
1255
- err = migrate_prep();
1256
- if (err)
1257
- goto mpol_out;
1323
+ lru_cache_disable();
12581324 }
12591325 {
12601326 NODEMASK_SCRATCH(scratch);
12611327 if (scratch) {
1262
- down_write(&mm->mmap_sem);
1263
- task_lock(current);
1328
+ mmap_write_lock(mm);
12641329 err = mpol_set_nodemask(new, nmask, scratch);
1265
- task_unlock(current);
12661330 if (err)
1267
- up_write(&mm->mmap_sem);
1331
+ mmap_write_unlock(mm);
12681332 } else
12691333 err = -ENOMEM;
12701334 NODEMASK_SCRATCH_FREE(scratch);
....@@ -1301,9 +1365,11 @@
13011365 putback_movable_pages(&pagelist);
13021366 }
13031367
1304
- up_write(&mm->mmap_sem);
1368
+ mmap_write_unlock(mm);
13051369 mpol_out:
13061370 mpol_put(new);
1371
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1372
+ lru_cache_enable();
13071373 return err;
13081374 }
13091375
....@@ -1505,10 +1571,6 @@
15051571 if (nodes_empty(*new))
15061572 goto out_put;
15071573
1508
- nodes_and(*new, *new, node_states[N_MEMORY]);
1509
- if (nodes_empty(*new))
1510
- goto out_put;
1511
-
15121574 err = security_task_movememory(task);
15131575 if (err)
15141576 goto out_put;
....@@ -1552,13 +1614,13 @@
15521614 unsigned long flags)
15531615 {
15541616 int err;
1555
- int uninitialized_var(pval);
1617
+ int pval;
15561618 nodemask_t nodes;
1557
-
1558
- addr = untagged_addr(addr);
15591619
15601620 if (nmask != NULL && maxnode < nr_node_ids)
15611621 return -EINVAL;
1622
+
1623
+ addr = untagged_addr(addr);
15621624
15631625 err = do_get_mempolicy(&pval, &nodes, addr, flags);
15641626
....@@ -1691,26 +1753,59 @@
16911753
16921754 #endif /* CONFIG_COMPAT */
16931755
1756
+bool vma_migratable(struct vm_area_struct *vma)
1757
+{
1758
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1759
+ return false;
1760
+
1761
+ /*
1762
+ * DAX device mappings require predictable access latency, so avoid
1763
+ * incurring periodic faults.
1764
+ */
1765
+ if (vma_is_dax(vma))
1766
+ return false;
1767
+
1768
+ if (is_vm_hugetlb_page(vma) &&
1769
+ !hugepage_migration_supported(hstate_vma(vma)))
1770
+ return false;
1771
+
1772
+ /*
1773
+ * Migration allocates pages in the highest zone. If we cannot
1774
+ * do so then migration (at least from node to node) is not
1775
+ * possible.
1776
+ */
1777
+ if (vma->vm_file &&
1778
+ gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1779
+ < policy_zone)
1780
+ return false;
1781
+ return true;
1782
+}
1783
+
16941784 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
16951785 unsigned long addr)
16961786 {
1697
- struct mempolicy *pol = NULL;
1787
+ struct mempolicy *pol;
16981788
1699
- if (vma) {
1700
- if (vma->vm_ops && vma->vm_ops->get_policy) {
1701
- pol = vma->vm_ops->get_policy(vma, addr);
1702
- } else if (vma->vm_policy) {
1703
- pol = vma->vm_policy;
1789
+ if (!vma)
1790
+ return NULL;
17041791
1705
- /*
1706
- * shmem_alloc_page() passes MPOL_F_SHARED policy with
1707
- * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1708
- * count on these policies which will be dropped by
1709
- * mpol_cond_put() later
1710
- */
1711
- if (mpol_needs_cond_ref(pol))
1712
- mpol_get(pol);
1713
- }
1792
+ if (vma->vm_ops && vma->vm_ops->get_policy)
1793
+ return vma->vm_ops->get_policy(vma, addr);
1794
+
1795
+ /*
1796
+ * This could be called without holding the mmap_sem in the
1797
+ * speculative page fault handler's path.
1798
+ */
1799
+ pol = READ_ONCE(vma->vm_policy);
1800
+ if (pol) {
1801
+ /*
1802
+ * shmem_alloc_page() passes MPOL_F_SHARED policy with
1803
+ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1804
+ * count on these policies which will be dropped by
1805
+ * mpol_cond_put() later
1806
+ */
1807
+ if (mpol_needs_cond_ref(pol))
1808
+ mpol_get(pol);
17141809 }
17151810
17161811 return pol;
....@@ -1785,7 +1880,7 @@
17851880 * Return a nodemask representing a mempolicy for filtering nodes for
17861881 * page allocation
17871882 */
1788
-static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1883
+nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
17891884 {
17901885 /* Lower zones don't get a nodemask applied for MPOL_BIND */
17911886 if (unlikely(policy->mode == MPOL_BIND) &&
....@@ -1797,8 +1892,7 @@
17971892 }
17981893
17991894 /* Return the node id preferred by the given mempolicy, or the given id */
1800
-static int policy_node(gfp_t gfp, struct mempolicy *policy,
1801
- int nd)
1895
+static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
18021896 {
18031897 if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
18041898 nd = policy->v.preferred_node;
....@@ -1986,7 +2080,6 @@
19862080 break;
19872081
19882082 case MPOL_BIND:
1989
- /* Fall through */
19902083 case MPOL_INTERLEAVE:
19912084 *mask = mempolicy->v.nodes;
19922085 break;
....@@ -2081,7 +2174,7 @@
20812174 *
20822175 * This function allocates a page from the kernel page pool and applies
20832176 * a NUMA policy associated with the VMA or the current process.
2084
- * When VMA is not NULL caller must hold down_read on the mmap_sem of the
2177
+ * When VMA is not NULL caller must read-lock the mmap_lock of the
20852178 * mm_struct of the VMA to prevent it from going away. Should be used for
20862179 * all allocations for pages that will be mapped into user space. Returns
20872180 * NULL when no page can be allocated.
....@@ -2119,43 +2212,29 @@
21192212 * If the policy is interleave, or does not allow the current
21202213 * node in its nodemask, we allocate the standard way.
21212214 */
2122
- if (pol->mode == MPOL_PREFERRED &&
2123
- !(pol->flags & MPOL_F_LOCAL))
2215
+ if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
21242216 hpage_node = pol->v.preferred_node;
21252217
21262218 nmask = policy_nodemask(gfp, pol);
21272219 if (!nmask || node_isset(hpage_node, *nmask)) {
21282220 mpol_cond_put(pol);
21292221 /*
2130
- * We cannot invoke reclaim if __GFP_THISNODE
2131
- * is set. Invoking reclaim with
2132
- * __GFP_THISNODE set, would cause THP
2133
- * allocations to trigger heavy swapping
2134
- * despite there may be tons of free memory
2135
- * (including potentially plenty of THP
2136
- * already available in the buddy) on all the
2137
- * other NUMA nodes.
2138
- *
2139
- * At most we could invoke compaction when
2140
- * __GFP_THISNODE is set (but we would need to
2141
- * refrain from invoking reclaim even if
2142
- * compaction returned COMPACT_SKIPPED because
2143
- * there wasn't not enough memory to succeed
2144
- * compaction). For now just avoid
2145
- * __GFP_THISNODE instead of limiting the
2146
- * allocation path to a strict and single
2147
- * compaction invocation.
2148
- *
2149
- * Supposedly if direct reclaim was enabled by
2150
- * the caller, the app prefers THP regardless
2151
- * of the node it comes from so this would be
2152
- * more desiderable behavior than only
2153
- * providing THP originated from the local
2154
- * node in such case.
2222
+ * First, try to allocate THP only on local node, but
2223
+ * don't reclaim unnecessarily, just compact.
21552224 */
2156
- if (!(gfp & __GFP_DIRECT_RECLAIM))
2157
- gfp |= __GFP_THISNODE;
2158
- page = __alloc_pages_node(hpage_node, gfp, order);
2225
+ page = __alloc_pages_node(hpage_node,
2226
+ gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2227
+
2228
+ /*
2229
+ * If hugepage allocations are configured to always
2230
+ * synchronous compact or the vma has been madvised
2231
+ * to prefer hugepage backing, retry allowing remote
2232
+ * memory with both reclaim and compact as well.
2233
+ */
2234
+ if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2235
+ page = __alloc_pages_nodemask(gfp, order,
2236
+ hpage_node, nmask);
2237
+
21592238 goto out;
21602239 }
21612240 }
....@@ -2167,6 +2246,7 @@
21672246 out:
21682247 return page;
21692248 }
2249
+EXPORT_SYMBOL(alloc_pages_vma);
21702250
21712251 /**
21722252 * alloc_pages_current - Allocate pages.
....@@ -2266,7 +2346,6 @@
22662346
22672347 switch (a->mode) {
22682348 case MPOL_BIND:
2269
- /* Fall through */
22702349 case MPOL_INTERLEAVE:
22712350 return !!nodes_equal(a->v.nodes, b->v.nodes);
22722351 case MPOL_PREFERRED:
....@@ -2399,7 +2478,7 @@
23992478 unsigned long pgoff;
24002479 int thiscpu = raw_smp_processor_id();
24012480 int thisnid = cpu_to_node(thiscpu);
2402
- int polnid = -1;
2481
+ int polnid = NUMA_NO_NODE;
24032482 int ret = -1;
24042483
24052484 pol = get_vma_policy(vma, addr);
....@@ -2573,6 +2652,7 @@
25732652 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
25742653 if (!mpol_new)
25752654 goto err_out;
2655
+ atomic_set(&mpol_new->refcnt, 1);
25762656 goto restart;
25772657 }
25782658
....@@ -2805,12 +2885,11 @@
28052885 int mpol_parse_str(char *str, struct mempolicy **mpol)
28062886 {
28072887 struct mempolicy *new = NULL;
2808
- unsigned short mode;
28092888 unsigned short mode_flags;
28102889 nodemask_t nodes;
28112890 char *nodelist = strchr(str, ':');
28122891 char *flags = strchr(str, '=');
2813
- int err = 1;
2892
+ int err = 1, mode;
28142893
28152894 if (flags)
28162895 *flags++ = '\0'; /* terminate mode string */
....@@ -2825,12 +2904,8 @@
28252904 } else
28262905 nodes_clear(nodes);
28272906
2828
- for (mode = 0; mode < MPOL_MAX; mode++) {
2829
- if (!strcmp(str, policy_modes[mode])) {
2830
- break;
2831
- }
2832
- }
2833
- if (mode >= MPOL_MAX)
2907
+ mode = match_string(policy_modes, MPOL_MAX, str);
2908
+ if (mode < 0)
28342909 goto out;
28352910
28362911 switch (mode) {