hc
2024-01-03 2f7c68cb55ecb7331f2381deb497c27155f32faf
kernel/mm/mempolicy.c
....@@ -1,9 +1,9 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Simple NUMA memory policy for the Linux kernel.
34 *
45 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
56 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6
- * Subject to the GNU Public License, version 2.
77 *
88 * NUMA policy allows the user to give hints in which node(s) memory should
99 * be allocated.
....@@ -68,7 +68,7 @@
6868 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
6969
7070 #include <linux/mempolicy.h>
71
-#include <linux/mm.h>
71
+#include <linux/pagewalk.h>
7272 #include <linux/highmem.h>
7373 #include <linux/hugetlb.h>
7474 #include <linux/kernel.h>
....@@ -126,6 +126,32 @@
126126 };
127127
128128 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129
+
130
+/**
131
+ * numa_map_to_online_node - Find closest online node
132
+ * @node: Node id to start the search
133
+ *
134
+ * Lookup the next closest node by distance if @nid is not online.
135
+ */
136
+int numa_map_to_online_node(int node)
137
+{
138
+ int min_dist = INT_MAX, dist, n, min_node;
139
+
140
+ if (node == NUMA_NO_NODE || node_online(node))
141
+ return node;
142
+
143
+ min_node = node;
144
+ for_each_online_node(n) {
145
+ dist = node_distance(node, n);
146
+ if (dist < min_dist) {
147
+ min_dist = dist;
148
+ min_node = n;
149
+ }
150
+ }
151
+
152
+ return min_node;
153
+}
154
+EXPORT_SYMBOL_GPL(numa_map_to_online_node);
129155
130156 struct mempolicy *get_task_policy(struct task_struct *p)
131157 {
....@@ -198,7 +224,7 @@
198224 * handle an empty nodemask with MPOL_PREFERRED here.
199225 *
200226 * Must be called holding task's alloc_lock to protect task's mems_allowed
201
- * and mempolicy. May also be called holding the mmap_semaphore for write.
227
+ * and mempolicy. May also be called holding the mmap_lock for write.
202228 */
203229 static int mpol_set_nodemask(struct mempolicy *pol,
204230 const nodemask_t *nodes, struct nodemask_scratch *nsc)
....@@ -342,13 +368,13 @@
342368 /*
343369 * mpol_rebind_policy - Migrate a policy to a different set of nodes
344370 *
345
- * Per-vma policies are protected by mmap_sem. Allocations using per-task
371
+ * Per-vma policies are protected by mmap_lock. Allocations using per-task
346372 * policies are protected by task->mems_allowed_seq to prevent a premature
347373 * OOM/allocation failure due to parallel nodemask modification.
348374 */
349375 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
350376 {
351
- if (!pol)
377
+ if (!pol || pol->mode == MPOL_LOCAL)
352378 return;
353379 if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
354380 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
....@@ -372,17 +398,20 @@
372398 /*
373399 * Rebind each vma in mm to new nodemask.
374400 *
375
- * Call holding a reference to mm. Takes mm->mmap_sem during call.
401
+ * Call holding a reference to mm. Takes mm->mmap_lock during call.
376402 */
377403
378404 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
379405 {
380406 struct vm_area_struct *vma;
381407
382
- down_write(&mm->mmap_sem);
383
- for (vma = mm->mmap; vma; vma = vma->vm_next)
408
+ mmap_write_lock(mm);
409
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
410
+ vm_write_begin(vma);
384411 mpol_rebind_policy(vma->vm_policy, new);
385
- up_write(&mm->mmap_sem);
412
+ vm_write_end(vma);
413
+ }
414
+ mmap_write_unlock(mm);
386415 }
387416
388417 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
....@@ -410,7 +439,9 @@
410439 struct list_head *pagelist;
411440 unsigned long flags;
412441 nodemask_t *nmask;
413
- struct vm_area_struct *prev;
442
+ unsigned long start;
443
+ unsigned long end;
444
+ struct vm_area_struct *first;
414445 };
415446
416447 /*
....@@ -440,6 +471,7 @@
440471 */
441472 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
442473 unsigned long end, struct mm_walk *walk)
474
+ __releases(ptl)
443475 {
444476 int ret = 0;
445477 struct page *page;
....@@ -555,9 +587,10 @@
555587 unsigned long addr, unsigned long end,
556588 struct mm_walk *walk)
557589 {
590
+ int ret = 0;
558591 #ifdef CONFIG_HUGETLB_PAGE
559592 struct queue_pages *qp = walk->private;
560
- unsigned long flags = qp->flags;
593
+ unsigned long flags = (qp->flags & MPOL_MF_VALID);
561594 struct page *page;
562595 spinlock_t *ptl;
563596 pte_t entry;
....@@ -569,16 +602,45 @@
569602 page = pte_page(entry);
570603 if (!queue_pages_required(page, qp))
571604 goto unlock;
605
+
606
+ if (flags == MPOL_MF_STRICT) {
607
+ /*
608
+ * STRICT alone means only detecting misplaced page and no
609
+ * need to further check other vma.
610
+ */
611
+ ret = -EIO;
612
+ goto unlock;
613
+ }
614
+
615
+ if (!vma_migratable(walk->vma)) {
616
+ /*
617
+ * Must be STRICT with MOVE*, otherwise .test_walk() have
618
+ * stopped walking current vma.
619
+ * Detecting misplaced page but allow migrating pages which
620
+ * have been queued.
621
+ */
622
+ ret = 1;
623
+ goto unlock;
624
+ }
625
+
572626 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
573627 if (flags & (MPOL_MF_MOVE_ALL) ||
574
- (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
575
- isolate_huge_page(page, qp->pagelist);
628
+ (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
629
+ !hugetlb_pmd_shared(pte))) {
630
+ if (isolate_hugetlb(page, qp->pagelist) &&
631
+ (flags & MPOL_MF_STRICT))
632
+ /*
633
+ * Failed to isolate page but allow migrating pages
634
+ * which have been queued.
635
+ */
636
+ ret = 1;
637
+ }
576638 unlock:
577639 spin_unlock(ptl);
578640 #else
579641 BUG();
580642 #endif
581
- return 0;
643
+ return ret;
582644 }
583645
584646 #ifdef CONFIG_NUMA_BALANCING
....@@ -596,7 +658,7 @@
596658 {
597659 int nr_updated;
598660
599
- nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
661
+ nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
600662 if (nr_updated)
601663 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
602664
....@@ -618,6 +680,22 @@
618680 unsigned long endvma = vma->vm_end;
619681 unsigned long flags = qp->flags;
620682
683
+ /* range check first */
684
+ VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
685
+
686
+ if (!qp->first) {
687
+ qp->first = vma;
688
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
689
+ (qp->start < vma->vm_start))
690
+ /* hole at head side of range */
691
+ return -EFAULT;
692
+ }
693
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
694
+ ((vma->vm_end < qp->end) &&
695
+ (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
696
+ /* hole at middle or tail of range */
697
+ return -EFAULT;
698
+
621699 /*
622700 * Need check MPOL_MF_STRICT to return -EIO if possible
623701 * regardless of vma_migratable
....@@ -628,22 +706,10 @@
628706
629707 if (endvma > end)
630708 endvma = end;
631
- if (vma->vm_start > start)
632
- start = vma->vm_start;
633
-
634
- if (!(flags & MPOL_MF_DISCONTIG_OK)) {
635
- if (!vma->vm_next && vma->vm_end < end)
636
- return -EFAULT;
637
- if (qp->prev && qp->prev->vm_end < vma->vm_start)
638
- return -EFAULT;
639
- }
640
-
641
- qp->prev = vma;
642709
643710 if (flags & MPOL_MF_LAZY) {
644711 /* Similar to task_numa_work, skip inaccessible VMAs */
645
- if (!is_vm_hugetlb_page(vma) &&
646
- (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
712
+ if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
647713 !(vma->vm_flags & VM_MIXEDMAP))
648714 change_prot_numa(vma, start, endvma);
649715 return 1;
....@@ -654,6 +720,12 @@
654720 return 0;
655721 return 1;
656722 }
723
+
724
+static const struct mm_walk_ops queue_pages_walk_ops = {
725
+ .hugetlb_entry = queue_pages_hugetlb,
726
+ .pmd_entry = queue_pages_pte_range,
727
+ .test_walk = queue_pages_test_walk,
728
+};
657729
658730 /*
659731 * Walk through page tables and collect pages to be migrated.
....@@ -675,26 +747,28 @@
675747 nodemask_t *nodes, unsigned long flags,
676748 struct list_head *pagelist)
677749 {
750
+ int err;
678751 struct queue_pages qp = {
679752 .pagelist = pagelist,
680753 .flags = flags,
681754 .nmask = nodes,
682
- .prev = NULL,
683
- };
684
- struct mm_walk queue_pages_walk = {
685
- .hugetlb_entry = queue_pages_hugetlb,
686
- .pmd_entry = queue_pages_pte_range,
687
- .test_walk = queue_pages_test_walk,
688
- .mm = mm,
689
- .private = &qp,
755
+ .start = start,
756
+ .end = end,
757
+ .first = NULL,
690758 };
691759
692
- return walk_page_range(start, end, &queue_pages_walk);
760
+ err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
761
+
762
+ if (!qp.first)
763
+ /* whole range in hole */
764
+ err = -EFAULT;
765
+
766
+ return err;
693767 }
694768
695769 /*
696770 * Apply policy to a single VMA
697
- * This must be called with the mmap_sem held for writing.
771
+ * This must be called with the mmap_lock held for writing.
698772 */
699773 static int vma_replace_policy(struct vm_area_struct *vma,
700774 struct mempolicy *pol)
....@@ -712,6 +786,7 @@
712786 if (IS_ERR(new))
713787 return PTR_ERR(new);
714788
789
+ vm_write_begin(vma);
715790 if (vma->vm_ops && vma->vm_ops->set_policy) {
716791 err = vma->vm_ops->set_policy(vma, new);
717792 if (err)
....@@ -719,11 +794,17 @@
719794 }
720795
721796 old = vma->vm_policy;
722
- vma->vm_policy = new; /* protected by mmap_sem */
797
+ /*
798
+ * The speculative page fault handler accesses this field without
799
+ * hodling the mmap_sem.
800
+ */
801
+ WRITE_ONCE(vma->vm_policy, new);
802
+ vm_write_end(vma);
723803 mpol_put(old);
724804
725805 return 0;
726806 err_out:
807
+ vm_write_end(vma);
727808 mpol_put(new);
728809 return err;
729810 }
....@@ -732,7 +813,6 @@
732813 static int mbind_range(struct mm_struct *mm, unsigned long start,
733814 unsigned long end, struct mempolicy *new_pol)
734815 {
735
- struct vm_area_struct *next;
736816 struct vm_area_struct *prev;
737817 struct vm_area_struct *vma;
738818 int err = 0;
....@@ -741,15 +821,13 @@
741821 unsigned long vmend;
742822
743823 vma = find_vma(mm, start);
744
- if (!vma || vma->vm_start > start)
745
- return -EFAULT;
824
+ VM_BUG_ON(!vma);
746825
747826 prev = vma->vm_prev;
748827 if (start > vma->vm_start)
749828 prev = vma;
750829
751
- for (; vma && vma->vm_start < end; prev = vma, vma = next) {
752
- next = vma->vm_next;
830
+ for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
753831 vmstart = max(start, vma->vm_start);
754832 vmend = min(end, vma->vm_end);
755833
....@@ -764,10 +842,6 @@
764842 vma_get_anon_name(vma));
765843 if (prev) {
766844 vma = prev;
767
- next = vma->vm_next;
768
- if (mpol_equal(vma_policy(vma), new_pol))
769
- continue;
770
- /* vma_merge() joined vma && vma->next, case 8 */
771845 goto replace;
772846 }
773847 if (vma->vm_start != vmstart) {
....@@ -807,13 +881,12 @@
807881 goto out;
808882 }
809883
810
- task_lock(current);
811884 ret = mpol_set_nodemask(new, nodes, scratch);
812885 if (ret) {
813
- task_unlock(current);
814886 mpol_put(new);
815887 goto out;
816888 }
889
+ task_lock(current);
817890 old = current->mempolicy;
818891 current->mempolicy = new;
819892 if (new && new->mode == MPOL_INTERLEAVE)
....@@ -839,7 +912,6 @@
839912
840913 switch (p->mode) {
841914 case MPOL_BIND:
842
- /* Fall through */
843915 case MPOL_INTERLEAVE:
844916 *nodes = p->v.nodes;
845917 break;
....@@ -853,16 +925,19 @@
853925 }
854926 }
855927
856
-static int lookup_node(unsigned long addr)
928
+static int lookup_node(struct mm_struct *mm, unsigned long addr)
857929 {
858
- struct page *p;
930
+ struct page *p = NULL;
859931 int err;
860932
861
- err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
862
- if (err >= 0) {
933
+ int locked = 1;
934
+ err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
935
+ if (err > 0) {
863936 err = page_to_nid(p);
864937 put_page(p);
865938 }
939
+ if (locked)
940
+ mmap_read_unlock(mm);
866941 return err;
867942 }
868943
....@@ -873,7 +948,7 @@
873948 int err;
874949 struct mm_struct *mm = current->mm;
875950 struct vm_area_struct *vma = NULL;
876
- struct mempolicy *pol = current->mempolicy;
951
+ struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
877952
878953 if (flags &
879954 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
....@@ -895,10 +970,10 @@
895970 * vma/shared policy at addr is NULL. We
896971 * want to return MPOL_DEFAULT in this case.
897972 */
898
- down_read(&mm->mmap_sem);
973
+ mmap_read_lock(mm);
899974 vma = find_vma_intersection(mm, addr, addr+1);
900975 if (!vma) {
901
- up_read(&mm->mmap_sem);
976
+ mmap_read_unlock(mm);
902977 return -EFAULT;
903978 }
904979 if (vma->vm_ops && vma->vm_ops->get_policy)
....@@ -913,7 +988,16 @@
913988
914989 if (flags & MPOL_F_NODE) {
915990 if (flags & MPOL_F_ADDR) {
916
- err = lookup_node(addr);
991
+ /*
992
+ * Take a refcount on the mpol, lookup_node()
993
+ * wil drop the mmap_lock, so after calling
994
+ * lookup_node() only "pol" remains valid, "vma"
995
+ * is stale.
996
+ */
997
+ pol_refcount = pol;
998
+ vma = NULL;
999
+ mpol_get(pol);
1000
+ err = lookup_node(mm, addr);
9171001 if (err < 0)
9181002 goto out;
9191003 *policy = err;
....@@ -948,7 +1032,9 @@
9481032 out:
9491033 mpol_cond_put(pol);
9501034 if (vma)
951
- up_read(&current->mm->mmap_sem);
1035
+ mmap_read_unlock(mm);
1036
+ if (pol_refcount)
1037
+ mpol_put(pol_refcount);
9521038 return err;
9531039 }
9541040
....@@ -967,8 +1053,8 @@
9671053 if (!isolate_lru_page(head)) {
9681054 list_add_tail(&head->lru, pagelist);
9691055 mod_node_page_state(page_pgdat(head),
970
- NR_ISOLATED_ANON + page_is_file_cache(head),
971
- hpage_nr_pages(head));
1056
+ NR_ISOLATED_ANON + page_is_file_lru(head),
1057
+ thp_nr_pages(head));
9721058 } else if (flags & MPOL_MF_STRICT) {
9731059 /*
9741060 * Non-movable page may reach here. And, there may be
....@@ -984,27 +1070,6 @@
9841070 return 0;
9851071 }
9861072
987
-/* page allocation callback for NUMA node migration */
988
-struct page *alloc_new_node_page(struct page *page, unsigned long node)
989
-{
990
- if (PageHuge(page))
991
- return alloc_huge_page_node(page_hstate(compound_head(page)),
992
- node);
993
- else if (PageTransHuge(page)) {
994
- struct page *thp;
995
-
996
- thp = alloc_pages_node(node,
997
- (GFP_TRANSHUGE | __GFP_THISNODE),
998
- HPAGE_PMD_ORDER);
999
- if (!thp)
1000
- return NULL;
1001
- prep_transhuge_page(thp);
1002
- return thp;
1003
- } else
1004
- return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1005
- __GFP_THISNODE, 0);
1006
-}
1007
-
10081073 /*
10091074 * Migrate pages from one node to a target node.
10101075 * Returns error or the number of pages not migrated.
....@@ -1015,6 +1080,10 @@
10151080 nodemask_t nmask;
10161081 LIST_HEAD(pagelist);
10171082 int err = 0;
1083
+ struct migration_target_control mtc = {
1084
+ .nid = dest,
1085
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1086
+ };
10181087
10191088 nodes_clear(nmask);
10201089 node_set(source, nmask);
....@@ -1029,8 +1098,8 @@
10291098 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
10301099
10311100 if (!list_empty(&pagelist)) {
1032
- err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1033
- MIGRATE_SYNC, MR_SYSCALL);
1101
+ err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1102
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
10341103 if (err)
10351104 putback_movable_pages(&pagelist);
10361105 }
....@@ -1048,14 +1117,12 @@
10481117 const nodemask_t *to, int flags)
10491118 {
10501119 int busy = 0;
1051
- int err;
1120
+ int err = 0;
10521121 nodemask_t tmp;
10531122
1054
- err = migrate_prep();
1055
- if (err)
1056
- return err;
1123
+ lru_cache_disable();
10571124
1058
- down_read(&mm->mmap_sem);
1125
+ mmap_read_lock(mm);
10591126
10601127 /*
10611128 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
....@@ -1136,7 +1203,9 @@
11361203 if (err < 0)
11371204 break;
11381205 }
1139
- up_read(&mm->mmap_sem);
1206
+ mmap_read_unlock(mm);
1207
+
1208
+ lru_cache_enable();
11401209 if (err < 0)
11411210 return err;
11421211 return busy;
....@@ -1153,7 +1222,7 @@
11531222 static struct page *new_page(struct page *page, unsigned long start)
11541223 {
11551224 struct vm_area_struct *vma;
1156
- unsigned long uninitialized_var(address);
1225
+ unsigned long address;
11571226
11581227 vma = find_vma(current->mm, start);
11591228 while (vma) {
....@@ -1252,19 +1321,15 @@
12521321
12531322 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
12541323
1255
- err = migrate_prep();
1256
- if (err)
1257
- goto mpol_out;
1324
+ lru_cache_disable();
12581325 }
12591326 {
12601327 NODEMASK_SCRATCH(scratch);
12611328 if (scratch) {
1262
- down_write(&mm->mmap_sem);
1263
- task_lock(current);
1329
+ mmap_write_lock(mm);
12641330 err = mpol_set_nodemask(new, nmask, scratch);
1265
- task_unlock(current);
12661331 if (err)
1267
- up_write(&mm->mmap_sem);
1332
+ mmap_write_unlock(mm);
12681333 } else
12691334 err = -ENOMEM;
12701335 NODEMASK_SCRATCH_FREE(scratch);
....@@ -1301,9 +1366,11 @@
13011366 putback_movable_pages(&pagelist);
13021367 }
13031368
1304
- up_write(&mm->mmap_sem);
1369
+ mmap_write_unlock(mm);
13051370 mpol_out:
13061371 mpol_put(new);
1372
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1373
+ lru_cache_enable();
13071374 return err;
13081375 }
13091376
....@@ -1505,10 +1572,6 @@
15051572 if (nodes_empty(*new))
15061573 goto out_put;
15071574
1508
- nodes_and(*new, *new, node_states[N_MEMORY]);
1509
- if (nodes_empty(*new))
1510
- goto out_put;
1511
-
15121575 err = security_task_movememory(task);
15131576 if (err)
15141577 goto out_put;
....@@ -1552,13 +1615,13 @@
15521615 unsigned long flags)
15531616 {
15541617 int err;
1555
- int uninitialized_var(pval);
1618
+ int pval;
15561619 nodemask_t nodes;
1557
-
1558
- addr = untagged_addr(addr);
15591620
15601621 if (nmask != NULL && maxnode < nr_node_ids)
15611622 return -EINVAL;
1623
+
1624
+ addr = untagged_addr(addr);
15621625
15631626 err = do_get_mempolicy(&pval, &nodes, addr, flags);
15641627
....@@ -1691,26 +1754,59 @@
16911754
16921755 #endif /* CONFIG_COMPAT */
16931756
1757
+bool vma_migratable(struct vm_area_struct *vma)
1758
+{
1759
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1760
+ return false;
1761
+
1762
+ /*
1763
+ * DAX device mappings require predictable access latency, so avoid
1764
+ * incurring periodic faults.
1765
+ */
1766
+ if (vma_is_dax(vma))
1767
+ return false;
1768
+
1769
+ if (is_vm_hugetlb_page(vma) &&
1770
+ !hugepage_migration_supported(hstate_vma(vma)))
1771
+ return false;
1772
+
1773
+ /*
1774
+ * Migration allocates pages in the highest zone. If we cannot
1775
+ * do so then migration (at least from node to node) is not
1776
+ * possible.
1777
+ */
1778
+ if (vma->vm_file &&
1779
+ gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1780
+ < policy_zone)
1781
+ return false;
1782
+ return true;
1783
+}
1784
+
16941785 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
16951786 unsigned long addr)
16961787 {
1697
- struct mempolicy *pol = NULL;
1788
+ struct mempolicy *pol;
16981789
1699
- if (vma) {
1700
- if (vma->vm_ops && vma->vm_ops->get_policy) {
1701
- pol = vma->vm_ops->get_policy(vma, addr);
1702
- } else if (vma->vm_policy) {
1703
- pol = vma->vm_policy;
1790
+ if (!vma)
1791
+ return NULL;
17041792
1705
- /*
1706
- * shmem_alloc_page() passes MPOL_F_SHARED policy with
1707
- * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1708
- * count on these policies which will be dropped by
1709
- * mpol_cond_put() later
1710
- */
1711
- if (mpol_needs_cond_ref(pol))
1712
- mpol_get(pol);
1713
- }
1793
+ if (vma->vm_ops && vma->vm_ops->get_policy)
1794
+ return vma->vm_ops->get_policy(vma, addr);
1795
+
1796
+ /*
1797
+ * This could be called without holding the mmap_sem in the
1798
+ * speculative page fault handler's path.
1799
+ */
1800
+ pol = READ_ONCE(vma->vm_policy);
1801
+ if (pol) {
1802
+ /*
1803
+ * shmem_alloc_page() passes MPOL_F_SHARED policy with
1804
+ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1805
+ * count on these policies which will be dropped by
1806
+ * mpol_cond_put() later
1807
+ */
1808
+ if (mpol_needs_cond_ref(pol))
1809
+ mpol_get(pol);
17141810 }
17151811
17161812 return pol;
....@@ -1785,7 +1881,7 @@
17851881 * Return a nodemask representing a mempolicy for filtering nodes for
17861882 * page allocation
17871883 */
1788
-static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1884
+nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
17891885 {
17901886 /* Lower zones don't get a nodemask applied for MPOL_BIND */
17911887 if (unlikely(policy->mode == MPOL_BIND) &&
....@@ -1797,8 +1893,7 @@
17971893 }
17981894
17991895 /* Return the node id preferred by the given mempolicy, or the given id */
1800
-static int policy_node(gfp_t gfp, struct mempolicy *policy,
1801
- int nd)
1896
+static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
18021897 {
18031898 if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
18041899 nd = policy->v.preferred_node;
....@@ -1986,7 +2081,6 @@
19862081 break;
19872082
19882083 case MPOL_BIND:
1989
- /* Fall through */
19902084 case MPOL_INTERLEAVE:
19912085 *mask = mempolicy->v.nodes;
19922086 break;
....@@ -2081,7 +2175,7 @@
20812175 *
20822176 * This function allocates a page from the kernel page pool and applies
20832177 * a NUMA policy associated with the VMA or the current process.
2084
- * When VMA is not NULL caller must hold down_read on the mmap_sem of the
2178
+ * When VMA is not NULL caller must read-lock the mmap_lock of the
20852179 * mm_struct of the VMA to prevent it from going away. Should be used for
20862180 * all allocations for pages that will be mapped into user space. Returns
20872181 * NULL when no page can be allocated.
....@@ -2119,43 +2213,29 @@
21192213 * If the policy is interleave, or does not allow the current
21202214 * node in its nodemask, we allocate the standard way.
21212215 */
2122
- if (pol->mode == MPOL_PREFERRED &&
2123
- !(pol->flags & MPOL_F_LOCAL))
2216
+ if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
21242217 hpage_node = pol->v.preferred_node;
21252218
21262219 nmask = policy_nodemask(gfp, pol);
21272220 if (!nmask || node_isset(hpage_node, *nmask)) {
21282221 mpol_cond_put(pol);
21292222 /*
2130
- * We cannot invoke reclaim if __GFP_THISNODE
2131
- * is set. Invoking reclaim with
2132
- * __GFP_THISNODE set, would cause THP
2133
- * allocations to trigger heavy swapping
2134
- * despite there may be tons of free memory
2135
- * (including potentially plenty of THP
2136
- * already available in the buddy) on all the
2137
- * other NUMA nodes.
2138
- *
2139
- * At most we could invoke compaction when
2140
- * __GFP_THISNODE is set (but we would need to
2141
- * refrain from invoking reclaim even if
2142
- * compaction returned COMPACT_SKIPPED because
2143
- * there wasn't not enough memory to succeed
2144
- * compaction). For now just avoid
2145
- * __GFP_THISNODE instead of limiting the
2146
- * allocation path to a strict and single
2147
- * compaction invocation.
2148
- *
2149
- * Supposedly if direct reclaim was enabled by
2150
- * the caller, the app prefers THP regardless
2151
- * of the node it comes from so this would be
2152
- * more desiderable behavior than only
2153
- * providing THP originated from the local
2154
- * node in such case.
2223
+ * First, try to allocate THP only on local node, but
2224
+ * don't reclaim unnecessarily, just compact.
21552225 */
2156
- if (!(gfp & __GFP_DIRECT_RECLAIM))
2157
- gfp |= __GFP_THISNODE;
2158
- page = __alloc_pages_node(hpage_node, gfp, order);
2226
+ page = __alloc_pages_node(hpage_node,
2227
+ gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2228
+
2229
+ /*
2230
+ * If hugepage allocations are configured to always
2231
+ * synchronous compact or the vma has been madvised
2232
+ * to prefer hugepage backing, retry allowing remote
2233
+ * memory with both reclaim and compact as well.
2234
+ */
2235
+ if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2236
+ page = __alloc_pages_nodemask(gfp, order,
2237
+ hpage_node, nmask);
2238
+
21592239 goto out;
21602240 }
21612241 }
....@@ -2167,6 +2247,7 @@
21672247 out:
21682248 return page;
21692249 }
2250
+EXPORT_SYMBOL(alloc_pages_vma);
21702251
21712252 /**
21722253 * alloc_pages_current - Allocate pages.
....@@ -2266,7 +2347,6 @@
22662347
22672348 switch (a->mode) {
22682349 case MPOL_BIND:
2269
- /* Fall through */
22702350 case MPOL_INTERLEAVE:
22712351 return !!nodes_equal(a->v.nodes, b->v.nodes);
22722352 case MPOL_PREFERRED:
....@@ -2399,7 +2479,7 @@
23992479 unsigned long pgoff;
24002480 int thiscpu = raw_smp_processor_id();
24012481 int thisnid = cpu_to_node(thiscpu);
2402
- int polnid = -1;
2482
+ int polnid = NUMA_NO_NODE;
24032483 int ret = -1;
24042484
24052485 pol = get_vma_policy(vma, addr);
....@@ -2573,6 +2653,7 @@
25732653 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
25742654 if (!mpol_new)
25752655 goto err_out;
2656
+ atomic_set(&mpol_new->refcnt, 1);
25762657 goto restart;
25772658 }
25782659
....@@ -2805,12 +2886,11 @@
28052886 int mpol_parse_str(char *str, struct mempolicy **mpol)
28062887 {
28072888 struct mempolicy *new = NULL;
2808
- unsigned short mode;
28092889 unsigned short mode_flags;
28102890 nodemask_t nodes;
28112891 char *nodelist = strchr(str, ':');
28122892 char *flags = strchr(str, '=');
2813
- int err = 1;
2893
+ int err = 1, mode;
28142894
28152895 if (flags)
28162896 *flags++ = '\0'; /* terminate mode string */
....@@ -2825,12 +2905,8 @@
28252905 } else
28262906 nodes_clear(nodes);
28272907
2828
- for (mode = 0; mode < MPOL_MAX; mode++) {
2829
- if (!strcmp(str, policy_modes[mode])) {
2830
- break;
2831
- }
2832
- }
2833
- if (mode >= MPOL_MAX)
2908
+ mode = match_string(policy_modes, MPOL_MAX, str);
2909
+ if (mode < 0)
28342910 goto out;
28352911
28362912 switch (mode) {