.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * Simple NUMA memory policy for the Linux kernel. |
---|
3 | 4 | * |
---|
4 | 5 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. |
---|
5 | 6 | * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. |
---|
6 | | - * Subject to the GNU Public License, version 2. |
---|
7 | 7 | * |
---|
8 | 8 | * NUMA policy allows the user to give hints in which node(s) memory should |
---|
9 | 9 | * be allocated. |
---|
.. | .. |
---|
68 | 68 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
---|
69 | 69 | |
---|
70 | 70 | #include <linux/mempolicy.h> |
---|
71 | | -#include <linux/mm.h> |
---|
| 71 | +#include <linux/pagewalk.h> |
---|
72 | 72 | #include <linux/highmem.h> |
---|
73 | 73 | #include <linux/hugetlb.h> |
---|
74 | 74 | #include <linux/kernel.h> |
---|
.. | .. |
---|
126 | 126 | }; |
---|
127 | 127 | |
---|
128 | 128 | static struct mempolicy preferred_node_policy[MAX_NUMNODES]; |
---|
| 129 | + |
---|
| 130 | +/** |
---|
| 131 | + * numa_map_to_online_node - Find closest online node |
---|
| 132 | + * @node: Node id to start the search |
---|
| 133 | + * |
---|
| 134 | + * Lookup the next closest node by distance if @nid is not online. |
---|
| 135 | + */ |
---|
| 136 | +int numa_map_to_online_node(int node) |
---|
| 137 | +{ |
---|
| 138 | + int min_dist = INT_MAX, dist, n, min_node; |
---|
| 139 | + |
---|
| 140 | + if (node == NUMA_NO_NODE || node_online(node)) |
---|
| 141 | + return node; |
---|
| 142 | + |
---|
| 143 | + min_node = node; |
---|
| 144 | + for_each_online_node(n) { |
---|
| 145 | + dist = node_distance(node, n); |
---|
| 146 | + if (dist < min_dist) { |
---|
| 147 | + min_dist = dist; |
---|
| 148 | + min_node = n; |
---|
| 149 | + } |
---|
| 150 | + } |
---|
| 151 | + |
---|
| 152 | + return min_node; |
---|
| 153 | +} |
---|
| 154 | +EXPORT_SYMBOL_GPL(numa_map_to_online_node); |
---|
129 | 155 | |
---|
130 | 156 | struct mempolicy *get_task_policy(struct task_struct *p) |
---|
131 | 157 | { |
---|
.. | .. |
---|
198 | 224 | * handle an empty nodemask with MPOL_PREFERRED here. |
---|
199 | 225 | * |
---|
200 | 226 | * Must be called holding task's alloc_lock to protect task's mems_allowed |
---|
201 | | - * and mempolicy. May also be called holding the mmap_semaphore for write. |
---|
| 227 | + * and mempolicy. May also be called holding the mmap_lock for write. |
---|
202 | 228 | */ |
---|
203 | 229 | static int mpol_set_nodemask(struct mempolicy *pol, |
---|
204 | 230 | const nodemask_t *nodes, struct nodemask_scratch *nsc) |
---|
.. | .. |
---|
342 | 368 | /* |
---|
343 | 369 | * mpol_rebind_policy - Migrate a policy to a different set of nodes |
---|
344 | 370 | * |
---|
345 | | - * Per-vma policies are protected by mmap_sem. Allocations using per-task |
---|
| 371 | + * Per-vma policies are protected by mmap_lock. Allocations using per-task |
---|
346 | 372 | * policies are protected by task->mems_allowed_seq to prevent a premature |
---|
347 | 373 | * OOM/allocation failure due to parallel nodemask modification. |
---|
348 | 374 | */ |
---|
349 | 375 | static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) |
---|
350 | 376 | { |
---|
351 | | - if (!pol) |
---|
| 377 | + if (!pol || pol->mode == MPOL_LOCAL) |
---|
352 | 378 | return; |
---|
353 | 379 | if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && |
---|
354 | 380 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) |
---|
.. | .. |
---|
372 | 398 | /* |
---|
373 | 399 | * Rebind each vma in mm to new nodemask. |
---|
374 | 400 | * |
---|
375 | | - * Call holding a reference to mm. Takes mm->mmap_sem during call. |
---|
| 401 | + * Call holding a reference to mm. Takes mm->mmap_lock during call. |
---|
376 | 402 | */ |
---|
377 | 403 | |
---|
378 | 404 | void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) |
---|
379 | 405 | { |
---|
380 | 406 | struct vm_area_struct *vma; |
---|
381 | 407 | |
---|
382 | | - down_write(&mm->mmap_sem); |
---|
383 | | - for (vma = mm->mmap; vma; vma = vma->vm_next) |
---|
| 408 | + mmap_write_lock(mm); |
---|
| 409 | + for (vma = mm->mmap; vma; vma = vma->vm_next) { |
---|
| 410 | + vm_write_begin(vma); |
---|
384 | 411 | mpol_rebind_policy(vma->vm_policy, new); |
---|
385 | | - up_write(&mm->mmap_sem); |
---|
| 412 | + vm_write_end(vma); |
---|
| 413 | + } |
---|
| 414 | + mmap_write_unlock(mm); |
---|
386 | 415 | } |
---|
387 | 416 | |
---|
388 | 417 | static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { |
---|
.. | .. |
---|
410 | 439 | struct list_head *pagelist; |
---|
411 | 440 | unsigned long flags; |
---|
412 | 441 | nodemask_t *nmask; |
---|
413 | | - struct vm_area_struct *prev; |
---|
| 442 | + unsigned long start; |
---|
| 443 | + unsigned long end; |
---|
| 444 | + struct vm_area_struct *first; |
---|
414 | 445 | }; |
---|
415 | 446 | |
---|
416 | 447 | /* |
---|
.. | .. |
---|
440 | 471 | */ |
---|
441 | 472 | static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, |
---|
442 | 473 | unsigned long end, struct mm_walk *walk) |
---|
| 474 | + __releases(ptl) |
---|
443 | 475 | { |
---|
444 | 476 | int ret = 0; |
---|
445 | 477 | struct page *page; |
---|
.. | .. |
---|
555 | 587 | unsigned long addr, unsigned long end, |
---|
556 | 588 | struct mm_walk *walk) |
---|
557 | 589 | { |
---|
| 590 | + int ret = 0; |
---|
558 | 591 | #ifdef CONFIG_HUGETLB_PAGE |
---|
559 | 592 | struct queue_pages *qp = walk->private; |
---|
560 | | - unsigned long flags = qp->flags; |
---|
| 593 | + unsigned long flags = (qp->flags & MPOL_MF_VALID); |
---|
561 | 594 | struct page *page; |
---|
562 | 595 | spinlock_t *ptl; |
---|
563 | 596 | pte_t entry; |
---|
.. | .. |
---|
569 | 602 | page = pte_page(entry); |
---|
570 | 603 | if (!queue_pages_required(page, qp)) |
---|
571 | 604 | goto unlock; |
---|
| 605 | + |
---|
| 606 | + if (flags == MPOL_MF_STRICT) { |
---|
| 607 | + /* |
---|
| 608 | + * STRICT alone means only detecting misplaced page and no |
---|
| 609 | + * need to further check other vma. |
---|
| 610 | + */ |
---|
| 611 | + ret = -EIO; |
---|
| 612 | + goto unlock; |
---|
| 613 | + } |
---|
| 614 | + |
---|
| 615 | + if (!vma_migratable(walk->vma)) { |
---|
| 616 | + /* |
---|
| 617 | + * Must be STRICT with MOVE*, otherwise .test_walk() have |
---|
| 618 | + * stopped walking current vma. |
---|
| 619 | + * Detecting misplaced page but allow migrating pages which |
---|
| 620 | + * have been queued. |
---|
| 621 | + */ |
---|
| 622 | + ret = 1; |
---|
| 623 | + goto unlock; |
---|
| 624 | + } |
---|
| 625 | + |
---|
572 | 626 | /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ |
---|
573 | 627 | if (flags & (MPOL_MF_MOVE_ALL) || |
---|
574 | | - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) |
---|
575 | | - isolate_huge_page(page, qp->pagelist); |
---|
| 628 | + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && |
---|
| 629 | + !hugetlb_pmd_shared(pte))) { |
---|
| 630 | + if (isolate_hugetlb(page, qp->pagelist) && |
---|
| 631 | + (flags & MPOL_MF_STRICT)) |
---|
| 632 | + /* |
---|
| 633 | + * Failed to isolate page but allow migrating pages |
---|
| 634 | + * which have been queued. |
---|
| 635 | + */ |
---|
| 636 | + ret = 1; |
---|
| 637 | + } |
---|
576 | 638 | unlock: |
---|
577 | 639 | spin_unlock(ptl); |
---|
578 | 640 | #else |
---|
579 | 641 | BUG(); |
---|
580 | 642 | #endif |
---|
581 | | - return 0; |
---|
| 643 | + return ret; |
---|
582 | 644 | } |
---|
583 | 645 | |
---|
584 | 646 | #ifdef CONFIG_NUMA_BALANCING |
---|
.. | .. |
---|
596 | 658 | { |
---|
597 | 659 | int nr_updated; |
---|
598 | 660 | |
---|
599 | | - nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1); |
---|
| 661 | + nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA); |
---|
600 | 662 | if (nr_updated) |
---|
601 | 663 | count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); |
---|
602 | 664 | |
---|
.. | .. |
---|
618 | 680 | unsigned long endvma = vma->vm_end; |
---|
619 | 681 | unsigned long flags = qp->flags; |
---|
620 | 682 | |
---|
| 683 | + /* range check first */ |
---|
| 684 | + VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma); |
---|
| 685 | + |
---|
| 686 | + if (!qp->first) { |
---|
| 687 | + qp->first = vma; |
---|
| 688 | + if (!(flags & MPOL_MF_DISCONTIG_OK) && |
---|
| 689 | + (qp->start < vma->vm_start)) |
---|
| 690 | + /* hole at head side of range */ |
---|
| 691 | + return -EFAULT; |
---|
| 692 | + } |
---|
| 693 | + if (!(flags & MPOL_MF_DISCONTIG_OK) && |
---|
| 694 | + ((vma->vm_end < qp->end) && |
---|
| 695 | + (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start))) |
---|
| 696 | + /* hole at middle or tail of range */ |
---|
| 697 | + return -EFAULT; |
---|
| 698 | + |
---|
621 | 699 | /* |
---|
622 | 700 | * Need check MPOL_MF_STRICT to return -EIO if possible |
---|
623 | 701 | * regardless of vma_migratable |
---|
.. | .. |
---|
628 | 706 | |
---|
629 | 707 | if (endvma > end) |
---|
630 | 708 | endvma = end; |
---|
631 | | - if (vma->vm_start > start) |
---|
632 | | - start = vma->vm_start; |
---|
633 | | - |
---|
634 | | - if (!(flags & MPOL_MF_DISCONTIG_OK)) { |
---|
635 | | - if (!vma->vm_next && vma->vm_end < end) |
---|
636 | | - return -EFAULT; |
---|
637 | | - if (qp->prev && qp->prev->vm_end < vma->vm_start) |
---|
638 | | - return -EFAULT; |
---|
639 | | - } |
---|
640 | | - |
---|
641 | | - qp->prev = vma; |
---|
642 | 709 | |
---|
643 | 710 | if (flags & MPOL_MF_LAZY) { |
---|
644 | 711 | /* Similar to task_numa_work, skip inaccessible VMAs */ |
---|
645 | | - if (!is_vm_hugetlb_page(vma) && |
---|
646 | | - (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) && |
---|
| 712 | + if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) && |
---|
647 | 713 | !(vma->vm_flags & VM_MIXEDMAP)) |
---|
648 | 714 | change_prot_numa(vma, start, endvma); |
---|
649 | 715 | return 1; |
---|
.. | .. |
---|
654 | 720 | return 0; |
---|
655 | 721 | return 1; |
---|
656 | 722 | } |
---|
| 723 | + |
---|
| 724 | +static const struct mm_walk_ops queue_pages_walk_ops = { |
---|
| 725 | + .hugetlb_entry = queue_pages_hugetlb, |
---|
| 726 | + .pmd_entry = queue_pages_pte_range, |
---|
| 727 | + .test_walk = queue_pages_test_walk, |
---|
| 728 | +}; |
---|
657 | 729 | |
---|
658 | 730 | /* |
---|
659 | 731 | * Walk through page tables and collect pages to be migrated. |
---|
.. | .. |
---|
675 | 747 | nodemask_t *nodes, unsigned long flags, |
---|
676 | 748 | struct list_head *pagelist) |
---|
677 | 749 | { |
---|
| 750 | + int err; |
---|
678 | 751 | struct queue_pages qp = { |
---|
679 | 752 | .pagelist = pagelist, |
---|
680 | 753 | .flags = flags, |
---|
681 | 754 | .nmask = nodes, |
---|
682 | | - .prev = NULL, |
---|
683 | | - }; |
---|
684 | | - struct mm_walk queue_pages_walk = { |
---|
685 | | - .hugetlb_entry = queue_pages_hugetlb, |
---|
686 | | - .pmd_entry = queue_pages_pte_range, |
---|
687 | | - .test_walk = queue_pages_test_walk, |
---|
688 | | - .mm = mm, |
---|
689 | | - .private = &qp, |
---|
| 755 | + .start = start, |
---|
| 756 | + .end = end, |
---|
| 757 | + .first = NULL, |
---|
690 | 758 | }; |
---|
691 | 759 | |
---|
692 | | - return walk_page_range(start, end, &queue_pages_walk); |
---|
| 760 | + err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); |
---|
| 761 | + |
---|
| 762 | + if (!qp.first) |
---|
| 763 | + /* whole range in hole */ |
---|
| 764 | + err = -EFAULT; |
---|
| 765 | + |
---|
| 766 | + return err; |
---|
693 | 767 | } |
---|
694 | 768 | |
---|
695 | 769 | /* |
---|
696 | 770 | * Apply policy to a single VMA |
---|
697 | | - * This must be called with the mmap_sem held for writing. |
---|
| 771 | + * This must be called with the mmap_lock held for writing. |
---|
698 | 772 | */ |
---|
699 | 773 | static int vma_replace_policy(struct vm_area_struct *vma, |
---|
700 | 774 | struct mempolicy *pol) |
---|
.. | .. |
---|
712 | 786 | if (IS_ERR(new)) |
---|
713 | 787 | return PTR_ERR(new); |
---|
714 | 788 | |
---|
| 789 | + vm_write_begin(vma); |
---|
715 | 790 | if (vma->vm_ops && vma->vm_ops->set_policy) { |
---|
716 | 791 | err = vma->vm_ops->set_policy(vma, new); |
---|
717 | 792 | if (err) |
---|
.. | .. |
---|
719 | 794 | } |
---|
720 | 795 | |
---|
721 | 796 | old = vma->vm_policy; |
---|
722 | | - vma->vm_policy = new; /* protected by mmap_sem */ |
---|
| 797 | + /* |
---|
| 798 | + * The speculative page fault handler accesses this field without |
---|
| 799 | + * hodling the mmap_sem. |
---|
| 800 | + */ |
---|
| 801 | + WRITE_ONCE(vma->vm_policy, new); |
---|
| 802 | + vm_write_end(vma); |
---|
723 | 803 | mpol_put(old); |
---|
724 | 804 | |
---|
725 | 805 | return 0; |
---|
726 | 806 | err_out: |
---|
| 807 | + vm_write_end(vma); |
---|
727 | 808 | mpol_put(new); |
---|
728 | 809 | return err; |
---|
729 | 810 | } |
---|
.. | .. |
---|
732 | 813 | static int mbind_range(struct mm_struct *mm, unsigned long start, |
---|
733 | 814 | unsigned long end, struct mempolicy *new_pol) |
---|
734 | 815 | { |
---|
735 | | - struct vm_area_struct *next; |
---|
736 | 816 | struct vm_area_struct *prev; |
---|
737 | 817 | struct vm_area_struct *vma; |
---|
738 | 818 | int err = 0; |
---|
.. | .. |
---|
741 | 821 | unsigned long vmend; |
---|
742 | 822 | |
---|
743 | 823 | vma = find_vma(mm, start); |
---|
744 | | - if (!vma || vma->vm_start > start) |
---|
745 | | - return -EFAULT; |
---|
| 824 | + VM_BUG_ON(!vma); |
---|
746 | 825 | |
---|
747 | 826 | prev = vma->vm_prev; |
---|
748 | 827 | if (start > vma->vm_start) |
---|
749 | 828 | prev = vma; |
---|
750 | 829 | |
---|
751 | | - for (; vma && vma->vm_start < end; prev = vma, vma = next) { |
---|
752 | | - next = vma->vm_next; |
---|
| 830 | + for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) { |
---|
753 | 831 | vmstart = max(start, vma->vm_start); |
---|
754 | 832 | vmend = min(end, vma->vm_end); |
---|
755 | 833 | |
---|
.. | .. |
---|
764 | 842 | vma_get_anon_name(vma)); |
---|
765 | 843 | if (prev) { |
---|
766 | 844 | vma = prev; |
---|
767 | | - next = vma->vm_next; |
---|
768 | | - if (mpol_equal(vma_policy(vma), new_pol)) |
---|
769 | | - continue; |
---|
770 | | - /* vma_merge() joined vma && vma->next, case 8 */ |
---|
771 | 845 | goto replace; |
---|
772 | 846 | } |
---|
773 | 847 | if (vma->vm_start != vmstart) { |
---|
.. | .. |
---|
807 | 881 | goto out; |
---|
808 | 882 | } |
---|
809 | 883 | |
---|
810 | | - task_lock(current); |
---|
811 | 884 | ret = mpol_set_nodemask(new, nodes, scratch); |
---|
812 | 885 | if (ret) { |
---|
813 | | - task_unlock(current); |
---|
814 | 886 | mpol_put(new); |
---|
815 | 887 | goto out; |
---|
816 | 888 | } |
---|
| 889 | + task_lock(current); |
---|
817 | 890 | old = current->mempolicy; |
---|
818 | 891 | current->mempolicy = new; |
---|
819 | 892 | if (new && new->mode == MPOL_INTERLEAVE) |
---|
.. | .. |
---|
839 | 912 | |
---|
840 | 913 | switch (p->mode) { |
---|
841 | 914 | case MPOL_BIND: |
---|
842 | | - /* Fall through */ |
---|
843 | 915 | case MPOL_INTERLEAVE: |
---|
844 | 916 | *nodes = p->v.nodes; |
---|
845 | 917 | break; |
---|
.. | .. |
---|
853 | 925 | } |
---|
854 | 926 | } |
---|
855 | 927 | |
---|
856 | | -static int lookup_node(unsigned long addr) |
---|
| 928 | +static int lookup_node(struct mm_struct *mm, unsigned long addr) |
---|
857 | 929 | { |
---|
858 | | - struct page *p; |
---|
| 930 | + struct page *p = NULL; |
---|
859 | 931 | int err; |
---|
860 | 932 | |
---|
861 | | - err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL); |
---|
862 | | - if (err >= 0) { |
---|
| 933 | + int locked = 1; |
---|
| 934 | + err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); |
---|
| 935 | + if (err > 0) { |
---|
863 | 936 | err = page_to_nid(p); |
---|
864 | 937 | put_page(p); |
---|
865 | 938 | } |
---|
| 939 | + if (locked) |
---|
| 940 | + mmap_read_unlock(mm); |
---|
866 | 941 | return err; |
---|
867 | 942 | } |
---|
868 | 943 | |
---|
.. | .. |
---|
873 | 948 | int err; |
---|
874 | 949 | struct mm_struct *mm = current->mm; |
---|
875 | 950 | struct vm_area_struct *vma = NULL; |
---|
876 | | - struct mempolicy *pol = current->mempolicy; |
---|
| 951 | + struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; |
---|
877 | 952 | |
---|
878 | 953 | if (flags & |
---|
879 | 954 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) |
---|
.. | .. |
---|
895 | 970 | * vma/shared policy at addr is NULL. We |
---|
896 | 971 | * want to return MPOL_DEFAULT in this case. |
---|
897 | 972 | */ |
---|
898 | | - down_read(&mm->mmap_sem); |
---|
| 973 | + mmap_read_lock(mm); |
---|
899 | 974 | vma = find_vma_intersection(mm, addr, addr+1); |
---|
900 | 975 | if (!vma) { |
---|
901 | | - up_read(&mm->mmap_sem); |
---|
| 976 | + mmap_read_unlock(mm); |
---|
902 | 977 | return -EFAULT; |
---|
903 | 978 | } |
---|
904 | 979 | if (vma->vm_ops && vma->vm_ops->get_policy) |
---|
.. | .. |
---|
913 | 988 | |
---|
914 | 989 | if (flags & MPOL_F_NODE) { |
---|
915 | 990 | if (flags & MPOL_F_ADDR) { |
---|
916 | | - err = lookup_node(addr); |
---|
| 991 | + /* |
---|
| 992 | + * Take a refcount on the mpol, lookup_node() |
---|
| 993 | + * wil drop the mmap_lock, so after calling |
---|
| 994 | + * lookup_node() only "pol" remains valid, "vma" |
---|
| 995 | + * is stale. |
---|
| 996 | + */ |
---|
| 997 | + pol_refcount = pol; |
---|
| 998 | + vma = NULL; |
---|
| 999 | + mpol_get(pol); |
---|
| 1000 | + err = lookup_node(mm, addr); |
---|
917 | 1001 | if (err < 0) |
---|
918 | 1002 | goto out; |
---|
919 | 1003 | *policy = err; |
---|
.. | .. |
---|
948 | 1032 | out: |
---|
949 | 1033 | mpol_cond_put(pol); |
---|
950 | 1034 | if (vma) |
---|
951 | | - up_read(¤t->mm->mmap_sem); |
---|
| 1035 | + mmap_read_unlock(mm); |
---|
| 1036 | + if (pol_refcount) |
---|
| 1037 | + mpol_put(pol_refcount); |
---|
952 | 1038 | return err; |
---|
953 | 1039 | } |
---|
954 | 1040 | |
---|
.. | .. |
---|
967 | 1053 | if (!isolate_lru_page(head)) { |
---|
968 | 1054 | list_add_tail(&head->lru, pagelist); |
---|
969 | 1055 | mod_node_page_state(page_pgdat(head), |
---|
970 | | - NR_ISOLATED_ANON + page_is_file_cache(head), |
---|
971 | | - hpage_nr_pages(head)); |
---|
| 1056 | + NR_ISOLATED_ANON + page_is_file_lru(head), |
---|
| 1057 | + thp_nr_pages(head)); |
---|
972 | 1058 | } else if (flags & MPOL_MF_STRICT) { |
---|
973 | 1059 | /* |
---|
974 | 1060 | * Non-movable page may reach here. And, there may be |
---|
.. | .. |
---|
984 | 1070 | return 0; |
---|
985 | 1071 | } |
---|
986 | 1072 | |
---|
987 | | -/* page allocation callback for NUMA node migration */ |
---|
988 | | -struct page *alloc_new_node_page(struct page *page, unsigned long node) |
---|
989 | | -{ |
---|
990 | | - if (PageHuge(page)) |
---|
991 | | - return alloc_huge_page_node(page_hstate(compound_head(page)), |
---|
992 | | - node); |
---|
993 | | - else if (PageTransHuge(page)) { |
---|
994 | | - struct page *thp; |
---|
995 | | - |
---|
996 | | - thp = alloc_pages_node(node, |
---|
997 | | - (GFP_TRANSHUGE | __GFP_THISNODE), |
---|
998 | | - HPAGE_PMD_ORDER); |
---|
999 | | - if (!thp) |
---|
1000 | | - return NULL; |
---|
1001 | | - prep_transhuge_page(thp); |
---|
1002 | | - return thp; |
---|
1003 | | - } else |
---|
1004 | | - return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE | |
---|
1005 | | - __GFP_THISNODE, 0); |
---|
1006 | | -} |
---|
1007 | | - |
---|
1008 | 1073 | /* |
---|
1009 | 1074 | * Migrate pages from one node to a target node. |
---|
1010 | 1075 | * Returns error or the number of pages not migrated. |
---|
.. | .. |
---|
1015 | 1080 | nodemask_t nmask; |
---|
1016 | 1081 | LIST_HEAD(pagelist); |
---|
1017 | 1082 | int err = 0; |
---|
| 1083 | + struct migration_target_control mtc = { |
---|
| 1084 | + .nid = dest, |
---|
| 1085 | + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, |
---|
| 1086 | + }; |
---|
1018 | 1087 | |
---|
1019 | 1088 | nodes_clear(nmask); |
---|
1020 | 1089 | node_set(source, nmask); |
---|
.. | .. |
---|
1029 | 1098 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
---|
1030 | 1099 | |
---|
1031 | 1100 | if (!list_empty(&pagelist)) { |
---|
1032 | | - err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest, |
---|
1033 | | - MIGRATE_SYNC, MR_SYSCALL); |
---|
| 1101 | + err = migrate_pages(&pagelist, alloc_migration_target, NULL, |
---|
| 1102 | + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); |
---|
1034 | 1103 | if (err) |
---|
1035 | 1104 | putback_movable_pages(&pagelist); |
---|
1036 | 1105 | } |
---|
.. | .. |
---|
1048 | 1117 | const nodemask_t *to, int flags) |
---|
1049 | 1118 | { |
---|
1050 | 1119 | int busy = 0; |
---|
1051 | | - int err; |
---|
| 1120 | + int err = 0; |
---|
1052 | 1121 | nodemask_t tmp; |
---|
1053 | 1122 | |
---|
1054 | | - err = migrate_prep(); |
---|
1055 | | - if (err) |
---|
1056 | | - return err; |
---|
| 1123 | + lru_cache_disable(); |
---|
1057 | 1124 | |
---|
1058 | | - down_read(&mm->mmap_sem); |
---|
| 1125 | + mmap_read_lock(mm); |
---|
1059 | 1126 | |
---|
1060 | 1127 | /* |
---|
1061 | 1128 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' |
---|
.. | .. |
---|
1136 | 1203 | if (err < 0) |
---|
1137 | 1204 | break; |
---|
1138 | 1205 | } |
---|
1139 | | - up_read(&mm->mmap_sem); |
---|
| 1206 | + mmap_read_unlock(mm); |
---|
| 1207 | + |
---|
| 1208 | + lru_cache_enable(); |
---|
1140 | 1209 | if (err < 0) |
---|
1141 | 1210 | return err; |
---|
1142 | 1211 | return busy; |
---|
.. | .. |
---|
1153 | 1222 | static struct page *new_page(struct page *page, unsigned long start) |
---|
1154 | 1223 | { |
---|
1155 | 1224 | struct vm_area_struct *vma; |
---|
1156 | | - unsigned long uninitialized_var(address); |
---|
| 1225 | + unsigned long address; |
---|
1157 | 1226 | |
---|
1158 | 1227 | vma = find_vma(current->mm, start); |
---|
1159 | 1228 | while (vma) { |
---|
.. | .. |
---|
1252 | 1321 | |
---|
1253 | 1322 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { |
---|
1254 | 1323 | |
---|
1255 | | - err = migrate_prep(); |
---|
1256 | | - if (err) |
---|
1257 | | - goto mpol_out; |
---|
| 1324 | + lru_cache_disable(); |
---|
1258 | 1325 | } |
---|
1259 | 1326 | { |
---|
1260 | 1327 | NODEMASK_SCRATCH(scratch); |
---|
1261 | 1328 | if (scratch) { |
---|
1262 | | - down_write(&mm->mmap_sem); |
---|
1263 | | - task_lock(current); |
---|
| 1329 | + mmap_write_lock(mm); |
---|
1264 | 1330 | err = mpol_set_nodemask(new, nmask, scratch); |
---|
1265 | | - task_unlock(current); |
---|
1266 | 1331 | if (err) |
---|
1267 | | - up_write(&mm->mmap_sem); |
---|
| 1332 | + mmap_write_unlock(mm); |
---|
1268 | 1333 | } else |
---|
1269 | 1334 | err = -ENOMEM; |
---|
1270 | 1335 | NODEMASK_SCRATCH_FREE(scratch); |
---|
.. | .. |
---|
1301 | 1366 | putback_movable_pages(&pagelist); |
---|
1302 | 1367 | } |
---|
1303 | 1368 | |
---|
1304 | | - up_write(&mm->mmap_sem); |
---|
| 1369 | + mmap_write_unlock(mm); |
---|
1305 | 1370 | mpol_out: |
---|
1306 | 1371 | mpol_put(new); |
---|
| 1372 | + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
---|
| 1373 | + lru_cache_enable(); |
---|
1307 | 1374 | return err; |
---|
1308 | 1375 | } |
---|
1309 | 1376 | |
---|
.. | .. |
---|
1505 | 1572 | if (nodes_empty(*new)) |
---|
1506 | 1573 | goto out_put; |
---|
1507 | 1574 | |
---|
1508 | | - nodes_and(*new, *new, node_states[N_MEMORY]); |
---|
1509 | | - if (nodes_empty(*new)) |
---|
1510 | | - goto out_put; |
---|
1511 | | - |
---|
1512 | 1575 | err = security_task_movememory(task); |
---|
1513 | 1576 | if (err) |
---|
1514 | 1577 | goto out_put; |
---|
.. | .. |
---|
1552 | 1615 | unsigned long flags) |
---|
1553 | 1616 | { |
---|
1554 | 1617 | int err; |
---|
1555 | | - int uninitialized_var(pval); |
---|
| 1618 | + int pval; |
---|
1556 | 1619 | nodemask_t nodes; |
---|
1557 | | - |
---|
1558 | | - addr = untagged_addr(addr); |
---|
1559 | 1620 | |
---|
1560 | 1621 | if (nmask != NULL && maxnode < nr_node_ids) |
---|
1561 | 1622 | return -EINVAL; |
---|
| 1623 | + |
---|
| 1624 | + addr = untagged_addr(addr); |
---|
1562 | 1625 | |
---|
1563 | 1626 | err = do_get_mempolicy(&pval, &nodes, addr, flags); |
---|
1564 | 1627 | |
---|
.. | .. |
---|
1691 | 1754 | |
---|
1692 | 1755 | #endif /* CONFIG_COMPAT */ |
---|
1693 | 1756 | |
---|
| 1757 | +bool vma_migratable(struct vm_area_struct *vma) |
---|
| 1758 | +{ |
---|
| 1759 | + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) |
---|
| 1760 | + return false; |
---|
| 1761 | + |
---|
| 1762 | + /* |
---|
| 1763 | + * DAX device mappings require predictable access latency, so avoid |
---|
| 1764 | + * incurring periodic faults. |
---|
| 1765 | + */ |
---|
| 1766 | + if (vma_is_dax(vma)) |
---|
| 1767 | + return false; |
---|
| 1768 | + |
---|
| 1769 | + if (is_vm_hugetlb_page(vma) && |
---|
| 1770 | + !hugepage_migration_supported(hstate_vma(vma))) |
---|
| 1771 | + return false; |
---|
| 1772 | + |
---|
| 1773 | + /* |
---|
| 1774 | + * Migration allocates pages in the highest zone. If we cannot |
---|
| 1775 | + * do so then migration (at least from node to node) is not |
---|
| 1776 | + * possible. |
---|
| 1777 | + */ |
---|
| 1778 | + if (vma->vm_file && |
---|
| 1779 | + gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) |
---|
| 1780 | + < policy_zone) |
---|
| 1781 | + return false; |
---|
| 1782 | + return true; |
---|
| 1783 | +} |
---|
| 1784 | + |
---|
1694 | 1785 | struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, |
---|
1695 | 1786 | unsigned long addr) |
---|
1696 | 1787 | { |
---|
1697 | | - struct mempolicy *pol = NULL; |
---|
| 1788 | + struct mempolicy *pol; |
---|
1698 | 1789 | |
---|
1699 | | - if (vma) { |
---|
1700 | | - if (vma->vm_ops && vma->vm_ops->get_policy) { |
---|
1701 | | - pol = vma->vm_ops->get_policy(vma, addr); |
---|
1702 | | - } else if (vma->vm_policy) { |
---|
1703 | | - pol = vma->vm_policy; |
---|
| 1790 | + if (!vma) |
---|
| 1791 | + return NULL; |
---|
1704 | 1792 | |
---|
1705 | | - /* |
---|
1706 | | - * shmem_alloc_page() passes MPOL_F_SHARED policy with |
---|
1707 | | - * a pseudo vma whose vma->vm_ops=NULL. Take a reference |
---|
1708 | | - * count on these policies which will be dropped by |
---|
1709 | | - * mpol_cond_put() later |
---|
1710 | | - */ |
---|
1711 | | - if (mpol_needs_cond_ref(pol)) |
---|
1712 | | - mpol_get(pol); |
---|
1713 | | - } |
---|
| 1793 | + if (vma->vm_ops && vma->vm_ops->get_policy) |
---|
| 1794 | + return vma->vm_ops->get_policy(vma, addr); |
---|
| 1795 | + |
---|
| 1796 | + /* |
---|
| 1797 | + * This could be called without holding the mmap_sem in the |
---|
| 1798 | + * speculative page fault handler's path. |
---|
| 1799 | + */ |
---|
| 1800 | + pol = READ_ONCE(vma->vm_policy); |
---|
| 1801 | + if (pol) { |
---|
| 1802 | + /* |
---|
| 1803 | + * shmem_alloc_page() passes MPOL_F_SHARED policy with |
---|
| 1804 | + * a pseudo vma whose vma->vm_ops=NULL. Take a reference |
---|
| 1805 | + * count on these policies which will be dropped by |
---|
| 1806 | + * mpol_cond_put() later |
---|
| 1807 | + */ |
---|
| 1808 | + if (mpol_needs_cond_ref(pol)) |
---|
| 1809 | + mpol_get(pol); |
---|
1714 | 1810 | } |
---|
1715 | 1811 | |
---|
1716 | 1812 | return pol; |
---|
.. | .. |
---|
1785 | 1881 | * Return a nodemask representing a mempolicy for filtering nodes for |
---|
1786 | 1882 | * page allocation |
---|
1787 | 1883 | */ |
---|
1788 | | -static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) |
---|
| 1884 | +nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) |
---|
1789 | 1885 | { |
---|
1790 | 1886 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ |
---|
1791 | 1887 | if (unlikely(policy->mode == MPOL_BIND) && |
---|
.. | .. |
---|
1797 | 1893 | } |
---|
1798 | 1894 | |
---|
1799 | 1895 | /* Return the node id preferred by the given mempolicy, or the given id */ |
---|
1800 | | -static int policy_node(gfp_t gfp, struct mempolicy *policy, |
---|
1801 | | - int nd) |
---|
| 1896 | +static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) |
---|
1802 | 1897 | { |
---|
1803 | 1898 | if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) |
---|
1804 | 1899 | nd = policy->v.preferred_node; |
---|
.. | .. |
---|
1986 | 2081 | break; |
---|
1987 | 2082 | |
---|
1988 | 2083 | case MPOL_BIND: |
---|
1989 | | - /* Fall through */ |
---|
1990 | 2084 | case MPOL_INTERLEAVE: |
---|
1991 | 2085 | *mask = mempolicy->v.nodes; |
---|
1992 | 2086 | break; |
---|
.. | .. |
---|
2081 | 2175 | * |
---|
2082 | 2176 | * This function allocates a page from the kernel page pool and applies |
---|
2083 | 2177 | * a NUMA policy associated with the VMA or the current process. |
---|
2084 | | - * When VMA is not NULL caller must hold down_read on the mmap_sem of the |
---|
| 2178 | + * When VMA is not NULL caller must read-lock the mmap_lock of the |
---|
2085 | 2179 | * mm_struct of the VMA to prevent it from going away. Should be used for |
---|
2086 | 2180 | * all allocations for pages that will be mapped into user space. Returns |
---|
2087 | 2181 | * NULL when no page can be allocated. |
---|
.. | .. |
---|
2119 | 2213 | * If the policy is interleave, or does not allow the current |
---|
2120 | 2214 | * node in its nodemask, we allocate the standard way. |
---|
2121 | 2215 | */ |
---|
2122 | | - if (pol->mode == MPOL_PREFERRED && |
---|
2123 | | - !(pol->flags & MPOL_F_LOCAL)) |
---|
| 2216 | + if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) |
---|
2124 | 2217 | hpage_node = pol->v.preferred_node; |
---|
2125 | 2218 | |
---|
2126 | 2219 | nmask = policy_nodemask(gfp, pol); |
---|
2127 | 2220 | if (!nmask || node_isset(hpage_node, *nmask)) { |
---|
2128 | 2221 | mpol_cond_put(pol); |
---|
2129 | 2222 | /* |
---|
2130 | | - * We cannot invoke reclaim if __GFP_THISNODE |
---|
2131 | | - * is set. Invoking reclaim with |
---|
2132 | | - * __GFP_THISNODE set, would cause THP |
---|
2133 | | - * allocations to trigger heavy swapping |
---|
2134 | | - * despite there may be tons of free memory |
---|
2135 | | - * (including potentially plenty of THP |
---|
2136 | | - * already available in the buddy) on all the |
---|
2137 | | - * other NUMA nodes. |
---|
2138 | | - * |
---|
2139 | | - * At most we could invoke compaction when |
---|
2140 | | - * __GFP_THISNODE is set (but we would need to |
---|
2141 | | - * refrain from invoking reclaim even if |
---|
2142 | | - * compaction returned COMPACT_SKIPPED because |
---|
2143 | | - * there wasn't not enough memory to succeed |
---|
2144 | | - * compaction). For now just avoid |
---|
2145 | | - * __GFP_THISNODE instead of limiting the |
---|
2146 | | - * allocation path to a strict and single |
---|
2147 | | - * compaction invocation. |
---|
2148 | | - * |
---|
2149 | | - * Supposedly if direct reclaim was enabled by |
---|
2150 | | - * the caller, the app prefers THP regardless |
---|
2151 | | - * of the node it comes from so this would be |
---|
2152 | | - * more desiderable behavior than only |
---|
2153 | | - * providing THP originated from the local |
---|
2154 | | - * node in such case. |
---|
| 2223 | + * First, try to allocate THP only on local node, but |
---|
| 2224 | + * don't reclaim unnecessarily, just compact. |
---|
2155 | 2225 | */ |
---|
2156 | | - if (!(gfp & __GFP_DIRECT_RECLAIM)) |
---|
2157 | | - gfp |= __GFP_THISNODE; |
---|
2158 | | - page = __alloc_pages_node(hpage_node, gfp, order); |
---|
| 2226 | + page = __alloc_pages_node(hpage_node, |
---|
| 2227 | + gfp | __GFP_THISNODE | __GFP_NORETRY, order); |
---|
| 2228 | + |
---|
| 2229 | + /* |
---|
| 2230 | + * If hugepage allocations are configured to always |
---|
| 2231 | + * synchronous compact or the vma has been madvised |
---|
| 2232 | + * to prefer hugepage backing, retry allowing remote |
---|
| 2233 | + * memory with both reclaim and compact as well. |
---|
| 2234 | + */ |
---|
| 2235 | + if (!page && (gfp & __GFP_DIRECT_RECLAIM)) |
---|
| 2236 | + page = __alloc_pages_nodemask(gfp, order, |
---|
| 2237 | + hpage_node, nmask); |
---|
| 2238 | + |
---|
2159 | 2239 | goto out; |
---|
2160 | 2240 | } |
---|
2161 | 2241 | } |
---|
.. | .. |
---|
2167 | 2247 | out: |
---|
2168 | 2248 | return page; |
---|
2169 | 2249 | } |
---|
| 2250 | +EXPORT_SYMBOL(alloc_pages_vma); |
---|
2170 | 2251 | |
---|
2171 | 2252 | /** |
---|
2172 | 2253 | * alloc_pages_current - Allocate pages. |
---|
.. | .. |
---|
2266 | 2347 | |
---|
2267 | 2348 | switch (a->mode) { |
---|
2268 | 2349 | case MPOL_BIND: |
---|
2269 | | - /* Fall through */ |
---|
2270 | 2350 | case MPOL_INTERLEAVE: |
---|
2271 | 2351 | return !!nodes_equal(a->v.nodes, b->v.nodes); |
---|
2272 | 2352 | case MPOL_PREFERRED: |
---|
.. | .. |
---|
2399 | 2479 | unsigned long pgoff; |
---|
2400 | 2480 | int thiscpu = raw_smp_processor_id(); |
---|
2401 | 2481 | int thisnid = cpu_to_node(thiscpu); |
---|
2402 | | - int polnid = -1; |
---|
| 2482 | + int polnid = NUMA_NO_NODE; |
---|
2403 | 2483 | int ret = -1; |
---|
2404 | 2484 | |
---|
2405 | 2485 | pol = get_vma_policy(vma, addr); |
---|
.. | .. |
---|
2573 | 2653 | mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
---|
2574 | 2654 | if (!mpol_new) |
---|
2575 | 2655 | goto err_out; |
---|
| 2656 | + atomic_set(&mpol_new->refcnt, 1); |
---|
2576 | 2657 | goto restart; |
---|
2577 | 2658 | } |
---|
2578 | 2659 | |
---|
.. | .. |
---|
2805 | 2886 | int mpol_parse_str(char *str, struct mempolicy **mpol) |
---|
2806 | 2887 | { |
---|
2807 | 2888 | struct mempolicy *new = NULL; |
---|
2808 | | - unsigned short mode; |
---|
2809 | 2889 | unsigned short mode_flags; |
---|
2810 | 2890 | nodemask_t nodes; |
---|
2811 | 2891 | char *nodelist = strchr(str, ':'); |
---|
2812 | 2892 | char *flags = strchr(str, '='); |
---|
2813 | | - int err = 1; |
---|
| 2893 | + int err = 1, mode; |
---|
2814 | 2894 | |
---|
2815 | 2895 | if (flags) |
---|
2816 | 2896 | *flags++ = '\0'; /* terminate mode string */ |
---|
.. | .. |
---|
2825 | 2905 | } else |
---|
2826 | 2906 | nodes_clear(nodes); |
---|
2827 | 2907 | |
---|
2828 | | - for (mode = 0; mode < MPOL_MAX; mode++) { |
---|
2829 | | - if (!strcmp(str, policy_modes[mode])) { |
---|
2830 | | - break; |
---|
2831 | | - } |
---|
2832 | | - } |
---|
2833 | | - if (mode >= MPOL_MAX) |
---|
| 2908 | + mode = match_string(policy_modes, MPOL_MAX, str); |
---|
| 2909 | + if (mode < 0) |
---|
2834 | 2910 | goto out; |
---|
2835 | 2911 | |
---|
2836 | 2912 | switch (mode) { |
---|