| .. | .. | 
|---|
 | 1 | +// SPDX-License-Identifier: GPL-2.0-only  | 
|---|
| 1 | 2 |  /* | 
|---|
| 2 | 3 |   * Simple NUMA memory policy for the Linux kernel. | 
|---|
| 3 | 4 |   * | 
|---|
| 4 | 5 |   * Copyright 2003,2004 Andi Kleen, SuSE Labs. | 
|---|
| 5 | 6 |   * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. | 
|---|
| 6 |  | - * Subject to the GNU Public License, version 2.  | 
|---|
| 7 | 7 |   * | 
|---|
| 8 | 8 |   * NUMA policy allows the user to give hints in which node(s) memory should | 
|---|
| 9 | 9 |   * be allocated. | 
|---|
| .. | .. | 
|---|
| 68 | 68 |  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 
|---|
| 69 | 69 |   | 
|---|
| 70 | 70 |  #include <linux/mempolicy.h> | 
|---|
| 71 |  | -#include <linux/mm.h>  | 
|---|
 | 71 | +#include <linux/pagewalk.h>  | 
|---|
| 72 | 72 |  #include <linux/highmem.h> | 
|---|
| 73 | 73 |  #include <linux/hugetlb.h> | 
|---|
| 74 | 74 |  #include <linux/kernel.h> | 
|---|
| .. | .. | 
|---|
| 126 | 126 |  }; | 
|---|
| 127 | 127 |   | 
|---|
| 128 | 128 |  static struct mempolicy preferred_node_policy[MAX_NUMNODES]; | 
|---|
 | 129 | +  | 
|---|
 | 130 | +/**  | 
|---|
 | 131 | + * numa_map_to_online_node - Find closest online node  | 
|---|
 | 132 | + * @node: Node id to start the search  | 
|---|
 | 133 | + *  | 
|---|
 | 134 | + * Lookup the next closest node by distance if @nid is not online.  | 
|---|
 | 135 | + */  | 
|---|
 | 136 | +int numa_map_to_online_node(int node)  | 
|---|
 | 137 | +{  | 
|---|
 | 138 | +	int min_dist = INT_MAX, dist, n, min_node;  | 
|---|
 | 139 | +  | 
|---|
 | 140 | +	if (node == NUMA_NO_NODE || node_online(node))  | 
|---|
 | 141 | +		return node;  | 
|---|
 | 142 | +  | 
|---|
 | 143 | +	min_node = node;  | 
|---|
 | 144 | +	for_each_online_node(n) {  | 
|---|
 | 145 | +		dist = node_distance(node, n);  | 
|---|
 | 146 | +		if (dist < min_dist) {  | 
|---|
 | 147 | +			min_dist = dist;  | 
|---|
 | 148 | +			min_node = n;  | 
|---|
 | 149 | +		}  | 
|---|
 | 150 | +	}  | 
|---|
 | 151 | +  | 
|---|
 | 152 | +	return min_node;  | 
|---|
 | 153 | +}  | 
|---|
 | 154 | +EXPORT_SYMBOL_GPL(numa_map_to_online_node);  | 
|---|
| 129 | 155 |   | 
|---|
| 130 | 156 |  struct mempolicy *get_task_policy(struct task_struct *p) | 
|---|
| 131 | 157 |  { | 
|---|
| .. | .. | 
|---|
| 198 | 224 |   * handle an empty nodemask with MPOL_PREFERRED here. | 
|---|
| 199 | 225 |   * | 
|---|
| 200 | 226 |   * Must be called holding task's alloc_lock to protect task's mems_allowed | 
|---|
| 201 |  | - * and mempolicy.  May also be called holding the mmap_semaphore for write.  | 
|---|
 | 227 | + * and mempolicy.  May also be called holding the mmap_lock for write.  | 
|---|
| 202 | 228 |   */ | 
|---|
| 203 | 229 |  static int mpol_set_nodemask(struct mempolicy *pol, | 
|---|
| 204 | 230 |  		     const nodemask_t *nodes, struct nodemask_scratch *nsc) | 
|---|
| .. | .. | 
|---|
| 342 | 368 |  /* | 
|---|
| 343 | 369 |   * mpol_rebind_policy - Migrate a policy to a different set of nodes | 
|---|
| 344 | 370 |   * | 
|---|
| 345 |  | - * Per-vma policies are protected by mmap_sem. Allocations using per-task  | 
|---|
 | 371 | + * Per-vma policies are protected by mmap_lock. Allocations using per-task  | 
|---|
| 346 | 372 |   * policies are protected by task->mems_allowed_seq to prevent a premature | 
|---|
| 347 | 373 |   * OOM/allocation failure due to parallel nodemask modification. | 
|---|
| 348 | 374 |   */ | 
|---|
| 349 | 375 |  static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) | 
|---|
| 350 | 376 |  { | 
|---|
| 351 |  | -	if (!pol)  | 
|---|
 | 377 | +	if (!pol || pol->mode == MPOL_LOCAL)  | 
|---|
| 352 | 378 |  		return; | 
|---|
| 353 | 379 |  	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && | 
|---|
| 354 | 380 |  	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) | 
|---|
| .. | .. | 
|---|
| 372 | 398 |  /* | 
|---|
| 373 | 399 |   * Rebind each vma in mm to new nodemask. | 
|---|
| 374 | 400 |   * | 
|---|
| 375 |  | - * Call holding a reference to mm.  Takes mm->mmap_sem during call.  | 
|---|
 | 401 | + * Call holding a reference to mm.  Takes mm->mmap_lock during call.  | 
|---|
| 376 | 402 |   */ | 
|---|
| 377 | 403 |   | 
|---|
| 378 | 404 |  void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | 
|---|
| 379 | 405 |  { | 
|---|
| 380 | 406 |  	struct vm_area_struct *vma; | 
|---|
| 381 | 407 |   | 
|---|
| 382 |  | -	down_write(&mm->mmap_sem);  | 
|---|
| 383 |  | -	for (vma = mm->mmap; vma; vma = vma->vm_next)  | 
|---|
 | 408 | +	mmap_write_lock(mm);  | 
|---|
 | 409 | +	for (vma = mm->mmap; vma; vma = vma->vm_next) {  | 
|---|
 | 410 | +		vm_write_begin(vma);  | 
|---|
| 384 | 411 |  		mpol_rebind_policy(vma->vm_policy, new); | 
|---|
| 385 |  | -	up_write(&mm->mmap_sem);  | 
|---|
 | 412 | +		vm_write_end(vma);  | 
|---|
 | 413 | +	}  | 
|---|
 | 414 | +	mmap_write_unlock(mm);  | 
|---|
| 386 | 415 |  } | 
|---|
| 387 | 416 |   | 
|---|
| 388 | 417 |  static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { | 
|---|
| .. | .. | 
|---|
| 410 | 439 |  	struct list_head *pagelist; | 
|---|
| 411 | 440 |  	unsigned long flags; | 
|---|
| 412 | 441 |  	nodemask_t *nmask; | 
|---|
| 413 |  | -	struct vm_area_struct *prev;  | 
|---|
 | 442 | +	unsigned long start;  | 
|---|
 | 443 | +	unsigned long end;  | 
|---|
 | 444 | +	struct vm_area_struct *first;  | 
|---|
| 414 | 445 |  }; | 
|---|
| 415 | 446 |   | 
|---|
| 416 | 447 |  /* | 
|---|
| .. | .. | 
|---|
| 440 | 471 |   */ | 
|---|
| 441 | 472 |  static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, | 
|---|
| 442 | 473 |  				unsigned long end, struct mm_walk *walk) | 
|---|
 | 474 | +	__releases(ptl)  | 
|---|
| 443 | 475 |  { | 
|---|
| 444 | 476 |  	int ret = 0; | 
|---|
| 445 | 477 |  	struct page *page; | 
|---|
| .. | .. | 
|---|
| 555 | 587 |  			       unsigned long addr, unsigned long end, | 
|---|
| 556 | 588 |  			       struct mm_walk *walk) | 
|---|
| 557 | 589 |  { | 
|---|
 | 590 | +	int ret = 0;  | 
|---|
| 558 | 591 |  #ifdef CONFIG_HUGETLB_PAGE | 
|---|
| 559 | 592 |  	struct queue_pages *qp = walk->private; | 
|---|
| 560 |  | -	unsigned long flags = qp->flags;  | 
|---|
 | 593 | +	unsigned long flags = (qp->flags & MPOL_MF_VALID);  | 
|---|
| 561 | 594 |  	struct page *page; | 
|---|
| 562 | 595 |  	spinlock_t *ptl; | 
|---|
| 563 | 596 |  	pte_t entry; | 
|---|
| .. | .. | 
|---|
| 569 | 602 |  	page = pte_page(entry); | 
|---|
| 570 | 603 |  	if (!queue_pages_required(page, qp)) | 
|---|
| 571 | 604 |  		goto unlock; | 
|---|
 | 605 | +  | 
|---|
 | 606 | +	if (flags == MPOL_MF_STRICT) {  | 
|---|
 | 607 | +		/*  | 
|---|
 | 608 | +		 * STRICT alone means only detecting misplaced page and no  | 
|---|
 | 609 | +		 * need to further check other vma.  | 
|---|
 | 610 | +		 */  | 
|---|
 | 611 | +		ret = -EIO;  | 
|---|
 | 612 | +		goto unlock;  | 
|---|
 | 613 | +	}  | 
|---|
 | 614 | +  | 
|---|
 | 615 | +	if (!vma_migratable(walk->vma)) {  | 
|---|
 | 616 | +		/*  | 
|---|
 | 617 | +		 * Must be STRICT with MOVE*, otherwise .test_walk() have  | 
|---|
 | 618 | +		 * stopped walking current vma.  | 
|---|
 | 619 | +		 * Detecting misplaced page but allow migrating pages which  | 
|---|
 | 620 | +		 * have been queued.  | 
|---|
 | 621 | +		 */  | 
|---|
 | 622 | +		ret = 1;  | 
|---|
 | 623 | +		goto unlock;  | 
|---|
 | 624 | +	}  | 
|---|
 | 625 | +  | 
|---|
| 572 | 626 |  	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ | 
|---|
| 573 | 627 |  	if (flags & (MPOL_MF_MOVE_ALL) || | 
|---|
| 574 |  | -	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))  | 
|---|
| 575 |  | -		isolate_huge_page(page, qp->pagelist);  | 
|---|
 | 628 | +	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&  | 
|---|
 | 629 | +	     !hugetlb_pmd_shared(pte))) {  | 
|---|
 | 630 | +		if (isolate_hugetlb(page, qp->pagelist) &&  | 
|---|
 | 631 | +			(flags & MPOL_MF_STRICT))  | 
|---|
 | 632 | +			/*  | 
|---|
 | 633 | +			 * Failed to isolate page but allow migrating pages  | 
|---|
 | 634 | +			 * which have been queued.  | 
|---|
 | 635 | +			 */  | 
|---|
 | 636 | +			ret = 1;  | 
|---|
 | 637 | +	}  | 
|---|
| 576 | 638 |  unlock: | 
|---|
| 577 | 639 |  	spin_unlock(ptl); | 
|---|
| 578 | 640 |  #else | 
|---|
| 579 | 641 |  	BUG(); | 
|---|
| 580 | 642 |  #endif | 
|---|
| 581 |  | -	return 0;  | 
|---|
 | 643 | +	return ret;  | 
|---|
| 582 | 644 |  } | 
|---|
| 583 | 645 |   | 
|---|
| 584 | 646 |  #ifdef CONFIG_NUMA_BALANCING | 
|---|
| .. | .. | 
|---|
| 596 | 658 |  { | 
|---|
| 597 | 659 |  	int nr_updated; | 
|---|
| 598 | 660 |   | 
|---|
| 599 |  | -	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);  | 
|---|
 | 661 | +	nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);  | 
|---|
| 600 | 662 |  	if (nr_updated) | 
|---|
| 601 | 663 |  		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); | 
|---|
| 602 | 664 |   | 
|---|
| .. | .. | 
|---|
| 618 | 680 |  	unsigned long endvma = vma->vm_end; | 
|---|
| 619 | 681 |  	unsigned long flags = qp->flags; | 
|---|
| 620 | 682 |   | 
|---|
 | 683 | +	/* range check first */  | 
|---|
 | 684 | +	VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);  | 
|---|
 | 685 | +  | 
|---|
 | 686 | +	if (!qp->first) {  | 
|---|
 | 687 | +		qp->first = vma;  | 
|---|
 | 688 | +		if (!(flags & MPOL_MF_DISCONTIG_OK) &&  | 
|---|
 | 689 | +			(qp->start < vma->vm_start))  | 
|---|
 | 690 | +			/* hole at head side of range */  | 
|---|
 | 691 | +			return -EFAULT;  | 
|---|
 | 692 | +	}  | 
|---|
 | 693 | +	if (!(flags & MPOL_MF_DISCONTIG_OK) &&  | 
|---|
 | 694 | +		((vma->vm_end < qp->end) &&  | 
|---|
 | 695 | +		(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))  | 
|---|
 | 696 | +		/* hole at middle or tail of range */  | 
|---|
 | 697 | +		return -EFAULT;  | 
|---|
 | 698 | +  | 
|---|
| 621 | 699 |  	/* | 
|---|
| 622 | 700 |  	 * Need check MPOL_MF_STRICT to return -EIO if possible | 
|---|
| 623 | 701 |  	 * regardless of vma_migratable | 
|---|
| .. | .. | 
|---|
| 628 | 706 |   | 
|---|
| 629 | 707 |  	if (endvma > end) | 
|---|
| 630 | 708 |  		endvma = end; | 
|---|
| 631 |  | -	if (vma->vm_start > start)  | 
|---|
| 632 |  | -		start = vma->vm_start;  | 
|---|
| 633 |  | -  | 
|---|
| 634 |  | -	if (!(flags & MPOL_MF_DISCONTIG_OK)) {  | 
|---|
| 635 |  | -		if (!vma->vm_next && vma->vm_end < end)  | 
|---|
| 636 |  | -			return -EFAULT;  | 
|---|
| 637 |  | -		if (qp->prev && qp->prev->vm_end < vma->vm_start)  | 
|---|
| 638 |  | -			return -EFAULT;  | 
|---|
| 639 |  | -	}  | 
|---|
| 640 |  | -  | 
|---|
| 641 |  | -	qp->prev = vma;  | 
|---|
| 642 | 709 |   | 
|---|
| 643 | 710 |  	if (flags & MPOL_MF_LAZY) { | 
|---|
| 644 | 711 |  		/* Similar to task_numa_work, skip inaccessible VMAs */ | 
|---|
| 645 |  | -		if (!is_vm_hugetlb_page(vma) &&  | 
|---|
| 646 |  | -			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&  | 
|---|
 | 712 | +		if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&  | 
|---|
| 647 | 713 |  			!(vma->vm_flags & VM_MIXEDMAP)) | 
|---|
| 648 | 714 |  			change_prot_numa(vma, start, endvma); | 
|---|
| 649 | 715 |  		return 1; | 
|---|
| .. | .. | 
|---|
| 654 | 720 |  		return 0; | 
|---|
| 655 | 721 |  	return 1; | 
|---|
| 656 | 722 |  } | 
|---|
 | 723 | +  | 
|---|
 | 724 | +static const struct mm_walk_ops queue_pages_walk_ops = {  | 
|---|
 | 725 | +	.hugetlb_entry		= queue_pages_hugetlb,  | 
|---|
 | 726 | +	.pmd_entry		= queue_pages_pte_range,  | 
|---|
 | 727 | +	.test_walk		= queue_pages_test_walk,  | 
|---|
 | 728 | +};  | 
|---|
| 657 | 729 |   | 
|---|
| 658 | 730 |  /* | 
|---|
| 659 | 731 |   * Walk through page tables and collect pages to be migrated. | 
|---|
| .. | .. | 
|---|
| 675 | 747 |  		nodemask_t *nodes, unsigned long flags, | 
|---|
| 676 | 748 |  		struct list_head *pagelist) | 
|---|
| 677 | 749 |  { | 
|---|
 | 750 | +	int err;  | 
|---|
| 678 | 751 |  	struct queue_pages qp = { | 
|---|
| 679 | 752 |  		.pagelist = pagelist, | 
|---|
| 680 | 753 |  		.flags = flags, | 
|---|
| 681 | 754 |  		.nmask = nodes, | 
|---|
| 682 |  | -		.prev = NULL,  | 
|---|
| 683 |  | -	};  | 
|---|
| 684 |  | -	struct mm_walk queue_pages_walk = {  | 
|---|
| 685 |  | -		.hugetlb_entry = queue_pages_hugetlb,  | 
|---|
| 686 |  | -		.pmd_entry = queue_pages_pte_range,  | 
|---|
| 687 |  | -		.test_walk = queue_pages_test_walk,  | 
|---|
| 688 |  | -		.mm = mm,  | 
|---|
| 689 |  | -		.private = &qp,  | 
|---|
 | 755 | +		.start = start,  | 
|---|
 | 756 | +		.end = end,  | 
|---|
 | 757 | +		.first = NULL,  | 
|---|
| 690 | 758 |  	}; | 
|---|
| 691 | 759 |   | 
|---|
| 692 |  | -	return walk_page_range(start, end, &queue_pages_walk);  | 
|---|
 | 760 | +	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);  | 
|---|
 | 761 | +  | 
|---|
 | 762 | +	if (!qp.first)  | 
|---|
 | 763 | +		/* whole range in hole */  | 
|---|
 | 764 | +		err = -EFAULT;  | 
|---|
 | 765 | +  | 
|---|
 | 766 | +	return err;  | 
|---|
| 693 | 767 |  } | 
|---|
| 694 | 768 |   | 
|---|
| 695 | 769 |  /* | 
|---|
| 696 | 770 |   * Apply policy to a single VMA | 
|---|
| 697 |  | - * This must be called with the mmap_sem held for writing.  | 
|---|
 | 771 | + * This must be called with the mmap_lock held for writing.  | 
|---|
| 698 | 772 |   */ | 
|---|
| 699 | 773 |  static int vma_replace_policy(struct vm_area_struct *vma, | 
|---|
| 700 | 774 |  						struct mempolicy *pol) | 
|---|
| .. | .. | 
|---|
| 712 | 786 |  	if (IS_ERR(new)) | 
|---|
| 713 | 787 |  		return PTR_ERR(new); | 
|---|
| 714 | 788 |   | 
|---|
 | 789 | +	vm_write_begin(vma);  | 
|---|
| 715 | 790 |  	if (vma->vm_ops && vma->vm_ops->set_policy) { | 
|---|
| 716 | 791 |  		err = vma->vm_ops->set_policy(vma, new); | 
|---|
| 717 | 792 |  		if (err) | 
|---|
| .. | .. | 
|---|
| 719 | 794 |  	} | 
|---|
| 720 | 795 |   | 
|---|
| 721 | 796 |  	old = vma->vm_policy; | 
|---|
| 722 |  | -	vma->vm_policy = new; /* protected by mmap_sem */  | 
|---|
 | 797 | +	/*  | 
|---|
 | 798 | +	 * The speculative page fault handler accesses this field without  | 
|---|
 | 799 | +	 * hodling the mmap_sem.  | 
|---|
 | 800 | +	 */  | 
|---|
 | 801 | +	WRITE_ONCE(vma->vm_policy,  new);  | 
|---|
 | 802 | +	vm_write_end(vma);  | 
|---|
| 723 | 803 |  	mpol_put(old); | 
|---|
| 724 | 804 |   | 
|---|
| 725 | 805 |  	return 0; | 
|---|
| 726 | 806 |   err_out: | 
|---|
 | 807 | +	vm_write_end(vma);  | 
|---|
| 727 | 808 |  	mpol_put(new); | 
|---|
| 728 | 809 |  	return err; | 
|---|
| 729 | 810 |  } | 
|---|
| .. | .. | 
|---|
| 732 | 813 |  static int mbind_range(struct mm_struct *mm, unsigned long start, | 
|---|
| 733 | 814 |  		       unsigned long end, struct mempolicy *new_pol) | 
|---|
| 734 | 815 |  { | 
|---|
| 735 |  | -	struct vm_area_struct *next;  | 
|---|
| 736 | 816 |  	struct vm_area_struct *prev; | 
|---|
| 737 | 817 |  	struct vm_area_struct *vma; | 
|---|
| 738 | 818 |  	int err = 0; | 
|---|
| .. | .. | 
|---|
| 741 | 821 |  	unsigned long vmend; | 
|---|
| 742 | 822 |   | 
|---|
| 743 | 823 |  	vma = find_vma(mm, start); | 
|---|
| 744 |  | -	if (!vma || vma->vm_start > start)  | 
|---|
| 745 |  | -		return -EFAULT;  | 
|---|
 | 824 | +	VM_BUG_ON(!vma);  | 
|---|
| 746 | 825 |   | 
|---|
| 747 | 826 |  	prev = vma->vm_prev; | 
|---|
| 748 | 827 |  	if (start > vma->vm_start) | 
|---|
| 749 | 828 |  		prev = vma; | 
|---|
| 750 | 829 |   | 
|---|
| 751 |  | -	for (; vma && vma->vm_start < end; prev = vma, vma = next) {  | 
|---|
| 752 |  | -		next = vma->vm_next;  | 
|---|
 | 830 | +	for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {  | 
|---|
| 753 | 831 |  		vmstart = max(start, vma->vm_start); | 
|---|
| 754 | 832 |  		vmend   = min(end, vma->vm_end); | 
|---|
| 755 | 833 |   | 
|---|
| .. | .. | 
|---|
| 764 | 842 |  				 vma_get_anon_name(vma)); | 
|---|
| 765 | 843 |  		if (prev) { | 
|---|
| 766 | 844 |  			vma = prev; | 
|---|
| 767 |  | -			next = vma->vm_next;  | 
|---|
| 768 |  | -			if (mpol_equal(vma_policy(vma), new_pol))  | 
|---|
| 769 |  | -				continue;  | 
|---|
| 770 |  | -			/* vma_merge() joined vma && vma->next, case 8 */  | 
|---|
| 771 | 845 |  			goto replace; | 
|---|
| 772 | 846 |  		} | 
|---|
| 773 | 847 |  		if (vma->vm_start != vmstart) { | 
|---|
| .. | .. | 
|---|
| 807 | 881 |  		goto out; | 
|---|
| 808 | 882 |  	} | 
|---|
| 809 | 883 |   | 
|---|
| 810 |  | -	task_lock(current);  | 
|---|
| 811 | 884 |  	ret = mpol_set_nodemask(new, nodes, scratch); | 
|---|
| 812 | 885 |  	if (ret) { | 
|---|
| 813 |  | -		task_unlock(current);  | 
|---|
| 814 | 886 |  		mpol_put(new); | 
|---|
| 815 | 887 |  		goto out; | 
|---|
| 816 | 888 |  	} | 
|---|
 | 889 | +	task_lock(current);  | 
|---|
| 817 | 890 |  	old = current->mempolicy; | 
|---|
| 818 | 891 |  	current->mempolicy = new; | 
|---|
| 819 | 892 |  	if (new && new->mode == MPOL_INTERLEAVE) | 
|---|
| .. | .. | 
|---|
| 839 | 912 |   | 
|---|
| 840 | 913 |  	switch (p->mode) { | 
|---|
| 841 | 914 |  	case MPOL_BIND: | 
|---|
| 842 |  | -		/* Fall through */  | 
|---|
| 843 | 915 |  	case MPOL_INTERLEAVE: | 
|---|
| 844 | 916 |  		*nodes = p->v.nodes; | 
|---|
| 845 | 917 |  		break; | 
|---|
| .. | .. | 
|---|
| 853 | 925 |  	} | 
|---|
| 854 | 926 |  } | 
|---|
| 855 | 927 |   | 
|---|
| 856 |  | -static int lookup_node(unsigned long addr)  | 
|---|
 | 928 | +static int lookup_node(struct mm_struct *mm, unsigned long addr)  | 
|---|
| 857 | 929 |  { | 
|---|
| 858 |  | -	struct page *p;  | 
|---|
 | 930 | +	struct page *p = NULL;  | 
|---|
| 859 | 931 |  	int err; | 
|---|
| 860 | 932 |   | 
|---|
| 861 |  | -	err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);  | 
|---|
| 862 |  | -	if (err >= 0) {  | 
|---|
 | 933 | +	int locked = 1;  | 
|---|
 | 934 | +	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);  | 
|---|
 | 935 | +	if (err > 0) {  | 
|---|
| 863 | 936 |  		err = page_to_nid(p); | 
|---|
| 864 | 937 |  		put_page(p); | 
|---|
| 865 | 938 |  	} | 
|---|
 | 939 | +	if (locked)  | 
|---|
 | 940 | +		mmap_read_unlock(mm);  | 
|---|
| 866 | 941 |  	return err; | 
|---|
| 867 | 942 |  } | 
|---|
| 868 | 943 |   | 
|---|
| .. | .. | 
|---|
| 873 | 948 |  	int err; | 
|---|
| 874 | 949 |  	struct mm_struct *mm = current->mm; | 
|---|
| 875 | 950 |  	struct vm_area_struct *vma = NULL; | 
|---|
| 876 |  | -	struct mempolicy *pol = current->mempolicy;  | 
|---|
 | 951 | +	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;  | 
|---|
| 877 | 952 |   | 
|---|
| 878 | 953 |  	if (flags & | 
|---|
| 879 | 954 |  		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) | 
|---|
| .. | .. | 
|---|
| 895 | 970 |  		 * vma/shared policy at addr is NULL.  We | 
|---|
| 896 | 971 |  		 * want to return MPOL_DEFAULT in this case. | 
|---|
| 897 | 972 |  		 */ | 
|---|
| 898 |  | -		down_read(&mm->mmap_sem);  | 
|---|
 | 973 | +		mmap_read_lock(mm);  | 
|---|
| 899 | 974 |  		vma = find_vma_intersection(mm, addr, addr+1); | 
|---|
| 900 | 975 |  		if (!vma) { | 
|---|
| 901 |  | -			up_read(&mm->mmap_sem);  | 
|---|
 | 976 | +			mmap_read_unlock(mm);  | 
|---|
| 902 | 977 |  			return -EFAULT; | 
|---|
| 903 | 978 |  		} | 
|---|
| 904 | 979 |  		if (vma->vm_ops && vma->vm_ops->get_policy) | 
|---|
| .. | .. | 
|---|
| 913 | 988 |   | 
|---|
| 914 | 989 |  	if (flags & MPOL_F_NODE) { | 
|---|
| 915 | 990 |  		if (flags & MPOL_F_ADDR) { | 
|---|
| 916 |  | -			err = lookup_node(addr);  | 
|---|
 | 991 | +			/*  | 
|---|
 | 992 | +			 * Take a refcount on the mpol, lookup_node()  | 
|---|
 | 993 | +			 * wil drop the mmap_lock, so after calling  | 
|---|
 | 994 | +			 * lookup_node() only "pol" remains valid, "vma"  | 
|---|
 | 995 | +			 * is stale.  | 
|---|
 | 996 | +			 */  | 
|---|
 | 997 | +			pol_refcount = pol;  | 
|---|
 | 998 | +			vma = NULL;  | 
|---|
 | 999 | +			mpol_get(pol);  | 
|---|
 | 1000 | +			err = lookup_node(mm, addr);  | 
|---|
| 917 | 1001 |  			if (err < 0) | 
|---|
| 918 | 1002 |  				goto out; | 
|---|
| 919 | 1003 |  			*policy = err; | 
|---|
| .. | .. | 
|---|
| 948 | 1032 |   out: | 
|---|
| 949 | 1033 |  	mpol_cond_put(pol); | 
|---|
| 950 | 1034 |  	if (vma) | 
|---|
| 951 |  | -		up_read(¤t->mm->mmap_sem);  | 
|---|
 | 1035 | +		mmap_read_unlock(mm);  | 
|---|
 | 1036 | +	if (pol_refcount)  | 
|---|
 | 1037 | +		mpol_put(pol_refcount);  | 
|---|
| 952 | 1038 |  	return err; | 
|---|
| 953 | 1039 |  } | 
|---|
| 954 | 1040 |   | 
|---|
| .. | .. | 
|---|
| 967 | 1053 |  		if (!isolate_lru_page(head)) { | 
|---|
| 968 | 1054 |  			list_add_tail(&head->lru, pagelist); | 
|---|
| 969 | 1055 |  			mod_node_page_state(page_pgdat(head), | 
|---|
| 970 |  | -				NR_ISOLATED_ANON + page_is_file_cache(head),  | 
|---|
| 971 |  | -				hpage_nr_pages(head));  | 
|---|
 | 1056 | +				NR_ISOLATED_ANON + page_is_file_lru(head),  | 
|---|
 | 1057 | +				thp_nr_pages(head));  | 
|---|
| 972 | 1058 |  		} else if (flags & MPOL_MF_STRICT) { | 
|---|
| 973 | 1059 |  			/* | 
|---|
| 974 | 1060 |  			 * Non-movable page may reach here.  And, there may be | 
|---|
| .. | .. | 
|---|
| 984 | 1070 |  	return 0; | 
|---|
| 985 | 1071 |  } | 
|---|
| 986 | 1072 |   | 
|---|
| 987 |  | -/* page allocation callback for NUMA node migration */  | 
|---|
| 988 |  | -struct page *alloc_new_node_page(struct page *page, unsigned long node)  | 
|---|
| 989 |  | -{  | 
|---|
| 990 |  | -	if (PageHuge(page))  | 
|---|
| 991 |  | -		return alloc_huge_page_node(page_hstate(compound_head(page)),  | 
|---|
| 992 |  | -					node);  | 
|---|
| 993 |  | -	else if (PageTransHuge(page)) {  | 
|---|
| 994 |  | -		struct page *thp;  | 
|---|
| 995 |  | -  | 
|---|
| 996 |  | -		thp = alloc_pages_node(node,  | 
|---|
| 997 |  | -			(GFP_TRANSHUGE | __GFP_THISNODE),  | 
|---|
| 998 |  | -			HPAGE_PMD_ORDER);  | 
|---|
| 999 |  | -		if (!thp)  | 
|---|
| 1000 |  | -			return NULL;  | 
|---|
| 1001 |  | -		prep_transhuge_page(thp);  | 
|---|
| 1002 |  | -		return thp;  | 
|---|
| 1003 |  | -	} else  | 
|---|
| 1004 |  | -		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |  | 
|---|
| 1005 |  | -						    __GFP_THISNODE, 0);  | 
|---|
| 1006 |  | -}  | 
|---|
| 1007 |  | -  | 
|---|
| 1008 | 1073 |  /* | 
|---|
| 1009 | 1074 |   * Migrate pages from one node to a target node. | 
|---|
| 1010 | 1075 |   * Returns error or the number of pages not migrated. | 
|---|
| .. | .. | 
|---|
| 1015 | 1080 |  	nodemask_t nmask; | 
|---|
| 1016 | 1081 |  	LIST_HEAD(pagelist); | 
|---|
| 1017 | 1082 |  	int err = 0; | 
|---|
 | 1083 | +	struct migration_target_control mtc = {  | 
|---|
 | 1084 | +		.nid = dest,  | 
|---|
 | 1085 | +		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,  | 
|---|
 | 1086 | +	};  | 
|---|
| 1018 | 1087 |   | 
|---|
| 1019 | 1088 |  	nodes_clear(nmask); | 
|---|
| 1020 | 1089 |  	node_set(source, nmask); | 
|---|
| .. | .. | 
|---|
| 1029 | 1098 |  			flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 
|---|
| 1030 | 1099 |   | 
|---|
| 1031 | 1100 |  	if (!list_empty(&pagelist)) { | 
|---|
| 1032 |  | -		err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,  | 
|---|
| 1033 |  | -					MIGRATE_SYNC, MR_SYSCALL);  | 
|---|
 | 1101 | +		err = migrate_pages(&pagelist, alloc_migration_target, NULL,  | 
|---|
 | 1102 | +				(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);  | 
|---|
| 1034 | 1103 |  		if (err) | 
|---|
| 1035 | 1104 |  			putback_movable_pages(&pagelist); | 
|---|
| 1036 | 1105 |  	} | 
|---|
| .. | .. | 
|---|
| 1048 | 1117 |  		     const nodemask_t *to, int flags) | 
|---|
| 1049 | 1118 |  { | 
|---|
| 1050 | 1119 |  	int busy = 0; | 
|---|
| 1051 |  | -	int err;  | 
|---|
 | 1120 | +	int err = 0;  | 
|---|
| 1052 | 1121 |  	nodemask_t tmp; | 
|---|
| 1053 | 1122 |   | 
|---|
| 1054 |  | -	err = migrate_prep();  | 
|---|
| 1055 |  | -	if (err)  | 
|---|
| 1056 |  | -		return err;  | 
|---|
 | 1123 | +	lru_cache_disable();  | 
|---|
| 1057 | 1124 |   | 
|---|
| 1058 |  | -	down_read(&mm->mmap_sem);  | 
|---|
 | 1125 | +	mmap_read_lock(mm);  | 
|---|
| 1059 | 1126 |   | 
|---|
| 1060 | 1127 |  	/* | 
|---|
| 1061 | 1128 |  	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' | 
|---|
| .. | .. | 
|---|
| 1136 | 1203 |  		if (err < 0) | 
|---|
| 1137 | 1204 |  			break; | 
|---|
| 1138 | 1205 |  	} | 
|---|
| 1139 |  | -	up_read(&mm->mmap_sem);  | 
|---|
 | 1206 | +	mmap_read_unlock(mm);  | 
|---|
 | 1207 | +  | 
|---|
 | 1208 | +	lru_cache_enable();  | 
|---|
| 1140 | 1209 |  	if (err < 0) | 
|---|
| 1141 | 1210 |  		return err; | 
|---|
| 1142 | 1211 |  	return busy; | 
|---|
| .. | .. | 
|---|
| 1153 | 1222 |  static struct page *new_page(struct page *page, unsigned long start) | 
|---|
| 1154 | 1223 |  { | 
|---|
| 1155 | 1224 |  	struct vm_area_struct *vma; | 
|---|
| 1156 |  | -	unsigned long uninitialized_var(address);  | 
|---|
 | 1225 | +	unsigned long address;  | 
|---|
| 1157 | 1226 |   | 
|---|
| 1158 | 1227 |  	vma = find_vma(current->mm, start); | 
|---|
| 1159 | 1228 |  	while (vma) { | 
|---|
| .. | .. | 
|---|
| 1252 | 1321 |   | 
|---|
| 1253 | 1322 |  	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { | 
|---|
| 1254 | 1323 |   | 
|---|
| 1255 |  | -		err = migrate_prep();  | 
|---|
| 1256 |  | -		if (err)  | 
|---|
| 1257 |  | -			goto mpol_out;  | 
|---|
 | 1324 | +		lru_cache_disable();  | 
|---|
| 1258 | 1325 |  	} | 
|---|
| 1259 | 1326 |  	{ | 
|---|
| 1260 | 1327 |  		NODEMASK_SCRATCH(scratch); | 
|---|
| 1261 | 1328 |  		if (scratch) { | 
|---|
| 1262 |  | -			down_write(&mm->mmap_sem);  | 
|---|
| 1263 |  | -			task_lock(current);  | 
|---|
 | 1329 | +			mmap_write_lock(mm);  | 
|---|
| 1264 | 1330 |  			err = mpol_set_nodemask(new, nmask, scratch); | 
|---|
| 1265 |  | -			task_unlock(current);  | 
|---|
| 1266 | 1331 |  			if (err) | 
|---|
| 1267 |  | -				up_write(&mm->mmap_sem);  | 
|---|
 | 1332 | +				mmap_write_unlock(mm);  | 
|---|
| 1268 | 1333 |  		} else | 
|---|
| 1269 | 1334 |  			err = -ENOMEM; | 
|---|
| 1270 | 1335 |  		NODEMASK_SCRATCH_FREE(scratch); | 
|---|
| .. | .. | 
|---|
| 1301 | 1366 |  			putback_movable_pages(&pagelist); | 
|---|
| 1302 | 1367 |  	} | 
|---|
| 1303 | 1368 |   | 
|---|
| 1304 |  | -	up_write(&mm->mmap_sem);  | 
|---|
 | 1369 | +	mmap_write_unlock(mm);  | 
|---|
| 1305 | 1370 |  mpol_out: | 
|---|
| 1306 | 1371 |  	mpol_put(new); | 
|---|
 | 1372 | +	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))  | 
|---|
 | 1373 | +		lru_cache_enable();  | 
|---|
| 1307 | 1374 |  	return err; | 
|---|
| 1308 | 1375 |  } | 
|---|
| 1309 | 1376 |   | 
|---|
| .. | .. | 
|---|
| 1505 | 1572 |  	if (nodes_empty(*new)) | 
|---|
| 1506 | 1573 |  		goto out_put; | 
|---|
| 1507 | 1574 |   | 
|---|
| 1508 |  | -	nodes_and(*new, *new, node_states[N_MEMORY]);  | 
|---|
| 1509 |  | -	if (nodes_empty(*new))  | 
|---|
| 1510 |  | -		goto out_put;  | 
|---|
| 1511 |  | -  | 
|---|
| 1512 | 1575 |  	err = security_task_movememory(task); | 
|---|
| 1513 | 1576 |  	if (err) | 
|---|
| 1514 | 1577 |  		goto out_put; | 
|---|
| .. | .. | 
|---|
| 1552 | 1615 |  				unsigned long flags) | 
|---|
| 1553 | 1616 |  { | 
|---|
| 1554 | 1617 |  	int err; | 
|---|
| 1555 |  | -	int uninitialized_var(pval);  | 
|---|
 | 1618 | +	int pval;  | 
|---|
| 1556 | 1619 |  	nodemask_t nodes; | 
|---|
| 1557 |  | -  | 
|---|
| 1558 |  | -	addr = untagged_addr(addr);  | 
|---|
| 1559 | 1620 |   | 
|---|
| 1560 | 1621 |  	if (nmask != NULL && maxnode < nr_node_ids) | 
|---|
| 1561 | 1622 |  		return -EINVAL; | 
|---|
 | 1623 | +  | 
|---|
 | 1624 | +	addr = untagged_addr(addr);  | 
|---|
| 1562 | 1625 |   | 
|---|
| 1563 | 1626 |  	err = do_get_mempolicy(&pval, &nodes, addr, flags); | 
|---|
| 1564 | 1627 |   | 
|---|
| .. | .. | 
|---|
| 1691 | 1754 |   | 
|---|
| 1692 | 1755 |  #endif /* CONFIG_COMPAT */ | 
|---|
| 1693 | 1756 |   | 
|---|
 | 1757 | +bool vma_migratable(struct vm_area_struct *vma)  | 
|---|
 | 1758 | +{  | 
|---|
 | 1759 | +	if (vma->vm_flags & (VM_IO | VM_PFNMAP))  | 
|---|
 | 1760 | +		return false;  | 
|---|
 | 1761 | +  | 
|---|
 | 1762 | +	/*  | 
|---|
 | 1763 | +	 * DAX device mappings require predictable access latency, so avoid  | 
|---|
 | 1764 | +	 * incurring periodic faults.  | 
|---|
 | 1765 | +	 */  | 
|---|
 | 1766 | +	if (vma_is_dax(vma))  | 
|---|
 | 1767 | +		return false;  | 
|---|
 | 1768 | +  | 
|---|
 | 1769 | +	if (is_vm_hugetlb_page(vma) &&  | 
|---|
 | 1770 | +		!hugepage_migration_supported(hstate_vma(vma)))  | 
|---|
 | 1771 | +		return false;  | 
|---|
 | 1772 | +  | 
|---|
 | 1773 | +	/*  | 
|---|
 | 1774 | +	 * Migration allocates pages in the highest zone. If we cannot  | 
|---|
 | 1775 | +	 * do so then migration (at least from node to node) is not  | 
|---|
 | 1776 | +	 * possible.  | 
|---|
 | 1777 | +	 */  | 
|---|
 | 1778 | +	if (vma->vm_file &&  | 
|---|
 | 1779 | +		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))  | 
|---|
 | 1780 | +			< policy_zone)  | 
|---|
 | 1781 | +		return false;  | 
|---|
 | 1782 | +	return true;  | 
|---|
 | 1783 | +}  | 
|---|
 | 1784 | +  | 
|---|
| 1694 | 1785 |  struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, | 
|---|
| 1695 | 1786 |  						unsigned long addr) | 
|---|
| 1696 | 1787 |  { | 
|---|
| 1697 |  | -	struct mempolicy *pol = NULL;  | 
|---|
 | 1788 | +	struct mempolicy *pol;  | 
|---|
| 1698 | 1789 |   | 
|---|
| 1699 |  | -	if (vma) {  | 
|---|
| 1700 |  | -		if (vma->vm_ops && vma->vm_ops->get_policy) {  | 
|---|
| 1701 |  | -			pol = vma->vm_ops->get_policy(vma, addr);  | 
|---|
| 1702 |  | -		} else if (vma->vm_policy) {  | 
|---|
| 1703 |  | -			pol = vma->vm_policy;  | 
|---|
 | 1790 | +	if (!vma)  | 
|---|
 | 1791 | +		return NULL;  | 
|---|
| 1704 | 1792 |   | 
|---|
| 1705 |  | -			/*  | 
|---|
| 1706 |  | -			 * shmem_alloc_page() passes MPOL_F_SHARED policy with  | 
|---|
| 1707 |  | -			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference  | 
|---|
| 1708 |  | -			 * count on these policies which will be dropped by  | 
|---|
| 1709 |  | -			 * mpol_cond_put() later  | 
|---|
| 1710 |  | -			 */  | 
|---|
| 1711 |  | -			if (mpol_needs_cond_ref(pol))  | 
|---|
| 1712 |  | -				mpol_get(pol);  | 
|---|
| 1713 |  | -		}  | 
|---|
 | 1793 | +	if (vma->vm_ops && vma->vm_ops->get_policy)  | 
|---|
 | 1794 | +		return vma->vm_ops->get_policy(vma, addr);  | 
|---|
 | 1795 | +  | 
|---|
 | 1796 | +	/*  | 
|---|
 | 1797 | +	 * This could be called without holding the mmap_sem in the  | 
|---|
 | 1798 | +	 * speculative page fault handler's path.  | 
|---|
 | 1799 | +	 */  | 
|---|
 | 1800 | +	pol = READ_ONCE(vma->vm_policy);  | 
|---|
 | 1801 | +	if (pol) {  | 
|---|
 | 1802 | +		/*  | 
|---|
 | 1803 | +		 * shmem_alloc_page() passes MPOL_F_SHARED policy with  | 
|---|
 | 1804 | +		 * a pseudo vma whose vma->vm_ops=NULL. Take a reference  | 
|---|
 | 1805 | +		 * count on these policies which will be dropped by  | 
|---|
 | 1806 | +		 * mpol_cond_put() later  | 
|---|
 | 1807 | +		 */  | 
|---|
 | 1808 | +		if (mpol_needs_cond_ref(pol))  | 
|---|
 | 1809 | +			mpol_get(pol);  | 
|---|
| 1714 | 1810 |  	} | 
|---|
| 1715 | 1811 |   | 
|---|
| 1716 | 1812 |  	return pol; | 
|---|
| .. | .. | 
|---|
| 1785 | 1881 |   * Return a nodemask representing a mempolicy for filtering nodes for | 
|---|
| 1786 | 1882 |   * page allocation | 
|---|
| 1787 | 1883 |   */ | 
|---|
| 1788 |  | -static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)  | 
|---|
 | 1884 | +nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)  | 
|---|
| 1789 | 1885 |  { | 
|---|
| 1790 | 1886 |  	/* Lower zones don't get a nodemask applied for MPOL_BIND */ | 
|---|
| 1791 | 1887 |  	if (unlikely(policy->mode == MPOL_BIND) && | 
|---|
| .. | .. | 
|---|
| 1797 | 1893 |  } | 
|---|
| 1798 | 1894 |   | 
|---|
| 1799 | 1895 |  /* Return the node id preferred by the given mempolicy, or the given id */ | 
|---|
| 1800 |  | -static int policy_node(gfp_t gfp, struct mempolicy *policy,  | 
|---|
| 1801 |  | -								int nd)  | 
|---|
 | 1896 | +static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)  | 
|---|
| 1802 | 1897 |  { | 
|---|
| 1803 | 1898 |  	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) | 
|---|
| 1804 | 1899 |  		nd = policy->v.preferred_node; | 
|---|
| .. | .. | 
|---|
| 1986 | 2081 |  		break; | 
|---|
| 1987 | 2082 |   | 
|---|
| 1988 | 2083 |  	case MPOL_BIND: | 
|---|
| 1989 |  | -		/* Fall through */  | 
|---|
| 1990 | 2084 |  	case MPOL_INTERLEAVE: | 
|---|
| 1991 | 2085 |  		*mask =  mempolicy->v.nodes; | 
|---|
| 1992 | 2086 |  		break; | 
|---|
| .. | .. | 
|---|
| 2081 | 2175 |   * | 
|---|
| 2082 | 2176 |   * 	This function allocates a page from the kernel page pool and applies | 
|---|
| 2083 | 2177 |   *	a NUMA policy associated with the VMA or the current process. | 
|---|
| 2084 |  | - *	When VMA is not NULL caller must hold down_read on the mmap_sem of the  | 
|---|
 | 2178 | + *	When VMA is not NULL caller must read-lock the mmap_lock of the  | 
|---|
| 2085 | 2179 |   *	mm_struct of the VMA to prevent it from going away. Should be used for | 
|---|
| 2086 | 2180 |   *	all allocations for pages that will be mapped into user space. Returns | 
|---|
| 2087 | 2181 |   *	NULL when no page can be allocated. | 
|---|
| .. | .. | 
|---|
| 2119 | 2213 |  		 * If the policy is interleave, or does not allow the current | 
|---|
| 2120 | 2214 |  		 * node in its nodemask, we allocate the standard way. | 
|---|
| 2121 | 2215 |  		 */ | 
|---|
| 2122 |  | -		if (pol->mode == MPOL_PREFERRED &&  | 
|---|
| 2123 |  | -						!(pol->flags & MPOL_F_LOCAL))  | 
|---|
 | 2216 | +		if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))  | 
|---|
| 2124 | 2217 |  			hpage_node = pol->v.preferred_node; | 
|---|
| 2125 | 2218 |   | 
|---|
| 2126 | 2219 |  		nmask = policy_nodemask(gfp, pol); | 
|---|
| 2127 | 2220 |  		if (!nmask || node_isset(hpage_node, *nmask)) { | 
|---|
| 2128 | 2221 |  			mpol_cond_put(pol); | 
|---|
| 2129 | 2222 |  			/* | 
|---|
| 2130 |  | -			 * We cannot invoke reclaim if __GFP_THISNODE  | 
|---|
| 2131 |  | -			 * is set. Invoking reclaim with  | 
|---|
| 2132 |  | -			 * __GFP_THISNODE set, would cause THP  | 
|---|
| 2133 |  | -			 * allocations to trigger heavy swapping  | 
|---|
| 2134 |  | -			 * despite there may be tons of free memory  | 
|---|
| 2135 |  | -			 * (including potentially plenty of THP  | 
|---|
| 2136 |  | -			 * already available in the buddy) on all the  | 
|---|
| 2137 |  | -			 * other NUMA nodes.  | 
|---|
| 2138 |  | -			 *  | 
|---|
| 2139 |  | -			 * At most we could invoke compaction when  | 
|---|
| 2140 |  | -			 * __GFP_THISNODE is set (but we would need to  | 
|---|
| 2141 |  | -			 * refrain from invoking reclaim even if  | 
|---|
| 2142 |  | -			 * compaction returned COMPACT_SKIPPED because  | 
|---|
| 2143 |  | -			 * there wasn't not enough memory to succeed  | 
|---|
| 2144 |  | -			 * compaction). For now just avoid  | 
|---|
| 2145 |  | -			 * __GFP_THISNODE instead of limiting the  | 
|---|
| 2146 |  | -			 * allocation path to a strict and single  | 
|---|
| 2147 |  | -			 * compaction invocation.  | 
|---|
| 2148 |  | -			 *  | 
|---|
| 2149 |  | -			 * Supposedly if direct reclaim was enabled by  | 
|---|
| 2150 |  | -			 * the caller, the app prefers THP regardless  | 
|---|
| 2151 |  | -			 * of the node it comes from so this would be  | 
|---|
| 2152 |  | -			 * more desiderable behavior than only  | 
|---|
| 2153 |  | -			 * providing THP originated from the local  | 
|---|
| 2154 |  | -			 * node in such case.  | 
|---|
 | 2223 | +			 * First, try to allocate THP only on local node, but  | 
|---|
 | 2224 | +			 * don't reclaim unnecessarily, just compact.  | 
|---|
| 2155 | 2225 |  			 */ | 
|---|
| 2156 |  | -			if (!(gfp & __GFP_DIRECT_RECLAIM))  | 
|---|
| 2157 |  | -				gfp |= __GFP_THISNODE;  | 
|---|
| 2158 |  | -			page = __alloc_pages_node(hpage_node, gfp, order);  | 
|---|
 | 2226 | +			page = __alloc_pages_node(hpage_node,  | 
|---|
 | 2227 | +				gfp | __GFP_THISNODE | __GFP_NORETRY, order);  | 
|---|
 | 2228 | +  | 
|---|
 | 2229 | +			/*  | 
|---|
 | 2230 | +			 * If hugepage allocations are configured to always  | 
|---|
 | 2231 | +			 * synchronous compact or the vma has been madvised  | 
|---|
 | 2232 | +			 * to prefer hugepage backing, retry allowing remote  | 
|---|
 | 2233 | +			 * memory with both reclaim and compact as well.  | 
|---|
 | 2234 | +			 */  | 
|---|
 | 2235 | +			if (!page && (gfp & __GFP_DIRECT_RECLAIM))  | 
|---|
 | 2236 | +				page = __alloc_pages_nodemask(gfp, order,  | 
|---|
 | 2237 | +							hpage_node, nmask);  | 
|---|
 | 2238 | +  | 
|---|
| 2159 | 2239 |  			goto out; | 
|---|
| 2160 | 2240 |  		} | 
|---|
| 2161 | 2241 |  	} | 
|---|
| .. | .. | 
|---|
| 2167 | 2247 |  out: | 
|---|
| 2168 | 2248 |  	return page; | 
|---|
| 2169 | 2249 |  } | 
|---|
 | 2250 | +EXPORT_SYMBOL(alloc_pages_vma);  | 
|---|
| 2170 | 2251 |   | 
|---|
| 2171 | 2252 |  /** | 
|---|
| 2172 | 2253 |   * 	alloc_pages_current - Allocate pages. | 
|---|
| .. | .. | 
|---|
| 2266 | 2347 |   | 
|---|
| 2267 | 2348 |  	switch (a->mode) { | 
|---|
| 2268 | 2349 |  	case MPOL_BIND: | 
|---|
| 2269 |  | -		/* Fall through */  | 
|---|
| 2270 | 2350 |  	case MPOL_INTERLEAVE: | 
|---|
| 2271 | 2351 |  		return !!nodes_equal(a->v.nodes, b->v.nodes); | 
|---|
| 2272 | 2352 |  	case MPOL_PREFERRED: | 
|---|
| .. | .. | 
|---|
| 2399 | 2479 |  	unsigned long pgoff; | 
|---|
| 2400 | 2480 |  	int thiscpu = raw_smp_processor_id(); | 
|---|
| 2401 | 2481 |  	int thisnid = cpu_to_node(thiscpu); | 
|---|
| 2402 |  | -	int polnid = -1;  | 
|---|
 | 2482 | +	int polnid = NUMA_NO_NODE;  | 
|---|
| 2403 | 2483 |  	int ret = -1; | 
|---|
| 2404 | 2484 |   | 
|---|
| 2405 | 2485 |  	pol = get_vma_policy(vma, addr); | 
|---|
| .. | .. | 
|---|
| 2573 | 2653 |  	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 
|---|
| 2574 | 2654 |  	if (!mpol_new) | 
|---|
| 2575 | 2655 |  		goto err_out; | 
|---|
 | 2656 | +	atomic_set(&mpol_new->refcnt, 1);  | 
|---|
| 2576 | 2657 |  	goto restart; | 
|---|
| 2577 | 2658 |  } | 
|---|
| 2578 | 2659 |   | 
|---|
| .. | .. | 
|---|
| 2805 | 2886 |  int mpol_parse_str(char *str, struct mempolicy **mpol) | 
|---|
| 2806 | 2887 |  { | 
|---|
| 2807 | 2888 |  	struct mempolicy *new = NULL; | 
|---|
| 2808 |  | -	unsigned short mode;  | 
|---|
| 2809 | 2889 |  	unsigned short mode_flags; | 
|---|
| 2810 | 2890 |  	nodemask_t nodes; | 
|---|
| 2811 | 2891 |  	char *nodelist = strchr(str, ':'); | 
|---|
| 2812 | 2892 |  	char *flags = strchr(str, '='); | 
|---|
| 2813 |  | -	int err = 1;  | 
|---|
 | 2893 | +	int err = 1, mode;  | 
|---|
| 2814 | 2894 |   | 
|---|
| 2815 | 2895 |  	if (flags) | 
|---|
| 2816 | 2896 |  		*flags++ = '\0';	/* terminate mode string */ | 
|---|
| .. | .. | 
|---|
| 2825 | 2905 |  	} else | 
|---|
| 2826 | 2906 |  		nodes_clear(nodes); | 
|---|
| 2827 | 2907 |   | 
|---|
| 2828 |  | -	for (mode = 0; mode < MPOL_MAX; mode++) {  | 
|---|
| 2829 |  | -		if (!strcmp(str, policy_modes[mode])) {  | 
|---|
| 2830 |  | -			break;  | 
|---|
| 2831 |  | -		}  | 
|---|
| 2832 |  | -	}  | 
|---|
| 2833 |  | -	if (mode >= MPOL_MAX)  | 
|---|
 | 2908 | +	mode = match_string(policy_modes, MPOL_MAX, str);  | 
|---|
 | 2909 | +	if (mode < 0)  | 
|---|
| 2834 | 2910 |  		goto out; | 
|---|
| 2835 | 2911 |   | 
|---|
| 2836 | 2912 |  	switch (mode) { | 
|---|