From 244b2c5ca8b14627e4a17755e5922221e121c771 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Wed, 09 Oct 2024 06:15:07 +0000
Subject: [PATCH] change system file
---
kernel/mm/mempolicy.c | 410 ++++++++++++++++++++++++++++++++++-----------------------
1 files changed, 243 insertions(+), 167 deletions(-)
diff --git a/kernel/mm/mempolicy.c b/kernel/mm/mempolicy.c
index 9b03bfa..67f75cf 100644
--- a/kernel/mm/mempolicy.c
+++ b/kernel/mm/mempolicy.c
@@ -1,9 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple NUMA memory policy for the Linux kernel.
*
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
- * Subject to the GNU Public License, version 2.
*
* NUMA policy allows the user to give hints in which node(s) memory should
* be allocated.
@@ -68,7 +68,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mempolicy.h>
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
@@ -126,6 +126,32 @@
};
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+
+/**
+ * numa_map_to_online_node - Find closest online node
+ * @node: Node id to start the search
+ *
+ * Lookup the next closest node by distance if @nid is not online.
+ */
+int numa_map_to_online_node(int node)
+{
+ int min_dist = INT_MAX, dist, n, min_node;
+
+ if (node == NUMA_NO_NODE || node_online(node))
+ return node;
+
+ min_node = node;
+ for_each_online_node(n) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(numa_map_to_online_node);
struct mempolicy *get_task_policy(struct task_struct *p)
{
@@ -198,7 +224,7 @@
* handle an empty nodemask with MPOL_PREFERRED here.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
- * and mempolicy. May also be called holding the mmap_semaphore for write.
+ * and mempolicy. May also be called holding the mmap_lock for write.
*/
static int mpol_set_nodemask(struct mempolicy *pol,
const nodemask_t *nodes, struct nodemask_scratch *nsc)
@@ -342,13 +368,13 @@
/*
* mpol_rebind_policy - Migrate a policy to a different set of nodes
*
- * Per-vma policies are protected by mmap_sem. Allocations using per-task
+ * Per-vma policies are protected by mmap_lock. Allocations using per-task
* policies are protected by task->mems_allowed_seq to prevent a premature
* OOM/allocation failure due to parallel nodemask modification.
*/
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
- if (!pol)
+ if (!pol || pol->mode == MPOL_LOCAL)
return;
if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
@@ -372,17 +398,20 @@
/*
* Rebind each vma in mm to new nodemask.
*
- * Call holding a reference to mm. Takes mm->mmap_sem during call.
+ * Call holding a reference to mm. Takes mm->mmap_lock during call.
*/
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
- down_write(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ mmap_write_lock(mm);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ vm_write_begin(vma);
mpol_rebind_policy(vma->vm_policy, new);
- up_write(&mm->mmap_sem);
+ vm_write_end(vma);
+ }
+ mmap_write_unlock(mm);
}
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
@@ -410,7 +439,9 @@
struct list_head *pagelist;
unsigned long flags;
nodemask_t *nmask;
- struct vm_area_struct *prev;
+ unsigned long start;
+ unsigned long end;
+ struct vm_area_struct *first;
};
/*
@@ -440,6 +471,7 @@
*/
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
+ __releases(ptl)
{
int ret = 0;
struct page *page;
@@ -555,9 +587,10 @@
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
+ int ret = 0;
#ifdef CONFIG_HUGETLB_PAGE
struct queue_pages *qp = walk->private;
- unsigned long flags = qp->flags;
+ unsigned long flags = (qp->flags & MPOL_MF_VALID);
struct page *page;
spinlock_t *ptl;
pte_t entry;
@@ -569,16 +602,45 @@
page = pte_page(entry);
if (!queue_pages_required(page, qp))
goto unlock;
+
+ if (flags == MPOL_MF_STRICT) {
+ /*
+ * STRICT alone means only detecting misplaced page and no
+ * need to further check other vma.
+ */
+ ret = -EIO;
+ goto unlock;
+ }
+
+ if (!vma_migratable(walk->vma)) {
+ /*
+ * Must be STRICT with MOVE*, otherwise .test_walk() have
+ * stopped walking current vma.
+ * Detecting misplaced page but allow migrating pages which
+ * have been queued.
+ */
+ ret = 1;
+ goto unlock;
+ }
+
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
- (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
- isolate_huge_page(page, qp->pagelist);
+ (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
+ !hugetlb_pmd_shared(pte))) {
+ if (isolate_hugetlb(page, qp->pagelist) &&
+ (flags & MPOL_MF_STRICT))
+ /*
+ * Failed to isolate page but allow migrating pages
+ * which have been queued.
+ */
+ ret = 1;
+ }
unlock:
spin_unlock(ptl);
#else
BUG();
#endif
- return 0;
+ return ret;
}
#ifdef CONFIG_NUMA_BALANCING
@@ -596,7 +658,7 @@
{
int nr_updated;
- nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
+ nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
@@ -618,6 +680,22 @@
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
+ /* range check first */
+ VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
+
+ if (!qp->first) {
+ qp->first = vma;
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
+ (qp->start < vma->vm_start))
+ /* hole at head side of range */
+ return -EFAULT;
+ }
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
+ ((vma->vm_end < qp->end) &&
+ (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
+ /* hole at middle or tail of range */
+ return -EFAULT;
+
/*
* Need check MPOL_MF_STRICT to return -EIO if possible
* regardless of vma_migratable
@@ -628,22 +706,10 @@
if (endvma > end)
endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
-
- if (!(flags & MPOL_MF_DISCONTIG_OK)) {
- if (!vma->vm_next && vma->vm_end < end)
- return -EFAULT;
- if (qp->prev && qp->prev->vm_end < vma->vm_start)
- return -EFAULT;
- }
-
- qp->prev = vma;
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
- if (!is_vm_hugetlb_page(vma) &&
- (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
+ if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
!(vma->vm_flags & VM_MIXEDMAP))
change_prot_numa(vma, start, endvma);
return 1;
@@ -654,6 +720,12 @@
return 0;
return 1;
}
+
+static const struct mm_walk_ops queue_pages_walk_ops = {
+ .hugetlb_entry = queue_pages_hugetlb,
+ .pmd_entry = queue_pages_pte_range,
+ .test_walk = queue_pages_test_walk,
+};
/*
* Walk through page tables and collect pages to be migrated.
@@ -675,26 +747,28 @@
nodemask_t *nodes, unsigned long flags,
struct list_head *pagelist)
{
+ int err;
struct queue_pages qp = {
.pagelist = pagelist,
.flags = flags,
.nmask = nodes,
- .prev = NULL,
- };
- struct mm_walk queue_pages_walk = {
- .hugetlb_entry = queue_pages_hugetlb,
- .pmd_entry = queue_pages_pte_range,
- .test_walk = queue_pages_test_walk,
- .mm = mm,
- .private = &qp,
+ .start = start,
+ .end = end,
+ .first = NULL,
};
- return walk_page_range(start, end, &queue_pages_walk);
+ err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
+
+ if (!qp.first)
+ /* whole range in hole */
+ err = -EFAULT;
+
+ return err;
}
/*
* Apply policy to a single VMA
- * This must be called with the mmap_sem held for writing.
+ * This must be called with the mmap_lock held for writing.
*/
static int vma_replace_policy(struct vm_area_struct *vma,
struct mempolicy *pol)
@@ -712,6 +786,7 @@
if (IS_ERR(new))
return PTR_ERR(new);
+ vm_write_begin(vma);
if (vma->vm_ops && vma->vm_ops->set_policy) {
err = vma->vm_ops->set_policy(vma, new);
if (err)
@@ -719,11 +794,17 @@
}
old = vma->vm_policy;
- vma->vm_policy = new; /* protected by mmap_sem */
+ /*
+ * The speculative page fault handler accesses this field without
+ * hodling the mmap_sem.
+ */
+ WRITE_ONCE(vma->vm_policy, new);
+ vm_write_end(vma);
mpol_put(old);
return 0;
err_out:
+ vm_write_end(vma);
mpol_put(new);
return err;
}
@@ -732,7 +813,6 @@
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
- struct vm_area_struct *next;
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
@@ -741,15 +821,13 @@
unsigned long vmend;
vma = find_vma(mm, start);
- if (!vma || vma->vm_start > start)
- return -EFAULT;
+ VM_BUG_ON(!vma);
prev = vma->vm_prev;
if (start > vma->vm_start)
prev = vma;
- for (; vma && vma->vm_start < end; prev = vma, vma = next) {
- next = vma->vm_next;
+ for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
@@ -764,10 +842,6 @@
vma_get_anon_name(vma));
if (prev) {
vma = prev;
- next = vma->vm_next;
- if (mpol_equal(vma_policy(vma), new_pol))
- continue;
- /* vma_merge() joined vma && vma->next, case 8 */
goto replace;
}
if (vma->vm_start != vmstart) {
@@ -807,13 +881,12 @@
goto out;
}
- task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
- task_unlock(current);
mpol_put(new);
goto out;
}
+ task_lock(current);
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
@@ -839,7 +912,6 @@
switch (p->mode) {
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
*nodes = p->v.nodes;
break;
@@ -853,16 +925,19 @@
}
}
-static int lookup_node(unsigned long addr)
+static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
- struct page *p;
+ struct page *p = NULL;
int err;
- err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
- if (err >= 0) {
+ int locked = 1;
+ err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
+ if (err > 0) {
err = page_to_nid(p);
put_page(p);
}
+ if (locked)
+ mmap_read_unlock(mm);
return err;
}
@@ -873,7 +948,7 @@
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
- struct mempolicy *pol = current->mempolicy;
+ struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
@@ -895,10 +970,10 @@
* vma/shared policy at addr is NULL. We
* want to return MPOL_DEFAULT in this case.
*/
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma_intersection(mm, addr, addr+1);
if (!vma) {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return -EFAULT;
}
if (vma->vm_ops && vma->vm_ops->get_policy)
@@ -913,7 +988,16 @@
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
- err = lookup_node(addr);
+ /*
+ * Take a refcount on the mpol, lookup_node()
+ * wil drop the mmap_lock, so after calling
+ * lookup_node() only "pol" remains valid, "vma"
+ * is stale.
+ */
+ pol_refcount = pol;
+ vma = NULL;
+ mpol_get(pol);
+ err = lookup_node(mm, addr);
if (err < 0)
goto out;
*policy = err;
@@ -948,7 +1032,9 @@
out:
mpol_cond_put(pol);
if (vma)
- up_read(¤t->mm->mmap_sem);
+ mmap_read_unlock(mm);
+ if (pol_refcount)
+ mpol_put(pol_refcount);
return err;
}
@@ -967,8 +1053,8 @@
if (!isolate_lru_page(head)) {
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
- hpage_nr_pages(head));
+ NR_ISOLATED_ANON + page_is_file_lru(head),
+ thp_nr_pages(head));
} else if (flags & MPOL_MF_STRICT) {
/*
* Non-movable page may reach here. And, there may be
@@ -984,27 +1070,6 @@
return 0;
}
-/* page allocation callback for NUMA node migration */
-struct page *alloc_new_node_page(struct page *page, unsigned long node)
-{
- if (PageHuge(page))
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- node);
- else if (PageTransHuge(page)) {
- struct page *thp;
-
- thp = alloc_pages_node(node,
- (GFP_TRANSHUGE | __GFP_THISNODE),
- HPAGE_PMD_ORDER);
- if (!thp)
- return NULL;
- prep_transhuge_page(thp);
- return thp;
- } else
- return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
- __GFP_THISNODE, 0);
-}
-
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
@@ -1015,6 +1080,10 @@
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
+ struct migration_target_control mtc = {
+ .nid = dest,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
nodes_clear(nmask);
node_set(source, nmask);
@@ -1029,8 +1098,8 @@
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
}
@@ -1048,14 +1117,12 @@
const nodemask_t *to, int flags)
{
int busy = 0;
- int err;
+ int err = 0;
nodemask_t tmp;
- err = migrate_prep();
- if (err)
- return err;
+ lru_cache_disable();
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
@@ -1136,7 +1203,9 @@
if (err < 0)
break;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
+
+ lru_cache_enable();
if (err < 0)
return err;
return busy;
@@ -1153,7 +1222,7 @@
static struct page *new_page(struct page *page, unsigned long start)
{
struct vm_area_struct *vma;
- unsigned long uninitialized_var(address);
+ unsigned long address;
vma = find_vma(current->mm, start);
while (vma) {
@@ -1252,19 +1321,15 @@
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- err = migrate_prep();
- if (err)
- goto mpol_out;
+ lru_cache_disable();
}
{
NODEMASK_SCRATCH(scratch);
if (scratch) {
- down_write(&mm->mmap_sem);
- task_lock(current);
+ mmap_write_lock(mm);
err = mpol_set_nodemask(new, nmask, scratch);
- task_unlock(current);
if (err)
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
} else
err = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
@@ -1301,9 +1366,11 @@
putback_movable_pages(&pagelist);
}
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
mpol_out:
mpol_put(new);
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ lru_cache_enable();
return err;
}
@@ -1505,10 +1572,6 @@
if (nodes_empty(*new))
goto out_put;
- nodes_and(*new, *new, node_states[N_MEMORY]);
- if (nodes_empty(*new))
- goto out_put;
-
err = security_task_movememory(task);
if (err)
goto out_put;
@@ -1552,13 +1615,13 @@
unsigned long flags)
{
int err;
- int uninitialized_var(pval);
+ int pval;
nodemask_t nodes;
-
- addr = untagged_addr(addr);
if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
+
+ addr = untagged_addr(addr);
err = do_get_mempolicy(&pval, &nodes, addr, flags);
@@ -1691,26 +1754,59 @@
#endif /* CONFIG_COMPAT */
+bool vma_migratable(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ return false;
+
+ /*
+ * DAX device mappings require predictable access latency, so avoid
+ * incurring periodic faults.
+ */
+ if (vma_is_dax(vma))
+ return false;
+
+ if (is_vm_hugetlb_page(vma) &&
+ !hugepage_migration_supported(hstate_vma(vma)))
+ return false;
+
+ /*
+ * Migration allocates pages in the highest zone. If we cannot
+ * do so then migration (at least from node to node) is not
+ * possible.
+ */
+ if (vma->vm_file &&
+ gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
+ < policy_zone)
+ return false;
+ return true;
+}
+
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
- struct mempolicy *pol = NULL;
+ struct mempolicy *pol;
- if (vma) {
- if (vma->vm_ops && vma->vm_ops->get_policy) {
- pol = vma->vm_ops->get_policy(vma, addr);
- } else if (vma->vm_policy) {
- pol = vma->vm_policy;
+ if (!vma)
+ return NULL;
- /*
- * shmem_alloc_page() passes MPOL_F_SHARED policy with
- * a pseudo vma whose vma->vm_ops=NULL. Take a reference
- * count on these policies which will be dropped by
- * mpol_cond_put() later
- */
- if (mpol_needs_cond_ref(pol))
- mpol_get(pol);
- }
+ if (vma->vm_ops && vma->vm_ops->get_policy)
+ return vma->vm_ops->get_policy(vma, addr);
+
+ /*
+ * This could be called without holding the mmap_sem in the
+ * speculative page fault handler's path.
+ */
+ pol = READ_ONCE(vma->vm_policy);
+ if (pol) {
+ /*
+ * shmem_alloc_page() passes MPOL_F_SHARED policy with
+ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+ * count on these policies which will be dropped by
+ * mpol_cond_put() later
+ */
+ if (mpol_needs_cond_ref(pol))
+ mpol_get(pol);
}
return pol;
@@ -1785,7 +1881,7 @@
* Return a nodemask representing a mempolicy for filtering nodes for
* page allocation
*/
-static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
+nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->mode == MPOL_BIND) &&
@@ -1797,8 +1893,7 @@
}
/* Return the node id preferred by the given mempolicy, or the given id */
-static int policy_node(gfp_t gfp, struct mempolicy *policy,
- int nd)
+static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
nd = policy->v.preferred_node;
@@ -1986,7 +2081,6 @@
break;
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
*mask = mempolicy->v.nodes;
break;
@@ -2081,7 +2175,7 @@
*
* This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process.
- * When VMA is not NULL caller must hold down_read on the mmap_sem of the
+ * When VMA is not NULL caller must read-lock the mmap_lock of the
* mm_struct of the VMA to prevent it from going away. Should be used for
* all allocations for pages that will be mapped into user space. Returns
* NULL when no page can be allocated.
@@ -2119,43 +2213,29 @@
* If the policy is interleave, or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
- if (pol->mode == MPOL_PREFERRED &&
- !(pol->flags & MPOL_F_LOCAL))
+ if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
hpage_node = pol->v.preferred_node;
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
mpol_cond_put(pol);
/*
- * We cannot invoke reclaim if __GFP_THISNODE
- * is set. Invoking reclaim with
- * __GFP_THISNODE set, would cause THP
- * allocations to trigger heavy swapping
- * despite there may be tons of free memory
- * (including potentially plenty of THP
- * already available in the buddy) on all the
- * other NUMA nodes.
- *
- * At most we could invoke compaction when
- * __GFP_THISNODE is set (but we would need to
- * refrain from invoking reclaim even if
- * compaction returned COMPACT_SKIPPED because
- * there wasn't not enough memory to succeed
- * compaction). For now just avoid
- * __GFP_THISNODE instead of limiting the
- * allocation path to a strict and single
- * compaction invocation.
- *
- * Supposedly if direct reclaim was enabled by
- * the caller, the app prefers THP regardless
- * of the node it comes from so this would be
- * more desiderable behavior than only
- * providing THP originated from the local
- * node in such case.
+ * First, try to allocate THP only on local node, but
+ * don't reclaim unnecessarily, just compact.
*/
- if (!(gfp & __GFP_DIRECT_RECLAIM))
- gfp |= __GFP_THISNODE;
- page = __alloc_pages_node(hpage_node, gfp, order);
+ page = __alloc_pages_node(hpage_node,
+ gfp | __GFP_THISNODE | __GFP_NORETRY, order);
+
+ /*
+ * If hugepage allocations are configured to always
+ * synchronous compact or the vma has been madvised
+ * to prefer hugepage backing, retry allowing remote
+ * memory with both reclaim and compact as well.
+ */
+ if (!page && (gfp & __GFP_DIRECT_RECLAIM))
+ page = __alloc_pages_nodemask(gfp, order,
+ hpage_node, nmask);
+
goto out;
}
}
@@ -2167,6 +2247,7 @@
out:
return page;
}
+EXPORT_SYMBOL(alloc_pages_vma);
/**
* alloc_pages_current - Allocate pages.
@@ -2266,7 +2347,6 @@
switch (a->mode) {
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
@@ -2399,7 +2479,7 @@
unsigned long pgoff;
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
- int polnid = -1;
+ int polnid = NUMA_NO_NODE;
int ret = -1;
pol = get_vma_policy(vma, addr);
@@ -2573,6 +2653,7 @@
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
+ atomic_set(&mpol_new->refcnt, 1);
goto restart;
}
@@ -2805,12 +2886,11 @@
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
struct mempolicy *new = NULL;
- unsigned short mode;
unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
- int err = 1;
+ int err = 1, mode;
if (flags)
*flags++ = '\0'; /* terminate mode string */
@@ -2825,12 +2905,8 @@
} else
nodes_clear(nodes);
- for (mode = 0; mode < MPOL_MAX; mode++) {
- if (!strcmp(str, policy_modes[mode])) {
- break;
- }
- }
- if (mode >= MPOL_MAX)
+ mode = match_string(policy_modes, MPOL_MAX, str);
+ if (mode < 0)
goto out;
switch (mode) {
--
Gitblit v1.6.2