From 244b2c5ca8b14627e4a17755e5922221e121c771 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Wed, 09 Oct 2024 06:15:07 +0000
Subject: [PATCH] change system file
---
kernel/mm/memory-failure.c | 646 +++++++++++++++++++++++++++++-----------------------------
1 files changed, 324 insertions(+), 322 deletions(-)
diff --git a/kernel/mm/memory-failure.c b/kernel/mm/memory-failure.c
index 3da3c63..9aa2a15 100644
--- a/kernel/mm/memory-failure.c
+++ b/kernel/mm/memory-failure.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2008, 2009 Intel Corporation
* Authors: Andi Kleen, Fengguang Wu
- *
- * This software may be redistributed and/or modified under the terms of
- * the GNU General Public License ("GPL") version 2 only as published by the
- * Free Software Foundation.
*
* High level machine check handler. Handles pages reported by the
* hardware as being corrupted usually due to a multi-bit ECC memory or cache
@@ -67,6 +64,33 @@
int sysctl_memory_failure_recovery __read_mostly = 1;
atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
+
+static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
+{
+ if (hugepage_or_freepage) {
+ /*
+ * Doing this check for free pages is also fine since dissolve_free_huge_page
+ * returns 0 for non-hugetlb pages as well.
+ */
+ if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
+ /*
+ * We could fail to take off the target page from buddy
+ * for example due to racy page allocaiton, but that's
+ * acceptable because soft-offlined page is not broken
+ * and if someone really want to use it, they should
+ * take it.
+ */
+ return false;
+ }
+
+ SetPageHWPoison(page);
+ if (release)
+ put_page(page);
+ page_ref_inc(page);
+ num_poisoned_pages_inc();
+
+ return true;
+}
#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
@@ -213,14 +237,15 @@
{
struct task_struct *t = tk->tsk;
short addr_lsb = tk->size_shift;
- int ret;
+ int ret = 0;
- pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
- pfn, t->comm, t->pid);
+ pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
+ pfn, t->comm, t->pid);
- if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
- ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
- addr_lsb, current);
+ if (flags & MF_ACTION_REQUIRED) {
+ WARN_ON_ONCE(t != current);
+ ret = force_sig_mceerr(BUS_MCEERR_AR,
+ (void __user *)tk->addr, addr_lsb);
} else {
/*
* Don't use force here, it's convenient if the signal
@@ -306,30 +331,24 @@
/*
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
- * TBD would GFP_NOIO be enough?
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
struct vm_area_struct *vma,
- struct list_head *to_kill,
- struct to_kill **tkc)
+ struct list_head *to_kill)
{
struct to_kill *tk;
- if (*tkc) {
- tk = *tkc;
- *tkc = NULL;
- } else {
- tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
- if (!tk) {
- pr_err("Memory failure: Out of memory while machine check handling\n");
- return;
- }
+ tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
+ if (!tk) {
+ pr_err("Memory failure: Out of memory while machine check handling\n");
+ return;
}
+
tk->addr = page_address_in_vma(p, vma);
if (is_zone_device_page(p))
tk->size_shift = dev_pagemap_mapping_shift(p, vma);
else
- tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
+ tk->size_shift = page_shift(compound_head(p));
/*
* Send SIGKILL if "tk->addr == -EFAULT". Also, as
@@ -348,6 +367,7 @@
kfree(tk);
return;
}
+
get_task_struct(tsk);
tk->tsk = tsk;
list_add_tail(&tk->nd, to_kill);
@@ -407,9 +427,15 @@
{
struct task_struct *t;
- for_each_thread(tsk, t)
- if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
- return t;
+ for_each_thread(tsk, t) {
+ if (t->flags & PF_MCE_PROCESS) {
+ if (t->flags & PF_MCE_EARLY)
+ return t;
+ } else {
+ if (sysctl_memory_failure_early_kill)
+ return t;
+ }
+ }
return NULL;
}
@@ -418,35 +444,40 @@
* to be signaled when some page under the process is hwpoisoned.
* Return task_struct of the dedicated thread (main thread unless explicitly
* specified) if the process is "early kill," and otherwise returns NULL.
+ *
+ * Note that the above is true for Action Optional case, but not for Action
+ * Required case where SIGBUS should sent only to the current thread.
*/
static struct task_struct *task_early_kill(struct task_struct *tsk,
int force_early)
{
- struct task_struct *t;
if (!tsk->mm)
return NULL;
- if (force_early)
- return tsk;
- t = find_early_kill_thread(tsk);
- if (t)
- return t;
- if (sysctl_memory_failure_early_kill)
- return tsk;
- return NULL;
+ if (force_early) {
+ /*
+ * Comparing ->mm here because current task might represent
+ * a subthread, while tsk always points to the main thread.
+ */
+ if (tsk->mm == current->mm)
+ return current;
+ else
+ return NULL;
+ }
+ return find_early_kill_thread(tsk);
}
/*
* Collect processes when the error hit an anonymous page.
*/
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc, int force_early)
+ int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
struct anon_vma *av;
pgoff_t pgoff;
- av = page_lock_anon_vma_read(page);
+ av = page_lock_anon_vma_read(page, NULL);
if (av == NULL) /* Not actually mapped anymore */
return;
@@ -464,7 +495,7 @@
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill, tkc);
+ add_to_kill(t, page, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -475,16 +506,17 @@
* Collect processes when the error hit a file mapped page.
*/
static void collect_procs_file(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc, int force_early)
+ int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
struct address_space *mapping = page->mapping;
+ pgoff_t pgoff;
i_mmap_lock_read(mapping);
read_lock(&tasklist_lock);
+ pgoff = page_to_pgoff(page);
for_each_process(tsk) {
- pgoff_t pgoff = page_to_pgoff(page);
struct task_struct *t = task_early_kill(tsk, force_early);
if (!t)
@@ -499,7 +531,7 @@
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill, tkc);
+ add_to_kill(t, page, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -508,26 +540,17 @@
/*
* Collect the processes who have the corrupted page mapped to kill.
- * This is done in two steps for locking reasons.
- * First preallocate one tokill structure outside the spin locks,
- * so that we can kill at least one process reasonably reliable.
*/
static void collect_procs(struct page *page, struct list_head *tokill,
int force_early)
{
- struct to_kill *tk;
-
if (!page->mapping)
return;
- tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
- if (!tk)
- return;
if (PageAnon(page))
- collect_procs_anon(page, tokill, &tk, force_early);
+ collect_procs_anon(page, tokill, force_early);
else
- collect_procs_file(page, tokill, &tk, force_early);
- kfree(tk);
+ collect_procs_file(page, tokill, force_early);
}
static const char *action_name[] = {
@@ -559,6 +582,7 @@
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
[MF_MSG_DAX] = "dax page",
+ [MF_MSG_UNSPLIT_THP] = "unsplit thp",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -829,7 +853,6 @@
#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
#define unevict (1UL << PG_unevictable)
#define mlock (1UL << PG_mlocked)
-#define writeback (1UL << PG_writeback)
#define lru (1UL << PG_lru)
#define head (1UL << PG_head)
#define slab (1UL << PG_slab)
@@ -878,7 +901,6 @@
#undef sc
#undef unevict
#undef mlock
-#undef writeback
#undef lru
#undef head
#undef slab
@@ -930,7 +952,7 @@
* Return: return 0 if failed to grab the refcount, otherwise true (some
* non-zero value.)
*/
-int get_hwpoison_page(struct page *page)
+static int get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
@@ -959,7 +981,6 @@
return 0;
}
-EXPORT_SYMBOL_GPL(get_hwpoison_page);
/*
* Do all that is necessary to remove user space mappings. Unmap
@@ -968,10 +989,10 @@
static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
int flags, struct page **hpagep)
{
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK;
struct address_space *mapping;
LIST_HEAD(tokill);
- bool unmap_success;
+ bool unmap_success = true;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
bool mlocked = PageMlocked(hpage);
@@ -1011,7 +1032,7 @@
*/
mapping = page_mapping(hpage);
if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
- mapping_cap_writeback_dirty(mapping)) {
+ mapping_can_writeback(mapping)) {
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
} else {
@@ -1033,7 +1054,30 @@
if (kill)
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- unmap_success = try_to_unmap(hpage, ttu);
+ if (!PageHuge(hpage)) {
+ unmap_success = try_to_unmap(hpage, ttu);
+ } else {
+ if (!PageAnon(hpage)) {
+ /*
+ * For hugetlb pages in shared mappings, try_to_unmap
+ * could potentially call huge_pmd_unshare. Because of
+ * this, take semaphore in write mode here and set
+ * TTU_RMAP_LOCKED to indicate we have taken the lock
+ * at this higer level.
+ */
+ mapping = hugetlb_page_mapping_lock_write(hpage);
+ if (mapping) {
+ unmap_success = try_to_unmap(hpage,
+ ttu|TTU_RMAP_LOCKED);
+ i_mmap_unlock_write(mapping);
+ } else {
+ pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
+ unmap_success = false;
+ }
+ } else {
+ unmap_success = try_to_unmap(hpage, ttu);
+ }
+ }
if (!unmap_success)
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
@@ -1084,6 +1128,25 @@
return page_action(ps, p, pfn);
}
+static int try_to_split_thp_page(struct page *page, const char *msg)
+{
+ lock_page(page);
+ if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+ unsigned long pfn = page_to_pfn(page);
+
+ unlock_page(page);
+ if (!PageAnon(page))
+ pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
+ else
+ pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+ put_page(page);
+ return -EBUSY;
+ }
+ unlock_page(page);
+
+ return 0;
+}
+
static int memory_failure_hugetlb(unsigned long pfn, int flags)
{
struct page *p = pfn_to_page(pfn);
@@ -1125,7 +1188,7 @@
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_dec();
unlock_page(head);
- put_hwpoison_page(head);
+ put_page(head);
return 0;
}
@@ -1166,6 +1229,19 @@
LIST_HEAD(tokill);
int rc = -EBUSY;
loff_t start;
+ dax_entry_t cookie;
+
+ if (flags & MF_COUNT_INCREASED)
+ /*
+ * Drop the extra refcount in case we come from madvise().
+ */
+ put_page(page);
+
+ /* device metadata space is not recoverable */
+ if (!pgmap_pfn_valid(pgmap, pfn)) {
+ rc = -ENXIO;
+ goto out;
+ }
/*
* Prevent the inode from being freed while we are interrogating
@@ -1174,7 +1250,8 @@
* also prevents changes to the mapping of this pfn until
* poison signaling is complete.
*/
- if (!dax_lock_mapping_entry(page))
+ cookie = dax_lock_page(page);
+ if (!cookie)
goto out;
if (hwpoison_filter(page)) {
@@ -1182,16 +1259,12 @@
goto unlock;
}
- switch (pgmap->type) {
- case MEMORY_DEVICE_PRIVATE:
- case MEMORY_DEVICE_PUBLIC:
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
/*
* TODO: Handle HMM pages which may need coordination
* with device-side memory.
*/
goto unlock;
- default:
- break;
}
/*
@@ -1225,7 +1298,7 @@
kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
rc = 0;
unlock:
- dax_unlock_mapping_entry(page);
+ dax_unlock_page(page, cookie);
out:
/* drop pgmap ref acquired in caller */
put_dev_pagemap(pgmap);
@@ -1308,23 +1381,11 @@
}
if (PageTransHuge(hpage)) {
- lock_page(p);
- if (!PageAnon(p) || unlikely(split_huge_page(p))) {
- unlock_page(p);
- if (!PageAnon(p))
- pr_err("Memory failure: %#lx: non anonymous thp\n",
- pfn);
- else
- pr_err("Memory failure: %#lx: thp split failed\n",
- pfn);
- if (TestClearPageHWPoison(p))
- num_poisoned_pages_dec();
- put_hwpoison_page(p);
+ if (try_to_split_thp_page(p, "Memory Failure") < 0) {
+ action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
return -EBUSY;
}
- unlock_page(p);
VM_BUG_ON_PAGE(!page_count(p), p);
- hpage = compound_head(p);
}
/*
@@ -1364,10 +1425,7 @@
* page_remove_rmap() in try_to_unmap_one(). So to determine page status
* correctly, we save a copy of the page flags at this time.
*/
- if (PageHuge(p))
- page_flags = hpage->flags;
- else
- page_flags = p->flags;
+ page_flags = p->flags;
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1376,14 +1434,14 @@
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_dec();
unlock_page(p);
- put_hwpoison_page(p);
+ put_page(p);
return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unlock_page(p);
- put_hwpoison_page(p);
+ put_page(p);
return 0;
}
@@ -1404,11 +1462,8 @@
/*
* Now take care of user space mappings.
* Abort on fail: __delete_from_page_cache() assumes unmapped page.
- *
- * When the raw error page is thp tail page, hpage points to the raw
- * page after thp split.
*/
- if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
+ if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto out;
@@ -1492,7 +1547,7 @@
unsigned long proc_flags;
int gotten;
- mf_cpu = this_cpu_ptr(&memory_failure_cpu);
+ mf_cpu = container_of(work, struct memory_failure_cpu, work);
for (;;) {
spin_lock_irqsave(&mf_cpu->lock, proc_flags);
gotten = kfifo_get(&mf_cpu->fifo, &entry);
@@ -1500,10 +1555,23 @@
if (!gotten)
break;
if (entry.flags & MF_SOFT_OFFLINE)
- soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
+ soft_offline_page(entry.pfn, entry.flags);
else
memory_failure(entry.pfn, entry.flags);
}
+}
+
+/*
+ * Process memory_failure work queued on the specified CPU.
+ * Used to avoid return-to-userspace racing with the memory_failure workqueue.
+ */
+void memory_failure_queue_kick(int cpu)
+{
+ struct memory_failure_cpu *mf_cpu;
+
+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+ cancel_work_sync(&mf_cpu->work);
+ memory_failure_work_func(&mf_cpu->work);
}
static int __init memory_failure_init(void)
@@ -1612,147 +1680,113 @@
}
unlock_page(page);
- put_hwpoison_page(page);
+ put_page(page);
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
- put_hwpoison_page(page);
+ put_page(page);
return 0;
}
EXPORT_SYMBOL(unpoison_memory);
-static struct page *new_page(struct page *p, unsigned long private)
+/*
+ * Safely get reference count of an arbitrary page.
+ * Returns 0 for a free page, 1 for an in-use page, -EIO for a page-type we
+ * cannot handle and -EBUSY if we raced with an allocation.
+ * We only incremented refcount in case the page was already in-use and it is
+ * a known type we can handle.
+ */
+static int get_any_page(struct page *p, int flags)
{
- int nid = page_to_nid(p);
+ int ret = 0, pass = 0;
+ bool count_increased = false;
- return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
+ if (flags & MF_COUNT_INCREASED)
+ count_increased = true;
+
+try_again:
+ if (!count_increased && !get_hwpoison_page(p)) {
+ if (page_count(p)) {
+ /* We raced with an allocation, retry. */
+ if (pass++ < 3)
+ goto try_again;
+ ret = -EBUSY;
+ } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
+ /* We raced with put_page, retry. */
+ if (pass++ < 3)
+ goto try_again;
+ ret = -EIO;
+ }
+ } else {
+ if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) {
+ ret = 1;
+ } else {
+ /*
+ * A page we cannot handle. Check whether we can turn
+ * it into something we can handle.
+ */
+ if (pass++ < 3) {
+ put_page(p);
+ shake_page(p, 1);
+ count_increased = false;
+ goto try_again;
+ }
+ put_page(p);
+ ret = -EIO;
+ }
+ }
+
+ return ret;
+}
+
+static bool isolate_page(struct page *page, struct list_head *pagelist)
+{
+ bool isolated = false;
+ bool lru = PageLRU(page);
+
+ if (PageHuge(page)) {
+ isolated = !isolate_hugetlb(page, pagelist);
+ } else {
+ if (lru)
+ isolated = !isolate_lru_page(page);
+ else
+ isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
+
+ if (isolated)
+ list_add(&page->lru, pagelist);
+ }
+
+ if (isolated && lru)
+ inc_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_lru(page));
+
+ /*
+ * If we succeed to isolate the page, we grabbed another refcount on
+ * the page, so we can safely drop the one we got from get_any_pages().
+ * If we failed to isolate the page, it means that we cannot go further
+ * and we will return an error, so drop the reference we got from
+ * get_any_pages() as well.
+ */
+ put_page(page);
+ return isolated;
}
/*
- * Safely get reference count of an arbitrary page.
- * Returns 0 for a free page, -EIO for a zero refcount page
- * that is not free, and 1 for any other page type.
- * For 1 the page is returned with increased page count, otherwise not.
+ * __soft_offline_page handles hugetlb-pages and non-hugetlb pages.
+ * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
+ * If the page is mapped, it migrates the contents over.
*/
-static int __get_any_page(struct page *p, unsigned long pfn, int flags)
+static int __soft_offline_page(struct page *page)
{
- int ret;
-
- if (flags & MF_COUNT_INCREASED)
- return 1;
-
- /*
- * When the target page is a free hugepage, just remove it
- * from free hugepage list.
- */
- if (!get_hwpoison_page(p)) {
- if (PageHuge(p)) {
- pr_info("%s: %#lx free huge page\n", __func__, pfn);
- ret = 0;
- } else if (is_free_buddy_page(p)) {
- pr_info("%s: %#lx free buddy page\n", __func__, pfn);
- ret = 0;
- } else {
- pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
- __func__, pfn, p->flags);
- ret = -EIO;
- }
- } else {
- /* Not a free page */
- ret = 1;
- }
- return ret;
-}
-
-static int get_any_page(struct page *page, unsigned long pfn, int flags)
-{
- int ret = __get_any_page(page, pfn, flags);
-
- if (ret == 1 && !PageHuge(page) &&
- !PageLRU(page) && !__PageMovable(page)) {
- /*
- * Try to free it.
- */
- put_hwpoison_page(page);
- shake_page(page, 1);
-
- /*
- * Did it turn free?
- */
- ret = __get_any_page(page, pfn, 0);
- if (ret == 1 && !PageLRU(page)) {
- /* Drop page reference which is from __get_any_page() */
- put_hwpoison_page(page);
- pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
- pfn, page->flags, &page->flags);
- return -EIO;
- }
- }
- return ret;
-}
-
-static int soft_offline_huge_page(struct page *page, int flags)
-{
- int ret;
+ int ret = 0;
unsigned long pfn = page_to_pfn(page);
struct page *hpage = compound_head(page);
+ char const *msg_page[] = {"page", "hugepage"};
+ bool huge = PageHuge(page);
LIST_HEAD(pagelist);
-
- /*
- * This double-check of PageHWPoison is to avoid the race with
- * memory_failure(). See also comment in __soft_offline_page().
- */
- lock_page(hpage);
- if (PageHWPoison(hpage)) {
- unlock_page(hpage);
- put_hwpoison_page(hpage);
- pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
- return -EBUSY;
- }
- unlock_page(hpage);
-
- ret = isolate_huge_page(hpage, &pagelist);
- /*
- * get_any_page() and isolate_huge_page() takes a refcount each,
- * so need to drop one here.
- */
- put_hwpoison_page(hpage);
- if (!ret) {
- pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
- return -EBUSY;
- }
-
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
- if (ret) {
- pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
- pfn, ret, page->flags, &page->flags);
- if (!list_empty(&pagelist))
- putback_movable_pages(&pagelist);
- if (ret > 0)
- ret = -EIO;
- } else {
- /*
- * We set PG_hwpoison only when the migration source hugepage
- * was successfully dissolved, because otherwise hwpoisoned
- * hugepage remains on free hugepage list, then userspace will
- * find it as SIGBUS by allocation failure. That's not expected
- * in soft-offlining.
- */
- ret = dissolve_free_huge_page(page);
- if (!ret) {
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- else
- ret = -EBUSY;
- }
- }
- return ret;
-}
-
-static int __soft_offline_page(struct page *page, int flags)
-{
- int ret;
- unsigned long pfn = page_to_pfn(page);
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
/*
* Check PageHWPoison again inside page lock because PageHWPoison
@@ -1761,127 +1795,77 @@
* so there's no race between soft_offline_page() and memory_failure().
*/
lock_page(page);
- wait_on_page_writeback(page);
+ if (!PageHuge(page))
+ wait_on_page_writeback(page);
if (PageHWPoison(page)) {
unlock_page(page);
- put_hwpoison_page(page);
+ put_page(page);
pr_info("soft offline: %#lx page already poisoned\n", pfn);
- return -EBUSY;
+ return 0;
}
- /*
- * Try to invalidate first. This should work for
- * non dirty unmapped page cache pages.
- */
- ret = invalidate_inode_page(page);
+
+ if (!PageHuge(page))
+ /*
+ * Try to invalidate first. This should work for
+ * non dirty unmapped page cache pages.
+ */
+ ret = invalidate_inode_page(page);
unlock_page(page);
+
/*
* RED-PEN would be better to keep it isolated here, but we
* would need to fix isolation locking first.
*/
- if (ret == 1) {
- put_hwpoison_page(page);
+ if (ret) {
pr_info("soft_offline: %#lx: invalidated\n", pfn);
- SetPageHWPoison(page);
- num_poisoned_pages_inc();
+ page_handle_poison(page, false, true);
return 0;
}
- /*
- * Simple invalidation didn't work.
- * Try to migrate to a new page instead. migrate.c
- * handles a large number of cases for us.
- */
- if (PageLRU(page))
- ret = isolate_lru_page(page);
- else
- ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
- /*
- * Drop page reference which is came from get_any_page()
- * successful isolate_lru_page() already took another one.
- */
- put_hwpoison_page(page);
- if (!ret) {
- LIST_HEAD(pagelist);
- /*
- * After isolated lru page, the PageLRU will be cleared,
- * so use !__PageMovable instead for LRU page's mapping
- * cannot have PAGE_MAPPING_MOVABLE.
- */
- if (!__PageMovable(page))
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
- list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
- if (ret) {
+ if (isolate_page(hpage, &pagelist)) {
+ ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ if (!ret) {
+ bool release = !huge;
+
+ if (!page_handle_poison(page, huge, release))
+ ret = -EBUSY;
+ } else {
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
- pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
- pfn, ret, page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
+ pfn, msg_page[huge], ret, page->flags, &page->flags);
if (ret > 0)
- ret = -EIO;
+ ret = -EBUSY;
}
} else {
- pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
- pfn, ret, page_count(page), page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n",
+ pfn, msg_page[huge], page_count(page), page->flags, &page->flags);
+ ret = -EBUSY;
}
return ret;
}
-static int soft_offline_in_use_page(struct page *page, int flags)
+static int soft_offline_in_use_page(struct page *page)
{
- int ret;
- int mt;
struct page *hpage = compound_head(page);
- if (!PageHuge(page) && PageTransHuge(hpage)) {
- lock_page(page);
- if (!PageAnon(page) || unlikely(split_huge_page(page))) {
- unlock_page(page);
- if (!PageAnon(page))
- pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
- else
- pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
- put_hwpoison_page(page);
+ if (!PageHuge(page) && PageTransHuge(hpage))
+ if (try_to_split_thp_page(page, "soft offline") < 0)
return -EBUSY;
- }
- unlock_page(page);
- }
-
- /*
- * Setting MIGRATE_ISOLATE here ensures that the page will be linked
- * to free list immediately (not via pcplist) when released after
- * successful page migration. Otherwise we can't guarantee that the
- * page is really free after put_page() returns, so
- * set_hwpoison_free_buddy_page() highly likely fails.
- */
- mt = get_pageblock_migratetype(page);
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
- if (PageHuge(page))
- ret = soft_offline_huge_page(page, flags);
- else
- ret = __soft_offline_page(page, flags);
- set_pageblock_migratetype(page, mt);
- return ret;
+ return __soft_offline_page(page);
}
-static int soft_offline_free_page(struct page *page)
+static void put_ref_page(struct page *page)
{
- int rc = dissolve_free_huge_page(page);
-
- if (!rc) {
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- else
- rc = -EBUSY;
- }
- return rc;
+ if (page)
+ put_page(page);
}
/**
* soft_offline_page - Soft offline a page.
- * @page: page to offline
+ * @pfn: pfn to soft-offline
* @flags: flags. Same as memory_failure().
*
* Returns 0 on success, otherwise negated errno.
@@ -1901,34 +1885,52 @@
* This is not a 100% solution for all memory, but tries to be
* ``good enough'' for the majority of memory.
*/
-int soft_offline_page(struct page *page, int flags)
+int soft_offline_page(unsigned long pfn, int flags)
{
int ret;
- unsigned long pfn = page_to_pfn(page);
+ bool try_again = true;
+ struct page *page, *ref_page = NULL;
- if (is_zone_device_page(page)) {
- pr_debug_ratelimited("soft_offline: %#lx page is device page\n",
- pfn);
- if (flags & MF_COUNT_INCREASED)
- put_page(page);
+ WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED));
+
+ if (!pfn_valid(pfn))
+ return -ENXIO;
+ if (flags & MF_COUNT_INCREASED)
+ ref_page = pfn_to_page(pfn);
+
+ /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
+ page = pfn_to_online_page(pfn);
+ if (!page) {
+ put_ref_page(ref_page);
return -EIO;
}
if (PageHWPoison(page)) {
- pr_info("soft offline: %#lx page already poisoned\n", pfn);
- if (flags & MF_COUNT_INCREASED)
- put_hwpoison_page(page);
- return -EBUSY;
+ pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
+ put_ref_page(ref_page);
+ return 0;
}
+retry:
get_online_mems();
- ret = get_any_page(page, pfn, flags);
+ ret = get_any_page(page, flags);
put_online_mems();
- if (ret > 0)
- ret = soft_offline_in_use_page(page, flags);
- else if (ret == 0)
- ret = soft_offline_free_page(page);
+ if (ret > 0) {
+ ret = soft_offline_in_use_page(page);
+ } else if (ret == 0) {
+ if (!page_handle_poison(page, true, false)) {
+ if (try_again) {
+ try_again = false;
+ flags &= ~MF_COUNT_INCREASED;
+ goto retry;
+ }
+ ret = -EBUSY;
+ }
+ } else if (ret == -EIO) {
+ pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n",
+ __func__, pfn, page->flags, &page->flags);
+ }
return ret;
}
--
Gitblit v1.6.2