From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/mm/memory-failure.c | 646 +++++++++++++++++++++++++++++----------------------------- 1 files changed, 324 insertions(+), 322 deletions(-) diff --git a/kernel/mm/memory-failure.c b/kernel/mm/memory-failure.c index 3da3c63..9aa2a15 100644 --- a/kernel/mm/memory-failure.c +++ b/kernel/mm/memory-failure.c @@ -1,10 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008, 2009 Intel Corporation * Authors: Andi Kleen, Fengguang Wu - * - * This software may be redistributed and/or modified under the terms of - * the GNU General Public License ("GPL") version 2 only as published by the - * Free Software Foundation. * * High level machine check handler. Handles pages reported by the * hardware as being corrupted usually due to a multi-bit ECC memory or cache @@ -67,6 +64,33 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); + +static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) +{ + if (hugepage_or_freepage) { + /* + * Doing this check for free pages is also fine since dissolve_free_huge_page + * returns 0 for non-hugetlb pages as well. + */ + if (dissolve_free_huge_page(page) || !take_page_off_buddy(page)) + /* + * We could fail to take off the target page from buddy + * for example due to racy page allocaiton, but that's + * acceptable because soft-offlined page is not broken + * and if someone really want to use it, they should + * take it. + */ + return false; + } + + SetPageHWPoison(page); + if (release) + put_page(page); + page_ref_inc(page); + num_poisoned_pages_inc(); + + return true; +} #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) @@ -213,14 +237,15 @@ { struct task_struct *t = tk->tsk; short addr_lsb = tk->size_shift; - int ret; + int ret = 0; - pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n", - pfn, t->comm, t->pid); + pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", + pfn, t->comm, t->pid); - if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { - ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr, - addr_lsb, current); + if (flags & MF_ACTION_REQUIRED) { + WARN_ON_ONCE(t != current); + ret = force_sig_mceerr(BUS_MCEERR_AR, + (void __user *)tk->addr, addr_lsb); } else { /* * Don't use force here, it's convenient if the signal @@ -306,30 +331,24 @@ /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. - * TBD would GFP_NOIO be enough? */ static void add_to_kill(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, - struct list_head *to_kill, - struct to_kill **tkc) + struct list_head *to_kill) { struct to_kill *tk; - if (*tkc) { - tk = *tkc; - *tkc = NULL; - } else { - tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); - if (!tk) { - pr_err("Memory failure: Out of memory while machine check handling\n"); - return; - } + tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); + if (!tk) { + pr_err("Memory failure: Out of memory while machine check handling\n"); + return; } + tk->addr = page_address_in_vma(p, vma); if (is_zone_device_page(p)) tk->size_shift = dev_pagemap_mapping_shift(p, vma); else - tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT; + tk->size_shift = page_shift(compound_head(p)); /* * Send SIGKILL if "tk->addr == -EFAULT". Also, as @@ -348,6 +367,7 @@ kfree(tk); return; } + get_task_struct(tsk); tk->tsk = tsk; list_add_tail(&tk->nd, to_kill); @@ -407,9 +427,15 @@ { struct task_struct *t; - for_each_thread(tsk, t) - if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) - return t; + for_each_thread(tsk, t) { + if (t->flags & PF_MCE_PROCESS) { + if (t->flags & PF_MCE_EARLY) + return t; + } else { + if (sysctl_memory_failure_early_kill) + return t; + } + } return NULL; } @@ -418,35 +444,40 @@ * to be signaled when some page under the process is hwpoisoned. * Return task_struct of the dedicated thread (main thread unless explicitly * specified) if the process is "early kill," and otherwise returns NULL. + * + * Note that the above is true for Action Optional case, but not for Action + * Required case where SIGBUS should sent only to the current thread. */ static struct task_struct *task_early_kill(struct task_struct *tsk, int force_early) { - struct task_struct *t; if (!tsk->mm) return NULL; - if (force_early) - return tsk; - t = find_early_kill_thread(tsk); - if (t) - return t; - if (sysctl_memory_failure_early_kill) - return tsk; - return NULL; + if (force_early) { + /* + * Comparing ->mm here because current task might represent + * a subthread, while tsk always points to the main thread. + */ + if (tsk->mm == current->mm) + return current; + else + return NULL; + } + return find_early_kill_thread(tsk); } /* * Collect processes when the error hit an anonymous page. */ static void collect_procs_anon(struct page *page, struct list_head *to_kill, - struct to_kill **tkc, int force_early) + int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; pgoff_t pgoff; - av = page_lock_anon_vma_read(page); + av = page_lock_anon_vma_read(page, NULL); if (av == NULL) /* Not actually mapped anymore */ return; @@ -464,7 +495,7 @@ if (!page_mapped_in_vma(page, vma)) continue; if (vma->vm_mm == t->mm) - add_to_kill(t, page, vma, to_kill, tkc); + add_to_kill(t, page, vma, to_kill); } } read_unlock(&tasklist_lock); @@ -475,16 +506,17 @@ * Collect processes when the error hit a file mapped page. */ static void collect_procs_file(struct page *page, struct list_head *to_kill, - struct to_kill **tkc, int force_early) + int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; struct address_space *mapping = page->mapping; + pgoff_t pgoff; i_mmap_lock_read(mapping); read_lock(&tasklist_lock); + pgoff = page_to_pgoff(page); for_each_process(tsk) { - pgoff_t pgoff = page_to_pgoff(page); struct task_struct *t = task_early_kill(tsk, force_early); if (!t) @@ -499,7 +531,7 @@ * to be informed of all such data corruptions. */ if (vma->vm_mm == t->mm) - add_to_kill(t, page, vma, to_kill, tkc); + add_to_kill(t, page, vma, to_kill); } } read_unlock(&tasklist_lock); @@ -508,26 +540,17 @@ /* * Collect the processes who have the corrupted page mapped to kill. - * This is done in two steps for locking reasons. - * First preallocate one tokill structure outside the spin locks, - * so that we can kill at least one process reasonably reliable. */ static void collect_procs(struct page *page, struct list_head *tokill, int force_early) { - struct to_kill *tk; - if (!page->mapping) return; - tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); - if (!tk) - return; if (PageAnon(page)) - collect_procs_anon(page, tokill, &tk, force_early); + collect_procs_anon(page, tokill, force_early); else - collect_procs_file(page, tokill, &tk, force_early); - kfree(tk); + collect_procs_file(page, tokill, force_early); } static const char *action_name[] = { @@ -559,6 +582,7 @@ [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", [MF_MSG_DAX] = "dax page", + [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -829,7 +853,6 @@ #define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked)) #define unevict (1UL << PG_unevictable) #define mlock (1UL << PG_mlocked) -#define writeback (1UL << PG_writeback) #define lru (1UL << PG_lru) #define head (1UL << PG_head) #define slab (1UL << PG_slab) @@ -878,7 +901,6 @@ #undef sc #undef unevict #undef mlock -#undef writeback #undef lru #undef head #undef slab @@ -930,7 +952,7 @@ * Return: return 0 if failed to grab the refcount, otherwise true (some * non-zero value.) */ -int get_hwpoison_page(struct page *page) +static int get_hwpoison_page(struct page *page) { struct page *head = compound_head(page); @@ -959,7 +981,6 @@ return 0; } -EXPORT_SYMBOL_GPL(get_hwpoison_page); /* * Do all that is necessary to remove user space mappings. Unmap @@ -968,10 +989,10 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, int flags, struct page **hpagep) { - enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + enum ttu_flags ttu = TTU_IGNORE_MLOCK; struct address_space *mapping; LIST_HEAD(tokill); - bool unmap_success; + bool unmap_success = true; int kill = 1, forcekill; struct page *hpage = *hpagep; bool mlocked = PageMlocked(hpage); @@ -1011,7 +1032,7 @@ */ mapping = page_mapping(hpage); if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && - mapping_cap_writeback_dirty(mapping)) { + mapping_can_writeback(mapping)) { if (page_mkclean(hpage)) { SetPageDirty(hpage); } else { @@ -1033,7 +1054,30 @@ if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - unmap_success = try_to_unmap(hpage, ttu); + if (!PageHuge(hpage)) { + unmap_success = try_to_unmap(hpage, ttu); + } else { + if (!PageAnon(hpage)) { + /* + * For hugetlb pages in shared mappings, try_to_unmap + * could potentially call huge_pmd_unshare. Because of + * this, take semaphore in write mode here and set + * TTU_RMAP_LOCKED to indicate we have taken the lock + * at this higer level. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (mapping) { + unmap_success = try_to_unmap(hpage, + ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); + } else { + pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); + unmap_success = false; + } + } else { + unmap_success = try_to_unmap(hpage, ttu); + } + } if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); @@ -1084,6 +1128,25 @@ return page_action(ps, p, pfn); } +static int try_to_split_thp_page(struct page *page, const char *msg) +{ + lock_page(page); + if (!PageAnon(page) || unlikely(split_huge_page(page))) { + unsigned long pfn = page_to_pfn(page); + + unlock_page(page); + if (!PageAnon(page)) + pr_info("%s: %#lx: non anonymous thp\n", msg, pfn); + else + pr_info("%s: %#lx: thp split failed\n", msg, pfn); + put_page(page); + return -EBUSY; + } + unlock_page(page); + + return 0; +} + static int memory_failure_hugetlb(unsigned long pfn, int flags) { struct page *p = pfn_to_page(pfn); @@ -1125,7 +1188,7 @@ pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); num_poisoned_pages_dec(); unlock_page(head); - put_hwpoison_page(head); + put_page(head); return 0; } @@ -1166,6 +1229,19 @@ LIST_HEAD(tokill); int rc = -EBUSY; loff_t start; + dax_entry_t cookie; + + if (flags & MF_COUNT_INCREASED) + /* + * Drop the extra refcount in case we come from madvise(). + */ + put_page(page); + + /* device metadata space is not recoverable */ + if (!pgmap_pfn_valid(pgmap, pfn)) { + rc = -ENXIO; + goto out; + } /* * Prevent the inode from being freed while we are interrogating @@ -1174,7 +1250,8 @@ * also prevents changes to the mapping of this pfn until * poison signaling is complete. */ - if (!dax_lock_mapping_entry(page)) + cookie = dax_lock_page(page); + if (!cookie) goto out; if (hwpoison_filter(page)) { @@ -1182,16 +1259,12 @@ goto unlock; } - switch (pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - case MEMORY_DEVICE_PUBLIC: + if (pgmap->type == MEMORY_DEVICE_PRIVATE) { /* * TODO: Handle HMM pages which may need coordination * with device-side memory. */ goto unlock; - default: - break; } /* @@ -1225,7 +1298,7 @@ kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags); rc = 0; unlock: - dax_unlock_mapping_entry(page); + dax_unlock_page(page, cookie); out: /* drop pgmap ref acquired in caller */ put_dev_pagemap(pgmap); @@ -1308,23 +1381,11 @@ } if (PageTransHuge(hpage)) { - lock_page(p); - if (!PageAnon(p) || unlikely(split_huge_page(p))) { - unlock_page(p); - if (!PageAnon(p)) - pr_err("Memory failure: %#lx: non anonymous thp\n", - pfn); - else - pr_err("Memory failure: %#lx: thp split failed\n", - pfn); - if (TestClearPageHWPoison(p)) - num_poisoned_pages_dec(); - put_hwpoison_page(p); + if (try_to_split_thp_page(p, "Memory Failure") < 0) { + action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); return -EBUSY; } - unlock_page(p); VM_BUG_ON_PAGE(!page_count(p), p); - hpage = compound_head(p); } /* @@ -1364,10 +1425,7 @@ * page_remove_rmap() in try_to_unmap_one(). So to determine page status * correctly, we save a copy of the page flags at this time. */ - if (PageHuge(p)) - page_flags = hpage->flags; - else - page_flags = p->flags; + page_flags = p->flags; /* * unpoison always clear PG_hwpoison inside page lock @@ -1376,14 +1434,14 @@ pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); num_poisoned_pages_dec(); unlock_page(p); - put_hwpoison_page(p); + put_page(p); return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unlock_page(p); - put_hwpoison_page(p); + put_page(p); return 0; } @@ -1404,11 +1462,8 @@ /* * Now take care of user space mappings. * Abort on fail: __delete_from_page_cache() assumes unmapped page. - * - * When the raw error page is thp tail page, hpage points to the raw - * page after thp split. */ - if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) { + if (!hwpoison_user_mappings(p, pfn, flags, &p)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; @@ -1492,7 +1547,7 @@ unsigned long proc_flags; int gotten; - mf_cpu = this_cpu_ptr(&memory_failure_cpu); + mf_cpu = container_of(work, struct memory_failure_cpu, work); for (;;) { spin_lock_irqsave(&mf_cpu->lock, proc_flags); gotten = kfifo_get(&mf_cpu->fifo, &entry); @@ -1500,10 +1555,23 @@ if (!gotten) break; if (entry.flags & MF_SOFT_OFFLINE) - soft_offline_page(pfn_to_page(entry.pfn), entry.flags); + soft_offline_page(entry.pfn, entry.flags); else memory_failure(entry.pfn, entry.flags); } +} + +/* + * Process memory_failure work queued on the specified CPU. + * Used to avoid return-to-userspace racing with the memory_failure workqueue. + */ +void memory_failure_queue_kick(int cpu) +{ + struct memory_failure_cpu *mf_cpu; + + mf_cpu = &per_cpu(memory_failure_cpu, cpu); + cancel_work_sync(&mf_cpu->work); + memory_failure_work_func(&mf_cpu->work); } static int __init memory_failure_init(void) @@ -1612,147 +1680,113 @@ } unlock_page(page); - put_hwpoison_page(page); + put_page(page); if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) - put_hwpoison_page(page); + put_page(page); return 0; } EXPORT_SYMBOL(unpoison_memory); -static struct page *new_page(struct page *p, unsigned long private) +/* + * Safely get reference count of an arbitrary page. + * Returns 0 for a free page, 1 for an in-use page, -EIO for a page-type we + * cannot handle and -EBUSY if we raced with an allocation. + * We only incremented refcount in case the page was already in-use and it is + * a known type we can handle. + */ +static int get_any_page(struct page *p, int flags) { - int nid = page_to_nid(p); + int ret = 0, pass = 0; + bool count_increased = false; - return new_page_nodemask(p, nid, &node_states[N_MEMORY]); + if (flags & MF_COUNT_INCREASED) + count_increased = true; + +try_again: + if (!count_increased && !get_hwpoison_page(p)) { + if (page_count(p)) { + /* We raced with an allocation, retry. */ + if (pass++ < 3) + goto try_again; + ret = -EBUSY; + } else if (!PageHuge(p) && !is_free_buddy_page(p)) { + /* We raced with put_page, retry. */ + if (pass++ < 3) + goto try_again; + ret = -EIO; + } + } else { + if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) { + ret = 1; + } else { + /* + * A page we cannot handle. Check whether we can turn + * it into something we can handle. + */ + if (pass++ < 3) { + put_page(p); + shake_page(p, 1); + count_increased = false; + goto try_again; + } + put_page(p); + ret = -EIO; + } + } + + return ret; +} + +static bool isolate_page(struct page *page, struct list_head *pagelist) +{ + bool isolated = false; + bool lru = PageLRU(page); + + if (PageHuge(page)) { + isolated = !isolate_hugetlb(page, pagelist); + } else { + if (lru) + isolated = !isolate_lru_page(page); + else + isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE); + + if (isolated) + list_add(&page->lru, pagelist); + } + + if (isolated && lru) + inc_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_lru(page)); + + /* + * If we succeed to isolate the page, we grabbed another refcount on + * the page, so we can safely drop the one we got from get_any_pages(). + * If we failed to isolate the page, it means that we cannot go further + * and we will return an error, so drop the reference we got from + * get_any_pages() as well. + */ + put_page(page); + return isolated; } /* - * Safely get reference count of an arbitrary page. - * Returns 0 for a free page, -EIO for a zero refcount page - * that is not free, and 1 for any other page type. - * For 1 the page is returned with increased page count, otherwise not. + * __soft_offline_page handles hugetlb-pages and non-hugetlb pages. + * If the page is a non-dirty unmapped page-cache page, it simply invalidates. + * If the page is mapped, it migrates the contents over. */ -static int __get_any_page(struct page *p, unsigned long pfn, int flags) +static int __soft_offline_page(struct page *page) { - int ret; - - if (flags & MF_COUNT_INCREASED) - return 1; - - /* - * When the target page is a free hugepage, just remove it - * from free hugepage list. - */ - if (!get_hwpoison_page(p)) { - if (PageHuge(p)) { - pr_info("%s: %#lx free huge page\n", __func__, pfn); - ret = 0; - } else if (is_free_buddy_page(p)) { - pr_info("%s: %#lx free buddy page\n", __func__, pfn); - ret = 0; - } else { - pr_info("%s: %#lx: unknown zero refcount page type %lx\n", - __func__, pfn, p->flags); - ret = -EIO; - } - } else { - /* Not a free page */ - ret = 1; - } - return ret; -} - -static int get_any_page(struct page *page, unsigned long pfn, int flags) -{ - int ret = __get_any_page(page, pfn, flags); - - if (ret == 1 && !PageHuge(page) && - !PageLRU(page) && !__PageMovable(page)) { - /* - * Try to free it. - */ - put_hwpoison_page(page); - shake_page(page, 1); - - /* - * Did it turn free? - */ - ret = __get_any_page(page, pfn, 0); - if (ret == 1 && !PageLRU(page)) { - /* Drop page reference which is from __get_any_page() */ - put_hwpoison_page(page); - pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n", - pfn, page->flags, &page->flags); - return -EIO; - } - } - return ret; -} - -static int soft_offline_huge_page(struct page *page, int flags) -{ - int ret; + int ret = 0; unsigned long pfn = page_to_pfn(page); struct page *hpage = compound_head(page); + char const *msg_page[] = {"page", "hugepage"}; + bool huge = PageHuge(page); LIST_HEAD(pagelist); - - /* - * This double-check of PageHWPoison is to avoid the race with - * memory_failure(). See also comment in __soft_offline_page(). - */ - lock_page(hpage); - if (PageHWPoison(hpage)) { - unlock_page(hpage); - put_hwpoison_page(hpage); - pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); - return -EBUSY; - } - unlock_page(hpage); - - ret = isolate_huge_page(hpage, &pagelist); - /* - * get_any_page() and isolate_huge_page() takes a refcount each, - * so need to drop one here. - */ - put_hwpoison_page(hpage); - if (!ret) { - pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); - return -EBUSY; - } - - ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, - MIGRATE_SYNC, MR_MEMORY_FAILURE); - if (ret) { - pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n", - pfn, ret, page->flags, &page->flags); - if (!list_empty(&pagelist)) - putback_movable_pages(&pagelist); - if (ret > 0) - ret = -EIO; - } else { - /* - * We set PG_hwpoison only when the migration source hugepage - * was successfully dissolved, because otherwise hwpoisoned - * hugepage remains on free hugepage list, then userspace will - * find it as SIGBUS by allocation failure. That's not expected - * in soft-offlining. - */ - ret = dissolve_free_huge_page(page); - if (!ret) { - if (set_hwpoison_free_buddy_page(page)) - num_poisoned_pages_inc(); - else - ret = -EBUSY; - } - } - return ret; -} - -static int __soft_offline_page(struct page *page, int flags) -{ - int ret; - unsigned long pfn = page_to_pfn(page); + struct migration_target_control mtc = { + .nid = NUMA_NO_NODE, + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; /* * Check PageHWPoison again inside page lock because PageHWPoison @@ -1761,127 +1795,77 @@ * so there's no race between soft_offline_page() and memory_failure(). */ lock_page(page); - wait_on_page_writeback(page); + if (!PageHuge(page)) + wait_on_page_writeback(page); if (PageHWPoison(page)) { unlock_page(page); - put_hwpoison_page(page); + put_page(page); pr_info("soft offline: %#lx page already poisoned\n", pfn); - return -EBUSY; + return 0; } - /* - * Try to invalidate first. This should work for - * non dirty unmapped page cache pages. - */ - ret = invalidate_inode_page(page); + + if (!PageHuge(page)) + /* + * Try to invalidate first. This should work for + * non dirty unmapped page cache pages. + */ + ret = invalidate_inode_page(page); unlock_page(page); + /* * RED-PEN would be better to keep it isolated here, but we * would need to fix isolation locking first. */ - if (ret == 1) { - put_hwpoison_page(page); + if (ret) { pr_info("soft_offline: %#lx: invalidated\n", pfn); - SetPageHWPoison(page); - num_poisoned_pages_inc(); + page_handle_poison(page, false, true); return 0; } - /* - * Simple invalidation didn't work. - * Try to migrate to a new page instead. migrate.c - * handles a large number of cases for us. - */ - if (PageLRU(page)) - ret = isolate_lru_page(page); - else - ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); - /* - * Drop page reference which is came from get_any_page() - * successful isolate_lru_page() already took another one. - */ - put_hwpoison_page(page); - if (!ret) { - LIST_HEAD(pagelist); - /* - * After isolated lru page, the PageLRU will be cleared, - * so use !__PageMovable instead for LRU page's mapping - * cannot have PAGE_MAPPING_MOVABLE. - */ - if (!__PageMovable(page)) - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); - list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, - MIGRATE_SYNC, MR_MEMORY_FAILURE); - if (ret) { + if (isolate_page(hpage, &pagelist)) { + ret = migrate_pages(&pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE); + if (!ret) { + bool release = !huge; + + if (!page_handle_poison(page, huge, release)) + ret = -EBUSY; + } else { if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", - pfn, ret, page->flags, &page->flags); + pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n", + pfn, msg_page[huge], ret, page->flags, &page->flags); if (ret > 0) - ret = -EIO; + ret = -EBUSY; } } else { - pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n", - pfn, ret, page_count(page), page->flags, &page->flags); + pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n", + pfn, msg_page[huge], page_count(page), page->flags, &page->flags); + ret = -EBUSY; } return ret; } -static int soft_offline_in_use_page(struct page *page, int flags) +static int soft_offline_in_use_page(struct page *page) { - int ret; - int mt; struct page *hpage = compound_head(page); - if (!PageHuge(page) && PageTransHuge(hpage)) { - lock_page(page); - if (!PageAnon(page) || unlikely(split_huge_page(page))) { - unlock_page(page); - if (!PageAnon(page)) - pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); - else - pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); - put_hwpoison_page(page); + if (!PageHuge(page) && PageTransHuge(hpage)) + if (try_to_split_thp_page(page, "soft offline") < 0) return -EBUSY; - } - unlock_page(page); - } - - /* - * Setting MIGRATE_ISOLATE here ensures that the page will be linked - * to free list immediately (not via pcplist) when released after - * successful page migration. Otherwise we can't guarantee that the - * page is really free after put_page() returns, so - * set_hwpoison_free_buddy_page() highly likely fails. - */ - mt = get_pageblock_migratetype(page); - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - if (PageHuge(page)) - ret = soft_offline_huge_page(page, flags); - else - ret = __soft_offline_page(page, flags); - set_pageblock_migratetype(page, mt); - return ret; + return __soft_offline_page(page); } -static int soft_offline_free_page(struct page *page) +static void put_ref_page(struct page *page) { - int rc = dissolve_free_huge_page(page); - - if (!rc) { - if (set_hwpoison_free_buddy_page(page)) - num_poisoned_pages_inc(); - else - rc = -EBUSY; - } - return rc; + if (page) + put_page(page); } /** * soft_offline_page - Soft offline a page. - * @page: page to offline + * @pfn: pfn to soft-offline * @flags: flags. Same as memory_failure(). * * Returns 0 on success, otherwise negated errno. @@ -1901,34 +1885,52 @@ * This is not a 100% solution for all memory, but tries to be * ``good enough'' for the majority of memory. */ -int soft_offline_page(struct page *page, int flags) +int soft_offline_page(unsigned long pfn, int flags) { int ret; - unsigned long pfn = page_to_pfn(page); + bool try_again = true; + struct page *page, *ref_page = NULL; - if (is_zone_device_page(page)) { - pr_debug_ratelimited("soft_offline: %#lx page is device page\n", - pfn); - if (flags & MF_COUNT_INCREASED) - put_page(page); + WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED)); + + if (!pfn_valid(pfn)) + return -ENXIO; + if (flags & MF_COUNT_INCREASED) + ref_page = pfn_to_page(pfn); + + /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ + page = pfn_to_online_page(pfn); + if (!page) { + put_ref_page(ref_page); return -EIO; } if (PageHWPoison(page)) { - pr_info("soft offline: %#lx page already poisoned\n", pfn); - if (flags & MF_COUNT_INCREASED) - put_hwpoison_page(page); - return -EBUSY; + pr_info("%s: %#lx page already poisoned\n", __func__, pfn); + put_ref_page(ref_page); + return 0; } +retry: get_online_mems(); - ret = get_any_page(page, pfn, flags); + ret = get_any_page(page, flags); put_online_mems(); - if (ret > 0) - ret = soft_offline_in_use_page(page, flags); - else if (ret == 0) - ret = soft_offline_free_page(page); + if (ret > 0) { + ret = soft_offline_in_use_page(page); + } else if (ret == 0) { + if (!page_handle_poison(page, true, false)) { + if (try_again) { + try_again = false; + flags &= ~MF_COUNT_INCREASED; + goto retry; + } + ret = -EBUSY; + } + } else if (ret == -EIO) { + pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n", + __func__, pfn, page->flags, &page->flags); + } return ret; } -- Gitblit v1.6.2