hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/mm/memory-failure.c
....@@ -1,10 +1,7 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Copyright (C) 2008, 2009 Intel Corporation
34 * Authors: Andi Kleen, Fengguang Wu
4
- *
5
- * This software may be redistributed and/or modified under the terms of
6
- * the GNU General Public License ("GPL") version 2 only as published by the
7
- * Free Software Foundation.
85 *
96 * High level machine check handler. Handles pages reported by the
107 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
....@@ -67,6 +64,33 @@
6764 int sysctl_memory_failure_recovery __read_mostly = 1;
6865
6966 atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
67
+
68
+static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
69
+{
70
+ if (hugepage_or_freepage) {
71
+ /*
72
+ * Doing this check for free pages is also fine since dissolve_free_huge_page
73
+ * returns 0 for non-hugetlb pages as well.
74
+ */
75
+ if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
76
+ /*
77
+ * We could fail to take off the target page from buddy
78
+ * for example due to racy page allocaiton, but that's
79
+ * acceptable because soft-offlined page is not broken
80
+ * and if someone really want to use it, they should
81
+ * take it.
82
+ */
83
+ return false;
84
+ }
85
+
86
+ SetPageHWPoison(page);
87
+ if (release)
88
+ put_page(page);
89
+ page_ref_inc(page);
90
+ num_poisoned_pages_inc();
91
+
92
+ return true;
93
+}
7094
7195 #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
7296
....@@ -213,14 +237,15 @@
213237 {
214238 struct task_struct *t = tk->tsk;
215239 short addr_lsb = tk->size_shift;
216
- int ret;
240
+ int ret = 0;
217241
218
- pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
219
- pfn, t->comm, t->pid);
242
+ pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
243
+ pfn, t->comm, t->pid);
220244
221
- if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
222
- ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
223
- addr_lsb, current);
245
+ if (flags & MF_ACTION_REQUIRED) {
246
+ WARN_ON_ONCE(t != current);
247
+ ret = force_sig_mceerr(BUS_MCEERR_AR,
248
+ (void __user *)tk->addr, addr_lsb);
224249 } else {
225250 /*
226251 * Don't use force here, it's convenient if the signal
....@@ -306,30 +331,24 @@
306331 /*
307332 * Schedule a process for later kill.
308333 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
309
- * TBD would GFP_NOIO be enough?
310334 */
311335 static void add_to_kill(struct task_struct *tsk, struct page *p,
312336 struct vm_area_struct *vma,
313
- struct list_head *to_kill,
314
- struct to_kill **tkc)
337
+ struct list_head *to_kill)
315338 {
316339 struct to_kill *tk;
317340
318
- if (*tkc) {
319
- tk = *tkc;
320
- *tkc = NULL;
321
- } else {
322
- tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
323
- if (!tk) {
324
- pr_err("Memory failure: Out of memory while machine check handling\n");
325
- return;
326
- }
341
+ tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
342
+ if (!tk) {
343
+ pr_err("Memory failure: Out of memory while machine check handling\n");
344
+ return;
327345 }
346
+
328347 tk->addr = page_address_in_vma(p, vma);
329348 if (is_zone_device_page(p))
330349 tk->size_shift = dev_pagemap_mapping_shift(p, vma);
331350 else
332
- tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
351
+ tk->size_shift = page_shift(compound_head(p));
333352
334353 /*
335354 * Send SIGKILL if "tk->addr == -EFAULT". Also, as
....@@ -348,6 +367,7 @@
348367 kfree(tk);
349368 return;
350369 }
370
+
351371 get_task_struct(tsk);
352372 tk->tsk = tsk;
353373 list_add_tail(&tk->nd, to_kill);
....@@ -407,9 +427,15 @@
407427 {
408428 struct task_struct *t;
409429
410
- for_each_thread(tsk, t)
411
- if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
412
- return t;
430
+ for_each_thread(tsk, t) {
431
+ if (t->flags & PF_MCE_PROCESS) {
432
+ if (t->flags & PF_MCE_EARLY)
433
+ return t;
434
+ } else {
435
+ if (sysctl_memory_failure_early_kill)
436
+ return t;
437
+ }
438
+ }
413439 return NULL;
414440 }
415441
....@@ -418,35 +444,40 @@
418444 * to be signaled when some page under the process is hwpoisoned.
419445 * Return task_struct of the dedicated thread (main thread unless explicitly
420446 * specified) if the process is "early kill," and otherwise returns NULL.
447
+ *
448
+ * Note that the above is true for Action Optional case, but not for Action
449
+ * Required case where SIGBUS should sent only to the current thread.
421450 */
422451 static struct task_struct *task_early_kill(struct task_struct *tsk,
423452 int force_early)
424453 {
425
- struct task_struct *t;
426454 if (!tsk->mm)
427455 return NULL;
428
- if (force_early)
429
- return tsk;
430
- t = find_early_kill_thread(tsk);
431
- if (t)
432
- return t;
433
- if (sysctl_memory_failure_early_kill)
434
- return tsk;
435
- return NULL;
456
+ if (force_early) {
457
+ /*
458
+ * Comparing ->mm here because current task might represent
459
+ * a subthread, while tsk always points to the main thread.
460
+ */
461
+ if (tsk->mm == current->mm)
462
+ return current;
463
+ else
464
+ return NULL;
465
+ }
466
+ return find_early_kill_thread(tsk);
436467 }
437468
438469 /*
439470 * Collect processes when the error hit an anonymous page.
440471 */
441472 static void collect_procs_anon(struct page *page, struct list_head *to_kill,
442
- struct to_kill **tkc, int force_early)
473
+ int force_early)
443474 {
444475 struct vm_area_struct *vma;
445476 struct task_struct *tsk;
446477 struct anon_vma *av;
447478 pgoff_t pgoff;
448479
449
- av = page_lock_anon_vma_read(page);
480
+ av = page_lock_anon_vma_read(page, NULL);
450481 if (av == NULL) /* Not actually mapped anymore */
451482 return;
452483
....@@ -464,7 +495,7 @@
464495 if (!page_mapped_in_vma(page, vma))
465496 continue;
466497 if (vma->vm_mm == t->mm)
467
- add_to_kill(t, page, vma, to_kill, tkc);
498
+ add_to_kill(t, page, vma, to_kill);
468499 }
469500 }
470501 read_unlock(&tasklist_lock);
....@@ -475,16 +506,17 @@
475506 * Collect processes when the error hit a file mapped page.
476507 */
477508 static void collect_procs_file(struct page *page, struct list_head *to_kill,
478
- struct to_kill **tkc, int force_early)
509
+ int force_early)
479510 {
480511 struct vm_area_struct *vma;
481512 struct task_struct *tsk;
482513 struct address_space *mapping = page->mapping;
514
+ pgoff_t pgoff;
483515
484516 i_mmap_lock_read(mapping);
485517 read_lock(&tasklist_lock);
518
+ pgoff = page_to_pgoff(page);
486519 for_each_process(tsk) {
487
- pgoff_t pgoff = page_to_pgoff(page);
488520 struct task_struct *t = task_early_kill(tsk, force_early);
489521
490522 if (!t)
....@@ -499,7 +531,7 @@
499531 * to be informed of all such data corruptions.
500532 */
501533 if (vma->vm_mm == t->mm)
502
- add_to_kill(t, page, vma, to_kill, tkc);
534
+ add_to_kill(t, page, vma, to_kill);
503535 }
504536 }
505537 read_unlock(&tasklist_lock);
....@@ -508,26 +540,17 @@
508540
509541 /*
510542 * Collect the processes who have the corrupted page mapped to kill.
511
- * This is done in two steps for locking reasons.
512
- * First preallocate one tokill structure outside the spin locks,
513
- * so that we can kill at least one process reasonably reliable.
514543 */
515544 static void collect_procs(struct page *page, struct list_head *tokill,
516545 int force_early)
517546 {
518
- struct to_kill *tk;
519
-
520547 if (!page->mapping)
521548 return;
522549
523
- tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
524
- if (!tk)
525
- return;
526550 if (PageAnon(page))
527
- collect_procs_anon(page, tokill, &tk, force_early);
551
+ collect_procs_anon(page, tokill, force_early);
528552 else
529
- collect_procs_file(page, tokill, &tk, force_early);
530
- kfree(tk);
553
+ collect_procs_file(page, tokill, force_early);
531554 }
532555
533556 static const char *action_name[] = {
....@@ -559,6 +582,7 @@
559582 [MF_MSG_BUDDY] = "free buddy page",
560583 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
561584 [MF_MSG_DAX] = "dax page",
585
+ [MF_MSG_UNSPLIT_THP] = "unsplit thp",
562586 [MF_MSG_UNKNOWN] = "unknown page",
563587 };
564588
....@@ -829,7 +853,6 @@
829853 #define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
830854 #define unevict (1UL << PG_unevictable)
831855 #define mlock (1UL << PG_mlocked)
832
-#define writeback (1UL << PG_writeback)
833856 #define lru (1UL << PG_lru)
834857 #define head (1UL << PG_head)
835858 #define slab (1UL << PG_slab)
....@@ -878,7 +901,6 @@
878901 #undef sc
879902 #undef unevict
880903 #undef mlock
881
-#undef writeback
882904 #undef lru
883905 #undef head
884906 #undef slab
....@@ -930,7 +952,7 @@
930952 * Return: return 0 if failed to grab the refcount, otherwise true (some
931953 * non-zero value.)
932954 */
933
-int get_hwpoison_page(struct page *page)
955
+static int get_hwpoison_page(struct page *page)
934956 {
935957 struct page *head = compound_head(page);
936958
....@@ -959,7 +981,6 @@
959981
960982 return 0;
961983 }
962
-EXPORT_SYMBOL_GPL(get_hwpoison_page);
963984
964985 /*
965986 * Do all that is necessary to remove user space mappings. Unmap
....@@ -968,10 +989,10 @@
968989 static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
969990 int flags, struct page **hpagep)
970991 {
971
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
992
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK;
972993 struct address_space *mapping;
973994 LIST_HEAD(tokill);
974
- bool unmap_success;
995
+ bool unmap_success = true;
975996 int kill = 1, forcekill;
976997 struct page *hpage = *hpagep;
977998 bool mlocked = PageMlocked(hpage);
....@@ -1011,7 +1032,7 @@
10111032 */
10121033 mapping = page_mapping(hpage);
10131034 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
1014
- mapping_cap_writeback_dirty(mapping)) {
1035
+ mapping_can_writeback(mapping)) {
10151036 if (page_mkclean(hpage)) {
10161037 SetPageDirty(hpage);
10171038 } else {
....@@ -1033,7 +1054,30 @@
10331054 if (kill)
10341055 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
10351056
1036
- unmap_success = try_to_unmap(hpage, ttu);
1057
+ if (!PageHuge(hpage)) {
1058
+ unmap_success = try_to_unmap(hpage, ttu);
1059
+ } else {
1060
+ if (!PageAnon(hpage)) {
1061
+ /*
1062
+ * For hugetlb pages in shared mappings, try_to_unmap
1063
+ * could potentially call huge_pmd_unshare. Because of
1064
+ * this, take semaphore in write mode here and set
1065
+ * TTU_RMAP_LOCKED to indicate we have taken the lock
1066
+ * at this higer level.
1067
+ */
1068
+ mapping = hugetlb_page_mapping_lock_write(hpage);
1069
+ if (mapping) {
1070
+ unmap_success = try_to_unmap(hpage,
1071
+ ttu|TTU_RMAP_LOCKED);
1072
+ i_mmap_unlock_write(mapping);
1073
+ } else {
1074
+ pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
1075
+ unmap_success = false;
1076
+ }
1077
+ } else {
1078
+ unmap_success = try_to_unmap(hpage, ttu);
1079
+ }
1080
+ }
10371081 if (!unmap_success)
10381082 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
10391083 pfn, page_mapcount(hpage));
....@@ -1084,6 +1128,25 @@
10841128 return page_action(ps, p, pfn);
10851129 }
10861130
1131
+static int try_to_split_thp_page(struct page *page, const char *msg)
1132
+{
1133
+ lock_page(page);
1134
+ if (!PageAnon(page) || unlikely(split_huge_page(page))) {
1135
+ unsigned long pfn = page_to_pfn(page);
1136
+
1137
+ unlock_page(page);
1138
+ if (!PageAnon(page))
1139
+ pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
1140
+ else
1141
+ pr_info("%s: %#lx: thp split failed\n", msg, pfn);
1142
+ put_page(page);
1143
+ return -EBUSY;
1144
+ }
1145
+ unlock_page(page);
1146
+
1147
+ return 0;
1148
+}
1149
+
10871150 static int memory_failure_hugetlb(unsigned long pfn, int flags)
10881151 {
10891152 struct page *p = pfn_to_page(pfn);
....@@ -1125,7 +1188,7 @@
11251188 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
11261189 num_poisoned_pages_dec();
11271190 unlock_page(head);
1128
- put_hwpoison_page(head);
1191
+ put_page(head);
11291192 return 0;
11301193 }
11311194
....@@ -1166,6 +1229,19 @@
11661229 LIST_HEAD(tokill);
11671230 int rc = -EBUSY;
11681231 loff_t start;
1232
+ dax_entry_t cookie;
1233
+
1234
+ if (flags & MF_COUNT_INCREASED)
1235
+ /*
1236
+ * Drop the extra refcount in case we come from madvise().
1237
+ */
1238
+ put_page(page);
1239
+
1240
+ /* device metadata space is not recoverable */
1241
+ if (!pgmap_pfn_valid(pgmap, pfn)) {
1242
+ rc = -ENXIO;
1243
+ goto out;
1244
+ }
11691245
11701246 /*
11711247 * Prevent the inode from being freed while we are interrogating
....@@ -1174,7 +1250,8 @@
11741250 * also prevents changes to the mapping of this pfn until
11751251 * poison signaling is complete.
11761252 */
1177
- if (!dax_lock_mapping_entry(page))
1253
+ cookie = dax_lock_page(page);
1254
+ if (!cookie)
11781255 goto out;
11791256
11801257 if (hwpoison_filter(page)) {
....@@ -1182,16 +1259,12 @@
11821259 goto unlock;
11831260 }
11841261
1185
- switch (pgmap->type) {
1186
- case MEMORY_DEVICE_PRIVATE:
1187
- case MEMORY_DEVICE_PUBLIC:
1262
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
11881263 /*
11891264 * TODO: Handle HMM pages which may need coordination
11901265 * with device-side memory.
11911266 */
11921267 goto unlock;
1193
- default:
1194
- break;
11951268 }
11961269
11971270 /*
....@@ -1225,7 +1298,7 @@
12251298 kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
12261299 rc = 0;
12271300 unlock:
1228
- dax_unlock_mapping_entry(page);
1301
+ dax_unlock_page(page, cookie);
12291302 out:
12301303 /* drop pgmap ref acquired in caller */
12311304 put_dev_pagemap(pgmap);
....@@ -1308,23 +1381,11 @@
13081381 }
13091382
13101383 if (PageTransHuge(hpage)) {
1311
- lock_page(p);
1312
- if (!PageAnon(p) || unlikely(split_huge_page(p))) {
1313
- unlock_page(p);
1314
- if (!PageAnon(p))
1315
- pr_err("Memory failure: %#lx: non anonymous thp\n",
1316
- pfn);
1317
- else
1318
- pr_err("Memory failure: %#lx: thp split failed\n",
1319
- pfn);
1320
- if (TestClearPageHWPoison(p))
1321
- num_poisoned_pages_dec();
1322
- put_hwpoison_page(p);
1384
+ if (try_to_split_thp_page(p, "Memory Failure") < 0) {
1385
+ action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
13231386 return -EBUSY;
13241387 }
1325
- unlock_page(p);
13261388 VM_BUG_ON_PAGE(!page_count(p), p);
1327
- hpage = compound_head(p);
13281389 }
13291390
13301391 /*
....@@ -1364,10 +1425,7 @@
13641425 * page_remove_rmap() in try_to_unmap_one(). So to determine page status
13651426 * correctly, we save a copy of the page flags at this time.
13661427 */
1367
- if (PageHuge(p))
1368
- page_flags = hpage->flags;
1369
- else
1370
- page_flags = p->flags;
1428
+ page_flags = p->flags;
13711429
13721430 /*
13731431 * unpoison always clear PG_hwpoison inside page lock
....@@ -1376,14 +1434,14 @@
13761434 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
13771435 num_poisoned_pages_dec();
13781436 unlock_page(p);
1379
- put_hwpoison_page(p);
1437
+ put_page(p);
13801438 return 0;
13811439 }
13821440 if (hwpoison_filter(p)) {
13831441 if (TestClearPageHWPoison(p))
13841442 num_poisoned_pages_dec();
13851443 unlock_page(p);
1386
- put_hwpoison_page(p);
1444
+ put_page(p);
13871445 return 0;
13881446 }
13891447
....@@ -1404,11 +1462,8 @@
14041462 /*
14051463 * Now take care of user space mappings.
14061464 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1407
- *
1408
- * When the raw error page is thp tail page, hpage points to the raw
1409
- * page after thp split.
14101465 */
1411
- if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
1466
+ if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
14121467 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
14131468 res = -EBUSY;
14141469 goto out;
....@@ -1492,7 +1547,7 @@
14921547 unsigned long proc_flags;
14931548 int gotten;
14941549
1495
- mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1550
+ mf_cpu = container_of(work, struct memory_failure_cpu, work);
14961551 for (;;) {
14971552 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
14981553 gotten = kfifo_get(&mf_cpu->fifo, &entry);
....@@ -1500,10 +1555,23 @@
15001555 if (!gotten)
15011556 break;
15021557 if (entry.flags & MF_SOFT_OFFLINE)
1503
- soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1558
+ soft_offline_page(entry.pfn, entry.flags);
15041559 else
15051560 memory_failure(entry.pfn, entry.flags);
15061561 }
1562
+}
1563
+
1564
+/*
1565
+ * Process memory_failure work queued on the specified CPU.
1566
+ * Used to avoid return-to-userspace racing with the memory_failure workqueue.
1567
+ */
1568
+void memory_failure_queue_kick(int cpu)
1569
+{
1570
+ struct memory_failure_cpu *mf_cpu;
1571
+
1572
+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1573
+ cancel_work_sync(&mf_cpu->work);
1574
+ memory_failure_work_func(&mf_cpu->work);
15071575 }
15081576
15091577 static int __init memory_failure_init(void)
....@@ -1612,147 +1680,113 @@
16121680 }
16131681 unlock_page(page);
16141682
1615
- put_hwpoison_page(page);
1683
+ put_page(page);
16161684 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1617
- put_hwpoison_page(page);
1685
+ put_page(page);
16181686
16191687 return 0;
16201688 }
16211689 EXPORT_SYMBOL(unpoison_memory);
16221690
1623
-static struct page *new_page(struct page *p, unsigned long private)
1691
+/*
1692
+ * Safely get reference count of an arbitrary page.
1693
+ * Returns 0 for a free page, 1 for an in-use page, -EIO for a page-type we
1694
+ * cannot handle and -EBUSY if we raced with an allocation.
1695
+ * We only incremented refcount in case the page was already in-use and it is
1696
+ * a known type we can handle.
1697
+ */
1698
+static int get_any_page(struct page *p, int flags)
16241699 {
1625
- int nid = page_to_nid(p);
1700
+ int ret = 0, pass = 0;
1701
+ bool count_increased = false;
16261702
1627
- return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
1703
+ if (flags & MF_COUNT_INCREASED)
1704
+ count_increased = true;
1705
+
1706
+try_again:
1707
+ if (!count_increased && !get_hwpoison_page(p)) {
1708
+ if (page_count(p)) {
1709
+ /* We raced with an allocation, retry. */
1710
+ if (pass++ < 3)
1711
+ goto try_again;
1712
+ ret = -EBUSY;
1713
+ } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
1714
+ /* We raced with put_page, retry. */
1715
+ if (pass++ < 3)
1716
+ goto try_again;
1717
+ ret = -EIO;
1718
+ }
1719
+ } else {
1720
+ if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) {
1721
+ ret = 1;
1722
+ } else {
1723
+ /*
1724
+ * A page we cannot handle. Check whether we can turn
1725
+ * it into something we can handle.
1726
+ */
1727
+ if (pass++ < 3) {
1728
+ put_page(p);
1729
+ shake_page(p, 1);
1730
+ count_increased = false;
1731
+ goto try_again;
1732
+ }
1733
+ put_page(p);
1734
+ ret = -EIO;
1735
+ }
1736
+ }
1737
+
1738
+ return ret;
1739
+}
1740
+
1741
+static bool isolate_page(struct page *page, struct list_head *pagelist)
1742
+{
1743
+ bool isolated = false;
1744
+ bool lru = PageLRU(page);
1745
+
1746
+ if (PageHuge(page)) {
1747
+ isolated = !isolate_hugetlb(page, pagelist);
1748
+ } else {
1749
+ if (lru)
1750
+ isolated = !isolate_lru_page(page);
1751
+ else
1752
+ isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1753
+
1754
+ if (isolated)
1755
+ list_add(&page->lru, pagelist);
1756
+ }
1757
+
1758
+ if (isolated && lru)
1759
+ inc_node_page_state(page, NR_ISOLATED_ANON +
1760
+ page_is_file_lru(page));
1761
+
1762
+ /*
1763
+ * If we succeed to isolate the page, we grabbed another refcount on
1764
+ * the page, so we can safely drop the one we got from get_any_pages().
1765
+ * If we failed to isolate the page, it means that we cannot go further
1766
+ * and we will return an error, so drop the reference we got from
1767
+ * get_any_pages() as well.
1768
+ */
1769
+ put_page(page);
1770
+ return isolated;
16281771 }
16291772
16301773 /*
1631
- * Safely get reference count of an arbitrary page.
1632
- * Returns 0 for a free page, -EIO for a zero refcount page
1633
- * that is not free, and 1 for any other page type.
1634
- * For 1 the page is returned with increased page count, otherwise not.
1774
+ * __soft_offline_page handles hugetlb-pages and non-hugetlb pages.
1775
+ * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
1776
+ * If the page is mapped, it migrates the contents over.
16351777 */
1636
-static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1778
+static int __soft_offline_page(struct page *page)
16371779 {
1638
- int ret;
1639
-
1640
- if (flags & MF_COUNT_INCREASED)
1641
- return 1;
1642
-
1643
- /*
1644
- * When the target page is a free hugepage, just remove it
1645
- * from free hugepage list.
1646
- */
1647
- if (!get_hwpoison_page(p)) {
1648
- if (PageHuge(p)) {
1649
- pr_info("%s: %#lx free huge page\n", __func__, pfn);
1650
- ret = 0;
1651
- } else if (is_free_buddy_page(p)) {
1652
- pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1653
- ret = 0;
1654
- } else {
1655
- pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1656
- __func__, pfn, p->flags);
1657
- ret = -EIO;
1658
- }
1659
- } else {
1660
- /* Not a free page */
1661
- ret = 1;
1662
- }
1663
- return ret;
1664
-}
1665
-
1666
-static int get_any_page(struct page *page, unsigned long pfn, int flags)
1667
-{
1668
- int ret = __get_any_page(page, pfn, flags);
1669
-
1670
- if (ret == 1 && !PageHuge(page) &&
1671
- !PageLRU(page) && !__PageMovable(page)) {
1672
- /*
1673
- * Try to free it.
1674
- */
1675
- put_hwpoison_page(page);
1676
- shake_page(page, 1);
1677
-
1678
- /*
1679
- * Did it turn free?
1680
- */
1681
- ret = __get_any_page(page, pfn, 0);
1682
- if (ret == 1 && !PageLRU(page)) {
1683
- /* Drop page reference which is from __get_any_page() */
1684
- put_hwpoison_page(page);
1685
- pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
1686
- pfn, page->flags, &page->flags);
1687
- return -EIO;
1688
- }
1689
- }
1690
- return ret;
1691
-}
1692
-
1693
-static int soft_offline_huge_page(struct page *page, int flags)
1694
-{
1695
- int ret;
1780
+ int ret = 0;
16961781 unsigned long pfn = page_to_pfn(page);
16971782 struct page *hpage = compound_head(page);
1783
+ char const *msg_page[] = {"page", "hugepage"};
1784
+ bool huge = PageHuge(page);
16981785 LIST_HEAD(pagelist);
1699
-
1700
- /*
1701
- * This double-check of PageHWPoison is to avoid the race with
1702
- * memory_failure(). See also comment in __soft_offline_page().
1703
- */
1704
- lock_page(hpage);
1705
- if (PageHWPoison(hpage)) {
1706
- unlock_page(hpage);
1707
- put_hwpoison_page(hpage);
1708
- pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1709
- return -EBUSY;
1710
- }
1711
- unlock_page(hpage);
1712
-
1713
- ret = isolate_huge_page(hpage, &pagelist);
1714
- /*
1715
- * get_any_page() and isolate_huge_page() takes a refcount each,
1716
- * so need to drop one here.
1717
- */
1718
- put_hwpoison_page(hpage);
1719
- if (!ret) {
1720
- pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1721
- return -EBUSY;
1722
- }
1723
-
1724
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1725
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
1726
- if (ret) {
1727
- pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
1728
- pfn, ret, page->flags, &page->flags);
1729
- if (!list_empty(&pagelist))
1730
- putback_movable_pages(&pagelist);
1731
- if (ret > 0)
1732
- ret = -EIO;
1733
- } else {
1734
- /*
1735
- * We set PG_hwpoison only when the migration source hugepage
1736
- * was successfully dissolved, because otherwise hwpoisoned
1737
- * hugepage remains on free hugepage list, then userspace will
1738
- * find it as SIGBUS by allocation failure. That's not expected
1739
- * in soft-offlining.
1740
- */
1741
- ret = dissolve_free_huge_page(page);
1742
- if (!ret) {
1743
- if (set_hwpoison_free_buddy_page(page))
1744
- num_poisoned_pages_inc();
1745
- else
1746
- ret = -EBUSY;
1747
- }
1748
- }
1749
- return ret;
1750
-}
1751
-
1752
-static int __soft_offline_page(struct page *page, int flags)
1753
-{
1754
- int ret;
1755
- unsigned long pfn = page_to_pfn(page);
1786
+ struct migration_target_control mtc = {
1787
+ .nid = NUMA_NO_NODE,
1788
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
1789
+ };
17561790
17571791 /*
17581792 * Check PageHWPoison again inside page lock because PageHWPoison
....@@ -1761,127 +1795,77 @@
17611795 * so there's no race between soft_offline_page() and memory_failure().
17621796 */
17631797 lock_page(page);
1764
- wait_on_page_writeback(page);
1798
+ if (!PageHuge(page))
1799
+ wait_on_page_writeback(page);
17651800 if (PageHWPoison(page)) {
17661801 unlock_page(page);
1767
- put_hwpoison_page(page);
1802
+ put_page(page);
17681803 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1769
- return -EBUSY;
1804
+ return 0;
17701805 }
1771
- /*
1772
- * Try to invalidate first. This should work for
1773
- * non dirty unmapped page cache pages.
1774
- */
1775
- ret = invalidate_inode_page(page);
1806
+
1807
+ if (!PageHuge(page))
1808
+ /*
1809
+ * Try to invalidate first. This should work for
1810
+ * non dirty unmapped page cache pages.
1811
+ */
1812
+ ret = invalidate_inode_page(page);
17761813 unlock_page(page);
1814
+
17771815 /*
17781816 * RED-PEN would be better to keep it isolated here, but we
17791817 * would need to fix isolation locking first.
17801818 */
1781
- if (ret == 1) {
1782
- put_hwpoison_page(page);
1819
+ if (ret) {
17831820 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1784
- SetPageHWPoison(page);
1785
- num_poisoned_pages_inc();
1821
+ page_handle_poison(page, false, true);
17861822 return 0;
17871823 }
17881824
1789
- /*
1790
- * Simple invalidation didn't work.
1791
- * Try to migrate to a new page instead. migrate.c
1792
- * handles a large number of cases for us.
1793
- */
1794
- if (PageLRU(page))
1795
- ret = isolate_lru_page(page);
1796
- else
1797
- ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1798
- /*
1799
- * Drop page reference which is came from get_any_page()
1800
- * successful isolate_lru_page() already took another one.
1801
- */
1802
- put_hwpoison_page(page);
1803
- if (!ret) {
1804
- LIST_HEAD(pagelist);
1805
- /*
1806
- * After isolated lru page, the PageLRU will be cleared,
1807
- * so use !__PageMovable instead for LRU page's mapping
1808
- * cannot have PAGE_MAPPING_MOVABLE.
1809
- */
1810
- if (!__PageMovable(page))
1811
- inc_node_page_state(page, NR_ISOLATED_ANON +
1812
- page_is_file_cache(page));
1813
- list_add(&page->lru, &pagelist);
1814
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1815
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
1816
- if (ret) {
1825
+ if (isolate_page(hpage, &pagelist)) {
1826
+ ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
1827
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
1828
+ if (!ret) {
1829
+ bool release = !huge;
1830
+
1831
+ if (!page_handle_poison(page, huge, release))
1832
+ ret = -EBUSY;
1833
+ } else {
18171834 if (!list_empty(&pagelist))
18181835 putback_movable_pages(&pagelist);
18191836
1820
- pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
1821
- pfn, ret, page->flags, &page->flags);
1837
+ pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
1838
+ pfn, msg_page[huge], ret, page->flags, &page->flags);
18221839 if (ret > 0)
1823
- ret = -EIO;
1840
+ ret = -EBUSY;
18241841 }
18251842 } else {
1826
- pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
1827
- pfn, ret, page_count(page), page->flags, &page->flags);
1843
+ pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n",
1844
+ pfn, msg_page[huge], page_count(page), page->flags, &page->flags);
1845
+ ret = -EBUSY;
18281846 }
18291847 return ret;
18301848 }
18311849
1832
-static int soft_offline_in_use_page(struct page *page, int flags)
1850
+static int soft_offline_in_use_page(struct page *page)
18331851 {
1834
- int ret;
1835
- int mt;
18361852 struct page *hpage = compound_head(page);
18371853
1838
- if (!PageHuge(page) && PageTransHuge(hpage)) {
1839
- lock_page(page);
1840
- if (!PageAnon(page) || unlikely(split_huge_page(page))) {
1841
- unlock_page(page);
1842
- if (!PageAnon(page))
1843
- pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1844
- else
1845
- pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1846
- put_hwpoison_page(page);
1854
+ if (!PageHuge(page) && PageTransHuge(hpage))
1855
+ if (try_to_split_thp_page(page, "soft offline") < 0)
18471856 return -EBUSY;
1848
- }
1849
- unlock_page(page);
1850
- }
1851
-
1852
- /*
1853
- * Setting MIGRATE_ISOLATE here ensures that the page will be linked
1854
- * to free list immediately (not via pcplist) when released after
1855
- * successful page migration. Otherwise we can't guarantee that the
1856
- * page is really free after put_page() returns, so
1857
- * set_hwpoison_free_buddy_page() highly likely fails.
1858
- */
1859
- mt = get_pageblock_migratetype(page);
1860
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
1861
- if (PageHuge(page))
1862
- ret = soft_offline_huge_page(page, flags);
1863
- else
1864
- ret = __soft_offline_page(page, flags);
1865
- set_pageblock_migratetype(page, mt);
1866
- return ret;
1857
+ return __soft_offline_page(page);
18671858 }
18681859
1869
-static int soft_offline_free_page(struct page *page)
1860
+static void put_ref_page(struct page *page)
18701861 {
1871
- int rc = dissolve_free_huge_page(page);
1872
-
1873
- if (!rc) {
1874
- if (set_hwpoison_free_buddy_page(page))
1875
- num_poisoned_pages_inc();
1876
- else
1877
- rc = -EBUSY;
1878
- }
1879
- return rc;
1862
+ if (page)
1863
+ put_page(page);
18801864 }
18811865
18821866 /**
18831867 * soft_offline_page - Soft offline a page.
1884
- * @page: page to offline
1868
+ * @pfn: pfn to soft-offline
18851869 * @flags: flags. Same as memory_failure().
18861870 *
18871871 * Returns 0 on success, otherwise negated errno.
....@@ -1901,34 +1885,52 @@
19011885 * This is not a 100% solution for all memory, but tries to be
19021886 * ``good enough'' for the majority of memory.
19031887 */
1904
-int soft_offline_page(struct page *page, int flags)
1888
+int soft_offline_page(unsigned long pfn, int flags)
19051889 {
19061890 int ret;
1907
- unsigned long pfn = page_to_pfn(page);
1891
+ bool try_again = true;
1892
+ struct page *page, *ref_page = NULL;
19081893
1909
- if (is_zone_device_page(page)) {
1910
- pr_debug_ratelimited("soft_offline: %#lx page is device page\n",
1911
- pfn);
1912
- if (flags & MF_COUNT_INCREASED)
1913
- put_page(page);
1894
+ WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED));
1895
+
1896
+ if (!pfn_valid(pfn))
1897
+ return -ENXIO;
1898
+ if (flags & MF_COUNT_INCREASED)
1899
+ ref_page = pfn_to_page(pfn);
1900
+
1901
+ /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
1902
+ page = pfn_to_online_page(pfn);
1903
+ if (!page) {
1904
+ put_ref_page(ref_page);
19141905 return -EIO;
19151906 }
19161907
19171908 if (PageHWPoison(page)) {
1918
- pr_info("soft offline: %#lx page already poisoned\n", pfn);
1919
- if (flags & MF_COUNT_INCREASED)
1920
- put_hwpoison_page(page);
1921
- return -EBUSY;
1909
+ pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
1910
+ put_ref_page(ref_page);
1911
+ return 0;
19221912 }
19231913
1914
+retry:
19241915 get_online_mems();
1925
- ret = get_any_page(page, pfn, flags);
1916
+ ret = get_any_page(page, flags);
19261917 put_online_mems();
19271918
1928
- if (ret > 0)
1929
- ret = soft_offline_in_use_page(page, flags);
1930
- else if (ret == 0)
1931
- ret = soft_offline_free_page(page);
1919
+ if (ret > 0) {
1920
+ ret = soft_offline_in_use_page(page);
1921
+ } else if (ret == 0) {
1922
+ if (!page_handle_poison(page, true, false)) {
1923
+ if (try_again) {
1924
+ try_again = false;
1925
+ flags &= ~MF_COUNT_INCREASED;
1926
+ goto retry;
1927
+ }
1928
+ ret = -EBUSY;
1929
+ }
1930
+ } else if (ret == -EIO) {
1931
+ pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n",
1932
+ __func__, pfn, page->flags, &page->flags);
1933
+ }
19321934
19331935 return ret;
19341936 }