| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * Copyright (C) 2008, 2009 Intel Corporation |
|---|
| 3 | 4 | * Authors: Andi Kleen, Fengguang Wu |
|---|
| 4 | | - * |
|---|
| 5 | | - * This software may be redistributed and/or modified under the terms of |
|---|
| 6 | | - * the GNU General Public License ("GPL") version 2 only as published by the |
|---|
| 7 | | - * Free Software Foundation. |
|---|
| 8 | 5 | * |
|---|
| 9 | 6 | * High level machine check handler. Handles pages reported by the |
|---|
| 10 | 7 | * hardware as being corrupted usually due to a multi-bit ECC memory or cache |
|---|
| .. | .. |
|---|
| 67 | 64 | int sysctl_memory_failure_recovery __read_mostly = 1; |
|---|
| 68 | 65 | |
|---|
| 69 | 66 | atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); |
|---|
| 67 | + |
|---|
| 68 | +static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) |
|---|
| 69 | +{ |
|---|
| 70 | + if (hugepage_or_freepage) { |
|---|
| 71 | + /* |
|---|
| 72 | + * Doing this check for free pages is also fine since dissolve_free_huge_page |
|---|
| 73 | + * returns 0 for non-hugetlb pages as well. |
|---|
| 74 | + */ |
|---|
| 75 | + if (dissolve_free_huge_page(page) || !take_page_off_buddy(page)) |
|---|
| 76 | + /* |
|---|
| 77 | + * We could fail to take off the target page from buddy |
|---|
| 78 | + * for example due to racy page allocaiton, but that's |
|---|
| 79 | + * acceptable because soft-offlined page is not broken |
|---|
| 80 | + * and if someone really want to use it, they should |
|---|
| 81 | + * take it. |
|---|
| 82 | + */ |
|---|
| 83 | + return false; |
|---|
| 84 | + } |
|---|
| 85 | + |
|---|
| 86 | + SetPageHWPoison(page); |
|---|
| 87 | + if (release) |
|---|
| 88 | + put_page(page); |
|---|
| 89 | + page_ref_inc(page); |
|---|
| 90 | + num_poisoned_pages_inc(); |
|---|
| 91 | + |
|---|
| 92 | + return true; |
|---|
| 93 | +} |
|---|
| 70 | 94 | |
|---|
| 71 | 95 | #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) |
|---|
| 72 | 96 | |
|---|
| .. | .. |
|---|
| 213 | 237 | { |
|---|
| 214 | 238 | struct task_struct *t = tk->tsk; |
|---|
| 215 | 239 | short addr_lsb = tk->size_shift; |
|---|
| 216 | | - int ret; |
|---|
| 240 | + int ret = 0; |
|---|
| 217 | 241 | |
|---|
| 218 | | - pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n", |
|---|
| 219 | | - pfn, t->comm, t->pid); |
|---|
| 242 | + pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", |
|---|
| 243 | + pfn, t->comm, t->pid); |
|---|
| 220 | 244 | |
|---|
| 221 | | - if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { |
|---|
| 222 | | - ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr, |
|---|
| 223 | | - addr_lsb, current); |
|---|
| 245 | + if (flags & MF_ACTION_REQUIRED) { |
|---|
| 246 | + WARN_ON_ONCE(t != current); |
|---|
| 247 | + ret = force_sig_mceerr(BUS_MCEERR_AR, |
|---|
| 248 | + (void __user *)tk->addr, addr_lsb); |
|---|
| 224 | 249 | } else { |
|---|
| 225 | 250 | /* |
|---|
| 226 | 251 | * Don't use force here, it's convenient if the signal |
|---|
| .. | .. |
|---|
| 306 | 331 | /* |
|---|
| 307 | 332 | * Schedule a process for later kill. |
|---|
| 308 | 333 | * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. |
|---|
| 309 | | - * TBD would GFP_NOIO be enough? |
|---|
| 310 | 334 | */ |
|---|
| 311 | 335 | static void add_to_kill(struct task_struct *tsk, struct page *p, |
|---|
| 312 | 336 | struct vm_area_struct *vma, |
|---|
| 313 | | - struct list_head *to_kill, |
|---|
| 314 | | - struct to_kill **tkc) |
|---|
| 337 | + struct list_head *to_kill) |
|---|
| 315 | 338 | { |
|---|
| 316 | 339 | struct to_kill *tk; |
|---|
| 317 | 340 | |
|---|
| 318 | | - if (*tkc) { |
|---|
| 319 | | - tk = *tkc; |
|---|
| 320 | | - *tkc = NULL; |
|---|
| 321 | | - } else { |
|---|
| 322 | | - tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); |
|---|
| 323 | | - if (!tk) { |
|---|
| 324 | | - pr_err("Memory failure: Out of memory while machine check handling\n"); |
|---|
| 325 | | - return; |
|---|
| 326 | | - } |
|---|
| 341 | + tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); |
|---|
| 342 | + if (!tk) { |
|---|
| 343 | + pr_err("Memory failure: Out of memory while machine check handling\n"); |
|---|
| 344 | + return; |
|---|
| 327 | 345 | } |
|---|
| 346 | + |
|---|
| 328 | 347 | tk->addr = page_address_in_vma(p, vma); |
|---|
| 329 | 348 | if (is_zone_device_page(p)) |
|---|
| 330 | 349 | tk->size_shift = dev_pagemap_mapping_shift(p, vma); |
|---|
| 331 | 350 | else |
|---|
| 332 | | - tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT; |
|---|
| 351 | + tk->size_shift = page_shift(compound_head(p)); |
|---|
| 333 | 352 | |
|---|
| 334 | 353 | /* |
|---|
| 335 | 354 | * Send SIGKILL if "tk->addr == -EFAULT". Also, as |
|---|
| .. | .. |
|---|
| 348 | 367 | kfree(tk); |
|---|
| 349 | 368 | return; |
|---|
| 350 | 369 | } |
|---|
| 370 | + |
|---|
| 351 | 371 | get_task_struct(tsk); |
|---|
| 352 | 372 | tk->tsk = tsk; |
|---|
| 353 | 373 | list_add_tail(&tk->nd, to_kill); |
|---|
| .. | .. |
|---|
| 407 | 427 | { |
|---|
| 408 | 428 | struct task_struct *t; |
|---|
| 409 | 429 | |
|---|
| 410 | | - for_each_thread(tsk, t) |
|---|
| 411 | | - if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) |
|---|
| 412 | | - return t; |
|---|
| 430 | + for_each_thread(tsk, t) { |
|---|
| 431 | + if (t->flags & PF_MCE_PROCESS) { |
|---|
| 432 | + if (t->flags & PF_MCE_EARLY) |
|---|
| 433 | + return t; |
|---|
| 434 | + } else { |
|---|
| 435 | + if (sysctl_memory_failure_early_kill) |
|---|
| 436 | + return t; |
|---|
| 437 | + } |
|---|
| 438 | + } |
|---|
| 413 | 439 | return NULL; |
|---|
| 414 | 440 | } |
|---|
| 415 | 441 | |
|---|
| .. | .. |
|---|
| 418 | 444 | * to be signaled when some page under the process is hwpoisoned. |
|---|
| 419 | 445 | * Return task_struct of the dedicated thread (main thread unless explicitly |
|---|
| 420 | 446 | * specified) if the process is "early kill," and otherwise returns NULL. |
|---|
| 447 | + * |
|---|
| 448 | + * Note that the above is true for Action Optional case, but not for Action |
|---|
| 449 | + * Required case where SIGBUS should sent only to the current thread. |
|---|
| 421 | 450 | */ |
|---|
| 422 | 451 | static struct task_struct *task_early_kill(struct task_struct *tsk, |
|---|
| 423 | 452 | int force_early) |
|---|
| 424 | 453 | { |
|---|
| 425 | | - struct task_struct *t; |
|---|
| 426 | 454 | if (!tsk->mm) |
|---|
| 427 | 455 | return NULL; |
|---|
| 428 | | - if (force_early) |
|---|
| 429 | | - return tsk; |
|---|
| 430 | | - t = find_early_kill_thread(tsk); |
|---|
| 431 | | - if (t) |
|---|
| 432 | | - return t; |
|---|
| 433 | | - if (sysctl_memory_failure_early_kill) |
|---|
| 434 | | - return tsk; |
|---|
| 435 | | - return NULL; |
|---|
| 456 | + if (force_early) { |
|---|
| 457 | + /* |
|---|
| 458 | + * Comparing ->mm here because current task might represent |
|---|
| 459 | + * a subthread, while tsk always points to the main thread. |
|---|
| 460 | + */ |
|---|
| 461 | + if (tsk->mm == current->mm) |
|---|
| 462 | + return current; |
|---|
| 463 | + else |
|---|
| 464 | + return NULL; |
|---|
| 465 | + } |
|---|
| 466 | + return find_early_kill_thread(tsk); |
|---|
| 436 | 467 | } |
|---|
| 437 | 468 | |
|---|
| 438 | 469 | /* |
|---|
| 439 | 470 | * Collect processes when the error hit an anonymous page. |
|---|
| 440 | 471 | */ |
|---|
| 441 | 472 | static void collect_procs_anon(struct page *page, struct list_head *to_kill, |
|---|
| 442 | | - struct to_kill **tkc, int force_early) |
|---|
| 473 | + int force_early) |
|---|
| 443 | 474 | { |
|---|
| 444 | 475 | struct vm_area_struct *vma; |
|---|
| 445 | 476 | struct task_struct *tsk; |
|---|
| 446 | 477 | struct anon_vma *av; |
|---|
| 447 | 478 | pgoff_t pgoff; |
|---|
| 448 | 479 | |
|---|
| 449 | | - av = page_lock_anon_vma_read(page); |
|---|
| 480 | + av = page_lock_anon_vma_read(page, NULL); |
|---|
| 450 | 481 | if (av == NULL) /* Not actually mapped anymore */ |
|---|
| 451 | 482 | return; |
|---|
| 452 | 483 | |
|---|
| .. | .. |
|---|
| 464 | 495 | if (!page_mapped_in_vma(page, vma)) |
|---|
| 465 | 496 | continue; |
|---|
| 466 | 497 | if (vma->vm_mm == t->mm) |
|---|
| 467 | | - add_to_kill(t, page, vma, to_kill, tkc); |
|---|
| 498 | + add_to_kill(t, page, vma, to_kill); |
|---|
| 468 | 499 | } |
|---|
| 469 | 500 | } |
|---|
| 470 | 501 | read_unlock(&tasklist_lock); |
|---|
| .. | .. |
|---|
| 475 | 506 | * Collect processes when the error hit a file mapped page. |
|---|
| 476 | 507 | */ |
|---|
| 477 | 508 | static void collect_procs_file(struct page *page, struct list_head *to_kill, |
|---|
| 478 | | - struct to_kill **tkc, int force_early) |
|---|
| 509 | + int force_early) |
|---|
| 479 | 510 | { |
|---|
| 480 | 511 | struct vm_area_struct *vma; |
|---|
| 481 | 512 | struct task_struct *tsk; |
|---|
| 482 | 513 | struct address_space *mapping = page->mapping; |
|---|
| 514 | + pgoff_t pgoff; |
|---|
| 483 | 515 | |
|---|
| 484 | 516 | i_mmap_lock_read(mapping); |
|---|
| 485 | 517 | read_lock(&tasklist_lock); |
|---|
| 518 | + pgoff = page_to_pgoff(page); |
|---|
| 486 | 519 | for_each_process(tsk) { |
|---|
| 487 | | - pgoff_t pgoff = page_to_pgoff(page); |
|---|
| 488 | 520 | struct task_struct *t = task_early_kill(tsk, force_early); |
|---|
| 489 | 521 | |
|---|
| 490 | 522 | if (!t) |
|---|
| .. | .. |
|---|
| 499 | 531 | * to be informed of all such data corruptions. |
|---|
| 500 | 532 | */ |
|---|
| 501 | 533 | if (vma->vm_mm == t->mm) |
|---|
| 502 | | - add_to_kill(t, page, vma, to_kill, tkc); |
|---|
| 534 | + add_to_kill(t, page, vma, to_kill); |
|---|
| 503 | 535 | } |
|---|
| 504 | 536 | } |
|---|
| 505 | 537 | read_unlock(&tasklist_lock); |
|---|
| .. | .. |
|---|
| 508 | 540 | |
|---|
| 509 | 541 | /* |
|---|
| 510 | 542 | * Collect the processes who have the corrupted page mapped to kill. |
|---|
| 511 | | - * This is done in two steps for locking reasons. |
|---|
| 512 | | - * First preallocate one tokill structure outside the spin locks, |
|---|
| 513 | | - * so that we can kill at least one process reasonably reliable. |
|---|
| 514 | 543 | */ |
|---|
| 515 | 544 | static void collect_procs(struct page *page, struct list_head *tokill, |
|---|
| 516 | 545 | int force_early) |
|---|
| 517 | 546 | { |
|---|
| 518 | | - struct to_kill *tk; |
|---|
| 519 | | - |
|---|
| 520 | 547 | if (!page->mapping) |
|---|
| 521 | 548 | return; |
|---|
| 522 | 549 | |
|---|
| 523 | | - tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); |
|---|
| 524 | | - if (!tk) |
|---|
| 525 | | - return; |
|---|
| 526 | 550 | if (PageAnon(page)) |
|---|
| 527 | | - collect_procs_anon(page, tokill, &tk, force_early); |
|---|
| 551 | + collect_procs_anon(page, tokill, force_early); |
|---|
| 528 | 552 | else |
|---|
| 529 | | - collect_procs_file(page, tokill, &tk, force_early); |
|---|
| 530 | | - kfree(tk); |
|---|
| 553 | + collect_procs_file(page, tokill, force_early); |
|---|
| 531 | 554 | } |
|---|
| 532 | 555 | |
|---|
| 533 | 556 | static const char *action_name[] = { |
|---|
| .. | .. |
|---|
| 559 | 582 | [MF_MSG_BUDDY] = "free buddy page", |
|---|
| 560 | 583 | [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", |
|---|
| 561 | 584 | [MF_MSG_DAX] = "dax page", |
|---|
| 585 | + [MF_MSG_UNSPLIT_THP] = "unsplit thp", |
|---|
| 562 | 586 | [MF_MSG_UNKNOWN] = "unknown page", |
|---|
| 563 | 587 | }; |
|---|
| 564 | 588 | |
|---|
| .. | .. |
|---|
| 829 | 853 | #define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked)) |
|---|
| 830 | 854 | #define unevict (1UL << PG_unevictable) |
|---|
| 831 | 855 | #define mlock (1UL << PG_mlocked) |
|---|
| 832 | | -#define writeback (1UL << PG_writeback) |
|---|
| 833 | 856 | #define lru (1UL << PG_lru) |
|---|
| 834 | 857 | #define head (1UL << PG_head) |
|---|
| 835 | 858 | #define slab (1UL << PG_slab) |
|---|
| .. | .. |
|---|
| 878 | 901 | #undef sc |
|---|
| 879 | 902 | #undef unevict |
|---|
| 880 | 903 | #undef mlock |
|---|
| 881 | | -#undef writeback |
|---|
| 882 | 904 | #undef lru |
|---|
| 883 | 905 | #undef head |
|---|
| 884 | 906 | #undef slab |
|---|
| .. | .. |
|---|
| 930 | 952 | * Return: return 0 if failed to grab the refcount, otherwise true (some |
|---|
| 931 | 953 | * non-zero value.) |
|---|
| 932 | 954 | */ |
|---|
| 933 | | -int get_hwpoison_page(struct page *page) |
|---|
| 955 | +static int get_hwpoison_page(struct page *page) |
|---|
| 934 | 956 | { |
|---|
| 935 | 957 | struct page *head = compound_head(page); |
|---|
| 936 | 958 | |
|---|
| .. | .. |
|---|
| 959 | 981 | |
|---|
| 960 | 982 | return 0; |
|---|
| 961 | 983 | } |
|---|
| 962 | | -EXPORT_SYMBOL_GPL(get_hwpoison_page); |
|---|
| 963 | 984 | |
|---|
| 964 | 985 | /* |
|---|
| 965 | 986 | * Do all that is necessary to remove user space mappings. Unmap |
|---|
| .. | .. |
|---|
| 968 | 989 | static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, |
|---|
| 969 | 990 | int flags, struct page **hpagep) |
|---|
| 970 | 991 | { |
|---|
| 971 | | - enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
|---|
| 992 | + enum ttu_flags ttu = TTU_IGNORE_MLOCK; |
|---|
| 972 | 993 | struct address_space *mapping; |
|---|
| 973 | 994 | LIST_HEAD(tokill); |
|---|
| 974 | | - bool unmap_success; |
|---|
| 995 | + bool unmap_success = true; |
|---|
| 975 | 996 | int kill = 1, forcekill; |
|---|
| 976 | 997 | struct page *hpage = *hpagep; |
|---|
| 977 | 998 | bool mlocked = PageMlocked(hpage); |
|---|
| .. | .. |
|---|
| 1011 | 1032 | */ |
|---|
| 1012 | 1033 | mapping = page_mapping(hpage); |
|---|
| 1013 | 1034 | if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && |
|---|
| 1014 | | - mapping_cap_writeback_dirty(mapping)) { |
|---|
| 1035 | + mapping_can_writeback(mapping)) { |
|---|
| 1015 | 1036 | if (page_mkclean(hpage)) { |
|---|
| 1016 | 1037 | SetPageDirty(hpage); |
|---|
| 1017 | 1038 | } else { |
|---|
| .. | .. |
|---|
| 1033 | 1054 | if (kill) |
|---|
| 1034 | 1055 | collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); |
|---|
| 1035 | 1056 | |
|---|
| 1036 | | - unmap_success = try_to_unmap(hpage, ttu); |
|---|
| 1057 | + if (!PageHuge(hpage)) { |
|---|
| 1058 | + unmap_success = try_to_unmap(hpage, ttu); |
|---|
| 1059 | + } else { |
|---|
| 1060 | + if (!PageAnon(hpage)) { |
|---|
| 1061 | + /* |
|---|
| 1062 | + * For hugetlb pages in shared mappings, try_to_unmap |
|---|
| 1063 | + * could potentially call huge_pmd_unshare. Because of |
|---|
| 1064 | + * this, take semaphore in write mode here and set |
|---|
| 1065 | + * TTU_RMAP_LOCKED to indicate we have taken the lock |
|---|
| 1066 | + * at this higer level. |
|---|
| 1067 | + */ |
|---|
| 1068 | + mapping = hugetlb_page_mapping_lock_write(hpage); |
|---|
| 1069 | + if (mapping) { |
|---|
| 1070 | + unmap_success = try_to_unmap(hpage, |
|---|
| 1071 | + ttu|TTU_RMAP_LOCKED); |
|---|
| 1072 | + i_mmap_unlock_write(mapping); |
|---|
| 1073 | + } else { |
|---|
| 1074 | + pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); |
|---|
| 1075 | + unmap_success = false; |
|---|
| 1076 | + } |
|---|
| 1077 | + } else { |
|---|
| 1078 | + unmap_success = try_to_unmap(hpage, ttu); |
|---|
| 1079 | + } |
|---|
| 1080 | + } |
|---|
| 1037 | 1081 | if (!unmap_success) |
|---|
| 1038 | 1082 | pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", |
|---|
| 1039 | 1083 | pfn, page_mapcount(hpage)); |
|---|
| .. | .. |
|---|
| 1084 | 1128 | return page_action(ps, p, pfn); |
|---|
| 1085 | 1129 | } |
|---|
| 1086 | 1130 | |
|---|
| 1131 | +static int try_to_split_thp_page(struct page *page, const char *msg) |
|---|
| 1132 | +{ |
|---|
| 1133 | + lock_page(page); |
|---|
| 1134 | + if (!PageAnon(page) || unlikely(split_huge_page(page))) { |
|---|
| 1135 | + unsigned long pfn = page_to_pfn(page); |
|---|
| 1136 | + |
|---|
| 1137 | + unlock_page(page); |
|---|
| 1138 | + if (!PageAnon(page)) |
|---|
| 1139 | + pr_info("%s: %#lx: non anonymous thp\n", msg, pfn); |
|---|
| 1140 | + else |
|---|
| 1141 | + pr_info("%s: %#lx: thp split failed\n", msg, pfn); |
|---|
| 1142 | + put_page(page); |
|---|
| 1143 | + return -EBUSY; |
|---|
| 1144 | + } |
|---|
| 1145 | + unlock_page(page); |
|---|
| 1146 | + |
|---|
| 1147 | + return 0; |
|---|
| 1148 | +} |
|---|
| 1149 | + |
|---|
| 1087 | 1150 | static int memory_failure_hugetlb(unsigned long pfn, int flags) |
|---|
| 1088 | 1151 | { |
|---|
| 1089 | 1152 | struct page *p = pfn_to_page(pfn); |
|---|
| .. | .. |
|---|
| 1125 | 1188 | pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); |
|---|
| 1126 | 1189 | num_poisoned_pages_dec(); |
|---|
| 1127 | 1190 | unlock_page(head); |
|---|
| 1128 | | - put_hwpoison_page(head); |
|---|
| 1191 | + put_page(head); |
|---|
| 1129 | 1192 | return 0; |
|---|
| 1130 | 1193 | } |
|---|
| 1131 | 1194 | |
|---|
| .. | .. |
|---|
| 1166 | 1229 | LIST_HEAD(tokill); |
|---|
| 1167 | 1230 | int rc = -EBUSY; |
|---|
| 1168 | 1231 | loff_t start; |
|---|
| 1232 | + dax_entry_t cookie; |
|---|
| 1233 | + |
|---|
| 1234 | + if (flags & MF_COUNT_INCREASED) |
|---|
| 1235 | + /* |
|---|
| 1236 | + * Drop the extra refcount in case we come from madvise(). |
|---|
| 1237 | + */ |
|---|
| 1238 | + put_page(page); |
|---|
| 1239 | + |
|---|
| 1240 | + /* device metadata space is not recoverable */ |
|---|
| 1241 | + if (!pgmap_pfn_valid(pgmap, pfn)) { |
|---|
| 1242 | + rc = -ENXIO; |
|---|
| 1243 | + goto out; |
|---|
| 1244 | + } |
|---|
| 1169 | 1245 | |
|---|
| 1170 | 1246 | /* |
|---|
| 1171 | 1247 | * Prevent the inode from being freed while we are interrogating |
|---|
| .. | .. |
|---|
| 1174 | 1250 | * also prevents changes to the mapping of this pfn until |
|---|
| 1175 | 1251 | * poison signaling is complete. |
|---|
| 1176 | 1252 | */ |
|---|
| 1177 | | - if (!dax_lock_mapping_entry(page)) |
|---|
| 1253 | + cookie = dax_lock_page(page); |
|---|
| 1254 | + if (!cookie) |
|---|
| 1178 | 1255 | goto out; |
|---|
| 1179 | 1256 | |
|---|
| 1180 | 1257 | if (hwpoison_filter(page)) { |
|---|
| .. | .. |
|---|
| 1182 | 1259 | goto unlock; |
|---|
| 1183 | 1260 | } |
|---|
| 1184 | 1261 | |
|---|
| 1185 | | - switch (pgmap->type) { |
|---|
| 1186 | | - case MEMORY_DEVICE_PRIVATE: |
|---|
| 1187 | | - case MEMORY_DEVICE_PUBLIC: |
|---|
| 1262 | + if (pgmap->type == MEMORY_DEVICE_PRIVATE) { |
|---|
| 1188 | 1263 | /* |
|---|
| 1189 | 1264 | * TODO: Handle HMM pages which may need coordination |
|---|
| 1190 | 1265 | * with device-side memory. |
|---|
| 1191 | 1266 | */ |
|---|
| 1192 | 1267 | goto unlock; |
|---|
| 1193 | | - default: |
|---|
| 1194 | | - break; |
|---|
| 1195 | 1268 | } |
|---|
| 1196 | 1269 | |
|---|
| 1197 | 1270 | /* |
|---|
| .. | .. |
|---|
| 1225 | 1298 | kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags); |
|---|
| 1226 | 1299 | rc = 0; |
|---|
| 1227 | 1300 | unlock: |
|---|
| 1228 | | - dax_unlock_mapping_entry(page); |
|---|
| 1301 | + dax_unlock_page(page, cookie); |
|---|
| 1229 | 1302 | out: |
|---|
| 1230 | 1303 | /* drop pgmap ref acquired in caller */ |
|---|
| 1231 | 1304 | put_dev_pagemap(pgmap); |
|---|
| .. | .. |
|---|
| 1308 | 1381 | } |
|---|
| 1309 | 1382 | |
|---|
| 1310 | 1383 | if (PageTransHuge(hpage)) { |
|---|
| 1311 | | - lock_page(p); |
|---|
| 1312 | | - if (!PageAnon(p) || unlikely(split_huge_page(p))) { |
|---|
| 1313 | | - unlock_page(p); |
|---|
| 1314 | | - if (!PageAnon(p)) |
|---|
| 1315 | | - pr_err("Memory failure: %#lx: non anonymous thp\n", |
|---|
| 1316 | | - pfn); |
|---|
| 1317 | | - else |
|---|
| 1318 | | - pr_err("Memory failure: %#lx: thp split failed\n", |
|---|
| 1319 | | - pfn); |
|---|
| 1320 | | - if (TestClearPageHWPoison(p)) |
|---|
| 1321 | | - num_poisoned_pages_dec(); |
|---|
| 1322 | | - put_hwpoison_page(p); |
|---|
| 1384 | + if (try_to_split_thp_page(p, "Memory Failure") < 0) { |
|---|
| 1385 | + action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); |
|---|
| 1323 | 1386 | return -EBUSY; |
|---|
| 1324 | 1387 | } |
|---|
| 1325 | | - unlock_page(p); |
|---|
| 1326 | 1388 | VM_BUG_ON_PAGE(!page_count(p), p); |
|---|
| 1327 | | - hpage = compound_head(p); |
|---|
| 1328 | 1389 | } |
|---|
| 1329 | 1390 | |
|---|
| 1330 | 1391 | /* |
|---|
| .. | .. |
|---|
| 1364 | 1425 | * page_remove_rmap() in try_to_unmap_one(). So to determine page status |
|---|
| 1365 | 1426 | * correctly, we save a copy of the page flags at this time. |
|---|
| 1366 | 1427 | */ |
|---|
| 1367 | | - if (PageHuge(p)) |
|---|
| 1368 | | - page_flags = hpage->flags; |
|---|
| 1369 | | - else |
|---|
| 1370 | | - page_flags = p->flags; |
|---|
| 1428 | + page_flags = p->flags; |
|---|
| 1371 | 1429 | |
|---|
| 1372 | 1430 | /* |
|---|
| 1373 | 1431 | * unpoison always clear PG_hwpoison inside page lock |
|---|
| .. | .. |
|---|
| 1376 | 1434 | pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); |
|---|
| 1377 | 1435 | num_poisoned_pages_dec(); |
|---|
| 1378 | 1436 | unlock_page(p); |
|---|
| 1379 | | - put_hwpoison_page(p); |
|---|
| 1437 | + put_page(p); |
|---|
| 1380 | 1438 | return 0; |
|---|
| 1381 | 1439 | } |
|---|
| 1382 | 1440 | if (hwpoison_filter(p)) { |
|---|
| 1383 | 1441 | if (TestClearPageHWPoison(p)) |
|---|
| 1384 | 1442 | num_poisoned_pages_dec(); |
|---|
| 1385 | 1443 | unlock_page(p); |
|---|
| 1386 | | - put_hwpoison_page(p); |
|---|
| 1444 | + put_page(p); |
|---|
| 1387 | 1445 | return 0; |
|---|
| 1388 | 1446 | } |
|---|
| 1389 | 1447 | |
|---|
| .. | .. |
|---|
| 1404 | 1462 | /* |
|---|
| 1405 | 1463 | * Now take care of user space mappings. |
|---|
| 1406 | 1464 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. |
|---|
| 1407 | | - * |
|---|
| 1408 | | - * When the raw error page is thp tail page, hpage points to the raw |
|---|
| 1409 | | - * page after thp split. |
|---|
| 1410 | 1465 | */ |
|---|
| 1411 | | - if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) { |
|---|
| 1466 | + if (!hwpoison_user_mappings(p, pfn, flags, &p)) { |
|---|
| 1412 | 1467 | action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); |
|---|
| 1413 | 1468 | res = -EBUSY; |
|---|
| 1414 | 1469 | goto out; |
|---|
| .. | .. |
|---|
| 1492 | 1547 | unsigned long proc_flags; |
|---|
| 1493 | 1548 | int gotten; |
|---|
| 1494 | 1549 | |
|---|
| 1495 | | - mf_cpu = this_cpu_ptr(&memory_failure_cpu); |
|---|
| 1550 | + mf_cpu = container_of(work, struct memory_failure_cpu, work); |
|---|
| 1496 | 1551 | for (;;) { |
|---|
| 1497 | 1552 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); |
|---|
| 1498 | 1553 | gotten = kfifo_get(&mf_cpu->fifo, &entry); |
|---|
| .. | .. |
|---|
| 1500 | 1555 | if (!gotten) |
|---|
| 1501 | 1556 | break; |
|---|
| 1502 | 1557 | if (entry.flags & MF_SOFT_OFFLINE) |
|---|
| 1503 | | - soft_offline_page(pfn_to_page(entry.pfn), entry.flags); |
|---|
| 1558 | + soft_offline_page(entry.pfn, entry.flags); |
|---|
| 1504 | 1559 | else |
|---|
| 1505 | 1560 | memory_failure(entry.pfn, entry.flags); |
|---|
| 1506 | 1561 | } |
|---|
| 1562 | +} |
|---|
| 1563 | + |
|---|
| 1564 | +/* |
|---|
| 1565 | + * Process memory_failure work queued on the specified CPU. |
|---|
| 1566 | + * Used to avoid return-to-userspace racing with the memory_failure workqueue. |
|---|
| 1567 | + */ |
|---|
| 1568 | +void memory_failure_queue_kick(int cpu) |
|---|
| 1569 | +{ |
|---|
| 1570 | + struct memory_failure_cpu *mf_cpu; |
|---|
| 1571 | + |
|---|
| 1572 | + mf_cpu = &per_cpu(memory_failure_cpu, cpu); |
|---|
| 1573 | + cancel_work_sync(&mf_cpu->work); |
|---|
| 1574 | + memory_failure_work_func(&mf_cpu->work); |
|---|
| 1507 | 1575 | } |
|---|
| 1508 | 1576 | |
|---|
| 1509 | 1577 | static int __init memory_failure_init(void) |
|---|
| .. | .. |
|---|
| 1612 | 1680 | } |
|---|
| 1613 | 1681 | unlock_page(page); |
|---|
| 1614 | 1682 | |
|---|
| 1615 | | - put_hwpoison_page(page); |
|---|
| 1683 | + put_page(page); |
|---|
| 1616 | 1684 | if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) |
|---|
| 1617 | | - put_hwpoison_page(page); |
|---|
| 1685 | + put_page(page); |
|---|
| 1618 | 1686 | |
|---|
| 1619 | 1687 | return 0; |
|---|
| 1620 | 1688 | } |
|---|
| 1621 | 1689 | EXPORT_SYMBOL(unpoison_memory); |
|---|
| 1622 | 1690 | |
|---|
| 1623 | | -static struct page *new_page(struct page *p, unsigned long private) |
|---|
| 1691 | +/* |
|---|
| 1692 | + * Safely get reference count of an arbitrary page. |
|---|
| 1693 | + * Returns 0 for a free page, 1 for an in-use page, -EIO for a page-type we |
|---|
| 1694 | + * cannot handle and -EBUSY if we raced with an allocation. |
|---|
| 1695 | + * We only incremented refcount in case the page was already in-use and it is |
|---|
| 1696 | + * a known type we can handle. |
|---|
| 1697 | + */ |
|---|
| 1698 | +static int get_any_page(struct page *p, int flags) |
|---|
| 1624 | 1699 | { |
|---|
| 1625 | | - int nid = page_to_nid(p); |
|---|
| 1700 | + int ret = 0, pass = 0; |
|---|
| 1701 | + bool count_increased = false; |
|---|
| 1626 | 1702 | |
|---|
| 1627 | | - return new_page_nodemask(p, nid, &node_states[N_MEMORY]); |
|---|
| 1703 | + if (flags & MF_COUNT_INCREASED) |
|---|
| 1704 | + count_increased = true; |
|---|
| 1705 | + |
|---|
| 1706 | +try_again: |
|---|
| 1707 | + if (!count_increased && !get_hwpoison_page(p)) { |
|---|
| 1708 | + if (page_count(p)) { |
|---|
| 1709 | + /* We raced with an allocation, retry. */ |
|---|
| 1710 | + if (pass++ < 3) |
|---|
| 1711 | + goto try_again; |
|---|
| 1712 | + ret = -EBUSY; |
|---|
| 1713 | + } else if (!PageHuge(p) && !is_free_buddy_page(p)) { |
|---|
| 1714 | + /* We raced with put_page, retry. */ |
|---|
| 1715 | + if (pass++ < 3) |
|---|
| 1716 | + goto try_again; |
|---|
| 1717 | + ret = -EIO; |
|---|
| 1718 | + } |
|---|
| 1719 | + } else { |
|---|
| 1720 | + if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) { |
|---|
| 1721 | + ret = 1; |
|---|
| 1722 | + } else { |
|---|
| 1723 | + /* |
|---|
| 1724 | + * A page we cannot handle. Check whether we can turn |
|---|
| 1725 | + * it into something we can handle. |
|---|
| 1726 | + */ |
|---|
| 1727 | + if (pass++ < 3) { |
|---|
| 1728 | + put_page(p); |
|---|
| 1729 | + shake_page(p, 1); |
|---|
| 1730 | + count_increased = false; |
|---|
| 1731 | + goto try_again; |
|---|
| 1732 | + } |
|---|
| 1733 | + put_page(p); |
|---|
| 1734 | + ret = -EIO; |
|---|
| 1735 | + } |
|---|
| 1736 | + } |
|---|
| 1737 | + |
|---|
| 1738 | + return ret; |
|---|
| 1739 | +} |
|---|
| 1740 | + |
|---|
| 1741 | +static bool isolate_page(struct page *page, struct list_head *pagelist) |
|---|
| 1742 | +{ |
|---|
| 1743 | + bool isolated = false; |
|---|
| 1744 | + bool lru = PageLRU(page); |
|---|
| 1745 | + |
|---|
| 1746 | + if (PageHuge(page)) { |
|---|
| 1747 | + isolated = !isolate_hugetlb(page, pagelist); |
|---|
| 1748 | + } else { |
|---|
| 1749 | + if (lru) |
|---|
| 1750 | + isolated = !isolate_lru_page(page); |
|---|
| 1751 | + else |
|---|
| 1752 | + isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE); |
|---|
| 1753 | + |
|---|
| 1754 | + if (isolated) |
|---|
| 1755 | + list_add(&page->lru, pagelist); |
|---|
| 1756 | + } |
|---|
| 1757 | + |
|---|
| 1758 | + if (isolated && lru) |
|---|
| 1759 | + inc_node_page_state(page, NR_ISOLATED_ANON + |
|---|
| 1760 | + page_is_file_lru(page)); |
|---|
| 1761 | + |
|---|
| 1762 | + /* |
|---|
| 1763 | + * If we succeed to isolate the page, we grabbed another refcount on |
|---|
| 1764 | + * the page, so we can safely drop the one we got from get_any_pages(). |
|---|
| 1765 | + * If we failed to isolate the page, it means that we cannot go further |
|---|
| 1766 | + * and we will return an error, so drop the reference we got from |
|---|
| 1767 | + * get_any_pages() as well. |
|---|
| 1768 | + */ |
|---|
| 1769 | + put_page(page); |
|---|
| 1770 | + return isolated; |
|---|
| 1628 | 1771 | } |
|---|
| 1629 | 1772 | |
|---|
| 1630 | 1773 | /* |
|---|
| 1631 | | - * Safely get reference count of an arbitrary page. |
|---|
| 1632 | | - * Returns 0 for a free page, -EIO for a zero refcount page |
|---|
| 1633 | | - * that is not free, and 1 for any other page type. |
|---|
| 1634 | | - * For 1 the page is returned with increased page count, otherwise not. |
|---|
| 1774 | + * __soft_offline_page handles hugetlb-pages and non-hugetlb pages. |
|---|
| 1775 | + * If the page is a non-dirty unmapped page-cache page, it simply invalidates. |
|---|
| 1776 | + * If the page is mapped, it migrates the contents over. |
|---|
| 1635 | 1777 | */ |
|---|
| 1636 | | -static int __get_any_page(struct page *p, unsigned long pfn, int flags) |
|---|
| 1778 | +static int __soft_offline_page(struct page *page) |
|---|
| 1637 | 1779 | { |
|---|
| 1638 | | - int ret; |
|---|
| 1639 | | - |
|---|
| 1640 | | - if (flags & MF_COUNT_INCREASED) |
|---|
| 1641 | | - return 1; |
|---|
| 1642 | | - |
|---|
| 1643 | | - /* |
|---|
| 1644 | | - * When the target page is a free hugepage, just remove it |
|---|
| 1645 | | - * from free hugepage list. |
|---|
| 1646 | | - */ |
|---|
| 1647 | | - if (!get_hwpoison_page(p)) { |
|---|
| 1648 | | - if (PageHuge(p)) { |
|---|
| 1649 | | - pr_info("%s: %#lx free huge page\n", __func__, pfn); |
|---|
| 1650 | | - ret = 0; |
|---|
| 1651 | | - } else if (is_free_buddy_page(p)) { |
|---|
| 1652 | | - pr_info("%s: %#lx free buddy page\n", __func__, pfn); |
|---|
| 1653 | | - ret = 0; |
|---|
| 1654 | | - } else { |
|---|
| 1655 | | - pr_info("%s: %#lx: unknown zero refcount page type %lx\n", |
|---|
| 1656 | | - __func__, pfn, p->flags); |
|---|
| 1657 | | - ret = -EIO; |
|---|
| 1658 | | - } |
|---|
| 1659 | | - } else { |
|---|
| 1660 | | - /* Not a free page */ |
|---|
| 1661 | | - ret = 1; |
|---|
| 1662 | | - } |
|---|
| 1663 | | - return ret; |
|---|
| 1664 | | -} |
|---|
| 1665 | | - |
|---|
| 1666 | | -static int get_any_page(struct page *page, unsigned long pfn, int flags) |
|---|
| 1667 | | -{ |
|---|
| 1668 | | - int ret = __get_any_page(page, pfn, flags); |
|---|
| 1669 | | - |
|---|
| 1670 | | - if (ret == 1 && !PageHuge(page) && |
|---|
| 1671 | | - !PageLRU(page) && !__PageMovable(page)) { |
|---|
| 1672 | | - /* |
|---|
| 1673 | | - * Try to free it. |
|---|
| 1674 | | - */ |
|---|
| 1675 | | - put_hwpoison_page(page); |
|---|
| 1676 | | - shake_page(page, 1); |
|---|
| 1677 | | - |
|---|
| 1678 | | - /* |
|---|
| 1679 | | - * Did it turn free? |
|---|
| 1680 | | - */ |
|---|
| 1681 | | - ret = __get_any_page(page, pfn, 0); |
|---|
| 1682 | | - if (ret == 1 && !PageLRU(page)) { |
|---|
| 1683 | | - /* Drop page reference which is from __get_any_page() */ |
|---|
| 1684 | | - put_hwpoison_page(page); |
|---|
| 1685 | | - pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n", |
|---|
| 1686 | | - pfn, page->flags, &page->flags); |
|---|
| 1687 | | - return -EIO; |
|---|
| 1688 | | - } |
|---|
| 1689 | | - } |
|---|
| 1690 | | - return ret; |
|---|
| 1691 | | -} |
|---|
| 1692 | | - |
|---|
| 1693 | | -static int soft_offline_huge_page(struct page *page, int flags) |
|---|
| 1694 | | -{ |
|---|
| 1695 | | - int ret; |
|---|
| 1780 | + int ret = 0; |
|---|
| 1696 | 1781 | unsigned long pfn = page_to_pfn(page); |
|---|
| 1697 | 1782 | struct page *hpage = compound_head(page); |
|---|
| 1783 | + char const *msg_page[] = {"page", "hugepage"}; |
|---|
| 1784 | + bool huge = PageHuge(page); |
|---|
| 1698 | 1785 | LIST_HEAD(pagelist); |
|---|
| 1699 | | - |
|---|
| 1700 | | - /* |
|---|
| 1701 | | - * This double-check of PageHWPoison is to avoid the race with |
|---|
| 1702 | | - * memory_failure(). See also comment in __soft_offline_page(). |
|---|
| 1703 | | - */ |
|---|
| 1704 | | - lock_page(hpage); |
|---|
| 1705 | | - if (PageHWPoison(hpage)) { |
|---|
| 1706 | | - unlock_page(hpage); |
|---|
| 1707 | | - put_hwpoison_page(hpage); |
|---|
| 1708 | | - pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); |
|---|
| 1709 | | - return -EBUSY; |
|---|
| 1710 | | - } |
|---|
| 1711 | | - unlock_page(hpage); |
|---|
| 1712 | | - |
|---|
| 1713 | | - ret = isolate_huge_page(hpage, &pagelist); |
|---|
| 1714 | | - /* |
|---|
| 1715 | | - * get_any_page() and isolate_huge_page() takes a refcount each, |
|---|
| 1716 | | - * so need to drop one here. |
|---|
| 1717 | | - */ |
|---|
| 1718 | | - put_hwpoison_page(hpage); |
|---|
| 1719 | | - if (!ret) { |
|---|
| 1720 | | - pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); |
|---|
| 1721 | | - return -EBUSY; |
|---|
| 1722 | | - } |
|---|
| 1723 | | - |
|---|
| 1724 | | - ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
|---|
| 1725 | | - MIGRATE_SYNC, MR_MEMORY_FAILURE); |
|---|
| 1726 | | - if (ret) { |
|---|
| 1727 | | - pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n", |
|---|
| 1728 | | - pfn, ret, page->flags, &page->flags); |
|---|
| 1729 | | - if (!list_empty(&pagelist)) |
|---|
| 1730 | | - putback_movable_pages(&pagelist); |
|---|
| 1731 | | - if (ret > 0) |
|---|
| 1732 | | - ret = -EIO; |
|---|
| 1733 | | - } else { |
|---|
| 1734 | | - /* |
|---|
| 1735 | | - * We set PG_hwpoison only when the migration source hugepage |
|---|
| 1736 | | - * was successfully dissolved, because otherwise hwpoisoned |
|---|
| 1737 | | - * hugepage remains on free hugepage list, then userspace will |
|---|
| 1738 | | - * find it as SIGBUS by allocation failure. That's not expected |
|---|
| 1739 | | - * in soft-offlining. |
|---|
| 1740 | | - */ |
|---|
| 1741 | | - ret = dissolve_free_huge_page(page); |
|---|
| 1742 | | - if (!ret) { |
|---|
| 1743 | | - if (set_hwpoison_free_buddy_page(page)) |
|---|
| 1744 | | - num_poisoned_pages_inc(); |
|---|
| 1745 | | - else |
|---|
| 1746 | | - ret = -EBUSY; |
|---|
| 1747 | | - } |
|---|
| 1748 | | - } |
|---|
| 1749 | | - return ret; |
|---|
| 1750 | | -} |
|---|
| 1751 | | - |
|---|
| 1752 | | -static int __soft_offline_page(struct page *page, int flags) |
|---|
| 1753 | | -{ |
|---|
| 1754 | | - int ret; |
|---|
| 1755 | | - unsigned long pfn = page_to_pfn(page); |
|---|
| 1786 | + struct migration_target_control mtc = { |
|---|
| 1787 | + .nid = NUMA_NO_NODE, |
|---|
| 1788 | + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, |
|---|
| 1789 | + }; |
|---|
| 1756 | 1790 | |
|---|
| 1757 | 1791 | /* |
|---|
| 1758 | 1792 | * Check PageHWPoison again inside page lock because PageHWPoison |
|---|
| .. | .. |
|---|
| 1761 | 1795 | * so there's no race between soft_offline_page() and memory_failure(). |
|---|
| 1762 | 1796 | */ |
|---|
| 1763 | 1797 | lock_page(page); |
|---|
| 1764 | | - wait_on_page_writeback(page); |
|---|
| 1798 | + if (!PageHuge(page)) |
|---|
| 1799 | + wait_on_page_writeback(page); |
|---|
| 1765 | 1800 | if (PageHWPoison(page)) { |
|---|
| 1766 | 1801 | unlock_page(page); |
|---|
| 1767 | | - put_hwpoison_page(page); |
|---|
| 1802 | + put_page(page); |
|---|
| 1768 | 1803 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
|---|
| 1769 | | - return -EBUSY; |
|---|
| 1804 | + return 0; |
|---|
| 1770 | 1805 | } |
|---|
| 1771 | | - /* |
|---|
| 1772 | | - * Try to invalidate first. This should work for |
|---|
| 1773 | | - * non dirty unmapped page cache pages. |
|---|
| 1774 | | - */ |
|---|
| 1775 | | - ret = invalidate_inode_page(page); |
|---|
| 1806 | + |
|---|
| 1807 | + if (!PageHuge(page)) |
|---|
| 1808 | + /* |
|---|
| 1809 | + * Try to invalidate first. This should work for |
|---|
| 1810 | + * non dirty unmapped page cache pages. |
|---|
| 1811 | + */ |
|---|
| 1812 | + ret = invalidate_inode_page(page); |
|---|
| 1776 | 1813 | unlock_page(page); |
|---|
| 1814 | + |
|---|
| 1777 | 1815 | /* |
|---|
| 1778 | 1816 | * RED-PEN would be better to keep it isolated here, but we |
|---|
| 1779 | 1817 | * would need to fix isolation locking first. |
|---|
| 1780 | 1818 | */ |
|---|
| 1781 | | - if (ret == 1) { |
|---|
| 1782 | | - put_hwpoison_page(page); |
|---|
| 1819 | + if (ret) { |
|---|
| 1783 | 1820 | pr_info("soft_offline: %#lx: invalidated\n", pfn); |
|---|
| 1784 | | - SetPageHWPoison(page); |
|---|
| 1785 | | - num_poisoned_pages_inc(); |
|---|
| 1821 | + page_handle_poison(page, false, true); |
|---|
| 1786 | 1822 | return 0; |
|---|
| 1787 | 1823 | } |
|---|
| 1788 | 1824 | |
|---|
| 1789 | | - /* |
|---|
| 1790 | | - * Simple invalidation didn't work. |
|---|
| 1791 | | - * Try to migrate to a new page instead. migrate.c |
|---|
| 1792 | | - * handles a large number of cases for us. |
|---|
| 1793 | | - */ |
|---|
| 1794 | | - if (PageLRU(page)) |
|---|
| 1795 | | - ret = isolate_lru_page(page); |
|---|
| 1796 | | - else |
|---|
| 1797 | | - ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); |
|---|
| 1798 | | - /* |
|---|
| 1799 | | - * Drop page reference which is came from get_any_page() |
|---|
| 1800 | | - * successful isolate_lru_page() already took another one. |
|---|
| 1801 | | - */ |
|---|
| 1802 | | - put_hwpoison_page(page); |
|---|
| 1803 | | - if (!ret) { |
|---|
| 1804 | | - LIST_HEAD(pagelist); |
|---|
| 1805 | | - /* |
|---|
| 1806 | | - * After isolated lru page, the PageLRU will be cleared, |
|---|
| 1807 | | - * so use !__PageMovable instead for LRU page's mapping |
|---|
| 1808 | | - * cannot have PAGE_MAPPING_MOVABLE. |
|---|
| 1809 | | - */ |
|---|
| 1810 | | - if (!__PageMovable(page)) |
|---|
| 1811 | | - inc_node_page_state(page, NR_ISOLATED_ANON + |
|---|
| 1812 | | - page_is_file_cache(page)); |
|---|
| 1813 | | - list_add(&page->lru, &pagelist); |
|---|
| 1814 | | - ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
|---|
| 1815 | | - MIGRATE_SYNC, MR_MEMORY_FAILURE); |
|---|
| 1816 | | - if (ret) { |
|---|
| 1825 | + if (isolate_page(hpage, &pagelist)) { |
|---|
| 1826 | + ret = migrate_pages(&pagelist, alloc_migration_target, NULL, |
|---|
| 1827 | + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE); |
|---|
| 1828 | + if (!ret) { |
|---|
| 1829 | + bool release = !huge; |
|---|
| 1830 | + |
|---|
| 1831 | + if (!page_handle_poison(page, huge, release)) |
|---|
| 1832 | + ret = -EBUSY; |
|---|
| 1833 | + } else { |
|---|
| 1817 | 1834 | if (!list_empty(&pagelist)) |
|---|
| 1818 | 1835 | putback_movable_pages(&pagelist); |
|---|
| 1819 | 1836 | |
|---|
| 1820 | | - pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", |
|---|
| 1821 | | - pfn, ret, page->flags, &page->flags); |
|---|
| 1837 | + pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n", |
|---|
| 1838 | + pfn, msg_page[huge], ret, page->flags, &page->flags); |
|---|
| 1822 | 1839 | if (ret > 0) |
|---|
| 1823 | | - ret = -EIO; |
|---|
| 1840 | + ret = -EBUSY; |
|---|
| 1824 | 1841 | } |
|---|
| 1825 | 1842 | } else { |
|---|
| 1826 | | - pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n", |
|---|
| 1827 | | - pfn, ret, page_count(page), page->flags, &page->flags); |
|---|
| 1843 | + pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n", |
|---|
| 1844 | + pfn, msg_page[huge], page_count(page), page->flags, &page->flags); |
|---|
| 1845 | + ret = -EBUSY; |
|---|
| 1828 | 1846 | } |
|---|
| 1829 | 1847 | return ret; |
|---|
| 1830 | 1848 | } |
|---|
| 1831 | 1849 | |
|---|
| 1832 | | -static int soft_offline_in_use_page(struct page *page, int flags) |
|---|
| 1850 | +static int soft_offline_in_use_page(struct page *page) |
|---|
| 1833 | 1851 | { |
|---|
| 1834 | | - int ret; |
|---|
| 1835 | | - int mt; |
|---|
| 1836 | 1852 | struct page *hpage = compound_head(page); |
|---|
| 1837 | 1853 | |
|---|
| 1838 | | - if (!PageHuge(page) && PageTransHuge(hpage)) { |
|---|
| 1839 | | - lock_page(page); |
|---|
| 1840 | | - if (!PageAnon(page) || unlikely(split_huge_page(page))) { |
|---|
| 1841 | | - unlock_page(page); |
|---|
| 1842 | | - if (!PageAnon(page)) |
|---|
| 1843 | | - pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); |
|---|
| 1844 | | - else |
|---|
| 1845 | | - pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); |
|---|
| 1846 | | - put_hwpoison_page(page); |
|---|
| 1854 | + if (!PageHuge(page) && PageTransHuge(hpage)) |
|---|
| 1855 | + if (try_to_split_thp_page(page, "soft offline") < 0) |
|---|
| 1847 | 1856 | return -EBUSY; |
|---|
| 1848 | | - } |
|---|
| 1849 | | - unlock_page(page); |
|---|
| 1850 | | - } |
|---|
| 1851 | | - |
|---|
| 1852 | | - /* |
|---|
| 1853 | | - * Setting MIGRATE_ISOLATE here ensures that the page will be linked |
|---|
| 1854 | | - * to free list immediately (not via pcplist) when released after |
|---|
| 1855 | | - * successful page migration. Otherwise we can't guarantee that the |
|---|
| 1856 | | - * page is really free after put_page() returns, so |
|---|
| 1857 | | - * set_hwpoison_free_buddy_page() highly likely fails. |
|---|
| 1858 | | - */ |
|---|
| 1859 | | - mt = get_pageblock_migratetype(page); |
|---|
| 1860 | | - set_pageblock_migratetype(page, MIGRATE_ISOLATE); |
|---|
| 1861 | | - if (PageHuge(page)) |
|---|
| 1862 | | - ret = soft_offline_huge_page(page, flags); |
|---|
| 1863 | | - else |
|---|
| 1864 | | - ret = __soft_offline_page(page, flags); |
|---|
| 1865 | | - set_pageblock_migratetype(page, mt); |
|---|
| 1866 | | - return ret; |
|---|
| 1857 | + return __soft_offline_page(page); |
|---|
| 1867 | 1858 | } |
|---|
| 1868 | 1859 | |
|---|
| 1869 | | -static int soft_offline_free_page(struct page *page) |
|---|
| 1860 | +static void put_ref_page(struct page *page) |
|---|
| 1870 | 1861 | { |
|---|
| 1871 | | - int rc = dissolve_free_huge_page(page); |
|---|
| 1872 | | - |
|---|
| 1873 | | - if (!rc) { |
|---|
| 1874 | | - if (set_hwpoison_free_buddy_page(page)) |
|---|
| 1875 | | - num_poisoned_pages_inc(); |
|---|
| 1876 | | - else |
|---|
| 1877 | | - rc = -EBUSY; |
|---|
| 1878 | | - } |
|---|
| 1879 | | - return rc; |
|---|
| 1862 | + if (page) |
|---|
| 1863 | + put_page(page); |
|---|
| 1880 | 1864 | } |
|---|
| 1881 | 1865 | |
|---|
| 1882 | 1866 | /** |
|---|
| 1883 | 1867 | * soft_offline_page - Soft offline a page. |
|---|
| 1884 | | - * @page: page to offline |
|---|
| 1868 | + * @pfn: pfn to soft-offline |
|---|
| 1885 | 1869 | * @flags: flags. Same as memory_failure(). |
|---|
| 1886 | 1870 | * |
|---|
| 1887 | 1871 | * Returns 0 on success, otherwise negated errno. |
|---|
| .. | .. |
|---|
| 1901 | 1885 | * This is not a 100% solution for all memory, but tries to be |
|---|
| 1902 | 1886 | * ``good enough'' for the majority of memory. |
|---|
| 1903 | 1887 | */ |
|---|
| 1904 | | -int soft_offline_page(struct page *page, int flags) |
|---|
| 1888 | +int soft_offline_page(unsigned long pfn, int flags) |
|---|
| 1905 | 1889 | { |
|---|
| 1906 | 1890 | int ret; |
|---|
| 1907 | | - unsigned long pfn = page_to_pfn(page); |
|---|
| 1891 | + bool try_again = true; |
|---|
| 1892 | + struct page *page, *ref_page = NULL; |
|---|
| 1908 | 1893 | |
|---|
| 1909 | | - if (is_zone_device_page(page)) { |
|---|
| 1910 | | - pr_debug_ratelimited("soft_offline: %#lx page is device page\n", |
|---|
| 1911 | | - pfn); |
|---|
| 1912 | | - if (flags & MF_COUNT_INCREASED) |
|---|
| 1913 | | - put_page(page); |
|---|
| 1894 | + WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED)); |
|---|
| 1895 | + |
|---|
| 1896 | + if (!pfn_valid(pfn)) |
|---|
| 1897 | + return -ENXIO; |
|---|
| 1898 | + if (flags & MF_COUNT_INCREASED) |
|---|
| 1899 | + ref_page = pfn_to_page(pfn); |
|---|
| 1900 | + |
|---|
| 1901 | + /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ |
|---|
| 1902 | + page = pfn_to_online_page(pfn); |
|---|
| 1903 | + if (!page) { |
|---|
| 1904 | + put_ref_page(ref_page); |
|---|
| 1914 | 1905 | return -EIO; |
|---|
| 1915 | 1906 | } |
|---|
| 1916 | 1907 | |
|---|
| 1917 | 1908 | if (PageHWPoison(page)) { |
|---|
| 1918 | | - pr_info("soft offline: %#lx page already poisoned\n", pfn); |
|---|
| 1919 | | - if (flags & MF_COUNT_INCREASED) |
|---|
| 1920 | | - put_hwpoison_page(page); |
|---|
| 1921 | | - return -EBUSY; |
|---|
| 1909 | + pr_info("%s: %#lx page already poisoned\n", __func__, pfn); |
|---|
| 1910 | + put_ref_page(ref_page); |
|---|
| 1911 | + return 0; |
|---|
| 1922 | 1912 | } |
|---|
| 1923 | 1913 | |
|---|
| 1914 | +retry: |
|---|
| 1924 | 1915 | get_online_mems(); |
|---|
| 1925 | | - ret = get_any_page(page, pfn, flags); |
|---|
| 1916 | + ret = get_any_page(page, flags); |
|---|
| 1926 | 1917 | put_online_mems(); |
|---|
| 1927 | 1918 | |
|---|
| 1928 | | - if (ret > 0) |
|---|
| 1929 | | - ret = soft_offline_in_use_page(page, flags); |
|---|
| 1930 | | - else if (ret == 0) |
|---|
| 1931 | | - ret = soft_offline_free_page(page); |
|---|
| 1919 | + if (ret > 0) { |
|---|
| 1920 | + ret = soft_offline_in_use_page(page); |
|---|
| 1921 | + } else if (ret == 0) { |
|---|
| 1922 | + if (!page_handle_poison(page, true, false)) { |
|---|
| 1923 | + if (try_again) { |
|---|
| 1924 | + try_again = false; |
|---|
| 1925 | + flags &= ~MF_COUNT_INCREASED; |
|---|
| 1926 | + goto retry; |
|---|
| 1927 | + } |
|---|
| 1928 | + ret = -EBUSY; |
|---|
| 1929 | + } |
|---|
| 1930 | + } else if (ret == -EIO) { |
|---|
| 1931 | + pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n", |
|---|
| 1932 | + __func__, pfn, page->flags, &page->flags); |
|---|
| 1933 | + } |
|---|
| 1932 | 1934 | |
|---|
| 1933 | 1935 | return ret; |
|---|
| 1934 | 1936 | } |
|---|