~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,10 +1,7 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* Copyright (C) 2008, 2009 Intel Corporation
3	4	* Authors: Andi Kleen, Fengguang Wu
4		- *
5		- * This software may be redistributed and/or modified under the terms of
6		- * the GNU General Public License ("GPL") version 2 only as published by the
7		- * Free Software Foundation.
8	5	*
9	6	* High level machine check handler. Handles pages reported by the
10	7	* hardware as being corrupted usually due to a multi-bit ECC memory or cache
..	..	@@ -67,6 +64,33 @@
67	64	int sysctl_memory_failure_recovery __read_mostly = 1;
68	65
69	66	atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
	67	+
	68	+static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
	69	+{
	70	+ if (hugepage_or_freepage) {
	71	+ /*
	72	+ * Doing this check for free pages is also fine since dissolve_free_huge_page
	73	+ * returns 0 for non-hugetlb pages as well.
	74	+ */
	75	+ if (dissolve_free_huge_page(page) \|\| !take_page_off_buddy(page))
	76	+ /*
	77	+ * We could fail to take off the target page from buddy
	78	+ * for example due to racy page allocaiton, but that's
	79	+ * acceptable because soft-offlined page is not broken
	80	+ * and if someone really want to use it, they should
	81	+ * take it.
	82	+ */
	83	+ return false;
	84	+ }
	85	+
	86	+ SetPageHWPoison(page);
	87	+ if (release)
	88	+ put_page(page);
	89	+ page_ref_inc(page);
	90	+ num_poisoned_pages_inc();
	91	+
	92	+ return true;
	93	+}
70	94
71	95	#if defined(CONFIG_HWPOISON_INJECT) \|\| defined(CONFIG_HWPOISON_INJECT_MODULE)
72	96
..	..	@@ -213,14 +237,15 @@
213	237	{
214	238	struct task_struct *t = tk->tsk;
215	239	short addr_lsb = tk->size_shift;
216		- int ret;
	240	+ int ret = 0;
217	241
218		- pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
219		- pfn, t->comm, t->pid);
	242	+ pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
	243	+ pfn, t->comm, t->pid);
220	244
221		- if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
222		- ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
223		- addr_lsb, current);
	245	+ if (flags & MF_ACTION_REQUIRED) {
	246	+ WARN_ON_ONCE(t != current);
	247	+ ret = force_sig_mceerr(BUS_MCEERR_AR,
	248	+ (void __user *)tk->addr, addr_lsb);
224	249	} else {
225	250	/*
226	251	* Don't use force here, it's convenient if the signal
..	..	@@ -306,30 +331,24 @@
306	331	/*
307	332	* Schedule a process for later kill.
308	333	* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
309		- * TBD would GFP_NOIO be enough?
310	334	*/
311	335	static void add_to_kill(struct task_struct tsk, struct page p,
312	336	struct vm_area_struct *vma,
313		- struct list_head *to_kill,
314		- struct to_kill **tkc)
	337	+ struct list_head *to_kill)
315	338	{
316	339	struct to_kill *tk;
317	340
318		- if (*tkc) {
319		- tk = *tkc;
320		- *tkc = NULL;
321		- } else {
322		- tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
323		- if (!tk) {
324		- pr_err("Memory failure: Out of memory while machine check handling\n");
325		- return;
326		- }
	341	+ tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
	342	+ if (!tk) {
	343	+ pr_err("Memory failure: Out of memory while machine check handling\n");
	344	+ return;
327	345	}
	346	+
328	347	tk->addr = page_address_in_vma(p, vma);
329	348	if (is_zone_device_page(p))
330	349	tk->size_shift = dev_pagemap_mapping_shift(p, vma);
331	350	else
332		- tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
	351	+ tk->size_shift = page_shift(compound_head(p));
333	352
334	353	/*
335	354	* Send SIGKILL if "tk->addr == -EFAULT". Also, as
..	..	@@ -348,6 +367,7 @@
348	367	kfree(tk);
349	368	return;
350	369	}
	370	+
351	371	get_task_struct(tsk);
352	372	tk->tsk = tsk;
353	373	list_add_tail(&tk->nd, to_kill);
..	..	@@ -407,9 +427,15 @@
407	427	{
408	428	struct task_struct *t;
409	429
410		- for_each_thread(tsk, t)
411		- if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
412		- return t;
	430	+ for_each_thread(tsk, t) {
	431	+ if (t->flags & PF_MCE_PROCESS) {
	432	+ if (t->flags & PF_MCE_EARLY)
	433	+ return t;
	434	+ } else {
	435	+ if (sysctl_memory_failure_early_kill)
	436	+ return t;
	437	+ }
	438	+ }
413	439	return NULL;
414	440	}
415	441
..	..	@@ -418,35 +444,40 @@
418	444	* to be signaled when some page under the process is hwpoisoned.
419	445	* Return task_struct of the dedicated thread (main thread unless explicitly
420	446	* specified) if the process is "early kill," and otherwise returns NULL.
	447	+ *
	448	+ * Note that the above is true for Action Optional case, but not for Action
	449	+ * Required case where SIGBUS should sent only to the current thread.
421	450	*/
422	451	static struct task_struct task_early_kill(struct task_struct tsk,
423	452	int force_early)
424	453	{
425		- struct task_struct *t;
426	454	if (!tsk->mm)
427	455	return NULL;
428		- if (force_early)
429		- return tsk;
430		- t = find_early_kill_thread(tsk);
431		- if (t)
432		- return t;
433		- if (sysctl_memory_failure_early_kill)
434		- return tsk;
435		- return NULL;
	456	+ if (force_early) {
	457	+ /*
	458	+ * Comparing ->mm here because current task might represent
	459	+ * a subthread, while tsk always points to the main thread.
	460	+ */
	461	+ if (tsk->mm == current->mm)
	462	+ return current;
	463	+ else
	464	+ return NULL;
	465	+ }
	466	+ return find_early_kill_thread(tsk);
436	467	}
437	468
438	469	/*
439	470	* Collect processes when the error hit an anonymous page.
440	471	*/
441	472	static void collect_procs_anon(struct page page, struct list_head to_kill,
442		- struct to_kill **tkc, int force_early)
	473	+ int force_early)
443	474	{
444	475	struct vm_area_struct *vma;
445	476	struct task_struct *tsk;
446	477	struct anon_vma *av;
447	478	pgoff_t pgoff;
448	479
449		- av = page_lock_anon_vma_read(page);
	480	+ av = page_lock_anon_vma_read(page, NULL);
450	481	if (av == NULL) /* Not actually mapped anymore */
451	482	return;
452	483
..	..	@@ -464,7 +495,7 @@
464	495	if (!page_mapped_in_vma(page, vma))
465	496	continue;
466	497	if (vma->vm_mm == t->mm)
467		- add_to_kill(t, page, vma, to_kill, tkc);
	498	+ add_to_kill(t, page, vma, to_kill);
468	499	}
469	500	}
470	501	read_unlock(&tasklist_lock);
..	..	@@ -475,16 +506,17 @@
475	506	* Collect processes when the error hit a file mapped page.
476	507	*/
477	508	static void collect_procs_file(struct page page, struct list_head to_kill,
478		- struct to_kill **tkc, int force_early)
	509	+ int force_early)
479	510	{
480	511	struct vm_area_struct *vma;
481	512	struct task_struct *tsk;
482	513	struct address_space *mapping = page->mapping;
	514	+ pgoff_t pgoff;
483	515
484	516	i_mmap_lock_read(mapping);
485	517	read_lock(&tasklist_lock);
	518	+ pgoff = page_to_pgoff(page);
486	519	for_each_process(tsk) {
487		- pgoff_t pgoff = page_to_pgoff(page);
488	520	struct task_struct *t = task_early_kill(tsk, force_early);
489	521
490	522	if (!t)
..	..	@@ -499,7 +531,7 @@
499	531	* to be informed of all such data corruptions.
500	532	*/
501	533	if (vma->vm_mm == t->mm)
502		- add_to_kill(t, page, vma, to_kill, tkc);
	534	+ add_to_kill(t, page, vma, to_kill);
503	535	}
504	536	}
505	537	read_unlock(&tasklist_lock);
..	..	@@ -508,26 +540,17 @@
508	540
509	541	/*
510	542	* Collect the processes who have the corrupted page mapped to kill.
511		- * This is done in two steps for locking reasons.
512		- * First preallocate one tokill structure outside the spin locks,
513		- * so that we can kill at least one process reasonably reliable.
514	543	*/
515	544	static void collect_procs(struct page page, struct list_head tokill,
516	545	int force_early)
517	546	{
518		- struct to_kill *tk;
519		-
520	547	if (!page->mapping)
521	548	return;
522	549
523		- tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
524		- if (!tk)
525		- return;
526	550	if (PageAnon(page))
527		- collect_procs_anon(page, tokill, &tk, force_early);
	551	+ collect_procs_anon(page, tokill, force_early);
528	552	else
529		- collect_procs_file(page, tokill, &tk, force_early);
530		- kfree(tk);
	553	+ collect_procs_file(page, tokill, force_early);
531	554	}
532	555
533	556	static const char *action_name[] = {
..	..	@@ -559,6 +582,7 @@
559	582	[MF_MSG_BUDDY] = "free buddy page",
560	583	[MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
561	584	[MF_MSG_DAX] = "dax page",
	585	+ [MF_MSG_UNSPLIT_THP] = "unsplit thp",
562	586	[MF_MSG_UNKNOWN] = "unknown page",
563	587	};
564	588
..	..	@@ -829,7 +853,6 @@
829	853	#define sc ((1UL << PG_swapcache) \| (1UL << PG_swapbacked))
830	854	#define unevict (1UL << PG_unevictable)
831	855	#define mlock (1UL << PG_mlocked)
832		-#define writeback (1UL << PG_writeback)
833	856	#define lru (1UL << PG_lru)
834	857	#define head (1UL << PG_head)
835	858	#define slab (1UL << PG_slab)
..	..	@@ -878,7 +901,6 @@
878	901	#undef sc
879	902	#undef unevict
880	903	#undef mlock
881		-#undef writeback
882	904	#undef lru
883	905	#undef head
884	906	#undef slab
..	..	@@ -930,7 +952,7 @@
930	952	* Return: return 0 if failed to grab the refcount, otherwise true (some
931	953	* non-zero value.)
932	954	*/
933		-int get_hwpoison_page(struct page *page)
	955	+static int get_hwpoison_page(struct page *page)
934	956	{
935	957	struct page *head = compound_head(page);
936	958
..	..	@@ -959,7 +981,6 @@
959	981
960	982	return 0;
961	983	}
962		-EXPORT_SYMBOL_GPL(get_hwpoison_page);
963	984
964	985	/*
965	986	* Do all that is necessary to remove user space mappings. Unmap
..	..	@@ -968,10 +989,10 @@
968	989	static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
969	990	int flags, struct page **hpagep)
970	991	{
971		- enum ttu_flags ttu = TTU_IGNORE_MLOCK \| TTU_IGNORE_ACCESS;
	992	+ enum ttu_flags ttu = TTU_IGNORE_MLOCK;
972	993	struct address_space *mapping;
973	994	LIST_HEAD(tokill);
974		- bool unmap_success;
	995	+ bool unmap_success = true;
975	996	int kill = 1, forcekill;
976	997	struct page hpage = hpagep;
977	998	bool mlocked = PageMlocked(hpage);
..	..	@@ -1011,7 +1032,7 @@
1011	1032	*/
1012	1033	mapping = page_mapping(hpage);
1013	1034	if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
1014		- mapping_cap_writeback_dirty(mapping)) {
	1035	+ mapping_can_writeback(mapping)) {
1015	1036	if (page_mkclean(hpage)) {
1016	1037	SetPageDirty(hpage);
1017	1038	} else {
..	..	@@ -1033,7 +1054,30 @@
1033	1054	if (kill)
1034	1055	collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
1035	1056
1036		- unmap_success = try_to_unmap(hpage, ttu);
	1057	+ if (!PageHuge(hpage)) {
	1058	+ unmap_success = try_to_unmap(hpage, ttu);
	1059	+ } else {
	1060	+ if (!PageAnon(hpage)) {
	1061	+ /*
	1062	+ * For hugetlb pages in shared mappings, try_to_unmap
	1063	+ * could potentially call huge_pmd_unshare. Because of
	1064	+ * this, take semaphore in write mode here and set
	1065	+ * TTU_RMAP_LOCKED to indicate we have taken the lock
	1066	+ * at this higer level.
	1067	+ */
	1068	+ mapping = hugetlb_page_mapping_lock_write(hpage);
	1069	+ if (mapping) {
	1070	+ unmap_success = try_to_unmap(hpage,
	1071	+ ttu\|TTU_RMAP_LOCKED);
	1072	+ i_mmap_unlock_write(mapping);
	1073	+ } else {
	1074	+ pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
	1075	+ unmap_success = false;
	1076	+ }
	1077	+ } else {
	1078	+ unmap_success = try_to_unmap(hpage, ttu);
	1079	+ }
	1080	+ }
1037	1081	if (!unmap_success)
1038	1082	pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
1039	1083	pfn, page_mapcount(hpage));
..	..	@@ -1084,6 +1128,25 @@
1084	1128	return page_action(ps, p, pfn);
1085	1129	}
1086	1130
	1131	+static int try_to_split_thp_page(struct page page, const char msg)
	1132	+{
	1133	+ lock_page(page);
	1134	+ if (!PageAnon(page) \|\| unlikely(split_huge_page(page))) {
	1135	+ unsigned long pfn = page_to_pfn(page);
	1136	+
	1137	+ unlock_page(page);
	1138	+ if (!PageAnon(page))
	1139	+ pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
	1140	+ else
	1141	+ pr_info("%s: %#lx: thp split failed\n", msg, pfn);
	1142	+ put_page(page);
	1143	+ return -EBUSY;
	1144	+ }
	1145	+ unlock_page(page);
	1146	+
	1147	+ return 0;
	1148	+}
	1149	+
1087	1150	static int memory_failure_hugetlb(unsigned long pfn, int flags)
1088	1151	{
1089	1152	struct page *p = pfn_to_page(pfn);
..	..	@@ -1125,7 +1188,7 @@
1125	1188	pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1126	1189	num_poisoned_pages_dec();
1127	1190	unlock_page(head);
1128		- put_hwpoison_page(head);
	1191	+ put_page(head);
1129	1192	return 0;
1130	1193	}
1131	1194
..	..	@@ -1166,6 +1229,19 @@
1166	1229	LIST_HEAD(tokill);
1167	1230	int rc = -EBUSY;
1168	1231	loff_t start;
	1232	+ dax_entry_t cookie;
	1233	+
	1234	+ if (flags & MF_COUNT_INCREASED)
	1235	+ /*
	1236	+ * Drop the extra refcount in case we come from madvise().
	1237	+ */
	1238	+ put_page(page);
	1239	+
	1240	+ /* device metadata space is not recoverable */
	1241	+ if (!pgmap_pfn_valid(pgmap, pfn)) {
	1242	+ rc = -ENXIO;
	1243	+ goto out;
	1244	+ }
1169	1245
1170	1246	/*
1171	1247	* Prevent the inode from being freed while we are interrogating
..	..	@@ -1174,7 +1250,8 @@
1174	1250	* also prevents changes to the mapping of this pfn until
1175	1251	* poison signaling is complete.
1176	1252	*/
1177		- if (!dax_lock_mapping_entry(page))
	1253	+ cookie = dax_lock_page(page);
	1254	+ if (!cookie)
1178	1255	goto out;
1179	1256
1180	1257	if (hwpoison_filter(page)) {
..	..	@@ -1182,16 +1259,12 @@
1182	1259	goto unlock;
1183	1260	}
1184	1261
1185		- switch (pgmap->type) {
1186		- case MEMORY_DEVICE_PRIVATE:
1187		- case MEMORY_DEVICE_PUBLIC:
	1262	+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
1188	1263	/*
1189	1264	* TODO: Handle HMM pages which may need coordination
1190	1265	* with device-side memory.
1191	1266	*/
1192	1267	goto unlock;
1193		- default:
1194		- break;
1195	1268	}
1196	1269
1197	1270	/*
..	..	@@ -1225,7 +1298,7 @@
1225	1298	kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
1226	1299	rc = 0;
1227	1300	unlock:
1228		- dax_unlock_mapping_entry(page);
	1301	+ dax_unlock_page(page, cookie);
1229	1302	out:
1230	1303	/* drop pgmap ref acquired in caller */
1231	1304	put_dev_pagemap(pgmap);
..	..	@@ -1308,23 +1381,11 @@
1308	1381	}
1309	1382
1310	1383	if (PageTransHuge(hpage)) {
1311		- lock_page(p);
1312		- if (!PageAnon(p) \|\| unlikely(split_huge_page(p))) {
1313		- unlock_page(p);
1314		- if (!PageAnon(p))
1315		- pr_err("Memory failure: %#lx: non anonymous thp\n",
1316		- pfn);
1317		- else
1318		- pr_err("Memory failure: %#lx: thp split failed\n",
1319		- pfn);
1320		- if (TestClearPageHWPoison(p))
1321		- num_poisoned_pages_dec();
1322		- put_hwpoison_page(p);
	1384	+ if (try_to_split_thp_page(p, "Memory Failure") < 0) {
	1385	+ action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
1323	1386	return -EBUSY;
1324	1387	}
1325		- unlock_page(p);
1326	1388	VM_BUG_ON_PAGE(!page_count(p), p);
1327		- hpage = compound_head(p);
1328	1389	}
1329	1390
1330	1391	/*
..	..	@@ -1364,10 +1425,7 @@
1364	1425	* page_remove_rmap() in try_to_unmap_one(). So to determine page status
1365	1426	* correctly, we save a copy of the page flags at this time.
1366	1427	*/
1367		- if (PageHuge(p))
1368		- page_flags = hpage->flags;
1369		- else
1370		- page_flags = p->flags;
	1428	+ page_flags = p->flags;
1371	1429
1372	1430	/*
1373	1431	* unpoison always clear PG_hwpoison inside page lock
..	..	@@ -1376,14 +1434,14 @@
1376	1434	pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1377	1435	num_poisoned_pages_dec();
1378	1436	unlock_page(p);
1379		- put_hwpoison_page(p);
	1437	+ put_page(p);
1380	1438	return 0;
1381	1439	}
1382	1440	if (hwpoison_filter(p)) {
1383	1441	if (TestClearPageHWPoison(p))
1384	1442	num_poisoned_pages_dec();
1385	1443	unlock_page(p);
1386		- put_hwpoison_page(p);
	1444	+ put_page(p);
1387	1445	return 0;
1388	1446	}
1389	1447
..	..	@@ -1404,11 +1462,8 @@
1404	1462	/*
1405	1463	* Now take care of user space mappings.
1406	1464	* Abort on fail: __delete_from_page_cache() assumes unmapped page.
1407		- *
1408		- * When the raw error page is thp tail page, hpage points to the raw
1409		- * page after thp split.
1410	1465	*/
1411		- if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
	1466	+ if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
1412	1467	action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1413	1468	res = -EBUSY;
1414	1469	goto out;
..	..	@@ -1492,7 +1547,7 @@
1492	1547	unsigned long proc_flags;
1493	1548	int gotten;
1494	1549
1495		- mf_cpu = this_cpu_ptr(&memory_failure_cpu);
	1550	+ mf_cpu = container_of(work, struct memory_failure_cpu, work);
1496	1551	for (;;) {
1497	1552	spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1498	1553	gotten = kfifo_get(&mf_cpu->fifo, &entry);
..	..	@@ -1500,10 +1555,23 @@
1500	1555	if (!gotten)
1501	1556	break;
1502	1557	if (entry.flags & MF_SOFT_OFFLINE)
1503		- soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
	1558	+ soft_offline_page(entry.pfn, entry.flags);
1504	1559	else
1505	1560	memory_failure(entry.pfn, entry.flags);
1506	1561	}
	1562	+}
	1563	+
	1564	+/*
	1565	+ * Process memory_failure work queued on the specified CPU.
	1566	+ * Used to avoid return-to-userspace racing with the memory_failure workqueue.
	1567	+ */
	1568	+void memory_failure_queue_kick(int cpu)
	1569	+{
	1570	+ struct memory_failure_cpu *mf_cpu;
	1571	+
	1572	+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
	1573	+ cancel_work_sync(&mf_cpu->work);
	1574	+ memory_failure_work_func(&mf_cpu->work);
1507	1575	}
1508	1576
1509	1577	static int __init memory_failure_init(void)
..	..	@@ -1612,147 +1680,113 @@
1612	1680	}
1613	1681	unlock_page(page);
1614	1682
1615		- put_hwpoison_page(page);
	1683	+ put_page(page);
1616	1684	if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1617		- put_hwpoison_page(page);
	1685	+ put_page(page);
1618	1686
1619	1687	return 0;
1620	1688	}
1621	1689	EXPORT_SYMBOL(unpoison_memory);
1622	1690
1623		-static struct page new_page(struct page p, unsigned long private)
	1691	+/*
	1692	+ * Safely get reference count of an arbitrary page.
	1693	+ * Returns 0 for a free page, 1 for an in-use page, -EIO for a page-type we
	1694	+ * cannot handle and -EBUSY if we raced with an allocation.
	1695	+ * We only incremented refcount in case the page was already in-use and it is
	1696	+ * a known type we can handle.
	1697	+ */
	1698	+static int get_any_page(struct page *p, int flags)
1624	1699	{
1625		- int nid = page_to_nid(p);
	1700	+ int ret = 0, pass = 0;
	1701	+ bool count_increased = false;
1626	1702
1627		- return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
	1703	+ if (flags & MF_COUNT_INCREASED)
	1704	+ count_increased = true;
	1705	+
	1706	+try_again:
	1707	+ if (!count_increased && !get_hwpoison_page(p)) {
	1708	+ if (page_count(p)) {
	1709	+ /* We raced with an allocation, retry. */
	1710	+ if (pass++ < 3)
	1711	+ goto try_again;
	1712	+ ret = -EBUSY;
	1713	+ } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
	1714	+ /* We raced with put_page, retry. */
	1715	+ if (pass++ < 3)
	1716	+ goto try_again;
	1717	+ ret = -EIO;
	1718	+ }
	1719	+ } else {
	1720	+ if (PageHuge(p) \|\| PageLRU(p) \|\| __PageMovable(p)) {
	1721	+ ret = 1;
	1722	+ } else {
	1723	+ /*
	1724	+ * A page we cannot handle. Check whether we can turn
	1725	+ * it into something we can handle.
	1726	+ */
	1727	+ if (pass++ < 3) {
	1728	+ put_page(p);
	1729	+ shake_page(p, 1);
	1730	+ count_increased = false;
	1731	+ goto try_again;
	1732	+ }
	1733	+ put_page(p);
	1734	+ ret = -EIO;
	1735	+ }
	1736	+ }
	1737	+
	1738	+ return ret;
	1739	+}
	1740	+
	1741	+static bool isolate_page(struct page page, struct list_head pagelist)
	1742	+{
	1743	+ bool isolated = false;
	1744	+ bool lru = PageLRU(page);
	1745	+
	1746	+ if (PageHuge(page)) {
	1747	+ isolated = !isolate_hugetlb(page, pagelist);
	1748	+ } else {
	1749	+ if (lru)
	1750	+ isolated = !isolate_lru_page(page);
	1751	+ else
	1752	+ isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
	1753	+
	1754	+ if (isolated)
	1755	+ list_add(&page->lru, pagelist);
	1756	+ }
	1757	+
	1758	+ if (isolated && lru)
	1759	+ inc_node_page_state(page, NR_ISOLATED_ANON +
	1760	+ page_is_file_lru(page));
	1761	+
	1762	+ /*
	1763	+ * If we succeed to isolate the page, we grabbed another refcount on
	1764	+ * the page, so we can safely drop the one we got from get_any_pages().
	1765	+ * If we failed to isolate the page, it means that we cannot go further
	1766	+ * and we will return an error, so drop the reference we got from
	1767	+ * get_any_pages() as well.
	1768	+ */
	1769	+ put_page(page);
	1770	+ return isolated;
1628	1771	}
1629	1772
1630	1773	/*
1631		- * Safely get reference count of an arbitrary page.
1632		- * Returns 0 for a free page, -EIO for a zero refcount page
1633		- * that is not free, and 1 for any other page type.
1634		- * For 1 the page is returned with increased page count, otherwise not.
	1774	+ * __soft_offline_page handles hugetlb-pages and non-hugetlb pages.
	1775	+ * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
	1776	+ * If the page is mapped, it migrates the contents over.
1635	1777	*/
1636		-static int __get_any_page(struct page *p, unsigned long pfn, int flags)
	1778	+static int __soft_offline_page(struct page *page)
1637	1779	{
1638		- int ret;
1639		-
1640		- if (flags & MF_COUNT_INCREASED)
1641		- return 1;
1642		-
1643		- /*
1644		- * When the target page is a free hugepage, just remove it
1645		- * from free hugepage list.
1646		- */
1647		- if (!get_hwpoison_page(p)) {
1648		- if (PageHuge(p)) {
1649		- pr_info("%s: %#lx free huge page\n", __func__, pfn);
1650		- ret = 0;
1651		- } else if (is_free_buddy_page(p)) {
1652		- pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1653		- ret = 0;
1654		- } else {
1655		- pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1656		- __func__, pfn, p->flags);
1657		- ret = -EIO;
1658		- }
1659		- } else {
1660		- /* Not a free page */
1661		- ret = 1;
1662		- }
1663		- return ret;
1664		-}
1665		-
1666		-static int get_any_page(struct page *page, unsigned long pfn, int flags)
1667		-{
1668		- int ret = __get_any_page(page, pfn, flags);
1669		-
1670		- if (ret == 1 && !PageHuge(page) &&
1671		- !PageLRU(page) && !__PageMovable(page)) {
1672		- /*
1673		- * Try to free it.
1674		- */
1675		- put_hwpoison_page(page);
1676		- shake_page(page, 1);
1677		-
1678		- /*
1679		- * Did it turn free?
1680		- */
1681		- ret = __get_any_page(page, pfn, 0);
1682		- if (ret == 1 && !PageLRU(page)) {
1683		- /* Drop page reference which is from __get_any_page() */
1684		- put_hwpoison_page(page);
1685		- pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
1686		- pfn, page->flags, &page->flags);
1687		- return -EIO;
1688		- }
1689		- }
1690		- return ret;
1691		-}
1692		-
1693		-static int soft_offline_huge_page(struct page *page, int flags)
1694		-{
1695		- int ret;
	1780	+ int ret = 0;
1696	1781	unsigned long pfn = page_to_pfn(page);
1697	1782	struct page *hpage = compound_head(page);
	1783	+ char const *msg_page[] = {"page", "hugepage"};
	1784	+ bool huge = PageHuge(page);
1698	1785	LIST_HEAD(pagelist);
1699		-
1700		- /*
1701		- * This double-check of PageHWPoison is to avoid the race with
1702		- * memory_failure(). See also comment in __soft_offline_page().
1703		- */
1704		- lock_page(hpage);
1705		- if (PageHWPoison(hpage)) {
1706		- unlock_page(hpage);
1707		- put_hwpoison_page(hpage);
1708		- pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1709		- return -EBUSY;
1710		- }
1711		- unlock_page(hpage);
1712		-
1713		- ret = isolate_huge_page(hpage, &pagelist);
1714		- /*
1715		- * get_any_page() and isolate_huge_page() takes a refcount each,
1716		- * so need to drop one here.
1717		- */
1718		- put_hwpoison_page(hpage);
1719		- if (!ret) {
1720		- pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1721		- return -EBUSY;
1722		- }
1723		-
1724		- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1725		- MIGRATE_SYNC, MR_MEMORY_FAILURE);
1726		- if (ret) {
1727		- pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
1728		- pfn, ret, page->flags, &page->flags);
1729		- if (!list_empty(&pagelist))
1730		- putback_movable_pages(&pagelist);
1731		- if (ret > 0)
1732		- ret = -EIO;
1733		- } else {
1734		- /*
1735		- * We set PG_hwpoison only when the migration source hugepage
1736		- * was successfully dissolved, because otherwise hwpoisoned
1737		- * hugepage remains on free hugepage list, then userspace will
1738		- * find it as SIGBUS by allocation failure. That's not expected
1739		- * in soft-offlining.
1740		- */
1741		- ret = dissolve_free_huge_page(page);
1742		- if (!ret) {
1743		- if (set_hwpoison_free_buddy_page(page))
1744		- num_poisoned_pages_inc();
1745		- else
1746		- ret = -EBUSY;
1747		- }
1748		- }
1749		- return ret;
1750		-}
1751		-
1752		-static int __soft_offline_page(struct page *page, int flags)
1753		-{
1754		- int ret;
1755		- unsigned long pfn = page_to_pfn(page);
	1786	+ struct migration_target_control mtc = {
	1787	+ .nid = NUMA_NO_NODE,
	1788	+ .gfp_mask = GFP_USER \| __GFP_MOVABLE \| __GFP_RETRY_MAYFAIL,
	1789	+ };
1756	1790
1757	1791	/*
1758	1792	* Check PageHWPoison again inside page lock because PageHWPoison
..	..	@@ -1761,127 +1795,77 @@
1761	1795	* so there's no race between soft_offline_page() and memory_failure().
1762	1796	*/
1763	1797	lock_page(page);
1764		- wait_on_page_writeback(page);
	1798	+ if (!PageHuge(page))
	1799	+ wait_on_page_writeback(page);
1765	1800	if (PageHWPoison(page)) {
1766	1801	unlock_page(page);
1767		- put_hwpoison_page(page);
	1802	+ put_page(page);
1768	1803	pr_info("soft offline: %#lx page already poisoned\n", pfn);
1769		- return -EBUSY;
	1804	+ return 0;
1770	1805	}
1771		- /*
1772		- * Try to invalidate first. This should work for
1773		- * non dirty unmapped page cache pages.
1774		- */
1775		- ret = invalidate_inode_page(page);
	1806	+
	1807	+ if (!PageHuge(page))
	1808	+ /*
	1809	+ * Try to invalidate first. This should work for
	1810	+ * non dirty unmapped page cache pages.
	1811	+ */
	1812	+ ret = invalidate_inode_page(page);
1776	1813	unlock_page(page);
	1814	+
1777	1815	/*
1778	1816	* RED-PEN would be better to keep it isolated here, but we
1779	1817	* would need to fix isolation locking first.
1780	1818	*/
1781		- if (ret == 1) {
1782		- put_hwpoison_page(page);
	1819	+ if (ret) {
1783	1820	pr_info("soft_offline: %#lx: invalidated\n", pfn);
1784		- SetPageHWPoison(page);
1785		- num_poisoned_pages_inc();
	1821	+ page_handle_poison(page, false, true);
1786	1822	return 0;
1787	1823	}
1788	1824
1789		- /*
1790		- * Simple invalidation didn't work.
1791		- * Try to migrate to a new page instead. migrate.c
1792		- * handles a large number of cases for us.
1793		- */
1794		- if (PageLRU(page))
1795		- ret = isolate_lru_page(page);
1796		- else
1797		- ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1798		- /*
1799		- * Drop page reference which is came from get_any_page()
1800		- * successful isolate_lru_page() already took another one.
1801		- */
1802		- put_hwpoison_page(page);
1803		- if (!ret) {
1804		- LIST_HEAD(pagelist);
1805		- /*
1806		- * After isolated lru page, the PageLRU will be cleared,
1807		- * so use !__PageMovable instead for LRU page's mapping
1808		- * cannot have PAGE_MAPPING_MOVABLE.
1809		- */
1810		- if (!__PageMovable(page))
1811		- inc_node_page_state(page, NR_ISOLATED_ANON +
1812		- page_is_file_cache(page));
1813		- list_add(&page->lru, &pagelist);
1814		- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1815		- MIGRATE_SYNC, MR_MEMORY_FAILURE);
1816		- if (ret) {
	1825	+ if (isolate_page(hpage, &pagelist)) {
	1826	+ ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
	1827	+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
	1828	+ if (!ret) {
	1829	+ bool release = !huge;
	1830	+
	1831	+ if (!page_handle_poison(page, huge, release))
	1832	+ ret = -EBUSY;
	1833	+ } else {
1817	1834	if (!list_empty(&pagelist))
1818	1835	putback_movable_pages(&pagelist);
1819	1836
1820		- pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
1821		- pfn, ret, page->flags, &page->flags);
	1837	+ pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
	1838	+ pfn, msg_page[huge], ret, page->flags, &page->flags);
1822	1839	if (ret > 0)
1823		- ret = -EIO;
	1840	+ ret = -EBUSY;
1824	1841	}
1825	1842	} else {
1826		- pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
1827		- pfn, ret, page_count(page), page->flags, &page->flags);
	1843	+ pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n",
	1844	+ pfn, msg_page[huge], page_count(page), page->flags, &page->flags);
	1845	+ ret = -EBUSY;
1828	1846	}
1829	1847	return ret;
1830	1848	}
1831	1849
1832		-static int soft_offline_in_use_page(struct page *page, int flags)
	1850	+static int soft_offline_in_use_page(struct page *page)
1833	1851	{
1834		- int ret;
1835		- int mt;
1836	1852	struct page *hpage = compound_head(page);
1837	1853
1838		- if (!PageHuge(page) && PageTransHuge(hpage)) {
1839		- lock_page(page);
1840		- if (!PageAnon(page) \|\| unlikely(split_huge_page(page))) {
1841		- unlock_page(page);
1842		- if (!PageAnon(page))
1843		- pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1844		- else
1845		- pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1846		- put_hwpoison_page(page);
	1854	+ if (!PageHuge(page) && PageTransHuge(hpage))
	1855	+ if (try_to_split_thp_page(page, "soft offline") < 0)
1847	1856	return -EBUSY;
1848		- }
1849		- unlock_page(page);
1850		- }
1851		-
1852		- /*
1853		- * Setting MIGRATE_ISOLATE here ensures that the page will be linked
1854		- * to free list immediately (not via pcplist) when released after
1855		- * successful page migration. Otherwise we can't guarantee that the
1856		- * page is really free after put_page() returns, so
1857		- * set_hwpoison_free_buddy_page() highly likely fails.
1858		- */
1859		- mt = get_pageblock_migratetype(page);
1860		- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
1861		- if (PageHuge(page))
1862		- ret = soft_offline_huge_page(page, flags);
1863		- else
1864		- ret = __soft_offline_page(page, flags);
1865		- set_pageblock_migratetype(page, mt);
1866		- return ret;
	1857	+ return __soft_offline_page(page);
1867	1858	}
1868	1859
1869		-static int soft_offline_free_page(struct page *page)
	1860	+static void put_ref_page(struct page *page)
1870	1861	{
1871		- int rc = dissolve_free_huge_page(page);
1872		-
1873		- if (!rc) {
1874		- if (set_hwpoison_free_buddy_page(page))
1875		- num_poisoned_pages_inc();
1876		- else
1877		- rc = -EBUSY;
1878		- }
1879		- return rc;
	1862	+ if (page)
	1863	+ put_page(page);
1880	1864	}
1881	1865
1882	1866	/**
1883	1867	* soft_offline_page - Soft offline a page.
1884		- * @page: page to offline
	1868	+ * @pfn: pfn to soft-offline
1885	1869	* @flags: flags. Same as memory_failure().
1886	1870	*
1887	1871	* Returns 0 on success, otherwise negated errno.
..	..	@@ -1901,34 +1885,52 @@
1901	1885	* This is not a 100% solution for all memory, but tries to be
1902	1886	* ``good enough'' for the majority of memory.
1903	1887	*/
1904		-int soft_offline_page(struct page *page, int flags)
	1888	+int soft_offline_page(unsigned long pfn, int flags)
1905	1889	{
1906	1890	int ret;
1907		- unsigned long pfn = page_to_pfn(page);
	1891	+ bool try_again = true;
	1892	+ struct page page, ref_page = NULL;
1908	1893
1909		- if (is_zone_device_page(page)) {
1910		- pr_debug_ratelimited("soft_offline: %#lx page is device page\n",
1911		- pfn);
1912		- if (flags & MF_COUNT_INCREASED)
1913		- put_page(page);
	1894	+ WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED));
	1895	+
	1896	+ if (!pfn_valid(pfn))
	1897	+ return -ENXIO;
	1898	+ if (flags & MF_COUNT_INCREASED)
	1899	+ ref_page = pfn_to_page(pfn);
	1900	+
	1901	+ /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
	1902	+ page = pfn_to_online_page(pfn);
	1903	+ if (!page) {
	1904	+ put_ref_page(ref_page);
1914	1905	return -EIO;
1915	1906	}
1916	1907
1917	1908	if (PageHWPoison(page)) {
1918		- pr_info("soft offline: %#lx page already poisoned\n", pfn);
1919		- if (flags & MF_COUNT_INCREASED)
1920		- put_hwpoison_page(page);
1921		- return -EBUSY;
	1909	+ pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
	1910	+ put_ref_page(ref_page);
	1911	+ return 0;
1922	1912	}
1923	1913
	1914	+retry:
1924	1915	get_online_mems();
1925		- ret = get_any_page(page, pfn, flags);
	1916	+ ret = get_any_page(page, flags);
1926	1917	put_online_mems();
1927	1918
1928		- if (ret > 0)
1929		- ret = soft_offline_in_use_page(page, flags);
1930		- else if (ret == 0)
1931		- ret = soft_offline_free_page(page);
	1919	+ if (ret > 0) {
	1920	+ ret = soft_offline_in_use_page(page);
	1921	+ } else if (ret == 0) {
	1922	+ if (!page_handle_poison(page, true, false)) {
	1923	+ if (try_again) {
	1924	+ try_again = false;
	1925	+ flags &= ~MF_COUNT_INCREASED;
	1926	+ goto retry;
	1927	+ }
	1928	+ ret = -EBUSY;
	1929	+ }
	1930	+ } else if (ret == -EIO) {
	1931	+ pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n",
	1932	+ __func__, pfn, page->flags, &page->flags);
	1933	+ }
1932	1934
1933	1935	return ret;
1934	1936	}