~hc/RK356X_SDK_RELEASE.git

..	..	@@ -38,6 +38,7 @@
38	38	#include <linux/hugetlb.h>
39	39	#include <linux/hugetlb_cgroup.h>
40	40	#include <linux/gfp.h>
	41	+#include <linux/pagewalk.h>
41	42	#include <linux/pfn_t.h>
42	43	#include <linux/memremap.h>
43	44	#include <linux/userfaultfd_k.h>
..	..	@@ -47,39 +48,17 @@
47	48	#include <linux/page_owner.h>
48	49	#include <linux/sched/mm.h>
49	50	#include <linux/ptrace.h>
	51	+#include <linux/oom.h>
50	52
51	53	#include <asm/tlbflush.h>
52	54
53	55	#define CREATE_TRACE_POINTS
54	56	#include <trace/events/migrate.h>
	57	+#undef CREATE_TRACE_POINTS
	58	+#include <trace/hooks/mm.h>
	59	+#include <trace/hooks/vmscan.h>
55	60
56	61	#include "internal.h"
57		-
58		-/*
59		- * migrate_prep() needs to be called before we start compiling a list of pages
60		- * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
61		- * undesirable, use migrate_prep_local()
62		- */
63		-int migrate_prep(void)
64		-{
65		- /*
66		- * Clear the LRU lists so pages can be isolated.
67		- * Note that pages may be moved off the LRU after we have
68		- * drained them. Those pages will fail to migrate like other
69		- * pages that may be busy.
70		- */
71		- lru_add_drain_all();
72		-
73		- return 0;
74		-}
75		-
76		-/* Do the necessary work of migrate_prep but not if it involves other CPUs */
77		-int migrate_prep_local(void)
78		-{
79		- lru_add_drain();
80		-
81		- return 0;
82		-}
83	62
84	63	int isolate_movable_page(struct page *page, isolate_mode_t mode)
85	64	{
..	..	@@ -100,7 +79,7 @@
100	79	/*
101	80	* Check PageMovable before holding a PG_lock because page's owner
102	81	* assumes anybody doesn't touch PG_lock of newly allocated page
103		- * so unconditionally grapping the lock ruins page's owner side.
	82	+ * so unconditionally grabbing the lock ruins page's owner side.
104	83	*/
105	84	if (unlikely(!__PageMovable(page)))
106	85	goto out_putpage;
..	..	@@ -129,7 +108,7 @@
129	108
130	109	/* Driver shouldn't use PG_isolated bit of page->flags */
131	110	WARN_ON_ONCE(PageIsolated(page));
132		- __SetPageIsolated(page);
	111	+ SetPageIsolated(page);
133	112	unlock_page(page);
134	113
135	114	return 0;
..	..	@@ -153,7 +132,7 @@
153	132
154	133	mapping = page_mapping(page);
155	134	mapping->a_ops->putback_page(page);
156		- __ClearPageIsolated(page);
	135	+ ClearPageIsolated(page);
157	136	}
158	137
159	138	/*
..	..	@@ -186,16 +165,17 @@
186	165	if (PageMovable(page))
187	166	putback_movable_page(page);
188	167	else
189		- __ClearPageIsolated(page);
	168	+ ClearPageIsolated(page);
190	169	unlock_page(page);
191	170	put_page(page);
192	171	} else {
193	172	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
194		- page_is_file_cache(page), -hpage_nr_pages(page));
	173	+ page_is_file_lru(page), -thp_nr_pages(page));
195	174	putback_lru_page(page);
196	175	}
197	176	}
198	177	}
	178	+EXPORT_SYMBOL_GPL(putback_movable_pages);
199	179
200	180	/*
201	181	* Restore a potential migration pte to a working pte entry
..	..	@@ -240,15 +220,17 @@
240	220	*/
241	221	entry = pte_to_swp_entry(*pvmw.pte);
242	222	if (is_write_migration_entry(entry))
243		- pte = maybe_mkwrite(pte, vma);
	223	+ pte = maybe_mkwrite(pte, vma->vm_flags);
	224	+ else if (pte_swp_uffd_wp(*pvmw.pte))
	225	+ pte = pte_mkuffd_wp(pte);
244	226
245		- if (unlikely(is_zone_device_page(new))) {
246		- if (is_device_private_page(new)) {
247		- entry = make_device_private_entry(new, pte_write(pte));
248		- pte = swp_entry_to_pte(entry);
249		- } else if (is_device_public_page(new)) {
250		- pte = pte_mkdevmap(pte);
251		- }
	227	+ if (unlikely(is_device_private_page(new))) {
	228	+ entry = make_device_private_entry(new, pte_write(pte));
	229	+ pte = swp_entry_to_pte(entry);
	230	+ if (pte_swp_soft_dirty(*pvmw.pte))
	231	+ pte = pte_swp_mksoft_dirty(pte);
	232	+ if (pte_swp_uffd_wp(*pvmw.pte))
	233	+ pte = pte_swp_mkuffd_wp(pte);
252	234	}
253	235
254	236	#ifdef CONFIG_HUGETLB_PAGE
..	..	@@ -322,19 +304,18 @@
322	304	goto out;
323	305
324	306	page = migration_entry_to_page(entry);
	307	+ page = compound_head(page);
325	308
326	309	/*
327		- * Once radix-tree replacement of page migration started, page_count
328		- * must be zero. And, we don't want to call wait_on_page_locked()
329		- * against a page without get_page().
330		- * So, we use get_page_unless_zero(), here. Even failed, page fault
331		- * will occur again.
	310	+ * Once page cache replacement of page migration started, page_count
	311	+ * is zero; but we must not call put_and_wait_on_page_locked() without
	312	+ * a ref. Use get_page_unless_zero(), and just fault again if it fails.
332	313	*/
333	314	if (!get_page_unless_zero(page))
334	315	goto out;
335	316	pte_unmap_unlock(ptep, ptl);
336		- wait_on_page_locked(page);
337		- put_page(page);
	317	+ trace_android_vh_waiting_for_page_migration(page);
	318	+ put_and_wait_on_page_locked(page);
338	319	return;
339	320	out:
340	321	pte_unmap_unlock(ptep, ptl);
..	..	@@ -368,63 +349,27 @@
368	349	if (!get_page_unless_zero(page))
369	350	goto unlock;
370	351	spin_unlock(ptl);
371		- wait_on_page_locked(page);
372		- put_page(page);
	352	+ put_and_wait_on_page_locked(page);
373	353	return;
374	354	unlock:
375	355	spin_unlock(ptl);
376	356	}
377	357	#endif
378	358
379		-#ifdef CONFIG_BLOCK
380		-/* Returns true if all buffers are successfully locked */
381		-static bool buffer_migrate_lock_buffers(struct buffer_head *head,
382		- enum migrate_mode mode)
	359	+static int expected_page_refs(struct address_space mapping, struct page page)
383	360	{
384		- struct buffer_head *bh = head;
	361	+ int expected_count = 1;
385	362
386		- /* Simple case, sync compaction */
387		- if (mode != MIGRATE_ASYNC) {
388		- do {
389		- get_bh(bh);
390		- lock_buffer(bh);
391		- bh = bh->b_this_page;
	363	+ /*
	364	+ * Device private pages have an extra refcount as they are
	365	+ * ZONE_DEVICE pages.
	366	+ */
	367	+ expected_count += is_device_private_page(page);
	368	+ if (mapping)
	369	+ expected_count += thp_nr_pages(page) + page_has_private(page);
392	370
393		- } while (bh != head);
394		-
395		- return true;
396		- }
397		-
398		- /* async case, we cannot block on lock_buffer so use trylock_buffer */
399		- do {
400		- get_bh(bh);
401		- if (!trylock_buffer(bh)) {
402		- /*
403		- * We failed to lock the buffer and cannot stall in
404		- * async migration. Release the taken locks
405		- */
406		- struct buffer_head *failed_bh = bh;
407		- put_bh(failed_bh);
408		- bh = head;
409		- while (bh != failed_bh) {
410		- unlock_buffer(bh);
411		- put_bh(bh);
412		- bh = bh->b_this_page;
413		- }
414		- return false;
415		- }
416		-
417		- bh = bh->b_this_page;
418		- } while (bh != head);
419		- return true;
	371	+ return expected_count;
420	372	}
421		-#else
422		-static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
423		- enum migrate_mode mode)
424		-{
425		- return true;
426		-}
427		-#endif /* CONFIG_BLOCK */
428	373
429	374	/*
430	375	* Replace the page in the mapping.
..	..	@@ -435,21 +380,13 @@
435	380	* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
436	381	*/
437	382	int migrate_page_move_mapping(struct address_space *mapping,
438		- struct page newpage, struct page page,
439		- struct buffer_head *head, enum migrate_mode mode,
440		- int extra_count)
	383	+ struct page newpage, struct page page, int extra_count)
441	384	{
	385	+ XA_STATE(xas, &mapping->i_pages, page_index(page));
442	386	struct zone oldzone, newzone;
443	387	int dirty;
444		- int expected_count = 1 + extra_count;
445		- void **pslot;
446		-
447		- /*
448		- * Device public or private pages have an extra refcount as they are
449		- * ZONE_DEVICE pages.
450		- */
451		- expected_count += is_device_private_page(page);
452		- expected_count += is_device_public_page(page);
	388	+ int expected_count = expected_page_refs(mapping, page) + extra_count;
	389	+ int nr = thp_nr_pages(page);
453	390
454	391	if (!mapping) {
455	392	/* Anonymous page without mapping */
..	..	@@ -468,35 +405,14 @@
468	405	oldzone = page_zone(page);
469	406	newzone = page_zone(newpage);
470	407
471		- xa_lock_irq(&mapping->i_pages);
472		-
473		- pslot = radix_tree_lookup_slot(&mapping->i_pages,
474		- page_index(page));
475		-
476		- expected_count += hpage_nr_pages(page) + page_has_private(page);
477		- if (page_count(page) != expected_count \|\|
478		- radix_tree_deref_slot_protected(pslot,
479		- &mapping->i_pages.xa_lock) != page) {
480		- xa_unlock_irq(&mapping->i_pages);
	408	+ xas_lock_irq(&xas);
	409	+ if (page_count(page) != expected_count \|\| xas_load(&xas) != page) {
	410	+ xas_unlock_irq(&xas);
481	411	return -EAGAIN;
482	412	}
483	413
484	414	if (!page_ref_freeze(page, expected_count)) {
485		- xa_unlock_irq(&mapping->i_pages);
486		- return -EAGAIN;
487		- }
488		-
489		- /*
490		- * In the async migration case of moving a page with buffers, lock the
491		- * buffers using trylock before the mapping is moved. If the mapping
492		- * was moved, we later failed to lock the buffers and could not move
493		- * the mapping back due to an elevated page count, we would have to
494		- * block waiting on other references to be dropped.
495		- */
496		- if (mode == MIGRATE_ASYNC && head &&
497		- !buffer_migrate_lock_buffers(head, mode)) {
498		- page_ref_unfreeze(page, expected_count);
499		- xa_unlock_irq(&mapping->i_pages);
	415	+ xas_unlock_irq(&xas);
500	416	return -EAGAIN;
501	417	}
502	418
..	..	@@ -506,7 +422,7 @@
506	422	*/
507	423	newpage->index = page->index;
508	424	newpage->mapping = page->mapping;
509		- page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
	425	+ page_ref_add(newpage, nr); /* add cache reference */
510	426	if (PageSwapBacked(page)) {
511	427	__SetPageSwapBacked(newpage);
512	428	if (PageSwapCache(page)) {
..	..	@@ -524,16 +440,13 @@
524	440	SetPageDirty(newpage);
525	441	}
526	442
527		- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
	443	+ xas_store(&xas, newpage);
528	444	if (PageTransHuge(page)) {
529	445	int i;
530		- int index = page_index(page);
531	446
532		- for (i = 1; i < HPAGE_PMD_NR; i++) {
533		- pslot = radix_tree_lookup_slot(&mapping->i_pages,
534		- index + i);
535		- radix_tree_replace_slot(&mapping->i_pages, pslot,
536		- newpage + i);
	447	+ for (i = 1; i < nr; i++) {
	448	+ xas_next(&xas);
	449	+ xas_store(&xas, newpage);
537	450	}
538	451	}
539	452
..	..	@@ -542,9 +455,9 @@
542	455	* to one less reference.
543	456	* We know this isn't the last reference.
544	457	*/
545		- page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
	458	+ page_ref_unfreeze(page, expected_count - nr);
546	459
547		- xa_unlock(&mapping->i_pages);
	460	+ xas_unlock(&xas);
548	461	/* Leave irq disabled to prevent preemption while updating stats */
549	462
550	463	/*
..	..	@@ -558,17 +471,24 @@
558	471	* are mapped to swap space.
559	472	*/
560	473	if (newzone != oldzone) {
561		- __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
562		- __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
	474	+ struct lruvec old_lruvec, new_lruvec;
	475	+ struct mem_cgroup *memcg;
	476	+
	477	+ memcg = page_memcg(page);
	478	+ old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
	479	+ new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
	480	+
	481	+ __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
	482	+ __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
563	483	if (PageSwapBacked(page) && !PageSwapCache(page)) {
564		- __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
565		- __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
	484	+ __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
	485	+ __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
566	486	}
567		- if (dirty && mapping_cap_account_dirty(mapping)) {
568		- __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
569		- __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
570		- __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
571		- __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
	487	+ if (dirty && mapping_can_writeback(mapping)) {
	488	+ __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
	489	+ __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
	490	+ __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
	491	+ __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
572	492	}
573	493	}
574	494	local_irq_enable();
..	..	@@ -584,22 +504,18 @@
584	504	int migrate_huge_page_move_mapping(struct address_space *mapping,
585	505	struct page newpage, struct page page)
586	506	{
	507	+ XA_STATE(xas, &mapping->i_pages, page_index(page));
587	508	int expected_count;
588		- void **pslot;
589	509
590		- xa_lock_irq(&mapping->i_pages);
591		-
592		- pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page));
593		-
	510	+ xas_lock_irq(&xas);
594	511	expected_count = 2 + page_has_private(page);
595		- if (page_count(page) != expected_count \|\|
596		- radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) {
597		- xa_unlock_irq(&mapping->i_pages);
	512	+ if (page_count(page) != expected_count \|\| xas_load(&xas) != page) {
	513	+ xas_unlock_irq(&xas);
598	514	return -EAGAIN;
599	515	}
600	516
601	517	if (!page_ref_freeze(page, expected_count)) {
602		- xa_unlock_irq(&mapping->i_pages);
	518	+ xas_unlock_irq(&xas);
603	519	return -EAGAIN;
604	520	}
605	521
..	..	@@ -608,11 +524,11 @@
608	524
609	525	get_page(newpage);
610	526
611		- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
	527	+ xas_store(&xas, newpage);
612	528
613	529	page_ref_unfreeze(page, expected_count - 1);
614	530
615		- xa_unlock_irq(&mapping->i_pages);
	531	+ xas_unlock_irq(&xas);
616	532
617	533	return MIGRATEPAGE_SUCCESS;
618	534	}
..	..	@@ -656,7 +572,7 @@
656	572	} else {
657	573	/* thp page */
658	574	BUG_ON(!PageTransHuge(src));
659		- nr_pages = hpage_nr_pages(src);
	575	+ nr_pages = thp_nr_pages(src);
660	576	}
661	577
662	578	for (i = 0; i < nr_pages; i++) {
..	..	@@ -671,6 +587,8 @@
671	587	void migrate_page_states(struct page newpage, struct page page)
672	588	{
673	589	int cpupid;
	590	+
	591	+ trace_android_vh_migrate_page_states(page, newpage);
674	592
675	593	if (PageError(page))
676	594	SetPageError(newpage);
..	..	@@ -689,6 +607,7 @@
689	607	SetPageChecked(newpage);
690	608	if (PageMappedToDisk(page))
691	609	SetPageMappedToDisk(newpage);
	610	+ trace_android_vh_look_around_migrate_page(page, newpage);
692	611
693	612	/* Move dirty on pages not done by migrate_page_move_mapping() */
694	613	if (PageDirty(page))
..	..	@@ -723,9 +642,18 @@
723	642	if (PageWriteback(newpage))
724	643	end_page_writeback(newpage);
725	644
	645	+ /*
	646	+ * PG_readahead shares the same bit with PG_reclaim. The above
	647	+ * end_page_writeback() may clear PG_readahead mistakenly, so set the
	648	+ * bit after that.
	649	+ */
	650	+ if (PageReadahead(page))
	651	+ SetPageReadahead(newpage);
	652	+
726	653	copy_page_owner(page, newpage);
727	654
728		- mem_cgroup_migrate(page, newpage);
	655	+ if (!PageHuge(page))
	656	+ mem_cgroup_migrate(page, newpage);
729	657	}
730	658	EXPORT_SYMBOL(migrate_page_states);
731	659
..	..	@@ -758,7 +686,7 @@
758	686
759	687	BUG_ON(PageWriteback(page)); /* Writeback must be complete */
760	688
761		- rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
	689	+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
762	690
763	691	if (rc != MIGRATEPAGE_SUCCESS)
764	692	return rc;
..	..	@@ -772,40 +700,96 @@
772	700	EXPORT_SYMBOL(migrate_page);
773	701
774	702	#ifdef CONFIG_BLOCK
775		-/*
776		- * Migration function for pages with buffers. This function can only be used
777		- * if the underlying filesystem guarantees that no other references to "page"
778		- * exist.
779		- */
780		-int buffer_migrate_page(struct address_space *mapping,
781		- struct page newpage, struct page page, enum migrate_mode mode)
	703	+/* Returns true if all buffers are successfully locked */
	704	+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
	705	+ enum migrate_mode mode)
	706	+{
	707	+ struct buffer_head *bh = head;
	708	+
	709	+ /* Simple case, sync compaction */
	710	+ if (mode != MIGRATE_ASYNC) {
	711	+ do {
	712	+ lock_buffer(bh);
	713	+ bh = bh->b_this_page;
	714	+
	715	+ } while (bh != head);
	716	+
	717	+ return true;
	718	+ }
	719	+
	720	+ /* async case, we cannot block on lock_buffer so use trylock_buffer */
	721	+ do {
	722	+ if (!trylock_buffer(bh)) {
	723	+ /*
	724	+ * We failed to lock the buffer and cannot stall in
	725	+ * async migration. Release the taken locks
	726	+ */
	727	+ struct buffer_head *failed_bh = bh;
	728	+ bh = head;
	729	+ while (bh != failed_bh) {
	730	+ unlock_buffer(bh);
	731	+ bh = bh->b_this_page;
	732	+ }
	733	+ return false;
	734	+ }
	735	+
	736	+ bh = bh->b_this_page;
	737	+ } while (bh != head);
	738	+ return true;
	739	+}
	740	+
	741	+static int __buffer_migrate_page(struct address_space *mapping,
	742	+ struct page newpage, struct page page, enum migrate_mode mode,
	743	+ bool check_refs)
782	744	{
783	745	struct buffer_head bh, head;
784	746	int rc;
	747	+ int expected_count;
785	748
786	749	if (!page_has_buffers(page))
787	750	return migrate_page(mapping, newpage, page, mode);
788	751
	752	+ /* Check whether page does not have extra refs before we do more work */
	753	+ expected_count = expected_page_refs(mapping, page);
	754	+ if (page_count(page) != expected_count)
	755	+ return -EAGAIN;
	756	+
789	757	head = page_buffers(page);
	758	+ if (!buffer_migrate_lock_buffers(head, mode))
	759	+ return -EAGAIN;
790	760
791		- rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
	761	+ if (check_refs) {
	762	+ bool busy;
	763	+ bool invalidated = false;
792	764
	765	+recheck_buffers:
	766	+ busy = false;
	767	+ spin_lock(&mapping->private_lock);
	768	+ bh = head;
	769	+ do {
	770	+ if (atomic_read(&bh->b_count)) {
	771	+ busy = true;
	772	+ break;
	773	+ }
	774	+ bh = bh->b_this_page;
	775	+ } while (bh != head);
	776	+ if (busy) {
	777	+ if (invalidated) {
	778	+ rc = -EAGAIN;
	779	+ goto unlock_buffers;
	780	+ }
	781	+ spin_unlock(&mapping->private_lock);
	782	+ invalidate_bh_lrus();
	783	+ invalidated = true;
	784	+ goto recheck_buffers;
	785	+ }
	786	+ }
	787	+
	788	+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
793	789	if (rc != MIGRATEPAGE_SUCCESS)
794		- return rc;
	790	+ goto unlock_buffers;
795	791
796		- /*
797		- * In the async case, migrate_page_move_mapping locked the buffers
798		- * with an IRQ-safe spinlock held. In the sync case, the buffers
799		- * need to be locked now
800		- */
801		- if (mode != MIGRATE_ASYNC)
802		- BUG_ON(!buffer_migrate_lock_buffers(head, mode));
803		-
804		- ClearPagePrivate(page);
805		- set_page_private(newpage, page_private(page));
806		- set_page_private(page, 0);
807		- put_page(page);
808		- get_page(newpage);
	792	+ attach_page_private(newpage, detach_page_private(page));
809	793
810	794	bh = head;
811	795	do {
..	..	@@ -814,24 +798,48 @@
814	798
815	799	} while (bh != head);
816	800
817		- SetPagePrivate(newpage);
818		-
819	801	if (mode != MIGRATE_SYNC_NO_COPY)
820	802	migrate_page_copy(newpage, page);
821	803	else
822	804	migrate_page_states(newpage, page);
823	805
	806	+ rc = MIGRATEPAGE_SUCCESS;
	807	+unlock_buffers:
	808	+ if (check_refs)
	809	+ spin_unlock(&mapping->private_lock);
824	810	bh = head;
825	811	do {
826	812	unlock_buffer(bh);
827		- put_bh(bh);
828	813	bh = bh->b_this_page;
829	814
830	815	} while (bh != head);
831	816
832		- return MIGRATEPAGE_SUCCESS;
	817	+ return rc;
	818	+}
	819	+
	820	+/*
	821	+ * Migration function for pages with buffers. This function can only be used
	822	+ * if the underlying filesystem guarantees that no other references to "page"
	823	+ * exist. For example attached buffer heads are accessed only under page lock.
	824	+ */
	825	+int buffer_migrate_page(struct address_space *mapping,
	826	+ struct page newpage, struct page page, enum migrate_mode mode)
	827	+{
	828	+ return __buffer_migrate_page(mapping, newpage, page, mode, false);
833	829	}
834	830	EXPORT_SYMBOL(buffer_migrate_page);
	831	+
	832	+/*
	833	+ * Same as above except that this variant is more careful and checks that there
	834	+ * are also no buffer head references. This function is the right one for
	835	+ * mappings where buffer heads are directly looked up and referenced (such as
	836	+ * block device mappings).
	837	+ */
	838	+int buffer_migrate_page_norefs(struct address_space *mapping,
	839	+ struct page newpage, struct page page, enum migrate_mode mode)
	840	+{
	841	+ return __buffer_migrate_page(mapping, newpage, page, mode, true);
	842	+}
835	843	#endif
836	844
837	845	/*
..	..	@@ -899,7 +907,7 @@
899	907	*/
900	908	if (page_has_private(page) &&
901	909	!try_to_release_page(page, GFP_KERNEL))
902		- return -EAGAIN;
	910	+ return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
903	911
904	912	return migrate_page(mapping, newpage, page, mode);
905	913	}
..	..	@@ -951,7 +959,7 @@
951	959	VM_BUG_ON_PAGE(!PageIsolated(page), page);
952	960	if (!PageMovable(page)) {
953	961	rc = MIGRATEPAGE_SUCCESS;
954		- __ClearPageIsolated(page);
	962	+ ClearPageIsolated(page);
955	963	goto out;
956	964	}
957	965
..	..	@@ -973,23 +981,23 @@
973	981	* We clear PG_movable under page_lock so any compactor
974	982	* cannot try to migrate this page.
975	983	*/
976		- __ClearPageIsolated(page);
	984	+ ClearPageIsolated(page);
977	985	}
978	986
979	987	/*
980		- * Anonymous and movable page->mapping will be cleard by
	988	+ * Anonymous and movable page->mapping will be cleared by
981	989	* free_pages_prepare so don't reset it here for keeping
982	990	* the type to work PageAnon, for example.
983	991	*/
984	992	if (!PageMappingFlags(page))
985	993	page->mapping = NULL;
986	994
987		- if (unlikely(is_zone_device_page(newpage))) {
988		- if (is_device_public_page(newpage))
989		- flush_dcache_page(newpage);
990		- } else
991		- flush_dcache_page(newpage);
	995	+ if (likely(!is_zone_device_page(newpage))) {
	996	+ int i, nr = compound_nr(newpage);
992	997
	998	+ for (i = 0; i < nr; i++)
	999	+ flush_dcache_page(newpage + i);
	1000	+ }
993	1001	}
994	1002	out:
995	1003	return rc;
..	..	@@ -1013,7 +1021,7 @@
1013	1021	* to the LRU. Later, when the IO completes the pages are
1014	1022	* marked uptodate and unlocked. However, the queueing
1015	1023	* could be merging multiple pages for one bio (e.g.
1016		- * mpage_readpages). If an allocation happens for the
	1024	+ * mpage_readahead). If an allocation happens for the
1017	1025	* second or third page, the process can end up locking
1018	1026	* the same page twice and deadlocking. Rather than
1019	1027	* trying to be clever about what pages can be locked,
..	..	@@ -1101,8 +1109,7 @@
1101	1109	/* Establish migration ptes */
1102	1110	VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
1103	1111	page);
1104		- try_to_unmap(page,
1105		- TTU_MIGRATION\|TTU_IGNORE_MLOCK\|TTU_IGNORE_ACCESS);
	1112	+ try_to_unmap(page, TTU_MIGRATION\|TTU_IGNORE_MLOCK);
1106	1113	page_was_mapped = 1;
1107	1114	}
1108	1115
..	..	@@ -1141,34 +1148,19 @@
1141	1148	}
1142	1149
1143	1150	/*
1144		- * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
1145		- * around it.
1146		- */
1147		-#if defined(CONFIG_ARM) && \
1148		- defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700
1149		-#define ICE_noinline noinline
1150		-#else
1151		-#define ICE_noinline
1152		-#endif
1153		-
1154		-/*
1155	1151	* Obtain the lock on page, remove all ptes and migrate the page
1156	1152	* to the newly allocated page in newpage.
1157	1153	*/
1158		-static ICE_noinline int unmap_and_move(new_page_t get_new_page,
	1154	+static int unmap_and_move(new_page_t get_new_page,
1159	1155	free_page_t put_new_page,
1160	1156	unsigned long private, struct page *page,
1161	1157	int force, enum migrate_mode mode,
1162	1158	enum migrate_reason reason)
1163	1159	{
1164	1160	int rc = MIGRATEPAGE_SUCCESS;
1165		- struct page *newpage;
	1161	+ struct page *newpage = NULL;
1166	1162
1167	1163	if (!thp_migration_supported() && PageTransHuge(page))
1168		- return -ENOMEM;
1169		-
1170		- newpage = get_new_page(page, private);
1171		- if (!newpage)
1172	1164	return -ENOMEM;
1173	1165
1174	1166	if (page_count(page) == 1) {
..	..	@@ -1178,15 +1170,15 @@
1178	1170	if (unlikely(__PageMovable(page))) {
1179	1171	lock_page(page);
1180	1172	if (!PageMovable(page))
1181		- __ClearPageIsolated(page);
	1173	+ ClearPageIsolated(page);
1182	1174	unlock_page(page);
1183	1175	}
1184		- if (put_new_page)
1185		- put_new_page(newpage, private);
1186		- else
1187		- put_page(newpage);
1188	1176	goto out;
1189	1177	}
	1178	+
	1179	+ newpage = get_new_page(page, private);
	1180	+ if (!newpage)
	1181	+ return -ENOMEM;
1190	1182
1191	1183	rc = __unmap_and_move(page, newpage, force, mode);
1192	1184	if (rc == MIGRATEPAGE_SUCCESS)
..	..	@@ -1197,8 +1189,7 @@
1197	1189	/*
1198	1190	* A page that has been migrated has all references
1199	1191	* removed and will be freed. A page that has not been
1200		- * migrated will have kepts its references and be
1201		- * restored.
	1192	+ * migrated will have kept its references and be restored.
1202	1193	*/
1203	1194	list_del(&page->lru);
1204	1195
..	..	@@ -1209,7 +1200,7 @@
1209	1200	*/
1210	1201	if (likely(!__PageMovable(page)))
1211	1202	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1212		- page_is_file_cache(page), -hpage_nr_pages(page));
	1203	+ page_is_file_lru(page), -thp_nr_pages(page));
1213	1204	}
1214	1205
1215	1206	/*
..	..	@@ -1218,16 +1209,11 @@
1218	1209	* we want to retry.
1219	1210	*/
1220	1211	if (rc == MIGRATEPAGE_SUCCESS) {
1221		- put_page(page);
1222		- if (reason == MR_MEMORY_FAILURE) {
	1212	+ if (reason != MR_MEMORY_FAILURE)
1223	1213	/*
1224		- * Set PG_HWPoison on just freed page
1225		- * intentionally. Although it's rather weird,
1226		- * it's how HWPoison flag works at the moment.
	1214	+ * We release the page in page_handle_poison.
1227	1215	*/
1228		- if (set_hwpoison_free_buddy_page(page))
1229		- num_poisoned_pages_inc();
1230		- }
	1216	+ put_page(page);
1231	1217	} else {
1232	1218	if (rc != -EAGAIN) {
1233	1219	if (likely(!__PageMovable(page))) {
..	..	@@ -1239,7 +1225,7 @@
1239	1225	if (PageMovable(page))
1240	1226	putback_movable_page(page);
1241	1227	else
1242		- __ClearPageIsolated(page);
	1228	+ ClearPageIsolated(page);
1243	1229	unlock_page(page);
1244	1230	put_page(page);
1245	1231	}
..	..	@@ -1280,9 +1266,10 @@
1280	1266	int page_was_mapped = 0;
1281	1267	struct page *new_hpage;
1282	1268	struct anon_vma *anon_vma = NULL;
	1269	+ struct address_space *mapping = NULL;
1283	1270
1284	1271	/*
1285		- * Movability of hugepages depends on architectures and hugepage size.
	1272	+ * Migratability of hugepages depends on architectures and their size.
1286	1273	* This check is necessary because some callers of hugepage migration
1287	1274	* like soft offline and memory hotremove don't walk through page
1288	1275	* tables or check whether the hugepage is pmd-based or not before
..	..	@@ -1327,9 +1314,29 @@
1327	1314	goto put_anon;
1328	1315
1329	1316	if (page_mapped(hpage)) {
1330		- try_to_unmap(hpage,
1331		- TTU_MIGRATION\|TTU_IGNORE_MLOCK\|TTU_IGNORE_ACCESS);
	1317	+ bool mapping_locked = false;
	1318	+ enum ttu_flags ttu = TTU_MIGRATION\|TTU_IGNORE_MLOCK;
	1319	+
	1320	+ if (!PageAnon(hpage)) {
	1321	+ /*
	1322	+ * In shared mappings, try_to_unmap could potentially
	1323	+ * call huge_pmd_unshare. Because of this, take
	1324	+ * semaphore in write mode here and set TTU_RMAP_LOCKED
	1325	+ * to let lower levels know we have taken the lock.
	1326	+ */
	1327	+ mapping = hugetlb_page_mapping_lock_write(hpage);
	1328	+ if (unlikely(!mapping))
	1329	+ goto unlock_put_anon;
	1330	+
	1331	+ mapping_locked = true;
	1332	+ ttu \|= TTU_RMAP_LOCKED;
	1333	+ }
	1334	+
	1335	+ try_to_unmap(hpage, ttu);
1332	1336	page_was_mapped = 1;
	1337	+
	1338	+ if (mapping_locked)
	1339	+ i_mmap_unlock_write(mapping);
1333	1340	}
1334	1341
1335	1342	if (!page_mapped(hpage))
..	..	@@ -1339,6 +1346,7 @@
1339	1346	remove_migration_ptes(hpage,
1340	1347	rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
1341	1348
	1349	+unlock_put_anon:
1342	1350	unlock_page(new_hpage);
1343	1351
1344	1352	put_anon:
..	..	@@ -1395,22 +1403,37 @@
1395	1403	enum migrate_mode mode, int reason)
1396	1404	{
1397	1405	int retry = 1;
	1406	+ int thp_retry = 1;
1398	1407	int nr_failed = 0;
1399	1408	int nr_succeeded = 0;
	1409	+ int nr_thp_succeeded = 0;
	1410	+ int nr_thp_failed = 0;
	1411	+ int nr_thp_split = 0;
1400	1412	int pass = 0;
	1413	+ bool is_thp = false;
1401	1414	struct page *page;
1402	1415	struct page *page2;
1403	1416	int swapwrite = current->flags & PF_SWAPWRITE;
1404		- int rc;
	1417	+ int rc, nr_subpages;
	1418	+
	1419	+ trace_mm_migrate_pages_start(mode, reason);
1405	1420
1406	1421	if (!swapwrite)
1407	1422	current->flags \|= PF_SWAPWRITE;
1408	1423
1409		- for(pass = 0; pass < 10 && retry; pass++) {
	1424	+ for (pass = 0; pass < 10 && (retry \|\| thp_retry); pass++) {
1410	1425	retry = 0;
	1426	+ thp_retry = 0;
1411	1427
1412	1428	list_for_each_entry_safe(page, page2, from, lru) {
1413	1429	retry:
	1430	+ /*
	1431	+ * THP statistics is based on the source huge page.
	1432	+ * Capture required information that might get lost
	1433	+ * during migration.
	1434	+ */
	1435	+ is_thp = PageTransHuge(page) && !PageHuge(page);
	1436	+ nr_subpages = thp_nr_pages(page);
1414	1437	cond_resched();
1415	1438
1416	1439	if (PageHuge(page))
..	..	@@ -1435,21 +1458,35 @@
1435	1458	* we encounter them after the rest of the list
1436	1459	* is processed.
1437	1460	*/
1438		- if (PageTransHuge(page) && !PageHuge(page)) {
	1461	+ if (is_thp) {
1439	1462	lock_page(page);
1440	1463	rc = split_huge_page_to_list(page, from);
1441	1464	unlock_page(page);
1442	1465	if (!rc) {
1443	1466	list_safe_reset_next(page, page2, lru);
	1467	+ nr_thp_split++;
1444	1468	goto retry;
1445	1469	}
	1470	+
	1471	+ nr_thp_failed++;
	1472	+ nr_failed += nr_subpages;
	1473	+ goto out;
1446	1474	}
1447	1475	nr_failed++;
1448	1476	goto out;
1449	1477	case -EAGAIN:
	1478	+ if (is_thp) {
	1479	+ thp_retry++;
	1480	+ break;
	1481	+ }
1450	1482	retry++;
1451	1483	break;
1452	1484	case MIGRATEPAGE_SUCCESS:
	1485	+ if (is_thp) {
	1486	+ nr_thp_succeeded++;
	1487	+ nr_succeeded += nr_subpages;
	1488	+ break;
	1489	+ }
1453	1490	nr_succeeded++;
1454	1491	break;
1455	1492	default:
..	..	@@ -1459,24 +1496,76 @@
1459	1496	* removed from migration page list and not
1460	1497	* retried in the next outer loop.
1461	1498	*/
	1499	+ if (is_thp) {
	1500	+ nr_thp_failed++;
	1501	+ nr_failed += nr_subpages;
	1502	+ break;
	1503	+ }
1462	1504	nr_failed++;
1463	1505	break;
1464	1506	}
1465	1507	}
1466	1508	}
1467		- nr_failed += retry;
	1509	+ nr_failed += retry + thp_retry;
	1510	+ nr_thp_failed += thp_retry;
1468	1511	rc = nr_failed;
1469	1512	out:
1470		- if (nr_succeeded)
1471		- count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1472		- if (nr_failed)
1473		- count_vm_events(PGMIGRATE_FAIL, nr_failed);
1474		- trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
	1513	+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
	1514	+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
	1515	+ count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
	1516	+ count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
	1517	+ count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
	1518	+ trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
	1519	+ nr_thp_failed, nr_thp_split, mode, reason);
1475	1520
1476	1521	if (!swapwrite)
1477	1522	current->flags &= ~PF_SWAPWRITE;
1478	1523
1479	1524	return rc;
	1525	+}
	1526	+EXPORT_SYMBOL_GPL(migrate_pages);
	1527	+
	1528	+struct page alloc_migration_target(struct page page, unsigned long private)
	1529	+{
	1530	+ struct migration_target_control *mtc;
	1531	+ gfp_t gfp_mask;
	1532	+ unsigned int order = 0;
	1533	+ struct page *new_page = NULL;
	1534	+ int nid;
	1535	+ int zidx;
	1536	+
	1537	+ mtc = (struct migration_target_control *)private;
	1538	+ gfp_mask = mtc->gfp_mask;
	1539	+ nid = mtc->nid;
	1540	+ if (nid == NUMA_NO_NODE)
	1541	+ nid = page_to_nid(page);
	1542	+
	1543	+ if (PageHuge(page)) {
	1544	+ struct hstate *h = page_hstate(compound_head(page));
	1545	+
	1546	+ gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
	1547	+ return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
	1548	+ }
	1549	+
	1550	+ if (PageTransHuge(page)) {
	1551	+ /*
	1552	+ * clear __GFP_RECLAIM to make the migration callback
	1553	+ * consistent with regular THP allocations.
	1554	+ */
	1555	+ gfp_mask &= ~__GFP_RECLAIM;
	1556	+ gfp_mask \|= GFP_TRANSHUGE;
	1557	+ order = HPAGE_PMD_ORDER;
	1558	+ }
	1559	+ zidx = zone_idx(page_zone(page));
	1560	+ if (is_highmem_idx(zidx) \|\| zidx == ZONE_MOVABLE)
	1561	+ gfp_mask \|= __GFP_HIGHMEM;
	1562	+
	1563	+ new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
	1564	+
	1565	+ if (new_page && PageTransHuge(new_page))
	1566	+ prep_transhuge_page(new_page);
	1567	+
	1568	+ return new_page;
1480	1569	}
1481	1570
1482	1571	#ifdef CONFIG_NUMA
..	..	@@ -1496,12 +1585,13 @@
1496	1585	struct list_head *pagelist, int node)
1497	1586	{
1498	1587	int err;
	1588	+ struct migration_target_control mtc = {
	1589	+ .nid = node,
	1590	+ .gfp_mask = GFP_HIGHUSER_MOVABLE \| __GFP_THISNODE,
	1591	+ };
1499	1592
1500		- if (list_empty(pagelist))
1501		- return 0;
1502		-
1503		- err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
1504		- MIGRATE_SYNC, MR_SYSCALL);
	1593	+ err = migrate_pages(pagelist, alloc_migration_target, NULL,
	1594	+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
1505	1595	if (err)
1506	1596	putback_movable_pages(pagelist);
1507	1597	return err;
..	..	@@ -1524,7 +1614,7 @@
1524	1614	unsigned int follflags;
1525	1615	int err;
1526	1616
1527		- down_read(&mm->mmap_sem);
	1617	+ mmap_read_lock(mm);
1528	1618	err = -EFAULT;
1529	1619	vma = find_vma(mm, addr);
1530	1620	if (!vma \|\| addr < vma->vm_start \|\| !vma_migratable(vma))
..	..	@@ -1566,8 +1656,8 @@
1566	1656	err = 1;
1567	1657	list_add_tail(&head->lru, pagelist);
1568	1658	mod_node_page_state(page_pgdat(head),
1569		- NR_ISOLATED_ANON + page_is_file_cache(head),
1570		- hpage_nr_pages(head));
	1659	+ NR_ISOLATED_ANON + page_is_file_lru(head),
	1660	+ thp_nr_pages(head));
1571	1661	}
1572	1662	out_putpage:
1573	1663	/*
..	..	@@ -1575,10 +1665,36 @@
1575	1665	* isolate_lru_page() or drop the page ref if it was
1576	1666	* not isolated.
1577	1667	*/
1578		- put_page(page);
	1668	+ put_user_page(page);
1579	1669	out:
1580		- up_read(&mm->mmap_sem);
	1670	+ mmap_read_unlock(mm);
1581	1671	return err;
	1672	+}
	1673	+
	1674	+static int move_pages_and_store_status(struct mm_struct *mm, int node,
	1675	+ struct list_head pagelist, int __user status,
	1676	+ int start, int i, unsigned long nr_pages)
	1677	+{
	1678	+ int err;
	1679	+
	1680	+ if (list_empty(pagelist))
	1681	+ return 0;
	1682	+
	1683	+ err = do_move_pages_to_node(mm, pagelist, node);
	1684	+ if (err) {
	1685	+ /*
	1686	+ * Positive err means the number of failed
	1687	+ * pages to migrate. Since we are going to
	1688	+ * abort and return the number of non-migrated
	1689	+ * pages, so need to incude the rest of the
	1690	+ * nr_pages that have not been attempted as
	1691	+ * well.
	1692	+ */
	1693	+ if (err > 0)
	1694	+ err += nr_pages - i - 1;
	1695	+ return err;
	1696	+ }
	1697	+ return store_status(status, start, node, i - start);
1582	1698	}
1583	1699
1584	1700	/*
..	..	@@ -1596,7 +1712,7 @@
1596	1712	int start, i;
1597	1713	int err = 0, err1;
1598	1714
1599		- migrate_prep();
	1715	+ lru_cache_disable();
1600	1716
1601	1717	for (i = start = 0; i < nr_pages; i++) {
1602	1718	const void __user *p;
..	..	@@ -1624,21 +1740,8 @@
1624	1740	current_node = node;
1625	1741	start = i;
1626	1742	} else if (node != current_node) {
1627		- err = do_move_pages_to_node(mm, &pagelist, current_node);
1628		- if (err) {
1629		- /*
1630		- * Positive err means the number of failed
1631		- * pages to migrate. Since we are going to
1632		- * abort and return the number of non-migrated
1633		- * pages, so need to incude the rest of the
1634		- * nr_pages that have not been attempted as
1635		- * well.
1636		- */
1637		- if (err > 0)
1638		- err += nr_pages - i - 1;
1639		- goto out;
1640		- }
1641		- err = store_status(status, start, current_node, i - start);
	1743	+ err = move_pages_and_store_status(mm, current_node,
	1744	+ &pagelist, status, start, i, nr_pages);
1642	1745	if (err)
1643	1746	goto out;
1644	1747	start = i;
..	..	@@ -1652,52 +1755,33 @@
1652	1755	err = add_page_for_migration(mm, addr, current_node,
1653	1756	&pagelist, flags & MPOL_MF_MOVE_ALL);
1654	1757
1655		- if (!err) {
1656		- /* The page is already on the target node */
1657		- err = store_status(status, i, current_node, 1);
1658		- if (err)
1659		- goto out_flush;
1660		- continue;
1661		- } else if (err > 0) {
	1758	+ if (err > 0) {
1662	1759	/* The page is successfully queued for migration */
1663	1760	continue;
1664	1761	}
1665	1762
1666		- err = store_status(status, i, err, 1);
	1763	+ /*
	1764	+ * If the page is already on the target node (!err), store the
	1765	+ * node, otherwise, store the err.
	1766	+ */
	1767	+ err = store_status(status, i, err ? : current_node, 1);
1667	1768	if (err)
1668	1769	goto out_flush;
1669	1770
1670		- err = do_move_pages_to_node(mm, &pagelist, current_node);
1671		- if (err) {
1672		- if (err > 0)
1673		- err += nr_pages - i - 1;
	1771	+ err = move_pages_and_store_status(mm, current_node, &pagelist,
	1772	+ status, start, i, nr_pages);
	1773	+ if (err)
1674	1774	goto out;
1675		- }
1676		- if (i > start) {
1677		- err = store_status(status, start, current_node, i - start);
1678		- if (err)
1679		- goto out;
1680		- }
1681	1775	current_node = NUMA_NO_NODE;
1682	1776	}
1683	1777	out_flush:
1684		- if (list_empty(&pagelist))
1685		- return err;
1686		-
1687	1778	/* Make sure we do not overwrite the existing error */
1688		- err1 = do_move_pages_to_node(mm, &pagelist, current_node);
1689		- /*
1690		- * Don't have to report non-attempted pages here since:
1691		- * - If the above loop is done gracefully all pages have been
1692		- * attempted.
1693		- * - If the above loop is aborted it means a fatal error
1694		- * happened, should return ret.
1695		- */
1696		- if (!err1)
1697		- err1 = store_status(status, start, current_node, i - start);
	1779	+ err1 = move_pages_and_store_status(mm, current_node, &pagelist,
	1780	+ status, start, i, nr_pages);
1698	1781	if (err >= 0)
1699	1782	err = err1;
1700	1783	out:
	1784	+ lru_cache_enable();
1701	1785	return err;
1702	1786	}
1703	1787
..	..	@@ -1709,7 +1793,7 @@
1709	1793	{
1710	1794	unsigned long i;
1711	1795
1712		- down_read(&mm->mmap_sem);
	1796	+ mmap_read_lock(mm);
1713	1797
1714	1798	for (i = 0; i < nr_pages; i++) {
1715	1799	unsigned long addr = (unsigned long)(*pages);
..	..	@@ -1736,7 +1820,7 @@
1736	1820	status++;
1737	1821	}
1738	1822
1739		- up_read(&mm->mmap_sem);
	1823	+ mmap_read_unlock(mm);
1740	1824	}
1741	1825
1742	1826	/*
..	..	@@ -1773,6 +1857,53 @@
1773	1857	return nr_pages ? -EFAULT : 0;
1774	1858	}
1775	1859
	1860	+static struct mm_struct find_mm_struct(pid_t pid, nodemask_t mem_nodes)
	1861	+{
	1862	+ struct task_struct *task;
	1863	+ struct mm_struct *mm;
	1864	+
	1865	+ /*
	1866	+ * There is no need to check if current process has the right to modify
	1867	+ * the specified process when they are same.
	1868	+ */
	1869	+ if (!pid) {
	1870	+ mmget(current->mm);
	1871	+ *mem_nodes = cpuset_mems_allowed(current);
	1872	+ return current->mm;
	1873	+ }
	1874	+
	1875	+ /* Find the mm_struct */
	1876	+ rcu_read_lock();
	1877	+ task = find_task_by_vpid(pid);
	1878	+ if (!task) {
	1879	+ rcu_read_unlock();
	1880	+ return ERR_PTR(-ESRCH);
	1881	+ }
	1882	+ get_task_struct(task);
	1883	+
	1884	+ /*
	1885	+ * Check if this process has the right to modify the specified
	1886	+ * process. Use the regular "ptrace_may_access()" checks.
	1887	+ */
	1888	+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
	1889	+ rcu_read_unlock();
	1890	+ mm = ERR_PTR(-EPERM);
	1891	+ goto out;
	1892	+ }
	1893	+ rcu_read_unlock();
	1894	+
	1895	+ mm = ERR_PTR(security_task_movememory(task));
	1896	+ if (IS_ERR(mm))
	1897	+ goto out;
	1898	+ *mem_nodes = cpuset_mems_allowed(task);
	1899	+ mm = get_task_mm(task);
	1900	+out:
	1901	+ put_task_struct(task);
	1902	+ if (!mm)
	1903	+ mm = ERR_PTR(-EINVAL);
	1904	+ return mm;
	1905	+}
	1906	+
1776	1907	/*
1777	1908	* Move a list of pages in the address space of the currently executing
1778	1909	* process.
..	..	@@ -1782,7 +1913,6 @@
1782	1913	const int __user *nodes,
1783	1914	int __user *status, int flags)
1784	1915	{
1785		- struct task_struct *task;
1786	1916	struct mm_struct *mm;
1787	1917	int err;
1788	1918	nodemask_t task_nodes;
..	..	@@ -1794,36 +1924,9 @@
1794	1924	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1795	1925	return -EPERM;
1796	1926
1797		- /* Find the mm_struct */
1798		- rcu_read_lock();
1799		- task = pid ? find_task_by_vpid(pid) : current;
1800		- if (!task) {
1801		- rcu_read_unlock();
1802		- return -ESRCH;
1803		- }
1804		- get_task_struct(task);
1805		-
1806		- /*
1807		- * Check if this process has the right to modify the specified
1808		- * process. Use the regular "ptrace_may_access()" checks.
1809		- */
1810		- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1811		- rcu_read_unlock();
1812		- err = -EPERM;
1813		- goto out;
1814		- }
1815		- rcu_read_unlock();
1816		-
1817		- err = security_task_movememory(task);
1818		- if (err)
1819		- goto out;
1820		-
1821		- task_nodes = cpuset_mems_allowed(task);
1822		- mm = get_task_mm(task);
1823		- put_task_struct(task);
1824		-
1825		- if (!mm)
1826		- return -EINVAL;
	1927	+ mm = find_mm_struct(pid, &task_nodes);
	1928	+ if (IS_ERR(mm))
	1929	+ return PTR_ERR(mm);
1827	1930
1828	1931	if (nodes)
1829	1932	err = do_pages_move(mm, task_nodes, nr_pages, pages,
..	..	@@ -1832,10 +1935,6 @@
1832	1935	err = do_pages_stat(mm, nr_pages, pages, status);
1833	1936
1834	1937	mmput(mm);
1835		- return err;
1836		-
1837		-out:
1838		- put_task_struct(task);
1839	1938	return err;
1840	1939	}
1841	1940
..	..	@@ -1889,7 +1988,7 @@
1889	1988	if (!zone_watermark_ok(zone, 0,
1890	1989	high_wmark_pages(zone) +
1891	1990	nr_migrate_pages,
1892		- 0, 0))
	1991	+ ZONE_MOVABLE, 0))
1893	1992	continue;
1894	1993	return true;
1895	1994	}
..	..	@@ -1918,7 +2017,7 @@
1918	2017	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
1919	2018
1920	2019	/* Avoid migrating to a node that is nearly full */
1921		- if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
	2020	+ if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
1922	2021	return 0;
1923	2022
1924	2023	if (isolate_lru_page(page))
..	..	@@ -1936,9 +2035,9 @@
1936	2035	return 0;
1937	2036	}
1938	2037
1939		- page_lru = page_is_file_cache(page);
	2038	+ page_lru = page_is_file_lru(page);
1940	2039	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
1941		- hpage_nr_pages(page));
	2040	+ thp_nr_pages(page));
1942	2041
1943	2042	/*
1944	2043	* Isolating the page has taken another reference, so the
..	..	@@ -1960,7 +2059,7 @@
1960	2059	* node. Caller is expected to have an elevated reference count on
1961	2060	* the page that will be dropped by this function before returning.
1962	2061	*/
1963		-int migrate_misplaced_page(struct page page, struct vm_area_struct vma,
	2062	+int migrate_misplaced_page(struct page page, struct vm_fault vmf,
1964	2063	int node)
1965	2064	{
1966	2065	pg_data_t *pgdat = NODE_DATA(node);
..	..	@@ -1972,15 +2071,15 @@
1972	2071	* Don't migrate file pages that are mapped in multiple processes
1973	2072	* with execute permissions as they are probably shared libraries.
1974	2073	*/
1975		- if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
1976		- (vma->vm_flags & VM_EXEC))
	2074	+ if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
	2075	+ (vmf->vma_flags & VM_EXEC))
1977	2076	goto out;
1978	2077
1979	2078	/*
1980	2079	* Also do not migrate dirty pages as not all filesystems can move
1981	2080	* dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
1982	2081	*/
1983		- if (page_is_file_cache(page) && PageDirty(page))
	2082	+ if (page_is_file_lru(page) && PageDirty(page))
1984	2083	goto out;
1985	2084
1986	2085	isolated = numamigrate_isolate_page(pgdat, page);
..	..	@@ -1995,7 +2094,7 @@
1995	2094	if (!list_empty(&migratepages)) {
1996	2095	list_del(&page->lru);
1997	2096	dec_node_page_state(page, NR_ISOLATED_ANON +
1998		- page_is_file_cache(page));
	2097	+ page_is_file_lru(page));
1999	2098	putback_lru_page(page);
2000	2099	}
2001	2100	isolated = 0;
..	..	@@ -2025,9 +2124,8 @@
2025	2124	pg_data_t *pgdat = NODE_DATA(node);
2026	2125	int isolated = 0;
2027	2126	struct page *new_page = NULL;
2028		- int page_lru = page_is_file_cache(page);
2029		- unsigned long mmun_start = address & HPAGE_PMD_MASK;
2030		- unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
	2127	+ int page_lru = page_is_file_lru(page);
	2128	+ unsigned long start = address & HPAGE_PMD_MASK;
2031	2129
2032	2130	new_page = alloc_pages_node(node,
2033	2131	(GFP_TRANSHUGE_LIGHT \| __GFP_THISNODE),
..	..	@@ -2050,15 +2148,15 @@
2050	2148	/* anon mapping, we can simply copy page->mapping to the new page: */
2051	2149	new_page->mapping = page->mapping;
2052	2150	new_page->index = page->index;
	2151	+ /* flush the cache before copying using the kernel virtual address */
	2152	+ flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
2053	2153	migrate_page_copy(new_page, page);
2054	2154	WARN_ON(PageLRU(new_page));
2055	2155
2056	2156	/* Recheck the target PMD */
2057		- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2058	2157	ptl = pmd_lock(mm, pmd);
2059	2158	if (unlikely(!pmd_same(*pmd, entry) \|\| !page_ref_freeze(page, 2))) {
2060	2159	spin_unlock(ptl);
2061		- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2062	2160
2063	2161	/* Reverse changes made by migrate_page_copy() */
2064	2162	if (TestClearPageActive(new_page))
..	..	@@ -2089,8 +2187,7 @@
2089	2187	* new page and page_add_new_anon_rmap guarantee the copy is
2090	2188	* visible before the pagetable update.
2091	2189	*/
2092		- flush_cache_range(vma, mmun_start, mmun_end);
2093		- page_add_anon_rmap(new_page, vma, mmun_start, true);
	2190	+ page_add_anon_rmap(new_page, vma, start, true);
2094	2191	/*
2095	2192	* At this point the pmd is numa/protnone (i.e. non present) and the TLB
2096	2193	* has already been flushed globally. So no TLB can be currently
..	..	@@ -2098,11 +2195,11 @@
2098	2195	* pmd before doing set_pmd_at(), nor to flush the TLB after
2099	2196	* set_pmd_at(). Clearing the pmd here would introduce a race
2100	2197	* condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
2101		- * mmap_sem for reading. If the pmd is set to NULL at any given time,
	2198	+ * mmap_lock for reading. If the pmd is set to NULL at any given time,
2102	2199	* MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
2103	2200	* pmd.
2104	2201	*/
2105		- set_pmd_at(mm, mmun_start, pmd, entry);
	2202	+ set_pmd_at(mm, start, pmd, entry);
2106	2203	update_mmu_cache_pmd(vma, address, &entry);
2107	2204
2108	2205	page_ref_unfreeze(page, 2);
..	..	@@ -2111,11 +2208,6 @@
2111	2208	set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
2112	2209
2113	2210	spin_unlock(ptl);
2114		- /*
2115		- * No need to double call mmu_notifier->invalidate_range() callback as
2116		- * the above pmdp_huge_clear_flush_notify() did already call it.
2117		- */
2118		- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2119	2211
2120	2212	/* Take an "isolate" reference and put new page on the LRU. */
2121	2213	get_page(new_page);
..	..	@@ -2139,7 +2231,7 @@
2139	2231	ptl = pmd_lock(mm, pmd);
2140	2232	if (pmd_same(*pmd, entry)) {
2141	2233	entry = pmd_modify(entry, vma->vm_page_prot);
2142		- set_pmd_at(mm, mmun_start, pmd, entry);
	2234	+ set_pmd_at(mm, start, pmd, entry);
2143	2235	update_mmu_cache_pmd(vma, address, &entry);
2144	2236	}
2145	2237	spin_unlock(ptl);
..	..	@@ -2153,25 +2245,26 @@
2153	2245
2154	2246	#endif /* CONFIG_NUMA */
2155	2247
2156		-#if defined(CONFIG_MIGRATE_VMA_HELPER)
2157		-struct migrate_vma {
2158		- struct vm_area_struct *vma;
2159		- unsigned long *dst;
2160		- unsigned long *src;
2161		- unsigned long cpages;
2162		- unsigned long npages;
2163		- unsigned long start;
2164		- unsigned long end;
2165		-};
2166		-
	2248	+#ifdef CONFIG_DEVICE_PRIVATE
2167	2249	static int migrate_vma_collect_hole(unsigned long start,
2168	2250	unsigned long end,
	2251	+ __always_unused int depth,
2169	2252	struct mm_walk *walk)
2170	2253	{
2171	2254	struct migrate_vma *migrate = walk->private;
2172	2255	unsigned long addr;
2173	2256
2174		- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
	2257	+ /* Only allow populating anonymous memory. */
	2258	+ if (!vma_is_anonymous(walk->vma)) {
	2259	+ for (addr = start; addr < end; addr += PAGE_SIZE) {
	2260	+ migrate->src[migrate->npages] = 0;
	2261	+ migrate->dst[migrate->npages] = 0;
	2262	+ migrate->npages++;
	2263	+ }
	2264	+ return 0;
	2265	+ }
	2266	+
	2267	+ for (addr = start; addr < end; addr += PAGE_SIZE) {
2175	2268	migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
2176	2269	migrate->dst[migrate->npages] = 0;
2177	2270	migrate->npages++;
..	..	@@ -2188,7 +2281,7 @@
2188	2281	struct migrate_vma *migrate = walk->private;
2189	2282	unsigned long addr;
2190	2283
2191		- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
	2284	+ for (addr = start; addr < end; addr += PAGE_SIZE) {
2192	2285	migrate->dst[migrate->npages] = 0;
2193	2286	migrate->src[migrate->npages++] = 0;
2194	2287	}
..	..	@@ -2210,7 +2303,7 @@
2210	2303
2211	2304	again:
2212	2305	if (pmd_none(*pmdp))
2213		- return migrate_vma_collect_hole(start, end, walk);
	2306	+ return migrate_vma_collect_hole(start, end, -1, walk);
2214	2307
2215	2308	if (pmd_trans_huge(*pmdp)) {
2216	2309	struct page *page;
..	..	@@ -2243,7 +2336,7 @@
2243	2336	return migrate_vma_collect_skip(start, end,
2244	2337	walk);
2245	2338	if (pmd_none(*pmdp))
2246		- return migrate_vma_collect_hole(start, end,
	2339	+ return migrate_vma_collect_hole(start, end, -1,
2247	2340	walk);
2248	2341	}
2249	2342	}
..	..	@@ -2255,24 +2348,22 @@
2255	2348	arch_enter_lazy_mmu_mode();
2256	2349
2257	2350	for (; addr < end; addr += PAGE_SIZE, ptep++) {
2258		- unsigned long mpfn, pfn;
	2351	+ unsigned long mpfn = 0, pfn;
2259	2352	struct page *page;
2260	2353	swp_entry_t entry;
2261	2354	pte_t pte;
2262	2355
2263	2356	pte = *ptep;
2264		- pfn = pte_pfn(pte);
2265	2357
2266	2358	if (pte_none(pte)) {
2267		- mpfn = MIGRATE_PFN_MIGRATE;
2268		- migrate->cpages++;
2269		- pfn = 0;
	2359	+ if (vma_is_anonymous(vma)) {
	2360	+ mpfn = MIGRATE_PFN_MIGRATE;
	2361	+ migrate->cpages++;
	2362	+ }
2270	2363	goto next;
2271	2364	}
2272	2365
2273	2366	if (!pte_present(pte)) {
2274		- mpfn = pfn = 0;
2275		-
2276	2367	/*
2277	2368	* Only care about unaddressable device page special
2278	2369	* page table entry. Other special swap entries are not
..	..	@@ -2283,28 +2374,34 @@
2283	2374	goto next;
2284	2375
2285	2376	page = device_private_entry_to_page(entry);
2286		- mpfn = migrate_pfn(page_to_pfn(page))\|
2287		- MIGRATE_PFN_DEVICE \| MIGRATE_PFN_MIGRATE;
	2377	+ if (!(migrate->flags &
	2378	+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE) \|\|
	2379	+ page->pgmap->owner != migrate->pgmap_owner)
	2380	+ goto next;
	2381	+
	2382	+ mpfn = migrate_pfn(page_to_pfn(page)) \|
	2383	+ MIGRATE_PFN_MIGRATE;
2288	2384	if (is_write_device_private_entry(entry))
2289	2385	mpfn \|= MIGRATE_PFN_WRITE;
2290	2386	} else {
	2387	+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
	2388	+ goto next;
	2389	+ pfn = pte_pfn(pte);
2291	2390	if (is_zero_pfn(pfn)) {
2292	2391	mpfn = MIGRATE_PFN_MIGRATE;
2293	2392	migrate->cpages++;
2294		- pfn = 0;
2295	2393	goto next;
2296	2394	}
2297		- page = _vm_normal_page(migrate->vma, addr, pte, true);
	2395	+ page = vm_normal_page(migrate->vma, addr, pte);
2298	2396	mpfn = migrate_pfn(pfn) \| MIGRATE_PFN_MIGRATE;
2299	2397	mpfn \|= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
2300	2398	}
2301	2399
2302	2400	/* FIXME support THP */
2303	2401	if (!page \|\| !page->mapping \|\| PageTransCompound(page)) {
2304		- mpfn = pfn = 0;
	2402	+ mpfn = 0;
2305	2403	goto next;
2306	2404	}
2307		- pfn = page_to_pfn(page);
2308	2405
2309	2406	/*
2310	2407	* By getting a reference on the page we pin it and that blocks
..	..	@@ -2333,8 +2430,17 @@
2333	2430	entry = make_migration_entry(page, mpfn &
2334	2431	MIGRATE_PFN_WRITE);
2335	2432	swp_pte = swp_entry_to_pte(entry);
2336		- if (pte_soft_dirty(pte))
2337		- swp_pte = pte_swp_mksoft_dirty(swp_pte);
	2433	+ if (pte_present(pte)) {
	2434	+ if (pte_soft_dirty(pte))
	2435	+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
	2436	+ if (pte_uffd_wp(pte))
	2437	+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
	2438	+ } else {
	2439	+ if (pte_swp_soft_dirty(pte))
	2440	+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
	2441	+ if (pte_swp_uffd_wp(pte))
	2442	+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
	2443	+ }
2338	2444	set_pte_at(mm, addr, ptep, swp_pte);
2339	2445
2340	2446	/*
..	..	@@ -2353,15 +2459,21 @@
2353	2459	migrate->dst[migrate->npages] = 0;
2354	2460	migrate->src[migrate->npages++] = mpfn;
2355	2461	}
2356		- arch_leave_lazy_mmu_mode();
2357		- pte_unmap_unlock(ptep - 1, ptl);
2358	2462
2359	2463	/* Only flush the TLB if we actually modified any entries */
2360	2464	if (unmapped)
2361	2465	flush_tlb_range(walk->vma, start, end);
2362	2466
	2467	+ arch_leave_lazy_mmu_mode();
	2468	+ pte_unmap_unlock(ptep - 1, ptl);
	2469	+
2363	2470	return 0;
2364	2471	}
	2472	+
	2473	+static const struct mm_walk_ops migrate_vma_walk_ops = {
	2474	+ .pmd_entry = migrate_vma_collect_pmd,
	2475	+ .pte_hole = migrate_vma_collect_hole,
	2476	+};
2365	2477
2366	2478	/*
2367	2479	* migrate_vma_collect() - collect pages over a range of virtual addresses
..	..	@@ -2373,22 +2485,22 @@
2373	2485	*/
2374	2486	static void migrate_vma_collect(struct migrate_vma *migrate)
2375	2487	{
2376		- struct mm_walk mm_walk = {
2377		- .pmd_entry = migrate_vma_collect_pmd,
2378		- .pte_hole = migrate_vma_collect_hole,
2379		- .vma = migrate->vma,
2380		- .mm = migrate->vma->vm_mm,
2381		- .private = migrate,
2382		- };
	2488	+ struct mmu_notifier_range range;
2383	2489
2384		- mmu_notifier_invalidate_range_start(mm_walk.mm,
2385		- migrate->start,
2386		- migrate->end);
2387		- walk_page_range(migrate->start, migrate->end, &mm_walk);
2388		- mmu_notifier_invalidate_range_end(mm_walk.mm,
2389		- migrate->start,
2390		- migrate->end);
	2490	+ /*
	2491	+ * Note that the pgmap_owner is passed to the mmu notifier callback so
	2492	+ * that the registered device driver can skip invalidating device
	2493	+ * private page mappings that won't be migrated.
	2494	+ */
	2495	+ mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
	2496	+ migrate->vma->vm_mm, migrate->start, migrate->end,
	2497	+ migrate->pgmap_owner);
	2498	+ mmu_notifier_invalidate_range_start(&range);
2391	2499
	2500	+ walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
	2501	+ &migrate_vma_walk_ops, migrate);
	2502	+
	2503	+ mmu_notifier_invalidate_range_end(&range);
2392	2504	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
2393	2505	}
2394	2506
..	..	@@ -2432,16 +2544,7 @@
2432	2544	* FIXME proper solution is to rework migration_entry_wait() so
2433	2545	* it does not need to take a reference on page.
2434	2546	*/
2435		- if (is_device_private_page(page))
2436		- return true;
2437		-
2438		- /*
2439		- * Only allow device public page to be migrated and account for
2440		- * the extra reference count imply by ZONE_DEVICE pages.
2441		- */
2442		- if (!is_device_public_page(page))
2443		- return false;
2444		- extra++;
	2547	+ return is_device_private_page(page);
2445	2548	}
2446	2549
2447	2550	/* For file back page */
..	..	@@ -2575,7 +2678,7 @@
2575	2678	*/
2576	2679	static void migrate_vma_unmap(struct migrate_vma *migrate)
2577	2680	{
2578		- int flags = TTU_MIGRATION \| TTU_IGNORE_MLOCK \| TTU_IGNORE_ACCESS;
	2681	+ int flags = TTU_MIGRATION \| TTU_IGNORE_MLOCK;
2579	2682	const unsigned long npages = migrate->npages;
2580	2683	const unsigned long start = migrate->start;
2581	2684	unsigned long addr, i, restore = 0;
..	..	@@ -2620,6 +2723,118 @@
2620	2723	}
2621	2724	}
2622	2725
	2726	+/**
	2727	+ * migrate_vma_setup() - prepare to migrate a range of memory
	2728	+ * @args: contains the vma, start, and pfns arrays for the migration
	2729	+ *
	2730	+ * Returns: negative errno on failures, 0 when 0 or more pages were migrated
	2731	+ * without an error.
	2732	+ *
	2733	+ * Prepare to migrate a range of memory virtual address range by collecting all
	2734	+ * the pages backing each virtual address in the range, saving them inside the
	2735	+ * src array. Then lock those pages and unmap them. Once the pages are locked
	2736	+ * and unmapped, check whether each page is pinned or not. Pages that aren't
	2737	+ * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
	2738	+ * corresponding src array entry. Then restores any pages that are pinned, by
	2739	+ * remapping and unlocking those pages.
	2740	+ *
	2741	+ * The caller should then allocate destination memory and copy source memory to
	2742	+ * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
	2743	+ * flag set). Once these are allocated and copied, the caller must update each
	2744	+ * corresponding entry in the dst array with the pfn value of the destination
	2745	+ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
	2746	+ * (destination pages must have their struct pages locked, via lock_page()).
	2747	+ *
	2748	+ * Note that the caller does not have to migrate all the pages that are marked
	2749	+ * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
	2750	+ * device memory to system memory. If the caller cannot migrate a device page
	2751	+ * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
	2752	+ * consequences for the userspace process, so it must be avoided if at all
	2753	+ * possible.
	2754	+ *
	2755	+ * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
	2756	+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
	2757	+ * allowing the caller to allocate device memory for those unback virtual
	2758	+ * address. For this the caller simply has to allocate device memory and
	2759	+ * properly set the destination entry like for regular migration. Note that
	2760	+ * this can still fails and thus inside the device driver must check if the
	2761	+ * migration was successful for those entries after calling migrate_vma_pages()
	2762	+ * just like for regular migration.
	2763	+ *
	2764	+ * After that, the callers must call migrate_vma_pages() to go over each entry
	2765	+ * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
	2766	+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
	2767	+ * then migrate_vma_pages() to migrate struct page information from the source
	2768	+ * struct page to the destination struct page. If it fails to migrate the
	2769	+ * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
	2770	+ * src array.
	2771	+ *
	2772	+ * At this point all successfully migrated pages have an entry in the src
	2773	+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
	2774	+ * array entry with MIGRATE_PFN_VALID flag set.
	2775	+ *
	2776	+ * Once migrate_vma_pages() returns the caller may inspect which pages were
	2777	+ * successfully migrated, and which were not. Successfully migrated pages will
	2778	+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
	2779	+ *
	2780	+ * It is safe to update device page table after migrate_vma_pages() because
	2781	+ * both destination and source page are still locked, and the mmap_lock is held
	2782	+ * in read mode (hence no one can unmap the range being migrated).
	2783	+ *
	2784	+ * Once the caller is done cleaning up things and updating its page table (if it
	2785	+ * chose to do so, this is not an obligation) it finally calls
	2786	+ * migrate_vma_finalize() to update the CPU page table to point to new pages
	2787	+ * for successfully migrated pages or otherwise restore the CPU page table to
	2788	+ * point to the original source pages.
	2789	+ */
	2790	+int migrate_vma_setup(struct migrate_vma *args)
	2791	+{
	2792	+ long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
	2793	+
	2794	+ args->start &= PAGE_MASK;
	2795	+ args->end &= PAGE_MASK;
	2796	+ if (!args->vma \|\| is_vm_hugetlb_page(args->vma) \|\|
	2797	+ (args->vma->vm_flags & VM_SPECIAL) \|\| vma_is_dax(args->vma))
	2798	+ return -EINVAL;
	2799	+ if (nr_pages <= 0)
	2800	+ return -EINVAL;
	2801	+ if (args->start < args->vma->vm_start \|\|
	2802	+ args->start >= args->vma->vm_end)
	2803	+ return -EINVAL;
	2804	+ if (args->end <= args->vma->vm_start \|\| args->end > args->vma->vm_end)
	2805	+ return -EINVAL;
	2806	+ if (!args->src \|\| !args->dst)
	2807	+ return -EINVAL;
	2808	+
	2809	+ memset(args->src, 0, sizeof(args->src) nr_pages);
	2810	+ args->cpages = 0;
	2811	+ args->npages = 0;
	2812	+
	2813	+ migrate_vma_collect(args);
	2814	+
	2815	+ if (args->cpages)
	2816	+ migrate_vma_prepare(args);
	2817	+ if (args->cpages)
	2818	+ migrate_vma_unmap(args);
	2819	+
	2820	+ /*
	2821	+ * At this point pages are locked and unmapped, and thus they have
	2822	+ * stable content and can safely be copied to destination memory that
	2823	+ * is allocated by the drivers.
	2824	+ */
	2825	+ return 0;
	2826	+
	2827	+}
	2828	+EXPORT_SYMBOL(migrate_vma_setup);
	2829	+
	2830	+/*
	2831	+ * This code closely matches the code in:
	2832	+ * __handle_mm_fault()
	2833	+ * handle_pte_fault()
	2834	+ * do_anonymous_page()
	2835	+ * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
	2836	+ * private page.
	2837	+ */
2623	2838	static void migrate_vma_insert_page(struct migrate_vma *migrate,
2624	2839	unsigned long addr,
2625	2840	struct page *page,
..	..	@@ -2628,7 +2843,6 @@
2628	2843	{
2629	2844	struct vm_area_struct *vma = migrate->vma;
2630	2845	struct mm_struct *mm = vma->vm_mm;
2631		- struct mem_cgroup *memcg;
2632	2846	bool flush = false;
2633	2847	spinlock_t *ptl;
2634	2848	pte_t entry;
..	..	@@ -2661,12 +2875,12 @@
2661	2875	* pte_offset_map() on pmds where a huge pmd might be created
2662	2876	* from a different thread.
2663	2877	*
2664		- * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
	2878	+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
2665	2879	* parallel threads are excluded by other means.
2666	2880	*
2667		- * Here we only have down_read(mmap_sem).
	2881	+ * Here we only have mmap_read_lock(mm).
2668	2882	*/
2669		- if (pte_alloc(mm, pmdp, addr))
	2883	+ if (pte_alloc(mm, pmdp))
2670	2884	goto abort;
2671	2885
2672	2886	/* See the comment in pte_alloc_one_map() */
..	..	@@ -2675,7 +2889,7 @@
2675	2889
2676	2890	if (unlikely(anon_vma_prepare(vma)))
2677	2891	goto abort;
2678		- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
	2892	+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
2679	2893	goto abort;
2680	2894
2681	2895	/*
..	..	@@ -2691,11 +2905,13 @@
2691	2905
2692	2906	swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
2693	2907	entry = swp_entry_to_pte(swp_entry);
2694		- } else if (is_device_public_page(page)) {
2695		- entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
2696		- if (vma->vm_flags & VM_WRITE)
2697		- entry = pte_mkwrite(pte_mkdirty(entry));
2698		- entry = pte_mkdevmap(entry);
	2908	+ } else {
	2909	+ /*
	2910	+ * For now we only support migrating to un-addressable
	2911	+ * device memory.
	2912	+ */
	2913	+ pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
	2914	+ goto abort;
2699	2915	}
2700	2916	} else {
2701	2917	entry = mk_pte(page, vma->vm_page_prot);
..	..	@@ -2705,36 +2921,29 @@
2705	2921
2706	2922	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
2707	2923
	2924	+ if (check_stable_address_space(mm))
	2925	+ goto unlock_abort;
	2926	+
2708	2927	if (pte_present(*ptep)) {
2709	2928	unsigned long pfn = pte_pfn(*ptep);
2710	2929
2711		- if (!is_zero_pfn(pfn)) {
2712		- pte_unmap_unlock(ptep, ptl);
2713		- mem_cgroup_cancel_charge(page, memcg, false);
2714		- goto abort;
2715		- }
	2930	+ if (!is_zero_pfn(pfn))
	2931	+ goto unlock_abort;
2716	2932	flush = true;
2717		- } else if (!pte_none(*ptep)) {
2718		- pte_unmap_unlock(ptep, ptl);
2719		- mem_cgroup_cancel_charge(page, memcg, false);
2720		- goto abort;
2721		- }
	2933	+ } else if (!pte_none(*ptep))
	2934	+ goto unlock_abort;
2722	2935
2723	2936	/*
2724		- * Check for usefaultfd but do not deliver the fault. Instead,
	2937	+ * Check for userfaultfd but do not deliver the fault. Instead,
2725	2938	* just back off.
2726	2939	*/
2727		- if (userfaultfd_missing(vma)) {
2728		- pte_unmap_unlock(ptep, ptl);
2729		- mem_cgroup_cancel_charge(page, memcg, false);
2730		- goto abort;
2731		- }
	2940	+ if (userfaultfd_missing(vma))
	2941	+ goto unlock_abort;
2732	2942
2733	2943	inc_mm_counter(mm, MM_ANONPAGES);
2734	2944	page_add_new_anon_rmap(page, vma, addr, false);
2735		- mem_cgroup_commit_charge(page, memcg, false, false);
2736	2945	if (!is_zone_device_page(page))
2737		- lru_cache_add_active_or_unevictable(page, vma);
	2946	+ lru_cache_add_inactive_or_unevictable(page, vma);
2738	2947	get_page(page);
2739	2948
2740	2949	if (flush) {
..	..	@@ -2752,11 +2961,13 @@
2752	2961	*src = MIGRATE_PFN_MIGRATE;
2753	2962	return;
2754	2963
	2964	+unlock_abort:
	2965	+ pte_unmap_unlock(ptep, ptl);
2755	2966	abort:
2756	2967	*src &= ~MIGRATE_PFN_MIGRATE;
2757	2968	}
2758	2969
2759		-/*
	2970	+/**
2760	2971	* migrate_vma_pages() - migrate meta-data from src page to dst page
2761	2972	* @migrate: migrate struct containing all migration information
2762	2973	*
..	..	@@ -2764,13 +2975,12 @@
2764	2975	* struct page. This effectively finishes the migration from source page to the
2765	2976	* destination page.
2766	2977	*/
2767		-static void migrate_vma_pages(struct migrate_vma *migrate)
	2978	+void migrate_vma_pages(struct migrate_vma *migrate)
2768	2979	{
2769	2980	const unsigned long npages = migrate->npages;
2770	2981	const unsigned long start = migrate->start;
2771		- struct vm_area_struct *vma = migrate->vma;
2772		- struct mm_struct *mm = vma->vm_mm;
2773		- unsigned long addr, i, mmu_start;
	2982	+ struct mmu_notifier_range range;
	2983	+ unsigned long addr, i;
2774	2984	bool notified = false;
2775	2985
2776	2986	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
..	..	@@ -2785,15 +2995,17 @@
2785	2995	}
2786	2996
2787	2997	if (!page) {
2788		- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
	2998	+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
2789	2999	continue;
2790		- }
2791	3000	if (!notified) {
2792		- mmu_start = addr;
2793	3001	notified = true;
2794		- mmu_notifier_invalidate_range_start(mm,
2795		- mmu_start,
2796		- migrate->end);
	3002	+
	3003	+ mmu_notifier_range_init(&range,
	3004	+ MMU_NOTIFY_CLEAR, 0,
	3005	+ NULL,
	3006	+ migrate->vma->vm_mm,
	3007	+ addr, migrate->end);
	3008	+ mmu_notifier_invalidate_range_start(&range);
2797	3009	}
2798	3010	migrate_vma_insert_page(migrate, addr, newpage,
2799	3011	&migrate->src[i],
..	..	@@ -2813,7 +3025,7 @@
2813	3025	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2814	3026	continue;
2815	3027	}
2816		- } else if (!is_device_public_page(newpage)) {
	3028	+ } else {
2817	3029	/*
2818	3030	* Other types of ZONE_DEVICE page are not
2819	3031	* supported.
..	..	@@ -2834,11 +3046,11 @@
2834	3046	* did already call it.
2835	3047	*/
2836	3048	if (notified)
2837		- mmu_notifier_invalidate_range_only_end(mm, mmu_start,
2838		- migrate->end);
	3049	+ mmu_notifier_invalidate_range_only_end(&range);
2839	3050	}
	3051	+EXPORT_SYMBOL(migrate_vma_pages);
2840	3052
2841		-/*
	3053	+/**
2842	3054	* migrate_vma_finalize() - restore CPU page table entry
2843	3055	* @migrate: migrate struct containing all migration information
2844	3056	*
..	..	@@ -2849,7 +3061,7 @@
2849	3061	* This also unlocks the pages and puts them back on the lru, or drops the extra
2850	3062	* refcount, for device pages.
2851	3063	*/
2852		-static void migrate_vma_finalize(struct migrate_vma *migrate)
	3064	+void migrate_vma_finalize(struct migrate_vma *migrate)
2853	3065	{
2854	3066	const unsigned long npages = migrate->npages;
2855	3067	unsigned long i;
..	..	@@ -2876,7 +3088,6 @@
2876	3088
2877	3089	remove_migration_ptes(page, newpage, false);
2878	3090	unlock_page(page);
2879		- migrate->cpages--;
2880	3091
2881	3092	if (is_zone_device_page(page))
2882	3093	put_page(page);
..	..	@@ -2892,124 +3103,5 @@
2892	3103	}
2893	3104	}
2894	3105	}
2895		-
2896		-/*
2897		- * migrate_vma() - migrate a range of memory inside vma
2898		- *
2899		- * @ops: migration callback for allocating destination memory and copying
2900		- * @vma: virtual memory area containing the range to be migrated
2901		- * @start: start address of the range to migrate (inclusive)
2902		- * @end: end address of the range to migrate (exclusive)
2903		- * @src: array of hmm_pfn_t containing source pfns
2904		- * @dst: array of hmm_pfn_t containing destination pfns
2905		- * @private: pointer passed back to each of the callback
2906		- * Returns: 0 on success, error code otherwise
2907		- *
2908		- * This function tries to migrate a range of memory virtual address range, using
2909		- * callbacks to allocate and copy memory from source to destination. First it
2910		- * collects all the pages backing each virtual address in the range, saving this
2911		- * inside the src array. Then it locks those pages and unmaps them. Once the pages
2912		- * are locked and unmapped, it checks whether each page is pinned or not. Pages
2913		- * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
2914		- * in the corresponding src array entry. It then restores any pages that are
2915		- * pinned, by remapping and unlocking those pages.
2916		- *
2917		- * At this point it calls the alloc_and_copy() callback. For documentation on
2918		- * what is expected from that callback, see struct migrate_vma_ops comments in
2919		- * include/linux/migrate.h
2920		- *
2921		- * After the alloc_and_copy() callback, this function goes over each entry in
2922		- * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
2923		- * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
2924		- * then the function tries to migrate struct page information from the source
2925		- * struct page to the destination struct page. If it fails to migrate the struct
2926		- * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
2927		- * array.
2928		- *
2929		- * At this point all successfully migrated pages have an entry in the src
2930		- * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
2931		- * array entry with MIGRATE_PFN_VALID flag set.
2932		- *
2933		- * It then calls the finalize_and_map() callback. See comments for "struct
2934		- * migrate_vma_ops", in include/linux/migrate.h for details about
2935		- * finalize_and_map() behavior.
2936		- *
2937		- * After the finalize_and_map() callback, for successfully migrated pages, this
2938		- * function updates the CPU page table to point to new pages, otherwise it
2939		- * restores the CPU page table to point to the original source pages.
2940		- *
2941		- * Function returns 0 after the above steps, even if no pages were migrated
2942		- * (The function only returns an error if any of the arguments are invalid.)
2943		- *
2944		- * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
2945		- * unsigned long entries.
2946		- */
2947		-int migrate_vma(const struct migrate_vma_ops *ops,
2948		- struct vm_area_struct *vma,
2949		- unsigned long start,
2950		- unsigned long end,
2951		- unsigned long *src,
2952		- unsigned long *dst,
2953		- void *private)
2954		-{
2955		- struct migrate_vma migrate;
2956		-
2957		- /* Sanity check the arguments */
2958		- start &= PAGE_MASK;
2959		- end &= PAGE_MASK;
2960		- if (!vma \|\| is_vm_hugetlb_page(vma) \|\| (vma->vm_flags & VM_SPECIAL) \|\|
2961		- vma_is_dax(vma))
2962		- return -EINVAL;
2963		- if (start < vma->vm_start \|\| start >= vma->vm_end)
2964		- return -EINVAL;
2965		- if (end <= vma->vm_start \|\| end > vma->vm_end)
2966		- return -EINVAL;
2967		- if (!ops \|\| !src \|\| !dst \|\| start >= end)
2968		- return -EINVAL;
2969		-
2970		- memset(src, 0, sizeof(src) ((end - start) >> PAGE_SHIFT));
2971		- migrate.src = src;
2972		- migrate.dst = dst;
2973		- migrate.start = start;
2974		- migrate.npages = 0;
2975		- migrate.cpages = 0;
2976		- migrate.end = end;
2977		- migrate.vma = vma;
2978		-
2979		- /* Collect, and try to unmap source pages */
2980		- migrate_vma_collect(&migrate);
2981		- if (!migrate.cpages)
2982		- return 0;
2983		-
2984		- /* Lock and isolate page */
2985		- migrate_vma_prepare(&migrate);
2986		- if (!migrate.cpages)
2987		- return 0;
2988		-
2989		- /* Unmap pages */
2990		- migrate_vma_unmap(&migrate);
2991		- if (!migrate.cpages)
2992		- return 0;
2993		-
2994		- /*
2995		- * At this point pages are locked and unmapped, and thus they have
2996		- * stable content and can safely be copied to destination memory that
2997		- * is allocated by the callback.
2998		- *
2999		- * Note that migration can fail in migrate_vma_struct_page() for each
3000		- * individual page.
3001		- */
3002		- ops->alloc_and_copy(vma, src, dst, start, end, private);
3003		-
3004		- /* This does the real migration of struct page */
3005		- migrate_vma_pages(&migrate);
3006		-
3007		- ops->finalize_and_map(vma, src, dst, start, end, private);
3008		-
3009		- /* Unlock and remap pages */
3010		- migrate_vma_finalize(&migrate);
3011		-
3012		- return 0;
3013		-}
3014		-EXPORT_SYMBOL(migrate_vma);
3015		-#endif /* defined(MIGRATE_VMA_HELPER) */
	3106	+EXPORT_SYMBOL(migrate_vma_finalize);
	3107	+#endif /* CONFIG_DEVICE_PRIVATE */