~hc/RK356X_SDK_RELEASE.git

..	..	@@ -38,6 +38,7 @@
38	38	#include <linux/hugetlb.h>
39	39	#include <linux/hugetlb_cgroup.h>
40	40	#include <linux/gfp.h>
	41	+#include <linux/pagewalk.h>
41	42	#include <linux/pfn_t.h>
42	43	#include <linux/memremap.h>
43	44	#include <linux/userfaultfd_k.h>
..	..	@@ -47,39 +48,17 @@
47	48	#include <linux/page_owner.h>
48	49	#include <linux/sched/mm.h>
49	50	#include <linux/ptrace.h>
	51	+#include <linux/oom.h>
50	52
51	53	#include <asm/tlbflush.h>
52	54
53	55	#define CREATE_TRACE_POINTS
54	56	#include <trace/events/migrate.h>
	57	+#undef CREATE_TRACE_POINTS
	58	+#include <trace/hooks/mm.h>
	59	+#include <trace/hooks/vmscan.h>
55	60
56	61	#include "internal.h"
57		-
58		-/*
59		- * migrate_prep() needs to be called before we start compiling a list of pages
60		- * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
61		- * undesirable, use migrate_prep_local()
62		- */
63		-int migrate_prep(void)
64		-{
65		- /*
66		- * Clear the LRU lists so pages can be isolated.
67		- * Note that pages may be moved off the LRU after we have
68		- * drained them. Those pages will fail to migrate like other
69		- * pages that may be busy.
70		- */
71		- lru_add_drain_all();
72		-
73		- return 0;
74		-}
75		-
76		-/* Do the necessary work of migrate_prep but not if it involves other CPUs */
77		-int migrate_prep_local(void)
78		-{
79		- lru_add_drain();
80		-
81		- return 0;
82		-}
83	62
84	63	int isolate_movable_page(struct page *page, isolate_mode_t mode)
85	64	{
..	..	@@ -100,7 +79,7 @@
100	79	/*
101	80	* Check PageMovable before holding a PG_lock because page's owner
102	81	* assumes anybody doesn't touch PG_lock of newly allocated page
103		- * so unconditionally grapping the lock ruins page's owner side.
	82	+ * so unconditionally grabbing the lock ruins page's owner side.
104	83	*/
105	84	if (unlikely(!__PageMovable(page)))
106	85	goto out_putpage;
..	..	@@ -129,7 +108,7 @@
129	108
130	109	/* Driver shouldn't use PG_isolated bit of page->flags */
131	110	WARN_ON_ONCE(PageIsolated(page));
132		- __SetPageIsolated(page);
	111	+ SetPageIsolated(page);
133	112	unlock_page(page);
134	113
135	114	return 0;
..	..	@@ -153,7 +132,7 @@
153	132
154	133	mapping = page_mapping(page);
155	134	mapping->a_ops->putback_page(page);
156		- __ClearPageIsolated(page);
	135	+ ClearPageIsolated(page);
157	136	}
158	137
159	138	/*
..	..	@@ -162,7 +141,7 @@
162	141	*
163	142	* This function shall be used whenever the isolated pageset has been
164	143	* built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
165		- * and isolate_huge_page().
	144	+ * and isolate_hugetlb().
166	145	*/
167	146	void putback_movable_pages(struct list_head *l)
168	147	{
..	..	@@ -186,16 +165,17 @@
186	165	if (PageMovable(page))
187	166	putback_movable_page(page);
188	167	else
189		- __ClearPageIsolated(page);
	168	+ ClearPageIsolated(page);
190	169	unlock_page(page);
191	170	put_page(page);
192	171	} else {
193	172	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
194		- page_is_file_cache(page), -hpage_nr_pages(page));
	173	+ page_is_file_lru(page), -thp_nr_pages(page));
195	174	putback_lru_page(page);
196	175	}
197	176	}
198	177	}
	178	+EXPORT_SYMBOL_GPL(putback_movable_pages);
199	179
200	180	/*
201	181	* Restore a potential migration pte to a working pte entry
..	..	@@ -240,15 +220,17 @@
240	220	*/
241	221	entry = pte_to_swp_entry(*pvmw.pte);
242	222	if (is_write_migration_entry(entry))
243		- pte = maybe_mkwrite(pte, vma);
	223	+ pte = maybe_mkwrite(pte, vma->vm_flags);
	224	+ else if (pte_swp_uffd_wp(*pvmw.pte))
	225	+ pte = pte_mkuffd_wp(pte);
244	226
245		- if (unlikely(is_zone_device_page(new))) {
246		- if (is_device_private_page(new)) {
247		- entry = make_device_private_entry(new, pte_write(pte));
248		- pte = swp_entry_to_pte(entry);
249		- } else if (is_device_public_page(new)) {
250		- pte = pte_mkdevmap(pte);
251		- }
	227	+ if (unlikely(is_device_private_page(new))) {
	228	+ entry = make_device_private_entry(new, pte_write(pte));
	229	+ pte = swp_entry_to_pte(entry);
	230	+ if (pte_swp_soft_dirty(*pvmw.pte))
	231	+ pte = pte_swp_mksoft_dirty(pte);
	232	+ if (pte_swp_uffd_wp(*pvmw.pte))
	233	+ pte = pte_swp_mkuffd_wp(pte);
252	234	}
253	235
254	236	#ifdef CONFIG_HUGETLB_PAGE
..	..	@@ -322,19 +304,18 @@
322	304	goto out;
323	305
324	306	page = migration_entry_to_page(entry);
	307	+ page = compound_head(page);
325	308
326	309	/*
327		- * Once radix-tree replacement of page migration started, page_count
328		- * must be zero. And, we don't want to call wait_on_page_locked()
329		- * against a page without get_page().
330		- * So, we use get_page_unless_zero(), here. Even failed, page fault
331		- * will occur again.
	310	+ * Once page cache replacement of page migration started, page_count
	311	+ * is zero; but we must not call put_and_wait_on_page_locked() without
	312	+ * a ref. Use get_page_unless_zero(), and just fault again if it fails.
332	313	*/
333	314	if (!get_page_unless_zero(page))
334	315	goto out;
335	316	pte_unmap_unlock(ptep, ptl);
336		- wait_on_page_locked(page);
337		- put_page(page);
	317	+ trace_android_vh_waiting_for_page_migration(page);
	318	+ put_and_wait_on_page_locked(page);
338	319	return;
339	320	out:
340	321	pte_unmap_unlock(ptep, ptl);
..	..	@@ -368,63 +349,27 @@
368	349	if (!get_page_unless_zero(page))
369	350	goto unlock;
370	351	spin_unlock(ptl);
371		- wait_on_page_locked(page);
372		- put_page(page);
	352	+ put_and_wait_on_page_locked(page);
373	353	return;
374	354	unlock:
375	355	spin_unlock(ptl);
376	356	}
377	357	#endif
378	358
379		-#ifdef CONFIG_BLOCK
380		-/* Returns true if all buffers are successfully locked */
381		-static bool buffer_migrate_lock_buffers(struct buffer_head *head,
382		- enum migrate_mode mode)
	359	+static int expected_page_refs(struct address_space mapping, struct page page)
383	360	{
384		- struct buffer_head *bh = head;
	361	+ int expected_count = 1;
385	362
386		- /* Simple case, sync compaction */
387		- if (mode != MIGRATE_ASYNC) {
388		- do {
389		- get_bh(bh);
390		- lock_buffer(bh);
391		- bh = bh->b_this_page;
	363	+ /*
	364	+ * Device private pages have an extra refcount as they are
	365	+ * ZONE_DEVICE pages.
	366	+ */
	367	+ expected_count += is_device_private_page(page);
	368	+ if (mapping)
	369	+ expected_count += thp_nr_pages(page) + page_has_private(page);
392	370
393		- } while (bh != head);
394		-
395		- return true;
396		- }
397		-
398		- /* async case, we cannot block on lock_buffer so use trylock_buffer */
399		- do {
400		- get_bh(bh);
401		- if (!trylock_buffer(bh)) {
402		- /*
403		- * We failed to lock the buffer and cannot stall in
404		- * async migration. Release the taken locks
405		- */
406		- struct buffer_head *failed_bh = bh;
407		- put_bh(failed_bh);
408		- bh = head;
409		- while (bh != failed_bh) {
410		- unlock_buffer(bh);
411		- put_bh(bh);
412		- bh = bh->b_this_page;
413		- }
414		- return false;
415		- }
416		-
417		- bh = bh->b_this_page;
418		- } while (bh != head);
419		- return true;
	371	+ return expected_count;
420	372	}
421		-#else
422		-static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
423		- enum migrate_mode mode)
424		-{
425		- return true;
426		-}
427		-#endif /* CONFIG_BLOCK */
428	373
429	374	/*
430	375	* Replace the page in the mapping.
..	..	@@ -435,21 +380,13 @@
435	380	* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
436	381	*/
437	382	int migrate_page_move_mapping(struct address_space *mapping,
438		- struct page newpage, struct page page,
439		- struct buffer_head *head, enum migrate_mode mode,
440		- int extra_count)
	383	+ struct page newpage, struct page page, int extra_count)
441	384	{
	385	+ XA_STATE(xas, &mapping->i_pages, page_index(page));
442	386	struct zone oldzone, newzone;
443	387	int dirty;
444		- int expected_count = 1 + extra_count;
445		- void **pslot;
446		-
447		- /*
448		- * Device public or private pages have an extra refcount as they are
449		- * ZONE_DEVICE pages.
450		- */
451		- expected_count += is_device_private_page(page);
452		- expected_count += is_device_public_page(page);
	388	+ int expected_count = expected_page_refs(mapping, page) + extra_count;
	389	+ int nr = thp_nr_pages(page);
453	390
454	391	if (!mapping) {
455	392	/* Anonymous page without mapping */
..	..	@@ -468,35 +405,14 @@
468	405	oldzone = page_zone(page);
469	406	newzone = page_zone(newpage);
470	407
471		- xa_lock_irq(&mapping->i_pages);
472		-
473		- pslot = radix_tree_lookup_slot(&mapping->i_pages,
474		- page_index(page));
475		-
476		- expected_count += hpage_nr_pages(page) + page_has_private(page);
477		- if (page_count(page) != expected_count \|\|
478		- radix_tree_deref_slot_protected(pslot,
479		- &mapping->i_pages.xa_lock) != page) {
480		- xa_unlock_irq(&mapping->i_pages);
	408	+ xas_lock_irq(&xas);
	409	+ if (page_count(page) != expected_count \|\| xas_load(&xas) != page) {
	410	+ xas_unlock_irq(&xas);
481	411	return -EAGAIN;
482	412	}
483	413
484	414	if (!page_ref_freeze(page, expected_count)) {
485		- xa_unlock_irq(&mapping->i_pages);
486		- return -EAGAIN;
487		- }
488		-
489		- /*
490		- * In the async migration case of moving a page with buffers, lock the
491		- * buffers using trylock before the mapping is moved. If the mapping
492		- * was moved, we later failed to lock the buffers and could not move
493		- * the mapping back due to an elevated page count, we would have to
494		- * block waiting on other references to be dropped.
495		- */
496		- if (mode == MIGRATE_ASYNC && head &&
497		- !buffer_migrate_lock_buffers(head, mode)) {
498		- page_ref_unfreeze(page, expected_count);
499		- xa_unlock_irq(&mapping->i_pages);
	415	+ xas_unlock_irq(&xas);
500	416	return -EAGAIN;
501	417	}
502	418
..	..	@@ -506,7 +422,7 @@
506	422	*/
507	423	newpage->index = page->index;
508	424	newpage->mapping = page->mapping;
509		- page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
	425	+ page_ref_add(newpage, nr); /* add cache reference */
510	426	if (PageSwapBacked(page)) {
511	427	__SetPageSwapBacked(newpage);
512	428	if (PageSwapCache(page)) {
..	..	@@ -524,16 +440,13 @@
524	440	SetPageDirty(newpage);
525	441	}
526	442
527		- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
	443	+ xas_store(&xas, newpage);
528	444	if (PageTransHuge(page)) {
529	445	int i;
530		- int index = page_index(page);
531	446
532		- for (i = 1; i < HPAGE_PMD_NR; i++) {
533		- pslot = radix_tree_lookup_slot(&mapping->i_pages,
534		- index + i);
535		- radix_tree_replace_slot(&mapping->i_pages, pslot,
536		- newpage + i);
	447	+ for (i = 1; i < nr; i++) {
	448	+ xas_next(&xas);
	449	+ xas_store(&xas, newpage);
537	450	}
538	451	}
539	452
..	..	@@ -542,9 +455,9 @@
542	455	* to one less reference.
543	456	* We know this isn't the last reference.
544	457	*/
545		- page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
	458	+ page_ref_unfreeze(page, expected_count - nr);
546	459
547		- xa_unlock(&mapping->i_pages);
	460	+ xas_unlock(&xas);
548	461	/* Leave irq disabled to prevent preemption while updating stats */
549	462
550	463	/*
..	..	@@ -558,17 +471,24 @@
558	471	* are mapped to swap space.
559	472	*/
560	473	if (newzone != oldzone) {
561		- __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
562		- __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
	474	+ struct lruvec old_lruvec, new_lruvec;
	475	+ struct mem_cgroup *memcg;
	476	+
	477	+ memcg = page_memcg(page);
	478	+ old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
	479	+ new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
	480	+
	481	+ __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
	482	+ __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
563	483	if (PageSwapBacked(page) && !PageSwapCache(page)) {
564		- __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
565		- __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
	484	+ __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
	485	+ __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
566	486	}
567		- if (dirty && mapping_cap_account_dirty(mapping)) {
568		- __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
569		- __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
570		- __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
571		- __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
	487	+ if (dirty && mapping_can_writeback(mapping)) {
	488	+ __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
	489	+ __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
	490	+ __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
	491	+ __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
572	492	}
573	493	}
574	494	local_irq_enable();
..	..	@@ -584,22 +504,18 @@
584	504	int migrate_huge_page_move_mapping(struct address_space *mapping,
585	505	struct page newpage, struct page page)
586	506	{
	507	+ XA_STATE(xas, &mapping->i_pages, page_index(page));
587	508	int expected_count;
588		- void **pslot;
589	509
590		- xa_lock_irq(&mapping->i_pages);
591		-
592		- pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page));
593		-
	510	+ xas_lock_irq(&xas);
594	511	expected_count = 2 + page_has_private(page);
595		- if (page_count(page) != expected_count \|\|
596		- radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) {
597		- xa_unlock_irq(&mapping->i_pages);
	512	+ if (page_count(page) != expected_count \|\| xas_load(&xas) != page) {
	513	+ xas_unlock_irq(&xas);
598	514	return -EAGAIN;
599	515	}
600	516
601	517	if (!page_ref_freeze(page, expected_count)) {
602		- xa_unlock_irq(&mapping->i_pages);
	518	+ xas_unlock_irq(&xas);
603	519	return -EAGAIN;
604	520	}
605	521
..	..	@@ -608,11 +524,11 @@
608	524
609	525	get_page(newpage);
610	526
611		- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
	527	+ xas_store(&xas, newpage);
612	528
613	529	page_ref_unfreeze(page, expected_count - 1);
614	530
615		- xa_unlock_irq(&mapping->i_pages);
	531	+ xas_unlock_irq(&xas);
616	532
617	533	return MIGRATEPAGE_SUCCESS;
618	534	}
..	..	@@ -656,7 +572,7 @@
656	572	} else {
657	573	/* thp page */
658	574	BUG_ON(!PageTransHuge(src));
659		- nr_pages = hpage_nr_pages(src);
	575	+ nr_pages = thp_nr_pages(src);
660	576	}
661	577
662	578	for (i = 0; i < nr_pages; i++) {
..	..	@@ -671,6 +587,8 @@
671	587	void migrate_page_states(struct page newpage, struct page page)
672	588	{
673	589	int cpupid;
	590	+
	591	+ trace_android_vh_migrate_page_states(page, newpage);
674	592
675	593	if (PageError(page))
676	594	SetPageError(newpage);
..	..	@@ -689,6 +607,7 @@
689	607	SetPageChecked(newpage);
690	608	if (PageMappedToDisk(page))
691	609	SetPageMappedToDisk(newpage);
	610	+ trace_android_vh_look_around_migrate_page(page, newpage);
692	611
693	612	/* Move dirty on pages not done by migrate_page_move_mapping() */
694	613	if (PageDirty(page))
..	..	@@ -723,9 +642,18 @@
723	642	if (PageWriteback(newpage))
724	643	end_page_writeback(newpage);
725	644
	645	+ /*
	646	+ * PG_readahead shares the same bit with PG_reclaim. The above
	647	+ * end_page_writeback() may clear PG_readahead mistakenly, so set the
	648	+ * bit after that.
	649	+ */
	650	+ if (PageReadahead(page))
	651	+ SetPageReadahead(newpage);
	652	+
726	653	copy_page_owner(page, newpage);
727	654
728		- mem_cgroup_migrate(page, newpage);
	655	+ if (!PageHuge(page))
	656	+ mem_cgroup_migrate(page, newpage);
729	657	}
730	658	EXPORT_SYMBOL(migrate_page_states);
731	659
..	..	@@ -758,7 +686,7 @@
758	686
759	687	BUG_ON(PageWriteback(page)); /* Writeback must be complete */
760	688
761		- rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
	689	+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
762	690
763	691	if (rc != MIGRATEPAGE_SUCCESS)
764	692	return rc;
..	..	@@ -772,40 +700,96 @@
772	700	EXPORT_SYMBOL(migrate_page);
773	701
774	702	#ifdef CONFIG_BLOCK
775		-/*
776		- * Migration function for pages with buffers. This function can only be used
777		- * if the underlying filesystem guarantees that no other references to "page"
778		- * exist.
779		- */
780		-int buffer_migrate_page(struct address_space *mapping,
781		- struct page newpage, struct page page, enum migrate_mode mode)
	703	+/* Returns true if all buffers are successfully locked */
	704	+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
	705	+ enum migrate_mode mode)
	706	+{
	707	+ struct buffer_head *bh = head;
	708	+
	709	+ /* Simple case, sync compaction */
	710	+ if (mode != MIGRATE_ASYNC) {
	711	+ do {
	712	+ lock_buffer(bh);
	713	+ bh = bh->b_this_page;
	714	+
	715	+ } while (bh != head);
	716	+
	717	+ return true;
	718	+ }
	719	+
	720	+ /* async case, we cannot block on lock_buffer so use trylock_buffer */
	721	+ do {
	722	+ if (!trylock_buffer(bh)) {
	723	+ /*
	724	+ * We failed to lock the buffer and cannot stall in
	725	+ * async migration. Release the taken locks
	726	+ */
	727	+ struct buffer_head *failed_bh = bh;
	728	+ bh = head;
	729	+ while (bh != failed_bh) {
	730	+ unlock_buffer(bh);
	731	+ bh = bh->b_this_page;
	732	+ }
	733	+ return false;
	734	+ }
	735	+
	736	+ bh = bh->b_this_page;
	737	+ } while (bh != head);
	738	+ return true;
	739	+}
	740	+
	741	+static int __buffer_migrate_page(struct address_space *mapping,
	742	+ struct page newpage, struct page page, enum migrate_mode mode,
	743	+ bool check_refs)
782	744	{
783	745	struct buffer_head bh, head;
784	746	int rc;
	747	+ int expected_count;
785	748
786	749	if (!page_has_buffers(page))
787	750	return migrate_page(mapping, newpage, page, mode);
788	751
	752	+ /* Check whether page does not have extra refs before we do more work */
	753	+ expected_count = expected_page_refs(mapping, page);
	754	+ if (page_count(page) != expected_count)
	755	+ return -EAGAIN;
	756	+
789	757	head = page_buffers(page);
	758	+ if (!buffer_migrate_lock_buffers(head, mode))
	759	+ return -EAGAIN;
790	760
791		- rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
	761	+ if (check_refs) {
	762	+ bool busy;
	763	+ bool invalidated = false;
792	764
	765	+recheck_buffers:
	766	+ busy = false;
	767	+ spin_lock(&mapping->private_lock);
	768	+ bh = head;
	769	+ do {
	770	+ if (atomic_read(&bh->b_count)) {
	771	+ busy = true;
	772	+ break;
	773	+ }
	774	+ bh = bh->b_this_page;
	775	+ } while (bh != head);
	776	+ if (busy) {
	777	+ if (invalidated) {
	778	+ rc = -EAGAIN;
	779	+ goto unlock_buffers;
	780	+ }
	781	+ spin_unlock(&mapping->private_lock);
	782	+ invalidate_bh_lrus();
	783	+ invalidated = true;
	784	+ goto recheck_buffers;
	785	+ }
	786	+ }
	787	+
	788	+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
793	789	if (rc != MIGRATEPAGE_SUCCESS)
794		- return rc;
	790	+ goto unlock_buffers;
795	791
796		- /*
797		- * In the async case, migrate_page_move_mapping locked the buffers
798		- * with an IRQ-safe spinlock held. In the sync case, the buffers
799		- * need to be locked now
800		- */
801		- if (mode != MIGRATE_ASYNC)
802		- BUG_ON(!buffer_migrate_lock_buffers(head, mode));
803		-
804		- ClearPagePrivate(page);
805		- set_page_private(newpage, page_private(page));
806		- set_page_private(page, 0);
807		- put_page(page);
808		- get_page(newpage);
	792	+ attach_page_private(newpage, detach_page_private(page));
809	793
810	794	bh = head;
811	795	do {
..	..	@@ -814,24 +798,48 @@
814	798
815	799	} while (bh != head);
816	800
817		- SetPagePrivate(newpage);
818		-
819	801	if (mode != MIGRATE_SYNC_NO_COPY)
820	802	migrate_page_copy(newpage, page);
821	803	else
822	804	migrate_page_states(newpage, page);
823	805
	806	+ rc = MIGRATEPAGE_SUCCESS;
	807	+unlock_buffers:
	808	+ if (check_refs)
	809	+ spin_unlock(&mapping->private_lock);
824	810	bh = head;
825	811	do {
826	812	unlock_buffer(bh);
827		- put_bh(bh);
828	813	bh = bh->b_this_page;
829	814
830	815	} while (bh != head);
831	816
832		- return MIGRATEPAGE_SUCCESS;
	817	+ return rc;
	818	+}
	819	+
	820	+/*
	821	+ * Migration function for pages with buffers. This function can only be used
	822	+ * if the underlying filesystem guarantees that no other references to "page"
	823	+ * exist. For example attached buffer heads are accessed only under page lock.
	824	+ */
	825	+int buffer_migrate_page(struct address_space *mapping,
	826	+ struct page newpage, struct page page, enum migrate_mode mode)
	827	+{
	828	+ return __buffer_migrate_page(mapping, newpage, page, mode, false);
833	829	}
834	830	EXPORT_SYMBOL(buffer_migrate_page);
	831	+
	832	+/*
	833	+ * Same as above except that this variant is more careful and checks that there
	834	+ * are also no buffer head references. This function is the right one for
	835	+ * mappings where buffer heads are directly looked up and referenced (such as
	836	+ * block device mappings).
	837	+ */
	838	+int buffer_migrate_page_norefs(struct address_space *mapping,
	839	+ struct page newpage, struct page page, enum migrate_mode mode)
	840	+{
	841	+ return __buffer_migrate_page(mapping, newpage, page, mode, true);
	842	+}
835	843	#endif
836	844
837	845	/*
..	..	@@ -899,7 +907,7 @@
899	907	*/
900	908	if (page_has_private(page) &&
901	909	!try_to_release_page(page, GFP_KERNEL))
902		- return -EAGAIN;
	910	+ return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
903	911
904	912	return migrate_page(mapping, newpage, page, mode);
905	913	}
..	..	@@ -951,7 +959,7 @@
951	959	VM_BUG_ON_PAGE(!PageIsolated(page), page);
952	960	if (!PageMovable(page)) {
953	961	rc = MIGRATEPAGE_SUCCESS;
954		- __ClearPageIsolated(page);
	962	+ ClearPageIsolated(page);
955	963	goto out;
956	964	}
957	965
..	..	@@ -973,23 +981,23 @@
973	981	* We clear PG_movable under page_lock so any compactor
974	982	* cannot try to migrate this page.
975	983	*/
976		- __ClearPageIsolated(page);
	984	+ ClearPageIsolated(page);
977	985	}
978	986
979	987	/*
980		- * Anonymous and movable page->mapping will be cleard by
	988	+ * Anonymous and movable page->mapping will be cleared by
981	989	* free_pages_prepare so don't reset it here for keeping
982	990	* the type to work PageAnon, for example.
983	991	*/
984	992	if (!PageMappingFlags(page))
985	993	page->mapping = NULL;
986	994
987		- if (unlikely(is_zone_device_page(newpage))) {
988		- if (is_device_public_page(newpage))
989		- flush_dcache_page(newpage);
990		- } else
991		- flush_dcache_page(newpage);
	995	+ if (likely(!is_zone_device_page(newpage))) {
	996	+ int i, nr = compound_nr(newpage);
992	997
	998	+ for (i = 0; i < nr; i++)
	999	+ flush_dcache_page(newpage + i);
	1000	+ }
993	1001	}
994	1002	out:
995	1003	return rc;
..	..	@@ -1013,7 +1021,7 @@
1013	1021	* to the LRU. Later, when the IO completes the pages are
1014	1022	* marked uptodate and unlocked. However, the queueing
1015	1023	* could be merging multiple pages for one bio (e.g.
1016		- * mpage_readpages). If an allocation happens for the
	1024	+ * mpage_readahead). If an allocation happens for the
1017	1025	* second or third page, the process can end up locking
1018	1026	* the same page twice and deadlocking. Rather than
1019	1027	* trying to be clever about what pages can be locked,
..	..	@@ -1101,8 +1109,7 @@
1101	1109	/* Establish migration ptes */
1102	1110	VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
1103	1111	page);
1104		- try_to_unmap(page,
1105		- TTU_MIGRATION\|TTU_IGNORE_MLOCK\|TTU_IGNORE_ACCESS);
	1112	+ try_to_unmap(page, TTU_MIGRATION\|TTU_IGNORE_MLOCK);
1106	1113	page_was_mapped = 1;
1107	1114	}
1108	1115
..	..	@@ -1141,34 +1148,19 @@
1141	1148	}
1142	1149
1143	1150	/*
1144		- * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
1145		- * around it.
1146		- */
1147		-#if defined(CONFIG_ARM) && \
1148		- defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700
1149		-#define ICE_noinline noinline
1150		-#else
1151		-#define ICE_noinline
1152		-#endif
1153		-
1154		-/*
1155	1151	* Obtain the lock on page, remove all ptes and migrate the page
1156	1152	* to the newly allocated page in newpage.
1157	1153	*/
1158		-static ICE_noinline int unmap_and_move(new_page_t get_new_page,
	1154	+static int unmap_and_move(new_page_t get_new_page,
1159	1155	free_page_t put_new_page,
1160	1156	unsigned long private, struct page *page,
1161	1157	int force, enum migrate_mode mode,
1162	1158	enum migrate_reason reason)
1163	1159	{
1164	1160	int rc = MIGRATEPAGE_SUCCESS;
1165		- struct page *newpage;
	1161	+ struct page *newpage = NULL;
1166	1162
1167	1163	if (!thp_migration_supported() && PageTransHuge(page))
1168		- return -ENOMEM;
1169		-
1170		- newpage = get_new_page(page, private);
1171		- if (!newpage)
1172	1164	return -ENOMEM;
1173	1165
1174	1166	if (page_count(page) == 1) {
..	..	@@ -1178,15 +1170,15 @@
1178	1170	if (unlikely(__PageMovable(page))) {
1179	1171	lock_page(page);
1180	1172	if (!PageMovable(page))
1181		- __ClearPageIsolated(page);
	1173	+ ClearPageIsolated(page);
1182	1174	unlock_page(page);
1183	1175	}
1184		- if (put_new_page)
1185		- put_new_page(newpage, private);
1186		- else
1187		- put_page(newpage);
1188	1176	goto out;
1189	1177	}
	1178	+
	1179	+ newpage = get_new_page(page, private);
	1180	+ if (!newpage)
	1181	+ return -ENOMEM;
1190	1182
1191	1183	rc = __unmap_and_move(page, newpage, force, mode);
1192	1184	if (rc == MIGRATEPAGE_SUCCESS)
..	..	@@ -1197,8 +1189,7 @@
1197	1189	/*
1198	1190	* A page that has been migrated has all references
1199	1191	* removed and will be freed. A page that has not been
1200		- * migrated will have kepts its references and be
1201		- * restored.
	1192	+ * migrated will have kept its references and be restored.
1202	1193	*/
1203	1194	list_del(&page->lru);
1204	1195
..	..	@@ -1209,7 +1200,7 @@
1209	1200	*/
1210	1201	if (likely(!__PageMovable(page)))
1211	1202	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1212		- page_is_file_cache(page), -hpage_nr_pages(page));
	1203	+ page_is_file_lru(page), -thp_nr_pages(page));
1213	1204	}
1214	1205
1215	1206	/*
..	..	@@ -1218,16 +1209,11 @@
1218	1209	* we want to retry.
1219	1210	*/
1220	1211	if (rc == MIGRATEPAGE_SUCCESS) {
1221		- put_page(page);
1222		- if (reason == MR_MEMORY_FAILURE) {
	1212	+ if (reason != MR_MEMORY_FAILURE)
1223	1213	/*
1224		- * Set PG_HWPoison on just freed page
1225		- * intentionally. Although it's rather weird,
1226		- * it's how HWPoison flag works at the moment.
	1214	+ * We release the page in page_handle_poison.
1227	1215	*/
1228		- if (set_hwpoison_free_buddy_page(page))
1229		- num_poisoned_pages_inc();
1230		- }
	1216	+ put_page(page);
1231	1217	} else {
1232	1218	if (rc != -EAGAIN) {
1233	1219	if (likely(!__PageMovable(page))) {
..	..	@@ -1239,7 +1225,7 @@
1239	1225	if (PageMovable(page))
1240	1226	putback_movable_page(page);
1241	1227	else
1242		- __ClearPageIsolated(page);
	1228	+ ClearPageIsolated(page);
1243	1229	unlock_page(page);
1244	1230	put_page(page);
1245	1231	}
..	..	@@ -1280,9 +1266,10 @@
1280	1266	int page_was_mapped = 0;
1281	1267	struct page *new_hpage;
1282	1268	struct anon_vma *anon_vma = NULL;
	1269	+ struct address_space *mapping = NULL;
1283	1270
1284	1271	/*
1285		- * Movability of hugepages depends on architectures and hugepage size.
	1272	+ * Migratability of hugepages depends on architectures and their size.
1286	1273	* This check is necessary because some callers of hugepage migration
1287	1274	* like soft offline and memory hotremove don't walk through page
1288	1275	* tables or check whether the hugepage is pmd-based or not before
..	..	@@ -1327,9 +1314,29 @@
1327	1314	goto put_anon;
1328	1315
1329	1316	if (page_mapped(hpage)) {
1330		- try_to_unmap(hpage,
1331		- TTU_MIGRATION\|TTU_IGNORE_MLOCK\|TTU_IGNORE_ACCESS);
	1317	+ bool mapping_locked = false;
	1318	+ enum ttu_flags ttu = TTU_MIGRATION\|TTU_IGNORE_MLOCK;
	1319	+
	1320	+ if (!PageAnon(hpage)) {
	1321	+ /*
	1322	+ * In shared mappings, try_to_unmap could potentially
	1323	+ * call huge_pmd_unshare. Because of this, take
	1324	+ * semaphore in write mode here and set TTU_RMAP_LOCKED
	1325	+ * to let lower levels know we have taken the lock.
	1326	+ */
	1327	+ mapping = hugetlb_page_mapping_lock_write(hpage);
	1328	+ if (unlikely(!mapping))
	1329	+ goto unlock_put_anon;
	1330	+
	1331	+ mapping_locked = true;
	1332	+ ttu \|= TTU_RMAP_LOCKED;
	1333	+ }
	1334	+
	1335	+ try_to_unmap(hpage, ttu);
1332	1336	page_was_mapped = 1;
	1337	+
	1338	+ if (mapping_locked)
	1339	+ i_mmap_unlock_write(mapping);
1333	1340	}
1334	1341
1335	1342	if (!page_mapped(hpage))
..	..	@@ -1339,6 +1346,7 @@
1339	1346	remove_migration_ptes(hpage,
1340	1347	rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
1341	1348
	1349	+unlock_put_anon:
1342	1350	unlock_page(new_hpage);
1343	1351
1344	1352	put_anon:
..	..	@@ -1395,22 +1403,37 @@
1395	1403	enum migrate_mode mode, int reason)
1396	1404	{
1397	1405	int retry = 1;
	1406	+ int thp_retry = 1;
1398	1407	int nr_failed = 0;
1399	1408	int nr_succeeded = 0;
	1409	+ int nr_thp_succeeded = 0;
	1410	+ int nr_thp_failed = 0;
	1411	+ int nr_thp_split = 0;
1400	1412	int pass = 0;
	1413	+ bool is_thp = false;
1401	1414	struct page *page;
1402	1415	struct page *page2;
1403	1416	int swapwrite = current->flags & PF_SWAPWRITE;
1404		- int rc;
	1417	+ int rc, nr_subpages;
	1418	+
	1419	+ trace_mm_migrate_pages_start(mode, reason);
1405	1420
1406	1421	if (!swapwrite)
1407	1422	current->flags \|= PF_SWAPWRITE;
1408	1423
1409		- for(pass = 0; pass < 10 && retry; pass++) {
	1424	+ for (pass = 0; pass < 10 && (retry \|\| thp_retry); pass++) {
1410	1425	retry = 0;
	1426	+ thp_retry = 0;
1411	1427
1412	1428	list_for_each_entry_safe(page, page2, from, lru) {
1413	1429	retry:
	1430	+ /*
	1431	+ * THP statistics is based on the source huge page.
	1432	+ * Capture required information that might get lost
	1433	+ * during migration.
	1434	+ */
	1435	+ is_thp = PageTransHuge(page) && !PageHuge(page);
	1436	+ nr_subpages = thp_nr_pages(page);
1414	1437	cond_resched();
1415	1438
1416	1439	if (PageHuge(page))
..	..	@@ -1435,21 +1458,35 @@
1435	1458	* we encounter them after the rest of the list
1436	1459	* is processed.
1437	1460	*/
1438		- if (PageTransHuge(page) && !PageHuge(page)) {
	1461	+ if (is_thp) {
1439	1462	lock_page(page);
1440	1463	rc = split_huge_page_to_list(page, from);
1441	1464	unlock_page(page);
1442	1465	if (!rc) {
1443	1466	list_safe_reset_next(page, page2, lru);
	1467	+ nr_thp_split++;
1444	1468	goto retry;
1445	1469	}
	1470	+
	1471	+ nr_thp_failed++;
	1472	+ nr_failed += nr_subpages;
	1473	+ goto out;
1446	1474	}
1447	1475	nr_failed++;
1448	1476	goto out;
1449	1477	case -EAGAIN:
	1478	+ if (is_thp) {
	1479	+ thp_retry++;
	1480	+ break;
	1481	+ }
1450	1482	retry++;
1451	1483	break;
1452	1484	case MIGRATEPAGE_SUCCESS:
	1485	+ if (is_thp) {
	1486	+ nr_thp_succeeded++;
	1487	+ nr_succeeded += nr_subpages;
	1488	+ break;
	1489	+ }
1453	1490	nr_succeeded++;
1454	1491	break;
1455	1492	default:
..	..	@@ -1459,24 +1496,76 @@
1459	1496	* removed from migration page list and not
1460	1497	* retried in the next outer loop.
1461	1498	*/
	1499	+ if (is_thp) {
	1500	+ nr_thp_failed++;
	1501	+ nr_failed += nr_subpages;
	1502	+ break;
	1503	+ }
1462	1504	nr_failed++;
1463	1505	break;
1464	1506	}
1465	1507	}
1466	1508	}
1467		- nr_failed += retry;
	1509	+ nr_failed += retry + thp_retry;
	1510	+ nr_thp_failed += thp_retry;
1468	1511	rc = nr_failed;
1469	1512	out:
1470		- if (nr_succeeded)
1471		- count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1472		- if (nr_failed)
1473		- count_vm_events(PGMIGRATE_FAIL, nr_failed);
1474		- trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
	1513	+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
	1514	+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
	1515	+ count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
	1516	+ count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
	1517	+ count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
	1518	+ trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
	1519	+ nr_thp_failed, nr_thp_split, mode, reason);
1475	1520
1476	1521	if (!swapwrite)
1477	1522	current->flags &= ~PF_SWAPWRITE;
1478	1523
1479	1524	return rc;
	1525	+}
	1526	+EXPORT_SYMBOL_GPL(migrate_pages);
	1527	+
	1528	+struct page alloc_migration_target(struct page page, unsigned long private)
	1529	+{
	1530	+ struct migration_target_control *mtc;
	1531	+ gfp_t gfp_mask;
	1532	+ unsigned int order = 0;
	1533	+ struct page *new_page = NULL;
	1534	+ int nid;
	1535	+ int zidx;
	1536	+
	1537	+ mtc = (struct migration_target_control *)private;
	1538	+ gfp_mask = mtc->gfp_mask;
	1539	+ nid = mtc->nid;
	1540	+ if (nid == NUMA_NO_NODE)
	1541	+ nid = page_to_nid(page);
	1542	+
	1543	+ if (PageHuge(page)) {
	1544	+ struct hstate *h = page_hstate(compound_head(page));
	1545	+
	1546	+ gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
	1547	+ return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
	1548	+ }
	1549	+
	1550	+ if (PageTransHuge(page)) {
	1551	+ /*
	1552	+ * clear __GFP_RECLAIM to make the migration callback
	1553	+ * consistent with regular THP allocations.
	1554	+ */
	1555	+ gfp_mask &= ~__GFP_RECLAIM;
	1556	+ gfp_mask \|= GFP_TRANSHUGE;
	1557	+ order = HPAGE_PMD_ORDER;
	1558	+ }
	1559	+ zidx = zone_idx(page_zone(page));
	1560	+ if (is_highmem_idx(zidx) \|\| zidx == ZONE_MOVABLE)
	1561	+ gfp_mask \|= __GFP_HIGHMEM;
	1562	+
	1563	+ new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
	1564	+
	1565	+ if (new_page && PageTransHuge(new_page))
	1566	+ prep_transhuge_page(new_page);
	1567	+
	1568	+ return new_page;
1480	1569	}
1481	1570
1482	1571	#ifdef CONFIG_NUMA
..	..	@@ -1496,12 +1585,13 @@
1496	1585	struct list_head *pagelist, int node)
1497	1586	{
1498	1587	int err;
	1588	+ struct migration_target_control mtc = {
	1589	+ .nid = node,
	1590	+ .gfp_mask = GFP_HIGHUSER_MOVABLE \| __GFP_THISNODE,
	1591	+ };
1499	1592
1500		- if (list_empty(pagelist))
1501		- return 0;
1502		-
1503		- err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
1504		- MIGRATE_SYNC, MR_SYSCALL);
	1593	+ err = migrate_pages(pagelist, alloc_migration_target, NULL,
	1594	+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
1505	1595	if (err)
1506	1596	putback_movable_pages(pagelist);
1507	1597	return err;
..	..	@@ -1524,7 +1614,7 @@
1524	1614	unsigned int follflags;
1525	1615	int err;
1526	1616
1527		- down_read(&mm->mmap_sem);
	1617	+ mmap_read_lock(mm);
1528	1618	err = -EFAULT;
1529	1619	vma = find_vma(mm, addr);
1530	1620	if (!vma \|\| addr < vma->vm_start \|\| !vma_migratable(vma))
..	..	@@ -1552,8 +1642,9 @@
1552	1642
1553	1643	if (PageHuge(page)) {
1554	1644	if (PageHead(page)) {
1555		- isolate_huge_page(page, pagelist);
1556		- err = 1;
	1645	+ err = isolate_hugetlb(page, pagelist);
	1646	+ if (!err)
	1647	+ err = 1;
1557	1648	}
1558	1649	} else {
1559	1650	struct page *head;
..	..	@@ -1566,8 +1657,8 @@
1566	1657	err = 1;
1567	1658	list_add_tail(&head->lru, pagelist);
1568	1659	mod_node_page_state(page_pgdat(head),
1569		- NR_ISOLATED_ANON + page_is_file_cache(head),
1570		- hpage_nr_pages(head));
	1660	+ NR_ISOLATED_ANON + page_is_file_lru(head),
	1661	+ thp_nr_pages(head));
1571	1662	}
1572	1663	out_putpage:
1573	1664	/*
..	..	@@ -1575,10 +1666,36 @@
1575	1666	* isolate_lru_page() or drop the page ref if it was
1576	1667	* not isolated.
1577	1668	*/
1578		- put_page(page);
	1669	+ put_user_page(page);
1579	1670	out:
1580		- up_read(&mm->mmap_sem);
	1671	+ mmap_read_unlock(mm);
1581	1672	return err;
	1673	+}
	1674	+
	1675	+static int move_pages_and_store_status(struct mm_struct *mm, int node,
	1676	+ struct list_head pagelist, int __user status,
	1677	+ int start, int i, unsigned long nr_pages)
	1678	+{
	1679	+ int err;
	1680	+
	1681	+ if (list_empty(pagelist))
	1682	+ return 0;
	1683	+
	1684	+ err = do_move_pages_to_node(mm, pagelist, node);
	1685	+ if (err) {
	1686	+ /*
	1687	+ * Positive err means the number of failed
	1688	+ * pages to migrate. Since we are going to
	1689	+ * abort and return the number of non-migrated
	1690	+ * pages, so need to incude the rest of the
	1691	+ * nr_pages that have not been attempted as
	1692	+ * well.
	1693	+ */
	1694	+ if (err > 0)
	1695	+ err += nr_pages - i - 1;
	1696	+ return err;
	1697	+ }
	1698	+ return store_status(status, start, node, i - start);
1582	1699	}
1583	1700
1584	1701	/*
..	..	@@ -1596,7 +1713,7 @@
1596	1713	int start, i;
1597	1714	int err = 0, err1;
1598	1715
1599		- migrate_prep();
	1716	+ lru_cache_disable();
1600	1717
1601	1718	for (i = start = 0; i < nr_pages; i++) {
1602	1719	const void __user *p;
..	..	@@ -1624,21 +1741,8 @@
1624	1741	current_node = node;
1625	1742	start = i;
1626	1743	} else if (node != current_node) {
1627		- err = do_move_pages_to_node(mm, &pagelist, current_node);
1628		- if (err) {
1629		- /*
1630		- * Positive err means the number of failed
1631		- * pages to migrate. Since we are going to
1632		- * abort and return the number of non-migrated
1633		- * pages, so need to incude the rest of the
1634		- * nr_pages that have not been attempted as
1635		- * well.
1636		- */
1637		- if (err > 0)
1638		- err += nr_pages - i - 1;
1639		- goto out;
1640		- }
1641		- err = store_status(status, start, current_node, i - start);
	1744	+ err = move_pages_and_store_status(mm, current_node,
	1745	+ &pagelist, status, start, i, nr_pages);
1642	1746	if (err)
1643	1747	goto out;
1644	1748	start = i;
..	..	@@ -1652,52 +1756,33 @@
1652	1756	err = add_page_for_migration(mm, addr, current_node,
1653	1757	&pagelist, flags & MPOL_MF_MOVE_ALL);
1654	1758
1655		- if (!err) {
1656		- /* The page is already on the target node */
1657		- err = store_status(status, i, current_node, 1);
1658		- if (err)
1659		- goto out_flush;
1660		- continue;
1661		- } else if (err > 0) {
	1759	+ if (err > 0) {
1662	1760	/* The page is successfully queued for migration */
1663	1761	continue;
1664	1762	}
1665	1763
1666		- err = store_status(status, i, err, 1);
	1764	+ /*
	1765	+ * If the page is already on the target node (!err), store the
	1766	+ * node, otherwise, store the err.
	1767	+ */
	1768	+ err = store_status(status, i, err ? : current_node, 1);
1667	1769	if (err)
1668	1770	goto out_flush;
1669	1771
1670		- err = do_move_pages_to_node(mm, &pagelist, current_node);
1671		- if (err) {
1672		- if (err > 0)
1673		- err += nr_pages - i - 1;
	1772	+ err = move_pages_and_store_status(mm, current_node, &pagelist,
	1773	+ status, start, i, nr_pages);
	1774	+ if (err)
1674	1775	goto out;
1675		- }
1676		- if (i > start) {
1677		- err = store_status(status, start, current_node, i - start);
1678		- if (err)
1679		- goto out;
1680		- }
1681	1776	current_node = NUMA_NO_NODE;
1682	1777	}
1683	1778	out_flush:
1684		- if (list_empty(&pagelist))
1685		- return err;
1686		-
1687	1779	/* Make sure we do not overwrite the existing error */
1688		- err1 = do_move_pages_to_node(mm, &pagelist, current_node);
1689		- /*
1690		- * Don't have to report non-attempted pages here since:
1691		- * - If the above loop is done gracefully all pages have been
1692		- * attempted.
1693		- * - If the above loop is aborted it means a fatal error
1694		- * happened, should return ret.
1695		- */
1696		- if (!err1)
1697		- err1 = store_status(status, start, current_node, i - start);
	1780	+ err1 = move_pages_and_store_status(mm, current_node, &pagelist,
	1781	+ status, start, i, nr_pages);
1698	1782	if (err >= 0)
1699	1783	err = err1;
1700	1784	out:
	1785	+ lru_cache_enable();
1701	1786	return err;
1702	1787	}
1703	1788
..	..	@@ -1709,7 +1794,7 @@
1709	1794	{
1710	1795	unsigned long i;
1711	1796
1712		- down_read(&mm->mmap_sem);
	1797	+ mmap_read_lock(mm);
1713	1798
1714	1799	for (i = 0; i < nr_pages; i++) {
1715	1800	unsigned long addr = (unsigned long)(*pages);
..	..	@@ -1736,7 +1821,7 @@
1736	1821	status++;
1737	1822	}
1738	1823
1739		- up_read(&mm->mmap_sem);
	1824	+ mmap_read_unlock(mm);
1740	1825	}
1741	1826
1742	1827	/*
..	..	@@ -1773,6 +1858,53 @@
1773	1858	return nr_pages ? -EFAULT : 0;
1774	1859	}
1775	1860
	1861	+static struct mm_struct find_mm_struct(pid_t pid, nodemask_t mem_nodes)
	1862	+{
	1863	+ struct task_struct *task;
	1864	+ struct mm_struct *mm;
	1865	+
	1866	+ /*
	1867	+ * There is no need to check if current process has the right to modify
	1868	+ * the specified process when they are same.
	1869	+ */
	1870	+ if (!pid) {
	1871	+ mmget(current->mm);
	1872	+ *mem_nodes = cpuset_mems_allowed(current);
	1873	+ return current->mm;
	1874	+ }
	1875	+
	1876	+ /* Find the mm_struct */
	1877	+ rcu_read_lock();
	1878	+ task = find_task_by_vpid(pid);
	1879	+ if (!task) {
	1880	+ rcu_read_unlock();
	1881	+ return ERR_PTR(-ESRCH);
	1882	+ }
	1883	+ get_task_struct(task);
	1884	+
	1885	+ /*
	1886	+ * Check if this process has the right to modify the specified
	1887	+ * process. Use the regular "ptrace_may_access()" checks.
	1888	+ */
	1889	+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
	1890	+ rcu_read_unlock();
	1891	+ mm = ERR_PTR(-EPERM);
	1892	+ goto out;
	1893	+ }
	1894	+ rcu_read_unlock();
	1895	+
	1896	+ mm = ERR_PTR(security_task_movememory(task));
	1897	+ if (IS_ERR(mm))
	1898	+ goto out;
	1899	+ *mem_nodes = cpuset_mems_allowed(task);
	1900	+ mm = get_task_mm(task);
	1901	+out:
	1902	+ put_task_struct(task);
	1903	+ if (!mm)
	1904	+ mm = ERR_PTR(-EINVAL);
	1905	+ return mm;
	1906	+}
	1907	+
1776	1908	/*
1777	1909	* Move a list of pages in the address space of the currently executing
1778	1910	* process.
..	..	@@ -1782,7 +1914,6 @@
1782	1914	const int __user *nodes,
1783	1915	int __user *status, int flags)
1784	1916	{
1785		- struct task_struct *task;
1786	1917	struct mm_struct *mm;
1787	1918	int err;
1788	1919	nodemask_t task_nodes;
..	..	@@ -1794,36 +1925,9 @@
1794	1925	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1795	1926	return -EPERM;
1796	1927
1797		- /* Find the mm_struct */
1798		- rcu_read_lock();
1799		- task = pid ? find_task_by_vpid(pid) : current;
1800		- if (!task) {
1801		- rcu_read_unlock();
1802		- return -ESRCH;
1803		- }
1804		- get_task_struct(task);
1805		-
1806		- /*
1807		- * Check if this process has the right to modify the specified
1808		- * process. Use the regular "ptrace_may_access()" checks.
1809		- */
1810		- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1811		- rcu_read_unlock();
1812		- err = -EPERM;
1813		- goto out;
1814		- }
1815		- rcu_read_unlock();
1816		-
1817		- err = security_task_movememory(task);
1818		- if (err)
1819		- goto out;
1820		-
1821		- task_nodes = cpuset_mems_allowed(task);
1822		- mm = get_task_mm(task);
1823		- put_task_struct(task);
1824		-
1825		- if (!mm)
1826		- return -EINVAL;
	1928	+ mm = find_mm_struct(pid, &task_nodes);
	1929	+ if (IS_ERR(mm))
	1930	+ return PTR_ERR(mm);
1827	1931
1828	1932	if (nodes)
1829	1933	err = do_pages_move(mm, task_nodes, nr_pages, pages,
..	..	@@ -1832,10 +1936,6 @@
1832	1936	err = do_pages_stat(mm, nr_pages, pages, status);
1833	1937
1834	1938	mmput(mm);
1835		- return err;
1836		-
1837		-out:
1838		- put_task_struct(task);
1839	1939	return err;
1840	1940	}
1841	1941
..	..	@@ -1889,7 +1989,7 @@
1889	1989	if (!zone_watermark_ok(zone, 0,
1890	1990	high_wmark_pages(zone) +
1891	1991	nr_migrate_pages,
1892		- 0, 0))
	1992	+ ZONE_MOVABLE, 0))
1893	1993	continue;
1894	1994	return true;
1895	1995	}
..	..	@@ -1918,7 +2018,7 @@
1918	2018	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
1919	2019
1920	2020	/* Avoid migrating to a node that is nearly full */
1921		- if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
	2021	+ if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
1922	2022	return 0;
1923	2023
1924	2024	if (isolate_lru_page(page))
..	..	@@ -1936,9 +2036,9 @@
1936	2036	return 0;
1937	2037	}
1938	2038
1939		- page_lru = page_is_file_cache(page);
	2039	+ page_lru = page_is_file_lru(page);
1940	2040	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
1941		- hpage_nr_pages(page));
	2041	+ thp_nr_pages(page));
1942	2042
1943	2043	/*
1944	2044	* Isolating the page has taken another reference, so the
..	..	@@ -1960,7 +2060,7 @@
1960	2060	* node. Caller is expected to have an elevated reference count on
1961	2061	* the page that will be dropped by this function before returning.
1962	2062	*/
1963		-int migrate_misplaced_page(struct page page, struct vm_area_struct vma,
	2063	+int migrate_misplaced_page(struct page page, struct vm_fault vmf,
1964	2064	int node)
1965	2065	{
1966	2066	pg_data_t *pgdat = NODE_DATA(node);
..	..	@@ -1972,15 +2072,15 @@
1972	2072	* Don't migrate file pages that are mapped in multiple processes
1973	2073	* with execute permissions as they are probably shared libraries.
1974	2074	*/
1975		- if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
1976		- (vma->vm_flags & VM_EXEC))
	2075	+ if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
	2076	+ (vmf->vma_flags & VM_EXEC))
1977	2077	goto out;
1978	2078
1979	2079	/*
1980	2080	* Also do not migrate dirty pages as not all filesystems can move
1981	2081	* dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
1982	2082	*/
1983		- if (page_is_file_cache(page) && PageDirty(page))
	2083	+ if (page_is_file_lru(page) && PageDirty(page))
1984	2084	goto out;
1985	2085
1986	2086	isolated = numamigrate_isolate_page(pgdat, page);
..	..	@@ -1995,7 +2095,7 @@
1995	2095	if (!list_empty(&migratepages)) {
1996	2096	list_del(&page->lru);
1997	2097	dec_node_page_state(page, NR_ISOLATED_ANON +
1998		- page_is_file_cache(page));
	2098	+ page_is_file_lru(page));
1999	2099	putback_lru_page(page);
2000	2100	}
2001	2101	isolated = 0;
..	..	@@ -2025,9 +2125,8 @@
2025	2125	pg_data_t *pgdat = NODE_DATA(node);
2026	2126	int isolated = 0;
2027	2127	struct page *new_page = NULL;
2028		- int page_lru = page_is_file_cache(page);
2029		- unsigned long mmun_start = address & HPAGE_PMD_MASK;
2030		- unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
	2128	+ int page_lru = page_is_file_lru(page);
	2129	+ unsigned long start = address & HPAGE_PMD_MASK;
2031	2130
2032	2131	new_page = alloc_pages_node(node,
2033	2132	(GFP_TRANSHUGE_LIGHT \| __GFP_THISNODE),
..	..	@@ -2050,15 +2149,15 @@
2050	2149	/* anon mapping, we can simply copy page->mapping to the new page: */
2051	2150	new_page->mapping = page->mapping;
2052	2151	new_page->index = page->index;
	2152	+ /* flush the cache before copying using the kernel virtual address */
	2153	+ flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
2053	2154	migrate_page_copy(new_page, page);
2054	2155	WARN_ON(PageLRU(new_page));
2055	2156
2056	2157	/* Recheck the target PMD */
2057		- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2058	2158	ptl = pmd_lock(mm, pmd);
2059	2159	if (unlikely(!pmd_same(*pmd, entry) \|\| !page_ref_freeze(page, 2))) {
2060	2160	spin_unlock(ptl);
2061		- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2062	2161
2063	2162	/* Reverse changes made by migrate_page_copy() */
2064	2163	if (TestClearPageActive(new_page))
..	..	@@ -2089,8 +2188,7 @@
2089	2188	* new page and page_add_new_anon_rmap guarantee the copy is
2090	2189	* visible before the pagetable update.
2091	2190	*/
2092		- flush_cache_range(vma, mmun_start, mmun_end);
2093		- page_add_anon_rmap(new_page, vma, mmun_start, true);
	2191	+ page_add_anon_rmap(new_page, vma, start, true);
2094	2192	/*
2095	2193	* At this point the pmd is numa/protnone (i.e. non present) and the TLB
2096	2194	* has already been flushed globally. So no TLB can be currently
..	..	@@ -2098,11 +2196,11 @@
2098	2196	* pmd before doing set_pmd_at(), nor to flush the TLB after
2099	2197	* set_pmd_at(). Clearing the pmd here would introduce a race
2100	2198	* condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
2101		- * mmap_sem for reading. If the pmd is set to NULL at any given time,
	2199	+ * mmap_lock for reading. If the pmd is set to NULL at any given time,
2102	2200	* MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
2103	2201	* pmd.
2104	2202	*/
2105		- set_pmd_at(mm, mmun_start, pmd, entry);
	2203	+ set_pmd_at(mm, start, pmd, entry);
2106	2204	update_mmu_cache_pmd(vma, address, &entry);
2107	2205
2108	2206	page_ref_unfreeze(page, 2);
..	..	@@ -2111,11 +2209,6 @@
2111	2209	set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
2112	2210
2113	2211	spin_unlock(ptl);
2114		- /*
2115		- * No need to double call mmu_notifier->invalidate_range() callback as
2116		- * the above pmdp_huge_clear_flush_notify() did already call it.
2117		- */
2118		- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2119	2212
2120	2213	/* Take an "isolate" reference and put new page on the LRU. */
2121	2214	get_page(new_page);
..	..	@@ -2139,7 +2232,7 @@
2139	2232	ptl = pmd_lock(mm, pmd);
2140	2233	if (pmd_same(*pmd, entry)) {
2141	2234	entry = pmd_modify(entry, vma->vm_page_prot);
2142		- set_pmd_at(mm, mmun_start, pmd, entry);
	2235	+ set_pmd_at(mm, start, pmd, entry);
2143	2236	update_mmu_cache_pmd(vma, address, &entry);
2144	2237	}
2145	2238	spin_unlock(ptl);
..	..	@@ -2153,25 +2246,26 @@
2153	2246
2154	2247	#endif /* CONFIG_NUMA */
2155	2248
2156		-#if defined(CONFIG_MIGRATE_VMA_HELPER)
2157		-struct migrate_vma {
2158		- struct vm_area_struct *vma;
2159		- unsigned long *dst;
2160		- unsigned long *src;
2161		- unsigned long cpages;
2162		- unsigned long npages;
2163		- unsigned long start;
2164		- unsigned long end;
2165		-};
2166		-
	2249	+#ifdef CONFIG_DEVICE_PRIVATE
2167	2250	static int migrate_vma_collect_hole(unsigned long start,
2168	2251	unsigned long end,
	2252	+ __always_unused int depth,
2169	2253	struct mm_walk *walk)
2170	2254	{
2171	2255	struct migrate_vma *migrate = walk->private;
2172	2256	unsigned long addr;
2173	2257
2174		- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
	2258	+ /* Only allow populating anonymous memory. */
	2259	+ if (!vma_is_anonymous(walk->vma)) {
	2260	+ for (addr = start; addr < end; addr += PAGE_SIZE) {
	2261	+ migrate->src[migrate->npages] = 0;
	2262	+ migrate->dst[migrate->npages] = 0;
	2263	+ migrate->npages++;
	2264	+ }
	2265	+ return 0;
	2266	+ }
	2267	+
	2268	+ for (addr = start; addr < end; addr += PAGE_SIZE) {
2175	2269	migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
2176	2270	migrate->dst[migrate->npages] = 0;
2177	2271	migrate->npages++;
..	..	@@ -2188,7 +2282,7 @@
2188	2282	struct migrate_vma *migrate = walk->private;
2189	2283	unsigned long addr;
2190	2284
2191		- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
	2285	+ for (addr = start; addr < end; addr += PAGE_SIZE) {
2192	2286	migrate->dst[migrate->npages] = 0;
2193	2287	migrate->src[migrate->npages++] = 0;
2194	2288	}
..	..	@@ -2210,7 +2304,7 @@
2210	2304
2211	2305	again:
2212	2306	if (pmd_none(*pmdp))
2213		- return migrate_vma_collect_hole(start, end, walk);
	2307	+ return migrate_vma_collect_hole(start, end, -1, walk);
2214	2308
2215	2309	if (pmd_trans_huge(*pmdp)) {
2216	2310	struct page *page;
..	..	@@ -2243,7 +2337,7 @@
2243	2337	return migrate_vma_collect_skip(start, end,
2244	2338	walk);
2245	2339	if (pmd_none(*pmdp))
2246		- return migrate_vma_collect_hole(start, end,
	2340	+ return migrate_vma_collect_hole(start, end, -1,
2247	2341	walk);
2248	2342	}
2249	2343	}
..	..	@@ -2255,24 +2349,22 @@
2255	2349	arch_enter_lazy_mmu_mode();
2256	2350
2257	2351	for (; addr < end; addr += PAGE_SIZE, ptep++) {
2258		- unsigned long mpfn, pfn;
	2352	+ unsigned long mpfn = 0, pfn;
2259	2353	struct page *page;
2260	2354	swp_entry_t entry;
2261	2355	pte_t pte;
2262	2356
2263	2357	pte = *ptep;
2264		- pfn = pte_pfn(pte);
2265	2358
2266	2359	if (pte_none(pte)) {
2267		- mpfn = MIGRATE_PFN_MIGRATE;
2268		- migrate->cpages++;
2269		- pfn = 0;
	2360	+ if (vma_is_anonymous(vma)) {
	2361	+ mpfn = MIGRATE_PFN_MIGRATE;
	2362	+ migrate->cpages++;
	2363	+ }
2270	2364	goto next;
2271	2365	}
2272	2366
2273	2367	if (!pte_present(pte)) {
2274		- mpfn = pfn = 0;
2275		-
2276	2368	/*
2277	2369	* Only care about unaddressable device page special
2278	2370	* page table entry. Other special swap entries are not
..	..	@@ -2283,28 +2375,34 @@
2283	2375	goto next;
2284	2376
2285	2377	page = device_private_entry_to_page(entry);
2286		- mpfn = migrate_pfn(page_to_pfn(page))\|
2287		- MIGRATE_PFN_DEVICE \| MIGRATE_PFN_MIGRATE;
	2378	+ if (!(migrate->flags &
	2379	+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE) \|\|
	2380	+ page->pgmap->owner != migrate->pgmap_owner)
	2381	+ goto next;
	2382	+
	2383	+ mpfn = migrate_pfn(page_to_pfn(page)) \|
	2384	+ MIGRATE_PFN_MIGRATE;
2288	2385	if (is_write_device_private_entry(entry))
2289	2386	mpfn \|= MIGRATE_PFN_WRITE;
2290	2387	} else {
	2388	+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
	2389	+ goto next;
	2390	+ pfn = pte_pfn(pte);
2291	2391	if (is_zero_pfn(pfn)) {
2292	2392	mpfn = MIGRATE_PFN_MIGRATE;
2293	2393	migrate->cpages++;
2294		- pfn = 0;
2295	2394	goto next;
2296	2395	}
2297		- page = _vm_normal_page(migrate->vma, addr, pte, true);
	2396	+ page = vm_normal_page(migrate->vma, addr, pte);
2298	2397	mpfn = migrate_pfn(pfn) \| MIGRATE_PFN_MIGRATE;
2299	2398	mpfn \|= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
2300	2399	}
2301	2400
2302	2401	/* FIXME support THP */
2303	2402	if (!page \|\| !page->mapping \|\| PageTransCompound(page)) {
2304		- mpfn = pfn = 0;
	2403	+ mpfn = 0;
2305	2404	goto next;
2306	2405	}
2307		- pfn = page_to_pfn(page);
2308	2406
2309	2407	/*
2310	2408	* By getting a reference on the page we pin it and that blocks
..	..	@@ -2333,8 +2431,17 @@
2333	2431	entry = make_migration_entry(page, mpfn &
2334	2432	MIGRATE_PFN_WRITE);
2335	2433	swp_pte = swp_entry_to_pte(entry);
2336		- if (pte_soft_dirty(pte))
2337		- swp_pte = pte_swp_mksoft_dirty(swp_pte);
	2434	+ if (pte_present(pte)) {
	2435	+ if (pte_soft_dirty(pte))
	2436	+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
	2437	+ if (pte_uffd_wp(pte))
	2438	+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
	2439	+ } else {
	2440	+ if (pte_swp_soft_dirty(pte))
	2441	+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
	2442	+ if (pte_swp_uffd_wp(pte))
	2443	+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
	2444	+ }
2338	2445	set_pte_at(mm, addr, ptep, swp_pte);
2339	2446
2340	2447	/*
..	..	@@ -2353,15 +2460,21 @@
2353	2460	migrate->dst[migrate->npages] = 0;
2354	2461	migrate->src[migrate->npages++] = mpfn;
2355	2462	}
2356		- arch_leave_lazy_mmu_mode();
2357		- pte_unmap_unlock(ptep - 1, ptl);
2358	2463
2359	2464	/* Only flush the TLB if we actually modified any entries */
2360	2465	if (unmapped)
2361	2466	flush_tlb_range(walk->vma, start, end);
2362	2467
	2468	+ arch_leave_lazy_mmu_mode();
	2469	+ pte_unmap_unlock(ptep - 1, ptl);
	2470	+
2363	2471	return 0;
2364	2472	}
	2473	+
	2474	+static const struct mm_walk_ops migrate_vma_walk_ops = {
	2475	+ .pmd_entry = migrate_vma_collect_pmd,
	2476	+ .pte_hole = migrate_vma_collect_hole,
	2477	+};
2365	2478
2366	2479	/*
2367	2480	* migrate_vma_collect() - collect pages over a range of virtual addresses
..	..	@@ -2373,22 +2486,22 @@
2373	2486	*/
2374	2487	static void migrate_vma_collect(struct migrate_vma *migrate)
2375	2488	{
2376		- struct mm_walk mm_walk = {
2377		- .pmd_entry = migrate_vma_collect_pmd,
2378		- .pte_hole = migrate_vma_collect_hole,
2379		- .vma = migrate->vma,
2380		- .mm = migrate->vma->vm_mm,
2381		- .private = migrate,
2382		- };
	2489	+ struct mmu_notifier_range range;
2383	2490
2384		- mmu_notifier_invalidate_range_start(mm_walk.mm,
2385		- migrate->start,
2386		- migrate->end);
2387		- walk_page_range(migrate->start, migrate->end, &mm_walk);
2388		- mmu_notifier_invalidate_range_end(mm_walk.mm,
2389		- migrate->start,
2390		- migrate->end);
	2491	+ /*
	2492	+ * Note that the pgmap_owner is passed to the mmu notifier callback so
	2493	+ * that the registered device driver can skip invalidating device
	2494	+ * private page mappings that won't be migrated.
	2495	+ */
	2496	+ mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
	2497	+ migrate->vma->vm_mm, migrate->start, migrate->end,
	2498	+ migrate->pgmap_owner);
	2499	+ mmu_notifier_invalidate_range_start(&range);
2391	2500
	2501	+ walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
	2502	+ &migrate_vma_walk_ops, migrate);
	2503	+
	2504	+ mmu_notifier_invalidate_range_end(&range);
2392	2505	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
2393	2506	}
2394	2507
..	..	@@ -2432,16 +2545,7 @@
2432	2545	* FIXME proper solution is to rework migration_entry_wait() so
2433	2546	* it does not need to take a reference on page.
2434	2547	*/
2435		- if (is_device_private_page(page))
2436		- return true;
2437		-
2438		- /*
2439		- * Only allow device public page to be migrated and account for
2440		- * the extra reference count imply by ZONE_DEVICE pages.
2441		- */
2442		- if (!is_device_public_page(page))
2443		- return false;
2444		- extra++;
	2548	+ return is_device_private_page(page);
2445	2549	}
2446	2550
2447	2551	/* For file back page */
..	..	@@ -2575,7 +2679,7 @@
2575	2679	*/
2576	2680	static void migrate_vma_unmap(struct migrate_vma *migrate)
2577	2681	{
2578		- int flags = TTU_MIGRATION \| TTU_IGNORE_MLOCK \| TTU_IGNORE_ACCESS;
	2682	+ int flags = TTU_MIGRATION \| TTU_IGNORE_MLOCK;
2579	2683	const unsigned long npages = migrate->npages;
2580	2684	const unsigned long start = migrate->start;
2581	2685	unsigned long addr, i, restore = 0;
..	..	@@ -2620,6 +2724,118 @@
2620	2724	}
2621	2725	}
2622	2726
	2727	+/**
	2728	+ * migrate_vma_setup() - prepare to migrate a range of memory
	2729	+ * @args: contains the vma, start, and pfns arrays for the migration
	2730	+ *
	2731	+ * Returns: negative errno on failures, 0 when 0 or more pages were migrated
	2732	+ * without an error.
	2733	+ *
	2734	+ * Prepare to migrate a range of memory virtual address range by collecting all
	2735	+ * the pages backing each virtual address in the range, saving them inside the
	2736	+ * src array. Then lock those pages and unmap them. Once the pages are locked
	2737	+ * and unmapped, check whether each page is pinned or not. Pages that aren't
	2738	+ * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
	2739	+ * corresponding src array entry. Then restores any pages that are pinned, by
	2740	+ * remapping and unlocking those pages.
	2741	+ *
	2742	+ * The caller should then allocate destination memory and copy source memory to
	2743	+ * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
	2744	+ * flag set). Once these are allocated and copied, the caller must update each
	2745	+ * corresponding entry in the dst array with the pfn value of the destination
	2746	+ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
	2747	+ * (destination pages must have their struct pages locked, via lock_page()).
	2748	+ *
	2749	+ * Note that the caller does not have to migrate all the pages that are marked
	2750	+ * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
	2751	+ * device memory to system memory. If the caller cannot migrate a device page
	2752	+ * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
	2753	+ * consequences for the userspace process, so it must be avoided if at all
	2754	+ * possible.
	2755	+ *
	2756	+ * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
	2757	+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
	2758	+ * allowing the caller to allocate device memory for those unback virtual
	2759	+ * address. For this the caller simply has to allocate device memory and
	2760	+ * properly set the destination entry like for regular migration. Note that
	2761	+ * this can still fails and thus inside the device driver must check if the
	2762	+ * migration was successful for those entries after calling migrate_vma_pages()
	2763	+ * just like for regular migration.
	2764	+ *
	2765	+ * After that, the callers must call migrate_vma_pages() to go over each entry
	2766	+ * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
	2767	+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
	2768	+ * then migrate_vma_pages() to migrate struct page information from the source
	2769	+ * struct page to the destination struct page. If it fails to migrate the
	2770	+ * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
	2771	+ * src array.
	2772	+ *
	2773	+ * At this point all successfully migrated pages have an entry in the src
	2774	+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
	2775	+ * array entry with MIGRATE_PFN_VALID flag set.
	2776	+ *
	2777	+ * Once migrate_vma_pages() returns the caller may inspect which pages were
	2778	+ * successfully migrated, and which were not. Successfully migrated pages will
	2779	+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
	2780	+ *
	2781	+ * It is safe to update device page table after migrate_vma_pages() because
	2782	+ * both destination and source page are still locked, and the mmap_lock is held
	2783	+ * in read mode (hence no one can unmap the range being migrated).
	2784	+ *
	2785	+ * Once the caller is done cleaning up things and updating its page table (if it
	2786	+ * chose to do so, this is not an obligation) it finally calls
	2787	+ * migrate_vma_finalize() to update the CPU page table to point to new pages
	2788	+ * for successfully migrated pages or otherwise restore the CPU page table to
	2789	+ * point to the original source pages.
	2790	+ */
	2791	+int migrate_vma_setup(struct migrate_vma *args)
	2792	+{
	2793	+ long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
	2794	+
	2795	+ args->start &= PAGE_MASK;
	2796	+ args->end &= PAGE_MASK;
	2797	+ if (!args->vma \|\| is_vm_hugetlb_page(args->vma) \|\|
	2798	+ (args->vma->vm_flags & VM_SPECIAL) \|\| vma_is_dax(args->vma))
	2799	+ return -EINVAL;
	2800	+ if (nr_pages <= 0)
	2801	+ return -EINVAL;
	2802	+ if (args->start < args->vma->vm_start \|\|
	2803	+ args->start >= args->vma->vm_end)
	2804	+ return -EINVAL;
	2805	+ if (args->end <= args->vma->vm_start \|\| args->end > args->vma->vm_end)
	2806	+ return -EINVAL;
	2807	+ if (!args->src \|\| !args->dst)
	2808	+ return -EINVAL;
	2809	+
	2810	+ memset(args->src, 0, sizeof(args->src) nr_pages);
	2811	+ args->cpages = 0;
	2812	+ args->npages = 0;
	2813	+
	2814	+ migrate_vma_collect(args);
	2815	+
	2816	+ if (args->cpages)
	2817	+ migrate_vma_prepare(args);
	2818	+ if (args->cpages)
	2819	+ migrate_vma_unmap(args);
	2820	+
	2821	+ /*
	2822	+ * At this point pages are locked and unmapped, and thus they have
	2823	+ * stable content and can safely be copied to destination memory that
	2824	+ * is allocated by the drivers.
	2825	+ */
	2826	+ return 0;
	2827	+
	2828	+}
	2829	+EXPORT_SYMBOL(migrate_vma_setup);
	2830	+
	2831	+/*
	2832	+ * This code closely matches the code in:
	2833	+ * __handle_mm_fault()
	2834	+ * handle_pte_fault()
	2835	+ * do_anonymous_page()
	2836	+ * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
	2837	+ * private page.
	2838	+ */
2623	2839	static void migrate_vma_insert_page(struct migrate_vma *migrate,
2624	2840	unsigned long addr,
2625	2841	struct page *page,
..	..	@@ -2628,7 +2844,6 @@
2628	2844	{
2629	2845	struct vm_area_struct *vma = migrate->vma;
2630	2846	struct mm_struct *mm = vma->vm_mm;
2631		- struct mem_cgroup *memcg;
2632	2847	bool flush = false;
2633	2848	spinlock_t *ptl;
2634	2849	pte_t entry;
..	..	@@ -2661,12 +2876,12 @@
2661	2876	* pte_offset_map() on pmds where a huge pmd might be created
2662	2877	* from a different thread.
2663	2878	*
2664		- * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
	2879	+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
2665	2880	* parallel threads are excluded by other means.
2666	2881	*
2667		- * Here we only have down_read(mmap_sem).
	2882	+ * Here we only have mmap_read_lock(mm).
2668	2883	*/
2669		- if (pte_alloc(mm, pmdp, addr))
	2884	+ if (pte_alloc(mm, pmdp))
2670	2885	goto abort;
2671	2886
2672	2887	/* See the comment in pte_alloc_one_map() */
..	..	@@ -2675,7 +2890,7 @@
2675	2890
2676	2891	if (unlikely(anon_vma_prepare(vma)))
2677	2892	goto abort;
2678		- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
	2893	+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
2679	2894	goto abort;
2680	2895
2681	2896	/*
..	..	@@ -2691,11 +2906,13 @@
2691	2906
2692	2907	swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
2693	2908	entry = swp_entry_to_pte(swp_entry);
2694		- } else if (is_device_public_page(page)) {
2695		- entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
2696		- if (vma->vm_flags & VM_WRITE)
2697		- entry = pte_mkwrite(pte_mkdirty(entry));
2698		- entry = pte_mkdevmap(entry);
	2909	+ } else {
	2910	+ /*
	2911	+ * For now we only support migrating to un-addressable
	2912	+ * device memory.
	2913	+ */
	2914	+ pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
	2915	+ goto abort;
2699	2916	}
2700	2917	} else {
2701	2918	entry = mk_pte(page, vma->vm_page_prot);
..	..	@@ -2705,36 +2922,29 @@
2705	2922
2706	2923	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
2707	2924
	2925	+ if (check_stable_address_space(mm))
	2926	+ goto unlock_abort;
	2927	+
2708	2928	if (pte_present(*ptep)) {
2709	2929	unsigned long pfn = pte_pfn(*ptep);
2710	2930
2711		- if (!is_zero_pfn(pfn)) {
2712		- pte_unmap_unlock(ptep, ptl);
2713		- mem_cgroup_cancel_charge(page, memcg, false);
2714		- goto abort;
2715		- }
	2931	+ if (!is_zero_pfn(pfn))
	2932	+ goto unlock_abort;
2716	2933	flush = true;
2717		- } else if (!pte_none(*ptep)) {
2718		- pte_unmap_unlock(ptep, ptl);
2719		- mem_cgroup_cancel_charge(page, memcg, false);
2720		- goto abort;
2721		- }
	2934	+ } else if (!pte_none(*ptep))
	2935	+ goto unlock_abort;
2722	2936
2723	2937	/*
2724		- * Check for usefaultfd but do not deliver the fault. Instead,
	2938	+ * Check for userfaultfd but do not deliver the fault. Instead,
2725	2939	* just back off.
2726	2940	*/
2727		- if (userfaultfd_missing(vma)) {
2728		- pte_unmap_unlock(ptep, ptl);
2729		- mem_cgroup_cancel_charge(page, memcg, false);
2730		- goto abort;
2731		- }
	2941	+ if (userfaultfd_missing(vma))
	2942	+ goto unlock_abort;
2732	2943
2733	2944	inc_mm_counter(mm, MM_ANONPAGES);
2734	2945	page_add_new_anon_rmap(page, vma, addr, false);
2735		- mem_cgroup_commit_charge(page, memcg, false, false);
2736	2946	if (!is_zone_device_page(page))
2737		- lru_cache_add_active_or_unevictable(page, vma);
	2947	+ lru_cache_add_inactive_or_unevictable(page, vma);
2738	2948	get_page(page);
2739	2949
2740	2950	if (flush) {
..	..	@@ -2752,11 +2962,13 @@
2752	2962	*src = MIGRATE_PFN_MIGRATE;
2753	2963	return;
2754	2964
	2965	+unlock_abort:
	2966	+ pte_unmap_unlock(ptep, ptl);
2755	2967	abort:
2756	2968	*src &= ~MIGRATE_PFN_MIGRATE;
2757	2969	}
2758	2970
2759		-/*
	2971	+/**
2760	2972	* migrate_vma_pages() - migrate meta-data from src page to dst page
2761	2973	* @migrate: migrate struct containing all migration information
2762	2974	*
..	..	@@ -2764,13 +2976,12 @@
2764	2976	* struct page. This effectively finishes the migration from source page to the
2765	2977	* destination page.
2766	2978	*/
2767		-static void migrate_vma_pages(struct migrate_vma *migrate)
	2979	+void migrate_vma_pages(struct migrate_vma *migrate)
2768	2980	{
2769	2981	const unsigned long npages = migrate->npages;
2770	2982	const unsigned long start = migrate->start;
2771		- struct vm_area_struct *vma = migrate->vma;
2772		- struct mm_struct *mm = vma->vm_mm;
2773		- unsigned long addr, i, mmu_start;
	2983	+ struct mmu_notifier_range range;
	2984	+ unsigned long addr, i;
2774	2985	bool notified = false;
2775	2986
2776	2987	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
..	..	@@ -2785,15 +2996,17 @@
2785	2996	}
2786	2997
2787	2998	if (!page) {
2788		- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
	2999	+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
2789	3000	continue;
2790		- }
2791	3001	if (!notified) {
2792		- mmu_start = addr;
2793	3002	notified = true;
2794		- mmu_notifier_invalidate_range_start(mm,
2795		- mmu_start,
2796		- migrate->end);
	3003	+
	3004	+ mmu_notifier_range_init(&range,
	3005	+ MMU_NOTIFY_CLEAR, 0,
	3006	+ NULL,
	3007	+ migrate->vma->vm_mm,
	3008	+ addr, migrate->end);
	3009	+ mmu_notifier_invalidate_range_start(&range);
2797	3010	}
2798	3011	migrate_vma_insert_page(migrate, addr, newpage,
2799	3012	&migrate->src[i],
..	..	@@ -2813,7 +3026,7 @@
2813	3026	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2814	3027	continue;
2815	3028	}
2816		- } else if (!is_device_public_page(newpage)) {
	3029	+ } else {
2817	3030	/*
2818	3031	* Other types of ZONE_DEVICE page are not
2819	3032	* supported.
..	..	@@ -2834,11 +3047,11 @@
2834	3047	* did already call it.
2835	3048	*/
2836	3049	if (notified)
2837		- mmu_notifier_invalidate_range_only_end(mm, mmu_start,
2838		- migrate->end);
	3050	+ mmu_notifier_invalidate_range_only_end(&range);
2839	3051	}
	3052	+EXPORT_SYMBOL(migrate_vma_pages);
2840	3053
2841		-/*
	3054	+/**
2842	3055	* migrate_vma_finalize() - restore CPU page table entry
2843	3056	* @migrate: migrate struct containing all migration information
2844	3057	*
..	..	@@ -2849,7 +3062,7 @@
2849	3062	* This also unlocks the pages and puts them back on the lru, or drops the extra
2850	3063	* refcount, for device pages.
2851	3064	*/
2852		-static void migrate_vma_finalize(struct migrate_vma *migrate)
	3065	+void migrate_vma_finalize(struct migrate_vma *migrate)
2853	3066	{
2854	3067	const unsigned long npages = migrate->npages;
2855	3068	unsigned long i;
..	..	@@ -2876,7 +3089,6 @@
2876	3089
2877	3090	remove_migration_ptes(page, newpage, false);
2878	3091	unlock_page(page);
2879		- migrate->cpages--;
2880	3092
2881	3093	if (is_zone_device_page(page))
2882	3094	put_page(page);
..	..	@@ -2892,124 +3104,5 @@
2892	3104	}
2893	3105	}
2894	3106	}
2895		-
2896		-/*
2897		- * migrate_vma() - migrate a range of memory inside vma
2898		- *
2899		- * @ops: migration callback for allocating destination memory and copying
2900		- * @vma: virtual memory area containing the range to be migrated
2901		- * @start: start address of the range to migrate (inclusive)
2902		- * @end: end address of the range to migrate (exclusive)
2903		- * @src: array of hmm_pfn_t containing source pfns
2904		- * @dst: array of hmm_pfn_t containing destination pfns
2905		- * @private: pointer passed back to each of the callback
2906		- * Returns: 0 on success, error code otherwise
2907		- *
2908		- * This function tries to migrate a range of memory virtual address range, using
2909		- * callbacks to allocate and copy memory from source to destination. First it
2910		- * collects all the pages backing each virtual address in the range, saving this
2911		- * inside the src array. Then it locks those pages and unmaps them. Once the pages
2912		- * are locked and unmapped, it checks whether each page is pinned or not. Pages
2913		- * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
2914		- * in the corresponding src array entry. It then restores any pages that are
2915		- * pinned, by remapping and unlocking those pages.
2916		- *
2917		- * At this point it calls the alloc_and_copy() callback. For documentation on
2918		- * what is expected from that callback, see struct migrate_vma_ops comments in
2919		- * include/linux/migrate.h
2920		- *
2921		- * After the alloc_and_copy() callback, this function goes over each entry in
2922		- * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
2923		- * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
2924		- * then the function tries to migrate struct page information from the source
2925		- * struct page to the destination struct page. If it fails to migrate the struct
2926		- * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
2927		- * array.
2928		- *
2929		- * At this point all successfully migrated pages have an entry in the src
2930		- * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
2931		- * array entry with MIGRATE_PFN_VALID flag set.
2932		- *
2933		- * It then calls the finalize_and_map() callback. See comments for "struct
2934		- * migrate_vma_ops", in include/linux/migrate.h for details about
2935		- * finalize_and_map() behavior.
2936		- *
2937		- * After the finalize_and_map() callback, for successfully migrated pages, this
2938		- * function updates the CPU page table to point to new pages, otherwise it
2939		- * restores the CPU page table to point to the original source pages.
2940		- *
2941		- * Function returns 0 after the above steps, even if no pages were migrated
2942		- * (The function only returns an error if any of the arguments are invalid.)
2943		- *
2944		- * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
2945		- * unsigned long entries.
2946		- */
2947		-int migrate_vma(const struct migrate_vma_ops *ops,
2948		- struct vm_area_struct *vma,
2949		- unsigned long start,
2950		- unsigned long end,
2951		- unsigned long *src,
2952		- unsigned long *dst,
2953		- void *private)
2954		-{
2955		- struct migrate_vma migrate;
2956		-
2957		- /* Sanity check the arguments */
2958		- start &= PAGE_MASK;
2959		- end &= PAGE_MASK;
2960		- if (!vma \|\| is_vm_hugetlb_page(vma) \|\| (vma->vm_flags & VM_SPECIAL) \|\|
2961		- vma_is_dax(vma))
2962		- return -EINVAL;
2963		- if (start < vma->vm_start \|\| start >= vma->vm_end)
2964		- return -EINVAL;
2965		- if (end <= vma->vm_start \|\| end > vma->vm_end)
2966		- return -EINVAL;
2967		- if (!ops \|\| !src \|\| !dst \|\| start >= end)
2968		- return -EINVAL;
2969		-
2970		- memset(src, 0, sizeof(src) ((end - start) >> PAGE_SHIFT));
2971		- migrate.src = src;
2972		- migrate.dst = dst;
2973		- migrate.start = start;
2974		- migrate.npages = 0;
2975		- migrate.cpages = 0;
2976		- migrate.end = end;
2977		- migrate.vma = vma;
2978		-
2979		- /* Collect, and try to unmap source pages */
2980		- migrate_vma_collect(&migrate);
2981		- if (!migrate.cpages)
2982		- return 0;
2983		-
2984		- /* Lock and isolate page */
2985		- migrate_vma_prepare(&migrate);
2986		- if (!migrate.cpages)
2987		- return 0;
2988		-
2989		- /* Unmap pages */
2990		- migrate_vma_unmap(&migrate);
2991		- if (!migrate.cpages)
2992		- return 0;
2993		-
2994		- /*
2995		- * At this point pages are locked and unmapped, and thus they have
2996		- * stable content and can safely be copied to destination memory that
2997		- * is allocated by the callback.
2998		- *
2999		- * Note that migration can fail in migrate_vma_struct_page() for each
3000		- * individual page.
3001		- */
3002		- ops->alloc_and_copy(vma, src, dst, start, end, private);
3003		-
3004		- /* This does the real migration of struct page */
3005		- migrate_vma_pages(&migrate);
3006		-
3007		- ops->finalize_and_map(vma, src, dst, start, end, private);
3008		-
3009		- /* Unlock and remap pages */
3010		- migrate_vma_finalize(&migrate);
3011		-
3012		- return 0;
3013		-}
3014		-EXPORT_SYMBOL(migrate_vma);
3015		-#endif /* defined(MIGRATE_VMA_HELPER) */
	3107	+EXPORT_SYMBOL(migrate_vma_finalize);
	3108	+#endif /* CONFIG_DEVICE_PRIVATE */