~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,9 +1,9 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* Simple NUMA memory policy for the Linux kernel.
3	4	*
4	5	* Copyright 2003,2004 Andi Kleen, SuSE Labs.
5	6	* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6		- * Subject to the GNU Public License, version 2.
7	7	*
8	8	* NUMA policy allows the user to give hints in which node(s) memory should
9	9	* be allocated.
..	..	@@ -68,7 +68,7 @@
68	68	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69	69
70	70	#include <linux/mempolicy.h>
71		-#include <linux/mm.h>
	71	+#include <linux/pagewalk.h>
72	72	#include <linux/highmem.h>
73	73	#include <linux/hugetlb.h>
74	74	#include <linux/kernel.h>
..	..	@@ -126,6 +126,32 @@
126	126	};
127	127
128	128	static struct mempolicy preferred_node_policy[MAX_NUMNODES];
	129	+
	130	+/**
	131	+ * numa_map_to_online_node - Find closest online node
	132	+ * @node: Node id to start the search
	133	+ *
	134	+ * Lookup the next closest node by distance if @nid is not online.
	135	+ */
	136	+int numa_map_to_online_node(int node)
	137	+{
	138	+ int min_dist = INT_MAX, dist, n, min_node;
	139	+
	140	+ if (node == NUMA_NO_NODE \|\| node_online(node))
	141	+ return node;
	142	+
	143	+ min_node = node;
	144	+ for_each_online_node(n) {
	145	+ dist = node_distance(node, n);
	146	+ if (dist < min_dist) {
	147	+ min_dist = dist;
	148	+ min_node = n;
	149	+ }
	150	+ }
	151	+
	152	+ return min_node;
	153	+}
	154	+EXPORT_SYMBOL_GPL(numa_map_to_online_node);
129	155
130	156	struct mempolicy get_task_policy(struct task_struct p)
131	157	{
..	..	@@ -198,7 +224,7 @@
198	224	* handle an empty nodemask with MPOL_PREFERRED here.
199	225	*
200	226	* Must be called holding task's alloc_lock to protect task's mems_allowed
201		- * and mempolicy. May also be called holding the mmap_semaphore for write.
	227	+ * and mempolicy. May also be called holding the mmap_lock for write.
202	228	*/
203	229	static int mpol_set_nodemask(struct mempolicy *pol,
204	230	const nodemask_t nodes, struct nodemask_scratch nsc)
..	..	@@ -342,13 +368,13 @@
342	368	/*
343	369	* mpol_rebind_policy - Migrate a policy to a different set of nodes
344	370	*
345		- * Per-vma policies are protected by mmap_sem. Allocations using per-task
	371	+ * Per-vma policies are protected by mmap_lock. Allocations using per-task
346	372	* policies are protected by task->mems_allowed_seq to prevent a premature
347	373	* OOM/allocation failure due to parallel nodemask modification.
348	374	*/
349	375	static void mpol_rebind_policy(struct mempolicy pol, const nodemask_t newmask)
350	376	{
351		- if (!pol)
	377	+ if (!pol \|\| pol->mode == MPOL_LOCAL)
352	378	return;
353	379	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
354	380	nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
..	..	@@ -372,17 +398,20 @@
372	398	/*
373	399	* Rebind each vma in mm to new nodemask.
374	400	*
375		- * Call holding a reference to mm. Takes mm->mmap_sem during call.
	401	+ * Call holding a reference to mm. Takes mm->mmap_lock during call.
376	402	*/
377	403
378	404	void mpol_rebind_mm(struct mm_struct mm, nodemask_t new)
379	405	{
380	406	struct vm_area_struct *vma;
381	407
382		- down_write(&mm->mmap_sem);
383		- for (vma = mm->mmap; vma; vma = vma->vm_next)
	408	+ mmap_write_lock(mm);
	409	+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
	410	+ vm_write_begin(vma);
384	411	mpol_rebind_policy(vma->vm_policy, new);
385		- up_write(&mm->mmap_sem);
	412	+ vm_write_end(vma);
	413	+ }
	414	+ mmap_write_unlock(mm);
386	415	}
387	416
388	417	static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
..	..	@@ -410,7 +439,9 @@
410	439	struct list_head *pagelist;
411	440	unsigned long flags;
412	441	nodemask_t *nmask;
413		- struct vm_area_struct *prev;
	442	+ unsigned long start;
	443	+ unsigned long end;
	444	+ struct vm_area_struct *first;
414	445	};
415	446
416	447	/*
..	..	@@ -440,6 +471,7 @@
440	471	*/
441	472	static int queue_pages_pmd(pmd_t pmd, spinlock_t ptl, unsigned long addr,
442	473	unsigned long end, struct mm_walk *walk)
	474	+ __releases(ptl)
443	475	{
444	476	int ret = 0;
445	477	struct page *page;
..	..	@@ -555,9 +587,10 @@
555	587	unsigned long addr, unsigned long end,
556	588	struct mm_walk *walk)
557	589	{
	590	+ int ret = 0;
558	591	#ifdef CONFIG_HUGETLB_PAGE
559	592	struct queue_pages *qp = walk->private;
560		- unsigned long flags = qp->flags;
	593	+ unsigned long flags = (qp->flags & MPOL_MF_VALID);
561	594	struct page *page;
562	595	spinlock_t *ptl;
563	596	pte_t entry;
..	..	@@ -569,16 +602,44 @@
569	602	page = pte_page(entry);
570	603	if (!queue_pages_required(page, qp))
571	604	goto unlock;
	605	+
	606	+ if (flags == MPOL_MF_STRICT) {
	607	+ /*
	608	+ * STRICT alone means only detecting misplaced page and no
	609	+ * need to further check other vma.
	610	+ */
	611	+ ret = -EIO;
	612	+ goto unlock;
	613	+ }
	614	+
	615	+ if (!vma_migratable(walk->vma)) {
	616	+ /*
	617	+ * Must be STRICT with MOVE*, otherwise .test_walk() have
	618	+ * stopped walking current vma.
	619	+ * Detecting misplaced page but allow migrating pages which
	620	+ * have been queued.
	621	+ */
	622	+ ret = 1;
	623	+ goto unlock;
	624	+ }
	625	+
572	626	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
573	627	if (flags & (MPOL_MF_MOVE_ALL) \|\|
574		- (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
575		- isolate_huge_page(page, qp->pagelist);
	628	+ (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
	629	+ if (!isolate_huge_page(page, qp->pagelist) &&
	630	+ (flags & MPOL_MF_STRICT))
	631	+ /*
	632	+ * Failed to isolate page but allow migrating pages
	633	+ * which have been queued.
	634	+ */
	635	+ ret = 1;
	636	+ }
576	637	unlock:
577	638	spin_unlock(ptl);
578	639	#else
579	640	BUG();
580	641	#endif
581		- return 0;
	642	+ return ret;
582	643	}
583	644
584	645	#ifdef CONFIG_NUMA_BALANCING
..	..	@@ -596,7 +657,7 @@
596	657	{
597	658	int nr_updated;
598	659
599		- nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
	660	+ nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
600	661	if (nr_updated)
601	662	count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
602	663
..	..	@@ -618,6 +679,22 @@
618	679	unsigned long endvma = vma->vm_end;
619	680	unsigned long flags = qp->flags;
620	681
	682	+ /* range check first */
	683	+ VM_BUG_ON_VMA((vma->vm_start > start) \|\| (vma->vm_end < end), vma);
	684	+
	685	+ if (!qp->first) {
	686	+ qp->first = vma;
	687	+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
	688	+ (qp->start < vma->vm_start))
	689	+ /* hole at head side of range */
	690	+ return -EFAULT;
	691	+ }
	692	+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
	693	+ ((vma->vm_end < qp->end) &&
	694	+ (!vma->vm_next \|\| vma->vm_end < vma->vm_next->vm_start)))
	695	+ /* hole at middle or tail of range */
	696	+ return -EFAULT;
	697	+
621	698	/*
622	699	* Need check MPOL_MF_STRICT to return -EIO if possible
623	700	* regardless of vma_migratable
..	..	@@ -628,22 +705,10 @@
628	705
629	706	if (endvma > end)
630	707	endvma = end;
631		- if (vma->vm_start > start)
632		- start = vma->vm_start;
633		-
634		- if (!(flags & MPOL_MF_DISCONTIG_OK)) {
635		- if (!vma->vm_next && vma->vm_end < end)
636		- return -EFAULT;
637		- if (qp->prev && qp->prev->vm_end < vma->vm_start)
638		- return -EFAULT;
639		- }
640		-
641		- qp->prev = vma;
642	708
643	709	if (flags & MPOL_MF_LAZY) {
644	710	/* Similar to task_numa_work, skip inaccessible VMAs */
645		- if (!is_vm_hugetlb_page(vma) &&
646		- (vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)) &&
	711	+ if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
647	712	!(vma->vm_flags & VM_MIXEDMAP))
648	713	change_prot_numa(vma, start, endvma);
649	714	return 1;
..	..	@@ -654,6 +719,12 @@
654	719	return 0;
655	720	return 1;
656	721	}
	722	+
	723	+static const struct mm_walk_ops queue_pages_walk_ops = {
	724	+ .hugetlb_entry = queue_pages_hugetlb,
	725	+ .pmd_entry = queue_pages_pte_range,
	726	+ .test_walk = queue_pages_test_walk,
	727	+};
657	728
658	729	/*
659	730	* Walk through page tables and collect pages to be migrated.
..	..	@@ -675,26 +746,28 @@
675	746	nodemask_t *nodes, unsigned long flags,
676	747	struct list_head *pagelist)
677	748	{
	749	+ int err;
678	750	struct queue_pages qp = {
679	751	.pagelist = pagelist,
680	752	.flags = flags,
681	753	.nmask = nodes,
682		- .prev = NULL,
683		- };
684		- struct mm_walk queue_pages_walk = {
685		- .hugetlb_entry = queue_pages_hugetlb,
686		- .pmd_entry = queue_pages_pte_range,
687		- .test_walk = queue_pages_test_walk,
688		- .mm = mm,
689		- .private = &qp,
	754	+ .start = start,
	755	+ .end = end,
	756	+ .first = NULL,
690	757	};
691	758
692		- return walk_page_range(start, end, &queue_pages_walk);
	759	+ err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
	760	+
	761	+ if (!qp.first)
	762	+ /* whole range in hole */
	763	+ err = -EFAULT;
	764	+
	765	+ return err;
693	766	}
694	767
695	768	/*
696	769	* Apply policy to a single VMA
697		- * This must be called with the mmap_sem held for writing.
	770	+ * This must be called with the mmap_lock held for writing.
698	771	*/
699	772	static int vma_replace_policy(struct vm_area_struct *vma,
700	773	struct mempolicy *pol)
..	..	@@ -712,6 +785,7 @@
712	785	if (IS_ERR(new))
713	786	return PTR_ERR(new);
714	787
	788	+ vm_write_begin(vma);
715	789	if (vma->vm_ops && vma->vm_ops->set_policy) {
716	790	err = vma->vm_ops->set_policy(vma, new);
717	791	if (err)
..	..	@@ -719,11 +793,17 @@
719	793	}
720	794
721	795	old = vma->vm_policy;
722		- vma->vm_policy = new; /* protected by mmap_sem */
	796	+ /*
	797	+ * The speculative page fault handler accesses this field without
	798	+ * hodling the mmap_sem.
	799	+ */
	800	+ WRITE_ONCE(vma->vm_policy, new);
	801	+ vm_write_end(vma);
723	802	mpol_put(old);
724	803
725	804	return 0;
726	805	err_out:
	806	+ vm_write_end(vma);
727	807	mpol_put(new);
728	808	return err;
729	809	}
..	..	@@ -732,7 +812,6 @@
732	812	static int mbind_range(struct mm_struct *mm, unsigned long start,
733	813	unsigned long end, struct mempolicy *new_pol)
734	814	{
735		- struct vm_area_struct *next;
736	815	struct vm_area_struct *prev;
737	816	struct vm_area_struct *vma;
738	817	int err = 0;
..	..	@@ -741,15 +820,13 @@
741	820	unsigned long vmend;
742	821
743	822	vma = find_vma(mm, start);
744		- if (!vma \|\| vma->vm_start > start)
745		- return -EFAULT;
	823	+ VM_BUG_ON(!vma);
746	824
747	825	prev = vma->vm_prev;
748	826	if (start > vma->vm_start)
749	827	prev = vma;
750	828
751		- for (; vma && vma->vm_start < end; prev = vma, vma = next) {
752		- next = vma->vm_next;
	829	+ for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
753	830	vmstart = max(start, vma->vm_start);
754	831	vmend = min(end, vma->vm_end);
755	832
..	..	@@ -764,10 +841,6 @@
764	841	vma_get_anon_name(vma));
765	842	if (prev) {
766	843	vma = prev;
767		- next = vma->vm_next;
768		- if (mpol_equal(vma_policy(vma), new_pol))
769		- continue;
770		- /* vma_merge() joined vma && vma->next, case 8 */
771	844	goto replace;
772	845	}
773	846	if (vma->vm_start != vmstart) {
..	..	@@ -807,13 +880,12 @@
807	880	goto out;
808	881	}
809	882
810		- task_lock(current);
811	883	ret = mpol_set_nodemask(new, nodes, scratch);
812	884	if (ret) {
813		- task_unlock(current);
814	885	mpol_put(new);
815	886	goto out;
816	887	}
	888	+ task_lock(current);
817	889	old = current->mempolicy;
818	890	current->mempolicy = new;
819	891	if (new && new->mode == MPOL_INTERLEAVE)
..	..	@@ -839,7 +911,6 @@
839	911
840	912	switch (p->mode) {
841	913	case MPOL_BIND:
842		- /* Fall through */
843	914	case MPOL_INTERLEAVE:
844	915	*nodes = p->v.nodes;
845	916	break;
..	..	@@ -853,16 +924,19 @@
853	924	}
854	925	}
855	926
856		-static int lookup_node(unsigned long addr)
	927	+static int lookup_node(struct mm_struct *mm, unsigned long addr)
857	928	{
858		- struct page *p;
	929	+ struct page *p = NULL;
859	930	int err;
860	931
861		- err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
862		- if (err >= 0) {
	932	+ int locked = 1;
	933	+ err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
	934	+ if (err > 0) {
863	935	err = page_to_nid(p);
864	936	put_page(p);
865	937	}
	938	+ if (locked)
	939	+ mmap_read_unlock(mm);
866	940	return err;
867	941	}
868	942
..	..	@@ -873,7 +947,7 @@
873	947	int err;
874	948	struct mm_struct *mm = current->mm;
875	949	struct vm_area_struct *vma = NULL;
876		- struct mempolicy *pol = current->mempolicy;
	950	+ struct mempolicy pol = current->mempolicy, pol_refcount = NULL;
877	951
878	952	if (flags &
879	953	~(unsigned long)(MPOL_F_NODE\|MPOL_F_ADDR\|MPOL_F_MEMS_ALLOWED))
..	..	@@ -895,10 +969,10 @@
895	969	* vma/shared policy at addr is NULL. We
896	970	* want to return MPOL_DEFAULT in this case.
897	971	*/
898		- down_read(&mm->mmap_sem);
	972	+ mmap_read_lock(mm);
899	973	vma = find_vma_intersection(mm, addr, addr+1);
900	974	if (!vma) {
901		- up_read(&mm->mmap_sem);
	975	+ mmap_read_unlock(mm);
902	976	return -EFAULT;
903	977	}
904	978	if (vma->vm_ops && vma->vm_ops->get_policy)
..	..	@@ -913,7 +987,16 @@
913	987
914	988	if (flags & MPOL_F_NODE) {
915	989	if (flags & MPOL_F_ADDR) {
916		- err = lookup_node(addr);
	990	+ /*
	991	+ * Take a refcount on the mpol, lookup_node()
	992	+ * wil drop the mmap_lock, so after calling
	993	+ * lookup_node() only "pol" remains valid, "vma"
	994	+ * is stale.
	995	+ */
	996	+ pol_refcount = pol;
	997	+ vma = NULL;
	998	+ mpol_get(pol);
	999	+ err = lookup_node(mm, addr);
917	1000	if (err < 0)
918	1001	goto out;
919	1002	*policy = err;
..	..	@@ -948,7 +1031,9 @@
948	1031	out:
949	1032	mpol_cond_put(pol);
950	1033	if (vma)
951		- up_read(&current->mm->mmap_sem);
	1034	+ mmap_read_unlock(mm);
	1035	+ if (pol_refcount)
	1036	+ mpol_put(pol_refcount);
952	1037	return err;
953	1038	}
954	1039
..	..	@@ -967,8 +1052,8 @@
967	1052	if (!isolate_lru_page(head)) {
968	1053	list_add_tail(&head->lru, pagelist);
969	1054	mod_node_page_state(page_pgdat(head),
970		- NR_ISOLATED_ANON + page_is_file_cache(head),
971		- hpage_nr_pages(head));
	1055	+ NR_ISOLATED_ANON + page_is_file_lru(head),
	1056	+ thp_nr_pages(head));
972	1057	} else if (flags & MPOL_MF_STRICT) {
973	1058	/*
974	1059	* Non-movable page may reach here. And, there may be
..	..	@@ -984,27 +1069,6 @@
984	1069	return 0;
985	1070	}
986	1071
987		-/* page allocation callback for NUMA node migration */
988		-struct page alloc_new_node_page(struct page page, unsigned long node)
989		-{
990		- if (PageHuge(page))
991		- return alloc_huge_page_node(page_hstate(compound_head(page)),
992		- node);
993		- else if (PageTransHuge(page)) {
994		- struct page *thp;
995		-
996		- thp = alloc_pages_node(node,
997		- (GFP_TRANSHUGE \| __GFP_THISNODE),
998		- HPAGE_PMD_ORDER);
999		- if (!thp)
1000		- return NULL;
1001		- prep_transhuge_page(thp);
1002		- return thp;
1003		- } else
1004		- return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE \|
1005		- __GFP_THISNODE, 0);
1006		-}
1007		-
1008	1072	/*
1009	1073	* Migrate pages from one node to a target node.
1010	1074	* Returns error or the number of pages not migrated.
..	..	@@ -1015,6 +1079,10 @@
1015	1079	nodemask_t nmask;
1016	1080	LIST_HEAD(pagelist);
1017	1081	int err = 0;
	1082	+ struct migration_target_control mtc = {
	1083	+ .nid = dest,
	1084	+ .gfp_mask = GFP_HIGHUSER_MOVABLE \| __GFP_THISNODE,
	1085	+ };
1018	1086
1019	1087	nodes_clear(nmask);
1020	1088	node_set(source, nmask);
..	..	@@ -1029,8 +1097,8 @@
1029	1097	flags \| MPOL_MF_DISCONTIG_OK, &pagelist);
1030	1098
1031	1099	if (!list_empty(&pagelist)) {
1032		- err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1033		- MIGRATE_SYNC, MR_SYSCALL);
	1100	+ err = migrate_pages(&pagelist, alloc_migration_target, NULL,
	1101	+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
1034	1102	if (err)
1035	1103	putback_movable_pages(&pagelist);
1036	1104	}
..	..	@@ -1048,14 +1116,12 @@
1048	1116	const nodemask_t *to, int flags)
1049	1117	{
1050	1118	int busy = 0;
1051		- int err;
	1119	+ int err = 0;
1052	1120	nodemask_t tmp;
1053	1121
1054		- err = migrate_prep();
1055		- if (err)
1056		- return err;
	1122	+ lru_cache_disable();
1057	1123
1058		- down_read(&mm->mmap_sem);
	1124	+ mmap_read_lock(mm);
1059	1125
1060	1126	/*
1061	1127	* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
..	..	@@ -1136,7 +1202,9 @@
1136	1202	if (err < 0)
1137	1203	break;
1138	1204	}
1139		- up_read(&mm->mmap_sem);
	1205	+ mmap_read_unlock(mm);
	1206	+
	1207	+ lru_cache_enable();
1140	1208	if (err < 0)
1141	1209	return err;
1142	1210	return busy;
..	..	@@ -1153,7 +1221,7 @@
1153	1221	static struct page new_page(struct page page, unsigned long start)
1154	1222	{
1155	1223	struct vm_area_struct *vma;
1156		- unsigned long uninitialized_var(address);
	1224	+ unsigned long address;
1157	1225
1158	1226	vma = find_vma(current->mm, start);
1159	1227	while (vma) {
..	..	@@ -1252,19 +1320,15 @@
1252	1320
1253	1321	if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) {
1254	1322
1255		- err = migrate_prep();
1256		- if (err)
1257		- goto mpol_out;
	1323	+ lru_cache_disable();
1258	1324	}
1259	1325	{
1260	1326	NODEMASK_SCRATCH(scratch);
1261	1327	if (scratch) {
1262		- down_write(&mm->mmap_sem);
1263		- task_lock(current);
	1328	+ mmap_write_lock(mm);
1264	1329	err = mpol_set_nodemask(new, nmask, scratch);
1265		- task_unlock(current);
1266	1330	if (err)
1267		- up_write(&mm->mmap_sem);
	1331	+ mmap_write_unlock(mm);
1268	1332	} else
1269	1333	err = -ENOMEM;
1270	1334	NODEMASK_SCRATCH_FREE(scratch);
..	..	@@ -1301,9 +1365,11 @@
1301	1365	putback_movable_pages(&pagelist);
1302	1366	}
1303	1367
1304		- up_write(&mm->mmap_sem);
	1368	+ mmap_write_unlock(mm);
1305	1369	mpol_out:
1306	1370	mpol_put(new);
	1371	+ if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL))
	1372	+ lru_cache_enable();
1307	1373	return err;
1308	1374	}
1309	1375
..	..	@@ -1505,10 +1571,6 @@
1505	1571	if (nodes_empty(*new))
1506	1572	goto out_put;
1507	1573
1508		- nodes_and(new, new, node_states[N_MEMORY]);
1509		- if (nodes_empty(*new))
1510		- goto out_put;
1511		-
1512	1574	err = security_task_movememory(task);
1513	1575	if (err)
1514	1576	goto out_put;
..	..	@@ -1552,13 +1614,13 @@
1552	1614	unsigned long flags)
1553	1615	{
1554	1616	int err;
1555		- int uninitialized_var(pval);
	1617	+ int pval;
1556	1618	nodemask_t nodes;
1557		-
1558		- addr = untagged_addr(addr);
1559	1619
1560	1620	if (nmask != NULL && maxnode < nr_node_ids)
1561	1621	return -EINVAL;
	1622	+
	1623	+ addr = untagged_addr(addr);
1562	1624
1563	1625	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1564	1626
..	..	@@ -1691,26 +1753,59 @@
1691	1753
1692	1754	#endif /* CONFIG_COMPAT */
1693	1755
	1756	+bool vma_migratable(struct vm_area_struct *vma)
	1757	+{
	1758	+ if (vma->vm_flags & (VM_IO \| VM_PFNMAP))
	1759	+ return false;
	1760	+
	1761	+ /*
	1762	+ * DAX device mappings require predictable access latency, so avoid
	1763	+ * incurring periodic faults.
	1764	+ */
	1765	+ if (vma_is_dax(vma))
	1766	+ return false;
	1767	+
	1768	+ if (is_vm_hugetlb_page(vma) &&
	1769	+ !hugepage_migration_supported(hstate_vma(vma)))
	1770	+ return false;
	1771	+
	1772	+ /*
	1773	+ * Migration allocates pages in the highest zone. If we cannot
	1774	+ * do so then migration (at least from node to node) is not
	1775	+ * possible.
	1776	+ */
	1777	+ if (vma->vm_file &&
	1778	+ gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
	1779	+ < policy_zone)
	1780	+ return false;
	1781	+ return true;
	1782	+}
	1783	+
1694	1784	struct mempolicy __get_vma_policy(struct vm_area_struct vma,
1695	1785	unsigned long addr)
1696	1786	{
1697		- struct mempolicy *pol = NULL;
	1787	+ struct mempolicy *pol;
1698	1788
1699		- if (vma) {
1700		- if (vma->vm_ops && vma->vm_ops->get_policy) {
1701		- pol = vma->vm_ops->get_policy(vma, addr);
1702		- } else if (vma->vm_policy) {
1703		- pol = vma->vm_policy;
	1789	+ if (!vma)
	1790	+ return NULL;
1704	1791
1705		- /*
1706		- * shmem_alloc_page() passes MPOL_F_SHARED policy with
1707		- * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1708		- * count on these policies which will be dropped by
1709		- * mpol_cond_put() later
1710		- */
1711		- if (mpol_needs_cond_ref(pol))
1712		- mpol_get(pol);
1713		- }
	1792	+ if (vma->vm_ops && vma->vm_ops->get_policy)
	1793	+ return vma->vm_ops->get_policy(vma, addr);
	1794	+
	1795	+ /*
	1796	+ * This could be called without holding the mmap_sem in the
	1797	+ * speculative page fault handler's path.
	1798	+ */
	1799	+ pol = READ_ONCE(vma->vm_policy);
	1800	+ if (pol) {
	1801	+ /*
	1802	+ * shmem_alloc_page() passes MPOL_F_SHARED policy with
	1803	+ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
	1804	+ * count on these policies which will be dropped by
	1805	+ * mpol_cond_put() later
	1806	+ */
	1807	+ if (mpol_needs_cond_ref(pol))
	1808	+ mpol_get(pol);
1714	1809	}
1715	1810
1716	1811	return pol;
..	..	@@ -1785,7 +1880,7 @@
1785	1880	* Return a nodemask representing a mempolicy for filtering nodes for
1786	1881	* page allocation
1787	1882	*/
1788		-static nodemask_t policy_nodemask(gfp_t gfp, struct mempolicy policy)
	1883	+nodemask_t policy_nodemask(gfp_t gfp, struct mempolicy policy)
1789	1884	{
1790	1885	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1791	1886	if (unlikely(policy->mode == MPOL_BIND) &&
..	..	@@ -1797,8 +1892,7 @@
1797	1892	}
1798	1893
1799	1894	/* Return the node id preferred by the given mempolicy, or the given id */
1800		-static int policy_node(gfp_t gfp, struct mempolicy *policy,
1801		- int nd)
	1895	+static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1802	1896	{
1803	1897	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1804	1898	nd = policy->v.preferred_node;
..	..	@@ -1986,7 +2080,6 @@
1986	2080	break;
1987	2081
1988	2082	case MPOL_BIND:
1989		- /* Fall through */
1990	2083	case MPOL_INTERLEAVE:
1991	2084	*mask = mempolicy->v.nodes;
1992	2085	break;
..	..	@@ -2081,7 +2174,7 @@
2081	2174	*
2082	2175	* This function allocates a page from the kernel page pool and applies
2083	2176	* a NUMA policy associated with the VMA or the current process.
2084		- * When VMA is not NULL caller must hold down_read on the mmap_sem of the
	2177	+ * When VMA is not NULL caller must read-lock the mmap_lock of the
2085	2178	* mm_struct of the VMA to prevent it from going away. Should be used for
2086	2179	* all allocations for pages that will be mapped into user space. Returns
2087	2180	* NULL when no page can be allocated.
..	..	@@ -2119,43 +2212,29 @@
2119	2212	* If the policy is interleave, or does not allow the current
2120	2213	* node in its nodemask, we allocate the standard way.
2121	2214	*/
2122		- if (pol->mode == MPOL_PREFERRED &&
2123		- !(pol->flags & MPOL_F_LOCAL))
	2215	+ if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2124	2216	hpage_node = pol->v.preferred_node;
2125	2217
2126	2218	nmask = policy_nodemask(gfp, pol);
2127	2219	if (!nmask \|\| node_isset(hpage_node, *nmask)) {
2128	2220	mpol_cond_put(pol);
2129	2221	/*
2130		- * We cannot invoke reclaim if __GFP_THISNODE
2131		- * is set. Invoking reclaim with
2132		- * __GFP_THISNODE set, would cause THP
2133		- * allocations to trigger heavy swapping
2134		- * despite there may be tons of free memory
2135		- * (including potentially plenty of THP
2136		- * already available in the buddy) on all the
2137		- * other NUMA nodes.
2138		- *
2139		- * At most we could invoke compaction when
2140		- * __GFP_THISNODE is set (but we would need to
2141		- * refrain from invoking reclaim even if
2142		- * compaction returned COMPACT_SKIPPED because
2143		- * there wasn't not enough memory to succeed
2144		- * compaction). For now just avoid
2145		- * __GFP_THISNODE instead of limiting the
2146		- * allocation path to a strict and single
2147		- * compaction invocation.
2148		- *
2149		- * Supposedly if direct reclaim was enabled by
2150		- * the caller, the app prefers THP regardless
2151		- * of the node it comes from so this would be
2152		- * more desiderable behavior than only
2153		- * providing THP originated from the local
2154		- * node in such case.
	2222	+ * First, try to allocate THP only on local node, but
	2223	+ * don't reclaim unnecessarily, just compact.
2155	2224	*/
2156		- if (!(gfp & __GFP_DIRECT_RECLAIM))
2157		- gfp \|= __GFP_THISNODE;
2158		- page = __alloc_pages_node(hpage_node, gfp, order);
	2225	+ page = __alloc_pages_node(hpage_node,
	2226	+ gfp \| __GFP_THISNODE \| __GFP_NORETRY, order);
	2227	+
	2228	+ /*
	2229	+ * If hugepage allocations are configured to always
	2230	+ * synchronous compact or the vma has been madvised
	2231	+ * to prefer hugepage backing, retry allowing remote
	2232	+ * memory with both reclaim and compact as well.
	2233	+ */
	2234	+ if (!page && (gfp & __GFP_DIRECT_RECLAIM))
	2235	+ page = __alloc_pages_nodemask(gfp, order,
	2236	+ hpage_node, nmask);
	2237	+
2159	2238	goto out;
2160	2239	}
2161	2240	}
..	..	@@ -2167,6 +2246,7 @@
2167	2246	out:
2168	2247	return page;
2169	2248	}
	2249	+EXPORT_SYMBOL(alloc_pages_vma);
2170	2250
2171	2251	/**
2172	2252	* alloc_pages_current - Allocate pages.
..	..	@@ -2266,7 +2346,6 @@
2266	2346
2267	2347	switch (a->mode) {
2268	2348	case MPOL_BIND:
2269		- /* Fall through */
2270	2349	case MPOL_INTERLEAVE:
2271	2350	return !!nodes_equal(a->v.nodes, b->v.nodes);
2272	2351	case MPOL_PREFERRED:
..	..	@@ -2399,7 +2478,7 @@
2399	2478	unsigned long pgoff;
2400	2479	int thiscpu = raw_smp_processor_id();
2401	2480	int thisnid = cpu_to_node(thiscpu);
2402		- int polnid = -1;
	2481	+ int polnid = NUMA_NO_NODE;
2403	2482	int ret = -1;
2404	2483
2405	2484	pol = get_vma_policy(vma, addr);
..	..	@@ -2573,6 +2652,7 @@
2573	2652	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2574	2653	if (!mpol_new)
2575	2654	goto err_out;
	2655	+ atomic_set(&mpol_new->refcnt, 1);
2576	2656	goto restart;
2577	2657	}
2578	2658
..	..	@@ -2805,12 +2885,11 @@
2805	2885	int mpol_parse_str(char str, struct mempolicy *mpol)
2806	2886	{
2807	2887	struct mempolicy *new = NULL;
2808		- unsigned short mode;
2809	2888	unsigned short mode_flags;
2810	2889	nodemask_t nodes;
2811	2890	char *nodelist = strchr(str, ':');
2812	2891	char *flags = strchr(str, '=');
2813		- int err = 1;
	2892	+ int err = 1, mode;
2814	2893
2815	2894	if (flags)
2816	2895	flags++ = '\0'; / terminate mode string */
..	..	@@ -2825,12 +2904,8 @@
2825	2904	} else
2826	2905	nodes_clear(nodes);
2827	2906
2828		- for (mode = 0; mode < MPOL_MAX; mode++) {
2829		- if (!strcmp(str, policy_modes[mode])) {
2830		- break;
2831		- }
2832		- }
2833		- if (mode >= MPOL_MAX)
	2907	+ mode = match_string(policy_modes, MPOL_MAX, str);
	2908	+ if (mode < 0)
2834	2909	goto out;
2835	2910
2836	2911	switch (mode) {