~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,9 +1,9 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* Simple NUMA memory policy for the Linux kernel.
3	4	*
4	5	* Copyright 2003,2004 Andi Kleen, SuSE Labs.
5	6	* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6		- * Subject to the GNU Public License, version 2.
7	7	*
8	8	* NUMA policy allows the user to give hints in which node(s) memory should
9	9	* be allocated.
..	..	@@ -68,7 +68,7 @@
68	68	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69	69
70	70	#include <linux/mempolicy.h>
71		-#include <linux/mm.h>
	71	+#include <linux/pagewalk.h>
72	72	#include <linux/highmem.h>
73	73	#include <linux/hugetlb.h>
74	74	#include <linux/kernel.h>
..	..	@@ -126,6 +126,32 @@
126	126	};
127	127
128	128	static struct mempolicy preferred_node_policy[MAX_NUMNODES];
	129	+
	130	+/**
	131	+ * numa_map_to_online_node - Find closest online node
	132	+ * @node: Node id to start the search
	133	+ *
	134	+ * Lookup the next closest node by distance if @nid is not online.
	135	+ */
	136	+int numa_map_to_online_node(int node)
	137	+{
	138	+ int min_dist = INT_MAX, dist, n, min_node;
	139	+
	140	+ if (node == NUMA_NO_NODE \|\| node_online(node))
	141	+ return node;
	142	+
	143	+ min_node = node;
	144	+ for_each_online_node(n) {
	145	+ dist = node_distance(node, n);
	146	+ if (dist < min_dist) {
	147	+ min_dist = dist;
	148	+ min_node = n;
	149	+ }
	150	+ }
	151	+
	152	+ return min_node;
	153	+}
	154	+EXPORT_SYMBOL_GPL(numa_map_to_online_node);
129	155
130	156	struct mempolicy get_task_policy(struct task_struct p)
131	157	{
..	..	@@ -198,7 +224,7 @@
198	224	* handle an empty nodemask with MPOL_PREFERRED here.
199	225	*
200	226	* Must be called holding task's alloc_lock to protect task's mems_allowed
201		- * and mempolicy. May also be called holding the mmap_semaphore for write.
	227	+ * and mempolicy. May also be called holding the mmap_lock for write.
202	228	*/
203	229	static int mpol_set_nodemask(struct mempolicy *pol,
204	230	const nodemask_t nodes, struct nodemask_scratch nsc)
..	..	@@ -342,13 +368,13 @@
342	368	/*
343	369	* mpol_rebind_policy - Migrate a policy to a different set of nodes
344	370	*
345		- * Per-vma policies are protected by mmap_sem. Allocations using per-task
	371	+ * Per-vma policies are protected by mmap_lock. Allocations using per-task
346	372	* policies are protected by task->mems_allowed_seq to prevent a premature
347	373	* OOM/allocation failure due to parallel nodemask modification.
348	374	*/
349	375	static void mpol_rebind_policy(struct mempolicy pol, const nodemask_t newmask)
350	376	{
351		- if (!pol)
	377	+ if (!pol \|\| pol->mode == MPOL_LOCAL)
352	378	return;
353	379	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
354	380	nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
..	..	@@ -372,17 +398,20 @@
372	398	/*
373	399	* Rebind each vma in mm to new nodemask.
374	400	*
375		- * Call holding a reference to mm. Takes mm->mmap_sem during call.
	401	+ * Call holding a reference to mm. Takes mm->mmap_lock during call.
376	402	*/
377	403
378	404	void mpol_rebind_mm(struct mm_struct mm, nodemask_t new)
379	405	{
380	406	struct vm_area_struct *vma;
381	407
382		- down_write(&mm->mmap_sem);
383		- for (vma = mm->mmap; vma; vma = vma->vm_next)
	408	+ mmap_write_lock(mm);
	409	+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
	410	+ vm_write_begin(vma);
384	411	mpol_rebind_policy(vma->vm_policy, new);
385		- up_write(&mm->mmap_sem);
	412	+ vm_write_end(vma);
	413	+ }
	414	+ mmap_write_unlock(mm);
386	415	}
387	416
388	417	static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
..	..	@@ -410,7 +439,9 @@
410	439	struct list_head *pagelist;
411	440	unsigned long flags;
412	441	nodemask_t *nmask;
413		- struct vm_area_struct *prev;
	442	+ unsigned long start;
	443	+ unsigned long end;
	444	+ struct vm_area_struct *first;
414	445	};
415	446
416	447	/*
..	..	@@ -440,6 +471,7 @@
440	471	*/
441	472	static int queue_pages_pmd(pmd_t pmd, spinlock_t ptl, unsigned long addr,
442	473	unsigned long end, struct mm_walk *walk)
	474	+ __releases(ptl)
443	475	{
444	476	int ret = 0;
445	477	struct page *page;
..	..	@@ -555,9 +587,10 @@
555	587	unsigned long addr, unsigned long end,
556	588	struct mm_walk *walk)
557	589	{
	590	+ int ret = 0;
558	591	#ifdef CONFIG_HUGETLB_PAGE
559	592	struct queue_pages *qp = walk->private;
560		- unsigned long flags = qp->flags;
	593	+ unsigned long flags = (qp->flags & MPOL_MF_VALID);
561	594	struct page *page;
562	595	spinlock_t *ptl;
563	596	pte_t entry;
..	..	@@ -569,16 +602,45 @@
569	602	page = pte_page(entry);
570	603	if (!queue_pages_required(page, qp))
571	604	goto unlock;
	605	+
	606	+ if (flags == MPOL_MF_STRICT) {
	607	+ /*
	608	+ * STRICT alone means only detecting misplaced page and no
	609	+ * need to further check other vma.
	610	+ */
	611	+ ret = -EIO;
	612	+ goto unlock;
	613	+ }
	614	+
	615	+ if (!vma_migratable(walk->vma)) {
	616	+ /*
	617	+ * Must be STRICT with MOVE*, otherwise .test_walk() have
	618	+ * stopped walking current vma.
	619	+ * Detecting misplaced page but allow migrating pages which
	620	+ * have been queued.
	621	+ */
	622	+ ret = 1;
	623	+ goto unlock;
	624	+ }
	625	+
572	626	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
573	627	if (flags & (MPOL_MF_MOVE_ALL) \|\|
574		- (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
575		- isolate_huge_page(page, qp->pagelist);
	628	+ (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
	629	+ !hugetlb_pmd_shared(pte))) {
	630	+ if (isolate_hugetlb(page, qp->pagelist) &&
	631	+ (flags & MPOL_MF_STRICT))
	632	+ /*
	633	+ * Failed to isolate page but allow migrating pages
	634	+ * which have been queued.
	635	+ */
	636	+ ret = 1;
	637	+ }
576	638	unlock:
577	639	spin_unlock(ptl);
578	640	#else
579	641	BUG();
580	642	#endif
581		- return 0;
	643	+ return ret;
582	644	}
583	645
584	646	#ifdef CONFIG_NUMA_BALANCING
..	..	@@ -596,7 +658,7 @@
596	658	{
597	659	int nr_updated;
598	660
599		- nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
	661	+ nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
600	662	if (nr_updated)
601	663	count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
602	664
..	..	@@ -618,6 +680,22 @@
618	680	unsigned long endvma = vma->vm_end;
619	681	unsigned long flags = qp->flags;
620	682
	683	+ /* range check first */
	684	+ VM_BUG_ON_VMA((vma->vm_start > start) \|\| (vma->vm_end < end), vma);
	685	+
	686	+ if (!qp->first) {
	687	+ qp->first = vma;
	688	+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
	689	+ (qp->start < vma->vm_start))
	690	+ /* hole at head side of range */
	691	+ return -EFAULT;
	692	+ }
	693	+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
	694	+ ((vma->vm_end < qp->end) &&
	695	+ (!vma->vm_next \|\| vma->vm_end < vma->vm_next->vm_start)))
	696	+ /* hole at middle or tail of range */
	697	+ return -EFAULT;
	698	+
621	699	/*
622	700	* Need check MPOL_MF_STRICT to return -EIO if possible
623	701	* regardless of vma_migratable
..	..	@@ -628,22 +706,10 @@
628	706
629	707	if (endvma > end)
630	708	endvma = end;
631		- if (vma->vm_start > start)
632		- start = vma->vm_start;
633		-
634		- if (!(flags & MPOL_MF_DISCONTIG_OK)) {
635		- if (!vma->vm_next && vma->vm_end < end)
636		- return -EFAULT;
637		- if (qp->prev && qp->prev->vm_end < vma->vm_start)
638		- return -EFAULT;
639		- }
640		-
641		- qp->prev = vma;
642	709
643	710	if (flags & MPOL_MF_LAZY) {
644	711	/* Similar to task_numa_work, skip inaccessible VMAs */
645		- if (!is_vm_hugetlb_page(vma) &&
646		- (vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)) &&
	712	+ if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
647	713	!(vma->vm_flags & VM_MIXEDMAP))
648	714	change_prot_numa(vma, start, endvma);
649	715	return 1;
..	..	@@ -654,6 +720,12 @@
654	720	return 0;
655	721	return 1;
656	722	}
	723	+
	724	+static const struct mm_walk_ops queue_pages_walk_ops = {
	725	+ .hugetlb_entry = queue_pages_hugetlb,
	726	+ .pmd_entry = queue_pages_pte_range,
	727	+ .test_walk = queue_pages_test_walk,
	728	+};
657	729
658	730	/*
659	731	* Walk through page tables and collect pages to be migrated.
..	..	@@ -675,26 +747,28 @@
675	747	nodemask_t *nodes, unsigned long flags,
676	748	struct list_head *pagelist)
677	749	{
	750	+ int err;
678	751	struct queue_pages qp = {
679	752	.pagelist = pagelist,
680	753	.flags = flags,
681	754	.nmask = nodes,
682		- .prev = NULL,
683		- };
684		- struct mm_walk queue_pages_walk = {
685		- .hugetlb_entry = queue_pages_hugetlb,
686		- .pmd_entry = queue_pages_pte_range,
687		- .test_walk = queue_pages_test_walk,
688		- .mm = mm,
689		- .private = &qp,
	755	+ .start = start,
	756	+ .end = end,
	757	+ .first = NULL,
690	758	};
691	759
692		- return walk_page_range(start, end, &queue_pages_walk);
	760	+ err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
	761	+
	762	+ if (!qp.first)
	763	+ /* whole range in hole */
	764	+ err = -EFAULT;
	765	+
	766	+ return err;
693	767	}
694	768
695	769	/*
696	770	* Apply policy to a single VMA
697		- * This must be called with the mmap_sem held for writing.
	771	+ * This must be called with the mmap_lock held for writing.
698	772	*/
699	773	static int vma_replace_policy(struct vm_area_struct *vma,
700	774	struct mempolicy *pol)
..	..	@@ -712,6 +786,7 @@
712	786	if (IS_ERR(new))
713	787	return PTR_ERR(new);
714	788
	789	+ vm_write_begin(vma);
715	790	if (vma->vm_ops && vma->vm_ops->set_policy) {
716	791	err = vma->vm_ops->set_policy(vma, new);
717	792	if (err)
..	..	@@ -719,11 +794,17 @@
719	794	}
720	795
721	796	old = vma->vm_policy;
722		- vma->vm_policy = new; /* protected by mmap_sem */
	797	+ /*
	798	+ * The speculative page fault handler accesses this field without
	799	+ * hodling the mmap_sem.
	800	+ */
	801	+ WRITE_ONCE(vma->vm_policy, new);
	802	+ vm_write_end(vma);
723	803	mpol_put(old);
724	804
725	805	return 0;
726	806	err_out:
	807	+ vm_write_end(vma);
727	808	mpol_put(new);
728	809	return err;
729	810	}
..	..	@@ -732,7 +813,6 @@
732	813	static int mbind_range(struct mm_struct *mm, unsigned long start,
733	814	unsigned long end, struct mempolicy *new_pol)
734	815	{
735		- struct vm_area_struct *next;
736	816	struct vm_area_struct *prev;
737	817	struct vm_area_struct *vma;
738	818	int err = 0;
..	..	@@ -741,15 +821,13 @@
741	821	unsigned long vmend;
742	822
743	823	vma = find_vma(mm, start);
744		- if (!vma \|\| vma->vm_start > start)
745		- return -EFAULT;
	824	+ VM_BUG_ON(!vma);
746	825
747	826	prev = vma->vm_prev;
748	827	if (start > vma->vm_start)
749	828	prev = vma;
750	829
751		- for (; vma && vma->vm_start < end; prev = vma, vma = next) {
752		- next = vma->vm_next;
	830	+ for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
753	831	vmstart = max(start, vma->vm_start);
754	832	vmend = min(end, vma->vm_end);
755	833
..	..	@@ -764,10 +842,6 @@
764	842	vma_get_anon_name(vma));
765	843	if (prev) {
766	844	vma = prev;
767		- next = vma->vm_next;
768		- if (mpol_equal(vma_policy(vma), new_pol))
769		- continue;
770		- /* vma_merge() joined vma && vma->next, case 8 */
771	845	goto replace;
772	846	}
773	847	if (vma->vm_start != vmstart) {
..	..	@@ -807,13 +881,12 @@
807	881	goto out;
808	882	}
809	883
810		- task_lock(current);
811	884	ret = mpol_set_nodemask(new, nodes, scratch);
812	885	if (ret) {
813		- task_unlock(current);
814	886	mpol_put(new);
815	887	goto out;
816	888	}
	889	+ task_lock(current);
817	890	old = current->mempolicy;
818	891	current->mempolicy = new;
819	892	if (new && new->mode == MPOL_INTERLEAVE)
..	..	@@ -839,7 +912,6 @@
839	912
840	913	switch (p->mode) {
841	914	case MPOL_BIND:
842		- /* Fall through */
843	915	case MPOL_INTERLEAVE:
844	916	*nodes = p->v.nodes;
845	917	break;
..	..	@@ -853,16 +925,19 @@
853	925	}
854	926	}
855	927
856		-static int lookup_node(unsigned long addr)
	928	+static int lookup_node(struct mm_struct *mm, unsigned long addr)
857	929	{
858		- struct page *p;
	930	+ struct page *p = NULL;
859	931	int err;
860	932
861		- err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
862		- if (err >= 0) {
	933	+ int locked = 1;
	934	+ err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
	935	+ if (err > 0) {
863	936	err = page_to_nid(p);
864	937	put_page(p);
865	938	}
	939	+ if (locked)
	940	+ mmap_read_unlock(mm);
866	941	return err;
867	942	}
868	943
..	..	@@ -873,7 +948,7 @@
873	948	int err;
874	949	struct mm_struct *mm = current->mm;
875	950	struct vm_area_struct *vma = NULL;
876		- struct mempolicy *pol = current->mempolicy;
	951	+ struct mempolicy pol = current->mempolicy, pol_refcount = NULL;
877	952
878	953	if (flags &
879	954	~(unsigned long)(MPOL_F_NODE\|MPOL_F_ADDR\|MPOL_F_MEMS_ALLOWED))
..	..	@@ -895,10 +970,10 @@
895	970	* vma/shared policy at addr is NULL. We
896	971	* want to return MPOL_DEFAULT in this case.
897	972	*/
898		- down_read(&mm->mmap_sem);
	973	+ mmap_read_lock(mm);
899	974	vma = find_vma_intersection(mm, addr, addr+1);
900	975	if (!vma) {
901		- up_read(&mm->mmap_sem);
	976	+ mmap_read_unlock(mm);
902	977	return -EFAULT;
903	978	}
904	979	if (vma->vm_ops && vma->vm_ops->get_policy)
..	..	@@ -913,7 +988,16 @@
913	988
914	989	if (flags & MPOL_F_NODE) {
915	990	if (flags & MPOL_F_ADDR) {
916		- err = lookup_node(addr);
	991	+ /*
	992	+ * Take a refcount on the mpol, lookup_node()
	993	+ * wil drop the mmap_lock, so after calling
	994	+ * lookup_node() only "pol" remains valid, "vma"
	995	+ * is stale.
	996	+ */
	997	+ pol_refcount = pol;
	998	+ vma = NULL;
	999	+ mpol_get(pol);
	1000	+ err = lookup_node(mm, addr);
917	1001	if (err < 0)
918	1002	goto out;
919	1003	*policy = err;
..	..	@@ -948,7 +1032,9 @@
948	1032	out:
949	1033	mpol_cond_put(pol);
950	1034	if (vma)
951		- up_read(&current->mm->mmap_sem);
	1035	+ mmap_read_unlock(mm);
	1036	+ if (pol_refcount)
	1037	+ mpol_put(pol_refcount);
952	1038	return err;
953	1039	}
954	1040
..	..	@@ -967,8 +1053,8 @@
967	1053	if (!isolate_lru_page(head)) {
968	1054	list_add_tail(&head->lru, pagelist);
969	1055	mod_node_page_state(page_pgdat(head),
970		- NR_ISOLATED_ANON + page_is_file_cache(head),
971		- hpage_nr_pages(head));
	1056	+ NR_ISOLATED_ANON + page_is_file_lru(head),
	1057	+ thp_nr_pages(head));
972	1058	} else if (flags & MPOL_MF_STRICT) {
973	1059	/*
974	1060	* Non-movable page may reach here. And, there may be
..	..	@@ -984,27 +1070,6 @@
984	1070	return 0;
985	1071	}
986	1072
987		-/* page allocation callback for NUMA node migration */
988		-struct page alloc_new_node_page(struct page page, unsigned long node)
989		-{
990		- if (PageHuge(page))
991		- return alloc_huge_page_node(page_hstate(compound_head(page)),
992		- node);
993		- else if (PageTransHuge(page)) {
994		- struct page *thp;
995		-
996		- thp = alloc_pages_node(node,
997		- (GFP_TRANSHUGE \| __GFP_THISNODE),
998		- HPAGE_PMD_ORDER);
999		- if (!thp)
1000		- return NULL;
1001		- prep_transhuge_page(thp);
1002		- return thp;
1003		- } else
1004		- return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE \|
1005		- __GFP_THISNODE, 0);
1006		-}
1007		-
1008	1073	/*
1009	1074	* Migrate pages from one node to a target node.
1010	1075	* Returns error or the number of pages not migrated.
..	..	@@ -1015,6 +1080,10 @@
1015	1080	nodemask_t nmask;
1016	1081	LIST_HEAD(pagelist);
1017	1082	int err = 0;
	1083	+ struct migration_target_control mtc = {
	1084	+ .nid = dest,
	1085	+ .gfp_mask = GFP_HIGHUSER_MOVABLE \| __GFP_THISNODE,
	1086	+ };
1018	1087
1019	1088	nodes_clear(nmask);
1020	1089	node_set(source, nmask);
..	..	@@ -1029,8 +1098,8 @@
1029	1098	flags \| MPOL_MF_DISCONTIG_OK, &pagelist);
1030	1099
1031	1100	if (!list_empty(&pagelist)) {
1032		- err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1033		- MIGRATE_SYNC, MR_SYSCALL);
	1101	+ err = migrate_pages(&pagelist, alloc_migration_target, NULL,
	1102	+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
1034	1103	if (err)
1035	1104	putback_movable_pages(&pagelist);
1036	1105	}
..	..	@@ -1048,14 +1117,12 @@
1048	1117	const nodemask_t *to, int flags)
1049	1118	{
1050	1119	int busy = 0;
1051		- int err;
	1120	+ int err = 0;
1052	1121	nodemask_t tmp;
1053	1122
1054		- err = migrate_prep();
1055		- if (err)
1056		- return err;
	1123	+ lru_cache_disable();
1057	1124
1058		- down_read(&mm->mmap_sem);
	1125	+ mmap_read_lock(mm);
1059	1126
1060	1127	/*
1061	1128	* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
..	..	@@ -1136,7 +1203,9 @@
1136	1203	if (err < 0)
1137	1204	break;
1138	1205	}
1139		- up_read(&mm->mmap_sem);
	1206	+ mmap_read_unlock(mm);
	1207	+
	1208	+ lru_cache_enable();
1140	1209	if (err < 0)
1141	1210	return err;
1142	1211	return busy;
..	..	@@ -1153,7 +1222,7 @@
1153	1222	static struct page new_page(struct page page, unsigned long start)
1154	1223	{
1155	1224	struct vm_area_struct *vma;
1156		- unsigned long uninitialized_var(address);
	1225	+ unsigned long address;
1157	1226
1158	1227	vma = find_vma(current->mm, start);
1159	1228	while (vma) {
..	..	@@ -1252,19 +1321,15 @@
1252	1321
1253	1322	if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) {
1254	1323
1255		- err = migrate_prep();
1256		- if (err)
1257		- goto mpol_out;
	1324	+ lru_cache_disable();
1258	1325	}
1259	1326	{
1260	1327	NODEMASK_SCRATCH(scratch);
1261	1328	if (scratch) {
1262		- down_write(&mm->mmap_sem);
1263		- task_lock(current);
	1329	+ mmap_write_lock(mm);
1264	1330	err = mpol_set_nodemask(new, nmask, scratch);
1265		- task_unlock(current);
1266	1331	if (err)
1267		- up_write(&mm->mmap_sem);
	1332	+ mmap_write_unlock(mm);
1268	1333	} else
1269	1334	err = -ENOMEM;
1270	1335	NODEMASK_SCRATCH_FREE(scratch);
..	..	@@ -1301,9 +1366,11 @@
1301	1366	putback_movable_pages(&pagelist);
1302	1367	}
1303	1368
1304		- up_write(&mm->mmap_sem);
	1369	+ mmap_write_unlock(mm);
1305	1370	mpol_out:
1306	1371	mpol_put(new);
	1372	+ if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL))
	1373	+ lru_cache_enable();
1307	1374	return err;
1308	1375	}
1309	1376
..	..	@@ -1505,10 +1572,6 @@
1505	1572	if (nodes_empty(*new))
1506	1573	goto out_put;
1507	1574
1508		- nodes_and(new, new, node_states[N_MEMORY]);
1509		- if (nodes_empty(*new))
1510		- goto out_put;
1511		-
1512	1575	err = security_task_movememory(task);
1513	1576	if (err)
1514	1577	goto out_put;
..	..	@@ -1552,13 +1615,13 @@
1552	1615	unsigned long flags)
1553	1616	{
1554	1617	int err;
1555		- int uninitialized_var(pval);
	1618	+ int pval;
1556	1619	nodemask_t nodes;
1557		-
1558		- addr = untagged_addr(addr);
1559	1620
1560	1621	if (nmask != NULL && maxnode < nr_node_ids)
1561	1622	return -EINVAL;
	1623	+
	1624	+ addr = untagged_addr(addr);
1562	1625
1563	1626	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1564	1627
..	..	@@ -1691,26 +1754,59 @@
1691	1754
1692	1755	#endif /* CONFIG_COMPAT */
1693	1756
	1757	+bool vma_migratable(struct vm_area_struct *vma)
	1758	+{
	1759	+ if (vma->vm_flags & (VM_IO \| VM_PFNMAP))
	1760	+ return false;
	1761	+
	1762	+ /*
	1763	+ * DAX device mappings require predictable access latency, so avoid
	1764	+ * incurring periodic faults.
	1765	+ */
	1766	+ if (vma_is_dax(vma))
	1767	+ return false;
	1768	+
	1769	+ if (is_vm_hugetlb_page(vma) &&
	1770	+ !hugepage_migration_supported(hstate_vma(vma)))
	1771	+ return false;
	1772	+
	1773	+ /*
	1774	+ * Migration allocates pages in the highest zone. If we cannot
	1775	+ * do so then migration (at least from node to node) is not
	1776	+ * possible.
	1777	+ */
	1778	+ if (vma->vm_file &&
	1779	+ gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
	1780	+ < policy_zone)
	1781	+ return false;
	1782	+ return true;
	1783	+}
	1784	+
1694	1785	struct mempolicy __get_vma_policy(struct vm_area_struct vma,
1695	1786	unsigned long addr)
1696	1787	{
1697		- struct mempolicy *pol = NULL;
	1788	+ struct mempolicy *pol;
1698	1789
1699		- if (vma) {
1700		- if (vma->vm_ops && vma->vm_ops->get_policy) {
1701		- pol = vma->vm_ops->get_policy(vma, addr);
1702		- } else if (vma->vm_policy) {
1703		- pol = vma->vm_policy;
	1790	+ if (!vma)
	1791	+ return NULL;
1704	1792
1705		- /*
1706		- * shmem_alloc_page() passes MPOL_F_SHARED policy with
1707		- * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1708		- * count on these policies which will be dropped by
1709		- * mpol_cond_put() later
1710		- */
1711		- if (mpol_needs_cond_ref(pol))
1712		- mpol_get(pol);
1713		- }
	1793	+ if (vma->vm_ops && vma->vm_ops->get_policy)
	1794	+ return vma->vm_ops->get_policy(vma, addr);
	1795	+
	1796	+ /*
	1797	+ * This could be called without holding the mmap_sem in the
	1798	+ * speculative page fault handler's path.
	1799	+ */
	1800	+ pol = READ_ONCE(vma->vm_policy);
	1801	+ if (pol) {
	1802	+ /*
	1803	+ * shmem_alloc_page() passes MPOL_F_SHARED policy with
	1804	+ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
	1805	+ * count on these policies which will be dropped by
	1806	+ * mpol_cond_put() later
	1807	+ */
	1808	+ if (mpol_needs_cond_ref(pol))
	1809	+ mpol_get(pol);
1714	1810	}
1715	1811
1716	1812	return pol;
..	..	@@ -1785,7 +1881,7 @@
1785	1881	* Return a nodemask representing a mempolicy for filtering nodes for
1786	1882	* page allocation
1787	1883	*/
1788		-static nodemask_t policy_nodemask(gfp_t gfp, struct mempolicy policy)
	1884	+nodemask_t policy_nodemask(gfp_t gfp, struct mempolicy policy)
1789	1885	{
1790	1886	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1791	1887	if (unlikely(policy->mode == MPOL_BIND) &&
..	..	@@ -1797,8 +1893,7 @@
1797	1893	}
1798	1894
1799	1895	/* Return the node id preferred by the given mempolicy, or the given id */
1800		-static int policy_node(gfp_t gfp, struct mempolicy *policy,
1801		- int nd)
	1896	+static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1802	1897	{
1803	1898	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1804	1899	nd = policy->v.preferred_node;
..	..	@@ -1986,7 +2081,6 @@
1986	2081	break;
1987	2082
1988	2083	case MPOL_BIND:
1989		- /* Fall through */
1990	2084	case MPOL_INTERLEAVE:
1991	2085	*mask = mempolicy->v.nodes;
1992	2086	break;
..	..	@@ -2081,7 +2175,7 @@
2081	2175	*
2082	2176	* This function allocates a page from the kernel page pool and applies
2083	2177	* a NUMA policy associated with the VMA or the current process.
2084		- * When VMA is not NULL caller must hold down_read on the mmap_sem of the
	2178	+ * When VMA is not NULL caller must read-lock the mmap_lock of the
2085	2179	* mm_struct of the VMA to prevent it from going away. Should be used for
2086	2180	* all allocations for pages that will be mapped into user space. Returns
2087	2181	* NULL when no page can be allocated.
..	..	@@ -2119,43 +2213,29 @@
2119	2213	* If the policy is interleave, or does not allow the current
2120	2214	* node in its nodemask, we allocate the standard way.
2121	2215	*/
2122		- if (pol->mode == MPOL_PREFERRED &&
2123		- !(pol->flags & MPOL_F_LOCAL))
	2216	+ if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2124	2217	hpage_node = pol->v.preferred_node;
2125	2218
2126	2219	nmask = policy_nodemask(gfp, pol);
2127	2220	if (!nmask \|\| node_isset(hpage_node, *nmask)) {
2128	2221	mpol_cond_put(pol);
2129	2222	/*
2130		- * We cannot invoke reclaim if __GFP_THISNODE
2131		- * is set. Invoking reclaim with
2132		- * __GFP_THISNODE set, would cause THP
2133		- * allocations to trigger heavy swapping
2134		- * despite there may be tons of free memory
2135		- * (including potentially plenty of THP
2136		- * already available in the buddy) on all the
2137		- * other NUMA nodes.
2138		- *
2139		- * At most we could invoke compaction when
2140		- * __GFP_THISNODE is set (but we would need to
2141		- * refrain from invoking reclaim even if
2142		- * compaction returned COMPACT_SKIPPED because
2143		- * there wasn't not enough memory to succeed
2144		- * compaction). For now just avoid
2145		- * __GFP_THISNODE instead of limiting the
2146		- * allocation path to a strict and single
2147		- * compaction invocation.
2148		- *
2149		- * Supposedly if direct reclaim was enabled by
2150		- * the caller, the app prefers THP regardless
2151		- * of the node it comes from so this would be
2152		- * more desiderable behavior than only
2153		- * providing THP originated from the local
2154		- * node in such case.
	2223	+ * First, try to allocate THP only on local node, but
	2224	+ * don't reclaim unnecessarily, just compact.
2155	2225	*/
2156		- if (!(gfp & __GFP_DIRECT_RECLAIM))
2157		- gfp \|= __GFP_THISNODE;
2158		- page = __alloc_pages_node(hpage_node, gfp, order);
	2226	+ page = __alloc_pages_node(hpage_node,
	2227	+ gfp \| __GFP_THISNODE \| __GFP_NORETRY, order);
	2228	+
	2229	+ /*
	2230	+ * If hugepage allocations are configured to always
	2231	+ * synchronous compact or the vma has been madvised
	2232	+ * to prefer hugepage backing, retry allowing remote
	2233	+ * memory with both reclaim and compact as well.
	2234	+ */
	2235	+ if (!page && (gfp & __GFP_DIRECT_RECLAIM))
	2236	+ page = __alloc_pages_nodemask(gfp, order,
	2237	+ hpage_node, nmask);
	2238	+
2159	2239	goto out;
2160	2240	}
2161	2241	}
..	..	@@ -2167,6 +2247,7 @@
2167	2247	out:
2168	2248	return page;
2169	2249	}
	2250	+EXPORT_SYMBOL(alloc_pages_vma);
2170	2251
2171	2252	/**
2172	2253	* alloc_pages_current - Allocate pages.
..	..	@@ -2266,7 +2347,6 @@
2266	2347
2267	2348	switch (a->mode) {
2268	2349	case MPOL_BIND:
2269		- /* Fall through */
2270	2350	case MPOL_INTERLEAVE:
2271	2351	return !!nodes_equal(a->v.nodes, b->v.nodes);
2272	2352	case MPOL_PREFERRED:
..	..	@@ -2399,7 +2479,7 @@
2399	2479	unsigned long pgoff;
2400	2480	int thiscpu = raw_smp_processor_id();
2401	2481	int thisnid = cpu_to_node(thiscpu);
2402		- int polnid = -1;
	2482	+ int polnid = NUMA_NO_NODE;
2403	2483	int ret = -1;
2404	2484
2405	2485	pol = get_vma_policy(vma, addr);
..	..	@@ -2573,6 +2653,7 @@
2573	2653	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2574	2654	if (!mpol_new)
2575	2655	goto err_out;
	2656	+ atomic_set(&mpol_new->refcnt, 1);
2576	2657	goto restart;
2577	2658	}
2578	2659
..	..	@@ -2805,12 +2886,11 @@
2805	2886	int mpol_parse_str(char str, struct mempolicy *mpol)
2806	2887	{
2807	2888	struct mempolicy *new = NULL;
2808		- unsigned short mode;
2809	2889	unsigned short mode_flags;
2810	2890	nodemask_t nodes;
2811	2891	char *nodelist = strchr(str, ':');
2812	2892	char *flags = strchr(str, '=');
2813		- int err = 1;
	2893	+ int err = 1, mode;
2814	2894
2815	2895	if (flags)
2816	2896	flags++ = '\0'; / terminate mode string */
..	..	@@ -2825,12 +2905,8 @@
2825	2905	} else
2826	2906	nodes_clear(nodes);
2827	2907
2828		- for (mode = 0; mode < MPOL_MAX; mode++) {
2829		- if (!strcmp(str, policy_modes[mode])) {
2830		- break;
2831		- }
2832		- }
2833		- if (mode >= MPOL_MAX)
	2908	+ mode = match_string(policy_modes, MPOL_MAX, str);
	2909	+ if (mode < 0)
2834	2910	goto out;
2835	2911
2836	2912	switch (mode) {