~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/mm/page_alloc.c
3	4	*
..	..	@@ -16,11 +17,11 @@
16	17
17	18	#include <linux/stddef.h>
18	19	#include <linux/mm.h>
	20	+#include <linux/highmem.h>
19	21	#include <linux/swap.h>
20	22	#include <linux/interrupt.h>
21	23	#include <linux/pagemap.h>
22	24	#include <linux/jiffies.h>
23		-#include <linux/bootmem.h>
24	25	#include <linux/memblock.h>
25	26	#include <linux/compiler.h>
26	27	#include <linux/kernel.h>
..	..	@@ -43,12 +44,12 @@
43	44	#include <linux/mempolicy.h>
44	45	#include <linux/memremap.h>
45	46	#include <linux/stop_machine.h>
	47	+#include <linux/random.h>
46	48	#include <linux/sort.h>
47	49	#include <linux/pfn.h>
48	50	#include <linux/backing-dev.h>
49	51	#include <linux/fault-inject.h>
50	52	#include <linux/page-isolation.h>
51		-#include <linux/page_ext.h>
52	53	#include <linux/debugobjects.h>
53	54	#include <linux/kmemleak.h>
54	55	#include <linux/compaction.h>
..	..	@@ -61,18 +62,63 @@
61	62	#include <linux/sched/rt.h>
62	63	#include <linux/sched/mm.h>
63	64	#include <linux/page_owner.h>
	65	+#include <linux/page_pinner.h>
64	66	#include <linux/kthread.h>
65	67	#include <linux/memcontrol.h>
66	68	#include <linux/ftrace.h>
67	69	#include <linux/lockdep.h>
68	70	#include <linux/nmi.h>
69		-#include <linux/khugepaged.h>
70	71	#include <linux/psi.h>
	72	+#include <linux/padata.h>
	73	+#include <linux/khugepaged.h>
	74	+#include <trace/hooks/mm.h>
	75	+#include <trace/hooks/vmscan.h>
71	76
72	77	#include <asm/sections.h>
73	78	#include <asm/tlbflush.h>
74	79	#include <asm/div64.h>
75	80	#include "internal.h"
	81	+#include "shuffle.h"
	82	+#include "page_reporting.h"
	83	+
	84	+/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
	85	+typedef int __bitwise fpi_t;
	86	+
	87	+/* No special request */
	88	+#define FPI_NONE ((__force fpi_t)0)
	89	+
	90	+/*
	91	+ * Skip free page reporting notification for the (possibly merged) page.
	92	+ * This does not hinder free page reporting from grabbing the page,
	93	+ * reporting it and marking it "reported" - it only skips notifying
	94	+ * the free page reporting infrastructure about a newly freed page. For
	95	+ * example, used when temporarily pulling a page from a freelist and
	96	+ * putting it back unmodified.
	97	+ */
	98	+#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
	99	+
	100	+/*
	101	+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
	102	+ * page shuffling (relevant code - e.g., memory onlining - is expected to
	103	+ * shuffle the whole zone).
	104	+ *
	105	+ * Note: No code should rely on this flag for correctness - it's purely
	106	+ * to allow for optimizations when handing back either fresh pages
	107	+ * (memory onlining) or untouched pages (page isolation, free page
	108	+ * reporting).
	109	+ */
	110	+#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
	111	+
	112	+/*
	113	+ * Don't poison memory with KASAN (only for the tag-based modes).
	114	+ * During boot, all non-reserved memblock memory is exposed to page_alloc.
	115	+ * Poisoning all that memory lengthens boot time, especially on systems with
	116	+ * large amount of RAM. This flag is used to skip that poisoning.
	117	+ * This is only done for the tag-based KASAN modes, as those are able to
	118	+ * detect memory corruptions with the memory tags assigned by default.
	119	+ * All memory allocated normally after boot gets poisoned as usual.
	120	+ */
	121	+#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
76	122
77	123	/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
78	124	static DEFINE_MUTEX(pcp_batch_high_lock);
..	..	@@ -94,12 +140,15 @@
94	140	*/
95	141	DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
96	142	EXPORT_PER_CPU_SYMBOL(_numa_mem_);
97		-int _node_numa_mem_[MAX_NUMNODES];
98	143	#endif
99	144
100	145	/* work_structs for global per-cpu drains */
101		-DEFINE_MUTEX(pcpu_drain_mutex);
102		-DEFINE_PER_CPU(struct work_struct, pcpu_drain);
	146	+struct pcpu_drain {
	147	+ struct zone *zone;
	148	+ struct work_struct work;
	149	+};
	150	+static DEFINE_MUTEX(pcpu_drain_mutex);
	151	+static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
103	152
104	153	#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
105	154	volatile unsigned long latent_entropy __latent_entropy;
..	..	@@ -123,62 +172,33 @@
123	172	};
124	173	EXPORT_SYMBOL(node_states);
125	174
126		-/* Protect totalram_pages and zone->managed_pages */
127		-static DEFINE_SPINLOCK(managed_page_count_lock);
128		-
129		-unsigned long totalram_pages __read_mostly;
	175	+atomic_long_t _totalram_pages __read_mostly;
	176	+EXPORT_SYMBOL(_totalram_pages);
130	177	unsigned long totalreserve_pages __read_mostly;
131	178	unsigned long totalcma_pages __read_mostly;
132	179
133	180	int percpu_pagelist_fraction;
134	181	gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
135		-#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
136		-DEFINE_STATIC_KEY_TRUE(init_on_alloc);
137		-#else
138	182	DEFINE_STATIC_KEY_FALSE(init_on_alloc);
139		-#endif
140	183	EXPORT_SYMBOL(init_on_alloc);
141	184
142		-#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
143		-DEFINE_STATIC_KEY_TRUE(init_on_free);
144		-#else
145	185	DEFINE_STATIC_KEY_FALSE(init_on_free);
146		-#endif
147	186	EXPORT_SYMBOL(init_on_free);
148	187
	188	+static bool _init_on_alloc_enabled_early __read_mostly
	189	+ = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
149	190	static int __init early_init_on_alloc(char *buf)
150	191	{
151		- int ret;
152		- bool bool_result;
153	192
154		- if (!buf)
155		- return -EINVAL;
156		- ret = kstrtobool(buf, &bool_result);
157		- if (bool_result && page_poisoning_enabled())
158		- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
159		- if (bool_result)
160		- static_branch_enable(&init_on_alloc);
161		- else
162		- static_branch_disable(&init_on_alloc);
163		- return ret;
	193	+ return kstrtobool(buf, &_init_on_alloc_enabled_early);
164	194	}
165	195	early_param("init_on_alloc", early_init_on_alloc);
166	196
	197	+static bool _init_on_free_enabled_early __read_mostly
	198	+ = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
167	199	static int __init early_init_on_free(char *buf)
168	200	{
169		- int ret;
170		- bool bool_result;
171		-
172		- if (!buf)
173		- return -EINVAL;
174		- ret = kstrtobool(buf, &bool_result);
175		- if (bool_result && page_poisoning_enabled())
176		- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
177		- if (bool_result)
178		- static_branch_enable(&init_on_free);
179		- else
180		- static_branch_disable(&init_on_free);
181		- return ret;
	201	+ return kstrtobool(buf, &_init_on_free_enabled_early);
182	202	}
183	203	early_param("init_on_free", early_init_on_free);
184	204
..	..	@@ -242,7 +262,8 @@
242	262	unsigned int pageblock_order __read_mostly;
243	263	#endif
244	264
245		-static void __free_pages_ok(struct page *page, unsigned int order);
	265	+static void __free_pages_ok(struct page *page, unsigned int order,
	266	+ fpi_t fpi_flags);
246	267
247	268	/*
248	269	* results with 256, 32 in the lowmem_reserve sysctl:
..	..	@@ -269,8 +290,6 @@
269	290	[ZONE_MOVABLE] = 0,
270	291	};
271	292
272		-EXPORT_SYMBOL(totalram_pages);
273		-
274	293	static char * const zone_names[MAX_NR_ZONES] = {
275	294	#ifdef CONFIG_ZONE_DMA
276	295	"DMA",
..	..	@@ -288,7 +307,7 @@
288	307	#endif
289	308	};
290	309
291		-char * const migratetype_names[MIGRATE_TYPES] = {
	310	+const char * const migratetype_names[MIGRATE_TYPES] = {
292	311	"Unmovable",
293	312	"Movable",
294	313	"Reclaimable",
..	..	@@ -301,14 +320,14 @@
301	320	#endif
302	321	};
303	322
304		-compound_page_dtor * const compound_page_dtors[] = {
305		- NULL,
306		- free_compound_page,
	323	+compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
	324	+ [NULL_COMPOUND_DTOR] = NULL,
	325	+ [COMPOUND_PAGE_DTOR] = free_compound_page,
307	326	#ifdef CONFIG_HUGETLB_PAGE
308		- free_huge_page,
	327	+ [HUGETLB_PAGE_DTOR] = free_huge_page,
309	328	#endif
310	329	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
311		- free_transhuge_page,
	330	+ [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
312	331	#endif
313	332	};
314	333
..	..	@@ -319,6 +338,20 @@
319	338	*/
320	339	int min_free_kbytes = 1024;
321	340	int user_min_free_kbytes = -1;
	341	+#ifdef CONFIG_DISCONTIGMEM
	342	+/*
	343	+ * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
	344	+ * are not on separate NUMA nodes. Functionally this works but with
	345	+ * watermark_boost_factor, it can reclaim prematurely as the ranges can be
	346	+ * quite small. By default, do not boost watermarks on discontigmem as in
	347	+ * many cases very high-order allocations like THP are likely to be
	348	+ * unsupported and the premature reclaim offsets the advantage of long-term
	349	+ * fragmentation avoidance.
	350	+ */
	351	+int watermark_boost_factor __read_mostly;
	352	+#else
	353	+int watermark_boost_factor __read_mostly = 15000;
	354	+#endif
322	355	int watermark_scale_factor = 10;
323	356
324	357	/*
..	..	@@ -328,28 +361,26 @@
328	361	*/
329	362	int extra_free_kbytes = 0;
330	363
331		-static unsigned long nr_kernel_pages __meminitdata;
332		-static unsigned long nr_all_pages __meminitdata;
333		-static unsigned long dma_reserve __meminitdata;
	364	+static unsigned long nr_kernel_pages __initdata;
	365	+static unsigned long nr_all_pages __initdata;
	366	+static unsigned long dma_reserve __initdata;
334	367
335		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
336		-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
337		-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
	368	+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
	369	+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
338	370	static unsigned long required_kernelcore __initdata;
339	371	static unsigned long required_kernelcore_percent __initdata;
340	372	static unsigned long required_movablecore __initdata;
341	373	static unsigned long required_movablecore_percent __initdata;
342		-static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
	374	+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
343	375	static bool mirrored_kernelcore __meminitdata;
344	376
345	377	/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
346	378	int movable_zone;
347	379	EXPORT_SYMBOL(movable_zone);
348		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
349	380
350	381	#if MAX_NUMNODES > 1
351		-int nr_node_ids __read_mostly = MAX_NUMNODES;
352		-int nr_online_nodes __read_mostly = 1;
	382	+unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
	383	+unsigned int nr_online_nodes __read_mostly = 1;
353	384	EXPORT_SYMBOL(nr_node_ids);
354	385	EXPORT_SYMBOL(nr_online_nodes);
355	386	#endif
..	..	@@ -365,7 +396,7 @@
365	396	static DEFINE_STATIC_KEY_TRUE(deferred_pages);
366	397
367	398	/*
368		- * Calling kasan_free_pages() only after deferred memory initialization
	399	+ * Calling kasan_poison_pages() only after deferred memory initialization
369	400	* has completed. Poisoning pages during deferred memory init will greatly
370	401	* lengthen the process and cause problem in large memory systems as the
371	402	* deferred pages initialization is done with interrupt disabled.
..	..	@@ -377,10 +408,12 @@
377	408	* on-demand allocation and then freed again before the deferred pages
378	409	* initialization is done, but this is not likely to happen.
379	410	*/
380		-static inline void kasan_free_nondeferred_pages(struct page *page, int order)
	411	+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
381	412	{
382		- if (!static_branch_unlikely(&deferred_pages))
383		- kasan_free_pages(page, order);
	413	+ return static_branch_unlikely(&deferred_pages) \|\|
	414	+ (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
	415	+ (fpi_flags & FPI_SKIP_KASAN_POISON)) \|\|
	416	+ PageSkipKASanPoison(page);
384	417	}
385	418
386	419	/* Returns true if the struct page for the pfn is uninitialised */
..	..	@@ -395,38 +428,57 @@
395	428	}
396	429
397	430	/*
398		- * Returns false when the remaining initialisation should be deferred until
	431	+ * Returns true when the remaining initialisation should be deferred until
399	432	* later in the boot cycle when it can be parallelised.
400	433	*/
401		-static inline bool update_defer_init(pg_data_t *pgdat,
402		- unsigned long pfn, unsigned long zone_end,
403		- unsigned long *nr_initialised)
	434	+static bool __meminit
	435	+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
404	436	{
405		- /* Always populate low zones for address-constrained allocations */
406		- if (zone_end < pgdat_end_pfn(pgdat))
407		- return true;
408		- (*nr_initialised)++;
409		- if ((*nr_initialised > pgdat->static_init_pgcnt) &&
410		- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
411		- pgdat->first_deferred_pfn = pfn;
412		- return false;
	437	+ static unsigned long prev_end_pfn, nr_initialised;
	438	+
	439	+ /*
	440	+ * prev_end_pfn static that contains the end of previous zone
	441	+ * No need to protect because called very early in boot before smp_init.
	442	+ */
	443	+ if (prev_end_pfn != end_pfn) {
	444	+ prev_end_pfn = end_pfn;
	445	+ nr_initialised = 0;
413	446	}
414	447
415		- return true;
	448	+ /* Always populate low zones for address-constrained allocations */
	449	+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
	450	+ return false;
	451	+
	452	+ if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
	453	+ return true;
	454	+ /*
	455	+ * We start only with one section of pages, more pages are added as
	456	+ * needed until the rest of deferred pages are initialized.
	457	+ */
	458	+ nr_initialised++;
	459	+ if ((nr_initialised > PAGES_PER_SECTION) &&
	460	+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
	461	+ NODE_DATA(nid)->first_deferred_pfn = pfn;
	462	+ return true;
	463	+ }
	464	+ return false;
416	465	}
417	466	#else
418		-#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
	467	+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
	468	+{
	469	+ return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
	470	+ (fpi_flags & FPI_SKIP_KASAN_POISON)) \|\|
	471	+ PageSkipKASanPoison(page);
	472	+}
419	473
420	474	static inline bool early_page_uninitialised(unsigned long pfn)
421	475	{
422	476	return false;
423	477	}
424	478
425		-static inline bool update_defer_init(pg_data_t *pgdat,
426		- unsigned long pfn, unsigned long zone_end,
427		- unsigned long *nr_initialised)
	479	+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
428	480	{
429		- return true;
	481	+ return false;
430	482	}
431	483	#endif
432	484
..	..	@@ -435,7 +487,7 @@
435	487	unsigned long pfn)
436	488	{
437	489	#ifdef CONFIG_SPARSEMEM
438		- return __pfn_to_section(pfn)->pageblock_flags;
	490	+ return section_to_usemap(__pfn_to_section(pfn));
439	491	#else
440	492	return page_zone(page)->pageblock_flags;
441	493	#endif /* CONFIG_SPARSEMEM */
..	..	@@ -445,25 +497,23 @@
445	497	{
446	498	#ifdef CONFIG_SPARSEMEM
447	499	pfn &= (PAGES_PER_SECTION-1);
448		- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
449	500	#else
450	501	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
451		- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
452	502	#endif /* CONFIG_SPARSEMEM */
	503	+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
453	504	}
454	505
455	506	/**
456	507	* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
457	508	* @page: The page within the block of interest
458	509	* @pfn: The target page frame number
459		- * @end_bitidx: The last bit of interest to retrieve
460	510	* @mask: mask of bits that the caller is interested in
461	511	*
462	512	* Return: pageblock_bits flags
463	513	*/
464		-static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
	514	+static __always_inline
	515	+unsigned long __get_pfnblock_flags_mask(struct page *page,
465	516	unsigned long pfn,
466		- unsigned long end_bitidx,
467	517	unsigned long mask)
468	518	{
469	519	unsigned long *bitmap;
..	..	@@ -476,20 +526,36 @@
476	526	bitidx &= (BITS_PER_LONG-1);
477	527
478	528	word = bitmap[word_bitidx];
479		- bitidx += end_bitidx;
480		- return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
	529	+ return (word >> bitidx) & mask;
481	530	}
482	531
483	532	unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
484		- unsigned long end_bitidx,
485	533	unsigned long mask)
486	534	{
487		- return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
	535	+ return __get_pfnblock_flags_mask(page, pfn, mask);
488	536	}
	537	+EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask);
	538	+
	539	+int isolate_anon_lru_page(struct page *page)
	540	+{
	541	+ int ret;
	542	+
	543	+ if (!PageLRU(page) \|\| !PageAnon(page))
	544	+ return -EINVAL;
	545	+
	546	+ if (!get_page_unless_zero(page))
	547	+ return -EINVAL;
	548	+
	549	+ ret = isolate_lru_page(page);
	550	+ put_page(page);
	551	+
	552	+ return ret;
	553	+}
	554	+EXPORT_SYMBOL_GPL(isolate_anon_lru_page);
489	555
490	556	static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
491	557	{
492		- return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
	558	+ return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
493	559	}
494	560
495	561	/**
..	..	@@ -497,12 +563,10 @@
497	563	* @page: The page within the block of interest
498	564	* @flags: The flags to set
499	565	* @pfn: The target page frame number
500		- * @end_bitidx: The last bit of interest
501	566	* @mask: mask of bits that the caller is interested in
502	567	*/
503	568	void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
504	569	unsigned long pfn,
505		- unsigned long end_bitidx,
506	570	unsigned long mask)
507	571	{
508	572	unsigned long *bitmap;
..	..	@@ -510,6 +574,7 @@
510	574	unsigned long old_word, word;
511	575
512	576	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
	577	+ BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
513	578
514	579	bitmap = get_pageblock_bitmap(page, pfn);
515	580	bitidx = pfn_to_bitidx(page, pfn);
..	..	@@ -518,9 +583,8 @@
518	583
519	584	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
520	585
521		- bitidx += end_bitidx;
522		- mask <<= (BITS_PER_LONG - bitidx - 1);
523		- flags <<= (BITS_PER_LONG - bitidx - 1);
	586	+ mask <<= bitidx;
	587	+ flags <<= bitidx;
524	588
525	589	word = READ_ONCE(bitmap[word_bitidx]);
526	590	for (;;) {
..	..	@@ -537,8 +601,8 @@
537	601	migratetype < MIGRATE_PCPTYPES))
538	602	migratetype = MIGRATE_UNMOVABLE;
539	603
540		- set_pageblock_flags_group(page, (unsigned long)migratetype,
541		- PB_migrate, PB_migrate_end);
	604	+ set_pfnblock_flags_mask(page, (unsigned long)migratetype,
	605	+ page_to_pfn(page), MIGRATETYPE_MASK);
542	606	}
543	607
544	608	#ifdef CONFIG_DEBUG_VM
..	..	@@ -593,8 +657,7 @@
593	657	}
594	658	#endif
595	659
596		-static void bad_page(struct page page, const char reason,
597		- unsigned long bad_flags)
	660	+static void bad_page(struct page page, const char reason)
598	661	{
599	662	static unsigned long resume;
600	663	static unsigned long nr_shown;
..	..	@@ -623,10 +686,6 @@
623	686	pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
624	687	current->comm, page_to_pfn(page));
625	688	__dump_page(page, reason);
626		- bad_flags &= page->flags;
627		- if (bad_flags)
628		- pr_alert("bad because of flags: %#lx(%pGp)\n",
629		- bad_flags, &bad_flags);
630	689	dump_page_owner(page);
631	690
632	691	print_modules();
..	..	@@ -654,7 +713,8 @@
654	713
655	714	void free_compound_page(struct page *page)
656	715	{
657		- __free_pages_ok(page, compound_order(page));
	716	+ mem_cgroup_uncharge(page);
	717	+ __free_pages_ok(page, compound_order(page), FPI_NONE);
658	718	}
659	719
660	720	void prep_compound_page(struct page *page, unsigned int order)
..	..	@@ -662,8 +722,6 @@
662	722	int i;
663	723	int nr_pages = 1 << order;
664	724
665		- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
666		- set_compound_order(page, order);
667	725	__SetPageHead(page);
668	726	for (i = 1; i < nr_pages; i++) {
669	727	struct page *p = page + i;
..	..	@@ -671,51 +729,30 @@
671	729	p->mapping = TAIL_MAPPING;
672	730	set_compound_head(p, page);
673	731	}
	732	+
	733	+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
	734	+ set_compound_order(page, order);
674	735	atomic_set(compound_mapcount_ptr(page), -1);
	736	+ if (hpage_pincount_available(page))
	737	+ atomic_set(compound_pincount_ptr(page), 0);
675	738	}
676	739
677	740	#ifdef CONFIG_DEBUG_PAGEALLOC
678	741	unsigned int _debug_guardpage_minorder;
679		-bool _debug_pagealloc_enabled __read_mostly
	742	+
	743	+bool _debug_pagealloc_enabled_early __read_mostly
680	744	= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
	745	+EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
	746	+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
681	747	EXPORT_SYMBOL(_debug_pagealloc_enabled);
682		-bool _debug_guardpage_enabled __read_mostly;
	748	+
	749	+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
683	750
684	751	static int __init early_debug_pagealloc(char *buf)
685	752	{
686		- if (!buf)
687		- return -EINVAL;
688		- return kstrtobool(buf, &_debug_pagealloc_enabled);
	753	+ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
689	754	}
690	755	early_param("debug_pagealloc", early_debug_pagealloc);
691		-
692		-static bool need_debug_guardpage(void)
693		-{
694		- /* If we don't use debug_pagealloc, we don't need guard page */
695		- if (!debug_pagealloc_enabled())
696		- return false;
697		-
698		- if (!debug_guardpage_minorder())
699		- return false;
700		-
701		- return true;
702		-}
703		-
704		-static void init_debug_guardpage(void)
705		-{
706		- if (!debug_pagealloc_enabled())
707		- return;
708		-
709		- if (!debug_guardpage_minorder())
710		- return;
711		-
712		- _debug_guardpage_enabled = true;
713		-}
714		-
715		-struct page_ext_operations debug_guardpage_ops = {
716		- .need = need_debug_guardpage,
717		- .init = init_debug_guardpage,
718		-};
719	756
720	757	static int __init debug_guardpage_minorder_setup(char *buf)
721	758	{
..	..	@@ -734,20 +771,13 @@
734	771	static inline bool set_page_guard(struct zone zone, struct page page,
735	772	unsigned int order, int migratetype)
736	773	{
737		- struct page_ext *page_ext;
738		-
739	774	if (!debug_guardpage_enabled())
740	775	return false;
741	776
742	777	if (order >= debug_guardpage_minorder())
743	778	return false;
744	779
745		- page_ext = lookup_page_ext(page);
746		- if (unlikely(!page_ext))
747		- return false;
748		-
749		- __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
750		-
	780	+ __SetPageGuard(page);
751	781	INIT_LIST_HEAD(&page->lru);
752	782	set_page_private(page, order);
753	783	/* Guard pages are not available for any usage */
..	..	@@ -759,39 +789,77 @@
759	789	static inline void clear_page_guard(struct zone zone, struct page page,
760	790	unsigned int order, int migratetype)
761	791	{
762		- struct page_ext *page_ext;
763		-
764	792	if (!debug_guardpage_enabled())
765	793	return;
766	794
767		- page_ext = lookup_page_ext(page);
768		- if (unlikely(!page_ext))
769		- return;
770		-
771		- __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
	795	+ __ClearPageGuard(page);
772	796
773	797	set_page_private(page, 0);
774	798	if (!is_migrate_isolate(migratetype))
775	799	__mod_zone_freepage_state(zone, (1 << order), migratetype);
776	800	}
777	801	#else
778		-struct page_ext_operations debug_guardpage_ops;
779	802	static inline bool set_page_guard(struct zone zone, struct page page,
780	803	unsigned int order, int migratetype) { return false; }
781	804	static inline void clear_page_guard(struct zone zone, struct page page,
782	805	unsigned int order, int migratetype) {}
783	806	#endif
784	807
785		-static inline void set_page_order(struct page *page, unsigned int order)
	808	+/*
	809	+ * Enable static keys related to various memory debugging and hardening options.
	810	+ * Some override others, and depend on early params that are evaluated in the
	811	+ * order of appearance. So we need to first gather the full picture of what was
	812	+ * enabled, and then make decisions.
	813	+ */
	814	+void init_mem_debugging_and_hardening(void)
	815	+{
	816	+ bool page_poisoning_requested = false;
	817	+
	818	+#ifdef CONFIG_PAGE_POISONING
	819	+ /*
	820	+ * Page poisoning is debug page alloc for some arches. If
	821	+ * either of those options are enabled, enable poisoning.
	822	+ */
	823	+ if (page_poisoning_enabled() \|\|
	824	+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
	825	+ debug_pagealloc_enabled())) {
	826	+ static_branch_enable(&_page_poisoning_enabled);
	827	+ page_poisoning_requested = true;
	828	+ }
	829	+#endif
	830	+
	831	+ if (_init_on_alloc_enabled_early) {
	832	+ if (page_poisoning_requested)
	833	+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
	834	+ "will take precedence over init_on_alloc\n");
	835	+ else
	836	+ static_branch_enable(&init_on_alloc);
	837	+ }
	838	+ if (_init_on_free_enabled_early) {
	839	+ if (page_poisoning_requested)
	840	+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
	841	+ "will take precedence over init_on_free\n");
	842	+ else
	843	+ static_branch_enable(&init_on_free);
	844	+ }
	845	+
	846	+#ifdef CONFIG_DEBUG_PAGEALLOC
	847	+ if (!debug_pagealloc_enabled())
	848	+ return;
	849	+
	850	+ static_branch_enable(&_debug_pagealloc_enabled);
	851	+
	852	+ if (!debug_guardpage_minorder())
	853	+ return;
	854	+
	855	+ static_branch_enable(&_debug_guardpage_enabled);
	856	+#endif
	857	+}
	858	+
	859	+static inline void set_buddy_order(struct page *page, unsigned int order)
786	860	{
787	861	set_page_private(page, order);
788	862	__SetPageBuddy(page);
789		-}
790		-
791		-static inline void rmv_page_order(struct page *page)
792		-{
793		- __ClearPageBuddy(page);
794		- set_page_private(page, 0);
795	863	}
796	864
797	865	/*
..	..	@@ -807,32 +875,151 @@
807	875	*
808	876	* For recording page's order, we use page_private(page).
809	877	*/
810		-static inline int page_is_buddy(struct page page, struct page buddy,
	878	+static inline bool page_is_buddy(struct page page, struct page buddy,
811	879	unsigned int order)
812	880	{
813		- if (page_is_guard(buddy) && page_order(buddy) == order) {
814		- if (page_zone_id(page) != page_zone_id(buddy))
815		- return 0;
	881	+ if (!page_is_guard(buddy) && !PageBuddy(buddy))
	882	+ return false;
816	883
817		- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
	884	+ if (buddy_order(buddy) != order)
	885	+ return false;
818	886
819		- return 1;
820		- }
	887	+ /*
	888	+ * zone check is done late to avoid uselessly calculating
	889	+ * zone/node ids for pages that could never merge.
	890	+ */
	891	+ if (page_zone_id(page) != page_zone_id(buddy))
	892	+ return false;
821	893
822		- if (PageBuddy(buddy) && page_order(buddy) == order) {
823		- /*
824		- * zone check is done late to avoid uselessly
825		- * calculating zone/node ids for pages that could
826		- * never merge.
827		- */
828		- if (page_zone_id(page) != page_zone_id(buddy))
829		- return 0;
	894	+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
830	895
831		- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
	896	+ return true;
	897	+}
832	898
833		- return 1;
834		- }
835		- return 0;
	899	+#ifdef CONFIG_COMPACTION
	900	+static inline struct capture_control task_capc(struct zone zone)
	901	+{
	902	+ struct capture_control *capc = current->capture_control;
	903	+
	904	+ return unlikely(capc) &&
	905	+ !(current->flags & PF_KTHREAD) &&
	906	+ !capc->page &&
	907	+ capc->cc->zone == zone ? capc : NULL;
	908	+}
	909	+
	910	+static inline bool
	911	+compaction_capture(struct capture_control capc, struct page page,
	912	+ int order, int migratetype)
	913	+{
	914	+ if (!capc \|\| order != capc->cc->order)
	915	+ return false;
	916	+
	917	+ /* Do not accidentally pollute CMA or isolated regions*/
	918	+ if (is_migrate_cma(migratetype) \|\|
	919	+ is_migrate_isolate(migratetype))
	920	+ return false;
	921	+
	922	+ /*
	923	+ * Do not let lower order allocations polluate a movable pageblock.
	924	+ * This might let an unmovable request use a reclaimable pageblock
	925	+ * and vice-versa but no more than normal fallback logic which can
	926	+ * have trouble finding a high-order free page.
	927	+ */
	928	+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
	929	+ return false;
	930	+
	931	+ capc->page = page;
	932	+ return true;
	933	+}
	934	+
	935	+#else
	936	+static inline struct capture_control task_capc(struct zone zone)
	937	+{
	938	+ return NULL;
	939	+}
	940	+
	941	+static inline bool
	942	+compaction_capture(struct capture_control capc, struct page page,
	943	+ int order, int migratetype)
	944	+{
	945	+ return false;
	946	+}
	947	+#endif /* CONFIG_COMPACTION */
	948	+
	949	+/* Used for pages not on another list */
	950	+static inline void add_to_free_list(struct page page, struct zone zone,
	951	+ unsigned int order, int migratetype)
	952	+{
	953	+ struct free_area *area = &zone->free_area[order];
	954	+
	955	+ list_add(&page->lru, &area->free_list[migratetype]);
	956	+ area->nr_free++;
	957	+}
	958	+
	959	+/* Used for pages not on another list */
	960	+static inline void add_to_free_list_tail(struct page page, struct zone zone,
	961	+ unsigned int order, int migratetype)
	962	+{
	963	+ struct free_area *area = &zone->free_area[order];
	964	+
	965	+ list_add_tail(&page->lru, &area->free_list[migratetype]);
	966	+ area->nr_free++;
	967	+}
	968	+
	969	+/*
	970	+ * Used for pages which are on another list. Move the pages to the tail
	971	+ * of the list - so the moved pages won't immediately be considered for
	972	+ * allocation again (e.g., optimization for memory onlining).
	973	+ */
	974	+static inline void move_to_free_list(struct page page, struct zone zone,
	975	+ unsigned int order, int migratetype)
	976	+{
	977	+ struct free_area *area = &zone->free_area[order];
	978	+
	979	+ list_move_tail(&page->lru, &area->free_list[migratetype]);
	980	+}
	981	+
	982	+static inline void del_page_from_free_list(struct page page, struct zone zone,
	983	+ unsigned int order)
	984	+{
	985	+ /* clear reported state and update reported page count */
	986	+ if (page_reported(page))
	987	+ __ClearPageReported(page);
	988	+
	989	+ list_del(&page->lru);
	990	+ __ClearPageBuddy(page);
	991	+ set_page_private(page, 0);
	992	+ zone->free_area[order].nr_free--;
	993	+}
	994	+
	995	+/*
	996	+ * If this is not the largest possible page, check if the buddy
	997	+ * of the next-highest order is free. If it is, it's possible
	998	+ * that pages are being freed that will coalesce soon. In case,
	999	+ * that is happening, add the free page to the tail of the list
	1000	+ * so it's less likely to be used soon and more likely to be merged
	1001	+ * as a higher order page
	1002	+ */
	1003	+static inline bool
	1004	+buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
	1005	+ struct page *page, unsigned int order)
	1006	+{
	1007	+ struct page higher_page, higher_buddy;
	1008	+ unsigned long combined_pfn;
	1009	+
	1010	+ if (order >= MAX_ORDER - 2)
	1011	+ return false;
	1012	+
	1013	+ if (!pfn_valid_within(buddy_pfn))
	1014	+ return false;
	1015	+
	1016	+ combined_pfn = buddy_pfn & pfn;
	1017	+ higher_page = page + (combined_pfn - pfn);
	1018	+ buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
	1019	+ higher_buddy = higher_page + (buddy_pfn - combined_pfn);
	1020	+
	1021	+ return pfn_valid_within(buddy_pfn) &&
	1022	+ page_is_buddy(higher_page, higher_buddy, order + 1);
836	1023	}
837	1024
838	1025	/*
..	..	@@ -862,12 +1049,14 @@
862	1049	static inline void __free_one_page(struct page *page,
863	1050	unsigned long pfn,
864	1051	struct zone *zone, unsigned int order,
865		- int migratetype)
	1052	+ int migratetype, fpi_t fpi_flags)
866	1053	{
	1054	+ struct capture_control *capc = task_capc(zone);
	1055	+ unsigned long buddy_pfn;
867	1056	unsigned long combined_pfn;
868		- unsigned long uninitialized_var(buddy_pfn);
869		- struct page *buddy;
870	1057	unsigned int max_order;
	1058	+ struct page *buddy;
	1059	+ bool to_tail;
871	1060
872	1061	max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
873	1062
..	..	@@ -883,6 +1072,11 @@
883	1072
884	1073	continue_merging:
885	1074	while (order < max_order) {
	1075	+ if (compaction_capture(capc, page, order, migratetype)) {
	1076	+ __mod_zone_freepage_state(zone, -(1 << order),
	1077	+ migratetype);
	1078	+ return;
	1079	+ }
886	1080	buddy_pfn = __find_buddy_pfn(pfn, order);
887	1081	buddy = page + (buddy_pfn - pfn);
888	1082
..	..	@@ -894,13 +1088,10 @@
894	1088	* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
895	1089	* merge with it and move up one order.
896	1090	*/
897		- if (page_is_guard(buddy)) {
	1091	+ if (page_is_guard(buddy))
898	1092	clear_page_guard(zone, buddy, order, migratetype);
899		- } else {
900		- list_del(&buddy->lru);
901		- zone->free_area[order].nr_free--;
902		- rmv_page_order(buddy);
903		- }
	1093	+ else
	1094	+ del_page_from_free_list(buddy, zone, order);
904	1095	combined_pfn = buddy_pfn & pfn;
905	1096	page = page + (combined_pfn - pfn);
906	1097	pfn = combined_pfn;
..	..	@@ -932,33 +1123,23 @@
932	1123	}
933	1124
934	1125	done_merging:
935		- set_page_order(page, order);
	1126	+ set_buddy_order(page, order);
936	1127
937		- /*
938		- * If this is not the largest possible page, check if the buddy
939		- * of the next-highest order is free. If it is, it's possible
940		- * that pages are being freed that will coalesce soon. In case,
941		- * that is happening, add the free page to the tail of the list
942		- * so it's less likely to be used soon and more likely to be merged
943		- * as a higher order page
944		- */
945		- if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
946		- struct page higher_page, higher_buddy;
947		- combined_pfn = buddy_pfn & pfn;
948		- higher_page = page + (combined_pfn - pfn);
949		- buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
950		- higher_buddy = higher_page + (buddy_pfn - combined_pfn);
951		- if (pfn_valid_within(buddy_pfn) &&
952		- page_is_buddy(higher_page, higher_buddy, order + 1)) {
953		- list_add_tail(&page->lru,
954		- &zone->free_area[order].free_list[migratetype]);
955		- goto out;
956		- }
957		- }
	1128	+ if (fpi_flags & FPI_TO_TAIL)
	1129	+ to_tail = true;
	1130	+ else if (is_shuffle_order(order))
	1131	+ to_tail = shuffle_pick_tail();
	1132	+ else
	1133	+ to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
958	1134
959		- list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
960		-out:
961		- zone->free_area[order].nr_free++;
	1135	+ if (to_tail)
	1136	+ add_to_free_list_tail(page, zone, order, migratetype);
	1137	+ else
	1138	+ add_to_free_list(page, zone, order, migratetype);
	1139	+
	1140	+ /* Notify page reporting subsystem of freed page */
	1141	+ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
	1142	+ page_reporting_notify_free(order);
962	1143	}
963	1144
964	1145	/*
..	..	@@ -983,13 +1164,9 @@
983	1164	return true;
984	1165	}
985	1166
986		-static void free_pages_check_bad(struct page *page)
	1167	+static const char page_bad_reason(struct page page, unsigned long flags)
987	1168	{
988		- const char *bad_reason;
989		- unsigned long bad_flags;
990		-
991		- bad_reason = NULL;
992		- bad_flags = 0;
	1169	+ const char *bad_reason = NULL;
993	1170
994	1171	if (unlikely(atomic_read(&page->_mapcount) != -1))
995	1172	bad_reason = "nonzero mapcount";
..	..	@@ -997,24 +1174,32 @@
997	1174	bad_reason = "non-NULL mapping";
998	1175	if (unlikely(page_ref_count(page) != 0))
999	1176	bad_reason = "nonzero _refcount";
1000		- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
1001		- bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1002		- bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
	1177	+ if (unlikely(page->flags & flags)) {
	1178	+ if (flags == PAGE_FLAGS_CHECK_AT_PREP)
	1179	+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
	1180	+ else
	1181	+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1003	1182	}
1004	1183	#ifdef CONFIG_MEMCG
1005	1184	if (unlikely(page->mem_cgroup))
1006	1185	bad_reason = "page still charged to cgroup";
1007	1186	#endif
1008		- bad_page(page, bad_reason, bad_flags);
	1187	+ return bad_reason;
1009	1188	}
1010	1189
1011		-static inline int free_pages_check(struct page *page)
	1190	+static void check_free_page_bad(struct page *page)
	1191	+{
	1192	+ bad_page(page,
	1193	+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
	1194	+}
	1195	+
	1196	+static inline int check_free_page(struct page *page)
1012	1197	{
1013	1198	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
1014	1199	return 0;
1015	1200
1016	1201	/* Something has gone sideways, find it */
1017		- free_pages_check_bad(page);
	1202	+ check_free_page_bad(page);
1018	1203	return 1;
1019	1204	}
1020	1205
..	..	@@ -1036,7 +1221,7 @@
1036	1221	case 1:
1037	1222	/* the first tail page: ->mapping may be compound_mapcount() */
1038	1223	if (unlikely(compound_mapcount(page))) {
1039		- bad_page(page, "nonzero compound_mapcount", 0);
	1224	+ bad_page(page, "nonzero compound_mapcount");
1040	1225	goto out;
1041	1226	}
1042	1227	break;
..	..	@@ -1048,17 +1233,17 @@
1048	1233	break;
1049	1234	default:
1050	1235	if (page->mapping != TAIL_MAPPING) {
1051		- bad_page(page, "corrupted mapping in tail page", 0);
	1236	+ bad_page(page, "corrupted mapping in tail page");
1052	1237	goto out;
1053	1238	}
1054	1239	break;
1055	1240	}
1056	1241	if (unlikely(!PageTail(page))) {
1057		- bad_page(page, "PageTail not set", 0);
	1242	+ bad_page(page, "PageTail not set");
1058	1243	goto out;
1059	1244	}
1060	1245	if (unlikely(compound_head(page) != head_page)) {
1061		- bad_page(page, "compound_head not consistent", 0);
	1246	+ bad_page(page, "compound_head not consistent");
1062	1247	goto out;
1063	1248	}
1064	1249	ret = 0;
..	..	@@ -1068,25 +1253,48 @@
1068	1253	return ret;
1069	1254	}
1070	1255
1071		-static void kernel_init_free_pages(struct page *page, int numpages)
	1256	+static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
1072	1257	{
1073	1258	int i;
1074	1259
	1260	+ if (zero_tags) {
	1261	+ for (i = 0; i < numpages; i++)
	1262	+ tag_clear_highpage(page + i);
	1263	+ return;
	1264	+ }
	1265	+
1075	1266	/* s390's use of memset() could override KASAN redzones. */
1076	1267	kasan_disable_current();
1077		- for (i = 0; i < numpages; i++)
	1268	+ for (i = 0; i < numpages; i++) {
	1269	+ u8 tag = page_kasan_tag(page + i);
	1270	+ page_kasan_tag_reset(page + i);
1078	1271	clear_highpage(page + i);
	1272	+ page_kasan_tag_set(page + i, tag);
	1273	+ }
1079	1274	kasan_enable_current();
1080	1275	}
1081	1276
1082	1277	static __always_inline bool free_pages_prepare(struct page *page,
1083		- unsigned int order, bool check_free)
	1278	+ unsigned int order, bool check_free, fpi_t fpi_flags)
1084	1279	{
1085	1280	int bad = 0;
	1281	+ bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
1086	1282
1087	1283	VM_BUG_ON_PAGE(PageTail(page), page);
1088	1284
1089	1285	trace_mm_page_free(page, order);
	1286	+
	1287	+ if (unlikely(PageHWPoison(page)) && !order) {
	1288	+ /*
	1289	+ * Do not let hwpoison pages hit pcplists/buddy
	1290	+ * Untie memcg state and reset page's owner
	1291	+ */
	1292	+ if (memcg_kmem_enabled() && PageKmemcg(page))
	1293	+ __memcg_kmem_uncharge_page(page, order);
	1294	+ reset_page_owner(page, order);
	1295	+ free_page_pinner(page, order);
	1296	+ return false;
	1297	+ }
1090	1298
1091	1299	/*
1092	1300	* Check tail pages before head page information is cleared to
..	..	@@ -1103,7 +1311,7 @@
1103	1311	for (i = 1; i < (1 << order); i++) {
1104	1312	if (compound)
1105	1313	bad += free_tail_pages_check(page, page + i);
1106		- if (unlikely(free_pages_check(page + i))) {
	1314	+ if (unlikely(check_free_page(page + i))) {
1107	1315	bad++;
1108	1316	continue;
1109	1317	}
..	..	@@ -1113,15 +1321,16 @@
1113	1321	if (PageMappingFlags(page))
1114	1322	page->mapping = NULL;
1115	1323	if (memcg_kmem_enabled() && PageKmemcg(page))
1116		- memcg_kmem_uncharge(page, order);
	1324	+ __memcg_kmem_uncharge_page(page, order);
1117	1325	if (check_free)
1118		- bad += free_pages_check(page);
	1326	+ bad += check_free_page(page);
1119	1327	if (bad)
1120	1328	return false;
1121	1329
1122	1330	page_cpupid_reset_last(page);
1123	1331	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1124	1332	reset_page_owner(page, order);
	1333	+ free_page_pinner(page, order);
1125	1334
1126	1335	if (!PageHighMem(page)) {
1127	1336	debug_check_no_locks_freed(page_address(page),
..	..	@@ -1129,36 +1338,77 @@
1129	1338	debug_check_no_obj_freed(page_address(page),
1130	1339	PAGE_SIZE << order);
1131	1340	}
1132		- arch_free_page(page, order);
1133		- if (want_init_on_free())
1134		- kernel_init_free_pages(page, 1 << order);
1135	1341
1136		- kernel_poison_pages(page, 1 << order, 0);
1137		- kernel_map_pages(page, 1 << order, 0);
1138		- kasan_free_nondeferred_pages(page, order);
	1342	+ kernel_poison_pages(page, 1 << order);
	1343	+
	1344	+ /*
	1345	+ * As memory initialization might be integrated into KASAN,
	1346	+ * kasan_free_pages and kernel_init_free_pages must be
	1347	+ * kept together to avoid discrepancies in behavior.
	1348	+ *
	1349	+ * With hardware tag-based KASAN, memory tags must be set before the
	1350	+ * page becomes unavailable via debug_pagealloc or arch_free_page.
	1351	+ */
	1352	+ if (kasan_has_integrated_init()) {
	1353	+ if (!skip_kasan_poison)
	1354	+ kasan_free_pages(page, order);
	1355	+ } else {
	1356	+ bool init = want_init_on_free();
	1357	+
	1358	+ if (init)
	1359	+ kernel_init_free_pages(page, 1 << order, false);
	1360	+ if (!skip_kasan_poison)
	1361	+ kasan_poison_pages(page, order, init);
	1362	+ }
	1363	+
	1364	+ /*
	1365	+ * arch_free_page() can make the page's contents inaccessible. s390
	1366	+ * does this. So nothing which can access the page's contents should
	1367	+ * happen after this.
	1368	+ */
	1369	+ arch_free_page(page, order);
	1370	+
	1371	+ debug_pagealloc_unmap_pages(page, 1 << order);
1139	1372
1140	1373	return true;
1141	1374	}
1142	1375
1143	1376	#ifdef CONFIG_DEBUG_VM
1144		-static inline bool free_pcp_prepare(struct page *page)
1145		-{
1146		- return free_pages_prepare(page, 0, true);
1147		-}
1148		-
1149		-static inline bool bulkfree_pcp_prepare(struct page *page)
1150		-{
1151		- return false;
1152		-}
1153		-#else
	1377	+/*
	1378	+ * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
	1379	+ * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
	1380	+ * moved from pcp lists to free lists.
	1381	+ */
1154	1382	static bool free_pcp_prepare(struct page *page)
1155	1383	{
1156		- return free_pages_prepare(page, 0, false);
	1384	+ return free_pages_prepare(page, 0, true, FPI_NONE);
1157	1385	}
1158	1386
1159	1387	static bool bulkfree_pcp_prepare(struct page *page)
1160	1388	{
1161		- return free_pages_check(page);
	1389	+ if (debug_pagealloc_enabled_static())
	1390	+ return check_free_page(page);
	1391	+ else
	1392	+ return false;
	1393	+}
	1394	+#else
	1395	+/*
	1396	+ * With DEBUG_VM disabled, order-0 pages being freed are checked only when
	1397	+ * moving from pcp lists to free list in order to reduce overhead. With
	1398	+ * debug_pagealloc enabled, they are checked also immediately when being freed
	1399	+ * to the pcp lists.
	1400	+ */
	1401	+static bool free_pcp_prepare(struct page *page)
	1402	+{
	1403	+ if (debug_pagealloc_enabled_static())
	1404	+ return free_pages_prepare(page, 0, true, FPI_NONE);
	1405	+ else
	1406	+ return free_pages_prepare(page, 0, false, FPI_NONE);
	1407	+}
	1408	+
	1409	+static bool bulkfree_pcp_prepare(struct page *page)
	1410	+{
	1411	+ return check_free_page(page);
1162	1412	}
1163	1413	#endif /* CONFIG_DEBUG_VM */
1164	1414
..	..	@@ -1258,7 +1508,7 @@
1258	1508	if (unlikely(isolated_pageblocks))
1259	1509	mt = get_pageblock_migratetype(page);
1260	1510
1261		- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
	1511	+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
1262	1512	trace_mm_page_pcpu_drain(page, 0, mt);
1263	1513	}
1264	1514	spin_unlock(&zone->lock);
..	..	@@ -1267,14 +1517,14 @@
1267	1517	static void free_one_page(struct zone *zone,
1268	1518	struct page *page, unsigned long pfn,
1269	1519	unsigned int order,
1270		- int migratetype)
	1520	+ int migratetype, fpi_t fpi_flags)
1271	1521	{
1272	1522	spin_lock(&zone->lock);
1273	1523	if (unlikely(has_isolate_pageblock(zone) \|\|
1274	1524	is_migrate_isolate(migratetype))) {
1275	1525	migratetype = get_pfnblock_migratetype(page, pfn);
1276	1526	}
1277		- __free_one_page(page, pfn, zone, order, migratetype);
	1527	+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
1278	1528	spin_unlock(&zone->lock);
1279	1529	}
1280	1530
..	..	@@ -1348,33 +1598,50 @@
1348	1598	/* Avoid false-positive PageTail() */
1349	1599	INIT_LIST_HEAD(&page->lru);
1350	1600
1351		- SetPageReserved(page);
	1601	+ /*
	1602	+ * no need for atomic set_bit because the struct
	1603	+ * page is not visible yet so nobody should
	1604	+ * access it yet.
	1605	+ */
	1606	+ __SetPageReserved(page);
1352	1607	}
1353	1608	}
1354	1609	}
1355	1610
1356		-static void __free_pages_ok(struct page *page, unsigned int order)
	1611	+static void __free_pages_ok(struct page *page, unsigned int order,
	1612	+ fpi_t fpi_flags)
1357	1613	{
1358	1614	unsigned long flags;
1359	1615	int migratetype;
1360	1616	unsigned long pfn = page_to_pfn(page);
	1617	+ bool skip_free_unref_page = false;
1361	1618
1362		- if (!free_pages_prepare(page, order, true))
	1619	+ if (!free_pages_prepare(page, order, true, fpi_flags))
1363	1620	return;
1364	1621
1365	1622	migratetype = get_pfnblock_migratetype(page, pfn);
	1623	+ trace_android_vh_free_unref_page_bypass(page, order, migratetype, &skip_free_unref_page);
	1624	+ if (skip_free_unref_page)
	1625	+ return;
	1626	+
1366	1627	local_irq_save(flags);
1367	1628	__count_vm_events(PGFREE, 1 << order);
1368		- free_one_page(page_zone(page), page, pfn, order, migratetype);
	1629	+ free_one_page(page_zone(page), page, pfn, order, migratetype,
	1630	+ fpi_flags);
1369	1631	local_irq_restore(flags);
1370	1632	}
1371	1633
1372		-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
	1634	+void __free_pages_core(struct page *page, unsigned int order)
1373	1635	{
1374	1636	unsigned int nr_pages = 1 << order;
1375	1637	struct page *p = page;
1376	1638	unsigned int loop;
1377	1639
	1640	+ /*
	1641	+ * When initializing the memmap, __init_single_page() sets the refcount
	1642	+ * of all pages to 1 ("allocated"/"not free"). We have to set the
	1643	+ * refcount of all involved pages to 0.
	1644	+ */
1378	1645	prefetchw(p);
1379	1646	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1380	1647	prefetchw(p + 1);
..	..	@@ -1384,15 +1651,43 @@
1384	1651	__ClearPageReserved(p);
1385	1652	set_page_count(p, 0);
1386	1653
1387		- page_zone(page)->managed_pages += nr_pages;
1388		- set_page_refcounted(page);
1389		- __free_pages(page, order);
	1654	+ atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
	1655	+
	1656	+ /*
	1657	+ * Bypass PCP and place fresh pages right to the tail, primarily
	1658	+ * relevant for memory onlining.
	1659	+ */
	1660	+ __free_pages_ok(page, order, FPI_TO_TAIL \| FPI_SKIP_KASAN_POISON);
1390	1661	}
1391	1662
1392		-#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) \|\| \
1393		- defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
	1663	+#ifdef CONFIG_NEED_MULTIPLE_NODES
1394	1664
1395	1665	static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
	1666	+
	1667	+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
	1668	+
	1669	+/*
	1670	+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
	1671	+ */
	1672	+int __meminit __early_pfn_to_nid(unsigned long pfn,
	1673	+ struct mminit_pfnnid_cache *state)
	1674	+{
	1675	+ unsigned long start_pfn, end_pfn;
	1676	+ int nid;
	1677	+
	1678	+ if (state->last_start <= pfn && pfn < state->last_end)
	1679	+ return state->last_nid;
	1680	+
	1681	+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
	1682	+ if (nid != NUMA_NO_NODE) {
	1683	+ state->last_start = start_pfn;
	1684	+ state->last_end = end_pfn;
	1685	+ state->last_nid = nid;
	1686	+ }
	1687	+
	1688	+ return nid;
	1689	+}
	1690	+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
1396	1691
1397	1692	int __meminit early_pfn_to_nid(unsigned long pfn)
1398	1693	{
..	..	@@ -1407,48 +1702,14 @@
1407	1702
1408	1703	return nid;
1409	1704	}
1410		-#endif
	1705	+#endif /* CONFIG_NEED_MULTIPLE_NODES */
1411	1706
1412		-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
1413		-static inline bool __meminit __maybe_unused
1414		-meminit_pfn_in_nid(unsigned long pfn, int node,
1415		- struct mminit_pfnnid_cache *state)
1416		-{
1417		- int nid;
1418		-
1419		- nid = __early_pfn_to_nid(pfn, state);
1420		- if (nid >= 0 && nid != node)
1421		- return false;
1422		- return true;
1423		-}
1424		-
1425		-/* Only safe to use early in boot when initialisation is single-threaded */
1426		-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1427		-{
1428		- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
1429		-}
1430		-
1431		-#else
1432		-
1433		-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1434		-{
1435		- return true;
1436		-}
1437		-static inline bool __meminit __maybe_unused
1438		-meminit_pfn_in_nid(unsigned long pfn, int node,
1439		- struct mminit_pfnnid_cache *state)
1440		-{
1441		- return true;
1442		-}
1443		-#endif
1444		-
1445		-
1446		-void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
	1707	+void __init memblock_free_pages(struct page *page, unsigned long pfn,
1447	1708	unsigned int order)
1448	1709	{
1449	1710	if (early_page_uninitialised(pfn))
1450	1711	return;
1451		- return __free_pages_boot_core(page, order);
	1712	+ __free_pages_core(page, order);
1452	1713	}
1453	1714
1454	1715	/*
..	..	@@ -1539,14 +1800,14 @@
1539	1800	if (nr_pages == pageblock_nr_pages &&
1540	1801	(pfn & (pageblock_nr_pages - 1)) == 0) {
1541	1802	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1542		- __free_pages_boot_core(page, pageblock_order);
	1803	+ __free_pages_core(page, pageblock_order);
1543	1804	return;
1544	1805	}
1545	1806
1546	1807	for (i = 0; i < nr_pages; i++, page++, pfn++) {
1547	1808	if ((pfn & (pageblock_nr_pages - 1)) == 0)
1548	1809	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1549		- __free_pages_boot_core(page, 0);
	1810	+ __free_pages_core(page, 0);
1550	1811	}
1551	1812	}
1552	1813
..	..	@@ -1569,20 +1830,12 @@
1569	1830	*
1570	1831	* Then, we check if a current large page is valid by only checking the validity
1571	1832	* of the head pfn.
1572		- *
1573		- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
1574		- * within a node: a pfn is between start and end of a node, but does not belong
1575		- * to this memory node.
1576	1833	*/
1577		-static inline bool __init
1578		-deferred_pfn_valid(int nid, unsigned long pfn,
1579		- struct mminit_pfnnid_cache *nid_init_state)
	1834	+static inline bool __init deferred_pfn_valid(unsigned long pfn)
1580	1835	{
1581	1836	if (!pfn_valid_within(pfn))
1582	1837	return false;
1583	1838	if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
1584		- return false;
1585		- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
1586	1839	return false;
1587	1840	return true;
1588	1841	}
..	..	@@ -1591,21 +1844,19 @@
1591	1844	* Free pages to buddy allocator. Try to free aligned pages in
1592	1845	* pageblock_nr_pages sizes.
1593	1846	*/
1594		-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
	1847	+static void __init deferred_free_pages(unsigned long pfn,
1595	1848	unsigned long end_pfn)
1596	1849	{
1597		- struct mminit_pfnnid_cache nid_init_state = { };
1598	1850	unsigned long nr_pgmask = pageblock_nr_pages - 1;
1599	1851	unsigned long nr_free = 0;
1600	1852
1601	1853	for (; pfn < end_pfn; pfn++) {
1602		- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
	1854	+ if (!deferred_pfn_valid(pfn)) {
1603	1855	deferred_free_range(pfn - nr_free, nr_free);
1604	1856	nr_free = 0;
1605	1857	} else if (!(pfn & nr_pgmask)) {
1606	1858	deferred_free_range(pfn - nr_free, nr_free);
1607	1859	nr_free = 1;
1608		- touch_nmi_watchdog();
1609	1860	} else {
1610	1861	nr_free++;
1611	1862	}
..	..	@@ -1619,22 +1870,22 @@
1619	1870	* by performing it only once every pageblock_nr_pages.
1620	1871	* Return number of pages initialized.
1621	1872	*/
1622		-static unsigned long __init deferred_init_pages(int nid, int zid,
	1873	+static unsigned long __init deferred_init_pages(struct zone *zone,
1623	1874	unsigned long pfn,
1624	1875	unsigned long end_pfn)
1625	1876	{
1626		- struct mminit_pfnnid_cache nid_init_state = { };
1627	1877	unsigned long nr_pgmask = pageblock_nr_pages - 1;
	1878	+ int nid = zone_to_nid(zone);
1628	1879	unsigned long nr_pages = 0;
	1880	+ int zid = zone_idx(zone);
1629	1881	struct page *page = NULL;
1630	1882
1631	1883	for (; pfn < end_pfn; pfn++) {
1632		- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
	1884	+ if (!deferred_pfn_valid(pfn)) {
1633	1885	page = NULL;
1634	1886	continue;
1635	1887	} else if (!page \|\| !(pfn & nr_pgmask)) {
1636	1888	page = pfn_to_page(pfn);
1637		- touch_nmi_watchdog();
1638	1889	} else {
1639	1890	page++;
1640	1891	}
..	..	@@ -1644,18 +1895,127 @@
1644	1895	return (nr_pages);
1645	1896	}
1646	1897
	1898	+/*
	1899	+ * This function is meant to pre-load the iterator for the zone init.
	1900	+ * Specifically it walks through the ranges until we are caught up to the
	1901	+ * first_init_pfn value and exits there. If we never encounter the value we
	1902	+ * return false indicating there are no valid ranges left.
	1903	+ */
	1904	+static bool __init
	1905	+deferred_init_mem_pfn_range_in_zone(u64 i, struct zone zone,
	1906	+ unsigned long spfn, unsigned long epfn,
	1907	+ unsigned long first_init_pfn)
	1908	+{
	1909	+ u64 j;
	1910	+
	1911	+ /*
	1912	+ * Start out by walking through the ranges in this zone that have
	1913	+ * already been initialized. We don't need to do anything with them
	1914	+ * so we just need to flush them out of the system.
	1915	+ */
	1916	+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
	1917	+ if (*epfn <= first_init_pfn)
	1918	+ continue;
	1919	+ if (*spfn < first_init_pfn)
	1920	+ *spfn = first_init_pfn;
	1921	+ *i = j;
	1922	+ return true;
	1923	+ }
	1924	+
	1925	+ return false;
	1926	+}
	1927	+
	1928	+/*
	1929	+ * Initialize and free pages. We do it in two loops: first we initialize
	1930	+ * struct page, then free to buddy allocator, because while we are
	1931	+ * freeing pages we can access pages that are ahead (computing buddy
	1932	+ * page in __free_one_page()).
	1933	+ *
	1934	+ * In order to try and keep some memory in the cache we have the loop
	1935	+ * broken along max page order boundaries. This way we will not cause
	1936	+ * any issues with the buddy page computation.
	1937	+ */
	1938	+static unsigned long __init
	1939	+deferred_init_maxorder(u64 i, struct zone zone, unsigned long *start_pfn,
	1940	+ unsigned long *end_pfn)
	1941	+{
	1942	+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
	1943	+ unsigned long spfn = start_pfn, epfn = end_pfn;
	1944	+ unsigned long nr_pages = 0;
	1945	+ u64 j = *i;
	1946	+
	1947	+ /* First we loop through and initialize the page values */
	1948	+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
	1949	+ unsigned long t;
	1950	+
	1951	+ if (mo_pfn <= *start_pfn)
	1952	+ break;
	1953	+
	1954	+ t = min(mo_pfn, *end_pfn);
	1955	+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
	1956	+
	1957	+ if (mo_pfn < *end_pfn) {
	1958	+ *start_pfn = mo_pfn;
	1959	+ break;
	1960	+ }
	1961	+ }
	1962	+
	1963	+ /* Reset values and now loop through freeing pages as needed */
	1964	+ swap(j, *i);
	1965	+
	1966	+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
	1967	+ unsigned long t;
	1968	+
	1969	+ if (mo_pfn <= spfn)
	1970	+ break;
	1971	+
	1972	+ t = min(mo_pfn, epfn);
	1973	+ deferred_free_pages(spfn, t);
	1974	+
	1975	+ if (mo_pfn <= epfn)
	1976	+ break;
	1977	+ }
	1978	+
	1979	+ return nr_pages;
	1980	+}
	1981	+
	1982	+static void __init
	1983	+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
	1984	+ void *arg)
	1985	+{
	1986	+ unsigned long spfn, epfn;
	1987	+ struct zone *zone = arg;
	1988	+ u64 i;
	1989	+
	1990	+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
	1991	+
	1992	+ /*
	1993	+ * Initialize and free pages in MAX_ORDER sized increments so that we
	1994	+ * can avoid introducing any issues with the buddy allocator.
	1995	+ */
	1996	+ while (spfn < end_pfn) {
	1997	+ deferred_init_maxorder(&i, zone, &spfn, &epfn);
	1998	+ cond_resched();
	1999	+ }
	2000	+}
	2001	+
	2002	+/* An arch may override for more concurrency. */
	2003	+__weak int __init
	2004	+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
	2005	+{
	2006	+ return 1;
	2007	+}
	2008	+
1647	2009	/* Initialise remaining memory on a node */
1648	2010	static int __init deferred_init_memmap(void *data)
1649	2011	{
1650	2012	pg_data_t *pgdat = data;
1651		- int nid = pgdat->node_id;
1652		- unsigned long start = jiffies;
1653		- unsigned long nr_pages = 0;
1654		- unsigned long spfn, epfn, first_init_pfn, flags;
1655		- phys_addr_t spa, epa;
1656		- int zid;
1657		- struct zone *zone;
1658	2013	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
	2014	+ unsigned long spfn = 0, epfn = 0;
	2015	+ unsigned long first_init_pfn, flags;
	2016	+ unsigned long start = jiffies;
	2017	+ struct zone *zone;
	2018	+ int zid, max_threads;
1659	2019	u64 i;
1660	2020
1661	2021	/* Bind memory initialisation thread to a local node if possible */
..	..	@@ -1688,30 +2048,36 @@
1688	2048	if (first_init_pfn < zone_end_pfn(zone))
1689	2049	break;
1690	2050	}
1691		- first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
1692	2051
1693		- /*
1694		- * Initialize and free pages. We do it in two loops: first we initialize
1695		- * struct page, than free to buddy allocator, because while we are
1696		- * freeing pages we can access pages that are ahead (computing buddy
1697		- * page in __free_one_page()).
1698		- */
1699		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1700		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1701		- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1702		- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
1703		- }
1704		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1705		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1706		- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1707		- deferred_free_pages(nid, zid, spfn, epfn);
1708		- }
	2052	+ /* If the zone is empty somebody else may have cleared out the zone */
	2053	+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
	2054	+ first_init_pfn))
	2055	+ goto zone_empty;
1709	2056
	2057	+ max_threads = deferred_page_init_max_threads(cpumask);
	2058	+
	2059	+ while (spfn < epfn) {
	2060	+ unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
	2061	+ struct padata_mt_job job = {
	2062	+ .thread_fn = deferred_init_memmap_chunk,
	2063	+ .fn_arg = zone,
	2064	+ .start = spfn,
	2065	+ .size = epfn_align - spfn,
	2066	+ .align = PAGES_PER_SECTION,
	2067	+ .min_chunk = PAGES_PER_SECTION,
	2068	+ .max_threads = max_threads,
	2069	+ };
	2070	+
	2071	+ padata_do_multithreaded(&job);
	2072	+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
	2073	+ epfn_align);
	2074	+ }
	2075	+zone_empty:
1710	2076	/* Sanity check that the next zone really is unpopulated */
1711	2077	WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
1712	2078
1713		- pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
1714		- jiffies_to_msecs(jiffies - start));
	2079	+ pr_info("node %d deferred pages initialised in %ums\n",
	2080	+ pgdat->node_id, jiffies_to_msecs(jiffies - start));
1715	2081
1716	2082	pgdat_init_report_one_done();
1717	2083	return 0;
..	..	@@ -1735,14 +2101,11 @@
1735	2101	static noinline bool __init
1736	2102	deferred_grow_zone(struct zone *zone, unsigned int order)
1737	2103	{
1738		- int zid = zone_idx(zone);
1739		- int nid = zone_to_nid(zone);
1740		- pg_data_t *pgdat = NODE_DATA(nid);
1741	2104	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
1742		- unsigned long nr_pages = 0;
1743		- unsigned long first_init_pfn, spfn, epfn, t, flags;
	2105	+ pg_data_t *pgdat = zone->zone_pgdat;
1744	2106	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
1745		- phys_addr_t spa, epa;
	2107	+ unsigned long spfn, epfn, flags;
	2108	+ unsigned long nr_pages = 0;
1746	2109	u64 i;
1747	2110
1748	2111	/* Only the last zone may have deferred pages */
..	..	@@ -1760,38 +2123,37 @@
1760	2123	return true;
1761	2124	}
1762	2125
1763		- first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
1764		-
1765		- if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
	2126	+ /* If the zone is empty somebody else may have cleared out the zone */
	2127	+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
	2128	+ first_deferred_pfn)) {
	2129	+ pgdat->first_deferred_pfn = ULONG_MAX;
1766	2130	pgdat_resize_unlock(pgdat, &flags);
1767		- return false;
	2131	+ /* Retry only once. */
	2132	+ return first_deferred_pfn != ULONG_MAX;
1768	2133	}
1769	2134
1770		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1771		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1772		- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
	2135	+ /*
	2136	+ * Initialize and free pages in MAX_ORDER sized increments so
	2137	+ * that we can avoid introducing any issues with the buddy
	2138	+ * allocator.
	2139	+ */
	2140	+ while (spfn < epfn) {
	2141	+ /* update our first deferred PFN for this section */
	2142	+ first_deferred_pfn = spfn;
1773	2143
1774		- while (spfn < epfn && nr_pages < nr_pages_needed) {
1775		- t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
1776		- first_deferred_pfn = min(t, epfn);
1777		- nr_pages += deferred_init_pages(nid, zid, spfn,
1778		- first_deferred_pfn);
1779		- spfn = first_deferred_pfn;
1780		- }
	2144	+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
	2145	+ touch_nmi_watchdog();
1781	2146
	2147	+ /* We should only stop along section boundaries */
	2148	+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
	2149	+ continue;
	2150	+
	2151	+ /* If our quota has been met we can stop here */
1782	2152	if (nr_pages >= nr_pages_needed)
1783	2153	break;
1784	2154	}
1785	2155
1786		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1787		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1788		- epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
1789		- deferred_free_pages(nid, zid, spfn, epfn);
1790		-
1791		- if (first_deferred_pfn == epfn)
1792		- break;
1793		- }
1794		- pgdat->first_deferred_pfn = first_deferred_pfn;
	2156	+ pgdat->first_deferred_pfn = spfn;
1795	2157	pgdat_resize_unlock(pgdat, &flags);
1796	2158
1797	2159	return nr_pages > 0;
..	..	@@ -1814,9 +2176,9 @@
1814	2176	void __init page_alloc_init_late(void)
1815	2177	{
1816	2178	struct zone *zone;
	2179	+ int nid;
1817	2180
1818	2181	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1819		- int nid;
1820	2182
1821	2183	/* There will be num_node_state(N_MEMORY) threads */
1822	2184	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
..	..	@@ -1844,10 +2206,12 @@
1844	2206	/* Reinit limits that are based on free pages after the kernel is up */
1845	2207	files_maxfiles_init();
1846	2208	#endif
1847		-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
	2209	+
1848	2210	/* Discard memblock private memory */
1849	2211	memblock_discard();
1850		-#endif
	2212	+
	2213	+ for_each_node_state(nid, N_MEMORY)
	2214	+ shuffle_free_memory(NODE_DATA(nid));
1851	2215
1852	2216	for_each_populated_zone(zone)
1853	2217	set_zone_contiguous(zone);
..	..	@@ -1881,6 +2245,7 @@
1881	2245	}
1882	2246
1883	2247	adjust_managed_page_count(page, pageblock_nr_pages);
	2248	+ page_zone(page)->cma_pages += pageblock_nr_pages;
1884	2249	}
1885	2250	#endif
1886	2251
..	..	@@ -1899,13 +2264,11 @@
1899	2264	* -- nyc
1900	2265	*/
1901	2266	static inline void expand(struct zone zone, struct page page,
1902		- int low, int high, struct free_area *area,
1903		- int migratetype)
	2267	+ int low, int high, int migratetype)
1904	2268	{
1905	2269	unsigned long size = 1 << high;
1906	2270
1907	2271	while (high > low) {
1908		- area--;
1909	2272	high--;
1910	2273	size >>= 1;
1911	2274	VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
..	..	@@ -1919,39 +2282,21 @@
1919	2282	if (set_page_guard(zone, &page[size], high, migratetype))
1920	2283	continue;
1921	2284
1922		- list_add(&page[size].lru, &area->free_list[migratetype]);
1923		- area->nr_free++;
1924		- set_page_order(&page[size], high);
	2285	+ add_to_free_list(&page[size], zone, high, migratetype);
	2286	+ set_buddy_order(&page[size], high);
1925	2287	}
1926	2288	}
1927	2289
1928	2290	static void check_new_page_bad(struct page *page)
1929	2291	{
1930		- const char *bad_reason = NULL;
1931		- unsigned long bad_flags = 0;
1932		-
1933		- if (unlikely(atomic_read(&page->_mapcount) != -1))
1934		- bad_reason = "nonzero mapcount";
1935		- if (unlikely(page->mapping != NULL))
1936		- bad_reason = "non-NULL mapping";
1937		- if (unlikely(page_ref_count(page) != 0))
1938		- bad_reason = "nonzero _count";
1939	2292	if (unlikely(page->flags & __PG_HWPOISON)) {
1940		- bad_reason = "HWPoisoned (hardware-corrupted)";
1941		- bad_flags = __PG_HWPOISON;
1942	2293	/* Don't complain about hwpoisoned pages */
1943	2294	page_mapcount_reset(page); /* remove PageBuddy */
1944	2295	return;
1945	2296	}
1946		- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
1947		- bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
1948		- bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
1949		- }
1950		-#ifdef CONFIG_MEMCG
1951		- if (unlikely(page->mem_cgroup))
1952		- bad_reason = "page still charged to cgroup";
1953		-#endif
1954		- bad_page(page, bad_reason, bad_flags);
	2297	+
	2298	+ bad_page(page,
	2299	+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
1955	2300	}
1956	2301
1957	2302	/*
..	..	@@ -1967,30 +2312,40 @@
1967	2312	return 1;
1968	2313	}
1969	2314
1970		-static inline bool free_pages_prezeroed(void)
1971		-{
1972		- return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
1973		- page_poisoning_enabled()) \|\| want_init_on_free();
1974		-}
1975		-
1976	2315	#ifdef CONFIG_DEBUG_VM
1977		-static bool check_pcp_refill(struct page *page)
	2316	+/*
	2317	+ * With DEBUG_VM enabled, order-0 pages are checked for expected state when
	2318	+ * being allocated from pcp lists. With debug_pagealloc also enabled, they are
	2319	+ * also checked when pcp lists are refilled from the free lists.
	2320	+ */
	2321	+static inline bool check_pcp_refill(struct page *page)
1978	2322	{
1979		- return false;
	2323	+ if (debug_pagealloc_enabled_static())
	2324	+ return check_new_page(page);
	2325	+ else
	2326	+ return false;
1980	2327	}
1981	2328
1982		-static bool check_new_pcp(struct page *page)
	2329	+static inline bool check_new_pcp(struct page *page)
1983	2330	{
1984	2331	return check_new_page(page);
1985	2332	}
1986	2333	#else
1987		-static bool check_pcp_refill(struct page *page)
	2334	+/*
	2335	+ * With DEBUG_VM disabled, free order-0 pages are checked for expected state
	2336	+ * when pcp lists are being refilled from the free lists. With debug_pagealloc
	2337	+ * enabled, they are also checked when being allocated from the pcp lists.
	2338	+ */
	2339	+static inline bool check_pcp_refill(struct page *page)
1988	2340	{
1989	2341	return check_new_page(page);
1990	2342	}
1991		-static bool check_new_pcp(struct page *page)
	2343	+static inline bool check_new_pcp(struct page *page)
1992	2344	{
1993		- return false;
	2345	+ if (debug_pagealloc_enabled_static())
	2346	+ return check_new_page(page);
	2347	+ else
	2348	+ return false;
1994	2349	}
1995	2350	#endif /* CONFIG_DEBUG_VM */
1996	2351
..	..	@@ -2014,9 +2369,31 @@
2014	2369	set_page_refcounted(page);
2015	2370
2016	2371	arch_alloc_page(page, order);
2017		- kernel_map_pages(page, 1 << order, 1);
2018		- kasan_alloc_pages(page, order);
2019		- kernel_poison_pages(page, 1 << order, 1);
	2372	+ debug_pagealloc_map_pages(page, 1 << order);
	2373	+
	2374	+ /*
	2375	+ * Page unpoisoning must happen before memory initialization.
	2376	+ * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
	2377	+ * allocations and the page unpoisoning code will complain.
	2378	+ */
	2379	+ kernel_unpoison_pages(page, 1 << order);
	2380	+
	2381	+ /*
	2382	+ * As memory initialization might be integrated into KASAN,
	2383	+ * kasan_alloc_pages and kernel_init_free_pages must be
	2384	+ * kept together to avoid discrepancies in behavior.
	2385	+ */
	2386	+ if (kasan_has_integrated_init()) {
	2387	+ kasan_alloc_pages(page, order, gfp_flags);
	2388	+ } else {
	2389	+ bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
	2390	+
	2391	+ kasan_unpoison_pages(page, order, init);
	2392	+ if (init)
	2393	+ kernel_init_free_pages(page, 1 << order,
	2394	+ gfp_flags & __GFP_ZEROTAGS);
	2395	+ }
	2396	+
2020	2397	set_page_owner(page, order, gfp_flags);
2021	2398	}
2022	2399
..	..	@@ -2024,9 +2401,6 @@
2024	2401	unsigned int alloc_flags)
2025	2402	{
2026	2403	post_alloc_hook(page, order, gfp_flags);
2027		-
2028		- if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
2029		- kernel_init_free_pages(page, 1 << order);
2030	2404
2031	2405	if (order && (gfp_flags & __GFP_COMP))
2032	2406	prep_compound_page(page, order);
..	..	@@ -2041,6 +2415,7 @@
2041	2415	set_page_pfmemalloc(page);
2042	2416	else
2043	2417	clear_page_pfmemalloc(page);
	2418	+ trace_android_vh_test_clear_look_around_ref(page);
2044	2419	}
2045	2420
2046	2421	/*
..	..	@@ -2058,14 +2433,11 @@
2058	2433	/* Find a page of the appropriate size in the preferred list */
2059	2434	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
2060	2435	area = &(zone->free_area[current_order]);
2061		- page = list_first_entry_or_null(&area->free_list[migratetype],
2062		- struct page, lru);
	2436	+ page = get_page_from_free_area(area, migratetype);
2063	2437	if (!page)
2064	2438	continue;
2065		- list_del(&page->lru);
2066		- rmv_page_order(page);
2067		- area->nr_free--;
2068		- expand(zone, page, order, current_order, area, migratetype);
	2439	+ del_page_from_free_list(page, zone, current_order);
	2440	+ expand(zone, page, order, current_order, migratetype);
2069	2441	set_pcppage_migratetype(page, migratetype);
2070	2442	return page;
2071	2443	}
..	..	@@ -2078,10 +2450,10 @@
2078	2450	* This array describes the order lists are fallen back to when
2079	2451	* the free lists for the desirable migrate type are depleted
2080	2452	*/
2081		-static int fallbacks[MIGRATE_TYPES][4] = {
	2453	+static int fallbacks[MIGRATE_TYPES][3] = {
2082	2454	[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2083		- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2084	2455	[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
	2456	+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2085	2457	#ifdef CONFIG_CMA
2086	2458	[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
2087	2459	#endif
..	..	@@ -2102,7 +2474,7 @@
2102	2474	#endif
2103	2475
2104	2476	/*
2105		- * Move the free pages in a range to the free lists of the requested type.
	2477	+ * Move the free pages in a range to the freelist tail of the requested type.
2106	2478	* Note that start_page and end_pages are not aligned on a pageblock
2107	2479	* boundary. If alignment is required, use move_freepages_block()
2108	2480	*/
..	..	@@ -2114,30 +2486,11 @@
2114	2486	unsigned int order;
2115	2487	int pages_moved = 0;
2116	2488
2117		-#ifndef CONFIG_HOLES_IN_ZONE
2118		- /*
2119		- * page_zone is not safe to call in this context when
2120		- * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
2121		- * anyway as we check zone boundaries in move_freepages_block().
2122		- * Remove at a later date when no bug reports exist related to
2123		- * grouping pages by mobility
2124		- */
2125		- VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
2126		- pfn_valid(page_to_pfn(end_page)) &&
2127		- page_zone(start_page) != page_zone(end_page));
2128		-#endif
2129		-
2130		- if (num_movable)
2131		- *num_movable = 0;
2132		-
2133	2489	for (page = start_page; page <= end_page;) {
2134	2490	if (!pfn_valid_within(page_to_pfn(page))) {
2135	2491	page++;
2136	2492	continue;
2137	2493	}
2138		-
2139		- /* Make sure we are not inadvertently changing nodes */
2140		- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
2141	2494
2142	2495	if (!PageBuddy(page)) {
2143	2496	/*
..	..	@@ -2153,9 +2506,12 @@
2153	2506	continue;
2154	2507	}
2155	2508
2156		- order = page_order(page);
2157		- list_move(&page->lru,
2158		- &zone->free_area[order].free_list[migratetype]);
	2509	+ /* Make sure we are not inadvertently changing nodes */
	2510	+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
	2511	+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
	2512	+
	2513	+ order = buddy_order(page);
	2514	+ move_to_free_list(page, zone, order, migratetype);
2159	2515	page += 1 << order;
2160	2516	pages_moved += 1 << order;
2161	2517	}
..	..	@@ -2168,6 +2524,9 @@
2168	2524	{
2169	2525	unsigned long start_pfn, end_pfn;
2170	2526	struct page start_page, end_page;
	2527	+
	2528	+ if (num_movable)
	2529	+ *num_movable = 0;
2171	2530
2172	2531	start_pfn = page_to_pfn(page);
2173	2532	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
..	..	@@ -2229,6 +2588,43 @@
2229	2588	return false;
2230	2589	}
2231	2590
	2591	+static inline bool boost_watermark(struct zone *zone)
	2592	+{
	2593	+ unsigned long max_boost;
	2594	+
	2595	+ if (!watermark_boost_factor)
	2596	+ return false;
	2597	+ /*
	2598	+ * Don't bother in zones that are unlikely to produce results.
	2599	+ * On small machines, including kdump capture kernels running
	2600	+ * in a small area, boosting the watermark can cause an out of
	2601	+ * memory situation immediately.
	2602	+ */
	2603	+ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
	2604	+ return false;
	2605	+
	2606	+ max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
	2607	+ watermark_boost_factor, 10000);
	2608	+
	2609	+ /*
	2610	+ * high watermark may be uninitialised if fragmentation occurs
	2611	+ * very early in boot so do not boost. We do not fall
	2612	+ * through and boost by pageblock_nr_pages as failing
	2613	+ * allocations that early means that reclaim is not going
	2614	+ * to help and it may even be impossible to reclaim the
	2615	+ * boosted watermark resulting in a hang.
	2616	+ */
	2617	+ if (!max_boost)
	2618	+ return false;
	2619	+
	2620	+ max_boost = max(pageblock_nr_pages, max_boost);
	2621	+
	2622	+ zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
	2623	+ max_boost);
	2624	+
	2625	+ return true;
	2626	+}
	2627	+
2232	2628	/*
2233	2629	* This function implements actual steal behaviour. If order is large enough,
2234	2630	* we can steal whole pageblock. If not, we first move freepages in this
..	..	@@ -2238,10 +2634,9 @@
2238	2634	* itself, so pages freed in the future will be put on the correct free list.
2239	2635	*/
2240	2636	static void steal_suitable_fallback(struct zone zone, struct page page,
2241		- int start_type, bool whole_block)
	2637	+ unsigned int alloc_flags, int start_type, bool whole_block)
2242	2638	{
2243		- unsigned int current_order = page_order(page);
2244		- struct free_area *area;
	2639	+ unsigned int current_order = buddy_order(page);
2245	2640	int free_pages, movable_pages, alike_pages;
2246	2641	int old_block_type;
2247	2642
..	..	@@ -2259,6 +2654,14 @@
2259	2654	change_pageblock_range(page, current_order, start_type);
2260	2655	goto single_page;
2261	2656	}
	2657	+
	2658	+ /*
	2659	+ * Boost watermarks to increase reclaim pressure to reduce the
	2660	+ * likelihood of future fallbacks. Wake kswapd now as the node
	2661	+ * may be balanced overall and kswapd will not wake naturally.
	2662	+ */
	2663	+ if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
	2664	+ set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
2262	2665
2263	2666	/* We are not allowed to try stealing from the whole block */
2264	2667	if (!whole_block)
..	..	@@ -2303,8 +2706,7 @@
2303	2706	return;
2304	2707
2305	2708	single_page:
2306		- area = &zone->free_area[current_order];
2307		- list_move(&page->lru, &area->free_list[start_type]);
	2709	+ move_to_free_list(page, zone, current_order, start_type);
2308	2710	}
2309	2711
2310	2712	/*
..	..	@@ -2328,7 +2730,7 @@
2328	2730	if (fallback_mt == MIGRATE_TYPES)
2329	2731	break;
2330	2732
2331		- if (list_empty(&area->free_list[fallback_mt]))
	2733	+ if (free_area_empty(area, fallback_mt))
2332	2734	continue;
2333	2735
2334	2736	if (can_steal_fallback(order, migratetype))
..	..	@@ -2358,7 +2760,7 @@
2358	2760	* Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2359	2761	* Check is race-prone but harmless.
2360	2762	*/
2361		- max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
	2763	+ max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
2362	2764	if (zone->nr_reserved_highatomic >= max_managed)
2363	2765	return;
2364	2766
..	..	@@ -2400,8 +2802,9 @@
2400	2802	struct page *page;
2401	2803	int order;
2402	2804	bool ret;
	2805	+ bool skip_unreserve_highatomic = false;
2403	2806
2404		- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
	2807	+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
2405	2808	ac->nodemask) {
2406	2809	/*
2407	2810	* Preserve at least one pageblock unless memory pressure
..	..	@@ -2411,13 +2814,16 @@
2411	2814	pageblock_nr_pages)
2412	2815	continue;
2413	2816
	2817	+ trace_android_vh_unreserve_highatomic_bypass(force, zone,
	2818	+ &skip_unreserve_highatomic);
	2819	+ if (skip_unreserve_highatomic)
	2820	+ continue;
	2821	+
2414	2822	spin_lock_irqsave(&zone->lock, flags);
2415	2823	for (order = 0; order < MAX_ORDER; order++) {
2416	2824	struct free_area *area = &(zone->free_area[order]);
2417	2825
2418		- page = list_first_entry_or_null(
2419		- &area->free_list[MIGRATE_HIGHATOMIC],
2420		- struct page, lru);
	2826	+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
2421	2827	if (!page)
2422	2828	continue;
2423	2829
..	..	@@ -2475,20 +2881,30 @@
2475	2881	* condition simpler.
2476	2882	*/
2477	2883	static __always_inline bool
2478		-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
	2884	+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
	2885	+ unsigned int alloc_flags)
2479	2886	{
2480	2887	struct free_area *area;
2481	2888	int current_order;
	2889	+ int min_order = order;
2482	2890	struct page *page;
2483	2891	int fallback_mt;
2484	2892	bool can_steal;
	2893	+
	2894	+ /*
	2895	+ * Do not steal pages from freelists belonging to other pageblocks
	2896	+ * i.e. orders < pageblock_order. If there are no local zones free,
	2897	+ * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
	2898	+ */
	2899	+ if (alloc_flags & ALLOC_NOFRAGMENT)
	2900	+ min_order = pageblock_order;
2485	2901
2486	2902	/*
2487	2903	* Find the largest available free page in the other list. This roughly
2488	2904	* approximates finding the pageblock with the most free pages, which
2489	2905	* would be too costly to do exactly.
2490	2906	*/
2491		- for (current_order = MAX_ORDER - 1; current_order >= order;
	2907	+ for (current_order = MAX_ORDER - 1; current_order >= min_order;
2492	2908	--current_order) {
2493	2909	area = &(zone->free_area[current_order]);
2494	2910	fallback_mt = find_suitable_fallback(area, current_order,
..	..	@@ -2530,10 +2946,10 @@
2530	2946	VM_BUG_ON(current_order == MAX_ORDER);
2531	2947
2532	2948	do_steal:
2533		- page = list_first_entry(&area->free_list[fallback_mt],
2534		- struct page, lru);
	2949	+ page = get_page_from_free_area(area, fallback_mt);
2535	2950
2536		- steal_suitable_fallback(zone, page, start_migratetype, can_steal);
	2951	+ steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
	2952	+ can_steal);
2537	2953
2538	2954	trace_mm_page_alloc_extfrag(page, order, current_order,
2539	2955	start_migratetype, fallback_mt);
..	..	@@ -2547,14 +2963,16 @@
2547	2963	* Call me with the zone->lock already held.
2548	2964	*/
2549	2965	static __always_inline struct page *
2550		-__rmqueue(struct zone *zone, unsigned int order, int migratetype)
	2966	+__rmqueue(struct zone *zone, unsigned int order, int migratetype,
	2967	+ unsigned int alloc_flags)
2551	2968	{
2552	2969	struct page *page;
2553	2970
2554	2971	retry:
2555	2972	page = __rmqueue_smallest(zone, order, migratetype);
2556	2973
2557		- if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype))
	2974	+ if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype,
	2975	+ alloc_flags))
2558	2976	goto retry;
2559	2977
2560	2978	trace_mm_page_alloc_zone_locked(page, order, migratetype);
..	..	@@ -2562,18 +2980,18 @@
2562	2980	}
2563	2981
2564	2982	#ifdef CONFIG_CMA
2565		-static struct page __rmqueue_cma(struct zone zone, unsigned int order)
	2983	+static struct page __rmqueue_cma(struct zone zone, unsigned int order,
	2984	+ int migratetype,
	2985	+ unsigned int alloc_flags)
2566	2986	{
2567		- struct page *page = 0;
2568		-
2569		- if (IS_ENABLED(CONFIG_CMA))
2570		- if (!zone->cma_alloc)
2571		- page = __rmqueue_cma_fallback(zone, order);
	2987	+ struct page *page = __rmqueue_cma_fallback(zone, order);
2572	2988	trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA);
2573	2989	return page;
2574	2990	}
2575	2991	#else
2576		-static inline struct page __rmqueue_cma(struct zone zone, unsigned int order)
	2992	+static inline struct page __rmqueue_cma(struct zone zone, unsigned int order,
	2993	+ int migratetype,
	2994	+ unsigned int alloc_flags)
2577	2995	{
2578	2996	return NULL;
2579	2997	}
..	..	@@ -2586,7 +3004,7 @@
2586	3004	*/
2587	3005	static int rmqueue_bulk(struct zone *zone, unsigned int order,
2588	3006	unsigned long count, struct list_head *list,
2589		- int migratetype)
	3007	+ int migratetype, unsigned int alloc_flags)
2590	3008	{
2591	3009	int i, alloced = 0;
2592	3010
..	..	@@ -2594,15 +3012,11 @@
2594	3012	for (i = 0; i < count; ++i) {
2595	3013	struct page *page;
2596	3014
2597		- /*
2598		- * If migrate type CMA is being requested only try to
2599		- * satisfy the request with CMA pages to try and increase
2600		- * CMA utlization.
2601		- */
2602	3015	if (is_migrate_cma(migratetype))
2603		- page = __rmqueue_cma(zone, order);
	3016	+ page = __rmqueue_cma(zone, order, migratetype,
	3017	+ alloc_flags);
2604	3018	else
2605		- page = __rmqueue(zone, order, migratetype);
	3019	+ page = __rmqueue(zone, order, migratetype, alloc_flags);
2606	3020
2607	3021	if (unlikely(page == NULL))
2608	3022	break;
..	..	@@ -2645,14 +3059,18 @@
2645	3059	*/
2646	3060	static struct list_head get_populated_pcp_list(struct zone zone,
2647	3061	unsigned int order, struct per_cpu_pages *pcp,
2648		- int migratetype)
	3062	+ int migratetype, unsigned int alloc_flags)
2649	3063	{
2650	3064	struct list_head *list = &pcp->lists[migratetype];
2651	3065
2652	3066	if (list_empty(list)) {
	3067	+ trace_android_vh_rmqueue_bulk_bypass(order, pcp, migratetype, list);
	3068	+ if (!list_empty(list))
	3069	+ return list;
	3070	+
2653	3071	pcp->count += rmqueue_bulk(zone, order,
2654	3072	pcp->batch, list,
2655		- migratetype);
	3073	+ migratetype, alloc_flags);
2656	3074
2657	3075	if (list_empty(list))
2658	3076	list = NULL;
..	..	@@ -2739,6 +3157,10 @@
2739	3157
2740	3158	static void drain_local_pages_wq(struct work_struct *work)
2741	3159	{
	3160	+ struct pcpu_drain *drain;
	3161	+
	3162	+ drain = container_of(work, struct pcpu_drain, work);
	3163	+
2742	3164	/*
2743	3165	* drain_all_pages doesn't use proper cpu hotplug protection so
2744	3166	* we can race with cpu offline when the WQ can move this from
..	..	@@ -2747,7 +3169,7 @@
2747	3169	* a different one.
2748	3170	*/
2749	3171	preempt_disable();
2750		- drain_local_pages(NULL);
	3172	+ drain_local_pages(drain->zone);
2751	3173	preempt_enable();
2752	3174	}
2753	3175
..	..	@@ -2818,12 +3240,14 @@
2818	3240	}
2819	3241
2820	3242	for_each_cpu(cpu, &cpus_with_pcps) {
2821		- struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
2822		- INIT_WORK(work, drain_local_pages_wq);
2823		- queue_work_on(cpu, mm_percpu_wq, work);
	3243	+ struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
	3244	+
	3245	+ drain->zone = zone;
	3246	+ INIT_WORK(&drain->work, drain_local_pages_wq);
	3247	+ queue_work_on(cpu, mm_percpu_wq, &drain->work);
2824	3248	}
2825	3249	for_each_cpu(cpu, &cpus_with_pcps)
2826		- flush_work(per_cpu_ptr(&pcpu_drain, cpu));
	3250	+ flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
2827	3251
2828	3252	mutex_unlock(&pcpu_drain_mutex);
2829	3253	}
..	..	@@ -2900,6 +3324,7 @@
2900	3324	struct zone *zone = page_zone(page);
2901	3325	struct per_cpu_pages *pcp;
2902	3326	int migratetype;
	3327	+ bool pcp_skip_cma_pages = false;
2903	3328
2904	3329	migratetype = get_pcppage_migratetype(page);
2905	3330	__count_vm_event(PGFREE);
..	..	@@ -2912,8 +3337,12 @@
2912	3337	* excessively into the page allocator
2913	3338	*/
2914	3339	if (migratetype >= MIGRATE_PCPTYPES) {
2915		- if (unlikely(is_migrate_isolate(migratetype))) {
2916		- free_one_page(zone, page, pfn, 0, migratetype);
	3340	+ trace_android_vh_pcplist_add_cma_pages_bypass(migratetype,
	3341	+ &pcp_skip_cma_pages);
	3342	+ if (unlikely(is_migrate_isolate(migratetype)) \|\|
	3343	+ pcp_skip_cma_pages) {
	3344	+ free_one_page(zone, page, pfn, 0, migratetype,
	3345	+ FPI_NONE);
2917	3346	return;
2918	3347	}
2919	3348	migratetype = MIGRATE_MOVABLE;
..	..	@@ -2935,8 +3364,15 @@
2935	3364	{
2936	3365	unsigned long flags;
2937	3366	unsigned long pfn = page_to_pfn(page);
	3367	+ int migratetype;
	3368	+ bool skip_free_unref_page = false;
2938	3369
2939	3370	if (!free_unref_page_prepare(page, pfn))
	3371	+ return;
	3372	+
	3373	+ migratetype = get_pfnblock_migratetype(page, pfn);
	3374	+ trace_android_vh_free_unref_page_bypass(page, 0, migratetype, &skip_free_unref_page);
	3375	+ if (skip_free_unref_page)
2940	3376	return;
2941	3377
2942	3378	local_irq_save(flags);
..	..	@@ -2999,7 +3435,8 @@
2999	3435
3000	3436	for (i = 1; i < (1 << order); i++)
3001	3437	set_page_refcounted(page + i);
3002		- split_page_owner(page, order);
	3438	+ split_page_owner(page, 1 << order);
	3439	+ split_page_memcg(page, 1 << order);
3003	3440	}
3004	3441	EXPORT_SYMBOL_GPL(split_page);
3005	3442
..	..	@@ -3021,7 +3458,7 @@
3021	3458	* watermark, because we already know our high-order page
3022	3459	* exists.
3023	3460	*/
3024		- watermark = min_wmark_pages(zone) + (1UL << order);
	3461	+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
3025	3462	if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
3026	3463	return 0;
3027	3464
..	..	@@ -3029,9 +3466,8 @@
3029	3466	}
3030	3467
3031	3468	/* Remove page from free list */
3032		- list_del(&page->lru);
3033		- zone->free_area[order].nr_free--;
3034		- rmv_page_order(page);
	3469	+
	3470	+ del_page_from_free_list(page, zone, order);
3035	3471
3036	3472	/*
3037	3473	* Set the pageblock if the isolated page is at least half of a
..	..	@@ -3050,6 +3486,27 @@
3050	3486
3051	3487
3052	3488	return 1UL << order;
	3489	+}
	3490	+
	3491	+/**
	3492	+ * __putback_isolated_page - Return a now-isolated page back where we got it
	3493	+ * @page: Page that was isolated
	3494	+ * @order: Order of the isolated page
	3495	+ * @mt: The page's pageblock's migratetype
	3496	+ *
	3497	+ * This function is meant to return a page pulled from the free lists via
	3498	+ * __isolate_free_page back to the free lists they were pulled from.
	3499	+ */
	3500	+void __putback_isolated_page(struct page *page, unsigned int order, int mt)
	3501	+{
	3502	+ struct zone *zone = page_zone(page);
	3503	+
	3504	+ /* zone lock should be held when this function is called */
	3505	+ lockdep_assert_held(&zone->lock);
	3506	+
	3507	+ /* Return isolated page to tail of freelist. */
	3508	+ __free_one_page(page, page_to_pfn(page), zone, order, mt,
	3509	+ FPI_SKIP_REPORT_NOTIFY \| FPI_TO_TAIL);
3053	3510	}
3054	3511
3055	3512	/*
..	..	@@ -3081,6 +3538,7 @@
3081	3538
3082	3539	/* Remove page from the per-cpu list, caller must protect the list */
3083	3540	static struct page __rmqueue_pcplist(struct zone zone, int migratetype,
	3541	+ unsigned int alloc_flags,
3084	3542	struct per_cpu_pages *pcp,
3085	3543	gfp_t gfp_flags)
3086	3544	{
..	..	@@ -3090,9 +3548,9 @@
3090	3548	do {
3091	3549	/* First try to get CMA pages */
3092	3550	if (migratetype == MIGRATE_MOVABLE &&
3093		- gfp_flags & __GFP_CMA) {
	3551	+ alloc_flags & ALLOC_CMA) {
3094	3552	list = get_populated_pcp_list(zone, 0, pcp,
3095		- get_cma_migrate_type());
	3553	+ get_cma_migrate_type(), alloc_flags);
3096	3554	}
3097	3555
3098	3556	if (list == NULL) {
..	..	@@ -3101,7 +3559,7 @@
3101	3559	* free CMA pages.
3102	3560	*/
3103	3561	list = get_populated_pcp_list(zone, 0, pcp,
3104		- migratetype);
	3562	+ migratetype, alloc_flags);
3105	3563	if (unlikely(list == NULL) \|\|
3106	3564	unlikely(list_empty(list)))
3107	3565	return NULL;
..	..	@@ -3117,8 +3575,8 @@
3117	3575
3118	3576	/* Lock and remove page from the per-cpu list */
3119	3577	static struct page rmqueue_pcplist(struct zone preferred_zone,
3120		- struct zone *zone, unsigned int order,
3121		- gfp_t gfp_flags, int migratetype)
	3578	+ struct zone *zone, gfp_t gfp_flags,
	3579	+ int migratetype, unsigned int alloc_flags)
3122	3580	{
3123	3581	struct per_cpu_pages *pcp;
3124	3582	struct page *page;
..	..	@@ -3126,10 +3584,10 @@
3126	3584
3127	3585	local_irq_save(flags);
3128	3586	pcp = &this_cpu_ptr(zone->pageset)->pcp;
3129		- page = __rmqueue_pcplist(zone, migratetype, pcp,
	3587	+ page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp,
3130	3588	gfp_flags);
3131	3589	if (page) {
3132		- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
	3590	+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
3133	3591	zone_statistics(preferred_zone, zone);
3134	3592	}
3135	3593	local_irq_restore(flags);
..	..	@@ -3149,8 +3607,8 @@
3149	3607	struct page *page;
3150	3608
3151	3609	if (likely(order == 0)) {
3152		- page = rmqueue_pcplist(preferred_zone, zone, order,
3153		- gfp_flags, migratetype);
	3610	+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
	3611	+ migratetype, alloc_flags);
3154	3612	goto out;
3155	3613	}
3156	3614
..	..	@@ -3163,21 +3621,27 @@
3163	3621
3164	3622	do {
3165	3623	page = NULL;
3166		-
3167		- if (alloc_flags & ALLOC_HARDER) {
	3624	+ /*
	3625	+ * order-0 request can reach here when the pcplist is skipped
	3626	+ * due to non-CMA allocation context. HIGHATOMIC area is
	3627	+ * reserved for high-order atomic allocation, so order-0
	3628	+ * request should skip it.
	3629	+ */
	3630	+ if (order > 0 && alloc_flags & ALLOC_HARDER) {
3168	3631	page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
3169	3632	if (page)
3170	3633	trace_mm_page_alloc_zone_locked(page, order, migratetype);
3171	3634	}
3172		-
3173		- if (!page && migratetype == MIGRATE_MOVABLE &&
3174		- gfp_flags & __GFP_CMA)
3175		- page = __rmqueue_cma(zone, order);
3176		-
3177		- if (!page)
3178		- page = __rmqueue(zone, order, migratetype);
	3635	+ if (!page) {
	3636	+ if (migratetype == MIGRATE_MOVABLE &&
	3637	+ alloc_flags & ALLOC_CMA)
	3638	+ page = __rmqueue_cma(zone, order, migratetype,
	3639	+ alloc_flags);
	3640	+ if (!page)
	3641	+ page = __rmqueue(zone, order, migratetype,
	3642	+ alloc_flags);
	3643	+ }
3179	3644	} while (page && check_new_pages(page, order));
3180		-
3181	3645	spin_unlock(&zone->lock);
3182	3646	if (!page)
3183	3647	goto failed;
..	..	@@ -3186,9 +3650,17 @@
3186	3650
3187	3651	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
3188	3652	zone_statistics(preferred_zone, zone);
	3653	+ trace_android_vh_rmqueue(preferred_zone, zone, order,
	3654	+ gfp_flags, alloc_flags, migratetype);
3189	3655	local_irq_restore(flags);
3190	3656
3191	3657	out:
	3658	+ /* Separate test+clear to avoid unnecessary atomics */
	3659	+ if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
	3660	+ clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
	3661	+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
	3662	+ }
	3663	+
3192	3664	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
3193	3665	return page;
3194	3666
..	..	@@ -3218,7 +3690,7 @@
3218	3690	}
3219	3691	__setup("fail_page_alloc=", setup_fail_page_alloc);
3220	3692
3221		-static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
	3693	+static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3222	3694	{
3223	3695	if (order < fail_page_alloc.min_order)
3224	3696	return false;
..	..	@@ -3242,24 +3714,14 @@
3242	3714
3243	3715	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
3244	3716	&fail_page_alloc.attr);
3245		- if (IS_ERR(dir))
3246		- return PTR_ERR(dir);
3247	3717
3248		- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
3249		- &fail_page_alloc.ignore_gfp_reclaim))
3250		- goto fail;
3251		- if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3252		- &fail_page_alloc.ignore_gfp_highmem))
3253		- goto fail;
3254		- if (!debugfs_create_u32("min-order", mode, dir,
3255		- &fail_page_alloc.min_order))
3256		- goto fail;
	3718	+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
	3719	+ &fail_page_alloc.ignore_gfp_reclaim);
	3720	+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
	3721	+ &fail_page_alloc.ignore_gfp_highmem);
	3722	+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
3257	3723
3258	3724	return 0;
3259		-fail:
3260		- debugfs_remove_recursive(dir);
3261		-
3262		- return -ENOMEM;
3263	3725	}
3264	3726
3265	3727	late_initcall(fail_page_alloc_debugfs);
..	..	@@ -3268,12 +3730,41 @@
3268	3730
3269	3731	#else /* CONFIG_FAIL_PAGE_ALLOC */
3270	3732
3271		-static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
	3733	+static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3272	3734	{
3273	3735	return false;
3274	3736	}
3275	3737
3276	3738	#endif /* CONFIG_FAIL_PAGE_ALLOC */
	3739	+
	3740	+noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
	3741	+{
	3742	+ return __should_fail_alloc_page(gfp_mask, order);
	3743	+}
	3744	+ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
	3745	+
	3746	+static inline long __zone_watermark_unusable_free(struct zone *z,
	3747	+ unsigned int order, unsigned int alloc_flags)
	3748	+{
	3749	+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER\|ALLOC_OOM));
	3750	+ long unusable_free = (1 << order) - 1;
	3751	+
	3752	+ /*
	3753	+ * If the caller does not have rights to ALLOC_HARDER then subtract
	3754	+ * the high-atomic reserves. This will over-estimate the size of the
	3755	+ * atomic reserve but it avoids a search.
	3756	+ */
	3757	+ if (likely(!alloc_harder))
	3758	+ unusable_free += z->nr_reserved_highatomic;
	3759	+
	3760	+#ifdef CONFIG_CMA
	3761	+ /* If allocation can't use CMA areas don't use free CMA pages */
	3762	+ if (!(alloc_flags & ALLOC_CMA))
	3763	+ unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
	3764	+#endif
	3765	+
	3766	+ return unusable_free;
	3767	+}
3277	3768
3278	3769	/*
3279	3770	* Return true if free base pages are above 'mark'. For high-order checks it
..	..	@@ -3282,7 +3773,7 @@
3282	3773	* to check in the allocation paths if no pages are free.
3283	3774	*/
3284	3775	bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3285		- int classzone_idx, unsigned int alloc_flags,
	3776	+ int highest_zoneidx, unsigned int alloc_flags,
3286	3777	long free_pages)
3287	3778	{
3288	3779	long min = mark;
..	..	@@ -3290,19 +3781,12 @@
3290	3781	const bool alloc_harder = (alloc_flags & (ALLOC_HARDER\|ALLOC_OOM));
3291	3782
3292	3783	/* free_pages may go negative - that's OK */
3293		- free_pages -= (1 << order) - 1;
	3784	+ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
3294	3785
3295	3786	if (alloc_flags & ALLOC_HIGH)
3296	3787	min -= min / 2;
3297	3788
3298		- /*
3299		- * If the caller does not have rights to ALLOC_HARDER then subtract
3300		- * the high-atomic reserves. This will over-estimate the size of the
3301		- * atomic reserve but it avoids a search.
3302		- */
3303		- if (likely(!alloc_harder)) {
3304		- free_pages -= z->nr_reserved_highatomic;
3305		- } else {
	3789	+ if (unlikely(alloc_harder)) {
3306	3790	/*
3307	3791	* OOM victims can try even harder than normal ALLOC_HARDER
3308	3792	* users on the grounds that it's definitely going to be in
..	..	@@ -3315,19 +3799,12 @@
3315	3799	min -= min / 4;
3316	3800	}
3317	3801
3318		-
3319		-#ifdef CONFIG_CMA
3320		- /* If allocation can't use CMA areas don't use free CMA pages */
3321		- if (!(alloc_flags & ALLOC_CMA))
3322		- free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
3323		-#endif
3324		-
3325	3802	/*
3326	3803	* Check watermarks for an order-0 allocation request. If these
3327	3804	* are not met, then a high-order request also cannot go ahead
3328	3805	* even if a suitable page happened to be free.
3329	3806	*/
3330		- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
	3807	+ if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
3331	3808	return false;
3332	3809
3333	3810	/* If this is an order-0 request then the watermark is fine */
..	..	@@ -3351,65 +3828,83 @@
3351	3828	if (mt == MIGRATE_CMA)
3352	3829	continue;
3353	3830	#endif
3354		- if (!list_empty(&area->free_list[mt]))
	3831	+ if (!free_area_empty(area, mt))
3355	3832	return true;
3356	3833	}
3357	3834
3358	3835	#ifdef CONFIG_CMA
3359	3836	if ((alloc_flags & ALLOC_CMA) &&
3360		- !list_empty(&area->free_list[MIGRATE_CMA])) {
	3837	+ !free_area_empty(area, MIGRATE_CMA)) {
3361	3838	return true;
3362	3839	}
3363	3840	#endif
3364		- if (alloc_harder &&
3365		- !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
	3841	+ if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
3366	3842	return true;
3367	3843	}
3368	3844	return false;
3369	3845	}
3370	3846
3371	3847	bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3372		- int classzone_idx, unsigned int alloc_flags)
	3848	+ int highest_zoneidx, unsigned int alloc_flags)
3373	3849	{
3374		- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
	3850	+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3375	3851	zone_page_state(z, NR_FREE_PAGES));
3376	3852	}
	3853	+EXPORT_SYMBOL_GPL(zone_watermark_ok);
3377	3854
3378	3855	static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
3379		- unsigned long mark, int classzone_idx, unsigned int alloc_flags)
	3856	+ unsigned long mark, int highest_zoneidx,
	3857	+ unsigned int alloc_flags, gfp_t gfp_mask)
3380	3858	{
3381		- long free_pages = zone_page_state(z, NR_FREE_PAGES);
3382		- long cma_pages = 0;
	3859	+ long free_pages;
3383	3860
3384		-#ifdef CONFIG_CMA
3385		- /* If allocation can't use CMA areas don't use free CMA pages */
3386		- if (!(alloc_flags & ALLOC_CMA))
3387		- cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
3388		-#endif
	3861	+ free_pages = zone_page_state(z, NR_FREE_PAGES);
3389	3862
3390	3863	/*
3391	3864	* Fast check for order-0 only. If this fails then the reserves
3392		- * need to be calculated. There is a corner case where the check
3393		- * passes but only the high-order atomic reserve are free. If
3394		- * the caller is !atomic then it'll uselessly search the free
3395		- * list. That corner case is then slower but it is harmless.
	3865	+ * need to be calculated.
3396	3866	*/
3397		- if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
3398		- return true;
	3867	+ if (!order) {
	3868	+ long usable_free;
	3869	+ long reserved;
3399	3870
3400		- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3401		- free_pages);
	3871	+ usable_free = free_pages;
	3872	+ reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
	3873	+
	3874	+ /* reserved may over estimate high-atomic reserves. */
	3875	+ usable_free -= min(usable_free, reserved);
	3876	+ if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
	3877	+ return true;
	3878	+ }
	3879	+
	3880	+ if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
	3881	+ free_pages))
	3882	+ return true;
	3883	+ /*
	3884	+ * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
	3885	+ * when checking the min watermark. The min watermark is the
	3886	+ * point where boosting is ignored so that kswapd is woken up
	3887	+ * when below the low watermark.
	3888	+ */
	3889	+ if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
	3890	+ && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
	3891	+ mark = z->_watermark[WMARK_MIN];
	3892	+ return __zone_watermark_ok(z, order, mark, highest_zoneidx,
	3893	+ alloc_flags, free_pages);
	3894	+ }
	3895	+
	3896	+ return false;
3402	3897	}
3403	3898
3404	3899	bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3405		- unsigned long mark, int classzone_idx)
	3900	+ unsigned long mark, int highest_zoneidx)
3406	3901	{
3407	3902	long free_pages = zone_page_state(z, NR_FREE_PAGES);
3408	3903
3409	3904	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3410	3905	free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
3411	3906
3412		- return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
	3907	+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
3413	3908	free_pages);
3414	3909	}
3415	3910	EXPORT_SYMBOL_GPL(zone_watermark_ok_safe);
..	..	@@ -3418,7 +3913,7 @@
3418	3913	static bool zone_allows_reclaim(struct zone local_zone, struct zone zone)
3419	3914	{
3420	3915	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3421		- RECLAIM_DISTANCE;
	3916	+ node_reclaim_distance;
3422	3917	}
3423	3918	#else /* CONFIG_NUMA */
3424	3919	static bool zone_allows_reclaim(struct zone local_zone, struct zone zone)
..	..	@@ -3428,6 +3923,61 @@
3428	3923	#endif /* CONFIG_NUMA */
3429	3924
3430	3925	/*
	3926	+ * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
	3927	+ * fragmentation is subtle. If the preferred zone was HIGHMEM then
	3928	+ * premature use of a lower zone may cause lowmem pressure problems that
	3929	+ * are worse than fragmentation. If the next zone is ZONE_DMA then it is
	3930	+ * probably too small. It only makes sense to spread allocations to avoid
	3931	+ * fragmentation between the Normal and DMA32 zones.
	3932	+ */
	3933	+static inline unsigned int
	3934	+alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
	3935	+{
	3936	+ unsigned int alloc_flags;
	3937	+
	3938	+ /*
	3939	+ * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
	3940	+ * to save a branch.
	3941	+ */
	3942	+ alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
	3943	+
	3944	+#ifdef CONFIG_ZONE_DMA32
	3945	+ if (!zone)
	3946	+ return alloc_flags;
	3947	+
	3948	+ if (zone_idx(zone) != ZONE_NORMAL)
	3949	+ return alloc_flags;
	3950	+
	3951	+ /*
	3952	+ * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
	3953	+ * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
	3954	+ * on UMA that if Normal is populated then so is DMA32.
	3955	+ */
	3956	+ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
	3957	+ if (nr_online_nodes > 1 && !populated_zone(--zone))
	3958	+ return alloc_flags;
	3959	+
	3960	+ alloc_flags \|= ALLOC_NOFRAGMENT;
	3961	+#endif /* CONFIG_ZONE_DMA32 */
	3962	+ return alloc_flags;
	3963	+}
	3964	+
	3965	+static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
	3966	+ unsigned int alloc_flags)
	3967	+{
	3968	+#ifdef CONFIG_CMA
	3969	+ unsigned int pflags = current->flags;
	3970	+
	3971	+ if (!(pflags & PF_MEMALLOC_NOCMA) &&
	3972	+ gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE &&
	3973	+ gfp_mask & __GFP_CMA)
	3974	+ alloc_flags \|= ALLOC_CMA;
	3975	+
	3976	+#endif
	3977	+ return alloc_flags;
	3978	+}
	3979	+
	3980	+/*
3431	3981	* get_page_from_freelist goes through the zonelist trying to allocate
3432	3982	* a page.
3433	3983	*/
..	..	@@ -3435,16 +3985,20 @@
3435	3985	get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3436	3986	const struct alloc_context *ac)
3437	3987	{
3438		- struct zoneref *z = ac->preferred_zoneref;
	3988	+ struct zoneref *z;
3439	3989	struct zone *zone;
3440	3990	struct pglist_data *last_pgdat_dirty_limit = NULL;
	3991	+ bool no_fallback;
3441	3992
	3993	+retry:
3442	3994	/*
3443	3995	* Scan zonelist, looking for a zone with enough free.
3444	3996	* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
3445	3997	*/
3446		- for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3447		- ac->nodemask) {
	3998	+ no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
	3999	+ z = ac->preferred_zoneref;
	4000	+ for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
	4001	+ ac->nodemask) {
3448	4002	struct page *page;
3449	4003	unsigned long mark;
3450	4004
..	..	@@ -3481,9 +4035,26 @@
3481	4035	}
3482	4036	}
3483	4037
3484		- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
	4038	+ if (no_fallback && nr_online_nodes > 1 &&
	4039	+ zone != ac->preferred_zoneref->zone) {
	4040	+ int local_nid;
	4041	+
	4042	+ /*
	4043	+ * If moving to a remote node, retry but allow
	4044	+ * fragmenting fallbacks. Locality is more important
	4045	+ * than fragmentation avoidance.
	4046	+ */
	4047	+ local_nid = zone_to_nid(ac->preferred_zoneref->zone);
	4048	+ if (zone_to_nid(zone) != local_nid) {
	4049	+ alloc_flags &= ~ALLOC_NOFRAGMENT;
	4050	+ goto retry;
	4051	+ }
	4052	+ }
	4053	+
	4054	+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
3485	4055	if (!zone_watermark_fast(zone, order, mark,
3486		- ac_classzone_idx(ac), alloc_flags)) {
	4056	+ ac->highest_zoneidx, alloc_flags,
	4057	+ gfp_mask)) {
3487	4058	int ret;
3488	4059
3489	4060	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
..	..	@@ -3516,7 +4087,7 @@
3516	4087	default:
3517	4088	/* did we reclaim enough */
3518	4089	if (zone_watermark_ok(zone, order, mark,
3519		- ac_classzone_idx(ac), alloc_flags))
	4090	+ ac->highest_zoneidx, alloc_flags))
3520	4091	goto try_this_zone;
3521	4092
3522	4093	continue;
..	..	@@ -3548,30 +4119,21 @@
3548	4119	}
3549	4120	}
3550	4121
	4122	+ /*
	4123	+ * It's possible on a UMA machine to get through all zones that are
	4124	+ * fragmented. If avoiding fragmentation, reset and try again.
	4125	+ */
	4126	+ if (no_fallback) {
	4127	+ alloc_flags &= ~ALLOC_NOFRAGMENT;
	4128	+ goto retry;
	4129	+ }
	4130	+
3551	4131	return NULL;
3552		-}
3553		-
3554		-/*
3555		- * Large machines with many possible nodes should not always dump per-node
3556		- * meminfo in irq context.
3557		- */
3558		-static inline bool should_suppress_show_mem(void)
3559		-{
3560		- bool ret = false;
3561		-
3562		-#if NODES_SHIFT > 8
3563		- ret = in_interrupt();
3564		-#endif
3565		- return ret;
3566	4132	}
3567	4133
3568	4134	static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3569	4135	{
3570	4136	unsigned int filter = SHOW_MEM_FILTER_NODES;
3571		- static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
3572		-
3573		- if (should_suppress_show_mem() \|\| !__ratelimit(&show_mem_rs))
3574		- return;
3575	4137
3576	4138	/*
3577	4139	* This documents exceptions given to allocations in certain
..	..	@@ -3592,22 +4154,23 @@
3592	4154	{
3593	4155	struct va_format vaf;
3594	4156	va_list args;
3595		- static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
3596		- DEFAULT_RATELIMIT_BURST);
	4157	+ static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
3597	4158
3598		- if ((gfp_mask & __GFP_NOWARN) \|\| !__ratelimit(&nopage_rs))
	4159	+ if ((gfp_mask & __GFP_NOWARN) \|\|
	4160	+ !__ratelimit(&nopage_rs) \|\|
	4161	+ ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
3599	4162	return;
3600	4163
3601	4164	va_start(args, fmt);
3602	4165	vaf.fmt = fmt;
3603	4166	vaf.va = &args;
3604		- pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
	4167	+ pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
3605	4168	current->comm, &vaf, gfp_mask, &gfp_mask,
3606	4169	nodemask_pr_args(nodemask));
3607	4170	va_end(args);
3608	4171
3609	4172	cpuset_print_current_mems_allowed();
3610		-
	4173	+ pr_cont("\n");
3611	4174	dump_stack();
3612	4175	warn_alloc_show_mem(gfp_mask, nodemask);
3613	4176	}
..	..	@@ -3681,11 +4244,13 @@
3681	4244	* success so it is time to admit defeat. We will skip the OOM killer
3682	4245	* because it is very likely that the caller has a more reasonable
3683	4246	* fallback than shooting a random task.
	4247	+ *
	4248	+ * The OOM killer may not free memory on a specific node.
3684	4249	*/
3685		- if (gfp_mask & __GFP_RETRY_MAYFAIL)
	4250	+ if (gfp_mask & (__GFP_RETRY_MAYFAIL \| __GFP_THISNODE))
3686	4251	goto out;
3687	4252	/* The OOM killer does not needlessly kill tasks for lowmem */
3688		- if (ac->high_zoneidx < ZONE_NORMAL)
	4253	+ if (ac->highest_zoneidx < ZONE_NORMAL)
3689	4254	goto out;
3690	4255	if (pm_suspended_storage())
3691	4256	goto out;
..	..	@@ -3698,10 +4263,6 @@
3698	4263	* out_of_memory). Once filesystems are ready to handle allocation
3699	4264	* failures more gracefully we should just bail out here.
3700	4265	*/
3701		-
3702		- /* The OOM killer may not free memory on a specific node */
3703		- if (gfp_mask & __GFP_THISNODE)
3704		- goto out;
3705	4266
3706	4267	/* Exhausted what can be done so it's blame time */
3707	4268	if (out_of_memory(&oc) \|\| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
..	..	@@ -3733,7 +4294,7 @@
3733	4294	unsigned int alloc_flags, const struct alloc_context *ac,
3734	4295	enum compact_priority prio, enum compact_result *compact_result)
3735	4296	{
3736		- struct page *page;
	4297	+ struct page *page = NULL;
3737	4298	unsigned long pflags;
3738	4299	unsigned int noreclaim_flag;
3739	4300
..	..	@@ -3744,13 +4305,10 @@
3744	4305	noreclaim_flag = memalloc_noreclaim_save();
3745	4306
3746	4307	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3747		- prio);
	4308	+ prio, &page);
3748	4309
3749	4310	memalloc_noreclaim_restore(noreclaim_flag);
3750	4311	psi_memstall_leave(&pflags);
3751		-
3752		- if (*compact_result <= COMPACT_INACTIVE)
3753		- return NULL;
3754	4312
3755	4313	/*
3756	4314	* At least in one zone compaction wasn't deferred or skipped, so let's
..	..	@@ -3758,7 +4316,13 @@
3758	4316	*/
3759	4317	count_vm_event(COMPACTSTALL);
3760	4318
3761		- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	4319	+ /* Prep a captured page if available */
	4320	+ if (page)
	4321	+ prep_new_page(page, order, gfp_mask, alloc_flags);
	4322	+
	4323	+ /* Try get a page from the freelist if available */
	4324	+ if (!page)
	4325	+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3762	4326
3763	4327	if (page) {
3764	4328	struct zone *zone = page_zone(page);
..	..	@@ -3807,14 +4371,22 @@
3807	4371	goto check_priority;
3808	4372
3809	4373	/*
3810		- * make sure the compaction wasn't deferred or didn't bail out early
3811		- * due to locks contention before we declare that we should give up.
3812		- * But do not retry if the given zonelist is not suitable for
3813		- * compaction.
	4374	+ * compaction was skipped because there are not enough order-0 pages
	4375	+ * to work with, so we retry only if it looks like reclaim can help.
3814	4376	*/
3815		- if (compaction_withdrawn(compact_result)) {
	4377	+ if (compaction_needs_reclaim(compact_result)) {
3816	4378	ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3817	4379	goto out;
	4380	+ }
	4381	+
	4382	+ /*
	4383	+ * make sure the compaction wasn't deferred or didn't bail out early
	4384	+ * due to locks contention before we declare that we should give up.
	4385	+ * But the next retry should use a higher priority if allowed, so
	4386	+ * we don't just keep bailing out endlessly.
	4387	+ */
	4388	+ if (compaction_withdrawn(compact_result)) {
	4389	+ goto check_priority;
3818	4390	}
3819	4391
3820	4392	/*
..	..	@@ -3877,10 +4449,10 @@
3877	4449	* Let's give them a good hope and keep retrying while the order-0
3878	4450	* watermarks are OK.
3879	4451	*/
3880		- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3881		- ac->nodemask) {
	4452	+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
	4453	+ ac->highest_zoneidx, ac->nodemask) {
3882	4454	if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
3883		- ac_classzone_idx(ac), alloc_flags))
	4455	+ ac->highest_zoneidx, alloc_flags))
3884	4456	return true;
3885	4457	}
3886	4458	return false;
..	..	@@ -3938,33 +4510,50 @@
3938	4510	EXPORT_SYMBOL_GPL(fs_reclaim_release);
3939	4511	#endif
3940	4512
	4513	+/*
	4514	+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
	4515	+ * have been rebuilt so allocation retries. Reader side does not lock and
	4516	+ * retries the allocation if zonelist changes. Writer side is protected by the
	4517	+ * embedded spin_lock.
	4518	+ */
	4519	+static DEFINE_SEQLOCK(zonelist_update_seq);
	4520	+
	4521	+static unsigned int zonelist_iter_begin(void)
	4522	+{
	4523	+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
	4524	+ return read_seqbegin(&zonelist_update_seq);
	4525	+
	4526	+ return 0;
	4527	+}
	4528	+
	4529	+static unsigned int check_retry_zonelist(unsigned int seq)
	4530	+{
	4531	+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
	4532	+ return read_seqretry(&zonelist_update_seq, seq);
	4533	+
	4534	+ return seq;
	4535	+}
	4536	+
3941	4537	/* Perform direct synchronous page reclaim */
3942		-static int
	4538	+static unsigned long
3943	4539	__perform_reclaim(gfp_t gfp_mask, unsigned int order,
3944	4540	const struct alloc_context *ac)
3945	4541	{
3946		- struct reclaim_state reclaim_state;
3947		- int progress;
3948	4542	unsigned int noreclaim_flag;
3949		- unsigned long pflags;
	4543	+ unsigned long progress;
3950	4544
3951	4545	cond_resched();
3952	4546
3953	4547	/* We now go into synchronous reclaim */
3954	4548	cpuset_memory_pressure_bump();
3955		- psi_memstall_enter(&pflags);
3956	4549	fs_reclaim_acquire(gfp_mask);
3957	4550	noreclaim_flag = memalloc_noreclaim_save();
3958		- reclaim_state.reclaimed_slab = 0;
3959		- current->reclaim_state = &reclaim_state;
3960	4551
3961	4552	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
3962	4553	ac->nodemask);
3963	4554
3964		- current->reclaim_state = NULL;
3965	4555	memalloc_noreclaim_restore(noreclaim_flag);
3966	4556	fs_reclaim_release(gfp_mask);
3967		- psi_memstall_leave(&pflags);
3968	4557
3969	4558	cond_resched();
3970	4559
..	..	@@ -3978,11 +4567,14 @@
3978	4567	unsigned long *did_some_progress)
3979	4568	{
3980	4569	struct page *page = NULL;
	4570	+ unsigned long pflags;
3981	4571	bool drained = false;
	4572	+ bool skip_pcp_drain = false;
3982	4573
	4574	+ psi_memstall_enter(&pflags);
3983	4575	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
3984	4576	if (unlikely(!(*did_some_progress)))
3985		- return NULL;
	4577	+ goto out;
3986	4578
3987	4579	retry:
3988	4580	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
..	..	@@ -3990,14 +4582,19 @@
3990	4582	/*
3991	4583	* If an allocation failed after direct reclaim, it could be because
3992	4584	* pages are pinned on the per-cpu lists or in high alloc reserves.
3993		- * Shrink them them and try again
	4585	+ * Shrink them and try again
3994	4586	*/
3995	4587	if (!page && !drained) {
3996	4588	unreserve_highatomic_pageblock(ac, false);
3997		- drain_all_pages(NULL);
	4589	+ trace_android_vh_drain_all_pages_bypass(gfp_mask, order,
	4590	+ alloc_flags, ac->migratetype, *did_some_progress, &skip_pcp_drain);
	4591	+ if (!skip_pcp_drain)
	4592	+ drain_all_pages(NULL);
3998	4593	drained = true;
3999	4594	goto retry;
4000	4595	}
	4596	+out:
	4597	+ psi_memstall_leave(&pflags);
4001	4598
4002	4599	return page;
4003	4600	}
..	..	@@ -4008,12 +4605,12 @@
4008	4605	struct zoneref *z;
4009	4606	struct zone *zone;
4010	4607	pg_data_t *last_pgdat = NULL;
4011		- enum zone_type high_zoneidx = ac->high_zoneidx;
	4608	+ enum zone_type highest_zoneidx = ac->highest_zoneidx;
4012	4609
4013		- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
	4610	+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
4014	4611	ac->nodemask) {
4015	4612	if (last_pgdat != zone->zone_pgdat)
4016		- wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
	4613	+ wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
4017	4614	last_pgdat = zone->zone_pgdat;
4018	4615	}
4019	4616	}
..	..	@@ -4023,8 +4620,13 @@
4023	4620	{
4024	4621	unsigned int alloc_flags = ALLOC_WMARK_MIN \| ALLOC_CPUSET;
4025	4622
4026		- /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
	4623	+ /*
	4624	+ * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
	4625	+ * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
	4626	+ * to save two branches.
	4627	+ */
4027	4628	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
	4629	+ BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
4028	4630
4029	4631	/*
4030	4632	* The caller may dip into page reserves a bit more if the caller
..	..	@@ -4032,7 +4634,8 @@
4032	4634	* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
4033	4635	* set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
4034	4636	*/
4035		- alloc_flags \|= (__force int) (gfp_mask & __GFP_HIGH);
	4637	+ alloc_flags \|= (__force int)
	4638	+ (gfp_mask & (__GFP_HIGH \| __GFP_KSWAPD_RECLAIM));
4036	4639
4037	4640	if (gfp_mask & __GFP_ATOMIC) {
4038	4641	/*
..	..	@@ -4049,10 +4652,8 @@
4049	4652	} else if (unlikely(rt_task(current)) && !in_interrupt())
4050	4653	alloc_flags \|= ALLOC_HARDER;
4051	4654
4052		-#ifdef CONFIG_CMA
4053		- if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
4054		- alloc_flags \|= ALLOC_CMA;
4055		-#endif
	4655	+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
	4656	+
4056	4657	return alloc_flags;
4057	4658	}
4058	4659
..	..	@@ -4115,6 +4716,7 @@
4115	4716	{
4116	4717	struct zone *zone;
4117	4718	struct zoneref *z;
	4719	+ bool ret = false;
4118	4720
4119	4721	/*
4120	4722	* Costly allocations might have made a progress but this doesn't mean
..	..	@@ -4141,8 +4743,8 @@
4141	4743	* request even if all reclaimable pages are considered then we are
4142	4744	* screwed and have to go OOM.
4143	4745	*/
4144		- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
4145		- ac->nodemask) {
	4746	+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
	4747	+ ac->highest_zoneidx, ac->nodemask) {
4146	4748	unsigned long available;
4147	4749	unsigned long reclaimable;
4148	4750	unsigned long min_wmark = min_wmark_pages(zone);
..	..	@@ -4156,7 +4758,7 @@
4156	4758	* reclaimable pages?
4157	4759	*/
4158	4760	wmark = __zone_watermark_ok(zone, order, min_wmark,
4159		- ac_classzone_idx(ac), alloc_flags, available);
	4761	+ ac->highest_zoneidx, alloc_flags, available);
4160	4762	trace_reclaim_retry_zone(z, order, reclaimable,
4161	4763	available, min_wmark, *no_progress_loops, wmark);
4162	4764	if (wmark) {
..	..	@@ -4178,25 +4780,24 @@
4178	4780	}
4179	4781	}
4180	4782
4181		- /*
4182		- * Memory allocation/reclaim might be called from a WQ
4183		- * context and the current implementation of the WQ
4184		- * concurrency control doesn't recognize that
4185		- * a particular WQ is congested if the worker thread is
4186		- * looping without ever sleeping. Therefore we have to
4187		- * do a short sleep here rather than calling
4188		- * cond_resched().
4189		- */
4190		- if (current->flags & PF_WQ_WORKER)
4191		- schedule_timeout_uninterruptible(1);
4192		- else
4193		- cond_resched();
4194		-
4195		- return true;
	4783	+ ret = true;
	4784	+ goto out;
4196	4785	}
4197	4786	}
4198	4787
4199		- return false;
	4788	+out:
	4789	+ /*
	4790	+ * Memory allocation/reclaim might be called from a WQ context and the
	4791	+ * current implementation of the WQ concurrency control doesn't
	4792	+ * recognize that a particular WQ is congested if the worker thread is
	4793	+ * looping without ever sleeping. Therefore we have to do a short sleep
	4794	+ * here rather than calling cond_resched().
	4795	+ */
	4796	+ if (current->flags & PF_WQ_WORKER)
	4797	+ schedule_timeout_uninterruptible(1);
	4798	+ else
	4799	+ cond_resched();
	4800	+ return ret;
4200	4801	}
4201	4802
4202	4803	static inline bool
..	..	@@ -4246,8 +4847,12 @@
4246	4847	int compaction_retries;
4247	4848	int no_progress_loops;
4248	4849	unsigned int cpuset_mems_cookie;
	4850	+ unsigned int zonelist_iter_cookie;
4249	4851	int reserve_flags;
	4852	+ unsigned long vh_record;
	4853	+ bool should_alloc_retry = false;
4250	4854
	4855	+ trace_android_vh_alloc_pages_slowpath_begin(gfp_mask, order, &vh_record);
4251	4856	/*
4252	4857	* We also sanity check to catch abuse of atomic reserves being used by
4253	4858	* callers that are not in atomic context.
..	..	@@ -4256,11 +4861,12 @@
4256	4861	(__GFP_ATOMIC\|__GFP_DIRECT_RECLAIM)))
4257	4862	gfp_mask &= ~__GFP_ATOMIC;
4258	4863
4259		-retry_cpuset:
	4864	+restart:
4260	4865	compaction_retries = 0;
4261	4866	no_progress_loops = 0;
4262	4867	compact_priority = DEF_COMPACT_PRIORITY;
4263	4868	cpuset_mems_cookie = read_mems_allowed_begin();
	4869	+ zonelist_iter_cookie = zonelist_iter_begin();
4264	4870
4265	4871	/*
4266	4872	* The fast path uses conservative alloc_flags to succeed only until
..	..	@@ -4276,11 +4882,11 @@
4276	4882	* could end up iterating over non-eligible zones endlessly.
4277	4883	*/
4278	4884	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4279		- ac->high_zoneidx, ac->nodemask);
	4885	+ ac->highest_zoneidx, ac->nodemask);
4280	4886	if (!ac->preferred_zoneref->zone)
4281	4887	goto nopage;
4282	4888
4283		- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
	4889	+ if (alloc_flags & ALLOC_KSWAPD)
4284	4890	wake_all_kswapds(order, gfp_mask, ac);
4285	4891
4286	4892	/*
..	..	@@ -4313,18 +4919,28 @@
4313	4919
4314	4920	/*
4315	4921	* Checks for costly allocations with __GFP_NORETRY, which
4316		- * includes THP page fault allocations
	4922	+ * includes some THP page fault allocations
4317	4923	*/
4318	4924	if (costly_order && (gfp_mask & __GFP_NORETRY)) {
4319	4925	/*
4320		- * If compaction is deferred for high-order allocations,
4321		- * it is because sync compaction recently failed. If
4322		- * this is the case and the caller requested a THP
4323		- * allocation, we do not want to heavily disrupt the
4324		- * system, so we fail the allocation instead of entering
4325		- * direct reclaim.
	4926	+ * If allocating entire pageblock(s) and compaction
	4927	+ * failed because all zones are below low watermarks
	4928	+ * or is prohibited because it recently failed at this
	4929	+ * order, fail immediately unless the allocator has
	4930	+ * requested compaction and reclaim retry.
	4931	+ *
	4932	+ * Reclaim is
	4933	+ * - potentially very expensive because zones are far
	4934	+ * below their low watermarks or this is part of very
	4935	+ * bursty high order allocations,
	4936	+ * - not guaranteed to help because isolate_freepages()
	4937	+ * may not iterate over freed pages as part of its
	4938	+ * linear scan, and
	4939	+ * - unlikely to make entire pageblocks free on its
	4940	+ * own.
4326	4941	*/
4327		- if (compact_result == COMPACT_DEFERRED)
	4942	+ if (compact_result == COMPACT_SKIPPED \|\|
	4943	+ compact_result == COMPACT_DEFERRED)
4328	4944	goto nopage;
4329	4945
4330	4946	/*
..	..	@@ -4338,12 +4954,12 @@
4338	4954
4339	4955	retry:
4340	4956	/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4341		- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
	4957	+ if (alloc_flags & ALLOC_KSWAPD)
4342	4958	wake_all_kswapds(order, gfp_mask, ac);
4343	4959
4344	4960	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4345	4961	if (reserve_flags)
4346		- alloc_flags = reserve_flags;
	4962	+ alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
4347	4963
4348	4964	/*
4349	4965	* Reset the nodemask and zonelist iterators if memory policies can be
..	..	@@ -4353,7 +4969,7 @@
4353	4969	if (!(alloc_flags & ALLOC_CPUSET) \|\| reserve_flags) {
4354	4970	ac->nodemask = NULL;
4355	4971	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4356		- ac->high_zoneidx, ac->nodemask);
	4972	+ ac->highest_zoneidx, ac->nodemask);
4357	4973	}
4358	4974
4359	4975	/* Attempt with potentially adjusted zonelist and alloc_flags */
..	..	@@ -4368,6 +4984,18 @@
4368	4984	/* Avoid recursion of direct reclaim */
4369	4985	if (current->flags & PF_MEMALLOC)
4370	4986	goto nopage;
	4987	+
	4988	+ trace_android_vh_alloc_pages_reclaim_bypass(gfp_mask, order,
	4989	+ alloc_flags, ac->migratetype, &page);
	4990	+
	4991	+ if (page)
	4992	+ goto got_pg;
	4993	+
	4994	+ trace_android_vh_should_alloc_pages_retry(gfp_mask, order,
	4995	+ &alloc_flags, ac->migratetype, ac->preferred_zoneref->zone,
	4996	+ &page, &should_alloc_retry);
	4997	+ if (should_alloc_retry)
	4998	+ goto retry;
4371	4999
4372	5000	/* Try direct reclaim and then allocating */
4373	5001	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
..	..	@@ -4409,9 +5037,13 @@
4409	5037	goto retry;
4410	5038
4411	5039
4412		- /* Deal with possible cpuset update races before we start OOM killing */
4413		- if (check_retry_cpuset(cpuset_mems_cookie, ac))
4414		- goto retry_cpuset;
	5040	+ /*
	5041	+ * Deal with possible cpuset update races or zonelist updates to avoid
	5042	+ * a unnecessary OOM kill.
	5043	+ */
	5044	+ if (check_retry_cpuset(cpuset_mems_cookie, ac) \|\|
	5045	+ check_retry_zonelist(zonelist_iter_cookie))
	5046	+ goto restart;
4415	5047
4416	5048	/* Reclaim has failed us, start killing things */
4417	5049	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
..	..	@@ -4420,7 +5052,7 @@
4420	5052
4421	5053	/* Avoid allocations with no watermarks from looping endlessly */
4422	5054	if (tsk_is_oom_victim(current) &&
4423		- (alloc_flags == ALLOC_OOM \|\|
	5055	+ (alloc_flags & ALLOC_OOM \|\|
4424	5056	(gfp_mask & __GFP_NOMEMALLOC)))
4425	5057	goto nopage;
4426	5058
..	..	@@ -4431,9 +5063,13 @@
4431	5063	}
4432	5064
4433	5065	nopage:
4434		- /* Deal with possible cpuset update races before we fail */
4435		- if (check_retry_cpuset(cpuset_mems_cookie, ac))
4436		- goto retry_cpuset;
	5066	+ /*
	5067	+ * Deal with possible cpuset update races or zonelist updates to avoid
	5068	+ * a unnecessary OOM kill.
	5069	+ */
	5070	+ if (check_retry_cpuset(cpuset_mems_cookie, ac) \|\|
	5071	+ check_retry_zonelist(zonelist_iter_cookie))
	5072	+ goto restart;
4437	5073
4438	5074	/*
4439	5075	* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
..	..	@@ -4476,9 +5112,15 @@
4476	5112	goto retry;
4477	5113	}
4478	5114	fail:
	5115	+ trace_android_vh_alloc_pages_failure_bypass(gfp_mask, order,
	5116	+ alloc_flags, ac->migratetype, &page);
	5117	+ if (page)
	5118	+ goto got_pg;
	5119	+
4479	5120	warn_alloc(gfp_mask, ac->nodemask,
4480	5121	"page allocation failure: order:%u", order);
4481	5122	got_pg:
	5123	+ trace_android_vh_alloc_pages_slowpath_end(gfp_mask, order, vh_record);
4482	5124	return page;
4483	5125	}
4484	5126
..	..	@@ -4487,14 +5129,18 @@
4487	5129	struct alloc_context ac, gfp_t alloc_mask,
4488	5130	unsigned int *alloc_flags)
4489	5131	{
4490		- ac->high_zoneidx = gfp_zone(gfp_mask);
	5132	+ ac->highest_zoneidx = gfp_zone(gfp_mask);
4491	5133	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
4492	5134	ac->nodemask = nodemask;
4493		- ac->migratetype = gfpflags_to_migratetype(gfp_mask);
	5135	+ ac->migratetype = gfp_migratetype(gfp_mask);
4494	5136
4495	5137	if (cpusets_enabled()) {
4496	5138	*alloc_mask \|= __GFP_HARDWALL;
4497		- if (!ac->nodemask)
	5139	+ /*
	5140	+ * When we are in the interrupt context, it is irrelevant
	5141	+ * to the current task context. It means that any node ok.
	5142	+ */
	5143	+ if (!in_interrupt() && !ac->nodemask)
4498	5144	ac->nodemask = &cpuset_current_mems_allowed;
4499	5145	else
4500	5146	*alloc_flags \|= ALLOC_CPUSET;
..	..	@@ -4508,15 +5154,8 @@
4508	5154	if (should_fail_alloc_page(gfp_mask, order))
4509	5155	return false;
4510	5156
4511		- if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
4512		- *alloc_flags \|= ALLOC_CMA;
	5157	+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
4513	5158
4514		- return true;
4515		-}
4516		-
4517		-/* Determine whether to spread dirty pages and what the first usable zone */
4518		-static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
4519		-{
4520	5159	/* Dirty zone balancing only done in the fast path */
4521	5160	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
4522	5161
..	..	@@ -4526,7 +5165,9 @@
4526	5165	* may get reset for allocations that ignore memory policies.
4527	5166	*/
4528	5167	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4529		- ac->high_zoneidx, ac->nodemask);
	5168	+ ac->highest_zoneidx, ac->nodemask);
	5169	+
	5170	+ return true;
4530	5171	}
4531	5172
4532	5173	/*
..	..	@@ -4555,7 +5196,11 @@
4555	5196	if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
4556	5197	return NULL;
4557	5198
4558		- finalise_ac(gfp_mask, &ac);
	5199	+ /*
	5200	+ * Forbid the first pass from falling back to types that fragment
	5201	+ * memory until all local zones are considered.
	5202	+ */
	5203	+ alloc_flags \|= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
4559	5204
4560	5205	/* First allocation attempt */
4561	5206	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
..	..	@@ -4575,14 +5220,13 @@
4575	5220	* Restore the original nodemask if it was potentially replaced with
4576	5221	* &cpuset_current_mems_allowed to optimize the fast-path attempt.
4577	5222	*/
4578		- if (unlikely(ac.nodemask != nodemask))
4579		- ac.nodemask = nodemask;
	5223	+ ac.nodemask = nodemask;
4580	5224
4581	5225	page = __alloc_pages_slowpath(alloc_mask, order, &ac);
4582	5226
4583	5227	out:
4584	5228	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
4585		- unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
	5229	+ unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
4586	5230	__free_pages(page, order);
4587	5231	page = NULL;
4588	5232	}
..	..	@@ -4620,13 +5264,20 @@
4620	5264	if (order == 0) /* Via pcp? */
4621	5265	free_unref_page(page);
4622	5266	else
4623		- __free_pages_ok(page, order);
	5267	+ __free_pages_ok(page, order, FPI_NONE);
4624	5268	}
4625	5269
4626	5270	void __free_pages(struct page *page, unsigned int order)
4627	5271	{
	5272	+ /* get PageHead before we drop reference */
	5273	+ int head = PageHead(page);
	5274	+
	5275	+ trace_android_vh_free_pages(page, order);
4628	5276	if (put_page_testzero(page))
4629	5277	free_the_page(page, order);
	5278	+ else if (!head)
	5279	+ while (order-- > 0)
	5280	+ free_the_page(page + (1 << order), order);
4630	5281	}
4631	5282	EXPORT_SYMBOL(__free_pages);
4632	5283
..	..	@@ -4731,6 +5382,18 @@
4731	5382	/* reset page count bias and offset to start of new frag */
4732	5383	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
4733	5384	offset = size - fragsz;
	5385	+ if (unlikely(offset < 0)) {
	5386	+ /*
	5387	+ * The caller is trying to allocate a fragment
	5388	+ * with fragsz > PAGE_SIZE but the cache isn't big
	5389	+ * enough to satisfy the request, this may
	5390	+ * happen in low memory conditions.
	5391	+ * We don't release the cache page because
	5392	+ * it could make memory pressure worse
	5393	+ * so we simply return NULL here.
	5394	+ */
	5395	+ return NULL;
	5396	+ }
4734	5397	}
4735	5398
4736	5399	nc->pagecnt_bias--;
..	..	@@ -4771,7 +5434,7 @@
4771	5434	/**
4772	5435	* alloc_pages_exact - allocate an exact number physically-contiguous pages.
4773	5436	* @size: the number of bytes to allocate
4774		- * @gfp_mask: GFP flags for the allocation
	5437	+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4775	5438	*
4776	5439	* This function is similar to alloc_pages(), except that it allocates the
4777	5440	* minimum number of pages to satisfy the request. alloc_pages() can only
..	..	@@ -4780,11 +5443,16 @@
4780	5443	* This function is also limited by MAX_ORDER.
4781	5444	*
4782	5445	* Memory allocated by this function must be released by free_pages_exact().
	5446	+ *
	5447	+ * Return: pointer to the allocated area or %NULL in case of error.
4783	5448	*/
4784	5449	void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4785	5450	{
4786	5451	unsigned int order = get_order(size);
4787	5452	unsigned long addr;
	5453	+
	5454	+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
	5455	+ gfp_mask &= ~__GFP_COMP;
4788	5456
4789	5457	addr = __get_free_pages(gfp_mask, order);
4790	5458	return make_alloc_exact(addr, order, size);
..	..	@@ -4796,15 +5464,22 @@
4796	5464	* pages on a node.
4797	5465	* @nid: the preferred node ID where memory should be allocated
4798	5466	* @size: the number of bytes to allocate
4799		- * @gfp_mask: GFP flags for the allocation
	5467	+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4800	5468	*
4801	5469	* Like alloc_pages_exact(), but try to allocate on node nid first before falling
4802	5470	* back.
	5471	+ *
	5472	+ * Return: pointer to the allocated area or %NULL in case of error.
4803	5473	*/
4804	5474	void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
4805	5475	{
4806	5476	unsigned int order = get_order(size);
4807		- struct page *p = alloc_pages_node(nid, gfp_mask, order);
	5477	+ struct page *p;
	5478	+
	5479	+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
	5480	+ gfp_mask &= ~__GFP_COMP;
	5481	+
	5482	+ p = alloc_pages_node(nid, gfp_mask, order);
4808	5483	if (!p)
4809	5484	return NULL;
4810	5485	return make_alloc_exact((unsigned long)page_address(p), order, size);
..	..	@@ -4833,11 +5508,13 @@
4833	5508	* nr_free_zone_pages - count number of pages beyond high watermark
4834	5509	* @offset: The zone index of the highest zone
4835	5510	*
4836		- * nr_free_zone_pages() counts the number of counts pages which are beyond the
	5511	+ * nr_free_zone_pages() counts the number of pages which are beyond the
4837	5512	* high watermark within all zones at or below a given zone index. For each
4838	5513	* zone, the number of pages is calculated as:
4839	5514	*
4840	5515	* nr_free_zone_pages = managed_pages - high_pages
	5516	+ *
	5517	+ * Return: number of pages beyond high watermark.
4841	5518	*/
4842	5519	static unsigned long nr_free_zone_pages(int offset)
4843	5520	{
..	..	@@ -4850,7 +5527,7 @@
4850	5527	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
4851	5528
4852	5529	for_each_zone_zonelist(zone, z, zonelist, offset) {
4853		- unsigned long size = zone->managed_pages;
	5530	+ unsigned long size = zone_managed_pages(zone);
4854	5531	unsigned long high = high_wmark_pages(zone);
4855	5532	if (size > high)
4856	5533	sum += size - high;
..	..	@@ -4864,23 +5541,15 @@
4864	5541	*
4865	5542	* nr_free_buffer_pages() counts the number of pages which are beyond the high
4866	5543	* watermark within ZONE_DMA and ZONE_NORMAL.
	5544	+ *
	5545	+ * Return: number of pages beyond high watermark within ZONE_DMA and
	5546	+ * ZONE_NORMAL.
4867	5547	*/
4868	5548	unsigned long nr_free_buffer_pages(void)
4869	5549	{
4870	5550	return nr_free_zone_pages(gfp_zone(GFP_USER));
4871	5551	}
4872	5552	EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4873		-
4874		-/**
4875		- * nr_free_pagecache_pages - count number of pages beyond high watermark
4876		- *
4877		- * nr_free_pagecache_pages() counts the number of pages which are beyond the
4878		- * high watermark within all zones.
4879		- */
4880		-unsigned long nr_free_pagecache_pages(void)
4881		-{
4882		- return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
4883		-}
4884	5553
4885	5554	static inline void show_node(struct zone *zone)
4886	5555	{
..	..	@@ -4902,7 +5571,7 @@
4902	5571	pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
4903	5572
4904	5573	for_each_zone(zone)
4905		- wmark_low += zone->watermark[WMARK_LOW];
	5574	+ wmark_low += low_wmark_pages(zone);
4906	5575
4907	5576	/*
4908	5577	* Estimate the amount of memory available for userspace allocations,
..	..	@@ -4924,8 +5593,8 @@
4924	5593	* items that are in use, and cannot be freed. Cap this estimate at the
4925	5594	* low watermark.
4926	5595	*/
4927		- reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
4928		- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
	5596	+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
	5597	+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
4929	5598	available += reclaimable - min(reclaimable / 2, wmark_low);
4930	5599
4931	5600	if (available < 0)
..	..	@@ -4936,11 +5605,11 @@
4936	5605
4937	5606	void si_meminfo(struct sysinfo *val)
4938	5607	{
4939		- val->totalram = totalram_pages;
	5608	+ val->totalram = totalram_pages();
4940	5609	val->sharedram = global_node_page_state(NR_SHMEM);
4941	5610	val->freeram = global_zone_page_state(NR_FREE_PAGES);
4942	5611	val->bufferram = nr_blockdev_pages();
4943		- val->totalhigh = totalhigh_pages;
	5612	+ val->totalhigh = totalhigh_pages();
4944	5613	val->freehigh = nr_free_highpages();
4945	5614	val->mem_unit = PAGE_SIZE;
4946	5615	}
..	..	@@ -4957,7 +5626,7 @@
4957	5626	pg_data_t *pgdat = NODE_DATA(nid);
4958	5627
4959	5628	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
4960		- managed_pages += pgdat->node_zones[zone_type].managed_pages;
	5629	+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
4961	5630	val->totalram = managed_pages;
4962	5631	val->sharedram = node_page_state(pgdat, NR_SHMEM);
4963	5632	val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
..	..	@@ -4966,7 +5635,7 @@
4966	5635	struct zone *zone = &pgdat->node_zones[zone_type];
4967	5636
4968	5637	if (is_highmem(zone)) {
4969		- managed_highpages += zone->managed_pages;
	5638	+ managed_highpages += zone_managed_pages(zone);
4970	5639	free_highpages += zone_page_state(zone, NR_FREE_PAGES);
4971	5640	}
4972	5641	}
..	..	@@ -5055,7 +5724,7 @@
5055	5724
5056	5725	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
5057	5726	" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
5058		- " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
	5727	+ " unevictable:%lu dirty:%lu writeback:%lu\n"
5059	5728	" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
5060	5729	" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
5061	5730	" free:%lu free_pcp:%lu free_cma:%lu\n",
..	..	@@ -5068,9 +5737,8 @@
5068	5737	global_node_page_state(NR_UNEVICTABLE),
5069	5738	global_node_page_state(NR_FILE_DIRTY),
5070	5739	global_node_page_state(NR_WRITEBACK),
5071		- global_node_page_state(NR_UNSTABLE_NFS),
5072		- global_node_page_state(NR_SLAB_RECLAIMABLE),
5073		- global_node_page_state(NR_SLAB_UNRECLAIMABLE),
	5740	+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
	5741	+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
5074	5742	global_node_page_state(NR_FILE_MAPPED),
5075	5743	global_node_page_state(NR_SHMEM),
5076	5744	global_zone_page_state(NR_PAGETABLE),
..	..	@@ -5079,6 +5747,7 @@
5079	5747	free_pcp,
5080	5748	global_zone_page_state(NR_FREE_CMA_PAGES));
5081	5749
	5750	+ trace_android_vh_show_mapcount_pages(NULL);
5082	5751	for_each_online_pgdat(pgdat) {
5083	5752	if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
5084	5753	continue;
..	..	@@ -5101,7 +5770,10 @@
5101	5770	" anon_thp: %lukB"
5102	5771	#endif
5103	5772	" writeback_tmp:%lukB"
5104		- " unstable:%lukB"
	5773	+ " kernel_stack:%lukB"
	5774	+#ifdef CONFIG_SHADOW_CALL_STACK
	5775	+ " shadow_call_stack:%lukB"
	5776	+#endif
5105	5777	" all_unreclaimable? %s"
5106	5778	"\n",
5107	5779	pgdat->node_id,
..	..	@@ -5123,7 +5795,10 @@
5123	5795	K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
5124	5796	#endif
5125	5797	K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
5126		- K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
	5798	+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
	5799	+#ifdef CONFIG_SHADOW_CALL_STACK
	5800	+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
	5801	+#endif
5127	5802	pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
5128	5803	"yes" : "no");
5129	5804	}
..	..	@@ -5145,6 +5820,7 @@
5145	5820	" min:%lukB"
5146	5821	" low:%lukB"
5147	5822	" high:%lukB"
	5823	+ " reserved_highatomic:%luKB"
5148	5824	" active_anon:%lukB"
5149	5825	" inactive_anon:%lukB"
5150	5826	" active_file:%lukB"
..	..	@@ -5154,10 +5830,6 @@
5154	5830	" present:%lukB"
5155	5831	" managed:%lukB"
5156	5832	" mlocked:%lukB"
5157		- " kernel_stack:%lukB"
5158		-#ifdef CONFIG_SHADOW_CALL_STACK
5159		- " shadow_call_stack:%lukB"
5160		-#endif
5161	5833	" pagetables:%lukB"
5162	5834	" bounce:%lukB"
5163	5835	" free_pcp:%lukB"
..	..	@@ -5169,6 +5841,7 @@
5169	5841	K(min_wmark_pages(zone)),
5170	5842	K(low_wmark_pages(zone)),
5171	5843	K(high_wmark_pages(zone)),
	5844	+ K(zone->nr_reserved_highatomic),
5172	5845	K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
5173	5846	K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
5174	5847	K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
..	..	@@ -5176,12 +5849,8 @@
5176	5849	K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
5177	5850	K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
5178	5851	K(zone->present_pages),
5179		- K(zone->managed_pages),
	5852	+ K(zone_managed_pages(zone)),
5180	5853	K(zone_page_state(zone, NR_MLOCK)),
5181		- zone_page_state(zone, NR_KERNEL_STACK_KB),
5182		-#ifdef CONFIG_SHADOW_CALL_STACK
5183		- zone_page_state(zone, NR_KERNEL_SCS_BYTES) / 1024,
5184		-#endif
5185	5854	K(zone_page_state(zone, NR_PAGETABLE)),
5186	5855	K(zone_page_state(zone, NR_BOUNCE)),
5187	5856	K(free_pcp),
..	..	@@ -5213,7 +5882,7 @@
5213	5882
5214	5883	types[order] = 0;
5215	5884	for (type = 0; type < MIGRATE_TYPES; type++) {
5216		- if (!list_empty(&area->free_list[type]))
	5885	+ if (!free_area_empty(area, type))
5217	5886	types[order] \|= 1 << type;
5218	5887	}
5219	5888	}
..	..	@@ -5254,7 +5923,7 @@
5254	5923	do {
5255	5924	zone_type--;
5256	5925	zone = pgdat->node_zones + zone_type;
5257		- if (managed_zone(zone)) {
	5926	+ if (populated_zone(zone)) {
5258	5927	zoneref_set_zone(zone, &zonerefs[nr_zones++]);
5259	5928	check_highest_zone(zone_type);
5260	5929	}
..	..	@@ -5280,36 +5949,17 @@
5280	5949	return 0;
5281	5950	}
5282	5951
5283		-static __init int setup_numa_zonelist_order(char *s)
5284		-{
5285		- if (!s)
5286		- return 0;
5287		-
5288		- return __parse_numa_zonelist_order(s);
5289		-}
5290		-early_param("numa_zonelist_order", setup_numa_zonelist_order);
5291		-
5292	5952	char numa_zonelist_order[] = "Node";
5293	5953
5294	5954	/*
5295	5955	* sysctl handler for numa_zonelist_order
5296	5956	*/
5297	5957	int numa_zonelist_order_handler(struct ctl_table *table, int write,
5298		- void __user buffer, size_t length,
5299		- loff_t *ppos)
	5958	+ void buffer, size_t length, loff_t *ppos)
5300	5959	{
5301		- char *str;
5302		- int ret;
5303		-
5304		- if (!write)
5305		- return proc_dostring(table, write, buffer, length, ppos);
5306		- str = memdup_user_nul(buffer, 16);
5307		- if (IS_ERR(str))
5308		- return PTR_ERR(str);
5309		-
5310		- ret = __parse_numa_zonelist_order(str);
5311		- kfree(str);
5312		- return ret;
	5960	+ if (write)
	5961	+ return __parse_numa_zonelist_order(buffer);
	5962	+ return proc_dostring(table, write, buffer, length, ppos);
5313	5963	}
5314	5964
5315	5965
..	..	@@ -5328,14 +5978,14 @@
5328	5978	* from each node to each node in the system), and should also prefer nodes
5329	5979	* with no CPUs, since presumably they'll have very little allocation pressure
5330	5980	* on them otherwise.
5331		- * It returns -1 if no node is found.
	5981	+ *
	5982	+ * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
5332	5983	*/
5333	5984	static int find_next_best_node(int node, nodemask_t *used_node_mask)
5334	5985	{
5335	5986	int n, val;
5336	5987	int min_val = INT_MAX;
5337	5988	int best_node = NUMA_NO_NODE;
5338		- const struct cpumask *tmp = cpumask_of_node(0);
5339	5989
5340	5990	/* Use the local node if we haven't already */
5341	5991	if (!node_isset(node, *used_node_mask)) {
..	..	@@ -5356,8 +6006,7 @@
5356	6006	val += (n < node);
5357	6007
5358	6008	/* Give preference to headless and unused nodes */
5359		- tmp = cpumask_of_node(n);
5360		- if (!cpumask_empty(tmp))
	6009	+ if (!cpumask_empty(cpumask_of_node(n)))
5361	6010	val += PENALTY_FOR_NODE_WITH_CPUS;
5362	6011
5363	6012	/* Slight preference for less loaded node */
..	..	@@ -5428,14 +6077,13 @@
5428	6077	{
5429	6078	static int node_order[MAX_NUMNODES];
5430	6079	int node, load, nr_nodes = 0;
5431		- nodemask_t used_mask;
	6080	+ nodemask_t used_mask = NODE_MASK_NONE;
5432	6081	int local_node, prev_node;
5433	6082
5434	6083	/* NUMA-aware ordering of nodes */
5435	6084	local_node = pgdat->node_id;
5436	6085	load = nr_online_nodes;
5437	6086	prev_node = local_node;
5438		- nodes_clear(used_mask);
5439	6087
5440	6088	memset(node_order, 0, sizeof(node_order));
5441	6089	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
..	..	@@ -5542,9 +6190,22 @@
5542	6190	int nid;
5543	6191	int __maybe_unused cpu;
5544	6192	pg_data_t *self = data;
5545		- static DEFINE_SPINLOCK(lock);
	6193	+ unsigned long flags;
5546	6194
5547		- spin_lock(&lock);
	6195	+ /*
	6196	+ * Explicitly disable this CPU's interrupts before taking seqlock
	6197	+ * to prevent any IRQ handler from calling into the page allocator
	6198	+ * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
	6199	+ */
	6200	+ local_irq_save(flags);
	6201	+ /*
	6202	+ * Explicitly disable this CPU's synchronous printk() before taking
	6203	+ * seqlock to prevent any printk() from trying to hold port->lock, for
	6204	+ * tty_insert_flip_string_and_push_buffer() on other CPU might be
	6205	+ * calling kmalloc(GFP_ATOMIC \| __GFP_NOWARN) with port->lock held.
	6206	+ */
	6207	+ printk_deferred_enter();
	6208	+ write_seqlock(&zonelist_update_seq);
5548	6209
5549	6210	#ifdef CONFIG_NUMA
5550	6211	memset(node_load, 0, sizeof(node_load));
..	..	@@ -5577,7 +6238,9 @@
5577	6238	#endif
5578	6239	}
5579	6240
5580		- spin_unlock(&lock);
	6241	+ write_sequnlock(&zonelist_update_seq);
	6242	+ printk_deferred_exit();
	6243	+ local_irq_restore(flags);
5581	6244	}
5582	6245
5583	6246	static noinline void __init
..	..	@@ -5615,13 +6278,16 @@
5615	6278	*/
5616	6279	void __ref build_all_zonelists(pg_data_t *pgdat)
5617	6280	{
	6281	+ unsigned long vm_total_pages;
	6282	+
5618	6283	if (system_state == SYSTEM_BOOTING) {
5619	6284	build_all_zonelists_init();
5620	6285	} else {
5621	6286	__build_all_zonelists(pgdat);
5622	6287	/* cpuset refresh routine should be here */
5623	6288	}
5624		- vm_total_pages = nr_free_pagecache_pages();
	6289	+ /* Get the number of free pages beyond high watermark in all zones. */
	6290	+ vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
5625	6291	/*
5626	6292	* Disable grouping by mobility if the number of pages in the
5627	6293	* system is too low to allow the mechanism to work. It would be
..	..	@@ -5634,7 +6300,7 @@
5634	6300	else
5635	6301	page_group_by_mobility_disabled = 0;
5636	6302
5637		- pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
	6303	+ pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
5638	6304	nr_online_nodes,
5639	6305	page_group_by_mobility_disabled ? "off" : "on",
5640	6306	vm_total_pages);
..	..	@@ -5643,81 +6309,148 @@
5643	6309	#endif
5644	6310	}
5645	6311
	6312	+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
	6313	+static bool __meminit
	6314	+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
	6315	+{
	6316	+ static struct memblock_region *r;
	6317	+
	6318	+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
	6319	+ if (!r \|\| *pfn >= memblock_region_memory_end_pfn(r)) {
	6320	+ for_each_mem_region(r) {
	6321	+ if (*pfn < memblock_region_memory_end_pfn(r))
	6322	+ break;
	6323	+ }
	6324	+ }
	6325	+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
	6326	+ memblock_is_mirror(r)) {
	6327	+ *pfn = memblock_region_memory_end_pfn(r);
	6328	+ return true;
	6329	+ }
	6330	+ }
	6331	+ return false;
	6332	+}
	6333	+
5646	6334	/*
5647	6335	* Initially all pages are reserved - free ones are freed
5648		- * up by free_all_bootmem() once the early boot process is
	6336	+ * up by memblock_free_all() once the early boot process is
5649	6337	* done. Non-atomic initialization, single-pass.
	6338	+ *
	6339	+ * All aligned pageblocks are initialized to the specified migratetype
	6340	+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
	6341	+ * zone stats (e.g., nr_isolate_pageblock) are touched.
5650	6342	*/
5651	6343	void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5652		- unsigned long start_pfn, enum meminit_context context,
5653		- struct vmem_altmap *altmap)
	6344	+ unsigned long start_pfn, unsigned long zone_end_pfn,
	6345	+ enum meminit_context context,
	6346	+ struct vmem_altmap *altmap, int migratetype)
5654	6347	{
5655		- unsigned long end_pfn = start_pfn + size;
5656		- pg_data_t *pgdat = NODE_DATA(nid);
5657		- unsigned long pfn;
5658		- unsigned long nr_initialised = 0;
	6348	+ unsigned long pfn, end_pfn = start_pfn + size;
5659	6349	struct page *page;
5660		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5661		- struct memblock_region r = NULL, tmp;
5662		-#endif
5663	6350
5664	6351	if (highest_memmap_pfn < end_pfn - 1)
5665	6352	highest_memmap_pfn = end_pfn - 1;
	6353	+
	6354	+#ifdef CONFIG_ZONE_DEVICE
	6355	+ /*
	6356	+ * Honor reservation requested by the driver for this ZONE_DEVICE
	6357	+ * memory. We limit the total number of pages to initialize to just
	6358	+ * those that might contain the memory mapping. We will defer the
	6359	+ * ZONE_DEVICE page initialization until after we have released
	6360	+ * the hotplug lock.
	6361	+ */
	6362	+ if (zone == ZONE_DEVICE) {
	6363	+ if (!altmap)
	6364	+ return;
	6365	+
	6366	+ if (start_pfn == altmap->base_pfn)
	6367	+ start_pfn += altmap->reserve;
	6368	+ end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
	6369	+ }
	6370	+#endif
5666	6371
5667	6372	#ifdef CONFIG_ROCKCHIP_THUNDER_BOOT
5668	6373	/* Zero all page struct in advance */
5669	6374	memset(pfn_to_page(start_pfn), 0, sizeof(struct page) * size);
5670	6375	#endif
5671	6376
5672		- /*
5673		- * Honor reservation requested by the driver for this ZONE_DEVICE
5674		- * memory
5675		- */
5676		- if (altmap && start_pfn == altmap->base_pfn)
5677		- start_pfn += altmap->reserve;
5678		-
5679		- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
	6377	+ for (pfn = start_pfn; pfn < end_pfn; ) {
5680	6378	/*
5681	6379	* There can be holes in boot-time mem_map[]s handed to this
5682	6380	* function. They do not exist on hotplugged memory.
5683	6381	*/
5684		- if (context != MEMINIT_EARLY)
5685		- goto not_early;
5686		-
5687		- if (!early_pfn_valid(pfn))
5688		- continue;
5689		- if (!early_pfn_in_nid(pfn, nid))
5690		- continue;
5691		- if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
5692		- break;
5693		-
5694		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5695		- /*
5696		- * Check given memblock attribute by firmware which can affect
5697		- * kernel memory layout. If zone==ZONE_MOVABLE but memory is
5698		- * mirrored, it's an overlapped memmap init. skip it.
5699		- */
5700		- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
5701		- if (!r \|\| pfn >= memblock_region_memory_end_pfn(r)) {
5702		- for_each_memblock(memory, tmp)
5703		- if (pfn < memblock_region_memory_end_pfn(tmp))
5704		- break;
5705		- r = tmp;
5706		- }
5707		- if (pfn >= memblock_region_memory_base_pfn(r) &&
5708		- memblock_is_mirror(r)) {
5709		- /* already initialized as NORMAL */
5710		- pfn = memblock_region_memory_end_pfn(r);
	6382	+ if (context == MEMINIT_EARLY) {
	6383	+ if (overlap_memmap_init(zone, &pfn))
5711	6384	continue;
5712		- }
	6385	+ if (defer_init(nid, pfn, zone_end_pfn))
	6386	+ break;
5713	6387	}
5714		-#endif
5715	6388
5716		-not_early:
5717	6389	page = pfn_to_page(pfn);
5718	6390	__init_single_page(page, pfn, zone, nid, false);
5719	6391	if (context == MEMINIT_HOTPLUG)
5720		- SetPageReserved(page);
	6392	+ __SetPageReserved(page);
	6393	+
	6394	+ /*
	6395	+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
	6396	+ * such that unmovable allocations won't be scattered all
	6397	+ * over the place during system boot.
	6398	+ */
	6399	+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
	6400	+ set_pageblock_migratetype(page, migratetype);
	6401	+ cond_resched();
	6402	+ }
	6403	+ pfn++;
	6404	+ }
	6405	+}
	6406	+
	6407	+#ifdef CONFIG_ZONE_DEVICE
	6408	+void __ref memmap_init_zone_device(struct zone *zone,
	6409	+ unsigned long start_pfn,
	6410	+ unsigned long nr_pages,
	6411	+ struct dev_pagemap *pgmap)
	6412	+{
	6413	+ unsigned long pfn, end_pfn = start_pfn + nr_pages;
	6414	+ struct pglist_data *pgdat = zone->zone_pgdat;
	6415	+ struct vmem_altmap *altmap = pgmap_altmap(pgmap);
	6416	+ unsigned long zone_idx = zone_idx(zone);
	6417	+ unsigned long start = jiffies;
	6418	+ int nid = pgdat->node_id;
	6419	+
	6420	+ if (WARN_ON_ONCE(!pgmap \|\| zone_idx(zone) != ZONE_DEVICE))
	6421	+ return;
	6422	+
	6423	+ /*
	6424	+ * The call to memmap_init should have already taken care
	6425	+ * of the pages reserved for the memmap, so we can just jump to
	6426	+ * the end of that region and start processing the device pages.
	6427	+ */
	6428	+ if (altmap) {
	6429	+ start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
	6430	+ nr_pages = end_pfn - start_pfn;
	6431	+ }
	6432	+
	6433	+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
	6434	+ struct page *page = pfn_to_page(pfn);
	6435	+
	6436	+ __init_single_page(page, pfn, zone_idx, nid, true);
	6437	+
	6438	+ /*
	6439	+ * Mark page reserved as it will need to wait for onlining
	6440	+ * phase for it to be fully associated with a zone.
	6441	+ *
	6442	+ * We can use the non-atomic __set_bit operation for setting
	6443	+ * the flag as we are still initializing the pages.
	6444	+ */
	6445	+ __SetPageReserved(page);
	6446	+
	6447	+ /*
	6448	+ * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
	6449	+ * and zone_device_data. It is a bug if a ZONE_DEVICE page is
	6450	+ * ever freed or placed on a driver-private list.
	6451	+ */
	6452	+ page->pgmap = pgmap;
	6453	+ page->zone_device_data = NULL;
5721	6454
5722	6455	/*
5723	6456	* Mark the block movable so that blocks are reserved for
..	..	@@ -5726,21 +6459,20 @@
5726	6459	* the address space during boot when many long-lived
5727	6460	* kernel allocations are made.
5728	6461	*
5729		- * bitmap is created for zone's valid pfn range. but memmap
5730		- * can be created for invalid pages (for alignment)
5731		- * check here not to call set_pageblock_migratetype() against
5732		- * pfn out of zone.
5733		- *
5734	6462	* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
5735		- * because this is done early in sparse_add_one_section
	6463	+ * because this is done early in section_activate()
5736	6464	*/
5737		- if (!(pfn & (pageblock_nr_pages - 1))) {
	6465	+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
5738	6466	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5739	6467	cond_resched();
5740	6468	}
5741	6469	}
	6470	+
	6471	+ pr_info("%s initialised %lu pages in %ums\n", __func__,
	6472	+ nr_pages, jiffies_to_msecs(jiffies - start));
5742	6473	}
5743	6474
	6475	+#endif
5744	6476	static void __meminit zone_init_free_lists(struct zone *zone)
5745	6477	{
5746	6478	unsigned int order, t;
..	..	@@ -5750,11 +6482,118 @@
5750	6482	}
5751	6483	}
5752	6484
5753		-#ifndef __HAVE_ARCH_MEMMAP_INIT
5754		-#define memmap_init(size, nid, zone, start_pfn) \
5755		- memmap_init_zone((size), (nid), (zone), (start_pfn), \
5756		- MEMINIT_EARLY, NULL)
	6485	+/*
	6486	+ * Only struct pages that correspond to ranges defined by memblock.memory
	6487	+ * are zeroed and initialized by going through __init_single_page() during
	6488	+ * memmap_init_zone_range().
	6489	+ *
	6490	+ * But, there could be struct pages that correspond to holes in
	6491	+ * memblock.memory. This can happen because of the following reasons:
	6492	+ * - physical memory bank size is not necessarily the exact multiple of the
	6493	+ * arbitrary section size
	6494	+ * - early reserved memory may not be listed in memblock.memory
	6495	+ * - memory layouts defined with memmap= kernel parameter may not align
	6496	+ * nicely with memmap sections
	6497	+ *
	6498	+ * Explicitly initialize those struct pages so that:
	6499	+ * - PG_Reserved is set
	6500	+ * - zone and node links point to zone and node that span the page if the
	6501	+ * hole is in the middle of a zone
	6502	+ * - zone and node links point to adjacent zone/node if the hole falls on
	6503	+ * the zone boundary; the pages in such holes will be prepended to the
	6504	+ * zone/node above the hole except for the trailing pages in the last
	6505	+ * section that will be appended to the zone/node below.
	6506	+ */
	6507	+static void __init init_unavailable_range(unsigned long spfn,
	6508	+ unsigned long epfn,
	6509	+ int zone, int node)
	6510	+{
	6511	+ unsigned long pfn;
	6512	+ u64 pgcnt = 0;
	6513	+
	6514	+ for (pfn = spfn; pfn < epfn; pfn++) {
	6515	+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
	6516	+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
	6517	+ + pageblock_nr_pages - 1;
	6518	+ continue;
	6519	+ }
	6520	+ __init_single_page(pfn_to_page(pfn), pfn, zone, node, true);
	6521	+ __SetPageReserved(pfn_to_page(pfn));
	6522	+ pgcnt++;
	6523	+ }
	6524	+
	6525	+ if (pgcnt)
	6526	+ pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
	6527	+ node, zone_names[zone], pgcnt);
	6528	+}
	6529	+
	6530	+static void __init memmap_init_zone_range(struct zone *zone,
	6531	+ unsigned long start_pfn,
	6532	+ unsigned long end_pfn,
	6533	+ unsigned long *hole_pfn)
	6534	+{
	6535	+ unsigned long zone_start_pfn = zone->zone_start_pfn;
	6536	+ unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
	6537	+ int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
	6538	+
	6539	+ start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
	6540	+ end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
	6541	+
	6542	+ if (start_pfn >= end_pfn)
	6543	+ return;
	6544	+
	6545	+ memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn,
	6546	+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
	6547	+
	6548	+ if (*hole_pfn < start_pfn)
	6549	+ init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
	6550	+
	6551	+ *hole_pfn = end_pfn;
	6552	+}
	6553	+
	6554	+void __init __weak memmap_init(void)
	6555	+{
	6556	+ unsigned long start_pfn, end_pfn;
	6557	+ unsigned long hole_pfn = 0;
	6558	+ int i, j, zone_id, nid;
	6559	+
	6560	+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
	6561	+ struct pglist_data *node = NODE_DATA(nid);
	6562	+
	6563	+ for (j = 0; j < MAX_NR_ZONES; j++) {
	6564	+ struct zone *zone = node->node_zones + j;
	6565	+
	6566	+ if (!populated_zone(zone))
	6567	+ continue;
	6568	+
	6569	+ memmap_init_zone_range(zone, start_pfn, end_pfn,
	6570	+ &hole_pfn);
	6571	+ zone_id = j;
	6572	+ }
	6573	+ }
	6574	+
	6575	+#ifdef CONFIG_SPARSEMEM
	6576	+ /*
	6577	+ * Initialize the memory map for hole in the range [memory_end,
	6578	+ * section_end].
	6579	+ * Append the pages in this hole to the highest zone in the last
	6580	+ * node.
	6581	+ * The call to init_unavailable_range() is outside the ifdef to
	6582	+ * silence the compiler warining about zone_id set but not used;
	6583	+ * for FLATMEM it is a nop anyway
	6584	+ */
	6585	+ end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
	6586	+ if (hole_pfn < end_pfn)
5757	6587	#endif
	6588	+ init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
	6589	+}
	6590	+
	6591	+/* A stub for backwards compatibility with custom implementatin on IA-64 */
	6592	+void __meminit __weak arch_memmap_init(unsigned long size, int nid,
	6593	+ unsigned long zone,
	6594	+ unsigned long range_start_pfn)
	6595	+{
	6596	+}
5758	6597
5759	6598	static int zone_batchsize(struct zone *zone)
5760	6599	{
..	..	@@ -5765,7 +6604,7 @@
5765	6604	* The per-cpu-pages pools are set to around 1000th of the
5766	6605	* size of the zone.
5767	6606	*/
5768		- batch = zone->managed_pages / 1024;
	6607	+ batch = zone_managed_pages(zone) / 1024;
5769	6608	/* But no more than a meg. */
5770	6609	if (batch * PAGE_SIZE > 1024 * 1024)
5771	6610	batch = (1024 * 1024) / PAGE_SIZE;
..	..	@@ -5812,7 +6651,7 @@
5812	6651	* locking.
5813	6652	*
5814	6653	* Any new users of pcp->batch and pcp->high should ensure they can cope with
5815		- * those fields changing asynchronously (acording the the above rule).
	6654	+ * those fields changing asynchronously (acording to the above rule).
5816	6655	*
5817	6656	* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5818	6657	* outside of boot time (or some other assurance that no concurrent updaters
..	..	@@ -5821,6 +6660,7 @@
5821	6660	static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
5822	6661	unsigned long batch)
5823	6662	{
	6663	+ trace_android_vh_pageset_update(&high, &batch);
5824	6664	/* start with a fail safe value for batch */
5825	6665	pcp->batch = 1;
5826	6666	smp_wmb();
..	..	@@ -5846,7 +6686,6 @@
5846	6686	memset(p, 0, sizeof(*p));
5847	6687
5848	6688	pcp = &p->pcp;
5849		- pcp->count = 0;
5850	6689	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
5851	6690	INIT_LIST_HEAD(&pcp->lists[migratetype]);
5852	6691	}
..	..	@@ -5876,7 +6715,7 @@
5876	6715	{
5877	6716	if (percpu_pagelist_fraction)
5878	6717	pageset_set_high(pcp,
5879		- (zone->managed_pages /
	6718	+ (zone_managed_pages(zone) /
5880	6719	percpu_pagelist_fraction));
5881	6720	else
5882	6721	pageset_set_batch(pcp, zone_batchsize(zone));
..	..	@@ -5906,9 +6745,24 @@
5906	6745	{
5907	6746	struct pglist_data *pgdat;
5908	6747	struct zone *zone;
	6748	+ int __maybe_unused cpu;
5909	6749
5910	6750	for_each_populated_zone(zone)
5911	6751	setup_zone_pageset(zone);
	6752	+
	6753	+#ifdef CONFIG_NUMA
	6754	+ /*
	6755	+ * Unpopulated zones continue using the boot pagesets.
	6756	+ * The numa stats for these pagesets need to be reset.
	6757	+ * Otherwise, they will end up skewing the stats of
	6758	+ * the nodes these zones are associated with.
	6759	+ */
	6760	+ for_each_possible_cpu(cpu) {
	6761	+ struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
	6762	+ memset(pcp->vm_numa_stat_diff, 0,
	6763	+ sizeof(pcp->vm_numa_stat_diff));
	6764	+ }
	6765	+#endif
5912	6766
5913	6767	for_each_online_pgdat(pgdat)
5914	6768	pgdat->per_cpu_nodestats =
..	..	@@ -5952,73 +6806,6 @@
5952	6806	zone->initialized = 1;
5953	6807	}
5954	6808
5955		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5956		-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
5957		-
5958		-/*
5959		- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
5960		- */
5961		-int __meminit __early_pfn_to_nid(unsigned long pfn,
5962		- struct mminit_pfnnid_cache *state)
5963		-{
5964		- unsigned long start_pfn, end_pfn;
5965		- int nid;
5966		-
5967		- if (state->last_start <= pfn && pfn < state->last_end)
5968		- return state->last_nid;
5969		-
5970		- nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
5971		- if (nid != -1) {
5972		- state->last_start = start_pfn;
5973		- state->last_end = end_pfn;
5974		- state->last_nid = nid;
5975		- }
5976		-
5977		- return nid;
5978		-}
5979		-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
5980		-
5981		-/**
5982		- * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
5983		- * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
5984		- * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
5985		- *
5986		- * If an architecture guarantees that all ranges registered contain no holes
5987		- * and may be freed, this this function may be used instead of calling
5988		- * memblock_free_early_nid() manually.
5989		- */
5990		-void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
5991		-{
5992		- unsigned long start_pfn, end_pfn;
5993		- int i, this_nid;
5994		-
5995		- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
5996		- start_pfn = min(start_pfn, max_low_pfn);
5997		- end_pfn = min(end_pfn, max_low_pfn);
5998		-
5999		- if (start_pfn < end_pfn)
6000		- memblock_free_early_nid(PFN_PHYS(start_pfn),
6001		- (end_pfn - start_pfn) << PAGE_SHIFT,
6002		- this_nid);
6003		- }
6004		-}
6005		-
6006		-/**
6007		- * sparse_memory_present_with_active_regions - Call memory_present for each active range
6008		- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
6009		- *
6010		- * If an architecture guarantees that all ranges registered contain no holes and may
6011		- * be freed, this function may be used instead of calling memory_present() manually.
6012		- */
6013		-void __init sparse_memory_present_with_active_regions(int nid)
6014		-{
6015		- unsigned long start_pfn, end_pfn;
6016		- int i, this_nid;
6017		-
6018		- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
6019		- memory_present(this_nid, start_pfn, end_pfn);
6020		-}
6021		-
6022	6809	/**
6023	6810	* get_pfn_range_for_nid - Return the start and end page frames for a node
6024	6811	* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
..	..	@@ -6030,7 +6817,7 @@
6030	6817	* with no available memory, a warning is printed and the start and end
6031	6818	* PFNs will be 0.
6032	6819	*/
6033		-void __meminit get_pfn_range_for_nid(unsigned int nid,
	6820	+void __init get_pfn_range_for_nid(unsigned int nid,
6034	6821	unsigned long start_pfn, unsigned long end_pfn)
6035	6822	{
6036	6823	unsigned long this_start_pfn, this_end_pfn;
..	..	@@ -6079,7 +6866,7 @@
6079	6866	* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
6080	6867	* zones within a node are in order of monotonic increases memory addresses
6081	6868	*/
6082		-static void __meminit adjust_zone_range_for_zone_movable(int nid,
	6869	+static void __init adjust_zone_range_for_zone_movable(int nid,
6083	6870	unsigned long zone_type,
6084	6871	unsigned long node_start_pfn,
6085	6872	unsigned long node_end_pfn,
..	..	@@ -6110,13 +6897,12 @@
6110	6897	* Return the number of pages a zone spans in a node, including holes
6111	6898	* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
6112	6899	*/
6113		-static unsigned long __meminit zone_spanned_pages_in_node(int nid,
	6900	+static unsigned long __init zone_spanned_pages_in_node(int nid,
6114	6901	unsigned long zone_type,
6115	6902	unsigned long node_start_pfn,
6116	6903	unsigned long node_end_pfn,
6117	6904	unsigned long *zone_start_pfn,
6118		- unsigned long *zone_end_pfn,
6119		- unsigned long *ignored)
	6905	+ unsigned long *zone_end_pfn)
6120	6906	{
6121	6907	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6122	6908	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
..	..	@@ -6147,7 +6933,7 @@
6147	6933	* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
6148	6934	* then all holes in the requested range will be accounted for.
6149	6935	*/
6150		-unsigned long __meminit __absent_pages_in_range(int nid,
	6936	+unsigned long __init __absent_pages_in_range(int nid,
6151	6937	unsigned long range_start_pfn,
6152	6938	unsigned long range_end_pfn)
6153	6939	{
..	..	@@ -6168,7 +6954,7 @@
6168	6954	* @start_pfn: The start PFN to start searching for holes
6169	6955	* @end_pfn: The end PFN to stop searching for holes
6170	6956	*
6171		- * It returns the number of pages frames in memory holes within a range.
	6957	+ * Return: the number of pages frames in memory holes within a range.
6172	6958	*/
6173	6959	unsigned long __init absent_pages_in_range(unsigned long start_pfn,
6174	6960	unsigned long end_pfn)
..	..	@@ -6177,11 +6963,10 @@
6177	6963	}
6178	6964
6179	6965	/* Return the number of page frames in holes in a zone on a node */
6180		-static unsigned long __meminit zone_absent_pages_in_node(int nid,
	6966	+static unsigned long __init zone_absent_pages_in_node(int nid,
6181	6967	unsigned long zone_type,
6182	6968	unsigned long node_start_pfn,
6183		- unsigned long node_end_pfn,
6184		- unsigned long *ignored)
	6969	+ unsigned long node_end_pfn)
6185	6970	{
6186	6971	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6187	6972	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
..	..	@@ -6209,7 +6994,7 @@
6209	6994	unsigned long start_pfn, end_pfn;
6210	6995	struct memblock_region *r;
6211	6996
6212		- for_each_memblock(memory, r) {
	6997	+ for_each_mem_region(r) {
6213	6998	start_pfn = clamp(memblock_region_memory_base_pfn(r),
6214	6999	zone_start_pfn, zone_end_pfn);
6215	7000	end_pfn = clamp(memblock_region_memory_end_pfn(r),
..	..	@@ -6228,45 +7013,9 @@
6228	7013	return nr_absent;
6229	7014	}
6230	7015
6231		-#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6232		-static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
6233		- unsigned long zone_type,
6234		- unsigned long node_start_pfn,
6235		- unsigned long node_end_pfn,
6236		- unsigned long *zone_start_pfn,
6237		- unsigned long *zone_end_pfn,
6238		- unsigned long *zones_size)
6239		-{
6240		- unsigned int zone;
6241		-
6242		- *zone_start_pfn = node_start_pfn;
6243		- for (zone = 0; zone < zone_type; zone++)
6244		- *zone_start_pfn += zones_size[zone];
6245		-
6246		- zone_end_pfn = zone_start_pfn + zones_size[zone_type];
6247		-
6248		- return zones_size[zone_type];
6249		-}
6250		-
6251		-static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
6252		- unsigned long zone_type,
	7016	+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
6253	7017	unsigned long node_start_pfn,
6254		- unsigned long node_end_pfn,
6255		- unsigned long *zholes_size)
6256		-{
6257		- if (!zholes_size)
6258		- return 0;
6259		-
6260		- return zholes_size[zone_type];
6261		-}
6262		-
6263		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6264		-
6265		-static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
6266		- unsigned long node_start_pfn,
6267		- unsigned long node_end_pfn,
6268		- unsigned long *zones_size,
6269		- unsigned long *zholes_size)
	7018	+ unsigned long node_end_pfn)
6270	7019	{
6271	7020	unsigned long realtotalpages = 0, totalpages = 0;
6272	7021	enum zone_type i;
..	..	@@ -6274,17 +7023,21 @@
6274	7023	for (i = 0; i < MAX_NR_ZONES; i++) {
6275	7024	struct zone *zone = pgdat->node_zones + i;
6276	7025	unsigned long zone_start_pfn, zone_end_pfn;
	7026	+ unsigned long spanned, absent;
6277	7027	unsigned long size, real_size;
6278	7028
6279		- size = zone_spanned_pages_in_node(pgdat->node_id, i,
6280		- node_start_pfn,
6281		- node_end_pfn,
6282		- &zone_start_pfn,
6283		- &zone_end_pfn,
6284		- zones_size);
6285		- real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
6286		- node_start_pfn, node_end_pfn,
6287		- zholes_size);
	7029	+ spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
	7030	+ node_start_pfn,
	7031	+ node_end_pfn,
	7032	+ &zone_start_pfn,
	7033	+ &zone_end_pfn);
	7034	+ absent = zone_absent_pages_in_node(pgdat->node_id, i,
	7035	+ node_start_pfn,
	7036	+ node_end_pfn);
	7037	+
	7038	+ size = spanned;
	7039	+ real_size = size - absent;
	7040	+
6288	7041	if (size)
6289	7042	zone->zone_start_pfn = zone_start_pfn;
6290	7043	else
..	..	@@ -6330,10 +7083,14 @@
6330	7083	{
6331	7084	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
6332	7085	zone->pageblock_flags = NULL;
6333		- if (usemapsize)
	7086	+ if (usemapsize) {
6334	7087	zone->pageblock_flags =
6335		- memblock_virt_alloc_node_nopanic(usemapsize,
6336		- pgdat->node_id);
	7088	+ memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
	7089	+ pgdat->node_id);
	7090	+ if (!zone->pageblock_flags)
	7091	+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
	7092	+ usemapsize, zone->name, pgdat->node_id);
	7093	+ }
6337	7094	}
6338	7095	#else
6339	7096	static inline void setup_usemap(struct pglist_data pgdat, struct zone zone,
..	..	@@ -6400,9 +7157,11 @@
6400	7157	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6401	7158	static void pgdat_init_split_queue(struct pglist_data *pgdat)
6402	7159	{
6403		- spin_lock_init(&pgdat->split_queue_lock);
6404		- INIT_LIST_HEAD(&pgdat->split_queue);
6405		- pgdat->split_queue_len = 0;
	7160	+ struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
	7161	+
	7162	+ spin_lock_init(&ds_queue->split_queue_lock);
	7163	+ INIT_LIST_HEAD(&ds_queue->split_queue);
	7164	+ ds_queue->split_queue_len = 0;
6406	7165	}
6407	7166	#else
6408	7167	static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
..	..	@@ -6429,13 +7188,13 @@
6429	7188
6430	7189	pgdat_page_ext_init(pgdat);
6431	7190	spin_lock_init(&pgdat->lru_lock);
6432		- lruvec_init(node_lruvec(pgdat));
	7191	+ lruvec_init(&pgdat->__lruvec);
6433	7192	}
6434	7193
6435	7194	static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
6436	7195	unsigned long remaining_pages)
6437	7196	{
6438		- zone->managed_pages = remaining_pages;
	7197	+ atomic_long_set(&zone->managed_pages, remaining_pages);
6439	7198	zone_set_nid(zone, nid);
6440	7199	zone->name = zone_names[idx];
6441	7200	zone->zone_pgdat = NODE_DATA(nid);
..	..	@@ -6533,7 +7292,7 @@
6533	7292	set_pageblock_order();
6534	7293	setup_usemap(pgdat, zone, zone_start_pfn, size);
6535	7294	init_currently_empty_zone(zone, zone_start_pfn, size);
6536		- memmap_init(size, nid, j, zone_start_pfn);
	7295	+ arch_memmap_init(size, nid, j, zone_start_pfn);
6537	7296	}
6538	7297	}
6539	7298
..	..	@@ -6562,7 +7321,11 @@
6562	7321	end = pgdat_end_pfn(pgdat);
6563	7322	end = ALIGN(end, MAX_ORDER_NR_PAGES);
6564	7323	size = (end - start) * sizeof(struct page);
6565		- map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
	7324	+ map = memblock_alloc_node(size, SMP_CACHE_BYTES,
	7325	+ pgdat->node_id);
	7326	+ if (!map)
	7327	+ panic("Failed to allocate %ld bytes for node %d memory map\n",
	7328	+ size, pgdat->node_id);
6566	7329	pgdat->node_mem_map = map + offset;
6567	7330	}
6568	7331	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
..	..	@@ -6574,10 +7337,8 @@
6574	7337	*/
6575	7338	if (pgdat == NODE_DATA(0)) {
6576	7339	mem_map = NODE_DATA(0)->node_mem_map;
6577		-#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) \|\| defined(CONFIG_FLATMEM)
6578	7340	if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
6579	7341	mem_map -= offset;
6580		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6581	7342	}
6582	7343	#endif
6583	7344	}
..	..	@@ -6588,42 +7349,31 @@
6588	7349	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6589	7350	static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
6590	7351	{
6591		- /*
6592		- * We start only with one section of pages, more pages are added as
6593		- * needed until the rest of deferred pages are initialized.
6594		- */
6595		- pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
6596		- pgdat->node_spanned_pages);
6597	7352	pgdat->first_deferred_pfn = ULONG_MAX;
6598	7353	}
6599	7354	#else
6600	7355	static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
6601	7356	#endif
6602	7357
6603		-void __init free_area_init_node(int nid, unsigned long *zones_size,
6604		- unsigned long node_start_pfn,
6605		- unsigned long *zholes_size)
	7358	+static void __init free_area_init_node(int nid)
6606	7359	{
6607	7360	pg_data_t *pgdat = NODE_DATA(nid);
6608	7361	unsigned long start_pfn = 0;
6609	7362	unsigned long end_pfn = 0;
6610	7363
6611	7364	/* pg_data_t should be reset to zero when it's allocated */
6612		- WARN_ON(pgdat->nr_zones \|\| pgdat->kswapd_classzone_idx);
	7365	+ WARN_ON(pgdat->nr_zones \|\| pgdat->kswapd_highest_zoneidx);
	7366	+
	7367	+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
6613	7368
6614	7369	pgdat->node_id = nid;
6615		- pgdat->node_start_pfn = node_start_pfn;
	7370	+ pgdat->node_start_pfn = start_pfn;
6616	7371	pgdat->per_cpu_nodestats = NULL;
6617		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6618		- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
	7372	+
6619	7373	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
6620	7374	(u64)start_pfn << PAGE_SHIFT,
6621	7375	end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
6622		-#else
6623		- start_pfn = node_start_pfn;
6624		-#endif
6625		- calculate_node_totalpages(pgdat, start_pfn, end_pfn,
6626		- zones_size, zholes_size);
	7376	+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
6627	7377
6628	7378	alloc_node_mem_map(pgdat);
6629	7379	pgdat_set_deferred_range(pgdat);
..	..	@@ -6631,80 +7381,10 @@
6631	7381	free_area_init_core(pgdat);
6632	7382	}
6633	7383
6634		-#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
6635		-
6636		-/*
6637		- * Zero all valid struct pages in range [spfn, epfn), return number of struct
6638		- * pages zeroed
6639		- */
6640		-static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
	7384	+void __init free_area_init_memoryless_node(int nid)
6641	7385	{
6642		- unsigned long pfn;
6643		- u64 pgcnt = 0;
6644		-
6645		- for (pfn = spfn; pfn < epfn; pfn++) {
6646		- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
6647		- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
6648		- + pageblock_nr_pages - 1;
6649		- continue;
6650		- }
6651		- mm_zero_struct_page(pfn_to_page(pfn));
6652		- pgcnt++;
6653		- }
6654		-
6655		- return pgcnt;
	7386	+ free_area_init_node(nid);
6656	7387	}
6657		-
6658		-/*
6659		- * Only struct pages that are backed by physical memory are zeroed and
6660		- * initialized by going through __init_single_page(). But, there are some
6661		- * struct pages which are reserved in memblock allocator and their fields
6662		- * may be accessed (for example page_to_pfn() on some configuration accesses
6663		- * flags). We must explicitly zero those struct pages.
6664		- *
6665		- * This function also addresses a similar issue where struct pages are left
6666		- * uninitialized because the physical address range is not covered by
6667		- * memblock.memory or memblock.reserved. That could happen when memblock
6668		- * layout is manually configured via memmap=, or when the highest physical
6669		- * address (max_pfn) does not end on a section boundary.
6670		- */
6671		-void __init zero_resv_unavail(void)
6672		-{
6673		- phys_addr_t start, end;
6674		- u64 i, pgcnt;
6675		- phys_addr_t next = 0;
6676		-
6677		- /*
6678		- * Loop through unavailable ranges not covered by memblock.memory.
6679		- */
6680		- pgcnt = 0;
6681		- for_each_mem_range(i, &memblock.memory, NULL,
6682		- NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
6683		- if (next < start)
6684		- pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
6685		- next = end;
6686		- }
6687		-
6688		- /*
6689		- * Early sections always have a fully populated memmap for the whole
6690		- * section - see pfn_valid(). If the last section has holes at the
6691		- * end and that section is marked "online", the memmap will be
6692		- * considered initialized. Make sure that memmap has a well defined
6693		- * state.
6694		- */
6695		- pgcnt += zero_pfn_range(PFN_DOWN(next),
6696		- round_up(max_pfn, PAGES_PER_SECTION));
6697		-
6698		- /*
6699		- * Struct pages that do not have backing memory. This could be because
6700		- * firmware is using some of this memory, or for some other reasons.
6701		- */
6702		- if (pgcnt)
6703		- pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
6704		-}
6705		-#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
6706		-
6707		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6708	7388
6709	7389	#if MAX_NUMNODES > 1
6710	7390	/*
..	..	@@ -6735,14 +7415,14 @@
6735	7415	* model has fine enough granularity to avoid incorrect mapping for the
6736	7416	* populated node map.
6737	7417	*
6738		- * Returns the determined alignment in pfn's. 0 if there is no alignment
	7418	+ * Return: the determined alignment in pfn's. 0 if there is no alignment
6739	7419	* requirement (single node).
6740	7420	*/
6741	7421	unsigned long __init node_map_pfn_alignment(void)
6742	7422	{
6743	7423	unsigned long accl_mask = 0, last_end = 0;
6744	7424	unsigned long start, end, mask;
6745		- int last_nid = -1;
	7425	+ int last_nid = NUMA_NO_NODE;
6746	7426	int i, nid;
6747	7427
6748	7428	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
..	..	@@ -6769,33 +7449,15 @@
6769	7449	return ~accl_mask + 1;
6770	7450	}
6771	7451
6772		-/* Find the lowest pfn for a node */
6773		-static unsigned long __init find_min_pfn_for_node(int nid)
6774		-{
6775		- unsigned long min_pfn = ULONG_MAX;
6776		- unsigned long start_pfn;
6777		- int i;
6778		-
6779		- for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
6780		- min_pfn = min(min_pfn, start_pfn);
6781		-
6782		- if (min_pfn == ULONG_MAX) {
6783		- pr_warn("Could not find start_pfn for node %d\n", nid);
6784		- return 0;
6785		- }
6786		-
6787		- return min_pfn;
6788		-}
6789		-
6790	7452	/**
6791	7453	* find_min_pfn_with_active_regions - Find the minimum PFN registered
6792	7454	*
6793		- * It returns the minimum PFN based on information provided via
	7455	+ * Return: the minimum PFN based on information provided via
6794	7456	* memblock_set_node().
6795	7457	*/
6796	7458	unsigned long __init find_min_pfn_with_active_regions(void)
6797	7459	{
6798		- return find_min_pfn_for_node(MAX_NUMNODES);
	7460	+ return PHYS_PFN(memblock_start_of_DRAM());
6799	7461	}
6800	7462
6801	7463	/*
..	..	@@ -6844,11 +7506,11 @@
6844	7506	* options.
6845	7507	*/
6846	7508	if (movable_node_is_enabled()) {
6847		- for_each_memblock(memory, r) {
	7509	+ for_each_mem_region(r) {
6848	7510	if (!memblock_is_hotpluggable(r))
6849	7511	continue;
6850	7512
6851		- nid = r->nid;
	7513	+ nid = memblock_get_region_node(r);
6852	7514
6853	7515	usable_startpfn = PFN_DOWN(r->base);
6854	7516	zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
..	..	@@ -6865,11 +7527,11 @@
6865	7527	if (mirrored_kernelcore) {
6866	7528	bool mem_below_4gb_not_mirrored = false;
6867	7529
6868		- for_each_memblock(memory, r) {
	7530	+ for_each_mem_region(r) {
6869	7531	if (memblock_is_mirror(r))
6870	7532	continue;
6871	7533
6872		- nid = r->nid;
	7534	+ nid = memblock_get_region_node(r);
6873	7535
6874	7536	usable_startpfn = memblock_region_memory_base_pfn(r);
6875	7537
..	..	@@ -6884,7 +7546,7 @@
6884	7546	}
6885	7547
6886	7548	if (mem_below_4gb_not_mirrored)
6887		- pr_warn("This configuration results in unmirrored kernel memory.");
	7549	+ pr_warn("This configuration results in unmirrored kernel memory.\n");
6888	7550
6889	7551	goto out2;
6890	7552	}
..	..	@@ -7023,9 +7685,16 @@
7023	7685
7024	7686	out2:
7025	7687	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
7026		- for (nid = 0; nid < MAX_NUMNODES; nid++)
	7688	+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
	7689	+ unsigned long start_pfn, end_pfn;
	7690	+
7027	7691	zone_movable_pfn[nid] =
7028	7692	roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
	7693	+
	7694	+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
	7695	+ if (zone_movable_pfn[nid] >= end_pfn)
	7696	+ zone_movable_pfn[nid] = 0;
	7697	+ }
7029	7698
7030	7699	out:
7031	7700	/* restore the node_state */
..	..	@@ -7037,23 +7706,29 @@
7037	7706	{
7038	7707	enum zone_type zone_type;
7039	7708
7040		- if (N_MEMORY == N_NORMAL_MEMORY)
7041		- return;
7042		-
7043	7709	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
7044	7710	struct zone *zone = &pgdat->node_zones[zone_type];
7045	7711	if (populated_zone(zone)) {
7046		- node_set_state(nid, N_HIGH_MEMORY);
7047		- if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
7048		- zone_type <= ZONE_NORMAL)
	7712	+ if (IS_ENABLED(CONFIG_HIGHMEM))
	7713	+ node_set_state(nid, N_HIGH_MEMORY);
	7714	+ if (zone_type <= ZONE_NORMAL)
7049	7715	node_set_state(nid, N_NORMAL_MEMORY);
7050	7716	break;
7051	7717	}
7052	7718	}
7053	7719	}
7054	7720
	7721	+/*
	7722	+ * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
	7723	+ * such cases we allow max_zone_pfn sorted in the descending order
	7724	+ */
	7725	+bool __weak arch_has_descending_max_zone_pfns(void)
	7726	+{
	7727	+ return false;
	7728	+}
	7729	+
7055	7730	/**
7056		- * free_area_init_nodes - Initialise all pg_data_t and zone data
	7731	+ * free_area_init - Initialise all pg_data_t and zone data
7057	7732	* @max_zone_pfn: an array of max PFNs for each zone
7058	7733	*
7059	7734	* This will call free_area_init_node() for each active node in the system.
..	..	@@ -7065,10 +7740,11 @@
7065	7740	* starts where the previous one ended. For example, ZONE_DMA32 starts
7066	7741	* at arch_max_dma_pfn.
7067	7742	*/
7068		-void __init free_area_init_nodes(unsigned long *max_zone_pfn)
	7743	+void __init free_area_init(unsigned long *max_zone_pfn)
7069	7744	{
7070	7745	unsigned long start_pfn, end_pfn;
7071		- int i, nid;
	7746	+ int i, nid, zone;
	7747	+ bool descending;
7072	7748
7073	7749	/* Record where the zone boundaries are */
7074	7750	memset(arch_zone_lowest_possible_pfn, 0,
..	..	@@ -7077,14 +7753,20 @@
7077	7753	sizeof(arch_zone_highest_possible_pfn));
7078	7754
7079	7755	start_pfn = find_min_pfn_with_active_regions();
	7756	+ descending = arch_has_descending_max_zone_pfns();
7080	7757
7081	7758	for (i = 0; i < MAX_NR_ZONES; i++) {
7082		- if (i == ZONE_MOVABLE)
	7759	+ if (descending)
	7760	+ zone = MAX_NR_ZONES - i - 1;
	7761	+ else
	7762	+ zone = i;
	7763	+
	7764	+ if (zone == ZONE_MOVABLE)
7083	7765	continue;
7084	7766
7085		- end_pfn = max(max_zone_pfn[i], start_pfn);
7086		- arch_zone_lowest_possible_pfn[i] = start_pfn;
7087		- arch_zone_highest_possible_pfn[i] = end_pfn;
	7767	+ end_pfn = max(max_zone_pfn[zone], start_pfn);
	7768	+ arch_zone_lowest_possible_pfn[zone] = start_pfn;
	7769	+ arch_zone_highest_possible_pfn[zone] = end_pfn;
7088	7770
7089	7771	start_pfn = end_pfn;
7090	7772	}
..	..	@@ -7118,27 +7800,33 @@
7118	7800	(u64)zone_movable_pfn[i] << PAGE_SHIFT);
7119	7801	}
7120	7802
7121		- /* Print out the early node map */
	7803	+ /*
	7804	+ * Print out the early node map, and initialize the
	7805	+ * subsection-map relative to active online memory ranges to
	7806	+ * enable future "sub-section" extensions of the memory map.
	7807	+ */
7122	7808	pr_info("Early memory node ranges\n");
7123		- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
	7809	+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
7124	7810	pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
7125	7811	(u64)start_pfn << PAGE_SHIFT,
7126	7812	((u64)end_pfn << PAGE_SHIFT) - 1);
	7813	+ subsection_map_init(start_pfn, end_pfn - start_pfn);
	7814	+ }
7127	7815
7128	7816	/* Initialise every node */
7129	7817	mminit_verify_pageflags_layout();
7130	7818	setup_nr_node_ids();
7131		- zero_resv_unavail();
7132	7819	for_each_online_node(nid) {
7133	7820	pg_data_t *pgdat = NODE_DATA(nid);
7134		- free_area_init_node(nid, NULL,
7135		- find_min_pfn_for_node(nid), NULL);
	7821	+ free_area_init_node(nid);
7136	7822
7137	7823	/* Any memory on that node */
7138	7824	if (pgdat->node_present_pages)
7139	7825	node_set_state(nid, N_MEMORY);
7140	7826	check_for_memory(pgdat, nid);
7141	7827	}
	7828	+
	7829	+ memmap_init();
7142	7830	}
7143	7831
7144	7832	static int __init cmdline_parse_core(char p, unsigned long core,
..	..	@@ -7197,22 +7885,18 @@
7197	7885	early_param("kernelcore", cmdline_parse_kernelcore);
7198	7886	early_param("movablecore", cmdline_parse_movablecore);
7199	7887
7200		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
7201		-
7202	7888	void adjust_managed_page_count(struct page *page, long count)
7203	7889	{
7204		- spin_lock(&managed_page_count_lock);
7205		- page_zone(page)->managed_pages += count;
7206		- totalram_pages += count;
	7890	+ atomic_long_add(count, &page_zone(page)->managed_pages);
	7891	+ totalram_pages_add(count);
7207	7892	#ifdef CONFIG_HIGHMEM
7208	7893	if (PageHighMem(page))
7209		- totalhigh_pages += count;
	7894	+ totalhigh_pages_add(count);
7210	7895	#endif
7211		- spin_unlock(&managed_page_count_lock);
7212	7896	}
7213	7897	EXPORT_SYMBOL(adjust_managed_page_count);
7214	7898
7215		-unsigned long free_reserved_area(void start, void end, int poison, char *s)
	7899	+unsigned long free_reserved_area(void start, void end, int poison, const char *s)
7216	7900	{
7217	7901	void *pos;
7218	7902	unsigned long pages = 0;
..	..	@@ -7231,6 +7915,11 @@
7231	7915	* alias for the memset().
7232	7916	*/
7233	7917	direct_map_addr = page_address(page);
	7918	+ /*
	7919	+ * Perform a kasan-unchecked memset() since this memory
	7920	+ * has not been initialized.
	7921	+ */
	7922	+ direct_map_addr = kasan_reset_tag(direct_map_addr);
7234	7923	if ((unsigned int)poison <= 0xFF)
7235	7924	memset(direct_map_addr, poison, PAGE_SIZE);
7236	7925
..	..	@@ -7243,15 +7932,14 @@
7243	7932
7244	7933	return pages;
7245	7934	}
7246		-EXPORT_SYMBOL(free_reserved_area);
7247	7935
7248	7936	#ifdef CONFIG_HIGHMEM
7249	7937	void free_highmem_page(struct page *page)
7250	7938	{
7251	7939	__free_reserved_page(page);
7252		- totalram_pages++;
7253		- page_zone(page)->managed_pages++;
7254		- totalhigh_pages++;
	7940	+ totalram_pages_inc();
	7941	+ atomic_long_inc(&page_zone(page)->managed_pages);
	7942	+ totalhigh_pages_inc();
7255	7943	}
7256	7944	#endif
7257	7945
..	..	@@ -7278,7 +7966,7 @@
7278	7966	*/
7279	7967	#define adj_init_size(start, end, size, pos, adj) \
7280	7968	do { \
7281		- if (start <= pos && pos < end && size > adj) \
	7969	+ if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
7282	7970	size -= adj; \
7283	7971	} while (0)
7284	7972
..	..	@@ -7300,10 +7988,10 @@
7300	7988	physpages << (PAGE_SHIFT - 10),
7301	7989	codesize >> 10, datasize >> 10, rosize >> 10,
7302	7990	(init_data_size + init_code_size) >> 10, bss_size >> 10,
7303		- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
	7991	+ (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
7304	7992	totalcma_pages << (PAGE_SHIFT - 10),
7305	7993	#ifdef CONFIG_HIGHMEM
7306		- totalhigh_pages << (PAGE_SHIFT - 10),
	7994	+ totalhigh_pages() << (PAGE_SHIFT - 10),
7307	7995	#endif
7308	7996	str ? ", " : "", str ? str : "");
7309	7997	}
..	..	@@ -7322,13 +8010,6 @@
7322	8010	void __init set_dma_reserve(unsigned long new_dma_reserve)
7323	8011	{
7324	8012	dma_reserve = new_dma_reserve;
7325		-}
7326		-
7327		-void __init free_area_init(unsigned long *zones_size)
7328		-{
7329		- zero_resv_unavail();
7330		- free_area_init_node(0, zones_size,
7331		- __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
7332	8013	}
7333	8014
7334	8015	static int page_alloc_cpu_dead(unsigned int cpu)
..	..	@@ -7356,9 +8037,27 @@
7356	8037	return 0;
7357	8038	}
7358	8039
	8040	+#ifdef CONFIG_NUMA
	8041	+int hashdist = HASHDIST_DEFAULT;
	8042	+
	8043	+static int __init set_hashdist(char *str)
	8044	+{
	8045	+ if (!str)
	8046	+ return 0;
	8047	+ hashdist = simple_strtoul(str, &str, 0);
	8048	+ return 1;
	8049	+}
	8050	+__setup("hashdist=", set_hashdist);
	8051	+#endif
	8052	+
7359	8053	void __init page_alloc_init(void)
7360	8054	{
7361	8055	int ret;
	8056	+
	8057	+#ifdef CONFIG_NUMA
	8058	+ if (num_node_state(N_MEMORY) == 1)
	8059	+ hashdist = 0;
	8060	+#endif
7362	8061
7363	8062	ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
7364	8063	"mm/page_alloc:dead", NULL,
..	..	@@ -7383,6 +8082,7 @@
7383	8082	for (i = 0; i < MAX_NR_ZONES; i++) {
7384	8083	struct zone *zone = pgdat->node_zones + i;
7385	8084	long max = 0;
	8085	+ unsigned long managed_pages = zone_managed_pages(zone);
7386	8086
7387	8087	/* Find valid and maximum lowmem_reserve in the zone */
7388	8088	for (j = i; j < MAX_NR_ZONES; j++) {
..	..	@@ -7393,8 +8093,8 @@
7393	8093	/* we treat the high watermark as reserved pages. */
7394	8094	max += high_wmark_pages(zone);
7395	8095
7396		- if (max > zone->managed_pages)
7397		- max = zone->managed_pages;
	8096	+ if (max > managed_pages)
	8097	+ max = managed_pages;
7398	8098
7399	8099	pgdat->totalreserve_pages += max;
7400	8100
..	..	@@ -7413,30 +8113,24 @@
7413	8113	static void setup_per_zone_lowmem_reserve(void)
7414	8114	{
7415	8115	struct pglist_data *pgdat;
7416		- enum zone_type j, idx;
	8116	+ enum zone_type i, j;
7417	8117
7418	8118	for_each_online_pgdat(pgdat) {
7419		- for (j = 0; j < MAX_NR_ZONES; j++) {
7420		- struct zone *zone = pgdat->node_zones + j;
7421		- unsigned long managed_pages = zone->managed_pages;
	8119	+ for (i = 0; i < MAX_NR_ZONES - 1; i++) {
	8120	+ struct zone *zone = &pgdat->node_zones[i];
	8121	+ int ratio = sysctl_lowmem_reserve_ratio[i];
	8122	+ bool clear = !ratio \|\| !zone_managed_pages(zone);
	8123	+ unsigned long managed_pages = 0;
7422	8124
7423		- zone->lowmem_reserve[j] = 0;
	8125	+ for (j = i + 1; j < MAX_NR_ZONES; j++) {
	8126	+ struct zone *upper_zone = &pgdat->node_zones[j];
7424	8127
7425		- idx = j;
7426		- while (idx) {
7427		- struct zone *lower_zone;
	8128	+ managed_pages += zone_managed_pages(upper_zone);
7428	8129
7429		- idx--;
7430		- lower_zone = pgdat->node_zones + idx;
7431		-
7432		- if (sysctl_lowmem_reserve_ratio[idx] < 1) {
7433		- sysctl_lowmem_reserve_ratio[idx] = 0;
7434		- lower_zone->lowmem_reserve[j] = 0;
7435		- } else {
7436		- lower_zone->lowmem_reserve[j] =
7437		- managed_pages / sysctl_lowmem_reserve_ratio[idx];
7438		- }
7439		- managed_pages += lower_zone->managed_pages;
	8130	+ if (clear)
	8131	+ zone->lowmem_reserve[j] = 0;
	8132	+ else
	8133	+ zone->lowmem_reserve[j] = managed_pages / ratio;
7440	8134	}
7441	8135	}
7442	8136	}
..	..	@@ -7456,18 +8150,17 @@
7456	8150	/* Calculate total number of !ZONE_HIGHMEM pages */
7457	8151	for_each_zone(zone) {
7458	8152	if (!is_highmem(zone))
7459		- lowmem_pages += zone->managed_pages;
	8153	+ lowmem_pages += zone_managed_pages(zone);
7460	8154	}
7461	8155
7462	8156	for_each_zone(zone) {
7463		- u64 min, low;
	8157	+ u64 tmp, low;
7464	8158
7465	8159	spin_lock_irqsave(&zone->lock, flags);
7466		- min = (u64)pages_min * zone->managed_pages;
7467		- do_div(min, lowmem_pages);
7468		- low = (u64)pages_low * zone->managed_pages;
7469		- do_div(low, vm_total_pages);
7470		-
	8160	+ tmp = (u64)pages_min * zone_managed_pages(zone);
	8161	+ do_div(tmp, lowmem_pages);
	8162	+ low = (u64)pages_low * zone_managed_pages(zone);
	8163	+ do_div(low, nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)));
7471	8164	if (is_highmem(zone)) {
7472	8165	/*
7473	8166	* __GFP_HIGH and PF_MEMALLOC allocations usually don't
..	..	@@ -7475,20 +8168,20 @@
7475	8168	* value here.
7476	8169	*
7477	8170	* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
7478		- * deltas control asynch page reclaim, and so should
	8171	+ * deltas control async page reclaim, and so should
7479	8172	* not be capped for highmem.
7480	8173	*/
7481	8174	unsigned long min_pages;
7482	8175
7483		- min_pages = zone->managed_pages / 1024;
	8176	+ min_pages = zone_managed_pages(zone) / 1024;
7484	8177	min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
7485		- zone->watermark[WMARK_MIN] = min_pages;
	8178	+ zone->_watermark[WMARK_MIN] = min_pages;
7486	8179	} else {
7487	8180	/*
7488	8181	* If it's a lowmem zone, reserve a number of pages
7489	8182	* proportionate to the zone's size.
7490	8183	*/
7491		- zone->watermark[WMARK_MIN] = min;
	8184	+ zone->_watermark[WMARK_MIN] = tmp;
7492	8185	}
7493	8186
7494	8187	/*
..	..	@@ -7496,14 +8189,13 @@
7496	8189	* scale factor in proportion to available memory, but
7497	8190	* ensure a minimum size on small systems.
7498	8191	*/
7499		- min = max_t(u64, min >> 2,
7500		- mult_frac(zone->managed_pages,
	8192	+ tmp = max_t(u64, tmp >> 2,
	8193	+ mult_frac(zone_managed_pages(zone),
7501	8194	watermark_scale_factor, 10000));
7502	8195
7503		- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
7504		- low + min;
7505		- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
7506		- low + min * 2;
	8196	+ zone->watermark_boost = 0;
	8197	+ zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + low + tmp;
	8198	+ zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + low + tmp * 2;
7507	8199
7508	8200	spin_unlock_irqrestore(&zone->lock, flags);
7509	8201	}
..	..	@@ -7532,7 +8224,7 @@
7532	8224	* Initialise min_free_kbytes.
7533	8225	*
7534	8226	* For small machines we want it small (128k min). For large machines
7535		- * we want it large (64MB max). But it is not linear, because network
	8227	+ * we want it large (256MB max). But it is not linear, because network
7536	8228	* bandwidth does not increase linearly with machine size. We use
7537	8229	*
7538	8230	* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
..	..	@@ -7564,8 +8256,8 @@
7564	8256	min_free_kbytes = new_min_free_kbytes;
7565	8257	if (min_free_kbytes < 128)
7566	8258	min_free_kbytes = 128;
7567		- if (min_free_kbytes > 65536)
7568		- min_free_kbytes = 65536;
	8259	+ if (min_free_kbytes > 262144)
	8260	+ min_free_kbytes = 262144;
7569	8261	} else {
7570	8262	pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
7571	8263	new_min_free_kbytes, user_min_free_kbytes);
..	..	@@ -7591,7 +8283,7 @@
7591	8283	* or extra_free_kbytes changes.
7592	8284	*/
7593	8285	int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
7594		- void __user buffer, size_t length, loff_t *ppos)
	8286	+ void buffer, size_t length, loff_t *ppos)
7595	8287	{
7596	8288	int rc;
7597	8289
..	..	@@ -7607,7 +8299,7 @@
7607	8299	}
7608	8300
7609	8301	int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
7610		- void __user buffer, size_t length, loff_t *ppos)
	8302	+ void buffer, size_t length, loff_t *ppos)
7611	8303	{
7612	8304	int rc;
7613	8305
..	..	@@ -7631,13 +8323,13 @@
7631	8323	pgdat->min_unmapped_pages = 0;
7632	8324
7633	8325	for_each_zone(zone)
7634		- zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
7635		- sysctl_min_unmapped_ratio) / 100;
	8326	+ zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
	8327	+ sysctl_min_unmapped_ratio) / 100;
7636	8328	}
7637	8329
7638	8330
7639	8331	int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
7640		- void __user buffer, size_t length, loff_t *ppos)
	8332	+ void buffer, size_t length, loff_t *ppos)
7641	8333	{
7642	8334	int rc;
7643	8335
..	..	@@ -7659,12 +8351,12 @@
7659	8351	pgdat->min_slab_pages = 0;
7660	8352
7661	8353	for_each_zone(zone)
7662		- zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
7663		- sysctl_min_slab_ratio) / 100;
	8354	+ zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
	8355	+ sysctl_min_slab_ratio) / 100;
7664	8356	}
7665	8357
7666	8358	int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
7667		- void __user buffer, size_t length, loff_t *ppos)
	8359	+ void buffer, size_t length, loff_t *ppos)
7668	8360	{
7669	8361	int rc;
7670	8362
..	..	@@ -7688,11 +8380,28 @@
7688	8380	* if in function of the boot time zone sizes.
7689	8381	*/
7690	8382	int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
7691		- void __user buffer, size_t length, loff_t *ppos)
	8383	+ void buffer, size_t length, loff_t *ppos)
7692	8384	{
	8385	+ int i;
	8386	+
7693	8387	proc_dointvec_minmax(table, write, buffer, length, ppos);
	8388	+
	8389	+ for (i = 0; i < MAX_NR_ZONES; i++) {
	8390	+ if (sysctl_lowmem_reserve_ratio[i] < 1)
	8391	+ sysctl_lowmem_reserve_ratio[i] = 0;
	8392	+ }
	8393	+
7694	8394	setup_per_zone_lowmem_reserve();
7695	8395	return 0;
	8396	+}
	8397	+
	8398	+static void __zone_pcp_update(struct zone *zone)
	8399	+{
	8400	+ unsigned int cpu;
	8401	+
	8402	+ for_each_possible_cpu(cpu)
	8403	+ pageset_set_high_and_batch(zone,
	8404	+ per_cpu_ptr(zone->pageset, cpu));
7696	8405	}
7697	8406
7698	8407	/*
..	..	@@ -7701,7 +8410,7 @@
7701	8410	* pagelist can have before it gets flushed back to buddy allocator.
7702	8411	*/
7703	8412	int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
7704		- void __user buffer, size_t length, loff_t *ppos)
	8413	+ void buffer, size_t length, loff_t *ppos)
7705	8414	{
7706	8415	struct zone *zone;
7707	8416	int old_percpu_pagelist_fraction;
..	..	@@ -7726,30 +8435,12 @@
7726	8435	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
7727	8436	goto out;
7728	8437
7729		- for_each_populated_zone(zone) {
7730		- unsigned int cpu;
7731		-
7732		- for_each_possible_cpu(cpu)
7733		- pageset_set_high_and_batch(zone,
7734		- per_cpu_ptr(zone->pageset, cpu));
7735		- }
	8438	+ for_each_populated_zone(zone)
	8439	+ __zone_pcp_update(zone);
7736	8440	out:
7737	8441	mutex_unlock(&pcp_batch_high_lock);
7738	8442	return ret;
7739	8443	}
7740		-
7741		-#ifdef CONFIG_NUMA
7742		-int hashdist = HASHDIST_DEFAULT;
7743		-
7744		-static int __init set_hashdist(char *str)
7745		-{
7746		- if (!str)
7747		- return 0;
7748		- hashdist = simple_strtoul(str, &str, 0);
7749		- return 1;
7750		-}
7751		-__setup("hashdist=", set_hashdist);
7752		-#endif
7753	8444
7754	8445	#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
7755	8446	/*
..	..	@@ -7797,6 +8488,7 @@
7797	8488	unsigned long log2qty, size;
7798	8489	void *table = NULL;
7799	8490	gfp_t gfp_flags;
	8491	+ bool virt;
7800	8492
7801	8493	/* allow the kernel cmdline to have a say */
7802	8494	if (!numentries) {
..	..	@@ -7853,32 +8545,34 @@
7853	8545
7854	8546	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC \| __GFP_ZERO : GFP_ATOMIC;
7855	8547	do {
	8548	+ virt = false;
7856	8549	size = bucketsize << log2qty;
7857	8550	if (flags & HASH_EARLY) {
7858	8551	if (flags & HASH_ZERO)
7859		- table = memblock_virt_alloc_nopanic(size, 0);
	8552	+ table = memblock_alloc(size, SMP_CACHE_BYTES);
7860	8553	else
7861		- table = memblock_virt_alloc_raw(size, 0);
7862		- } else if (hashdist) {
7863		- table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
	8554	+ table = memblock_alloc_raw(size,
	8555	+ SMP_CACHE_BYTES);
	8556	+ } else if (get_order(size) >= MAX_ORDER \|\| hashdist) {
	8557	+ table = __vmalloc(size, gfp_flags);
	8558	+ virt = true;
7864	8559	} else {
7865	8560	/*
7866	8561	* If bucketsize is not a power-of-two, we may free
7867	8562	* some pages at the end of hash table which
7868	8563	* alloc_pages_exact() automatically does
7869	8564	*/
7870		- if (get_order(size) < MAX_ORDER) {
7871		- table = alloc_pages_exact(size, gfp_flags);
7872		- kmemleak_alloc(table, size, 1, gfp_flags);
7873		- }
	8565	+ table = alloc_pages_exact(size, gfp_flags);
	8566	+ kmemleak_alloc(table, size, 1, gfp_flags);
7874	8567	}
7875	8568	} while (!table && size > PAGE_SIZE && --log2qty);
7876	8569
7877	8570	if (!table)
7878	8571	panic("Failed to allocate %s hash table\n", tablename);
7879	8572
7880		- pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
7881		- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
	8573	+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
	8574	+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
	8575	+ virt ? "vmalloc" : "linear");
7882	8576
7883	8577	if (_hash_shift)
7884	8578	*_hash_shift = log2qty;
..	..	@@ -7890,47 +8584,50 @@
7890	8584
7891	8585	/*
7892	8586	* This function checks whether pageblock includes unmovable pages or not.
7893		- * If @count is not zero, it is okay to include less @count unmovable pages
7894	8587	*
7895	8588	* PageLRU check without isolation or lru_lock could race so that
7896	8589	* MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
7897	8590	* check without lock_page also may miss some movable non-lru pages at
7898	8591	* race condition. So you can't expect this function should be exact.
	8592	+ *
	8593	+ * Returns a page without holding a reference. If the caller wants to
	8594	+ * dereference that page (e.g., dumping), it has to make sure that it
	8595	+ * cannot get removed (e.g., via memory unplug) concurrently.
	8596	+ *
7899	8597	*/
7900		-bool has_unmovable_pages(struct zone zone, struct page page, int count,
7901		- int migratetype,
7902		- bool skip_hwpoisoned_pages)
	8598	+struct page has_unmovable_pages(struct zone zone, struct page *page,
	8599	+ int migratetype, int flags)
7903	8600	{
7904		- unsigned long pfn, iter, found;
	8601	+ unsigned long iter = 0;
	8602	+ unsigned long pfn = page_to_pfn(page);
	8603	+ unsigned long offset = pfn % pageblock_nr_pages;
7905	8604
7906		- /*
7907		- * TODO we could make this much more efficient by not checking every
7908		- * page in the range if we know all of them are in MOVABLE_ZONE and
7909		- * that the movable zone guarantees that pages are migratable but
7910		- * the later is not the case right now unfortunatelly. E.g. movablecore
7911		- * can still lead to having bootmem allocations in zone_movable.
7912		- */
	8605	+ if (is_migrate_cma_page(page)) {
	8606	+ /*
	8607	+ * CMA allocations (alloc_contig_range) really need to mark
	8608	+ * isolate CMA pageblocks even when they are not movable in fact
	8609	+ * so consider them movable here.
	8610	+ */
	8611	+ if (is_migrate_cma(migratetype))
	8612	+ return NULL;
7913	8613
7914		- /*
7915		- * CMA allocations (alloc_contig_range) really need to mark isolate
7916		- * CMA pageblocks even when they are not movable in fact so consider
7917		- * them movable here.
7918		- */
7919		- if (is_migrate_cma(migratetype) &&
7920		- is_migrate_cma(get_pageblock_migratetype(page)))
7921		- return false;
	8614	+ return page;
	8615	+ }
7922	8616
7923		- pfn = page_to_pfn(page);
7924		- for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
7925		- unsigned long check = pfn + iter;
7926		-
7927		- if (!pfn_valid_within(check))
	8617	+ for (; iter < pageblock_nr_pages - offset; iter++) {
	8618	+ if (!pfn_valid_within(pfn + iter))
7928	8619	continue;
7929	8620
7930		- page = pfn_to_page(check);
	8621	+ page = pfn_to_page(pfn + iter);
7931	8622
	8623	+ /*
	8624	+ * Both, bootmem allocations and memory holes are marked
	8625	+ * PG_reserved and are unmovable. We can even have unmovable
	8626	+ * allocations inside ZONE_MOVABLE, for example when
	8627	+ * specifying "movablecore".
	8628	+ */
7932	8629	if (PageReserved(page))
7933		- goto unmovable;
	8630	+ return page;
7934	8631
7935	8632	/*
7936	8633	* If the zone is movable and we have ruled out all reserved
..	..	@@ -7942,17 +8639,22 @@
7942	8639
7943	8640	/*
7944	8641	* Hugepages are not in LRU lists, but they're movable.
7945		- * We need not scan over tail pages bacause we don't
	8642	+ * THPs are on the LRU, but need to be counted as #small pages.
	8643	+ * We need not scan over tail pages because we don't
7946	8644	* handle each tail page individually in migration.
7947	8645	*/
7948		- if (PageHuge(page)) {
	8646	+ if (PageHuge(page) \|\| PageTransCompound(page)) {
7949	8647	struct page *head = compound_head(page);
7950	8648	unsigned int skip_pages;
7951	8649
7952		- if (!hugepage_migration_supported(page_hstate(head)))
7953		- goto unmovable;
	8650	+ if (PageHuge(page)) {
	8651	+ if (!hugepage_migration_supported(page_hstate(head)))
	8652	+ return page;
	8653	+ } else if (!PageLRU(head) && !__PageMovable(head)) {
	8654	+ return page;
	8655	+ }
7954	8656
7955		- skip_pages = (1 << compound_order(head)) - (page - head);
	8657	+ skip_pages = compound_nr(head) - (page - head);
7956	8658	iter += skip_pages - 1;
7957	8659	continue;
7958	8660	}
..	..	@@ -7965,7 +8667,7 @@
7965	8667	*/
7966	8668	if (!page_ref_count(page)) {
7967	8669	if (PageBuddy(page))
7968		- iter += (1 << page_order(page)) - 1;
	8670	+ iter += (1 << buddy_order(page)) - 1;
7969	8671	continue;
7970	8672	}
7971	8673
..	..	@@ -7973,61 +8675,100 @@
7973	8675	* The HWPoisoned page may be not in buddy system, and
7974	8676	* page_count() is not 0.
7975	8677	*/
7976		- if (skip_hwpoisoned_pages && PageHWPoison(page))
	8678	+ if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
7977	8679	continue;
7978	8680
7979		- if (__PageMovable(page))
	8681	+ /*
	8682	+ * We treat all PageOffline() pages as movable when offlining
	8683	+ * to give drivers a chance to decrement their reference count
	8684	+ * in MEM_GOING_OFFLINE in order to indicate that these pages
	8685	+ * can be offlined as there are no direct references anymore.
	8686	+ * For actually unmovable PageOffline() where the driver does
	8687	+ * not support this, we will fail later when trying to actually
	8688	+ * move these pages that still have a reference count > 0.
	8689	+ * (false negatives in this function only)
	8690	+ */
	8691	+ if ((flags & MEMORY_OFFLINE) && PageOffline(page))
7980	8692	continue;
7981	8693
7982		- if (!PageLRU(page))
7983		- found++;
	8694	+ if (__PageMovable(page) \|\| PageLRU(page))
	8695	+ continue;
	8696	+
7984	8697	/*
7985	8698	* If there are RECLAIMABLE pages, we need to check
7986	8699	* it. But now, memory offline itself doesn't call
7987	8700	* shrink_node_slabs() and it still to be fixed.
7988	8701	*/
7989		- /*
7990		- * If the page is not RAM, page_count()should be 0.
7991		- * we don't need more check. This is an _used_ not-movable page.
7992		- *
7993		- * The problematic thing here is PG_reserved pages. PG_reserved
7994		- * is set to both of a memory hole page and a _used_ kernel
7995		- * page at boot.
7996		- */
7997		- if (found > count)
7998		- goto unmovable;
	8702	+ return page;
7999	8703	}
8000		- return false;
8001		-unmovable:
8002		- WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
8003		- return true;
	8704	+ return NULL;
8004	8705	}
8005	8706
8006		-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) \|\| defined(CONFIG_CMA)
8007		-
	8707	+#ifdef CONFIG_CONTIG_ALLOC
8008	8708	static unsigned long pfn_max_align_down(unsigned long pfn)
8009	8709	{
8010	8710	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
8011	8711	pageblock_nr_pages) - 1);
8012	8712	}
8013	8713
8014		-static unsigned long pfn_max_align_up(unsigned long pfn)
	8714	+unsigned long pfn_max_align_up(unsigned long pfn)
8015	8715	{
8016	8716	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
8017	8717	pageblock_nr_pages));
8018	8718	}
8019	8719
	8720	+#if defined(CONFIG_DYNAMIC_DEBUG) \|\| \
	8721	+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
	8722	+/* Usage: See admin-guide/dynamic-debug-howto.rst */
	8723	+static void alloc_contig_dump_pages(struct list_head *page_list)
	8724	+{
	8725	+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
	8726	+
	8727	+ if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
	8728	+ struct page *page;
	8729	+ unsigned long nr_skip = 0;
	8730	+ unsigned long nr_pages = 0;
	8731	+
	8732	+ dump_stack();
	8733	+ list_for_each_entry(page, page_list, lru) {
	8734	+ nr_pages++;
	8735	+ /* The page will be freed by putback_movable_pages soon */
	8736	+ if (page_count(page) == 1) {
	8737	+ nr_skip++;
	8738	+ continue;
	8739	+ }
	8740	+ dump_page(page, "migration failure");
	8741	+ }
	8742	+ pr_warn("total dump_pages %lu skipping %lu\n", nr_pages, nr_skip);
	8743	+ }
	8744	+}
	8745	+#else
	8746	+static inline void alloc_contig_dump_pages(struct list_head *page_list)
	8747	+{
	8748	+}
	8749	+#endif
	8750	+
8020	8751	/* [start, end) must belong to a single zone. */
8021	8752	static int __alloc_contig_migrate_range(struct compact_control *cc,
8022		- unsigned long start, unsigned long end)
	8753	+ unsigned long start, unsigned long end,
	8754	+ struct acr_info *info)
8023	8755	{
8024	8756	/* This function is based on compact_zone() from compaction.c. */
8025		- unsigned long nr_reclaimed;
	8757	+ unsigned int nr_reclaimed;
8026	8758	unsigned long pfn = start;
8027	8759	unsigned int tries = 0;
	8760	+ unsigned int max_tries = 5;
8028	8761	int ret = 0;
	8762	+ struct page *page;
	8763	+ struct migration_target_control mtc = {
	8764	+ .nid = zone_to_nid(cc->zone),
	8765	+ .gfp_mask = GFP_USER \| __GFP_MOVABLE \| __GFP_RETRY_MAYFAIL,
	8766	+ };
8029	8767
8030		- migrate_prep();
	8768	+ if (cc->alloc_contig && cc->mode == MIGRATE_ASYNC)
	8769	+ max_tries = 1;
	8770	+
	8771	+ lru_cache_disable();
8031	8772
8032	8773	while (pfn < end \|\| !list_empty(&cc->migratepages)) {
8033	8774	if (fatal_signal_pending(current)) {
..	..	@@ -8043,20 +8784,39 @@
8043	8784	break;
8044	8785	}
8045	8786	tries = 0;
8046		- } else if (++tries == 5) {
	8787	+ } else if (++tries == max_tries) {
8047	8788	ret = ret < 0 ? ret : -EBUSY;
8048	8789	break;
8049	8790	}
8050	8791
8051	8792	nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
8052	8793	&cc->migratepages);
	8794	+ info->nr_reclaimed += nr_reclaimed;
8053	8795	cc->nr_migratepages -= nr_reclaimed;
8054	8796
8055		- ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
8056		- NULL, 0, cc->mode, MR_CONTIG_RANGE);
	8797	+ list_for_each_entry(page, &cc->migratepages, lru)
	8798	+ info->nr_mapped += page_mapcount(page);
	8799	+
	8800	+ ret = migrate_pages(&cc->migratepages, alloc_migration_target,
	8801	+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
	8802	+ if (!ret)
	8803	+ info->nr_migrated += cc->nr_migratepages;
8057	8804	}
	8805	+
	8806	+ lru_cache_enable();
8058	8807	if (ret < 0) {
	8808	+ if (ret == -EBUSY) {
	8809	+ alloc_contig_dump_pages(&cc->migratepages);
	8810	+ page_pinner_mark_migration_failed_pages(&cc->migratepages);
	8811	+ }
	8812	+
	8813	+ if (!list_empty(&cc->migratepages)) {
	8814	+ page = list_first_entry(&cc->migratepages, struct page , lru);
	8815	+ info->failed_pfn = page_to_pfn(page);
	8816	+ }
	8817	+
8059	8818	putback_movable_pages(&cc->migratepages);
	8819	+ info->err \|= ACR_ERR_MIGRATE;
8060	8820	return ret;
8061	8821	}
8062	8822	return 0;
..	..	@@ -8079,25 +8839,28 @@
8079	8839	* pageblocks in the range. Once isolated, the pageblocks should not
8080	8840	* be modified by others.
8081	8841	*
8082		- * Returns zero on success or negative error code. On success all
	8842	+ * Return: zero on success or negative error code. On success all
8083	8843	* pages which PFN is in [start, end) are allocated for the caller and
8084	8844	* need to be freed with free_contig_range().
8085	8845	*/
8086	8846	int alloc_contig_range(unsigned long start, unsigned long end,
8087		- unsigned migratetype, gfp_t gfp_mask)
	8847	+ unsigned migratetype, gfp_t gfp_mask,
	8848	+ struct acr_info *info)
8088	8849	{
8089	8850	unsigned long outer_start, outer_end;
8090	8851	unsigned int order;
8091	8852	int ret = 0;
	8853	+ bool skip_drain_all_pages = false;
8092	8854
8093	8855	struct compact_control cc = {
8094	8856	.nr_migratepages = 0,
8095	8857	.order = -1,
8096	8858	.zone = page_zone(pfn_to_page(start)),
8097		- .mode = MIGRATE_SYNC,
	8859	+ .mode = gfp_mask & __GFP_NORETRY ? MIGRATE_ASYNC : MIGRATE_SYNC,
8098	8860	.ignore_skip_hint = true,
8099	8861	.no_set_skip_hint = true,
8100	8862	.gfp_mask = current_gfp_context(gfp_mask),
	8863	+ .alloc_contig = true,
8101	8864	};
8102	8865	INIT_LIST_HEAD(&cc.migratepages);
8103	8866
..	..	@@ -8126,14 +8889,18 @@
8126	8889	*/
8127	8890
8128	8891	ret = start_isolate_page_range(pfn_max_align_down(start),
8129		- pfn_max_align_up(end), migratetype,
8130		- false);
8131		- if (ret)
	8892	+ pfn_max_align_up(end), migratetype, 0,
	8893	+ &info->failed_pfn);
	8894	+ if (ret) {
	8895	+ info->err \|= ACR_ERR_ISOLATE;
8132	8896	return ret;
	8897	+ }
8133	8898
8134		-#ifdef CONFIG_CMA
8135		- cc.zone->cma_alloc = 1;
8136		-#endif
	8899	+ trace_android_vh_cma_drain_all_pages_bypass(migratetype,
	8900	+ &skip_drain_all_pages);
	8901	+ if (!skip_drain_all_pages)
	8902	+ drain_all_pages(cc.zone);
	8903	+
8137	8904	/*
8138	8905	* In case of -EBUSY, we'd like to know which page causes problem.
8139	8906	* So, just fall through. test_pages_isolated() has a tracepoint
..	..	@@ -8144,8 +8911,8 @@
8144	8911	* allocated. So, if we fall through be sure to clear ret so that
8145	8912	* -EBUSY is not accidentally used or returned to caller.
8146	8913	*/
8147		- ret = __alloc_contig_migrate_range(&cc, start, end);
8148		- if (ret && ret != -EBUSY)
	8914	+ ret = __alloc_contig_migrate_range(&cc, start, end, info);
	8915	+ if (ret && (ret != -EBUSY \|\| (gfp_mask & __GFP_NORETRY)))
8149	8916	goto done;
8150	8917	ret =0;
8151	8918
..	..	@@ -8166,9 +8933,6 @@
8166	8933	* isolated thus they won't get removed from buddy.
8167	8934	*/
8168	8935
8169		- lru_add_drain_all();
8170		- drain_all_pages(cc.zone);
8171		-
8172	8936	order = 0;
8173	8937	outer_start = start;
8174	8938	while (!PageBuddy(pfn_to_page(outer_start))) {
..	..	@@ -8180,7 +8944,7 @@
8180	8944	}
8181	8945
8182	8946	if (outer_start != start) {
8183		- order = page_order(pfn_to_page(outer_start));
	8947	+ order = buddy_order(pfn_to_page(outer_start));
8184	8948
8185	8949	/*
8186	8950	* outer_start page could be small order buddy page and
..	..	@@ -8193,10 +8957,11 @@
8193	8957	}
8194	8958
8195	8959	/* Make sure the range is really isolated. */
8196		- if (test_pages_isolated(outer_start, end, false)) {
	8960	+ if (test_pages_isolated(outer_start, end, 0, &info->failed_pfn)) {
8197	8961	pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
8198	8962	__func__, outer_start, end);
8199	8963	ret = -EBUSY;
	8964	+ info->err \|= ACR_ERR_TEST;
8200	8965	goto done;
8201	8966	}
8202	8967
..	..	@@ -8216,13 +8981,114 @@
8216	8981	done:
8217	8982	undo_isolate_page_range(pfn_max_align_down(start),
8218	8983	pfn_max_align_up(end), migratetype);
8219		-#ifdef CONFIG_CMA
8220		- cc.zone->cma_alloc = 0;
8221		-#endif
8222	8984	return ret;
8223	8985	}
	8986	+EXPORT_SYMBOL(alloc_contig_range);
8224	8987
8225		-void free_contig_range(unsigned long pfn, unsigned nr_pages)
	8988	+static int __alloc_contig_pages(unsigned long start_pfn,
	8989	+ unsigned long nr_pages, gfp_t gfp_mask)
	8990	+{
	8991	+ struct acr_info dummy;
	8992	+ unsigned long end_pfn = start_pfn + nr_pages;
	8993	+
	8994	+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
	8995	+ gfp_mask, &dummy);
	8996	+}
	8997	+
	8998	+static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
	8999	+ unsigned long nr_pages)
	9000	+{
	9001	+ unsigned long i, end_pfn = start_pfn + nr_pages;
	9002	+ struct page *page;
	9003	+
	9004	+ for (i = start_pfn; i < end_pfn; i++) {
	9005	+ page = pfn_to_online_page(i);
	9006	+ if (!page)
	9007	+ return false;
	9008	+
	9009	+ if (page_zone(page) != z)
	9010	+ return false;
	9011	+
	9012	+ if (PageReserved(page))
	9013	+ return false;
	9014	+
	9015	+ if (page_count(page) > 0)
	9016	+ return false;
	9017	+
	9018	+ if (PageHuge(page))
	9019	+ return false;
	9020	+ }
	9021	+ return true;
	9022	+}
	9023	+
	9024	+static bool zone_spans_last_pfn(const struct zone *zone,
	9025	+ unsigned long start_pfn, unsigned long nr_pages)
	9026	+{
	9027	+ unsigned long last_pfn = start_pfn + nr_pages - 1;
	9028	+
	9029	+ return zone_spans_pfn(zone, last_pfn);
	9030	+}
	9031	+
	9032	+/**
	9033	+ * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
	9034	+ * @nr_pages: Number of contiguous pages to allocate
	9035	+ * @gfp_mask: GFP mask to limit search and used during compaction
	9036	+ * @nid: Target node
	9037	+ * @nodemask: Mask for other possible nodes
	9038	+ *
	9039	+ * This routine is a wrapper around alloc_contig_range(). It scans over zones
	9040	+ * on an applicable zonelist to find a contiguous pfn range which can then be
	9041	+ * tried for allocation with alloc_contig_range(). This routine is intended
	9042	+ * for allocation requests which can not be fulfilled with the buddy allocator.
	9043	+ *
	9044	+ * The allocated memory is always aligned to a page boundary. If nr_pages is a
	9045	+ * power of two then the alignment is guaranteed to be to the given nr_pages
	9046	+ * (e.g. 1GB request would be aligned to 1GB).
	9047	+ *
	9048	+ * Allocated pages can be freed with free_contig_range() or by manually calling
	9049	+ * __free_page() on each allocated page.
	9050	+ *
	9051	+ * Return: pointer to contiguous pages on success, or NULL if not successful.
	9052	+ */
	9053	+struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
	9054	+ int nid, nodemask_t *nodemask)
	9055	+{
	9056	+ unsigned long ret, pfn, flags;
	9057	+ struct zonelist *zonelist;
	9058	+ struct zone *zone;
	9059	+ struct zoneref *z;
	9060	+
	9061	+ zonelist = node_zonelist(nid, gfp_mask);
	9062	+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
	9063	+ gfp_zone(gfp_mask), nodemask) {
	9064	+ spin_lock_irqsave(&zone->lock, flags);
	9065	+
	9066	+ pfn = ALIGN(zone->zone_start_pfn, nr_pages);
	9067	+ while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
	9068	+ if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
	9069	+ /*
	9070	+ * We release the zone lock here because
	9071	+ * alloc_contig_range() will also lock the zone
	9072	+ * at some point. If there's an allocation
	9073	+ * spinning on this lock, it may win the race
	9074	+ * and cause alloc_contig_range() to fail...
	9075	+ */
	9076	+ spin_unlock_irqrestore(&zone->lock, flags);
	9077	+ ret = __alloc_contig_pages(pfn, nr_pages,
	9078	+ gfp_mask);
	9079	+ if (!ret)
	9080	+ return pfn_to_page(pfn);
	9081	+ spin_lock_irqsave(&zone->lock, flags);
	9082	+ }
	9083	+ pfn += nr_pages;
	9084	+ }
	9085	+ spin_unlock_irqrestore(&zone->lock, flags);
	9086	+ }
	9087	+ return NULL;
	9088	+}
	9089	+#endif /* CONFIG_CONTIG_ALLOC */
	9090	+
	9091	+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
8226	9092	{
8227	9093	unsigned int count = 0;
8228	9094
..	..	@@ -8234,7 +9100,7 @@
8234	9100	}
8235	9101	WARN(count != 0, "%d pages are still in use!\n", count);
8236	9102	}
8237		-#endif
	9103	+EXPORT_SYMBOL(free_contig_range);
8238	9104
8239	9105	/*
8240	9106	* The zone indicated has a new number of managed_pages; batch sizes and percpu
..	..	@@ -8242,11 +9108,8 @@
8242	9108	*/
8243	9109	void __meminit zone_pcp_update(struct zone *zone)
8244	9110	{
8245		- unsigned cpu;
8246	9111	mutex_lock(&pcp_batch_high_lock);
8247		- for_each_possible_cpu(cpu)
8248		- pageset_set_high_and_batch(zone,
8249		- per_cpu_ptr(zone->pageset, cpu));
	9112	+ __zone_pcp_update(zone);
8250	9113	mutex_unlock(&pcp_batch_high_lock);
8251	9114	}
8252	9115
..	..	@@ -8271,32 +9134,21 @@
8271	9134
8272	9135	#ifdef CONFIG_MEMORY_HOTREMOVE
8273	9136	/*
8274		- * All pages in the range must be in a single zone and isolated
8275		- * before calling this.
	9137	+ * All pages in the range must be in a single zone, must not contain holes,
	9138	+ * must span full sections, and must be isolated before calling this function.
8276	9139	*/
8277		-void
8278		-__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
	9140	+void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
8279	9141	{
	9142	+ unsigned long pfn = start_pfn;
8280	9143	struct page *page;
8281	9144	struct zone *zone;
8282		- unsigned int order, i;
8283		- unsigned long pfn;
	9145	+ unsigned int order;
8284	9146	unsigned long flags;
8285		- /* find the first valid pfn */
8286		- for (pfn = start_pfn; pfn < end_pfn; pfn++)
8287		- if (pfn_valid(pfn))
8288		- break;
8289		- if (pfn == end_pfn)
8290		- return;
	9147	+
8291	9148	offline_mem_sections(pfn, end_pfn);
8292	9149	zone = page_zone(pfn_to_page(pfn));
8293	9150	spin_lock_irqsave(&zone->lock, flags);
8294		- pfn = start_pfn;
8295	9151	while (pfn < end_pfn) {
8296		- if (!pfn_valid(pfn)) {
8297		- pfn++;
8298		- continue;
8299		- }
8300	9152	page = pfn_to_page(pfn);
8301	9153	/*
8302	9154	* The HWPoisoned page may be not in buddy system, and
..	..	@@ -8304,22 +9156,23 @@
8304	9156	*/
8305	9157	if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
8306	9158	pfn++;
8307		- SetPageReserved(page);
	9159	+ continue;
	9160	+ }
	9161	+ /*
	9162	+ * At this point all remaining PageOffline() pages have a
	9163	+ * reference count of 0 and can simply be skipped.
	9164	+ */
	9165	+ if (PageOffline(page)) {
	9166	+ BUG_ON(page_count(page));
	9167	+ BUG_ON(PageBuddy(page));
	9168	+ pfn++;
8308	9169	continue;
8309	9170	}
8310	9171
8311	9172	BUG_ON(page_count(page));
8312	9173	BUG_ON(!PageBuddy(page));
8313		- order = page_order(page);
8314		-#ifdef CONFIG_DEBUG_VM
8315		- pr_info("remove from free list %lx %d %lx\n",
8316		- pfn, 1 << order, end_pfn);
8317		-#endif
8318		- list_del(&page->lru);
8319		- rmv_page_order(page);
8320		- zone->free_area[order].nr_free--;
8321		- for (i = 0; i < (1 << order); i++)
8322		- SetPageReserved((page+i));
	9174	+ order = buddy_order(page);
	9175	+ del_page_from_free_list(page, zone, order);
8323	9176	pfn += (1 << order);
8324	9177	}
8325	9178	spin_unlock_irqrestore(&zone->lock, flags);
..	..	@@ -8337,7 +9190,7 @@
8337	9190	for (order = 0; order < MAX_ORDER; order++) {
8338	9191	struct page *page_head = page - (pfn & ((1 << order) - 1));
8339	9192
8340		- if (PageBuddy(page_head) && page_order(page_head) >= order)
	9193	+ if (PageBuddy(page_head) && buddy_order(page_head) >= order)
8341	9194	break;
8342	9195	}
8343	9196	spin_unlock_irqrestore(&zone->lock, flags);
..	..	@@ -8347,30 +9200,87 @@
8347	9200
8348	9201	#ifdef CONFIG_MEMORY_FAILURE
8349	9202	/*
8350		- * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
8351		- * test is performed under the zone lock to prevent a race against page
8352		- * allocation.
	9203	+ * Break down a higher-order page in sub-pages, and keep our target out of
	9204	+ * buddy allocator.
8353	9205	*/
8354		-bool set_hwpoison_free_buddy_page(struct page *page)
	9206	+static void break_down_buddy_pages(struct zone zone, struct page page,
	9207	+ struct page *target, int low, int high,
	9208	+ int migratetype)
	9209	+{
	9210	+ unsigned long size = 1 << high;
	9211	+ struct page current_buddy, next_page;
	9212	+
	9213	+ while (high > low) {
	9214	+ high--;
	9215	+ size >>= 1;
	9216	+
	9217	+ if (target >= &page[size]) {
	9218	+ next_page = page + size;
	9219	+ current_buddy = page;
	9220	+ } else {
	9221	+ next_page = page;
	9222	+ current_buddy = page + size;
	9223	+ }
	9224	+
	9225	+ if (set_page_guard(zone, current_buddy, high, migratetype))
	9226	+ continue;
	9227	+
	9228	+ if (current_buddy != target) {
	9229	+ add_to_free_list(current_buddy, zone, high, migratetype);
	9230	+ set_buddy_order(current_buddy, high);
	9231	+ page = next_page;
	9232	+ }
	9233	+ }
	9234	+}
	9235	+
	9236	+/*
	9237	+ * Take a page that will be marked as poisoned off the buddy allocator.
	9238	+ */
	9239	+bool take_page_off_buddy(struct page *page)
8355	9240	{
8356	9241	struct zone *zone = page_zone(page);
8357	9242	unsigned long pfn = page_to_pfn(page);
8358	9243	unsigned long flags;
8359	9244	unsigned int order;
8360		- bool hwpoisoned = false;
	9245	+ bool ret = false;
8361	9246
8362	9247	spin_lock_irqsave(&zone->lock, flags);
8363	9248	for (order = 0; order < MAX_ORDER; order++) {
8364	9249	struct page *page_head = page - (pfn & ((1 << order) - 1));
	9250	+ int page_order = buddy_order(page_head);
8365	9251
8366		- if (PageBuddy(page_head) && page_order(page_head) >= order) {
8367		- if (!TestSetPageHWPoison(page))
8368		- hwpoisoned = true;
	9252	+ if (PageBuddy(page_head) && page_order >= order) {
	9253	+ unsigned long pfn_head = page_to_pfn(page_head);
	9254	+ int migratetype = get_pfnblock_migratetype(page_head,
	9255	+ pfn_head);
	9256	+
	9257	+ del_page_from_free_list(page_head, zone, page_order);
	9258	+ break_down_buddy_pages(zone, page_head, page, 0,
	9259	+ page_order, migratetype);
	9260	+ if (!is_migrate_isolate(migratetype))
	9261	+ __mod_zone_freepage_state(zone, -1, migratetype);
	9262	+ ret = true;
8369	9263	break;
8370	9264	}
	9265	+ if (page_count(page_head) > 0)
	9266	+ break;
8371	9267	}
8372	9268	spin_unlock_irqrestore(&zone->lock, flags);
8373		-
8374		- return hwpoisoned;
	9269	+ return ret;
8375	9270	}
8376	9271	#endif
	9272	+
	9273	+#ifdef CONFIG_ZONE_DMA
	9274	+bool has_managed_dma(void)
	9275	+{
	9276	+ struct pglist_data *pgdat;
	9277	+
	9278	+ for_each_online_pgdat(pgdat) {
	9279	+ struct zone *zone = &pgdat->node_zones[ZONE_DMA];
	9280	+
	9281	+ if (managed_zone(zone))
	9282	+ return true;
	9283	+ }
	9284	+ return false;
	9285	+}
	9286	+#endif /* CONFIG_ZONE_DMA */