~ljy/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/mm/page_alloc.c
3	4	*
..	..	@@ -16,11 +17,11 @@
16	17
17	18	#include <linux/stddef.h>
18	19	#include <linux/mm.h>
	20	+#include <linux/highmem.h>
19	21	#include <linux/swap.h>
20	22	#include <linux/interrupt.h>
21	23	#include <linux/pagemap.h>
22	24	#include <linux/jiffies.h>
23		-#include <linux/bootmem.h>
24	25	#include <linux/memblock.h>
25	26	#include <linux/compiler.h>
26	27	#include <linux/kernel.h>
..	..	@@ -43,12 +44,12 @@
43	44	#include <linux/mempolicy.h>
44	45	#include <linux/memremap.h>
45	46	#include <linux/stop_machine.h>
	47	+#include <linux/random.h>
46	48	#include <linux/sort.h>
47	49	#include <linux/pfn.h>
48	50	#include <linux/backing-dev.h>
49	51	#include <linux/fault-inject.h>
50	52	#include <linux/page-isolation.h>
51		-#include <linux/page_ext.h>
52	53	#include <linux/debugobjects.h>
53	54	#include <linux/kmemleak.h>
54	55	#include <linux/compaction.h>
..	..	@@ -60,19 +61,65 @@
60	61	#include <linux/hugetlb.h>
61	62	#include <linux/sched/rt.h>
62	63	#include <linux/sched/mm.h>
	64	+#include <linux/local_lock.h>
63	65	#include <linux/page_owner.h>
	66	+#include <linux/page_pinner.h>
64	67	#include <linux/kthread.h>
65	68	#include <linux/memcontrol.h>
66	69	#include <linux/ftrace.h>
67	70	#include <linux/lockdep.h>
68	71	#include <linux/nmi.h>
69		-#include <linux/khugepaged.h>
70	72	#include <linux/psi.h>
	73	+#include <linux/padata.h>
	74	+#include <linux/khugepaged.h>
	75	+#include <trace/hooks/mm.h>
	76	+#include <trace/hooks/vmscan.h>
71	77
72	78	#include <asm/sections.h>
73	79	#include <asm/tlbflush.h>
74	80	#include <asm/div64.h>
75	81	#include "internal.h"
	82	+#include "shuffle.h"
	83	+#include "page_reporting.h"
	84	+
	85	+/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
	86	+typedef int __bitwise fpi_t;
	87	+
	88	+/* No special request */
	89	+#define FPI_NONE ((__force fpi_t)0)
	90	+
	91	+/*
	92	+ * Skip free page reporting notification for the (possibly merged) page.
	93	+ * This does not hinder free page reporting from grabbing the page,
	94	+ * reporting it and marking it "reported" - it only skips notifying
	95	+ * the free page reporting infrastructure about a newly freed page. For
	96	+ * example, used when temporarily pulling a page from a freelist and
	97	+ * putting it back unmodified.
	98	+ */
	99	+#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
	100	+
	101	+/*
	102	+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
	103	+ * page shuffling (relevant code - e.g., memory onlining - is expected to
	104	+ * shuffle the whole zone).
	105	+ *
	106	+ * Note: No code should rely on this flag for correctness - it's purely
	107	+ * to allow for optimizations when handing back either fresh pages
	108	+ * (memory onlining) or untouched pages (page isolation, free page
	109	+ * reporting).
	110	+ */
	111	+#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
	112	+
	113	+/*
	114	+ * Don't poison memory with KASAN (only for the tag-based modes).
	115	+ * During boot, all non-reserved memblock memory is exposed to page_alloc.
	116	+ * Poisoning all that memory lengthens boot time, especially on systems with
	117	+ * large amount of RAM. This flag is used to skip that poisoning.
	118	+ * This is only done for the tag-based KASAN modes, as those are able to
	119	+ * detect memory corruptions with the memory tags assigned by default.
	120	+ * All memory allocated normally after boot gets poisoned as usual.
	121	+ */
	122	+#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
76	123
77	124	/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
78	125	static DEFINE_MUTEX(pcp_batch_high_lock);
..	..	@@ -94,12 +141,15 @@
94	141	*/
95	142	DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
96	143	EXPORT_PER_CPU_SYMBOL(_numa_mem_);
97		-int _node_numa_mem_[MAX_NUMNODES];
98	144	#endif
99	145
100	146	/* work_structs for global per-cpu drains */
101		-DEFINE_MUTEX(pcpu_drain_mutex);
102		-DEFINE_PER_CPU(struct work_struct, pcpu_drain);
	147	+struct pcpu_drain {
	148	+ struct zone *zone;
	149	+ struct work_struct work;
	150	+};
	151	+static DEFINE_MUTEX(pcpu_drain_mutex);
	152	+static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
103	153
104	154	#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
105	155	volatile unsigned long latent_entropy __latent_entropy;
..	..	@@ -123,62 +173,33 @@
123	173	};
124	174	EXPORT_SYMBOL(node_states);
125	175
126		-/* Protect totalram_pages and zone->managed_pages */
127		-static DEFINE_SPINLOCK(managed_page_count_lock);
128		-
129		-unsigned long totalram_pages __read_mostly;
	176	+atomic_long_t _totalram_pages __read_mostly;
	177	+EXPORT_SYMBOL(_totalram_pages);
130	178	unsigned long totalreserve_pages __read_mostly;
131	179	unsigned long totalcma_pages __read_mostly;
132	180
133	181	int percpu_pagelist_fraction;
134	182	gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
135		-#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
136		-DEFINE_STATIC_KEY_TRUE(init_on_alloc);
137		-#else
138	183	DEFINE_STATIC_KEY_FALSE(init_on_alloc);
139		-#endif
140	184	EXPORT_SYMBOL(init_on_alloc);
141	185
142		-#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
143		-DEFINE_STATIC_KEY_TRUE(init_on_free);
144		-#else
145	186	DEFINE_STATIC_KEY_FALSE(init_on_free);
146		-#endif
147	187	EXPORT_SYMBOL(init_on_free);
148	188
	189	+static bool _init_on_alloc_enabled_early __read_mostly
	190	+ = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
149	191	static int __init early_init_on_alloc(char *buf)
150	192	{
151		- int ret;
152		- bool bool_result;
153	193
154		- if (!buf)
155		- return -EINVAL;
156		- ret = kstrtobool(buf, &bool_result);
157		- if (bool_result && page_poisoning_enabled())
158		- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
159		- if (bool_result)
160		- static_branch_enable(&init_on_alloc);
161		- else
162		- static_branch_disable(&init_on_alloc);
163		- return ret;
	194	+ return kstrtobool(buf, &_init_on_alloc_enabled_early);
164	195	}
165	196	early_param("init_on_alloc", early_init_on_alloc);
166	197
	198	+static bool _init_on_free_enabled_early __read_mostly
	199	+ = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
167	200	static int __init early_init_on_free(char *buf)
168	201	{
169		- int ret;
170		- bool bool_result;
171		-
172		- if (!buf)
173		- return -EINVAL;
174		- ret = kstrtobool(buf, &bool_result);
175		- if (bool_result && page_poisoning_enabled())
176		- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
177		- if (bool_result)
178		- static_branch_enable(&init_on_free);
179		- else
180		- static_branch_disable(&init_on_free);
181		- return ret;
	202	+ return kstrtobool(buf, &_init_on_free_enabled_early);
182	203	}
183	204	early_param("init_on_free", early_init_on_free);
184	205
..	..	@@ -242,7 +263,8 @@
242	263	unsigned int pageblock_order __read_mostly;
243	264	#endif
244	265
245		-static void __free_pages_ok(struct page *page, unsigned int order);
	266	+static void __free_pages_ok(struct page *page, unsigned int order,
	267	+ fpi_t fpi_flags);
246	268
247	269	/*
248	270	* results with 256, 32 in the lowmem_reserve sysctl:
..	..	@@ -269,8 +291,6 @@
269	291	[ZONE_MOVABLE] = 0,
270	292	};
271	293
272		-EXPORT_SYMBOL(totalram_pages);
273		-
274	294	static char * const zone_names[MAX_NR_ZONES] = {
275	295	#ifdef CONFIG_ZONE_DMA
276	296	"DMA",
..	..	@@ -288,7 +308,7 @@
288	308	#endif
289	309	};
290	310
291		-char * const migratetype_names[MIGRATE_TYPES] = {
	311	+const char * const migratetype_names[MIGRATE_TYPES] = {
292	312	"Unmovable",
293	313	"Movable",
294	314	"Reclaimable",
..	..	@@ -301,14 +321,14 @@
301	321	#endif
302	322	};
303	323
304		-compound_page_dtor * const compound_page_dtors[] = {
305		- NULL,
306		- free_compound_page,
	324	+compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
	325	+ [NULL_COMPOUND_DTOR] = NULL,
	326	+ [COMPOUND_PAGE_DTOR] = free_compound_page,
307	327	#ifdef CONFIG_HUGETLB_PAGE
308		- free_huge_page,
	328	+ [HUGETLB_PAGE_DTOR] = free_huge_page,
309	329	#endif
310	330	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
311		- free_transhuge_page,
	331	+ [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
312	332	#endif
313	333	};
314	334
..	..	@@ -319,6 +339,20 @@
319	339	*/
320	340	int min_free_kbytes = 1024;
321	341	int user_min_free_kbytes = -1;
	342	+#ifdef CONFIG_DISCONTIGMEM
	343	+/*
	344	+ * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
	345	+ * are not on separate NUMA nodes. Functionally this works but with
	346	+ * watermark_boost_factor, it can reclaim prematurely as the ranges can be
	347	+ * quite small. By default, do not boost watermarks on discontigmem as in
	348	+ * many cases very high-order allocations like THP are likely to be
	349	+ * unsupported and the premature reclaim offsets the advantage of long-term
	350	+ * fragmentation avoidance.
	351	+ */
	352	+int watermark_boost_factor __read_mostly;
	353	+#else
	354	+int watermark_boost_factor __read_mostly = 15000;
	355	+#endif
322	356	int watermark_scale_factor = 10;
323	357
324	358	/*
..	..	@@ -328,31 +362,36 @@
328	362	*/
329	363	int extra_free_kbytes = 0;
330	364
331		-static unsigned long nr_kernel_pages __meminitdata;
332		-static unsigned long nr_all_pages __meminitdata;
333		-static unsigned long dma_reserve __meminitdata;
	365	+static unsigned long nr_kernel_pages __initdata;
	366	+static unsigned long nr_all_pages __initdata;
	367	+static unsigned long dma_reserve __initdata;
334	368
335		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
336		-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
337		-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
	369	+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
	370	+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
338	371	static unsigned long required_kernelcore __initdata;
339	372	static unsigned long required_kernelcore_percent __initdata;
340	373	static unsigned long required_movablecore __initdata;
341	374	static unsigned long required_movablecore_percent __initdata;
342		-static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
	375	+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
343	376	static bool mirrored_kernelcore __meminitdata;
344	377
345	378	/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
346	379	int movable_zone;
347	380	EXPORT_SYMBOL(movable_zone);
348		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
349	381
350	382	#if MAX_NUMNODES > 1
351		-int nr_node_ids __read_mostly = MAX_NUMNODES;
352		-int nr_online_nodes __read_mostly = 1;
	383	+unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
	384	+unsigned int nr_online_nodes __read_mostly = 1;
353	385	EXPORT_SYMBOL(nr_node_ids);
354	386	EXPORT_SYMBOL(nr_online_nodes);
355	387	#endif
	388	+
	389	+struct pa_lock {
	390	+ local_lock_t l;
	391	+};
	392	+static DEFINE_PER_CPU(struct pa_lock, pa_lock) = {
	393	+ .l = INIT_LOCAL_LOCK(l),
	394	+};
356	395
357	396	int page_group_by_mobility_disabled __read_mostly;
358	397
..	..	@@ -365,7 +404,7 @@
365	404	static DEFINE_STATIC_KEY_TRUE(deferred_pages);
366	405
367	406	/*
368		- * Calling kasan_free_pages() only after deferred memory initialization
	407	+ * Calling kasan_poison_pages() only after deferred memory initialization
369	408	* has completed. Poisoning pages during deferred memory init will greatly
370	409	* lengthen the process and cause problem in large memory systems as the
371	410	* deferred pages initialization is done with interrupt disabled.
..	..	@@ -377,10 +416,12 @@
377	416	* on-demand allocation and then freed again before the deferred pages
378	417	* initialization is done, but this is not likely to happen.
379	418	*/
380		-static inline void kasan_free_nondeferred_pages(struct page *page, int order)
	419	+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
381	420	{
382		- if (!static_branch_unlikely(&deferred_pages))
383		- kasan_free_pages(page, order);
	421	+ return static_branch_unlikely(&deferred_pages) \|\|
	422	+ (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
	423	+ (fpi_flags & FPI_SKIP_KASAN_POISON)) \|\|
	424	+ PageSkipKASanPoison(page);
384	425	}
385	426
386	427	/* Returns true if the struct page for the pfn is uninitialised */
..	..	@@ -395,38 +436,57 @@
395	436	}
396	437
397	438	/*
398		- * Returns false when the remaining initialisation should be deferred until
	439	+ * Returns true when the remaining initialisation should be deferred until
399	440	* later in the boot cycle when it can be parallelised.
400	441	*/
401		-static inline bool update_defer_init(pg_data_t *pgdat,
402		- unsigned long pfn, unsigned long zone_end,
403		- unsigned long *nr_initialised)
	442	+static bool __meminit
	443	+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
404	444	{
405		- /* Always populate low zones for address-constrained allocations */
406		- if (zone_end < pgdat_end_pfn(pgdat))
407		- return true;
408		- (*nr_initialised)++;
409		- if ((*nr_initialised > pgdat->static_init_pgcnt) &&
410		- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
411		- pgdat->first_deferred_pfn = pfn;
412		- return false;
	445	+ static unsigned long prev_end_pfn, nr_initialised;
	446	+
	447	+ /*
	448	+ * prev_end_pfn static that contains the end of previous zone
	449	+ * No need to protect because called very early in boot before smp_init.
	450	+ */
	451	+ if (prev_end_pfn != end_pfn) {
	452	+ prev_end_pfn = end_pfn;
	453	+ nr_initialised = 0;
413	454	}
414	455
415		- return true;
	456	+ /* Always populate low zones for address-constrained allocations */
	457	+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
	458	+ return false;
	459	+
	460	+ if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
	461	+ return true;
	462	+ /*
	463	+ * We start only with one section of pages, more pages are added as
	464	+ * needed until the rest of deferred pages are initialized.
	465	+ */
	466	+ nr_initialised++;
	467	+ if ((nr_initialised > PAGES_PER_SECTION) &&
	468	+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
	469	+ NODE_DATA(nid)->first_deferred_pfn = pfn;
	470	+ return true;
	471	+ }
	472	+ return false;
416	473	}
417	474	#else
418		-#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
	475	+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
	476	+{
	477	+ return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
	478	+ (fpi_flags & FPI_SKIP_KASAN_POISON)) \|\|
	479	+ PageSkipKASanPoison(page);
	480	+}
419	481
420	482	static inline bool early_page_uninitialised(unsigned long pfn)
421	483	{
422	484	return false;
423	485	}
424	486
425		-static inline bool update_defer_init(pg_data_t *pgdat,
426		- unsigned long pfn, unsigned long zone_end,
427		- unsigned long *nr_initialised)
	487	+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
428	488	{
429		- return true;
	489	+ return false;
430	490	}
431	491	#endif
432	492
..	..	@@ -435,7 +495,7 @@
435	495	unsigned long pfn)
436	496	{
437	497	#ifdef CONFIG_SPARSEMEM
438		- return __pfn_to_section(pfn)->pageblock_flags;
	498	+ return section_to_usemap(__pfn_to_section(pfn));
439	499	#else
440	500	return page_zone(page)->pageblock_flags;
441	501	#endif /* CONFIG_SPARSEMEM */
..	..	@@ -445,25 +505,23 @@
445	505	{
446	506	#ifdef CONFIG_SPARSEMEM
447	507	pfn &= (PAGES_PER_SECTION-1);
448		- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
449	508	#else
450	509	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
451		- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
452	510	#endif /* CONFIG_SPARSEMEM */
	511	+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
453	512	}
454	513
455	514	/**
456	515	* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
457	516	* @page: The page within the block of interest
458	517	* @pfn: The target page frame number
459		- * @end_bitidx: The last bit of interest to retrieve
460	518	* @mask: mask of bits that the caller is interested in
461	519	*
462	520	* Return: pageblock_bits flags
463	521	*/
464		-static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
	522	+static __always_inline
	523	+unsigned long __get_pfnblock_flags_mask(struct page *page,
465	524	unsigned long pfn,
466		- unsigned long end_bitidx,
467	525	unsigned long mask)
468	526	{
469	527	unsigned long *bitmap;
..	..	@@ -476,20 +534,36 @@
476	534	bitidx &= (BITS_PER_LONG-1);
477	535
478	536	word = bitmap[word_bitidx];
479		- bitidx += end_bitidx;
480		- return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
	537	+ return (word >> bitidx) & mask;
481	538	}
482	539
483	540	unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
484		- unsigned long end_bitidx,
485	541	unsigned long mask)
486	542	{
487		- return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
	543	+ return __get_pfnblock_flags_mask(page, pfn, mask);
488	544	}
	545	+EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask);
	546	+
	547	+int isolate_anon_lru_page(struct page *page)
	548	+{
	549	+ int ret;
	550	+
	551	+ if (!PageLRU(page) \|\| !PageAnon(page))
	552	+ return -EINVAL;
	553	+
	554	+ if (!get_page_unless_zero(page))
	555	+ return -EINVAL;
	556	+
	557	+ ret = isolate_lru_page(page);
	558	+ put_page(page);
	559	+
	560	+ return ret;
	561	+}
	562	+EXPORT_SYMBOL_GPL(isolate_anon_lru_page);
489	563
490	564	static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
491	565	{
492		- return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
	566	+ return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
493	567	}
494	568
495	569	/**
..	..	@@ -497,12 +571,10 @@
497	571	* @page: The page within the block of interest
498	572	* @flags: The flags to set
499	573	* @pfn: The target page frame number
500		- * @end_bitidx: The last bit of interest
501	574	* @mask: mask of bits that the caller is interested in
502	575	*/
503	576	void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
504	577	unsigned long pfn,
505		- unsigned long end_bitidx,
506	578	unsigned long mask)
507	579	{
508	580	unsigned long *bitmap;
..	..	@@ -510,6 +582,7 @@
510	582	unsigned long old_word, word;
511	583
512	584	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
	585	+ BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
513	586
514	587	bitmap = get_pageblock_bitmap(page, pfn);
515	588	bitidx = pfn_to_bitidx(page, pfn);
..	..	@@ -518,9 +591,8 @@
518	591
519	592	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
520	593
521		- bitidx += end_bitidx;
522		- mask <<= (BITS_PER_LONG - bitidx - 1);
523		- flags <<= (BITS_PER_LONG - bitidx - 1);
	594	+ mask <<= bitidx;
	595	+ flags <<= bitidx;
524	596
525	597	word = READ_ONCE(bitmap[word_bitidx]);
526	598	for (;;) {
..	..	@@ -537,8 +609,8 @@
537	609	migratetype < MIGRATE_PCPTYPES))
538	610	migratetype = MIGRATE_UNMOVABLE;
539	611
540		- set_pageblock_flags_group(page, (unsigned long)migratetype,
541		- PB_migrate, PB_migrate_end);
	612	+ set_pfnblock_flags_mask(page, (unsigned long)migratetype,
	613	+ page_to_pfn(page), MIGRATETYPE_MASK);
542	614	}
543	615
544	616	#ifdef CONFIG_DEBUG_VM
..	..	@@ -593,8 +665,7 @@
593	665	}
594	666	#endif
595	667
596		-static void bad_page(struct page page, const char reason,
597		- unsigned long bad_flags)
	668	+static void bad_page(struct page page, const char reason)
598	669	{
599	670	static unsigned long resume;
600	671	static unsigned long nr_shown;
..	..	@@ -623,10 +694,6 @@
623	694	pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
624	695	current->comm, page_to_pfn(page));
625	696	__dump_page(page, reason);
626		- bad_flags &= page->flags;
627		- if (bad_flags)
628		- pr_alert("bad because of flags: %#lx(%pGp)\n",
629		- bad_flags, &bad_flags);
630	697	dump_page_owner(page);
631	698
632	699	print_modules();
..	..	@@ -654,7 +721,8 @@
654	721
655	722	void free_compound_page(struct page *page)
656	723	{
657		- __free_pages_ok(page, compound_order(page));
	724	+ mem_cgroup_uncharge(page);
	725	+ __free_pages_ok(page, compound_order(page), FPI_NONE);
658	726	}
659	727
660	728	void prep_compound_page(struct page *page, unsigned int order)
..	..	@@ -662,8 +730,6 @@
662	730	int i;
663	731	int nr_pages = 1 << order;
664	732
665		- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
666		- set_compound_order(page, order);
667	733	__SetPageHead(page);
668	734	for (i = 1; i < nr_pages; i++) {
669	735	struct page *p = page + i;
..	..	@@ -671,51 +737,30 @@
671	737	p->mapping = TAIL_MAPPING;
672	738	set_compound_head(p, page);
673	739	}
	740	+
	741	+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
	742	+ set_compound_order(page, order);
674	743	atomic_set(compound_mapcount_ptr(page), -1);
	744	+ if (hpage_pincount_available(page))
	745	+ atomic_set(compound_pincount_ptr(page), 0);
675	746	}
676	747
677	748	#ifdef CONFIG_DEBUG_PAGEALLOC
678	749	unsigned int _debug_guardpage_minorder;
679		-bool _debug_pagealloc_enabled __read_mostly
	750	+
	751	+bool _debug_pagealloc_enabled_early __read_mostly
680	752	= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
	753	+EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
	754	+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
681	755	EXPORT_SYMBOL(_debug_pagealloc_enabled);
682		-bool _debug_guardpage_enabled __read_mostly;
	756	+
	757	+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
683	758
684	759	static int __init early_debug_pagealloc(char *buf)
685	760	{
686		- if (!buf)
687		- return -EINVAL;
688		- return kstrtobool(buf, &_debug_pagealloc_enabled);
	761	+ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
689	762	}
690	763	early_param("debug_pagealloc", early_debug_pagealloc);
691		-
692		-static bool need_debug_guardpage(void)
693		-{
694		- /* If we don't use debug_pagealloc, we don't need guard page */
695		- if (!debug_pagealloc_enabled())
696		- return false;
697		-
698		- if (!debug_guardpage_minorder())
699		- return false;
700		-
701		- return true;
702		-}
703		-
704		-static void init_debug_guardpage(void)
705		-{
706		- if (!debug_pagealloc_enabled())
707		- return;
708		-
709		- if (!debug_guardpage_minorder())
710		- return;
711		-
712		- _debug_guardpage_enabled = true;
713		-}
714		-
715		-struct page_ext_operations debug_guardpage_ops = {
716		- .need = need_debug_guardpage,
717		- .init = init_debug_guardpage,
718		-};
719	764
720	765	static int __init debug_guardpage_minorder_setup(char *buf)
721	766	{
..	..	@@ -734,20 +779,13 @@
734	779	static inline bool set_page_guard(struct zone zone, struct page page,
735	780	unsigned int order, int migratetype)
736	781	{
737		- struct page_ext *page_ext;
738		-
739	782	if (!debug_guardpage_enabled())
740	783	return false;
741	784
742	785	if (order >= debug_guardpage_minorder())
743	786	return false;
744	787
745		- page_ext = lookup_page_ext(page);
746		- if (unlikely(!page_ext))
747		- return false;
748		-
749		- __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
750		-
	788	+ __SetPageGuard(page);
751	789	INIT_LIST_HEAD(&page->lru);
752	790	set_page_private(page, order);
753	791	/* Guard pages are not available for any usage */
..	..	@@ -759,39 +797,77 @@
759	797	static inline void clear_page_guard(struct zone zone, struct page page,
760	798	unsigned int order, int migratetype)
761	799	{
762		- struct page_ext *page_ext;
763		-
764	800	if (!debug_guardpage_enabled())
765	801	return;
766	802
767		- page_ext = lookup_page_ext(page);
768		- if (unlikely(!page_ext))
769		- return;
770		-
771		- __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
	803	+ __ClearPageGuard(page);
772	804
773	805	set_page_private(page, 0);
774	806	if (!is_migrate_isolate(migratetype))
775	807	__mod_zone_freepage_state(zone, (1 << order), migratetype);
776	808	}
777	809	#else
778		-struct page_ext_operations debug_guardpage_ops;
779	810	static inline bool set_page_guard(struct zone zone, struct page page,
780	811	unsigned int order, int migratetype) { return false; }
781	812	static inline void clear_page_guard(struct zone zone, struct page page,
782	813	unsigned int order, int migratetype) {}
783	814	#endif
784	815
785		-static inline void set_page_order(struct page *page, unsigned int order)
	816	+/*
	817	+ * Enable static keys related to various memory debugging and hardening options.
	818	+ * Some override others, and depend on early params that are evaluated in the
	819	+ * order of appearance. So we need to first gather the full picture of what was
	820	+ * enabled, and then make decisions.
	821	+ */
	822	+void init_mem_debugging_and_hardening(void)
	823	+{
	824	+ bool page_poisoning_requested = false;
	825	+
	826	+#ifdef CONFIG_PAGE_POISONING
	827	+ /*
	828	+ * Page poisoning is debug page alloc for some arches. If
	829	+ * either of those options are enabled, enable poisoning.
	830	+ */
	831	+ if (page_poisoning_enabled() \|\|
	832	+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
	833	+ debug_pagealloc_enabled())) {
	834	+ static_branch_enable(&_page_poisoning_enabled);
	835	+ page_poisoning_requested = true;
	836	+ }
	837	+#endif
	838	+
	839	+ if (_init_on_alloc_enabled_early) {
	840	+ if (page_poisoning_requested)
	841	+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
	842	+ "will take precedence over init_on_alloc\n");
	843	+ else
	844	+ static_branch_enable(&init_on_alloc);
	845	+ }
	846	+ if (_init_on_free_enabled_early) {
	847	+ if (page_poisoning_requested)
	848	+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
	849	+ "will take precedence over init_on_free\n");
	850	+ else
	851	+ static_branch_enable(&init_on_free);
	852	+ }
	853	+
	854	+#ifdef CONFIG_DEBUG_PAGEALLOC
	855	+ if (!debug_pagealloc_enabled())
	856	+ return;
	857	+
	858	+ static_branch_enable(&_debug_pagealloc_enabled);
	859	+
	860	+ if (!debug_guardpage_minorder())
	861	+ return;
	862	+
	863	+ static_branch_enable(&_debug_guardpage_enabled);
	864	+#endif
	865	+}
	866	+
	867	+static inline void set_buddy_order(struct page *page, unsigned int order)
786	868	{
787	869	set_page_private(page, order);
788	870	__SetPageBuddy(page);
789		-}
790		-
791		-static inline void rmv_page_order(struct page *page)
792		-{
793		- __ClearPageBuddy(page);
794		- set_page_private(page, 0);
795	871	}
796	872
797	873	/*
..	..	@@ -807,32 +883,151 @@
807	883	*
808	884	* For recording page's order, we use page_private(page).
809	885	*/
810		-static inline int page_is_buddy(struct page page, struct page buddy,
	886	+static inline bool page_is_buddy(struct page page, struct page buddy,
811	887	unsigned int order)
812	888	{
813		- if (page_is_guard(buddy) && page_order(buddy) == order) {
814		- if (page_zone_id(page) != page_zone_id(buddy))
815		- return 0;
	889	+ if (!page_is_guard(buddy) && !PageBuddy(buddy))
	890	+ return false;
816	891
817		- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
	892	+ if (buddy_order(buddy) != order)
	893	+ return false;
818	894
819		- return 1;
820		- }
	895	+ /*
	896	+ * zone check is done late to avoid uselessly calculating
	897	+ * zone/node ids for pages that could never merge.
	898	+ */
	899	+ if (page_zone_id(page) != page_zone_id(buddy))
	900	+ return false;
821	901
822		- if (PageBuddy(buddy) && page_order(buddy) == order) {
823		- /*
824		- * zone check is done late to avoid uselessly
825		- * calculating zone/node ids for pages that could
826		- * never merge.
827		- */
828		- if (page_zone_id(page) != page_zone_id(buddy))
829		- return 0;
	902	+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
830	903
831		- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
	904	+ return true;
	905	+}
832	906
833		- return 1;
834		- }
835		- return 0;
	907	+#ifdef CONFIG_COMPACTION
	908	+static inline struct capture_control task_capc(struct zone zone)
	909	+{
	910	+ struct capture_control *capc = current->capture_control;
	911	+
	912	+ return unlikely(capc) &&
	913	+ !(current->flags & PF_KTHREAD) &&
	914	+ !capc->page &&
	915	+ capc->cc->zone == zone ? capc : NULL;
	916	+}
	917	+
	918	+static inline bool
	919	+compaction_capture(struct capture_control capc, struct page page,
	920	+ int order, int migratetype)
	921	+{
	922	+ if (!capc \|\| order != capc->cc->order)
	923	+ return false;
	924	+
	925	+ /* Do not accidentally pollute CMA or isolated regions*/
	926	+ if (is_migrate_cma(migratetype) \|\|
	927	+ is_migrate_isolate(migratetype))
	928	+ return false;
	929	+
	930	+ /*
	931	+ * Do not let lower order allocations polluate a movable pageblock.
	932	+ * This might let an unmovable request use a reclaimable pageblock
	933	+ * and vice-versa but no more than normal fallback logic which can
	934	+ * have trouble finding a high-order free page.
	935	+ */
	936	+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
	937	+ return false;
	938	+
	939	+ capc->page = page;
	940	+ return true;
	941	+}
	942	+
	943	+#else
	944	+static inline struct capture_control task_capc(struct zone zone)
	945	+{
	946	+ return NULL;
	947	+}
	948	+
	949	+static inline bool
	950	+compaction_capture(struct capture_control capc, struct page page,
	951	+ int order, int migratetype)
	952	+{
	953	+ return false;
	954	+}
	955	+#endif /* CONFIG_COMPACTION */
	956	+
	957	+/* Used for pages not on another list */
	958	+static inline void add_to_free_list(struct page page, struct zone zone,
	959	+ unsigned int order, int migratetype)
	960	+{
	961	+ struct free_area *area = &zone->free_area[order];
	962	+
	963	+ list_add(&page->lru, &area->free_list[migratetype]);
	964	+ area->nr_free++;
	965	+}
	966	+
	967	+/* Used for pages not on another list */
	968	+static inline void add_to_free_list_tail(struct page page, struct zone zone,
	969	+ unsigned int order, int migratetype)
	970	+{
	971	+ struct free_area *area = &zone->free_area[order];
	972	+
	973	+ list_add_tail(&page->lru, &area->free_list[migratetype]);
	974	+ area->nr_free++;
	975	+}
	976	+
	977	+/*
	978	+ * Used for pages which are on another list. Move the pages to the tail
	979	+ * of the list - so the moved pages won't immediately be considered for
	980	+ * allocation again (e.g., optimization for memory onlining).
	981	+ */
	982	+static inline void move_to_free_list(struct page page, struct zone zone,
	983	+ unsigned int order, int migratetype)
	984	+{
	985	+ struct free_area *area = &zone->free_area[order];
	986	+
	987	+ list_move_tail(&page->lru, &area->free_list[migratetype]);
	988	+}
	989	+
	990	+static inline void del_page_from_free_list(struct page page, struct zone zone,
	991	+ unsigned int order)
	992	+{
	993	+ /* clear reported state and update reported page count */
	994	+ if (page_reported(page))
	995	+ __ClearPageReported(page);
	996	+
	997	+ list_del(&page->lru);
	998	+ __ClearPageBuddy(page);
	999	+ set_page_private(page, 0);
	1000	+ zone->free_area[order].nr_free--;
	1001	+}
	1002	+
	1003	+/*
	1004	+ * If this is not the largest possible page, check if the buddy
	1005	+ * of the next-highest order is free. If it is, it's possible
	1006	+ * that pages are being freed that will coalesce soon. In case,
	1007	+ * that is happening, add the free page to the tail of the list
	1008	+ * so it's less likely to be used soon and more likely to be merged
	1009	+ * as a higher order page
	1010	+ */
	1011	+static inline bool
	1012	+buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
	1013	+ struct page *page, unsigned int order)
	1014	+{
	1015	+ struct page higher_page, higher_buddy;
	1016	+ unsigned long combined_pfn;
	1017	+
	1018	+ if (order >= MAX_ORDER - 2)
	1019	+ return false;
	1020	+
	1021	+ if (!pfn_valid_within(buddy_pfn))
	1022	+ return false;
	1023	+
	1024	+ combined_pfn = buddy_pfn & pfn;
	1025	+ higher_page = page + (combined_pfn - pfn);
	1026	+ buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
	1027	+ higher_buddy = higher_page + (buddy_pfn - combined_pfn);
	1028	+
	1029	+ return pfn_valid_within(buddy_pfn) &&
	1030	+ page_is_buddy(higher_page, higher_buddy, order + 1);
836	1031	}
837	1032
838	1033	/*
..	..	@@ -862,12 +1057,14 @@
862	1057	static inline void __free_one_page(struct page *page,
863	1058	unsigned long pfn,
864	1059	struct zone *zone, unsigned int order,
865		- int migratetype)
	1060	+ int migratetype, fpi_t fpi_flags)
866	1061	{
	1062	+ struct capture_control *capc = task_capc(zone);
	1063	+ unsigned long buddy_pfn;
867	1064	unsigned long combined_pfn;
868		- unsigned long uninitialized_var(buddy_pfn);
869		- struct page *buddy;
870	1065	unsigned int max_order;
	1066	+ struct page *buddy;
	1067	+ bool to_tail;
871	1068
872	1069	max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
873	1070
..	..	@@ -883,6 +1080,11 @@
883	1080
884	1081	continue_merging:
885	1082	while (order < max_order) {
	1083	+ if (compaction_capture(capc, page, order, migratetype)) {
	1084	+ __mod_zone_freepage_state(zone, -(1 << order),
	1085	+ migratetype);
	1086	+ return;
	1087	+ }
886	1088	buddy_pfn = __find_buddy_pfn(pfn, order);
887	1089	buddy = page + (buddy_pfn - pfn);
888	1090
..	..	@@ -894,13 +1096,10 @@
894	1096	* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
895	1097	* merge with it and move up one order.
896	1098	*/
897		- if (page_is_guard(buddy)) {
	1099	+ if (page_is_guard(buddy))
898	1100	clear_page_guard(zone, buddy, order, migratetype);
899		- } else {
900		- list_del(&buddy->lru);
901		- zone->free_area[order].nr_free--;
902		- rmv_page_order(buddy);
903		- }
	1101	+ else
	1102	+ del_page_from_free_list(buddy, zone, order);
904	1103	combined_pfn = buddy_pfn & pfn;
905	1104	page = page + (combined_pfn - pfn);
906	1105	pfn = combined_pfn;
..	..	@@ -932,33 +1131,23 @@
932	1131	}
933	1132
934	1133	done_merging:
935		- set_page_order(page, order);
	1134	+ set_buddy_order(page, order);
936	1135
937		- /*
938		- * If this is not the largest possible page, check if the buddy
939		- * of the next-highest order is free. If it is, it's possible
940		- * that pages are being freed that will coalesce soon. In case,
941		- * that is happening, add the free page to the tail of the list
942		- * so it's less likely to be used soon and more likely to be merged
943		- * as a higher order page
944		- */
945		- if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
946		- struct page higher_page, higher_buddy;
947		- combined_pfn = buddy_pfn & pfn;
948		- higher_page = page + (combined_pfn - pfn);
949		- buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
950		- higher_buddy = higher_page + (buddy_pfn - combined_pfn);
951		- if (pfn_valid_within(buddy_pfn) &&
952		- page_is_buddy(higher_page, higher_buddy, order + 1)) {
953		- list_add_tail(&page->lru,
954		- &zone->free_area[order].free_list[migratetype]);
955		- goto out;
956		- }
957		- }
	1136	+ if (fpi_flags & FPI_TO_TAIL)
	1137	+ to_tail = true;
	1138	+ else if (is_shuffle_order(order))
	1139	+ to_tail = shuffle_pick_tail();
	1140	+ else
	1141	+ to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
958	1142
959		- list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
960		-out:
961		- zone->free_area[order].nr_free++;
	1143	+ if (to_tail)
	1144	+ add_to_free_list_tail(page, zone, order, migratetype);
	1145	+ else
	1146	+ add_to_free_list(page, zone, order, migratetype);
	1147	+
	1148	+ /* Notify page reporting subsystem of freed page */
	1149	+ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
	1150	+ page_reporting_notify_free(order);
962	1151	}
963	1152
964	1153	/*
..	..	@@ -983,13 +1172,9 @@
983	1172	return true;
984	1173	}
985	1174
986		-static void free_pages_check_bad(struct page *page)
	1175	+static const char page_bad_reason(struct page page, unsigned long flags)
987	1176	{
988		- const char *bad_reason;
989		- unsigned long bad_flags;
990		-
991		- bad_reason = NULL;
992		- bad_flags = 0;
	1177	+ const char *bad_reason = NULL;
993	1178
994	1179	if (unlikely(atomic_read(&page->_mapcount) != -1))
995	1180	bad_reason = "nonzero mapcount";
..	..	@@ -997,24 +1182,32 @@
997	1182	bad_reason = "non-NULL mapping";
998	1183	if (unlikely(page_ref_count(page) != 0))
999	1184	bad_reason = "nonzero _refcount";
1000		- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
1001		- bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1002		- bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
	1185	+ if (unlikely(page->flags & flags)) {
	1186	+ if (flags == PAGE_FLAGS_CHECK_AT_PREP)
	1187	+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
	1188	+ else
	1189	+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1003	1190	}
1004	1191	#ifdef CONFIG_MEMCG
1005	1192	if (unlikely(page->mem_cgroup))
1006	1193	bad_reason = "page still charged to cgroup";
1007	1194	#endif
1008		- bad_page(page, bad_reason, bad_flags);
	1195	+ return bad_reason;
1009	1196	}
1010	1197
1011		-static inline int free_pages_check(struct page *page)
	1198	+static void check_free_page_bad(struct page *page)
	1199	+{
	1200	+ bad_page(page,
	1201	+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
	1202	+}
	1203	+
	1204	+static inline int check_free_page(struct page *page)
1012	1205	{
1013	1206	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
1014	1207	return 0;
1015	1208
1016	1209	/* Something has gone sideways, find it */
1017		- free_pages_check_bad(page);
	1210	+ check_free_page_bad(page);
1018	1211	return 1;
1019	1212	}
1020	1213
..	..	@@ -1036,7 +1229,7 @@
1036	1229	case 1:
1037	1230	/* the first tail page: ->mapping may be compound_mapcount() */
1038	1231	if (unlikely(compound_mapcount(page))) {
1039		- bad_page(page, "nonzero compound_mapcount", 0);
	1232	+ bad_page(page, "nonzero compound_mapcount");
1040	1233	goto out;
1041	1234	}
1042	1235	break;
..	..	@@ -1048,17 +1241,17 @@
1048	1241	break;
1049	1242	default:
1050	1243	if (page->mapping != TAIL_MAPPING) {
1051		- bad_page(page, "corrupted mapping in tail page", 0);
	1244	+ bad_page(page, "corrupted mapping in tail page");
1052	1245	goto out;
1053	1246	}
1054	1247	break;
1055	1248	}
1056	1249	if (unlikely(!PageTail(page))) {
1057		- bad_page(page, "PageTail not set", 0);
	1250	+ bad_page(page, "PageTail not set");
1058	1251	goto out;
1059	1252	}
1060	1253	if (unlikely(compound_head(page) != head_page)) {
1061		- bad_page(page, "compound_head not consistent", 0);
	1254	+ bad_page(page, "compound_head not consistent");
1062	1255	goto out;
1063	1256	}
1064	1257	ret = 0;
..	..	@@ -1068,25 +1261,48 @@
1068	1261	return ret;
1069	1262	}
1070	1263
1071		-static void kernel_init_free_pages(struct page *page, int numpages)
	1264	+static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
1072	1265	{
1073	1266	int i;
1074	1267
	1268	+ if (zero_tags) {
	1269	+ for (i = 0; i < numpages; i++)
	1270	+ tag_clear_highpage(page + i);
	1271	+ return;
	1272	+ }
	1273	+
1075	1274	/* s390's use of memset() could override KASAN redzones. */
1076	1275	kasan_disable_current();
1077		- for (i = 0; i < numpages; i++)
	1276	+ for (i = 0; i < numpages; i++) {
	1277	+ u8 tag = page_kasan_tag(page + i);
	1278	+ page_kasan_tag_reset(page + i);
1078	1279	clear_highpage(page + i);
	1280	+ page_kasan_tag_set(page + i, tag);
	1281	+ }
1079	1282	kasan_enable_current();
1080	1283	}
1081	1284
1082	1285	static __always_inline bool free_pages_prepare(struct page *page,
1083		- unsigned int order, bool check_free)
	1286	+ unsigned int order, bool check_free, fpi_t fpi_flags)
1084	1287	{
1085	1288	int bad = 0;
	1289	+ bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
1086	1290
1087	1291	VM_BUG_ON_PAGE(PageTail(page), page);
1088	1292
1089	1293	trace_mm_page_free(page, order);
	1294	+
	1295	+ if (unlikely(PageHWPoison(page)) && !order) {
	1296	+ /*
	1297	+ * Do not let hwpoison pages hit pcplists/buddy
	1298	+ * Untie memcg state and reset page's owner
	1299	+ */
	1300	+ if (memcg_kmem_enabled() && PageKmemcg(page))
	1301	+ __memcg_kmem_uncharge_page(page, order);
	1302	+ reset_page_owner(page, order);
	1303	+ free_page_pinner(page, order);
	1304	+ return false;
	1305	+ }
1090	1306
1091	1307	/*
1092	1308	* Check tail pages before head page information is cleared to
..	..	@@ -1103,7 +1319,7 @@
1103	1319	for (i = 1; i < (1 << order); i++) {
1104	1320	if (compound)
1105	1321	bad += free_tail_pages_check(page, page + i);
1106		- if (unlikely(free_pages_check(page + i))) {
	1322	+ if (unlikely(check_free_page(page + i))) {
1107	1323	bad++;
1108	1324	continue;
1109	1325	}
..	..	@@ -1113,15 +1329,16 @@
1113	1329	if (PageMappingFlags(page))
1114	1330	page->mapping = NULL;
1115	1331	if (memcg_kmem_enabled() && PageKmemcg(page))
1116		- memcg_kmem_uncharge(page, order);
	1332	+ __memcg_kmem_uncharge_page(page, order);
1117	1333	if (check_free)
1118		- bad += free_pages_check(page);
	1334	+ bad += check_free_page(page);
1119	1335	if (bad)
1120	1336	return false;
1121	1337
1122	1338	page_cpupid_reset_last(page);
1123	1339	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1124	1340	reset_page_owner(page, order);
	1341	+ free_page_pinner(page, order);
1125	1342
1126	1343	if (!PageHighMem(page)) {
1127	1344	debug_check_no_locks_freed(page_address(page),
..	..	@@ -1129,36 +1346,77 @@
1129	1346	debug_check_no_obj_freed(page_address(page),
1130	1347	PAGE_SIZE << order);
1131	1348	}
1132		- arch_free_page(page, order);
1133		- if (want_init_on_free())
1134		- kernel_init_free_pages(page, 1 << order);
1135	1349
1136		- kernel_poison_pages(page, 1 << order, 0);
1137		- kernel_map_pages(page, 1 << order, 0);
1138		- kasan_free_nondeferred_pages(page, order);
	1350	+ kernel_poison_pages(page, 1 << order);
	1351	+
	1352	+ /*
	1353	+ * As memory initialization might be integrated into KASAN,
	1354	+ * kasan_free_pages and kernel_init_free_pages must be
	1355	+ * kept together to avoid discrepancies in behavior.
	1356	+ *
	1357	+ * With hardware tag-based KASAN, memory tags must be set before the
	1358	+ * page becomes unavailable via debug_pagealloc or arch_free_page.
	1359	+ */
	1360	+ if (kasan_has_integrated_init()) {
	1361	+ if (!skip_kasan_poison)
	1362	+ kasan_free_pages(page, order);
	1363	+ } else {
	1364	+ bool init = want_init_on_free();
	1365	+
	1366	+ if (init)
	1367	+ kernel_init_free_pages(page, 1 << order, false);
	1368	+ if (!skip_kasan_poison)
	1369	+ kasan_poison_pages(page, order, init);
	1370	+ }
	1371	+
	1372	+ /*
	1373	+ * arch_free_page() can make the page's contents inaccessible. s390
	1374	+ * does this. So nothing which can access the page's contents should
	1375	+ * happen after this.
	1376	+ */
	1377	+ arch_free_page(page, order);
	1378	+
	1379	+ debug_pagealloc_unmap_pages(page, 1 << order);
1139	1380
1140	1381	return true;
1141	1382	}
1142	1383
1143	1384	#ifdef CONFIG_DEBUG_VM
1144		-static inline bool free_pcp_prepare(struct page *page)
1145		-{
1146		- return free_pages_prepare(page, 0, true);
1147		-}
1148		-
1149		-static inline bool bulkfree_pcp_prepare(struct page *page)
1150		-{
1151		- return false;
1152		-}
1153		-#else
	1385	+/*
	1386	+ * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
	1387	+ * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
	1388	+ * moved from pcp lists to free lists.
	1389	+ */
1154	1390	static bool free_pcp_prepare(struct page *page)
1155	1391	{
1156		- return free_pages_prepare(page, 0, false);
	1392	+ return free_pages_prepare(page, 0, true, FPI_NONE);
1157	1393	}
1158	1394
1159	1395	static bool bulkfree_pcp_prepare(struct page *page)
1160	1396	{
1161		- return free_pages_check(page);
	1397	+ if (debug_pagealloc_enabled_static())
	1398	+ return check_free_page(page);
	1399	+ else
	1400	+ return false;
	1401	+}
	1402	+#else
	1403	+/*
	1404	+ * With DEBUG_VM disabled, order-0 pages being freed are checked only when
	1405	+ * moving from pcp lists to free list in order to reduce overhead. With
	1406	+ * debug_pagealloc enabled, they are checked also immediately when being freed
	1407	+ * to the pcp lists.
	1408	+ */
	1409	+static bool free_pcp_prepare(struct page *page)
	1410	+{
	1411	+ if (debug_pagealloc_enabled_static())
	1412	+ return free_pages_prepare(page, 0, true, FPI_NONE);
	1413	+ else
	1414	+ return free_pages_prepare(page, 0, false, FPI_NONE);
	1415	+}
	1416	+
	1417	+static bool bulkfree_pcp_prepare(struct page *page)
	1418	+{
	1419	+ return check_free_page(page);
1162	1420	}
1163	1421	#endif /* CONFIG_DEBUG_VM */
1164	1422
..	..	@@ -1172,7 +1430,7 @@
1172	1430	}
1173	1431
1174	1432	/*
1175		- * Frees a number of pages from the PCP lists
	1433	+ * Frees a number of pages which have been collected from the pcp lists.
1176	1434	* Assumes all pages on list are in same zone, and of same order.
1177	1435	* count is the number of pages to free.
1178	1436	*
..	..	@@ -1182,15 +1440,56 @@
1182	1440	* And clear the zone's pages_scanned counter, to hold off the "all pages are
1183	1441	* pinned" detection logic.
1184	1442	*/
1185		-static void free_pcppages_bulk(struct zone *zone, int count,
1186		- struct per_cpu_pages *pcp)
	1443	+static void free_pcppages_bulk(struct zone zone, struct list_head head,
	1444	+ bool zone_retry)
	1445	+{
	1446	+ bool isolated_pageblocks;
	1447	+ struct page page, tmp;
	1448	+ unsigned long flags;
	1449	+
	1450	+ spin_lock_irqsave(&zone->lock, flags);
	1451	+ isolated_pageblocks = has_isolate_pageblock(zone);
	1452	+
	1453	+ /*
	1454	+ * Use safe version since after __free_one_page(),
	1455	+ * page->lru.next will not point to original list.
	1456	+ */
	1457	+ list_for_each_entry_safe(page, tmp, head, lru) {
	1458	+ int mt = get_pcppage_migratetype(page);
	1459	+
	1460	+ if (page_zone(page) != zone) {
	1461	+ /*
	1462	+ * free_unref_page_list() sorts pages by zone. If we end
	1463	+ * up with pages from a different NUMA nodes belonging
	1464	+ * to the same ZONE index then we need to redo with the
	1465	+ * correct ZONE pointer. Skip the page for now, redo it
	1466	+ * on the next iteration.
	1467	+ */
	1468	+ WARN_ON_ONCE(zone_retry == false);
	1469	+ if (zone_retry)
	1470	+ continue;
	1471	+ }
	1472	+
	1473	+ /* MIGRATE_ISOLATE page should not go to pcplists */
	1474	+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
	1475	+ /* Pageblock could have been isolated meanwhile */
	1476	+ if (unlikely(isolated_pageblocks))
	1477	+ mt = get_pageblock_migratetype(page);
	1478	+
	1479	+ list_del(&page->lru);
	1480	+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
	1481	+ trace_mm_page_pcpu_drain(page, 0, mt);
	1482	+ }
	1483	+ spin_unlock_irqrestore(&zone->lock, flags);
	1484	+}
	1485	+
	1486	+static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp,
	1487	+ struct list_head *dst)
1187	1488	{
1188	1489	int migratetype = 0;
1189	1490	int batch_free = 0;
1190	1491	int prefetch_nr = 0;
1191		- bool isolated_pageblocks;
1192		- struct page page, tmp;
1193		- LIST_HEAD(head);
	1492	+ struct page *page;
1194	1493
1195	1494	/*
1196	1495	* Ensure proper count is passed which otherwise would stuck in the
..	..	@@ -1227,7 +1526,7 @@
1227	1526	if (bulkfree_pcp_prepare(page))
1228	1527	continue;
1229	1528
1230		- list_add_tail(&page->lru, &head);
	1529	+ list_add_tail(&page->lru, dst);
1231	1530
1232	1531	/*
1233	1532	* We are going to put the page back to the global
..	..	@@ -1242,39 +1541,19 @@
1242	1541	prefetch_buddy(page);
1243	1542	} while (--count && --batch_free && !list_empty(list));
1244	1543	}
1245		-
1246		- spin_lock(&zone->lock);
1247		- isolated_pageblocks = has_isolate_pageblock(zone);
1248		-
1249		- /*
1250		- * Use safe version since after __free_one_page(),
1251		- * page->lru.next will not point to original list.
1252		- */
1253		- list_for_each_entry_safe(page, tmp, &head, lru) {
1254		- int mt = get_pcppage_migratetype(page);
1255		- /* MIGRATE_ISOLATE page should not go to pcplists */
1256		- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1257		- /* Pageblock could have been isolated meanwhile */
1258		- if (unlikely(isolated_pageblocks))
1259		- mt = get_pageblock_migratetype(page);
1260		-
1261		- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
1262		- trace_mm_page_pcpu_drain(page, 0, mt);
1263		- }
1264		- spin_unlock(&zone->lock);
1265	1544	}
1266	1545
1267	1546	static void free_one_page(struct zone *zone,
1268	1547	struct page *page, unsigned long pfn,
1269	1548	unsigned int order,
1270		- int migratetype)
	1549	+ int migratetype, fpi_t fpi_flags)
1271	1550	{
1272	1551	spin_lock(&zone->lock);
1273	1552	if (unlikely(has_isolate_pageblock(zone) \|\|
1274	1553	is_migrate_isolate(migratetype))) {
1275	1554	migratetype = get_pfnblock_migratetype(page, pfn);
1276	1555	}
1277		- __free_one_page(page, pfn, zone, order, migratetype);
	1556	+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
1278	1557	spin_unlock(&zone->lock);
1279	1558	}
1280	1559
..	..	@@ -1348,33 +1627,45 @@
1348	1627	/* Avoid false-positive PageTail() */
1349	1628	INIT_LIST_HEAD(&page->lru);
1350	1629
1351		- SetPageReserved(page);
	1630	+ /*
	1631	+ * no need for atomic set_bit because the struct
	1632	+ * page is not visible yet so nobody should
	1633	+ * access it yet.
	1634	+ */
	1635	+ __SetPageReserved(page);
1352	1636	}
1353	1637	}
1354	1638	}
1355	1639
1356		-static void __free_pages_ok(struct page *page, unsigned int order)
	1640	+static void __free_pages_ok(struct page *page, unsigned int order,
	1641	+ fpi_t fpi_flags)
1357	1642	{
1358	1643	unsigned long flags;
1359	1644	int migratetype;
1360	1645	unsigned long pfn = page_to_pfn(page);
1361	1646
1362		- if (!free_pages_prepare(page, order, true))
	1647	+ if (!free_pages_prepare(page, order, true, fpi_flags))
1363	1648	return;
1364	1649
1365	1650	migratetype = get_pfnblock_migratetype(page, pfn);
1366		- local_irq_save(flags);
	1651	+ local_lock_irqsave(&pa_lock.l, flags);
1367	1652	__count_vm_events(PGFREE, 1 << order);
1368		- free_one_page(page_zone(page), page, pfn, order, migratetype);
1369		- local_irq_restore(flags);
	1653	+ free_one_page(page_zone(page), page, pfn, order, migratetype,
	1654	+ fpi_flags);
	1655	+ local_unlock_irqrestore(&pa_lock.l, flags);
1370	1656	}
1371	1657
1372		-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
	1658	+void __free_pages_core(struct page *page, unsigned int order)
1373	1659	{
1374	1660	unsigned int nr_pages = 1 << order;
1375	1661	struct page *p = page;
1376	1662	unsigned int loop;
1377	1663
	1664	+ /*
	1665	+ * When initializing the memmap, __init_single_page() sets the refcount
	1666	+ * of all pages to 1 ("allocated"/"not free"). We have to set the
	1667	+ * refcount of all involved pages to 0.
	1668	+ */
1378	1669	prefetchw(p);
1379	1670	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1380	1671	prefetchw(p + 1);
..	..	@@ -1384,15 +1675,43 @@
1384	1675	__ClearPageReserved(p);
1385	1676	set_page_count(p, 0);
1386	1677
1387		- page_zone(page)->managed_pages += nr_pages;
1388		- set_page_refcounted(page);
1389		- __free_pages(page, order);
	1678	+ atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
	1679	+
	1680	+ /*
	1681	+ * Bypass PCP and place fresh pages right to the tail, primarily
	1682	+ * relevant for memory onlining.
	1683	+ */
	1684	+ __free_pages_ok(page, order, FPI_TO_TAIL \| FPI_SKIP_KASAN_POISON);
1390	1685	}
1391	1686
1392		-#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) \|\| \
1393		- defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
	1687	+#ifdef CONFIG_NEED_MULTIPLE_NODES
1394	1688
1395	1689	static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
	1690	+
	1691	+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
	1692	+
	1693	+/*
	1694	+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
	1695	+ */
	1696	+int __meminit __early_pfn_to_nid(unsigned long pfn,
	1697	+ struct mminit_pfnnid_cache *state)
	1698	+{
	1699	+ unsigned long start_pfn, end_pfn;
	1700	+ int nid;
	1701	+
	1702	+ if (state->last_start <= pfn && pfn < state->last_end)
	1703	+ return state->last_nid;
	1704	+
	1705	+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
	1706	+ if (nid != NUMA_NO_NODE) {
	1707	+ state->last_start = start_pfn;
	1708	+ state->last_end = end_pfn;
	1709	+ state->last_nid = nid;
	1710	+ }
	1711	+
	1712	+ return nid;
	1713	+}
	1714	+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
1396	1715
1397	1716	int __meminit early_pfn_to_nid(unsigned long pfn)
1398	1717	{
..	..	@@ -1407,48 +1726,14 @@
1407	1726
1408	1727	return nid;
1409	1728	}
1410		-#endif
	1729	+#endif /* CONFIG_NEED_MULTIPLE_NODES */
1411	1730
1412		-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
1413		-static inline bool __meminit __maybe_unused
1414		-meminit_pfn_in_nid(unsigned long pfn, int node,
1415		- struct mminit_pfnnid_cache *state)
1416		-{
1417		- int nid;
1418		-
1419		- nid = __early_pfn_to_nid(pfn, state);
1420		- if (nid >= 0 && nid != node)
1421		- return false;
1422		- return true;
1423		-}
1424		-
1425		-/* Only safe to use early in boot when initialisation is single-threaded */
1426		-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1427		-{
1428		- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
1429		-}
1430		-
1431		-#else
1432		-
1433		-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1434		-{
1435		- return true;
1436		-}
1437		-static inline bool __meminit __maybe_unused
1438		-meminit_pfn_in_nid(unsigned long pfn, int node,
1439		- struct mminit_pfnnid_cache *state)
1440		-{
1441		- return true;
1442		-}
1443		-#endif
1444		-
1445		-
1446		-void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
	1731	+void __init memblock_free_pages(struct page *page, unsigned long pfn,
1447	1732	unsigned int order)
1448	1733	{
1449	1734	if (early_page_uninitialised(pfn))
1450	1735	return;
1451		- return __free_pages_boot_core(page, order);
	1736	+ __free_pages_core(page, order);
1452	1737	}
1453	1738
1454	1739	/*
..	..	@@ -1539,14 +1824,14 @@
1539	1824	if (nr_pages == pageblock_nr_pages &&
1540	1825	(pfn & (pageblock_nr_pages - 1)) == 0) {
1541	1826	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1542		- __free_pages_boot_core(page, pageblock_order);
	1827	+ __free_pages_core(page, pageblock_order);
1543	1828	return;
1544	1829	}
1545	1830
1546	1831	for (i = 0; i < nr_pages; i++, page++, pfn++) {
1547	1832	if ((pfn & (pageblock_nr_pages - 1)) == 0)
1548	1833	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1549		- __free_pages_boot_core(page, 0);
	1834	+ __free_pages_core(page, 0);
1550	1835	}
1551	1836	}
1552	1837
..	..	@@ -1569,20 +1854,12 @@
1569	1854	*
1570	1855	* Then, we check if a current large page is valid by only checking the validity
1571	1856	* of the head pfn.
1572		- *
1573		- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
1574		- * within a node: a pfn is between start and end of a node, but does not belong
1575		- * to this memory node.
1576	1857	*/
1577		-static inline bool __init
1578		-deferred_pfn_valid(int nid, unsigned long pfn,
1579		- struct mminit_pfnnid_cache *nid_init_state)
	1858	+static inline bool __init deferred_pfn_valid(unsigned long pfn)
1580	1859	{
1581	1860	if (!pfn_valid_within(pfn))
1582	1861	return false;
1583	1862	if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
1584		- return false;
1585		- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
1586	1863	return false;
1587	1864	return true;
1588	1865	}
..	..	@@ -1591,21 +1868,19 @@
1591	1868	* Free pages to buddy allocator. Try to free aligned pages in
1592	1869	* pageblock_nr_pages sizes.
1593	1870	*/
1594		-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
	1871	+static void __init deferred_free_pages(unsigned long pfn,
1595	1872	unsigned long end_pfn)
1596	1873	{
1597		- struct mminit_pfnnid_cache nid_init_state = { };
1598	1874	unsigned long nr_pgmask = pageblock_nr_pages - 1;
1599	1875	unsigned long nr_free = 0;
1600	1876
1601	1877	for (; pfn < end_pfn; pfn++) {
1602		- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
	1878	+ if (!deferred_pfn_valid(pfn)) {
1603	1879	deferred_free_range(pfn - nr_free, nr_free);
1604	1880	nr_free = 0;
1605	1881	} else if (!(pfn & nr_pgmask)) {
1606	1882	deferred_free_range(pfn - nr_free, nr_free);
1607	1883	nr_free = 1;
1608		- touch_nmi_watchdog();
1609	1884	} else {
1610	1885	nr_free++;
1611	1886	}
..	..	@@ -1619,22 +1894,22 @@
1619	1894	* by performing it only once every pageblock_nr_pages.
1620	1895	* Return number of pages initialized.
1621	1896	*/
1622		-static unsigned long __init deferred_init_pages(int nid, int zid,
	1897	+static unsigned long __init deferred_init_pages(struct zone *zone,
1623	1898	unsigned long pfn,
1624	1899	unsigned long end_pfn)
1625	1900	{
1626		- struct mminit_pfnnid_cache nid_init_state = { };
1627	1901	unsigned long nr_pgmask = pageblock_nr_pages - 1;
	1902	+ int nid = zone_to_nid(zone);
1628	1903	unsigned long nr_pages = 0;
	1904	+ int zid = zone_idx(zone);
1629	1905	struct page *page = NULL;
1630	1906
1631	1907	for (; pfn < end_pfn; pfn++) {
1632		- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
	1908	+ if (!deferred_pfn_valid(pfn)) {
1633	1909	page = NULL;
1634	1910	continue;
1635	1911	} else if (!page \|\| !(pfn & nr_pgmask)) {
1636	1912	page = pfn_to_page(pfn);
1637		- touch_nmi_watchdog();
1638	1913	} else {
1639	1914	page++;
1640	1915	}
..	..	@@ -1644,18 +1919,127 @@
1644	1919	return (nr_pages);
1645	1920	}
1646	1921
	1922	+/*
	1923	+ * This function is meant to pre-load the iterator for the zone init.
	1924	+ * Specifically it walks through the ranges until we are caught up to the
	1925	+ * first_init_pfn value and exits there. If we never encounter the value we
	1926	+ * return false indicating there are no valid ranges left.
	1927	+ */
	1928	+static bool __init
	1929	+deferred_init_mem_pfn_range_in_zone(u64 i, struct zone zone,
	1930	+ unsigned long spfn, unsigned long epfn,
	1931	+ unsigned long first_init_pfn)
	1932	+{
	1933	+ u64 j;
	1934	+
	1935	+ /*
	1936	+ * Start out by walking through the ranges in this zone that have
	1937	+ * already been initialized. We don't need to do anything with them
	1938	+ * so we just need to flush them out of the system.
	1939	+ */
	1940	+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
	1941	+ if (*epfn <= first_init_pfn)
	1942	+ continue;
	1943	+ if (*spfn < first_init_pfn)
	1944	+ *spfn = first_init_pfn;
	1945	+ *i = j;
	1946	+ return true;
	1947	+ }
	1948	+
	1949	+ return false;
	1950	+}
	1951	+
	1952	+/*
	1953	+ * Initialize and free pages. We do it in two loops: first we initialize
	1954	+ * struct page, then free to buddy allocator, because while we are
	1955	+ * freeing pages we can access pages that are ahead (computing buddy
	1956	+ * page in __free_one_page()).
	1957	+ *
	1958	+ * In order to try and keep some memory in the cache we have the loop
	1959	+ * broken along max page order boundaries. This way we will not cause
	1960	+ * any issues with the buddy page computation.
	1961	+ */
	1962	+static unsigned long __init
	1963	+deferred_init_maxorder(u64 i, struct zone zone, unsigned long *start_pfn,
	1964	+ unsigned long *end_pfn)
	1965	+{
	1966	+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
	1967	+ unsigned long spfn = start_pfn, epfn = end_pfn;
	1968	+ unsigned long nr_pages = 0;
	1969	+ u64 j = *i;
	1970	+
	1971	+ /* First we loop through and initialize the page values */
	1972	+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
	1973	+ unsigned long t;
	1974	+
	1975	+ if (mo_pfn <= *start_pfn)
	1976	+ break;
	1977	+
	1978	+ t = min(mo_pfn, *end_pfn);
	1979	+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
	1980	+
	1981	+ if (mo_pfn < *end_pfn) {
	1982	+ *start_pfn = mo_pfn;
	1983	+ break;
	1984	+ }
	1985	+ }
	1986	+
	1987	+ /* Reset values and now loop through freeing pages as needed */
	1988	+ swap(j, *i);
	1989	+
	1990	+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
	1991	+ unsigned long t;
	1992	+
	1993	+ if (mo_pfn <= spfn)
	1994	+ break;
	1995	+
	1996	+ t = min(mo_pfn, epfn);
	1997	+ deferred_free_pages(spfn, t);
	1998	+
	1999	+ if (mo_pfn <= epfn)
	2000	+ break;
	2001	+ }
	2002	+
	2003	+ return nr_pages;
	2004	+}
	2005	+
	2006	+static void __init
	2007	+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
	2008	+ void *arg)
	2009	+{
	2010	+ unsigned long spfn, epfn;
	2011	+ struct zone *zone = arg;
	2012	+ u64 i;
	2013	+
	2014	+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
	2015	+
	2016	+ /*
	2017	+ * Initialize and free pages in MAX_ORDER sized increments so that we
	2018	+ * can avoid introducing any issues with the buddy allocator.
	2019	+ */
	2020	+ while (spfn < end_pfn) {
	2021	+ deferred_init_maxorder(&i, zone, &spfn, &epfn);
	2022	+ cond_resched();
	2023	+ }
	2024	+}
	2025	+
	2026	+/* An arch may override for more concurrency. */
	2027	+__weak int __init
	2028	+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
	2029	+{
	2030	+ return 1;
	2031	+}
	2032	+
1647	2033	/* Initialise remaining memory on a node */
1648	2034	static int __init deferred_init_memmap(void *data)
1649	2035	{
1650	2036	pg_data_t *pgdat = data;
1651		- int nid = pgdat->node_id;
1652		- unsigned long start = jiffies;
1653		- unsigned long nr_pages = 0;
1654		- unsigned long spfn, epfn, first_init_pfn, flags;
1655		- phys_addr_t spa, epa;
1656		- int zid;
1657		- struct zone *zone;
1658	2037	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
	2038	+ unsigned long spfn = 0, epfn = 0;
	2039	+ unsigned long first_init_pfn, flags;
	2040	+ unsigned long start = jiffies;
	2041	+ struct zone *zone;
	2042	+ int zid, max_threads;
1659	2043	u64 i;
1660	2044
1661	2045	/* Bind memory initialisation thread to a local node if possible */
..	..	@@ -1688,30 +2072,36 @@
1688	2072	if (first_init_pfn < zone_end_pfn(zone))
1689	2073	break;
1690	2074	}
1691		- first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
1692	2075
1693		- /*
1694		- * Initialize and free pages. We do it in two loops: first we initialize
1695		- * struct page, than free to buddy allocator, because while we are
1696		- * freeing pages we can access pages that are ahead (computing buddy
1697		- * page in __free_one_page()).
1698		- */
1699		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1700		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1701		- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1702		- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
1703		- }
1704		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1705		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1706		- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1707		- deferred_free_pages(nid, zid, spfn, epfn);
1708		- }
	2076	+ /* If the zone is empty somebody else may have cleared out the zone */
	2077	+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
	2078	+ first_init_pfn))
	2079	+ goto zone_empty;
1709	2080
	2081	+ max_threads = deferred_page_init_max_threads(cpumask);
	2082	+
	2083	+ while (spfn < epfn) {
	2084	+ unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
	2085	+ struct padata_mt_job job = {
	2086	+ .thread_fn = deferred_init_memmap_chunk,
	2087	+ .fn_arg = zone,
	2088	+ .start = spfn,
	2089	+ .size = epfn_align - spfn,
	2090	+ .align = PAGES_PER_SECTION,
	2091	+ .min_chunk = PAGES_PER_SECTION,
	2092	+ .max_threads = max_threads,
	2093	+ };
	2094	+
	2095	+ padata_do_multithreaded(&job);
	2096	+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
	2097	+ epfn_align);
	2098	+ }
	2099	+zone_empty:
1710	2100	/* Sanity check that the next zone really is unpopulated */
1711	2101	WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
1712	2102
1713		- pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
1714		- jiffies_to_msecs(jiffies - start));
	2103	+ pr_info("node %d deferred pages initialised in %ums\n",
	2104	+ pgdat->node_id, jiffies_to_msecs(jiffies - start));
1715	2105
1716	2106	pgdat_init_report_one_done();
1717	2107	return 0;
..	..	@@ -1735,14 +2125,11 @@
1735	2125	static noinline bool __init
1736	2126	deferred_grow_zone(struct zone *zone, unsigned int order)
1737	2127	{
1738		- int zid = zone_idx(zone);
1739		- int nid = zone_to_nid(zone);
1740		- pg_data_t *pgdat = NODE_DATA(nid);
1741	2128	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
1742		- unsigned long nr_pages = 0;
1743		- unsigned long first_init_pfn, spfn, epfn, t, flags;
	2129	+ pg_data_t *pgdat = zone->zone_pgdat;
1744	2130	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
1745		- phys_addr_t spa, epa;
	2131	+ unsigned long spfn, epfn, flags;
	2132	+ unsigned long nr_pages = 0;
1746	2133	u64 i;
1747	2134
1748	2135	/* Only the last zone may have deferred pages */
..	..	@@ -1760,38 +2147,37 @@
1760	2147	return true;
1761	2148	}
1762	2149
1763		- first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
1764		-
1765		- if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
	2150	+ /* If the zone is empty somebody else may have cleared out the zone */
	2151	+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
	2152	+ first_deferred_pfn)) {
	2153	+ pgdat->first_deferred_pfn = ULONG_MAX;
1766	2154	pgdat_resize_unlock(pgdat, &flags);
1767		- return false;
	2155	+ /* Retry only once. */
	2156	+ return first_deferred_pfn != ULONG_MAX;
1768	2157	}
1769	2158
1770		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1771		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1772		- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
	2159	+ /*
	2160	+ * Initialize and free pages in MAX_ORDER sized increments so
	2161	+ * that we can avoid introducing any issues with the buddy
	2162	+ * allocator.
	2163	+ */
	2164	+ while (spfn < epfn) {
	2165	+ /* update our first deferred PFN for this section */
	2166	+ first_deferred_pfn = spfn;
1773	2167
1774		- while (spfn < epfn && nr_pages < nr_pages_needed) {
1775		- t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
1776		- first_deferred_pfn = min(t, epfn);
1777		- nr_pages += deferred_init_pages(nid, zid, spfn,
1778		- first_deferred_pfn);
1779		- spfn = first_deferred_pfn;
1780		- }
	2168	+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
	2169	+ touch_nmi_watchdog();
1781	2170
	2171	+ /* We should only stop along section boundaries */
	2172	+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
	2173	+ continue;
	2174	+
	2175	+ /* If our quota has been met we can stop here */
1782	2176	if (nr_pages >= nr_pages_needed)
1783	2177	break;
1784	2178	}
1785	2179
1786		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1787		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1788		- epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
1789		- deferred_free_pages(nid, zid, spfn, epfn);
1790		-
1791		- if (first_deferred_pfn == epfn)
1792		- break;
1793		- }
1794		- pgdat->first_deferred_pfn = first_deferred_pfn;
	2180	+ pgdat->first_deferred_pfn = spfn;
1795	2181	pgdat_resize_unlock(pgdat, &flags);
1796	2182
1797	2183	return nr_pages > 0;
..	..	@@ -1814,9 +2200,9 @@
1814	2200	void __init page_alloc_init_late(void)
1815	2201	{
1816	2202	struct zone *zone;
	2203	+ int nid;
1817	2204
1818	2205	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1819		- int nid;
1820	2206
1821	2207	/* There will be num_node_state(N_MEMORY) threads */
1822	2208	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
..	..	@@ -1844,10 +2230,12 @@
1844	2230	/* Reinit limits that are based on free pages after the kernel is up */
1845	2231	files_maxfiles_init();
1846	2232	#endif
1847		-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
	2233	+
1848	2234	/* Discard memblock private memory */
1849	2235	memblock_discard();
1850		-#endif
	2236	+
	2237	+ for_each_node_state(nid, N_MEMORY)
	2238	+ shuffle_free_memory(NODE_DATA(nid));
1851	2239
1852	2240	for_each_populated_zone(zone)
1853	2241	set_zone_contiguous(zone);
..	..	@@ -1881,6 +2269,7 @@
1881	2269	}
1882	2270
1883	2271	adjust_managed_page_count(page, pageblock_nr_pages);
	2272	+ page_zone(page)->cma_pages += pageblock_nr_pages;
1884	2273	}
1885	2274	#endif
1886	2275
..	..	@@ -1899,13 +2288,11 @@
1899	2288	* -- nyc
1900	2289	*/
1901	2290	static inline void expand(struct zone zone, struct page page,
1902		- int low, int high, struct free_area *area,
1903		- int migratetype)
	2291	+ int low, int high, int migratetype)
1904	2292	{
1905	2293	unsigned long size = 1 << high;
1906	2294
1907	2295	while (high > low) {
1908		- area--;
1909	2296	high--;
1910	2297	size >>= 1;
1911	2298	VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
..	..	@@ -1919,39 +2306,21 @@
1919	2306	if (set_page_guard(zone, &page[size], high, migratetype))
1920	2307	continue;
1921	2308
1922		- list_add(&page[size].lru, &area->free_list[migratetype]);
1923		- area->nr_free++;
1924		- set_page_order(&page[size], high);
	2309	+ add_to_free_list(&page[size], zone, high, migratetype);
	2310	+ set_buddy_order(&page[size], high);
1925	2311	}
1926	2312	}
1927	2313
1928	2314	static void check_new_page_bad(struct page *page)
1929	2315	{
1930		- const char *bad_reason = NULL;
1931		- unsigned long bad_flags = 0;
1932		-
1933		- if (unlikely(atomic_read(&page->_mapcount) != -1))
1934		- bad_reason = "nonzero mapcount";
1935		- if (unlikely(page->mapping != NULL))
1936		- bad_reason = "non-NULL mapping";
1937		- if (unlikely(page_ref_count(page) != 0))
1938		- bad_reason = "nonzero _count";
1939	2316	if (unlikely(page->flags & __PG_HWPOISON)) {
1940		- bad_reason = "HWPoisoned (hardware-corrupted)";
1941		- bad_flags = __PG_HWPOISON;
1942	2317	/* Don't complain about hwpoisoned pages */
1943	2318	page_mapcount_reset(page); /* remove PageBuddy */
1944	2319	return;
1945	2320	}
1946		- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
1947		- bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
1948		- bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
1949		- }
1950		-#ifdef CONFIG_MEMCG
1951		- if (unlikely(page->mem_cgroup))
1952		- bad_reason = "page still charged to cgroup";
1953		-#endif
1954		- bad_page(page, bad_reason, bad_flags);
	2321	+
	2322	+ bad_page(page,
	2323	+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
1955	2324	}
1956	2325
1957	2326	/*
..	..	@@ -1967,30 +2336,40 @@
1967	2336	return 1;
1968	2337	}
1969	2338
1970		-static inline bool free_pages_prezeroed(void)
1971		-{
1972		- return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
1973		- page_poisoning_enabled()) \|\| want_init_on_free();
1974		-}
1975		-
1976	2339	#ifdef CONFIG_DEBUG_VM
1977		-static bool check_pcp_refill(struct page *page)
	2340	+/*
	2341	+ * With DEBUG_VM enabled, order-0 pages are checked for expected state when
	2342	+ * being allocated from pcp lists. With debug_pagealloc also enabled, they are
	2343	+ * also checked when pcp lists are refilled from the free lists.
	2344	+ */
	2345	+static inline bool check_pcp_refill(struct page *page)
1978	2346	{
1979		- return false;
	2347	+ if (debug_pagealloc_enabled_static())
	2348	+ return check_new_page(page);
	2349	+ else
	2350	+ return false;
1980	2351	}
1981	2352
1982		-static bool check_new_pcp(struct page *page)
	2353	+static inline bool check_new_pcp(struct page *page)
1983	2354	{
1984	2355	return check_new_page(page);
1985	2356	}
1986	2357	#else
1987		-static bool check_pcp_refill(struct page *page)
	2358	+/*
	2359	+ * With DEBUG_VM disabled, free order-0 pages are checked for expected state
	2360	+ * when pcp lists are being refilled from the free lists. With debug_pagealloc
	2361	+ * enabled, they are also checked when being allocated from the pcp lists.
	2362	+ */
	2363	+static inline bool check_pcp_refill(struct page *page)
1988	2364	{
1989	2365	return check_new_page(page);
1990	2366	}
1991		-static bool check_new_pcp(struct page *page)
	2367	+static inline bool check_new_pcp(struct page *page)
1992	2368	{
1993		- return false;
	2369	+ if (debug_pagealloc_enabled_static())
	2370	+ return check_new_page(page);
	2371	+ else
	2372	+ return false;
1994	2373	}
1995	2374	#endif /* CONFIG_DEBUG_VM */
1996	2375
..	..	@@ -2014,9 +2393,31 @@
2014	2393	set_page_refcounted(page);
2015	2394
2016	2395	arch_alloc_page(page, order);
2017		- kernel_map_pages(page, 1 << order, 1);
2018		- kasan_alloc_pages(page, order);
2019		- kernel_poison_pages(page, 1 << order, 1);
	2396	+ debug_pagealloc_map_pages(page, 1 << order);
	2397	+
	2398	+ /*
	2399	+ * Page unpoisoning must happen before memory initialization.
	2400	+ * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
	2401	+ * allocations and the page unpoisoning code will complain.
	2402	+ */
	2403	+ kernel_unpoison_pages(page, 1 << order);
	2404	+
	2405	+ /*
	2406	+ * As memory initialization might be integrated into KASAN,
	2407	+ * kasan_alloc_pages and kernel_init_free_pages must be
	2408	+ * kept together to avoid discrepancies in behavior.
	2409	+ */
	2410	+ if (kasan_has_integrated_init()) {
	2411	+ kasan_alloc_pages(page, order, gfp_flags);
	2412	+ } else {
	2413	+ bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
	2414	+
	2415	+ kasan_unpoison_pages(page, order, init);
	2416	+ if (init)
	2417	+ kernel_init_free_pages(page, 1 << order,
	2418	+ gfp_flags & __GFP_ZEROTAGS);
	2419	+ }
	2420	+
2020	2421	set_page_owner(page, order, gfp_flags);
2021	2422	}
2022	2423
..	..	@@ -2024,9 +2425,6 @@
2024	2425	unsigned int alloc_flags)
2025	2426	{
2026	2427	post_alloc_hook(page, order, gfp_flags);
2027		-
2028		- if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
2029		- kernel_init_free_pages(page, 1 << order);
2030	2428
2031	2429	if (order && (gfp_flags & __GFP_COMP))
2032	2430	prep_compound_page(page, order);
..	..	@@ -2041,6 +2439,7 @@
2041	2439	set_page_pfmemalloc(page);
2042	2440	else
2043	2441	clear_page_pfmemalloc(page);
	2442	+ trace_android_vh_test_clear_look_around_ref(page);
2044	2443	}
2045	2444
2046	2445	/*
..	..	@@ -2058,14 +2457,11 @@
2058	2457	/* Find a page of the appropriate size in the preferred list */
2059	2458	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
2060	2459	area = &(zone->free_area[current_order]);
2061		- page = list_first_entry_or_null(&area->free_list[migratetype],
2062		- struct page, lru);
	2460	+ page = get_page_from_free_area(area, migratetype);
2063	2461	if (!page)
2064	2462	continue;
2065		- list_del(&page->lru);
2066		- rmv_page_order(page);
2067		- area->nr_free--;
2068		- expand(zone, page, order, current_order, area, migratetype);
	2463	+ del_page_from_free_list(page, zone, current_order);
	2464	+ expand(zone, page, order, current_order, migratetype);
2069	2465	set_pcppage_migratetype(page, migratetype);
2070	2466	return page;
2071	2467	}
..	..	@@ -2078,10 +2474,10 @@
2078	2474	* This array describes the order lists are fallen back to when
2079	2475	* the free lists for the desirable migrate type are depleted
2080	2476	*/
2081		-static int fallbacks[MIGRATE_TYPES][4] = {
	2477	+static int fallbacks[MIGRATE_TYPES][3] = {
2082	2478	[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2083		- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2084	2479	[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
	2480	+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2085	2481	#ifdef CONFIG_CMA
2086	2482	[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
2087	2483	#endif
..	..	@@ -2102,7 +2498,7 @@
2102	2498	#endif
2103	2499
2104	2500	/*
2105		- * Move the free pages in a range to the free lists of the requested type.
	2501	+ * Move the free pages in a range to the freelist tail of the requested type.
2106	2502	* Note that start_page and end_pages are not aligned on a pageblock
2107	2503	* boundary. If alignment is required, use move_freepages_block()
2108	2504	*/
..	..	@@ -2114,30 +2510,11 @@
2114	2510	unsigned int order;
2115	2511	int pages_moved = 0;
2116	2512
2117		-#ifndef CONFIG_HOLES_IN_ZONE
2118		- /*
2119		- * page_zone is not safe to call in this context when
2120		- * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
2121		- * anyway as we check zone boundaries in move_freepages_block().
2122		- * Remove at a later date when no bug reports exist related to
2123		- * grouping pages by mobility
2124		- */
2125		- VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
2126		- pfn_valid(page_to_pfn(end_page)) &&
2127		- page_zone(start_page) != page_zone(end_page));
2128		-#endif
2129		-
2130		- if (num_movable)
2131		- *num_movable = 0;
2132		-
2133	2513	for (page = start_page; page <= end_page;) {
2134	2514	if (!pfn_valid_within(page_to_pfn(page))) {
2135	2515	page++;
2136	2516	continue;
2137	2517	}
2138		-
2139		- /* Make sure we are not inadvertently changing nodes */
2140		- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
2141	2518
2142	2519	if (!PageBuddy(page)) {
2143	2520	/*
..	..	@@ -2153,9 +2530,12 @@
2153	2530	continue;
2154	2531	}
2155	2532
2156		- order = page_order(page);
2157		- list_move(&page->lru,
2158		- &zone->free_area[order].free_list[migratetype]);
	2533	+ /* Make sure we are not inadvertently changing nodes */
	2534	+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
	2535	+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
	2536	+
	2537	+ order = buddy_order(page);
	2538	+ move_to_free_list(page, zone, order, migratetype);
2159	2539	page += 1 << order;
2160	2540	pages_moved += 1 << order;
2161	2541	}
..	..	@@ -2168,6 +2548,9 @@
2168	2548	{
2169	2549	unsigned long start_pfn, end_pfn;
2170	2550	struct page start_page, end_page;
	2551	+
	2552	+ if (num_movable)
	2553	+ *num_movable = 0;
2171	2554
2172	2555	start_pfn = page_to_pfn(page);
2173	2556	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
..	..	@@ -2229,6 +2612,43 @@
2229	2612	return false;
2230	2613	}
2231	2614
	2615	+static inline bool boost_watermark(struct zone *zone)
	2616	+{
	2617	+ unsigned long max_boost;
	2618	+
	2619	+ if (!watermark_boost_factor)
	2620	+ return false;
	2621	+ /*
	2622	+ * Don't bother in zones that are unlikely to produce results.
	2623	+ * On small machines, including kdump capture kernels running
	2624	+ * in a small area, boosting the watermark can cause an out of
	2625	+ * memory situation immediately.
	2626	+ */
	2627	+ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
	2628	+ return false;
	2629	+
	2630	+ max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
	2631	+ watermark_boost_factor, 10000);
	2632	+
	2633	+ /*
	2634	+ * high watermark may be uninitialised if fragmentation occurs
	2635	+ * very early in boot so do not boost. We do not fall
	2636	+ * through and boost by pageblock_nr_pages as failing
	2637	+ * allocations that early means that reclaim is not going
	2638	+ * to help and it may even be impossible to reclaim the
	2639	+ * boosted watermark resulting in a hang.
	2640	+ */
	2641	+ if (!max_boost)
	2642	+ return false;
	2643	+
	2644	+ max_boost = max(pageblock_nr_pages, max_boost);
	2645	+
	2646	+ zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
	2647	+ max_boost);
	2648	+
	2649	+ return true;
	2650	+}
	2651	+
2232	2652	/*
2233	2653	* This function implements actual steal behaviour. If order is large enough,
2234	2654	* we can steal whole pageblock. If not, we first move freepages in this
..	..	@@ -2238,10 +2658,9 @@
2238	2658	* itself, so pages freed in the future will be put on the correct free list.
2239	2659	*/
2240	2660	static void steal_suitable_fallback(struct zone zone, struct page page,
2241		- int start_type, bool whole_block)
	2661	+ unsigned int alloc_flags, int start_type, bool whole_block)
2242	2662	{
2243		- unsigned int current_order = page_order(page);
2244		- struct free_area *area;
	2663	+ unsigned int current_order = buddy_order(page);
2245	2664	int free_pages, movable_pages, alike_pages;
2246	2665	int old_block_type;
2247	2666
..	..	@@ -2259,6 +2678,14 @@
2259	2678	change_pageblock_range(page, current_order, start_type);
2260	2679	goto single_page;
2261	2680	}
	2681	+
	2682	+ /*
	2683	+ * Boost watermarks to increase reclaim pressure to reduce the
	2684	+ * likelihood of future fallbacks. Wake kswapd now as the node
	2685	+ * may be balanced overall and kswapd will not wake naturally.
	2686	+ */
	2687	+ if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
	2688	+ set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
2262	2689
2263	2690	/* We are not allowed to try stealing from the whole block */
2264	2691	if (!whole_block)
..	..	@@ -2303,8 +2730,7 @@
2303	2730	return;
2304	2731
2305	2732	single_page:
2306		- area = &zone->free_area[current_order];
2307		- list_move(&page->lru, &area->free_list[start_type]);
	2733	+ move_to_free_list(page, zone, current_order, start_type);
2308	2734	}
2309	2735
2310	2736	/*
..	..	@@ -2328,7 +2754,7 @@
2328	2754	if (fallback_mt == MIGRATE_TYPES)
2329	2755	break;
2330	2756
2331		- if (list_empty(&area->free_list[fallback_mt]))
	2757	+ if (free_area_empty(area, fallback_mt))
2332	2758	continue;
2333	2759
2334	2760	if (can_steal_fallback(order, migratetype))
..	..	@@ -2358,7 +2784,7 @@
2358	2784	* Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2359	2785	* Check is race-prone but harmless.
2360	2786	*/
2361		- max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
	2787	+ max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
2362	2788	if (zone->nr_reserved_highatomic >= max_managed)
2363	2789	return;
2364	2790
..	..	@@ -2401,7 +2827,7 @@
2401	2827	int order;
2402	2828	bool ret;
2403	2829
2404		- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
	2830	+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
2405	2831	ac->nodemask) {
2406	2832	/*
2407	2833	* Preserve at least one pageblock unless memory pressure
..	..	@@ -2415,9 +2841,7 @@
2415	2841	for (order = 0; order < MAX_ORDER; order++) {
2416	2842	struct free_area *area = &(zone->free_area[order]);
2417	2843
2418		- page = list_first_entry_or_null(
2419		- &area->free_list[MIGRATE_HIGHATOMIC],
2420		- struct page, lru);
	2844	+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
2421	2845	if (!page)
2422	2846	continue;
2423	2847
..	..	@@ -2475,20 +2899,30 @@
2475	2899	* condition simpler.
2476	2900	*/
2477	2901	static __always_inline bool
2478		-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
	2902	+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
	2903	+ unsigned int alloc_flags)
2479	2904	{
2480	2905	struct free_area *area;
2481	2906	int current_order;
	2907	+ int min_order = order;
2482	2908	struct page *page;
2483	2909	int fallback_mt;
2484	2910	bool can_steal;
	2911	+
	2912	+ /*
	2913	+ * Do not steal pages from freelists belonging to other pageblocks
	2914	+ * i.e. orders < pageblock_order. If there are no local zones free,
	2915	+ * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
	2916	+ */
	2917	+ if (alloc_flags & ALLOC_NOFRAGMENT)
	2918	+ min_order = pageblock_order;
2485	2919
2486	2920	/*
2487	2921	* Find the largest available free page in the other list. This roughly
2488	2922	* approximates finding the pageblock with the most free pages, which
2489	2923	* would be too costly to do exactly.
2490	2924	*/
2491		- for (current_order = MAX_ORDER - 1; current_order >= order;
	2925	+ for (current_order = MAX_ORDER - 1; current_order >= min_order;
2492	2926	--current_order) {
2493	2927	area = &(zone->free_area[current_order]);
2494	2928	fallback_mt = find_suitable_fallback(area, current_order,
..	..	@@ -2530,10 +2964,10 @@
2530	2964	VM_BUG_ON(current_order == MAX_ORDER);
2531	2965
2532	2966	do_steal:
2533		- page = list_first_entry(&area->free_list[fallback_mt],
2534		- struct page, lru);
	2967	+ page = get_page_from_free_area(area, fallback_mt);
2535	2968
2536		- steal_suitable_fallback(zone, page, start_migratetype, can_steal);
	2969	+ steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
	2970	+ can_steal);
2537	2971
2538	2972	trace_mm_page_alloc_extfrag(page, order, current_order,
2539	2973	start_migratetype, fallback_mt);
..	..	@@ -2547,14 +2981,16 @@
2547	2981	* Call me with the zone->lock already held.
2548	2982	*/
2549	2983	static __always_inline struct page *
2550		-__rmqueue(struct zone *zone, unsigned int order, int migratetype)
	2984	+__rmqueue(struct zone *zone, unsigned int order, int migratetype,
	2985	+ unsigned int alloc_flags)
2551	2986	{
2552	2987	struct page *page;
2553	2988
2554	2989	retry:
2555	2990	page = __rmqueue_smallest(zone, order, migratetype);
2556	2991
2557		- if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype))
	2992	+ if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype,
	2993	+ alloc_flags))
2558	2994	goto retry;
2559	2995
2560	2996	trace_mm_page_alloc_zone_locked(page, order, migratetype);
..	..	@@ -2562,18 +2998,18 @@
2562	2998	}
2563	2999
2564	3000	#ifdef CONFIG_CMA
2565		-static struct page __rmqueue_cma(struct zone zone, unsigned int order)
	3001	+static struct page __rmqueue_cma(struct zone zone, unsigned int order,
	3002	+ int migratetype,
	3003	+ unsigned int alloc_flags)
2566	3004	{
2567		- struct page *page = 0;
2568		-
2569		- if (IS_ENABLED(CONFIG_CMA))
2570		- if (!zone->cma_alloc)
2571		- page = __rmqueue_cma_fallback(zone, order);
	3005	+ struct page *page = __rmqueue_cma_fallback(zone, order);
2572	3006	trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA);
2573	3007	return page;
2574	3008	}
2575	3009	#else
2576		-static inline struct page __rmqueue_cma(struct zone zone, unsigned int order)
	3010	+static inline struct page __rmqueue_cma(struct zone zone, unsigned int order,
	3011	+ int migratetype,
	3012	+ unsigned int alloc_flags)
2577	3013	{
2578	3014	return NULL;
2579	3015	}
..	..	@@ -2586,7 +3022,7 @@
2586	3022	*/
2587	3023	static int rmqueue_bulk(struct zone *zone, unsigned int order,
2588	3024	unsigned long count, struct list_head *list,
2589		- int migratetype)
	3025	+ int migratetype, unsigned int alloc_flags)
2590	3026	{
2591	3027	int i, alloced = 0;
2592	3028
..	..	@@ -2594,15 +3030,11 @@
2594	3030	for (i = 0; i < count; ++i) {
2595	3031	struct page *page;
2596	3032
2597		- /*
2598		- * If migrate type CMA is being requested only try to
2599		- * satisfy the request with CMA pages to try and increase
2600		- * CMA utlization.
2601		- */
2602	3033	if (is_migrate_cma(migratetype))
2603		- page = __rmqueue_cma(zone, order);
	3034	+ page = __rmqueue_cma(zone, order, migratetype,
	3035	+ alloc_flags);
2604	3036	else
2605		- page = __rmqueue(zone, order, migratetype);
	3037	+ page = __rmqueue(zone, order, migratetype, alloc_flags);
2606	3038
2607	3039	if (unlikely(page == NULL))
2608	3040	break;
..	..	@@ -2645,14 +3077,14 @@
2645	3077	*/
2646	3078	static struct list_head get_populated_pcp_list(struct zone zone,
2647	3079	unsigned int order, struct per_cpu_pages *pcp,
2648		- int migratetype)
	3080	+ int migratetype, unsigned int alloc_flags)
2649	3081	{
2650	3082	struct list_head *list = &pcp->lists[migratetype];
2651	3083
2652	3084	if (list_empty(list)) {
2653	3085	pcp->count += rmqueue_bulk(zone, order,
2654	3086	pcp->batch, list,
2655		- migratetype);
	3087	+ migratetype, alloc_flags);
2656	3088
2657	3089	if (list_empty(list))
2658	3090	list = NULL;
..	..	@@ -2673,13 +3105,18 @@
2673	3105	{
2674	3106	unsigned long flags;
2675	3107	int to_drain, batch;
	3108	+ LIST_HEAD(dst);
2676	3109
2677		- local_irq_save(flags);
	3110	+ local_lock_irqsave(&pa_lock.l, flags);
2678	3111	batch = READ_ONCE(pcp->batch);
2679	3112	to_drain = min(pcp->count, batch);
2680	3113	if (to_drain > 0)
2681		- free_pcppages_bulk(zone, to_drain, pcp);
2682		- local_irq_restore(flags);
	3114	+ isolate_pcp_pages(to_drain, pcp, &dst);
	3115	+
	3116	+ local_unlock_irqrestore(&pa_lock.l, flags);
	3117	+
	3118	+ if (to_drain > 0)
	3119	+ free_pcppages_bulk(zone, &dst, false);
2683	3120	}
2684	3121	#endif
2685	3122
..	..	@@ -2695,14 +3132,21 @@
2695	3132	unsigned long flags;
2696	3133	struct per_cpu_pageset *pset;
2697	3134	struct per_cpu_pages *pcp;
	3135	+ LIST_HEAD(dst);
	3136	+ int count;
2698	3137
2699		- local_irq_save(flags);
	3138	+ local_lock_irqsave(&pa_lock.l, flags);
2700	3139	pset = per_cpu_ptr(zone->pageset, cpu);
2701	3140
2702	3141	pcp = &pset->pcp;
2703		- if (pcp->count)
2704		- free_pcppages_bulk(zone, pcp->count, pcp);
2705		- local_irq_restore(flags);
	3142	+ count = pcp->count;
	3143	+ if (count)
	3144	+ isolate_pcp_pages(count, pcp, &dst);
	3145	+
	3146	+ local_unlock_irqrestore(&pa_lock.l, flags);
	3147	+
	3148	+ if (count)
	3149	+ free_pcppages_bulk(zone, &dst, false);
2706	3150	}
2707	3151
2708	3152	/*
..	..	@@ -2739,6 +3183,10 @@
2739	3183
2740	3184	static void drain_local_pages_wq(struct work_struct *work)
2741	3185	{
	3186	+ struct pcpu_drain *drain;
	3187	+
	3188	+ drain = container_of(work, struct pcpu_drain, work);
	3189	+
2742	3190	/*
2743	3191	* drain_all_pages doesn't use proper cpu hotplug protection so
2744	3192	* we can race with cpu offline when the WQ can move this from
..	..	@@ -2746,9 +3194,9 @@
2746	3194	* cpu which is allright but we also have to make sure to not move to
2747	3195	* a different one.
2748	3196	*/
2749		- preempt_disable();
2750		- drain_local_pages(NULL);
2751		- preempt_enable();
	3197	+ migrate_disable();
	3198	+ drain_local_pages(drain->zone);
	3199	+ migrate_enable();
2752	3200	}
2753	3201
2754	3202	/*
..	..	@@ -2818,12 +3266,14 @@
2818	3266	}
2819	3267
2820	3268	for_each_cpu(cpu, &cpus_with_pcps) {
2821		- struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
2822		- INIT_WORK(work, drain_local_pages_wq);
2823		- queue_work_on(cpu, mm_percpu_wq, work);
	3269	+ struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
	3270	+
	3271	+ drain->zone = zone;
	3272	+ INIT_WORK(&drain->work, drain_local_pages_wq);
	3273	+ queue_work_on(cpu, mm_percpu_wq, &drain->work);
2824	3274	}
2825	3275	for_each_cpu(cpu, &cpus_with_pcps)
2826		- flush_work(per_cpu_ptr(&pcpu_drain, cpu));
	3276	+ flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
2827	3277
2828	3278	mutex_unlock(&pcpu_drain_mutex);
2829	3279	}
..	..	@@ -2895,11 +3345,13 @@
2895	3345	return true;
2896	3346	}
2897	3347
2898		-static void free_unref_page_commit(struct page *page, unsigned long pfn)
	3348	+static void free_unref_page_commit(struct page *page, unsigned long pfn,
	3349	+ struct list_head *dst)
2899	3350	{
2900	3351	struct zone *zone = page_zone(page);
2901	3352	struct per_cpu_pages *pcp;
2902	3353	int migratetype;
	3354	+ bool pcp_skip_cma_pages = false;
2903	3355
2904	3356	migratetype = get_pcppage_migratetype(page);
2905	3357	__count_vm_event(PGFREE);
..	..	@@ -2912,8 +3364,12 @@
2912	3364	* excessively into the page allocator
2913	3365	*/
2914	3366	if (migratetype >= MIGRATE_PCPTYPES) {
2915		- if (unlikely(is_migrate_isolate(migratetype))) {
2916		- free_one_page(zone, page, pfn, 0, migratetype);
	3367	+ trace_android_vh_pcplist_add_cma_pages_bypass(migratetype,
	3368	+ &pcp_skip_cma_pages);
	3369	+ if (unlikely(is_migrate_isolate(migratetype)) \|\|
	3370	+ pcp_skip_cma_pages) {
	3371	+ free_one_page(zone, page, pfn, 0, migratetype,
	3372	+ FPI_NONE);
2917	3373	return;
2918	3374	}
2919	3375	migratetype = MIGRATE_MOVABLE;
..	..	@@ -2924,7 +3380,8 @@
2924	3380	pcp->count++;
2925	3381	if (pcp->count >= pcp->high) {
2926	3382	unsigned long batch = READ_ONCE(pcp->batch);
2927		- free_pcppages_bulk(zone, batch, pcp);
	3383	+
	3384	+ isolate_pcp_pages(batch, pcp, dst);
2928	3385	}
2929	3386	}
2930	3387
..	..	@@ -2935,13 +3392,17 @@
2935	3392	{
2936	3393	unsigned long flags;
2937	3394	unsigned long pfn = page_to_pfn(page);
	3395	+ struct zone *zone = page_zone(page);
	3396	+ LIST_HEAD(dst);
2938	3397
2939	3398	if (!free_unref_page_prepare(page, pfn))
2940	3399	return;
2941	3400
2942		- local_irq_save(flags);
2943		- free_unref_page_commit(page, pfn);
2944		- local_irq_restore(flags);
	3401	+ local_lock_irqsave(&pa_lock.l, flags);
	3402	+ free_unref_page_commit(page, pfn, &dst);
	3403	+ local_unlock_irqrestore(&pa_lock.l, flags);
	3404	+ if (!list_empty(&dst))
	3405	+ free_pcppages_bulk(zone, &dst, false);
2945	3406	}
2946	3407
2947	3408	/*
..	..	@@ -2952,6 +3413,11 @@
2952	3413	struct page page, next;
2953	3414	unsigned long flags, pfn;
2954	3415	int batch_count = 0;
	3416	+ struct list_head dsts[__MAX_NR_ZONES];
	3417	+ int i;
	3418	+
	3419	+ for (i = 0; i < __MAX_NR_ZONES; i++)
	3420	+ INIT_LIST_HEAD(&dsts[i]);
2955	3421
2956	3422	/* Prepare pages for freeing */
2957	3423	list_for_each_entry_safe(page, next, list, lru) {
..	..	@@ -2961,25 +3427,42 @@
2961	3427	set_page_private(page, pfn);
2962	3428	}
2963	3429
2964		- local_irq_save(flags);
	3430	+ local_lock_irqsave(&pa_lock.l, flags);
2965	3431	list_for_each_entry_safe(page, next, list, lru) {
2966	3432	unsigned long pfn = page_private(page);
	3433	+ enum zone_type type;
2967	3434
2968	3435	set_page_private(page, 0);
2969	3436	trace_mm_page_free_batched(page);
2970		- free_unref_page_commit(page, pfn);
	3437	+ type = page_zonenum(page);
	3438	+ free_unref_page_commit(page, pfn, &dsts[type]);
2971	3439
2972	3440	/*
2973	3441	* Guard against excessive IRQ disabled times when we get
2974	3442	* a large list of pages to free.
2975	3443	*/
2976	3444	if (++batch_count == SWAP_CLUSTER_MAX) {
2977		- local_irq_restore(flags);
	3445	+ local_unlock_irqrestore(&pa_lock.l, flags);
2978	3446	batch_count = 0;
2979		- local_irq_save(flags);
	3447	+ local_lock_irqsave(&pa_lock.l, flags);
2980	3448	}
2981	3449	}
2982		- local_irq_restore(flags);
	3450	+ local_unlock_irqrestore(&pa_lock.l, flags);
	3451	+
	3452	+ for (i = 0; i < __MAX_NR_ZONES; ) {
	3453	+ struct page *page;
	3454	+ struct zone *zone;
	3455	+
	3456	+ if (list_empty(&dsts[i])) {
	3457	+ i++;
	3458	+ continue;
	3459	+ }
	3460	+
	3461	+ page = list_first_entry(&dsts[i], struct page, lru);
	3462	+ zone = page_zone(page);
	3463	+
	3464	+ free_pcppages_bulk(zone, &dsts[i], true);
	3465	+ }
2983	3466	}
2984	3467
2985	3468	/*
..	..	@@ -2999,7 +3482,8 @@
2999	3482
3000	3483	for (i = 1; i < (1 << order); i++)
3001	3484	set_page_refcounted(page + i);
3002		- split_page_owner(page, order);
	3485	+ split_page_owner(page, 1 << order);
	3486	+ split_page_memcg(page, 1 << order);
3003	3487	}
3004	3488	EXPORT_SYMBOL_GPL(split_page);
3005	3489
..	..	@@ -3021,7 +3505,7 @@
3021	3505	* watermark, because we already know our high-order page
3022	3506	* exists.
3023	3507	*/
3024		- watermark = min_wmark_pages(zone) + (1UL << order);
	3508	+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
3025	3509	if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
3026	3510	return 0;
3027	3511
..	..	@@ -3029,9 +3513,8 @@
3029	3513	}
3030	3514
3031	3515	/* Remove page from free list */
3032		- list_del(&page->lru);
3033		- zone->free_area[order].nr_free--;
3034		- rmv_page_order(page);
	3516	+
	3517	+ del_page_from_free_list(page, zone, order);
3035	3518
3036	3519	/*
3037	3520	* Set the pageblock if the isolated page is at least half of a
..	..	@@ -3050,6 +3533,27 @@
3050	3533
3051	3534
3052	3535	return 1UL << order;
	3536	+}
	3537	+
	3538	+/**
	3539	+ * __putback_isolated_page - Return a now-isolated page back where we got it
	3540	+ * @page: Page that was isolated
	3541	+ * @order: Order of the isolated page
	3542	+ * @mt: The page's pageblock's migratetype
	3543	+ *
	3544	+ * This function is meant to return a page pulled from the free lists via
	3545	+ * __isolate_free_page back to the free lists they were pulled from.
	3546	+ */
	3547	+void __putback_isolated_page(struct page *page, unsigned int order, int mt)
	3548	+{
	3549	+ struct zone *zone = page_zone(page);
	3550	+
	3551	+ /* zone lock should be held when this function is called */
	3552	+ lockdep_assert_held(&zone->lock);
	3553	+
	3554	+ /* Return isolated page to tail of freelist. */
	3555	+ __free_one_page(page, page_to_pfn(page), zone, order, mt,
	3556	+ FPI_SKIP_REPORT_NOTIFY \| FPI_TO_TAIL);
3053	3557	}
3054	3558
3055	3559	/*
..	..	@@ -3081,6 +3585,7 @@
3081	3585
3082	3586	/* Remove page from the per-cpu list, caller must protect the list */
3083	3587	static struct page __rmqueue_pcplist(struct zone zone, int migratetype,
	3588	+ unsigned int alloc_flags,
3084	3589	struct per_cpu_pages *pcp,
3085	3590	gfp_t gfp_flags)
3086	3591	{
..	..	@@ -3090,9 +3595,9 @@
3090	3595	do {
3091	3596	/* First try to get CMA pages */
3092	3597	if (migratetype == MIGRATE_MOVABLE &&
3093		- gfp_flags & __GFP_CMA) {
	3598	+ alloc_flags & ALLOC_CMA) {
3094	3599	list = get_populated_pcp_list(zone, 0, pcp,
3095		- get_cma_migrate_type());
	3600	+ get_cma_migrate_type(), alloc_flags);
3096	3601	}
3097	3602
3098	3603	if (list == NULL) {
..	..	@@ -3101,7 +3606,7 @@
3101	3606	* free CMA pages.
3102	3607	*/
3103	3608	list = get_populated_pcp_list(zone, 0, pcp,
3104		- migratetype);
	3609	+ migratetype, alloc_flags);
3105	3610	if (unlikely(list == NULL) \|\|
3106	3611	unlikely(list_empty(list)))
3107	3612	return NULL;
..	..	@@ -3117,22 +3622,22 @@
3117	3622
3118	3623	/* Lock and remove page from the per-cpu list */
3119	3624	static struct page rmqueue_pcplist(struct zone preferred_zone,
3120		- struct zone *zone, unsigned int order,
3121		- gfp_t gfp_flags, int migratetype)
	3625	+ struct zone *zone, gfp_t gfp_flags,
	3626	+ int migratetype, unsigned int alloc_flags)
3122	3627	{
3123	3628	struct per_cpu_pages *pcp;
3124	3629	struct page *page;
3125	3630	unsigned long flags;
3126	3631
3127		- local_irq_save(flags);
	3632	+ local_lock_irqsave(&pa_lock.l, flags);
3128	3633	pcp = &this_cpu_ptr(zone->pageset)->pcp;
3129		- page = __rmqueue_pcplist(zone, migratetype, pcp,
	3634	+ page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp,
3130	3635	gfp_flags);
3131	3636	if (page) {
3132		- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
	3637	+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
3133	3638	zone_statistics(preferred_zone, zone);
3134	3639	}
3135		- local_irq_restore(flags);
	3640	+ local_unlock_irqrestore(&pa_lock.l, flags);
3136	3641	return page;
3137	3642	}
3138	3643
..	..	@@ -3149,8 +3654,8 @@
3149	3654	struct page *page;
3150	3655
3151	3656	if (likely(order == 0)) {
3152		- page = rmqueue_pcplist(preferred_zone, zone, order,
3153		- gfp_flags, migratetype);
	3657	+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
	3658	+ migratetype, alloc_flags);
3154	3659	goto out;
3155	3660	}
3156	3661
..	..	@@ -3159,25 +3664,32 @@
3159	3664	* allocate greater than order-1 page units with __GFP_NOFAIL.
3160	3665	*/
3161	3666	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
3162		- spin_lock_irqsave(&zone->lock, flags);
	3667	+ local_lock_irqsave(&pa_lock.l, flags);
	3668	+ spin_lock(&zone->lock);
3163	3669
3164	3670	do {
3165	3671	page = NULL;
3166		-
3167		- if (alloc_flags & ALLOC_HARDER) {
	3672	+ /*
	3673	+ * order-0 request can reach here when the pcplist is skipped
	3674	+ * due to non-CMA allocation context. HIGHATOMIC area is
	3675	+ * reserved for high-order atomic allocation, so order-0
	3676	+ * request should skip it.
	3677	+ */
	3678	+ if (order > 0 && alloc_flags & ALLOC_HARDER) {
3168	3679	page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
3169	3680	if (page)
3170	3681	trace_mm_page_alloc_zone_locked(page, order, migratetype);
3171	3682	}
3172		-
3173		- if (!page && migratetype == MIGRATE_MOVABLE &&
3174		- gfp_flags & __GFP_CMA)
3175		- page = __rmqueue_cma(zone, order);
3176		-
3177		- if (!page)
3178		- page = __rmqueue(zone, order, migratetype);
	3683	+ if (!page) {
	3684	+ if (migratetype == MIGRATE_MOVABLE &&
	3685	+ alloc_flags & ALLOC_CMA)
	3686	+ page = __rmqueue_cma(zone, order, migratetype,
	3687	+ alloc_flags);
	3688	+ if (!page)
	3689	+ page = __rmqueue(zone, order, migratetype,
	3690	+ alloc_flags);
	3691	+ }
3179	3692	} while (page && check_new_pages(page, order));
3180		-
3181	3693	spin_unlock(&zone->lock);
3182	3694	if (!page)
3183	3695	goto failed;
..	..	@@ -3186,14 +3698,22 @@
3186	3698
3187	3699	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
3188	3700	zone_statistics(preferred_zone, zone);
3189		- local_irq_restore(flags);
	3701	+ trace_android_vh_rmqueue(preferred_zone, zone, order,
	3702	+ gfp_flags, alloc_flags, migratetype);
	3703	+ local_unlock_irqrestore(&pa_lock.l, flags);
3190	3704
3191	3705	out:
	3706	+ /* Separate test+clear to avoid unnecessary atomics */
	3707	+ if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
	3708	+ clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
	3709	+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
	3710	+ }
	3711	+
3192	3712	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
3193	3713	return page;
3194	3714
3195	3715	failed:
3196		- local_irq_restore(flags);
	3716	+ local_unlock_irqrestore(&pa_lock.l, flags);
3197	3717	return NULL;
3198	3718	}
3199	3719
..	..	@@ -3218,7 +3738,7 @@
3218	3738	}
3219	3739	__setup("fail_page_alloc=", setup_fail_page_alloc);
3220	3740
3221		-static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
	3741	+static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3222	3742	{
3223	3743	if (order < fail_page_alloc.min_order)
3224	3744	return false;
..	..	@@ -3242,24 +3762,14 @@
3242	3762
3243	3763	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
3244	3764	&fail_page_alloc.attr);
3245		- if (IS_ERR(dir))
3246		- return PTR_ERR(dir);
3247	3765
3248		- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
3249		- &fail_page_alloc.ignore_gfp_reclaim))
3250		- goto fail;
3251		- if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3252		- &fail_page_alloc.ignore_gfp_highmem))
3253		- goto fail;
3254		- if (!debugfs_create_u32("min-order", mode, dir,
3255		- &fail_page_alloc.min_order))
3256		- goto fail;
	3766	+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
	3767	+ &fail_page_alloc.ignore_gfp_reclaim);
	3768	+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
	3769	+ &fail_page_alloc.ignore_gfp_highmem);
	3770	+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
3257	3771
3258	3772	return 0;
3259		-fail:
3260		- debugfs_remove_recursive(dir);
3261		-
3262		- return -ENOMEM;
3263	3773	}
3264	3774
3265	3775	late_initcall(fail_page_alloc_debugfs);
..	..	@@ -3268,12 +3778,41 @@
3268	3778
3269	3779	#else /* CONFIG_FAIL_PAGE_ALLOC */
3270	3780
3271		-static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
	3781	+static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3272	3782	{
3273	3783	return false;
3274	3784	}
3275	3785
3276	3786	#endif /* CONFIG_FAIL_PAGE_ALLOC */
	3787	+
	3788	+noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
	3789	+{
	3790	+ return __should_fail_alloc_page(gfp_mask, order);
	3791	+}
	3792	+ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
	3793	+
	3794	+static inline long __zone_watermark_unusable_free(struct zone *z,
	3795	+ unsigned int order, unsigned int alloc_flags)
	3796	+{
	3797	+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER\|ALLOC_OOM));
	3798	+ long unusable_free = (1 << order) - 1;
	3799	+
	3800	+ /*
	3801	+ * If the caller does not have rights to ALLOC_HARDER then subtract
	3802	+ * the high-atomic reserves. This will over-estimate the size of the
	3803	+ * atomic reserve but it avoids a search.
	3804	+ */
	3805	+ if (likely(!alloc_harder))
	3806	+ unusable_free += z->nr_reserved_highatomic;
	3807	+
	3808	+#ifdef CONFIG_CMA
	3809	+ /* If allocation can't use CMA areas don't use free CMA pages */
	3810	+ if (!(alloc_flags & ALLOC_CMA))
	3811	+ unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
	3812	+#endif
	3813	+
	3814	+ return unusable_free;
	3815	+}
3277	3816
3278	3817	/*
3279	3818	* Return true if free base pages are above 'mark'. For high-order checks it
..	..	@@ -3282,7 +3821,7 @@
3282	3821	* to check in the allocation paths if no pages are free.
3283	3822	*/
3284	3823	bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3285		- int classzone_idx, unsigned int alloc_flags,
	3824	+ int highest_zoneidx, unsigned int alloc_flags,
3286	3825	long free_pages)
3287	3826	{
3288	3827	long min = mark;
..	..	@@ -3290,19 +3829,12 @@
3290	3829	const bool alloc_harder = (alloc_flags & (ALLOC_HARDER\|ALLOC_OOM));
3291	3830
3292	3831	/* free_pages may go negative - that's OK */
3293		- free_pages -= (1 << order) - 1;
	3832	+ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
3294	3833
3295	3834	if (alloc_flags & ALLOC_HIGH)
3296	3835	min -= min / 2;
3297	3836
3298		- /*
3299		- * If the caller does not have rights to ALLOC_HARDER then subtract
3300		- * the high-atomic reserves. This will over-estimate the size of the
3301		- * atomic reserve but it avoids a search.
3302		- */
3303		- if (likely(!alloc_harder)) {
3304		- free_pages -= z->nr_reserved_highatomic;
3305		- } else {
	3837	+ if (unlikely(alloc_harder)) {
3306	3838	/*
3307	3839	* OOM victims can try even harder than normal ALLOC_HARDER
3308	3840	* users on the grounds that it's definitely going to be in
..	..	@@ -3315,19 +3847,12 @@
3315	3847	min -= min / 4;
3316	3848	}
3317	3849
3318		-
3319		-#ifdef CONFIG_CMA
3320		- /* If allocation can't use CMA areas don't use free CMA pages */
3321		- if (!(alloc_flags & ALLOC_CMA))
3322		- free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
3323		-#endif
3324		-
3325	3850	/*
3326	3851	* Check watermarks for an order-0 allocation request. If these
3327	3852	* are not met, then a high-order request also cannot go ahead
3328	3853	* even if a suitable page happened to be free.
3329	3854	*/
3330		- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
	3855	+ if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
3331	3856	return false;
3332	3857
3333	3858	/* If this is an order-0 request then the watermark is fine */
..	..	@@ -3351,65 +3876,83 @@
3351	3876	if (mt == MIGRATE_CMA)
3352	3877	continue;
3353	3878	#endif
3354		- if (!list_empty(&area->free_list[mt]))
	3879	+ if (!free_area_empty(area, mt))
3355	3880	return true;
3356	3881	}
3357	3882
3358	3883	#ifdef CONFIG_CMA
3359	3884	if ((alloc_flags & ALLOC_CMA) &&
3360		- !list_empty(&area->free_list[MIGRATE_CMA])) {
	3885	+ !free_area_empty(area, MIGRATE_CMA)) {
3361	3886	return true;
3362	3887	}
3363	3888	#endif
3364		- if (alloc_harder &&
3365		- !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
	3889	+ if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
3366	3890	return true;
3367	3891	}
3368	3892	return false;
3369	3893	}
3370	3894
3371	3895	bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3372		- int classzone_idx, unsigned int alloc_flags)
	3896	+ int highest_zoneidx, unsigned int alloc_flags)
3373	3897	{
3374		- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
	3898	+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3375	3899	zone_page_state(z, NR_FREE_PAGES));
3376	3900	}
	3901	+EXPORT_SYMBOL_GPL(zone_watermark_ok);
3377	3902
3378	3903	static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
3379		- unsigned long mark, int classzone_idx, unsigned int alloc_flags)
	3904	+ unsigned long mark, int highest_zoneidx,
	3905	+ unsigned int alloc_flags, gfp_t gfp_mask)
3380	3906	{
3381		- long free_pages = zone_page_state(z, NR_FREE_PAGES);
3382		- long cma_pages = 0;
	3907	+ long free_pages;
3383	3908
3384		-#ifdef CONFIG_CMA
3385		- /* If allocation can't use CMA areas don't use free CMA pages */
3386		- if (!(alloc_flags & ALLOC_CMA))
3387		- cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
3388		-#endif
	3909	+ free_pages = zone_page_state(z, NR_FREE_PAGES);
3389	3910
3390	3911	/*
3391	3912	* Fast check for order-0 only. If this fails then the reserves
3392		- * need to be calculated. There is a corner case where the check
3393		- * passes but only the high-order atomic reserve are free. If
3394		- * the caller is !atomic then it'll uselessly search the free
3395		- * list. That corner case is then slower but it is harmless.
	3913	+ * need to be calculated.
3396	3914	*/
3397		- if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
3398		- return true;
	3915	+ if (!order) {
	3916	+ long usable_free;
	3917	+ long reserved;
3399	3918
3400		- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3401		- free_pages);
	3919	+ usable_free = free_pages;
	3920	+ reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
	3921	+
	3922	+ /* reserved may over estimate high-atomic reserves. */
	3923	+ usable_free -= min(usable_free, reserved);
	3924	+ if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
	3925	+ return true;
	3926	+ }
	3927	+
	3928	+ if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
	3929	+ free_pages))
	3930	+ return true;
	3931	+ /*
	3932	+ * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
	3933	+ * when checking the min watermark. The min watermark is the
	3934	+ * point where boosting is ignored so that kswapd is woken up
	3935	+ * when below the low watermark.
	3936	+ */
	3937	+ if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
	3938	+ && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
	3939	+ mark = z->_watermark[WMARK_MIN];
	3940	+ return __zone_watermark_ok(z, order, mark, highest_zoneidx,
	3941	+ alloc_flags, free_pages);
	3942	+ }
	3943	+
	3944	+ return false;
3402	3945	}
3403	3946
3404	3947	bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3405		- unsigned long mark, int classzone_idx)
	3948	+ unsigned long mark, int highest_zoneidx)
3406	3949	{
3407	3950	long free_pages = zone_page_state(z, NR_FREE_PAGES);
3408	3951
3409	3952	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3410	3953	free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
3411	3954
3412		- return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
	3955	+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
3413	3956	free_pages);
3414	3957	}
3415	3958	EXPORT_SYMBOL_GPL(zone_watermark_ok_safe);
..	..	@@ -3418,7 +3961,7 @@
3418	3961	static bool zone_allows_reclaim(struct zone local_zone, struct zone zone)
3419	3962	{
3420	3963	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3421		- RECLAIM_DISTANCE;
	3964	+ node_reclaim_distance;
3422	3965	}
3423	3966	#else /* CONFIG_NUMA */
3424	3967	static bool zone_allows_reclaim(struct zone local_zone, struct zone zone)
..	..	@@ -3428,6 +3971,61 @@
3428	3971	#endif /* CONFIG_NUMA */
3429	3972
3430	3973	/*
	3974	+ * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
	3975	+ * fragmentation is subtle. If the preferred zone was HIGHMEM then
	3976	+ * premature use of a lower zone may cause lowmem pressure problems that
	3977	+ * are worse than fragmentation. If the next zone is ZONE_DMA then it is
	3978	+ * probably too small. It only makes sense to spread allocations to avoid
	3979	+ * fragmentation between the Normal and DMA32 zones.
	3980	+ */
	3981	+static inline unsigned int
	3982	+alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
	3983	+{
	3984	+ unsigned int alloc_flags;
	3985	+
	3986	+ /*
	3987	+ * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
	3988	+ * to save a branch.
	3989	+ */
	3990	+ alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
	3991	+
	3992	+#ifdef CONFIG_ZONE_DMA32
	3993	+ if (!zone)
	3994	+ return alloc_flags;
	3995	+
	3996	+ if (zone_idx(zone) != ZONE_NORMAL)
	3997	+ return alloc_flags;
	3998	+
	3999	+ /*
	4000	+ * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
	4001	+ * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
	4002	+ * on UMA that if Normal is populated then so is DMA32.
	4003	+ */
	4004	+ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
	4005	+ if (nr_online_nodes > 1 && !populated_zone(--zone))
	4006	+ return alloc_flags;
	4007	+
	4008	+ alloc_flags \|= ALLOC_NOFRAGMENT;
	4009	+#endif /* CONFIG_ZONE_DMA32 */
	4010	+ return alloc_flags;
	4011	+}
	4012	+
	4013	+static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
	4014	+ unsigned int alloc_flags)
	4015	+{
	4016	+#ifdef CONFIG_CMA
	4017	+ unsigned int pflags = current->flags;
	4018	+
	4019	+ if (!(pflags & PF_MEMALLOC_NOCMA) &&
	4020	+ gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE &&
	4021	+ gfp_mask & __GFP_CMA)
	4022	+ alloc_flags \|= ALLOC_CMA;
	4023	+
	4024	+#endif
	4025	+ return alloc_flags;
	4026	+}
	4027	+
	4028	+/*
3431	4029	* get_page_from_freelist goes through the zonelist trying to allocate
3432	4030	* a page.
3433	4031	*/
..	..	@@ -3435,16 +4033,20 @@
3435	4033	get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3436	4034	const struct alloc_context *ac)
3437	4035	{
3438		- struct zoneref *z = ac->preferred_zoneref;
	4036	+ struct zoneref *z;
3439	4037	struct zone *zone;
3440	4038	struct pglist_data *last_pgdat_dirty_limit = NULL;
	4039	+ bool no_fallback;
3441	4040
	4041	+retry:
3442	4042	/*
3443	4043	* Scan zonelist, looking for a zone with enough free.
3444	4044	* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
3445	4045	*/
3446		- for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3447		- ac->nodemask) {
	4046	+ no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
	4047	+ z = ac->preferred_zoneref;
	4048	+ for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
	4049	+ ac->nodemask) {
3448	4050	struct page *page;
3449	4051	unsigned long mark;
3450	4052
..	..	@@ -3481,9 +4083,26 @@
3481	4083	}
3482	4084	}
3483	4085
3484		- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
	4086	+ if (no_fallback && nr_online_nodes > 1 &&
	4087	+ zone != ac->preferred_zoneref->zone) {
	4088	+ int local_nid;
	4089	+
	4090	+ /*
	4091	+ * If moving to a remote node, retry but allow
	4092	+ * fragmenting fallbacks. Locality is more important
	4093	+ * than fragmentation avoidance.
	4094	+ */
	4095	+ local_nid = zone_to_nid(ac->preferred_zoneref->zone);
	4096	+ if (zone_to_nid(zone) != local_nid) {
	4097	+ alloc_flags &= ~ALLOC_NOFRAGMENT;
	4098	+ goto retry;
	4099	+ }
	4100	+ }
	4101	+
	4102	+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
3485	4103	if (!zone_watermark_fast(zone, order, mark,
3486		- ac_classzone_idx(ac), alloc_flags)) {
	4104	+ ac->highest_zoneidx, alloc_flags,
	4105	+ gfp_mask)) {
3487	4106	int ret;
3488	4107
3489	4108	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
..	..	@@ -3516,7 +4135,7 @@
3516	4135	default:
3517	4136	/* did we reclaim enough */
3518	4137	if (zone_watermark_ok(zone, order, mark,
3519		- ac_classzone_idx(ac), alloc_flags))
	4138	+ ac->highest_zoneidx, alloc_flags))
3520	4139	goto try_this_zone;
3521	4140
3522	4141	continue;
..	..	@@ -3548,30 +4167,21 @@
3548	4167	}
3549	4168	}
3550	4169
	4170	+ /*
	4171	+ * It's possible on a UMA machine to get through all zones that are
	4172	+ * fragmented. If avoiding fragmentation, reset and try again.
	4173	+ */
	4174	+ if (no_fallback) {
	4175	+ alloc_flags &= ~ALLOC_NOFRAGMENT;
	4176	+ goto retry;
	4177	+ }
	4178	+
3551	4179	return NULL;
3552		-}
3553		-
3554		-/*
3555		- * Large machines with many possible nodes should not always dump per-node
3556		- * meminfo in irq context.
3557		- */
3558		-static inline bool should_suppress_show_mem(void)
3559		-{
3560		- bool ret = false;
3561		-
3562		-#if NODES_SHIFT > 8
3563		- ret = in_interrupt();
3564		-#endif
3565		- return ret;
3566	4180	}
3567	4181
3568	4182	static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3569	4183	{
3570	4184	unsigned int filter = SHOW_MEM_FILTER_NODES;
3571		- static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
3572		-
3573		- if (should_suppress_show_mem() \|\| !__ratelimit(&show_mem_rs))
3574		- return;
3575	4185
3576	4186	/*
3577	4187	* This documents exceptions given to allocations in certain
..	..	@@ -3592,22 +4202,23 @@
3592	4202	{
3593	4203	struct va_format vaf;
3594	4204	va_list args;
3595		- static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
3596		- DEFAULT_RATELIMIT_BURST);
	4205	+ static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
3597	4206
3598		- if ((gfp_mask & __GFP_NOWARN) \|\| !__ratelimit(&nopage_rs))
	4207	+ if ((gfp_mask & __GFP_NOWARN) \|\|
	4208	+ !__ratelimit(&nopage_rs) \|\|
	4209	+ ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
3599	4210	return;
3600	4211
3601	4212	va_start(args, fmt);
3602	4213	vaf.fmt = fmt;
3603	4214	vaf.va = &args;
3604		- pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
	4215	+ pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
3605	4216	current->comm, &vaf, gfp_mask, &gfp_mask,
3606	4217	nodemask_pr_args(nodemask));
3607	4218	va_end(args);
3608	4219
3609	4220	cpuset_print_current_mems_allowed();
3610		-
	4221	+ pr_cont("\n");
3611	4222	dump_stack();
3612	4223	warn_alloc_show_mem(gfp_mask, nodemask);
3613	4224	}
..	..	@@ -3681,11 +4292,13 @@
3681	4292	* success so it is time to admit defeat. We will skip the OOM killer
3682	4293	* because it is very likely that the caller has a more reasonable
3683	4294	* fallback than shooting a random task.
	4295	+ *
	4296	+ * The OOM killer may not free memory on a specific node.
3684	4297	*/
3685		- if (gfp_mask & __GFP_RETRY_MAYFAIL)
	4298	+ if (gfp_mask & (__GFP_RETRY_MAYFAIL \| __GFP_THISNODE))
3686	4299	goto out;
3687	4300	/* The OOM killer does not needlessly kill tasks for lowmem */
3688		- if (ac->high_zoneidx < ZONE_NORMAL)
	4301	+ if (ac->highest_zoneidx < ZONE_NORMAL)
3689	4302	goto out;
3690	4303	if (pm_suspended_storage())
3691	4304	goto out;
..	..	@@ -3698,10 +4311,6 @@
3698	4311	* out_of_memory). Once filesystems are ready to handle allocation
3699	4312	* failures more gracefully we should just bail out here.
3700	4313	*/
3701		-
3702		- /* The OOM killer may not free memory on a specific node */
3703		- if (gfp_mask & __GFP_THISNODE)
3704		- goto out;
3705	4314
3706	4315	/* Exhausted what can be done so it's blame time */
3707	4316	if (out_of_memory(&oc) \|\| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
..	..	@@ -3733,7 +4342,7 @@
3733	4342	unsigned int alloc_flags, const struct alloc_context *ac,
3734	4343	enum compact_priority prio, enum compact_result *compact_result)
3735	4344	{
3736		- struct page *page;
	4345	+ struct page *page = NULL;
3737	4346	unsigned long pflags;
3738	4347	unsigned int noreclaim_flag;
3739	4348
..	..	@@ -3744,13 +4353,10 @@
3744	4353	noreclaim_flag = memalloc_noreclaim_save();
3745	4354
3746	4355	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3747		- prio);
	4356	+ prio, &page);
3748	4357
3749	4358	memalloc_noreclaim_restore(noreclaim_flag);
3750	4359	psi_memstall_leave(&pflags);
3751		-
3752		- if (*compact_result <= COMPACT_INACTIVE)
3753		- return NULL;
3754	4360
3755	4361	/*
3756	4362	* At least in one zone compaction wasn't deferred or skipped, so let's
..	..	@@ -3758,7 +4364,13 @@
3758	4364	*/
3759	4365	count_vm_event(COMPACTSTALL);
3760	4366
3761		- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	4367	+ /* Prep a captured page if available */
	4368	+ if (page)
	4369	+ prep_new_page(page, order, gfp_mask, alloc_flags);
	4370	+
	4371	+ /* Try get a page from the freelist if available */
	4372	+ if (!page)
	4373	+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3762	4374
3763	4375	if (page) {
3764	4376	struct zone *zone = page_zone(page);
..	..	@@ -3807,14 +4419,22 @@
3807	4419	goto check_priority;
3808	4420
3809	4421	/*
3810		- * make sure the compaction wasn't deferred or didn't bail out early
3811		- * due to locks contention before we declare that we should give up.
3812		- * But do not retry if the given zonelist is not suitable for
3813		- * compaction.
	4422	+ * compaction was skipped because there are not enough order-0 pages
	4423	+ * to work with, so we retry only if it looks like reclaim can help.
3814	4424	*/
3815		- if (compaction_withdrawn(compact_result)) {
	4425	+ if (compaction_needs_reclaim(compact_result)) {
3816	4426	ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3817	4427	goto out;
	4428	+ }
	4429	+
	4430	+ /*
	4431	+ * make sure the compaction wasn't deferred or didn't bail out early
	4432	+ * due to locks contention before we declare that we should give up.
	4433	+ * But the next retry should use a higher priority if allowed, so
	4434	+ * we don't just keep bailing out endlessly.
	4435	+ */
	4436	+ if (compaction_withdrawn(compact_result)) {
	4437	+ goto check_priority;
3818	4438	}
3819	4439
3820	4440	/*
..	..	@@ -3877,10 +4497,10 @@
3877	4497	* Let's give them a good hope and keep retrying while the order-0
3878	4498	* watermarks are OK.
3879	4499	*/
3880		- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3881		- ac->nodemask) {
	4500	+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
	4501	+ ac->highest_zoneidx, ac->nodemask) {
3882	4502	if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
3883		- ac_classzone_idx(ac), alloc_flags))
	4503	+ ac->highest_zoneidx, alloc_flags))
3884	4504	return true;
3885	4505	}
3886	4506	return false;
..	..	@@ -3938,33 +4558,50 @@
3938	4558	EXPORT_SYMBOL_GPL(fs_reclaim_release);
3939	4559	#endif
3940	4560
	4561	+/*
	4562	+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
	4563	+ * have been rebuilt so allocation retries. Reader side does not lock and
	4564	+ * retries the allocation if zonelist changes. Writer side is protected by the
	4565	+ * embedded spin_lock.
	4566	+ */
	4567	+static DEFINE_SEQLOCK(zonelist_update_seq);
	4568	+
	4569	+static unsigned int zonelist_iter_begin(void)
	4570	+{
	4571	+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
	4572	+ return read_seqbegin(&zonelist_update_seq);
	4573	+
	4574	+ return 0;
	4575	+}
	4576	+
	4577	+static unsigned int check_retry_zonelist(unsigned int seq)
	4578	+{
	4579	+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
	4580	+ return read_seqretry(&zonelist_update_seq, seq);
	4581	+
	4582	+ return seq;
	4583	+}
	4584	+
3941	4585	/* Perform direct synchronous page reclaim */
3942		-static int
	4586	+static unsigned long
3943	4587	__perform_reclaim(gfp_t gfp_mask, unsigned int order,
3944	4588	const struct alloc_context *ac)
3945	4589	{
3946		- struct reclaim_state reclaim_state;
3947		- int progress;
3948	4590	unsigned int noreclaim_flag;
3949		- unsigned long pflags;
	4591	+ unsigned long progress;
3950	4592
3951	4593	cond_resched();
3952	4594
3953	4595	/* We now go into synchronous reclaim */
3954	4596	cpuset_memory_pressure_bump();
3955		- psi_memstall_enter(&pflags);
3956	4597	fs_reclaim_acquire(gfp_mask);
3957	4598	noreclaim_flag = memalloc_noreclaim_save();
3958		- reclaim_state.reclaimed_slab = 0;
3959		- current->reclaim_state = &reclaim_state;
3960	4599
3961	4600	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
3962	4601	ac->nodemask);
3963	4602
3964		- current->reclaim_state = NULL;
3965	4603	memalloc_noreclaim_restore(noreclaim_flag);
3966	4604	fs_reclaim_release(gfp_mask);
3967		- psi_memstall_leave(&pflags);
3968	4605
3969	4606	cond_resched();
3970	4607
..	..	@@ -3978,11 +4615,14 @@
3978	4615	unsigned long *did_some_progress)
3979	4616	{
3980	4617	struct page *page = NULL;
	4618	+ unsigned long pflags;
3981	4619	bool drained = false;
	4620	+ bool skip_pcp_drain = false;
3982	4621
	4622	+ psi_memstall_enter(&pflags);
3983	4623	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
3984	4624	if (unlikely(!(*did_some_progress)))
3985		- return NULL;
	4625	+ goto out;
3986	4626
3987	4627	retry:
3988	4628	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
..	..	@@ -3990,14 +4630,19 @@
3990	4630	/*
3991	4631	* If an allocation failed after direct reclaim, it could be because
3992	4632	* pages are pinned on the per-cpu lists or in high alloc reserves.
3993		- * Shrink them them and try again
	4633	+ * Shrink them and try again
3994	4634	*/
3995	4635	if (!page && !drained) {
3996	4636	unreserve_highatomic_pageblock(ac, false);
3997		- drain_all_pages(NULL);
	4637	+ trace_android_vh_drain_all_pages_bypass(gfp_mask, order,
	4638	+ alloc_flags, ac->migratetype, *did_some_progress, &skip_pcp_drain);
	4639	+ if (!skip_pcp_drain)
	4640	+ drain_all_pages(NULL);
3998	4641	drained = true;
3999	4642	goto retry;
4000	4643	}
	4644	+out:
	4645	+ psi_memstall_leave(&pflags);
4001	4646
4002	4647	return page;
4003	4648	}
..	..	@@ -4008,12 +4653,12 @@
4008	4653	struct zoneref *z;
4009	4654	struct zone *zone;
4010	4655	pg_data_t *last_pgdat = NULL;
4011		- enum zone_type high_zoneidx = ac->high_zoneidx;
	4656	+ enum zone_type highest_zoneidx = ac->highest_zoneidx;
4012	4657
4013		- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
	4658	+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
4014	4659	ac->nodemask) {
4015	4660	if (last_pgdat != zone->zone_pgdat)
4016		- wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
	4661	+ wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
4017	4662	last_pgdat = zone->zone_pgdat;
4018	4663	}
4019	4664	}
..	..	@@ -4023,8 +4668,13 @@
4023	4668	{
4024	4669	unsigned int alloc_flags = ALLOC_WMARK_MIN \| ALLOC_CPUSET;
4025	4670
4026		- /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
	4671	+ /*
	4672	+ * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
	4673	+ * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
	4674	+ * to save two branches.
	4675	+ */
4027	4676	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
	4677	+ BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
4028	4678
4029	4679	/*
4030	4680	* The caller may dip into page reserves a bit more if the caller
..	..	@@ -4032,7 +4682,8 @@
4032	4682	* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
4033	4683	* set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
4034	4684	*/
4035		- alloc_flags \|= (__force int) (gfp_mask & __GFP_HIGH);
	4685	+ alloc_flags \|= (__force int)
	4686	+ (gfp_mask & (__GFP_HIGH \| __GFP_KSWAPD_RECLAIM));
4036	4687
4037	4688	if (gfp_mask & __GFP_ATOMIC) {
4038	4689	/*
..	..	@@ -4049,10 +4700,8 @@
4049	4700	} else if (unlikely(rt_task(current)) && !in_interrupt())
4050	4701	alloc_flags \|= ALLOC_HARDER;
4051	4702
4052		-#ifdef CONFIG_CMA
4053		- if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
4054		- alloc_flags \|= ALLOC_CMA;
4055		-#endif
	4703	+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
	4704	+
4056	4705	return alloc_flags;
4057	4706	}
4058	4707
..	..	@@ -4115,6 +4764,7 @@
4115	4764	{
4116	4765	struct zone *zone;
4117	4766	struct zoneref *z;
	4767	+ bool ret = false;
4118	4768
4119	4769	/*
4120	4770	* Costly allocations might have made a progress but this doesn't mean
..	..	@@ -4141,8 +4791,8 @@
4141	4791	* request even if all reclaimable pages are considered then we are
4142	4792	* screwed and have to go OOM.
4143	4793	*/
4144		- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
4145		- ac->nodemask) {
	4794	+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
	4795	+ ac->highest_zoneidx, ac->nodemask) {
4146	4796	unsigned long available;
4147	4797	unsigned long reclaimable;
4148	4798	unsigned long min_wmark = min_wmark_pages(zone);
..	..	@@ -4156,7 +4806,7 @@
4156	4806	* reclaimable pages?
4157	4807	*/
4158	4808	wmark = __zone_watermark_ok(zone, order, min_wmark,
4159		- ac_classzone_idx(ac), alloc_flags, available);
	4809	+ ac->highest_zoneidx, alloc_flags, available);
4160	4810	trace_reclaim_retry_zone(z, order, reclaimable,
4161	4811	available, min_wmark, *no_progress_loops, wmark);
4162	4812	if (wmark) {
..	..	@@ -4178,25 +4828,24 @@
4178	4828	}
4179	4829	}
4180	4830
4181		- /*
4182		- * Memory allocation/reclaim might be called from a WQ
4183		- * context and the current implementation of the WQ
4184		- * concurrency control doesn't recognize that
4185		- * a particular WQ is congested if the worker thread is
4186		- * looping without ever sleeping. Therefore we have to
4187		- * do a short sleep here rather than calling
4188		- * cond_resched().
4189		- */
4190		- if (current->flags & PF_WQ_WORKER)
4191		- schedule_timeout_uninterruptible(1);
4192		- else
4193		- cond_resched();
4194		-
4195		- return true;
	4831	+ ret = true;
	4832	+ goto out;
4196	4833	}
4197	4834	}
4198	4835
4199		- return false;
	4836	+out:
	4837	+ /*
	4838	+ * Memory allocation/reclaim might be called from a WQ context and the
	4839	+ * current implementation of the WQ concurrency control doesn't
	4840	+ * recognize that a particular WQ is congested if the worker thread is
	4841	+ * looping without ever sleeping. Therefore we have to do a short sleep
	4842	+ * here rather than calling cond_resched().
	4843	+ */
	4844	+ if (current->flags & PF_WQ_WORKER)
	4845	+ schedule_timeout_uninterruptible(1);
	4846	+ else
	4847	+ cond_resched();
	4848	+ return ret;
4200	4849	}
4201	4850
4202	4851	static inline bool
..	..	@@ -4246,8 +4895,11 @@
4246	4895	int compaction_retries;
4247	4896	int no_progress_loops;
4248	4897	unsigned int cpuset_mems_cookie;
	4898	+ unsigned int zonelist_iter_cookie;
4249	4899	int reserve_flags;
	4900	+ unsigned long vh_record;
4250	4901
	4902	+ trace_android_vh_alloc_pages_slowpath_begin(gfp_mask, order, &vh_record);
4251	4903	/*
4252	4904	* We also sanity check to catch abuse of atomic reserves being used by
4253	4905	* callers that are not in atomic context.
..	..	@@ -4256,11 +4908,12 @@
4256	4908	(__GFP_ATOMIC\|__GFP_DIRECT_RECLAIM)))
4257	4909	gfp_mask &= ~__GFP_ATOMIC;
4258	4910
4259		-retry_cpuset:
	4911	+restart:
4260	4912	compaction_retries = 0;
4261	4913	no_progress_loops = 0;
4262	4914	compact_priority = DEF_COMPACT_PRIORITY;
4263	4915	cpuset_mems_cookie = read_mems_allowed_begin();
	4916	+ zonelist_iter_cookie = zonelist_iter_begin();
4264	4917
4265	4918	/*
4266	4919	* The fast path uses conservative alloc_flags to succeed only until
..	..	@@ -4276,11 +4929,11 @@
4276	4929	* could end up iterating over non-eligible zones endlessly.
4277	4930	*/
4278	4931	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4279		- ac->high_zoneidx, ac->nodemask);
	4932	+ ac->highest_zoneidx, ac->nodemask);
4280	4933	if (!ac->preferred_zoneref->zone)
4281	4934	goto nopage;
4282	4935
4283		- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
	4936	+ if (alloc_flags & ALLOC_KSWAPD)
4284	4937	wake_all_kswapds(order, gfp_mask, ac);
4285	4938
4286	4939	/*
..	..	@@ -4313,18 +4966,28 @@
4313	4966
4314	4967	/*
4315	4968	* Checks for costly allocations with __GFP_NORETRY, which
4316		- * includes THP page fault allocations
	4969	+ * includes some THP page fault allocations
4317	4970	*/
4318	4971	if (costly_order && (gfp_mask & __GFP_NORETRY)) {
4319	4972	/*
4320		- * If compaction is deferred for high-order allocations,
4321		- * it is because sync compaction recently failed. If
4322		- * this is the case and the caller requested a THP
4323		- * allocation, we do not want to heavily disrupt the
4324		- * system, so we fail the allocation instead of entering
4325		- * direct reclaim.
	4973	+ * If allocating entire pageblock(s) and compaction
	4974	+ * failed because all zones are below low watermarks
	4975	+ * or is prohibited because it recently failed at this
	4976	+ * order, fail immediately unless the allocator has
	4977	+ * requested compaction and reclaim retry.
	4978	+ *
	4979	+ * Reclaim is
	4980	+ * - potentially very expensive because zones are far
	4981	+ * below their low watermarks or this is part of very
	4982	+ * bursty high order allocations,
	4983	+ * - not guaranteed to help because isolate_freepages()
	4984	+ * may not iterate over freed pages as part of its
	4985	+ * linear scan, and
	4986	+ * - unlikely to make entire pageblocks free on its
	4987	+ * own.
4326	4988	*/
4327		- if (compact_result == COMPACT_DEFERRED)
	4989	+ if (compact_result == COMPACT_SKIPPED \|\|
	4990	+ compact_result == COMPACT_DEFERRED)
4328	4991	goto nopage;
4329	4992
4330	4993	/*
..	..	@@ -4338,12 +5001,12 @@
4338	5001
4339	5002	retry:
4340	5003	/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4341		- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
	5004	+ if (alloc_flags & ALLOC_KSWAPD)
4342	5005	wake_all_kswapds(order, gfp_mask, ac);
4343	5006
4344	5007	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4345	5008	if (reserve_flags)
4346		- alloc_flags = reserve_flags;
	5009	+ alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
4347	5010
4348	5011	/*
4349	5012	* Reset the nodemask and zonelist iterators if memory policies can be
..	..	@@ -4353,7 +5016,7 @@
4353	5016	if (!(alloc_flags & ALLOC_CPUSET) \|\| reserve_flags) {
4354	5017	ac->nodemask = NULL;
4355	5018	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4356		- ac->high_zoneidx, ac->nodemask);
	5019	+ ac->highest_zoneidx, ac->nodemask);
4357	5020	}
4358	5021
4359	5022	/* Attempt with potentially adjusted zonelist and alloc_flags */
..	..	@@ -4368,6 +5031,12 @@
4368	5031	/* Avoid recursion of direct reclaim */
4369	5032	if (current->flags & PF_MEMALLOC)
4370	5033	goto nopage;
	5034	+
	5035	+ trace_android_vh_alloc_pages_reclaim_bypass(gfp_mask, order,
	5036	+ alloc_flags, ac->migratetype, &page);
	5037	+
	5038	+ if (page)
	5039	+ goto got_pg;
4371	5040
4372	5041	/* Try direct reclaim and then allocating */
4373	5042	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
..	..	@@ -4409,9 +5078,13 @@
4409	5078	goto retry;
4410	5079
4411	5080
4412		- /* Deal with possible cpuset update races before we start OOM killing */
4413		- if (check_retry_cpuset(cpuset_mems_cookie, ac))
4414		- goto retry_cpuset;
	5081	+ /*
	5082	+ * Deal with possible cpuset update races or zonelist updates to avoid
	5083	+ * a unnecessary OOM kill.
	5084	+ */
	5085	+ if (check_retry_cpuset(cpuset_mems_cookie, ac) \|\|
	5086	+ check_retry_zonelist(zonelist_iter_cookie))
	5087	+ goto restart;
4415	5088
4416	5089	/* Reclaim has failed us, start killing things */
4417	5090	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
..	..	@@ -4420,7 +5093,7 @@
4420	5093
4421	5094	/* Avoid allocations with no watermarks from looping endlessly */
4422	5095	if (tsk_is_oom_victim(current) &&
4423		- (alloc_flags == ALLOC_OOM \|\|
	5096	+ (alloc_flags & ALLOC_OOM \|\|
4424	5097	(gfp_mask & __GFP_NOMEMALLOC)))
4425	5098	goto nopage;
4426	5099
..	..	@@ -4431,9 +5104,13 @@
4431	5104	}
4432	5105
4433	5106	nopage:
4434		- /* Deal with possible cpuset update races before we fail */
4435		- if (check_retry_cpuset(cpuset_mems_cookie, ac))
4436		- goto retry_cpuset;
	5107	+ /*
	5108	+ * Deal with possible cpuset update races or zonelist updates to avoid
	5109	+ * a unnecessary OOM kill.
	5110	+ */
	5111	+ if (check_retry_cpuset(cpuset_mems_cookie, ac) \|\|
	5112	+ check_retry_zonelist(zonelist_iter_cookie))
	5113	+ goto restart;
4437	5114
4438	5115	/*
4439	5116	* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
..	..	@@ -4476,9 +5153,15 @@
4476	5153	goto retry;
4477	5154	}
4478	5155	fail:
	5156	+ trace_android_vh_alloc_pages_failure_bypass(gfp_mask, order,
	5157	+ alloc_flags, ac->migratetype, &page);
	5158	+ if (page)
	5159	+ goto got_pg;
	5160	+
4479	5161	warn_alloc(gfp_mask, ac->nodemask,
4480	5162	"page allocation failure: order:%u", order);
4481	5163	got_pg:
	5164	+ trace_android_vh_alloc_pages_slowpath_end(gfp_mask, order, vh_record);
4482	5165	return page;
4483	5166	}
4484	5167
..	..	@@ -4487,14 +5170,18 @@
4487	5170	struct alloc_context ac, gfp_t alloc_mask,
4488	5171	unsigned int *alloc_flags)
4489	5172	{
4490		- ac->high_zoneidx = gfp_zone(gfp_mask);
	5173	+ ac->highest_zoneidx = gfp_zone(gfp_mask);
4491	5174	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
4492	5175	ac->nodemask = nodemask;
4493		- ac->migratetype = gfpflags_to_migratetype(gfp_mask);
	5176	+ ac->migratetype = gfp_migratetype(gfp_mask);
4494	5177
4495	5178	if (cpusets_enabled()) {
4496	5179	*alloc_mask \|= __GFP_HARDWALL;
4497		- if (!ac->nodemask)
	5180	+ /*
	5181	+ * When we are in the interrupt context, it is irrelevant
	5182	+ * to the current task context. It means that any node ok.
	5183	+ */
	5184	+ if (!in_interrupt() && !ac->nodemask)
4498	5185	ac->nodemask = &cpuset_current_mems_allowed;
4499	5186	else
4500	5187	*alloc_flags \|= ALLOC_CPUSET;
..	..	@@ -4508,15 +5195,8 @@
4508	5195	if (should_fail_alloc_page(gfp_mask, order))
4509	5196	return false;
4510	5197
4511		- if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
4512		- *alloc_flags \|= ALLOC_CMA;
	5198	+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
4513	5199
4514		- return true;
4515		-}
4516		-
4517		-/* Determine whether to spread dirty pages and what the first usable zone */
4518		-static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
4519		-{
4520	5200	/* Dirty zone balancing only done in the fast path */
4521	5201	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
4522	5202
..	..	@@ -4526,7 +5206,9 @@
4526	5206	* may get reset for allocations that ignore memory policies.
4527	5207	*/
4528	5208	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4529		- ac->high_zoneidx, ac->nodemask);
	5209	+ ac->highest_zoneidx, ac->nodemask);
	5210	+
	5211	+ return true;
4530	5212	}
4531	5213
4532	5214	/*
..	..	@@ -4555,7 +5237,11 @@
4555	5237	if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
4556	5238	return NULL;
4557	5239
4558		- finalise_ac(gfp_mask, &ac);
	5240	+ /*
	5241	+ * Forbid the first pass from falling back to types that fragment
	5242	+ * memory until all local zones are considered.
	5243	+ */
	5244	+ alloc_flags \|= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
4559	5245
4560	5246	/* First allocation attempt */
4561	5247	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
..	..	@@ -4575,14 +5261,13 @@
4575	5261	* Restore the original nodemask if it was potentially replaced with
4576	5262	* &cpuset_current_mems_allowed to optimize the fast-path attempt.
4577	5263	*/
4578		- if (unlikely(ac.nodemask != nodemask))
4579		- ac.nodemask = nodemask;
	5264	+ ac.nodemask = nodemask;
4580	5265
4581	5266	page = __alloc_pages_slowpath(alloc_mask, order, &ac);
4582	5267
4583	5268	out:
4584	5269	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
4585		- unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
	5270	+ unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
4586	5271	__free_pages(page, order);
4587	5272	page = NULL;
4588	5273	}
..	..	@@ -4620,13 +5305,17 @@
4620	5305	if (order == 0) /* Via pcp? */
4621	5306	free_unref_page(page);
4622	5307	else
4623		- __free_pages_ok(page, order);
	5308	+ __free_pages_ok(page, order, FPI_NONE);
4624	5309	}
4625	5310
4626	5311	void __free_pages(struct page *page, unsigned int order)
4627	5312	{
	5313	+ trace_android_vh_free_pages(page, order);
4628	5314	if (put_page_testzero(page))
4629	5315	free_the_page(page, order);
	5316	+ else if (!PageHead(page))
	5317	+ while (order-- > 0)
	5318	+ free_the_page(page + (1 << order), order);
4630	5319	}
4631	5320	EXPORT_SYMBOL(__free_pages);
4632	5321
..	..	@@ -4731,6 +5420,18 @@
4731	5420	/* reset page count bias and offset to start of new frag */
4732	5421	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
4733	5422	offset = size - fragsz;
	5423	+ if (unlikely(offset < 0)) {
	5424	+ /*
	5425	+ * The caller is trying to allocate a fragment
	5426	+ * with fragsz > PAGE_SIZE but the cache isn't big
	5427	+ * enough to satisfy the request, this may
	5428	+ * happen in low memory conditions.
	5429	+ * We don't release the cache page because
	5430	+ * it could make memory pressure worse
	5431	+ * so we simply return NULL here.
	5432	+ */
	5433	+ return NULL;
	5434	+ }
4734	5435	}
4735	5436
4736	5437	nc->pagecnt_bias--;
..	..	@@ -4771,7 +5472,7 @@
4771	5472	/**
4772	5473	* alloc_pages_exact - allocate an exact number physically-contiguous pages.
4773	5474	* @size: the number of bytes to allocate
4774		- * @gfp_mask: GFP flags for the allocation
	5475	+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4775	5476	*
4776	5477	* This function is similar to alloc_pages(), except that it allocates the
4777	5478	* minimum number of pages to satisfy the request. alloc_pages() can only
..	..	@@ -4780,11 +5481,16 @@
4780	5481	* This function is also limited by MAX_ORDER.
4781	5482	*
4782	5483	* Memory allocated by this function must be released by free_pages_exact().
	5484	+ *
	5485	+ * Return: pointer to the allocated area or %NULL in case of error.
4783	5486	*/
4784	5487	void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4785	5488	{
4786	5489	unsigned int order = get_order(size);
4787	5490	unsigned long addr;
	5491	+
	5492	+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
	5493	+ gfp_mask &= ~__GFP_COMP;
4788	5494
4789	5495	addr = __get_free_pages(gfp_mask, order);
4790	5496	return make_alloc_exact(addr, order, size);
..	..	@@ -4796,15 +5502,22 @@
4796	5502	* pages on a node.
4797	5503	* @nid: the preferred node ID where memory should be allocated
4798	5504	* @size: the number of bytes to allocate
4799		- * @gfp_mask: GFP flags for the allocation
	5505	+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4800	5506	*
4801	5507	* Like alloc_pages_exact(), but try to allocate on node nid first before falling
4802	5508	* back.
	5509	+ *
	5510	+ * Return: pointer to the allocated area or %NULL in case of error.
4803	5511	*/
4804	5512	void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
4805	5513	{
4806	5514	unsigned int order = get_order(size);
4807		- struct page *p = alloc_pages_node(nid, gfp_mask, order);
	5515	+ struct page *p;
	5516	+
	5517	+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
	5518	+ gfp_mask &= ~__GFP_COMP;
	5519	+
	5520	+ p = alloc_pages_node(nid, gfp_mask, order);
4808	5521	if (!p)
4809	5522	return NULL;
4810	5523	return make_alloc_exact((unsigned long)page_address(p), order, size);
..	..	@@ -4833,11 +5546,13 @@
4833	5546	* nr_free_zone_pages - count number of pages beyond high watermark
4834	5547	* @offset: The zone index of the highest zone
4835	5548	*
4836		- * nr_free_zone_pages() counts the number of counts pages which are beyond the
	5549	+ * nr_free_zone_pages() counts the number of pages which are beyond the
4837	5550	* high watermark within all zones at or below a given zone index. For each
4838	5551	* zone, the number of pages is calculated as:
4839	5552	*
4840	5553	* nr_free_zone_pages = managed_pages - high_pages
	5554	+ *
	5555	+ * Return: number of pages beyond high watermark.
4841	5556	*/
4842	5557	static unsigned long nr_free_zone_pages(int offset)
4843	5558	{
..	..	@@ -4850,7 +5565,7 @@
4850	5565	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
4851	5566
4852	5567	for_each_zone_zonelist(zone, z, zonelist, offset) {
4853		- unsigned long size = zone->managed_pages;
	5568	+ unsigned long size = zone_managed_pages(zone);
4854	5569	unsigned long high = high_wmark_pages(zone);
4855	5570	if (size > high)
4856	5571	sum += size - high;
..	..	@@ -4864,23 +5579,15 @@
4864	5579	*
4865	5580	* nr_free_buffer_pages() counts the number of pages which are beyond the high
4866	5581	* watermark within ZONE_DMA and ZONE_NORMAL.
	5582	+ *
	5583	+ * Return: number of pages beyond high watermark within ZONE_DMA and
	5584	+ * ZONE_NORMAL.
4867	5585	*/
4868	5586	unsigned long nr_free_buffer_pages(void)
4869	5587	{
4870	5588	return nr_free_zone_pages(gfp_zone(GFP_USER));
4871	5589	}
4872	5590	EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4873		-
4874		-/**
4875		- * nr_free_pagecache_pages - count number of pages beyond high watermark
4876		- *
4877		- * nr_free_pagecache_pages() counts the number of pages which are beyond the
4878		- * high watermark within all zones.
4879		- */
4880		-unsigned long nr_free_pagecache_pages(void)
4881		-{
4882		- return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
4883		-}
4884	5591
4885	5592	static inline void show_node(struct zone *zone)
4886	5593	{
..	..	@@ -4902,7 +5609,7 @@
4902	5609	pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
4903	5610
4904	5611	for_each_zone(zone)
4905		- wmark_low += zone->watermark[WMARK_LOW];
	5612	+ wmark_low += low_wmark_pages(zone);
4906	5613
4907	5614	/*
4908	5615	* Estimate the amount of memory available for userspace allocations,
..	..	@@ -4924,8 +5631,8 @@
4924	5631	* items that are in use, and cannot be freed. Cap this estimate at the
4925	5632	* low watermark.
4926	5633	*/
4927		- reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
4928		- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
	5634	+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
	5635	+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
4929	5636	available += reclaimable - min(reclaimable / 2, wmark_low);
4930	5637
4931	5638	if (available < 0)
..	..	@@ -4936,11 +5643,11 @@
4936	5643
4937	5644	void si_meminfo(struct sysinfo *val)
4938	5645	{
4939		- val->totalram = totalram_pages;
	5646	+ val->totalram = totalram_pages();
4940	5647	val->sharedram = global_node_page_state(NR_SHMEM);
4941	5648	val->freeram = global_zone_page_state(NR_FREE_PAGES);
4942	5649	val->bufferram = nr_blockdev_pages();
4943		- val->totalhigh = totalhigh_pages;
	5650	+ val->totalhigh = totalhigh_pages();
4944	5651	val->freehigh = nr_free_highpages();
4945	5652	val->mem_unit = PAGE_SIZE;
4946	5653	}
..	..	@@ -4957,7 +5664,7 @@
4957	5664	pg_data_t *pgdat = NODE_DATA(nid);
4958	5665
4959	5666	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
4960		- managed_pages += pgdat->node_zones[zone_type].managed_pages;
	5667	+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
4961	5668	val->totalram = managed_pages;
4962	5669	val->sharedram = node_page_state(pgdat, NR_SHMEM);
4963	5670	val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
..	..	@@ -4966,7 +5673,7 @@
4966	5673	struct zone *zone = &pgdat->node_zones[zone_type];
4967	5674
4968	5675	if (is_highmem(zone)) {
4969		- managed_highpages += zone->managed_pages;
	5676	+ managed_highpages += zone_managed_pages(zone);
4970	5677	free_highpages += zone_page_state(zone, NR_FREE_PAGES);
4971	5678	}
4972	5679	}
..	..	@@ -5055,7 +5762,7 @@
5055	5762
5056	5763	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
5057	5764	" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
5058		- " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
	5765	+ " unevictable:%lu dirty:%lu writeback:%lu\n"
5059	5766	" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
5060	5767	" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
5061	5768	" free:%lu free_pcp:%lu free_cma:%lu\n",
..	..	@@ -5068,9 +5775,8 @@
5068	5775	global_node_page_state(NR_UNEVICTABLE),
5069	5776	global_node_page_state(NR_FILE_DIRTY),
5070	5777	global_node_page_state(NR_WRITEBACK),
5071		- global_node_page_state(NR_UNSTABLE_NFS),
5072		- global_node_page_state(NR_SLAB_RECLAIMABLE),
5073		- global_node_page_state(NR_SLAB_UNRECLAIMABLE),
	5778	+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
	5779	+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
5074	5780	global_node_page_state(NR_FILE_MAPPED),
5075	5781	global_node_page_state(NR_SHMEM),
5076	5782	global_zone_page_state(NR_PAGETABLE),
..	..	@@ -5079,6 +5785,7 @@
5079	5785	free_pcp,
5080	5786	global_zone_page_state(NR_FREE_CMA_PAGES));
5081	5787
	5788	+ trace_android_vh_show_mapcount_pages(NULL);
5082	5789	for_each_online_pgdat(pgdat) {
5083	5790	if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
5084	5791	continue;
..	..	@@ -5101,7 +5808,10 @@
5101	5808	" anon_thp: %lukB"
5102	5809	#endif
5103	5810	" writeback_tmp:%lukB"
5104		- " unstable:%lukB"
	5811	+ " kernel_stack:%lukB"
	5812	+#ifdef CONFIG_SHADOW_CALL_STACK
	5813	+ " shadow_call_stack:%lukB"
	5814	+#endif
5105	5815	" all_unreclaimable? %s"
5106	5816	"\n",
5107	5817	pgdat->node_id,
..	..	@@ -5123,7 +5833,10 @@
5123	5833	K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
5124	5834	#endif
5125	5835	K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
5126		- K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
	5836	+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
	5837	+#ifdef CONFIG_SHADOW_CALL_STACK
	5838	+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
	5839	+#endif
5127	5840	pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
5128	5841	"yes" : "no");
5129	5842	}
..	..	@@ -5145,6 +5858,7 @@
5145	5858	" min:%lukB"
5146	5859	" low:%lukB"
5147	5860	" high:%lukB"
	5861	+ " reserved_highatomic:%luKB"
5148	5862	" active_anon:%lukB"
5149	5863	" inactive_anon:%lukB"
5150	5864	" active_file:%lukB"
..	..	@@ -5154,10 +5868,6 @@
5154	5868	" present:%lukB"
5155	5869	" managed:%lukB"
5156	5870	" mlocked:%lukB"
5157		- " kernel_stack:%lukB"
5158		-#ifdef CONFIG_SHADOW_CALL_STACK
5159		- " shadow_call_stack:%lukB"
5160		-#endif
5161	5871	" pagetables:%lukB"
5162	5872	" bounce:%lukB"
5163	5873	" free_pcp:%lukB"
..	..	@@ -5169,6 +5879,7 @@
5169	5879	K(min_wmark_pages(zone)),
5170	5880	K(low_wmark_pages(zone)),
5171	5881	K(high_wmark_pages(zone)),
	5882	+ K(zone->nr_reserved_highatomic),
5172	5883	K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
5173	5884	K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
5174	5885	K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
..	..	@@ -5176,12 +5887,8 @@
5176	5887	K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
5177	5888	K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
5178	5889	K(zone->present_pages),
5179		- K(zone->managed_pages),
	5890	+ K(zone_managed_pages(zone)),
5180	5891	K(zone_page_state(zone, NR_MLOCK)),
5181		- zone_page_state(zone, NR_KERNEL_STACK_KB),
5182		-#ifdef CONFIG_SHADOW_CALL_STACK
5183		- zone_page_state(zone, NR_KERNEL_SCS_BYTES) / 1024,
5184		-#endif
5185	5892	K(zone_page_state(zone, NR_PAGETABLE)),
5186	5893	K(zone_page_state(zone, NR_BOUNCE)),
5187	5894	K(free_pcp),
..	..	@@ -5213,7 +5920,7 @@
5213	5920
5214	5921	types[order] = 0;
5215	5922	for (type = 0; type < MIGRATE_TYPES; type++) {
5216		- if (!list_empty(&area->free_list[type]))
	5923	+ if (!free_area_empty(area, type))
5217	5924	types[order] \|= 1 << type;
5218	5925	}
5219	5926	}
..	..	@@ -5254,7 +5961,7 @@
5254	5961	do {
5255	5962	zone_type--;
5256	5963	zone = pgdat->node_zones + zone_type;
5257		- if (managed_zone(zone)) {
	5964	+ if (populated_zone(zone)) {
5258	5965	zoneref_set_zone(zone, &zonerefs[nr_zones++]);
5259	5966	check_highest_zone(zone_type);
5260	5967	}
..	..	@@ -5280,36 +5987,17 @@
5280	5987	return 0;
5281	5988	}
5282	5989
5283		-static __init int setup_numa_zonelist_order(char *s)
5284		-{
5285		- if (!s)
5286		- return 0;
5287		-
5288		- return __parse_numa_zonelist_order(s);
5289		-}
5290		-early_param("numa_zonelist_order", setup_numa_zonelist_order);
5291		-
5292	5990	char numa_zonelist_order[] = "Node";
5293	5991
5294	5992	/*
5295	5993	* sysctl handler for numa_zonelist_order
5296	5994	*/
5297	5995	int numa_zonelist_order_handler(struct ctl_table *table, int write,
5298		- void __user buffer, size_t length,
5299		- loff_t *ppos)
	5996	+ void buffer, size_t length, loff_t *ppos)
5300	5997	{
5301		- char *str;
5302		- int ret;
5303		-
5304		- if (!write)
5305		- return proc_dostring(table, write, buffer, length, ppos);
5306		- str = memdup_user_nul(buffer, 16);
5307		- if (IS_ERR(str))
5308		- return PTR_ERR(str);
5309		-
5310		- ret = __parse_numa_zonelist_order(str);
5311		- kfree(str);
5312		- return ret;
	5998	+ if (write)
	5999	+ return __parse_numa_zonelist_order(buffer);
	6000	+ return proc_dostring(table, write, buffer, length, ppos);
5313	6001	}
5314	6002
5315	6003
..	..	@@ -5328,14 +6016,14 @@
5328	6016	* from each node to each node in the system), and should also prefer nodes
5329	6017	* with no CPUs, since presumably they'll have very little allocation pressure
5330	6018	* on them otherwise.
5331		- * It returns -1 if no node is found.
	6019	+ *
	6020	+ * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
5332	6021	*/
5333	6022	static int find_next_best_node(int node, nodemask_t *used_node_mask)
5334	6023	{
5335	6024	int n, val;
5336	6025	int min_val = INT_MAX;
5337	6026	int best_node = NUMA_NO_NODE;
5338		- const struct cpumask *tmp = cpumask_of_node(0);
5339	6027
5340	6028	/* Use the local node if we haven't already */
5341	6029	if (!node_isset(node, *used_node_mask)) {
..	..	@@ -5356,8 +6044,7 @@
5356	6044	val += (n < node);
5357	6045
5358	6046	/* Give preference to headless and unused nodes */
5359		- tmp = cpumask_of_node(n);
5360		- if (!cpumask_empty(tmp))
	6047	+ if (!cpumask_empty(cpumask_of_node(n)))
5361	6048	val += PENALTY_FOR_NODE_WITH_CPUS;
5362	6049
5363	6050	/* Slight preference for less loaded node */
..	..	@@ -5428,14 +6115,13 @@
5428	6115	{
5429	6116	static int node_order[MAX_NUMNODES];
5430	6117	int node, load, nr_nodes = 0;
5431		- nodemask_t used_mask;
	6118	+ nodemask_t used_mask = NODE_MASK_NONE;
5432	6119	int local_node, prev_node;
5433	6120
5434	6121	/* NUMA-aware ordering of nodes */
5435	6122	local_node = pgdat->node_id;
5436	6123	load = nr_online_nodes;
5437	6124	prev_node = local_node;
5438		- nodes_clear(used_mask);
5439	6125
5440	6126	memset(node_order, 0, sizeof(node_order));
5441	6127	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
..	..	@@ -5542,9 +6228,8 @@
5542	6228	int nid;
5543	6229	int __maybe_unused cpu;
5544	6230	pg_data_t *self = data;
5545		- static DEFINE_SPINLOCK(lock);
5546	6231
5547		- spin_lock(&lock);
	6232	+ write_seqlock(&zonelist_update_seq);
5548	6233
5549	6234	#ifdef CONFIG_NUMA
5550	6235	memset(node_load, 0, sizeof(node_load));
..	..	@@ -5577,7 +6262,7 @@
5577	6262	#endif
5578	6263	}
5579	6264
5580		- spin_unlock(&lock);
	6265	+ write_sequnlock(&zonelist_update_seq);
5581	6266	}
5582	6267
5583	6268	static noinline void __init
..	..	@@ -5615,13 +6300,16 @@
5615	6300	*/
5616	6301	void __ref build_all_zonelists(pg_data_t *pgdat)
5617	6302	{
	6303	+ unsigned long vm_total_pages;
	6304	+
5618	6305	if (system_state == SYSTEM_BOOTING) {
5619	6306	build_all_zonelists_init();
5620	6307	} else {
5621	6308	__build_all_zonelists(pgdat);
5622	6309	/* cpuset refresh routine should be here */
5623	6310	}
5624		- vm_total_pages = nr_free_pagecache_pages();
	6311	+ /* Get the number of free pages beyond high watermark in all zones. */
	6312	+ vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
5625	6313	/*
5626	6314	* Disable grouping by mobility if the number of pages in the
5627	6315	* system is too low to allow the mechanism to work. It would be
..	..	@@ -5634,7 +6322,7 @@
5634	6322	else
5635	6323	page_group_by_mobility_disabled = 0;
5636	6324
5637		- pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
	6325	+ pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
5638	6326	nr_online_nodes,
5639	6327	page_group_by_mobility_disabled ? "off" : "on",
5640	6328	vm_total_pages);
..	..	@@ -5643,81 +6331,148 @@
5643	6331	#endif
5644	6332	}
5645	6333
	6334	+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
	6335	+static bool __meminit
	6336	+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
	6337	+{
	6338	+ static struct memblock_region *r;
	6339	+
	6340	+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
	6341	+ if (!r \|\| *pfn >= memblock_region_memory_end_pfn(r)) {
	6342	+ for_each_mem_region(r) {
	6343	+ if (*pfn < memblock_region_memory_end_pfn(r))
	6344	+ break;
	6345	+ }
	6346	+ }
	6347	+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
	6348	+ memblock_is_mirror(r)) {
	6349	+ *pfn = memblock_region_memory_end_pfn(r);
	6350	+ return true;
	6351	+ }
	6352	+ }
	6353	+ return false;
	6354	+}
	6355	+
5646	6356	/*
5647	6357	* Initially all pages are reserved - free ones are freed
5648		- * up by free_all_bootmem() once the early boot process is
	6358	+ * up by memblock_free_all() once the early boot process is
5649	6359	* done. Non-atomic initialization, single-pass.
	6360	+ *
	6361	+ * All aligned pageblocks are initialized to the specified migratetype
	6362	+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
	6363	+ * zone stats (e.g., nr_isolate_pageblock) are touched.
5650	6364	*/
5651	6365	void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5652		- unsigned long start_pfn, enum meminit_context context,
5653		- struct vmem_altmap *altmap)
	6366	+ unsigned long start_pfn, unsigned long zone_end_pfn,
	6367	+ enum meminit_context context,
	6368	+ struct vmem_altmap *altmap, int migratetype)
5654	6369	{
5655		- unsigned long end_pfn = start_pfn + size;
5656		- pg_data_t *pgdat = NODE_DATA(nid);
5657		- unsigned long pfn;
5658		- unsigned long nr_initialised = 0;
	6370	+ unsigned long pfn, end_pfn = start_pfn + size;
5659	6371	struct page *page;
5660		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5661		- struct memblock_region r = NULL, tmp;
5662		-#endif
5663	6372
5664	6373	if (highest_memmap_pfn < end_pfn - 1)
5665	6374	highest_memmap_pfn = end_pfn - 1;
	6375	+
	6376	+#ifdef CONFIG_ZONE_DEVICE
	6377	+ /*
	6378	+ * Honor reservation requested by the driver for this ZONE_DEVICE
	6379	+ * memory. We limit the total number of pages to initialize to just
	6380	+ * those that might contain the memory mapping. We will defer the
	6381	+ * ZONE_DEVICE page initialization until after we have released
	6382	+ * the hotplug lock.
	6383	+ */
	6384	+ if (zone == ZONE_DEVICE) {
	6385	+ if (!altmap)
	6386	+ return;
	6387	+
	6388	+ if (start_pfn == altmap->base_pfn)
	6389	+ start_pfn += altmap->reserve;
	6390	+ end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
	6391	+ }
	6392	+#endif
5666	6393
5667	6394	#ifdef CONFIG_ROCKCHIP_THUNDER_BOOT
5668	6395	/* Zero all page struct in advance */
5669	6396	memset(pfn_to_page(start_pfn), 0, sizeof(struct page) * size);
5670	6397	#endif
5671	6398
5672		- /*
5673		- * Honor reservation requested by the driver for this ZONE_DEVICE
5674		- * memory
5675		- */
5676		- if (altmap && start_pfn == altmap->base_pfn)
5677		- start_pfn += altmap->reserve;
5678		-
5679		- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
	6399	+ for (pfn = start_pfn; pfn < end_pfn; ) {
5680	6400	/*
5681	6401	* There can be holes in boot-time mem_map[]s handed to this
5682	6402	* function. They do not exist on hotplugged memory.
5683	6403	*/
5684		- if (context != MEMINIT_EARLY)
5685		- goto not_early;
5686		-
5687		- if (!early_pfn_valid(pfn))
5688		- continue;
5689		- if (!early_pfn_in_nid(pfn, nid))
5690		- continue;
5691		- if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
5692		- break;
5693		-
5694		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5695		- /*
5696		- * Check given memblock attribute by firmware which can affect
5697		- * kernel memory layout. If zone==ZONE_MOVABLE but memory is
5698		- * mirrored, it's an overlapped memmap init. skip it.
5699		- */
5700		- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
5701		- if (!r \|\| pfn >= memblock_region_memory_end_pfn(r)) {
5702		- for_each_memblock(memory, tmp)
5703		- if (pfn < memblock_region_memory_end_pfn(tmp))
5704		- break;
5705		- r = tmp;
5706		- }
5707		- if (pfn >= memblock_region_memory_base_pfn(r) &&
5708		- memblock_is_mirror(r)) {
5709		- /* already initialized as NORMAL */
5710		- pfn = memblock_region_memory_end_pfn(r);
	6404	+ if (context == MEMINIT_EARLY) {
	6405	+ if (overlap_memmap_init(zone, &pfn))
5711	6406	continue;
5712		- }
	6407	+ if (defer_init(nid, pfn, zone_end_pfn))
	6408	+ break;
5713	6409	}
5714		-#endif
5715	6410
5716		-not_early:
5717	6411	page = pfn_to_page(pfn);
5718	6412	__init_single_page(page, pfn, zone, nid, false);
5719	6413	if (context == MEMINIT_HOTPLUG)
5720		- SetPageReserved(page);
	6414	+ __SetPageReserved(page);
	6415	+
	6416	+ /*
	6417	+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
	6418	+ * such that unmovable allocations won't be scattered all
	6419	+ * over the place during system boot.
	6420	+ */
	6421	+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
	6422	+ set_pageblock_migratetype(page, migratetype);
	6423	+ cond_resched();
	6424	+ }
	6425	+ pfn++;
	6426	+ }
	6427	+}
	6428	+
	6429	+#ifdef CONFIG_ZONE_DEVICE
	6430	+void __ref memmap_init_zone_device(struct zone *zone,
	6431	+ unsigned long start_pfn,
	6432	+ unsigned long nr_pages,
	6433	+ struct dev_pagemap *pgmap)
	6434	+{
	6435	+ unsigned long pfn, end_pfn = start_pfn + nr_pages;
	6436	+ struct pglist_data *pgdat = zone->zone_pgdat;
	6437	+ struct vmem_altmap *altmap = pgmap_altmap(pgmap);
	6438	+ unsigned long zone_idx = zone_idx(zone);
	6439	+ unsigned long start = jiffies;
	6440	+ int nid = pgdat->node_id;
	6441	+
	6442	+ if (WARN_ON_ONCE(!pgmap \|\| zone_idx(zone) != ZONE_DEVICE))
	6443	+ return;
	6444	+
	6445	+ /*
	6446	+ * The call to memmap_init should have already taken care
	6447	+ * of the pages reserved for the memmap, so we can just jump to
	6448	+ * the end of that region and start processing the device pages.
	6449	+ */
	6450	+ if (altmap) {
	6451	+ start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
	6452	+ nr_pages = end_pfn - start_pfn;
	6453	+ }
	6454	+
	6455	+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
	6456	+ struct page *page = pfn_to_page(pfn);
	6457	+
	6458	+ __init_single_page(page, pfn, zone_idx, nid, true);
	6459	+
	6460	+ /*
	6461	+ * Mark page reserved as it will need to wait for onlining
	6462	+ * phase for it to be fully associated with a zone.
	6463	+ *
	6464	+ * We can use the non-atomic __set_bit operation for setting
	6465	+ * the flag as we are still initializing the pages.
	6466	+ */
	6467	+ __SetPageReserved(page);
	6468	+
	6469	+ /*
	6470	+ * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
	6471	+ * and zone_device_data. It is a bug if a ZONE_DEVICE page is
	6472	+ * ever freed or placed on a driver-private list.
	6473	+ */
	6474	+ page->pgmap = pgmap;
	6475	+ page->zone_device_data = NULL;
5721	6476
5722	6477	/*
5723	6478	* Mark the block movable so that blocks are reserved for
..	..	@@ -5726,21 +6481,20 @@
5726	6481	* the address space during boot when many long-lived
5727	6482	* kernel allocations are made.
5728	6483	*
5729		- * bitmap is created for zone's valid pfn range. but memmap
5730		- * can be created for invalid pages (for alignment)
5731		- * check here not to call set_pageblock_migratetype() against
5732		- * pfn out of zone.
5733		- *
5734	6484	* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
5735		- * because this is done early in sparse_add_one_section
	6485	+ * because this is done early in section_activate()
5736	6486	*/
5737		- if (!(pfn & (pageblock_nr_pages - 1))) {
	6487	+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
5738	6488	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5739	6489	cond_resched();
5740	6490	}
5741	6491	}
	6492	+
	6493	+ pr_info("%s initialised %lu pages in %ums\n", __func__,
	6494	+ nr_pages, jiffies_to_msecs(jiffies - start));
5742	6495	}
5743	6496
	6497	+#endif
5744	6498	static void __meminit zone_init_free_lists(struct zone *zone)
5745	6499	{
5746	6500	unsigned int order, t;
..	..	@@ -5750,11 +6504,118 @@
5750	6504	}
5751	6505	}
5752	6506
5753		-#ifndef __HAVE_ARCH_MEMMAP_INIT
5754		-#define memmap_init(size, nid, zone, start_pfn) \
5755		- memmap_init_zone((size), (nid), (zone), (start_pfn), \
5756		- MEMINIT_EARLY, NULL)
	6507	+/*
	6508	+ * Only struct pages that correspond to ranges defined by memblock.memory
	6509	+ * are zeroed and initialized by going through __init_single_page() during
	6510	+ * memmap_init_zone_range().
	6511	+ *
	6512	+ * But, there could be struct pages that correspond to holes in
	6513	+ * memblock.memory. This can happen because of the following reasons:
	6514	+ * - physical memory bank size is not necessarily the exact multiple of the
	6515	+ * arbitrary section size
	6516	+ * - early reserved memory may not be listed in memblock.memory
	6517	+ * - memory layouts defined with memmap= kernel parameter may not align
	6518	+ * nicely with memmap sections
	6519	+ *
	6520	+ * Explicitly initialize those struct pages so that:
	6521	+ * - PG_Reserved is set
	6522	+ * - zone and node links point to zone and node that span the page if the
	6523	+ * hole is in the middle of a zone
	6524	+ * - zone and node links point to adjacent zone/node if the hole falls on
	6525	+ * the zone boundary; the pages in such holes will be prepended to the
	6526	+ * zone/node above the hole except for the trailing pages in the last
	6527	+ * section that will be appended to the zone/node below.
	6528	+ */
	6529	+static void __init init_unavailable_range(unsigned long spfn,
	6530	+ unsigned long epfn,
	6531	+ int zone, int node)
	6532	+{
	6533	+ unsigned long pfn;
	6534	+ u64 pgcnt = 0;
	6535	+
	6536	+ for (pfn = spfn; pfn < epfn; pfn++) {
	6537	+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
	6538	+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
	6539	+ + pageblock_nr_pages - 1;
	6540	+ continue;
	6541	+ }
	6542	+ __init_single_page(pfn_to_page(pfn), pfn, zone, node, true);
	6543	+ __SetPageReserved(pfn_to_page(pfn));
	6544	+ pgcnt++;
	6545	+ }
	6546	+
	6547	+ if (pgcnt)
	6548	+ pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
	6549	+ node, zone_names[zone], pgcnt);
	6550	+}
	6551	+
	6552	+static void __init memmap_init_zone_range(struct zone *zone,
	6553	+ unsigned long start_pfn,
	6554	+ unsigned long end_pfn,
	6555	+ unsigned long *hole_pfn)
	6556	+{
	6557	+ unsigned long zone_start_pfn = zone->zone_start_pfn;
	6558	+ unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
	6559	+ int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
	6560	+
	6561	+ start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
	6562	+ end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
	6563	+
	6564	+ if (start_pfn >= end_pfn)
	6565	+ return;
	6566	+
	6567	+ memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn,
	6568	+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
	6569	+
	6570	+ if (*hole_pfn < start_pfn)
	6571	+ init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
	6572	+
	6573	+ *hole_pfn = end_pfn;
	6574	+}
	6575	+
	6576	+void __init __weak memmap_init(void)
	6577	+{
	6578	+ unsigned long start_pfn, end_pfn;
	6579	+ unsigned long hole_pfn = 0;
	6580	+ int i, j, zone_id, nid;
	6581	+
	6582	+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
	6583	+ struct pglist_data *node = NODE_DATA(nid);
	6584	+
	6585	+ for (j = 0; j < MAX_NR_ZONES; j++) {
	6586	+ struct zone *zone = node->node_zones + j;
	6587	+
	6588	+ if (!populated_zone(zone))
	6589	+ continue;
	6590	+
	6591	+ memmap_init_zone_range(zone, start_pfn, end_pfn,
	6592	+ &hole_pfn);
	6593	+ zone_id = j;
	6594	+ }
	6595	+ }
	6596	+
	6597	+#ifdef CONFIG_SPARSEMEM
	6598	+ /*
	6599	+ * Initialize the memory map for hole in the range [memory_end,
	6600	+ * section_end].
	6601	+ * Append the pages in this hole to the highest zone in the last
	6602	+ * node.
	6603	+ * The call to init_unavailable_range() is outside the ifdef to
	6604	+ * silence the compiler warining about zone_id set but not used;
	6605	+ * for FLATMEM it is a nop anyway
	6606	+ */
	6607	+ end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
	6608	+ if (hole_pfn < end_pfn)
5757	6609	#endif
	6610	+ init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
	6611	+}
	6612	+
	6613	+/* A stub for backwards compatibility with custom implementatin on IA-64 */
	6614	+void __meminit __weak arch_memmap_init(unsigned long size, int nid,
	6615	+ unsigned long zone,
	6616	+ unsigned long range_start_pfn)
	6617	+{
	6618	+}
5758	6619
5759	6620	static int zone_batchsize(struct zone *zone)
5760	6621	{
..	..	@@ -5765,7 +6626,7 @@
5765	6626	* The per-cpu-pages pools are set to around 1000th of the
5766	6627	* size of the zone.
5767	6628	*/
5768		- batch = zone->managed_pages / 1024;
	6629	+ batch = zone_managed_pages(zone) / 1024;
5769	6630	/* But no more than a meg. */
5770	6631	if (batch * PAGE_SIZE > 1024 * 1024)
5771	6632	batch = (1024 * 1024) / PAGE_SIZE;
..	..	@@ -5812,7 +6673,7 @@
5812	6673	* locking.
5813	6674	*
5814	6675	* Any new users of pcp->batch and pcp->high should ensure they can cope with
5815		- * those fields changing asynchronously (acording the the above rule).
	6676	+ * those fields changing asynchronously (acording to the above rule).
5816	6677	*
5817	6678	* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5818	6679	* outside of boot time (or some other assurance that no concurrent updaters
..	..	@@ -5846,7 +6707,6 @@
5846	6707	memset(p, 0, sizeof(*p));
5847	6708
5848	6709	pcp = &p->pcp;
5849		- pcp->count = 0;
5850	6710	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
5851	6711	INIT_LIST_HEAD(&pcp->lists[migratetype]);
5852	6712	}
..	..	@@ -5876,7 +6736,7 @@
5876	6736	{
5877	6737	if (percpu_pagelist_fraction)
5878	6738	pageset_set_high(pcp,
5879		- (zone->managed_pages /
	6739	+ (zone_managed_pages(zone) /
5880	6740	percpu_pagelist_fraction));
5881	6741	else
5882	6742	pageset_set_batch(pcp, zone_batchsize(zone));
..	..	@@ -5906,9 +6766,24 @@
5906	6766	{
5907	6767	struct pglist_data *pgdat;
5908	6768	struct zone *zone;
	6769	+ int __maybe_unused cpu;
5909	6770
5910	6771	for_each_populated_zone(zone)
5911	6772	setup_zone_pageset(zone);
	6773	+
	6774	+#ifdef CONFIG_NUMA
	6775	+ /*
	6776	+ * Unpopulated zones continue using the boot pagesets.
	6777	+ * The numa stats for these pagesets need to be reset.
	6778	+ * Otherwise, they will end up skewing the stats of
	6779	+ * the nodes these zones are associated with.
	6780	+ */
	6781	+ for_each_possible_cpu(cpu) {
	6782	+ struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
	6783	+ memset(pcp->vm_numa_stat_diff, 0,
	6784	+ sizeof(pcp->vm_numa_stat_diff));
	6785	+ }
	6786	+#endif
5912	6787
5913	6788	for_each_online_pgdat(pgdat)
5914	6789	pgdat->per_cpu_nodestats =
..	..	@@ -5952,73 +6827,6 @@
5952	6827	zone->initialized = 1;
5953	6828	}
5954	6829
5955		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5956		-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
5957		-
5958		-/*
5959		- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
5960		- */
5961		-int __meminit __early_pfn_to_nid(unsigned long pfn,
5962		- struct mminit_pfnnid_cache *state)
5963		-{
5964		- unsigned long start_pfn, end_pfn;
5965		- int nid;
5966		-
5967		- if (state->last_start <= pfn && pfn < state->last_end)
5968		- return state->last_nid;
5969		-
5970		- nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
5971		- if (nid != -1) {
5972		- state->last_start = start_pfn;
5973		- state->last_end = end_pfn;
5974		- state->last_nid = nid;
5975		- }
5976		-
5977		- return nid;
5978		-}
5979		-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
5980		-
5981		-/**
5982		- * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
5983		- * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
5984		- * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
5985		- *
5986		- * If an architecture guarantees that all ranges registered contain no holes
5987		- * and may be freed, this this function may be used instead of calling
5988		- * memblock_free_early_nid() manually.
5989		- */
5990		-void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
5991		-{
5992		- unsigned long start_pfn, end_pfn;
5993		- int i, this_nid;
5994		-
5995		- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
5996		- start_pfn = min(start_pfn, max_low_pfn);
5997		- end_pfn = min(end_pfn, max_low_pfn);
5998		-
5999		- if (start_pfn < end_pfn)
6000		- memblock_free_early_nid(PFN_PHYS(start_pfn),
6001		- (end_pfn - start_pfn) << PAGE_SHIFT,
6002		- this_nid);
6003		- }
6004		-}
6005		-
6006		-/**
6007		- * sparse_memory_present_with_active_regions - Call memory_present for each active range
6008		- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
6009		- *
6010		- * If an architecture guarantees that all ranges registered contain no holes and may
6011		- * be freed, this function may be used instead of calling memory_present() manually.
6012		- */
6013		-void __init sparse_memory_present_with_active_regions(int nid)
6014		-{
6015		- unsigned long start_pfn, end_pfn;
6016		- int i, this_nid;
6017		-
6018		- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
6019		- memory_present(this_nid, start_pfn, end_pfn);
6020		-}
6021		-
6022	6830	/**
6023	6831	* get_pfn_range_for_nid - Return the start and end page frames for a node
6024	6832	* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
..	..	@@ -6030,7 +6838,7 @@
6030	6838	* with no available memory, a warning is printed and the start and end
6031	6839	* PFNs will be 0.
6032	6840	*/
6033		-void __meminit get_pfn_range_for_nid(unsigned int nid,
	6841	+void __init get_pfn_range_for_nid(unsigned int nid,
6034	6842	unsigned long start_pfn, unsigned long end_pfn)
6035	6843	{
6036	6844	unsigned long this_start_pfn, this_end_pfn;
..	..	@@ -6079,7 +6887,7 @@
6079	6887	* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
6080	6888	* zones within a node are in order of monotonic increases memory addresses
6081	6889	*/
6082		-static void __meminit adjust_zone_range_for_zone_movable(int nid,
	6890	+static void __init adjust_zone_range_for_zone_movable(int nid,
6083	6891	unsigned long zone_type,
6084	6892	unsigned long node_start_pfn,
6085	6893	unsigned long node_end_pfn,
..	..	@@ -6110,13 +6918,12 @@
6110	6918	* Return the number of pages a zone spans in a node, including holes
6111	6919	* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
6112	6920	*/
6113		-static unsigned long __meminit zone_spanned_pages_in_node(int nid,
	6921	+static unsigned long __init zone_spanned_pages_in_node(int nid,
6114	6922	unsigned long zone_type,
6115	6923	unsigned long node_start_pfn,
6116	6924	unsigned long node_end_pfn,
6117	6925	unsigned long *zone_start_pfn,
6118		- unsigned long *zone_end_pfn,
6119		- unsigned long *ignored)
	6926	+ unsigned long *zone_end_pfn)
6120	6927	{
6121	6928	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6122	6929	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
..	..	@@ -6147,7 +6954,7 @@
6147	6954	* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
6148	6955	* then all holes in the requested range will be accounted for.
6149	6956	*/
6150		-unsigned long __meminit __absent_pages_in_range(int nid,
	6957	+unsigned long __init __absent_pages_in_range(int nid,
6151	6958	unsigned long range_start_pfn,
6152	6959	unsigned long range_end_pfn)
6153	6960	{
..	..	@@ -6168,7 +6975,7 @@
6168	6975	* @start_pfn: The start PFN to start searching for holes
6169	6976	* @end_pfn: The end PFN to stop searching for holes
6170	6977	*
6171		- * It returns the number of pages frames in memory holes within a range.
	6978	+ * Return: the number of pages frames in memory holes within a range.
6172	6979	*/
6173	6980	unsigned long __init absent_pages_in_range(unsigned long start_pfn,
6174	6981	unsigned long end_pfn)
..	..	@@ -6177,11 +6984,10 @@
6177	6984	}
6178	6985
6179	6986	/* Return the number of page frames in holes in a zone on a node */
6180		-static unsigned long __meminit zone_absent_pages_in_node(int nid,
	6987	+static unsigned long __init zone_absent_pages_in_node(int nid,
6181	6988	unsigned long zone_type,
6182	6989	unsigned long node_start_pfn,
6183		- unsigned long node_end_pfn,
6184		- unsigned long *ignored)
	6990	+ unsigned long node_end_pfn)
6185	6991	{
6186	6992	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6187	6993	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
..	..	@@ -6209,7 +7015,7 @@
6209	7015	unsigned long start_pfn, end_pfn;
6210	7016	struct memblock_region *r;
6211	7017
6212		- for_each_memblock(memory, r) {
	7018	+ for_each_mem_region(r) {
6213	7019	start_pfn = clamp(memblock_region_memory_base_pfn(r),
6214	7020	zone_start_pfn, zone_end_pfn);
6215	7021	end_pfn = clamp(memblock_region_memory_end_pfn(r),
..	..	@@ -6228,45 +7034,9 @@
6228	7034	return nr_absent;
6229	7035	}
6230	7036
6231		-#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6232		-static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
6233		- unsigned long zone_type,
6234		- unsigned long node_start_pfn,
6235		- unsigned long node_end_pfn,
6236		- unsigned long *zone_start_pfn,
6237		- unsigned long *zone_end_pfn,
6238		- unsigned long *zones_size)
6239		-{
6240		- unsigned int zone;
6241		-
6242		- *zone_start_pfn = node_start_pfn;
6243		- for (zone = 0; zone < zone_type; zone++)
6244		- *zone_start_pfn += zones_size[zone];
6245		-
6246		- zone_end_pfn = zone_start_pfn + zones_size[zone_type];
6247		-
6248		- return zones_size[zone_type];
6249		-}
6250		-
6251		-static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
6252		- unsigned long zone_type,
	7037	+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
6253	7038	unsigned long node_start_pfn,
6254		- unsigned long node_end_pfn,
6255		- unsigned long *zholes_size)
6256		-{
6257		- if (!zholes_size)
6258		- return 0;
6259		-
6260		- return zholes_size[zone_type];
6261		-}
6262		-
6263		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6264		-
6265		-static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
6266		- unsigned long node_start_pfn,
6267		- unsigned long node_end_pfn,
6268		- unsigned long *zones_size,
6269		- unsigned long *zholes_size)
	7039	+ unsigned long node_end_pfn)
6270	7040	{
6271	7041	unsigned long realtotalpages = 0, totalpages = 0;
6272	7042	enum zone_type i;
..	..	@@ -6274,17 +7044,21 @@
6274	7044	for (i = 0; i < MAX_NR_ZONES; i++) {
6275	7045	struct zone *zone = pgdat->node_zones + i;
6276	7046	unsigned long zone_start_pfn, zone_end_pfn;
	7047	+ unsigned long spanned, absent;
6277	7048	unsigned long size, real_size;
6278	7049
6279		- size = zone_spanned_pages_in_node(pgdat->node_id, i,
6280		- node_start_pfn,
6281		- node_end_pfn,
6282		- &zone_start_pfn,
6283		- &zone_end_pfn,
6284		- zones_size);
6285		- real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
6286		- node_start_pfn, node_end_pfn,
6287		- zholes_size);
	7050	+ spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
	7051	+ node_start_pfn,
	7052	+ node_end_pfn,
	7053	+ &zone_start_pfn,
	7054	+ &zone_end_pfn);
	7055	+ absent = zone_absent_pages_in_node(pgdat->node_id, i,
	7056	+ node_start_pfn,
	7057	+ node_end_pfn);
	7058	+
	7059	+ size = spanned;
	7060	+ real_size = size - absent;
	7061	+
6288	7062	if (size)
6289	7063	zone->zone_start_pfn = zone_start_pfn;
6290	7064	else
..	..	@@ -6330,10 +7104,14 @@
6330	7104	{
6331	7105	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
6332	7106	zone->pageblock_flags = NULL;
6333		- if (usemapsize)
	7107	+ if (usemapsize) {
6334	7108	zone->pageblock_flags =
6335		- memblock_virt_alloc_node_nopanic(usemapsize,
6336		- pgdat->node_id);
	7109	+ memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
	7110	+ pgdat->node_id);
	7111	+ if (!zone->pageblock_flags)
	7112	+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
	7113	+ usemapsize, zone->name, pgdat->node_id);
	7114	+ }
6337	7115	}
6338	7116	#else
6339	7117	static inline void setup_usemap(struct pglist_data pgdat, struct zone zone,
..	..	@@ -6400,9 +7178,11 @@
6400	7178	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6401	7179	static void pgdat_init_split_queue(struct pglist_data *pgdat)
6402	7180	{
6403		- spin_lock_init(&pgdat->split_queue_lock);
6404		- INIT_LIST_HEAD(&pgdat->split_queue);
6405		- pgdat->split_queue_len = 0;
	7181	+ struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
	7182	+
	7183	+ spin_lock_init(&ds_queue->split_queue_lock);
	7184	+ INIT_LIST_HEAD(&ds_queue->split_queue);
	7185	+ ds_queue->split_queue_len = 0;
6406	7186	}
6407	7187	#else
6408	7188	static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
..	..	@@ -6429,13 +7209,13 @@
6429	7209
6430	7210	pgdat_page_ext_init(pgdat);
6431	7211	spin_lock_init(&pgdat->lru_lock);
6432		- lruvec_init(node_lruvec(pgdat));
	7212	+ lruvec_init(&pgdat->__lruvec);
6433	7213	}
6434	7214
6435	7215	static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
6436	7216	unsigned long remaining_pages)
6437	7217	{
6438		- zone->managed_pages = remaining_pages;
	7218	+ atomic_long_set(&zone->managed_pages, remaining_pages);
6439	7219	zone_set_nid(zone, nid);
6440	7220	zone->name = zone_names[idx];
6441	7221	zone->zone_pgdat = NODE_DATA(nid);
..	..	@@ -6533,7 +7313,7 @@
6533	7313	set_pageblock_order();
6534	7314	setup_usemap(pgdat, zone, zone_start_pfn, size);
6535	7315	init_currently_empty_zone(zone, zone_start_pfn, size);
6536		- memmap_init(size, nid, j, zone_start_pfn);
	7316	+ arch_memmap_init(size, nid, j, zone_start_pfn);
6537	7317	}
6538	7318	}
6539	7319
..	..	@@ -6562,7 +7342,11 @@
6562	7342	end = pgdat_end_pfn(pgdat);
6563	7343	end = ALIGN(end, MAX_ORDER_NR_PAGES);
6564	7344	size = (end - start) * sizeof(struct page);
6565		- map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
	7345	+ map = memblock_alloc_node(size, SMP_CACHE_BYTES,
	7346	+ pgdat->node_id);
	7347	+ if (!map)
	7348	+ panic("Failed to allocate %ld bytes for node %d memory map\n",
	7349	+ size, pgdat->node_id);
6566	7350	pgdat->node_mem_map = map + offset;
6567	7351	}
6568	7352	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
..	..	@@ -6574,10 +7358,8 @@
6574	7358	*/
6575	7359	if (pgdat == NODE_DATA(0)) {
6576	7360	mem_map = NODE_DATA(0)->node_mem_map;
6577		-#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) \|\| defined(CONFIG_FLATMEM)
6578	7361	if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
6579	7362	mem_map -= offset;
6580		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6581	7363	}
6582	7364	#endif
6583	7365	}
..	..	@@ -6588,42 +7370,31 @@
6588	7370	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6589	7371	static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
6590	7372	{
6591		- /*
6592		- * We start only with one section of pages, more pages are added as
6593		- * needed until the rest of deferred pages are initialized.
6594		- */
6595		- pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
6596		- pgdat->node_spanned_pages);
6597	7373	pgdat->first_deferred_pfn = ULONG_MAX;
6598	7374	}
6599	7375	#else
6600	7376	static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
6601	7377	#endif
6602	7378
6603		-void __init free_area_init_node(int nid, unsigned long *zones_size,
6604		- unsigned long node_start_pfn,
6605		- unsigned long *zholes_size)
	7379	+static void __init free_area_init_node(int nid)
6606	7380	{
6607	7381	pg_data_t *pgdat = NODE_DATA(nid);
6608	7382	unsigned long start_pfn = 0;
6609	7383	unsigned long end_pfn = 0;
6610	7384
6611	7385	/* pg_data_t should be reset to zero when it's allocated */
6612		- WARN_ON(pgdat->nr_zones \|\| pgdat->kswapd_classzone_idx);
	7386	+ WARN_ON(pgdat->nr_zones \|\| pgdat->kswapd_highest_zoneidx);
	7387	+
	7388	+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
6613	7389
6614	7390	pgdat->node_id = nid;
6615		- pgdat->node_start_pfn = node_start_pfn;
	7391	+ pgdat->node_start_pfn = start_pfn;
6616	7392	pgdat->per_cpu_nodestats = NULL;
6617		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6618		- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
	7393	+
6619	7394	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
6620	7395	(u64)start_pfn << PAGE_SHIFT,
6621	7396	end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
6622		-#else
6623		- start_pfn = node_start_pfn;
6624		-#endif
6625		- calculate_node_totalpages(pgdat, start_pfn, end_pfn,
6626		- zones_size, zholes_size);
	7397	+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
6627	7398
6628	7399	alloc_node_mem_map(pgdat);
6629	7400	pgdat_set_deferred_range(pgdat);
..	..	@@ -6631,80 +7402,10 @@
6631	7402	free_area_init_core(pgdat);
6632	7403	}
6633	7404
6634		-#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
6635		-
6636		-/*
6637		- * Zero all valid struct pages in range [spfn, epfn), return number of struct
6638		- * pages zeroed
6639		- */
6640		-static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
	7405	+void __init free_area_init_memoryless_node(int nid)
6641	7406	{
6642		- unsigned long pfn;
6643		- u64 pgcnt = 0;
6644		-
6645		- for (pfn = spfn; pfn < epfn; pfn++) {
6646		- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
6647		- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
6648		- + pageblock_nr_pages - 1;
6649		- continue;
6650		- }
6651		- mm_zero_struct_page(pfn_to_page(pfn));
6652		- pgcnt++;
6653		- }
6654		-
6655		- return pgcnt;
	7407	+ free_area_init_node(nid);
6656	7408	}
6657		-
6658		-/*
6659		- * Only struct pages that are backed by physical memory are zeroed and
6660		- * initialized by going through __init_single_page(). But, there are some
6661		- * struct pages which are reserved in memblock allocator and their fields
6662		- * may be accessed (for example page_to_pfn() on some configuration accesses
6663		- * flags). We must explicitly zero those struct pages.
6664		- *
6665		- * This function also addresses a similar issue where struct pages are left
6666		- * uninitialized because the physical address range is not covered by
6667		- * memblock.memory or memblock.reserved. That could happen when memblock
6668		- * layout is manually configured via memmap=, or when the highest physical
6669		- * address (max_pfn) does not end on a section boundary.
6670		- */
6671		-void __init zero_resv_unavail(void)
6672		-{
6673		- phys_addr_t start, end;
6674		- u64 i, pgcnt;
6675		- phys_addr_t next = 0;
6676		-
6677		- /*
6678		- * Loop through unavailable ranges not covered by memblock.memory.
6679		- */
6680		- pgcnt = 0;
6681		- for_each_mem_range(i, &memblock.memory, NULL,
6682		- NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
6683		- if (next < start)
6684		- pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
6685		- next = end;
6686		- }
6687		-
6688		- /*
6689		- * Early sections always have a fully populated memmap for the whole
6690		- * section - see pfn_valid(). If the last section has holes at the
6691		- * end and that section is marked "online", the memmap will be
6692		- * considered initialized. Make sure that memmap has a well defined
6693		- * state.
6694		- */
6695		- pgcnt += zero_pfn_range(PFN_DOWN(next),
6696		- round_up(max_pfn, PAGES_PER_SECTION));
6697		-
6698		- /*
6699		- * Struct pages that do not have backing memory. This could be because
6700		- * firmware is using some of this memory, or for some other reasons.
6701		- */
6702		- if (pgcnt)
6703		- pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
6704		-}
6705		-#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
6706		-
6707		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6708	7409
6709	7410	#if MAX_NUMNODES > 1
6710	7411	/*
..	..	@@ -6735,14 +7436,14 @@
6735	7436	* model has fine enough granularity to avoid incorrect mapping for the
6736	7437	* populated node map.
6737	7438	*
6738		- * Returns the determined alignment in pfn's. 0 if there is no alignment
	7439	+ * Return: the determined alignment in pfn's. 0 if there is no alignment
6739	7440	* requirement (single node).
6740	7441	*/
6741	7442	unsigned long __init node_map_pfn_alignment(void)
6742	7443	{
6743	7444	unsigned long accl_mask = 0, last_end = 0;
6744	7445	unsigned long start, end, mask;
6745		- int last_nid = -1;
	7446	+ int last_nid = NUMA_NO_NODE;
6746	7447	int i, nid;
6747	7448
6748	7449	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
..	..	@@ -6769,33 +7470,15 @@
6769	7470	return ~accl_mask + 1;
6770	7471	}
6771	7472
6772		-/* Find the lowest pfn for a node */
6773		-static unsigned long __init find_min_pfn_for_node(int nid)
6774		-{
6775		- unsigned long min_pfn = ULONG_MAX;
6776		- unsigned long start_pfn;
6777		- int i;
6778		-
6779		- for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
6780		- min_pfn = min(min_pfn, start_pfn);
6781		-
6782		- if (min_pfn == ULONG_MAX) {
6783		- pr_warn("Could not find start_pfn for node %d\n", nid);
6784		- return 0;
6785		- }
6786		-
6787		- return min_pfn;
6788		-}
6789		-
6790	7473	/**
6791	7474	* find_min_pfn_with_active_regions - Find the minimum PFN registered
6792	7475	*
6793		- * It returns the minimum PFN based on information provided via
	7476	+ * Return: the minimum PFN based on information provided via
6794	7477	* memblock_set_node().
6795	7478	*/
6796	7479	unsigned long __init find_min_pfn_with_active_regions(void)
6797	7480	{
6798		- return find_min_pfn_for_node(MAX_NUMNODES);
	7481	+ return PHYS_PFN(memblock_start_of_DRAM());
6799	7482	}
6800	7483
6801	7484	/*
..	..	@@ -6844,11 +7527,11 @@
6844	7527	* options.
6845	7528	*/
6846	7529	if (movable_node_is_enabled()) {
6847		- for_each_memblock(memory, r) {
	7530	+ for_each_mem_region(r) {
6848	7531	if (!memblock_is_hotpluggable(r))
6849	7532	continue;
6850	7533
6851		- nid = r->nid;
	7534	+ nid = memblock_get_region_node(r);
6852	7535
6853	7536	usable_startpfn = PFN_DOWN(r->base);
6854	7537	zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
..	..	@@ -6865,11 +7548,11 @@
6865	7548	if (mirrored_kernelcore) {
6866	7549	bool mem_below_4gb_not_mirrored = false;
6867	7550
6868		- for_each_memblock(memory, r) {
	7551	+ for_each_mem_region(r) {
6869	7552	if (memblock_is_mirror(r))
6870	7553	continue;
6871	7554
6872		- nid = r->nid;
	7555	+ nid = memblock_get_region_node(r);
6873	7556
6874	7557	usable_startpfn = memblock_region_memory_base_pfn(r);
6875	7558
..	..	@@ -6884,7 +7567,7 @@
6884	7567	}
6885	7568
6886	7569	if (mem_below_4gb_not_mirrored)
6887		- pr_warn("This configuration results in unmirrored kernel memory.");
	7570	+ pr_warn("This configuration results in unmirrored kernel memory.\n");
6888	7571
6889	7572	goto out2;
6890	7573	}
..	..	@@ -7023,9 +7706,16 @@
7023	7706
7024	7707	out2:
7025	7708	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
7026		- for (nid = 0; nid < MAX_NUMNODES; nid++)
	7709	+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
	7710	+ unsigned long start_pfn, end_pfn;
	7711	+
7027	7712	zone_movable_pfn[nid] =
7028	7713	roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
	7714	+
	7715	+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
	7716	+ if (zone_movable_pfn[nid] >= end_pfn)
	7717	+ zone_movable_pfn[nid] = 0;
	7718	+ }
7029	7719
7030	7720	out:
7031	7721	/* restore the node_state */
..	..	@@ -7037,23 +7727,29 @@
7037	7727	{
7038	7728	enum zone_type zone_type;
7039	7729
7040		- if (N_MEMORY == N_NORMAL_MEMORY)
7041		- return;
7042		-
7043	7730	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
7044	7731	struct zone *zone = &pgdat->node_zones[zone_type];
7045	7732	if (populated_zone(zone)) {
7046		- node_set_state(nid, N_HIGH_MEMORY);
7047		- if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
7048		- zone_type <= ZONE_NORMAL)
	7733	+ if (IS_ENABLED(CONFIG_HIGHMEM))
	7734	+ node_set_state(nid, N_HIGH_MEMORY);
	7735	+ if (zone_type <= ZONE_NORMAL)
7049	7736	node_set_state(nid, N_NORMAL_MEMORY);
7050	7737	break;
7051	7738	}
7052	7739	}
7053	7740	}
7054	7741
	7742	+/*
	7743	+ * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
	7744	+ * such cases we allow max_zone_pfn sorted in the descending order
	7745	+ */
	7746	+bool __weak arch_has_descending_max_zone_pfns(void)
	7747	+{
	7748	+ return false;
	7749	+}
	7750	+
7055	7751	/**
7056		- * free_area_init_nodes - Initialise all pg_data_t and zone data
	7752	+ * free_area_init - Initialise all pg_data_t and zone data
7057	7753	* @max_zone_pfn: an array of max PFNs for each zone
7058	7754	*
7059	7755	* This will call free_area_init_node() for each active node in the system.
..	..	@@ -7065,10 +7761,11 @@
7065	7761	* starts where the previous one ended. For example, ZONE_DMA32 starts
7066	7762	* at arch_max_dma_pfn.
7067	7763	*/
7068		-void __init free_area_init_nodes(unsigned long *max_zone_pfn)
	7764	+void __init free_area_init(unsigned long *max_zone_pfn)
7069	7765	{
7070	7766	unsigned long start_pfn, end_pfn;
7071		- int i, nid;
	7767	+ int i, nid, zone;
	7768	+ bool descending;
7072	7769
7073	7770	/* Record where the zone boundaries are */
7074	7771	memset(arch_zone_lowest_possible_pfn, 0,
..	..	@@ -7077,14 +7774,20 @@
7077	7774	sizeof(arch_zone_highest_possible_pfn));
7078	7775
7079	7776	start_pfn = find_min_pfn_with_active_regions();
	7777	+ descending = arch_has_descending_max_zone_pfns();
7080	7778
7081	7779	for (i = 0; i < MAX_NR_ZONES; i++) {
7082		- if (i == ZONE_MOVABLE)
	7780	+ if (descending)
	7781	+ zone = MAX_NR_ZONES - i - 1;
	7782	+ else
	7783	+ zone = i;
	7784	+
	7785	+ if (zone == ZONE_MOVABLE)
7083	7786	continue;
7084	7787
7085		- end_pfn = max(max_zone_pfn[i], start_pfn);
7086		- arch_zone_lowest_possible_pfn[i] = start_pfn;
7087		- arch_zone_highest_possible_pfn[i] = end_pfn;
	7788	+ end_pfn = max(max_zone_pfn[zone], start_pfn);
	7789	+ arch_zone_lowest_possible_pfn[zone] = start_pfn;
	7790	+ arch_zone_highest_possible_pfn[zone] = end_pfn;
7088	7791
7089	7792	start_pfn = end_pfn;
7090	7793	}
..	..	@@ -7118,27 +7821,33 @@
7118	7821	(u64)zone_movable_pfn[i] << PAGE_SHIFT);
7119	7822	}
7120	7823
7121		- /* Print out the early node map */
	7824	+ /*
	7825	+ * Print out the early node map, and initialize the
	7826	+ * subsection-map relative to active online memory ranges to
	7827	+ * enable future "sub-section" extensions of the memory map.
	7828	+ */
7122	7829	pr_info("Early memory node ranges\n");
7123		- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
	7830	+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
7124	7831	pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
7125	7832	(u64)start_pfn << PAGE_SHIFT,
7126	7833	((u64)end_pfn << PAGE_SHIFT) - 1);
	7834	+ subsection_map_init(start_pfn, end_pfn - start_pfn);
	7835	+ }
7127	7836
7128	7837	/* Initialise every node */
7129	7838	mminit_verify_pageflags_layout();
7130	7839	setup_nr_node_ids();
7131		- zero_resv_unavail();
7132	7840	for_each_online_node(nid) {
7133	7841	pg_data_t *pgdat = NODE_DATA(nid);
7134		- free_area_init_node(nid, NULL,
7135		- find_min_pfn_for_node(nid), NULL);
	7842	+ free_area_init_node(nid);
7136	7843
7137	7844	/* Any memory on that node */
7138	7845	if (pgdat->node_present_pages)
7139	7846	node_set_state(nid, N_MEMORY);
7140	7847	check_for_memory(pgdat, nid);
7141	7848	}
	7849	+
	7850	+ memmap_init();
7142	7851	}
7143	7852
7144	7853	static int __init cmdline_parse_core(char p, unsigned long core,
..	..	@@ -7197,22 +7906,18 @@
7197	7906	early_param("kernelcore", cmdline_parse_kernelcore);
7198	7907	early_param("movablecore", cmdline_parse_movablecore);
7199	7908
7200		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
7201		-
7202	7909	void adjust_managed_page_count(struct page *page, long count)
7203	7910	{
7204		- spin_lock(&managed_page_count_lock);
7205		- page_zone(page)->managed_pages += count;
7206		- totalram_pages += count;
	7911	+ atomic_long_add(count, &page_zone(page)->managed_pages);
	7912	+ totalram_pages_add(count);
7207	7913	#ifdef CONFIG_HIGHMEM
7208	7914	if (PageHighMem(page))
7209		- totalhigh_pages += count;
	7915	+ totalhigh_pages_add(count);
7210	7916	#endif
7211		- spin_unlock(&managed_page_count_lock);
7212	7917	}
7213	7918	EXPORT_SYMBOL(adjust_managed_page_count);
7214	7919
7215		-unsigned long free_reserved_area(void start, void end, int poison, char *s)
	7920	+unsigned long free_reserved_area(void start, void end, int poison, const char *s)
7216	7921	{
7217	7922	void *pos;
7218	7923	unsigned long pages = 0;
..	..	@@ -7231,6 +7936,11 @@
7231	7936	* alias for the memset().
7232	7937	*/
7233	7938	direct_map_addr = page_address(page);
	7939	+ /*
	7940	+ * Perform a kasan-unchecked memset() since this memory
	7941	+ * has not been initialized.
	7942	+ */
	7943	+ direct_map_addr = kasan_reset_tag(direct_map_addr);
7234	7944	if ((unsigned int)poison <= 0xFF)
7235	7945	memset(direct_map_addr, poison, PAGE_SIZE);
7236	7946
..	..	@@ -7243,15 +7953,14 @@
7243	7953
7244	7954	return pages;
7245	7955	}
7246		-EXPORT_SYMBOL(free_reserved_area);
7247	7956
7248	7957	#ifdef CONFIG_HIGHMEM
7249	7958	void free_highmem_page(struct page *page)
7250	7959	{
7251	7960	__free_reserved_page(page);
7252		- totalram_pages++;
7253		- page_zone(page)->managed_pages++;
7254		- totalhigh_pages++;
	7961	+ totalram_pages_inc();
	7962	+ atomic_long_inc(&page_zone(page)->managed_pages);
	7963	+ totalhigh_pages_inc();
7255	7964	}
7256	7965	#endif
7257	7966
..	..	@@ -7278,7 +7987,7 @@
7278	7987	*/
7279	7988	#define adj_init_size(start, end, size, pos, adj) \
7280	7989	do { \
7281		- if (start <= pos && pos < end && size > adj) \
	7990	+ if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
7282	7991	size -= adj; \
7283	7992	} while (0)
7284	7993
..	..	@@ -7300,10 +8009,10 @@
7300	8009	physpages << (PAGE_SHIFT - 10),
7301	8010	codesize >> 10, datasize >> 10, rosize >> 10,
7302	8011	(init_data_size + init_code_size) >> 10, bss_size >> 10,
7303		- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
	8012	+ (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
7304	8013	totalcma_pages << (PAGE_SHIFT - 10),
7305	8014	#ifdef CONFIG_HIGHMEM
7306		- totalhigh_pages << (PAGE_SHIFT - 10),
	8015	+ totalhigh_pages() << (PAGE_SHIFT - 10),
7307	8016	#endif
7308	8017	str ? ", " : "", str ? str : "");
7309	8018	}
..	..	@@ -7322,13 +8031,6 @@
7322	8031	void __init set_dma_reserve(unsigned long new_dma_reserve)
7323	8032	{
7324	8033	dma_reserve = new_dma_reserve;
7325		-}
7326		-
7327		-void __init free_area_init(unsigned long *zones_size)
7328		-{
7329		- zero_resv_unavail();
7330		- free_area_init_node(0, zones_size,
7331		- __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
7332	8034	}
7333	8035
7334	8036	static int page_alloc_cpu_dead(unsigned int cpu)
..	..	@@ -7356,9 +8058,27 @@
7356	8058	return 0;
7357	8059	}
7358	8060
	8061	+#ifdef CONFIG_NUMA
	8062	+int hashdist = HASHDIST_DEFAULT;
	8063	+
	8064	+static int __init set_hashdist(char *str)
	8065	+{
	8066	+ if (!str)
	8067	+ return 0;
	8068	+ hashdist = simple_strtoul(str, &str, 0);
	8069	+ return 1;
	8070	+}
	8071	+__setup("hashdist=", set_hashdist);
	8072	+#endif
	8073	+
7359	8074	void __init page_alloc_init(void)
7360	8075	{
7361	8076	int ret;
	8077	+
	8078	+#ifdef CONFIG_NUMA
	8079	+ if (num_node_state(N_MEMORY) == 1)
	8080	+ hashdist = 0;
	8081	+#endif
7362	8082
7363	8083	ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
7364	8084	"mm/page_alloc:dead", NULL,
..	..	@@ -7383,6 +8103,7 @@
7383	8103	for (i = 0; i < MAX_NR_ZONES; i++) {
7384	8104	struct zone *zone = pgdat->node_zones + i;
7385	8105	long max = 0;
	8106	+ unsigned long managed_pages = zone_managed_pages(zone);
7386	8107
7387	8108	/* Find valid and maximum lowmem_reserve in the zone */
7388	8109	for (j = i; j < MAX_NR_ZONES; j++) {
..	..	@@ -7393,8 +8114,8 @@
7393	8114	/* we treat the high watermark as reserved pages. */
7394	8115	max += high_wmark_pages(zone);
7395	8116
7396		- if (max > zone->managed_pages)
7397		- max = zone->managed_pages;
	8117	+ if (max > managed_pages)
	8118	+ max = managed_pages;
7398	8119
7399	8120	pgdat->totalreserve_pages += max;
7400	8121
..	..	@@ -7413,30 +8134,24 @@
7413	8134	static void setup_per_zone_lowmem_reserve(void)
7414	8135	{
7415	8136	struct pglist_data *pgdat;
7416		- enum zone_type j, idx;
	8137	+ enum zone_type i, j;
7417	8138
7418	8139	for_each_online_pgdat(pgdat) {
7419		- for (j = 0; j < MAX_NR_ZONES; j++) {
7420		- struct zone *zone = pgdat->node_zones + j;
7421		- unsigned long managed_pages = zone->managed_pages;
	8140	+ for (i = 0; i < MAX_NR_ZONES - 1; i++) {
	8141	+ struct zone *zone = &pgdat->node_zones[i];
	8142	+ int ratio = sysctl_lowmem_reserve_ratio[i];
	8143	+ bool clear = !ratio \|\| !zone_managed_pages(zone);
	8144	+ unsigned long managed_pages = 0;
7422	8145
7423		- zone->lowmem_reserve[j] = 0;
	8146	+ for (j = i + 1; j < MAX_NR_ZONES; j++) {
	8147	+ struct zone *upper_zone = &pgdat->node_zones[j];
7424	8148
7425		- idx = j;
7426		- while (idx) {
7427		- struct zone *lower_zone;
	8149	+ managed_pages += zone_managed_pages(upper_zone);
7428	8150
7429		- idx--;
7430		- lower_zone = pgdat->node_zones + idx;
7431		-
7432		- if (sysctl_lowmem_reserve_ratio[idx] < 1) {
7433		- sysctl_lowmem_reserve_ratio[idx] = 0;
7434		- lower_zone->lowmem_reserve[j] = 0;
7435		- } else {
7436		- lower_zone->lowmem_reserve[j] =
7437		- managed_pages / sysctl_lowmem_reserve_ratio[idx];
7438		- }
7439		- managed_pages += lower_zone->managed_pages;
	8151	+ if (clear)
	8152	+ zone->lowmem_reserve[j] = 0;
	8153	+ else
	8154	+ zone->lowmem_reserve[j] = managed_pages / ratio;
7440	8155	}
7441	8156	}
7442	8157	}
..	..	@@ -7456,18 +8171,17 @@
7456	8171	/* Calculate total number of !ZONE_HIGHMEM pages */
7457	8172	for_each_zone(zone) {
7458	8173	if (!is_highmem(zone))
7459		- lowmem_pages += zone->managed_pages;
	8174	+ lowmem_pages += zone_managed_pages(zone);
7460	8175	}
7461	8176
7462	8177	for_each_zone(zone) {
7463		- u64 min, low;
	8178	+ u64 tmp, low;
7464	8179
7465	8180	spin_lock_irqsave(&zone->lock, flags);
7466		- min = (u64)pages_min * zone->managed_pages;
7467		- do_div(min, lowmem_pages);
7468		- low = (u64)pages_low * zone->managed_pages;
7469		- do_div(low, vm_total_pages);
7470		-
	8181	+ tmp = (u64)pages_min * zone_managed_pages(zone);
	8182	+ do_div(tmp, lowmem_pages);
	8183	+ low = (u64)pages_low * zone_managed_pages(zone);
	8184	+ do_div(low, nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)));
7471	8185	if (is_highmem(zone)) {
7472	8186	/*
7473	8187	* __GFP_HIGH and PF_MEMALLOC allocations usually don't
..	..	@@ -7475,20 +8189,20 @@
7475	8189	* value here.
7476	8190	*
7477	8191	* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
7478		- * deltas control asynch page reclaim, and so should
	8192	+ * deltas control async page reclaim, and so should
7479	8193	* not be capped for highmem.
7480	8194	*/
7481	8195	unsigned long min_pages;
7482	8196
7483		- min_pages = zone->managed_pages / 1024;
	8197	+ min_pages = zone_managed_pages(zone) / 1024;
7484	8198	min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
7485		- zone->watermark[WMARK_MIN] = min_pages;
	8199	+ zone->_watermark[WMARK_MIN] = min_pages;
7486	8200	} else {
7487	8201	/*
7488	8202	* If it's a lowmem zone, reserve a number of pages
7489	8203	* proportionate to the zone's size.
7490	8204	*/
7491		- zone->watermark[WMARK_MIN] = min;
	8205	+ zone->_watermark[WMARK_MIN] = tmp;
7492	8206	}
7493	8207
7494	8208	/*
..	..	@@ -7496,14 +8210,13 @@
7496	8210	* scale factor in proportion to available memory, but
7497	8211	* ensure a minimum size on small systems.
7498	8212	*/
7499		- min = max_t(u64, min >> 2,
7500		- mult_frac(zone->managed_pages,
	8213	+ tmp = max_t(u64, tmp >> 2,
	8214	+ mult_frac(zone_managed_pages(zone),
7501	8215	watermark_scale_factor, 10000));
7502	8216
7503		- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
7504		- low + min;
7505		- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
7506		- low + min * 2;
	8217	+ zone->watermark_boost = 0;
	8218	+ zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + low + tmp;
	8219	+ zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + low + tmp * 2;
7507	8220
7508	8221	spin_unlock_irqrestore(&zone->lock, flags);
7509	8222	}
..	..	@@ -7532,7 +8245,7 @@
7532	8245	* Initialise min_free_kbytes.
7533	8246	*
7534	8247	* For small machines we want it small (128k min). For large machines
7535		- * we want it large (64MB max). But it is not linear, because network
	8248	+ * we want it large (256MB max). But it is not linear, because network
7536	8249	* bandwidth does not increase linearly with machine size. We use
7537	8250	*
7538	8251	* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
..	..	@@ -7564,8 +8277,8 @@
7564	8277	min_free_kbytes = new_min_free_kbytes;
7565	8278	if (min_free_kbytes < 128)
7566	8279	min_free_kbytes = 128;
7567		- if (min_free_kbytes > 65536)
7568		- min_free_kbytes = 65536;
	8280	+ if (min_free_kbytes > 262144)
	8281	+ min_free_kbytes = 262144;
7569	8282	} else {
7570	8283	pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
7571	8284	new_min_free_kbytes, user_min_free_kbytes);
..	..	@@ -7591,7 +8304,7 @@
7591	8304	* or extra_free_kbytes changes.
7592	8305	*/
7593	8306	int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
7594		- void __user buffer, size_t length, loff_t *ppos)
	8307	+ void buffer, size_t length, loff_t *ppos)
7595	8308	{
7596	8309	int rc;
7597	8310
..	..	@@ -7607,7 +8320,7 @@
7607	8320	}
7608	8321
7609	8322	int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
7610		- void __user buffer, size_t length, loff_t *ppos)
	8323	+ void buffer, size_t length, loff_t *ppos)
7611	8324	{
7612	8325	int rc;
7613	8326
..	..	@@ -7631,13 +8344,13 @@
7631	8344	pgdat->min_unmapped_pages = 0;
7632	8345
7633	8346	for_each_zone(zone)
7634		- zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
7635		- sysctl_min_unmapped_ratio) / 100;
	8347	+ zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
	8348	+ sysctl_min_unmapped_ratio) / 100;
7636	8349	}
7637	8350
7638	8351
7639	8352	int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
7640		- void __user buffer, size_t length, loff_t *ppos)
	8353	+ void buffer, size_t length, loff_t *ppos)
7641	8354	{
7642	8355	int rc;
7643	8356
..	..	@@ -7659,12 +8372,12 @@
7659	8372	pgdat->min_slab_pages = 0;
7660	8373
7661	8374	for_each_zone(zone)
7662		- zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
7663		- sysctl_min_slab_ratio) / 100;
	8375	+ zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
	8376	+ sysctl_min_slab_ratio) / 100;
7664	8377	}
7665	8378
7666	8379	int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
7667		- void __user buffer, size_t length, loff_t *ppos)
	8380	+ void buffer, size_t length, loff_t *ppos)
7668	8381	{
7669	8382	int rc;
7670	8383
..	..	@@ -7688,11 +8401,28 @@
7688	8401	* if in function of the boot time zone sizes.
7689	8402	*/
7690	8403	int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
7691		- void __user buffer, size_t length, loff_t *ppos)
	8404	+ void buffer, size_t length, loff_t *ppos)
7692	8405	{
	8406	+ int i;
	8407	+
7693	8408	proc_dointvec_minmax(table, write, buffer, length, ppos);
	8409	+
	8410	+ for (i = 0; i < MAX_NR_ZONES; i++) {
	8411	+ if (sysctl_lowmem_reserve_ratio[i] < 1)
	8412	+ sysctl_lowmem_reserve_ratio[i] = 0;
	8413	+ }
	8414	+
7694	8415	setup_per_zone_lowmem_reserve();
7695	8416	return 0;
	8417	+}
	8418	+
	8419	+static void __zone_pcp_update(struct zone *zone)
	8420	+{
	8421	+ unsigned int cpu;
	8422	+
	8423	+ for_each_possible_cpu(cpu)
	8424	+ pageset_set_high_and_batch(zone,
	8425	+ per_cpu_ptr(zone->pageset, cpu));
7696	8426	}
7697	8427
7698	8428	/*
..	..	@@ -7701,7 +8431,7 @@
7701	8431	* pagelist can have before it gets flushed back to buddy allocator.
7702	8432	*/
7703	8433	int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
7704		- void __user buffer, size_t length, loff_t *ppos)
	8434	+ void buffer, size_t length, loff_t *ppos)
7705	8435	{
7706	8436	struct zone *zone;
7707	8437	int old_percpu_pagelist_fraction;
..	..	@@ -7726,30 +8456,12 @@
7726	8456	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
7727	8457	goto out;
7728	8458
7729		- for_each_populated_zone(zone) {
7730		- unsigned int cpu;
7731		-
7732		- for_each_possible_cpu(cpu)
7733		- pageset_set_high_and_batch(zone,
7734		- per_cpu_ptr(zone->pageset, cpu));
7735		- }
	8459	+ for_each_populated_zone(zone)
	8460	+ __zone_pcp_update(zone);
7736	8461	out:
7737	8462	mutex_unlock(&pcp_batch_high_lock);
7738	8463	return ret;
7739	8464	}
7740		-
7741		-#ifdef CONFIG_NUMA
7742		-int hashdist = HASHDIST_DEFAULT;
7743		-
7744		-static int __init set_hashdist(char *str)
7745		-{
7746		- if (!str)
7747		- return 0;
7748		- hashdist = simple_strtoul(str, &str, 0);
7749		- return 1;
7750		-}
7751		-__setup("hashdist=", set_hashdist);
7752		-#endif
7753	8465
7754	8466	#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
7755	8467	/*
..	..	@@ -7797,6 +8509,7 @@
7797	8509	unsigned long log2qty, size;
7798	8510	void *table = NULL;
7799	8511	gfp_t gfp_flags;
	8512	+ bool virt;
7800	8513
7801	8514	/* allow the kernel cmdline to have a say */
7802	8515	if (!numentries) {
..	..	@@ -7853,32 +8566,34 @@
7853	8566
7854	8567	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC \| __GFP_ZERO : GFP_ATOMIC;
7855	8568	do {
	8569	+ virt = false;
7856	8570	size = bucketsize << log2qty;
7857	8571	if (flags & HASH_EARLY) {
7858	8572	if (flags & HASH_ZERO)
7859		- table = memblock_virt_alloc_nopanic(size, 0);
	8573	+ table = memblock_alloc(size, SMP_CACHE_BYTES);
7860	8574	else
7861		- table = memblock_virt_alloc_raw(size, 0);
7862		- } else if (hashdist) {
7863		- table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
	8575	+ table = memblock_alloc_raw(size,
	8576	+ SMP_CACHE_BYTES);
	8577	+ } else if (get_order(size) >= MAX_ORDER \|\| hashdist) {
	8578	+ table = __vmalloc(size, gfp_flags);
	8579	+ virt = true;
7864	8580	} else {
7865	8581	/*
7866	8582	* If bucketsize is not a power-of-two, we may free
7867	8583	* some pages at the end of hash table which
7868	8584	* alloc_pages_exact() automatically does
7869	8585	*/
7870		- if (get_order(size) < MAX_ORDER) {
7871		- table = alloc_pages_exact(size, gfp_flags);
7872		- kmemleak_alloc(table, size, 1, gfp_flags);
7873		- }
	8586	+ table = alloc_pages_exact(size, gfp_flags);
	8587	+ kmemleak_alloc(table, size, 1, gfp_flags);
7874	8588	}
7875	8589	} while (!table && size > PAGE_SIZE && --log2qty);
7876	8590
7877	8591	if (!table)
7878	8592	panic("Failed to allocate %s hash table\n", tablename);
7879	8593
7880		- pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
7881		- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
	8594	+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
	8595	+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
	8596	+ virt ? "vmalloc" : "linear");
7882	8597
7883	8598	if (_hash_shift)
7884	8599	*_hash_shift = log2qty;
..	..	@@ -7890,47 +8605,50 @@
7890	8605
7891	8606	/*
7892	8607	* This function checks whether pageblock includes unmovable pages or not.
7893		- * If @count is not zero, it is okay to include less @count unmovable pages
7894	8608	*
7895	8609	* PageLRU check without isolation or lru_lock could race so that
7896	8610	* MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
7897	8611	* check without lock_page also may miss some movable non-lru pages at
7898	8612	* race condition. So you can't expect this function should be exact.
	8613	+ *
	8614	+ * Returns a page without holding a reference. If the caller wants to
	8615	+ * dereference that page (e.g., dumping), it has to make sure that it
	8616	+ * cannot get removed (e.g., via memory unplug) concurrently.
	8617	+ *
7899	8618	*/
7900		-bool has_unmovable_pages(struct zone zone, struct page page, int count,
7901		- int migratetype,
7902		- bool skip_hwpoisoned_pages)
	8619	+struct page has_unmovable_pages(struct zone zone, struct page *page,
	8620	+ int migratetype, int flags)
7903	8621	{
7904		- unsigned long pfn, iter, found;
	8622	+ unsigned long iter = 0;
	8623	+ unsigned long pfn = page_to_pfn(page);
	8624	+ unsigned long offset = pfn % pageblock_nr_pages;
7905	8625
7906		- /*
7907		- * TODO we could make this much more efficient by not checking every
7908		- * page in the range if we know all of them are in MOVABLE_ZONE and
7909		- * that the movable zone guarantees that pages are migratable but
7910		- * the later is not the case right now unfortunatelly. E.g. movablecore
7911		- * can still lead to having bootmem allocations in zone_movable.
7912		- */
	8626	+ if (is_migrate_cma_page(page)) {
	8627	+ /*
	8628	+ * CMA allocations (alloc_contig_range) really need to mark
	8629	+ * isolate CMA pageblocks even when they are not movable in fact
	8630	+ * so consider them movable here.
	8631	+ */
	8632	+ if (is_migrate_cma(migratetype))
	8633	+ return NULL;
7913	8634
7914		- /*
7915		- * CMA allocations (alloc_contig_range) really need to mark isolate
7916		- * CMA pageblocks even when they are not movable in fact so consider
7917		- * them movable here.
7918		- */
7919		- if (is_migrate_cma(migratetype) &&
7920		- is_migrate_cma(get_pageblock_migratetype(page)))
7921		- return false;
	8635	+ return page;
	8636	+ }
7922	8637
7923		- pfn = page_to_pfn(page);
7924		- for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
7925		- unsigned long check = pfn + iter;
7926		-
7927		- if (!pfn_valid_within(check))
	8638	+ for (; iter < pageblock_nr_pages - offset; iter++) {
	8639	+ if (!pfn_valid_within(pfn + iter))
7928	8640	continue;
7929	8641
7930		- page = pfn_to_page(check);
	8642	+ page = pfn_to_page(pfn + iter);
7931	8643
	8644	+ /*
	8645	+ * Both, bootmem allocations and memory holes are marked
	8646	+ * PG_reserved and are unmovable. We can even have unmovable
	8647	+ * allocations inside ZONE_MOVABLE, for example when
	8648	+ * specifying "movablecore".
	8649	+ */
7932	8650	if (PageReserved(page))
7933		- goto unmovable;
	8651	+ return page;
7934	8652
7935	8653	/*
7936	8654	* If the zone is movable and we have ruled out all reserved
..	..	@@ -7942,17 +8660,22 @@
7942	8660
7943	8661	/*
7944	8662	* Hugepages are not in LRU lists, but they're movable.
7945		- * We need not scan over tail pages bacause we don't
	8663	+ * THPs are on the LRU, but need to be counted as #small pages.
	8664	+ * We need not scan over tail pages because we don't
7946	8665	* handle each tail page individually in migration.
7947	8666	*/
7948		- if (PageHuge(page)) {
	8667	+ if (PageHuge(page) \|\| PageTransCompound(page)) {
7949	8668	struct page *head = compound_head(page);
7950	8669	unsigned int skip_pages;
7951	8670
7952		- if (!hugepage_migration_supported(page_hstate(head)))
7953		- goto unmovable;
	8671	+ if (PageHuge(page)) {
	8672	+ if (!hugepage_migration_supported(page_hstate(head)))
	8673	+ return page;
	8674	+ } else if (!PageLRU(head) && !__PageMovable(head)) {
	8675	+ return page;
	8676	+ }
7954	8677
7955		- skip_pages = (1 << compound_order(head)) - (page - head);
	8678	+ skip_pages = compound_nr(head) - (page - head);
7956	8679	iter += skip_pages - 1;
7957	8680	continue;
7958	8681	}
..	..	@@ -7965,7 +8688,7 @@
7965	8688	*/
7966	8689	if (!page_ref_count(page)) {
7967	8690	if (PageBuddy(page))
7968		- iter += (1 << page_order(page)) - 1;
	8691	+ iter += (1 << buddy_order(page)) - 1;
7969	8692	continue;
7970	8693	}
7971	8694
..	..	@@ -7973,61 +8696,100 @@
7973	8696	* The HWPoisoned page may be not in buddy system, and
7974	8697	* page_count() is not 0.
7975	8698	*/
7976		- if (skip_hwpoisoned_pages && PageHWPoison(page))
	8699	+ if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
7977	8700	continue;
7978	8701
7979		- if (__PageMovable(page))
	8702	+ /*
	8703	+ * We treat all PageOffline() pages as movable when offlining
	8704	+ * to give drivers a chance to decrement their reference count
	8705	+ * in MEM_GOING_OFFLINE in order to indicate that these pages
	8706	+ * can be offlined as there are no direct references anymore.
	8707	+ * For actually unmovable PageOffline() where the driver does
	8708	+ * not support this, we will fail later when trying to actually
	8709	+ * move these pages that still have a reference count > 0.
	8710	+ * (false negatives in this function only)
	8711	+ */
	8712	+ if ((flags & MEMORY_OFFLINE) && PageOffline(page))
7980	8713	continue;
7981	8714
7982		- if (!PageLRU(page))
7983		- found++;
	8715	+ if (__PageMovable(page) \|\| PageLRU(page))
	8716	+ continue;
	8717	+
7984	8718	/*
7985	8719	* If there are RECLAIMABLE pages, we need to check
7986	8720	* it. But now, memory offline itself doesn't call
7987	8721	* shrink_node_slabs() and it still to be fixed.
7988	8722	*/
7989		- /*
7990		- * If the page is not RAM, page_count()should be 0.
7991		- * we don't need more check. This is an _used_ not-movable page.
7992		- *
7993		- * The problematic thing here is PG_reserved pages. PG_reserved
7994		- * is set to both of a memory hole page and a _used_ kernel
7995		- * page at boot.
7996		- */
7997		- if (found > count)
7998		- goto unmovable;
	8723	+ return page;
7999	8724	}
8000		- return false;
8001		-unmovable:
8002		- WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
8003		- return true;
	8725	+ return NULL;
8004	8726	}
8005	8727
8006		-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) \|\| defined(CONFIG_CMA)
8007		-
	8728	+#ifdef CONFIG_CONTIG_ALLOC
8008	8729	static unsigned long pfn_max_align_down(unsigned long pfn)
8009	8730	{
8010	8731	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
8011	8732	pageblock_nr_pages) - 1);
8012	8733	}
8013	8734
8014		-static unsigned long pfn_max_align_up(unsigned long pfn)
	8735	+unsigned long pfn_max_align_up(unsigned long pfn)
8015	8736	{
8016	8737	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
8017	8738	pageblock_nr_pages));
8018	8739	}
8019	8740
	8741	+#if defined(CONFIG_DYNAMIC_DEBUG) \|\| \
	8742	+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
	8743	+/* Usage: See admin-guide/dynamic-debug-howto.rst */
	8744	+static void alloc_contig_dump_pages(struct list_head *page_list)
	8745	+{
	8746	+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
	8747	+
	8748	+ if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
	8749	+ struct page *page;
	8750	+ unsigned long nr_skip = 0;
	8751	+ unsigned long nr_pages = 0;
	8752	+
	8753	+ dump_stack();
	8754	+ list_for_each_entry(page, page_list, lru) {
	8755	+ nr_pages++;
	8756	+ /* The page will be freed by putback_movable_pages soon */
	8757	+ if (page_count(page) == 1) {
	8758	+ nr_skip++;
	8759	+ continue;
	8760	+ }
	8761	+ dump_page(page, "migration failure");
	8762	+ }
	8763	+ pr_warn("total dump_pages %lu skipping %lu\n", nr_pages, nr_skip);
	8764	+ }
	8765	+}
	8766	+#else
	8767	+static inline void alloc_contig_dump_pages(struct list_head *page_list)
	8768	+{
	8769	+}
	8770	+#endif
	8771	+
8020	8772	/* [start, end) must belong to a single zone. */
8021	8773	static int __alloc_contig_migrate_range(struct compact_control *cc,
8022		- unsigned long start, unsigned long end)
	8774	+ unsigned long start, unsigned long end,
	8775	+ struct acr_info *info)
8023	8776	{
8024	8777	/* This function is based on compact_zone() from compaction.c. */
8025		- unsigned long nr_reclaimed;
	8778	+ unsigned int nr_reclaimed;
8026	8779	unsigned long pfn = start;
8027	8780	unsigned int tries = 0;
	8781	+ unsigned int max_tries = 5;
8028	8782	int ret = 0;
	8783	+ struct page *page;
	8784	+ struct migration_target_control mtc = {
	8785	+ .nid = zone_to_nid(cc->zone),
	8786	+ .gfp_mask = GFP_USER \| __GFP_MOVABLE \| __GFP_RETRY_MAYFAIL,
	8787	+ };
8029	8788
8030		- migrate_prep();
	8789	+ if (cc->alloc_contig && cc->mode == MIGRATE_ASYNC)
	8790	+ max_tries = 1;
	8791	+
	8792	+ lru_cache_disable();
8031	8793
8032	8794	while (pfn < end \|\| !list_empty(&cc->migratepages)) {
8033	8795	if (fatal_signal_pending(current)) {
..	..	@@ -8043,20 +8805,39 @@
8043	8805	break;
8044	8806	}
8045	8807	tries = 0;
8046		- } else if (++tries == 5) {
	8808	+ } else if (++tries == max_tries) {
8047	8809	ret = ret < 0 ? ret : -EBUSY;
8048	8810	break;
8049	8811	}
8050	8812
8051	8813	nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
8052	8814	&cc->migratepages);
	8815	+ info->nr_reclaimed += nr_reclaimed;
8053	8816	cc->nr_migratepages -= nr_reclaimed;
8054	8817
8055		- ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
8056		- NULL, 0, cc->mode, MR_CONTIG_RANGE);
	8818	+ list_for_each_entry(page, &cc->migratepages, lru)
	8819	+ info->nr_mapped += page_mapcount(page);
	8820	+
	8821	+ ret = migrate_pages(&cc->migratepages, alloc_migration_target,
	8822	+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
	8823	+ if (!ret)
	8824	+ info->nr_migrated += cc->nr_migratepages;
8057	8825	}
	8826	+
	8827	+ lru_cache_enable();
8058	8828	if (ret < 0) {
	8829	+ if (ret == -EBUSY) {
	8830	+ alloc_contig_dump_pages(&cc->migratepages);
	8831	+ page_pinner_mark_migration_failed_pages(&cc->migratepages);
	8832	+ }
	8833	+
	8834	+ if (!list_empty(&cc->migratepages)) {
	8835	+ page = list_first_entry(&cc->migratepages, struct page , lru);
	8836	+ info->failed_pfn = page_to_pfn(page);
	8837	+ }
	8838	+
8059	8839	putback_movable_pages(&cc->migratepages);
	8840	+ info->err \|= ACR_ERR_MIGRATE;
8060	8841	return ret;
8061	8842	}
8062	8843	return 0;
..	..	@@ -8079,25 +8860,28 @@
8079	8860	* pageblocks in the range. Once isolated, the pageblocks should not
8080	8861	* be modified by others.
8081	8862	*
8082		- * Returns zero on success or negative error code. On success all
	8863	+ * Return: zero on success or negative error code. On success all
8083	8864	* pages which PFN is in [start, end) are allocated for the caller and
8084	8865	* need to be freed with free_contig_range().
8085	8866	*/
8086	8867	int alloc_contig_range(unsigned long start, unsigned long end,
8087		- unsigned migratetype, gfp_t gfp_mask)
	8868	+ unsigned migratetype, gfp_t gfp_mask,
	8869	+ struct acr_info *info)
8088	8870	{
8089	8871	unsigned long outer_start, outer_end;
8090	8872	unsigned int order;
8091	8873	int ret = 0;
	8874	+ bool skip_drain_all_pages = false;
8092	8875
8093	8876	struct compact_control cc = {
8094	8877	.nr_migratepages = 0,
8095	8878	.order = -1,
8096	8879	.zone = page_zone(pfn_to_page(start)),
8097		- .mode = MIGRATE_SYNC,
	8880	+ .mode = gfp_mask & __GFP_NORETRY ? MIGRATE_ASYNC : MIGRATE_SYNC,
8098	8881	.ignore_skip_hint = true,
8099	8882	.no_set_skip_hint = true,
8100	8883	.gfp_mask = current_gfp_context(gfp_mask),
	8884	+ .alloc_contig = true,
8101	8885	};
8102	8886	INIT_LIST_HEAD(&cc.migratepages);
8103	8887
..	..	@@ -8126,14 +8910,18 @@
8126	8910	*/
8127	8911
8128	8912	ret = start_isolate_page_range(pfn_max_align_down(start),
8129		- pfn_max_align_up(end), migratetype,
8130		- false);
8131		- if (ret)
	8913	+ pfn_max_align_up(end), migratetype, 0,
	8914	+ &info->failed_pfn);
	8915	+ if (ret) {
	8916	+ info->err \|= ACR_ERR_ISOLATE;
8132	8917	return ret;
	8918	+ }
8133	8919
8134		-#ifdef CONFIG_CMA
8135		- cc.zone->cma_alloc = 1;
8136		-#endif
	8920	+ trace_android_vh_cma_drain_all_pages_bypass(migratetype,
	8921	+ &skip_drain_all_pages);
	8922	+ if (!skip_drain_all_pages)
	8923	+ drain_all_pages(cc.zone);
	8924	+
8137	8925	/*
8138	8926	* In case of -EBUSY, we'd like to know which page causes problem.
8139	8927	* So, just fall through. test_pages_isolated() has a tracepoint
..	..	@@ -8144,8 +8932,8 @@
8144	8932	* allocated. So, if we fall through be sure to clear ret so that
8145	8933	* -EBUSY is not accidentally used or returned to caller.
8146	8934	*/
8147		- ret = __alloc_contig_migrate_range(&cc, start, end);
8148		- if (ret && ret != -EBUSY)
	8935	+ ret = __alloc_contig_migrate_range(&cc, start, end, info);
	8936	+ if (ret && (ret != -EBUSY \|\| (gfp_mask & __GFP_NORETRY)))
8149	8937	goto done;
8150	8938	ret =0;
8151	8939
..	..	@@ -8166,9 +8954,6 @@
8166	8954	* isolated thus they won't get removed from buddy.
8167	8955	*/
8168	8956
8169		- lru_add_drain_all();
8170		- drain_all_pages(cc.zone);
8171		-
8172	8957	order = 0;
8173	8958	outer_start = start;
8174	8959	while (!PageBuddy(pfn_to_page(outer_start))) {
..	..	@@ -8180,7 +8965,7 @@
8180	8965	}
8181	8966
8182	8967	if (outer_start != start) {
8183		- order = page_order(pfn_to_page(outer_start));
	8968	+ order = buddy_order(pfn_to_page(outer_start));
8184	8969
8185	8970	/*
8186	8971	* outer_start page could be small order buddy page and
..	..	@@ -8193,10 +8978,11 @@
8193	8978	}
8194	8979
8195	8980	/* Make sure the range is really isolated. */
8196		- if (test_pages_isolated(outer_start, end, false)) {
	8981	+ if (test_pages_isolated(outer_start, end, 0, &info->failed_pfn)) {
8197	8982	pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
8198	8983	__func__, outer_start, end);
8199	8984	ret = -EBUSY;
	8985	+ info->err \|= ACR_ERR_TEST;
8200	8986	goto done;
8201	8987	}
8202	8988
..	..	@@ -8216,13 +9002,114 @@
8216	9002	done:
8217	9003	undo_isolate_page_range(pfn_max_align_down(start),
8218	9004	pfn_max_align_up(end), migratetype);
8219		-#ifdef CONFIG_CMA
8220		- cc.zone->cma_alloc = 0;
8221		-#endif
8222	9005	return ret;
8223	9006	}
	9007	+EXPORT_SYMBOL(alloc_contig_range);
8224	9008
8225		-void free_contig_range(unsigned long pfn, unsigned nr_pages)
	9009	+static int __alloc_contig_pages(unsigned long start_pfn,
	9010	+ unsigned long nr_pages, gfp_t gfp_mask)
	9011	+{
	9012	+ struct acr_info dummy;
	9013	+ unsigned long end_pfn = start_pfn + nr_pages;
	9014	+
	9015	+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
	9016	+ gfp_mask, &dummy);
	9017	+}
	9018	+
	9019	+static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
	9020	+ unsigned long nr_pages)
	9021	+{
	9022	+ unsigned long i, end_pfn = start_pfn + nr_pages;
	9023	+ struct page *page;
	9024	+
	9025	+ for (i = start_pfn; i < end_pfn; i++) {
	9026	+ page = pfn_to_online_page(i);
	9027	+ if (!page)
	9028	+ return false;
	9029	+
	9030	+ if (page_zone(page) != z)
	9031	+ return false;
	9032	+
	9033	+ if (PageReserved(page))
	9034	+ return false;
	9035	+
	9036	+ if (page_count(page) > 0)
	9037	+ return false;
	9038	+
	9039	+ if (PageHuge(page))
	9040	+ return false;
	9041	+ }
	9042	+ return true;
	9043	+}
	9044	+
	9045	+static bool zone_spans_last_pfn(const struct zone *zone,
	9046	+ unsigned long start_pfn, unsigned long nr_pages)
	9047	+{
	9048	+ unsigned long last_pfn = start_pfn + nr_pages - 1;
	9049	+
	9050	+ return zone_spans_pfn(zone, last_pfn);
	9051	+}
	9052	+
	9053	+/**
	9054	+ * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
	9055	+ * @nr_pages: Number of contiguous pages to allocate
	9056	+ * @gfp_mask: GFP mask to limit search and used during compaction
	9057	+ * @nid: Target node
	9058	+ * @nodemask: Mask for other possible nodes
	9059	+ *
	9060	+ * This routine is a wrapper around alloc_contig_range(). It scans over zones
	9061	+ * on an applicable zonelist to find a contiguous pfn range which can then be
	9062	+ * tried for allocation with alloc_contig_range(). This routine is intended
	9063	+ * for allocation requests which can not be fulfilled with the buddy allocator.
	9064	+ *
	9065	+ * The allocated memory is always aligned to a page boundary. If nr_pages is a
	9066	+ * power of two then the alignment is guaranteed to be to the given nr_pages
	9067	+ * (e.g. 1GB request would be aligned to 1GB).
	9068	+ *
	9069	+ * Allocated pages can be freed with free_contig_range() or by manually calling
	9070	+ * __free_page() on each allocated page.
	9071	+ *
	9072	+ * Return: pointer to contiguous pages on success, or NULL if not successful.
	9073	+ */
	9074	+struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
	9075	+ int nid, nodemask_t *nodemask)
	9076	+{
	9077	+ unsigned long ret, pfn, flags;
	9078	+ struct zonelist *zonelist;
	9079	+ struct zone *zone;
	9080	+ struct zoneref *z;
	9081	+
	9082	+ zonelist = node_zonelist(nid, gfp_mask);
	9083	+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
	9084	+ gfp_zone(gfp_mask), nodemask) {
	9085	+ spin_lock_irqsave(&zone->lock, flags);
	9086	+
	9087	+ pfn = ALIGN(zone->zone_start_pfn, nr_pages);
	9088	+ while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
	9089	+ if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
	9090	+ /*
	9091	+ * We release the zone lock here because
	9092	+ * alloc_contig_range() will also lock the zone
	9093	+ * at some point. If there's an allocation
	9094	+ * spinning on this lock, it may win the race
	9095	+ * and cause alloc_contig_range() to fail...
	9096	+ */
	9097	+ spin_unlock_irqrestore(&zone->lock, flags);
	9098	+ ret = __alloc_contig_pages(pfn, nr_pages,
	9099	+ gfp_mask);
	9100	+ if (!ret)
	9101	+ return pfn_to_page(pfn);
	9102	+ spin_lock_irqsave(&zone->lock, flags);
	9103	+ }
	9104	+ pfn += nr_pages;
	9105	+ }
	9106	+ spin_unlock_irqrestore(&zone->lock, flags);
	9107	+ }
	9108	+ return NULL;
	9109	+}
	9110	+#endif /* CONFIG_CONTIG_ALLOC */
	9111	+
	9112	+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
8226	9113	{
8227	9114	unsigned int count = 0;
8228	9115
..	..	@@ -8234,7 +9121,7 @@
8234	9121	}
8235	9122	WARN(count != 0, "%d pages are still in use!\n", count);
8236	9123	}
8237		-#endif
	9124	+EXPORT_SYMBOL(free_contig_range);
8238	9125
8239	9126	/*
8240	9127	* The zone indicated has a new number of managed_pages; batch sizes and percpu
..	..	@@ -8242,11 +9129,8 @@
8242	9129	*/
8243	9130	void __meminit zone_pcp_update(struct zone *zone)
8244	9131	{
8245		- unsigned cpu;
8246	9132	mutex_lock(&pcp_batch_high_lock);
8247		- for_each_possible_cpu(cpu)
8248		- pageset_set_high_and_batch(zone,
8249		- per_cpu_ptr(zone->pageset, cpu));
	9133	+ __zone_pcp_update(zone);
8250	9134	mutex_unlock(&pcp_batch_high_lock);
8251	9135	}
8252	9136
..	..	@@ -8257,7 +9141,7 @@
8257	9141	struct per_cpu_pageset *pset;
8258	9142
8259	9143	/* avoid races with drain_pages() */
8260		- local_irq_save(flags);
	9144	+ local_lock_irqsave(&pa_lock.l, flags);
8261	9145	if (zone->pageset != &boot_pageset) {
8262	9146	for_each_online_cpu(cpu) {
8263	9147	pset = per_cpu_ptr(zone->pageset, cpu);
..	..	@@ -8266,37 +9150,26 @@
8266	9150	free_percpu(zone->pageset);
8267	9151	zone->pageset = &boot_pageset;
8268	9152	}
8269		- local_irq_restore(flags);
	9153	+ local_unlock_irqrestore(&pa_lock.l, flags);
8270	9154	}
8271	9155
8272	9156	#ifdef CONFIG_MEMORY_HOTREMOVE
8273	9157	/*
8274		- * All pages in the range must be in a single zone and isolated
8275		- * before calling this.
	9158	+ * All pages in the range must be in a single zone, must not contain holes,
	9159	+ * must span full sections, and must be isolated before calling this function.
8276	9160	*/
8277		-void
8278		-__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
	9161	+void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
8279	9162	{
	9163	+ unsigned long pfn = start_pfn;
8280	9164	struct page *page;
8281	9165	struct zone *zone;
8282		- unsigned int order, i;
8283		- unsigned long pfn;
	9166	+ unsigned int order;
8284	9167	unsigned long flags;
8285		- /* find the first valid pfn */
8286		- for (pfn = start_pfn; pfn < end_pfn; pfn++)
8287		- if (pfn_valid(pfn))
8288		- break;
8289		- if (pfn == end_pfn)
8290		- return;
	9168	+
8291	9169	offline_mem_sections(pfn, end_pfn);
8292	9170	zone = page_zone(pfn_to_page(pfn));
8293	9171	spin_lock_irqsave(&zone->lock, flags);
8294		- pfn = start_pfn;
8295	9172	while (pfn < end_pfn) {
8296		- if (!pfn_valid(pfn)) {
8297		- pfn++;
8298		- continue;
8299		- }
8300	9173	page = pfn_to_page(pfn);
8301	9174	/*
8302	9175	* The HWPoisoned page may be not in buddy system, and
..	..	@@ -8304,22 +9177,23 @@
8304	9177	*/
8305	9178	if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
8306	9179	pfn++;
8307		- SetPageReserved(page);
	9180	+ continue;
	9181	+ }
	9182	+ /*
	9183	+ * At this point all remaining PageOffline() pages have a
	9184	+ * reference count of 0 and can simply be skipped.
	9185	+ */
	9186	+ if (PageOffline(page)) {
	9187	+ BUG_ON(page_count(page));
	9188	+ BUG_ON(PageBuddy(page));
	9189	+ pfn++;
8308	9190	continue;
8309	9191	}
8310	9192
8311	9193	BUG_ON(page_count(page));
8312	9194	BUG_ON(!PageBuddy(page));
8313		- order = page_order(page);
8314		-#ifdef CONFIG_DEBUG_VM
8315		- pr_info("remove from free list %lx %d %lx\n",
8316		- pfn, 1 << order, end_pfn);
8317		-#endif
8318		- list_del(&page->lru);
8319		- rmv_page_order(page);
8320		- zone->free_area[order].nr_free--;
8321		- for (i = 0; i < (1 << order); i++)
8322		- SetPageReserved((page+i));
	9195	+ order = buddy_order(page);
	9196	+ del_page_from_free_list(page, zone, order);
8323	9197	pfn += (1 << order);
8324	9198	}
8325	9199	spin_unlock_irqrestore(&zone->lock, flags);
..	..	@@ -8337,7 +9211,7 @@
8337	9211	for (order = 0; order < MAX_ORDER; order++) {
8338	9212	struct page *page_head = page - (pfn & ((1 << order) - 1));
8339	9213
8340		- if (PageBuddy(page_head) && page_order(page_head) >= order)
	9214	+ if (PageBuddy(page_head) && buddy_order(page_head) >= order)
8341	9215	break;
8342	9216	}
8343	9217	spin_unlock_irqrestore(&zone->lock, flags);
..	..	@@ -8347,30 +9221,87 @@
8347	9221
8348	9222	#ifdef CONFIG_MEMORY_FAILURE
8349	9223	/*
8350		- * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
8351		- * test is performed under the zone lock to prevent a race against page
8352		- * allocation.
	9224	+ * Break down a higher-order page in sub-pages, and keep our target out of
	9225	+ * buddy allocator.
8353	9226	*/
8354		-bool set_hwpoison_free_buddy_page(struct page *page)
	9227	+static void break_down_buddy_pages(struct zone zone, struct page page,
	9228	+ struct page *target, int low, int high,
	9229	+ int migratetype)
	9230	+{
	9231	+ unsigned long size = 1 << high;
	9232	+ struct page current_buddy, next_page;
	9233	+
	9234	+ while (high > low) {
	9235	+ high--;
	9236	+ size >>= 1;
	9237	+
	9238	+ if (target >= &page[size]) {
	9239	+ next_page = page + size;
	9240	+ current_buddy = page;
	9241	+ } else {
	9242	+ next_page = page;
	9243	+ current_buddy = page + size;
	9244	+ }
	9245	+
	9246	+ if (set_page_guard(zone, current_buddy, high, migratetype))
	9247	+ continue;
	9248	+
	9249	+ if (current_buddy != target) {
	9250	+ add_to_free_list(current_buddy, zone, high, migratetype);
	9251	+ set_buddy_order(current_buddy, high);
	9252	+ page = next_page;
	9253	+ }
	9254	+ }
	9255	+}
	9256	+
	9257	+/*
	9258	+ * Take a page that will be marked as poisoned off the buddy allocator.
	9259	+ */
	9260	+bool take_page_off_buddy(struct page *page)
8355	9261	{
8356	9262	struct zone *zone = page_zone(page);
8357	9263	unsigned long pfn = page_to_pfn(page);
8358	9264	unsigned long flags;
8359	9265	unsigned int order;
8360		- bool hwpoisoned = false;
	9266	+ bool ret = false;
8361	9267
8362	9268	spin_lock_irqsave(&zone->lock, flags);
8363	9269	for (order = 0; order < MAX_ORDER; order++) {
8364	9270	struct page *page_head = page - (pfn & ((1 << order) - 1));
	9271	+ int page_order = buddy_order(page_head);
8365	9272
8366		- if (PageBuddy(page_head) && page_order(page_head) >= order) {
8367		- if (!TestSetPageHWPoison(page))
8368		- hwpoisoned = true;
	9273	+ if (PageBuddy(page_head) && page_order >= order) {
	9274	+ unsigned long pfn_head = page_to_pfn(page_head);
	9275	+ int migratetype = get_pfnblock_migratetype(page_head,
	9276	+ pfn_head);
	9277	+
	9278	+ del_page_from_free_list(page_head, zone, page_order);
	9279	+ break_down_buddy_pages(zone, page_head, page, 0,
	9280	+ page_order, migratetype);
	9281	+ if (!is_migrate_isolate(migratetype))
	9282	+ __mod_zone_freepage_state(zone, -1, migratetype);
	9283	+ ret = true;
8369	9284	break;
8370	9285	}
	9286	+ if (page_count(page_head) > 0)
	9287	+ break;
8371	9288	}
8372	9289	spin_unlock_irqrestore(&zone->lock, flags);
8373		-
8374		- return hwpoisoned;
	9290	+ return ret;
8375	9291	}
8376	9292	#endif
	9293	+
	9294	+#ifdef CONFIG_ZONE_DMA
	9295	+bool has_managed_dma(void)
	9296	+{
	9297	+ struct pglist_data *pgdat;
	9298	+
	9299	+ for_each_online_pgdat(pgdat) {
	9300	+ struct zone *zone = &pgdat->node_zones[ZONE_DMA];
	9301	+
	9302	+ if (managed_zone(zone))
	9303	+ return true;
	9304	+ }
	9305	+ return false;
	9306	+}
	9307	+#endif /* CONFIG_ZONE_DMA */