~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/mm/page_alloc.c
3	4	*
..	..	@@ -16,11 +17,11 @@
16	17
17	18	#include <linux/stddef.h>
18	19	#include <linux/mm.h>
	20	+#include <linux/highmem.h>
19	21	#include <linux/swap.h>
20	22	#include <linux/interrupt.h>
21	23	#include <linux/pagemap.h>
22	24	#include <linux/jiffies.h>
23		-#include <linux/bootmem.h>
24	25	#include <linux/memblock.h>
25	26	#include <linux/compiler.h>
26	27	#include <linux/kernel.h>
..	..	@@ -43,12 +44,12 @@
43	44	#include <linux/mempolicy.h>
44	45	#include <linux/memremap.h>
45	46	#include <linux/stop_machine.h>
	47	+#include <linux/random.h>
46	48	#include <linux/sort.h>
47	49	#include <linux/pfn.h>
48	50	#include <linux/backing-dev.h>
49	51	#include <linux/fault-inject.h>
50	52	#include <linux/page-isolation.h>
51		-#include <linux/page_ext.h>
52	53	#include <linux/debugobjects.h>
53	54	#include <linux/kmemleak.h>
54	55	#include <linux/compaction.h>
..	..	@@ -60,20 +61,64 @@
60	61	#include <linux/hugetlb.h>
61	62	#include <linux/sched/rt.h>
62	63	#include <linux/sched/mm.h>
63		-#include <linux/locallock.h>
64	64	#include <linux/page_owner.h>
	65	+#include <linux/page_pinner.h>
65	66	#include <linux/kthread.h>
66	67	#include <linux/memcontrol.h>
67	68	#include <linux/ftrace.h>
68	69	#include <linux/lockdep.h>
69	70	#include <linux/nmi.h>
70		-#include <linux/khugepaged.h>
71	71	#include <linux/psi.h>
	72	+#include <linux/padata.h>
	73	+#include <linux/khugepaged.h>
	74	+#include <trace/hooks/mm.h>
	75	+#include <trace/hooks/vmscan.h>
72	76
73	77	#include <asm/sections.h>
74	78	#include <asm/tlbflush.h>
75	79	#include <asm/div64.h>
76	80	#include "internal.h"
	81	+#include "shuffle.h"
	82	+#include "page_reporting.h"
	83	+
	84	+/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
	85	+typedef int __bitwise fpi_t;
	86	+
	87	+/* No special request */
	88	+#define FPI_NONE ((__force fpi_t)0)
	89	+
	90	+/*
	91	+ * Skip free page reporting notification for the (possibly merged) page.
	92	+ * This does not hinder free page reporting from grabbing the page,
	93	+ * reporting it and marking it "reported" - it only skips notifying
	94	+ * the free page reporting infrastructure about a newly freed page. For
	95	+ * example, used when temporarily pulling a page from a freelist and
	96	+ * putting it back unmodified.
	97	+ */
	98	+#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
	99	+
	100	+/*
	101	+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
	102	+ * page shuffling (relevant code - e.g., memory onlining - is expected to
	103	+ * shuffle the whole zone).
	104	+ *
	105	+ * Note: No code should rely on this flag for correctness - it's purely
	106	+ * to allow for optimizations when handing back either fresh pages
	107	+ * (memory onlining) or untouched pages (page isolation, free page
	108	+ * reporting).
	109	+ */
	110	+#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
	111	+
	112	+/*
	113	+ * Don't poison memory with KASAN (only for the tag-based modes).
	114	+ * During boot, all non-reserved memblock memory is exposed to page_alloc.
	115	+ * Poisoning all that memory lengthens boot time, especially on systems with
	116	+ * large amount of RAM. This flag is used to skip that poisoning.
	117	+ * This is only done for the tag-based KASAN modes, as those are able to
	118	+ * detect memory corruptions with the memory tags assigned by default.
	119	+ * All memory allocated normally after boot gets poisoned as usual.
	120	+ */
	121	+#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
77	122
78	123	/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
79	124	static DEFINE_MUTEX(pcp_batch_high_lock);
..	..	@@ -95,12 +140,15 @@
95	140	*/
96	141	DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
97	142	EXPORT_PER_CPU_SYMBOL(_numa_mem_);
98		-int _node_numa_mem_[MAX_NUMNODES];
99	143	#endif
100	144
101	145	/* work_structs for global per-cpu drains */
102		-DEFINE_MUTEX(pcpu_drain_mutex);
103		-DEFINE_PER_CPU(struct work_struct, pcpu_drain);
	146	+struct pcpu_drain {
	147	+ struct zone *zone;
	148	+ struct work_struct work;
	149	+};
	150	+static DEFINE_MUTEX(pcpu_drain_mutex);
	151	+static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
104	152
105	153	#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
106	154	volatile unsigned long latent_entropy __latent_entropy;
..	..	@@ -124,62 +172,33 @@
124	172	};
125	173	EXPORT_SYMBOL(node_states);
126	174
127		-/* Protect totalram_pages and zone->managed_pages */
128		-static DEFINE_SPINLOCK(managed_page_count_lock);
129		-
130		-unsigned long totalram_pages __read_mostly;
	175	+atomic_long_t _totalram_pages __read_mostly;
	176	+EXPORT_SYMBOL(_totalram_pages);
131	177	unsigned long totalreserve_pages __read_mostly;
132	178	unsigned long totalcma_pages __read_mostly;
133	179
134	180	int percpu_pagelist_fraction;
135	181	gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
136		-#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
137		-DEFINE_STATIC_KEY_TRUE(init_on_alloc);
138		-#else
139	182	DEFINE_STATIC_KEY_FALSE(init_on_alloc);
140		-#endif
141	183	EXPORT_SYMBOL(init_on_alloc);
142	184
143		-#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
144		-DEFINE_STATIC_KEY_TRUE(init_on_free);
145		-#else
146	185	DEFINE_STATIC_KEY_FALSE(init_on_free);
147		-#endif
148	186	EXPORT_SYMBOL(init_on_free);
149	187
	188	+static bool _init_on_alloc_enabled_early __read_mostly
	189	+ = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
150	190	static int __init early_init_on_alloc(char *buf)
151	191	{
152		- int ret;
153		- bool bool_result;
154	192
155		- if (!buf)
156		- return -EINVAL;
157		- ret = kstrtobool(buf, &bool_result);
158		- if (bool_result && page_poisoning_enabled())
159		- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
160		- if (bool_result)
161		- static_branch_enable(&init_on_alloc);
162		- else
163		- static_branch_disable(&init_on_alloc);
164		- return ret;
	193	+ return kstrtobool(buf, &_init_on_alloc_enabled_early);
165	194	}
166	195	early_param("init_on_alloc", early_init_on_alloc);
167	196
	197	+static bool _init_on_free_enabled_early __read_mostly
	198	+ = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
168	199	static int __init early_init_on_free(char *buf)
169	200	{
170		- int ret;
171		- bool bool_result;
172		-
173		- if (!buf)
174		- return -EINVAL;
175		- ret = kstrtobool(buf, &bool_result);
176		- if (bool_result && page_poisoning_enabled())
177		- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
178		- if (bool_result)
179		- static_branch_enable(&init_on_free);
180		- else
181		- static_branch_disable(&init_on_free);
182		- return ret;
	201	+ return kstrtobool(buf, &_init_on_free_enabled_early);
183	202	}
184	203	early_param("init_on_free", early_init_on_free);
185	204
..	..	@@ -243,7 +262,8 @@
243	262	unsigned int pageblock_order __read_mostly;
244	263	#endif
245	264
246		-static void __free_pages_ok(struct page *page, unsigned int order);
	265	+static void __free_pages_ok(struct page *page, unsigned int order,
	266	+ fpi_t fpi_flags);
247	267
248	268	/*
249	269	* results with 256, 32 in the lowmem_reserve sysctl:
..	..	@@ -270,8 +290,6 @@
270	290	[ZONE_MOVABLE] = 0,
271	291	};
272	292
273		-EXPORT_SYMBOL(totalram_pages);
274		-
275	293	static char * const zone_names[MAX_NR_ZONES] = {
276	294	#ifdef CONFIG_ZONE_DMA
277	295	"DMA",
..	..	@@ -289,7 +307,7 @@
289	307	#endif
290	308	};
291	309
292		-char * const migratetype_names[MIGRATE_TYPES] = {
	310	+const char * const migratetype_names[MIGRATE_TYPES] = {
293	311	"Unmovable",
294	312	"Movable",
295	313	"Reclaimable",
..	..	@@ -302,14 +320,14 @@
302	320	#endif
303	321	};
304	322
305		-compound_page_dtor * const compound_page_dtors[] = {
306		- NULL,
307		- free_compound_page,
	323	+compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
	324	+ [NULL_COMPOUND_DTOR] = NULL,
	325	+ [COMPOUND_PAGE_DTOR] = free_compound_page,
308	326	#ifdef CONFIG_HUGETLB_PAGE
309		- free_huge_page,
	327	+ [HUGETLB_PAGE_DTOR] = free_huge_page,
310	328	#endif
311	329	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
312		- free_transhuge_page,
	330	+ [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
313	331	#endif
314	332	};
315	333
..	..	@@ -320,6 +338,20 @@
320	338	*/
321	339	int min_free_kbytes = 1024;
322	340	int user_min_free_kbytes = -1;
	341	+#ifdef CONFIG_DISCONTIGMEM
	342	+/*
	343	+ * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
	344	+ * are not on separate NUMA nodes. Functionally this works but with
	345	+ * watermark_boost_factor, it can reclaim prematurely as the ranges can be
	346	+ * quite small. By default, do not boost watermarks on discontigmem as in
	347	+ * many cases very high-order allocations like THP are likely to be
	348	+ * unsupported and the premature reclaim offsets the advantage of long-term
	349	+ * fragmentation avoidance.
	350	+ */
	351	+int watermark_boost_factor __read_mostly;
	352	+#else
	353	+int watermark_boost_factor __read_mostly = 15000;
	354	+#endif
323	355	int watermark_scale_factor = 10;
324	356
325	357	/*
..	..	@@ -329,42 +361,28 @@
329	361	*/
330	362	int extra_free_kbytes = 0;
331	363
332		-static unsigned long nr_kernel_pages __meminitdata;
333		-static unsigned long nr_all_pages __meminitdata;
334		-static unsigned long dma_reserve __meminitdata;
	364	+static unsigned long nr_kernel_pages __initdata;
	365	+static unsigned long nr_all_pages __initdata;
	366	+static unsigned long dma_reserve __initdata;
335	367
336		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
337		-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
338		-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
	368	+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
	369	+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
339	370	static unsigned long required_kernelcore __initdata;
340	371	static unsigned long required_kernelcore_percent __initdata;
341	372	static unsigned long required_movablecore __initdata;
342	373	static unsigned long required_movablecore_percent __initdata;
343		-static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
	374	+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
344	375	static bool mirrored_kernelcore __meminitdata;
345	376
346	377	/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
347	378	int movable_zone;
348	379	EXPORT_SYMBOL(movable_zone);
349		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
350	380
351	381	#if MAX_NUMNODES > 1
352		-int nr_node_ids __read_mostly = MAX_NUMNODES;
353		-int nr_online_nodes __read_mostly = 1;
	382	+unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
	383	+unsigned int nr_online_nodes __read_mostly = 1;
354	384	EXPORT_SYMBOL(nr_node_ids);
355	385	EXPORT_SYMBOL(nr_online_nodes);
356		-#endif
357		-
358		-static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
359		-
360		-#ifdef CONFIG_PREEMPT_RT_BASE
361		-# define cpu_lock_irqsave(cpu, flags) \
362		- local_lock_irqsave_on(pa_lock, flags, cpu)
363		-# define cpu_unlock_irqrestore(cpu, flags) \
364		- local_unlock_irqrestore_on(pa_lock, flags, cpu)
365		-#else
366		-# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags)
367		-# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags)
368	386	#endif
369	387
370	388	int page_group_by_mobility_disabled __read_mostly;
..	..	@@ -378,7 +396,7 @@
378	396	static DEFINE_STATIC_KEY_TRUE(deferred_pages);
379	397
380	398	/*
381		- * Calling kasan_free_pages() only after deferred memory initialization
	399	+ * Calling kasan_poison_pages() only after deferred memory initialization
382	400	* has completed. Poisoning pages during deferred memory init will greatly
383	401	* lengthen the process and cause problem in large memory systems as the
384	402	* deferred pages initialization is done with interrupt disabled.
..	..	@@ -390,10 +408,12 @@
390	408	* on-demand allocation and then freed again before the deferred pages
391	409	* initialization is done, but this is not likely to happen.
392	410	*/
393		-static inline void kasan_free_nondeferred_pages(struct page *page, int order)
	411	+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
394	412	{
395		- if (!static_branch_unlikely(&deferred_pages))
396		- kasan_free_pages(page, order);
	413	+ return static_branch_unlikely(&deferred_pages) \|\|
	414	+ (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
	415	+ (fpi_flags & FPI_SKIP_KASAN_POISON)) \|\|
	416	+ PageSkipKASanPoison(page);
397	417	}
398	418
399	419	/* Returns true if the struct page for the pfn is uninitialised */
..	..	@@ -408,38 +428,57 @@
408	428	}
409	429
410	430	/*
411		- * Returns false when the remaining initialisation should be deferred until
	431	+ * Returns true when the remaining initialisation should be deferred until
412	432	* later in the boot cycle when it can be parallelised.
413	433	*/
414		-static inline bool update_defer_init(pg_data_t *pgdat,
415		- unsigned long pfn, unsigned long zone_end,
416		- unsigned long *nr_initialised)
	434	+static bool __meminit
	435	+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
417	436	{
418		- /* Always populate low zones for address-constrained allocations */
419		- if (zone_end < pgdat_end_pfn(pgdat))
420		- return true;
421		- (*nr_initialised)++;
422		- if ((*nr_initialised > pgdat->static_init_pgcnt) &&
423		- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
424		- pgdat->first_deferred_pfn = pfn;
425		- return false;
	437	+ static unsigned long prev_end_pfn, nr_initialised;
	438	+
	439	+ /*
	440	+ * prev_end_pfn static that contains the end of previous zone
	441	+ * No need to protect because called very early in boot before smp_init.
	442	+ */
	443	+ if (prev_end_pfn != end_pfn) {
	444	+ prev_end_pfn = end_pfn;
	445	+ nr_initialised = 0;
426	446	}
427	447
428		- return true;
	448	+ /* Always populate low zones for address-constrained allocations */
	449	+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
	450	+ return false;
	451	+
	452	+ if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
	453	+ return true;
	454	+ /*
	455	+ * We start only with one section of pages, more pages are added as
	456	+ * needed until the rest of deferred pages are initialized.
	457	+ */
	458	+ nr_initialised++;
	459	+ if ((nr_initialised > PAGES_PER_SECTION) &&
	460	+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
	461	+ NODE_DATA(nid)->first_deferred_pfn = pfn;
	462	+ return true;
	463	+ }
	464	+ return false;
429	465	}
430	466	#else
431		-#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
	467	+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
	468	+{
	469	+ return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
	470	+ (fpi_flags & FPI_SKIP_KASAN_POISON)) \|\|
	471	+ PageSkipKASanPoison(page);
	472	+}
432	473
433	474	static inline bool early_page_uninitialised(unsigned long pfn)
434	475	{
435	476	return false;
436	477	}
437	478
438		-static inline bool update_defer_init(pg_data_t *pgdat,
439		- unsigned long pfn, unsigned long zone_end,
440		- unsigned long *nr_initialised)
	479	+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
441	480	{
442		- return true;
	481	+ return false;
443	482	}
444	483	#endif
445	484
..	..	@@ -448,7 +487,7 @@
448	487	unsigned long pfn)
449	488	{
450	489	#ifdef CONFIG_SPARSEMEM
451		- return __pfn_to_section(pfn)->pageblock_flags;
	490	+ return section_to_usemap(__pfn_to_section(pfn));
452	491	#else
453	492	return page_zone(page)->pageblock_flags;
454	493	#endif /* CONFIG_SPARSEMEM */
..	..	@@ -458,25 +497,23 @@
458	497	{
459	498	#ifdef CONFIG_SPARSEMEM
460	499	pfn &= (PAGES_PER_SECTION-1);
461		- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
462	500	#else
463	501	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
464		- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
465	502	#endif /* CONFIG_SPARSEMEM */
	503	+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
466	504	}
467	505
468	506	/**
469	507	* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
470	508	* @page: The page within the block of interest
471	509	* @pfn: The target page frame number
472		- * @end_bitidx: The last bit of interest to retrieve
473	510	* @mask: mask of bits that the caller is interested in
474	511	*
475	512	* Return: pageblock_bits flags
476	513	*/
477		-static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
	514	+static __always_inline
	515	+unsigned long __get_pfnblock_flags_mask(struct page *page,
478	516	unsigned long pfn,
479		- unsigned long end_bitidx,
480	517	unsigned long mask)
481	518	{
482	519	unsigned long *bitmap;
..	..	@@ -489,20 +526,36 @@
489	526	bitidx &= (BITS_PER_LONG-1);
490	527
491	528	word = bitmap[word_bitidx];
492		- bitidx += end_bitidx;
493		- return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
	529	+ return (word >> bitidx) & mask;
494	530	}
495	531
496	532	unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
497		- unsigned long end_bitidx,
498	533	unsigned long mask)
499	534	{
500		- return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
	535	+ return __get_pfnblock_flags_mask(page, pfn, mask);
501	536	}
	537	+EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask);
	538	+
	539	+int isolate_anon_lru_page(struct page *page)
	540	+{
	541	+ int ret;
	542	+
	543	+ if (!PageLRU(page) \|\| !PageAnon(page))
	544	+ return -EINVAL;
	545	+
	546	+ if (!get_page_unless_zero(page))
	547	+ return -EINVAL;
	548	+
	549	+ ret = isolate_lru_page(page);
	550	+ put_page(page);
	551	+
	552	+ return ret;
	553	+}
	554	+EXPORT_SYMBOL_GPL(isolate_anon_lru_page);
502	555
503	556	static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
504	557	{
505		- return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
	558	+ return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
506	559	}
507	560
508	561	/**
..	..	@@ -510,12 +563,10 @@
510	563	* @page: The page within the block of interest
511	564	* @flags: The flags to set
512	565	* @pfn: The target page frame number
513		- * @end_bitidx: The last bit of interest
514	566	* @mask: mask of bits that the caller is interested in
515	567	*/
516	568	void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
517	569	unsigned long pfn,
518		- unsigned long end_bitidx,
519	570	unsigned long mask)
520	571	{
521	572	unsigned long *bitmap;
..	..	@@ -523,6 +574,7 @@
523	574	unsigned long old_word, word;
524	575
525	576	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
	577	+ BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
526	578
527	579	bitmap = get_pageblock_bitmap(page, pfn);
528	580	bitidx = pfn_to_bitidx(page, pfn);
..	..	@@ -531,9 +583,8 @@
531	583
532	584	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
533	585
534		- bitidx += end_bitidx;
535		- mask <<= (BITS_PER_LONG - bitidx - 1);
536		- flags <<= (BITS_PER_LONG - bitidx - 1);
	586	+ mask <<= bitidx;
	587	+ flags <<= bitidx;
537	588
538	589	word = READ_ONCE(bitmap[word_bitidx]);
539	590	for (;;) {
..	..	@@ -550,8 +601,8 @@
550	601	migratetype < MIGRATE_PCPTYPES))
551	602	migratetype = MIGRATE_UNMOVABLE;
552	603
553		- set_pageblock_flags_group(page, (unsigned long)migratetype,
554		- PB_migrate, PB_migrate_end);
	604	+ set_pfnblock_flags_mask(page, (unsigned long)migratetype,
	605	+ page_to_pfn(page), MIGRATETYPE_MASK);
555	606	}
556	607
557	608	#ifdef CONFIG_DEBUG_VM
..	..	@@ -606,8 +657,7 @@
606	657	}
607	658	#endif
608	659
609		-static void bad_page(struct page page, const char reason,
610		- unsigned long bad_flags)
	660	+static void bad_page(struct page page, const char reason)
611	661	{
612	662	static unsigned long resume;
613	663	static unsigned long nr_shown;
..	..	@@ -636,10 +686,6 @@
636	686	pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
637	687	current->comm, page_to_pfn(page));
638	688	__dump_page(page, reason);
639		- bad_flags &= page->flags;
640		- if (bad_flags)
641		- pr_alert("bad because of flags: %#lx(%pGp)\n",
642		- bad_flags, &bad_flags);
643	689	dump_page_owner(page);
644	690
645	691	print_modules();
..	..	@@ -667,7 +713,8 @@
667	713
668	714	void free_compound_page(struct page *page)
669	715	{
670		- __free_pages_ok(page, compound_order(page));
	716	+ mem_cgroup_uncharge(page);
	717	+ __free_pages_ok(page, compound_order(page), FPI_NONE);
671	718	}
672	719
673	720	void prep_compound_page(struct page *page, unsigned int order)
..	..	@@ -675,8 +722,6 @@
675	722	int i;
676	723	int nr_pages = 1 << order;
677	724
678		- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
679		- set_compound_order(page, order);
680	725	__SetPageHead(page);
681	726	for (i = 1; i < nr_pages; i++) {
682	727	struct page *p = page + i;
..	..	@@ -684,51 +729,30 @@
684	729	p->mapping = TAIL_MAPPING;
685	730	set_compound_head(p, page);
686	731	}
	732	+
	733	+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
	734	+ set_compound_order(page, order);
687	735	atomic_set(compound_mapcount_ptr(page), -1);
	736	+ if (hpage_pincount_available(page))
	737	+ atomic_set(compound_pincount_ptr(page), 0);
688	738	}
689	739
690	740	#ifdef CONFIG_DEBUG_PAGEALLOC
691	741	unsigned int _debug_guardpage_minorder;
692		-bool _debug_pagealloc_enabled __read_mostly
	742	+
	743	+bool _debug_pagealloc_enabled_early __read_mostly
693	744	= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
	745	+EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
	746	+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
694	747	EXPORT_SYMBOL(_debug_pagealloc_enabled);
695		-bool _debug_guardpage_enabled __read_mostly;
	748	+
	749	+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
696	750
697	751	static int __init early_debug_pagealloc(char *buf)
698	752	{
699		- if (!buf)
700		- return -EINVAL;
701		- return kstrtobool(buf, &_debug_pagealloc_enabled);
	753	+ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
702	754	}
703	755	early_param("debug_pagealloc", early_debug_pagealloc);
704		-
705		-static bool need_debug_guardpage(void)
706		-{
707		- /* If we don't use debug_pagealloc, we don't need guard page */
708		- if (!debug_pagealloc_enabled())
709		- return false;
710		-
711		- if (!debug_guardpage_minorder())
712		- return false;
713		-
714		- return true;
715		-}
716		-
717		-static void init_debug_guardpage(void)
718		-{
719		- if (!debug_pagealloc_enabled())
720		- return;
721		-
722		- if (!debug_guardpage_minorder())
723		- return;
724		-
725		- _debug_guardpage_enabled = true;
726		-}
727		-
728		-struct page_ext_operations debug_guardpage_ops = {
729		- .need = need_debug_guardpage,
730		- .init = init_debug_guardpage,
731		-};
732	756
733	757	static int __init debug_guardpage_minorder_setup(char *buf)
734	758	{
..	..	@@ -747,20 +771,13 @@
747	771	static inline bool set_page_guard(struct zone zone, struct page page,
748	772	unsigned int order, int migratetype)
749	773	{
750		- struct page_ext *page_ext;
751		-
752	774	if (!debug_guardpage_enabled())
753	775	return false;
754	776
755	777	if (order >= debug_guardpage_minorder())
756	778	return false;
757	779
758		- page_ext = lookup_page_ext(page);
759		- if (unlikely(!page_ext))
760		- return false;
761		-
762		- __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
763		-
	780	+ __SetPageGuard(page);
764	781	INIT_LIST_HEAD(&page->lru);
765	782	set_page_private(page, order);
766	783	/* Guard pages are not available for any usage */
..	..	@@ -772,39 +789,77 @@
772	789	static inline void clear_page_guard(struct zone zone, struct page page,
773	790	unsigned int order, int migratetype)
774	791	{
775		- struct page_ext *page_ext;
776		-
777	792	if (!debug_guardpage_enabled())
778	793	return;
779	794
780		- page_ext = lookup_page_ext(page);
781		- if (unlikely(!page_ext))
782		- return;
783		-
784		- __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
	795	+ __ClearPageGuard(page);
785	796
786	797	set_page_private(page, 0);
787	798	if (!is_migrate_isolate(migratetype))
788	799	__mod_zone_freepage_state(zone, (1 << order), migratetype);
789	800	}
790	801	#else
791		-struct page_ext_operations debug_guardpage_ops;
792	802	static inline bool set_page_guard(struct zone zone, struct page page,
793	803	unsigned int order, int migratetype) { return false; }
794	804	static inline void clear_page_guard(struct zone zone, struct page page,
795	805	unsigned int order, int migratetype) {}
796	806	#endif
797	807
798		-static inline void set_page_order(struct page *page, unsigned int order)
	808	+/*
	809	+ * Enable static keys related to various memory debugging and hardening options.
	810	+ * Some override others, and depend on early params that are evaluated in the
	811	+ * order of appearance. So we need to first gather the full picture of what was
	812	+ * enabled, and then make decisions.
	813	+ */
	814	+void init_mem_debugging_and_hardening(void)
	815	+{
	816	+ bool page_poisoning_requested = false;
	817	+
	818	+#ifdef CONFIG_PAGE_POISONING
	819	+ /*
	820	+ * Page poisoning is debug page alloc for some arches. If
	821	+ * either of those options are enabled, enable poisoning.
	822	+ */
	823	+ if (page_poisoning_enabled() \|\|
	824	+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
	825	+ debug_pagealloc_enabled())) {
	826	+ static_branch_enable(&_page_poisoning_enabled);
	827	+ page_poisoning_requested = true;
	828	+ }
	829	+#endif
	830	+
	831	+ if (_init_on_alloc_enabled_early) {
	832	+ if (page_poisoning_requested)
	833	+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
	834	+ "will take precedence over init_on_alloc\n");
	835	+ else
	836	+ static_branch_enable(&init_on_alloc);
	837	+ }
	838	+ if (_init_on_free_enabled_early) {
	839	+ if (page_poisoning_requested)
	840	+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
	841	+ "will take precedence over init_on_free\n");
	842	+ else
	843	+ static_branch_enable(&init_on_free);
	844	+ }
	845	+
	846	+#ifdef CONFIG_DEBUG_PAGEALLOC
	847	+ if (!debug_pagealloc_enabled())
	848	+ return;
	849	+
	850	+ static_branch_enable(&_debug_pagealloc_enabled);
	851	+
	852	+ if (!debug_guardpage_minorder())
	853	+ return;
	854	+
	855	+ static_branch_enable(&_debug_guardpage_enabled);
	856	+#endif
	857	+}
	858	+
	859	+static inline void set_buddy_order(struct page *page, unsigned int order)
799	860	{
800	861	set_page_private(page, order);
801	862	__SetPageBuddy(page);
802		-}
803		-
804		-static inline void rmv_page_order(struct page *page)
805		-{
806		- __ClearPageBuddy(page);
807		- set_page_private(page, 0);
808	863	}
809	864
810	865	/*
..	..	@@ -820,32 +875,151 @@
820	875	*
821	876	* For recording page's order, we use page_private(page).
822	877	*/
823		-static inline int page_is_buddy(struct page page, struct page buddy,
	878	+static inline bool page_is_buddy(struct page page, struct page buddy,
824	879	unsigned int order)
825	880	{
826		- if (page_is_guard(buddy) && page_order(buddy) == order) {
827		- if (page_zone_id(page) != page_zone_id(buddy))
828		- return 0;
	881	+ if (!page_is_guard(buddy) && !PageBuddy(buddy))
	882	+ return false;
829	883
830		- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
	884	+ if (buddy_order(buddy) != order)
	885	+ return false;
831	886
832		- return 1;
833		- }
	887	+ /*
	888	+ * zone check is done late to avoid uselessly calculating
	889	+ * zone/node ids for pages that could never merge.
	890	+ */
	891	+ if (page_zone_id(page) != page_zone_id(buddy))
	892	+ return false;
834	893
835		- if (PageBuddy(buddy) && page_order(buddy) == order) {
836		- /*
837		- * zone check is done late to avoid uselessly
838		- * calculating zone/node ids for pages that could
839		- * never merge.
840		- */
841		- if (page_zone_id(page) != page_zone_id(buddy))
842		- return 0;
	894	+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
843	895
844		- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
	896	+ return true;
	897	+}
845	898
846		- return 1;
847		- }
848		- return 0;
	899	+#ifdef CONFIG_COMPACTION
	900	+static inline struct capture_control task_capc(struct zone zone)
	901	+{
	902	+ struct capture_control *capc = current->capture_control;
	903	+
	904	+ return unlikely(capc) &&
	905	+ !(current->flags & PF_KTHREAD) &&
	906	+ !capc->page &&
	907	+ capc->cc->zone == zone ? capc : NULL;
	908	+}
	909	+
	910	+static inline bool
	911	+compaction_capture(struct capture_control capc, struct page page,
	912	+ int order, int migratetype)
	913	+{
	914	+ if (!capc \|\| order != capc->cc->order)
	915	+ return false;
	916	+
	917	+ /* Do not accidentally pollute CMA or isolated regions*/
	918	+ if (is_migrate_cma(migratetype) \|\|
	919	+ is_migrate_isolate(migratetype))
	920	+ return false;
	921	+
	922	+ /*
	923	+ * Do not let lower order allocations polluate a movable pageblock.
	924	+ * This might let an unmovable request use a reclaimable pageblock
	925	+ * and vice-versa but no more than normal fallback logic which can
	926	+ * have trouble finding a high-order free page.
	927	+ */
	928	+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
	929	+ return false;
	930	+
	931	+ capc->page = page;
	932	+ return true;
	933	+}
	934	+
	935	+#else
	936	+static inline struct capture_control task_capc(struct zone zone)
	937	+{
	938	+ return NULL;
	939	+}
	940	+
	941	+static inline bool
	942	+compaction_capture(struct capture_control capc, struct page page,
	943	+ int order, int migratetype)
	944	+{
	945	+ return false;
	946	+}
	947	+#endif /* CONFIG_COMPACTION */
	948	+
	949	+/* Used for pages not on another list */
	950	+static inline void add_to_free_list(struct page page, struct zone zone,
	951	+ unsigned int order, int migratetype)
	952	+{
	953	+ struct free_area *area = &zone->free_area[order];
	954	+
	955	+ list_add(&page->lru, &area->free_list[migratetype]);
	956	+ area->nr_free++;
	957	+}
	958	+
	959	+/* Used for pages not on another list */
	960	+static inline void add_to_free_list_tail(struct page page, struct zone zone,
	961	+ unsigned int order, int migratetype)
	962	+{
	963	+ struct free_area *area = &zone->free_area[order];
	964	+
	965	+ list_add_tail(&page->lru, &area->free_list[migratetype]);
	966	+ area->nr_free++;
	967	+}
	968	+
	969	+/*
	970	+ * Used for pages which are on another list. Move the pages to the tail
	971	+ * of the list - so the moved pages won't immediately be considered for
	972	+ * allocation again (e.g., optimization for memory onlining).
	973	+ */
	974	+static inline void move_to_free_list(struct page page, struct zone zone,
	975	+ unsigned int order, int migratetype)
	976	+{
	977	+ struct free_area *area = &zone->free_area[order];
	978	+
	979	+ list_move_tail(&page->lru, &area->free_list[migratetype]);
	980	+}
	981	+
	982	+static inline void del_page_from_free_list(struct page page, struct zone zone,
	983	+ unsigned int order)
	984	+{
	985	+ /* clear reported state and update reported page count */
	986	+ if (page_reported(page))
	987	+ __ClearPageReported(page);
	988	+
	989	+ list_del(&page->lru);
	990	+ __ClearPageBuddy(page);
	991	+ set_page_private(page, 0);
	992	+ zone->free_area[order].nr_free--;
	993	+}
	994	+
	995	+/*
	996	+ * If this is not the largest possible page, check if the buddy
	997	+ * of the next-highest order is free. If it is, it's possible
	998	+ * that pages are being freed that will coalesce soon. In case,
	999	+ * that is happening, add the free page to the tail of the list
	1000	+ * so it's less likely to be used soon and more likely to be merged
	1001	+ * as a higher order page
	1002	+ */
	1003	+static inline bool
	1004	+buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
	1005	+ struct page *page, unsigned int order)
	1006	+{
	1007	+ struct page higher_page, higher_buddy;
	1008	+ unsigned long combined_pfn;
	1009	+
	1010	+ if (order >= MAX_ORDER - 2)
	1011	+ return false;
	1012	+
	1013	+ if (!pfn_valid_within(buddy_pfn))
	1014	+ return false;
	1015	+
	1016	+ combined_pfn = buddy_pfn & pfn;
	1017	+ higher_page = page + (combined_pfn - pfn);
	1018	+ buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
	1019	+ higher_buddy = higher_page + (buddy_pfn - combined_pfn);
	1020	+
	1021	+ return pfn_valid_within(buddy_pfn) &&
	1022	+ page_is_buddy(higher_page, higher_buddy, order + 1);
849	1023	}
850	1024
851	1025	/*
..	..	@@ -875,12 +1049,14 @@
875	1049	static inline void __free_one_page(struct page *page,
876	1050	unsigned long pfn,
877	1051	struct zone *zone, unsigned int order,
878		- int migratetype)
	1052	+ int migratetype, fpi_t fpi_flags)
879	1053	{
	1054	+ struct capture_control *capc = task_capc(zone);
	1055	+ unsigned long buddy_pfn;
880	1056	unsigned long combined_pfn;
881		- unsigned long uninitialized_var(buddy_pfn);
882		- struct page *buddy;
883	1057	unsigned int max_order;
	1058	+ struct page *buddy;
	1059	+ bool to_tail;
884	1060
885	1061	max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
886	1062
..	..	@@ -896,6 +1072,11 @@
896	1072
897	1073	continue_merging:
898	1074	while (order < max_order) {
	1075	+ if (compaction_capture(capc, page, order, migratetype)) {
	1076	+ __mod_zone_freepage_state(zone, -(1 << order),
	1077	+ migratetype);
	1078	+ return;
	1079	+ }
899	1080	buddy_pfn = __find_buddy_pfn(pfn, order);
900	1081	buddy = page + (buddy_pfn - pfn);
901	1082
..	..	@@ -907,13 +1088,10 @@
907	1088	* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
908	1089	* merge with it and move up one order.
909	1090	*/
910		- if (page_is_guard(buddy)) {
	1091	+ if (page_is_guard(buddy))
911	1092	clear_page_guard(zone, buddy, order, migratetype);
912		- } else {
913		- list_del(&buddy->lru);
914		- zone->free_area[order].nr_free--;
915		- rmv_page_order(buddy);
916		- }
	1093	+ else
	1094	+ del_page_from_free_list(buddy, zone, order);
917	1095	combined_pfn = buddy_pfn & pfn;
918	1096	page = page + (combined_pfn - pfn);
919	1097	pfn = combined_pfn;
..	..	@@ -945,33 +1123,23 @@
945	1123	}
946	1124
947	1125	done_merging:
948		- set_page_order(page, order);
	1126	+ set_buddy_order(page, order);
949	1127
950		- /*
951		- * If this is not the largest possible page, check if the buddy
952		- * of the next-highest order is free. If it is, it's possible
953		- * that pages are being freed that will coalesce soon. In case,
954		- * that is happening, add the free page to the tail of the list
955		- * so it's less likely to be used soon and more likely to be merged
956		- * as a higher order page
957		- */
958		- if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
959		- struct page higher_page, higher_buddy;
960		- combined_pfn = buddy_pfn & pfn;
961		- higher_page = page + (combined_pfn - pfn);
962		- buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
963		- higher_buddy = higher_page + (buddy_pfn - combined_pfn);
964		- if (pfn_valid_within(buddy_pfn) &&
965		- page_is_buddy(higher_page, higher_buddy, order + 1)) {
966		- list_add_tail(&page->lru,
967		- &zone->free_area[order].free_list[migratetype]);
968		- goto out;
969		- }
970		- }
	1128	+ if (fpi_flags & FPI_TO_TAIL)
	1129	+ to_tail = true;
	1130	+ else if (is_shuffle_order(order))
	1131	+ to_tail = shuffle_pick_tail();
	1132	+ else
	1133	+ to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
971	1134
972		- list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
973		-out:
974		- zone->free_area[order].nr_free++;
	1135	+ if (to_tail)
	1136	+ add_to_free_list_tail(page, zone, order, migratetype);
	1137	+ else
	1138	+ add_to_free_list(page, zone, order, migratetype);
	1139	+
	1140	+ /* Notify page reporting subsystem of freed page */
	1141	+ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
	1142	+ page_reporting_notify_free(order);
975	1143	}
976	1144
977	1145	/*
..	..	@@ -996,13 +1164,9 @@
996	1164	return true;
997	1165	}
998	1166
999		-static void free_pages_check_bad(struct page *page)
	1167	+static const char page_bad_reason(struct page page, unsigned long flags)
1000	1168	{
1001		- const char *bad_reason;
1002		- unsigned long bad_flags;
1003		-
1004		- bad_reason = NULL;
1005		- bad_flags = 0;
	1169	+ const char *bad_reason = NULL;
1006	1170
1007	1171	if (unlikely(atomic_read(&page->_mapcount) != -1))
1008	1172	bad_reason = "nonzero mapcount";
..	..	@@ -1010,24 +1174,32 @@
1010	1174	bad_reason = "non-NULL mapping";
1011	1175	if (unlikely(page_ref_count(page) != 0))
1012	1176	bad_reason = "nonzero _refcount";
1013		- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
1014		- bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1015		- bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
	1177	+ if (unlikely(page->flags & flags)) {
	1178	+ if (flags == PAGE_FLAGS_CHECK_AT_PREP)
	1179	+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
	1180	+ else
	1181	+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1016	1182	}
1017	1183	#ifdef CONFIG_MEMCG
1018	1184	if (unlikely(page->mem_cgroup))
1019	1185	bad_reason = "page still charged to cgroup";
1020	1186	#endif
1021		- bad_page(page, bad_reason, bad_flags);
	1187	+ return bad_reason;
1022	1188	}
1023	1189
1024		-static inline int free_pages_check(struct page *page)
	1190	+static void check_free_page_bad(struct page *page)
	1191	+{
	1192	+ bad_page(page,
	1193	+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
	1194	+}
	1195	+
	1196	+static inline int check_free_page(struct page *page)
1025	1197	{
1026	1198	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
1027	1199	return 0;
1028	1200
1029	1201	/* Something has gone sideways, find it */
1030		- free_pages_check_bad(page);
	1202	+ check_free_page_bad(page);
1031	1203	return 1;
1032	1204	}
1033	1205
..	..	@@ -1049,7 +1221,7 @@
1049	1221	case 1:
1050	1222	/* the first tail page: ->mapping may be compound_mapcount() */
1051	1223	if (unlikely(compound_mapcount(page))) {
1052		- bad_page(page, "nonzero compound_mapcount", 0);
	1224	+ bad_page(page, "nonzero compound_mapcount");
1053	1225	goto out;
1054	1226	}
1055	1227	break;
..	..	@@ -1061,17 +1233,17 @@
1061	1233	break;
1062	1234	default:
1063	1235	if (page->mapping != TAIL_MAPPING) {
1064		- bad_page(page, "corrupted mapping in tail page", 0);
	1236	+ bad_page(page, "corrupted mapping in tail page");
1065	1237	goto out;
1066	1238	}
1067	1239	break;
1068	1240	}
1069	1241	if (unlikely(!PageTail(page))) {
1070		- bad_page(page, "PageTail not set", 0);
	1242	+ bad_page(page, "PageTail not set");
1071	1243	goto out;
1072	1244	}
1073	1245	if (unlikely(compound_head(page) != head_page)) {
1074		- bad_page(page, "compound_head not consistent", 0);
	1246	+ bad_page(page, "compound_head not consistent");
1075	1247	goto out;
1076	1248	}
1077	1249	ret = 0;
..	..	@@ -1081,25 +1253,48 @@
1081	1253	return ret;
1082	1254	}
1083	1255
1084		-static void kernel_init_free_pages(struct page *page, int numpages)
	1256	+static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
1085	1257	{
1086	1258	int i;
1087	1259
	1260	+ if (zero_tags) {
	1261	+ for (i = 0; i < numpages; i++)
	1262	+ tag_clear_highpage(page + i);
	1263	+ return;
	1264	+ }
	1265	+
1088	1266	/* s390's use of memset() could override KASAN redzones. */
1089	1267	kasan_disable_current();
1090		- for (i = 0; i < numpages; i++)
	1268	+ for (i = 0; i < numpages; i++) {
	1269	+ u8 tag = page_kasan_tag(page + i);
	1270	+ page_kasan_tag_reset(page + i);
1091	1271	clear_highpage(page + i);
	1272	+ page_kasan_tag_set(page + i, tag);
	1273	+ }
1092	1274	kasan_enable_current();
1093	1275	}
1094	1276
1095	1277	static __always_inline bool free_pages_prepare(struct page *page,
1096		- unsigned int order, bool check_free)
	1278	+ unsigned int order, bool check_free, fpi_t fpi_flags)
1097	1279	{
1098	1280	int bad = 0;
	1281	+ bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
1099	1282
1100	1283	VM_BUG_ON_PAGE(PageTail(page), page);
1101	1284
1102	1285	trace_mm_page_free(page, order);
	1286	+
	1287	+ if (unlikely(PageHWPoison(page)) && !order) {
	1288	+ /*
	1289	+ * Do not let hwpoison pages hit pcplists/buddy
	1290	+ * Untie memcg state and reset page's owner
	1291	+ */
	1292	+ if (memcg_kmem_enabled() && PageKmemcg(page))
	1293	+ __memcg_kmem_uncharge_page(page, order);
	1294	+ reset_page_owner(page, order);
	1295	+ free_page_pinner(page, order);
	1296	+ return false;
	1297	+ }
1103	1298
1104	1299	/*
1105	1300	* Check tail pages before head page information is cleared to
..	..	@@ -1116,7 +1311,7 @@
1116	1311	for (i = 1; i < (1 << order); i++) {
1117	1312	if (compound)
1118	1313	bad += free_tail_pages_check(page, page + i);
1119		- if (unlikely(free_pages_check(page + i))) {
	1314	+ if (unlikely(check_free_page(page + i))) {
1120	1315	bad++;
1121	1316	continue;
1122	1317	}
..	..	@@ -1126,15 +1321,16 @@
1126	1321	if (PageMappingFlags(page))
1127	1322	page->mapping = NULL;
1128	1323	if (memcg_kmem_enabled() && PageKmemcg(page))
1129		- memcg_kmem_uncharge(page, order);
	1324	+ __memcg_kmem_uncharge_page(page, order);
1130	1325	if (check_free)
1131		- bad += free_pages_check(page);
	1326	+ bad += check_free_page(page);
1132	1327	if (bad)
1133	1328	return false;
1134	1329
1135	1330	page_cpupid_reset_last(page);
1136	1331	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1137	1332	reset_page_owner(page, order);
	1333	+ free_page_pinner(page, order);
1138	1334
1139	1335	if (!PageHighMem(page)) {
1140	1336	debug_check_no_locks_freed(page_address(page),
..	..	@@ -1142,36 +1338,77 @@
1142	1338	debug_check_no_obj_freed(page_address(page),
1143	1339	PAGE_SIZE << order);
1144	1340	}
1145		- arch_free_page(page, order);
1146		- if (want_init_on_free())
1147		- kernel_init_free_pages(page, 1 << order);
1148	1341
1149		- kernel_poison_pages(page, 1 << order, 0);
1150		- kernel_map_pages(page, 1 << order, 0);
1151		- kasan_free_nondeferred_pages(page, order);
	1342	+ kernel_poison_pages(page, 1 << order);
	1343	+
	1344	+ /*
	1345	+ * As memory initialization might be integrated into KASAN,
	1346	+ * kasan_free_pages and kernel_init_free_pages must be
	1347	+ * kept together to avoid discrepancies in behavior.
	1348	+ *
	1349	+ * With hardware tag-based KASAN, memory tags must be set before the
	1350	+ * page becomes unavailable via debug_pagealloc or arch_free_page.
	1351	+ */
	1352	+ if (kasan_has_integrated_init()) {
	1353	+ if (!skip_kasan_poison)
	1354	+ kasan_free_pages(page, order);
	1355	+ } else {
	1356	+ bool init = want_init_on_free();
	1357	+
	1358	+ if (init)
	1359	+ kernel_init_free_pages(page, 1 << order, false);
	1360	+ if (!skip_kasan_poison)
	1361	+ kasan_poison_pages(page, order, init);
	1362	+ }
	1363	+
	1364	+ /*
	1365	+ * arch_free_page() can make the page's contents inaccessible. s390
	1366	+ * does this. So nothing which can access the page's contents should
	1367	+ * happen after this.
	1368	+ */
	1369	+ arch_free_page(page, order);
	1370	+
	1371	+ debug_pagealloc_unmap_pages(page, 1 << order);
1152	1372
1153	1373	return true;
1154	1374	}
1155	1375
1156	1376	#ifdef CONFIG_DEBUG_VM
1157		-static inline bool free_pcp_prepare(struct page *page)
1158		-{
1159		- return free_pages_prepare(page, 0, true);
1160		-}
1161		-
1162		-static inline bool bulkfree_pcp_prepare(struct page *page)
1163		-{
1164		- return false;
1165		-}
1166		-#else
	1377	+/*
	1378	+ * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
	1379	+ * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
	1380	+ * moved from pcp lists to free lists.
	1381	+ */
1167	1382	static bool free_pcp_prepare(struct page *page)
1168	1383	{
1169		- return free_pages_prepare(page, 0, false);
	1384	+ return free_pages_prepare(page, 0, true, FPI_NONE);
1170	1385	}
1171	1386
1172	1387	static bool bulkfree_pcp_prepare(struct page *page)
1173	1388	{
1174		- return free_pages_check(page);
	1389	+ if (debug_pagealloc_enabled_static())
	1390	+ return check_free_page(page);
	1391	+ else
	1392	+ return false;
	1393	+}
	1394	+#else
	1395	+/*
	1396	+ * With DEBUG_VM disabled, order-0 pages being freed are checked only when
	1397	+ * moving from pcp lists to free list in order to reduce overhead. With
	1398	+ * debug_pagealloc enabled, they are checked also immediately when being freed
	1399	+ * to the pcp lists.
	1400	+ */
	1401	+static bool free_pcp_prepare(struct page *page)
	1402	+{
	1403	+ if (debug_pagealloc_enabled_static())
	1404	+ return free_pages_prepare(page, 0, true, FPI_NONE);
	1405	+ else
	1406	+ return free_pages_prepare(page, 0, false, FPI_NONE);
	1407	+}
	1408	+
	1409	+static bool bulkfree_pcp_prepare(struct page *page)
	1410	+{
	1411	+ return check_free_page(page);
1175	1412	}
1176	1413	#endif /* CONFIG_DEBUG_VM */
1177	1414
..	..	@@ -1185,7 +1422,7 @@
1185	1422	}
1186	1423
1187	1424	/*
1188		- * Frees a number of pages which have been collected from the pcp lists.
	1425	+ * Frees a number of pages from the PCP lists
1189	1426	* Assumes all pages on list are in same zone, and of same order.
1190	1427	* count is the number of pages to free.
1191	1428	*
..	..	@@ -1195,57 +1432,15 @@
1195	1432	* And clear the zone's pages_scanned counter, to hold off the "all pages are
1196	1433	* pinned" detection logic.
1197	1434	*/
1198		-static void free_pcppages_bulk(struct zone zone, struct list_head head,
1199		- bool zone_retry)
1200		-{
1201		- bool isolated_pageblocks;
1202		- struct page page, tmp;
1203		- unsigned long flags;
1204		-
1205		- spin_lock_irqsave(&zone->lock, flags);
1206		- isolated_pageblocks = has_isolate_pageblock(zone);
1207		-
1208		- /*
1209		- * Use safe version since after __free_one_page(),
1210		- * page->lru.next will not point to original list.
1211		- */
1212		- list_for_each_entry_safe(page, tmp, head, lru) {
1213		- int mt = get_pcppage_migratetype(page);
1214		-
1215		- if (page_zone(page) != zone) {
1216		- /*
1217		- * free_unref_page_list() sorts pages by zone. If we end
1218		- * up with pages from a different NUMA nodes belonging
1219		- * to the same ZONE index then we need to redo with the
1220		- * correct ZONE pointer. Skip the page for now, redo it
1221		- * on the next iteration.
1222		- */
1223		- WARN_ON_ONCE(zone_retry == false);
1224		- if (zone_retry)
1225		- continue;
1226		- }
1227		-
1228		- /* MIGRATE_ISOLATE page should not go to pcplists */
1229		- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1230		- /* Pageblock could have been isolated meanwhile */
1231		- if (unlikely(isolated_pageblocks))
1232		- mt = get_pageblock_migratetype(page);
1233		-
1234		- list_del(&page->lru);
1235		- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
1236		- trace_mm_page_pcpu_drain(page, 0, mt);
1237		- }
1238		- spin_unlock_irqrestore(&zone->lock, flags);
1239		-}
1240		-
1241		-static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp,
1242		- struct list_head *dst)
1243		-
	1435	+static void free_pcppages_bulk(struct zone *zone, int count,
	1436	+ struct per_cpu_pages *pcp)
1244	1437	{
1245	1438	int migratetype = 0;
1246	1439	int batch_free = 0;
1247	1440	int prefetch_nr = 0;
1248		- struct page *page;
	1441	+ bool isolated_pageblocks;
	1442	+ struct page page, tmp;
	1443	+ LIST_HEAD(head);
1249	1444
1250	1445	/*
1251	1446	* Ensure proper count is passed which otherwise would stuck in the
..	..	@@ -1282,7 +1477,7 @@
1282	1477	if (bulkfree_pcp_prepare(page))
1283	1478	continue;
1284	1479
1285		- list_add_tail(&page->lru, dst);
	1480	+ list_add_tail(&page->lru, &head);
1286	1481
1287	1482	/*
1288	1483	* We are going to put the page back to the global
..	..	@@ -1297,19 +1492,39 @@
1297	1492	prefetch_buddy(page);
1298	1493	} while (--count && --batch_free && !list_empty(list));
1299	1494	}
	1495	+
	1496	+ spin_lock(&zone->lock);
	1497	+ isolated_pageblocks = has_isolate_pageblock(zone);
	1498	+
	1499	+ /*
	1500	+ * Use safe version since after __free_one_page(),
	1501	+ * page->lru.next will not point to original list.
	1502	+ */
	1503	+ list_for_each_entry_safe(page, tmp, &head, lru) {
	1504	+ int mt = get_pcppage_migratetype(page);
	1505	+ /* MIGRATE_ISOLATE page should not go to pcplists */
	1506	+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
	1507	+ /* Pageblock could have been isolated meanwhile */
	1508	+ if (unlikely(isolated_pageblocks))
	1509	+ mt = get_pageblock_migratetype(page);
	1510	+
	1511	+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
	1512	+ trace_mm_page_pcpu_drain(page, 0, mt);
	1513	+ }
	1514	+ spin_unlock(&zone->lock);
1300	1515	}
1301	1516
1302	1517	static void free_one_page(struct zone *zone,
1303	1518	struct page *page, unsigned long pfn,
1304	1519	unsigned int order,
1305		- int migratetype)
	1520	+ int migratetype, fpi_t fpi_flags)
1306	1521	{
1307	1522	spin_lock(&zone->lock);
1308	1523	if (unlikely(has_isolate_pageblock(zone) \|\|
1309	1524	is_migrate_isolate(migratetype))) {
1310	1525	migratetype = get_pfnblock_migratetype(page, pfn);
1311	1526	}
1312		- __free_one_page(page, pfn, zone, order, migratetype);
	1527	+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
1313	1528	spin_unlock(&zone->lock);
1314	1529	}
1315	1530
..	..	@@ -1383,33 +1598,50 @@
1383	1598	/* Avoid false-positive PageTail() */
1384	1599	INIT_LIST_HEAD(&page->lru);
1385	1600
1386		- SetPageReserved(page);
	1601	+ /*
	1602	+ * no need for atomic set_bit because the struct
	1603	+ * page is not visible yet so nobody should
	1604	+ * access it yet.
	1605	+ */
	1606	+ __SetPageReserved(page);
1387	1607	}
1388	1608	}
1389	1609	}
1390	1610
1391		-static void __free_pages_ok(struct page *page, unsigned int order)
	1611	+static void __free_pages_ok(struct page *page, unsigned int order,
	1612	+ fpi_t fpi_flags)
1392	1613	{
1393	1614	unsigned long flags;
1394	1615	int migratetype;
1395	1616	unsigned long pfn = page_to_pfn(page);
	1617	+ bool skip_free_unref_page = false;
1396	1618
1397		- if (!free_pages_prepare(page, order, true))
	1619	+ if (!free_pages_prepare(page, order, true, fpi_flags))
1398	1620	return;
1399	1621
1400	1622	migratetype = get_pfnblock_migratetype(page, pfn);
1401		- local_lock_irqsave(pa_lock, flags);
	1623	+ trace_android_vh_free_unref_page_bypass(page, order, migratetype, &skip_free_unref_page);
	1624	+ if (skip_free_unref_page)
	1625	+ return;
	1626	+
	1627	+ local_irq_save(flags);
1402	1628	__count_vm_events(PGFREE, 1 << order);
1403		- free_one_page(page_zone(page), page, pfn, order, migratetype);
1404		- local_unlock_irqrestore(pa_lock, flags);
	1629	+ free_one_page(page_zone(page), page, pfn, order, migratetype,
	1630	+ fpi_flags);
	1631	+ local_irq_restore(flags);
1405	1632	}
1406	1633
1407		-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
	1634	+void __free_pages_core(struct page *page, unsigned int order)
1408	1635	{
1409	1636	unsigned int nr_pages = 1 << order;
1410	1637	struct page *p = page;
1411	1638	unsigned int loop;
1412	1639
	1640	+ /*
	1641	+ * When initializing the memmap, __init_single_page() sets the refcount
	1642	+ * of all pages to 1 ("allocated"/"not free"). We have to set the
	1643	+ * refcount of all involved pages to 0.
	1644	+ */
1413	1645	prefetchw(p);
1414	1646	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1415	1647	prefetchw(p + 1);
..	..	@@ -1419,15 +1651,43 @@
1419	1651	__ClearPageReserved(p);
1420	1652	set_page_count(p, 0);
1421	1653
1422		- page_zone(page)->managed_pages += nr_pages;
1423		- set_page_refcounted(page);
1424		- __free_pages(page, order);
	1654	+ atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
	1655	+
	1656	+ /*
	1657	+ * Bypass PCP and place fresh pages right to the tail, primarily
	1658	+ * relevant for memory onlining.
	1659	+ */
	1660	+ __free_pages_ok(page, order, FPI_TO_TAIL \| FPI_SKIP_KASAN_POISON);
1425	1661	}
1426	1662
1427		-#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) \|\| \
1428		- defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
	1663	+#ifdef CONFIG_NEED_MULTIPLE_NODES
1429	1664
1430	1665	static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
	1666	+
	1667	+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
	1668	+
	1669	+/*
	1670	+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
	1671	+ */
	1672	+int __meminit __early_pfn_to_nid(unsigned long pfn,
	1673	+ struct mminit_pfnnid_cache *state)
	1674	+{
	1675	+ unsigned long start_pfn, end_pfn;
	1676	+ int nid;
	1677	+
	1678	+ if (state->last_start <= pfn && pfn < state->last_end)
	1679	+ return state->last_nid;
	1680	+
	1681	+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
	1682	+ if (nid != NUMA_NO_NODE) {
	1683	+ state->last_start = start_pfn;
	1684	+ state->last_end = end_pfn;
	1685	+ state->last_nid = nid;
	1686	+ }
	1687	+
	1688	+ return nid;
	1689	+}
	1690	+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
1431	1691
1432	1692	int __meminit early_pfn_to_nid(unsigned long pfn)
1433	1693	{
..	..	@@ -1442,48 +1702,14 @@
1442	1702
1443	1703	return nid;
1444	1704	}
1445		-#endif
	1705	+#endif /* CONFIG_NEED_MULTIPLE_NODES */
1446	1706
1447		-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
1448		-static inline bool __meminit __maybe_unused
1449		-meminit_pfn_in_nid(unsigned long pfn, int node,
1450		- struct mminit_pfnnid_cache *state)
1451		-{
1452		- int nid;
1453		-
1454		- nid = __early_pfn_to_nid(pfn, state);
1455		- if (nid >= 0 && nid != node)
1456		- return false;
1457		- return true;
1458		-}
1459		-
1460		-/* Only safe to use early in boot when initialisation is single-threaded */
1461		-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1462		-{
1463		- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
1464		-}
1465		-
1466		-#else
1467		-
1468		-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1469		-{
1470		- return true;
1471		-}
1472		-static inline bool __meminit __maybe_unused
1473		-meminit_pfn_in_nid(unsigned long pfn, int node,
1474		- struct mminit_pfnnid_cache *state)
1475		-{
1476		- return true;
1477		-}
1478		-#endif
1479		-
1480		-
1481		-void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
	1707	+void __init memblock_free_pages(struct page *page, unsigned long pfn,
1482	1708	unsigned int order)
1483	1709	{
1484	1710	if (early_page_uninitialised(pfn))
1485	1711	return;
1486		- return __free_pages_boot_core(page, order);
	1712	+ __free_pages_core(page, order);
1487	1713	}
1488	1714
1489	1715	/*
..	..	@@ -1574,14 +1800,14 @@
1574	1800	if (nr_pages == pageblock_nr_pages &&
1575	1801	(pfn & (pageblock_nr_pages - 1)) == 0) {
1576	1802	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1577		- __free_pages_boot_core(page, pageblock_order);
	1803	+ __free_pages_core(page, pageblock_order);
1578	1804	return;
1579	1805	}
1580	1806
1581	1807	for (i = 0; i < nr_pages; i++, page++, pfn++) {
1582	1808	if ((pfn & (pageblock_nr_pages - 1)) == 0)
1583	1809	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1584		- __free_pages_boot_core(page, 0);
	1810	+ __free_pages_core(page, 0);
1585	1811	}
1586	1812	}
1587	1813
..	..	@@ -1604,20 +1830,12 @@
1604	1830	*
1605	1831	* Then, we check if a current large page is valid by only checking the validity
1606	1832	* of the head pfn.
1607		- *
1608		- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
1609		- * within a node: a pfn is between start and end of a node, but does not belong
1610		- * to this memory node.
1611	1833	*/
1612		-static inline bool __init
1613		-deferred_pfn_valid(int nid, unsigned long pfn,
1614		- struct mminit_pfnnid_cache *nid_init_state)
	1834	+static inline bool __init deferred_pfn_valid(unsigned long pfn)
1615	1835	{
1616	1836	if (!pfn_valid_within(pfn))
1617	1837	return false;
1618	1838	if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
1619		- return false;
1620		- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
1621	1839	return false;
1622	1840	return true;
1623	1841	}
..	..	@@ -1626,21 +1844,19 @@
1626	1844	* Free pages to buddy allocator. Try to free aligned pages in
1627	1845	* pageblock_nr_pages sizes.
1628	1846	*/
1629		-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
	1847	+static void __init deferred_free_pages(unsigned long pfn,
1630	1848	unsigned long end_pfn)
1631	1849	{
1632		- struct mminit_pfnnid_cache nid_init_state = { };
1633	1850	unsigned long nr_pgmask = pageblock_nr_pages - 1;
1634	1851	unsigned long nr_free = 0;
1635	1852
1636	1853	for (; pfn < end_pfn; pfn++) {
1637		- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
	1854	+ if (!deferred_pfn_valid(pfn)) {
1638	1855	deferred_free_range(pfn - nr_free, nr_free);
1639	1856	nr_free = 0;
1640	1857	} else if (!(pfn & nr_pgmask)) {
1641	1858	deferred_free_range(pfn - nr_free, nr_free);
1642	1859	nr_free = 1;
1643		- touch_nmi_watchdog();
1644	1860	} else {
1645	1861	nr_free++;
1646	1862	}
..	..	@@ -1654,22 +1870,22 @@
1654	1870	* by performing it only once every pageblock_nr_pages.
1655	1871	* Return number of pages initialized.
1656	1872	*/
1657		-static unsigned long __init deferred_init_pages(int nid, int zid,
	1873	+static unsigned long __init deferred_init_pages(struct zone *zone,
1658	1874	unsigned long pfn,
1659	1875	unsigned long end_pfn)
1660	1876	{
1661		- struct mminit_pfnnid_cache nid_init_state = { };
1662	1877	unsigned long nr_pgmask = pageblock_nr_pages - 1;
	1878	+ int nid = zone_to_nid(zone);
1663	1879	unsigned long nr_pages = 0;
	1880	+ int zid = zone_idx(zone);
1664	1881	struct page *page = NULL;
1665	1882
1666	1883	for (; pfn < end_pfn; pfn++) {
1667		- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
	1884	+ if (!deferred_pfn_valid(pfn)) {
1668	1885	page = NULL;
1669	1886	continue;
1670	1887	} else if (!page \|\| !(pfn & nr_pgmask)) {
1671	1888	page = pfn_to_page(pfn);
1672		- touch_nmi_watchdog();
1673	1889	} else {
1674	1890	page++;
1675	1891	}
..	..	@@ -1679,18 +1895,127 @@
1679	1895	return (nr_pages);
1680	1896	}
1681	1897
	1898	+/*
	1899	+ * This function is meant to pre-load the iterator for the zone init.
	1900	+ * Specifically it walks through the ranges until we are caught up to the
	1901	+ * first_init_pfn value and exits there. If we never encounter the value we
	1902	+ * return false indicating there are no valid ranges left.
	1903	+ */
	1904	+static bool __init
	1905	+deferred_init_mem_pfn_range_in_zone(u64 i, struct zone zone,
	1906	+ unsigned long spfn, unsigned long epfn,
	1907	+ unsigned long first_init_pfn)
	1908	+{
	1909	+ u64 j;
	1910	+
	1911	+ /*
	1912	+ * Start out by walking through the ranges in this zone that have
	1913	+ * already been initialized. We don't need to do anything with them
	1914	+ * so we just need to flush them out of the system.
	1915	+ */
	1916	+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
	1917	+ if (*epfn <= first_init_pfn)
	1918	+ continue;
	1919	+ if (*spfn < first_init_pfn)
	1920	+ *spfn = first_init_pfn;
	1921	+ *i = j;
	1922	+ return true;
	1923	+ }
	1924	+
	1925	+ return false;
	1926	+}
	1927	+
	1928	+/*
	1929	+ * Initialize and free pages. We do it in two loops: first we initialize
	1930	+ * struct page, then free to buddy allocator, because while we are
	1931	+ * freeing pages we can access pages that are ahead (computing buddy
	1932	+ * page in __free_one_page()).
	1933	+ *
	1934	+ * In order to try and keep some memory in the cache we have the loop
	1935	+ * broken along max page order boundaries. This way we will not cause
	1936	+ * any issues with the buddy page computation.
	1937	+ */
	1938	+static unsigned long __init
	1939	+deferred_init_maxorder(u64 i, struct zone zone, unsigned long *start_pfn,
	1940	+ unsigned long *end_pfn)
	1941	+{
	1942	+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
	1943	+ unsigned long spfn = start_pfn, epfn = end_pfn;
	1944	+ unsigned long nr_pages = 0;
	1945	+ u64 j = *i;
	1946	+
	1947	+ /* First we loop through and initialize the page values */
	1948	+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
	1949	+ unsigned long t;
	1950	+
	1951	+ if (mo_pfn <= *start_pfn)
	1952	+ break;
	1953	+
	1954	+ t = min(mo_pfn, *end_pfn);
	1955	+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
	1956	+
	1957	+ if (mo_pfn < *end_pfn) {
	1958	+ *start_pfn = mo_pfn;
	1959	+ break;
	1960	+ }
	1961	+ }
	1962	+
	1963	+ /* Reset values and now loop through freeing pages as needed */
	1964	+ swap(j, *i);
	1965	+
	1966	+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
	1967	+ unsigned long t;
	1968	+
	1969	+ if (mo_pfn <= spfn)
	1970	+ break;
	1971	+
	1972	+ t = min(mo_pfn, epfn);
	1973	+ deferred_free_pages(spfn, t);
	1974	+
	1975	+ if (mo_pfn <= epfn)
	1976	+ break;
	1977	+ }
	1978	+
	1979	+ return nr_pages;
	1980	+}
	1981	+
	1982	+static void __init
	1983	+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
	1984	+ void *arg)
	1985	+{
	1986	+ unsigned long spfn, epfn;
	1987	+ struct zone *zone = arg;
	1988	+ u64 i;
	1989	+
	1990	+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
	1991	+
	1992	+ /*
	1993	+ * Initialize and free pages in MAX_ORDER sized increments so that we
	1994	+ * can avoid introducing any issues with the buddy allocator.
	1995	+ */
	1996	+ while (spfn < end_pfn) {
	1997	+ deferred_init_maxorder(&i, zone, &spfn, &epfn);
	1998	+ cond_resched();
	1999	+ }
	2000	+}
	2001	+
	2002	+/* An arch may override for more concurrency. */
	2003	+__weak int __init
	2004	+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
	2005	+{
	2006	+ return 1;
	2007	+}
	2008	+
1682	2009	/* Initialise remaining memory on a node */
1683	2010	static int __init deferred_init_memmap(void *data)
1684	2011	{
1685	2012	pg_data_t *pgdat = data;
1686		- int nid = pgdat->node_id;
1687		- unsigned long start = jiffies;
1688		- unsigned long nr_pages = 0;
1689		- unsigned long spfn, epfn, first_init_pfn, flags;
1690		- phys_addr_t spa, epa;
1691		- int zid;
1692		- struct zone *zone;
1693	2013	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
	2014	+ unsigned long spfn = 0, epfn = 0;
	2015	+ unsigned long first_init_pfn, flags;
	2016	+ unsigned long start = jiffies;
	2017	+ struct zone *zone;
	2018	+ int zid, max_threads;
1694	2019	u64 i;
1695	2020
1696	2021	/* Bind memory initialisation thread to a local node if possible */
..	..	@@ -1723,30 +2048,36 @@
1723	2048	if (first_init_pfn < zone_end_pfn(zone))
1724	2049	break;
1725	2050	}
1726		- first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
1727	2051
1728		- /*
1729		- * Initialize and free pages. We do it in two loops: first we initialize
1730		- * struct page, than free to buddy allocator, because while we are
1731		- * freeing pages we can access pages that are ahead (computing buddy
1732		- * page in __free_one_page()).
1733		- */
1734		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1735		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1736		- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1737		- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
1738		- }
1739		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1740		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1741		- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1742		- deferred_free_pages(nid, zid, spfn, epfn);
1743		- }
	2052	+ /* If the zone is empty somebody else may have cleared out the zone */
	2053	+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
	2054	+ first_init_pfn))
	2055	+ goto zone_empty;
1744	2056
	2057	+ max_threads = deferred_page_init_max_threads(cpumask);
	2058	+
	2059	+ while (spfn < epfn) {
	2060	+ unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
	2061	+ struct padata_mt_job job = {
	2062	+ .thread_fn = deferred_init_memmap_chunk,
	2063	+ .fn_arg = zone,
	2064	+ .start = spfn,
	2065	+ .size = epfn_align - spfn,
	2066	+ .align = PAGES_PER_SECTION,
	2067	+ .min_chunk = PAGES_PER_SECTION,
	2068	+ .max_threads = max_threads,
	2069	+ };
	2070	+
	2071	+ padata_do_multithreaded(&job);
	2072	+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
	2073	+ epfn_align);
	2074	+ }
	2075	+zone_empty:
1745	2076	/* Sanity check that the next zone really is unpopulated */
1746	2077	WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
1747	2078
1748		- pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
1749		- jiffies_to_msecs(jiffies - start));
	2079	+ pr_info("node %d deferred pages initialised in %ums\n",
	2080	+ pgdat->node_id, jiffies_to_msecs(jiffies - start));
1750	2081
1751	2082	pgdat_init_report_one_done();
1752	2083	return 0;
..	..	@@ -1770,14 +2101,11 @@
1770	2101	static noinline bool __init
1771	2102	deferred_grow_zone(struct zone *zone, unsigned int order)
1772	2103	{
1773		- int zid = zone_idx(zone);
1774		- int nid = zone_to_nid(zone);
1775		- pg_data_t *pgdat = NODE_DATA(nid);
1776	2104	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
1777		- unsigned long nr_pages = 0;
1778		- unsigned long first_init_pfn, spfn, epfn, t, flags;
	2105	+ pg_data_t *pgdat = zone->zone_pgdat;
1779	2106	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
1780		- phys_addr_t spa, epa;
	2107	+ unsigned long spfn, epfn, flags;
	2108	+ unsigned long nr_pages = 0;
1781	2109	u64 i;
1782	2110
1783	2111	/* Only the last zone may have deferred pages */
..	..	@@ -1795,38 +2123,37 @@
1795	2123	return true;
1796	2124	}
1797	2125
1798		- first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
1799		-
1800		- if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
	2126	+ /* If the zone is empty somebody else may have cleared out the zone */
	2127	+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
	2128	+ first_deferred_pfn)) {
	2129	+ pgdat->first_deferred_pfn = ULONG_MAX;
1801	2130	pgdat_resize_unlock(pgdat, &flags);
1802		- return false;
	2131	+ /* Retry only once. */
	2132	+ return first_deferred_pfn != ULONG_MAX;
1803	2133	}
1804	2134
1805		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1806		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1807		- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
	2135	+ /*
	2136	+ * Initialize and free pages in MAX_ORDER sized increments so
	2137	+ * that we can avoid introducing any issues with the buddy
	2138	+ * allocator.
	2139	+ */
	2140	+ while (spfn < epfn) {
	2141	+ /* update our first deferred PFN for this section */
	2142	+ first_deferred_pfn = spfn;
1808	2143
1809		- while (spfn < epfn && nr_pages < nr_pages_needed) {
1810		- t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
1811		- first_deferred_pfn = min(t, epfn);
1812		- nr_pages += deferred_init_pages(nid, zid, spfn,
1813		- first_deferred_pfn);
1814		- spfn = first_deferred_pfn;
1815		- }
	2144	+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
	2145	+ touch_nmi_watchdog();
1816	2146
	2147	+ /* We should only stop along section boundaries */
	2148	+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
	2149	+ continue;
	2150	+
	2151	+ /* If our quota has been met we can stop here */
1817	2152	if (nr_pages >= nr_pages_needed)
1818	2153	break;
1819	2154	}
1820	2155
1821		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1822		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1823		- epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
1824		- deferred_free_pages(nid, zid, spfn, epfn);
1825		-
1826		- if (first_deferred_pfn == epfn)
1827		- break;
1828		- }
1829		- pgdat->first_deferred_pfn = first_deferred_pfn;
	2156	+ pgdat->first_deferred_pfn = spfn;
1830	2157	pgdat_resize_unlock(pgdat, &flags);
1831	2158
1832	2159	return nr_pages > 0;
..	..	@@ -1849,9 +2176,9 @@
1849	2176	void __init page_alloc_init_late(void)
1850	2177	{
1851	2178	struct zone *zone;
	2179	+ int nid;
1852	2180
1853	2181	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1854		- int nid;
1855	2182
1856	2183	/* There will be num_node_state(N_MEMORY) threads */
1857	2184	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
..	..	@@ -1879,10 +2206,12 @@
1879	2206	/* Reinit limits that are based on free pages after the kernel is up */
1880	2207	files_maxfiles_init();
1881	2208	#endif
1882		-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
	2209	+
1883	2210	/* Discard memblock private memory */
1884	2211	memblock_discard();
1885		-#endif
	2212	+
	2213	+ for_each_node_state(nid, N_MEMORY)
	2214	+ shuffle_free_memory(NODE_DATA(nid));
1886	2215
1887	2216	for_each_populated_zone(zone)
1888	2217	set_zone_contiguous(zone);
..	..	@@ -1916,6 +2245,7 @@
1916	2245	}
1917	2246
1918	2247	adjust_managed_page_count(page, pageblock_nr_pages);
	2248	+ page_zone(page)->cma_pages += pageblock_nr_pages;
1919	2249	}
1920	2250	#endif
1921	2251
..	..	@@ -1934,13 +2264,11 @@
1934	2264	* -- nyc
1935	2265	*/
1936	2266	static inline void expand(struct zone zone, struct page page,
1937		- int low, int high, struct free_area *area,
1938		- int migratetype)
	2267	+ int low, int high, int migratetype)
1939	2268	{
1940	2269	unsigned long size = 1 << high;
1941	2270
1942	2271	while (high > low) {
1943		- area--;
1944	2272	high--;
1945	2273	size >>= 1;
1946	2274	VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
..	..	@@ -1954,39 +2282,21 @@
1954	2282	if (set_page_guard(zone, &page[size], high, migratetype))
1955	2283	continue;
1956	2284
1957		- list_add(&page[size].lru, &area->free_list[migratetype]);
1958		- area->nr_free++;
1959		- set_page_order(&page[size], high);
	2285	+ add_to_free_list(&page[size], zone, high, migratetype);
	2286	+ set_buddy_order(&page[size], high);
1960	2287	}
1961	2288	}
1962	2289
1963	2290	static void check_new_page_bad(struct page *page)
1964	2291	{
1965		- const char *bad_reason = NULL;
1966		- unsigned long bad_flags = 0;
1967		-
1968		- if (unlikely(atomic_read(&page->_mapcount) != -1))
1969		- bad_reason = "nonzero mapcount";
1970		- if (unlikely(page->mapping != NULL))
1971		- bad_reason = "non-NULL mapping";
1972		- if (unlikely(page_ref_count(page) != 0))
1973		- bad_reason = "nonzero _count";
1974	2292	if (unlikely(page->flags & __PG_HWPOISON)) {
1975		- bad_reason = "HWPoisoned (hardware-corrupted)";
1976		- bad_flags = __PG_HWPOISON;
1977	2293	/* Don't complain about hwpoisoned pages */
1978	2294	page_mapcount_reset(page); /* remove PageBuddy */
1979	2295	return;
1980	2296	}
1981		- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
1982		- bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
1983		- bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
1984		- }
1985		-#ifdef CONFIG_MEMCG
1986		- if (unlikely(page->mem_cgroup))
1987		- bad_reason = "page still charged to cgroup";
1988		-#endif
1989		- bad_page(page, bad_reason, bad_flags);
	2297	+
	2298	+ bad_page(page,
	2299	+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
1990	2300	}
1991	2301
1992	2302	/*
..	..	@@ -2002,30 +2312,40 @@
2002	2312	return 1;
2003	2313	}
2004	2314
2005		-static inline bool free_pages_prezeroed(void)
2006		-{
2007		- return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
2008		- page_poisoning_enabled()) \|\| want_init_on_free();
2009		-}
2010		-
2011	2315	#ifdef CONFIG_DEBUG_VM
2012		-static bool check_pcp_refill(struct page *page)
	2316	+/*
	2317	+ * With DEBUG_VM enabled, order-0 pages are checked for expected state when
	2318	+ * being allocated from pcp lists. With debug_pagealloc also enabled, they are
	2319	+ * also checked when pcp lists are refilled from the free lists.
	2320	+ */
	2321	+static inline bool check_pcp_refill(struct page *page)
2013	2322	{
2014		- return false;
	2323	+ if (debug_pagealloc_enabled_static())
	2324	+ return check_new_page(page);
	2325	+ else
	2326	+ return false;
2015	2327	}
2016	2328
2017		-static bool check_new_pcp(struct page *page)
	2329	+static inline bool check_new_pcp(struct page *page)
2018	2330	{
2019	2331	return check_new_page(page);
2020	2332	}
2021	2333	#else
2022		-static bool check_pcp_refill(struct page *page)
	2334	+/*
	2335	+ * With DEBUG_VM disabled, free order-0 pages are checked for expected state
	2336	+ * when pcp lists are being refilled from the free lists. With debug_pagealloc
	2337	+ * enabled, they are also checked when being allocated from the pcp lists.
	2338	+ */
	2339	+static inline bool check_pcp_refill(struct page *page)
2023	2340	{
2024	2341	return check_new_page(page);
2025	2342	}
2026		-static bool check_new_pcp(struct page *page)
	2343	+static inline bool check_new_pcp(struct page *page)
2027	2344	{
2028		- return false;
	2345	+ if (debug_pagealloc_enabled_static())
	2346	+ return check_new_page(page);
	2347	+ else
	2348	+ return false;
2029	2349	}
2030	2350	#endif /* CONFIG_DEBUG_VM */
2031	2351
..	..	@@ -2049,9 +2369,31 @@
2049	2369	set_page_refcounted(page);
2050	2370
2051	2371	arch_alloc_page(page, order);
2052		- kernel_map_pages(page, 1 << order, 1);
2053		- kasan_alloc_pages(page, order);
2054		- kernel_poison_pages(page, 1 << order, 1);
	2372	+ debug_pagealloc_map_pages(page, 1 << order);
	2373	+
	2374	+ /*
	2375	+ * Page unpoisoning must happen before memory initialization.
	2376	+ * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
	2377	+ * allocations and the page unpoisoning code will complain.
	2378	+ */
	2379	+ kernel_unpoison_pages(page, 1 << order);
	2380	+
	2381	+ /*
	2382	+ * As memory initialization might be integrated into KASAN,
	2383	+ * kasan_alloc_pages and kernel_init_free_pages must be
	2384	+ * kept together to avoid discrepancies in behavior.
	2385	+ */
	2386	+ if (kasan_has_integrated_init()) {
	2387	+ kasan_alloc_pages(page, order, gfp_flags);
	2388	+ } else {
	2389	+ bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
	2390	+
	2391	+ kasan_unpoison_pages(page, order, init);
	2392	+ if (init)
	2393	+ kernel_init_free_pages(page, 1 << order,
	2394	+ gfp_flags & __GFP_ZEROTAGS);
	2395	+ }
	2396	+
2055	2397	set_page_owner(page, order, gfp_flags);
2056	2398	}
2057	2399
..	..	@@ -2059,9 +2401,6 @@
2059	2401	unsigned int alloc_flags)
2060	2402	{
2061	2403	post_alloc_hook(page, order, gfp_flags);
2062		-
2063		- if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
2064		- kernel_init_free_pages(page, 1 << order);
2065	2404
2066	2405	if (order && (gfp_flags & __GFP_COMP))
2067	2406	prep_compound_page(page, order);
..	..	@@ -2076,6 +2415,7 @@
2076	2415	set_page_pfmemalloc(page);
2077	2416	else
2078	2417	clear_page_pfmemalloc(page);
	2418	+ trace_android_vh_test_clear_look_around_ref(page);
2079	2419	}
2080	2420
2081	2421	/*
..	..	@@ -2093,14 +2433,11 @@
2093	2433	/* Find a page of the appropriate size in the preferred list */
2094	2434	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
2095	2435	area = &(zone->free_area[current_order]);
2096		- page = list_first_entry_or_null(&area->free_list[migratetype],
2097		- struct page, lru);
	2436	+ page = get_page_from_free_area(area, migratetype);
2098	2437	if (!page)
2099	2438	continue;
2100		- list_del(&page->lru);
2101		- rmv_page_order(page);
2102		- area->nr_free--;
2103		- expand(zone, page, order, current_order, area, migratetype);
	2439	+ del_page_from_free_list(page, zone, current_order);
	2440	+ expand(zone, page, order, current_order, migratetype);
2104	2441	set_pcppage_migratetype(page, migratetype);
2105	2442	return page;
2106	2443	}
..	..	@@ -2113,10 +2450,10 @@
2113	2450	* This array describes the order lists are fallen back to when
2114	2451	* the free lists for the desirable migrate type are depleted
2115	2452	*/
2116		-static int fallbacks[MIGRATE_TYPES][4] = {
	2453	+static int fallbacks[MIGRATE_TYPES][3] = {
2117	2454	[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2118		- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2119	2455	[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
	2456	+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2120	2457	#ifdef CONFIG_CMA
2121	2458	[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
2122	2459	#endif
..	..	@@ -2137,7 +2474,7 @@
2137	2474	#endif
2138	2475
2139	2476	/*
2140		- * Move the free pages in a range to the free lists of the requested type.
	2477	+ * Move the free pages in a range to the freelist tail of the requested type.
2141	2478	* Note that start_page and end_pages are not aligned on a pageblock
2142	2479	* boundary. If alignment is required, use move_freepages_block()
2143	2480	*/
..	..	@@ -2149,30 +2486,11 @@
2149	2486	unsigned int order;
2150	2487	int pages_moved = 0;
2151	2488
2152		-#ifndef CONFIG_HOLES_IN_ZONE
2153		- /*
2154		- * page_zone is not safe to call in this context when
2155		- * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
2156		- * anyway as we check zone boundaries in move_freepages_block().
2157		- * Remove at a later date when no bug reports exist related to
2158		- * grouping pages by mobility
2159		- */
2160		- VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
2161		- pfn_valid(page_to_pfn(end_page)) &&
2162		- page_zone(start_page) != page_zone(end_page));
2163		-#endif
2164		-
2165		- if (num_movable)
2166		- *num_movable = 0;
2167		-
2168	2489	for (page = start_page; page <= end_page;) {
2169	2490	if (!pfn_valid_within(page_to_pfn(page))) {
2170	2491	page++;
2171	2492	continue;
2172	2493	}
2173		-
2174		- /* Make sure we are not inadvertently changing nodes */
2175		- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
2176	2494
2177	2495	if (!PageBuddy(page)) {
2178	2496	/*
..	..	@@ -2188,9 +2506,12 @@
2188	2506	continue;
2189	2507	}
2190	2508
2191		- order = page_order(page);
2192		- list_move(&page->lru,
2193		- &zone->free_area[order].free_list[migratetype]);
	2509	+ /* Make sure we are not inadvertently changing nodes */
	2510	+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
	2511	+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
	2512	+
	2513	+ order = buddy_order(page);
	2514	+ move_to_free_list(page, zone, order, migratetype);
2194	2515	page += 1 << order;
2195	2516	pages_moved += 1 << order;
2196	2517	}
..	..	@@ -2203,6 +2524,9 @@
2203	2524	{
2204	2525	unsigned long start_pfn, end_pfn;
2205	2526	struct page start_page, end_page;
	2527	+
	2528	+ if (num_movable)
	2529	+ *num_movable = 0;
2206	2530
2207	2531	start_pfn = page_to_pfn(page);
2208	2532	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
..	..	@@ -2264,6 +2588,43 @@
2264	2588	return false;
2265	2589	}
2266	2590
	2591	+static inline bool boost_watermark(struct zone *zone)
	2592	+{
	2593	+ unsigned long max_boost;
	2594	+
	2595	+ if (!watermark_boost_factor)
	2596	+ return false;
	2597	+ /*
	2598	+ * Don't bother in zones that are unlikely to produce results.
	2599	+ * On small machines, including kdump capture kernels running
	2600	+ * in a small area, boosting the watermark can cause an out of
	2601	+ * memory situation immediately.
	2602	+ */
	2603	+ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
	2604	+ return false;
	2605	+
	2606	+ max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
	2607	+ watermark_boost_factor, 10000);
	2608	+
	2609	+ /*
	2610	+ * high watermark may be uninitialised if fragmentation occurs
	2611	+ * very early in boot so do not boost. We do not fall
	2612	+ * through and boost by pageblock_nr_pages as failing
	2613	+ * allocations that early means that reclaim is not going
	2614	+ * to help and it may even be impossible to reclaim the
	2615	+ * boosted watermark resulting in a hang.
	2616	+ */
	2617	+ if (!max_boost)
	2618	+ return false;
	2619	+
	2620	+ max_boost = max(pageblock_nr_pages, max_boost);
	2621	+
	2622	+ zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
	2623	+ max_boost);
	2624	+
	2625	+ return true;
	2626	+}
	2627	+
2267	2628	/*
2268	2629	* This function implements actual steal behaviour. If order is large enough,
2269	2630	* we can steal whole pageblock. If not, we first move freepages in this
..	..	@@ -2273,10 +2634,9 @@
2273	2634	* itself, so pages freed in the future will be put on the correct free list.
2274	2635	*/
2275	2636	static void steal_suitable_fallback(struct zone zone, struct page page,
2276		- int start_type, bool whole_block)
	2637	+ unsigned int alloc_flags, int start_type, bool whole_block)
2277	2638	{
2278		- unsigned int current_order = page_order(page);
2279		- struct free_area *area;
	2639	+ unsigned int current_order = buddy_order(page);
2280	2640	int free_pages, movable_pages, alike_pages;
2281	2641	int old_block_type;
2282	2642
..	..	@@ -2294,6 +2654,14 @@
2294	2654	change_pageblock_range(page, current_order, start_type);
2295	2655	goto single_page;
2296	2656	}
	2657	+
	2658	+ /*
	2659	+ * Boost watermarks to increase reclaim pressure to reduce the
	2660	+ * likelihood of future fallbacks. Wake kswapd now as the node
	2661	+ * may be balanced overall and kswapd will not wake naturally.
	2662	+ */
	2663	+ if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
	2664	+ set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
2297	2665
2298	2666	/* We are not allowed to try stealing from the whole block */
2299	2667	if (!whole_block)
..	..	@@ -2338,8 +2706,7 @@
2338	2706	return;
2339	2707
2340	2708	single_page:
2341		- area = &zone->free_area[current_order];
2342		- list_move(&page->lru, &area->free_list[start_type]);
	2709	+ move_to_free_list(page, zone, current_order, start_type);
2343	2710	}
2344	2711
2345	2712	/*
..	..	@@ -2363,7 +2730,7 @@
2363	2730	if (fallback_mt == MIGRATE_TYPES)
2364	2731	break;
2365	2732
2366		- if (list_empty(&area->free_list[fallback_mt]))
	2733	+ if (free_area_empty(area, fallback_mt))
2367	2734	continue;
2368	2735
2369	2736	if (can_steal_fallback(order, migratetype))
..	..	@@ -2393,7 +2760,7 @@
2393	2760	* Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2394	2761	* Check is race-prone but harmless.
2395	2762	*/
2396		- max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
	2763	+ max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
2397	2764	if (zone->nr_reserved_highatomic >= max_managed)
2398	2765	return;
2399	2766
..	..	@@ -2435,8 +2802,9 @@
2435	2802	struct page *page;
2436	2803	int order;
2437	2804	bool ret;
	2805	+ bool skip_unreserve_highatomic = false;
2438	2806
2439		- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
	2807	+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
2440	2808	ac->nodemask) {
2441	2809	/*
2442	2810	* Preserve at least one pageblock unless memory pressure
..	..	@@ -2446,13 +2814,16 @@
2446	2814	pageblock_nr_pages)
2447	2815	continue;
2448	2816
	2817	+ trace_android_vh_unreserve_highatomic_bypass(force, zone,
	2818	+ &skip_unreserve_highatomic);
	2819	+ if (skip_unreserve_highatomic)
	2820	+ continue;
	2821	+
2449	2822	spin_lock_irqsave(&zone->lock, flags);
2450	2823	for (order = 0; order < MAX_ORDER; order++) {
2451	2824	struct free_area *area = &(zone->free_area[order]);
2452	2825
2453		- page = list_first_entry_or_null(
2454		- &area->free_list[MIGRATE_HIGHATOMIC],
2455		- struct page, lru);
	2826	+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
2456	2827	if (!page)
2457	2828	continue;
2458	2829
..	..	@@ -2510,20 +2881,30 @@
2510	2881	* condition simpler.
2511	2882	*/
2512	2883	static __always_inline bool
2513		-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
	2884	+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
	2885	+ unsigned int alloc_flags)
2514	2886	{
2515	2887	struct free_area *area;
2516	2888	int current_order;
	2889	+ int min_order = order;
2517	2890	struct page *page;
2518	2891	int fallback_mt;
2519	2892	bool can_steal;
	2893	+
	2894	+ /*
	2895	+ * Do not steal pages from freelists belonging to other pageblocks
	2896	+ * i.e. orders < pageblock_order. If there are no local zones free,
	2897	+ * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
	2898	+ */
	2899	+ if (alloc_flags & ALLOC_NOFRAGMENT)
	2900	+ min_order = pageblock_order;
2520	2901
2521	2902	/*
2522	2903	* Find the largest available free page in the other list. This roughly
2523	2904	* approximates finding the pageblock with the most free pages, which
2524	2905	* would be too costly to do exactly.
2525	2906	*/
2526		- for (current_order = MAX_ORDER - 1; current_order >= order;
	2907	+ for (current_order = MAX_ORDER - 1; current_order >= min_order;
2527	2908	--current_order) {
2528	2909	area = &(zone->free_area[current_order]);
2529	2910	fallback_mt = find_suitable_fallback(area, current_order,
..	..	@@ -2565,10 +2946,10 @@
2565	2946	VM_BUG_ON(current_order == MAX_ORDER);
2566	2947
2567	2948	do_steal:
2568		- page = list_first_entry(&area->free_list[fallback_mt],
2569		- struct page, lru);
	2949	+ page = get_page_from_free_area(area, fallback_mt);
2570	2950
2571		- steal_suitable_fallback(zone, page, start_migratetype, can_steal);
	2951	+ steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
	2952	+ can_steal);
2572	2953
2573	2954	trace_mm_page_alloc_extfrag(page, order, current_order,
2574	2955	start_migratetype, fallback_mt);
..	..	@@ -2582,14 +2963,16 @@
2582	2963	* Call me with the zone->lock already held.
2583	2964	*/
2584	2965	static __always_inline struct page *
2585		-__rmqueue(struct zone *zone, unsigned int order, int migratetype)
	2966	+__rmqueue(struct zone *zone, unsigned int order, int migratetype,
	2967	+ unsigned int alloc_flags)
2586	2968	{
2587	2969	struct page *page;
2588	2970
2589	2971	retry:
2590	2972	page = __rmqueue_smallest(zone, order, migratetype);
2591	2973
2592		- if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype))
	2974	+ if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype,
	2975	+ alloc_flags))
2593	2976	goto retry;
2594	2977
2595	2978	trace_mm_page_alloc_zone_locked(page, order, migratetype);
..	..	@@ -2597,18 +2980,18 @@
2597	2980	}
2598	2981
2599	2982	#ifdef CONFIG_CMA
2600		-static struct page __rmqueue_cma(struct zone zone, unsigned int order)
	2983	+static struct page __rmqueue_cma(struct zone zone, unsigned int order,
	2984	+ int migratetype,
	2985	+ unsigned int alloc_flags)
2601	2986	{
2602		- struct page *page = 0;
2603		-
2604		- if (IS_ENABLED(CONFIG_CMA))
2605		- if (!zone->cma_alloc)
2606		- page = __rmqueue_cma_fallback(zone, order);
	2987	+ struct page *page = __rmqueue_cma_fallback(zone, order);
2607	2988	trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA);
2608	2989	return page;
2609	2990	}
2610	2991	#else
2611		-static inline struct page __rmqueue_cma(struct zone zone, unsigned int order)
	2992	+static inline struct page __rmqueue_cma(struct zone zone, unsigned int order,
	2993	+ int migratetype,
	2994	+ unsigned int alloc_flags)
2612	2995	{
2613	2996	return NULL;
2614	2997	}
..	..	@@ -2621,7 +3004,7 @@
2621	3004	*/
2622	3005	static int rmqueue_bulk(struct zone *zone, unsigned int order,
2623	3006	unsigned long count, struct list_head *list,
2624		- int migratetype)
	3007	+ int migratetype, unsigned int alloc_flags)
2625	3008	{
2626	3009	int i, alloced = 0;
2627	3010
..	..	@@ -2629,15 +3012,11 @@
2629	3012	for (i = 0; i < count; ++i) {
2630	3013	struct page *page;
2631	3014
2632		- /*
2633		- * If migrate type CMA is being requested only try to
2634		- * satisfy the request with CMA pages to try and increase
2635		- * CMA utlization.
2636		- */
2637	3015	if (is_migrate_cma(migratetype))
2638		- page = __rmqueue_cma(zone, order);
	3016	+ page = __rmqueue_cma(zone, order, migratetype,
	3017	+ alloc_flags);
2639	3018	else
2640		- page = __rmqueue(zone, order, migratetype);
	3019	+ page = __rmqueue(zone, order, migratetype, alloc_flags);
2641	3020
2642	3021	if (unlikely(page == NULL))
2643	3022	break;
..	..	@@ -2680,14 +3059,18 @@
2680	3059	*/
2681	3060	static struct list_head get_populated_pcp_list(struct zone zone,
2682	3061	unsigned int order, struct per_cpu_pages *pcp,
2683		- int migratetype)
	3062	+ int migratetype, unsigned int alloc_flags)
2684	3063	{
2685	3064	struct list_head *list = &pcp->lists[migratetype];
2686	3065
2687	3066	if (list_empty(list)) {
	3067	+ trace_android_vh_rmqueue_bulk_bypass(order, pcp, migratetype, list);
	3068	+ if (!list_empty(list))
	3069	+ return list;
	3070	+
2688	3071	pcp->count += rmqueue_bulk(zone, order,
2689	3072	pcp->batch, list,
2690		- migratetype);
	3073	+ migratetype, alloc_flags);
2691	3074
2692	3075	if (list_empty(list))
2693	3076	list = NULL;
..	..	@@ -2708,18 +3091,13 @@
2708	3091	{
2709	3092	unsigned long flags;
2710	3093	int to_drain, batch;
2711		- LIST_HEAD(dst);
2712	3094
2713		- local_lock_irqsave(pa_lock, flags);
	3095	+ local_irq_save(flags);
2714	3096	batch = READ_ONCE(pcp->batch);
2715	3097	to_drain = min(pcp->count, batch);
2716	3098	if (to_drain > 0)
2717		- isolate_pcp_pages(to_drain, pcp, &dst);
2718		-
2719		- local_unlock_irqrestore(pa_lock, flags);
2720		-
2721		- if (to_drain > 0)
2722		- free_pcppages_bulk(zone, &dst, false);
	3099	+ free_pcppages_bulk(zone, to_drain, pcp);
	3100	+ local_irq_restore(flags);
2723	3101	}
2724	3102	#endif
2725	3103
..	..	@@ -2735,21 +3113,14 @@
2735	3113	unsigned long flags;
2736	3114	struct per_cpu_pageset *pset;
2737	3115	struct per_cpu_pages *pcp;
2738		- LIST_HEAD(dst);
2739		- int count;
2740	3116
2741		- cpu_lock_irqsave(cpu, flags);
	3117	+ local_irq_save(flags);
2742	3118	pset = per_cpu_ptr(zone->pageset, cpu);
2743	3119
2744	3120	pcp = &pset->pcp;
2745		- count = pcp->count;
2746		- if (count)
2747		- isolate_pcp_pages(count, pcp, &dst);
2748		-
2749		- cpu_unlock_irqrestore(cpu, flags);
2750		-
2751		- if (count)
2752		- free_pcppages_bulk(zone, &dst, false);
	3121	+ if (pcp->count)
	3122	+ free_pcppages_bulk(zone, pcp->count, pcp);
	3123	+ local_irq_restore(flags);
2753	3124	}
2754	3125
2755	3126	/*
..	..	@@ -2784,9 +3155,12 @@
2784	3155	drain_pages(cpu);
2785	3156	}
2786	3157
2787		-#ifndef CONFIG_PREEMPT_RT_BASE
2788	3158	static void drain_local_pages_wq(struct work_struct *work)
2789	3159	{
	3160	+ struct pcpu_drain *drain;
	3161	+
	3162	+ drain = container_of(work, struct pcpu_drain, work);
	3163	+
2790	3164	/*
2791	3165	* drain_all_pages doesn't use proper cpu hotplug protection so
2792	3166	* we can race with cpu offline when the WQ can move this from
..	..	@@ -2795,10 +3169,9 @@
2795	3169	* a different one.
2796	3170	*/
2797	3171	preempt_disable();
2798		- drain_local_pages(NULL);
	3172	+ drain_local_pages(drain->zone);
2799	3173	preempt_enable();
2800	3174	}
2801		-#endif
2802	3175
2803	3176	/*
2804	3177	* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
..	..	@@ -2865,22 +3238,16 @@
2865	3238	else
2866	3239	cpumask_clear_cpu(cpu, &cpus_with_pcps);
2867	3240	}
2868		-#ifdef CONFIG_PREEMPT_RT_BASE
	3241	+
2869	3242	for_each_cpu(cpu, &cpus_with_pcps) {
2870		- if (zone)
2871		- drain_pages_zone(cpu, zone);
2872		- else
2873		- drain_pages(cpu);
2874		- }
2875		-#else
2876		- for_each_cpu(cpu, &cpus_with_pcps) {
2877		- struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
2878		- INIT_WORK(work, drain_local_pages_wq);
2879		- queue_work_on(cpu, mm_percpu_wq, work);
	3243	+ struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
	3244	+
	3245	+ drain->zone = zone;
	3246	+ INIT_WORK(&drain->work, drain_local_pages_wq);
	3247	+ queue_work_on(cpu, mm_percpu_wq, &drain->work);
2880	3248	}
2881	3249	for_each_cpu(cpu, &cpus_with_pcps)
2882		- flush_work(per_cpu_ptr(&pcpu_drain, cpu));
2883		-#endif
	3250	+ flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
2884	3251
2885	3252	mutex_unlock(&pcpu_drain_mutex);
2886	3253	}
..	..	@@ -2952,12 +3319,12 @@
2952	3319	return true;
2953	3320	}
2954	3321
2955		-static void free_unref_page_commit(struct page *page, unsigned long pfn,
2956		- struct list_head *dst)
	3322	+static void free_unref_page_commit(struct page *page, unsigned long pfn)
2957	3323	{
2958	3324	struct zone *zone = page_zone(page);
2959	3325	struct per_cpu_pages *pcp;
2960	3326	int migratetype;
	3327	+ bool pcp_skip_cma_pages = false;
2961	3328
2962	3329	migratetype = get_pcppage_migratetype(page);
2963	3330	__count_vm_event(PGFREE);
..	..	@@ -2970,8 +3337,12 @@
2970	3337	* excessively into the page allocator
2971	3338	*/
2972	3339	if (migratetype >= MIGRATE_PCPTYPES) {
2973		- if (unlikely(is_migrate_isolate(migratetype))) {
2974		- free_one_page(zone, page, pfn, 0, migratetype);
	3340	+ trace_android_vh_pcplist_add_cma_pages_bypass(migratetype,
	3341	+ &pcp_skip_cma_pages);
	3342	+ if (unlikely(is_migrate_isolate(migratetype)) \|\|
	3343	+ pcp_skip_cma_pages) {
	3344	+ free_one_page(zone, page, pfn, 0, migratetype,
	3345	+ FPI_NONE);
2975	3346	return;
2976	3347	}
2977	3348	migratetype = MIGRATE_MOVABLE;
..	..	@@ -2982,8 +3353,7 @@
2982	3353	pcp->count++;
2983	3354	if (pcp->count >= pcp->high) {
2984	3355	unsigned long batch = READ_ONCE(pcp->batch);
2985		-
2986		- isolate_pcp_pages(batch, pcp, dst);
	3356	+ free_pcppages_bulk(zone, batch, pcp);
2987	3357	}
2988	3358	}
2989	3359
..	..	@@ -2994,17 +3364,20 @@
2994	3364	{
2995	3365	unsigned long flags;
2996	3366	unsigned long pfn = page_to_pfn(page);
2997		- struct zone *zone = page_zone(page);
2998		- LIST_HEAD(dst);
	3367	+ int migratetype;
	3368	+ bool skip_free_unref_page = false;
2999	3369
3000	3370	if (!free_unref_page_prepare(page, pfn))
3001	3371	return;
3002	3372
3003		- local_lock_irqsave(pa_lock, flags);
3004		- free_unref_page_commit(page, pfn, &dst);
3005		- local_unlock_irqrestore(pa_lock, flags);
3006		- if (!list_empty(&dst))
3007		- free_pcppages_bulk(zone, &dst, false);
	3373	+ migratetype = get_pfnblock_migratetype(page, pfn);
	3374	+ trace_android_vh_free_unref_page_bypass(page, 0, migratetype, &skip_free_unref_page);
	3375	+ if (skip_free_unref_page)
	3376	+ return;
	3377	+
	3378	+ local_irq_save(flags);
	3379	+ free_unref_page_commit(page, pfn);
	3380	+ local_irq_restore(flags);
3008	3381	}
3009	3382
3010	3383	/*
..	..	@@ -3015,11 +3388,6 @@
3015	3388	struct page page, next;
3016	3389	unsigned long flags, pfn;
3017	3390	int batch_count = 0;
3018		- struct list_head dsts[__MAX_NR_ZONES];
3019		- int i;
3020		-
3021		- for (i = 0; i < __MAX_NR_ZONES; i++)
3022		- INIT_LIST_HEAD(&dsts[i]);
3023	3391
3024	3392	/* Prepare pages for freeing */
3025	3393	list_for_each_entry_safe(page, next, list, lru) {
..	..	@@ -3029,42 +3397,25 @@
3029	3397	set_page_private(page, pfn);
3030	3398	}
3031	3399
3032		- local_lock_irqsave(pa_lock, flags);
	3400	+ local_irq_save(flags);
3033	3401	list_for_each_entry_safe(page, next, list, lru) {
3034	3402	unsigned long pfn = page_private(page);
3035		- enum zone_type type;
3036	3403
3037	3404	set_page_private(page, 0);
3038	3405	trace_mm_page_free_batched(page);
3039		- type = page_zonenum(page);
3040		- free_unref_page_commit(page, pfn, &dsts[type]);
	3406	+ free_unref_page_commit(page, pfn);
3041	3407
3042	3408	/*
3043	3409	* Guard against excessive IRQ disabled times when we get
3044	3410	* a large list of pages to free.
3045	3411	*/
3046	3412	if (++batch_count == SWAP_CLUSTER_MAX) {
3047		- local_unlock_irqrestore(pa_lock, flags);
	3413	+ local_irq_restore(flags);
3048	3414	batch_count = 0;
3049		- local_lock_irqsave(pa_lock, flags);
	3415	+ local_irq_save(flags);
3050	3416	}
3051	3417	}
3052		- local_unlock_irqrestore(pa_lock, flags);
3053		-
3054		- for (i = 0; i < __MAX_NR_ZONES; ) {
3055		- struct page *page;
3056		- struct zone *zone;
3057		-
3058		- if (list_empty(&dsts[i])) {
3059		- i++;
3060		- continue;
3061		- }
3062		-
3063		- page = list_first_entry(&dsts[i], struct page, lru);
3064		- zone = page_zone(page);
3065		-
3066		- free_pcppages_bulk(zone, &dsts[i], true);
3067		- }
	3418	+ local_irq_restore(flags);
3068	3419	}
3069	3420
3070	3421	/*
..	..	@@ -3084,7 +3435,8 @@
3084	3435
3085	3436	for (i = 1; i < (1 << order); i++)
3086	3437	set_page_refcounted(page + i);
3087		- split_page_owner(page, order);
	3438	+ split_page_owner(page, 1 << order);
	3439	+ split_page_memcg(page, 1 << order);
3088	3440	}
3089	3441	EXPORT_SYMBOL_GPL(split_page);
3090	3442
..	..	@@ -3106,7 +3458,7 @@
3106	3458	* watermark, because we already know our high-order page
3107	3459	* exists.
3108	3460	*/
3109		- watermark = min_wmark_pages(zone) + (1UL << order);
	3461	+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
3110	3462	if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
3111	3463	return 0;
3112	3464
..	..	@@ -3114,9 +3466,8 @@
3114	3466	}
3115	3467
3116	3468	/* Remove page from free list */
3117		- list_del(&page->lru);
3118		- zone->free_area[order].nr_free--;
3119		- rmv_page_order(page);
	3469	+
	3470	+ del_page_from_free_list(page, zone, order);
3120	3471
3121	3472	/*
3122	3473	* Set the pageblock if the isolated page is at least half of a
..	..	@@ -3135,6 +3486,27 @@
3135	3486
3136	3487
3137	3488	return 1UL << order;
	3489	+}
	3490	+
	3491	+/**
	3492	+ * __putback_isolated_page - Return a now-isolated page back where we got it
	3493	+ * @page: Page that was isolated
	3494	+ * @order: Order of the isolated page
	3495	+ * @mt: The page's pageblock's migratetype
	3496	+ *
	3497	+ * This function is meant to return a page pulled from the free lists via
	3498	+ * __isolate_free_page back to the free lists they were pulled from.
	3499	+ */
	3500	+void __putback_isolated_page(struct page *page, unsigned int order, int mt)
	3501	+{
	3502	+ struct zone *zone = page_zone(page);
	3503	+
	3504	+ /* zone lock should be held when this function is called */
	3505	+ lockdep_assert_held(&zone->lock);
	3506	+
	3507	+ /* Return isolated page to tail of freelist. */
	3508	+ __free_one_page(page, page_to_pfn(page), zone, order, mt,
	3509	+ FPI_SKIP_REPORT_NOTIFY \| FPI_TO_TAIL);
3138	3510	}
3139	3511
3140	3512	/*
..	..	@@ -3166,6 +3538,7 @@
3166	3538
3167	3539	/* Remove page from the per-cpu list, caller must protect the list */
3168	3540	static struct page __rmqueue_pcplist(struct zone zone, int migratetype,
	3541	+ unsigned int alloc_flags,
3169	3542	struct per_cpu_pages *pcp,
3170	3543	gfp_t gfp_flags)
3171	3544	{
..	..	@@ -3175,9 +3548,9 @@
3175	3548	do {
3176	3549	/* First try to get CMA pages */
3177	3550	if (migratetype == MIGRATE_MOVABLE &&
3178		- gfp_flags & __GFP_CMA) {
	3551	+ alloc_flags & ALLOC_CMA) {
3179	3552	list = get_populated_pcp_list(zone, 0, pcp,
3180		- get_cma_migrate_type());
	3553	+ get_cma_migrate_type(), alloc_flags);
3181	3554	}
3182	3555
3183	3556	if (list == NULL) {
..	..	@@ -3186,7 +3559,7 @@
3186	3559	* free CMA pages.
3187	3560	*/
3188	3561	list = get_populated_pcp_list(zone, 0, pcp,
3189		- migratetype);
	3562	+ migratetype, alloc_flags);
3190	3563	if (unlikely(list == NULL) \|\|
3191	3564	unlikely(list_empty(list)))
3192	3565	return NULL;
..	..	@@ -3202,22 +3575,22 @@
3202	3575
3203	3576	/* Lock and remove page from the per-cpu list */
3204	3577	static struct page rmqueue_pcplist(struct zone preferred_zone,
3205		- struct zone *zone, unsigned int order,
3206		- gfp_t gfp_flags, int migratetype)
	3578	+ struct zone *zone, gfp_t gfp_flags,
	3579	+ int migratetype, unsigned int alloc_flags)
3207	3580	{
3208	3581	struct per_cpu_pages *pcp;
3209	3582	struct page *page;
3210	3583	unsigned long flags;
3211	3584
3212		- local_lock_irqsave(pa_lock, flags);
	3585	+ local_irq_save(flags);
3213	3586	pcp = &this_cpu_ptr(zone->pageset)->pcp;
3214		- page = __rmqueue_pcplist(zone, migratetype, pcp,
	3587	+ page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp,
3215	3588	gfp_flags);
3216	3589	if (page) {
3217		- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
	3590	+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
3218	3591	zone_statistics(preferred_zone, zone);
3219	3592	}
3220		- local_unlock_irqrestore(pa_lock, flags);
	3593	+ local_irq_restore(flags);
3221	3594	return page;
3222	3595	}
3223	3596
..	..	@@ -3234,8 +3607,8 @@
3234	3607	struct page *page;
3235	3608
3236	3609	if (likely(order == 0)) {
3237		- page = rmqueue_pcplist(preferred_zone, zone, order,
3238		- gfp_flags, migratetype);
	3610	+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
	3611	+ migratetype, alloc_flags);
3239	3612	goto out;
3240	3613	}
3241	3614
..	..	@@ -3244,25 +3617,31 @@
3244	3617	* allocate greater than order-1 page units with __GFP_NOFAIL.
3245	3618	*/
3246	3619	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
3247		- local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
	3620	+ spin_lock_irqsave(&zone->lock, flags);
3248	3621
3249	3622	do {
3250	3623	page = NULL;
3251		-
3252		- if (alloc_flags & ALLOC_HARDER) {
	3624	+ /*
	3625	+ * order-0 request can reach here when the pcplist is skipped
	3626	+ * due to non-CMA allocation context. HIGHATOMIC area is
	3627	+ * reserved for high-order atomic allocation, so order-0
	3628	+ * request should skip it.
	3629	+ */
	3630	+ if (order > 0 && alloc_flags & ALLOC_HARDER) {
3253	3631	page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
3254	3632	if (page)
3255	3633	trace_mm_page_alloc_zone_locked(page, order, migratetype);
3256	3634	}
3257		-
3258		- if (!page && migratetype == MIGRATE_MOVABLE &&
3259		- gfp_flags & __GFP_CMA)
3260		- page = __rmqueue_cma(zone, order);
3261		-
3262		- if (!page)
3263		- page = __rmqueue(zone, order, migratetype);
	3635	+ if (!page) {
	3636	+ if (migratetype == MIGRATE_MOVABLE &&
	3637	+ alloc_flags & ALLOC_CMA)
	3638	+ page = __rmqueue_cma(zone, order, migratetype,
	3639	+ alloc_flags);
	3640	+ if (!page)
	3641	+ page = __rmqueue(zone, order, migratetype,
	3642	+ alloc_flags);
	3643	+ }
3264	3644	} while (page && check_new_pages(page, order));
3265		-
3266	3645	spin_unlock(&zone->lock);
3267	3646	if (!page)
3268	3647	goto failed;
..	..	@@ -3271,14 +3650,22 @@
3271	3650
3272	3651	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
3273	3652	zone_statistics(preferred_zone, zone);
3274		- local_unlock_irqrestore(pa_lock, flags);
	3653	+ trace_android_vh_rmqueue(preferred_zone, zone, order,
	3654	+ gfp_flags, alloc_flags, migratetype);
	3655	+ local_irq_restore(flags);
3275	3656
3276	3657	out:
	3658	+ /* Separate test+clear to avoid unnecessary atomics */
	3659	+ if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
	3660	+ clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
	3661	+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
	3662	+ }
	3663	+
3277	3664	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
3278	3665	return page;
3279	3666
3280	3667	failed:
3281		- local_unlock_irqrestore(pa_lock, flags);
	3668	+ local_irq_restore(flags);
3282	3669	return NULL;
3283	3670	}
3284	3671
..	..	@@ -3303,7 +3690,7 @@
3303	3690	}
3304	3691	__setup("fail_page_alloc=", setup_fail_page_alloc);
3305	3692
3306		-static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
	3693	+static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3307	3694	{
3308	3695	if (order < fail_page_alloc.min_order)
3309	3696	return false;
..	..	@@ -3327,24 +3714,14 @@
3327	3714
3328	3715	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
3329	3716	&fail_page_alloc.attr);
3330		- if (IS_ERR(dir))
3331		- return PTR_ERR(dir);
3332	3717
3333		- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
3334		- &fail_page_alloc.ignore_gfp_reclaim))
3335		- goto fail;
3336		- if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3337		- &fail_page_alloc.ignore_gfp_highmem))
3338		- goto fail;
3339		- if (!debugfs_create_u32("min-order", mode, dir,
3340		- &fail_page_alloc.min_order))
3341		- goto fail;
	3718	+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
	3719	+ &fail_page_alloc.ignore_gfp_reclaim);
	3720	+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
	3721	+ &fail_page_alloc.ignore_gfp_highmem);
	3722	+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
3342	3723
3343	3724	return 0;
3344		-fail:
3345		- debugfs_remove_recursive(dir);
3346		-
3347		- return -ENOMEM;
3348	3725	}
3349	3726
3350	3727	late_initcall(fail_page_alloc_debugfs);
..	..	@@ -3353,12 +3730,41 @@
3353	3730
3354	3731	#else /* CONFIG_FAIL_PAGE_ALLOC */
3355	3732
3356		-static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
	3733	+static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3357	3734	{
3358	3735	return false;
3359	3736	}
3360	3737
3361	3738	#endif /* CONFIG_FAIL_PAGE_ALLOC */
	3739	+
	3740	+noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
	3741	+{
	3742	+ return __should_fail_alloc_page(gfp_mask, order);
	3743	+}
	3744	+ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
	3745	+
	3746	+static inline long __zone_watermark_unusable_free(struct zone *z,
	3747	+ unsigned int order, unsigned int alloc_flags)
	3748	+{
	3749	+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER\|ALLOC_OOM));
	3750	+ long unusable_free = (1 << order) - 1;
	3751	+
	3752	+ /*
	3753	+ * If the caller does not have rights to ALLOC_HARDER then subtract
	3754	+ * the high-atomic reserves. This will over-estimate the size of the
	3755	+ * atomic reserve but it avoids a search.
	3756	+ */
	3757	+ if (likely(!alloc_harder))
	3758	+ unusable_free += z->nr_reserved_highatomic;
	3759	+
	3760	+#ifdef CONFIG_CMA
	3761	+ /* If allocation can't use CMA areas don't use free CMA pages */
	3762	+ if (!(alloc_flags & ALLOC_CMA))
	3763	+ unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
	3764	+#endif
	3765	+
	3766	+ return unusable_free;
	3767	+}
3362	3768
3363	3769	/*
3364	3770	* Return true if free base pages are above 'mark'. For high-order checks it
..	..	@@ -3367,7 +3773,7 @@
3367	3773	* to check in the allocation paths if no pages are free.
3368	3774	*/
3369	3775	bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3370		- int classzone_idx, unsigned int alloc_flags,
	3776	+ int highest_zoneidx, unsigned int alloc_flags,
3371	3777	long free_pages)
3372	3778	{
3373	3779	long min = mark;
..	..	@@ -3375,19 +3781,12 @@
3375	3781	const bool alloc_harder = (alloc_flags & (ALLOC_HARDER\|ALLOC_OOM));
3376	3782
3377	3783	/* free_pages may go negative - that's OK */
3378		- free_pages -= (1 << order) - 1;
	3784	+ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
3379	3785
3380	3786	if (alloc_flags & ALLOC_HIGH)
3381	3787	min -= min / 2;
3382	3788
3383		- /*
3384		- * If the caller does not have rights to ALLOC_HARDER then subtract
3385		- * the high-atomic reserves. This will over-estimate the size of the
3386		- * atomic reserve but it avoids a search.
3387		- */
3388		- if (likely(!alloc_harder)) {
3389		- free_pages -= z->nr_reserved_highatomic;
3390		- } else {
	3789	+ if (unlikely(alloc_harder)) {
3391	3790	/*
3392	3791	* OOM victims can try even harder than normal ALLOC_HARDER
3393	3792	* users on the grounds that it's definitely going to be in
..	..	@@ -3400,19 +3799,12 @@
3400	3799	min -= min / 4;
3401	3800	}
3402	3801
3403		-
3404		-#ifdef CONFIG_CMA
3405		- /* If allocation can't use CMA areas don't use free CMA pages */
3406		- if (!(alloc_flags & ALLOC_CMA))
3407		- free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
3408		-#endif
3409		-
3410	3802	/*
3411	3803	* Check watermarks for an order-0 allocation request. If these
3412	3804	* are not met, then a high-order request also cannot go ahead
3413	3805	* even if a suitable page happened to be free.
3414	3806	*/
3415		- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
	3807	+ if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
3416	3808	return false;
3417	3809
3418	3810	/* If this is an order-0 request then the watermark is fine */
..	..	@@ -3436,65 +3828,83 @@
3436	3828	if (mt == MIGRATE_CMA)
3437	3829	continue;
3438	3830	#endif
3439		- if (!list_empty(&area->free_list[mt]))
	3831	+ if (!free_area_empty(area, mt))
3440	3832	return true;
3441	3833	}
3442	3834
3443	3835	#ifdef CONFIG_CMA
3444	3836	if ((alloc_flags & ALLOC_CMA) &&
3445		- !list_empty(&area->free_list[MIGRATE_CMA])) {
	3837	+ !free_area_empty(area, MIGRATE_CMA)) {
3446	3838	return true;
3447	3839	}
3448	3840	#endif
3449		- if (alloc_harder &&
3450		- !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
	3841	+ if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
3451	3842	return true;
3452	3843	}
3453	3844	return false;
3454	3845	}
3455	3846
3456	3847	bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3457		- int classzone_idx, unsigned int alloc_flags)
	3848	+ int highest_zoneidx, unsigned int alloc_flags)
3458	3849	{
3459		- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
	3850	+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3460	3851	zone_page_state(z, NR_FREE_PAGES));
3461	3852	}
	3853	+EXPORT_SYMBOL_GPL(zone_watermark_ok);
3462	3854
3463	3855	static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
3464		- unsigned long mark, int classzone_idx, unsigned int alloc_flags)
	3856	+ unsigned long mark, int highest_zoneidx,
	3857	+ unsigned int alloc_flags, gfp_t gfp_mask)
3465	3858	{
3466		- long free_pages = zone_page_state(z, NR_FREE_PAGES);
3467		- long cma_pages = 0;
	3859	+ long free_pages;
3468	3860
3469		-#ifdef CONFIG_CMA
3470		- /* If allocation can't use CMA areas don't use free CMA pages */
3471		- if (!(alloc_flags & ALLOC_CMA))
3472		- cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
3473		-#endif
	3861	+ free_pages = zone_page_state(z, NR_FREE_PAGES);
3474	3862
3475	3863	/*
3476	3864	* Fast check for order-0 only. If this fails then the reserves
3477		- * need to be calculated. There is a corner case where the check
3478		- * passes but only the high-order atomic reserve are free. If
3479		- * the caller is !atomic then it'll uselessly search the free
3480		- * list. That corner case is then slower but it is harmless.
	3865	+ * need to be calculated.
3481	3866	*/
3482		- if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
3483		- return true;
	3867	+ if (!order) {
	3868	+ long usable_free;
	3869	+ long reserved;
3484	3870
3485		- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3486		- free_pages);
	3871	+ usable_free = free_pages;
	3872	+ reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
	3873	+
	3874	+ /* reserved may over estimate high-atomic reserves. */
	3875	+ usable_free -= min(usable_free, reserved);
	3876	+ if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
	3877	+ return true;
	3878	+ }
	3879	+
	3880	+ if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
	3881	+ free_pages))
	3882	+ return true;
	3883	+ /*
	3884	+ * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
	3885	+ * when checking the min watermark. The min watermark is the
	3886	+ * point where boosting is ignored so that kswapd is woken up
	3887	+ * when below the low watermark.
	3888	+ */
	3889	+ if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
	3890	+ && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
	3891	+ mark = z->_watermark[WMARK_MIN];
	3892	+ return __zone_watermark_ok(z, order, mark, highest_zoneidx,
	3893	+ alloc_flags, free_pages);
	3894	+ }
	3895	+
	3896	+ return false;
3487	3897	}
3488	3898
3489	3899	bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3490		- unsigned long mark, int classzone_idx)
	3900	+ unsigned long mark, int highest_zoneidx)
3491	3901	{
3492	3902	long free_pages = zone_page_state(z, NR_FREE_PAGES);
3493	3903
3494	3904	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3495	3905	free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
3496	3906
3497		- return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
	3907	+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
3498	3908	free_pages);
3499	3909	}
3500	3910	EXPORT_SYMBOL_GPL(zone_watermark_ok_safe);
..	..	@@ -3503,7 +3913,7 @@
3503	3913	static bool zone_allows_reclaim(struct zone local_zone, struct zone zone)
3504	3914	{
3505	3915	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3506		- RECLAIM_DISTANCE;
	3916	+ node_reclaim_distance;
3507	3917	}
3508	3918	#else /* CONFIG_NUMA */
3509	3919	static bool zone_allows_reclaim(struct zone local_zone, struct zone zone)
..	..	@@ -3513,6 +3923,61 @@
3513	3923	#endif /* CONFIG_NUMA */
3514	3924
3515	3925	/*
	3926	+ * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
	3927	+ * fragmentation is subtle. If the preferred zone was HIGHMEM then
	3928	+ * premature use of a lower zone may cause lowmem pressure problems that
	3929	+ * are worse than fragmentation. If the next zone is ZONE_DMA then it is
	3930	+ * probably too small. It only makes sense to spread allocations to avoid
	3931	+ * fragmentation between the Normal and DMA32 zones.
	3932	+ */
	3933	+static inline unsigned int
	3934	+alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
	3935	+{
	3936	+ unsigned int alloc_flags;
	3937	+
	3938	+ /*
	3939	+ * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
	3940	+ * to save a branch.
	3941	+ */
	3942	+ alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
	3943	+
	3944	+#ifdef CONFIG_ZONE_DMA32
	3945	+ if (!zone)
	3946	+ return alloc_flags;
	3947	+
	3948	+ if (zone_idx(zone) != ZONE_NORMAL)
	3949	+ return alloc_flags;
	3950	+
	3951	+ /*
	3952	+ * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
	3953	+ * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
	3954	+ * on UMA that if Normal is populated then so is DMA32.
	3955	+ */
	3956	+ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
	3957	+ if (nr_online_nodes > 1 && !populated_zone(--zone))
	3958	+ return alloc_flags;
	3959	+
	3960	+ alloc_flags \|= ALLOC_NOFRAGMENT;
	3961	+#endif /* CONFIG_ZONE_DMA32 */
	3962	+ return alloc_flags;
	3963	+}
	3964	+
	3965	+static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
	3966	+ unsigned int alloc_flags)
	3967	+{
	3968	+#ifdef CONFIG_CMA
	3969	+ unsigned int pflags = current->flags;
	3970	+
	3971	+ if (!(pflags & PF_MEMALLOC_NOCMA) &&
	3972	+ gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE &&
	3973	+ gfp_mask & __GFP_CMA)
	3974	+ alloc_flags \|= ALLOC_CMA;
	3975	+
	3976	+#endif
	3977	+ return alloc_flags;
	3978	+}
	3979	+
	3980	+/*
3516	3981	* get_page_from_freelist goes through the zonelist trying to allocate
3517	3982	* a page.
3518	3983	*/
..	..	@@ -3520,16 +3985,20 @@
3520	3985	get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3521	3986	const struct alloc_context *ac)
3522	3987	{
3523		- struct zoneref *z = ac->preferred_zoneref;
	3988	+ struct zoneref *z;
3524	3989	struct zone *zone;
3525	3990	struct pglist_data *last_pgdat_dirty_limit = NULL;
	3991	+ bool no_fallback;
3526	3992
	3993	+retry:
3527	3994	/*
3528	3995	* Scan zonelist, looking for a zone with enough free.
3529	3996	* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
3530	3997	*/
3531		- for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3532		- ac->nodemask) {
	3998	+ no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
	3999	+ z = ac->preferred_zoneref;
	4000	+ for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
	4001	+ ac->nodemask) {
3533	4002	struct page *page;
3534	4003	unsigned long mark;
3535	4004
..	..	@@ -3566,9 +4035,26 @@
3566	4035	}
3567	4036	}
3568	4037
3569		- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
	4038	+ if (no_fallback && nr_online_nodes > 1 &&
	4039	+ zone != ac->preferred_zoneref->zone) {
	4040	+ int local_nid;
	4041	+
	4042	+ /*
	4043	+ * If moving to a remote node, retry but allow
	4044	+ * fragmenting fallbacks. Locality is more important
	4045	+ * than fragmentation avoidance.
	4046	+ */
	4047	+ local_nid = zone_to_nid(ac->preferred_zoneref->zone);
	4048	+ if (zone_to_nid(zone) != local_nid) {
	4049	+ alloc_flags &= ~ALLOC_NOFRAGMENT;
	4050	+ goto retry;
	4051	+ }
	4052	+ }
	4053	+
	4054	+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
3570	4055	if (!zone_watermark_fast(zone, order, mark,
3571		- ac_classzone_idx(ac), alloc_flags)) {
	4056	+ ac->highest_zoneidx, alloc_flags,
	4057	+ gfp_mask)) {
3572	4058	int ret;
3573	4059
3574	4060	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
..	..	@@ -3601,7 +4087,7 @@
3601	4087	default:
3602	4088	/* did we reclaim enough */
3603	4089	if (zone_watermark_ok(zone, order, mark,
3604		- ac_classzone_idx(ac), alloc_flags))
	4090	+ ac->highest_zoneidx, alloc_flags))
3605	4091	goto try_this_zone;
3606	4092
3607	4093	continue;
..	..	@@ -3633,30 +4119,21 @@
3633	4119	}
3634	4120	}
3635	4121
	4122	+ /*
	4123	+ * It's possible on a UMA machine to get through all zones that are
	4124	+ * fragmented. If avoiding fragmentation, reset and try again.
	4125	+ */
	4126	+ if (no_fallback) {
	4127	+ alloc_flags &= ~ALLOC_NOFRAGMENT;
	4128	+ goto retry;
	4129	+ }
	4130	+
3636	4131	return NULL;
3637		-}
3638		-
3639		-/*
3640		- * Large machines with many possible nodes should not always dump per-node
3641		- * meminfo in irq context.
3642		- */
3643		-static inline bool should_suppress_show_mem(void)
3644		-{
3645		- bool ret = false;
3646		-
3647		-#if NODES_SHIFT > 8
3648		- ret = in_interrupt();
3649		-#endif
3650		- return ret;
3651	4132	}
3652	4133
3653	4134	static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3654	4135	{
3655	4136	unsigned int filter = SHOW_MEM_FILTER_NODES;
3656		- static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
3657		-
3658		- if (should_suppress_show_mem() \|\| !__ratelimit(&show_mem_rs))
3659		- return;
3660	4137
3661	4138	/*
3662	4139	* This documents exceptions given to allocations in certain
..	..	@@ -3677,22 +4154,23 @@
3677	4154	{
3678	4155	struct va_format vaf;
3679	4156	va_list args;
3680		- static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
3681		- DEFAULT_RATELIMIT_BURST);
	4157	+ static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
3682	4158
3683		- if ((gfp_mask & __GFP_NOWARN) \|\| !__ratelimit(&nopage_rs))
	4159	+ if ((gfp_mask & __GFP_NOWARN) \|\|
	4160	+ !__ratelimit(&nopage_rs) \|\|
	4161	+ ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
3684	4162	return;
3685	4163
3686	4164	va_start(args, fmt);
3687	4165	vaf.fmt = fmt;
3688	4166	vaf.va = &args;
3689		- pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
	4167	+ pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
3690	4168	current->comm, &vaf, gfp_mask, &gfp_mask,
3691	4169	nodemask_pr_args(nodemask));
3692	4170	va_end(args);
3693	4171
3694	4172	cpuset_print_current_mems_allowed();
3695		-
	4173	+ pr_cont("\n");
3696	4174	dump_stack();
3697	4175	warn_alloc_show_mem(gfp_mask, nodemask);
3698	4176	}
..	..	@@ -3766,11 +4244,13 @@
3766	4244	* success so it is time to admit defeat. We will skip the OOM killer
3767	4245	* because it is very likely that the caller has a more reasonable
3768	4246	* fallback than shooting a random task.
	4247	+ *
	4248	+ * The OOM killer may not free memory on a specific node.
3769	4249	*/
3770		- if (gfp_mask & __GFP_RETRY_MAYFAIL)
	4250	+ if (gfp_mask & (__GFP_RETRY_MAYFAIL \| __GFP_THISNODE))
3771	4251	goto out;
3772	4252	/* The OOM killer does not needlessly kill tasks for lowmem */
3773		- if (ac->high_zoneidx < ZONE_NORMAL)
	4253	+ if (ac->highest_zoneidx < ZONE_NORMAL)
3774	4254	goto out;
3775	4255	if (pm_suspended_storage())
3776	4256	goto out;
..	..	@@ -3783,10 +4263,6 @@
3783	4263	* out_of_memory). Once filesystems are ready to handle allocation
3784	4264	* failures more gracefully we should just bail out here.
3785	4265	*/
3786		-
3787		- /* The OOM killer may not free memory on a specific node */
3788		- if (gfp_mask & __GFP_THISNODE)
3789		- goto out;
3790	4266
3791	4267	/* Exhausted what can be done so it's blame time */
3792	4268	if (out_of_memory(&oc) \|\| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
..	..	@@ -3818,7 +4294,7 @@
3818	4294	unsigned int alloc_flags, const struct alloc_context *ac,
3819	4295	enum compact_priority prio, enum compact_result *compact_result)
3820	4296	{
3821		- struct page *page;
	4297	+ struct page *page = NULL;
3822	4298	unsigned long pflags;
3823	4299	unsigned int noreclaim_flag;
3824	4300
..	..	@@ -3829,13 +4305,10 @@
3829	4305	noreclaim_flag = memalloc_noreclaim_save();
3830	4306
3831	4307	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3832		- prio);
	4308	+ prio, &page);
3833	4309
3834	4310	memalloc_noreclaim_restore(noreclaim_flag);
3835	4311	psi_memstall_leave(&pflags);
3836		-
3837		- if (*compact_result <= COMPACT_INACTIVE)
3838		- return NULL;
3839	4312
3840	4313	/*
3841	4314	* At least in one zone compaction wasn't deferred or skipped, so let's
..	..	@@ -3843,7 +4316,13 @@
3843	4316	*/
3844	4317	count_vm_event(COMPACTSTALL);
3845	4318
3846		- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	4319	+ /* Prep a captured page if available */
	4320	+ if (page)
	4321	+ prep_new_page(page, order, gfp_mask, alloc_flags);
	4322	+
	4323	+ /* Try get a page from the freelist if available */
	4324	+ if (!page)
	4325	+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3847	4326
3848	4327	if (page) {
3849	4328	struct zone *zone = page_zone(page);
..	..	@@ -3892,14 +4371,22 @@
3892	4371	goto check_priority;
3893	4372
3894	4373	/*
3895		- * make sure the compaction wasn't deferred or didn't bail out early
3896		- * due to locks contention before we declare that we should give up.
3897		- * But do not retry if the given zonelist is not suitable for
3898		- * compaction.
	4374	+ * compaction was skipped because there are not enough order-0 pages
	4375	+ * to work with, so we retry only if it looks like reclaim can help.
3899	4376	*/
3900		- if (compaction_withdrawn(compact_result)) {
	4377	+ if (compaction_needs_reclaim(compact_result)) {
3901	4378	ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3902	4379	goto out;
	4380	+ }
	4381	+
	4382	+ /*
	4383	+ * make sure the compaction wasn't deferred or didn't bail out early
	4384	+ * due to locks contention before we declare that we should give up.
	4385	+ * But the next retry should use a higher priority if allowed, so
	4386	+ * we don't just keep bailing out endlessly.
	4387	+ */
	4388	+ if (compaction_withdrawn(compact_result)) {
	4389	+ goto check_priority;
3903	4390	}
3904	4391
3905	4392	/*
..	..	@@ -3962,10 +4449,10 @@
3962	4449	* Let's give them a good hope and keep retrying while the order-0
3963	4450	* watermarks are OK.
3964	4451	*/
3965		- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3966		- ac->nodemask) {
	4452	+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
	4453	+ ac->highest_zoneidx, ac->nodemask) {
3967	4454	if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
3968		- ac_classzone_idx(ac), alloc_flags))
	4455	+ ac->highest_zoneidx, alloc_flags))
3969	4456	return true;
3970	4457	}
3971	4458	return false;
..	..	@@ -4023,33 +4510,50 @@
4023	4510	EXPORT_SYMBOL_GPL(fs_reclaim_release);
4024	4511	#endif
4025	4512
	4513	+/*
	4514	+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
	4515	+ * have been rebuilt so allocation retries. Reader side does not lock and
	4516	+ * retries the allocation if zonelist changes. Writer side is protected by the
	4517	+ * embedded spin_lock.
	4518	+ */
	4519	+static DEFINE_SEQLOCK(zonelist_update_seq);
	4520	+
	4521	+static unsigned int zonelist_iter_begin(void)
	4522	+{
	4523	+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
	4524	+ return read_seqbegin(&zonelist_update_seq);
	4525	+
	4526	+ return 0;
	4527	+}
	4528	+
	4529	+static unsigned int check_retry_zonelist(unsigned int seq)
	4530	+{
	4531	+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
	4532	+ return read_seqretry(&zonelist_update_seq, seq);
	4533	+
	4534	+ return seq;
	4535	+}
	4536	+
4026	4537	/* Perform direct synchronous page reclaim */
4027		-static int
	4538	+static unsigned long
4028	4539	__perform_reclaim(gfp_t gfp_mask, unsigned int order,
4029	4540	const struct alloc_context *ac)
4030	4541	{
4031		- struct reclaim_state reclaim_state;
4032		- int progress;
4033	4542	unsigned int noreclaim_flag;
4034		- unsigned long pflags;
	4543	+ unsigned long progress;
4035	4544
4036	4545	cond_resched();
4037	4546
4038	4547	/* We now go into synchronous reclaim */
4039	4548	cpuset_memory_pressure_bump();
4040		- psi_memstall_enter(&pflags);
4041	4549	fs_reclaim_acquire(gfp_mask);
4042	4550	noreclaim_flag = memalloc_noreclaim_save();
4043		- reclaim_state.reclaimed_slab = 0;
4044		- current->reclaim_state = &reclaim_state;
4045	4551
4046	4552	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
4047	4553	ac->nodemask);
4048	4554
4049		- current->reclaim_state = NULL;
4050	4555	memalloc_noreclaim_restore(noreclaim_flag);
4051	4556	fs_reclaim_release(gfp_mask);
4052		- psi_memstall_leave(&pflags);
4053	4557
4054	4558	cond_resched();
4055	4559
..	..	@@ -4063,11 +4567,14 @@
4063	4567	unsigned long *did_some_progress)
4064	4568	{
4065	4569	struct page *page = NULL;
	4570	+ unsigned long pflags;
4066	4571	bool drained = false;
	4572	+ bool skip_pcp_drain = false;
4067	4573
	4574	+ psi_memstall_enter(&pflags);
4068	4575	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
4069	4576	if (unlikely(!(*did_some_progress)))
4070		- return NULL;
	4577	+ goto out;
4071	4578
4072	4579	retry:
4073	4580	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
..	..	@@ -4075,14 +4582,19 @@
4075	4582	/*
4076	4583	* If an allocation failed after direct reclaim, it could be because
4077	4584	* pages are pinned on the per-cpu lists or in high alloc reserves.
4078		- * Shrink them them and try again
	4585	+ * Shrink them and try again
4079	4586	*/
4080	4587	if (!page && !drained) {
4081	4588	unreserve_highatomic_pageblock(ac, false);
4082		- drain_all_pages(NULL);
	4589	+ trace_android_vh_drain_all_pages_bypass(gfp_mask, order,
	4590	+ alloc_flags, ac->migratetype, *did_some_progress, &skip_pcp_drain);
	4591	+ if (!skip_pcp_drain)
	4592	+ drain_all_pages(NULL);
4083	4593	drained = true;
4084	4594	goto retry;
4085	4595	}
	4596	+out:
	4597	+ psi_memstall_leave(&pflags);
4086	4598
4087	4599	return page;
4088	4600	}
..	..	@@ -4093,12 +4605,12 @@
4093	4605	struct zoneref *z;
4094	4606	struct zone *zone;
4095	4607	pg_data_t *last_pgdat = NULL;
4096		- enum zone_type high_zoneidx = ac->high_zoneidx;
	4608	+ enum zone_type highest_zoneidx = ac->highest_zoneidx;
4097	4609
4098		- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
	4610	+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
4099	4611	ac->nodemask) {
4100	4612	if (last_pgdat != zone->zone_pgdat)
4101		- wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
	4613	+ wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
4102	4614	last_pgdat = zone->zone_pgdat;
4103	4615	}
4104	4616	}
..	..	@@ -4108,8 +4620,13 @@
4108	4620	{
4109	4621	unsigned int alloc_flags = ALLOC_WMARK_MIN \| ALLOC_CPUSET;
4110	4622
4111		- /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
	4623	+ /*
	4624	+ * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
	4625	+ * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
	4626	+ * to save two branches.
	4627	+ */
4112	4628	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
	4629	+ BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
4113	4630
4114	4631	/*
4115	4632	* The caller may dip into page reserves a bit more if the caller
..	..	@@ -4117,7 +4634,8 @@
4117	4634	* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
4118	4635	* set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
4119	4636	*/
4120		- alloc_flags \|= (__force int) (gfp_mask & __GFP_HIGH);
	4637	+ alloc_flags \|= (__force int)
	4638	+ (gfp_mask & (__GFP_HIGH \| __GFP_KSWAPD_RECLAIM));
4121	4639
4122	4640	if (gfp_mask & __GFP_ATOMIC) {
4123	4641	/*
..	..	@@ -4134,10 +4652,8 @@
4134	4652	} else if (unlikely(rt_task(current)) && !in_interrupt())
4135	4653	alloc_flags \|= ALLOC_HARDER;
4136	4654
4137		-#ifdef CONFIG_CMA
4138		- if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
4139		- alloc_flags \|= ALLOC_CMA;
4140		-#endif
	4655	+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
	4656	+
4141	4657	return alloc_flags;
4142	4658	}
4143	4659
..	..	@@ -4200,6 +4716,7 @@
4200	4716	{
4201	4717	struct zone *zone;
4202	4718	struct zoneref *z;
	4719	+ bool ret = false;
4203	4720
4204	4721	/*
4205	4722	* Costly allocations might have made a progress but this doesn't mean
..	..	@@ -4226,8 +4743,8 @@
4226	4743	* request even if all reclaimable pages are considered then we are
4227	4744	* screwed and have to go OOM.
4228	4745	*/
4229		- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
4230		- ac->nodemask) {
	4746	+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
	4747	+ ac->highest_zoneidx, ac->nodemask) {
4231	4748	unsigned long available;
4232	4749	unsigned long reclaimable;
4233	4750	unsigned long min_wmark = min_wmark_pages(zone);
..	..	@@ -4241,7 +4758,7 @@
4241	4758	* reclaimable pages?
4242	4759	*/
4243	4760	wmark = __zone_watermark_ok(zone, order, min_wmark,
4244		- ac_classzone_idx(ac), alloc_flags, available);
	4761	+ ac->highest_zoneidx, alloc_flags, available);
4245	4762	trace_reclaim_retry_zone(z, order, reclaimable,
4246	4763	available, min_wmark, *no_progress_loops, wmark);
4247	4764	if (wmark) {
..	..	@@ -4263,25 +4780,24 @@
4263	4780	}
4264	4781	}
4265	4782
4266		- /*
4267		- * Memory allocation/reclaim might be called from a WQ
4268		- * context and the current implementation of the WQ
4269		- * concurrency control doesn't recognize that
4270		- * a particular WQ is congested if the worker thread is
4271		- * looping without ever sleeping. Therefore we have to
4272		- * do a short sleep here rather than calling
4273		- * cond_resched().
4274		- */
4275		- if (current->flags & PF_WQ_WORKER)
4276		- schedule_timeout_uninterruptible(1);
4277		- else
4278		- cond_resched();
4279		-
4280		- return true;
	4783	+ ret = true;
	4784	+ goto out;
4281	4785	}
4282	4786	}
4283	4787
4284		- return false;
	4788	+out:
	4789	+ /*
	4790	+ * Memory allocation/reclaim might be called from a WQ context and the
	4791	+ * current implementation of the WQ concurrency control doesn't
	4792	+ * recognize that a particular WQ is congested if the worker thread is
	4793	+ * looping without ever sleeping. Therefore we have to do a short sleep
	4794	+ * here rather than calling cond_resched().
	4795	+ */
	4796	+ if (current->flags & PF_WQ_WORKER)
	4797	+ schedule_timeout_uninterruptible(1);
	4798	+ else
	4799	+ cond_resched();
	4800	+ return ret;
4285	4801	}
4286	4802
4287	4803	static inline bool
..	..	@@ -4331,8 +4847,12 @@
4331	4847	int compaction_retries;
4332	4848	int no_progress_loops;
4333	4849	unsigned int cpuset_mems_cookie;
	4850	+ unsigned int zonelist_iter_cookie;
4334	4851	int reserve_flags;
	4852	+ unsigned long vh_record;
	4853	+ bool should_alloc_retry = false;
4335	4854
	4855	+ trace_android_vh_alloc_pages_slowpath_begin(gfp_mask, order, &vh_record);
4336	4856	/*
4337	4857	* We also sanity check to catch abuse of atomic reserves being used by
4338	4858	* callers that are not in atomic context.
..	..	@@ -4341,11 +4861,12 @@
4341	4861	(__GFP_ATOMIC\|__GFP_DIRECT_RECLAIM)))
4342	4862	gfp_mask &= ~__GFP_ATOMIC;
4343	4863
4344		-retry_cpuset:
	4864	+restart:
4345	4865	compaction_retries = 0;
4346	4866	no_progress_loops = 0;
4347	4867	compact_priority = DEF_COMPACT_PRIORITY;
4348	4868	cpuset_mems_cookie = read_mems_allowed_begin();
	4869	+ zonelist_iter_cookie = zonelist_iter_begin();
4349	4870
4350	4871	/*
4351	4872	* The fast path uses conservative alloc_flags to succeed only until
..	..	@@ -4361,11 +4882,11 @@
4361	4882	* could end up iterating over non-eligible zones endlessly.
4362	4883	*/
4363	4884	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4364		- ac->high_zoneidx, ac->nodemask);
	4885	+ ac->highest_zoneidx, ac->nodemask);
4365	4886	if (!ac->preferred_zoneref->zone)
4366	4887	goto nopage;
4367	4888
4368		- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
	4889	+ if (alloc_flags & ALLOC_KSWAPD)
4369	4890	wake_all_kswapds(order, gfp_mask, ac);
4370	4891
4371	4892	/*
..	..	@@ -4398,18 +4919,28 @@
4398	4919
4399	4920	/*
4400	4921	* Checks for costly allocations with __GFP_NORETRY, which
4401		- * includes THP page fault allocations
	4922	+ * includes some THP page fault allocations
4402	4923	*/
4403	4924	if (costly_order && (gfp_mask & __GFP_NORETRY)) {
4404	4925	/*
4405		- * If compaction is deferred for high-order allocations,
4406		- * it is because sync compaction recently failed. If
4407		- * this is the case and the caller requested a THP
4408		- * allocation, we do not want to heavily disrupt the
4409		- * system, so we fail the allocation instead of entering
4410		- * direct reclaim.
	4926	+ * If allocating entire pageblock(s) and compaction
	4927	+ * failed because all zones are below low watermarks
	4928	+ * or is prohibited because it recently failed at this
	4929	+ * order, fail immediately unless the allocator has
	4930	+ * requested compaction and reclaim retry.
	4931	+ *
	4932	+ * Reclaim is
	4933	+ * - potentially very expensive because zones are far
	4934	+ * below their low watermarks or this is part of very
	4935	+ * bursty high order allocations,
	4936	+ * - not guaranteed to help because isolate_freepages()
	4937	+ * may not iterate over freed pages as part of its
	4938	+ * linear scan, and
	4939	+ * - unlikely to make entire pageblocks free on its
	4940	+ * own.
4411	4941	*/
4412		- if (compact_result == COMPACT_DEFERRED)
	4942	+ if (compact_result == COMPACT_SKIPPED \|\|
	4943	+ compact_result == COMPACT_DEFERRED)
4413	4944	goto nopage;
4414	4945
4415	4946	/*
..	..	@@ -4423,12 +4954,12 @@
4423	4954
4424	4955	retry:
4425	4956	/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4426		- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
	4957	+ if (alloc_flags & ALLOC_KSWAPD)
4427	4958	wake_all_kswapds(order, gfp_mask, ac);
4428	4959
4429	4960	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4430	4961	if (reserve_flags)
4431		- alloc_flags = reserve_flags;
	4962	+ alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
4432	4963
4433	4964	/*
4434	4965	* Reset the nodemask and zonelist iterators if memory policies can be
..	..	@@ -4438,7 +4969,7 @@
4438	4969	if (!(alloc_flags & ALLOC_CPUSET) \|\| reserve_flags) {
4439	4970	ac->nodemask = NULL;
4440	4971	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4441		- ac->high_zoneidx, ac->nodemask);
	4972	+ ac->highest_zoneidx, ac->nodemask);
4442	4973	}
4443	4974
4444	4975	/* Attempt with potentially adjusted zonelist and alloc_flags */
..	..	@@ -4453,6 +4984,18 @@
4453	4984	/* Avoid recursion of direct reclaim */
4454	4985	if (current->flags & PF_MEMALLOC)
4455	4986	goto nopage;
	4987	+
	4988	+ trace_android_vh_alloc_pages_reclaim_bypass(gfp_mask, order,
	4989	+ alloc_flags, ac->migratetype, &page);
	4990	+
	4991	+ if (page)
	4992	+ goto got_pg;
	4993	+
	4994	+ trace_android_vh_should_alloc_pages_retry(gfp_mask, order,
	4995	+ &alloc_flags, ac->migratetype, ac->preferred_zoneref->zone,
	4996	+ &page, &should_alloc_retry);
	4997	+ if (should_alloc_retry)
	4998	+ goto retry;
4456	4999
4457	5000	/* Try direct reclaim and then allocating */
4458	5001	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
..	..	@@ -4494,9 +5037,13 @@
4494	5037	goto retry;
4495	5038
4496	5039
4497		- /* Deal with possible cpuset update races before we start OOM killing */
4498		- if (check_retry_cpuset(cpuset_mems_cookie, ac))
4499		- goto retry_cpuset;
	5040	+ /*
	5041	+ * Deal with possible cpuset update races or zonelist updates to avoid
	5042	+ * a unnecessary OOM kill.
	5043	+ */
	5044	+ if (check_retry_cpuset(cpuset_mems_cookie, ac) \|\|
	5045	+ check_retry_zonelist(zonelist_iter_cookie))
	5046	+ goto restart;
4500	5047
4501	5048	/* Reclaim has failed us, start killing things */
4502	5049	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
..	..	@@ -4505,7 +5052,7 @@
4505	5052
4506	5053	/* Avoid allocations with no watermarks from looping endlessly */
4507	5054	if (tsk_is_oom_victim(current) &&
4508		- (alloc_flags == ALLOC_OOM \|\|
	5055	+ (alloc_flags & ALLOC_OOM \|\|
4509	5056	(gfp_mask & __GFP_NOMEMALLOC)))
4510	5057	goto nopage;
4511	5058
..	..	@@ -4516,9 +5063,13 @@
4516	5063	}
4517	5064
4518	5065	nopage:
4519		- /* Deal with possible cpuset update races before we fail */
4520		- if (check_retry_cpuset(cpuset_mems_cookie, ac))
4521		- goto retry_cpuset;
	5066	+ /*
	5067	+ * Deal with possible cpuset update races or zonelist updates to avoid
	5068	+ * a unnecessary OOM kill.
	5069	+ */
	5070	+ if (check_retry_cpuset(cpuset_mems_cookie, ac) \|\|
	5071	+ check_retry_zonelist(zonelist_iter_cookie))
	5072	+ goto restart;
4522	5073
4523	5074	/*
4524	5075	* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
..	..	@@ -4561,9 +5112,15 @@
4561	5112	goto retry;
4562	5113	}
4563	5114	fail:
	5115	+ trace_android_vh_alloc_pages_failure_bypass(gfp_mask, order,
	5116	+ alloc_flags, ac->migratetype, &page);
	5117	+ if (page)
	5118	+ goto got_pg;
	5119	+
4564	5120	warn_alloc(gfp_mask, ac->nodemask,
4565	5121	"page allocation failure: order:%u", order);
4566	5122	got_pg:
	5123	+ trace_android_vh_alloc_pages_slowpath_end(gfp_mask, order, vh_record);
4567	5124	return page;
4568	5125	}
4569	5126
..	..	@@ -4572,14 +5129,18 @@
4572	5129	struct alloc_context ac, gfp_t alloc_mask,
4573	5130	unsigned int *alloc_flags)
4574	5131	{
4575		- ac->high_zoneidx = gfp_zone(gfp_mask);
	5132	+ ac->highest_zoneidx = gfp_zone(gfp_mask);
4576	5133	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
4577	5134	ac->nodemask = nodemask;
4578		- ac->migratetype = gfpflags_to_migratetype(gfp_mask);
	5135	+ ac->migratetype = gfp_migratetype(gfp_mask);
4579	5136
4580	5137	if (cpusets_enabled()) {
4581	5138	*alloc_mask \|= __GFP_HARDWALL;
4582		- if (!ac->nodemask)
	5139	+ /*
	5140	+ * When we are in the interrupt context, it is irrelevant
	5141	+ * to the current task context. It means that any node ok.
	5142	+ */
	5143	+ if (!in_interrupt() && !ac->nodemask)
4583	5144	ac->nodemask = &cpuset_current_mems_allowed;
4584	5145	else
4585	5146	*alloc_flags \|= ALLOC_CPUSET;
..	..	@@ -4593,15 +5154,8 @@
4593	5154	if (should_fail_alloc_page(gfp_mask, order))
4594	5155	return false;
4595	5156
4596		- if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
4597		- *alloc_flags \|= ALLOC_CMA;
	5157	+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
4598	5158
4599		- return true;
4600		-}
4601		-
4602		-/* Determine whether to spread dirty pages and what the first usable zone */
4603		-static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
4604		-{
4605	5159	/* Dirty zone balancing only done in the fast path */
4606	5160	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
4607	5161
..	..	@@ -4611,7 +5165,9 @@
4611	5165	* may get reset for allocations that ignore memory policies.
4612	5166	*/
4613	5167	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4614		- ac->high_zoneidx, ac->nodemask);
	5168	+ ac->highest_zoneidx, ac->nodemask);
	5169	+
	5170	+ return true;
4615	5171	}
4616	5172
4617	5173	/*
..	..	@@ -4640,7 +5196,11 @@
4640	5196	if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
4641	5197	return NULL;
4642	5198
4643		- finalise_ac(gfp_mask, &ac);
	5199	+ /*
	5200	+ * Forbid the first pass from falling back to types that fragment
	5201	+ * memory until all local zones are considered.
	5202	+ */
	5203	+ alloc_flags \|= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
4644	5204
4645	5205	/* First allocation attempt */
4646	5206	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
..	..	@@ -4660,14 +5220,13 @@
4660	5220	* Restore the original nodemask if it was potentially replaced with
4661	5221	* &cpuset_current_mems_allowed to optimize the fast-path attempt.
4662	5222	*/
4663		- if (unlikely(ac.nodemask != nodemask))
4664		- ac.nodemask = nodemask;
	5223	+ ac.nodemask = nodemask;
4665	5224
4666	5225	page = __alloc_pages_slowpath(alloc_mask, order, &ac);
4667	5226
4668	5227	out:
4669	5228	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
4670		- unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
	5229	+ unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
4671	5230	__free_pages(page, order);
4672	5231	page = NULL;
4673	5232	}
..	..	@@ -4705,13 +5264,20 @@
4705	5264	if (order == 0) /* Via pcp? */
4706	5265	free_unref_page(page);
4707	5266	else
4708		- __free_pages_ok(page, order);
	5267	+ __free_pages_ok(page, order, FPI_NONE);
4709	5268	}
4710	5269
4711	5270	void __free_pages(struct page *page, unsigned int order)
4712	5271	{
	5272	+ /* get PageHead before we drop reference */
	5273	+ int head = PageHead(page);
	5274	+
	5275	+ trace_android_vh_free_pages(page, order);
4713	5276	if (put_page_testzero(page))
4714	5277	free_the_page(page, order);
	5278	+ else if (!head)
	5279	+ while (order-- > 0)
	5280	+ free_the_page(page + (1 << order), order);
4715	5281	}
4716	5282	EXPORT_SYMBOL(__free_pages);
4717	5283
..	..	@@ -4816,6 +5382,18 @@
4816	5382	/* reset page count bias and offset to start of new frag */
4817	5383	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
4818	5384	offset = size - fragsz;
	5385	+ if (unlikely(offset < 0)) {
	5386	+ /*
	5387	+ * The caller is trying to allocate a fragment
	5388	+ * with fragsz > PAGE_SIZE but the cache isn't big
	5389	+ * enough to satisfy the request, this may
	5390	+ * happen in low memory conditions.
	5391	+ * We don't release the cache page because
	5392	+ * it could make memory pressure worse
	5393	+ * so we simply return NULL here.
	5394	+ */
	5395	+ return NULL;
	5396	+ }
4819	5397	}
4820	5398
4821	5399	nc->pagecnt_bias--;
..	..	@@ -4856,7 +5434,7 @@
4856	5434	/**
4857	5435	* alloc_pages_exact - allocate an exact number physically-contiguous pages.
4858	5436	* @size: the number of bytes to allocate
4859		- * @gfp_mask: GFP flags for the allocation
	5437	+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4860	5438	*
4861	5439	* This function is similar to alloc_pages(), except that it allocates the
4862	5440	* minimum number of pages to satisfy the request. alloc_pages() can only
..	..	@@ -4865,11 +5443,16 @@
4865	5443	* This function is also limited by MAX_ORDER.
4866	5444	*
4867	5445	* Memory allocated by this function must be released by free_pages_exact().
	5446	+ *
	5447	+ * Return: pointer to the allocated area or %NULL in case of error.
4868	5448	*/
4869	5449	void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4870	5450	{
4871	5451	unsigned int order = get_order(size);
4872	5452	unsigned long addr;
	5453	+
	5454	+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
	5455	+ gfp_mask &= ~__GFP_COMP;
4873	5456
4874	5457	addr = __get_free_pages(gfp_mask, order);
4875	5458	return make_alloc_exact(addr, order, size);
..	..	@@ -4881,15 +5464,22 @@
4881	5464	* pages on a node.
4882	5465	* @nid: the preferred node ID where memory should be allocated
4883	5466	* @size: the number of bytes to allocate
4884		- * @gfp_mask: GFP flags for the allocation
	5467	+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4885	5468	*
4886	5469	* Like alloc_pages_exact(), but try to allocate on node nid first before falling
4887	5470	* back.
	5471	+ *
	5472	+ * Return: pointer to the allocated area or %NULL in case of error.
4888	5473	*/
4889	5474	void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
4890	5475	{
4891	5476	unsigned int order = get_order(size);
4892		- struct page *p = alloc_pages_node(nid, gfp_mask, order);
	5477	+ struct page *p;
	5478	+
	5479	+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
	5480	+ gfp_mask &= ~__GFP_COMP;
	5481	+
	5482	+ p = alloc_pages_node(nid, gfp_mask, order);
4893	5483	if (!p)
4894	5484	return NULL;
4895	5485	return make_alloc_exact((unsigned long)page_address(p), order, size);
..	..	@@ -4918,11 +5508,13 @@
4918	5508	* nr_free_zone_pages - count number of pages beyond high watermark
4919	5509	* @offset: The zone index of the highest zone
4920	5510	*
4921		- * nr_free_zone_pages() counts the number of counts pages which are beyond the
	5511	+ * nr_free_zone_pages() counts the number of pages which are beyond the
4922	5512	* high watermark within all zones at or below a given zone index. For each
4923	5513	* zone, the number of pages is calculated as:
4924	5514	*
4925	5515	* nr_free_zone_pages = managed_pages - high_pages
	5516	+ *
	5517	+ * Return: number of pages beyond high watermark.
4926	5518	*/
4927	5519	static unsigned long nr_free_zone_pages(int offset)
4928	5520	{
..	..	@@ -4935,7 +5527,7 @@
4935	5527	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
4936	5528
4937	5529	for_each_zone_zonelist(zone, z, zonelist, offset) {
4938		- unsigned long size = zone->managed_pages;
	5530	+ unsigned long size = zone_managed_pages(zone);
4939	5531	unsigned long high = high_wmark_pages(zone);
4940	5532	if (size > high)
4941	5533	sum += size - high;
..	..	@@ -4949,23 +5541,15 @@
4949	5541	*
4950	5542	* nr_free_buffer_pages() counts the number of pages which are beyond the high
4951	5543	* watermark within ZONE_DMA and ZONE_NORMAL.
	5544	+ *
	5545	+ * Return: number of pages beyond high watermark within ZONE_DMA and
	5546	+ * ZONE_NORMAL.
4952	5547	*/
4953	5548	unsigned long nr_free_buffer_pages(void)
4954	5549	{
4955	5550	return nr_free_zone_pages(gfp_zone(GFP_USER));
4956	5551	}
4957	5552	EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4958		-
4959		-/**
4960		- * nr_free_pagecache_pages - count number of pages beyond high watermark
4961		- *
4962		- * nr_free_pagecache_pages() counts the number of pages which are beyond the
4963		- * high watermark within all zones.
4964		- */
4965		-unsigned long nr_free_pagecache_pages(void)
4966		-{
4967		- return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
4968		-}
4969	5553
4970	5554	static inline void show_node(struct zone *zone)
4971	5555	{
..	..	@@ -4987,7 +5571,7 @@
4987	5571	pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
4988	5572
4989	5573	for_each_zone(zone)
4990		- wmark_low += zone->watermark[WMARK_LOW];
	5574	+ wmark_low += low_wmark_pages(zone);
4991	5575
4992	5576	/*
4993	5577	* Estimate the amount of memory available for userspace allocations,
..	..	@@ -5009,8 +5593,8 @@
5009	5593	* items that are in use, and cannot be freed. Cap this estimate at the
5010	5594	* low watermark.
5011	5595	*/
5012		- reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
5013		- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
	5596	+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
	5597	+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
5014	5598	available += reclaimable - min(reclaimable / 2, wmark_low);
5015	5599
5016	5600	if (available < 0)
..	..	@@ -5021,11 +5605,11 @@
5021	5605
5022	5606	void si_meminfo(struct sysinfo *val)
5023	5607	{
5024		- val->totalram = totalram_pages;
	5608	+ val->totalram = totalram_pages();
5025	5609	val->sharedram = global_node_page_state(NR_SHMEM);
5026	5610	val->freeram = global_zone_page_state(NR_FREE_PAGES);
5027	5611	val->bufferram = nr_blockdev_pages();
5028		- val->totalhigh = totalhigh_pages;
	5612	+ val->totalhigh = totalhigh_pages();
5029	5613	val->freehigh = nr_free_highpages();
5030	5614	val->mem_unit = PAGE_SIZE;
5031	5615	}
..	..	@@ -5042,7 +5626,7 @@
5042	5626	pg_data_t *pgdat = NODE_DATA(nid);
5043	5627
5044	5628	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
5045		- managed_pages += pgdat->node_zones[zone_type].managed_pages;
	5629	+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
5046	5630	val->totalram = managed_pages;
5047	5631	val->sharedram = node_page_state(pgdat, NR_SHMEM);
5048	5632	val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
..	..	@@ -5051,7 +5635,7 @@
5051	5635	struct zone *zone = &pgdat->node_zones[zone_type];
5052	5636
5053	5637	if (is_highmem(zone)) {
5054		- managed_highpages += zone->managed_pages;
	5638	+ managed_highpages += zone_managed_pages(zone);
5055	5639	free_highpages += zone_page_state(zone, NR_FREE_PAGES);
5056	5640	}
5057	5641	}
..	..	@@ -5140,7 +5724,7 @@
5140	5724
5141	5725	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
5142	5726	" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
5143		- " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
	5727	+ " unevictable:%lu dirty:%lu writeback:%lu\n"
5144	5728	" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
5145	5729	" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
5146	5730	" free:%lu free_pcp:%lu free_cma:%lu\n",
..	..	@@ -5153,9 +5737,8 @@
5153	5737	global_node_page_state(NR_UNEVICTABLE),
5154	5738	global_node_page_state(NR_FILE_DIRTY),
5155	5739	global_node_page_state(NR_WRITEBACK),
5156		- global_node_page_state(NR_UNSTABLE_NFS),
5157		- global_node_page_state(NR_SLAB_RECLAIMABLE),
5158		- global_node_page_state(NR_SLAB_UNRECLAIMABLE),
	5740	+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
	5741	+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
5159	5742	global_node_page_state(NR_FILE_MAPPED),
5160	5743	global_node_page_state(NR_SHMEM),
5161	5744	global_zone_page_state(NR_PAGETABLE),
..	..	@@ -5164,6 +5747,7 @@
5164	5747	free_pcp,
5165	5748	global_zone_page_state(NR_FREE_CMA_PAGES));
5166	5749
	5750	+ trace_android_vh_show_mapcount_pages(NULL);
5167	5751	for_each_online_pgdat(pgdat) {
5168	5752	if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
5169	5753	continue;
..	..	@@ -5186,7 +5770,10 @@
5186	5770	" anon_thp: %lukB"
5187	5771	#endif
5188	5772	" writeback_tmp:%lukB"
5189		- " unstable:%lukB"
	5773	+ " kernel_stack:%lukB"
	5774	+#ifdef CONFIG_SHADOW_CALL_STACK
	5775	+ " shadow_call_stack:%lukB"
	5776	+#endif
5190	5777	" all_unreclaimable? %s"
5191	5778	"\n",
5192	5779	pgdat->node_id,
..	..	@@ -5208,7 +5795,10 @@
5208	5795	K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
5209	5796	#endif
5210	5797	K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
5211		- K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
	5798	+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
	5799	+#ifdef CONFIG_SHADOW_CALL_STACK
	5800	+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
	5801	+#endif
5212	5802	pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
5213	5803	"yes" : "no");
5214	5804	}
..	..	@@ -5230,6 +5820,7 @@
5230	5820	" min:%lukB"
5231	5821	" low:%lukB"
5232	5822	" high:%lukB"
	5823	+ " reserved_highatomic:%luKB"
5233	5824	" active_anon:%lukB"
5234	5825	" inactive_anon:%lukB"
5235	5826	" active_file:%lukB"
..	..	@@ -5239,10 +5830,6 @@
5239	5830	" present:%lukB"
5240	5831	" managed:%lukB"
5241	5832	" mlocked:%lukB"
5242		- " kernel_stack:%lukB"
5243		-#ifdef CONFIG_SHADOW_CALL_STACK
5244		- " shadow_call_stack:%lukB"
5245		-#endif
5246	5833	" pagetables:%lukB"
5247	5834	" bounce:%lukB"
5248	5835	" free_pcp:%lukB"
..	..	@@ -5254,6 +5841,7 @@
5254	5841	K(min_wmark_pages(zone)),
5255	5842	K(low_wmark_pages(zone)),
5256	5843	K(high_wmark_pages(zone)),
	5844	+ K(zone->nr_reserved_highatomic),
5257	5845	K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
5258	5846	K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
5259	5847	K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
..	..	@@ -5261,12 +5849,8 @@
5261	5849	K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
5262	5850	K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
5263	5851	K(zone->present_pages),
5264		- K(zone->managed_pages),
	5852	+ K(zone_managed_pages(zone)),
5265	5853	K(zone_page_state(zone, NR_MLOCK)),
5266		- zone_page_state(zone, NR_KERNEL_STACK_KB),
5267		-#ifdef CONFIG_SHADOW_CALL_STACK
5268		- zone_page_state(zone, NR_KERNEL_SCS_BYTES) / 1024,
5269		-#endif
5270	5854	K(zone_page_state(zone, NR_PAGETABLE)),
5271	5855	K(zone_page_state(zone, NR_BOUNCE)),
5272	5856	K(free_pcp),
..	..	@@ -5298,7 +5882,7 @@
5298	5882
5299	5883	types[order] = 0;
5300	5884	for (type = 0; type < MIGRATE_TYPES; type++) {
5301		- if (!list_empty(&area->free_list[type]))
	5885	+ if (!free_area_empty(area, type))
5302	5886	types[order] \|= 1 << type;
5303	5887	}
5304	5888	}
..	..	@@ -5339,7 +5923,7 @@
5339	5923	do {
5340	5924	zone_type--;
5341	5925	zone = pgdat->node_zones + zone_type;
5342		- if (managed_zone(zone)) {
	5926	+ if (populated_zone(zone)) {
5343	5927	zoneref_set_zone(zone, &zonerefs[nr_zones++]);
5344	5928	check_highest_zone(zone_type);
5345	5929	}
..	..	@@ -5365,36 +5949,17 @@
5365	5949	return 0;
5366	5950	}
5367	5951
5368		-static __init int setup_numa_zonelist_order(char *s)
5369		-{
5370		- if (!s)
5371		- return 0;
5372		-
5373		- return __parse_numa_zonelist_order(s);
5374		-}
5375		-early_param("numa_zonelist_order", setup_numa_zonelist_order);
5376		-
5377	5952	char numa_zonelist_order[] = "Node";
5378	5953
5379	5954	/*
5380	5955	* sysctl handler for numa_zonelist_order
5381	5956	*/
5382	5957	int numa_zonelist_order_handler(struct ctl_table *table, int write,
5383		- void __user buffer, size_t length,
5384		- loff_t *ppos)
	5958	+ void buffer, size_t length, loff_t *ppos)
5385	5959	{
5386		- char *str;
5387		- int ret;
5388		-
5389		- if (!write)
5390		- return proc_dostring(table, write, buffer, length, ppos);
5391		- str = memdup_user_nul(buffer, 16);
5392		- if (IS_ERR(str))
5393		- return PTR_ERR(str);
5394		-
5395		- ret = __parse_numa_zonelist_order(str);
5396		- kfree(str);
5397		- return ret;
	5960	+ if (write)
	5961	+ return __parse_numa_zonelist_order(buffer);
	5962	+ return proc_dostring(table, write, buffer, length, ppos);
5398	5963	}
5399	5964
5400	5965
..	..	@@ -5413,14 +5978,14 @@
5413	5978	* from each node to each node in the system), and should also prefer nodes
5414	5979	* with no CPUs, since presumably they'll have very little allocation pressure
5415	5980	* on them otherwise.
5416		- * It returns -1 if no node is found.
	5981	+ *
	5982	+ * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
5417	5983	*/
5418	5984	static int find_next_best_node(int node, nodemask_t *used_node_mask)
5419	5985	{
5420	5986	int n, val;
5421	5987	int min_val = INT_MAX;
5422	5988	int best_node = NUMA_NO_NODE;
5423		- const struct cpumask *tmp = cpumask_of_node(0);
5424	5989
5425	5990	/* Use the local node if we haven't already */
5426	5991	if (!node_isset(node, *used_node_mask)) {
..	..	@@ -5441,8 +6006,7 @@
5441	6006	val += (n < node);
5442	6007
5443	6008	/* Give preference to headless and unused nodes */
5444		- tmp = cpumask_of_node(n);
5445		- if (!cpumask_empty(tmp))
	6009	+ if (!cpumask_empty(cpumask_of_node(n)))
5446	6010	val += PENALTY_FOR_NODE_WITH_CPUS;
5447	6011
5448	6012	/* Slight preference for less loaded node */
..	..	@@ -5513,14 +6077,13 @@
5513	6077	{
5514	6078	static int node_order[MAX_NUMNODES];
5515	6079	int node, load, nr_nodes = 0;
5516		- nodemask_t used_mask;
	6080	+ nodemask_t used_mask = NODE_MASK_NONE;
5517	6081	int local_node, prev_node;
5518	6082
5519	6083	/* NUMA-aware ordering of nodes */
5520	6084	local_node = pgdat->node_id;
5521	6085	load = nr_online_nodes;
5522	6086	prev_node = local_node;
5523		- nodes_clear(used_mask);
5524	6087
5525	6088	memset(node_order, 0, sizeof(node_order));
5526	6089	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
..	..	@@ -5627,9 +6190,22 @@
5627	6190	int nid;
5628	6191	int __maybe_unused cpu;
5629	6192	pg_data_t *self = data;
5630		- static DEFINE_SPINLOCK(lock);
	6193	+ unsigned long flags;
5631	6194
5632		- spin_lock(&lock);
	6195	+ /*
	6196	+ * Explicitly disable this CPU's interrupts before taking seqlock
	6197	+ * to prevent any IRQ handler from calling into the page allocator
	6198	+ * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
	6199	+ */
	6200	+ local_irq_save(flags);
	6201	+ /*
	6202	+ * Explicitly disable this CPU's synchronous printk() before taking
	6203	+ * seqlock to prevent any printk() from trying to hold port->lock, for
	6204	+ * tty_insert_flip_string_and_push_buffer() on other CPU might be
	6205	+ * calling kmalloc(GFP_ATOMIC \| __GFP_NOWARN) with port->lock held.
	6206	+ */
	6207	+ printk_deferred_enter();
	6208	+ write_seqlock(&zonelist_update_seq);
5633	6209
5634	6210	#ifdef CONFIG_NUMA
5635	6211	memset(node_load, 0, sizeof(node_load));
..	..	@@ -5662,7 +6238,9 @@
5662	6238	#endif
5663	6239	}
5664	6240
5665		- spin_unlock(&lock);
	6241	+ write_sequnlock(&zonelist_update_seq);
	6242	+ printk_deferred_exit();
	6243	+ local_irq_restore(flags);
5666	6244	}
5667	6245
5668	6246	static noinline void __init
..	..	@@ -5700,13 +6278,16 @@
5700	6278	*/
5701	6279	void __ref build_all_zonelists(pg_data_t *pgdat)
5702	6280	{
	6281	+ unsigned long vm_total_pages;
	6282	+
5703	6283	if (system_state == SYSTEM_BOOTING) {
5704	6284	build_all_zonelists_init();
5705	6285	} else {
5706	6286	__build_all_zonelists(pgdat);
5707	6287	/* cpuset refresh routine should be here */
5708	6288	}
5709		- vm_total_pages = nr_free_pagecache_pages();
	6289	+ /* Get the number of free pages beyond high watermark in all zones. */
	6290	+ vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
5710	6291	/*
5711	6292	* Disable grouping by mobility if the number of pages in the
5712	6293	* system is too low to allow the mechanism to work. It would be
..	..	@@ -5719,7 +6300,7 @@
5719	6300	else
5720	6301	page_group_by_mobility_disabled = 0;
5721	6302
5722		- pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
	6303	+ pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
5723	6304	nr_online_nodes,
5724	6305	page_group_by_mobility_disabled ? "off" : "on",
5725	6306	vm_total_pages);
..	..	@@ -5728,81 +6309,148 @@
5728	6309	#endif
5729	6310	}
5730	6311
	6312	+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
	6313	+static bool __meminit
	6314	+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
	6315	+{
	6316	+ static struct memblock_region *r;
	6317	+
	6318	+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
	6319	+ if (!r \|\| *pfn >= memblock_region_memory_end_pfn(r)) {
	6320	+ for_each_mem_region(r) {
	6321	+ if (*pfn < memblock_region_memory_end_pfn(r))
	6322	+ break;
	6323	+ }
	6324	+ }
	6325	+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
	6326	+ memblock_is_mirror(r)) {
	6327	+ *pfn = memblock_region_memory_end_pfn(r);
	6328	+ return true;
	6329	+ }
	6330	+ }
	6331	+ return false;
	6332	+}
	6333	+
5731	6334	/*
5732	6335	* Initially all pages are reserved - free ones are freed
5733		- * up by free_all_bootmem() once the early boot process is
	6336	+ * up by memblock_free_all() once the early boot process is
5734	6337	* done. Non-atomic initialization, single-pass.
	6338	+ *
	6339	+ * All aligned pageblocks are initialized to the specified migratetype
	6340	+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
	6341	+ * zone stats (e.g., nr_isolate_pageblock) are touched.
5735	6342	*/
5736	6343	void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5737		- unsigned long start_pfn, enum meminit_context context,
5738		- struct vmem_altmap *altmap)
	6344	+ unsigned long start_pfn, unsigned long zone_end_pfn,
	6345	+ enum meminit_context context,
	6346	+ struct vmem_altmap *altmap, int migratetype)
5739	6347	{
5740		- unsigned long end_pfn = start_pfn + size;
5741		- pg_data_t *pgdat = NODE_DATA(nid);
5742		- unsigned long pfn;
5743		- unsigned long nr_initialised = 0;
	6348	+ unsigned long pfn, end_pfn = start_pfn + size;
5744	6349	struct page *page;
5745		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5746		- struct memblock_region r = NULL, tmp;
5747		-#endif
5748	6350
5749	6351	if (highest_memmap_pfn < end_pfn - 1)
5750	6352	highest_memmap_pfn = end_pfn - 1;
	6353	+
	6354	+#ifdef CONFIG_ZONE_DEVICE
	6355	+ /*
	6356	+ * Honor reservation requested by the driver for this ZONE_DEVICE
	6357	+ * memory. We limit the total number of pages to initialize to just
	6358	+ * those that might contain the memory mapping. We will defer the
	6359	+ * ZONE_DEVICE page initialization until after we have released
	6360	+ * the hotplug lock.
	6361	+ */
	6362	+ if (zone == ZONE_DEVICE) {
	6363	+ if (!altmap)
	6364	+ return;
	6365	+
	6366	+ if (start_pfn == altmap->base_pfn)
	6367	+ start_pfn += altmap->reserve;
	6368	+ end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
	6369	+ }
	6370	+#endif
5751	6371
5752	6372	#ifdef CONFIG_ROCKCHIP_THUNDER_BOOT
5753	6373	/* Zero all page struct in advance */
5754	6374	memset(pfn_to_page(start_pfn), 0, sizeof(struct page) * size);
5755	6375	#endif
5756	6376
5757		- /*
5758		- * Honor reservation requested by the driver for this ZONE_DEVICE
5759		- * memory
5760		- */
5761		- if (altmap && start_pfn == altmap->base_pfn)
5762		- start_pfn += altmap->reserve;
5763		-
5764		- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
	6377	+ for (pfn = start_pfn; pfn < end_pfn; ) {
5765	6378	/*
5766	6379	* There can be holes in boot-time mem_map[]s handed to this
5767	6380	* function. They do not exist on hotplugged memory.
5768	6381	*/
5769		- if (context != MEMINIT_EARLY)
5770		- goto not_early;
5771		-
5772		- if (!early_pfn_valid(pfn))
5773		- continue;
5774		- if (!early_pfn_in_nid(pfn, nid))
5775		- continue;
5776		- if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
5777		- break;
5778		-
5779		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5780		- /*
5781		- * Check given memblock attribute by firmware which can affect
5782		- * kernel memory layout. If zone==ZONE_MOVABLE but memory is
5783		- * mirrored, it's an overlapped memmap init. skip it.
5784		- */
5785		- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
5786		- if (!r \|\| pfn >= memblock_region_memory_end_pfn(r)) {
5787		- for_each_memblock(memory, tmp)
5788		- if (pfn < memblock_region_memory_end_pfn(tmp))
5789		- break;
5790		- r = tmp;
5791		- }
5792		- if (pfn >= memblock_region_memory_base_pfn(r) &&
5793		- memblock_is_mirror(r)) {
5794		- /* already initialized as NORMAL */
5795		- pfn = memblock_region_memory_end_pfn(r);
	6382	+ if (context == MEMINIT_EARLY) {
	6383	+ if (overlap_memmap_init(zone, &pfn))
5796	6384	continue;
5797		- }
	6385	+ if (defer_init(nid, pfn, zone_end_pfn))
	6386	+ break;
5798	6387	}
5799		-#endif
5800	6388
5801		-not_early:
5802	6389	page = pfn_to_page(pfn);
5803	6390	__init_single_page(page, pfn, zone, nid, false);
5804	6391	if (context == MEMINIT_HOTPLUG)
5805		- SetPageReserved(page);
	6392	+ __SetPageReserved(page);
	6393	+
	6394	+ /*
	6395	+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
	6396	+ * such that unmovable allocations won't be scattered all
	6397	+ * over the place during system boot.
	6398	+ */
	6399	+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
	6400	+ set_pageblock_migratetype(page, migratetype);
	6401	+ cond_resched();
	6402	+ }
	6403	+ pfn++;
	6404	+ }
	6405	+}
	6406	+
	6407	+#ifdef CONFIG_ZONE_DEVICE
	6408	+void __ref memmap_init_zone_device(struct zone *zone,
	6409	+ unsigned long start_pfn,
	6410	+ unsigned long nr_pages,
	6411	+ struct dev_pagemap *pgmap)
	6412	+{
	6413	+ unsigned long pfn, end_pfn = start_pfn + nr_pages;
	6414	+ struct pglist_data *pgdat = zone->zone_pgdat;
	6415	+ struct vmem_altmap *altmap = pgmap_altmap(pgmap);
	6416	+ unsigned long zone_idx = zone_idx(zone);
	6417	+ unsigned long start = jiffies;
	6418	+ int nid = pgdat->node_id;
	6419	+
	6420	+ if (WARN_ON_ONCE(!pgmap \|\| zone_idx(zone) != ZONE_DEVICE))
	6421	+ return;
	6422	+
	6423	+ /*
	6424	+ * The call to memmap_init should have already taken care
	6425	+ * of the pages reserved for the memmap, so we can just jump to
	6426	+ * the end of that region and start processing the device pages.
	6427	+ */
	6428	+ if (altmap) {
	6429	+ start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
	6430	+ nr_pages = end_pfn - start_pfn;
	6431	+ }
	6432	+
	6433	+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
	6434	+ struct page *page = pfn_to_page(pfn);
	6435	+
	6436	+ __init_single_page(page, pfn, zone_idx, nid, true);
	6437	+
	6438	+ /*
	6439	+ * Mark page reserved as it will need to wait for onlining
	6440	+ * phase for it to be fully associated with a zone.
	6441	+ *
	6442	+ * We can use the non-atomic __set_bit operation for setting
	6443	+ * the flag as we are still initializing the pages.
	6444	+ */
	6445	+ __SetPageReserved(page);
	6446	+
	6447	+ /*
	6448	+ * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
	6449	+ * and zone_device_data. It is a bug if a ZONE_DEVICE page is
	6450	+ * ever freed or placed on a driver-private list.
	6451	+ */
	6452	+ page->pgmap = pgmap;
	6453	+ page->zone_device_data = NULL;
5806	6454
5807	6455	/*
5808	6456	* Mark the block movable so that blocks are reserved for
..	..	@@ -5811,21 +6459,20 @@
5811	6459	* the address space during boot when many long-lived
5812	6460	* kernel allocations are made.
5813	6461	*
5814		- * bitmap is created for zone's valid pfn range. but memmap
5815		- * can be created for invalid pages (for alignment)
5816		- * check here not to call set_pageblock_migratetype() against
5817		- * pfn out of zone.
5818		- *
5819	6462	* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
5820		- * because this is done early in sparse_add_one_section
	6463	+ * because this is done early in section_activate()
5821	6464	*/
5822		- if (!(pfn & (pageblock_nr_pages - 1))) {
	6465	+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
5823	6466	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5824	6467	cond_resched();
5825	6468	}
5826	6469	}
	6470	+
	6471	+ pr_info("%s initialised %lu pages in %ums\n", __func__,
	6472	+ nr_pages, jiffies_to_msecs(jiffies - start));
5827	6473	}
5828	6474
	6475	+#endif
5829	6476	static void __meminit zone_init_free_lists(struct zone *zone)
5830	6477	{
5831	6478	unsigned int order, t;
..	..	@@ -5835,11 +6482,118 @@
5835	6482	}
5836	6483	}
5837	6484
5838		-#ifndef __HAVE_ARCH_MEMMAP_INIT
5839		-#define memmap_init(size, nid, zone, start_pfn) \
5840		- memmap_init_zone((size), (nid), (zone), (start_pfn), \
5841		- MEMINIT_EARLY, NULL)
	6485	+/*
	6486	+ * Only struct pages that correspond to ranges defined by memblock.memory
	6487	+ * are zeroed and initialized by going through __init_single_page() during
	6488	+ * memmap_init_zone_range().
	6489	+ *
	6490	+ * But, there could be struct pages that correspond to holes in
	6491	+ * memblock.memory. This can happen because of the following reasons:
	6492	+ * - physical memory bank size is not necessarily the exact multiple of the
	6493	+ * arbitrary section size
	6494	+ * - early reserved memory may not be listed in memblock.memory
	6495	+ * - memory layouts defined with memmap= kernel parameter may not align
	6496	+ * nicely with memmap sections
	6497	+ *
	6498	+ * Explicitly initialize those struct pages so that:
	6499	+ * - PG_Reserved is set
	6500	+ * - zone and node links point to zone and node that span the page if the
	6501	+ * hole is in the middle of a zone
	6502	+ * - zone and node links point to adjacent zone/node if the hole falls on
	6503	+ * the zone boundary; the pages in such holes will be prepended to the
	6504	+ * zone/node above the hole except for the trailing pages in the last
	6505	+ * section that will be appended to the zone/node below.
	6506	+ */
	6507	+static void __init init_unavailable_range(unsigned long spfn,
	6508	+ unsigned long epfn,
	6509	+ int zone, int node)
	6510	+{
	6511	+ unsigned long pfn;
	6512	+ u64 pgcnt = 0;
	6513	+
	6514	+ for (pfn = spfn; pfn < epfn; pfn++) {
	6515	+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
	6516	+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
	6517	+ + pageblock_nr_pages - 1;
	6518	+ continue;
	6519	+ }
	6520	+ __init_single_page(pfn_to_page(pfn), pfn, zone, node, true);
	6521	+ __SetPageReserved(pfn_to_page(pfn));
	6522	+ pgcnt++;
	6523	+ }
	6524	+
	6525	+ if (pgcnt)
	6526	+ pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
	6527	+ node, zone_names[zone], pgcnt);
	6528	+}
	6529	+
	6530	+static void __init memmap_init_zone_range(struct zone *zone,
	6531	+ unsigned long start_pfn,
	6532	+ unsigned long end_pfn,
	6533	+ unsigned long *hole_pfn)
	6534	+{
	6535	+ unsigned long zone_start_pfn = zone->zone_start_pfn;
	6536	+ unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
	6537	+ int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
	6538	+
	6539	+ start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
	6540	+ end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
	6541	+
	6542	+ if (start_pfn >= end_pfn)
	6543	+ return;
	6544	+
	6545	+ memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn,
	6546	+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
	6547	+
	6548	+ if (*hole_pfn < start_pfn)
	6549	+ init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
	6550	+
	6551	+ *hole_pfn = end_pfn;
	6552	+}
	6553	+
	6554	+void __init __weak memmap_init(void)
	6555	+{
	6556	+ unsigned long start_pfn, end_pfn;
	6557	+ unsigned long hole_pfn = 0;
	6558	+ int i, j, zone_id, nid;
	6559	+
	6560	+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
	6561	+ struct pglist_data *node = NODE_DATA(nid);
	6562	+
	6563	+ for (j = 0; j < MAX_NR_ZONES; j++) {
	6564	+ struct zone *zone = node->node_zones + j;
	6565	+
	6566	+ if (!populated_zone(zone))
	6567	+ continue;
	6568	+
	6569	+ memmap_init_zone_range(zone, start_pfn, end_pfn,
	6570	+ &hole_pfn);
	6571	+ zone_id = j;
	6572	+ }
	6573	+ }
	6574	+
	6575	+#ifdef CONFIG_SPARSEMEM
	6576	+ /*
	6577	+ * Initialize the memory map for hole in the range [memory_end,
	6578	+ * section_end].
	6579	+ * Append the pages in this hole to the highest zone in the last
	6580	+ * node.
	6581	+ * The call to init_unavailable_range() is outside the ifdef to
	6582	+ * silence the compiler warining about zone_id set but not used;
	6583	+ * for FLATMEM it is a nop anyway
	6584	+ */
	6585	+ end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
	6586	+ if (hole_pfn < end_pfn)
5842	6587	#endif
	6588	+ init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
	6589	+}
	6590	+
	6591	+/* A stub for backwards compatibility with custom implementatin on IA-64 */
	6592	+void __meminit __weak arch_memmap_init(unsigned long size, int nid,
	6593	+ unsigned long zone,
	6594	+ unsigned long range_start_pfn)
	6595	+{
	6596	+}
5843	6597
5844	6598	static int zone_batchsize(struct zone *zone)
5845	6599	{
..	..	@@ -5850,7 +6604,7 @@
5850	6604	* The per-cpu-pages pools are set to around 1000th of the
5851	6605	* size of the zone.
5852	6606	*/
5853		- batch = zone->managed_pages / 1024;
	6607	+ batch = zone_managed_pages(zone) / 1024;
5854	6608	/* But no more than a meg. */
5855	6609	if (batch * PAGE_SIZE > 1024 * 1024)
5856	6610	batch = (1024 * 1024) / PAGE_SIZE;
..	..	@@ -5897,7 +6651,7 @@
5897	6651	* locking.
5898	6652	*
5899	6653	* Any new users of pcp->batch and pcp->high should ensure they can cope with
5900		- * those fields changing asynchronously (acording the the above rule).
	6654	+ * those fields changing asynchronously (acording to the above rule).
5901	6655	*
5902	6656	* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5903	6657	* outside of boot time (or some other assurance that no concurrent updaters
..	..	@@ -5906,6 +6660,7 @@
5906	6660	static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
5907	6661	unsigned long batch)
5908	6662	{
	6663	+ trace_android_vh_pageset_update(&high, &batch);
5909	6664	/* start with a fail safe value for batch */
5910	6665	pcp->batch = 1;
5911	6666	smp_wmb();
..	..	@@ -5931,7 +6686,6 @@
5931	6686	memset(p, 0, sizeof(*p));
5932	6687
5933	6688	pcp = &p->pcp;
5934		- pcp->count = 0;
5935	6689	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
5936	6690	INIT_LIST_HEAD(&pcp->lists[migratetype]);
5937	6691	}
..	..	@@ -5961,7 +6715,7 @@
5961	6715	{
5962	6716	if (percpu_pagelist_fraction)
5963	6717	pageset_set_high(pcp,
5964		- (zone->managed_pages /
	6718	+ (zone_managed_pages(zone) /
5965	6719	percpu_pagelist_fraction));
5966	6720	else
5967	6721	pageset_set_batch(pcp, zone_batchsize(zone));
..	..	@@ -5991,9 +6745,24 @@
5991	6745	{
5992	6746	struct pglist_data *pgdat;
5993	6747	struct zone *zone;
	6748	+ int __maybe_unused cpu;
5994	6749
5995	6750	for_each_populated_zone(zone)
5996	6751	setup_zone_pageset(zone);
	6752	+
	6753	+#ifdef CONFIG_NUMA
	6754	+ /*
	6755	+ * Unpopulated zones continue using the boot pagesets.
	6756	+ * The numa stats for these pagesets need to be reset.
	6757	+ * Otherwise, they will end up skewing the stats of
	6758	+ * the nodes these zones are associated with.
	6759	+ */
	6760	+ for_each_possible_cpu(cpu) {
	6761	+ struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
	6762	+ memset(pcp->vm_numa_stat_diff, 0,
	6763	+ sizeof(pcp->vm_numa_stat_diff));
	6764	+ }
	6765	+#endif
5997	6766
5998	6767	for_each_online_pgdat(pgdat)
5999	6768	pgdat->per_cpu_nodestats =
..	..	@@ -6037,73 +6806,6 @@
6037	6806	zone->initialized = 1;
6038	6807	}
6039	6808
6040		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6041		-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
6042		-
6043		-/*
6044		- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
6045		- */
6046		-int __meminit __early_pfn_to_nid(unsigned long pfn,
6047		- struct mminit_pfnnid_cache *state)
6048		-{
6049		- unsigned long start_pfn, end_pfn;
6050		- int nid;
6051		-
6052		- if (state->last_start <= pfn && pfn < state->last_end)
6053		- return state->last_nid;
6054		-
6055		- nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
6056		- if (nid != -1) {
6057		- state->last_start = start_pfn;
6058		- state->last_end = end_pfn;
6059		- state->last_nid = nid;
6060		- }
6061		-
6062		- return nid;
6063		-}
6064		-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
6065		-
6066		-/**
6067		- * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
6068		- * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
6069		- * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
6070		- *
6071		- * If an architecture guarantees that all ranges registered contain no holes
6072		- * and may be freed, this this function may be used instead of calling
6073		- * memblock_free_early_nid() manually.
6074		- */
6075		-void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
6076		-{
6077		- unsigned long start_pfn, end_pfn;
6078		- int i, this_nid;
6079		-
6080		- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
6081		- start_pfn = min(start_pfn, max_low_pfn);
6082		- end_pfn = min(end_pfn, max_low_pfn);
6083		-
6084		- if (start_pfn < end_pfn)
6085		- memblock_free_early_nid(PFN_PHYS(start_pfn),
6086		- (end_pfn - start_pfn) << PAGE_SHIFT,
6087		- this_nid);
6088		- }
6089		-}
6090		-
6091		-/**
6092		- * sparse_memory_present_with_active_regions - Call memory_present for each active range
6093		- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
6094		- *
6095		- * If an architecture guarantees that all ranges registered contain no holes and may
6096		- * be freed, this function may be used instead of calling memory_present() manually.
6097		- */
6098		-void __init sparse_memory_present_with_active_regions(int nid)
6099		-{
6100		- unsigned long start_pfn, end_pfn;
6101		- int i, this_nid;
6102		-
6103		- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
6104		- memory_present(this_nid, start_pfn, end_pfn);
6105		-}
6106		-
6107	6809	/**
6108	6810	* get_pfn_range_for_nid - Return the start and end page frames for a node
6109	6811	* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
..	..	@@ -6115,7 +6817,7 @@
6115	6817	* with no available memory, a warning is printed and the start and end
6116	6818	* PFNs will be 0.
6117	6819	*/
6118		-void __meminit get_pfn_range_for_nid(unsigned int nid,
	6820	+void __init get_pfn_range_for_nid(unsigned int nid,
6119	6821	unsigned long start_pfn, unsigned long end_pfn)
6120	6822	{
6121	6823	unsigned long this_start_pfn, this_end_pfn;
..	..	@@ -6164,7 +6866,7 @@
6164	6866	* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
6165	6867	* zones within a node are in order of monotonic increases memory addresses
6166	6868	*/
6167		-static void __meminit adjust_zone_range_for_zone_movable(int nid,
	6869	+static void __init adjust_zone_range_for_zone_movable(int nid,
6168	6870	unsigned long zone_type,
6169	6871	unsigned long node_start_pfn,
6170	6872	unsigned long node_end_pfn,
..	..	@@ -6195,13 +6897,12 @@
6195	6897	* Return the number of pages a zone spans in a node, including holes
6196	6898	* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
6197	6899	*/
6198		-static unsigned long __meminit zone_spanned_pages_in_node(int nid,
	6900	+static unsigned long __init zone_spanned_pages_in_node(int nid,
6199	6901	unsigned long zone_type,
6200	6902	unsigned long node_start_pfn,
6201	6903	unsigned long node_end_pfn,
6202	6904	unsigned long *zone_start_pfn,
6203		- unsigned long *zone_end_pfn,
6204		- unsigned long *ignored)
	6905	+ unsigned long *zone_end_pfn)
6205	6906	{
6206	6907	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6207	6908	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
..	..	@@ -6232,7 +6933,7 @@
6232	6933	* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
6233	6934	* then all holes in the requested range will be accounted for.
6234	6935	*/
6235		-unsigned long __meminit __absent_pages_in_range(int nid,
	6936	+unsigned long __init __absent_pages_in_range(int nid,
6236	6937	unsigned long range_start_pfn,
6237	6938	unsigned long range_end_pfn)
6238	6939	{
..	..	@@ -6253,7 +6954,7 @@
6253	6954	* @start_pfn: The start PFN to start searching for holes
6254	6955	* @end_pfn: The end PFN to stop searching for holes
6255	6956	*
6256		- * It returns the number of pages frames in memory holes within a range.
	6957	+ * Return: the number of pages frames in memory holes within a range.
6257	6958	*/
6258	6959	unsigned long __init absent_pages_in_range(unsigned long start_pfn,
6259	6960	unsigned long end_pfn)
..	..	@@ -6262,11 +6963,10 @@
6262	6963	}
6263	6964
6264	6965	/* Return the number of page frames in holes in a zone on a node */
6265		-static unsigned long __meminit zone_absent_pages_in_node(int nid,
	6966	+static unsigned long __init zone_absent_pages_in_node(int nid,
6266	6967	unsigned long zone_type,
6267	6968	unsigned long node_start_pfn,
6268		- unsigned long node_end_pfn,
6269		- unsigned long *ignored)
	6969	+ unsigned long node_end_pfn)
6270	6970	{
6271	6971	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6272	6972	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
..	..	@@ -6294,7 +6994,7 @@
6294	6994	unsigned long start_pfn, end_pfn;
6295	6995	struct memblock_region *r;
6296	6996
6297		- for_each_memblock(memory, r) {
	6997	+ for_each_mem_region(r) {
6298	6998	start_pfn = clamp(memblock_region_memory_base_pfn(r),
6299	6999	zone_start_pfn, zone_end_pfn);
6300	7000	end_pfn = clamp(memblock_region_memory_end_pfn(r),
..	..	@@ -6313,45 +7013,9 @@
6313	7013	return nr_absent;
6314	7014	}
6315	7015
6316		-#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6317		-static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
6318		- unsigned long zone_type,
6319		- unsigned long node_start_pfn,
6320		- unsigned long node_end_pfn,
6321		- unsigned long *zone_start_pfn,
6322		- unsigned long *zone_end_pfn,
6323		- unsigned long *zones_size)
6324		-{
6325		- unsigned int zone;
6326		-
6327		- *zone_start_pfn = node_start_pfn;
6328		- for (zone = 0; zone < zone_type; zone++)
6329		- *zone_start_pfn += zones_size[zone];
6330		-
6331		- zone_end_pfn = zone_start_pfn + zones_size[zone_type];
6332		-
6333		- return zones_size[zone_type];
6334		-}
6335		-
6336		-static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
6337		- unsigned long zone_type,
	7016	+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
6338	7017	unsigned long node_start_pfn,
6339		- unsigned long node_end_pfn,
6340		- unsigned long *zholes_size)
6341		-{
6342		- if (!zholes_size)
6343		- return 0;
6344		-
6345		- return zholes_size[zone_type];
6346		-}
6347		-
6348		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6349		-
6350		-static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
6351		- unsigned long node_start_pfn,
6352		- unsigned long node_end_pfn,
6353		- unsigned long *zones_size,
6354		- unsigned long *zholes_size)
	7018	+ unsigned long node_end_pfn)
6355	7019	{
6356	7020	unsigned long realtotalpages = 0, totalpages = 0;
6357	7021	enum zone_type i;
..	..	@@ -6359,17 +7023,21 @@
6359	7023	for (i = 0; i < MAX_NR_ZONES; i++) {
6360	7024	struct zone *zone = pgdat->node_zones + i;
6361	7025	unsigned long zone_start_pfn, zone_end_pfn;
	7026	+ unsigned long spanned, absent;
6362	7027	unsigned long size, real_size;
6363	7028
6364		- size = zone_spanned_pages_in_node(pgdat->node_id, i,
6365		- node_start_pfn,
6366		- node_end_pfn,
6367		- &zone_start_pfn,
6368		- &zone_end_pfn,
6369		- zones_size);
6370		- real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
6371		- node_start_pfn, node_end_pfn,
6372		- zholes_size);
	7029	+ spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
	7030	+ node_start_pfn,
	7031	+ node_end_pfn,
	7032	+ &zone_start_pfn,
	7033	+ &zone_end_pfn);
	7034	+ absent = zone_absent_pages_in_node(pgdat->node_id, i,
	7035	+ node_start_pfn,
	7036	+ node_end_pfn);
	7037	+
	7038	+ size = spanned;
	7039	+ real_size = size - absent;
	7040	+
6373	7041	if (size)
6374	7042	zone->zone_start_pfn = zone_start_pfn;
6375	7043	else
..	..	@@ -6415,10 +7083,14 @@
6415	7083	{
6416	7084	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
6417	7085	zone->pageblock_flags = NULL;
6418		- if (usemapsize)
	7086	+ if (usemapsize) {
6419	7087	zone->pageblock_flags =
6420		- memblock_virt_alloc_node_nopanic(usemapsize,
6421		- pgdat->node_id);
	7088	+ memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
	7089	+ pgdat->node_id);
	7090	+ if (!zone->pageblock_flags)
	7091	+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
	7092	+ usemapsize, zone->name, pgdat->node_id);
	7093	+ }
6422	7094	}
6423	7095	#else
6424	7096	static inline void setup_usemap(struct pglist_data pgdat, struct zone zone,
..	..	@@ -6485,9 +7157,11 @@
6485	7157	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6486	7158	static void pgdat_init_split_queue(struct pglist_data *pgdat)
6487	7159	{
6488		- spin_lock_init(&pgdat->split_queue_lock);
6489		- INIT_LIST_HEAD(&pgdat->split_queue);
6490		- pgdat->split_queue_len = 0;
	7160	+ struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
	7161	+
	7162	+ spin_lock_init(&ds_queue->split_queue_lock);
	7163	+ INIT_LIST_HEAD(&ds_queue->split_queue);
	7164	+ ds_queue->split_queue_len = 0;
6491	7165	}
6492	7166	#else
6493	7167	static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
..	..	@@ -6514,13 +7188,13 @@
6514	7188
6515	7189	pgdat_page_ext_init(pgdat);
6516	7190	spin_lock_init(&pgdat->lru_lock);
6517		- lruvec_init(node_lruvec(pgdat));
	7191	+ lruvec_init(&pgdat->__lruvec);
6518	7192	}
6519	7193
6520	7194	static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
6521	7195	unsigned long remaining_pages)
6522	7196	{
6523		- zone->managed_pages = remaining_pages;
	7197	+ atomic_long_set(&zone->managed_pages, remaining_pages);
6524	7198	zone_set_nid(zone, nid);
6525	7199	zone->name = zone_names[idx];
6526	7200	zone->zone_pgdat = NODE_DATA(nid);
..	..	@@ -6618,7 +7292,7 @@
6618	7292	set_pageblock_order();
6619	7293	setup_usemap(pgdat, zone, zone_start_pfn, size);
6620	7294	init_currently_empty_zone(zone, zone_start_pfn, size);
6621		- memmap_init(size, nid, j, zone_start_pfn);
	7295	+ arch_memmap_init(size, nid, j, zone_start_pfn);
6622	7296	}
6623	7297	}
6624	7298
..	..	@@ -6647,7 +7321,11 @@
6647	7321	end = pgdat_end_pfn(pgdat);
6648	7322	end = ALIGN(end, MAX_ORDER_NR_PAGES);
6649	7323	size = (end - start) * sizeof(struct page);
6650		- map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
	7324	+ map = memblock_alloc_node(size, SMP_CACHE_BYTES,
	7325	+ pgdat->node_id);
	7326	+ if (!map)
	7327	+ panic("Failed to allocate %ld bytes for node %d memory map\n",
	7328	+ size, pgdat->node_id);
6651	7329	pgdat->node_mem_map = map + offset;
6652	7330	}
6653	7331	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
..	..	@@ -6659,10 +7337,8 @@
6659	7337	*/
6660	7338	if (pgdat == NODE_DATA(0)) {
6661	7339	mem_map = NODE_DATA(0)->node_mem_map;
6662		-#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) \|\| defined(CONFIG_FLATMEM)
6663	7340	if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
6664	7341	mem_map -= offset;
6665		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6666	7342	}
6667	7343	#endif
6668	7344	}
..	..	@@ -6673,42 +7349,31 @@
6673	7349	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6674	7350	static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
6675	7351	{
6676		- /*
6677		- * We start only with one section of pages, more pages are added as
6678		- * needed until the rest of deferred pages are initialized.
6679		- */
6680		- pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
6681		- pgdat->node_spanned_pages);
6682	7352	pgdat->first_deferred_pfn = ULONG_MAX;
6683	7353	}
6684	7354	#else
6685	7355	static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
6686	7356	#endif
6687	7357
6688		-void __init free_area_init_node(int nid, unsigned long *zones_size,
6689		- unsigned long node_start_pfn,
6690		- unsigned long *zholes_size)
	7358	+static void __init free_area_init_node(int nid)
6691	7359	{
6692	7360	pg_data_t *pgdat = NODE_DATA(nid);
6693	7361	unsigned long start_pfn = 0;
6694	7362	unsigned long end_pfn = 0;
6695	7363
6696	7364	/* pg_data_t should be reset to zero when it's allocated */
6697		- WARN_ON(pgdat->nr_zones \|\| pgdat->kswapd_classzone_idx);
	7365	+ WARN_ON(pgdat->nr_zones \|\| pgdat->kswapd_highest_zoneidx);
	7366	+
	7367	+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
6698	7368
6699	7369	pgdat->node_id = nid;
6700		- pgdat->node_start_pfn = node_start_pfn;
	7370	+ pgdat->node_start_pfn = start_pfn;
6701	7371	pgdat->per_cpu_nodestats = NULL;
6702		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6703		- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
	7372	+
6704	7373	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
6705	7374	(u64)start_pfn << PAGE_SHIFT,
6706	7375	end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
6707		-#else
6708		- start_pfn = node_start_pfn;
6709		-#endif
6710		- calculate_node_totalpages(pgdat, start_pfn, end_pfn,
6711		- zones_size, zholes_size);
	7376	+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
6712	7377
6713	7378	alloc_node_mem_map(pgdat);
6714	7379	pgdat_set_deferred_range(pgdat);
..	..	@@ -6716,80 +7381,10 @@
6716	7381	free_area_init_core(pgdat);
6717	7382	}
6718	7383
6719		-#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
6720		-
6721		-/*
6722		- * Zero all valid struct pages in range [spfn, epfn), return number of struct
6723		- * pages zeroed
6724		- */
6725		-static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
	7384	+void __init free_area_init_memoryless_node(int nid)
6726	7385	{
6727		- unsigned long pfn;
6728		- u64 pgcnt = 0;
6729		-
6730		- for (pfn = spfn; pfn < epfn; pfn++) {
6731		- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
6732		- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
6733		- + pageblock_nr_pages - 1;
6734		- continue;
6735		- }
6736		- mm_zero_struct_page(pfn_to_page(pfn));
6737		- pgcnt++;
6738		- }
6739		-
6740		- return pgcnt;
	7386	+ free_area_init_node(nid);
6741	7387	}
6742		-
6743		-/*
6744		- * Only struct pages that are backed by physical memory are zeroed and
6745		- * initialized by going through __init_single_page(). But, there are some
6746		- * struct pages which are reserved in memblock allocator and their fields
6747		- * may be accessed (for example page_to_pfn() on some configuration accesses
6748		- * flags). We must explicitly zero those struct pages.
6749		- *
6750		- * This function also addresses a similar issue where struct pages are left
6751		- * uninitialized because the physical address range is not covered by
6752		- * memblock.memory or memblock.reserved. That could happen when memblock
6753		- * layout is manually configured via memmap=, or when the highest physical
6754		- * address (max_pfn) does not end on a section boundary.
6755		- */
6756		-void __init zero_resv_unavail(void)
6757		-{
6758		- phys_addr_t start, end;
6759		- u64 i, pgcnt;
6760		- phys_addr_t next = 0;
6761		-
6762		- /*
6763		- * Loop through unavailable ranges not covered by memblock.memory.
6764		- */
6765		- pgcnt = 0;
6766		- for_each_mem_range(i, &memblock.memory, NULL,
6767		- NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
6768		- if (next < start)
6769		- pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
6770		- next = end;
6771		- }
6772		-
6773		- /*
6774		- * Early sections always have a fully populated memmap for the whole
6775		- * section - see pfn_valid(). If the last section has holes at the
6776		- * end and that section is marked "online", the memmap will be
6777		- * considered initialized. Make sure that memmap has a well defined
6778		- * state.
6779		- */
6780		- pgcnt += zero_pfn_range(PFN_DOWN(next),
6781		- round_up(max_pfn, PAGES_PER_SECTION));
6782		-
6783		- /*
6784		- * Struct pages that do not have backing memory. This could be because
6785		- * firmware is using some of this memory, or for some other reasons.
6786		- */
6787		- if (pgcnt)
6788		- pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
6789		-}
6790		-#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
6791		-
6792		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6793	7388
6794	7389	#if MAX_NUMNODES > 1
6795	7390	/*
..	..	@@ -6820,14 +7415,14 @@
6820	7415	* model has fine enough granularity to avoid incorrect mapping for the
6821	7416	* populated node map.
6822	7417	*
6823		- * Returns the determined alignment in pfn's. 0 if there is no alignment
	7418	+ * Return: the determined alignment in pfn's. 0 if there is no alignment
6824	7419	* requirement (single node).
6825	7420	*/
6826	7421	unsigned long __init node_map_pfn_alignment(void)
6827	7422	{
6828	7423	unsigned long accl_mask = 0, last_end = 0;
6829	7424	unsigned long start, end, mask;
6830		- int last_nid = -1;
	7425	+ int last_nid = NUMA_NO_NODE;
6831	7426	int i, nid;
6832	7427
6833	7428	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
..	..	@@ -6854,33 +7449,15 @@
6854	7449	return ~accl_mask + 1;
6855	7450	}
6856	7451
6857		-/* Find the lowest pfn for a node */
6858		-static unsigned long __init find_min_pfn_for_node(int nid)
6859		-{
6860		- unsigned long min_pfn = ULONG_MAX;
6861		- unsigned long start_pfn;
6862		- int i;
6863		-
6864		- for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
6865		- min_pfn = min(min_pfn, start_pfn);
6866		-
6867		- if (min_pfn == ULONG_MAX) {
6868		- pr_warn("Could not find start_pfn for node %d\n", nid);
6869		- return 0;
6870		- }
6871		-
6872		- return min_pfn;
6873		-}
6874		-
6875	7452	/**
6876	7453	* find_min_pfn_with_active_regions - Find the minimum PFN registered
6877	7454	*
6878		- * It returns the minimum PFN based on information provided via
	7455	+ * Return: the minimum PFN based on information provided via
6879	7456	* memblock_set_node().
6880	7457	*/
6881	7458	unsigned long __init find_min_pfn_with_active_regions(void)
6882	7459	{
6883		- return find_min_pfn_for_node(MAX_NUMNODES);
	7460	+ return PHYS_PFN(memblock_start_of_DRAM());
6884	7461	}
6885	7462
6886	7463	/*
..	..	@@ -6929,11 +7506,11 @@
6929	7506	* options.
6930	7507	*/
6931	7508	if (movable_node_is_enabled()) {
6932		- for_each_memblock(memory, r) {
	7509	+ for_each_mem_region(r) {
6933	7510	if (!memblock_is_hotpluggable(r))
6934	7511	continue;
6935	7512
6936		- nid = r->nid;
	7513	+ nid = memblock_get_region_node(r);
6937	7514
6938	7515	usable_startpfn = PFN_DOWN(r->base);
6939	7516	zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
..	..	@@ -6950,11 +7527,11 @@
6950	7527	if (mirrored_kernelcore) {
6951	7528	bool mem_below_4gb_not_mirrored = false;
6952	7529
6953		- for_each_memblock(memory, r) {
	7530	+ for_each_mem_region(r) {
6954	7531	if (memblock_is_mirror(r))
6955	7532	continue;
6956	7533
6957		- nid = r->nid;
	7534	+ nid = memblock_get_region_node(r);
6958	7535
6959	7536	usable_startpfn = memblock_region_memory_base_pfn(r);
6960	7537
..	..	@@ -6969,7 +7546,7 @@
6969	7546	}
6970	7547
6971	7548	if (mem_below_4gb_not_mirrored)
6972		- pr_warn("This configuration results in unmirrored kernel memory.");
	7549	+ pr_warn("This configuration results in unmirrored kernel memory.\n");
6973	7550
6974	7551	goto out2;
6975	7552	}
..	..	@@ -7108,9 +7685,16 @@
7108	7685
7109	7686	out2:
7110	7687	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
7111		- for (nid = 0; nid < MAX_NUMNODES; nid++)
	7688	+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
	7689	+ unsigned long start_pfn, end_pfn;
	7690	+
7112	7691	zone_movable_pfn[nid] =
7113	7692	roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
	7693	+
	7694	+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
	7695	+ if (zone_movable_pfn[nid] >= end_pfn)
	7696	+ zone_movable_pfn[nid] = 0;
	7697	+ }
7114	7698
7115	7699	out:
7116	7700	/* restore the node_state */
..	..	@@ -7122,23 +7706,29 @@
7122	7706	{
7123	7707	enum zone_type zone_type;
7124	7708
7125		- if (N_MEMORY == N_NORMAL_MEMORY)
7126		- return;
7127		-
7128	7709	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
7129	7710	struct zone *zone = &pgdat->node_zones[zone_type];
7130	7711	if (populated_zone(zone)) {
7131		- node_set_state(nid, N_HIGH_MEMORY);
7132		- if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
7133		- zone_type <= ZONE_NORMAL)
	7712	+ if (IS_ENABLED(CONFIG_HIGHMEM))
	7713	+ node_set_state(nid, N_HIGH_MEMORY);
	7714	+ if (zone_type <= ZONE_NORMAL)
7134	7715	node_set_state(nid, N_NORMAL_MEMORY);
7135	7716	break;
7136	7717	}
7137	7718	}
7138	7719	}
7139	7720
	7721	+/*
	7722	+ * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
	7723	+ * such cases we allow max_zone_pfn sorted in the descending order
	7724	+ */
	7725	+bool __weak arch_has_descending_max_zone_pfns(void)
	7726	+{
	7727	+ return false;
	7728	+}
	7729	+
7140	7730	/**
7141		- * free_area_init_nodes - Initialise all pg_data_t and zone data
	7731	+ * free_area_init - Initialise all pg_data_t and zone data
7142	7732	* @max_zone_pfn: an array of max PFNs for each zone
7143	7733	*
7144	7734	* This will call free_area_init_node() for each active node in the system.
..	..	@@ -7150,10 +7740,11 @@
7150	7740	* starts where the previous one ended. For example, ZONE_DMA32 starts
7151	7741	* at arch_max_dma_pfn.
7152	7742	*/
7153		-void __init free_area_init_nodes(unsigned long *max_zone_pfn)
	7743	+void __init free_area_init(unsigned long *max_zone_pfn)
7154	7744	{
7155	7745	unsigned long start_pfn, end_pfn;
7156		- int i, nid;
	7746	+ int i, nid, zone;
	7747	+ bool descending;
7157	7748
7158	7749	/* Record where the zone boundaries are */
7159	7750	memset(arch_zone_lowest_possible_pfn, 0,
..	..	@@ -7162,14 +7753,20 @@
7162	7753	sizeof(arch_zone_highest_possible_pfn));
7163	7754
7164	7755	start_pfn = find_min_pfn_with_active_regions();
	7756	+ descending = arch_has_descending_max_zone_pfns();
7165	7757
7166	7758	for (i = 0; i < MAX_NR_ZONES; i++) {
7167		- if (i == ZONE_MOVABLE)
	7759	+ if (descending)
	7760	+ zone = MAX_NR_ZONES - i - 1;
	7761	+ else
	7762	+ zone = i;
	7763	+
	7764	+ if (zone == ZONE_MOVABLE)
7168	7765	continue;
7169	7766
7170		- end_pfn = max(max_zone_pfn[i], start_pfn);
7171		- arch_zone_lowest_possible_pfn[i] = start_pfn;
7172		- arch_zone_highest_possible_pfn[i] = end_pfn;
	7767	+ end_pfn = max(max_zone_pfn[zone], start_pfn);
	7768	+ arch_zone_lowest_possible_pfn[zone] = start_pfn;
	7769	+ arch_zone_highest_possible_pfn[zone] = end_pfn;
7173	7770
7174	7771	start_pfn = end_pfn;
7175	7772	}
..	..	@@ -7203,27 +7800,33 @@
7203	7800	(u64)zone_movable_pfn[i] << PAGE_SHIFT);
7204	7801	}
7205	7802
7206		- /* Print out the early node map */
	7803	+ /*
	7804	+ * Print out the early node map, and initialize the
	7805	+ * subsection-map relative to active online memory ranges to
	7806	+ * enable future "sub-section" extensions of the memory map.
	7807	+ */
7207	7808	pr_info("Early memory node ranges\n");
7208		- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
	7809	+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
7209	7810	pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
7210	7811	(u64)start_pfn << PAGE_SHIFT,
7211	7812	((u64)end_pfn << PAGE_SHIFT) - 1);
	7813	+ subsection_map_init(start_pfn, end_pfn - start_pfn);
	7814	+ }
7212	7815
7213	7816	/* Initialise every node */
7214	7817	mminit_verify_pageflags_layout();
7215	7818	setup_nr_node_ids();
7216		- zero_resv_unavail();
7217	7819	for_each_online_node(nid) {
7218	7820	pg_data_t *pgdat = NODE_DATA(nid);
7219		- free_area_init_node(nid, NULL,
7220		- find_min_pfn_for_node(nid), NULL);
	7821	+ free_area_init_node(nid);
7221	7822
7222	7823	/* Any memory on that node */
7223	7824	if (pgdat->node_present_pages)
7224	7825	node_set_state(nid, N_MEMORY);
7225	7826	check_for_memory(pgdat, nid);
7226	7827	}
	7828	+
	7829	+ memmap_init();
7227	7830	}
7228	7831
7229	7832	static int __init cmdline_parse_core(char p, unsigned long core,
..	..	@@ -7282,22 +7885,18 @@
7282	7885	early_param("kernelcore", cmdline_parse_kernelcore);
7283	7886	early_param("movablecore", cmdline_parse_movablecore);
7284	7887
7285		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
7286		-
7287	7888	void adjust_managed_page_count(struct page *page, long count)
7288	7889	{
7289		- spin_lock(&managed_page_count_lock);
7290		- page_zone(page)->managed_pages += count;
7291		- totalram_pages += count;
	7890	+ atomic_long_add(count, &page_zone(page)->managed_pages);
	7891	+ totalram_pages_add(count);
7292	7892	#ifdef CONFIG_HIGHMEM
7293	7893	if (PageHighMem(page))
7294		- totalhigh_pages += count;
	7894	+ totalhigh_pages_add(count);
7295	7895	#endif
7296		- spin_unlock(&managed_page_count_lock);
7297	7896	}
7298	7897	EXPORT_SYMBOL(adjust_managed_page_count);
7299	7898
7300		-unsigned long free_reserved_area(void start, void end, int poison, char *s)
	7899	+unsigned long free_reserved_area(void start, void end, int poison, const char *s)
7301	7900	{
7302	7901	void *pos;
7303	7902	unsigned long pages = 0;
..	..	@@ -7316,6 +7915,11 @@
7316	7915	* alias for the memset().
7317	7916	*/
7318	7917	direct_map_addr = page_address(page);
	7918	+ /*
	7919	+ * Perform a kasan-unchecked memset() since this memory
	7920	+ * has not been initialized.
	7921	+ */
	7922	+ direct_map_addr = kasan_reset_tag(direct_map_addr);
7319	7923	if ((unsigned int)poison <= 0xFF)
7320	7924	memset(direct_map_addr, poison, PAGE_SIZE);
7321	7925
..	..	@@ -7328,15 +7932,14 @@
7328	7932
7329	7933	return pages;
7330	7934	}
7331		-EXPORT_SYMBOL(free_reserved_area);
7332	7935
7333	7936	#ifdef CONFIG_HIGHMEM
7334	7937	void free_highmem_page(struct page *page)
7335	7938	{
7336	7939	__free_reserved_page(page);
7337		- totalram_pages++;
7338		- page_zone(page)->managed_pages++;
7339		- totalhigh_pages++;
	7940	+ totalram_pages_inc();
	7941	+ atomic_long_inc(&page_zone(page)->managed_pages);
	7942	+ totalhigh_pages_inc();
7340	7943	}
7341	7944	#endif
7342	7945
..	..	@@ -7363,7 +7966,7 @@
7363	7966	*/
7364	7967	#define adj_init_size(start, end, size, pos, adj) \
7365	7968	do { \
7366		- if (start <= pos && pos < end && size > adj) \
	7969	+ if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
7367	7970	size -= adj; \
7368	7971	} while (0)
7369	7972
..	..	@@ -7385,10 +7988,10 @@
7385	7988	physpages << (PAGE_SHIFT - 10),
7386	7989	codesize >> 10, datasize >> 10, rosize >> 10,
7387	7990	(init_data_size + init_code_size) >> 10, bss_size >> 10,
7388		- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
	7991	+ (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
7389	7992	totalcma_pages << (PAGE_SHIFT - 10),
7390	7993	#ifdef CONFIG_HIGHMEM
7391		- totalhigh_pages << (PAGE_SHIFT - 10),
	7994	+ totalhigh_pages() << (PAGE_SHIFT - 10),
7392	7995	#endif
7393	7996	str ? ", " : "", str ? str : "");
7394	7997	}
..	..	@@ -7409,18 +8012,10 @@
7409	8012	dma_reserve = new_dma_reserve;
7410	8013	}
7411	8014
7412		-void __init free_area_init(unsigned long *zones_size)
7413		-{
7414		- zero_resv_unavail();
7415		- free_area_init_node(0, zones_size,
7416		- __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
7417		-}
7418		-
7419	8015	static int page_alloc_cpu_dead(unsigned int cpu)
7420	8016	{
7421		- local_lock_irq_on(swapvec_lock, cpu);
	8017	+
7422	8018	lru_add_drain_cpu(cpu);
7423		- local_unlock_irq_on(swapvec_lock, cpu);
7424	8019	drain_pages(cpu);
7425	8020
7426	8021	/*
..	..	@@ -7442,9 +8037,27 @@
7442	8037	return 0;
7443	8038	}
7444	8039
	8040	+#ifdef CONFIG_NUMA
	8041	+int hashdist = HASHDIST_DEFAULT;
	8042	+
	8043	+static int __init set_hashdist(char *str)
	8044	+{
	8045	+ if (!str)
	8046	+ return 0;
	8047	+ hashdist = simple_strtoul(str, &str, 0);
	8048	+ return 1;
	8049	+}
	8050	+__setup("hashdist=", set_hashdist);
	8051	+#endif
	8052	+
7445	8053	void __init page_alloc_init(void)
7446	8054	{
7447	8055	int ret;
	8056	+
	8057	+#ifdef CONFIG_NUMA
	8058	+ if (num_node_state(N_MEMORY) == 1)
	8059	+ hashdist = 0;
	8060	+#endif
7448	8061
7449	8062	ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
7450	8063	"mm/page_alloc:dead", NULL,
..	..	@@ -7469,6 +8082,7 @@
7469	8082	for (i = 0; i < MAX_NR_ZONES; i++) {
7470	8083	struct zone *zone = pgdat->node_zones + i;
7471	8084	long max = 0;
	8085	+ unsigned long managed_pages = zone_managed_pages(zone);
7472	8086
7473	8087	/* Find valid and maximum lowmem_reserve in the zone */
7474	8088	for (j = i; j < MAX_NR_ZONES; j++) {
..	..	@@ -7479,8 +8093,8 @@
7479	8093	/* we treat the high watermark as reserved pages. */
7480	8094	max += high_wmark_pages(zone);
7481	8095
7482		- if (max > zone->managed_pages)
7483		- max = zone->managed_pages;
	8096	+ if (max > managed_pages)
	8097	+ max = managed_pages;
7484	8098
7485	8099	pgdat->totalreserve_pages += max;
7486	8100
..	..	@@ -7499,30 +8113,24 @@
7499	8113	static void setup_per_zone_lowmem_reserve(void)
7500	8114	{
7501	8115	struct pglist_data *pgdat;
7502		- enum zone_type j, idx;
	8116	+ enum zone_type i, j;
7503	8117
7504	8118	for_each_online_pgdat(pgdat) {
7505		- for (j = 0; j < MAX_NR_ZONES; j++) {
7506		- struct zone *zone = pgdat->node_zones + j;
7507		- unsigned long managed_pages = zone->managed_pages;
	8119	+ for (i = 0; i < MAX_NR_ZONES - 1; i++) {
	8120	+ struct zone *zone = &pgdat->node_zones[i];
	8121	+ int ratio = sysctl_lowmem_reserve_ratio[i];
	8122	+ bool clear = !ratio \|\| !zone_managed_pages(zone);
	8123	+ unsigned long managed_pages = 0;
7508	8124
7509		- zone->lowmem_reserve[j] = 0;
	8125	+ for (j = i + 1; j < MAX_NR_ZONES; j++) {
	8126	+ struct zone *upper_zone = &pgdat->node_zones[j];
7510	8127
7511		- idx = j;
7512		- while (idx) {
7513		- struct zone *lower_zone;
	8128	+ managed_pages += zone_managed_pages(upper_zone);
7514	8129
7515		- idx--;
7516		- lower_zone = pgdat->node_zones + idx;
7517		-
7518		- if (sysctl_lowmem_reserve_ratio[idx] < 1) {
7519		- sysctl_lowmem_reserve_ratio[idx] = 0;
7520		- lower_zone->lowmem_reserve[j] = 0;
7521		- } else {
7522		- lower_zone->lowmem_reserve[j] =
7523		- managed_pages / sysctl_lowmem_reserve_ratio[idx];
7524		- }
7525		- managed_pages += lower_zone->managed_pages;
	8130	+ if (clear)
	8131	+ zone->lowmem_reserve[j] = 0;
	8132	+ else
	8133	+ zone->lowmem_reserve[j] = managed_pages / ratio;
7526	8134	}
7527	8135	}
7528	8136	}
..	..	@@ -7542,18 +8150,17 @@
7542	8150	/* Calculate total number of !ZONE_HIGHMEM pages */
7543	8151	for_each_zone(zone) {
7544	8152	if (!is_highmem(zone))
7545		- lowmem_pages += zone->managed_pages;
	8153	+ lowmem_pages += zone_managed_pages(zone);
7546	8154	}
7547	8155
7548	8156	for_each_zone(zone) {
7549		- u64 min, low;
	8157	+ u64 tmp, low;
7550	8158
7551	8159	spin_lock_irqsave(&zone->lock, flags);
7552		- min = (u64)pages_min * zone->managed_pages;
7553		- do_div(min, lowmem_pages);
7554		- low = (u64)pages_low * zone->managed_pages;
7555		- do_div(low, vm_total_pages);
7556		-
	8160	+ tmp = (u64)pages_min * zone_managed_pages(zone);
	8161	+ do_div(tmp, lowmem_pages);
	8162	+ low = (u64)pages_low * zone_managed_pages(zone);
	8163	+ do_div(low, nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)));
7557	8164	if (is_highmem(zone)) {
7558	8165	/*
7559	8166	* __GFP_HIGH and PF_MEMALLOC allocations usually don't
..	..	@@ -7561,20 +8168,20 @@
7561	8168	* value here.
7562	8169	*
7563	8170	* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
7564		- * deltas control asynch page reclaim, and so should
	8171	+ * deltas control async page reclaim, and so should
7565	8172	* not be capped for highmem.
7566	8173	*/
7567	8174	unsigned long min_pages;
7568	8175
7569		- min_pages = zone->managed_pages / 1024;
	8176	+ min_pages = zone_managed_pages(zone) / 1024;
7570	8177	min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
7571		- zone->watermark[WMARK_MIN] = min_pages;
	8178	+ zone->_watermark[WMARK_MIN] = min_pages;
7572	8179	} else {
7573	8180	/*
7574	8181	* If it's a lowmem zone, reserve a number of pages
7575	8182	* proportionate to the zone's size.
7576	8183	*/
7577		- zone->watermark[WMARK_MIN] = min;
	8184	+ zone->_watermark[WMARK_MIN] = tmp;
7578	8185	}
7579	8186
7580	8187	/*
..	..	@@ -7582,14 +8189,13 @@
7582	8189	* scale factor in proportion to available memory, but
7583	8190	* ensure a minimum size on small systems.
7584	8191	*/
7585		- min = max_t(u64, min >> 2,
7586		- mult_frac(zone->managed_pages,
	8192	+ tmp = max_t(u64, tmp >> 2,
	8193	+ mult_frac(zone_managed_pages(zone),
7587	8194	watermark_scale_factor, 10000));
7588	8195
7589		- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
7590		- low + min;
7591		- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
7592		- low + min * 2;
	8196	+ zone->watermark_boost = 0;
	8197	+ zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + low + tmp;
	8198	+ zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + low + tmp * 2;
7593	8199
7594	8200	spin_unlock_irqrestore(&zone->lock, flags);
7595	8201	}
..	..	@@ -7618,7 +8224,7 @@
7618	8224	* Initialise min_free_kbytes.
7619	8225	*
7620	8226	* For small machines we want it small (128k min). For large machines
7621		- * we want it large (64MB max). But it is not linear, because network
	8227	+ * we want it large (256MB max). But it is not linear, because network
7622	8228	* bandwidth does not increase linearly with machine size. We use
7623	8229	*
7624	8230	* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
..	..	@@ -7650,8 +8256,8 @@
7650	8256	min_free_kbytes = new_min_free_kbytes;
7651	8257	if (min_free_kbytes < 128)
7652	8258	min_free_kbytes = 128;
7653		- if (min_free_kbytes > 65536)
7654		- min_free_kbytes = 65536;
	8259	+ if (min_free_kbytes > 262144)
	8260	+ min_free_kbytes = 262144;
7655	8261	} else {
7656	8262	pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
7657	8263	new_min_free_kbytes, user_min_free_kbytes);
..	..	@@ -7677,7 +8283,7 @@
7677	8283	* or extra_free_kbytes changes.
7678	8284	*/
7679	8285	int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
7680		- void __user buffer, size_t length, loff_t *ppos)
	8286	+ void buffer, size_t length, loff_t *ppos)
7681	8287	{
7682	8288	int rc;
7683	8289
..	..	@@ -7693,7 +8299,7 @@
7693	8299	}
7694	8300
7695	8301	int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
7696		- void __user buffer, size_t length, loff_t *ppos)
	8302	+ void buffer, size_t length, loff_t *ppos)
7697	8303	{
7698	8304	int rc;
7699	8305
..	..	@@ -7717,13 +8323,13 @@
7717	8323	pgdat->min_unmapped_pages = 0;
7718	8324
7719	8325	for_each_zone(zone)
7720		- zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
7721		- sysctl_min_unmapped_ratio) / 100;
	8326	+ zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
	8327	+ sysctl_min_unmapped_ratio) / 100;
7722	8328	}
7723	8329
7724	8330
7725	8331	int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
7726		- void __user buffer, size_t length, loff_t *ppos)
	8332	+ void buffer, size_t length, loff_t *ppos)
7727	8333	{
7728	8334	int rc;
7729	8335
..	..	@@ -7745,12 +8351,12 @@
7745	8351	pgdat->min_slab_pages = 0;
7746	8352
7747	8353	for_each_zone(zone)
7748		- zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
7749		- sysctl_min_slab_ratio) / 100;
	8354	+ zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
	8355	+ sysctl_min_slab_ratio) / 100;
7750	8356	}
7751	8357
7752	8358	int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
7753		- void __user buffer, size_t length, loff_t *ppos)
	8359	+ void buffer, size_t length, loff_t *ppos)
7754	8360	{
7755	8361	int rc;
7756	8362
..	..	@@ -7774,11 +8380,28 @@
7774	8380	* if in function of the boot time zone sizes.
7775	8381	*/
7776	8382	int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
7777		- void __user buffer, size_t length, loff_t *ppos)
	8383	+ void buffer, size_t length, loff_t *ppos)
7778	8384	{
	8385	+ int i;
	8386	+
7779	8387	proc_dointvec_minmax(table, write, buffer, length, ppos);
	8388	+
	8389	+ for (i = 0; i < MAX_NR_ZONES; i++) {
	8390	+ if (sysctl_lowmem_reserve_ratio[i] < 1)
	8391	+ sysctl_lowmem_reserve_ratio[i] = 0;
	8392	+ }
	8393	+
7780	8394	setup_per_zone_lowmem_reserve();
7781	8395	return 0;
	8396	+}
	8397	+
	8398	+static void __zone_pcp_update(struct zone *zone)
	8399	+{
	8400	+ unsigned int cpu;
	8401	+
	8402	+ for_each_possible_cpu(cpu)
	8403	+ pageset_set_high_and_batch(zone,
	8404	+ per_cpu_ptr(zone->pageset, cpu));
7782	8405	}
7783	8406
7784	8407	/*
..	..	@@ -7787,7 +8410,7 @@
7787	8410	* pagelist can have before it gets flushed back to buddy allocator.
7788	8411	*/
7789	8412	int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
7790		- void __user buffer, size_t length, loff_t *ppos)
	8413	+ void buffer, size_t length, loff_t *ppos)
7791	8414	{
7792	8415	struct zone *zone;
7793	8416	int old_percpu_pagelist_fraction;
..	..	@@ -7812,30 +8435,12 @@
7812	8435	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
7813	8436	goto out;
7814	8437
7815		- for_each_populated_zone(zone) {
7816		- unsigned int cpu;
7817		-
7818		- for_each_possible_cpu(cpu)
7819		- pageset_set_high_and_batch(zone,
7820		- per_cpu_ptr(zone->pageset, cpu));
7821		- }
	8438	+ for_each_populated_zone(zone)
	8439	+ __zone_pcp_update(zone);
7822	8440	out:
7823	8441	mutex_unlock(&pcp_batch_high_lock);
7824	8442	return ret;
7825	8443	}
7826		-
7827		-#ifdef CONFIG_NUMA
7828		-int hashdist = HASHDIST_DEFAULT;
7829		-
7830		-static int __init set_hashdist(char *str)
7831		-{
7832		- if (!str)
7833		- return 0;
7834		- hashdist = simple_strtoul(str, &str, 0);
7835		- return 1;
7836		-}
7837		-__setup("hashdist=", set_hashdist);
7838		-#endif
7839	8444
7840	8445	#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
7841	8446	/*
..	..	@@ -7883,6 +8488,7 @@
7883	8488	unsigned long log2qty, size;
7884	8489	void *table = NULL;
7885	8490	gfp_t gfp_flags;
	8491	+ bool virt;
7886	8492
7887	8493	/* allow the kernel cmdline to have a say */
7888	8494	if (!numentries) {
..	..	@@ -7939,32 +8545,34 @@
7939	8545
7940	8546	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC \| __GFP_ZERO : GFP_ATOMIC;
7941	8547	do {
	8548	+ virt = false;
7942	8549	size = bucketsize << log2qty;
7943	8550	if (flags & HASH_EARLY) {
7944	8551	if (flags & HASH_ZERO)
7945		- table = memblock_virt_alloc_nopanic(size, 0);
	8552	+ table = memblock_alloc(size, SMP_CACHE_BYTES);
7946	8553	else
7947		- table = memblock_virt_alloc_raw(size, 0);
7948		- } else if (hashdist) {
7949		- table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
	8554	+ table = memblock_alloc_raw(size,
	8555	+ SMP_CACHE_BYTES);
	8556	+ } else if (get_order(size) >= MAX_ORDER \|\| hashdist) {
	8557	+ table = __vmalloc(size, gfp_flags);
	8558	+ virt = true;
7950	8559	} else {
7951	8560	/*
7952	8561	* If bucketsize is not a power-of-two, we may free
7953	8562	* some pages at the end of hash table which
7954	8563	* alloc_pages_exact() automatically does
7955	8564	*/
7956		- if (get_order(size) < MAX_ORDER) {
7957		- table = alloc_pages_exact(size, gfp_flags);
7958		- kmemleak_alloc(table, size, 1, gfp_flags);
7959		- }
	8565	+ table = alloc_pages_exact(size, gfp_flags);
	8566	+ kmemleak_alloc(table, size, 1, gfp_flags);
7960	8567	}
7961	8568	} while (!table && size > PAGE_SIZE && --log2qty);
7962	8569
7963	8570	if (!table)
7964	8571	panic("Failed to allocate %s hash table\n", tablename);
7965	8572
7966		- pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
7967		- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
	8573	+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
	8574	+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
	8575	+ virt ? "vmalloc" : "linear");
7968	8576
7969	8577	if (_hash_shift)
7970	8578	*_hash_shift = log2qty;
..	..	@@ -7976,47 +8584,50 @@
7976	8584
7977	8585	/*
7978	8586	* This function checks whether pageblock includes unmovable pages or not.
7979		- * If @count is not zero, it is okay to include less @count unmovable pages
7980	8587	*
7981	8588	* PageLRU check without isolation or lru_lock could race so that
7982	8589	* MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
7983	8590	* check without lock_page also may miss some movable non-lru pages at
7984	8591	* race condition. So you can't expect this function should be exact.
	8592	+ *
	8593	+ * Returns a page without holding a reference. If the caller wants to
	8594	+ * dereference that page (e.g., dumping), it has to make sure that it
	8595	+ * cannot get removed (e.g., via memory unplug) concurrently.
	8596	+ *
7985	8597	*/
7986		-bool has_unmovable_pages(struct zone zone, struct page page, int count,
7987		- int migratetype,
7988		- bool skip_hwpoisoned_pages)
	8598	+struct page has_unmovable_pages(struct zone zone, struct page *page,
	8599	+ int migratetype, int flags)
7989	8600	{
7990		- unsigned long pfn, iter, found;
	8601	+ unsigned long iter = 0;
	8602	+ unsigned long pfn = page_to_pfn(page);
	8603	+ unsigned long offset = pfn % pageblock_nr_pages;
7991	8604
7992		- /*
7993		- * TODO we could make this much more efficient by not checking every
7994		- * page in the range if we know all of them are in MOVABLE_ZONE and
7995		- * that the movable zone guarantees that pages are migratable but
7996		- * the later is not the case right now unfortunatelly. E.g. movablecore
7997		- * can still lead to having bootmem allocations in zone_movable.
7998		- */
	8605	+ if (is_migrate_cma_page(page)) {
	8606	+ /*
	8607	+ * CMA allocations (alloc_contig_range) really need to mark
	8608	+ * isolate CMA pageblocks even when they are not movable in fact
	8609	+ * so consider them movable here.
	8610	+ */
	8611	+ if (is_migrate_cma(migratetype))
	8612	+ return NULL;
7999	8613
8000		- /*
8001		- * CMA allocations (alloc_contig_range) really need to mark isolate
8002		- * CMA pageblocks even when they are not movable in fact so consider
8003		- * them movable here.
8004		- */
8005		- if (is_migrate_cma(migratetype) &&
8006		- is_migrate_cma(get_pageblock_migratetype(page)))
8007		- return false;
	8614	+ return page;
	8615	+ }
8008	8616
8009		- pfn = page_to_pfn(page);
8010		- for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
8011		- unsigned long check = pfn + iter;
8012		-
8013		- if (!pfn_valid_within(check))
	8617	+ for (; iter < pageblock_nr_pages - offset; iter++) {
	8618	+ if (!pfn_valid_within(pfn + iter))
8014	8619	continue;
8015	8620
8016		- page = pfn_to_page(check);
	8621	+ page = pfn_to_page(pfn + iter);
8017	8622
	8623	+ /*
	8624	+ * Both, bootmem allocations and memory holes are marked
	8625	+ * PG_reserved and are unmovable. We can even have unmovable
	8626	+ * allocations inside ZONE_MOVABLE, for example when
	8627	+ * specifying "movablecore".
	8628	+ */
8018	8629	if (PageReserved(page))
8019		- goto unmovable;
	8630	+ return page;
8020	8631
8021	8632	/*
8022	8633	* If the zone is movable and we have ruled out all reserved
..	..	@@ -8028,17 +8639,22 @@
8028	8639
8029	8640	/*
8030	8641	* Hugepages are not in LRU lists, but they're movable.
8031		- * We need not scan over tail pages bacause we don't
	8642	+ * THPs are on the LRU, but need to be counted as #small pages.
	8643	+ * We need not scan over tail pages because we don't
8032	8644	* handle each tail page individually in migration.
8033	8645	*/
8034		- if (PageHuge(page)) {
	8646	+ if (PageHuge(page) \|\| PageTransCompound(page)) {
8035	8647	struct page *head = compound_head(page);
8036	8648	unsigned int skip_pages;
8037	8649
8038		- if (!hugepage_migration_supported(page_hstate(head)))
8039		- goto unmovable;
	8650	+ if (PageHuge(page)) {
	8651	+ if (!hugepage_migration_supported(page_hstate(head)))
	8652	+ return page;
	8653	+ } else if (!PageLRU(head) && !__PageMovable(head)) {
	8654	+ return page;
	8655	+ }
8040	8656
8041		- skip_pages = (1 << compound_order(head)) - (page - head);
	8657	+ skip_pages = compound_nr(head) - (page - head);
8042	8658	iter += skip_pages - 1;
8043	8659	continue;
8044	8660	}
..	..	@@ -8051,7 +8667,7 @@
8051	8667	*/
8052	8668	if (!page_ref_count(page)) {
8053	8669	if (PageBuddy(page))
8054		- iter += (1 << page_order(page)) - 1;
	8670	+ iter += (1 << buddy_order(page)) - 1;
8055	8671	continue;
8056	8672	}
8057	8673
..	..	@@ -8059,61 +8675,100 @@
8059	8675	* The HWPoisoned page may be not in buddy system, and
8060	8676	* page_count() is not 0.
8061	8677	*/
8062		- if (skip_hwpoisoned_pages && PageHWPoison(page))
	8678	+ if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
8063	8679	continue;
8064	8680
8065		- if (__PageMovable(page))
	8681	+ /*
	8682	+ * We treat all PageOffline() pages as movable when offlining
	8683	+ * to give drivers a chance to decrement their reference count
	8684	+ * in MEM_GOING_OFFLINE in order to indicate that these pages
	8685	+ * can be offlined as there are no direct references anymore.
	8686	+ * For actually unmovable PageOffline() where the driver does
	8687	+ * not support this, we will fail later when trying to actually
	8688	+ * move these pages that still have a reference count > 0.
	8689	+ * (false negatives in this function only)
	8690	+ */
	8691	+ if ((flags & MEMORY_OFFLINE) && PageOffline(page))
8066	8692	continue;
8067	8693
8068		- if (!PageLRU(page))
8069		- found++;
	8694	+ if (__PageMovable(page) \|\| PageLRU(page))
	8695	+ continue;
	8696	+
8070	8697	/*
8071	8698	* If there are RECLAIMABLE pages, we need to check
8072	8699	* it. But now, memory offline itself doesn't call
8073	8700	* shrink_node_slabs() and it still to be fixed.
8074	8701	*/
8075		- /*
8076		- * If the page is not RAM, page_count()should be 0.
8077		- * we don't need more check. This is an _used_ not-movable page.
8078		- *
8079		- * The problematic thing here is PG_reserved pages. PG_reserved
8080		- * is set to both of a memory hole page and a _used_ kernel
8081		- * page at boot.
8082		- */
8083		- if (found > count)
8084		- goto unmovable;
	8702	+ return page;
8085	8703	}
8086		- return false;
8087		-unmovable:
8088		- WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
8089		- return true;
	8704	+ return NULL;
8090	8705	}
8091	8706
8092		-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) \|\| defined(CONFIG_CMA)
8093		-
	8707	+#ifdef CONFIG_CONTIG_ALLOC
8094	8708	static unsigned long pfn_max_align_down(unsigned long pfn)
8095	8709	{
8096	8710	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
8097	8711	pageblock_nr_pages) - 1);
8098	8712	}
8099	8713
8100		-static unsigned long pfn_max_align_up(unsigned long pfn)
	8714	+unsigned long pfn_max_align_up(unsigned long pfn)
8101	8715	{
8102	8716	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
8103	8717	pageblock_nr_pages));
8104	8718	}
8105	8719
	8720	+#if defined(CONFIG_DYNAMIC_DEBUG) \|\| \
	8721	+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
	8722	+/* Usage: See admin-guide/dynamic-debug-howto.rst */
	8723	+static void alloc_contig_dump_pages(struct list_head *page_list)
	8724	+{
	8725	+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
	8726	+
	8727	+ if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
	8728	+ struct page *page;
	8729	+ unsigned long nr_skip = 0;
	8730	+ unsigned long nr_pages = 0;
	8731	+
	8732	+ dump_stack();
	8733	+ list_for_each_entry(page, page_list, lru) {
	8734	+ nr_pages++;
	8735	+ /* The page will be freed by putback_movable_pages soon */
	8736	+ if (page_count(page) == 1) {
	8737	+ nr_skip++;
	8738	+ continue;
	8739	+ }
	8740	+ dump_page(page, "migration failure");
	8741	+ }
	8742	+ pr_warn("total dump_pages %lu skipping %lu\n", nr_pages, nr_skip);
	8743	+ }
	8744	+}
	8745	+#else
	8746	+static inline void alloc_contig_dump_pages(struct list_head *page_list)
	8747	+{
	8748	+}
	8749	+#endif
	8750	+
8106	8751	/* [start, end) must belong to a single zone. */
8107	8752	static int __alloc_contig_migrate_range(struct compact_control *cc,
8108		- unsigned long start, unsigned long end)
	8753	+ unsigned long start, unsigned long end,
	8754	+ struct acr_info *info)
8109	8755	{
8110	8756	/* This function is based on compact_zone() from compaction.c. */
8111		- unsigned long nr_reclaimed;
	8757	+ unsigned int nr_reclaimed;
8112	8758	unsigned long pfn = start;
8113	8759	unsigned int tries = 0;
	8760	+ unsigned int max_tries = 5;
8114	8761	int ret = 0;
	8762	+ struct page *page;
	8763	+ struct migration_target_control mtc = {
	8764	+ .nid = zone_to_nid(cc->zone),
	8765	+ .gfp_mask = GFP_USER \| __GFP_MOVABLE \| __GFP_RETRY_MAYFAIL,
	8766	+ };
8115	8767
8116		- migrate_prep();
	8768	+ if (cc->alloc_contig && cc->mode == MIGRATE_ASYNC)
	8769	+ max_tries = 1;
	8770	+
	8771	+ lru_cache_disable();
8117	8772
8118	8773	while (pfn < end \|\| !list_empty(&cc->migratepages)) {
8119	8774	if (fatal_signal_pending(current)) {
..	..	@@ -8129,20 +8784,39 @@
8129	8784	break;
8130	8785	}
8131	8786	tries = 0;
8132		- } else if (++tries == 5) {
	8787	+ } else if (++tries == max_tries) {
8133	8788	ret = ret < 0 ? ret : -EBUSY;
8134	8789	break;
8135	8790	}
8136	8791
8137	8792	nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
8138	8793	&cc->migratepages);
	8794	+ info->nr_reclaimed += nr_reclaimed;
8139	8795	cc->nr_migratepages -= nr_reclaimed;
8140	8796
8141		- ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
8142		- NULL, 0, cc->mode, MR_CONTIG_RANGE);
	8797	+ list_for_each_entry(page, &cc->migratepages, lru)
	8798	+ info->nr_mapped += page_mapcount(page);
	8799	+
	8800	+ ret = migrate_pages(&cc->migratepages, alloc_migration_target,
	8801	+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
	8802	+ if (!ret)
	8803	+ info->nr_migrated += cc->nr_migratepages;
8143	8804	}
	8805	+
	8806	+ lru_cache_enable();
8144	8807	if (ret < 0) {
	8808	+ if (ret == -EBUSY) {
	8809	+ alloc_contig_dump_pages(&cc->migratepages);
	8810	+ page_pinner_mark_migration_failed_pages(&cc->migratepages);
	8811	+ }
	8812	+
	8813	+ if (!list_empty(&cc->migratepages)) {
	8814	+ page = list_first_entry(&cc->migratepages, struct page , lru);
	8815	+ info->failed_pfn = page_to_pfn(page);
	8816	+ }
	8817	+
8145	8818	putback_movable_pages(&cc->migratepages);
	8819	+ info->err \|= ACR_ERR_MIGRATE;
8146	8820	return ret;
8147	8821	}
8148	8822	return 0;
..	..	@@ -8165,25 +8839,28 @@
8165	8839	* pageblocks in the range. Once isolated, the pageblocks should not
8166	8840	* be modified by others.
8167	8841	*
8168		- * Returns zero on success or negative error code. On success all
	8842	+ * Return: zero on success or negative error code. On success all
8169	8843	* pages which PFN is in [start, end) are allocated for the caller and
8170	8844	* need to be freed with free_contig_range().
8171	8845	*/
8172	8846	int alloc_contig_range(unsigned long start, unsigned long end,
8173		- unsigned migratetype, gfp_t gfp_mask)
	8847	+ unsigned migratetype, gfp_t gfp_mask,
	8848	+ struct acr_info *info)
8174	8849	{
8175	8850	unsigned long outer_start, outer_end;
8176	8851	unsigned int order;
8177	8852	int ret = 0;
	8853	+ bool skip_drain_all_pages = false;
8178	8854
8179	8855	struct compact_control cc = {
8180	8856	.nr_migratepages = 0,
8181	8857	.order = -1,
8182	8858	.zone = page_zone(pfn_to_page(start)),
8183		- .mode = MIGRATE_SYNC,
	8859	+ .mode = gfp_mask & __GFP_NORETRY ? MIGRATE_ASYNC : MIGRATE_SYNC,
8184	8860	.ignore_skip_hint = true,
8185	8861	.no_set_skip_hint = true,
8186	8862	.gfp_mask = current_gfp_context(gfp_mask),
	8863	+ .alloc_contig = true,
8187	8864	};
8188	8865	INIT_LIST_HEAD(&cc.migratepages);
8189	8866
..	..	@@ -8212,14 +8889,18 @@
8212	8889	*/
8213	8890
8214	8891	ret = start_isolate_page_range(pfn_max_align_down(start),
8215		- pfn_max_align_up(end), migratetype,
8216		- false);
8217		- if (ret)
	8892	+ pfn_max_align_up(end), migratetype, 0,
	8893	+ &info->failed_pfn);
	8894	+ if (ret) {
	8895	+ info->err \|= ACR_ERR_ISOLATE;
8218	8896	return ret;
	8897	+ }
8219	8898
8220		-#ifdef CONFIG_CMA
8221		- cc.zone->cma_alloc = 1;
8222		-#endif
	8899	+ trace_android_vh_cma_drain_all_pages_bypass(migratetype,
	8900	+ &skip_drain_all_pages);
	8901	+ if (!skip_drain_all_pages)
	8902	+ drain_all_pages(cc.zone);
	8903	+
8223	8904	/*
8224	8905	* In case of -EBUSY, we'd like to know which page causes problem.
8225	8906	* So, just fall through. test_pages_isolated() has a tracepoint
..	..	@@ -8230,8 +8911,8 @@
8230	8911	* allocated. So, if we fall through be sure to clear ret so that
8231	8912	* -EBUSY is not accidentally used or returned to caller.
8232	8913	*/
8233		- ret = __alloc_contig_migrate_range(&cc, start, end);
8234		- if (ret && ret != -EBUSY)
	8914	+ ret = __alloc_contig_migrate_range(&cc, start, end, info);
	8915	+ if (ret && (ret != -EBUSY \|\| (gfp_mask & __GFP_NORETRY)))
8235	8916	goto done;
8236	8917	ret =0;
8237	8918
..	..	@@ -8252,9 +8933,6 @@
8252	8933	* isolated thus they won't get removed from buddy.
8253	8934	*/
8254	8935
8255		- lru_add_drain_all();
8256		- drain_all_pages(cc.zone);
8257		-
8258	8936	order = 0;
8259	8937	outer_start = start;
8260	8938	while (!PageBuddy(pfn_to_page(outer_start))) {
..	..	@@ -8266,7 +8944,7 @@
8266	8944	}
8267	8945
8268	8946	if (outer_start != start) {
8269		- order = page_order(pfn_to_page(outer_start));
	8947	+ order = buddy_order(pfn_to_page(outer_start));
8270	8948
8271	8949	/*
8272	8950	* outer_start page could be small order buddy page and
..	..	@@ -8279,10 +8957,11 @@
8279	8957	}
8280	8958
8281	8959	/* Make sure the range is really isolated. */
8282		- if (test_pages_isolated(outer_start, end, false)) {
	8960	+ if (test_pages_isolated(outer_start, end, 0, &info->failed_pfn)) {
8283	8961	pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
8284	8962	__func__, outer_start, end);
8285	8963	ret = -EBUSY;
	8964	+ info->err \|= ACR_ERR_TEST;
8286	8965	goto done;
8287	8966	}
8288	8967
..	..	@@ -8302,13 +8981,114 @@
8302	8981	done:
8303	8982	undo_isolate_page_range(pfn_max_align_down(start),
8304	8983	pfn_max_align_up(end), migratetype);
8305		-#ifdef CONFIG_CMA
8306		- cc.zone->cma_alloc = 0;
8307		-#endif
8308	8984	return ret;
8309	8985	}
	8986	+EXPORT_SYMBOL(alloc_contig_range);
8310	8987
8311		-void free_contig_range(unsigned long pfn, unsigned nr_pages)
	8988	+static int __alloc_contig_pages(unsigned long start_pfn,
	8989	+ unsigned long nr_pages, gfp_t gfp_mask)
	8990	+{
	8991	+ struct acr_info dummy;
	8992	+ unsigned long end_pfn = start_pfn + nr_pages;
	8993	+
	8994	+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
	8995	+ gfp_mask, &dummy);
	8996	+}
	8997	+
	8998	+static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
	8999	+ unsigned long nr_pages)
	9000	+{
	9001	+ unsigned long i, end_pfn = start_pfn + nr_pages;
	9002	+ struct page *page;
	9003	+
	9004	+ for (i = start_pfn; i < end_pfn; i++) {
	9005	+ page = pfn_to_online_page(i);
	9006	+ if (!page)
	9007	+ return false;
	9008	+
	9009	+ if (page_zone(page) != z)
	9010	+ return false;
	9011	+
	9012	+ if (PageReserved(page))
	9013	+ return false;
	9014	+
	9015	+ if (page_count(page) > 0)
	9016	+ return false;
	9017	+
	9018	+ if (PageHuge(page))
	9019	+ return false;
	9020	+ }
	9021	+ return true;
	9022	+}
	9023	+
	9024	+static bool zone_spans_last_pfn(const struct zone *zone,
	9025	+ unsigned long start_pfn, unsigned long nr_pages)
	9026	+{
	9027	+ unsigned long last_pfn = start_pfn + nr_pages - 1;
	9028	+
	9029	+ return zone_spans_pfn(zone, last_pfn);
	9030	+}
	9031	+
	9032	+/**
	9033	+ * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
	9034	+ * @nr_pages: Number of contiguous pages to allocate
	9035	+ * @gfp_mask: GFP mask to limit search and used during compaction
	9036	+ * @nid: Target node
	9037	+ * @nodemask: Mask for other possible nodes
	9038	+ *
	9039	+ * This routine is a wrapper around alloc_contig_range(). It scans over zones
	9040	+ * on an applicable zonelist to find a contiguous pfn range which can then be
	9041	+ * tried for allocation with alloc_contig_range(). This routine is intended
	9042	+ * for allocation requests which can not be fulfilled with the buddy allocator.
	9043	+ *
	9044	+ * The allocated memory is always aligned to a page boundary. If nr_pages is a
	9045	+ * power of two then the alignment is guaranteed to be to the given nr_pages
	9046	+ * (e.g. 1GB request would be aligned to 1GB).
	9047	+ *
	9048	+ * Allocated pages can be freed with free_contig_range() or by manually calling
	9049	+ * __free_page() on each allocated page.
	9050	+ *
	9051	+ * Return: pointer to contiguous pages on success, or NULL if not successful.
	9052	+ */
	9053	+struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
	9054	+ int nid, nodemask_t *nodemask)
	9055	+{
	9056	+ unsigned long ret, pfn, flags;
	9057	+ struct zonelist *zonelist;
	9058	+ struct zone *zone;
	9059	+ struct zoneref *z;
	9060	+
	9061	+ zonelist = node_zonelist(nid, gfp_mask);
	9062	+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
	9063	+ gfp_zone(gfp_mask), nodemask) {
	9064	+ spin_lock_irqsave(&zone->lock, flags);
	9065	+
	9066	+ pfn = ALIGN(zone->zone_start_pfn, nr_pages);
	9067	+ while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
	9068	+ if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
	9069	+ /*
	9070	+ * We release the zone lock here because
	9071	+ * alloc_contig_range() will also lock the zone
	9072	+ * at some point. If there's an allocation
	9073	+ * spinning on this lock, it may win the race
	9074	+ * and cause alloc_contig_range() to fail...
	9075	+ */
	9076	+ spin_unlock_irqrestore(&zone->lock, flags);
	9077	+ ret = __alloc_contig_pages(pfn, nr_pages,
	9078	+ gfp_mask);
	9079	+ if (!ret)
	9080	+ return pfn_to_page(pfn);
	9081	+ spin_lock_irqsave(&zone->lock, flags);
	9082	+ }
	9083	+ pfn += nr_pages;
	9084	+ }
	9085	+ spin_unlock_irqrestore(&zone->lock, flags);
	9086	+ }
	9087	+ return NULL;
	9088	+}
	9089	+#endif /* CONFIG_CONTIG_ALLOC */
	9090	+
	9091	+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
8312	9092	{
8313	9093	unsigned int count = 0;
8314	9094
..	..	@@ -8320,7 +9100,7 @@
8320	9100	}
8321	9101	WARN(count != 0, "%d pages are still in use!\n", count);
8322	9102	}
8323		-#endif
	9103	+EXPORT_SYMBOL(free_contig_range);
8324	9104
8325	9105	/*
8326	9106	* The zone indicated has a new number of managed_pages; batch sizes and percpu
..	..	@@ -8328,11 +9108,8 @@
8328	9108	*/
8329	9109	void __meminit zone_pcp_update(struct zone *zone)
8330	9110	{
8331		- unsigned cpu;
8332	9111	mutex_lock(&pcp_batch_high_lock);
8333		- for_each_possible_cpu(cpu)
8334		- pageset_set_high_and_batch(zone,
8335		- per_cpu_ptr(zone->pageset, cpu));
	9112	+ __zone_pcp_update(zone);
8336	9113	mutex_unlock(&pcp_batch_high_lock);
8337	9114	}
8338	9115
..	..	@@ -8343,7 +9120,7 @@
8343	9120	struct per_cpu_pageset *pset;
8344	9121
8345	9122	/* avoid races with drain_pages() */
8346		- local_lock_irqsave(pa_lock, flags);
	9123	+ local_irq_save(flags);
8347	9124	if (zone->pageset != &boot_pageset) {
8348	9125	for_each_online_cpu(cpu) {
8349	9126	pset = per_cpu_ptr(zone->pageset, cpu);
..	..	@@ -8352,37 +9129,26 @@
8352	9129	free_percpu(zone->pageset);
8353	9130	zone->pageset = &boot_pageset;
8354	9131	}
8355		- local_unlock_irqrestore(pa_lock, flags);
	9132	+ local_irq_restore(flags);
8356	9133	}
8357	9134
8358	9135	#ifdef CONFIG_MEMORY_HOTREMOVE
8359	9136	/*
8360		- * All pages in the range must be in a single zone and isolated
8361		- * before calling this.
	9137	+ * All pages in the range must be in a single zone, must not contain holes,
	9138	+ * must span full sections, and must be isolated before calling this function.
8362	9139	*/
8363		-void
8364		-__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
	9140	+void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
8365	9141	{
	9142	+ unsigned long pfn = start_pfn;
8366	9143	struct page *page;
8367	9144	struct zone *zone;
8368		- unsigned int order, i;
8369		- unsigned long pfn;
	9145	+ unsigned int order;
8370	9146	unsigned long flags;
8371		- /* find the first valid pfn */
8372		- for (pfn = start_pfn; pfn < end_pfn; pfn++)
8373		- if (pfn_valid(pfn))
8374		- break;
8375		- if (pfn == end_pfn)
8376		- return;
	9147	+
8377	9148	offline_mem_sections(pfn, end_pfn);
8378	9149	zone = page_zone(pfn_to_page(pfn));
8379	9150	spin_lock_irqsave(&zone->lock, flags);
8380		- pfn = start_pfn;
8381	9151	while (pfn < end_pfn) {
8382		- if (!pfn_valid(pfn)) {
8383		- pfn++;
8384		- continue;
8385		- }
8386	9152	page = pfn_to_page(pfn);
8387	9153	/*
8388	9154	* The HWPoisoned page may be not in buddy system, and
..	..	@@ -8390,22 +9156,23 @@
8390	9156	*/
8391	9157	if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
8392	9158	pfn++;
8393		- SetPageReserved(page);
	9159	+ continue;
	9160	+ }
	9161	+ /*
	9162	+ * At this point all remaining PageOffline() pages have a
	9163	+ * reference count of 0 and can simply be skipped.
	9164	+ */
	9165	+ if (PageOffline(page)) {
	9166	+ BUG_ON(page_count(page));
	9167	+ BUG_ON(PageBuddy(page));
	9168	+ pfn++;
8394	9169	continue;
8395	9170	}
8396	9171
8397	9172	BUG_ON(page_count(page));
8398	9173	BUG_ON(!PageBuddy(page));
8399		- order = page_order(page);
8400		-#ifdef CONFIG_DEBUG_VM
8401		- pr_info("remove from free list %lx %d %lx\n",
8402		- pfn, 1 << order, end_pfn);
8403		-#endif
8404		- list_del(&page->lru);
8405		- rmv_page_order(page);
8406		- zone->free_area[order].nr_free--;
8407		- for (i = 0; i < (1 << order); i++)
8408		- SetPageReserved((page+i));
	9174	+ order = buddy_order(page);
	9175	+ del_page_from_free_list(page, zone, order);
8409	9176	pfn += (1 << order);
8410	9177	}
8411	9178	spin_unlock_irqrestore(&zone->lock, flags);
..	..	@@ -8423,7 +9190,7 @@
8423	9190	for (order = 0; order < MAX_ORDER; order++) {
8424	9191	struct page *page_head = page - (pfn & ((1 << order) - 1));
8425	9192
8426		- if (PageBuddy(page_head) && page_order(page_head) >= order)
	9193	+ if (PageBuddy(page_head) && buddy_order(page_head) >= order)
8427	9194	break;
8428	9195	}
8429	9196	spin_unlock_irqrestore(&zone->lock, flags);
..	..	@@ -8433,30 +9200,87 @@
8433	9200
8434	9201	#ifdef CONFIG_MEMORY_FAILURE
8435	9202	/*
8436		- * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
8437		- * test is performed under the zone lock to prevent a race against page
8438		- * allocation.
	9203	+ * Break down a higher-order page in sub-pages, and keep our target out of
	9204	+ * buddy allocator.
8439	9205	*/
8440		-bool set_hwpoison_free_buddy_page(struct page *page)
	9206	+static void break_down_buddy_pages(struct zone zone, struct page page,
	9207	+ struct page *target, int low, int high,
	9208	+ int migratetype)
	9209	+{
	9210	+ unsigned long size = 1 << high;
	9211	+ struct page current_buddy, next_page;
	9212	+
	9213	+ while (high > low) {
	9214	+ high--;
	9215	+ size >>= 1;
	9216	+
	9217	+ if (target >= &page[size]) {
	9218	+ next_page = page + size;
	9219	+ current_buddy = page;
	9220	+ } else {
	9221	+ next_page = page;
	9222	+ current_buddy = page + size;
	9223	+ }
	9224	+
	9225	+ if (set_page_guard(zone, current_buddy, high, migratetype))
	9226	+ continue;
	9227	+
	9228	+ if (current_buddy != target) {
	9229	+ add_to_free_list(current_buddy, zone, high, migratetype);
	9230	+ set_buddy_order(current_buddy, high);
	9231	+ page = next_page;
	9232	+ }
	9233	+ }
	9234	+}
	9235	+
	9236	+/*
	9237	+ * Take a page that will be marked as poisoned off the buddy allocator.
	9238	+ */
	9239	+bool take_page_off_buddy(struct page *page)
8441	9240	{
8442	9241	struct zone *zone = page_zone(page);
8443	9242	unsigned long pfn = page_to_pfn(page);
8444	9243	unsigned long flags;
8445	9244	unsigned int order;
8446		- bool hwpoisoned = false;
	9245	+ bool ret = false;
8447	9246
8448	9247	spin_lock_irqsave(&zone->lock, flags);
8449	9248	for (order = 0; order < MAX_ORDER; order++) {
8450	9249	struct page *page_head = page - (pfn & ((1 << order) - 1));
	9250	+ int page_order = buddy_order(page_head);
8451	9251
8452		- if (PageBuddy(page_head) && page_order(page_head) >= order) {
8453		- if (!TestSetPageHWPoison(page))
8454		- hwpoisoned = true;
	9252	+ if (PageBuddy(page_head) && page_order >= order) {
	9253	+ unsigned long pfn_head = page_to_pfn(page_head);
	9254	+ int migratetype = get_pfnblock_migratetype(page_head,
	9255	+ pfn_head);
	9256	+
	9257	+ del_page_from_free_list(page_head, zone, page_order);
	9258	+ break_down_buddy_pages(zone, page_head, page, 0,
	9259	+ page_order, migratetype);
	9260	+ if (!is_migrate_isolate(migratetype))
	9261	+ __mod_zone_freepage_state(zone, -1, migratetype);
	9262	+ ret = true;
8455	9263	break;
8456	9264	}
	9265	+ if (page_count(page_head) > 0)
	9266	+ break;
8457	9267	}
8458	9268	spin_unlock_irqrestore(&zone->lock, flags);
8459		-
8460		- return hwpoisoned;
	9269	+ return ret;
8461	9270	}
8462	9271	#endif
	9272	+
	9273	+#ifdef CONFIG_ZONE_DMA
	9274	+bool has_managed_dma(void)
	9275	+{
	9276	+ struct pglist_data *pgdat;
	9277	+
	9278	+ for_each_online_pgdat(pgdat) {
	9279	+ struct zone *zone = &pgdat->node_zones[ZONE_DMA];
	9280	+
	9281	+ if (managed_zone(zone))
	9282	+ return true;
	9283	+ }
	9284	+ return false;
	9285	+}
	9286	+#endif /* CONFIG_ZONE_DMA */