~ljy/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/mm/page_alloc.c
3	4	*
..	..	@@ -16,11 +17,11 @@
16	17
17	18	#include <linux/stddef.h>
18	19	#include <linux/mm.h>
	20	+#include <linux/highmem.h>
19	21	#include <linux/swap.h>
20	22	#include <linux/interrupt.h>
21	23	#include <linux/pagemap.h>
22	24	#include <linux/jiffies.h>
23		-#include <linux/bootmem.h>
24	25	#include <linux/memblock.h>
25	26	#include <linux/compiler.h>
26	27	#include <linux/kernel.h>
..	..	@@ -43,12 +44,12 @@
43	44	#include <linux/mempolicy.h>
44	45	#include <linux/memremap.h>
45	46	#include <linux/stop_machine.h>
	47	+#include <linux/random.h>
46	48	#include <linux/sort.h>
47	49	#include <linux/pfn.h>
48	50	#include <linux/backing-dev.h>
49	51	#include <linux/fault-inject.h>
50	52	#include <linux/page-isolation.h>
51		-#include <linux/page_ext.h>
52	53	#include <linux/debugobjects.h>
53	54	#include <linux/kmemleak.h>
54	55	#include <linux/compaction.h>
..	..	@@ -60,20 +61,65 @@
60	61	#include <linux/hugetlb.h>
61	62	#include <linux/sched/rt.h>
62	63	#include <linux/sched/mm.h>
63		-#include <linux/locallock.h>
	64	+#include <linux/local_lock.h>
64	65	#include <linux/page_owner.h>
	66	+#include <linux/page_pinner.h>
65	67	#include <linux/kthread.h>
66	68	#include <linux/memcontrol.h>
67	69	#include <linux/ftrace.h>
68	70	#include <linux/lockdep.h>
69	71	#include <linux/nmi.h>
70		-#include <linux/khugepaged.h>
71	72	#include <linux/psi.h>
	73	+#include <linux/padata.h>
	74	+#include <linux/khugepaged.h>
	75	+#include <trace/hooks/mm.h>
	76	+#include <trace/hooks/vmscan.h>
72	77
73	78	#include <asm/sections.h>
74	79	#include <asm/tlbflush.h>
75	80	#include <asm/div64.h>
76	81	#include "internal.h"
	82	+#include "shuffle.h"
	83	+#include "page_reporting.h"
	84	+
	85	+/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
	86	+typedef int __bitwise fpi_t;
	87	+
	88	+/* No special request */
	89	+#define FPI_NONE ((__force fpi_t)0)
	90	+
	91	+/*
	92	+ * Skip free page reporting notification for the (possibly merged) page.
	93	+ * This does not hinder free page reporting from grabbing the page,
	94	+ * reporting it and marking it "reported" - it only skips notifying
	95	+ * the free page reporting infrastructure about a newly freed page. For
	96	+ * example, used when temporarily pulling a page from a freelist and
	97	+ * putting it back unmodified.
	98	+ */
	99	+#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
	100	+
	101	+/*
	102	+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
	103	+ * page shuffling (relevant code - e.g., memory onlining - is expected to
	104	+ * shuffle the whole zone).
	105	+ *
	106	+ * Note: No code should rely on this flag for correctness - it's purely
	107	+ * to allow for optimizations when handing back either fresh pages
	108	+ * (memory onlining) or untouched pages (page isolation, free page
	109	+ * reporting).
	110	+ */
	111	+#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
	112	+
	113	+/*
	114	+ * Don't poison memory with KASAN (only for the tag-based modes).
	115	+ * During boot, all non-reserved memblock memory is exposed to page_alloc.
	116	+ * Poisoning all that memory lengthens boot time, especially on systems with
	117	+ * large amount of RAM. This flag is used to skip that poisoning.
	118	+ * This is only done for the tag-based KASAN modes, as those are able to
	119	+ * detect memory corruptions with the memory tags assigned by default.
	120	+ * All memory allocated normally after boot gets poisoned as usual.
	121	+ */
	122	+#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
77	123
78	124	/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
79	125	static DEFINE_MUTEX(pcp_batch_high_lock);
..	..	@@ -95,12 +141,15 @@
95	141	*/
96	142	DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
97	143	EXPORT_PER_CPU_SYMBOL(_numa_mem_);
98		-int _node_numa_mem_[MAX_NUMNODES];
99	144	#endif
100	145
101	146	/* work_structs for global per-cpu drains */
102		-DEFINE_MUTEX(pcpu_drain_mutex);
103		-DEFINE_PER_CPU(struct work_struct, pcpu_drain);
	147	+struct pcpu_drain {
	148	+ struct zone *zone;
	149	+ struct work_struct work;
	150	+};
	151	+static DEFINE_MUTEX(pcpu_drain_mutex);
	152	+static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
104	153
105	154	#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
106	155	volatile unsigned long latent_entropy __latent_entropy;
..	..	@@ -124,62 +173,33 @@
124	173	};
125	174	EXPORT_SYMBOL(node_states);
126	175
127		-/* Protect totalram_pages and zone->managed_pages */
128		-static DEFINE_SPINLOCK(managed_page_count_lock);
129		-
130		-unsigned long totalram_pages __read_mostly;
	176	+atomic_long_t _totalram_pages __read_mostly;
	177	+EXPORT_SYMBOL(_totalram_pages);
131	178	unsigned long totalreserve_pages __read_mostly;
132	179	unsigned long totalcma_pages __read_mostly;
133	180
134	181	int percpu_pagelist_fraction;
135	182	gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
136		-#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
137		-DEFINE_STATIC_KEY_TRUE(init_on_alloc);
138		-#else
139	183	DEFINE_STATIC_KEY_FALSE(init_on_alloc);
140		-#endif
141	184	EXPORT_SYMBOL(init_on_alloc);
142	185
143		-#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
144		-DEFINE_STATIC_KEY_TRUE(init_on_free);
145		-#else
146	186	DEFINE_STATIC_KEY_FALSE(init_on_free);
147		-#endif
148	187	EXPORT_SYMBOL(init_on_free);
149	188
	189	+static bool _init_on_alloc_enabled_early __read_mostly
	190	+ = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
150	191	static int __init early_init_on_alloc(char *buf)
151	192	{
152		- int ret;
153		- bool bool_result;
154	193
155		- if (!buf)
156		- return -EINVAL;
157		- ret = kstrtobool(buf, &bool_result);
158		- if (bool_result && page_poisoning_enabled())
159		- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
160		- if (bool_result)
161		- static_branch_enable(&init_on_alloc);
162		- else
163		- static_branch_disable(&init_on_alloc);
164		- return ret;
	194	+ return kstrtobool(buf, &_init_on_alloc_enabled_early);
165	195	}
166	196	early_param("init_on_alloc", early_init_on_alloc);
167	197
	198	+static bool _init_on_free_enabled_early __read_mostly
	199	+ = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
168	200	static int __init early_init_on_free(char *buf)
169	201	{
170		- int ret;
171		- bool bool_result;
172		-
173		- if (!buf)
174		- return -EINVAL;
175		- ret = kstrtobool(buf, &bool_result);
176		- if (bool_result && page_poisoning_enabled())
177		- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
178		- if (bool_result)
179		- static_branch_enable(&init_on_free);
180		- else
181		- static_branch_disable(&init_on_free);
182		- return ret;
	202	+ return kstrtobool(buf, &_init_on_free_enabled_early);
183	203	}
184	204	early_param("init_on_free", early_init_on_free);
185	205
..	..	@@ -243,7 +263,8 @@
243	263	unsigned int pageblock_order __read_mostly;
244	264	#endif
245	265
246		-static void __free_pages_ok(struct page *page, unsigned int order);
	266	+static void __free_pages_ok(struct page *page, unsigned int order,
	267	+ fpi_t fpi_flags);
247	268
248	269	/*
249	270	* results with 256, 32 in the lowmem_reserve sysctl:
..	..	@@ -270,8 +291,6 @@
270	291	[ZONE_MOVABLE] = 0,
271	292	};
272	293
273		-EXPORT_SYMBOL(totalram_pages);
274		-
275	294	static char * const zone_names[MAX_NR_ZONES] = {
276	295	#ifdef CONFIG_ZONE_DMA
277	296	"DMA",
..	..	@@ -289,7 +308,7 @@
289	308	#endif
290	309	};
291	310
292		-char * const migratetype_names[MIGRATE_TYPES] = {
	311	+const char * const migratetype_names[MIGRATE_TYPES] = {
293	312	"Unmovable",
294	313	"Movable",
295	314	"Reclaimable",
..	..	@@ -302,14 +321,14 @@
302	321	#endif
303	322	};
304	323
305		-compound_page_dtor * const compound_page_dtors[] = {
306		- NULL,
307		- free_compound_page,
	324	+compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
	325	+ [NULL_COMPOUND_DTOR] = NULL,
	326	+ [COMPOUND_PAGE_DTOR] = free_compound_page,
308	327	#ifdef CONFIG_HUGETLB_PAGE
309		- free_huge_page,
	328	+ [HUGETLB_PAGE_DTOR] = free_huge_page,
310	329	#endif
311	330	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
312		- free_transhuge_page,
	331	+ [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
313	332	#endif
314	333	};
315	334
..	..	@@ -320,6 +339,20 @@
320	339	*/
321	340	int min_free_kbytes = 1024;
322	341	int user_min_free_kbytes = -1;
	342	+#ifdef CONFIG_DISCONTIGMEM
	343	+/*
	344	+ * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
	345	+ * are not on separate NUMA nodes. Functionally this works but with
	346	+ * watermark_boost_factor, it can reclaim prematurely as the ranges can be
	347	+ * quite small. By default, do not boost watermarks on discontigmem as in
	348	+ * many cases very high-order allocations like THP are likely to be
	349	+ * unsupported and the premature reclaim offsets the advantage of long-term
	350	+ * fragmentation avoidance.
	351	+ */
	352	+int watermark_boost_factor __read_mostly;
	353	+#else
	354	+int watermark_boost_factor __read_mostly = 15000;
	355	+#endif
323	356	int watermark_scale_factor = 10;
324	357
325	358	/*
..	..	@@ -329,43 +362,36 @@
329	362	*/
330	363	int extra_free_kbytes = 0;
331	364
332		-static unsigned long nr_kernel_pages __meminitdata;
333		-static unsigned long nr_all_pages __meminitdata;
334		-static unsigned long dma_reserve __meminitdata;
	365	+static unsigned long nr_kernel_pages __initdata;
	366	+static unsigned long nr_all_pages __initdata;
	367	+static unsigned long dma_reserve __initdata;
335	368
336		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
337		-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
338		-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
	369	+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
	370	+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
339	371	static unsigned long required_kernelcore __initdata;
340	372	static unsigned long required_kernelcore_percent __initdata;
341	373	static unsigned long required_movablecore __initdata;
342	374	static unsigned long required_movablecore_percent __initdata;
343		-static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
	375	+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
344	376	static bool mirrored_kernelcore __meminitdata;
345	377
346	378	/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
347	379	int movable_zone;
348	380	EXPORT_SYMBOL(movable_zone);
349		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
350	381
351	382	#if MAX_NUMNODES > 1
352		-int nr_node_ids __read_mostly = MAX_NUMNODES;
353		-int nr_online_nodes __read_mostly = 1;
	383	+unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
	384	+unsigned int nr_online_nodes __read_mostly = 1;
354	385	EXPORT_SYMBOL(nr_node_ids);
355	386	EXPORT_SYMBOL(nr_online_nodes);
356	387	#endif
357	388
358		-static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
359		-
360		-#ifdef CONFIG_PREEMPT_RT_BASE
361		-# define cpu_lock_irqsave(cpu, flags) \
362		- local_lock_irqsave_on(pa_lock, flags, cpu)
363		-# define cpu_unlock_irqrestore(cpu, flags) \
364		- local_unlock_irqrestore_on(pa_lock, flags, cpu)
365		-#else
366		-# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags)
367		-# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags)
368		-#endif
	389	+struct pa_lock {
	390	+ local_lock_t l;
	391	+};
	392	+static DEFINE_PER_CPU(struct pa_lock, pa_lock) = {
	393	+ .l = INIT_LOCAL_LOCK(l),
	394	+};
369	395
370	396	int page_group_by_mobility_disabled __read_mostly;
371	397
..	..	@@ -378,7 +404,7 @@
378	404	static DEFINE_STATIC_KEY_TRUE(deferred_pages);
379	405
380	406	/*
381		- * Calling kasan_free_pages() only after deferred memory initialization
	407	+ * Calling kasan_poison_pages() only after deferred memory initialization
382	408	* has completed. Poisoning pages during deferred memory init will greatly
383	409	* lengthen the process and cause problem in large memory systems as the
384	410	* deferred pages initialization is done with interrupt disabled.
..	..	@@ -390,10 +416,12 @@
390	416	* on-demand allocation and then freed again before the deferred pages
391	417	* initialization is done, but this is not likely to happen.
392	418	*/
393		-static inline void kasan_free_nondeferred_pages(struct page *page, int order)
	419	+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
394	420	{
395		- if (!static_branch_unlikely(&deferred_pages))
396		- kasan_free_pages(page, order);
	421	+ return static_branch_unlikely(&deferred_pages) \|\|
	422	+ (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
	423	+ (fpi_flags & FPI_SKIP_KASAN_POISON)) \|\|
	424	+ PageSkipKASanPoison(page);
397	425	}
398	426
399	427	/* Returns true if the struct page for the pfn is uninitialised */
..	..	@@ -408,38 +436,57 @@
408	436	}
409	437
410	438	/*
411		- * Returns false when the remaining initialisation should be deferred until
	439	+ * Returns true when the remaining initialisation should be deferred until
412	440	* later in the boot cycle when it can be parallelised.
413	441	*/
414		-static inline bool update_defer_init(pg_data_t *pgdat,
415		- unsigned long pfn, unsigned long zone_end,
416		- unsigned long *nr_initialised)
	442	+static bool __meminit
	443	+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
417	444	{
418		- /* Always populate low zones for address-constrained allocations */
419		- if (zone_end < pgdat_end_pfn(pgdat))
420		- return true;
421		- (*nr_initialised)++;
422		- if ((*nr_initialised > pgdat->static_init_pgcnt) &&
423		- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
424		- pgdat->first_deferred_pfn = pfn;
425		- return false;
	445	+ static unsigned long prev_end_pfn, nr_initialised;
	446	+
	447	+ /*
	448	+ * prev_end_pfn static that contains the end of previous zone
	449	+ * No need to protect because called very early in boot before smp_init.
	450	+ */
	451	+ if (prev_end_pfn != end_pfn) {
	452	+ prev_end_pfn = end_pfn;
	453	+ nr_initialised = 0;
426	454	}
427	455
428		- return true;
	456	+ /* Always populate low zones for address-constrained allocations */
	457	+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
	458	+ return false;
	459	+
	460	+ if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
	461	+ return true;
	462	+ /*
	463	+ * We start only with one section of pages, more pages are added as
	464	+ * needed until the rest of deferred pages are initialized.
	465	+ */
	466	+ nr_initialised++;
	467	+ if ((nr_initialised > PAGES_PER_SECTION) &&
	468	+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
	469	+ NODE_DATA(nid)->first_deferred_pfn = pfn;
	470	+ return true;
	471	+ }
	472	+ return false;
429	473	}
430	474	#else
431		-#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
	475	+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
	476	+{
	477	+ return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
	478	+ (fpi_flags & FPI_SKIP_KASAN_POISON)) \|\|
	479	+ PageSkipKASanPoison(page);
	480	+}
432	481
433	482	static inline bool early_page_uninitialised(unsigned long pfn)
434	483	{
435	484	return false;
436	485	}
437	486
438		-static inline bool update_defer_init(pg_data_t *pgdat,
439		- unsigned long pfn, unsigned long zone_end,
440		- unsigned long *nr_initialised)
	487	+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
441	488	{
442		- return true;
	489	+ return false;
443	490	}
444	491	#endif
445	492
..	..	@@ -448,7 +495,7 @@
448	495	unsigned long pfn)
449	496	{
450	497	#ifdef CONFIG_SPARSEMEM
451		- return __pfn_to_section(pfn)->pageblock_flags;
	498	+ return section_to_usemap(__pfn_to_section(pfn));
452	499	#else
453	500	return page_zone(page)->pageblock_flags;
454	501	#endif /* CONFIG_SPARSEMEM */
..	..	@@ -458,25 +505,23 @@
458	505	{
459	506	#ifdef CONFIG_SPARSEMEM
460	507	pfn &= (PAGES_PER_SECTION-1);
461		- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
462	508	#else
463	509	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
464		- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
465	510	#endif /* CONFIG_SPARSEMEM */
	511	+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
466	512	}
467	513
468	514	/**
469	515	* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
470	516	* @page: The page within the block of interest
471	517	* @pfn: The target page frame number
472		- * @end_bitidx: The last bit of interest to retrieve
473	518	* @mask: mask of bits that the caller is interested in
474	519	*
475	520	* Return: pageblock_bits flags
476	521	*/
477		-static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
	522	+static __always_inline
	523	+unsigned long __get_pfnblock_flags_mask(struct page *page,
478	524	unsigned long pfn,
479		- unsigned long end_bitidx,
480	525	unsigned long mask)
481	526	{
482	527	unsigned long *bitmap;
..	..	@@ -489,20 +534,36 @@
489	534	bitidx &= (BITS_PER_LONG-1);
490	535
491	536	word = bitmap[word_bitidx];
492		- bitidx += end_bitidx;
493		- return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
	537	+ return (word >> bitidx) & mask;
494	538	}
495	539
496	540	unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
497		- unsigned long end_bitidx,
498	541	unsigned long mask)
499	542	{
500		- return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
	543	+ return __get_pfnblock_flags_mask(page, pfn, mask);
501	544	}
	545	+EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask);
	546	+
	547	+int isolate_anon_lru_page(struct page *page)
	548	+{
	549	+ int ret;
	550	+
	551	+ if (!PageLRU(page) \|\| !PageAnon(page))
	552	+ return -EINVAL;
	553	+
	554	+ if (!get_page_unless_zero(page))
	555	+ return -EINVAL;
	556	+
	557	+ ret = isolate_lru_page(page);
	558	+ put_page(page);
	559	+
	560	+ return ret;
	561	+}
	562	+EXPORT_SYMBOL_GPL(isolate_anon_lru_page);
502	563
503	564	static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
504	565	{
505		- return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
	566	+ return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
506	567	}
507	568
508	569	/**
..	..	@@ -510,12 +571,10 @@
510	571	* @page: The page within the block of interest
511	572	* @flags: The flags to set
512	573	* @pfn: The target page frame number
513		- * @end_bitidx: The last bit of interest
514	574	* @mask: mask of bits that the caller is interested in
515	575	*/
516	576	void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
517	577	unsigned long pfn,
518		- unsigned long end_bitidx,
519	578	unsigned long mask)
520	579	{
521	580	unsigned long *bitmap;
..	..	@@ -523,6 +582,7 @@
523	582	unsigned long old_word, word;
524	583
525	584	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
	585	+ BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
526	586
527	587	bitmap = get_pageblock_bitmap(page, pfn);
528	588	bitidx = pfn_to_bitidx(page, pfn);
..	..	@@ -531,9 +591,8 @@
531	591
532	592	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
533	593
534		- bitidx += end_bitidx;
535		- mask <<= (BITS_PER_LONG - bitidx - 1);
536		- flags <<= (BITS_PER_LONG - bitidx - 1);
	594	+ mask <<= bitidx;
	595	+ flags <<= bitidx;
537	596
538	597	word = READ_ONCE(bitmap[word_bitidx]);
539	598	for (;;) {
..	..	@@ -550,8 +609,8 @@
550	609	migratetype < MIGRATE_PCPTYPES))
551	610	migratetype = MIGRATE_UNMOVABLE;
552	611
553		- set_pageblock_flags_group(page, (unsigned long)migratetype,
554		- PB_migrate, PB_migrate_end);
	612	+ set_pfnblock_flags_mask(page, (unsigned long)migratetype,
	613	+ page_to_pfn(page), MIGRATETYPE_MASK);
555	614	}
556	615
557	616	#ifdef CONFIG_DEBUG_VM
..	..	@@ -606,8 +665,7 @@
606	665	}
607	666	#endif
608	667
609		-static void bad_page(struct page page, const char reason,
610		- unsigned long bad_flags)
	668	+static void bad_page(struct page page, const char reason)
611	669	{
612	670	static unsigned long resume;
613	671	static unsigned long nr_shown;
..	..	@@ -636,10 +694,6 @@
636	694	pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
637	695	current->comm, page_to_pfn(page));
638	696	__dump_page(page, reason);
639		- bad_flags &= page->flags;
640		- if (bad_flags)
641		- pr_alert("bad because of flags: %#lx(%pGp)\n",
642		- bad_flags, &bad_flags);
643	697	dump_page_owner(page);
644	698
645	699	print_modules();
..	..	@@ -667,7 +721,8 @@
667	721
668	722	void free_compound_page(struct page *page)
669	723	{
670		- __free_pages_ok(page, compound_order(page));
	724	+ mem_cgroup_uncharge(page);
	725	+ __free_pages_ok(page, compound_order(page), FPI_NONE);
671	726	}
672	727
673	728	void prep_compound_page(struct page *page, unsigned int order)
..	..	@@ -675,8 +730,6 @@
675	730	int i;
676	731	int nr_pages = 1 << order;
677	732
678		- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
679		- set_compound_order(page, order);
680	733	__SetPageHead(page);
681	734	for (i = 1; i < nr_pages; i++) {
682	735	struct page *p = page + i;
..	..	@@ -684,51 +737,30 @@
684	737	p->mapping = TAIL_MAPPING;
685	738	set_compound_head(p, page);
686	739	}
	740	+
	741	+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
	742	+ set_compound_order(page, order);
687	743	atomic_set(compound_mapcount_ptr(page), -1);
	744	+ if (hpage_pincount_available(page))
	745	+ atomic_set(compound_pincount_ptr(page), 0);
688	746	}
689	747
690	748	#ifdef CONFIG_DEBUG_PAGEALLOC
691	749	unsigned int _debug_guardpage_minorder;
692		-bool _debug_pagealloc_enabled __read_mostly
	750	+
	751	+bool _debug_pagealloc_enabled_early __read_mostly
693	752	= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
	753	+EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
	754	+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
694	755	EXPORT_SYMBOL(_debug_pagealloc_enabled);
695		-bool _debug_guardpage_enabled __read_mostly;
	756	+
	757	+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
696	758
697	759	static int __init early_debug_pagealloc(char *buf)
698	760	{
699		- if (!buf)
700		- return -EINVAL;
701		- return kstrtobool(buf, &_debug_pagealloc_enabled);
	761	+ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
702	762	}
703	763	early_param("debug_pagealloc", early_debug_pagealloc);
704		-
705		-static bool need_debug_guardpage(void)
706		-{
707		- /* If we don't use debug_pagealloc, we don't need guard page */
708		- if (!debug_pagealloc_enabled())
709		- return false;
710		-
711		- if (!debug_guardpage_minorder())
712		- return false;
713		-
714		- return true;
715		-}
716		-
717		-static void init_debug_guardpage(void)
718		-{
719		- if (!debug_pagealloc_enabled())
720		- return;
721		-
722		- if (!debug_guardpage_minorder())
723		- return;
724		-
725		- _debug_guardpage_enabled = true;
726		-}
727		-
728		-struct page_ext_operations debug_guardpage_ops = {
729		- .need = need_debug_guardpage,
730		- .init = init_debug_guardpage,
731		-};
732	764
733	765	static int __init debug_guardpage_minorder_setup(char *buf)
734	766	{
..	..	@@ -747,20 +779,13 @@
747	779	static inline bool set_page_guard(struct zone zone, struct page page,
748	780	unsigned int order, int migratetype)
749	781	{
750		- struct page_ext *page_ext;
751		-
752	782	if (!debug_guardpage_enabled())
753	783	return false;
754	784
755	785	if (order >= debug_guardpage_minorder())
756	786	return false;
757	787
758		- page_ext = lookup_page_ext(page);
759		- if (unlikely(!page_ext))
760		- return false;
761		-
762		- __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
763		-
	788	+ __SetPageGuard(page);
764	789	INIT_LIST_HEAD(&page->lru);
765	790	set_page_private(page, order);
766	791	/* Guard pages are not available for any usage */
..	..	@@ -772,39 +797,77 @@
772	797	static inline void clear_page_guard(struct zone zone, struct page page,
773	798	unsigned int order, int migratetype)
774	799	{
775		- struct page_ext *page_ext;
776		-
777	800	if (!debug_guardpage_enabled())
778	801	return;
779	802
780		- page_ext = lookup_page_ext(page);
781		- if (unlikely(!page_ext))
782		- return;
783		-
784		- __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
	803	+ __ClearPageGuard(page);
785	804
786	805	set_page_private(page, 0);
787	806	if (!is_migrate_isolate(migratetype))
788	807	__mod_zone_freepage_state(zone, (1 << order), migratetype);
789	808	}
790	809	#else
791		-struct page_ext_operations debug_guardpage_ops;
792	810	static inline bool set_page_guard(struct zone zone, struct page page,
793	811	unsigned int order, int migratetype) { return false; }
794	812	static inline void clear_page_guard(struct zone zone, struct page page,
795	813	unsigned int order, int migratetype) {}
796	814	#endif
797	815
798		-static inline void set_page_order(struct page *page, unsigned int order)
	816	+/*
	817	+ * Enable static keys related to various memory debugging and hardening options.
	818	+ * Some override others, and depend on early params that are evaluated in the
	819	+ * order of appearance. So we need to first gather the full picture of what was
	820	+ * enabled, and then make decisions.
	821	+ */
	822	+void init_mem_debugging_and_hardening(void)
	823	+{
	824	+ bool page_poisoning_requested = false;
	825	+
	826	+#ifdef CONFIG_PAGE_POISONING
	827	+ /*
	828	+ * Page poisoning is debug page alloc for some arches. If
	829	+ * either of those options are enabled, enable poisoning.
	830	+ */
	831	+ if (page_poisoning_enabled() \|\|
	832	+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
	833	+ debug_pagealloc_enabled())) {
	834	+ static_branch_enable(&_page_poisoning_enabled);
	835	+ page_poisoning_requested = true;
	836	+ }
	837	+#endif
	838	+
	839	+ if (_init_on_alloc_enabled_early) {
	840	+ if (page_poisoning_requested)
	841	+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
	842	+ "will take precedence over init_on_alloc\n");
	843	+ else
	844	+ static_branch_enable(&init_on_alloc);
	845	+ }
	846	+ if (_init_on_free_enabled_early) {
	847	+ if (page_poisoning_requested)
	848	+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
	849	+ "will take precedence over init_on_free\n");
	850	+ else
	851	+ static_branch_enable(&init_on_free);
	852	+ }
	853	+
	854	+#ifdef CONFIG_DEBUG_PAGEALLOC
	855	+ if (!debug_pagealloc_enabled())
	856	+ return;
	857	+
	858	+ static_branch_enable(&_debug_pagealloc_enabled);
	859	+
	860	+ if (!debug_guardpage_minorder())
	861	+ return;
	862	+
	863	+ static_branch_enable(&_debug_guardpage_enabled);
	864	+#endif
	865	+}
	866	+
	867	+static inline void set_buddy_order(struct page *page, unsigned int order)
799	868	{
800	869	set_page_private(page, order);
801	870	__SetPageBuddy(page);
802		-}
803		-
804		-static inline void rmv_page_order(struct page *page)
805		-{
806		- __ClearPageBuddy(page);
807		- set_page_private(page, 0);
808	871	}
809	872
810	873	/*
..	..	@@ -820,32 +883,151 @@
820	883	*
821	884	* For recording page's order, we use page_private(page).
822	885	*/
823		-static inline int page_is_buddy(struct page page, struct page buddy,
	886	+static inline bool page_is_buddy(struct page page, struct page buddy,
824	887	unsigned int order)
825	888	{
826		- if (page_is_guard(buddy) && page_order(buddy) == order) {
827		- if (page_zone_id(page) != page_zone_id(buddy))
828		- return 0;
	889	+ if (!page_is_guard(buddy) && !PageBuddy(buddy))
	890	+ return false;
829	891
830		- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
	892	+ if (buddy_order(buddy) != order)
	893	+ return false;
831	894
832		- return 1;
833		- }
	895	+ /*
	896	+ * zone check is done late to avoid uselessly calculating
	897	+ * zone/node ids for pages that could never merge.
	898	+ */
	899	+ if (page_zone_id(page) != page_zone_id(buddy))
	900	+ return false;
834	901
835		- if (PageBuddy(buddy) && page_order(buddy) == order) {
836		- /*
837		- * zone check is done late to avoid uselessly
838		- * calculating zone/node ids for pages that could
839		- * never merge.
840		- */
841		- if (page_zone_id(page) != page_zone_id(buddy))
842		- return 0;
	902	+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
843	903
844		- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
	904	+ return true;
	905	+}
845	906
846		- return 1;
847		- }
848		- return 0;
	907	+#ifdef CONFIG_COMPACTION
	908	+static inline struct capture_control task_capc(struct zone zone)
	909	+{
	910	+ struct capture_control *capc = current->capture_control;
	911	+
	912	+ return unlikely(capc) &&
	913	+ !(current->flags & PF_KTHREAD) &&
	914	+ !capc->page &&
	915	+ capc->cc->zone == zone ? capc : NULL;
	916	+}
	917	+
	918	+static inline bool
	919	+compaction_capture(struct capture_control capc, struct page page,
	920	+ int order, int migratetype)
	921	+{
	922	+ if (!capc \|\| order != capc->cc->order)
	923	+ return false;
	924	+
	925	+ /* Do not accidentally pollute CMA or isolated regions*/
	926	+ if (is_migrate_cma(migratetype) \|\|
	927	+ is_migrate_isolate(migratetype))
	928	+ return false;
	929	+
	930	+ /*
	931	+ * Do not let lower order allocations polluate a movable pageblock.
	932	+ * This might let an unmovable request use a reclaimable pageblock
	933	+ * and vice-versa but no more than normal fallback logic which can
	934	+ * have trouble finding a high-order free page.
	935	+ */
	936	+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
	937	+ return false;
	938	+
	939	+ capc->page = page;
	940	+ return true;
	941	+}
	942	+
	943	+#else
	944	+static inline struct capture_control task_capc(struct zone zone)
	945	+{
	946	+ return NULL;
	947	+}
	948	+
	949	+static inline bool
	950	+compaction_capture(struct capture_control capc, struct page page,
	951	+ int order, int migratetype)
	952	+{
	953	+ return false;
	954	+}
	955	+#endif /* CONFIG_COMPACTION */
	956	+
	957	+/* Used for pages not on another list */
	958	+static inline void add_to_free_list(struct page page, struct zone zone,
	959	+ unsigned int order, int migratetype)
	960	+{
	961	+ struct free_area *area = &zone->free_area[order];
	962	+
	963	+ list_add(&page->lru, &area->free_list[migratetype]);
	964	+ area->nr_free++;
	965	+}
	966	+
	967	+/* Used for pages not on another list */
	968	+static inline void add_to_free_list_tail(struct page page, struct zone zone,
	969	+ unsigned int order, int migratetype)
	970	+{
	971	+ struct free_area *area = &zone->free_area[order];
	972	+
	973	+ list_add_tail(&page->lru, &area->free_list[migratetype]);
	974	+ area->nr_free++;
	975	+}
	976	+
	977	+/*
	978	+ * Used for pages which are on another list. Move the pages to the tail
	979	+ * of the list - so the moved pages won't immediately be considered for
	980	+ * allocation again (e.g., optimization for memory onlining).
	981	+ */
	982	+static inline void move_to_free_list(struct page page, struct zone zone,
	983	+ unsigned int order, int migratetype)
	984	+{
	985	+ struct free_area *area = &zone->free_area[order];
	986	+
	987	+ list_move_tail(&page->lru, &area->free_list[migratetype]);
	988	+}
	989	+
	990	+static inline void del_page_from_free_list(struct page page, struct zone zone,
	991	+ unsigned int order)
	992	+{
	993	+ /* clear reported state and update reported page count */
	994	+ if (page_reported(page))
	995	+ __ClearPageReported(page);
	996	+
	997	+ list_del(&page->lru);
	998	+ __ClearPageBuddy(page);
	999	+ set_page_private(page, 0);
	1000	+ zone->free_area[order].nr_free--;
	1001	+}
	1002	+
	1003	+/*
	1004	+ * If this is not the largest possible page, check if the buddy
	1005	+ * of the next-highest order is free. If it is, it's possible
	1006	+ * that pages are being freed that will coalesce soon. In case,
	1007	+ * that is happening, add the free page to the tail of the list
	1008	+ * so it's less likely to be used soon and more likely to be merged
	1009	+ * as a higher order page
	1010	+ */
	1011	+static inline bool
	1012	+buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
	1013	+ struct page *page, unsigned int order)
	1014	+{
	1015	+ struct page higher_page, higher_buddy;
	1016	+ unsigned long combined_pfn;
	1017	+
	1018	+ if (order >= MAX_ORDER - 2)
	1019	+ return false;
	1020	+
	1021	+ if (!pfn_valid_within(buddy_pfn))
	1022	+ return false;
	1023	+
	1024	+ combined_pfn = buddy_pfn & pfn;
	1025	+ higher_page = page + (combined_pfn - pfn);
	1026	+ buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
	1027	+ higher_buddy = higher_page + (buddy_pfn - combined_pfn);
	1028	+
	1029	+ return pfn_valid_within(buddy_pfn) &&
	1030	+ page_is_buddy(higher_page, higher_buddy, order + 1);
849	1031	}
850	1032
851	1033	/*
..	..	@@ -875,12 +1057,14 @@
875	1057	static inline void __free_one_page(struct page *page,
876	1058	unsigned long pfn,
877	1059	struct zone *zone, unsigned int order,
878		- int migratetype)
	1060	+ int migratetype, fpi_t fpi_flags)
879	1061	{
	1062	+ struct capture_control *capc = task_capc(zone);
	1063	+ unsigned long buddy_pfn;
880	1064	unsigned long combined_pfn;
881		- unsigned long uninitialized_var(buddy_pfn);
882		- struct page *buddy;
883	1065	unsigned int max_order;
	1066	+ struct page *buddy;
	1067	+ bool to_tail;
884	1068
885	1069	max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
886	1070
..	..	@@ -896,6 +1080,11 @@
896	1080
897	1081	continue_merging:
898	1082	while (order < max_order) {
	1083	+ if (compaction_capture(capc, page, order, migratetype)) {
	1084	+ __mod_zone_freepage_state(zone, -(1 << order),
	1085	+ migratetype);
	1086	+ return;
	1087	+ }
899	1088	buddy_pfn = __find_buddy_pfn(pfn, order);
900	1089	buddy = page + (buddy_pfn - pfn);
901	1090
..	..	@@ -907,13 +1096,10 @@
907	1096	* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
908	1097	* merge with it and move up one order.
909	1098	*/
910		- if (page_is_guard(buddy)) {
	1099	+ if (page_is_guard(buddy))
911	1100	clear_page_guard(zone, buddy, order, migratetype);
912		- } else {
913		- list_del(&buddy->lru);
914		- zone->free_area[order].nr_free--;
915		- rmv_page_order(buddy);
916		- }
	1101	+ else
	1102	+ del_page_from_free_list(buddy, zone, order);
917	1103	combined_pfn = buddy_pfn & pfn;
918	1104	page = page + (combined_pfn - pfn);
919	1105	pfn = combined_pfn;
..	..	@@ -945,33 +1131,23 @@
945	1131	}
946	1132
947	1133	done_merging:
948		- set_page_order(page, order);
	1134	+ set_buddy_order(page, order);
949	1135
950		- /*
951		- * If this is not the largest possible page, check if the buddy
952		- * of the next-highest order is free. If it is, it's possible
953		- * that pages are being freed that will coalesce soon. In case,
954		- * that is happening, add the free page to the tail of the list
955		- * so it's less likely to be used soon and more likely to be merged
956		- * as a higher order page
957		- */
958		- if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
959		- struct page higher_page, higher_buddy;
960		- combined_pfn = buddy_pfn & pfn;
961		- higher_page = page + (combined_pfn - pfn);
962		- buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
963		- higher_buddy = higher_page + (buddy_pfn - combined_pfn);
964		- if (pfn_valid_within(buddy_pfn) &&
965		- page_is_buddy(higher_page, higher_buddy, order + 1)) {
966		- list_add_tail(&page->lru,
967		- &zone->free_area[order].free_list[migratetype]);
968		- goto out;
969		- }
970		- }
	1136	+ if (fpi_flags & FPI_TO_TAIL)
	1137	+ to_tail = true;
	1138	+ else if (is_shuffle_order(order))
	1139	+ to_tail = shuffle_pick_tail();
	1140	+ else
	1141	+ to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
971	1142
972		- list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
973		-out:
974		- zone->free_area[order].nr_free++;
	1143	+ if (to_tail)
	1144	+ add_to_free_list_tail(page, zone, order, migratetype);
	1145	+ else
	1146	+ add_to_free_list(page, zone, order, migratetype);
	1147	+
	1148	+ /* Notify page reporting subsystem of freed page */
	1149	+ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
	1150	+ page_reporting_notify_free(order);
975	1151	}
976	1152
977	1153	/*
..	..	@@ -996,13 +1172,9 @@
996	1172	return true;
997	1173	}
998	1174
999		-static void free_pages_check_bad(struct page *page)
	1175	+static const char page_bad_reason(struct page page, unsigned long flags)
1000	1176	{
1001		- const char *bad_reason;
1002		- unsigned long bad_flags;
1003		-
1004		- bad_reason = NULL;
1005		- bad_flags = 0;
	1177	+ const char *bad_reason = NULL;
1006	1178
1007	1179	if (unlikely(atomic_read(&page->_mapcount) != -1))
1008	1180	bad_reason = "nonzero mapcount";
..	..	@@ -1010,24 +1182,32 @@
1010	1182	bad_reason = "non-NULL mapping";
1011	1183	if (unlikely(page_ref_count(page) != 0))
1012	1184	bad_reason = "nonzero _refcount";
1013		- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
1014		- bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1015		- bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
	1185	+ if (unlikely(page->flags & flags)) {
	1186	+ if (flags == PAGE_FLAGS_CHECK_AT_PREP)
	1187	+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
	1188	+ else
	1189	+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1016	1190	}
1017	1191	#ifdef CONFIG_MEMCG
1018	1192	if (unlikely(page->mem_cgroup))
1019	1193	bad_reason = "page still charged to cgroup";
1020	1194	#endif
1021		- bad_page(page, bad_reason, bad_flags);
	1195	+ return bad_reason;
1022	1196	}
1023	1197
1024		-static inline int free_pages_check(struct page *page)
	1198	+static void check_free_page_bad(struct page *page)
	1199	+{
	1200	+ bad_page(page,
	1201	+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
	1202	+}
	1203	+
	1204	+static inline int check_free_page(struct page *page)
1025	1205	{
1026	1206	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
1027	1207	return 0;
1028	1208
1029	1209	/* Something has gone sideways, find it */
1030		- free_pages_check_bad(page);
	1210	+ check_free_page_bad(page);
1031	1211	return 1;
1032	1212	}
1033	1213
..	..	@@ -1049,7 +1229,7 @@
1049	1229	case 1:
1050	1230	/* the first tail page: ->mapping may be compound_mapcount() */
1051	1231	if (unlikely(compound_mapcount(page))) {
1052		- bad_page(page, "nonzero compound_mapcount", 0);
	1232	+ bad_page(page, "nonzero compound_mapcount");
1053	1233	goto out;
1054	1234	}
1055	1235	break;
..	..	@@ -1061,17 +1241,17 @@
1061	1241	break;
1062	1242	default:
1063	1243	if (page->mapping != TAIL_MAPPING) {
1064		- bad_page(page, "corrupted mapping in tail page", 0);
	1244	+ bad_page(page, "corrupted mapping in tail page");
1065	1245	goto out;
1066	1246	}
1067	1247	break;
1068	1248	}
1069	1249	if (unlikely(!PageTail(page))) {
1070		- bad_page(page, "PageTail not set", 0);
	1250	+ bad_page(page, "PageTail not set");
1071	1251	goto out;
1072	1252	}
1073	1253	if (unlikely(compound_head(page) != head_page)) {
1074		- bad_page(page, "compound_head not consistent", 0);
	1254	+ bad_page(page, "compound_head not consistent");
1075	1255	goto out;
1076	1256	}
1077	1257	ret = 0;
..	..	@@ -1081,25 +1261,48 @@
1081	1261	return ret;
1082	1262	}
1083	1263
1084		-static void kernel_init_free_pages(struct page *page, int numpages)
	1264	+static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
1085	1265	{
1086	1266	int i;
1087	1267
	1268	+ if (zero_tags) {
	1269	+ for (i = 0; i < numpages; i++)
	1270	+ tag_clear_highpage(page + i);
	1271	+ return;
	1272	+ }
	1273	+
1088	1274	/* s390's use of memset() could override KASAN redzones. */
1089	1275	kasan_disable_current();
1090		- for (i = 0; i < numpages; i++)
	1276	+ for (i = 0; i < numpages; i++) {
	1277	+ u8 tag = page_kasan_tag(page + i);
	1278	+ page_kasan_tag_reset(page + i);
1091	1279	clear_highpage(page + i);
	1280	+ page_kasan_tag_set(page + i, tag);
	1281	+ }
1092	1282	kasan_enable_current();
1093	1283	}
1094	1284
1095	1285	static __always_inline bool free_pages_prepare(struct page *page,
1096		- unsigned int order, bool check_free)
	1286	+ unsigned int order, bool check_free, fpi_t fpi_flags)
1097	1287	{
1098	1288	int bad = 0;
	1289	+ bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
1099	1290
1100	1291	VM_BUG_ON_PAGE(PageTail(page), page);
1101	1292
1102	1293	trace_mm_page_free(page, order);
	1294	+
	1295	+ if (unlikely(PageHWPoison(page)) && !order) {
	1296	+ /*
	1297	+ * Do not let hwpoison pages hit pcplists/buddy
	1298	+ * Untie memcg state and reset page's owner
	1299	+ */
	1300	+ if (memcg_kmem_enabled() && PageKmemcg(page))
	1301	+ __memcg_kmem_uncharge_page(page, order);
	1302	+ reset_page_owner(page, order);
	1303	+ free_page_pinner(page, order);
	1304	+ return false;
	1305	+ }
1103	1306
1104	1307	/*
1105	1308	* Check tail pages before head page information is cleared to
..	..	@@ -1116,7 +1319,7 @@
1116	1319	for (i = 1; i < (1 << order); i++) {
1117	1320	if (compound)
1118	1321	bad += free_tail_pages_check(page, page + i);
1119		- if (unlikely(free_pages_check(page + i))) {
	1322	+ if (unlikely(check_free_page(page + i))) {
1120	1323	bad++;
1121	1324	continue;
1122	1325	}
..	..	@@ -1126,15 +1329,16 @@
1126	1329	if (PageMappingFlags(page))
1127	1330	page->mapping = NULL;
1128	1331	if (memcg_kmem_enabled() && PageKmemcg(page))
1129		- memcg_kmem_uncharge(page, order);
	1332	+ __memcg_kmem_uncharge_page(page, order);
1130	1333	if (check_free)
1131		- bad += free_pages_check(page);
	1334	+ bad += check_free_page(page);
1132	1335	if (bad)
1133	1336	return false;
1134	1337
1135	1338	page_cpupid_reset_last(page);
1136	1339	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1137	1340	reset_page_owner(page, order);
	1341	+ free_page_pinner(page, order);
1138	1342
1139	1343	if (!PageHighMem(page)) {
1140	1344	debug_check_no_locks_freed(page_address(page),
..	..	@@ -1142,36 +1346,77 @@
1142	1346	debug_check_no_obj_freed(page_address(page),
1143	1347	PAGE_SIZE << order);
1144	1348	}
1145		- arch_free_page(page, order);
1146		- if (want_init_on_free())
1147		- kernel_init_free_pages(page, 1 << order);
1148	1349
1149		- kernel_poison_pages(page, 1 << order, 0);
1150		- kernel_map_pages(page, 1 << order, 0);
1151		- kasan_free_nondeferred_pages(page, order);
	1350	+ kernel_poison_pages(page, 1 << order);
	1351	+
	1352	+ /*
	1353	+ * As memory initialization might be integrated into KASAN,
	1354	+ * kasan_free_pages and kernel_init_free_pages must be
	1355	+ * kept together to avoid discrepancies in behavior.
	1356	+ *
	1357	+ * With hardware tag-based KASAN, memory tags must be set before the
	1358	+ * page becomes unavailable via debug_pagealloc or arch_free_page.
	1359	+ */
	1360	+ if (kasan_has_integrated_init()) {
	1361	+ if (!skip_kasan_poison)
	1362	+ kasan_free_pages(page, order);
	1363	+ } else {
	1364	+ bool init = want_init_on_free();
	1365	+
	1366	+ if (init)
	1367	+ kernel_init_free_pages(page, 1 << order, false);
	1368	+ if (!skip_kasan_poison)
	1369	+ kasan_poison_pages(page, order, init);
	1370	+ }
	1371	+
	1372	+ /*
	1373	+ * arch_free_page() can make the page's contents inaccessible. s390
	1374	+ * does this. So nothing which can access the page's contents should
	1375	+ * happen after this.
	1376	+ */
	1377	+ arch_free_page(page, order);
	1378	+
	1379	+ debug_pagealloc_unmap_pages(page, 1 << order);
1152	1380
1153	1381	return true;
1154	1382	}
1155	1383
1156	1384	#ifdef CONFIG_DEBUG_VM
1157		-static inline bool free_pcp_prepare(struct page *page)
1158		-{
1159		- return free_pages_prepare(page, 0, true);
1160		-}
1161		-
1162		-static inline bool bulkfree_pcp_prepare(struct page *page)
1163		-{
1164		- return false;
1165		-}
1166		-#else
	1385	+/*
	1386	+ * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
	1387	+ * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
	1388	+ * moved from pcp lists to free lists.
	1389	+ */
1167	1390	static bool free_pcp_prepare(struct page *page)
1168	1391	{
1169		- return free_pages_prepare(page, 0, false);
	1392	+ return free_pages_prepare(page, 0, true, FPI_NONE);
1170	1393	}
1171	1394
1172	1395	static bool bulkfree_pcp_prepare(struct page *page)
1173	1396	{
1174		- return free_pages_check(page);
	1397	+ if (debug_pagealloc_enabled_static())
	1398	+ return check_free_page(page);
	1399	+ else
	1400	+ return false;
	1401	+}
	1402	+#else
	1403	+/*
	1404	+ * With DEBUG_VM disabled, order-0 pages being freed are checked only when
	1405	+ * moving from pcp lists to free list in order to reduce overhead. With
	1406	+ * debug_pagealloc enabled, they are checked also immediately when being freed
	1407	+ * to the pcp lists.
	1408	+ */
	1409	+static bool free_pcp_prepare(struct page *page)
	1410	+{
	1411	+ if (debug_pagealloc_enabled_static())
	1412	+ return free_pages_prepare(page, 0, true, FPI_NONE);
	1413	+ else
	1414	+ return free_pages_prepare(page, 0, false, FPI_NONE);
	1415	+}
	1416	+
	1417	+static bool bulkfree_pcp_prepare(struct page *page)
	1418	+{
	1419	+ return check_free_page(page);
1175	1420	}
1176	1421	#endif /* CONFIG_DEBUG_VM */
1177	1422
..	..	@@ -1232,7 +1477,7 @@
1232	1477	mt = get_pageblock_migratetype(page);
1233	1478
1234	1479	list_del(&page->lru);
1235		- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
	1480	+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
1236	1481	trace_mm_page_pcpu_drain(page, 0, mt);
1237	1482	}
1238	1483	spin_unlock_irqrestore(&zone->lock, flags);
..	..	@@ -1240,7 +1485,6 @@
1240	1485
1241	1486	static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp,
1242	1487	struct list_head *dst)
1243		-
1244	1488	{
1245	1489	int migratetype = 0;
1246	1490	int batch_free = 0;
..	..	@@ -1302,14 +1546,14 @@
1302	1546	static void free_one_page(struct zone *zone,
1303	1547	struct page *page, unsigned long pfn,
1304	1548	unsigned int order,
1305		- int migratetype)
	1549	+ int migratetype, fpi_t fpi_flags)
1306	1550	{
1307	1551	spin_lock(&zone->lock);
1308	1552	if (unlikely(has_isolate_pageblock(zone) \|\|
1309	1553	is_migrate_isolate(migratetype))) {
1310	1554	migratetype = get_pfnblock_migratetype(page, pfn);
1311	1555	}
1312		- __free_one_page(page, pfn, zone, order, migratetype);
	1556	+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
1313	1557	spin_unlock(&zone->lock);
1314	1558	}
1315	1559
..	..	@@ -1383,33 +1627,45 @@
1383	1627	/* Avoid false-positive PageTail() */
1384	1628	INIT_LIST_HEAD(&page->lru);
1385	1629
1386		- SetPageReserved(page);
	1630	+ /*
	1631	+ * no need for atomic set_bit because the struct
	1632	+ * page is not visible yet so nobody should
	1633	+ * access it yet.
	1634	+ */
	1635	+ __SetPageReserved(page);
1387	1636	}
1388	1637	}
1389	1638	}
1390	1639
1391		-static void __free_pages_ok(struct page *page, unsigned int order)
	1640	+static void __free_pages_ok(struct page *page, unsigned int order,
	1641	+ fpi_t fpi_flags)
1392	1642	{
1393	1643	unsigned long flags;
1394	1644	int migratetype;
1395	1645	unsigned long pfn = page_to_pfn(page);
1396	1646
1397		- if (!free_pages_prepare(page, order, true))
	1647	+ if (!free_pages_prepare(page, order, true, fpi_flags))
1398	1648	return;
1399	1649
1400	1650	migratetype = get_pfnblock_migratetype(page, pfn);
1401		- local_lock_irqsave(pa_lock, flags);
	1651	+ local_lock_irqsave(&pa_lock.l, flags);
1402	1652	__count_vm_events(PGFREE, 1 << order);
1403		- free_one_page(page_zone(page), page, pfn, order, migratetype);
1404		- local_unlock_irqrestore(pa_lock, flags);
	1653	+ free_one_page(page_zone(page), page, pfn, order, migratetype,
	1654	+ fpi_flags);
	1655	+ local_unlock_irqrestore(&pa_lock.l, flags);
1405	1656	}
1406	1657
1407		-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
	1658	+void __free_pages_core(struct page *page, unsigned int order)
1408	1659	{
1409	1660	unsigned int nr_pages = 1 << order;
1410	1661	struct page *p = page;
1411	1662	unsigned int loop;
1412	1663
	1664	+ /*
	1665	+ * When initializing the memmap, __init_single_page() sets the refcount
	1666	+ * of all pages to 1 ("allocated"/"not free"). We have to set the
	1667	+ * refcount of all involved pages to 0.
	1668	+ */
1413	1669	prefetchw(p);
1414	1670	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1415	1671	prefetchw(p + 1);
..	..	@@ -1419,15 +1675,43 @@
1419	1675	__ClearPageReserved(p);
1420	1676	set_page_count(p, 0);
1421	1677
1422		- page_zone(page)->managed_pages += nr_pages;
1423		- set_page_refcounted(page);
1424		- __free_pages(page, order);
	1678	+ atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
	1679	+
	1680	+ /*
	1681	+ * Bypass PCP and place fresh pages right to the tail, primarily
	1682	+ * relevant for memory onlining.
	1683	+ */
	1684	+ __free_pages_ok(page, order, FPI_TO_TAIL \| FPI_SKIP_KASAN_POISON);
1425	1685	}
1426	1686
1427		-#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) \|\| \
1428		- defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
	1687	+#ifdef CONFIG_NEED_MULTIPLE_NODES
1429	1688
1430	1689	static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
	1690	+
	1691	+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
	1692	+
	1693	+/*
	1694	+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
	1695	+ */
	1696	+int __meminit __early_pfn_to_nid(unsigned long pfn,
	1697	+ struct mminit_pfnnid_cache *state)
	1698	+{
	1699	+ unsigned long start_pfn, end_pfn;
	1700	+ int nid;
	1701	+
	1702	+ if (state->last_start <= pfn && pfn < state->last_end)
	1703	+ return state->last_nid;
	1704	+
	1705	+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
	1706	+ if (nid != NUMA_NO_NODE) {
	1707	+ state->last_start = start_pfn;
	1708	+ state->last_end = end_pfn;
	1709	+ state->last_nid = nid;
	1710	+ }
	1711	+
	1712	+ return nid;
	1713	+}
	1714	+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
1431	1715
1432	1716	int __meminit early_pfn_to_nid(unsigned long pfn)
1433	1717	{
..	..	@@ -1442,48 +1726,14 @@
1442	1726
1443	1727	return nid;
1444	1728	}
1445		-#endif
	1729	+#endif /* CONFIG_NEED_MULTIPLE_NODES */
1446	1730
1447		-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
1448		-static inline bool __meminit __maybe_unused
1449		-meminit_pfn_in_nid(unsigned long pfn, int node,
1450		- struct mminit_pfnnid_cache *state)
1451		-{
1452		- int nid;
1453		-
1454		- nid = __early_pfn_to_nid(pfn, state);
1455		- if (nid >= 0 && nid != node)
1456		- return false;
1457		- return true;
1458		-}
1459		-
1460		-/* Only safe to use early in boot when initialisation is single-threaded */
1461		-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1462		-{
1463		- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
1464		-}
1465		-
1466		-#else
1467		-
1468		-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1469		-{
1470		- return true;
1471		-}
1472		-static inline bool __meminit __maybe_unused
1473		-meminit_pfn_in_nid(unsigned long pfn, int node,
1474		- struct mminit_pfnnid_cache *state)
1475		-{
1476		- return true;
1477		-}
1478		-#endif
1479		-
1480		-
1481		-void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
	1731	+void __init memblock_free_pages(struct page *page, unsigned long pfn,
1482	1732	unsigned int order)
1483	1733	{
1484	1734	if (early_page_uninitialised(pfn))
1485	1735	return;
1486		- return __free_pages_boot_core(page, order);
	1736	+ __free_pages_core(page, order);
1487	1737	}
1488	1738
1489	1739	/*
..	..	@@ -1574,14 +1824,14 @@
1574	1824	if (nr_pages == pageblock_nr_pages &&
1575	1825	(pfn & (pageblock_nr_pages - 1)) == 0) {
1576	1826	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1577		- __free_pages_boot_core(page, pageblock_order);
	1827	+ __free_pages_core(page, pageblock_order);
1578	1828	return;
1579	1829	}
1580	1830
1581	1831	for (i = 0; i < nr_pages; i++, page++, pfn++) {
1582	1832	if ((pfn & (pageblock_nr_pages - 1)) == 0)
1583	1833	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1584		- __free_pages_boot_core(page, 0);
	1834	+ __free_pages_core(page, 0);
1585	1835	}
1586	1836	}
1587	1837
..	..	@@ -1604,20 +1854,12 @@
1604	1854	*
1605	1855	* Then, we check if a current large page is valid by only checking the validity
1606	1856	* of the head pfn.
1607		- *
1608		- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
1609		- * within a node: a pfn is between start and end of a node, but does not belong
1610		- * to this memory node.
1611	1857	*/
1612		-static inline bool __init
1613		-deferred_pfn_valid(int nid, unsigned long pfn,
1614		- struct mminit_pfnnid_cache *nid_init_state)
	1858	+static inline bool __init deferred_pfn_valid(unsigned long pfn)
1615	1859	{
1616	1860	if (!pfn_valid_within(pfn))
1617	1861	return false;
1618	1862	if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
1619		- return false;
1620		- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
1621	1863	return false;
1622	1864	return true;
1623	1865	}
..	..	@@ -1626,21 +1868,19 @@
1626	1868	* Free pages to buddy allocator. Try to free aligned pages in
1627	1869	* pageblock_nr_pages sizes.
1628	1870	*/
1629		-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
	1871	+static void __init deferred_free_pages(unsigned long pfn,
1630	1872	unsigned long end_pfn)
1631	1873	{
1632		- struct mminit_pfnnid_cache nid_init_state = { };
1633	1874	unsigned long nr_pgmask = pageblock_nr_pages - 1;
1634	1875	unsigned long nr_free = 0;
1635	1876
1636	1877	for (; pfn < end_pfn; pfn++) {
1637		- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
	1878	+ if (!deferred_pfn_valid(pfn)) {
1638	1879	deferred_free_range(pfn - nr_free, nr_free);
1639	1880	nr_free = 0;
1640	1881	} else if (!(pfn & nr_pgmask)) {
1641	1882	deferred_free_range(pfn - nr_free, nr_free);
1642	1883	nr_free = 1;
1643		- touch_nmi_watchdog();
1644	1884	} else {
1645	1885	nr_free++;
1646	1886	}
..	..	@@ -1654,22 +1894,22 @@
1654	1894	* by performing it only once every pageblock_nr_pages.
1655	1895	* Return number of pages initialized.
1656	1896	*/
1657		-static unsigned long __init deferred_init_pages(int nid, int zid,
	1897	+static unsigned long __init deferred_init_pages(struct zone *zone,
1658	1898	unsigned long pfn,
1659	1899	unsigned long end_pfn)
1660	1900	{
1661		- struct mminit_pfnnid_cache nid_init_state = { };
1662	1901	unsigned long nr_pgmask = pageblock_nr_pages - 1;
	1902	+ int nid = zone_to_nid(zone);
1663	1903	unsigned long nr_pages = 0;
	1904	+ int zid = zone_idx(zone);
1664	1905	struct page *page = NULL;
1665	1906
1666	1907	for (; pfn < end_pfn; pfn++) {
1667		- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
	1908	+ if (!deferred_pfn_valid(pfn)) {
1668	1909	page = NULL;
1669	1910	continue;
1670	1911	} else if (!page \|\| !(pfn & nr_pgmask)) {
1671	1912	page = pfn_to_page(pfn);
1672		- touch_nmi_watchdog();
1673	1913	} else {
1674	1914	page++;
1675	1915	}
..	..	@@ -1679,18 +1919,127 @@
1679	1919	return (nr_pages);
1680	1920	}
1681	1921
	1922	+/*
	1923	+ * This function is meant to pre-load the iterator for the zone init.
	1924	+ * Specifically it walks through the ranges until we are caught up to the
	1925	+ * first_init_pfn value and exits there. If we never encounter the value we
	1926	+ * return false indicating there are no valid ranges left.
	1927	+ */
	1928	+static bool __init
	1929	+deferred_init_mem_pfn_range_in_zone(u64 i, struct zone zone,
	1930	+ unsigned long spfn, unsigned long epfn,
	1931	+ unsigned long first_init_pfn)
	1932	+{
	1933	+ u64 j;
	1934	+
	1935	+ /*
	1936	+ * Start out by walking through the ranges in this zone that have
	1937	+ * already been initialized. We don't need to do anything with them
	1938	+ * so we just need to flush them out of the system.
	1939	+ */
	1940	+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
	1941	+ if (*epfn <= first_init_pfn)
	1942	+ continue;
	1943	+ if (*spfn < first_init_pfn)
	1944	+ *spfn = first_init_pfn;
	1945	+ *i = j;
	1946	+ return true;
	1947	+ }
	1948	+
	1949	+ return false;
	1950	+}
	1951	+
	1952	+/*
	1953	+ * Initialize and free pages. We do it in two loops: first we initialize
	1954	+ * struct page, then free to buddy allocator, because while we are
	1955	+ * freeing pages we can access pages that are ahead (computing buddy
	1956	+ * page in __free_one_page()).
	1957	+ *
	1958	+ * In order to try and keep some memory in the cache we have the loop
	1959	+ * broken along max page order boundaries. This way we will not cause
	1960	+ * any issues with the buddy page computation.
	1961	+ */
	1962	+static unsigned long __init
	1963	+deferred_init_maxorder(u64 i, struct zone zone, unsigned long *start_pfn,
	1964	+ unsigned long *end_pfn)
	1965	+{
	1966	+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
	1967	+ unsigned long spfn = start_pfn, epfn = end_pfn;
	1968	+ unsigned long nr_pages = 0;
	1969	+ u64 j = *i;
	1970	+
	1971	+ /* First we loop through and initialize the page values */
	1972	+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
	1973	+ unsigned long t;
	1974	+
	1975	+ if (mo_pfn <= *start_pfn)
	1976	+ break;
	1977	+
	1978	+ t = min(mo_pfn, *end_pfn);
	1979	+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
	1980	+
	1981	+ if (mo_pfn < *end_pfn) {
	1982	+ *start_pfn = mo_pfn;
	1983	+ break;
	1984	+ }
	1985	+ }
	1986	+
	1987	+ /* Reset values and now loop through freeing pages as needed */
	1988	+ swap(j, *i);
	1989	+
	1990	+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
	1991	+ unsigned long t;
	1992	+
	1993	+ if (mo_pfn <= spfn)
	1994	+ break;
	1995	+
	1996	+ t = min(mo_pfn, epfn);
	1997	+ deferred_free_pages(spfn, t);
	1998	+
	1999	+ if (mo_pfn <= epfn)
	2000	+ break;
	2001	+ }
	2002	+
	2003	+ return nr_pages;
	2004	+}
	2005	+
	2006	+static void __init
	2007	+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
	2008	+ void *arg)
	2009	+{
	2010	+ unsigned long spfn, epfn;
	2011	+ struct zone *zone = arg;
	2012	+ u64 i;
	2013	+
	2014	+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
	2015	+
	2016	+ /*
	2017	+ * Initialize and free pages in MAX_ORDER sized increments so that we
	2018	+ * can avoid introducing any issues with the buddy allocator.
	2019	+ */
	2020	+ while (spfn < end_pfn) {
	2021	+ deferred_init_maxorder(&i, zone, &spfn, &epfn);
	2022	+ cond_resched();
	2023	+ }
	2024	+}
	2025	+
	2026	+/* An arch may override for more concurrency. */
	2027	+__weak int __init
	2028	+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
	2029	+{
	2030	+ return 1;
	2031	+}
	2032	+
1682	2033	/* Initialise remaining memory on a node */
1683	2034	static int __init deferred_init_memmap(void *data)
1684	2035	{
1685	2036	pg_data_t *pgdat = data;
1686		- int nid = pgdat->node_id;
1687		- unsigned long start = jiffies;
1688		- unsigned long nr_pages = 0;
1689		- unsigned long spfn, epfn, first_init_pfn, flags;
1690		- phys_addr_t spa, epa;
1691		- int zid;
1692		- struct zone *zone;
1693	2037	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
	2038	+ unsigned long spfn = 0, epfn = 0;
	2039	+ unsigned long first_init_pfn, flags;
	2040	+ unsigned long start = jiffies;
	2041	+ struct zone *zone;
	2042	+ int zid, max_threads;
1694	2043	u64 i;
1695	2044
1696	2045	/* Bind memory initialisation thread to a local node if possible */
..	..	@@ -1723,30 +2072,36 @@
1723	2072	if (first_init_pfn < zone_end_pfn(zone))
1724	2073	break;
1725	2074	}
1726		- first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
1727	2075
1728		- /*
1729		- * Initialize and free pages. We do it in two loops: first we initialize
1730		- * struct page, than free to buddy allocator, because while we are
1731		- * freeing pages we can access pages that are ahead (computing buddy
1732		- * page in __free_one_page()).
1733		- */
1734		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1735		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1736		- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1737		- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
1738		- }
1739		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1740		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1741		- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1742		- deferred_free_pages(nid, zid, spfn, epfn);
1743		- }
	2076	+ /* If the zone is empty somebody else may have cleared out the zone */
	2077	+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
	2078	+ first_init_pfn))
	2079	+ goto zone_empty;
1744	2080
	2081	+ max_threads = deferred_page_init_max_threads(cpumask);
	2082	+
	2083	+ while (spfn < epfn) {
	2084	+ unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
	2085	+ struct padata_mt_job job = {
	2086	+ .thread_fn = deferred_init_memmap_chunk,
	2087	+ .fn_arg = zone,
	2088	+ .start = spfn,
	2089	+ .size = epfn_align - spfn,
	2090	+ .align = PAGES_PER_SECTION,
	2091	+ .min_chunk = PAGES_PER_SECTION,
	2092	+ .max_threads = max_threads,
	2093	+ };
	2094	+
	2095	+ padata_do_multithreaded(&job);
	2096	+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
	2097	+ epfn_align);
	2098	+ }
	2099	+zone_empty:
1745	2100	/* Sanity check that the next zone really is unpopulated */
1746	2101	WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
1747	2102
1748		- pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
1749		- jiffies_to_msecs(jiffies - start));
	2103	+ pr_info("node %d deferred pages initialised in %ums\n",
	2104	+ pgdat->node_id, jiffies_to_msecs(jiffies - start));
1750	2105
1751	2106	pgdat_init_report_one_done();
1752	2107	return 0;
..	..	@@ -1770,14 +2125,11 @@
1770	2125	static noinline bool __init
1771	2126	deferred_grow_zone(struct zone *zone, unsigned int order)
1772	2127	{
1773		- int zid = zone_idx(zone);
1774		- int nid = zone_to_nid(zone);
1775		- pg_data_t *pgdat = NODE_DATA(nid);
1776	2128	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
1777		- unsigned long nr_pages = 0;
1778		- unsigned long first_init_pfn, spfn, epfn, t, flags;
	2129	+ pg_data_t *pgdat = zone->zone_pgdat;
1779	2130	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
1780		- phys_addr_t spa, epa;
	2131	+ unsigned long spfn, epfn, flags;
	2132	+ unsigned long nr_pages = 0;
1781	2133	u64 i;
1782	2134
1783	2135	/* Only the last zone may have deferred pages */
..	..	@@ -1795,38 +2147,37 @@
1795	2147	return true;
1796	2148	}
1797	2149
1798		- first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
1799		-
1800		- if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
	2150	+ /* If the zone is empty somebody else may have cleared out the zone */
	2151	+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
	2152	+ first_deferred_pfn)) {
	2153	+ pgdat->first_deferred_pfn = ULONG_MAX;
1801	2154	pgdat_resize_unlock(pgdat, &flags);
1802		- return false;
	2155	+ /* Retry only once. */
	2156	+ return first_deferred_pfn != ULONG_MAX;
1803	2157	}
1804	2158
1805		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1806		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1807		- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
	2159	+ /*
	2160	+ * Initialize and free pages in MAX_ORDER sized increments so
	2161	+ * that we can avoid introducing any issues with the buddy
	2162	+ * allocator.
	2163	+ */
	2164	+ while (spfn < epfn) {
	2165	+ /* update our first deferred PFN for this section */
	2166	+ first_deferred_pfn = spfn;
1808	2167
1809		- while (spfn < epfn && nr_pages < nr_pages_needed) {
1810		- t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
1811		- first_deferred_pfn = min(t, epfn);
1812		- nr_pages += deferred_init_pages(nid, zid, spfn,
1813		- first_deferred_pfn);
1814		- spfn = first_deferred_pfn;
1815		- }
	2168	+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
	2169	+ touch_nmi_watchdog();
1816	2170
	2171	+ /* We should only stop along section boundaries */
	2172	+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
	2173	+ continue;
	2174	+
	2175	+ /* If our quota has been met we can stop here */
1817	2176	if (nr_pages >= nr_pages_needed)
1818	2177	break;
1819	2178	}
1820	2179
1821		- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1822		- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1823		- epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
1824		- deferred_free_pages(nid, zid, spfn, epfn);
1825		-
1826		- if (first_deferred_pfn == epfn)
1827		- break;
1828		- }
1829		- pgdat->first_deferred_pfn = first_deferred_pfn;
	2180	+ pgdat->first_deferred_pfn = spfn;
1830	2181	pgdat_resize_unlock(pgdat, &flags);
1831	2182
1832	2183	return nr_pages > 0;
..	..	@@ -1849,9 +2200,9 @@
1849	2200	void __init page_alloc_init_late(void)
1850	2201	{
1851	2202	struct zone *zone;
	2203	+ int nid;
1852	2204
1853	2205	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1854		- int nid;
1855	2206
1856	2207	/* There will be num_node_state(N_MEMORY) threads */
1857	2208	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
..	..	@@ -1879,10 +2230,12 @@
1879	2230	/* Reinit limits that are based on free pages after the kernel is up */
1880	2231	files_maxfiles_init();
1881	2232	#endif
1882		-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
	2233	+
1883	2234	/* Discard memblock private memory */
1884	2235	memblock_discard();
1885		-#endif
	2236	+
	2237	+ for_each_node_state(nid, N_MEMORY)
	2238	+ shuffle_free_memory(NODE_DATA(nid));
1886	2239
1887	2240	for_each_populated_zone(zone)
1888	2241	set_zone_contiguous(zone);
..	..	@@ -1916,6 +2269,7 @@
1916	2269	}
1917	2270
1918	2271	adjust_managed_page_count(page, pageblock_nr_pages);
	2272	+ page_zone(page)->cma_pages += pageblock_nr_pages;
1919	2273	}
1920	2274	#endif
1921	2275
..	..	@@ -1934,13 +2288,11 @@
1934	2288	* -- nyc
1935	2289	*/
1936	2290	static inline void expand(struct zone zone, struct page page,
1937		- int low, int high, struct free_area *area,
1938		- int migratetype)
	2291	+ int low, int high, int migratetype)
1939	2292	{
1940	2293	unsigned long size = 1 << high;
1941	2294
1942	2295	while (high > low) {
1943		- area--;
1944	2296	high--;
1945	2297	size >>= 1;
1946	2298	VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
..	..	@@ -1954,39 +2306,21 @@
1954	2306	if (set_page_guard(zone, &page[size], high, migratetype))
1955	2307	continue;
1956	2308
1957		- list_add(&page[size].lru, &area->free_list[migratetype]);
1958		- area->nr_free++;
1959		- set_page_order(&page[size], high);
	2309	+ add_to_free_list(&page[size], zone, high, migratetype);
	2310	+ set_buddy_order(&page[size], high);
1960	2311	}
1961	2312	}
1962	2313
1963	2314	static void check_new_page_bad(struct page *page)
1964	2315	{
1965		- const char *bad_reason = NULL;
1966		- unsigned long bad_flags = 0;
1967		-
1968		- if (unlikely(atomic_read(&page->_mapcount) != -1))
1969		- bad_reason = "nonzero mapcount";
1970		- if (unlikely(page->mapping != NULL))
1971		- bad_reason = "non-NULL mapping";
1972		- if (unlikely(page_ref_count(page) != 0))
1973		- bad_reason = "nonzero _count";
1974	2316	if (unlikely(page->flags & __PG_HWPOISON)) {
1975		- bad_reason = "HWPoisoned (hardware-corrupted)";
1976		- bad_flags = __PG_HWPOISON;
1977	2317	/* Don't complain about hwpoisoned pages */
1978	2318	page_mapcount_reset(page); /* remove PageBuddy */
1979	2319	return;
1980	2320	}
1981		- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
1982		- bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
1983		- bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
1984		- }
1985		-#ifdef CONFIG_MEMCG
1986		- if (unlikely(page->mem_cgroup))
1987		- bad_reason = "page still charged to cgroup";
1988		-#endif
1989		- bad_page(page, bad_reason, bad_flags);
	2321	+
	2322	+ bad_page(page,
	2323	+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
1990	2324	}
1991	2325
1992	2326	/*
..	..	@@ -2002,30 +2336,40 @@
2002	2336	return 1;
2003	2337	}
2004	2338
2005		-static inline bool free_pages_prezeroed(void)
2006		-{
2007		- return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
2008		- page_poisoning_enabled()) \|\| want_init_on_free();
2009		-}
2010		-
2011	2339	#ifdef CONFIG_DEBUG_VM
2012		-static bool check_pcp_refill(struct page *page)
	2340	+/*
	2341	+ * With DEBUG_VM enabled, order-0 pages are checked for expected state when
	2342	+ * being allocated from pcp lists. With debug_pagealloc also enabled, they are
	2343	+ * also checked when pcp lists are refilled from the free lists.
	2344	+ */
	2345	+static inline bool check_pcp_refill(struct page *page)
2013	2346	{
2014		- return false;
	2347	+ if (debug_pagealloc_enabled_static())
	2348	+ return check_new_page(page);
	2349	+ else
	2350	+ return false;
2015	2351	}
2016	2352
2017		-static bool check_new_pcp(struct page *page)
	2353	+static inline bool check_new_pcp(struct page *page)
2018	2354	{
2019	2355	return check_new_page(page);
2020	2356	}
2021	2357	#else
2022		-static bool check_pcp_refill(struct page *page)
	2358	+/*
	2359	+ * With DEBUG_VM disabled, free order-0 pages are checked for expected state
	2360	+ * when pcp lists are being refilled from the free lists. With debug_pagealloc
	2361	+ * enabled, they are also checked when being allocated from the pcp lists.
	2362	+ */
	2363	+static inline bool check_pcp_refill(struct page *page)
2023	2364	{
2024	2365	return check_new_page(page);
2025	2366	}
2026		-static bool check_new_pcp(struct page *page)
	2367	+static inline bool check_new_pcp(struct page *page)
2027	2368	{
2028		- return false;
	2369	+ if (debug_pagealloc_enabled_static())
	2370	+ return check_new_page(page);
	2371	+ else
	2372	+ return false;
2029	2373	}
2030	2374	#endif /* CONFIG_DEBUG_VM */
2031	2375
..	..	@@ -2049,9 +2393,31 @@
2049	2393	set_page_refcounted(page);
2050	2394
2051	2395	arch_alloc_page(page, order);
2052		- kernel_map_pages(page, 1 << order, 1);
2053		- kasan_alloc_pages(page, order);
2054		- kernel_poison_pages(page, 1 << order, 1);
	2396	+ debug_pagealloc_map_pages(page, 1 << order);
	2397	+
	2398	+ /*
	2399	+ * Page unpoisoning must happen before memory initialization.
	2400	+ * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
	2401	+ * allocations and the page unpoisoning code will complain.
	2402	+ */
	2403	+ kernel_unpoison_pages(page, 1 << order);
	2404	+
	2405	+ /*
	2406	+ * As memory initialization might be integrated into KASAN,
	2407	+ * kasan_alloc_pages and kernel_init_free_pages must be
	2408	+ * kept together to avoid discrepancies in behavior.
	2409	+ */
	2410	+ if (kasan_has_integrated_init()) {
	2411	+ kasan_alloc_pages(page, order, gfp_flags);
	2412	+ } else {
	2413	+ bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
	2414	+
	2415	+ kasan_unpoison_pages(page, order, init);
	2416	+ if (init)
	2417	+ kernel_init_free_pages(page, 1 << order,
	2418	+ gfp_flags & __GFP_ZEROTAGS);
	2419	+ }
	2420	+
2055	2421	set_page_owner(page, order, gfp_flags);
2056	2422	}
2057	2423
..	..	@@ -2059,9 +2425,6 @@
2059	2425	unsigned int alloc_flags)
2060	2426	{
2061	2427	post_alloc_hook(page, order, gfp_flags);
2062		-
2063		- if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
2064		- kernel_init_free_pages(page, 1 << order);
2065	2428
2066	2429	if (order && (gfp_flags & __GFP_COMP))
2067	2430	prep_compound_page(page, order);
..	..	@@ -2076,6 +2439,7 @@
2076	2439	set_page_pfmemalloc(page);
2077	2440	else
2078	2441	clear_page_pfmemalloc(page);
	2442	+ trace_android_vh_test_clear_look_around_ref(page);
2079	2443	}
2080	2444
2081	2445	/*
..	..	@@ -2093,14 +2457,11 @@
2093	2457	/* Find a page of the appropriate size in the preferred list */
2094	2458	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
2095	2459	area = &(zone->free_area[current_order]);
2096		- page = list_first_entry_or_null(&area->free_list[migratetype],
2097		- struct page, lru);
	2460	+ page = get_page_from_free_area(area, migratetype);
2098	2461	if (!page)
2099	2462	continue;
2100		- list_del(&page->lru);
2101		- rmv_page_order(page);
2102		- area->nr_free--;
2103		- expand(zone, page, order, current_order, area, migratetype);
	2463	+ del_page_from_free_list(page, zone, current_order);
	2464	+ expand(zone, page, order, current_order, migratetype);
2104	2465	set_pcppage_migratetype(page, migratetype);
2105	2466	return page;
2106	2467	}
..	..	@@ -2113,10 +2474,10 @@
2113	2474	* This array describes the order lists are fallen back to when
2114	2475	* the free lists for the desirable migrate type are depleted
2115	2476	*/
2116		-static int fallbacks[MIGRATE_TYPES][4] = {
	2477	+static int fallbacks[MIGRATE_TYPES][3] = {
2117	2478	[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2118		- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2119	2479	[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
	2480	+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2120	2481	#ifdef CONFIG_CMA
2121	2482	[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
2122	2483	#endif
..	..	@@ -2137,7 +2498,7 @@
2137	2498	#endif
2138	2499
2139	2500	/*
2140		- * Move the free pages in a range to the free lists of the requested type.
	2501	+ * Move the free pages in a range to the freelist tail of the requested type.
2141	2502	* Note that start_page and end_pages are not aligned on a pageblock
2142	2503	* boundary. If alignment is required, use move_freepages_block()
2143	2504	*/
..	..	@@ -2149,30 +2510,11 @@
2149	2510	unsigned int order;
2150	2511	int pages_moved = 0;
2151	2512
2152		-#ifndef CONFIG_HOLES_IN_ZONE
2153		- /*
2154		- * page_zone is not safe to call in this context when
2155		- * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
2156		- * anyway as we check zone boundaries in move_freepages_block().
2157		- * Remove at a later date when no bug reports exist related to
2158		- * grouping pages by mobility
2159		- */
2160		- VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
2161		- pfn_valid(page_to_pfn(end_page)) &&
2162		- page_zone(start_page) != page_zone(end_page));
2163		-#endif
2164		-
2165		- if (num_movable)
2166		- *num_movable = 0;
2167		-
2168	2513	for (page = start_page; page <= end_page;) {
2169	2514	if (!pfn_valid_within(page_to_pfn(page))) {
2170	2515	page++;
2171	2516	continue;
2172	2517	}
2173		-
2174		- /* Make sure we are not inadvertently changing nodes */
2175		- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
2176	2518
2177	2519	if (!PageBuddy(page)) {
2178	2520	/*
..	..	@@ -2188,9 +2530,12 @@
2188	2530	continue;
2189	2531	}
2190	2532
2191		- order = page_order(page);
2192		- list_move(&page->lru,
2193		- &zone->free_area[order].free_list[migratetype]);
	2533	+ /* Make sure we are not inadvertently changing nodes */
	2534	+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
	2535	+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
	2536	+
	2537	+ order = buddy_order(page);
	2538	+ move_to_free_list(page, zone, order, migratetype);
2194	2539	page += 1 << order;
2195	2540	pages_moved += 1 << order;
2196	2541	}
..	..	@@ -2203,6 +2548,9 @@
2203	2548	{
2204	2549	unsigned long start_pfn, end_pfn;
2205	2550	struct page start_page, end_page;
	2551	+
	2552	+ if (num_movable)
	2553	+ *num_movable = 0;
2206	2554
2207	2555	start_pfn = page_to_pfn(page);
2208	2556	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
..	..	@@ -2264,6 +2612,43 @@
2264	2612	return false;
2265	2613	}
2266	2614
	2615	+static inline bool boost_watermark(struct zone *zone)
	2616	+{
	2617	+ unsigned long max_boost;
	2618	+
	2619	+ if (!watermark_boost_factor)
	2620	+ return false;
	2621	+ /*
	2622	+ * Don't bother in zones that are unlikely to produce results.
	2623	+ * On small machines, including kdump capture kernels running
	2624	+ * in a small area, boosting the watermark can cause an out of
	2625	+ * memory situation immediately.
	2626	+ */
	2627	+ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
	2628	+ return false;
	2629	+
	2630	+ max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
	2631	+ watermark_boost_factor, 10000);
	2632	+
	2633	+ /*
	2634	+ * high watermark may be uninitialised if fragmentation occurs
	2635	+ * very early in boot so do not boost. We do not fall
	2636	+ * through and boost by pageblock_nr_pages as failing
	2637	+ * allocations that early means that reclaim is not going
	2638	+ * to help and it may even be impossible to reclaim the
	2639	+ * boosted watermark resulting in a hang.
	2640	+ */
	2641	+ if (!max_boost)
	2642	+ return false;
	2643	+
	2644	+ max_boost = max(pageblock_nr_pages, max_boost);
	2645	+
	2646	+ zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
	2647	+ max_boost);
	2648	+
	2649	+ return true;
	2650	+}
	2651	+
2267	2652	/*
2268	2653	* This function implements actual steal behaviour. If order is large enough,
2269	2654	* we can steal whole pageblock. If not, we first move freepages in this
..	..	@@ -2273,10 +2658,9 @@
2273	2658	* itself, so pages freed in the future will be put on the correct free list.
2274	2659	*/
2275	2660	static void steal_suitable_fallback(struct zone zone, struct page page,
2276		- int start_type, bool whole_block)
	2661	+ unsigned int alloc_flags, int start_type, bool whole_block)
2277	2662	{
2278		- unsigned int current_order = page_order(page);
2279		- struct free_area *area;
	2663	+ unsigned int current_order = buddy_order(page);
2280	2664	int free_pages, movable_pages, alike_pages;
2281	2665	int old_block_type;
2282	2666
..	..	@@ -2294,6 +2678,14 @@
2294	2678	change_pageblock_range(page, current_order, start_type);
2295	2679	goto single_page;
2296	2680	}
	2681	+
	2682	+ /*
	2683	+ * Boost watermarks to increase reclaim pressure to reduce the
	2684	+ * likelihood of future fallbacks. Wake kswapd now as the node
	2685	+ * may be balanced overall and kswapd will not wake naturally.
	2686	+ */
	2687	+ if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
	2688	+ set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
2297	2689
2298	2690	/* We are not allowed to try stealing from the whole block */
2299	2691	if (!whole_block)
..	..	@@ -2338,8 +2730,7 @@
2338	2730	return;
2339	2731
2340	2732	single_page:
2341		- area = &zone->free_area[current_order];
2342		- list_move(&page->lru, &area->free_list[start_type]);
	2733	+ move_to_free_list(page, zone, current_order, start_type);
2343	2734	}
2344	2735
2345	2736	/*
..	..	@@ -2363,7 +2754,7 @@
2363	2754	if (fallback_mt == MIGRATE_TYPES)
2364	2755	break;
2365	2756
2366		- if (list_empty(&area->free_list[fallback_mt]))
	2757	+ if (free_area_empty(area, fallback_mt))
2367	2758	continue;
2368	2759
2369	2760	if (can_steal_fallback(order, migratetype))
..	..	@@ -2393,7 +2784,7 @@
2393	2784	* Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2394	2785	* Check is race-prone but harmless.
2395	2786	*/
2396		- max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
	2787	+ max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
2397	2788	if (zone->nr_reserved_highatomic >= max_managed)
2398	2789	return;
2399	2790
..	..	@@ -2436,7 +2827,7 @@
2436	2827	int order;
2437	2828	bool ret;
2438	2829
2439		- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
	2830	+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
2440	2831	ac->nodemask) {
2441	2832	/*
2442	2833	* Preserve at least one pageblock unless memory pressure
..	..	@@ -2450,9 +2841,7 @@
2450	2841	for (order = 0; order < MAX_ORDER; order++) {
2451	2842	struct free_area *area = &(zone->free_area[order]);
2452	2843
2453		- page = list_first_entry_or_null(
2454		- &area->free_list[MIGRATE_HIGHATOMIC],
2455		- struct page, lru);
	2844	+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
2456	2845	if (!page)
2457	2846	continue;
2458	2847
..	..	@@ -2510,20 +2899,30 @@
2510	2899	* condition simpler.
2511	2900	*/
2512	2901	static __always_inline bool
2513		-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
	2902	+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
	2903	+ unsigned int alloc_flags)
2514	2904	{
2515	2905	struct free_area *area;
2516	2906	int current_order;
	2907	+ int min_order = order;
2517	2908	struct page *page;
2518	2909	int fallback_mt;
2519	2910	bool can_steal;
	2911	+
	2912	+ /*
	2913	+ * Do not steal pages from freelists belonging to other pageblocks
	2914	+ * i.e. orders < pageblock_order. If there are no local zones free,
	2915	+ * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
	2916	+ */
	2917	+ if (alloc_flags & ALLOC_NOFRAGMENT)
	2918	+ min_order = pageblock_order;
2520	2919
2521	2920	/*
2522	2921	* Find the largest available free page in the other list. This roughly
2523	2922	* approximates finding the pageblock with the most free pages, which
2524	2923	* would be too costly to do exactly.
2525	2924	*/
2526		- for (current_order = MAX_ORDER - 1; current_order >= order;
	2925	+ for (current_order = MAX_ORDER - 1; current_order >= min_order;
2527	2926	--current_order) {
2528	2927	area = &(zone->free_area[current_order]);
2529	2928	fallback_mt = find_suitable_fallback(area, current_order,
..	..	@@ -2565,10 +2964,10 @@
2565	2964	VM_BUG_ON(current_order == MAX_ORDER);
2566	2965
2567	2966	do_steal:
2568		- page = list_first_entry(&area->free_list[fallback_mt],
2569		- struct page, lru);
	2967	+ page = get_page_from_free_area(area, fallback_mt);
2570	2968
2571		- steal_suitable_fallback(zone, page, start_migratetype, can_steal);
	2969	+ steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
	2970	+ can_steal);
2572	2971
2573	2972	trace_mm_page_alloc_extfrag(page, order, current_order,
2574	2973	start_migratetype, fallback_mt);
..	..	@@ -2582,14 +2981,16 @@
2582	2981	* Call me with the zone->lock already held.
2583	2982	*/
2584	2983	static __always_inline struct page *
2585		-__rmqueue(struct zone *zone, unsigned int order, int migratetype)
	2984	+__rmqueue(struct zone *zone, unsigned int order, int migratetype,
	2985	+ unsigned int alloc_flags)
2586	2986	{
2587	2987	struct page *page;
2588	2988
2589	2989	retry:
2590	2990	page = __rmqueue_smallest(zone, order, migratetype);
2591	2991
2592		- if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype))
	2992	+ if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype,
	2993	+ alloc_flags))
2593	2994	goto retry;
2594	2995
2595	2996	trace_mm_page_alloc_zone_locked(page, order, migratetype);
..	..	@@ -2597,18 +2998,18 @@
2597	2998	}
2598	2999
2599	3000	#ifdef CONFIG_CMA
2600		-static struct page __rmqueue_cma(struct zone zone, unsigned int order)
	3001	+static struct page __rmqueue_cma(struct zone zone, unsigned int order,
	3002	+ int migratetype,
	3003	+ unsigned int alloc_flags)
2601	3004	{
2602		- struct page *page = 0;
2603		-
2604		- if (IS_ENABLED(CONFIG_CMA))
2605		- if (!zone->cma_alloc)
2606		- page = __rmqueue_cma_fallback(zone, order);
	3005	+ struct page *page = __rmqueue_cma_fallback(zone, order);
2607	3006	trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA);
2608	3007	return page;
2609	3008	}
2610	3009	#else
2611		-static inline struct page __rmqueue_cma(struct zone zone, unsigned int order)
	3010	+static inline struct page __rmqueue_cma(struct zone zone, unsigned int order,
	3011	+ int migratetype,
	3012	+ unsigned int alloc_flags)
2612	3013	{
2613	3014	return NULL;
2614	3015	}
..	..	@@ -2621,7 +3022,7 @@
2621	3022	*/
2622	3023	static int rmqueue_bulk(struct zone *zone, unsigned int order,
2623	3024	unsigned long count, struct list_head *list,
2624		- int migratetype)
	3025	+ int migratetype, unsigned int alloc_flags)
2625	3026	{
2626	3027	int i, alloced = 0;
2627	3028
..	..	@@ -2629,15 +3030,11 @@
2629	3030	for (i = 0; i < count; ++i) {
2630	3031	struct page *page;
2631	3032
2632		- /*
2633		- * If migrate type CMA is being requested only try to
2634		- * satisfy the request with CMA pages to try and increase
2635		- * CMA utlization.
2636		- */
2637	3033	if (is_migrate_cma(migratetype))
2638		- page = __rmqueue_cma(zone, order);
	3034	+ page = __rmqueue_cma(zone, order, migratetype,
	3035	+ alloc_flags);
2639	3036	else
2640		- page = __rmqueue(zone, order, migratetype);
	3037	+ page = __rmqueue(zone, order, migratetype, alloc_flags);
2641	3038
2642	3039	if (unlikely(page == NULL))
2643	3040	break;
..	..	@@ -2680,14 +3077,14 @@
2680	3077	*/
2681	3078	static struct list_head get_populated_pcp_list(struct zone zone,
2682	3079	unsigned int order, struct per_cpu_pages *pcp,
2683		- int migratetype)
	3080	+ int migratetype, unsigned int alloc_flags)
2684	3081	{
2685	3082	struct list_head *list = &pcp->lists[migratetype];
2686	3083
2687	3084	if (list_empty(list)) {
2688	3085	pcp->count += rmqueue_bulk(zone, order,
2689	3086	pcp->batch, list,
2690		- migratetype);
	3087	+ migratetype, alloc_flags);
2691	3088
2692	3089	if (list_empty(list))
2693	3090	list = NULL;
..	..	@@ -2710,13 +3107,13 @@
2710	3107	int to_drain, batch;
2711	3108	LIST_HEAD(dst);
2712	3109
2713		- local_lock_irqsave(pa_lock, flags);
	3110	+ local_lock_irqsave(&pa_lock.l, flags);
2714	3111	batch = READ_ONCE(pcp->batch);
2715	3112	to_drain = min(pcp->count, batch);
2716	3113	if (to_drain > 0)
2717	3114	isolate_pcp_pages(to_drain, pcp, &dst);
2718	3115
2719		- local_unlock_irqrestore(pa_lock, flags);
	3116	+ local_unlock_irqrestore(&pa_lock.l, flags);
2720	3117
2721	3118	if (to_drain > 0)
2722	3119	free_pcppages_bulk(zone, &dst, false);
..	..	@@ -2738,7 +3135,7 @@
2738	3135	LIST_HEAD(dst);
2739	3136	int count;
2740	3137
2741		- cpu_lock_irqsave(cpu, flags);
	3138	+ local_lock_irqsave(&pa_lock.l, flags);
2742	3139	pset = per_cpu_ptr(zone->pageset, cpu);
2743	3140
2744	3141	pcp = &pset->pcp;
..	..	@@ -2746,7 +3143,7 @@
2746	3143	if (count)
2747	3144	isolate_pcp_pages(count, pcp, &dst);
2748	3145
2749		- cpu_unlock_irqrestore(cpu, flags);
	3146	+ local_unlock_irqrestore(&pa_lock.l, flags);
2750	3147
2751	3148	if (count)
2752	3149	free_pcppages_bulk(zone, &dst, false);
..	..	@@ -2784,9 +3181,12 @@
2784	3181	drain_pages(cpu);
2785	3182	}
2786	3183
2787		-#ifndef CONFIG_PREEMPT_RT_BASE
2788	3184	static void drain_local_pages_wq(struct work_struct *work)
2789	3185	{
	3186	+ struct pcpu_drain *drain;
	3187	+
	3188	+ drain = container_of(work, struct pcpu_drain, work);
	3189	+
2790	3190	/*
2791	3191	* drain_all_pages doesn't use proper cpu hotplug protection so
2792	3192	* we can race with cpu offline when the WQ can move this from
..	..	@@ -2794,11 +3194,10 @@
2794	3194	* cpu which is allright but we also have to make sure to not move to
2795	3195	* a different one.
2796	3196	*/
2797		- preempt_disable();
2798		- drain_local_pages(NULL);
2799		- preempt_enable();
	3197	+ migrate_disable();
	3198	+ drain_local_pages(drain->zone);
	3199	+ migrate_enable();
2800	3200	}
2801		-#endif
2802	3201
2803	3202	/*
2804	3203	* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
..	..	@@ -2865,22 +3264,16 @@
2865	3264	else
2866	3265	cpumask_clear_cpu(cpu, &cpus_with_pcps);
2867	3266	}
2868		-#ifdef CONFIG_PREEMPT_RT_BASE
	3267	+
2869	3268	for_each_cpu(cpu, &cpus_with_pcps) {
2870		- if (zone)
2871		- drain_pages_zone(cpu, zone);
2872		- else
2873		- drain_pages(cpu);
2874		- }
2875		-#else
2876		- for_each_cpu(cpu, &cpus_with_pcps) {
2877		- struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
2878		- INIT_WORK(work, drain_local_pages_wq);
2879		- queue_work_on(cpu, mm_percpu_wq, work);
	3269	+ struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
	3270	+
	3271	+ drain->zone = zone;
	3272	+ INIT_WORK(&drain->work, drain_local_pages_wq);
	3273	+ queue_work_on(cpu, mm_percpu_wq, &drain->work);
2880	3274	}
2881	3275	for_each_cpu(cpu, &cpus_with_pcps)
2882		- flush_work(per_cpu_ptr(&pcpu_drain, cpu));
2883		-#endif
	3276	+ flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
2884	3277
2885	3278	mutex_unlock(&pcpu_drain_mutex);
2886	3279	}
..	..	@@ -2958,6 +3351,7 @@
2958	3351	struct zone *zone = page_zone(page);
2959	3352	struct per_cpu_pages *pcp;
2960	3353	int migratetype;
	3354	+ bool pcp_skip_cma_pages = false;
2961	3355
2962	3356	migratetype = get_pcppage_migratetype(page);
2963	3357	__count_vm_event(PGFREE);
..	..	@@ -2970,8 +3364,12 @@
2970	3364	* excessively into the page allocator
2971	3365	*/
2972	3366	if (migratetype >= MIGRATE_PCPTYPES) {
2973		- if (unlikely(is_migrate_isolate(migratetype))) {
2974		- free_one_page(zone, page, pfn, 0, migratetype);
	3367	+ trace_android_vh_pcplist_add_cma_pages_bypass(migratetype,
	3368	+ &pcp_skip_cma_pages);
	3369	+ if (unlikely(is_migrate_isolate(migratetype)) \|\|
	3370	+ pcp_skip_cma_pages) {
	3371	+ free_one_page(zone, page, pfn, 0, migratetype,
	3372	+ FPI_NONE);
2975	3373	return;
2976	3374	}
2977	3375	migratetype = MIGRATE_MOVABLE;
..	..	@@ -3000,9 +3398,9 @@
3000	3398	if (!free_unref_page_prepare(page, pfn))
3001	3399	return;
3002	3400
3003		- local_lock_irqsave(pa_lock, flags);
	3401	+ local_lock_irqsave(&pa_lock.l, flags);
3004	3402	free_unref_page_commit(page, pfn, &dst);
3005		- local_unlock_irqrestore(pa_lock, flags);
	3403	+ local_unlock_irqrestore(&pa_lock.l, flags);
3006	3404	if (!list_empty(&dst))
3007	3405	free_pcppages_bulk(zone, &dst, false);
3008	3406	}
..	..	@@ -3029,7 +3427,7 @@
3029	3427	set_page_private(page, pfn);
3030	3428	}
3031	3429
3032		- local_lock_irqsave(pa_lock, flags);
	3430	+ local_lock_irqsave(&pa_lock.l, flags);
3033	3431	list_for_each_entry_safe(page, next, list, lru) {
3034	3432	unsigned long pfn = page_private(page);
3035	3433	enum zone_type type;
..	..	@@ -3044,12 +3442,12 @@
3044	3442	* a large list of pages to free.
3045	3443	*/
3046	3444	if (++batch_count == SWAP_CLUSTER_MAX) {
3047		- local_unlock_irqrestore(pa_lock, flags);
	3445	+ local_unlock_irqrestore(&pa_lock.l, flags);
3048	3446	batch_count = 0;
3049		- local_lock_irqsave(pa_lock, flags);
	3447	+ local_lock_irqsave(&pa_lock.l, flags);
3050	3448	}
3051	3449	}
3052		- local_unlock_irqrestore(pa_lock, flags);
	3450	+ local_unlock_irqrestore(&pa_lock.l, flags);
3053	3451
3054	3452	for (i = 0; i < __MAX_NR_ZONES; ) {
3055	3453	struct page *page;
..	..	@@ -3084,7 +3482,8 @@
3084	3482
3085	3483	for (i = 1; i < (1 << order); i++)
3086	3484	set_page_refcounted(page + i);
3087		- split_page_owner(page, order);
	3485	+ split_page_owner(page, 1 << order);
	3486	+ split_page_memcg(page, 1 << order);
3088	3487	}
3089	3488	EXPORT_SYMBOL_GPL(split_page);
3090	3489
..	..	@@ -3106,7 +3505,7 @@
3106	3505	* watermark, because we already know our high-order page
3107	3506	* exists.
3108	3507	*/
3109		- watermark = min_wmark_pages(zone) + (1UL << order);
	3508	+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
3110	3509	if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
3111	3510	return 0;
3112	3511
..	..	@@ -3114,9 +3513,8 @@
3114	3513	}
3115	3514
3116	3515	/* Remove page from free list */
3117		- list_del(&page->lru);
3118		- zone->free_area[order].nr_free--;
3119		- rmv_page_order(page);
	3516	+
	3517	+ del_page_from_free_list(page, zone, order);
3120	3518
3121	3519	/*
3122	3520	* Set the pageblock if the isolated page is at least half of a
..	..	@@ -3135,6 +3533,27 @@
3135	3533
3136	3534
3137	3535	return 1UL << order;
	3536	+}
	3537	+
	3538	+/**
	3539	+ * __putback_isolated_page - Return a now-isolated page back where we got it
	3540	+ * @page: Page that was isolated
	3541	+ * @order: Order of the isolated page
	3542	+ * @mt: The page's pageblock's migratetype
	3543	+ *
	3544	+ * This function is meant to return a page pulled from the free lists via
	3545	+ * __isolate_free_page back to the free lists they were pulled from.
	3546	+ */
	3547	+void __putback_isolated_page(struct page *page, unsigned int order, int mt)
	3548	+{
	3549	+ struct zone *zone = page_zone(page);
	3550	+
	3551	+ /* zone lock should be held when this function is called */
	3552	+ lockdep_assert_held(&zone->lock);
	3553	+
	3554	+ /* Return isolated page to tail of freelist. */
	3555	+ __free_one_page(page, page_to_pfn(page), zone, order, mt,
	3556	+ FPI_SKIP_REPORT_NOTIFY \| FPI_TO_TAIL);
3138	3557	}
3139	3558
3140	3559	/*
..	..	@@ -3166,6 +3585,7 @@
3166	3585
3167	3586	/* Remove page from the per-cpu list, caller must protect the list */
3168	3587	static struct page __rmqueue_pcplist(struct zone zone, int migratetype,
	3588	+ unsigned int alloc_flags,
3169	3589	struct per_cpu_pages *pcp,
3170	3590	gfp_t gfp_flags)
3171	3591	{
..	..	@@ -3175,9 +3595,9 @@
3175	3595	do {
3176	3596	/* First try to get CMA pages */
3177	3597	if (migratetype == MIGRATE_MOVABLE &&
3178		- gfp_flags & __GFP_CMA) {
	3598	+ alloc_flags & ALLOC_CMA) {
3179	3599	list = get_populated_pcp_list(zone, 0, pcp,
3180		- get_cma_migrate_type());
	3600	+ get_cma_migrate_type(), alloc_flags);
3181	3601	}
3182	3602
3183	3603	if (list == NULL) {
..	..	@@ -3186,7 +3606,7 @@
3186	3606	* free CMA pages.
3187	3607	*/
3188	3608	list = get_populated_pcp_list(zone, 0, pcp,
3189		- migratetype);
	3609	+ migratetype, alloc_flags);
3190	3610	if (unlikely(list == NULL) \|\|
3191	3611	unlikely(list_empty(list)))
3192	3612	return NULL;
..	..	@@ -3202,22 +3622,22 @@
3202	3622
3203	3623	/* Lock and remove page from the per-cpu list */
3204	3624	static struct page rmqueue_pcplist(struct zone preferred_zone,
3205		- struct zone *zone, unsigned int order,
3206		- gfp_t gfp_flags, int migratetype)
	3625	+ struct zone *zone, gfp_t gfp_flags,
	3626	+ int migratetype, unsigned int alloc_flags)
3207	3627	{
3208	3628	struct per_cpu_pages *pcp;
3209	3629	struct page *page;
3210	3630	unsigned long flags;
3211	3631
3212		- local_lock_irqsave(pa_lock, flags);
	3632	+ local_lock_irqsave(&pa_lock.l, flags);
3213	3633	pcp = &this_cpu_ptr(zone->pageset)->pcp;
3214		- page = __rmqueue_pcplist(zone, migratetype, pcp,
	3634	+ page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp,
3215	3635	gfp_flags);
3216	3636	if (page) {
3217		- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
	3637	+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
3218	3638	zone_statistics(preferred_zone, zone);
3219	3639	}
3220		- local_unlock_irqrestore(pa_lock, flags);
	3640	+ local_unlock_irqrestore(&pa_lock.l, flags);
3221	3641	return page;
3222	3642	}
3223	3643
..	..	@@ -3234,8 +3654,8 @@
3234	3654	struct page *page;
3235	3655
3236	3656	if (likely(order == 0)) {
3237		- page = rmqueue_pcplist(preferred_zone, zone, order,
3238		- gfp_flags, migratetype);
	3657	+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
	3658	+ migratetype, alloc_flags);
3239	3659	goto out;
3240	3660	}
3241	3661
..	..	@@ -3244,25 +3664,32 @@
3244	3664	* allocate greater than order-1 page units with __GFP_NOFAIL.
3245	3665	*/
3246	3666	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
3247		- local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
	3667	+ local_lock_irqsave(&pa_lock.l, flags);
	3668	+ spin_lock(&zone->lock);
3248	3669
3249	3670	do {
3250	3671	page = NULL;
3251		-
3252		- if (alloc_flags & ALLOC_HARDER) {
	3672	+ /*
	3673	+ * order-0 request can reach here when the pcplist is skipped
	3674	+ * due to non-CMA allocation context. HIGHATOMIC area is
	3675	+ * reserved for high-order atomic allocation, so order-0
	3676	+ * request should skip it.
	3677	+ */
	3678	+ if (order > 0 && alloc_flags & ALLOC_HARDER) {
3253	3679	page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
3254	3680	if (page)
3255	3681	trace_mm_page_alloc_zone_locked(page, order, migratetype);
3256	3682	}
3257		-
3258		- if (!page && migratetype == MIGRATE_MOVABLE &&
3259		- gfp_flags & __GFP_CMA)
3260		- page = __rmqueue_cma(zone, order);
3261		-
3262		- if (!page)
3263		- page = __rmqueue(zone, order, migratetype);
	3683	+ if (!page) {
	3684	+ if (migratetype == MIGRATE_MOVABLE &&
	3685	+ alloc_flags & ALLOC_CMA)
	3686	+ page = __rmqueue_cma(zone, order, migratetype,
	3687	+ alloc_flags);
	3688	+ if (!page)
	3689	+ page = __rmqueue(zone, order, migratetype,
	3690	+ alloc_flags);
	3691	+ }
3264	3692	} while (page && check_new_pages(page, order));
3265		-
3266	3693	spin_unlock(&zone->lock);
3267	3694	if (!page)
3268	3695	goto failed;
..	..	@@ -3271,14 +3698,22 @@
3271	3698
3272	3699	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
3273	3700	zone_statistics(preferred_zone, zone);
3274		- local_unlock_irqrestore(pa_lock, flags);
	3701	+ trace_android_vh_rmqueue(preferred_zone, zone, order,
	3702	+ gfp_flags, alloc_flags, migratetype);
	3703	+ local_unlock_irqrestore(&pa_lock.l, flags);
3275	3704
3276	3705	out:
	3706	+ /* Separate test+clear to avoid unnecessary atomics */
	3707	+ if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
	3708	+ clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
	3709	+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
	3710	+ }
	3711	+
3277	3712	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
3278	3713	return page;
3279	3714
3280	3715	failed:
3281		- local_unlock_irqrestore(pa_lock, flags);
	3716	+ local_unlock_irqrestore(&pa_lock.l, flags);
3282	3717	return NULL;
3283	3718	}
3284	3719
..	..	@@ -3303,7 +3738,7 @@
3303	3738	}
3304	3739	__setup("fail_page_alloc=", setup_fail_page_alloc);
3305	3740
3306		-static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
	3741	+static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3307	3742	{
3308	3743	if (order < fail_page_alloc.min_order)
3309	3744	return false;
..	..	@@ -3327,24 +3762,14 @@
3327	3762
3328	3763	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
3329	3764	&fail_page_alloc.attr);
3330		- if (IS_ERR(dir))
3331		- return PTR_ERR(dir);
3332	3765
3333		- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
3334		- &fail_page_alloc.ignore_gfp_reclaim))
3335		- goto fail;
3336		- if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3337		- &fail_page_alloc.ignore_gfp_highmem))
3338		- goto fail;
3339		- if (!debugfs_create_u32("min-order", mode, dir,
3340		- &fail_page_alloc.min_order))
3341		- goto fail;
	3766	+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
	3767	+ &fail_page_alloc.ignore_gfp_reclaim);
	3768	+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
	3769	+ &fail_page_alloc.ignore_gfp_highmem);
	3770	+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
3342	3771
3343	3772	return 0;
3344		-fail:
3345		- debugfs_remove_recursive(dir);
3346		-
3347		- return -ENOMEM;
3348	3773	}
3349	3774
3350	3775	late_initcall(fail_page_alloc_debugfs);
..	..	@@ -3353,12 +3778,41 @@
3353	3778
3354	3779	#else /* CONFIG_FAIL_PAGE_ALLOC */
3355	3780
3356		-static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
	3781	+static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3357	3782	{
3358	3783	return false;
3359	3784	}
3360	3785
3361	3786	#endif /* CONFIG_FAIL_PAGE_ALLOC */
	3787	+
	3788	+noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
	3789	+{
	3790	+ return __should_fail_alloc_page(gfp_mask, order);
	3791	+}
	3792	+ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
	3793	+
	3794	+static inline long __zone_watermark_unusable_free(struct zone *z,
	3795	+ unsigned int order, unsigned int alloc_flags)
	3796	+{
	3797	+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER\|ALLOC_OOM));
	3798	+ long unusable_free = (1 << order) - 1;
	3799	+
	3800	+ /*
	3801	+ * If the caller does not have rights to ALLOC_HARDER then subtract
	3802	+ * the high-atomic reserves. This will over-estimate the size of the
	3803	+ * atomic reserve but it avoids a search.
	3804	+ */
	3805	+ if (likely(!alloc_harder))
	3806	+ unusable_free += z->nr_reserved_highatomic;
	3807	+
	3808	+#ifdef CONFIG_CMA
	3809	+ /* If allocation can't use CMA areas don't use free CMA pages */
	3810	+ if (!(alloc_flags & ALLOC_CMA))
	3811	+ unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
	3812	+#endif
	3813	+
	3814	+ return unusable_free;
	3815	+}
3362	3816
3363	3817	/*
3364	3818	* Return true if free base pages are above 'mark'. For high-order checks it
..	..	@@ -3367,7 +3821,7 @@
3367	3821	* to check in the allocation paths if no pages are free.
3368	3822	*/
3369	3823	bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3370		- int classzone_idx, unsigned int alloc_flags,
	3824	+ int highest_zoneidx, unsigned int alloc_flags,
3371	3825	long free_pages)
3372	3826	{
3373	3827	long min = mark;
..	..	@@ -3375,19 +3829,12 @@
3375	3829	const bool alloc_harder = (alloc_flags & (ALLOC_HARDER\|ALLOC_OOM));
3376	3830
3377	3831	/* free_pages may go negative - that's OK */
3378		- free_pages -= (1 << order) - 1;
	3832	+ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
3379	3833
3380	3834	if (alloc_flags & ALLOC_HIGH)
3381	3835	min -= min / 2;
3382	3836
3383		- /*
3384		- * If the caller does not have rights to ALLOC_HARDER then subtract
3385		- * the high-atomic reserves. This will over-estimate the size of the
3386		- * atomic reserve but it avoids a search.
3387		- */
3388		- if (likely(!alloc_harder)) {
3389		- free_pages -= z->nr_reserved_highatomic;
3390		- } else {
	3837	+ if (unlikely(alloc_harder)) {
3391	3838	/*
3392	3839	* OOM victims can try even harder than normal ALLOC_HARDER
3393	3840	* users on the grounds that it's definitely going to be in
..	..	@@ -3400,19 +3847,12 @@
3400	3847	min -= min / 4;
3401	3848	}
3402	3849
3403		-
3404		-#ifdef CONFIG_CMA
3405		- /* If allocation can't use CMA areas don't use free CMA pages */
3406		- if (!(alloc_flags & ALLOC_CMA))
3407		- free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
3408		-#endif
3409		-
3410	3850	/*
3411	3851	* Check watermarks for an order-0 allocation request. If these
3412	3852	* are not met, then a high-order request also cannot go ahead
3413	3853	* even if a suitable page happened to be free.
3414	3854	*/
3415		- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
	3855	+ if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
3416	3856	return false;
3417	3857
3418	3858	/* If this is an order-0 request then the watermark is fine */
..	..	@@ -3436,65 +3876,83 @@
3436	3876	if (mt == MIGRATE_CMA)
3437	3877	continue;
3438	3878	#endif
3439		- if (!list_empty(&area->free_list[mt]))
	3879	+ if (!free_area_empty(area, mt))
3440	3880	return true;
3441	3881	}
3442	3882
3443	3883	#ifdef CONFIG_CMA
3444	3884	if ((alloc_flags & ALLOC_CMA) &&
3445		- !list_empty(&area->free_list[MIGRATE_CMA])) {
	3885	+ !free_area_empty(area, MIGRATE_CMA)) {
3446	3886	return true;
3447	3887	}
3448	3888	#endif
3449		- if (alloc_harder &&
3450		- !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
	3889	+ if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
3451	3890	return true;
3452	3891	}
3453	3892	return false;
3454	3893	}
3455	3894
3456	3895	bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3457		- int classzone_idx, unsigned int alloc_flags)
	3896	+ int highest_zoneidx, unsigned int alloc_flags)
3458	3897	{
3459		- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
	3898	+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3460	3899	zone_page_state(z, NR_FREE_PAGES));
3461	3900	}
	3901	+EXPORT_SYMBOL_GPL(zone_watermark_ok);
3462	3902
3463	3903	static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
3464		- unsigned long mark, int classzone_idx, unsigned int alloc_flags)
	3904	+ unsigned long mark, int highest_zoneidx,
	3905	+ unsigned int alloc_flags, gfp_t gfp_mask)
3465	3906	{
3466		- long free_pages = zone_page_state(z, NR_FREE_PAGES);
3467		- long cma_pages = 0;
	3907	+ long free_pages;
3468	3908
3469		-#ifdef CONFIG_CMA
3470		- /* If allocation can't use CMA areas don't use free CMA pages */
3471		- if (!(alloc_flags & ALLOC_CMA))
3472		- cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
3473		-#endif
	3909	+ free_pages = zone_page_state(z, NR_FREE_PAGES);
3474	3910
3475	3911	/*
3476	3912	* Fast check for order-0 only. If this fails then the reserves
3477		- * need to be calculated. There is a corner case where the check
3478		- * passes but only the high-order atomic reserve are free. If
3479		- * the caller is !atomic then it'll uselessly search the free
3480		- * list. That corner case is then slower but it is harmless.
	3913	+ * need to be calculated.
3481	3914	*/
3482		- if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
3483		- return true;
	3915	+ if (!order) {
	3916	+ long usable_free;
	3917	+ long reserved;
3484	3918
3485		- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3486		- free_pages);
	3919	+ usable_free = free_pages;
	3920	+ reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
	3921	+
	3922	+ /* reserved may over estimate high-atomic reserves. */
	3923	+ usable_free -= min(usable_free, reserved);
	3924	+ if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
	3925	+ return true;
	3926	+ }
	3927	+
	3928	+ if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
	3929	+ free_pages))
	3930	+ return true;
	3931	+ /*
	3932	+ * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
	3933	+ * when checking the min watermark. The min watermark is the
	3934	+ * point where boosting is ignored so that kswapd is woken up
	3935	+ * when below the low watermark.
	3936	+ */
	3937	+ if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
	3938	+ && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
	3939	+ mark = z->_watermark[WMARK_MIN];
	3940	+ return __zone_watermark_ok(z, order, mark, highest_zoneidx,
	3941	+ alloc_flags, free_pages);
	3942	+ }
	3943	+
	3944	+ return false;
3487	3945	}
3488	3946
3489	3947	bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3490		- unsigned long mark, int classzone_idx)
	3948	+ unsigned long mark, int highest_zoneidx)
3491	3949	{
3492	3950	long free_pages = zone_page_state(z, NR_FREE_PAGES);
3493	3951
3494	3952	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3495	3953	free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
3496	3954
3497		- return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
	3955	+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
3498	3956	free_pages);
3499	3957	}
3500	3958	EXPORT_SYMBOL_GPL(zone_watermark_ok_safe);
..	..	@@ -3503,7 +3961,7 @@
3503	3961	static bool zone_allows_reclaim(struct zone local_zone, struct zone zone)
3504	3962	{
3505	3963	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3506		- RECLAIM_DISTANCE;
	3964	+ node_reclaim_distance;
3507	3965	}
3508	3966	#else /* CONFIG_NUMA */
3509	3967	static bool zone_allows_reclaim(struct zone local_zone, struct zone zone)
..	..	@@ -3513,6 +3971,61 @@
3513	3971	#endif /* CONFIG_NUMA */
3514	3972
3515	3973	/*
	3974	+ * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
	3975	+ * fragmentation is subtle. If the preferred zone was HIGHMEM then
	3976	+ * premature use of a lower zone may cause lowmem pressure problems that
	3977	+ * are worse than fragmentation. If the next zone is ZONE_DMA then it is
	3978	+ * probably too small. It only makes sense to spread allocations to avoid
	3979	+ * fragmentation between the Normal and DMA32 zones.
	3980	+ */
	3981	+static inline unsigned int
	3982	+alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
	3983	+{
	3984	+ unsigned int alloc_flags;
	3985	+
	3986	+ /*
	3987	+ * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
	3988	+ * to save a branch.
	3989	+ */
	3990	+ alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
	3991	+
	3992	+#ifdef CONFIG_ZONE_DMA32
	3993	+ if (!zone)
	3994	+ return alloc_flags;
	3995	+
	3996	+ if (zone_idx(zone) != ZONE_NORMAL)
	3997	+ return alloc_flags;
	3998	+
	3999	+ /*
	4000	+ * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
	4001	+ * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
	4002	+ * on UMA that if Normal is populated then so is DMA32.
	4003	+ */
	4004	+ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
	4005	+ if (nr_online_nodes > 1 && !populated_zone(--zone))
	4006	+ return alloc_flags;
	4007	+
	4008	+ alloc_flags \|= ALLOC_NOFRAGMENT;
	4009	+#endif /* CONFIG_ZONE_DMA32 */
	4010	+ return alloc_flags;
	4011	+}
	4012	+
	4013	+static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
	4014	+ unsigned int alloc_flags)
	4015	+{
	4016	+#ifdef CONFIG_CMA
	4017	+ unsigned int pflags = current->flags;
	4018	+
	4019	+ if (!(pflags & PF_MEMALLOC_NOCMA) &&
	4020	+ gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE &&
	4021	+ gfp_mask & __GFP_CMA)
	4022	+ alloc_flags \|= ALLOC_CMA;
	4023	+
	4024	+#endif
	4025	+ return alloc_flags;
	4026	+}
	4027	+
	4028	+/*
3516	4029	* get_page_from_freelist goes through the zonelist trying to allocate
3517	4030	* a page.
3518	4031	*/
..	..	@@ -3520,16 +4033,20 @@
3520	4033	get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3521	4034	const struct alloc_context *ac)
3522	4035	{
3523		- struct zoneref *z = ac->preferred_zoneref;
	4036	+ struct zoneref *z;
3524	4037	struct zone *zone;
3525	4038	struct pglist_data *last_pgdat_dirty_limit = NULL;
	4039	+ bool no_fallback;
3526	4040
	4041	+retry:
3527	4042	/*
3528	4043	* Scan zonelist, looking for a zone with enough free.
3529	4044	* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
3530	4045	*/
3531		- for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3532		- ac->nodemask) {
	4046	+ no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
	4047	+ z = ac->preferred_zoneref;
	4048	+ for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
	4049	+ ac->nodemask) {
3533	4050	struct page *page;
3534	4051	unsigned long mark;
3535	4052
..	..	@@ -3566,9 +4083,26 @@
3566	4083	}
3567	4084	}
3568	4085
3569		- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
	4086	+ if (no_fallback && nr_online_nodes > 1 &&
	4087	+ zone != ac->preferred_zoneref->zone) {
	4088	+ int local_nid;
	4089	+
	4090	+ /*
	4091	+ * If moving to a remote node, retry but allow
	4092	+ * fragmenting fallbacks. Locality is more important
	4093	+ * than fragmentation avoidance.
	4094	+ */
	4095	+ local_nid = zone_to_nid(ac->preferred_zoneref->zone);
	4096	+ if (zone_to_nid(zone) != local_nid) {
	4097	+ alloc_flags &= ~ALLOC_NOFRAGMENT;
	4098	+ goto retry;
	4099	+ }
	4100	+ }
	4101	+
	4102	+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
3570	4103	if (!zone_watermark_fast(zone, order, mark,
3571		- ac_classzone_idx(ac), alloc_flags)) {
	4104	+ ac->highest_zoneidx, alloc_flags,
	4105	+ gfp_mask)) {
3572	4106	int ret;
3573	4107
3574	4108	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
..	..	@@ -3601,7 +4135,7 @@
3601	4135	default:
3602	4136	/* did we reclaim enough */
3603	4137	if (zone_watermark_ok(zone, order, mark,
3604		- ac_classzone_idx(ac), alloc_flags))
	4138	+ ac->highest_zoneidx, alloc_flags))
3605	4139	goto try_this_zone;
3606	4140
3607	4141	continue;
..	..	@@ -3633,30 +4167,21 @@
3633	4167	}
3634	4168	}
3635	4169
	4170	+ /*
	4171	+ * It's possible on a UMA machine to get through all zones that are
	4172	+ * fragmented. If avoiding fragmentation, reset and try again.
	4173	+ */
	4174	+ if (no_fallback) {
	4175	+ alloc_flags &= ~ALLOC_NOFRAGMENT;
	4176	+ goto retry;
	4177	+ }
	4178	+
3636	4179	return NULL;
3637		-}
3638		-
3639		-/*
3640		- * Large machines with many possible nodes should not always dump per-node
3641		- * meminfo in irq context.
3642		- */
3643		-static inline bool should_suppress_show_mem(void)
3644		-{
3645		- bool ret = false;
3646		-
3647		-#if NODES_SHIFT > 8
3648		- ret = in_interrupt();
3649		-#endif
3650		- return ret;
3651	4180	}
3652	4181
3653	4182	static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3654	4183	{
3655	4184	unsigned int filter = SHOW_MEM_FILTER_NODES;
3656		- static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
3657		-
3658		- if (should_suppress_show_mem() \|\| !__ratelimit(&show_mem_rs))
3659		- return;
3660	4185
3661	4186	/*
3662	4187	* This documents exceptions given to allocations in certain
..	..	@@ -3677,22 +4202,23 @@
3677	4202	{
3678	4203	struct va_format vaf;
3679	4204	va_list args;
3680		- static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
3681		- DEFAULT_RATELIMIT_BURST);
	4205	+ static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
3682	4206
3683		- if ((gfp_mask & __GFP_NOWARN) \|\| !__ratelimit(&nopage_rs))
	4207	+ if ((gfp_mask & __GFP_NOWARN) \|\|
	4208	+ !__ratelimit(&nopage_rs) \|\|
	4209	+ ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
3684	4210	return;
3685	4211
3686	4212	va_start(args, fmt);
3687	4213	vaf.fmt = fmt;
3688	4214	vaf.va = &args;
3689		- pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
	4215	+ pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
3690	4216	current->comm, &vaf, gfp_mask, &gfp_mask,
3691	4217	nodemask_pr_args(nodemask));
3692	4218	va_end(args);
3693	4219
3694	4220	cpuset_print_current_mems_allowed();
3695		-
	4221	+ pr_cont("\n");
3696	4222	dump_stack();
3697	4223	warn_alloc_show_mem(gfp_mask, nodemask);
3698	4224	}
..	..	@@ -3766,11 +4292,13 @@
3766	4292	* success so it is time to admit defeat. We will skip the OOM killer
3767	4293	* because it is very likely that the caller has a more reasonable
3768	4294	* fallback than shooting a random task.
	4295	+ *
	4296	+ * The OOM killer may not free memory on a specific node.
3769	4297	*/
3770		- if (gfp_mask & __GFP_RETRY_MAYFAIL)
	4298	+ if (gfp_mask & (__GFP_RETRY_MAYFAIL \| __GFP_THISNODE))
3771	4299	goto out;
3772	4300	/* The OOM killer does not needlessly kill tasks for lowmem */
3773		- if (ac->high_zoneidx < ZONE_NORMAL)
	4301	+ if (ac->highest_zoneidx < ZONE_NORMAL)
3774	4302	goto out;
3775	4303	if (pm_suspended_storage())
3776	4304	goto out;
..	..	@@ -3783,10 +4311,6 @@
3783	4311	* out_of_memory). Once filesystems are ready to handle allocation
3784	4312	* failures more gracefully we should just bail out here.
3785	4313	*/
3786		-
3787		- /* The OOM killer may not free memory on a specific node */
3788		- if (gfp_mask & __GFP_THISNODE)
3789		- goto out;
3790	4314
3791	4315	/* Exhausted what can be done so it's blame time */
3792	4316	if (out_of_memory(&oc) \|\| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
..	..	@@ -3818,7 +4342,7 @@
3818	4342	unsigned int alloc_flags, const struct alloc_context *ac,
3819	4343	enum compact_priority prio, enum compact_result *compact_result)
3820	4344	{
3821		- struct page *page;
	4345	+ struct page *page = NULL;
3822	4346	unsigned long pflags;
3823	4347	unsigned int noreclaim_flag;
3824	4348
..	..	@@ -3829,13 +4353,10 @@
3829	4353	noreclaim_flag = memalloc_noreclaim_save();
3830	4354
3831	4355	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3832		- prio);
	4356	+ prio, &page);
3833	4357
3834	4358	memalloc_noreclaim_restore(noreclaim_flag);
3835	4359	psi_memstall_leave(&pflags);
3836		-
3837		- if (*compact_result <= COMPACT_INACTIVE)
3838		- return NULL;
3839	4360
3840	4361	/*
3841	4362	* At least in one zone compaction wasn't deferred or skipped, so let's
..	..	@@ -3843,7 +4364,13 @@
3843	4364	*/
3844	4365	count_vm_event(COMPACTSTALL);
3845	4366
3846		- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	4367	+ /* Prep a captured page if available */
	4368	+ if (page)
	4369	+ prep_new_page(page, order, gfp_mask, alloc_flags);
	4370	+
	4371	+ /* Try get a page from the freelist if available */
	4372	+ if (!page)
	4373	+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3847	4374
3848	4375	if (page) {
3849	4376	struct zone *zone = page_zone(page);
..	..	@@ -3892,14 +4419,22 @@
3892	4419	goto check_priority;
3893	4420
3894	4421	/*
3895		- * make sure the compaction wasn't deferred or didn't bail out early
3896		- * due to locks contention before we declare that we should give up.
3897		- * But do not retry if the given zonelist is not suitable for
3898		- * compaction.
	4422	+ * compaction was skipped because there are not enough order-0 pages
	4423	+ * to work with, so we retry only if it looks like reclaim can help.
3899	4424	*/
3900		- if (compaction_withdrawn(compact_result)) {
	4425	+ if (compaction_needs_reclaim(compact_result)) {
3901	4426	ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3902	4427	goto out;
	4428	+ }
	4429	+
	4430	+ /*
	4431	+ * make sure the compaction wasn't deferred or didn't bail out early
	4432	+ * due to locks contention before we declare that we should give up.
	4433	+ * But the next retry should use a higher priority if allowed, so
	4434	+ * we don't just keep bailing out endlessly.
	4435	+ */
	4436	+ if (compaction_withdrawn(compact_result)) {
	4437	+ goto check_priority;
3903	4438	}
3904	4439
3905	4440	/*
..	..	@@ -3962,10 +4497,10 @@
3962	4497	* Let's give them a good hope and keep retrying while the order-0
3963	4498	* watermarks are OK.
3964	4499	*/
3965		- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3966		- ac->nodemask) {
	4500	+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
	4501	+ ac->highest_zoneidx, ac->nodemask) {
3967	4502	if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
3968		- ac_classzone_idx(ac), alloc_flags))
	4503	+ ac->highest_zoneidx, alloc_flags))
3969	4504	return true;
3970	4505	}
3971	4506	return false;
..	..	@@ -4023,33 +4558,50 @@
4023	4558	EXPORT_SYMBOL_GPL(fs_reclaim_release);
4024	4559	#endif
4025	4560
	4561	+/*
	4562	+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
	4563	+ * have been rebuilt so allocation retries. Reader side does not lock and
	4564	+ * retries the allocation if zonelist changes. Writer side is protected by the
	4565	+ * embedded spin_lock.
	4566	+ */
	4567	+static DEFINE_SEQLOCK(zonelist_update_seq);
	4568	+
	4569	+static unsigned int zonelist_iter_begin(void)
	4570	+{
	4571	+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
	4572	+ return read_seqbegin(&zonelist_update_seq);
	4573	+
	4574	+ return 0;
	4575	+}
	4576	+
	4577	+static unsigned int check_retry_zonelist(unsigned int seq)
	4578	+{
	4579	+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
	4580	+ return read_seqretry(&zonelist_update_seq, seq);
	4581	+
	4582	+ return seq;
	4583	+}
	4584	+
4026	4585	/* Perform direct synchronous page reclaim */
4027		-static int
	4586	+static unsigned long
4028	4587	__perform_reclaim(gfp_t gfp_mask, unsigned int order,
4029	4588	const struct alloc_context *ac)
4030	4589	{
4031		- struct reclaim_state reclaim_state;
4032		- int progress;
4033	4590	unsigned int noreclaim_flag;
4034		- unsigned long pflags;
	4591	+ unsigned long progress;
4035	4592
4036	4593	cond_resched();
4037	4594
4038	4595	/* We now go into synchronous reclaim */
4039	4596	cpuset_memory_pressure_bump();
4040		- psi_memstall_enter(&pflags);
4041	4597	fs_reclaim_acquire(gfp_mask);
4042	4598	noreclaim_flag = memalloc_noreclaim_save();
4043		- reclaim_state.reclaimed_slab = 0;
4044		- current->reclaim_state = &reclaim_state;
4045	4599
4046	4600	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
4047	4601	ac->nodemask);
4048	4602
4049		- current->reclaim_state = NULL;
4050	4603	memalloc_noreclaim_restore(noreclaim_flag);
4051	4604	fs_reclaim_release(gfp_mask);
4052		- psi_memstall_leave(&pflags);
4053	4605
4054	4606	cond_resched();
4055	4607
..	..	@@ -4063,11 +4615,14 @@
4063	4615	unsigned long *did_some_progress)
4064	4616	{
4065	4617	struct page *page = NULL;
	4618	+ unsigned long pflags;
4066	4619	bool drained = false;
	4620	+ bool skip_pcp_drain = false;
4067	4621
	4622	+ psi_memstall_enter(&pflags);
4068	4623	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
4069	4624	if (unlikely(!(*did_some_progress)))
4070		- return NULL;
	4625	+ goto out;
4071	4626
4072	4627	retry:
4073	4628	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
..	..	@@ -4075,14 +4630,19 @@
4075	4630	/*
4076	4631	* If an allocation failed after direct reclaim, it could be because
4077	4632	* pages are pinned on the per-cpu lists or in high alloc reserves.
4078		- * Shrink them them and try again
	4633	+ * Shrink them and try again
4079	4634	*/
4080	4635	if (!page && !drained) {
4081	4636	unreserve_highatomic_pageblock(ac, false);
4082		- drain_all_pages(NULL);
	4637	+ trace_android_vh_drain_all_pages_bypass(gfp_mask, order,
	4638	+ alloc_flags, ac->migratetype, *did_some_progress, &skip_pcp_drain);
	4639	+ if (!skip_pcp_drain)
	4640	+ drain_all_pages(NULL);
4083	4641	drained = true;
4084	4642	goto retry;
4085	4643	}
	4644	+out:
	4645	+ psi_memstall_leave(&pflags);
4086	4646
4087	4647	return page;
4088	4648	}
..	..	@@ -4093,12 +4653,12 @@
4093	4653	struct zoneref *z;
4094	4654	struct zone *zone;
4095	4655	pg_data_t *last_pgdat = NULL;
4096		- enum zone_type high_zoneidx = ac->high_zoneidx;
	4656	+ enum zone_type highest_zoneidx = ac->highest_zoneidx;
4097	4657
4098		- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
	4658	+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
4099	4659	ac->nodemask) {
4100	4660	if (last_pgdat != zone->zone_pgdat)
4101		- wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
	4661	+ wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
4102	4662	last_pgdat = zone->zone_pgdat;
4103	4663	}
4104	4664	}
..	..	@@ -4108,8 +4668,13 @@
4108	4668	{
4109	4669	unsigned int alloc_flags = ALLOC_WMARK_MIN \| ALLOC_CPUSET;
4110	4670
4111		- /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
	4671	+ /*
	4672	+ * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
	4673	+ * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
	4674	+ * to save two branches.
	4675	+ */
4112	4676	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
	4677	+ BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
4113	4678
4114	4679	/*
4115	4680	* The caller may dip into page reserves a bit more if the caller
..	..	@@ -4117,7 +4682,8 @@
4117	4682	* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
4118	4683	* set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
4119	4684	*/
4120		- alloc_flags \|= (__force int) (gfp_mask & __GFP_HIGH);
	4685	+ alloc_flags \|= (__force int)
	4686	+ (gfp_mask & (__GFP_HIGH \| __GFP_KSWAPD_RECLAIM));
4121	4687
4122	4688	if (gfp_mask & __GFP_ATOMIC) {
4123	4689	/*
..	..	@@ -4134,10 +4700,8 @@
4134	4700	} else if (unlikely(rt_task(current)) && !in_interrupt())
4135	4701	alloc_flags \|= ALLOC_HARDER;
4136	4702
4137		-#ifdef CONFIG_CMA
4138		- if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
4139		- alloc_flags \|= ALLOC_CMA;
4140		-#endif
	4703	+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
	4704	+
4141	4705	return alloc_flags;
4142	4706	}
4143	4707
..	..	@@ -4200,6 +4764,7 @@
4200	4764	{
4201	4765	struct zone *zone;
4202	4766	struct zoneref *z;
	4767	+ bool ret = false;
4203	4768
4204	4769	/*
4205	4770	* Costly allocations might have made a progress but this doesn't mean
..	..	@@ -4226,8 +4791,8 @@
4226	4791	* request even if all reclaimable pages are considered then we are
4227	4792	* screwed and have to go OOM.
4228	4793	*/
4229		- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
4230		- ac->nodemask) {
	4794	+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
	4795	+ ac->highest_zoneidx, ac->nodemask) {
4231	4796	unsigned long available;
4232	4797	unsigned long reclaimable;
4233	4798	unsigned long min_wmark = min_wmark_pages(zone);
..	..	@@ -4241,7 +4806,7 @@
4241	4806	* reclaimable pages?
4242	4807	*/
4243	4808	wmark = __zone_watermark_ok(zone, order, min_wmark,
4244		- ac_classzone_idx(ac), alloc_flags, available);
	4809	+ ac->highest_zoneidx, alloc_flags, available);
4245	4810	trace_reclaim_retry_zone(z, order, reclaimable,
4246	4811	available, min_wmark, *no_progress_loops, wmark);
4247	4812	if (wmark) {
..	..	@@ -4263,25 +4828,24 @@
4263	4828	}
4264	4829	}
4265	4830
4266		- /*
4267		- * Memory allocation/reclaim might be called from a WQ
4268		- * context and the current implementation of the WQ
4269		- * concurrency control doesn't recognize that
4270		- * a particular WQ is congested if the worker thread is
4271		- * looping without ever sleeping. Therefore we have to
4272		- * do a short sleep here rather than calling
4273		- * cond_resched().
4274		- */
4275		- if (current->flags & PF_WQ_WORKER)
4276		- schedule_timeout_uninterruptible(1);
4277		- else
4278		- cond_resched();
4279		-
4280		- return true;
	4831	+ ret = true;
	4832	+ goto out;
4281	4833	}
4282	4834	}
4283	4835
4284		- return false;
	4836	+out:
	4837	+ /*
	4838	+ * Memory allocation/reclaim might be called from a WQ context and the
	4839	+ * current implementation of the WQ concurrency control doesn't
	4840	+ * recognize that a particular WQ is congested if the worker thread is
	4841	+ * looping without ever sleeping. Therefore we have to do a short sleep
	4842	+ * here rather than calling cond_resched().
	4843	+ */
	4844	+ if (current->flags & PF_WQ_WORKER)
	4845	+ schedule_timeout_uninterruptible(1);
	4846	+ else
	4847	+ cond_resched();
	4848	+ return ret;
4285	4849	}
4286	4850
4287	4851	static inline bool
..	..	@@ -4331,8 +4895,11 @@
4331	4895	int compaction_retries;
4332	4896	int no_progress_loops;
4333	4897	unsigned int cpuset_mems_cookie;
	4898	+ unsigned int zonelist_iter_cookie;
4334	4899	int reserve_flags;
	4900	+ unsigned long vh_record;
4335	4901
	4902	+ trace_android_vh_alloc_pages_slowpath_begin(gfp_mask, order, &vh_record);
4336	4903	/*
4337	4904	* We also sanity check to catch abuse of atomic reserves being used by
4338	4905	* callers that are not in atomic context.
..	..	@@ -4341,11 +4908,12 @@
4341	4908	(__GFP_ATOMIC\|__GFP_DIRECT_RECLAIM)))
4342	4909	gfp_mask &= ~__GFP_ATOMIC;
4343	4910
4344		-retry_cpuset:
	4911	+restart:
4345	4912	compaction_retries = 0;
4346	4913	no_progress_loops = 0;
4347	4914	compact_priority = DEF_COMPACT_PRIORITY;
4348	4915	cpuset_mems_cookie = read_mems_allowed_begin();
	4916	+ zonelist_iter_cookie = zonelist_iter_begin();
4349	4917
4350	4918	/*
4351	4919	* The fast path uses conservative alloc_flags to succeed only until
..	..	@@ -4361,11 +4929,11 @@
4361	4929	* could end up iterating over non-eligible zones endlessly.
4362	4930	*/
4363	4931	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4364		- ac->high_zoneidx, ac->nodemask);
	4932	+ ac->highest_zoneidx, ac->nodemask);
4365	4933	if (!ac->preferred_zoneref->zone)
4366	4934	goto nopage;
4367	4935
4368		- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
	4936	+ if (alloc_flags & ALLOC_KSWAPD)
4369	4937	wake_all_kswapds(order, gfp_mask, ac);
4370	4938
4371	4939	/*
..	..	@@ -4398,18 +4966,28 @@
4398	4966
4399	4967	/*
4400	4968	* Checks for costly allocations with __GFP_NORETRY, which
4401		- * includes THP page fault allocations
	4969	+ * includes some THP page fault allocations
4402	4970	*/
4403	4971	if (costly_order && (gfp_mask & __GFP_NORETRY)) {
4404	4972	/*
4405		- * If compaction is deferred for high-order allocations,
4406		- * it is because sync compaction recently failed. If
4407		- * this is the case and the caller requested a THP
4408		- * allocation, we do not want to heavily disrupt the
4409		- * system, so we fail the allocation instead of entering
4410		- * direct reclaim.
	4973	+ * If allocating entire pageblock(s) and compaction
	4974	+ * failed because all zones are below low watermarks
	4975	+ * or is prohibited because it recently failed at this
	4976	+ * order, fail immediately unless the allocator has
	4977	+ * requested compaction and reclaim retry.
	4978	+ *
	4979	+ * Reclaim is
	4980	+ * - potentially very expensive because zones are far
	4981	+ * below their low watermarks or this is part of very
	4982	+ * bursty high order allocations,
	4983	+ * - not guaranteed to help because isolate_freepages()
	4984	+ * may not iterate over freed pages as part of its
	4985	+ * linear scan, and
	4986	+ * - unlikely to make entire pageblocks free on its
	4987	+ * own.
4411	4988	*/
4412		- if (compact_result == COMPACT_DEFERRED)
	4989	+ if (compact_result == COMPACT_SKIPPED \|\|
	4990	+ compact_result == COMPACT_DEFERRED)
4413	4991	goto nopage;
4414	4992
4415	4993	/*
..	..	@@ -4423,12 +5001,12 @@
4423	5001
4424	5002	retry:
4425	5003	/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4426		- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
	5004	+ if (alloc_flags & ALLOC_KSWAPD)
4427	5005	wake_all_kswapds(order, gfp_mask, ac);
4428	5006
4429	5007	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4430	5008	if (reserve_flags)
4431		- alloc_flags = reserve_flags;
	5009	+ alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
4432	5010
4433	5011	/*
4434	5012	* Reset the nodemask and zonelist iterators if memory policies can be
..	..	@@ -4438,7 +5016,7 @@
4438	5016	if (!(alloc_flags & ALLOC_CPUSET) \|\| reserve_flags) {
4439	5017	ac->nodemask = NULL;
4440	5018	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4441		- ac->high_zoneidx, ac->nodemask);
	5019	+ ac->highest_zoneidx, ac->nodemask);
4442	5020	}
4443	5021
4444	5022	/* Attempt with potentially adjusted zonelist and alloc_flags */
..	..	@@ -4453,6 +5031,12 @@
4453	5031	/* Avoid recursion of direct reclaim */
4454	5032	if (current->flags & PF_MEMALLOC)
4455	5033	goto nopage;
	5034	+
	5035	+ trace_android_vh_alloc_pages_reclaim_bypass(gfp_mask, order,
	5036	+ alloc_flags, ac->migratetype, &page);
	5037	+
	5038	+ if (page)
	5039	+ goto got_pg;
4456	5040
4457	5041	/* Try direct reclaim and then allocating */
4458	5042	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
..	..	@@ -4494,9 +5078,13 @@
4494	5078	goto retry;
4495	5079
4496	5080
4497		- /* Deal with possible cpuset update races before we start OOM killing */
4498		- if (check_retry_cpuset(cpuset_mems_cookie, ac))
4499		- goto retry_cpuset;
	5081	+ /*
	5082	+ * Deal with possible cpuset update races or zonelist updates to avoid
	5083	+ * a unnecessary OOM kill.
	5084	+ */
	5085	+ if (check_retry_cpuset(cpuset_mems_cookie, ac) \|\|
	5086	+ check_retry_zonelist(zonelist_iter_cookie))
	5087	+ goto restart;
4500	5088
4501	5089	/* Reclaim has failed us, start killing things */
4502	5090	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
..	..	@@ -4505,7 +5093,7 @@
4505	5093
4506	5094	/* Avoid allocations with no watermarks from looping endlessly */
4507	5095	if (tsk_is_oom_victim(current) &&
4508		- (alloc_flags == ALLOC_OOM \|\|
	5096	+ (alloc_flags & ALLOC_OOM \|\|
4509	5097	(gfp_mask & __GFP_NOMEMALLOC)))
4510	5098	goto nopage;
4511	5099
..	..	@@ -4516,9 +5104,13 @@
4516	5104	}
4517	5105
4518	5106	nopage:
4519		- /* Deal with possible cpuset update races before we fail */
4520		- if (check_retry_cpuset(cpuset_mems_cookie, ac))
4521		- goto retry_cpuset;
	5107	+ /*
	5108	+ * Deal with possible cpuset update races or zonelist updates to avoid
	5109	+ * a unnecessary OOM kill.
	5110	+ */
	5111	+ if (check_retry_cpuset(cpuset_mems_cookie, ac) \|\|
	5112	+ check_retry_zonelist(zonelist_iter_cookie))
	5113	+ goto restart;
4522	5114
4523	5115	/*
4524	5116	* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
..	..	@@ -4561,9 +5153,15 @@
4561	5153	goto retry;
4562	5154	}
4563	5155	fail:
	5156	+ trace_android_vh_alloc_pages_failure_bypass(gfp_mask, order,
	5157	+ alloc_flags, ac->migratetype, &page);
	5158	+ if (page)
	5159	+ goto got_pg;
	5160	+
4564	5161	warn_alloc(gfp_mask, ac->nodemask,
4565	5162	"page allocation failure: order:%u", order);
4566	5163	got_pg:
	5164	+ trace_android_vh_alloc_pages_slowpath_end(gfp_mask, order, vh_record);
4567	5165	return page;
4568	5166	}
4569	5167
..	..	@@ -4572,14 +5170,18 @@
4572	5170	struct alloc_context ac, gfp_t alloc_mask,
4573	5171	unsigned int *alloc_flags)
4574	5172	{
4575		- ac->high_zoneidx = gfp_zone(gfp_mask);
	5173	+ ac->highest_zoneidx = gfp_zone(gfp_mask);
4576	5174	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
4577	5175	ac->nodemask = nodemask;
4578		- ac->migratetype = gfpflags_to_migratetype(gfp_mask);
	5176	+ ac->migratetype = gfp_migratetype(gfp_mask);
4579	5177
4580	5178	if (cpusets_enabled()) {
4581	5179	*alloc_mask \|= __GFP_HARDWALL;
4582		- if (!ac->nodemask)
	5180	+ /*
	5181	+ * When we are in the interrupt context, it is irrelevant
	5182	+ * to the current task context. It means that any node ok.
	5183	+ */
	5184	+ if (!in_interrupt() && !ac->nodemask)
4583	5185	ac->nodemask = &cpuset_current_mems_allowed;
4584	5186	else
4585	5187	*alloc_flags \|= ALLOC_CPUSET;
..	..	@@ -4593,15 +5195,8 @@
4593	5195	if (should_fail_alloc_page(gfp_mask, order))
4594	5196	return false;
4595	5197
4596		- if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
4597		- *alloc_flags \|= ALLOC_CMA;
	5198	+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
4598	5199
4599		- return true;
4600		-}
4601		-
4602		-/* Determine whether to spread dirty pages and what the first usable zone */
4603		-static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
4604		-{
4605	5200	/* Dirty zone balancing only done in the fast path */
4606	5201	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
4607	5202
..	..	@@ -4611,7 +5206,9 @@
4611	5206	* may get reset for allocations that ignore memory policies.
4612	5207	*/
4613	5208	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4614		- ac->high_zoneidx, ac->nodemask);
	5209	+ ac->highest_zoneidx, ac->nodemask);
	5210	+
	5211	+ return true;
4615	5212	}
4616	5213
4617	5214	/*
..	..	@@ -4640,7 +5237,11 @@
4640	5237	if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
4641	5238	return NULL;
4642	5239
4643		- finalise_ac(gfp_mask, &ac);
	5240	+ /*
	5241	+ * Forbid the first pass from falling back to types that fragment
	5242	+ * memory until all local zones are considered.
	5243	+ */
	5244	+ alloc_flags \|= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
4644	5245
4645	5246	/* First allocation attempt */
4646	5247	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
..	..	@@ -4660,14 +5261,13 @@
4660	5261	* Restore the original nodemask if it was potentially replaced with
4661	5262	* &cpuset_current_mems_allowed to optimize the fast-path attempt.
4662	5263	*/
4663		- if (unlikely(ac.nodemask != nodemask))
4664		- ac.nodemask = nodemask;
	5264	+ ac.nodemask = nodemask;
4665	5265
4666	5266	page = __alloc_pages_slowpath(alloc_mask, order, &ac);
4667	5267
4668	5268	out:
4669	5269	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
4670		- unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
	5270	+ unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
4671	5271	__free_pages(page, order);
4672	5272	page = NULL;
4673	5273	}
..	..	@@ -4705,13 +5305,17 @@
4705	5305	if (order == 0) /* Via pcp? */
4706	5306	free_unref_page(page);
4707	5307	else
4708		- __free_pages_ok(page, order);
	5308	+ __free_pages_ok(page, order, FPI_NONE);
4709	5309	}
4710	5310
4711	5311	void __free_pages(struct page *page, unsigned int order)
4712	5312	{
	5313	+ trace_android_vh_free_pages(page, order);
4713	5314	if (put_page_testzero(page))
4714	5315	free_the_page(page, order);
	5316	+ else if (!PageHead(page))
	5317	+ while (order-- > 0)
	5318	+ free_the_page(page + (1 << order), order);
4715	5319	}
4716	5320	EXPORT_SYMBOL(__free_pages);
4717	5321
..	..	@@ -4816,6 +5420,18 @@
4816	5420	/* reset page count bias and offset to start of new frag */
4817	5421	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
4818	5422	offset = size - fragsz;
	5423	+ if (unlikely(offset < 0)) {
	5424	+ /*
	5425	+ * The caller is trying to allocate a fragment
	5426	+ * with fragsz > PAGE_SIZE but the cache isn't big
	5427	+ * enough to satisfy the request, this may
	5428	+ * happen in low memory conditions.
	5429	+ * We don't release the cache page because
	5430	+ * it could make memory pressure worse
	5431	+ * so we simply return NULL here.
	5432	+ */
	5433	+ return NULL;
	5434	+ }
4819	5435	}
4820	5436
4821	5437	nc->pagecnt_bias--;
..	..	@@ -4856,7 +5472,7 @@
4856	5472	/**
4857	5473	* alloc_pages_exact - allocate an exact number physically-contiguous pages.
4858	5474	* @size: the number of bytes to allocate
4859		- * @gfp_mask: GFP flags for the allocation
	5475	+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4860	5476	*
4861	5477	* This function is similar to alloc_pages(), except that it allocates the
4862	5478	* minimum number of pages to satisfy the request. alloc_pages() can only
..	..	@@ -4865,11 +5481,16 @@
4865	5481	* This function is also limited by MAX_ORDER.
4866	5482	*
4867	5483	* Memory allocated by this function must be released by free_pages_exact().
	5484	+ *
	5485	+ * Return: pointer to the allocated area or %NULL in case of error.
4868	5486	*/
4869	5487	void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4870	5488	{
4871	5489	unsigned int order = get_order(size);
4872	5490	unsigned long addr;
	5491	+
	5492	+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
	5493	+ gfp_mask &= ~__GFP_COMP;
4873	5494
4874	5495	addr = __get_free_pages(gfp_mask, order);
4875	5496	return make_alloc_exact(addr, order, size);
..	..	@@ -4881,15 +5502,22 @@
4881	5502	* pages on a node.
4882	5503	* @nid: the preferred node ID where memory should be allocated
4883	5504	* @size: the number of bytes to allocate
4884		- * @gfp_mask: GFP flags for the allocation
	5505	+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4885	5506	*
4886	5507	* Like alloc_pages_exact(), but try to allocate on node nid first before falling
4887	5508	* back.
	5509	+ *
	5510	+ * Return: pointer to the allocated area or %NULL in case of error.
4888	5511	*/
4889	5512	void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
4890	5513	{
4891	5514	unsigned int order = get_order(size);
4892		- struct page *p = alloc_pages_node(nid, gfp_mask, order);
	5515	+ struct page *p;
	5516	+
	5517	+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
	5518	+ gfp_mask &= ~__GFP_COMP;
	5519	+
	5520	+ p = alloc_pages_node(nid, gfp_mask, order);
4893	5521	if (!p)
4894	5522	return NULL;
4895	5523	return make_alloc_exact((unsigned long)page_address(p), order, size);
..	..	@@ -4918,11 +5546,13 @@
4918	5546	* nr_free_zone_pages - count number of pages beyond high watermark
4919	5547	* @offset: The zone index of the highest zone
4920	5548	*
4921		- * nr_free_zone_pages() counts the number of counts pages which are beyond the
	5549	+ * nr_free_zone_pages() counts the number of pages which are beyond the
4922	5550	* high watermark within all zones at or below a given zone index. For each
4923	5551	* zone, the number of pages is calculated as:
4924	5552	*
4925	5553	* nr_free_zone_pages = managed_pages - high_pages
	5554	+ *
	5555	+ * Return: number of pages beyond high watermark.
4926	5556	*/
4927	5557	static unsigned long nr_free_zone_pages(int offset)
4928	5558	{
..	..	@@ -4935,7 +5565,7 @@
4935	5565	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
4936	5566
4937	5567	for_each_zone_zonelist(zone, z, zonelist, offset) {
4938		- unsigned long size = zone->managed_pages;
	5568	+ unsigned long size = zone_managed_pages(zone);
4939	5569	unsigned long high = high_wmark_pages(zone);
4940	5570	if (size > high)
4941	5571	sum += size - high;
..	..	@@ -4949,23 +5579,15 @@
4949	5579	*
4950	5580	* nr_free_buffer_pages() counts the number of pages which are beyond the high
4951	5581	* watermark within ZONE_DMA and ZONE_NORMAL.
	5582	+ *
	5583	+ * Return: number of pages beyond high watermark within ZONE_DMA and
	5584	+ * ZONE_NORMAL.
4952	5585	*/
4953	5586	unsigned long nr_free_buffer_pages(void)
4954	5587	{
4955	5588	return nr_free_zone_pages(gfp_zone(GFP_USER));
4956	5589	}
4957	5590	EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4958		-
4959		-/**
4960		- * nr_free_pagecache_pages - count number of pages beyond high watermark
4961		- *
4962		- * nr_free_pagecache_pages() counts the number of pages which are beyond the
4963		- * high watermark within all zones.
4964		- */
4965		-unsigned long nr_free_pagecache_pages(void)
4966		-{
4967		- return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
4968		-}
4969	5591
4970	5592	static inline void show_node(struct zone *zone)
4971	5593	{
..	..	@@ -4987,7 +5609,7 @@
4987	5609	pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
4988	5610
4989	5611	for_each_zone(zone)
4990		- wmark_low += zone->watermark[WMARK_LOW];
	5612	+ wmark_low += low_wmark_pages(zone);
4991	5613
4992	5614	/*
4993	5615	* Estimate the amount of memory available for userspace allocations,
..	..	@@ -5009,8 +5631,8 @@
5009	5631	* items that are in use, and cannot be freed. Cap this estimate at the
5010	5632	* low watermark.
5011	5633	*/
5012		- reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
5013		- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
	5634	+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
	5635	+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
5014	5636	available += reclaimable - min(reclaimable / 2, wmark_low);
5015	5637
5016	5638	if (available < 0)
..	..	@@ -5021,11 +5643,11 @@
5021	5643
5022	5644	void si_meminfo(struct sysinfo *val)
5023	5645	{
5024		- val->totalram = totalram_pages;
	5646	+ val->totalram = totalram_pages();
5025	5647	val->sharedram = global_node_page_state(NR_SHMEM);
5026	5648	val->freeram = global_zone_page_state(NR_FREE_PAGES);
5027	5649	val->bufferram = nr_blockdev_pages();
5028		- val->totalhigh = totalhigh_pages;
	5650	+ val->totalhigh = totalhigh_pages();
5029	5651	val->freehigh = nr_free_highpages();
5030	5652	val->mem_unit = PAGE_SIZE;
5031	5653	}
..	..	@@ -5042,7 +5664,7 @@
5042	5664	pg_data_t *pgdat = NODE_DATA(nid);
5043	5665
5044	5666	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
5045		- managed_pages += pgdat->node_zones[zone_type].managed_pages;
	5667	+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
5046	5668	val->totalram = managed_pages;
5047	5669	val->sharedram = node_page_state(pgdat, NR_SHMEM);
5048	5670	val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
..	..	@@ -5051,7 +5673,7 @@
5051	5673	struct zone *zone = &pgdat->node_zones[zone_type];
5052	5674
5053	5675	if (is_highmem(zone)) {
5054		- managed_highpages += zone->managed_pages;
	5676	+ managed_highpages += zone_managed_pages(zone);
5055	5677	free_highpages += zone_page_state(zone, NR_FREE_PAGES);
5056	5678	}
5057	5679	}
..	..	@@ -5140,7 +5762,7 @@
5140	5762
5141	5763	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
5142	5764	" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
5143		- " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
	5765	+ " unevictable:%lu dirty:%lu writeback:%lu\n"
5144	5766	" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
5145	5767	" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
5146	5768	" free:%lu free_pcp:%lu free_cma:%lu\n",
..	..	@@ -5153,9 +5775,8 @@
5153	5775	global_node_page_state(NR_UNEVICTABLE),
5154	5776	global_node_page_state(NR_FILE_DIRTY),
5155	5777	global_node_page_state(NR_WRITEBACK),
5156		- global_node_page_state(NR_UNSTABLE_NFS),
5157		- global_node_page_state(NR_SLAB_RECLAIMABLE),
5158		- global_node_page_state(NR_SLAB_UNRECLAIMABLE),
	5778	+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
	5779	+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
5159	5780	global_node_page_state(NR_FILE_MAPPED),
5160	5781	global_node_page_state(NR_SHMEM),
5161	5782	global_zone_page_state(NR_PAGETABLE),
..	..	@@ -5164,6 +5785,7 @@
5164	5785	free_pcp,
5165	5786	global_zone_page_state(NR_FREE_CMA_PAGES));
5166	5787
	5788	+ trace_android_vh_show_mapcount_pages(NULL);
5167	5789	for_each_online_pgdat(pgdat) {
5168	5790	if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
5169	5791	continue;
..	..	@@ -5186,7 +5808,10 @@
5186	5808	" anon_thp: %lukB"
5187	5809	#endif
5188	5810	" writeback_tmp:%lukB"
5189		- " unstable:%lukB"
	5811	+ " kernel_stack:%lukB"
	5812	+#ifdef CONFIG_SHADOW_CALL_STACK
	5813	+ " shadow_call_stack:%lukB"
	5814	+#endif
5190	5815	" all_unreclaimable? %s"
5191	5816	"\n",
5192	5817	pgdat->node_id,
..	..	@@ -5208,7 +5833,10 @@
5208	5833	K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
5209	5834	#endif
5210	5835	K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
5211		- K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
	5836	+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
	5837	+#ifdef CONFIG_SHADOW_CALL_STACK
	5838	+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
	5839	+#endif
5212	5840	pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
5213	5841	"yes" : "no");
5214	5842	}
..	..	@@ -5230,6 +5858,7 @@
5230	5858	" min:%lukB"
5231	5859	" low:%lukB"
5232	5860	" high:%lukB"
	5861	+ " reserved_highatomic:%luKB"
5233	5862	" active_anon:%lukB"
5234	5863	" inactive_anon:%lukB"
5235	5864	" active_file:%lukB"
..	..	@@ -5239,10 +5868,6 @@
5239	5868	" present:%lukB"
5240	5869	" managed:%lukB"
5241	5870	" mlocked:%lukB"
5242		- " kernel_stack:%lukB"
5243		-#ifdef CONFIG_SHADOW_CALL_STACK
5244		- " shadow_call_stack:%lukB"
5245		-#endif
5246	5871	" pagetables:%lukB"
5247	5872	" bounce:%lukB"
5248	5873	" free_pcp:%lukB"
..	..	@@ -5254,6 +5879,7 @@
5254	5879	K(min_wmark_pages(zone)),
5255	5880	K(low_wmark_pages(zone)),
5256	5881	K(high_wmark_pages(zone)),
	5882	+ K(zone->nr_reserved_highatomic),
5257	5883	K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
5258	5884	K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
5259	5885	K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
..	..	@@ -5261,12 +5887,8 @@
5261	5887	K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
5262	5888	K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
5263	5889	K(zone->present_pages),
5264		- K(zone->managed_pages),
	5890	+ K(zone_managed_pages(zone)),
5265	5891	K(zone_page_state(zone, NR_MLOCK)),
5266		- zone_page_state(zone, NR_KERNEL_STACK_KB),
5267		-#ifdef CONFIG_SHADOW_CALL_STACK
5268		- zone_page_state(zone, NR_KERNEL_SCS_BYTES) / 1024,
5269		-#endif
5270	5892	K(zone_page_state(zone, NR_PAGETABLE)),
5271	5893	K(zone_page_state(zone, NR_BOUNCE)),
5272	5894	K(free_pcp),
..	..	@@ -5298,7 +5920,7 @@
5298	5920
5299	5921	types[order] = 0;
5300	5922	for (type = 0; type < MIGRATE_TYPES; type++) {
5301		- if (!list_empty(&area->free_list[type]))
	5923	+ if (!free_area_empty(area, type))
5302	5924	types[order] \|= 1 << type;
5303	5925	}
5304	5926	}
..	..	@@ -5339,7 +5961,7 @@
5339	5961	do {
5340	5962	zone_type--;
5341	5963	zone = pgdat->node_zones + zone_type;
5342		- if (managed_zone(zone)) {
	5964	+ if (populated_zone(zone)) {
5343	5965	zoneref_set_zone(zone, &zonerefs[nr_zones++]);
5344	5966	check_highest_zone(zone_type);
5345	5967	}
..	..	@@ -5365,36 +5987,17 @@
5365	5987	return 0;
5366	5988	}
5367	5989
5368		-static __init int setup_numa_zonelist_order(char *s)
5369		-{
5370		- if (!s)
5371		- return 0;
5372		-
5373		- return __parse_numa_zonelist_order(s);
5374		-}
5375		-early_param("numa_zonelist_order", setup_numa_zonelist_order);
5376		-
5377	5990	char numa_zonelist_order[] = "Node";
5378	5991
5379	5992	/*
5380	5993	* sysctl handler for numa_zonelist_order
5381	5994	*/
5382	5995	int numa_zonelist_order_handler(struct ctl_table *table, int write,
5383		- void __user buffer, size_t length,
5384		- loff_t *ppos)
	5996	+ void buffer, size_t length, loff_t *ppos)
5385	5997	{
5386		- char *str;
5387		- int ret;
5388		-
5389		- if (!write)
5390		- return proc_dostring(table, write, buffer, length, ppos);
5391		- str = memdup_user_nul(buffer, 16);
5392		- if (IS_ERR(str))
5393		- return PTR_ERR(str);
5394		-
5395		- ret = __parse_numa_zonelist_order(str);
5396		- kfree(str);
5397		- return ret;
	5998	+ if (write)
	5999	+ return __parse_numa_zonelist_order(buffer);
	6000	+ return proc_dostring(table, write, buffer, length, ppos);
5398	6001	}
5399	6002
5400	6003
..	..	@@ -5413,14 +6016,14 @@
5413	6016	* from each node to each node in the system), and should also prefer nodes
5414	6017	* with no CPUs, since presumably they'll have very little allocation pressure
5415	6018	* on them otherwise.
5416		- * It returns -1 if no node is found.
	6019	+ *
	6020	+ * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
5417	6021	*/
5418	6022	static int find_next_best_node(int node, nodemask_t *used_node_mask)
5419	6023	{
5420	6024	int n, val;
5421	6025	int min_val = INT_MAX;
5422	6026	int best_node = NUMA_NO_NODE;
5423		- const struct cpumask *tmp = cpumask_of_node(0);
5424	6027
5425	6028	/* Use the local node if we haven't already */
5426	6029	if (!node_isset(node, *used_node_mask)) {
..	..	@@ -5441,8 +6044,7 @@
5441	6044	val += (n < node);
5442	6045
5443	6046	/* Give preference to headless and unused nodes */
5444		- tmp = cpumask_of_node(n);
5445		- if (!cpumask_empty(tmp))
	6047	+ if (!cpumask_empty(cpumask_of_node(n)))
5446	6048	val += PENALTY_FOR_NODE_WITH_CPUS;
5447	6049
5448	6050	/* Slight preference for less loaded node */
..	..	@@ -5513,14 +6115,13 @@
5513	6115	{
5514	6116	static int node_order[MAX_NUMNODES];
5515	6117	int node, load, nr_nodes = 0;
5516		- nodemask_t used_mask;
	6118	+ nodemask_t used_mask = NODE_MASK_NONE;
5517	6119	int local_node, prev_node;
5518	6120
5519	6121	/* NUMA-aware ordering of nodes */
5520	6122	local_node = pgdat->node_id;
5521	6123	load = nr_online_nodes;
5522	6124	prev_node = local_node;
5523		- nodes_clear(used_mask);
5524	6125
5525	6126	memset(node_order, 0, sizeof(node_order));
5526	6127	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
..	..	@@ -5627,9 +6228,8 @@
5627	6228	int nid;
5628	6229	int __maybe_unused cpu;
5629	6230	pg_data_t *self = data;
5630		- static DEFINE_SPINLOCK(lock);
5631	6231
5632		- spin_lock(&lock);
	6232	+ write_seqlock(&zonelist_update_seq);
5633	6233
5634	6234	#ifdef CONFIG_NUMA
5635	6235	memset(node_load, 0, sizeof(node_load));
..	..	@@ -5662,7 +6262,7 @@
5662	6262	#endif
5663	6263	}
5664	6264
5665		- spin_unlock(&lock);
	6265	+ write_sequnlock(&zonelist_update_seq);
5666	6266	}
5667	6267
5668	6268	static noinline void __init
..	..	@@ -5700,13 +6300,16 @@
5700	6300	*/
5701	6301	void __ref build_all_zonelists(pg_data_t *pgdat)
5702	6302	{
	6303	+ unsigned long vm_total_pages;
	6304	+
5703	6305	if (system_state == SYSTEM_BOOTING) {
5704	6306	build_all_zonelists_init();
5705	6307	} else {
5706	6308	__build_all_zonelists(pgdat);
5707	6309	/* cpuset refresh routine should be here */
5708	6310	}
5709		- vm_total_pages = nr_free_pagecache_pages();
	6311	+ /* Get the number of free pages beyond high watermark in all zones. */
	6312	+ vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
5710	6313	/*
5711	6314	* Disable grouping by mobility if the number of pages in the
5712	6315	* system is too low to allow the mechanism to work. It would be
..	..	@@ -5719,7 +6322,7 @@
5719	6322	else
5720	6323	page_group_by_mobility_disabled = 0;
5721	6324
5722		- pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
	6325	+ pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
5723	6326	nr_online_nodes,
5724	6327	page_group_by_mobility_disabled ? "off" : "on",
5725	6328	vm_total_pages);
..	..	@@ -5728,81 +6331,148 @@
5728	6331	#endif
5729	6332	}
5730	6333
	6334	+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
	6335	+static bool __meminit
	6336	+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
	6337	+{
	6338	+ static struct memblock_region *r;
	6339	+
	6340	+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
	6341	+ if (!r \|\| *pfn >= memblock_region_memory_end_pfn(r)) {
	6342	+ for_each_mem_region(r) {
	6343	+ if (*pfn < memblock_region_memory_end_pfn(r))
	6344	+ break;
	6345	+ }
	6346	+ }
	6347	+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
	6348	+ memblock_is_mirror(r)) {
	6349	+ *pfn = memblock_region_memory_end_pfn(r);
	6350	+ return true;
	6351	+ }
	6352	+ }
	6353	+ return false;
	6354	+}
	6355	+
5731	6356	/*
5732	6357	* Initially all pages are reserved - free ones are freed
5733		- * up by free_all_bootmem() once the early boot process is
	6358	+ * up by memblock_free_all() once the early boot process is
5734	6359	* done. Non-atomic initialization, single-pass.
	6360	+ *
	6361	+ * All aligned pageblocks are initialized to the specified migratetype
	6362	+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
	6363	+ * zone stats (e.g., nr_isolate_pageblock) are touched.
5735	6364	*/
5736	6365	void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5737		- unsigned long start_pfn, enum meminit_context context,
5738		- struct vmem_altmap *altmap)
	6366	+ unsigned long start_pfn, unsigned long zone_end_pfn,
	6367	+ enum meminit_context context,
	6368	+ struct vmem_altmap *altmap, int migratetype)
5739	6369	{
5740		- unsigned long end_pfn = start_pfn + size;
5741		- pg_data_t *pgdat = NODE_DATA(nid);
5742		- unsigned long pfn;
5743		- unsigned long nr_initialised = 0;
	6370	+ unsigned long pfn, end_pfn = start_pfn + size;
5744	6371	struct page *page;
5745		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5746		- struct memblock_region r = NULL, tmp;
5747		-#endif
5748	6372
5749	6373	if (highest_memmap_pfn < end_pfn - 1)
5750	6374	highest_memmap_pfn = end_pfn - 1;
	6375	+
	6376	+#ifdef CONFIG_ZONE_DEVICE
	6377	+ /*
	6378	+ * Honor reservation requested by the driver for this ZONE_DEVICE
	6379	+ * memory. We limit the total number of pages to initialize to just
	6380	+ * those that might contain the memory mapping. We will defer the
	6381	+ * ZONE_DEVICE page initialization until after we have released
	6382	+ * the hotplug lock.
	6383	+ */
	6384	+ if (zone == ZONE_DEVICE) {
	6385	+ if (!altmap)
	6386	+ return;
	6387	+
	6388	+ if (start_pfn == altmap->base_pfn)
	6389	+ start_pfn += altmap->reserve;
	6390	+ end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
	6391	+ }
	6392	+#endif
5751	6393
5752	6394	#ifdef CONFIG_ROCKCHIP_THUNDER_BOOT
5753	6395	/* Zero all page struct in advance */
5754	6396	memset(pfn_to_page(start_pfn), 0, sizeof(struct page) * size);
5755	6397	#endif
5756	6398
5757		- /*
5758		- * Honor reservation requested by the driver for this ZONE_DEVICE
5759		- * memory
5760		- */
5761		- if (altmap && start_pfn == altmap->base_pfn)
5762		- start_pfn += altmap->reserve;
5763		-
5764		- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
	6399	+ for (pfn = start_pfn; pfn < end_pfn; ) {
5765	6400	/*
5766	6401	* There can be holes in boot-time mem_map[]s handed to this
5767	6402	* function. They do not exist on hotplugged memory.
5768	6403	*/
5769		- if (context != MEMINIT_EARLY)
5770		- goto not_early;
5771		-
5772		- if (!early_pfn_valid(pfn))
5773		- continue;
5774		- if (!early_pfn_in_nid(pfn, nid))
5775		- continue;
5776		- if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
5777		- break;
5778		-
5779		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5780		- /*
5781		- * Check given memblock attribute by firmware which can affect
5782		- * kernel memory layout. If zone==ZONE_MOVABLE but memory is
5783		- * mirrored, it's an overlapped memmap init. skip it.
5784		- */
5785		- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
5786		- if (!r \|\| pfn >= memblock_region_memory_end_pfn(r)) {
5787		- for_each_memblock(memory, tmp)
5788		- if (pfn < memblock_region_memory_end_pfn(tmp))
5789		- break;
5790		- r = tmp;
5791		- }
5792		- if (pfn >= memblock_region_memory_base_pfn(r) &&
5793		- memblock_is_mirror(r)) {
5794		- /* already initialized as NORMAL */
5795		- pfn = memblock_region_memory_end_pfn(r);
	6404	+ if (context == MEMINIT_EARLY) {
	6405	+ if (overlap_memmap_init(zone, &pfn))
5796	6406	continue;
5797		- }
	6407	+ if (defer_init(nid, pfn, zone_end_pfn))
	6408	+ break;
5798	6409	}
5799		-#endif
5800	6410
5801		-not_early:
5802	6411	page = pfn_to_page(pfn);
5803	6412	__init_single_page(page, pfn, zone, nid, false);
5804	6413	if (context == MEMINIT_HOTPLUG)
5805		- SetPageReserved(page);
	6414	+ __SetPageReserved(page);
	6415	+
	6416	+ /*
	6417	+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
	6418	+ * such that unmovable allocations won't be scattered all
	6419	+ * over the place during system boot.
	6420	+ */
	6421	+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
	6422	+ set_pageblock_migratetype(page, migratetype);
	6423	+ cond_resched();
	6424	+ }
	6425	+ pfn++;
	6426	+ }
	6427	+}
	6428	+
	6429	+#ifdef CONFIG_ZONE_DEVICE
	6430	+void __ref memmap_init_zone_device(struct zone *zone,
	6431	+ unsigned long start_pfn,
	6432	+ unsigned long nr_pages,
	6433	+ struct dev_pagemap *pgmap)
	6434	+{
	6435	+ unsigned long pfn, end_pfn = start_pfn + nr_pages;
	6436	+ struct pglist_data *pgdat = zone->zone_pgdat;
	6437	+ struct vmem_altmap *altmap = pgmap_altmap(pgmap);
	6438	+ unsigned long zone_idx = zone_idx(zone);
	6439	+ unsigned long start = jiffies;
	6440	+ int nid = pgdat->node_id;
	6441	+
	6442	+ if (WARN_ON_ONCE(!pgmap \|\| zone_idx(zone) != ZONE_DEVICE))
	6443	+ return;
	6444	+
	6445	+ /*
	6446	+ * The call to memmap_init should have already taken care
	6447	+ * of the pages reserved for the memmap, so we can just jump to
	6448	+ * the end of that region and start processing the device pages.
	6449	+ */
	6450	+ if (altmap) {
	6451	+ start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
	6452	+ nr_pages = end_pfn - start_pfn;
	6453	+ }
	6454	+
	6455	+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
	6456	+ struct page *page = pfn_to_page(pfn);
	6457	+
	6458	+ __init_single_page(page, pfn, zone_idx, nid, true);
	6459	+
	6460	+ /*
	6461	+ * Mark page reserved as it will need to wait for onlining
	6462	+ * phase for it to be fully associated with a zone.
	6463	+ *
	6464	+ * We can use the non-atomic __set_bit operation for setting
	6465	+ * the flag as we are still initializing the pages.
	6466	+ */
	6467	+ __SetPageReserved(page);
	6468	+
	6469	+ /*
	6470	+ * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
	6471	+ * and zone_device_data. It is a bug if a ZONE_DEVICE page is
	6472	+ * ever freed or placed on a driver-private list.
	6473	+ */
	6474	+ page->pgmap = pgmap;
	6475	+ page->zone_device_data = NULL;
5806	6476
5807	6477	/*
5808	6478	* Mark the block movable so that blocks are reserved for
..	..	@@ -5811,21 +6481,20 @@
5811	6481	* the address space during boot when many long-lived
5812	6482	* kernel allocations are made.
5813	6483	*
5814		- * bitmap is created for zone's valid pfn range. but memmap
5815		- * can be created for invalid pages (for alignment)
5816		- * check here not to call set_pageblock_migratetype() against
5817		- * pfn out of zone.
5818		- *
5819	6484	* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
5820		- * because this is done early in sparse_add_one_section
	6485	+ * because this is done early in section_activate()
5821	6486	*/
5822		- if (!(pfn & (pageblock_nr_pages - 1))) {
	6487	+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
5823	6488	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5824	6489	cond_resched();
5825	6490	}
5826	6491	}
	6492	+
	6493	+ pr_info("%s initialised %lu pages in %ums\n", __func__,
	6494	+ nr_pages, jiffies_to_msecs(jiffies - start));
5827	6495	}
5828	6496
	6497	+#endif
5829	6498	static void __meminit zone_init_free_lists(struct zone *zone)
5830	6499	{
5831	6500	unsigned int order, t;
..	..	@@ -5835,11 +6504,118 @@
5835	6504	}
5836	6505	}
5837	6506
5838		-#ifndef __HAVE_ARCH_MEMMAP_INIT
5839		-#define memmap_init(size, nid, zone, start_pfn) \
5840		- memmap_init_zone((size), (nid), (zone), (start_pfn), \
5841		- MEMINIT_EARLY, NULL)
	6507	+/*
	6508	+ * Only struct pages that correspond to ranges defined by memblock.memory
	6509	+ * are zeroed and initialized by going through __init_single_page() during
	6510	+ * memmap_init_zone_range().
	6511	+ *
	6512	+ * But, there could be struct pages that correspond to holes in
	6513	+ * memblock.memory. This can happen because of the following reasons:
	6514	+ * - physical memory bank size is not necessarily the exact multiple of the
	6515	+ * arbitrary section size
	6516	+ * - early reserved memory may not be listed in memblock.memory
	6517	+ * - memory layouts defined with memmap= kernel parameter may not align
	6518	+ * nicely with memmap sections
	6519	+ *
	6520	+ * Explicitly initialize those struct pages so that:
	6521	+ * - PG_Reserved is set
	6522	+ * - zone and node links point to zone and node that span the page if the
	6523	+ * hole is in the middle of a zone
	6524	+ * - zone and node links point to adjacent zone/node if the hole falls on
	6525	+ * the zone boundary; the pages in such holes will be prepended to the
	6526	+ * zone/node above the hole except for the trailing pages in the last
	6527	+ * section that will be appended to the zone/node below.
	6528	+ */
	6529	+static void __init init_unavailable_range(unsigned long spfn,
	6530	+ unsigned long epfn,
	6531	+ int zone, int node)
	6532	+{
	6533	+ unsigned long pfn;
	6534	+ u64 pgcnt = 0;
	6535	+
	6536	+ for (pfn = spfn; pfn < epfn; pfn++) {
	6537	+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
	6538	+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
	6539	+ + pageblock_nr_pages - 1;
	6540	+ continue;
	6541	+ }
	6542	+ __init_single_page(pfn_to_page(pfn), pfn, zone, node, true);
	6543	+ __SetPageReserved(pfn_to_page(pfn));
	6544	+ pgcnt++;
	6545	+ }
	6546	+
	6547	+ if (pgcnt)
	6548	+ pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
	6549	+ node, zone_names[zone], pgcnt);
	6550	+}
	6551	+
	6552	+static void __init memmap_init_zone_range(struct zone *zone,
	6553	+ unsigned long start_pfn,
	6554	+ unsigned long end_pfn,
	6555	+ unsigned long *hole_pfn)
	6556	+{
	6557	+ unsigned long zone_start_pfn = zone->zone_start_pfn;
	6558	+ unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
	6559	+ int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
	6560	+
	6561	+ start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
	6562	+ end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
	6563	+
	6564	+ if (start_pfn >= end_pfn)
	6565	+ return;
	6566	+
	6567	+ memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn,
	6568	+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
	6569	+
	6570	+ if (*hole_pfn < start_pfn)
	6571	+ init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
	6572	+
	6573	+ *hole_pfn = end_pfn;
	6574	+}
	6575	+
	6576	+void __init __weak memmap_init(void)
	6577	+{
	6578	+ unsigned long start_pfn, end_pfn;
	6579	+ unsigned long hole_pfn = 0;
	6580	+ int i, j, zone_id, nid;
	6581	+
	6582	+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
	6583	+ struct pglist_data *node = NODE_DATA(nid);
	6584	+
	6585	+ for (j = 0; j < MAX_NR_ZONES; j++) {
	6586	+ struct zone *zone = node->node_zones + j;
	6587	+
	6588	+ if (!populated_zone(zone))
	6589	+ continue;
	6590	+
	6591	+ memmap_init_zone_range(zone, start_pfn, end_pfn,
	6592	+ &hole_pfn);
	6593	+ zone_id = j;
	6594	+ }
	6595	+ }
	6596	+
	6597	+#ifdef CONFIG_SPARSEMEM
	6598	+ /*
	6599	+ * Initialize the memory map for hole in the range [memory_end,
	6600	+ * section_end].
	6601	+ * Append the pages in this hole to the highest zone in the last
	6602	+ * node.
	6603	+ * The call to init_unavailable_range() is outside the ifdef to
	6604	+ * silence the compiler warining about zone_id set but not used;
	6605	+ * for FLATMEM it is a nop anyway
	6606	+ */
	6607	+ end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
	6608	+ if (hole_pfn < end_pfn)
5842	6609	#endif
	6610	+ init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
	6611	+}
	6612	+
	6613	+/* A stub for backwards compatibility with custom implementatin on IA-64 */
	6614	+void __meminit __weak arch_memmap_init(unsigned long size, int nid,
	6615	+ unsigned long zone,
	6616	+ unsigned long range_start_pfn)
	6617	+{
	6618	+}
5843	6619
5844	6620	static int zone_batchsize(struct zone *zone)
5845	6621	{
..	..	@@ -5850,7 +6626,7 @@
5850	6626	* The per-cpu-pages pools are set to around 1000th of the
5851	6627	* size of the zone.
5852	6628	*/
5853		- batch = zone->managed_pages / 1024;
	6629	+ batch = zone_managed_pages(zone) / 1024;
5854	6630	/* But no more than a meg. */
5855	6631	if (batch * PAGE_SIZE > 1024 * 1024)
5856	6632	batch = (1024 * 1024) / PAGE_SIZE;
..	..	@@ -5897,7 +6673,7 @@
5897	6673	* locking.
5898	6674	*
5899	6675	* Any new users of pcp->batch and pcp->high should ensure they can cope with
5900		- * those fields changing asynchronously (acording the the above rule).
	6676	+ * those fields changing asynchronously (acording to the above rule).
5901	6677	*
5902	6678	* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5903	6679	* outside of boot time (or some other assurance that no concurrent updaters
..	..	@@ -5931,7 +6707,6 @@
5931	6707	memset(p, 0, sizeof(*p));
5932	6708
5933	6709	pcp = &p->pcp;
5934		- pcp->count = 0;
5935	6710	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
5936	6711	INIT_LIST_HEAD(&pcp->lists[migratetype]);
5937	6712	}
..	..	@@ -5961,7 +6736,7 @@
5961	6736	{
5962	6737	if (percpu_pagelist_fraction)
5963	6738	pageset_set_high(pcp,
5964		- (zone->managed_pages /
	6739	+ (zone_managed_pages(zone) /
5965	6740	percpu_pagelist_fraction));
5966	6741	else
5967	6742	pageset_set_batch(pcp, zone_batchsize(zone));
..	..	@@ -5991,9 +6766,24 @@
5991	6766	{
5992	6767	struct pglist_data *pgdat;
5993	6768	struct zone *zone;
	6769	+ int __maybe_unused cpu;
5994	6770
5995	6771	for_each_populated_zone(zone)
5996	6772	setup_zone_pageset(zone);
	6773	+
	6774	+#ifdef CONFIG_NUMA
	6775	+ /*
	6776	+ * Unpopulated zones continue using the boot pagesets.
	6777	+ * The numa stats for these pagesets need to be reset.
	6778	+ * Otherwise, they will end up skewing the stats of
	6779	+ * the nodes these zones are associated with.
	6780	+ */
	6781	+ for_each_possible_cpu(cpu) {
	6782	+ struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
	6783	+ memset(pcp->vm_numa_stat_diff, 0,
	6784	+ sizeof(pcp->vm_numa_stat_diff));
	6785	+ }
	6786	+#endif
5997	6787
5998	6788	for_each_online_pgdat(pgdat)
5999	6789	pgdat->per_cpu_nodestats =
..	..	@@ -6037,73 +6827,6 @@
6037	6827	zone->initialized = 1;
6038	6828	}
6039	6829
6040		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6041		-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
6042		-
6043		-/*
6044		- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
6045		- */
6046		-int __meminit __early_pfn_to_nid(unsigned long pfn,
6047		- struct mminit_pfnnid_cache *state)
6048		-{
6049		- unsigned long start_pfn, end_pfn;
6050		- int nid;
6051		-
6052		- if (state->last_start <= pfn && pfn < state->last_end)
6053		- return state->last_nid;
6054		-
6055		- nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
6056		- if (nid != -1) {
6057		- state->last_start = start_pfn;
6058		- state->last_end = end_pfn;
6059		- state->last_nid = nid;
6060		- }
6061		-
6062		- return nid;
6063		-}
6064		-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
6065		-
6066		-/**
6067		- * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
6068		- * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
6069		- * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
6070		- *
6071		- * If an architecture guarantees that all ranges registered contain no holes
6072		- * and may be freed, this this function may be used instead of calling
6073		- * memblock_free_early_nid() manually.
6074		- */
6075		-void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
6076		-{
6077		- unsigned long start_pfn, end_pfn;
6078		- int i, this_nid;
6079		-
6080		- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
6081		- start_pfn = min(start_pfn, max_low_pfn);
6082		- end_pfn = min(end_pfn, max_low_pfn);
6083		-
6084		- if (start_pfn < end_pfn)
6085		- memblock_free_early_nid(PFN_PHYS(start_pfn),
6086		- (end_pfn - start_pfn) << PAGE_SHIFT,
6087		- this_nid);
6088		- }
6089		-}
6090		-
6091		-/**
6092		- * sparse_memory_present_with_active_regions - Call memory_present for each active range
6093		- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
6094		- *
6095		- * If an architecture guarantees that all ranges registered contain no holes and may
6096		- * be freed, this function may be used instead of calling memory_present() manually.
6097		- */
6098		-void __init sparse_memory_present_with_active_regions(int nid)
6099		-{
6100		- unsigned long start_pfn, end_pfn;
6101		- int i, this_nid;
6102		-
6103		- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
6104		- memory_present(this_nid, start_pfn, end_pfn);
6105		-}
6106		-
6107	6830	/**
6108	6831	* get_pfn_range_for_nid - Return the start and end page frames for a node
6109	6832	* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
..	..	@@ -6115,7 +6838,7 @@
6115	6838	* with no available memory, a warning is printed and the start and end
6116	6839	* PFNs will be 0.
6117	6840	*/
6118		-void __meminit get_pfn_range_for_nid(unsigned int nid,
	6841	+void __init get_pfn_range_for_nid(unsigned int nid,
6119	6842	unsigned long start_pfn, unsigned long end_pfn)
6120	6843	{
6121	6844	unsigned long this_start_pfn, this_end_pfn;
..	..	@@ -6164,7 +6887,7 @@
6164	6887	* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
6165	6888	* zones within a node are in order of monotonic increases memory addresses
6166	6889	*/
6167		-static void __meminit adjust_zone_range_for_zone_movable(int nid,
	6890	+static void __init adjust_zone_range_for_zone_movable(int nid,
6168	6891	unsigned long zone_type,
6169	6892	unsigned long node_start_pfn,
6170	6893	unsigned long node_end_pfn,
..	..	@@ -6195,13 +6918,12 @@
6195	6918	* Return the number of pages a zone spans in a node, including holes
6196	6919	* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
6197	6920	*/
6198		-static unsigned long __meminit zone_spanned_pages_in_node(int nid,
	6921	+static unsigned long __init zone_spanned_pages_in_node(int nid,
6199	6922	unsigned long zone_type,
6200	6923	unsigned long node_start_pfn,
6201	6924	unsigned long node_end_pfn,
6202	6925	unsigned long *zone_start_pfn,
6203		- unsigned long *zone_end_pfn,
6204		- unsigned long *ignored)
	6926	+ unsigned long *zone_end_pfn)
6205	6927	{
6206	6928	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6207	6929	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
..	..	@@ -6232,7 +6954,7 @@
6232	6954	* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
6233	6955	* then all holes in the requested range will be accounted for.
6234	6956	*/
6235		-unsigned long __meminit __absent_pages_in_range(int nid,
	6957	+unsigned long __init __absent_pages_in_range(int nid,
6236	6958	unsigned long range_start_pfn,
6237	6959	unsigned long range_end_pfn)
6238	6960	{
..	..	@@ -6253,7 +6975,7 @@
6253	6975	* @start_pfn: The start PFN to start searching for holes
6254	6976	* @end_pfn: The end PFN to stop searching for holes
6255	6977	*
6256		- * It returns the number of pages frames in memory holes within a range.
	6978	+ * Return: the number of pages frames in memory holes within a range.
6257	6979	*/
6258	6980	unsigned long __init absent_pages_in_range(unsigned long start_pfn,
6259	6981	unsigned long end_pfn)
..	..	@@ -6262,11 +6984,10 @@
6262	6984	}
6263	6985
6264	6986	/* Return the number of page frames in holes in a zone on a node */
6265		-static unsigned long __meminit zone_absent_pages_in_node(int nid,
	6987	+static unsigned long __init zone_absent_pages_in_node(int nid,
6266	6988	unsigned long zone_type,
6267	6989	unsigned long node_start_pfn,
6268		- unsigned long node_end_pfn,
6269		- unsigned long *ignored)
	6990	+ unsigned long node_end_pfn)
6270	6991	{
6271	6992	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
6272	6993	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
..	..	@@ -6294,7 +7015,7 @@
6294	7015	unsigned long start_pfn, end_pfn;
6295	7016	struct memblock_region *r;
6296	7017
6297		- for_each_memblock(memory, r) {
	7018	+ for_each_mem_region(r) {
6298	7019	start_pfn = clamp(memblock_region_memory_base_pfn(r),
6299	7020	zone_start_pfn, zone_end_pfn);
6300	7021	end_pfn = clamp(memblock_region_memory_end_pfn(r),
..	..	@@ -6313,45 +7034,9 @@
6313	7034	return nr_absent;
6314	7035	}
6315	7036
6316		-#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6317		-static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
6318		- unsigned long zone_type,
6319		- unsigned long node_start_pfn,
6320		- unsigned long node_end_pfn,
6321		- unsigned long *zone_start_pfn,
6322		- unsigned long *zone_end_pfn,
6323		- unsigned long *zones_size)
6324		-{
6325		- unsigned int zone;
6326		-
6327		- *zone_start_pfn = node_start_pfn;
6328		- for (zone = 0; zone < zone_type; zone++)
6329		- *zone_start_pfn += zones_size[zone];
6330		-
6331		- zone_end_pfn = zone_start_pfn + zones_size[zone_type];
6332		-
6333		- return zones_size[zone_type];
6334		-}
6335		-
6336		-static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
6337		- unsigned long zone_type,
	7037	+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
6338	7038	unsigned long node_start_pfn,
6339		- unsigned long node_end_pfn,
6340		- unsigned long *zholes_size)
6341		-{
6342		- if (!zholes_size)
6343		- return 0;
6344		-
6345		- return zholes_size[zone_type];
6346		-}
6347		-
6348		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6349		-
6350		-static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
6351		- unsigned long node_start_pfn,
6352		- unsigned long node_end_pfn,
6353		- unsigned long *zones_size,
6354		- unsigned long *zholes_size)
	7039	+ unsigned long node_end_pfn)
6355	7040	{
6356	7041	unsigned long realtotalpages = 0, totalpages = 0;
6357	7042	enum zone_type i;
..	..	@@ -6359,17 +7044,21 @@
6359	7044	for (i = 0; i < MAX_NR_ZONES; i++) {
6360	7045	struct zone *zone = pgdat->node_zones + i;
6361	7046	unsigned long zone_start_pfn, zone_end_pfn;
	7047	+ unsigned long spanned, absent;
6362	7048	unsigned long size, real_size;
6363	7049
6364		- size = zone_spanned_pages_in_node(pgdat->node_id, i,
6365		- node_start_pfn,
6366		- node_end_pfn,
6367		- &zone_start_pfn,
6368		- &zone_end_pfn,
6369		- zones_size);
6370		- real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
6371		- node_start_pfn, node_end_pfn,
6372		- zholes_size);
	7050	+ spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
	7051	+ node_start_pfn,
	7052	+ node_end_pfn,
	7053	+ &zone_start_pfn,
	7054	+ &zone_end_pfn);
	7055	+ absent = zone_absent_pages_in_node(pgdat->node_id, i,
	7056	+ node_start_pfn,
	7057	+ node_end_pfn);
	7058	+
	7059	+ size = spanned;
	7060	+ real_size = size - absent;
	7061	+
6373	7062	if (size)
6374	7063	zone->zone_start_pfn = zone_start_pfn;
6375	7064	else
..	..	@@ -6415,10 +7104,14 @@
6415	7104	{
6416	7105	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
6417	7106	zone->pageblock_flags = NULL;
6418		- if (usemapsize)
	7107	+ if (usemapsize) {
6419	7108	zone->pageblock_flags =
6420		- memblock_virt_alloc_node_nopanic(usemapsize,
6421		- pgdat->node_id);
	7109	+ memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
	7110	+ pgdat->node_id);
	7111	+ if (!zone->pageblock_flags)
	7112	+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
	7113	+ usemapsize, zone->name, pgdat->node_id);
	7114	+ }
6422	7115	}
6423	7116	#else
6424	7117	static inline void setup_usemap(struct pglist_data pgdat, struct zone zone,
..	..	@@ -6485,9 +7178,11 @@
6485	7178	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6486	7179	static void pgdat_init_split_queue(struct pglist_data *pgdat)
6487	7180	{
6488		- spin_lock_init(&pgdat->split_queue_lock);
6489		- INIT_LIST_HEAD(&pgdat->split_queue);
6490		- pgdat->split_queue_len = 0;
	7181	+ struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
	7182	+
	7183	+ spin_lock_init(&ds_queue->split_queue_lock);
	7184	+ INIT_LIST_HEAD(&ds_queue->split_queue);
	7185	+ ds_queue->split_queue_len = 0;
6491	7186	}
6492	7187	#else
6493	7188	static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
..	..	@@ -6514,13 +7209,13 @@
6514	7209
6515	7210	pgdat_page_ext_init(pgdat);
6516	7211	spin_lock_init(&pgdat->lru_lock);
6517		- lruvec_init(node_lruvec(pgdat));
	7212	+ lruvec_init(&pgdat->__lruvec);
6518	7213	}
6519	7214
6520	7215	static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
6521	7216	unsigned long remaining_pages)
6522	7217	{
6523		- zone->managed_pages = remaining_pages;
	7218	+ atomic_long_set(&zone->managed_pages, remaining_pages);
6524	7219	zone_set_nid(zone, nid);
6525	7220	zone->name = zone_names[idx];
6526	7221	zone->zone_pgdat = NODE_DATA(nid);
..	..	@@ -6618,7 +7313,7 @@
6618	7313	set_pageblock_order();
6619	7314	setup_usemap(pgdat, zone, zone_start_pfn, size);
6620	7315	init_currently_empty_zone(zone, zone_start_pfn, size);
6621		- memmap_init(size, nid, j, zone_start_pfn);
	7316	+ arch_memmap_init(size, nid, j, zone_start_pfn);
6622	7317	}
6623	7318	}
6624	7319
..	..	@@ -6647,7 +7342,11 @@
6647	7342	end = pgdat_end_pfn(pgdat);
6648	7343	end = ALIGN(end, MAX_ORDER_NR_PAGES);
6649	7344	size = (end - start) * sizeof(struct page);
6650		- map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
	7345	+ map = memblock_alloc_node(size, SMP_CACHE_BYTES,
	7346	+ pgdat->node_id);
	7347	+ if (!map)
	7348	+ panic("Failed to allocate %ld bytes for node %d memory map\n",
	7349	+ size, pgdat->node_id);
6651	7350	pgdat->node_mem_map = map + offset;
6652	7351	}
6653	7352	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
..	..	@@ -6659,10 +7358,8 @@
6659	7358	*/
6660	7359	if (pgdat == NODE_DATA(0)) {
6661	7360	mem_map = NODE_DATA(0)->node_mem_map;
6662		-#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) \|\| defined(CONFIG_FLATMEM)
6663	7361	if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
6664	7362	mem_map -= offset;
6665		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6666	7363	}
6667	7364	#endif
6668	7365	}
..	..	@@ -6673,42 +7370,31 @@
6673	7370	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6674	7371	static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
6675	7372	{
6676		- /*
6677		- * We start only with one section of pages, more pages are added as
6678		- * needed until the rest of deferred pages are initialized.
6679		- */
6680		- pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
6681		- pgdat->node_spanned_pages);
6682	7373	pgdat->first_deferred_pfn = ULONG_MAX;
6683	7374	}
6684	7375	#else
6685	7376	static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
6686	7377	#endif
6687	7378
6688		-void __init free_area_init_node(int nid, unsigned long *zones_size,
6689		- unsigned long node_start_pfn,
6690		- unsigned long *zholes_size)
	7379	+static void __init free_area_init_node(int nid)
6691	7380	{
6692	7381	pg_data_t *pgdat = NODE_DATA(nid);
6693	7382	unsigned long start_pfn = 0;
6694	7383	unsigned long end_pfn = 0;
6695	7384
6696	7385	/* pg_data_t should be reset to zero when it's allocated */
6697		- WARN_ON(pgdat->nr_zones \|\| pgdat->kswapd_classzone_idx);
	7386	+ WARN_ON(pgdat->nr_zones \|\| pgdat->kswapd_highest_zoneidx);
	7387	+
	7388	+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
6698	7389
6699	7390	pgdat->node_id = nid;
6700		- pgdat->node_start_pfn = node_start_pfn;
	7391	+ pgdat->node_start_pfn = start_pfn;
6701	7392	pgdat->per_cpu_nodestats = NULL;
6702		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6703		- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
	7393	+
6704	7394	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
6705	7395	(u64)start_pfn << PAGE_SHIFT,
6706	7396	end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
6707		-#else
6708		- start_pfn = node_start_pfn;
6709		-#endif
6710		- calculate_node_totalpages(pgdat, start_pfn, end_pfn,
6711		- zones_size, zholes_size);
	7397	+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
6712	7398
6713	7399	alloc_node_mem_map(pgdat);
6714	7400	pgdat_set_deferred_range(pgdat);
..	..	@@ -6716,80 +7402,10 @@
6716	7402	free_area_init_core(pgdat);
6717	7403	}
6718	7404
6719		-#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
6720		-
6721		-/*
6722		- * Zero all valid struct pages in range [spfn, epfn), return number of struct
6723		- * pages zeroed
6724		- */
6725		-static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
	7405	+void __init free_area_init_memoryless_node(int nid)
6726	7406	{
6727		- unsigned long pfn;
6728		- u64 pgcnt = 0;
6729		-
6730		- for (pfn = spfn; pfn < epfn; pfn++) {
6731		- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
6732		- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
6733		- + pageblock_nr_pages - 1;
6734		- continue;
6735		- }
6736		- mm_zero_struct_page(pfn_to_page(pfn));
6737		- pgcnt++;
6738		- }
6739		-
6740		- return pgcnt;
	7407	+ free_area_init_node(nid);
6741	7408	}
6742		-
6743		-/*
6744		- * Only struct pages that are backed by physical memory are zeroed and
6745		- * initialized by going through __init_single_page(). But, there are some
6746		- * struct pages which are reserved in memblock allocator and their fields
6747		- * may be accessed (for example page_to_pfn() on some configuration accesses
6748		- * flags). We must explicitly zero those struct pages.
6749		- *
6750		- * This function also addresses a similar issue where struct pages are left
6751		- * uninitialized because the physical address range is not covered by
6752		- * memblock.memory or memblock.reserved. That could happen when memblock
6753		- * layout is manually configured via memmap=, or when the highest physical
6754		- * address (max_pfn) does not end on a section boundary.
6755		- */
6756		-void __init zero_resv_unavail(void)
6757		-{
6758		- phys_addr_t start, end;
6759		- u64 i, pgcnt;
6760		- phys_addr_t next = 0;
6761		-
6762		- /*
6763		- * Loop through unavailable ranges not covered by memblock.memory.
6764		- */
6765		- pgcnt = 0;
6766		- for_each_mem_range(i, &memblock.memory, NULL,
6767		- NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
6768		- if (next < start)
6769		- pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
6770		- next = end;
6771		- }
6772		-
6773		- /*
6774		- * Early sections always have a fully populated memmap for the whole
6775		- * section - see pfn_valid(). If the last section has holes at the
6776		- * end and that section is marked "online", the memmap will be
6777		- * considered initialized. Make sure that memmap has a well defined
6778		- * state.
6779		- */
6780		- pgcnt += zero_pfn_range(PFN_DOWN(next),
6781		- round_up(max_pfn, PAGES_PER_SECTION));
6782		-
6783		- /*
6784		- * Struct pages that do not have backing memory. This could be because
6785		- * firmware is using some of this memory, or for some other reasons.
6786		- */
6787		- if (pgcnt)
6788		- pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
6789		-}
6790		-#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
6791		-
6792		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6793	7409
6794	7410	#if MAX_NUMNODES > 1
6795	7411	/*
..	..	@@ -6820,14 +7436,14 @@
6820	7436	* model has fine enough granularity to avoid incorrect mapping for the
6821	7437	* populated node map.
6822	7438	*
6823		- * Returns the determined alignment in pfn's. 0 if there is no alignment
	7439	+ * Return: the determined alignment in pfn's. 0 if there is no alignment
6824	7440	* requirement (single node).
6825	7441	*/
6826	7442	unsigned long __init node_map_pfn_alignment(void)
6827	7443	{
6828	7444	unsigned long accl_mask = 0, last_end = 0;
6829	7445	unsigned long start, end, mask;
6830		- int last_nid = -1;
	7446	+ int last_nid = NUMA_NO_NODE;
6831	7447	int i, nid;
6832	7448
6833	7449	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
..	..	@@ -6854,33 +7470,15 @@
6854	7470	return ~accl_mask + 1;
6855	7471	}
6856	7472
6857		-/* Find the lowest pfn for a node */
6858		-static unsigned long __init find_min_pfn_for_node(int nid)
6859		-{
6860		- unsigned long min_pfn = ULONG_MAX;
6861		- unsigned long start_pfn;
6862		- int i;
6863		-
6864		- for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
6865		- min_pfn = min(min_pfn, start_pfn);
6866		-
6867		- if (min_pfn == ULONG_MAX) {
6868		- pr_warn("Could not find start_pfn for node %d\n", nid);
6869		- return 0;
6870		- }
6871		-
6872		- return min_pfn;
6873		-}
6874		-
6875	7473	/**
6876	7474	* find_min_pfn_with_active_regions - Find the minimum PFN registered
6877	7475	*
6878		- * It returns the minimum PFN based on information provided via
	7476	+ * Return: the minimum PFN based on information provided via
6879	7477	* memblock_set_node().
6880	7478	*/
6881	7479	unsigned long __init find_min_pfn_with_active_regions(void)
6882	7480	{
6883		- return find_min_pfn_for_node(MAX_NUMNODES);
	7481	+ return PHYS_PFN(memblock_start_of_DRAM());
6884	7482	}
6885	7483
6886	7484	/*
..	..	@@ -6929,11 +7527,11 @@
6929	7527	* options.
6930	7528	*/
6931	7529	if (movable_node_is_enabled()) {
6932		- for_each_memblock(memory, r) {
	7530	+ for_each_mem_region(r) {
6933	7531	if (!memblock_is_hotpluggable(r))
6934	7532	continue;
6935	7533
6936		- nid = r->nid;
	7534	+ nid = memblock_get_region_node(r);
6937	7535
6938	7536	usable_startpfn = PFN_DOWN(r->base);
6939	7537	zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
..	..	@@ -6950,11 +7548,11 @@
6950	7548	if (mirrored_kernelcore) {
6951	7549	bool mem_below_4gb_not_mirrored = false;
6952	7550
6953		- for_each_memblock(memory, r) {
	7551	+ for_each_mem_region(r) {
6954	7552	if (memblock_is_mirror(r))
6955	7553	continue;
6956	7554
6957		- nid = r->nid;
	7555	+ nid = memblock_get_region_node(r);
6958	7556
6959	7557	usable_startpfn = memblock_region_memory_base_pfn(r);
6960	7558
..	..	@@ -6969,7 +7567,7 @@
6969	7567	}
6970	7568
6971	7569	if (mem_below_4gb_not_mirrored)
6972		- pr_warn("This configuration results in unmirrored kernel memory.");
	7570	+ pr_warn("This configuration results in unmirrored kernel memory.\n");
6973	7571
6974	7572	goto out2;
6975	7573	}
..	..	@@ -7108,9 +7706,16 @@
7108	7706
7109	7707	out2:
7110	7708	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
7111		- for (nid = 0; nid < MAX_NUMNODES; nid++)
	7709	+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
	7710	+ unsigned long start_pfn, end_pfn;
	7711	+
7112	7712	zone_movable_pfn[nid] =
7113	7713	roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
	7714	+
	7715	+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
	7716	+ if (zone_movable_pfn[nid] >= end_pfn)
	7717	+ zone_movable_pfn[nid] = 0;
	7718	+ }
7114	7719
7115	7720	out:
7116	7721	/* restore the node_state */
..	..	@@ -7122,23 +7727,29 @@
7122	7727	{
7123	7728	enum zone_type zone_type;
7124	7729
7125		- if (N_MEMORY == N_NORMAL_MEMORY)
7126		- return;
7127		-
7128	7730	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
7129	7731	struct zone *zone = &pgdat->node_zones[zone_type];
7130	7732	if (populated_zone(zone)) {
7131		- node_set_state(nid, N_HIGH_MEMORY);
7132		- if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
7133		- zone_type <= ZONE_NORMAL)
	7733	+ if (IS_ENABLED(CONFIG_HIGHMEM))
	7734	+ node_set_state(nid, N_HIGH_MEMORY);
	7735	+ if (zone_type <= ZONE_NORMAL)
7134	7736	node_set_state(nid, N_NORMAL_MEMORY);
7135	7737	break;
7136	7738	}
7137	7739	}
7138	7740	}
7139	7741
	7742	+/*
	7743	+ * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
	7744	+ * such cases we allow max_zone_pfn sorted in the descending order
	7745	+ */
	7746	+bool __weak arch_has_descending_max_zone_pfns(void)
	7747	+{
	7748	+ return false;
	7749	+}
	7750	+
7140	7751	/**
7141		- * free_area_init_nodes - Initialise all pg_data_t and zone data
	7752	+ * free_area_init - Initialise all pg_data_t and zone data
7142	7753	* @max_zone_pfn: an array of max PFNs for each zone
7143	7754	*
7144	7755	* This will call free_area_init_node() for each active node in the system.
..	..	@@ -7150,10 +7761,11 @@
7150	7761	* starts where the previous one ended. For example, ZONE_DMA32 starts
7151	7762	* at arch_max_dma_pfn.
7152	7763	*/
7153		-void __init free_area_init_nodes(unsigned long *max_zone_pfn)
	7764	+void __init free_area_init(unsigned long *max_zone_pfn)
7154	7765	{
7155	7766	unsigned long start_pfn, end_pfn;
7156		- int i, nid;
	7767	+ int i, nid, zone;
	7768	+ bool descending;
7157	7769
7158	7770	/* Record where the zone boundaries are */
7159	7771	memset(arch_zone_lowest_possible_pfn, 0,
..	..	@@ -7162,14 +7774,20 @@
7162	7774	sizeof(arch_zone_highest_possible_pfn));
7163	7775
7164	7776	start_pfn = find_min_pfn_with_active_regions();
	7777	+ descending = arch_has_descending_max_zone_pfns();
7165	7778
7166	7779	for (i = 0; i < MAX_NR_ZONES; i++) {
7167		- if (i == ZONE_MOVABLE)
	7780	+ if (descending)
	7781	+ zone = MAX_NR_ZONES - i - 1;
	7782	+ else
	7783	+ zone = i;
	7784	+
	7785	+ if (zone == ZONE_MOVABLE)
7168	7786	continue;
7169	7787
7170		- end_pfn = max(max_zone_pfn[i], start_pfn);
7171		- arch_zone_lowest_possible_pfn[i] = start_pfn;
7172		- arch_zone_highest_possible_pfn[i] = end_pfn;
	7788	+ end_pfn = max(max_zone_pfn[zone], start_pfn);
	7789	+ arch_zone_lowest_possible_pfn[zone] = start_pfn;
	7790	+ arch_zone_highest_possible_pfn[zone] = end_pfn;
7173	7791
7174	7792	start_pfn = end_pfn;
7175	7793	}
..	..	@@ -7203,27 +7821,33 @@
7203	7821	(u64)zone_movable_pfn[i] << PAGE_SHIFT);
7204	7822	}
7205	7823
7206		- /* Print out the early node map */
	7824	+ /*
	7825	+ * Print out the early node map, and initialize the
	7826	+ * subsection-map relative to active online memory ranges to
	7827	+ * enable future "sub-section" extensions of the memory map.
	7828	+ */
7207	7829	pr_info("Early memory node ranges\n");
7208		- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
	7830	+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
7209	7831	pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
7210	7832	(u64)start_pfn << PAGE_SHIFT,
7211	7833	((u64)end_pfn << PAGE_SHIFT) - 1);
	7834	+ subsection_map_init(start_pfn, end_pfn - start_pfn);
	7835	+ }
7212	7836
7213	7837	/* Initialise every node */
7214	7838	mminit_verify_pageflags_layout();
7215	7839	setup_nr_node_ids();
7216		- zero_resv_unavail();
7217	7840	for_each_online_node(nid) {
7218	7841	pg_data_t *pgdat = NODE_DATA(nid);
7219		- free_area_init_node(nid, NULL,
7220		- find_min_pfn_for_node(nid), NULL);
	7842	+ free_area_init_node(nid);
7221	7843
7222	7844	/* Any memory on that node */
7223	7845	if (pgdat->node_present_pages)
7224	7846	node_set_state(nid, N_MEMORY);
7225	7847	check_for_memory(pgdat, nid);
7226	7848	}
	7849	+
	7850	+ memmap_init();
7227	7851	}
7228	7852
7229	7853	static int __init cmdline_parse_core(char p, unsigned long core,
..	..	@@ -7282,22 +7906,18 @@
7282	7906	early_param("kernelcore", cmdline_parse_kernelcore);
7283	7907	early_param("movablecore", cmdline_parse_movablecore);
7284	7908
7285		-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
7286		-
7287	7909	void adjust_managed_page_count(struct page *page, long count)
7288	7910	{
7289		- spin_lock(&managed_page_count_lock);
7290		- page_zone(page)->managed_pages += count;
7291		- totalram_pages += count;
	7911	+ atomic_long_add(count, &page_zone(page)->managed_pages);
	7912	+ totalram_pages_add(count);
7292	7913	#ifdef CONFIG_HIGHMEM
7293	7914	if (PageHighMem(page))
7294		- totalhigh_pages += count;
	7915	+ totalhigh_pages_add(count);
7295	7916	#endif
7296		- spin_unlock(&managed_page_count_lock);
7297	7917	}
7298	7918	EXPORT_SYMBOL(adjust_managed_page_count);
7299	7919
7300		-unsigned long free_reserved_area(void start, void end, int poison, char *s)
	7920	+unsigned long free_reserved_area(void start, void end, int poison, const char *s)
7301	7921	{
7302	7922	void *pos;
7303	7923	unsigned long pages = 0;
..	..	@@ -7316,6 +7936,11 @@
7316	7936	* alias for the memset().
7317	7937	*/
7318	7938	direct_map_addr = page_address(page);
	7939	+ /*
	7940	+ * Perform a kasan-unchecked memset() since this memory
	7941	+ * has not been initialized.
	7942	+ */
	7943	+ direct_map_addr = kasan_reset_tag(direct_map_addr);
7319	7944	if ((unsigned int)poison <= 0xFF)
7320	7945	memset(direct_map_addr, poison, PAGE_SIZE);
7321	7946
..	..	@@ -7328,15 +7953,14 @@
7328	7953
7329	7954	return pages;
7330	7955	}
7331		-EXPORT_SYMBOL(free_reserved_area);
7332	7956
7333	7957	#ifdef CONFIG_HIGHMEM
7334	7958	void free_highmem_page(struct page *page)
7335	7959	{
7336	7960	__free_reserved_page(page);
7337		- totalram_pages++;
7338		- page_zone(page)->managed_pages++;
7339		- totalhigh_pages++;
	7961	+ totalram_pages_inc();
	7962	+ atomic_long_inc(&page_zone(page)->managed_pages);
	7963	+ totalhigh_pages_inc();
7340	7964	}
7341	7965	#endif
7342	7966
..	..	@@ -7363,7 +7987,7 @@
7363	7987	*/
7364	7988	#define adj_init_size(start, end, size, pos, adj) \
7365	7989	do { \
7366		- if (start <= pos && pos < end && size > adj) \
	7990	+ if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
7367	7991	size -= adj; \
7368	7992	} while (0)
7369	7993
..	..	@@ -7385,10 +8009,10 @@
7385	8009	physpages << (PAGE_SHIFT - 10),
7386	8010	codesize >> 10, datasize >> 10, rosize >> 10,
7387	8011	(init_data_size + init_code_size) >> 10, bss_size >> 10,
7388		- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
	8012	+ (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
7389	8013	totalcma_pages << (PAGE_SHIFT - 10),
7390	8014	#ifdef CONFIG_HIGHMEM
7391		- totalhigh_pages << (PAGE_SHIFT - 10),
	8015	+ totalhigh_pages() << (PAGE_SHIFT - 10),
7392	8016	#endif
7393	8017	str ? ", " : "", str ? str : "");
7394	8018	}
..	..	@@ -7409,18 +8033,10 @@
7409	8033	dma_reserve = new_dma_reserve;
7410	8034	}
7411	8035
7412		-void __init free_area_init(unsigned long *zones_size)
7413		-{
7414		- zero_resv_unavail();
7415		- free_area_init_node(0, zones_size,
7416		- __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
7417		-}
7418		-
7419	8036	static int page_alloc_cpu_dead(unsigned int cpu)
7420	8037	{
7421		- local_lock_irq_on(swapvec_lock, cpu);
	8038	+
7422	8039	lru_add_drain_cpu(cpu);
7423		- local_unlock_irq_on(swapvec_lock, cpu);
7424	8040	drain_pages(cpu);
7425	8041
7426	8042	/*
..	..	@@ -7442,9 +8058,27 @@
7442	8058	return 0;
7443	8059	}
7444	8060
	8061	+#ifdef CONFIG_NUMA
	8062	+int hashdist = HASHDIST_DEFAULT;
	8063	+
	8064	+static int __init set_hashdist(char *str)
	8065	+{
	8066	+ if (!str)
	8067	+ return 0;
	8068	+ hashdist = simple_strtoul(str, &str, 0);
	8069	+ return 1;
	8070	+}
	8071	+__setup("hashdist=", set_hashdist);
	8072	+#endif
	8073	+
7445	8074	void __init page_alloc_init(void)
7446	8075	{
7447	8076	int ret;
	8077	+
	8078	+#ifdef CONFIG_NUMA
	8079	+ if (num_node_state(N_MEMORY) == 1)
	8080	+ hashdist = 0;
	8081	+#endif
7448	8082
7449	8083	ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
7450	8084	"mm/page_alloc:dead", NULL,
..	..	@@ -7469,6 +8103,7 @@
7469	8103	for (i = 0; i < MAX_NR_ZONES; i++) {
7470	8104	struct zone *zone = pgdat->node_zones + i;
7471	8105	long max = 0;
	8106	+ unsigned long managed_pages = zone_managed_pages(zone);
7472	8107
7473	8108	/* Find valid and maximum lowmem_reserve in the zone */
7474	8109	for (j = i; j < MAX_NR_ZONES; j++) {
..	..	@@ -7479,8 +8114,8 @@
7479	8114	/* we treat the high watermark as reserved pages. */
7480	8115	max += high_wmark_pages(zone);
7481	8116
7482		- if (max > zone->managed_pages)
7483		- max = zone->managed_pages;
	8117	+ if (max > managed_pages)
	8118	+ max = managed_pages;
7484	8119
7485	8120	pgdat->totalreserve_pages += max;
7486	8121
..	..	@@ -7499,30 +8134,24 @@
7499	8134	static void setup_per_zone_lowmem_reserve(void)
7500	8135	{
7501	8136	struct pglist_data *pgdat;
7502		- enum zone_type j, idx;
	8137	+ enum zone_type i, j;
7503	8138
7504	8139	for_each_online_pgdat(pgdat) {
7505		- for (j = 0; j < MAX_NR_ZONES; j++) {
7506		- struct zone *zone = pgdat->node_zones + j;
7507		- unsigned long managed_pages = zone->managed_pages;
	8140	+ for (i = 0; i < MAX_NR_ZONES - 1; i++) {
	8141	+ struct zone *zone = &pgdat->node_zones[i];
	8142	+ int ratio = sysctl_lowmem_reserve_ratio[i];
	8143	+ bool clear = !ratio \|\| !zone_managed_pages(zone);
	8144	+ unsigned long managed_pages = 0;
7508	8145
7509		- zone->lowmem_reserve[j] = 0;
	8146	+ for (j = i + 1; j < MAX_NR_ZONES; j++) {
	8147	+ struct zone *upper_zone = &pgdat->node_zones[j];
7510	8148
7511		- idx = j;
7512		- while (idx) {
7513		- struct zone *lower_zone;
	8149	+ managed_pages += zone_managed_pages(upper_zone);
7514	8150
7515		- idx--;
7516		- lower_zone = pgdat->node_zones + idx;
7517		-
7518		- if (sysctl_lowmem_reserve_ratio[idx] < 1) {
7519		- sysctl_lowmem_reserve_ratio[idx] = 0;
7520		- lower_zone->lowmem_reserve[j] = 0;
7521		- } else {
7522		- lower_zone->lowmem_reserve[j] =
7523		- managed_pages / sysctl_lowmem_reserve_ratio[idx];
7524		- }
7525		- managed_pages += lower_zone->managed_pages;
	8151	+ if (clear)
	8152	+ zone->lowmem_reserve[j] = 0;
	8153	+ else
	8154	+ zone->lowmem_reserve[j] = managed_pages / ratio;
7526	8155	}
7527	8156	}
7528	8157	}
..	..	@@ -7542,18 +8171,17 @@
7542	8171	/* Calculate total number of !ZONE_HIGHMEM pages */
7543	8172	for_each_zone(zone) {
7544	8173	if (!is_highmem(zone))
7545		- lowmem_pages += zone->managed_pages;
	8174	+ lowmem_pages += zone_managed_pages(zone);
7546	8175	}
7547	8176
7548	8177	for_each_zone(zone) {
7549		- u64 min, low;
	8178	+ u64 tmp, low;
7550	8179
7551	8180	spin_lock_irqsave(&zone->lock, flags);
7552		- min = (u64)pages_min * zone->managed_pages;
7553		- do_div(min, lowmem_pages);
7554		- low = (u64)pages_low * zone->managed_pages;
7555		- do_div(low, vm_total_pages);
7556		-
	8181	+ tmp = (u64)pages_min * zone_managed_pages(zone);
	8182	+ do_div(tmp, lowmem_pages);
	8183	+ low = (u64)pages_low * zone_managed_pages(zone);
	8184	+ do_div(low, nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)));
7557	8185	if (is_highmem(zone)) {
7558	8186	/*
7559	8187	* __GFP_HIGH and PF_MEMALLOC allocations usually don't
..	..	@@ -7561,20 +8189,20 @@
7561	8189	* value here.
7562	8190	*
7563	8191	* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
7564		- * deltas control asynch page reclaim, and so should
	8192	+ * deltas control async page reclaim, and so should
7565	8193	* not be capped for highmem.
7566	8194	*/
7567	8195	unsigned long min_pages;
7568	8196
7569		- min_pages = zone->managed_pages / 1024;
	8197	+ min_pages = zone_managed_pages(zone) / 1024;
7570	8198	min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
7571		- zone->watermark[WMARK_MIN] = min_pages;
	8199	+ zone->_watermark[WMARK_MIN] = min_pages;
7572	8200	} else {
7573	8201	/*
7574	8202	* If it's a lowmem zone, reserve a number of pages
7575	8203	* proportionate to the zone's size.
7576	8204	*/
7577		- zone->watermark[WMARK_MIN] = min;
	8205	+ zone->_watermark[WMARK_MIN] = tmp;
7578	8206	}
7579	8207
7580	8208	/*
..	..	@@ -7582,14 +8210,13 @@
7582	8210	* scale factor in proportion to available memory, but
7583	8211	* ensure a minimum size on small systems.
7584	8212	*/
7585		- min = max_t(u64, min >> 2,
7586		- mult_frac(zone->managed_pages,
	8213	+ tmp = max_t(u64, tmp >> 2,
	8214	+ mult_frac(zone_managed_pages(zone),
7587	8215	watermark_scale_factor, 10000));
7588	8216
7589		- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
7590		- low + min;
7591		- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
7592		- low + min * 2;
	8217	+ zone->watermark_boost = 0;
	8218	+ zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + low + tmp;
	8219	+ zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + low + tmp * 2;
7593	8220
7594	8221	spin_unlock_irqrestore(&zone->lock, flags);
7595	8222	}
..	..	@@ -7618,7 +8245,7 @@
7618	8245	* Initialise min_free_kbytes.
7619	8246	*
7620	8247	* For small machines we want it small (128k min). For large machines
7621		- * we want it large (64MB max). But it is not linear, because network
	8248	+ * we want it large (256MB max). But it is not linear, because network
7622	8249	* bandwidth does not increase linearly with machine size. We use
7623	8250	*
7624	8251	* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
..	..	@@ -7650,8 +8277,8 @@
7650	8277	min_free_kbytes = new_min_free_kbytes;
7651	8278	if (min_free_kbytes < 128)
7652	8279	min_free_kbytes = 128;
7653		- if (min_free_kbytes > 65536)
7654		- min_free_kbytes = 65536;
	8280	+ if (min_free_kbytes > 262144)
	8281	+ min_free_kbytes = 262144;
7655	8282	} else {
7656	8283	pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
7657	8284	new_min_free_kbytes, user_min_free_kbytes);
..	..	@@ -7677,7 +8304,7 @@
7677	8304	* or extra_free_kbytes changes.
7678	8305	*/
7679	8306	int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
7680		- void __user buffer, size_t length, loff_t *ppos)
	8307	+ void buffer, size_t length, loff_t *ppos)
7681	8308	{
7682	8309	int rc;
7683	8310
..	..	@@ -7693,7 +8320,7 @@
7693	8320	}
7694	8321
7695	8322	int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
7696		- void __user buffer, size_t length, loff_t *ppos)
	8323	+ void buffer, size_t length, loff_t *ppos)
7697	8324	{
7698	8325	int rc;
7699	8326
..	..	@@ -7717,13 +8344,13 @@
7717	8344	pgdat->min_unmapped_pages = 0;
7718	8345
7719	8346	for_each_zone(zone)
7720		- zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
7721		- sysctl_min_unmapped_ratio) / 100;
	8347	+ zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
	8348	+ sysctl_min_unmapped_ratio) / 100;
7722	8349	}
7723	8350
7724	8351
7725	8352	int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
7726		- void __user buffer, size_t length, loff_t *ppos)
	8353	+ void buffer, size_t length, loff_t *ppos)
7727	8354	{
7728	8355	int rc;
7729	8356
..	..	@@ -7745,12 +8372,12 @@
7745	8372	pgdat->min_slab_pages = 0;
7746	8373
7747	8374	for_each_zone(zone)
7748		- zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
7749		- sysctl_min_slab_ratio) / 100;
	8375	+ zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
	8376	+ sysctl_min_slab_ratio) / 100;
7750	8377	}
7751	8378
7752	8379	int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
7753		- void __user buffer, size_t length, loff_t *ppos)
	8380	+ void buffer, size_t length, loff_t *ppos)
7754	8381	{
7755	8382	int rc;
7756	8383
..	..	@@ -7774,11 +8401,28 @@
7774	8401	* if in function of the boot time zone sizes.
7775	8402	*/
7776	8403	int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
7777		- void __user buffer, size_t length, loff_t *ppos)
	8404	+ void buffer, size_t length, loff_t *ppos)
7778	8405	{
	8406	+ int i;
	8407	+
7779	8408	proc_dointvec_minmax(table, write, buffer, length, ppos);
	8409	+
	8410	+ for (i = 0; i < MAX_NR_ZONES; i++) {
	8411	+ if (sysctl_lowmem_reserve_ratio[i] < 1)
	8412	+ sysctl_lowmem_reserve_ratio[i] = 0;
	8413	+ }
	8414	+
7780	8415	setup_per_zone_lowmem_reserve();
7781	8416	return 0;
	8417	+}
	8418	+
	8419	+static void __zone_pcp_update(struct zone *zone)
	8420	+{
	8421	+ unsigned int cpu;
	8422	+
	8423	+ for_each_possible_cpu(cpu)
	8424	+ pageset_set_high_and_batch(zone,
	8425	+ per_cpu_ptr(zone->pageset, cpu));
7782	8426	}
7783	8427
7784	8428	/*
..	..	@@ -7787,7 +8431,7 @@
7787	8431	* pagelist can have before it gets flushed back to buddy allocator.
7788	8432	*/
7789	8433	int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
7790		- void __user buffer, size_t length, loff_t *ppos)
	8434	+ void buffer, size_t length, loff_t *ppos)
7791	8435	{
7792	8436	struct zone *zone;
7793	8437	int old_percpu_pagelist_fraction;
..	..	@@ -7812,30 +8456,12 @@
7812	8456	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
7813	8457	goto out;
7814	8458
7815		- for_each_populated_zone(zone) {
7816		- unsigned int cpu;
7817		-
7818		- for_each_possible_cpu(cpu)
7819		- pageset_set_high_and_batch(zone,
7820		- per_cpu_ptr(zone->pageset, cpu));
7821		- }
	8459	+ for_each_populated_zone(zone)
	8460	+ __zone_pcp_update(zone);
7822	8461	out:
7823	8462	mutex_unlock(&pcp_batch_high_lock);
7824	8463	return ret;
7825	8464	}
7826		-
7827		-#ifdef CONFIG_NUMA
7828		-int hashdist = HASHDIST_DEFAULT;
7829		-
7830		-static int __init set_hashdist(char *str)
7831		-{
7832		- if (!str)
7833		- return 0;
7834		- hashdist = simple_strtoul(str, &str, 0);
7835		- return 1;
7836		-}
7837		-__setup("hashdist=", set_hashdist);
7838		-#endif
7839	8465
7840	8466	#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
7841	8467	/*
..	..	@@ -7883,6 +8509,7 @@
7883	8509	unsigned long log2qty, size;
7884	8510	void *table = NULL;
7885	8511	gfp_t gfp_flags;
	8512	+ bool virt;
7886	8513
7887	8514	/* allow the kernel cmdline to have a say */
7888	8515	if (!numentries) {
..	..	@@ -7939,32 +8566,34 @@
7939	8566
7940	8567	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC \| __GFP_ZERO : GFP_ATOMIC;
7941	8568	do {
	8569	+ virt = false;
7942	8570	size = bucketsize << log2qty;
7943	8571	if (flags & HASH_EARLY) {
7944	8572	if (flags & HASH_ZERO)
7945		- table = memblock_virt_alloc_nopanic(size, 0);
	8573	+ table = memblock_alloc(size, SMP_CACHE_BYTES);
7946	8574	else
7947		- table = memblock_virt_alloc_raw(size, 0);
7948		- } else if (hashdist) {
7949		- table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
	8575	+ table = memblock_alloc_raw(size,
	8576	+ SMP_CACHE_BYTES);
	8577	+ } else if (get_order(size) >= MAX_ORDER \|\| hashdist) {
	8578	+ table = __vmalloc(size, gfp_flags);
	8579	+ virt = true;
7950	8580	} else {
7951	8581	/*
7952	8582	* If bucketsize is not a power-of-two, we may free
7953	8583	* some pages at the end of hash table which
7954	8584	* alloc_pages_exact() automatically does
7955	8585	*/
7956		- if (get_order(size) < MAX_ORDER) {
7957		- table = alloc_pages_exact(size, gfp_flags);
7958		- kmemleak_alloc(table, size, 1, gfp_flags);
7959		- }
	8586	+ table = alloc_pages_exact(size, gfp_flags);
	8587	+ kmemleak_alloc(table, size, 1, gfp_flags);
7960	8588	}
7961	8589	} while (!table && size > PAGE_SIZE && --log2qty);
7962	8590
7963	8591	if (!table)
7964	8592	panic("Failed to allocate %s hash table\n", tablename);
7965	8593
7966		- pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
7967		- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
	8594	+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
	8595	+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
	8596	+ virt ? "vmalloc" : "linear");
7968	8597
7969	8598	if (_hash_shift)
7970	8599	*_hash_shift = log2qty;
..	..	@@ -7976,47 +8605,50 @@
7976	8605
7977	8606	/*
7978	8607	* This function checks whether pageblock includes unmovable pages or not.
7979		- * If @count is not zero, it is okay to include less @count unmovable pages
7980	8608	*
7981	8609	* PageLRU check without isolation or lru_lock could race so that
7982	8610	* MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
7983	8611	* check without lock_page also may miss some movable non-lru pages at
7984	8612	* race condition. So you can't expect this function should be exact.
	8613	+ *
	8614	+ * Returns a page without holding a reference. If the caller wants to
	8615	+ * dereference that page (e.g., dumping), it has to make sure that it
	8616	+ * cannot get removed (e.g., via memory unplug) concurrently.
	8617	+ *
7985	8618	*/
7986		-bool has_unmovable_pages(struct zone zone, struct page page, int count,
7987		- int migratetype,
7988		- bool skip_hwpoisoned_pages)
	8619	+struct page has_unmovable_pages(struct zone zone, struct page *page,
	8620	+ int migratetype, int flags)
7989	8621	{
7990		- unsigned long pfn, iter, found;
	8622	+ unsigned long iter = 0;
	8623	+ unsigned long pfn = page_to_pfn(page);
	8624	+ unsigned long offset = pfn % pageblock_nr_pages;
7991	8625
7992		- /*
7993		- * TODO we could make this much more efficient by not checking every
7994		- * page in the range if we know all of them are in MOVABLE_ZONE and
7995		- * that the movable zone guarantees that pages are migratable but
7996		- * the later is not the case right now unfortunatelly. E.g. movablecore
7997		- * can still lead to having bootmem allocations in zone_movable.
7998		- */
	8626	+ if (is_migrate_cma_page(page)) {
	8627	+ /*
	8628	+ * CMA allocations (alloc_contig_range) really need to mark
	8629	+ * isolate CMA pageblocks even when they are not movable in fact
	8630	+ * so consider them movable here.
	8631	+ */
	8632	+ if (is_migrate_cma(migratetype))
	8633	+ return NULL;
7999	8634
8000		- /*
8001		- * CMA allocations (alloc_contig_range) really need to mark isolate
8002		- * CMA pageblocks even when they are not movable in fact so consider
8003		- * them movable here.
8004		- */
8005		- if (is_migrate_cma(migratetype) &&
8006		- is_migrate_cma(get_pageblock_migratetype(page)))
8007		- return false;
	8635	+ return page;
	8636	+ }
8008	8637
8009		- pfn = page_to_pfn(page);
8010		- for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
8011		- unsigned long check = pfn + iter;
8012		-
8013		- if (!pfn_valid_within(check))
	8638	+ for (; iter < pageblock_nr_pages - offset; iter++) {
	8639	+ if (!pfn_valid_within(pfn + iter))
8014	8640	continue;
8015	8641
8016		- page = pfn_to_page(check);
	8642	+ page = pfn_to_page(pfn + iter);
8017	8643
	8644	+ /*
	8645	+ * Both, bootmem allocations and memory holes are marked
	8646	+ * PG_reserved and are unmovable. We can even have unmovable
	8647	+ * allocations inside ZONE_MOVABLE, for example when
	8648	+ * specifying "movablecore".
	8649	+ */
8018	8650	if (PageReserved(page))
8019		- goto unmovable;
	8651	+ return page;
8020	8652
8021	8653	/*
8022	8654	* If the zone is movable and we have ruled out all reserved
..	..	@@ -8028,17 +8660,22 @@
8028	8660
8029	8661	/*
8030	8662	* Hugepages are not in LRU lists, but they're movable.
8031		- * We need not scan over tail pages bacause we don't
	8663	+ * THPs are on the LRU, but need to be counted as #small pages.
	8664	+ * We need not scan over tail pages because we don't
8032	8665	* handle each tail page individually in migration.
8033	8666	*/
8034		- if (PageHuge(page)) {
	8667	+ if (PageHuge(page) \|\| PageTransCompound(page)) {
8035	8668	struct page *head = compound_head(page);
8036	8669	unsigned int skip_pages;
8037	8670
8038		- if (!hugepage_migration_supported(page_hstate(head)))
8039		- goto unmovable;
	8671	+ if (PageHuge(page)) {
	8672	+ if (!hugepage_migration_supported(page_hstate(head)))
	8673	+ return page;
	8674	+ } else if (!PageLRU(head) && !__PageMovable(head)) {
	8675	+ return page;
	8676	+ }
8040	8677
8041		- skip_pages = (1 << compound_order(head)) - (page - head);
	8678	+ skip_pages = compound_nr(head) - (page - head);
8042	8679	iter += skip_pages - 1;
8043	8680	continue;
8044	8681	}
..	..	@@ -8051,7 +8688,7 @@
8051	8688	*/
8052	8689	if (!page_ref_count(page)) {
8053	8690	if (PageBuddy(page))
8054		- iter += (1 << page_order(page)) - 1;
	8691	+ iter += (1 << buddy_order(page)) - 1;
8055	8692	continue;
8056	8693	}
8057	8694
..	..	@@ -8059,61 +8696,100 @@
8059	8696	* The HWPoisoned page may be not in buddy system, and
8060	8697	* page_count() is not 0.
8061	8698	*/
8062		- if (skip_hwpoisoned_pages && PageHWPoison(page))
	8699	+ if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
8063	8700	continue;
8064	8701
8065		- if (__PageMovable(page))
	8702	+ /*
	8703	+ * We treat all PageOffline() pages as movable when offlining
	8704	+ * to give drivers a chance to decrement their reference count
	8705	+ * in MEM_GOING_OFFLINE in order to indicate that these pages
	8706	+ * can be offlined as there are no direct references anymore.
	8707	+ * For actually unmovable PageOffline() where the driver does
	8708	+ * not support this, we will fail later when trying to actually
	8709	+ * move these pages that still have a reference count > 0.
	8710	+ * (false negatives in this function only)
	8711	+ */
	8712	+ if ((flags & MEMORY_OFFLINE) && PageOffline(page))
8066	8713	continue;
8067	8714
8068		- if (!PageLRU(page))
8069		- found++;
	8715	+ if (__PageMovable(page) \|\| PageLRU(page))
	8716	+ continue;
	8717	+
8070	8718	/*
8071	8719	* If there are RECLAIMABLE pages, we need to check
8072	8720	* it. But now, memory offline itself doesn't call
8073	8721	* shrink_node_slabs() and it still to be fixed.
8074	8722	*/
8075		- /*
8076		- * If the page is not RAM, page_count()should be 0.
8077		- * we don't need more check. This is an _used_ not-movable page.
8078		- *
8079		- * The problematic thing here is PG_reserved pages. PG_reserved
8080		- * is set to both of a memory hole page and a _used_ kernel
8081		- * page at boot.
8082		- */
8083		- if (found > count)
8084		- goto unmovable;
	8723	+ return page;
8085	8724	}
8086		- return false;
8087		-unmovable:
8088		- WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
8089		- return true;
	8725	+ return NULL;
8090	8726	}
8091	8727
8092		-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) \|\| defined(CONFIG_CMA)
8093		-
	8728	+#ifdef CONFIG_CONTIG_ALLOC
8094	8729	static unsigned long pfn_max_align_down(unsigned long pfn)
8095	8730	{
8096	8731	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
8097	8732	pageblock_nr_pages) - 1);
8098	8733	}
8099	8734
8100		-static unsigned long pfn_max_align_up(unsigned long pfn)
	8735	+unsigned long pfn_max_align_up(unsigned long pfn)
8101	8736	{
8102	8737	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
8103	8738	pageblock_nr_pages));
8104	8739	}
8105	8740
	8741	+#if defined(CONFIG_DYNAMIC_DEBUG) \|\| \
	8742	+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
	8743	+/* Usage: See admin-guide/dynamic-debug-howto.rst */
	8744	+static void alloc_contig_dump_pages(struct list_head *page_list)
	8745	+{
	8746	+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
	8747	+
	8748	+ if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
	8749	+ struct page *page;
	8750	+ unsigned long nr_skip = 0;
	8751	+ unsigned long nr_pages = 0;
	8752	+
	8753	+ dump_stack();
	8754	+ list_for_each_entry(page, page_list, lru) {
	8755	+ nr_pages++;
	8756	+ /* The page will be freed by putback_movable_pages soon */
	8757	+ if (page_count(page) == 1) {
	8758	+ nr_skip++;
	8759	+ continue;
	8760	+ }
	8761	+ dump_page(page, "migration failure");
	8762	+ }
	8763	+ pr_warn("total dump_pages %lu skipping %lu\n", nr_pages, nr_skip);
	8764	+ }
	8765	+}
	8766	+#else
	8767	+static inline void alloc_contig_dump_pages(struct list_head *page_list)
	8768	+{
	8769	+}
	8770	+#endif
	8771	+
8106	8772	/* [start, end) must belong to a single zone. */
8107	8773	static int __alloc_contig_migrate_range(struct compact_control *cc,
8108		- unsigned long start, unsigned long end)
	8774	+ unsigned long start, unsigned long end,
	8775	+ struct acr_info *info)
8109	8776	{
8110	8777	/* This function is based on compact_zone() from compaction.c. */
8111		- unsigned long nr_reclaimed;
	8778	+ unsigned int nr_reclaimed;
8112	8779	unsigned long pfn = start;
8113	8780	unsigned int tries = 0;
	8781	+ unsigned int max_tries = 5;
8114	8782	int ret = 0;
	8783	+ struct page *page;
	8784	+ struct migration_target_control mtc = {
	8785	+ .nid = zone_to_nid(cc->zone),
	8786	+ .gfp_mask = GFP_USER \| __GFP_MOVABLE \| __GFP_RETRY_MAYFAIL,
	8787	+ };
8115	8788
8116		- migrate_prep();
	8789	+ if (cc->alloc_contig && cc->mode == MIGRATE_ASYNC)
	8790	+ max_tries = 1;
	8791	+
	8792	+ lru_cache_disable();
8117	8793
8118	8794	while (pfn < end \|\| !list_empty(&cc->migratepages)) {
8119	8795	if (fatal_signal_pending(current)) {
..	..	@@ -8129,20 +8805,39 @@
8129	8805	break;
8130	8806	}
8131	8807	tries = 0;
8132		- } else if (++tries == 5) {
	8808	+ } else if (++tries == max_tries) {
8133	8809	ret = ret < 0 ? ret : -EBUSY;
8134	8810	break;
8135	8811	}
8136	8812
8137	8813	nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
8138	8814	&cc->migratepages);
	8815	+ info->nr_reclaimed += nr_reclaimed;
8139	8816	cc->nr_migratepages -= nr_reclaimed;
8140	8817
8141		- ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
8142		- NULL, 0, cc->mode, MR_CONTIG_RANGE);
	8818	+ list_for_each_entry(page, &cc->migratepages, lru)
	8819	+ info->nr_mapped += page_mapcount(page);
	8820	+
	8821	+ ret = migrate_pages(&cc->migratepages, alloc_migration_target,
	8822	+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
	8823	+ if (!ret)
	8824	+ info->nr_migrated += cc->nr_migratepages;
8143	8825	}
	8826	+
	8827	+ lru_cache_enable();
8144	8828	if (ret < 0) {
	8829	+ if (ret == -EBUSY) {
	8830	+ alloc_contig_dump_pages(&cc->migratepages);
	8831	+ page_pinner_mark_migration_failed_pages(&cc->migratepages);
	8832	+ }
	8833	+
	8834	+ if (!list_empty(&cc->migratepages)) {
	8835	+ page = list_first_entry(&cc->migratepages, struct page , lru);
	8836	+ info->failed_pfn = page_to_pfn(page);
	8837	+ }
	8838	+
8145	8839	putback_movable_pages(&cc->migratepages);
	8840	+ info->err \|= ACR_ERR_MIGRATE;
8146	8841	return ret;
8147	8842	}
8148	8843	return 0;
..	..	@@ -8165,25 +8860,28 @@
8165	8860	* pageblocks in the range. Once isolated, the pageblocks should not
8166	8861	* be modified by others.
8167	8862	*
8168		- * Returns zero on success or negative error code. On success all
	8863	+ * Return: zero on success or negative error code. On success all
8169	8864	* pages which PFN is in [start, end) are allocated for the caller and
8170	8865	* need to be freed with free_contig_range().
8171	8866	*/
8172	8867	int alloc_contig_range(unsigned long start, unsigned long end,
8173		- unsigned migratetype, gfp_t gfp_mask)
	8868	+ unsigned migratetype, gfp_t gfp_mask,
	8869	+ struct acr_info *info)
8174	8870	{
8175	8871	unsigned long outer_start, outer_end;
8176	8872	unsigned int order;
8177	8873	int ret = 0;
	8874	+ bool skip_drain_all_pages = false;
8178	8875
8179	8876	struct compact_control cc = {
8180	8877	.nr_migratepages = 0,
8181	8878	.order = -1,
8182	8879	.zone = page_zone(pfn_to_page(start)),
8183		- .mode = MIGRATE_SYNC,
	8880	+ .mode = gfp_mask & __GFP_NORETRY ? MIGRATE_ASYNC : MIGRATE_SYNC,
8184	8881	.ignore_skip_hint = true,
8185	8882	.no_set_skip_hint = true,
8186	8883	.gfp_mask = current_gfp_context(gfp_mask),
	8884	+ .alloc_contig = true,
8187	8885	};
8188	8886	INIT_LIST_HEAD(&cc.migratepages);
8189	8887
..	..	@@ -8212,14 +8910,18 @@
8212	8910	*/
8213	8911
8214	8912	ret = start_isolate_page_range(pfn_max_align_down(start),
8215		- pfn_max_align_up(end), migratetype,
8216		- false);
8217		- if (ret)
	8913	+ pfn_max_align_up(end), migratetype, 0,
	8914	+ &info->failed_pfn);
	8915	+ if (ret) {
	8916	+ info->err \|= ACR_ERR_ISOLATE;
8218	8917	return ret;
	8918	+ }
8219	8919
8220		-#ifdef CONFIG_CMA
8221		- cc.zone->cma_alloc = 1;
8222		-#endif
	8920	+ trace_android_vh_cma_drain_all_pages_bypass(migratetype,
	8921	+ &skip_drain_all_pages);
	8922	+ if (!skip_drain_all_pages)
	8923	+ drain_all_pages(cc.zone);
	8924	+
8223	8925	/*
8224	8926	* In case of -EBUSY, we'd like to know which page causes problem.
8225	8927	* So, just fall through. test_pages_isolated() has a tracepoint
..	..	@@ -8230,8 +8932,8 @@
8230	8932	* allocated. So, if we fall through be sure to clear ret so that
8231	8933	* -EBUSY is not accidentally used or returned to caller.
8232	8934	*/
8233		- ret = __alloc_contig_migrate_range(&cc, start, end);
8234		- if (ret && ret != -EBUSY)
	8935	+ ret = __alloc_contig_migrate_range(&cc, start, end, info);
	8936	+ if (ret && (ret != -EBUSY \|\| (gfp_mask & __GFP_NORETRY)))
8235	8937	goto done;
8236	8938	ret =0;
8237	8939
..	..	@@ -8252,9 +8954,6 @@
8252	8954	* isolated thus they won't get removed from buddy.
8253	8955	*/
8254	8956
8255		- lru_add_drain_all();
8256		- drain_all_pages(cc.zone);
8257		-
8258	8957	order = 0;
8259	8958	outer_start = start;
8260	8959	while (!PageBuddy(pfn_to_page(outer_start))) {
..	..	@@ -8266,7 +8965,7 @@
8266	8965	}
8267	8966
8268	8967	if (outer_start != start) {
8269		- order = page_order(pfn_to_page(outer_start));
	8968	+ order = buddy_order(pfn_to_page(outer_start));
8270	8969
8271	8970	/*
8272	8971	* outer_start page could be small order buddy page and
..	..	@@ -8279,10 +8978,11 @@
8279	8978	}
8280	8979
8281	8980	/* Make sure the range is really isolated. */
8282		- if (test_pages_isolated(outer_start, end, false)) {
	8981	+ if (test_pages_isolated(outer_start, end, 0, &info->failed_pfn)) {
8283	8982	pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
8284	8983	__func__, outer_start, end);
8285	8984	ret = -EBUSY;
	8985	+ info->err \|= ACR_ERR_TEST;
8286	8986	goto done;
8287	8987	}
8288	8988
..	..	@@ -8302,13 +9002,114 @@
8302	9002	done:
8303	9003	undo_isolate_page_range(pfn_max_align_down(start),
8304	9004	pfn_max_align_up(end), migratetype);
8305		-#ifdef CONFIG_CMA
8306		- cc.zone->cma_alloc = 0;
8307		-#endif
8308	9005	return ret;
8309	9006	}
	9007	+EXPORT_SYMBOL(alloc_contig_range);
8310	9008
8311		-void free_contig_range(unsigned long pfn, unsigned nr_pages)
	9009	+static int __alloc_contig_pages(unsigned long start_pfn,
	9010	+ unsigned long nr_pages, gfp_t gfp_mask)
	9011	+{
	9012	+ struct acr_info dummy;
	9013	+ unsigned long end_pfn = start_pfn + nr_pages;
	9014	+
	9015	+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
	9016	+ gfp_mask, &dummy);
	9017	+}
	9018	+
	9019	+static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
	9020	+ unsigned long nr_pages)
	9021	+{
	9022	+ unsigned long i, end_pfn = start_pfn + nr_pages;
	9023	+ struct page *page;
	9024	+
	9025	+ for (i = start_pfn; i < end_pfn; i++) {
	9026	+ page = pfn_to_online_page(i);
	9027	+ if (!page)
	9028	+ return false;
	9029	+
	9030	+ if (page_zone(page) != z)
	9031	+ return false;
	9032	+
	9033	+ if (PageReserved(page))
	9034	+ return false;
	9035	+
	9036	+ if (page_count(page) > 0)
	9037	+ return false;
	9038	+
	9039	+ if (PageHuge(page))
	9040	+ return false;
	9041	+ }
	9042	+ return true;
	9043	+}
	9044	+
	9045	+static bool zone_spans_last_pfn(const struct zone *zone,
	9046	+ unsigned long start_pfn, unsigned long nr_pages)
	9047	+{
	9048	+ unsigned long last_pfn = start_pfn + nr_pages - 1;
	9049	+
	9050	+ return zone_spans_pfn(zone, last_pfn);
	9051	+}
	9052	+
	9053	+/**
	9054	+ * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
	9055	+ * @nr_pages: Number of contiguous pages to allocate
	9056	+ * @gfp_mask: GFP mask to limit search and used during compaction
	9057	+ * @nid: Target node
	9058	+ * @nodemask: Mask for other possible nodes
	9059	+ *
	9060	+ * This routine is a wrapper around alloc_contig_range(). It scans over zones
	9061	+ * on an applicable zonelist to find a contiguous pfn range which can then be
	9062	+ * tried for allocation with alloc_contig_range(). This routine is intended
	9063	+ * for allocation requests which can not be fulfilled with the buddy allocator.
	9064	+ *
	9065	+ * The allocated memory is always aligned to a page boundary. If nr_pages is a
	9066	+ * power of two then the alignment is guaranteed to be to the given nr_pages
	9067	+ * (e.g. 1GB request would be aligned to 1GB).
	9068	+ *
	9069	+ * Allocated pages can be freed with free_contig_range() or by manually calling
	9070	+ * __free_page() on each allocated page.
	9071	+ *
	9072	+ * Return: pointer to contiguous pages on success, or NULL if not successful.
	9073	+ */
	9074	+struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
	9075	+ int nid, nodemask_t *nodemask)
	9076	+{
	9077	+ unsigned long ret, pfn, flags;
	9078	+ struct zonelist *zonelist;
	9079	+ struct zone *zone;
	9080	+ struct zoneref *z;
	9081	+
	9082	+ zonelist = node_zonelist(nid, gfp_mask);
	9083	+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
	9084	+ gfp_zone(gfp_mask), nodemask) {
	9085	+ spin_lock_irqsave(&zone->lock, flags);
	9086	+
	9087	+ pfn = ALIGN(zone->zone_start_pfn, nr_pages);
	9088	+ while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
	9089	+ if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
	9090	+ /*
	9091	+ * We release the zone lock here because
	9092	+ * alloc_contig_range() will also lock the zone
	9093	+ * at some point. If there's an allocation
	9094	+ * spinning on this lock, it may win the race
	9095	+ * and cause alloc_contig_range() to fail...
	9096	+ */
	9097	+ spin_unlock_irqrestore(&zone->lock, flags);
	9098	+ ret = __alloc_contig_pages(pfn, nr_pages,
	9099	+ gfp_mask);
	9100	+ if (!ret)
	9101	+ return pfn_to_page(pfn);
	9102	+ spin_lock_irqsave(&zone->lock, flags);
	9103	+ }
	9104	+ pfn += nr_pages;
	9105	+ }
	9106	+ spin_unlock_irqrestore(&zone->lock, flags);
	9107	+ }
	9108	+ return NULL;
	9109	+}
	9110	+#endif /* CONFIG_CONTIG_ALLOC */
	9111	+
	9112	+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
8312	9113	{
8313	9114	unsigned int count = 0;
8314	9115
..	..	@@ -8320,7 +9121,7 @@
8320	9121	}
8321	9122	WARN(count != 0, "%d pages are still in use!\n", count);
8322	9123	}
8323		-#endif
	9124	+EXPORT_SYMBOL(free_contig_range);
8324	9125
8325	9126	/*
8326	9127	* The zone indicated has a new number of managed_pages; batch sizes and percpu
..	..	@@ -8328,11 +9129,8 @@
8328	9129	*/
8329	9130	void __meminit zone_pcp_update(struct zone *zone)
8330	9131	{
8331		- unsigned cpu;
8332	9132	mutex_lock(&pcp_batch_high_lock);
8333		- for_each_possible_cpu(cpu)
8334		- pageset_set_high_and_batch(zone,
8335		- per_cpu_ptr(zone->pageset, cpu));
	9133	+ __zone_pcp_update(zone);
8336	9134	mutex_unlock(&pcp_batch_high_lock);
8337	9135	}
8338	9136
..	..	@@ -8343,7 +9141,7 @@
8343	9141	struct per_cpu_pageset *pset;
8344	9142
8345	9143	/* avoid races with drain_pages() */
8346		- local_lock_irqsave(pa_lock, flags);
	9144	+ local_lock_irqsave(&pa_lock.l, flags);
8347	9145	if (zone->pageset != &boot_pageset) {
8348	9146	for_each_online_cpu(cpu) {
8349	9147	pset = per_cpu_ptr(zone->pageset, cpu);
..	..	@@ -8352,37 +9150,26 @@
8352	9150	free_percpu(zone->pageset);
8353	9151	zone->pageset = &boot_pageset;
8354	9152	}
8355		- local_unlock_irqrestore(pa_lock, flags);
	9153	+ local_unlock_irqrestore(&pa_lock.l, flags);
8356	9154	}
8357	9155
8358	9156	#ifdef CONFIG_MEMORY_HOTREMOVE
8359	9157	/*
8360		- * All pages in the range must be in a single zone and isolated
8361		- * before calling this.
	9158	+ * All pages in the range must be in a single zone, must not contain holes,
	9159	+ * must span full sections, and must be isolated before calling this function.
8362	9160	*/
8363		-void
8364		-__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
	9161	+void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
8365	9162	{
	9163	+ unsigned long pfn = start_pfn;
8366	9164	struct page *page;
8367	9165	struct zone *zone;
8368		- unsigned int order, i;
8369		- unsigned long pfn;
	9166	+ unsigned int order;
8370	9167	unsigned long flags;
8371		- /* find the first valid pfn */
8372		- for (pfn = start_pfn; pfn < end_pfn; pfn++)
8373		- if (pfn_valid(pfn))
8374		- break;
8375		- if (pfn == end_pfn)
8376		- return;
	9168	+
8377	9169	offline_mem_sections(pfn, end_pfn);
8378	9170	zone = page_zone(pfn_to_page(pfn));
8379	9171	spin_lock_irqsave(&zone->lock, flags);
8380		- pfn = start_pfn;
8381	9172	while (pfn < end_pfn) {
8382		- if (!pfn_valid(pfn)) {
8383		- pfn++;
8384		- continue;
8385		- }
8386	9173	page = pfn_to_page(pfn);
8387	9174	/*
8388	9175	* The HWPoisoned page may be not in buddy system, and
..	..	@@ -8390,22 +9177,23 @@
8390	9177	*/
8391	9178	if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
8392	9179	pfn++;
8393		- SetPageReserved(page);
	9180	+ continue;
	9181	+ }
	9182	+ /*
	9183	+ * At this point all remaining PageOffline() pages have a
	9184	+ * reference count of 0 and can simply be skipped.
	9185	+ */
	9186	+ if (PageOffline(page)) {
	9187	+ BUG_ON(page_count(page));
	9188	+ BUG_ON(PageBuddy(page));
	9189	+ pfn++;
8394	9190	continue;
8395	9191	}
8396	9192
8397	9193	BUG_ON(page_count(page));
8398	9194	BUG_ON(!PageBuddy(page));
8399		- order = page_order(page);
8400		-#ifdef CONFIG_DEBUG_VM
8401		- pr_info("remove from free list %lx %d %lx\n",
8402		- pfn, 1 << order, end_pfn);
8403		-#endif
8404		- list_del(&page->lru);
8405		- rmv_page_order(page);
8406		- zone->free_area[order].nr_free--;
8407		- for (i = 0; i < (1 << order); i++)
8408		- SetPageReserved((page+i));
	9195	+ order = buddy_order(page);
	9196	+ del_page_from_free_list(page, zone, order);
8409	9197	pfn += (1 << order);
8410	9198	}
8411	9199	spin_unlock_irqrestore(&zone->lock, flags);
..	..	@@ -8423,7 +9211,7 @@
8423	9211	for (order = 0; order < MAX_ORDER; order++) {
8424	9212	struct page *page_head = page - (pfn & ((1 << order) - 1));
8425	9213
8426		- if (PageBuddy(page_head) && page_order(page_head) >= order)
	9214	+ if (PageBuddy(page_head) && buddy_order(page_head) >= order)
8427	9215	break;
8428	9216	}
8429	9217	spin_unlock_irqrestore(&zone->lock, flags);
..	..	@@ -8433,30 +9221,87 @@
8433	9221
8434	9222	#ifdef CONFIG_MEMORY_FAILURE
8435	9223	/*
8436		- * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
8437		- * test is performed under the zone lock to prevent a race against page
8438		- * allocation.
	9224	+ * Break down a higher-order page in sub-pages, and keep our target out of
	9225	+ * buddy allocator.
8439	9226	*/
8440		-bool set_hwpoison_free_buddy_page(struct page *page)
	9227	+static void break_down_buddy_pages(struct zone zone, struct page page,
	9228	+ struct page *target, int low, int high,
	9229	+ int migratetype)
	9230	+{
	9231	+ unsigned long size = 1 << high;
	9232	+ struct page current_buddy, next_page;
	9233	+
	9234	+ while (high > low) {
	9235	+ high--;
	9236	+ size >>= 1;
	9237	+
	9238	+ if (target >= &page[size]) {
	9239	+ next_page = page + size;
	9240	+ current_buddy = page;
	9241	+ } else {
	9242	+ next_page = page;
	9243	+ current_buddy = page + size;
	9244	+ }
	9245	+
	9246	+ if (set_page_guard(zone, current_buddy, high, migratetype))
	9247	+ continue;
	9248	+
	9249	+ if (current_buddy != target) {
	9250	+ add_to_free_list(current_buddy, zone, high, migratetype);
	9251	+ set_buddy_order(current_buddy, high);
	9252	+ page = next_page;
	9253	+ }
	9254	+ }
	9255	+}
	9256	+
	9257	+/*
	9258	+ * Take a page that will be marked as poisoned off the buddy allocator.
	9259	+ */
	9260	+bool take_page_off_buddy(struct page *page)
8441	9261	{
8442	9262	struct zone *zone = page_zone(page);
8443	9263	unsigned long pfn = page_to_pfn(page);
8444	9264	unsigned long flags;
8445	9265	unsigned int order;
8446		- bool hwpoisoned = false;
	9266	+ bool ret = false;
8447	9267
8448	9268	spin_lock_irqsave(&zone->lock, flags);
8449	9269	for (order = 0; order < MAX_ORDER; order++) {
8450	9270	struct page *page_head = page - (pfn & ((1 << order) - 1));
	9271	+ int page_order = buddy_order(page_head);
8451	9272
8452		- if (PageBuddy(page_head) && page_order(page_head) >= order) {
8453		- if (!TestSetPageHWPoison(page))
8454		- hwpoisoned = true;
	9273	+ if (PageBuddy(page_head) && page_order >= order) {
	9274	+ unsigned long pfn_head = page_to_pfn(page_head);
	9275	+ int migratetype = get_pfnblock_migratetype(page_head,
	9276	+ pfn_head);
	9277	+
	9278	+ del_page_from_free_list(page_head, zone, page_order);
	9279	+ break_down_buddy_pages(zone, page_head, page, 0,
	9280	+ page_order, migratetype);
	9281	+ if (!is_migrate_isolate(migratetype))
	9282	+ __mod_zone_freepage_state(zone, -1, migratetype);
	9283	+ ret = true;
8455	9284	break;
8456	9285	}
	9286	+ if (page_count(page_head) > 0)
	9287	+ break;
8457	9288	}
8458	9289	spin_unlock_irqrestore(&zone->lock, flags);
8459		-
8460		- return hwpoisoned;
	9290	+ return ret;
8461	9291	}
8462	9292	#endif
	9293	+
	9294	+#ifdef CONFIG_ZONE_DMA
	9295	+bool has_managed_dma(void)
	9296	+{
	9297	+ struct pglist_data *pgdat;
	9298	+
	9299	+ for_each_online_pgdat(pgdat) {
	9300	+ struct zone *zone = &pgdat->node_zones[ZONE_DMA];
	9301	+
	9302	+ if (managed_zone(zone))
	9303	+ return true;
	9304	+ }
	9305	+ return false;
	9306	+}
	9307	+#endif /* CONFIG_ZONE_DMA */