hc
2024-05-10 cde9070d9970eef1f7ec2360586c802a16230ad8
kernel/mm/page_alloc.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * linux/mm/page_alloc.c
34 *
....@@ -16,11 +17,11 @@
1617
1718 #include <linux/stddef.h>
1819 #include <linux/mm.h>
20
+#include <linux/highmem.h>
1921 #include <linux/swap.h>
2022 #include <linux/interrupt.h>
2123 #include <linux/pagemap.h>
2224 #include <linux/jiffies.h>
23
-#include <linux/bootmem.h>
2425 #include <linux/memblock.h>
2526 #include <linux/compiler.h>
2627 #include <linux/kernel.h>
....@@ -43,12 +44,12 @@
4344 #include <linux/mempolicy.h>
4445 #include <linux/memremap.h>
4546 #include <linux/stop_machine.h>
47
+#include <linux/random.h>
4648 #include <linux/sort.h>
4749 #include <linux/pfn.h>
4850 #include <linux/backing-dev.h>
4951 #include <linux/fault-inject.h>
5052 #include <linux/page-isolation.h>
51
-#include <linux/page_ext.h>
5253 #include <linux/debugobjects.h>
5354 #include <linux/kmemleak.h>
5455 #include <linux/compaction.h>
....@@ -61,18 +62,63 @@
6162 #include <linux/sched/rt.h>
6263 #include <linux/sched/mm.h>
6364 #include <linux/page_owner.h>
65
+#include <linux/page_pinner.h>
6466 #include <linux/kthread.h>
6567 #include <linux/memcontrol.h>
6668 #include <linux/ftrace.h>
6769 #include <linux/lockdep.h>
6870 #include <linux/nmi.h>
69
-#include <linux/khugepaged.h>
7071 #include <linux/psi.h>
72
+#include <linux/padata.h>
73
+#include <linux/khugepaged.h>
74
+#include <trace/hooks/mm.h>
75
+#include <trace/hooks/vmscan.h>
7176
7277 #include <asm/sections.h>
7378 #include <asm/tlbflush.h>
7479 #include <asm/div64.h>
7580 #include "internal.h"
81
+#include "shuffle.h"
82
+#include "page_reporting.h"
83
+
84
+/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
85
+typedef int __bitwise fpi_t;
86
+
87
+/* No special request */
88
+#define FPI_NONE ((__force fpi_t)0)
89
+
90
+/*
91
+ * Skip free page reporting notification for the (possibly merged) page.
92
+ * This does not hinder free page reporting from grabbing the page,
93
+ * reporting it and marking it "reported" - it only skips notifying
94
+ * the free page reporting infrastructure about a newly freed page. For
95
+ * example, used when temporarily pulling a page from a freelist and
96
+ * putting it back unmodified.
97
+ */
98
+#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
99
+
100
+/*
101
+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
102
+ * page shuffling (relevant code - e.g., memory onlining - is expected to
103
+ * shuffle the whole zone).
104
+ *
105
+ * Note: No code should rely on this flag for correctness - it's purely
106
+ * to allow for optimizations when handing back either fresh pages
107
+ * (memory onlining) or untouched pages (page isolation, free page
108
+ * reporting).
109
+ */
110
+#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
111
+
112
+/*
113
+ * Don't poison memory with KASAN (only for the tag-based modes).
114
+ * During boot, all non-reserved memblock memory is exposed to page_alloc.
115
+ * Poisoning all that memory lengthens boot time, especially on systems with
116
+ * large amount of RAM. This flag is used to skip that poisoning.
117
+ * This is only done for the tag-based KASAN modes, as those are able to
118
+ * detect memory corruptions with the memory tags assigned by default.
119
+ * All memory allocated normally after boot gets poisoned as usual.
120
+ */
121
+#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
76122
77123 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
78124 static DEFINE_MUTEX(pcp_batch_high_lock);
....@@ -94,12 +140,15 @@
94140 */
95141 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
96142 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
97
-int _node_numa_mem_[MAX_NUMNODES];
98143 #endif
99144
100145 /* work_structs for global per-cpu drains */
101
-DEFINE_MUTEX(pcpu_drain_mutex);
102
-DEFINE_PER_CPU(struct work_struct, pcpu_drain);
146
+struct pcpu_drain {
147
+ struct zone *zone;
148
+ struct work_struct work;
149
+};
150
+static DEFINE_MUTEX(pcpu_drain_mutex);
151
+static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
103152
104153 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
105154 volatile unsigned long latent_entropy __latent_entropy;
....@@ -123,62 +172,33 @@
123172 };
124173 EXPORT_SYMBOL(node_states);
125174
126
-/* Protect totalram_pages and zone->managed_pages */
127
-static DEFINE_SPINLOCK(managed_page_count_lock);
128
-
129
-unsigned long totalram_pages __read_mostly;
175
+atomic_long_t _totalram_pages __read_mostly;
176
+EXPORT_SYMBOL(_totalram_pages);
130177 unsigned long totalreserve_pages __read_mostly;
131178 unsigned long totalcma_pages __read_mostly;
132179
133180 int percpu_pagelist_fraction;
134181 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
135
-#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
136
-DEFINE_STATIC_KEY_TRUE(init_on_alloc);
137
-#else
138182 DEFINE_STATIC_KEY_FALSE(init_on_alloc);
139
-#endif
140183 EXPORT_SYMBOL(init_on_alloc);
141184
142
-#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
143
-DEFINE_STATIC_KEY_TRUE(init_on_free);
144
-#else
145185 DEFINE_STATIC_KEY_FALSE(init_on_free);
146
-#endif
147186 EXPORT_SYMBOL(init_on_free);
148187
188
+static bool _init_on_alloc_enabled_early __read_mostly
189
+ = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
149190 static int __init early_init_on_alloc(char *buf)
150191 {
151
- int ret;
152
- bool bool_result;
153192
154
- if (!buf)
155
- return -EINVAL;
156
- ret = kstrtobool(buf, &bool_result);
157
- if (bool_result && page_poisoning_enabled())
158
- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
159
- if (bool_result)
160
- static_branch_enable(&init_on_alloc);
161
- else
162
- static_branch_disable(&init_on_alloc);
163
- return ret;
193
+ return kstrtobool(buf, &_init_on_alloc_enabled_early);
164194 }
165195 early_param("init_on_alloc", early_init_on_alloc);
166196
197
+static bool _init_on_free_enabled_early __read_mostly
198
+ = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
167199 static int __init early_init_on_free(char *buf)
168200 {
169
- int ret;
170
- bool bool_result;
171
-
172
- if (!buf)
173
- return -EINVAL;
174
- ret = kstrtobool(buf, &bool_result);
175
- if (bool_result && page_poisoning_enabled())
176
- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
177
- if (bool_result)
178
- static_branch_enable(&init_on_free);
179
- else
180
- static_branch_disable(&init_on_free);
181
- return ret;
201
+ return kstrtobool(buf, &_init_on_free_enabled_early);
182202 }
183203 early_param("init_on_free", early_init_on_free);
184204
....@@ -242,7 +262,8 @@
242262 unsigned int pageblock_order __read_mostly;
243263 #endif
244264
245
-static void __free_pages_ok(struct page *page, unsigned int order);
265
+static void __free_pages_ok(struct page *page, unsigned int order,
266
+ fpi_t fpi_flags);
246267
247268 /*
248269 * results with 256, 32 in the lowmem_reserve sysctl:
....@@ -269,8 +290,6 @@
269290 [ZONE_MOVABLE] = 0,
270291 };
271292
272
-EXPORT_SYMBOL(totalram_pages);
273
-
274293 static char * const zone_names[MAX_NR_ZONES] = {
275294 #ifdef CONFIG_ZONE_DMA
276295 "DMA",
....@@ -288,7 +307,7 @@
288307 #endif
289308 };
290309
291
-char * const migratetype_names[MIGRATE_TYPES] = {
310
+const char * const migratetype_names[MIGRATE_TYPES] = {
292311 "Unmovable",
293312 "Movable",
294313 "Reclaimable",
....@@ -301,14 +320,14 @@
301320 #endif
302321 };
303322
304
-compound_page_dtor * const compound_page_dtors[] = {
305
- NULL,
306
- free_compound_page,
323
+compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
324
+ [NULL_COMPOUND_DTOR] = NULL,
325
+ [COMPOUND_PAGE_DTOR] = free_compound_page,
307326 #ifdef CONFIG_HUGETLB_PAGE
308
- free_huge_page,
327
+ [HUGETLB_PAGE_DTOR] = free_huge_page,
309328 #endif
310329 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
311
- free_transhuge_page,
330
+ [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
312331 #endif
313332 };
314333
....@@ -319,6 +338,20 @@
319338 */
320339 int min_free_kbytes = 1024;
321340 int user_min_free_kbytes = -1;
341
+#ifdef CONFIG_DISCONTIGMEM
342
+/*
343
+ * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
344
+ * are not on separate NUMA nodes. Functionally this works but with
345
+ * watermark_boost_factor, it can reclaim prematurely as the ranges can be
346
+ * quite small. By default, do not boost watermarks on discontigmem as in
347
+ * many cases very high-order allocations like THP are likely to be
348
+ * unsupported and the premature reclaim offsets the advantage of long-term
349
+ * fragmentation avoidance.
350
+ */
351
+int watermark_boost_factor __read_mostly;
352
+#else
353
+int watermark_boost_factor __read_mostly = 15000;
354
+#endif
322355 int watermark_scale_factor = 10;
323356
324357 /*
....@@ -328,28 +361,26 @@
328361 */
329362 int extra_free_kbytes = 0;
330363
331
-static unsigned long nr_kernel_pages __meminitdata;
332
-static unsigned long nr_all_pages __meminitdata;
333
-static unsigned long dma_reserve __meminitdata;
364
+static unsigned long nr_kernel_pages __initdata;
365
+static unsigned long nr_all_pages __initdata;
366
+static unsigned long dma_reserve __initdata;
334367
335
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
336
-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
337
-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
368
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
369
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
338370 static unsigned long required_kernelcore __initdata;
339371 static unsigned long required_kernelcore_percent __initdata;
340372 static unsigned long required_movablecore __initdata;
341373 static unsigned long required_movablecore_percent __initdata;
342
-static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
374
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
343375 static bool mirrored_kernelcore __meminitdata;
344376
345377 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
346378 int movable_zone;
347379 EXPORT_SYMBOL(movable_zone);
348
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
349380
350381 #if MAX_NUMNODES > 1
351
-int nr_node_ids __read_mostly = MAX_NUMNODES;
352
-int nr_online_nodes __read_mostly = 1;
382
+unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
383
+unsigned int nr_online_nodes __read_mostly = 1;
353384 EXPORT_SYMBOL(nr_node_ids);
354385 EXPORT_SYMBOL(nr_online_nodes);
355386 #endif
....@@ -365,7 +396,7 @@
365396 static DEFINE_STATIC_KEY_TRUE(deferred_pages);
366397
367398 /*
368
- * Calling kasan_free_pages() only after deferred memory initialization
399
+ * Calling kasan_poison_pages() only after deferred memory initialization
369400 * has completed. Poisoning pages during deferred memory init will greatly
370401 * lengthen the process and cause problem in large memory systems as the
371402 * deferred pages initialization is done with interrupt disabled.
....@@ -377,10 +408,12 @@
377408 * on-demand allocation and then freed again before the deferred pages
378409 * initialization is done, but this is not likely to happen.
379410 */
380
-static inline void kasan_free_nondeferred_pages(struct page *page, int order)
411
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
381412 {
382
- if (!static_branch_unlikely(&deferred_pages))
383
- kasan_free_pages(page, order);
413
+ return static_branch_unlikely(&deferred_pages) ||
414
+ (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
415
+ (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
416
+ PageSkipKASanPoison(page);
384417 }
385418
386419 /* Returns true if the struct page for the pfn is uninitialised */
....@@ -395,38 +428,57 @@
395428 }
396429
397430 /*
398
- * Returns false when the remaining initialisation should be deferred until
431
+ * Returns true when the remaining initialisation should be deferred until
399432 * later in the boot cycle when it can be parallelised.
400433 */
401
-static inline bool update_defer_init(pg_data_t *pgdat,
402
- unsigned long pfn, unsigned long zone_end,
403
- unsigned long *nr_initialised)
434
+static bool __meminit
435
+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
404436 {
405
- /* Always populate low zones for address-constrained allocations */
406
- if (zone_end < pgdat_end_pfn(pgdat))
407
- return true;
408
- (*nr_initialised)++;
409
- if ((*nr_initialised > pgdat->static_init_pgcnt) &&
410
- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
411
- pgdat->first_deferred_pfn = pfn;
412
- return false;
437
+ static unsigned long prev_end_pfn, nr_initialised;
438
+
439
+ /*
440
+ * prev_end_pfn static that contains the end of previous zone
441
+ * No need to protect because called very early in boot before smp_init.
442
+ */
443
+ if (prev_end_pfn != end_pfn) {
444
+ prev_end_pfn = end_pfn;
445
+ nr_initialised = 0;
413446 }
414447
415
- return true;
448
+ /* Always populate low zones for address-constrained allocations */
449
+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
450
+ return false;
451
+
452
+ if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
453
+ return true;
454
+ /*
455
+ * We start only with one section of pages, more pages are added as
456
+ * needed until the rest of deferred pages are initialized.
457
+ */
458
+ nr_initialised++;
459
+ if ((nr_initialised > PAGES_PER_SECTION) &&
460
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
461
+ NODE_DATA(nid)->first_deferred_pfn = pfn;
462
+ return true;
463
+ }
464
+ return false;
416465 }
417466 #else
418
-#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
467
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
468
+{
469
+ return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
470
+ (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
471
+ PageSkipKASanPoison(page);
472
+}
419473
420474 static inline bool early_page_uninitialised(unsigned long pfn)
421475 {
422476 return false;
423477 }
424478
425
-static inline bool update_defer_init(pg_data_t *pgdat,
426
- unsigned long pfn, unsigned long zone_end,
427
- unsigned long *nr_initialised)
479
+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
428480 {
429
- return true;
481
+ return false;
430482 }
431483 #endif
432484
....@@ -435,7 +487,7 @@
435487 unsigned long pfn)
436488 {
437489 #ifdef CONFIG_SPARSEMEM
438
- return __pfn_to_section(pfn)->pageblock_flags;
490
+ return section_to_usemap(__pfn_to_section(pfn));
439491 #else
440492 return page_zone(page)->pageblock_flags;
441493 #endif /* CONFIG_SPARSEMEM */
....@@ -445,25 +497,23 @@
445497 {
446498 #ifdef CONFIG_SPARSEMEM
447499 pfn &= (PAGES_PER_SECTION-1);
448
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
449500 #else
450501 pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
451
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
452502 #endif /* CONFIG_SPARSEMEM */
503
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
453504 }
454505
455506 /**
456507 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
457508 * @page: The page within the block of interest
458509 * @pfn: The target page frame number
459
- * @end_bitidx: The last bit of interest to retrieve
460510 * @mask: mask of bits that the caller is interested in
461511 *
462512 * Return: pageblock_bits flags
463513 */
464
-static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
514
+static __always_inline
515
+unsigned long __get_pfnblock_flags_mask(struct page *page,
465516 unsigned long pfn,
466
- unsigned long end_bitidx,
467517 unsigned long mask)
468518 {
469519 unsigned long *bitmap;
....@@ -476,20 +526,36 @@
476526 bitidx &= (BITS_PER_LONG-1);
477527
478528 word = bitmap[word_bitidx];
479
- bitidx += end_bitidx;
480
- return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
529
+ return (word >> bitidx) & mask;
481530 }
482531
483532 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
484
- unsigned long end_bitidx,
485533 unsigned long mask)
486534 {
487
- return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
535
+ return __get_pfnblock_flags_mask(page, pfn, mask);
488536 }
537
+EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask);
538
+
539
+int isolate_anon_lru_page(struct page *page)
540
+{
541
+ int ret;
542
+
543
+ if (!PageLRU(page) || !PageAnon(page))
544
+ return -EINVAL;
545
+
546
+ if (!get_page_unless_zero(page))
547
+ return -EINVAL;
548
+
549
+ ret = isolate_lru_page(page);
550
+ put_page(page);
551
+
552
+ return ret;
553
+}
554
+EXPORT_SYMBOL_GPL(isolate_anon_lru_page);
489555
490556 static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
491557 {
492
- return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
558
+ return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
493559 }
494560
495561 /**
....@@ -497,12 +563,10 @@
497563 * @page: The page within the block of interest
498564 * @flags: The flags to set
499565 * @pfn: The target page frame number
500
- * @end_bitidx: The last bit of interest
501566 * @mask: mask of bits that the caller is interested in
502567 */
503568 void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
504569 unsigned long pfn,
505
- unsigned long end_bitidx,
506570 unsigned long mask)
507571 {
508572 unsigned long *bitmap;
....@@ -510,6 +574,7 @@
510574 unsigned long old_word, word;
511575
512576 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
577
+ BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
513578
514579 bitmap = get_pageblock_bitmap(page, pfn);
515580 bitidx = pfn_to_bitidx(page, pfn);
....@@ -518,9 +583,8 @@
518583
519584 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
520585
521
- bitidx += end_bitidx;
522
- mask <<= (BITS_PER_LONG - bitidx - 1);
523
- flags <<= (BITS_PER_LONG - bitidx - 1);
586
+ mask <<= bitidx;
587
+ flags <<= bitidx;
524588
525589 word = READ_ONCE(bitmap[word_bitidx]);
526590 for (;;) {
....@@ -537,8 +601,8 @@
537601 migratetype < MIGRATE_PCPTYPES))
538602 migratetype = MIGRATE_UNMOVABLE;
539603
540
- set_pageblock_flags_group(page, (unsigned long)migratetype,
541
- PB_migrate, PB_migrate_end);
604
+ set_pfnblock_flags_mask(page, (unsigned long)migratetype,
605
+ page_to_pfn(page), MIGRATETYPE_MASK);
542606 }
543607
544608 #ifdef CONFIG_DEBUG_VM
....@@ -593,8 +657,7 @@
593657 }
594658 #endif
595659
596
-static void bad_page(struct page *page, const char *reason,
597
- unsigned long bad_flags)
660
+static void bad_page(struct page *page, const char *reason)
598661 {
599662 static unsigned long resume;
600663 static unsigned long nr_shown;
....@@ -623,10 +686,6 @@
623686 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
624687 current->comm, page_to_pfn(page));
625688 __dump_page(page, reason);
626
- bad_flags &= page->flags;
627
- if (bad_flags)
628
- pr_alert("bad because of flags: %#lx(%pGp)\n",
629
- bad_flags, &bad_flags);
630689 dump_page_owner(page);
631690
632691 print_modules();
....@@ -654,7 +713,8 @@
654713
655714 void free_compound_page(struct page *page)
656715 {
657
- __free_pages_ok(page, compound_order(page));
716
+ mem_cgroup_uncharge(page);
717
+ __free_pages_ok(page, compound_order(page), FPI_NONE);
658718 }
659719
660720 void prep_compound_page(struct page *page, unsigned int order)
....@@ -662,8 +722,6 @@
662722 int i;
663723 int nr_pages = 1 << order;
664724
665
- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
666
- set_compound_order(page, order);
667725 __SetPageHead(page);
668726 for (i = 1; i < nr_pages; i++) {
669727 struct page *p = page + i;
....@@ -671,51 +729,30 @@
671729 p->mapping = TAIL_MAPPING;
672730 set_compound_head(p, page);
673731 }
732
+
733
+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
734
+ set_compound_order(page, order);
674735 atomic_set(compound_mapcount_ptr(page), -1);
736
+ if (hpage_pincount_available(page))
737
+ atomic_set(compound_pincount_ptr(page), 0);
675738 }
676739
677740 #ifdef CONFIG_DEBUG_PAGEALLOC
678741 unsigned int _debug_guardpage_minorder;
679
-bool _debug_pagealloc_enabled __read_mostly
742
+
743
+bool _debug_pagealloc_enabled_early __read_mostly
680744 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
745
+EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
746
+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
681747 EXPORT_SYMBOL(_debug_pagealloc_enabled);
682
-bool _debug_guardpage_enabled __read_mostly;
748
+
749
+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
683750
684751 static int __init early_debug_pagealloc(char *buf)
685752 {
686
- if (!buf)
687
- return -EINVAL;
688
- return kstrtobool(buf, &_debug_pagealloc_enabled);
753
+ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
689754 }
690755 early_param("debug_pagealloc", early_debug_pagealloc);
691
-
692
-static bool need_debug_guardpage(void)
693
-{
694
- /* If we don't use debug_pagealloc, we don't need guard page */
695
- if (!debug_pagealloc_enabled())
696
- return false;
697
-
698
- if (!debug_guardpage_minorder())
699
- return false;
700
-
701
- return true;
702
-}
703
-
704
-static void init_debug_guardpage(void)
705
-{
706
- if (!debug_pagealloc_enabled())
707
- return;
708
-
709
- if (!debug_guardpage_minorder())
710
- return;
711
-
712
- _debug_guardpage_enabled = true;
713
-}
714
-
715
-struct page_ext_operations debug_guardpage_ops = {
716
- .need = need_debug_guardpage,
717
- .init = init_debug_guardpage,
718
-};
719756
720757 static int __init debug_guardpage_minorder_setup(char *buf)
721758 {
....@@ -734,20 +771,13 @@
734771 static inline bool set_page_guard(struct zone *zone, struct page *page,
735772 unsigned int order, int migratetype)
736773 {
737
- struct page_ext *page_ext;
738
-
739774 if (!debug_guardpage_enabled())
740775 return false;
741776
742777 if (order >= debug_guardpage_minorder())
743778 return false;
744779
745
- page_ext = lookup_page_ext(page);
746
- if (unlikely(!page_ext))
747
- return false;
748
-
749
- __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
750
-
780
+ __SetPageGuard(page);
751781 INIT_LIST_HEAD(&page->lru);
752782 set_page_private(page, order);
753783 /* Guard pages are not available for any usage */
....@@ -759,39 +789,77 @@
759789 static inline void clear_page_guard(struct zone *zone, struct page *page,
760790 unsigned int order, int migratetype)
761791 {
762
- struct page_ext *page_ext;
763
-
764792 if (!debug_guardpage_enabled())
765793 return;
766794
767
- page_ext = lookup_page_ext(page);
768
- if (unlikely(!page_ext))
769
- return;
770
-
771
- __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
795
+ __ClearPageGuard(page);
772796
773797 set_page_private(page, 0);
774798 if (!is_migrate_isolate(migratetype))
775799 __mod_zone_freepage_state(zone, (1 << order), migratetype);
776800 }
777801 #else
778
-struct page_ext_operations debug_guardpage_ops;
779802 static inline bool set_page_guard(struct zone *zone, struct page *page,
780803 unsigned int order, int migratetype) { return false; }
781804 static inline void clear_page_guard(struct zone *zone, struct page *page,
782805 unsigned int order, int migratetype) {}
783806 #endif
784807
785
-static inline void set_page_order(struct page *page, unsigned int order)
808
+/*
809
+ * Enable static keys related to various memory debugging and hardening options.
810
+ * Some override others, and depend on early params that are evaluated in the
811
+ * order of appearance. So we need to first gather the full picture of what was
812
+ * enabled, and then make decisions.
813
+ */
814
+void init_mem_debugging_and_hardening(void)
815
+{
816
+ bool page_poisoning_requested = false;
817
+
818
+#ifdef CONFIG_PAGE_POISONING
819
+ /*
820
+ * Page poisoning is debug page alloc for some arches. If
821
+ * either of those options are enabled, enable poisoning.
822
+ */
823
+ if (page_poisoning_enabled() ||
824
+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
825
+ debug_pagealloc_enabled())) {
826
+ static_branch_enable(&_page_poisoning_enabled);
827
+ page_poisoning_requested = true;
828
+ }
829
+#endif
830
+
831
+ if (_init_on_alloc_enabled_early) {
832
+ if (page_poisoning_requested)
833
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
834
+ "will take precedence over init_on_alloc\n");
835
+ else
836
+ static_branch_enable(&init_on_alloc);
837
+ }
838
+ if (_init_on_free_enabled_early) {
839
+ if (page_poisoning_requested)
840
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
841
+ "will take precedence over init_on_free\n");
842
+ else
843
+ static_branch_enable(&init_on_free);
844
+ }
845
+
846
+#ifdef CONFIG_DEBUG_PAGEALLOC
847
+ if (!debug_pagealloc_enabled())
848
+ return;
849
+
850
+ static_branch_enable(&_debug_pagealloc_enabled);
851
+
852
+ if (!debug_guardpage_minorder())
853
+ return;
854
+
855
+ static_branch_enable(&_debug_guardpage_enabled);
856
+#endif
857
+}
858
+
859
+static inline void set_buddy_order(struct page *page, unsigned int order)
786860 {
787861 set_page_private(page, order);
788862 __SetPageBuddy(page);
789
-}
790
-
791
-static inline void rmv_page_order(struct page *page)
792
-{
793
- __ClearPageBuddy(page);
794
- set_page_private(page, 0);
795863 }
796864
797865 /*
....@@ -807,32 +875,151 @@
807875 *
808876 * For recording page's order, we use page_private(page).
809877 */
810
-static inline int page_is_buddy(struct page *page, struct page *buddy,
878
+static inline bool page_is_buddy(struct page *page, struct page *buddy,
811879 unsigned int order)
812880 {
813
- if (page_is_guard(buddy) && page_order(buddy) == order) {
814
- if (page_zone_id(page) != page_zone_id(buddy))
815
- return 0;
881
+ if (!page_is_guard(buddy) && !PageBuddy(buddy))
882
+ return false;
816883
817
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
884
+ if (buddy_order(buddy) != order)
885
+ return false;
818886
819
- return 1;
820
- }
887
+ /*
888
+ * zone check is done late to avoid uselessly calculating
889
+ * zone/node ids for pages that could never merge.
890
+ */
891
+ if (page_zone_id(page) != page_zone_id(buddy))
892
+ return false;
821893
822
- if (PageBuddy(buddy) && page_order(buddy) == order) {
823
- /*
824
- * zone check is done late to avoid uselessly
825
- * calculating zone/node ids for pages that could
826
- * never merge.
827
- */
828
- if (page_zone_id(page) != page_zone_id(buddy))
829
- return 0;
894
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
830895
831
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
896
+ return true;
897
+}
832898
833
- return 1;
834
- }
835
- return 0;
899
+#ifdef CONFIG_COMPACTION
900
+static inline struct capture_control *task_capc(struct zone *zone)
901
+{
902
+ struct capture_control *capc = current->capture_control;
903
+
904
+ return unlikely(capc) &&
905
+ !(current->flags & PF_KTHREAD) &&
906
+ !capc->page &&
907
+ capc->cc->zone == zone ? capc : NULL;
908
+}
909
+
910
+static inline bool
911
+compaction_capture(struct capture_control *capc, struct page *page,
912
+ int order, int migratetype)
913
+{
914
+ if (!capc || order != capc->cc->order)
915
+ return false;
916
+
917
+ /* Do not accidentally pollute CMA or isolated regions*/
918
+ if (is_migrate_cma(migratetype) ||
919
+ is_migrate_isolate(migratetype))
920
+ return false;
921
+
922
+ /*
923
+ * Do not let lower order allocations polluate a movable pageblock.
924
+ * This might let an unmovable request use a reclaimable pageblock
925
+ * and vice-versa but no more than normal fallback logic which can
926
+ * have trouble finding a high-order free page.
927
+ */
928
+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
929
+ return false;
930
+
931
+ capc->page = page;
932
+ return true;
933
+}
934
+
935
+#else
936
+static inline struct capture_control *task_capc(struct zone *zone)
937
+{
938
+ return NULL;
939
+}
940
+
941
+static inline bool
942
+compaction_capture(struct capture_control *capc, struct page *page,
943
+ int order, int migratetype)
944
+{
945
+ return false;
946
+}
947
+#endif /* CONFIG_COMPACTION */
948
+
949
+/* Used for pages not on another list */
950
+static inline void add_to_free_list(struct page *page, struct zone *zone,
951
+ unsigned int order, int migratetype)
952
+{
953
+ struct free_area *area = &zone->free_area[order];
954
+
955
+ list_add(&page->lru, &area->free_list[migratetype]);
956
+ area->nr_free++;
957
+}
958
+
959
+/* Used for pages not on another list */
960
+static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
961
+ unsigned int order, int migratetype)
962
+{
963
+ struct free_area *area = &zone->free_area[order];
964
+
965
+ list_add_tail(&page->lru, &area->free_list[migratetype]);
966
+ area->nr_free++;
967
+}
968
+
969
+/*
970
+ * Used for pages which are on another list. Move the pages to the tail
971
+ * of the list - so the moved pages won't immediately be considered for
972
+ * allocation again (e.g., optimization for memory onlining).
973
+ */
974
+static inline void move_to_free_list(struct page *page, struct zone *zone,
975
+ unsigned int order, int migratetype)
976
+{
977
+ struct free_area *area = &zone->free_area[order];
978
+
979
+ list_move_tail(&page->lru, &area->free_list[migratetype]);
980
+}
981
+
982
+static inline void del_page_from_free_list(struct page *page, struct zone *zone,
983
+ unsigned int order)
984
+{
985
+ /* clear reported state and update reported page count */
986
+ if (page_reported(page))
987
+ __ClearPageReported(page);
988
+
989
+ list_del(&page->lru);
990
+ __ClearPageBuddy(page);
991
+ set_page_private(page, 0);
992
+ zone->free_area[order].nr_free--;
993
+}
994
+
995
+/*
996
+ * If this is not the largest possible page, check if the buddy
997
+ * of the next-highest order is free. If it is, it's possible
998
+ * that pages are being freed that will coalesce soon. In case,
999
+ * that is happening, add the free page to the tail of the list
1000
+ * so it's less likely to be used soon and more likely to be merged
1001
+ * as a higher order page
1002
+ */
1003
+static inline bool
1004
+buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
1005
+ struct page *page, unsigned int order)
1006
+{
1007
+ struct page *higher_page, *higher_buddy;
1008
+ unsigned long combined_pfn;
1009
+
1010
+ if (order >= MAX_ORDER - 2)
1011
+ return false;
1012
+
1013
+ if (!pfn_valid_within(buddy_pfn))
1014
+ return false;
1015
+
1016
+ combined_pfn = buddy_pfn & pfn;
1017
+ higher_page = page + (combined_pfn - pfn);
1018
+ buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
1019
+ higher_buddy = higher_page + (buddy_pfn - combined_pfn);
1020
+
1021
+ return pfn_valid_within(buddy_pfn) &&
1022
+ page_is_buddy(higher_page, higher_buddy, order + 1);
8361023 }
8371024
8381025 /*
....@@ -862,12 +1049,14 @@
8621049 static inline void __free_one_page(struct page *page,
8631050 unsigned long pfn,
8641051 struct zone *zone, unsigned int order,
865
- int migratetype)
1052
+ int migratetype, fpi_t fpi_flags)
8661053 {
1054
+ struct capture_control *capc = task_capc(zone);
1055
+ unsigned long buddy_pfn;
8671056 unsigned long combined_pfn;
868
- unsigned long uninitialized_var(buddy_pfn);
869
- struct page *buddy;
8701057 unsigned int max_order;
1058
+ struct page *buddy;
1059
+ bool to_tail;
8711060
8721061 max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
8731062
....@@ -883,6 +1072,11 @@
8831072
8841073 continue_merging:
8851074 while (order < max_order) {
1075
+ if (compaction_capture(capc, page, order, migratetype)) {
1076
+ __mod_zone_freepage_state(zone, -(1 << order),
1077
+ migratetype);
1078
+ return;
1079
+ }
8861080 buddy_pfn = __find_buddy_pfn(pfn, order);
8871081 buddy = page + (buddy_pfn - pfn);
8881082
....@@ -894,13 +1088,10 @@
8941088 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
8951089 * merge with it and move up one order.
8961090 */
897
- if (page_is_guard(buddy)) {
1091
+ if (page_is_guard(buddy))
8981092 clear_page_guard(zone, buddy, order, migratetype);
899
- } else {
900
- list_del(&buddy->lru);
901
- zone->free_area[order].nr_free--;
902
- rmv_page_order(buddy);
903
- }
1093
+ else
1094
+ del_page_from_free_list(buddy, zone, order);
9041095 combined_pfn = buddy_pfn & pfn;
9051096 page = page + (combined_pfn - pfn);
9061097 pfn = combined_pfn;
....@@ -932,33 +1123,23 @@
9321123 }
9331124
9341125 done_merging:
935
- set_page_order(page, order);
1126
+ set_buddy_order(page, order);
9361127
937
- /*
938
- * If this is not the largest possible page, check if the buddy
939
- * of the next-highest order is free. If it is, it's possible
940
- * that pages are being freed that will coalesce soon. In case,
941
- * that is happening, add the free page to the tail of the list
942
- * so it's less likely to be used soon and more likely to be merged
943
- * as a higher order page
944
- */
945
- if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
946
- struct page *higher_page, *higher_buddy;
947
- combined_pfn = buddy_pfn & pfn;
948
- higher_page = page + (combined_pfn - pfn);
949
- buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
950
- higher_buddy = higher_page + (buddy_pfn - combined_pfn);
951
- if (pfn_valid_within(buddy_pfn) &&
952
- page_is_buddy(higher_page, higher_buddy, order + 1)) {
953
- list_add_tail(&page->lru,
954
- &zone->free_area[order].free_list[migratetype]);
955
- goto out;
956
- }
957
- }
1128
+ if (fpi_flags & FPI_TO_TAIL)
1129
+ to_tail = true;
1130
+ else if (is_shuffle_order(order))
1131
+ to_tail = shuffle_pick_tail();
1132
+ else
1133
+ to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
9581134
959
- list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
960
-out:
961
- zone->free_area[order].nr_free++;
1135
+ if (to_tail)
1136
+ add_to_free_list_tail(page, zone, order, migratetype);
1137
+ else
1138
+ add_to_free_list(page, zone, order, migratetype);
1139
+
1140
+ /* Notify page reporting subsystem of freed page */
1141
+ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
1142
+ page_reporting_notify_free(order);
9621143 }
9631144
9641145 /*
....@@ -983,13 +1164,9 @@
9831164 return true;
9841165 }
9851166
986
-static void free_pages_check_bad(struct page *page)
1167
+static const char *page_bad_reason(struct page *page, unsigned long flags)
9871168 {
988
- const char *bad_reason;
989
- unsigned long bad_flags;
990
-
991
- bad_reason = NULL;
992
- bad_flags = 0;
1169
+ const char *bad_reason = NULL;
9931170
9941171 if (unlikely(atomic_read(&page->_mapcount) != -1))
9951172 bad_reason = "nonzero mapcount";
....@@ -997,24 +1174,32 @@
9971174 bad_reason = "non-NULL mapping";
9981175 if (unlikely(page_ref_count(page) != 0))
9991176 bad_reason = "nonzero _refcount";
1000
- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
1001
- bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1002
- bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
1177
+ if (unlikely(page->flags & flags)) {
1178
+ if (flags == PAGE_FLAGS_CHECK_AT_PREP)
1179
+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
1180
+ else
1181
+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
10031182 }
10041183 #ifdef CONFIG_MEMCG
10051184 if (unlikely(page->mem_cgroup))
10061185 bad_reason = "page still charged to cgroup";
10071186 #endif
1008
- bad_page(page, bad_reason, bad_flags);
1187
+ return bad_reason;
10091188 }
10101189
1011
-static inline int free_pages_check(struct page *page)
1190
+static void check_free_page_bad(struct page *page)
1191
+{
1192
+ bad_page(page,
1193
+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
1194
+}
1195
+
1196
+static inline int check_free_page(struct page *page)
10121197 {
10131198 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
10141199 return 0;
10151200
10161201 /* Something has gone sideways, find it */
1017
- free_pages_check_bad(page);
1202
+ check_free_page_bad(page);
10181203 return 1;
10191204 }
10201205
....@@ -1036,7 +1221,7 @@
10361221 case 1:
10371222 /* the first tail page: ->mapping may be compound_mapcount() */
10381223 if (unlikely(compound_mapcount(page))) {
1039
- bad_page(page, "nonzero compound_mapcount", 0);
1224
+ bad_page(page, "nonzero compound_mapcount");
10401225 goto out;
10411226 }
10421227 break;
....@@ -1048,17 +1233,17 @@
10481233 break;
10491234 default:
10501235 if (page->mapping != TAIL_MAPPING) {
1051
- bad_page(page, "corrupted mapping in tail page", 0);
1236
+ bad_page(page, "corrupted mapping in tail page");
10521237 goto out;
10531238 }
10541239 break;
10551240 }
10561241 if (unlikely(!PageTail(page))) {
1057
- bad_page(page, "PageTail not set", 0);
1242
+ bad_page(page, "PageTail not set");
10581243 goto out;
10591244 }
10601245 if (unlikely(compound_head(page) != head_page)) {
1061
- bad_page(page, "compound_head not consistent", 0);
1246
+ bad_page(page, "compound_head not consistent");
10621247 goto out;
10631248 }
10641249 ret = 0;
....@@ -1068,25 +1253,48 @@
10681253 return ret;
10691254 }
10701255
1071
-static void kernel_init_free_pages(struct page *page, int numpages)
1256
+static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
10721257 {
10731258 int i;
10741259
1260
+ if (zero_tags) {
1261
+ for (i = 0; i < numpages; i++)
1262
+ tag_clear_highpage(page + i);
1263
+ return;
1264
+ }
1265
+
10751266 /* s390's use of memset() could override KASAN redzones. */
10761267 kasan_disable_current();
1077
- for (i = 0; i < numpages; i++)
1268
+ for (i = 0; i < numpages; i++) {
1269
+ u8 tag = page_kasan_tag(page + i);
1270
+ page_kasan_tag_reset(page + i);
10781271 clear_highpage(page + i);
1272
+ page_kasan_tag_set(page + i, tag);
1273
+ }
10791274 kasan_enable_current();
10801275 }
10811276
10821277 static __always_inline bool free_pages_prepare(struct page *page,
1083
- unsigned int order, bool check_free)
1278
+ unsigned int order, bool check_free, fpi_t fpi_flags)
10841279 {
10851280 int bad = 0;
1281
+ bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
10861282
10871283 VM_BUG_ON_PAGE(PageTail(page), page);
10881284
10891285 trace_mm_page_free(page, order);
1286
+
1287
+ if (unlikely(PageHWPoison(page)) && !order) {
1288
+ /*
1289
+ * Do not let hwpoison pages hit pcplists/buddy
1290
+ * Untie memcg state and reset page's owner
1291
+ */
1292
+ if (memcg_kmem_enabled() && PageKmemcg(page))
1293
+ __memcg_kmem_uncharge_page(page, order);
1294
+ reset_page_owner(page, order);
1295
+ free_page_pinner(page, order);
1296
+ return false;
1297
+ }
10901298
10911299 /*
10921300 * Check tail pages before head page information is cleared to
....@@ -1103,7 +1311,7 @@
11031311 for (i = 1; i < (1 << order); i++) {
11041312 if (compound)
11051313 bad += free_tail_pages_check(page, page + i);
1106
- if (unlikely(free_pages_check(page + i))) {
1314
+ if (unlikely(check_free_page(page + i))) {
11071315 bad++;
11081316 continue;
11091317 }
....@@ -1113,15 +1321,16 @@
11131321 if (PageMappingFlags(page))
11141322 page->mapping = NULL;
11151323 if (memcg_kmem_enabled() && PageKmemcg(page))
1116
- memcg_kmem_uncharge(page, order);
1324
+ __memcg_kmem_uncharge_page(page, order);
11171325 if (check_free)
1118
- bad += free_pages_check(page);
1326
+ bad += check_free_page(page);
11191327 if (bad)
11201328 return false;
11211329
11221330 page_cpupid_reset_last(page);
11231331 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
11241332 reset_page_owner(page, order);
1333
+ free_page_pinner(page, order);
11251334
11261335 if (!PageHighMem(page)) {
11271336 debug_check_no_locks_freed(page_address(page),
....@@ -1129,36 +1338,77 @@
11291338 debug_check_no_obj_freed(page_address(page),
11301339 PAGE_SIZE << order);
11311340 }
1132
- arch_free_page(page, order);
1133
- if (want_init_on_free())
1134
- kernel_init_free_pages(page, 1 << order);
11351341
1136
- kernel_poison_pages(page, 1 << order, 0);
1137
- kernel_map_pages(page, 1 << order, 0);
1138
- kasan_free_nondeferred_pages(page, order);
1342
+ kernel_poison_pages(page, 1 << order);
1343
+
1344
+ /*
1345
+ * As memory initialization might be integrated into KASAN,
1346
+ * kasan_free_pages and kernel_init_free_pages must be
1347
+ * kept together to avoid discrepancies in behavior.
1348
+ *
1349
+ * With hardware tag-based KASAN, memory tags must be set before the
1350
+ * page becomes unavailable via debug_pagealloc or arch_free_page.
1351
+ */
1352
+ if (kasan_has_integrated_init()) {
1353
+ if (!skip_kasan_poison)
1354
+ kasan_free_pages(page, order);
1355
+ } else {
1356
+ bool init = want_init_on_free();
1357
+
1358
+ if (init)
1359
+ kernel_init_free_pages(page, 1 << order, false);
1360
+ if (!skip_kasan_poison)
1361
+ kasan_poison_pages(page, order, init);
1362
+ }
1363
+
1364
+ /*
1365
+ * arch_free_page() can make the page's contents inaccessible. s390
1366
+ * does this. So nothing which can access the page's contents should
1367
+ * happen after this.
1368
+ */
1369
+ arch_free_page(page, order);
1370
+
1371
+ debug_pagealloc_unmap_pages(page, 1 << order);
11391372
11401373 return true;
11411374 }
11421375
11431376 #ifdef CONFIG_DEBUG_VM
1144
-static inline bool free_pcp_prepare(struct page *page)
1145
-{
1146
- return free_pages_prepare(page, 0, true);
1147
-}
1148
-
1149
-static inline bool bulkfree_pcp_prepare(struct page *page)
1150
-{
1151
- return false;
1152
-}
1153
-#else
1377
+/*
1378
+ * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
1379
+ * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
1380
+ * moved from pcp lists to free lists.
1381
+ */
11541382 static bool free_pcp_prepare(struct page *page)
11551383 {
1156
- return free_pages_prepare(page, 0, false);
1384
+ return free_pages_prepare(page, 0, true, FPI_NONE);
11571385 }
11581386
11591387 static bool bulkfree_pcp_prepare(struct page *page)
11601388 {
1161
- return free_pages_check(page);
1389
+ if (debug_pagealloc_enabled_static())
1390
+ return check_free_page(page);
1391
+ else
1392
+ return false;
1393
+}
1394
+#else
1395
+/*
1396
+ * With DEBUG_VM disabled, order-0 pages being freed are checked only when
1397
+ * moving from pcp lists to free list in order to reduce overhead. With
1398
+ * debug_pagealloc enabled, they are checked also immediately when being freed
1399
+ * to the pcp lists.
1400
+ */
1401
+static bool free_pcp_prepare(struct page *page)
1402
+{
1403
+ if (debug_pagealloc_enabled_static())
1404
+ return free_pages_prepare(page, 0, true, FPI_NONE);
1405
+ else
1406
+ return free_pages_prepare(page, 0, false, FPI_NONE);
1407
+}
1408
+
1409
+static bool bulkfree_pcp_prepare(struct page *page)
1410
+{
1411
+ return check_free_page(page);
11621412 }
11631413 #endif /* CONFIG_DEBUG_VM */
11641414
....@@ -1258,7 +1508,7 @@
12581508 if (unlikely(isolated_pageblocks))
12591509 mt = get_pageblock_migratetype(page);
12601510
1261
- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
1511
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
12621512 trace_mm_page_pcpu_drain(page, 0, mt);
12631513 }
12641514 spin_unlock(&zone->lock);
....@@ -1267,14 +1517,14 @@
12671517 static void free_one_page(struct zone *zone,
12681518 struct page *page, unsigned long pfn,
12691519 unsigned int order,
1270
- int migratetype)
1520
+ int migratetype, fpi_t fpi_flags)
12711521 {
12721522 spin_lock(&zone->lock);
12731523 if (unlikely(has_isolate_pageblock(zone) ||
12741524 is_migrate_isolate(migratetype))) {
12751525 migratetype = get_pfnblock_migratetype(page, pfn);
12761526 }
1277
- __free_one_page(page, pfn, zone, order, migratetype);
1527
+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
12781528 spin_unlock(&zone->lock);
12791529 }
12801530
....@@ -1348,33 +1598,50 @@
13481598 /* Avoid false-positive PageTail() */
13491599 INIT_LIST_HEAD(&page->lru);
13501600
1351
- SetPageReserved(page);
1601
+ /*
1602
+ * no need for atomic set_bit because the struct
1603
+ * page is not visible yet so nobody should
1604
+ * access it yet.
1605
+ */
1606
+ __SetPageReserved(page);
13521607 }
13531608 }
13541609 }
13551610
1356
-static void __free_pages_ok(struct page *page, unsigned int order)
1611
+static void __free_pages_ok(struct page *page, unsigned int order,
1612
+ fpi_t fpi_flags)
13571613 {
13581614 unsigned long flags;
13591615 int migratetype;
13601616 unsigned long pfn = page_to_pfn(page);
1617
+ bool skip_free_unref_page = false;
13611618
1362
- if (!free_pages_prepare(page, order, true))
1619
+ if (!free_pages_prepare(page, order, true, fpi_flags))
13631620 return;
13641621
13651622 migratetype = get_pfnblock_migratetype(page, pfn);
1623
+ trace_android_vh_free_unref_page_bypass(page, order, migratetype, &skip_free_unref_page);
1624
+ if (skip_free_unref_page)
1625
+ return;
1626
+
13661627 local_irq_save(flags);
13671628 __count_vm_events(PGFREE, 1 << order);
1368
- free_one_page(page_zone(page), page, pfn, order, migratetype);
1629
+ free_one_page(page_zone(page), page, pfn, order, migratetype,
1630
+ fpi_flags);
13691631 local_irq_restore(flags);
13701632 }
13711633
1372
-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
1634
+void __free_pages_core(struct page *page, unsigned int order)
13731635 {
13741636 unsigned int nr_pages = 1 << order;
13751637 struct page *p = page;
13761638 unsigned int loop;
13771639
1640
+ /*
1641
+ * When initializing the memmap, __init_single_page() sets the refcount
1642
+ * of all pages to 1 ("allocated"/"not free"). We have to set the
1643
+ * refcount of all involved pages to 0.
1644
+ */
13781645 prefetchw(p);
13791646 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
13801647 prefetchw(p + 1);
....@@ -1384,15 +1651,43 @@
13841651 __ClearPageReserved(p);
13851652 set_page_count(p, 0);
13861653
1387
- page_zone(page)->managed_pages += nr_pages;
1388
- set_page_refcounted(page);
1389
- __free_pages(page, order);
1654
+ atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
1655
+
1656
+ /*
1657
+ * Bypass PCP and place fresh pages right to the tail, primarily
1658
+ * relevant for memory onlining.
1659
+ */
1660
+ __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
13901661 }
13911662
1392
-#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
1393
- defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
1663
+#ifdef CONFIG_NEED_MULTIPLE_NODES
13941664
13951665 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
1666
+
1667
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
1668
+
1669
+/*
1670
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
1671
+ */
1672
+int __meminit __early_pfn_to_nid(unsigned long pfn,
1673
+ struct mminit_pfnnid_cache *state)
1674
+{
1675
+ unsigned long start_pfn, end_pfn;
1676
+ int nid;
1677
+
1678
+ if (state->last_start <= pfn && pfn < state->last_end)
1679
+ return state->last_nid;
1680
+
1681
+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
1682
+ if (nid != NUMA_NO_NODE) {
1683
+ state->last_start = start_pfn;
1684
+ state->last_end = end_pfn;
1685
+ state->last_nid = nid;
1686
+ }
1687
+
1688
+ return nid;
1689
+}
1690
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
13961691
13971692 int __meminit early_pfn_to_nid(unsigned long pfn)
13981693 {
....@@ -1407,48 +1702,14 @@
14071702
14081703 return nid;
14091704 }
1410
-#endif
1705
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
14111706
1412
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
1413
-static inline bool __meminit __maybe_unused
1414
-meminit_pfn_in_nid(unsigned long pfn, int node,
1415
- struct mminit_pfnnid_cache *state)
1416
-{
1417
- int nid;
1418
-
1419
- nid = __early_pfn_to_nid(pfn, state);
1420
- if (nid >= 0 && nid != node)
1421
- return false;
1422
- return true;
1423
-}
1424
-
1425
-/* Only safe to use early in boot when initialisation is single-threaded */
1426
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1427
-{
1428
- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
1429
-}
1430
-
1431
-#else
1432
-
1433
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1434
-{
1435
- return true;
1436
-}
1437
-static inline bool __meminit __maybe_unused
1438
-meminit_pfn_in_nid(unsigned long pfn, int node,
1439
- struct mminit_pfnnid_cache *state)
1440
-{
1441
- return true;
1442
-}
1443
-#endif
1444
-
1445
-
1446
-void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
1707
+void __init memblock_free_pages(struct page *page, unsigned long pfn,
14471708 unsigned int order)
14481709 {
14491710 if (early_page_uninitialised(pfn))
14501711 return;
1451
- return __free_pages_boot_core(page, order);
1712
+ __free_pages_core(page, order);
14521713 }
14531714
14541715 /*
....@@ -1539,14 +1800,14 @@
15391800 if (nr_pages == pageblock_nr_pages &&
15401801 (pfn & (pageblock_nr_pages - 1)) == 0) {
15411802 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1542
- __free_pages_boot_core(page, pageblock_order);
1803
+ __free_pages_core(page, pageblock_order);
15431804 return;
15441805 }
15451806
15461807 for (i = 0; i < nr_pages; i++, page++, pfn++) {
15471808 if ((pfn & (pageblock_nr_pages - 1)) == 0)
15481809 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1549
- __free_pages_boot_core(page, 0);
1810
+ __free_pages_core(page, 0);
15501811 }
15511812 }
15521813
....@@ -1569,20 +1830,12 @@
15691830 *
15701831 * Then, we check if a current large page is valid by only checking the validity
15711832 * of the head pfn.
1572
- *
1573
- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
1574
- * within a node: a pfn is between start and end of a node, but does not belong
1575
- * to this memory node.
15761833 */
1577
-static inline bool __init
1578
-deferred_pfn_valid(int nid, unsigned long pfn,
1579
- struct mminit_pfnnid_cache *nid_init_state)
1834
+static inline bool __init deferred_pfn_valid(unsigned long pfn)
15801835 {
15811836 if (!pfn_valid_within(pfn))
15821837 return false;
15831838 if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
1584
- return false;
1585
- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
15861839 return false;
15871840 return true;
15881841 }
....@@ -1591,21 +1844,19 @@
15911844 * Free pages to buddy allocator. Try to free aligned pages in
15921845 * pageblock_nr_pages sizes.
15931846 */
1594
-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
1847
+static void __init deferred_free_pages(unsigned long pfn,
15951848 unsigned long end_pfn)
15961849 {
1597
- struct mminit_pfnnid_cache nid_init_state = { };
15981850 unsigned long nr_pgmask = pageblock_nr_pages - 1;
15991851 unsigned long nr_free = 0;
16001852
16011853 for (; pfn < end_pfn; pfn++) {
1602
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
1854
+ if (!deferred_pfn_valid(pfn)) {
16031855 deferred_free_range(pfn - nr_free, nr_free);
16041856 nr_free = 0;
16051857 } else if (!(pfn & nr_pgmask)) {
16061858 deferred_free_range(pfn - nr_free, nr_free);
16071859 nr_free = 1;
1608
- touch_nmi_watchdog();
16091860 } else {
16101861 nr_free++;
16111862 }
....@@ -1619,22 +1870,22 @@
16191870 * by performing it only once every pageblock_nr_pages.
16201871 * Return number of pages initialized.
16211872 */
1622
-static unsigned long __init deferred_init_pages(int nid, int zid,
1873
+static unsigned long __init deferred_init_pages(struct zone *zone,
16231874 unsigned long pfn,
16241875 unsigned long end_pfn)
16251876 {
1626
- struct mminit_pfnnid_cache nid_init_state = { };
16271877 unsigned long nr_pgmask = pageblock_nr_pages - 1;
1878
+ int nid = zone_to_nid(zone);
16281879 unsigned long nr_pages = 0;
1880
+ int zid = zone_idx(zone);
16291881 struct page *page = NULL;
16301882
16311883 for (; pfn < end_pfn; pfn++) {
1632
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
1884
+ if (!deferred_pfn_valid(pfn)) {
16331885 page = NULL;
16341886 continue;
16351887 } else if (!page || !(pfn & nr_pgmask)) {
16361888 page = pfn_to_page(pfn);
1637
- touch_nmi_watchdog();
16381889 } else {
16391890 page++;
16401891 }
....@@ -1644,18 +1895,127 @@
16441895 return (nr_pages);
16451896 }
16461897
1898
+/*
1899
+ * This function is meant to pre-load the iterator for the zone init.
1900
+ * Specifically it walks through the ranges until we are caught up to the
1901
+ * first_init_pfn value and exits there. If we never encounter the value we
1902
+ * return false indicating there are no valid ranges left.
1903
+ */
1904
+static bool __init
1905
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
1906
+ unsigned long *spfn, unsigned long *epfn,
1907
+ unsigned long first_init_pfn)
1908
+{
1909
+ u64 j;
1910
+
1911
+ /*
1912
+ * Start out by walking through the ranges in this zone that have
1913
+ * already been initialized. We don't need to do anything with them
1914
+ * so we just need to flush them out of the system.
1915
+ */
1916
+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
1917
+ if (*epfn <= first_init_pfn)
1918
+ continue;
1919
+ if (*spfn < first_init_pfn)
1920
+ *spfn = first_init_pfn;
1921
+ *i = j;
1922
+ return true;
1923
+ }
1924
+
1925
+ return false;
1926
+}
1927
+
1928
+/*
1929
+ * Initialize and free pages. We do it in two loops: first we initialize
1930
+ * struct page, then free to buddy allocator, because while we are
1931
+ * freeing pages we can access pages that are ahead (computing buddy
1932
+ * page in __free_one_page()).
1933
+ *
1934
+ * In order to try and keep some memory in the cache we have the loop
1935
+ * broken along max page order boundaries. This way we will not cause
1936
+ * any issues with the buddy page computation.
1937
+ */
1938
+static unsigned long __init
1939
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
1940
+ unsigned long *end_pfn)
1941
+{
1942
+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
1943
+ unsigned long spfn = *start_pfn, epfn = *end_pfn;
1944
+ unsigned long nr_pages = 0;
1945
+ u64 j = *i;
1946
+
1947
+ /* First we loop through and initialize the page values */
1948
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
1949
+ unsigned long t;
1950
+
1951
+ if (mo_pfn <= *start_pfn)
1952
+ break;
1953
+
1954
+ t = min(mo_pfn, *end_pfn);
1955
+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
1956
+
1957
+ if (mo_pfn < *end_pfn) {
1958
+ *start_pfn = mo_pfn;
1959
+ break;
1960
+ }
1961
+ }
1962
+
1963
+ /* Reset values and now loop through freeing pages as needed */
1964
+ swap(j, *i);
1965
+
1966
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
1967
+ unsigned long t;
1968
+
1969
+ if (mo_pfn <= spfn)
1970
+ break;
1971
+
1972
+ t = min(mo_pfn, epfn);
1973
+ deferred_free_pages(spfn, t);
1974
+
1975
+ if (mo_pfn <= epfn)
1976
+ break;
1977
+ }
1978
+
1979
+ return nr_pages;
1980
+}
1981
+
1982
+static void __init
1983
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
1984
+ void *arg)
1985
+{
1986
+ unsigned long spfn, epfn;
1987
+ struct zone *zone = arg;
1988
+ u64 i;
1989
+
1990
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
1991
+
1992
+ /*
1993
+ * Initialize and free pages in MAX_ORDER sized increments so that we
1994
+ * can avoid introducing any issues with the buddy allocator.
1995
+ */
1996
+ while (spfn < end_pfn) {
1997
+ deferred_init_maxorder(&i, zone, &spfn, &epfn);
1998
+ cond_resched();
1999
+ }
2000
+}
2001
+
2002
+/* An arch may override for more concurrency. */
2003
+__weak int __init
2004
+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
2005
+{
2006
+ return 1;
2007
+}
2008
+
16472009 /* Initialise remaining memory on a node */
16482010 static int __init deferred_init_memmap(void *data)
16492011 {
16502012 pg_data_t *pgdat = data;
1651
- int nid = pgdat->node_id;
1652
- unsigned long start = jiffies;
1653
- unsigned long nr_pages = 0;
1654
- unsigned long spfn, epfn, first_init_pfn, flags;
1655
- phys_addr_t spa, epa;
1656
- int zid;
1657
- struct zone *zone;
16582013 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2014
+ unsigned long spfn = 0, epfn = 0;
2015
+ unsigned long first_init_pfn, flags;
2016
+ unsigned long start = jiffies;
2017
+ struct zone *zone;
2018
+ int zid, max_threads;
16592019 u64 i;
16602020
16612021 /* Bind memory initialisation thread to a local node if possible */
....@@ -1688,30 +2048,36 @@
16882048 if (first_init_pfn < zone_end_pfn(zone))
16892049 break;
16902050 }
1691
- first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
16922051
1693
- /*
1694
- * Initialize and free pages. We do it in two loops: first we initialize
1695
- * struct page, than free to buddy allocator, because while we are
1696
- * freeing pages we can access pages that are ahead (computing buddy
1697
- * page in __free_one_page()).
1698
- */
1699
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1700
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1701
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1702
- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
1703
- }
1704
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1705
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1706
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1707
- deferred_free_pages(nid, zid, spfn, epfn);
1708
- }
2052
+ /* If the zone is empty somebody else may have cleared out the zone */
2053
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2054
+ first_init_pfn))
2055
+ goto zone_empty;
17092056
2057
+ max_threads = deferred_page_init_max_threads(cpumask);
2058
+
2059
+ while (spfn < epfn) {
2060
+ unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
2061
+ struct padata_mt_job job = {
2062
+ .thread_fn = deferred_init_memmap_chunk,
2063
+ .fn_arg = zone,
2064
+ .start = spfn,
2065
+ .size = epfn_align - spfn,
2066
+ .align = PAGES_PER_SECTION,
2067
+ .min_chunk = PAGES_PER_SECTION,
2068
+ .max_threads = max_threads,
2069
+ };
2070
+
2071
+ padata_do_multithreaded(&job);
2072
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2073
+ epfn_align);
2074
+ }
2075
+zone_empty:
17102076 /* Sanity check that the next zone really is unpopulated */
17112077 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
17122078
1713
- pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
1714
- jiffies_to_msecs(jiffies - start));
2079
+ pr_info("node %d deferred pages initialised in %ums\n",
2080
+ pgdat->node_id, jiffies_to_msecs(jiffies - start));
17152081
17162082 pgdat_init_report_one_done();
17172083 return 0;
....@@ -1735,14 +2101,11 @@
17352101 static noinline bool __init
17362102 deferred_grow_zone(struct zone *zone, unsigned int order)
17372103 {
1738
- int zid = zone_idx(zone);
1739
- int nid = zone_to_nid(zone);
1740
- pg_data_t *pgdat = NODE_DATA(nid);
17412104 unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
1742
- unsigned long nr_pages = 0;
1743
- unsigned long first_init_pfn, spfn, epfn, t, flags;
2105
+ pg_data_t *pgdat = zone->zone_pgdat;
17442106 unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
1745
- phys_addr_t spa, epa;
2107
+ unsigned long spfn, epfn, flags;
2108
+ unsigned long nr_pages = 0;
17462109 u64 i;
17472110
17482111 /* Only the last zone may have deferred pages */
....@@ -1760,38 +2123,37 @@
17602123 return true;
17612124 }
17622125
1763
- first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
1764
-
1765
- if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
2126
+ /* If the zone is empty somebody else may have cleared out the zone */
2127
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2128
+ first_deferred_pfn)) {
2129
+ pgdat->first_deferred_pfn = ULONG_MAX;
17662130 pgdat_resize_unlock(pgdat, &flags);
1767
- return false;
2131
+ /* Retry only once. */
2132
+ return first_deferred_pfn != ULONG_MAX;
17682133 }
17692134
1770
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1771
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1772
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
2135
+ /*
2136
+ * Initialize and free pages in MAX_ORDER sized increments so
2137
+ * that we can avoid introducing any issues with the buddy
2138
+ * allocator.
2139
+ */
2140
+ while (spfn < epfn) {
2141
+ /* update our first deferred PFN for this section */
2142
+ first_deferred_pfn = spfn;
17732143
1774
- while (spfn < epfn && nr_pages < nr_pages_needed) {
1775
- t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
1776
- first_deferred_pfn = min(t, epfn);
1777
- nr_pages += deferred_init_pages(nid, zid, spfn,
1778
- first_deferred_pfn);
1779
- spfn = first_deferred_pfn;
1780
- }
2144
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
2145
+ touch_nmi_watchdog();
17812146
2147
+ /* We should only stop along section boundaries */
2148
+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
2149
+ continue;
2150
+
2151
+ /* If our quota has been met we can stop here */
17822152 if (nr_pages >= nr_pages_needed)
17832153 break;
17842154 }
17852155
1786
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1787
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1788
- epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
1789
- deferred_free_pages(nid, zid, spfn, epfn);
1790
-
1791
- if (first_deferred_pfn == epfn)
1792
- break;
1793
- }
1794
- pgdat->first_deferred_pfn = first_deferred_pfn;
2156
+ pgdat->first_deferred_pfn = spfn;
17952157 pgdat_resize_unlock(pgdat, &flags);
17962158
17972159 return nr_pages > 0;
....@@ -1814,9 +2176,9 @@
18142176 void __init page_alloc_init_late(void)
18152177 {
18162178 struct zone *zone;
2179
+ int nid;
18172180
18182181 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1819
- int nid;
18202182
18212183 /* There will be num_node_state(N_MEMORY) threads */
18222184 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
....@@ -1844,10 +2206,12 @@
18442206 /* Reinit limits that are based on free pages after the kernel is up */
18452207 files_maxfiles_init();
18462208 #endif
1847
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
2209
+
18482210 /* Discard memblock private memory */
18492211 memblock_discard();
1850
-#endif
2212
+
2213
+ for_each_node_state(nid, N_MEMORY)
2214
+ shuffle_free_memory(NODE_DATA(nid));
18512215
18522216 for_each_populated_zone(zone)
18532217 set_zone_contiguous(zone);
....@@ -1881,6 +2245,7 @@
18812245 }
18822246
18832247 adjust_managed_page_count(page, pageblock_nr_pages);
2248
+ page_zone(page)->cma_pages += pageblock_nr_pages;
18842249 }
18852250 #endif
18862251
....@@ -1899,13 +2264,11 @@
18992264 * -- nyc
19002265 */
19012266 static inline void expand(struct zone *zone, struct page *page,
1902
- int low, int high, struct free_area *area,
1903
- int migratetype)
2267
+ int low, int high, int migratetype)
19042268 {
19052269 unsigned long size = 1 << high;
19062270
19072271 while (high > low) {
1908
- area--;
19092272 high--;
19102273 size >>= 1;
19112274 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
....@@ -1919,39 +2282,21 @@
19192282 if (set_page_guard(zone, &page[size], high, migratetype))
19202283 continue;
19212284
1922
- list_add(&page[size].lru, &area->free_list[migratetype]);
1923
- area->nr_free++;
1924
- set_page_order(&page[size], high);
2285
+ add_to_free_list(&page[size], zone, high, migratetype);
2286
+ set_buddy_order(&page[size], high);
19252287 }
19262288 }
19272289
19282290 static void check_new_page_bad(struct page *page)
19292291 {
1930
- const char *bad_reason = NULL;
1931
- unsigned long bad_flags = 0;
1932
-
1933
- if (unlikely(atomic_read(&page->_mapcount) != -1))
1934
- bad_reason = "nonzero mapcount";
1935
- if (unlikely(page->mapping != NULL))
1936
- bad_reason = "non-NULL mapping";
1937
- if (unlikely(page_ref_count(page) != 0))
1938
- bad_reason = "nonzero _count";
19392292 if (unlikely(page->flags & __PG_HWPOISON)) {
1940
- bad_reason = "HWPoisoned (hardware-corrupted)";
1941
- bad_flags = __PG_HWPOISON;
19422293 /* Don't complain about hwpoisoned pages */
19432294 page_mapcount_reset(page); /* remove PageBuddy */
19442295 return;
19452296 }
1946
- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
1947
- bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
1948
- bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
1949
- }
1950
-#ifdef CONFIG_MEMCG
1951
- if (unlikely(page->mem_cgroup))
1952
- bad_reason = "page still charged to cgroup";
1953
-#endif
1954
- bad_page(page, bad_reason, bad_flags);
2297
+
2298
+ bad_page(page,
2299
+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
19552300 }
19562301
19572302 /*
....@@ -1967,30 +2312,40 @@
19672312 return 1;
19682313 }
19692314
1970
-static inline bool free_pages_prezeroed(void)
1971
-{
1972
- return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
1973
- page_poisoning_enabled()) || want_init_on_free();
1974
-}
1975
-
19762315 #ifdef CONFIG_DEBUG_VM
1977
-static bool check_pcp_refill(struct page *page)
2316
+/*
2317
+ * With DEBUG_VM enabled, order-0 pages are checked for expected state when
2318
+ * being allocated from pcp lists. With debug_pagealloc also enabled, they are
2319
+ * also checked when pcp lists are refilled from the free lists.
2320
+ */
2321
+static inline bool check_pcp_refill(struct page *page)
19782322 {
1979
- return false;
2323
+ if (debug_pagealloc_enabled_static())
2324
+ return check_new_page(page);
2325
+ else
2326
+ return false;
19802327 }
19812328
1982
-static bool check_new_pcp(struct page *page)
2329
+static inline bool check_new_pcp(struct page *page)
19832330 {
19842331 return check_new_page(page);
19852332 }
19862333 #else
1987
-static bool check_pcp_refill(struct page *page)
2334
+/*
2335
+ * With DEBUG_VM disabled, free order-0 pages are checked for expected state
2336
+ * when pcp lists are being refilled from the free lists. With debug_pagealloc
2337
+ * enabled, they are also checked when being allocated from the pcp lists.
2338
+ */
2339
+static inline bool check_pcp_refill(struct page *page)
19882340 {
19892341 return check_new_page(page);
19902342 }
1991
-static bool check_new_pcp(struct page *page)
2343
+static inline bool check_new_pcp(struct page *page)
19922344 {
1993
- return false;
2345
+ if (debug_pagealloc_enabled_static())
2346
+ return check_new_page(page);
2347
+ else
2348
+ return false;
19942349 }
19952350 #endif /* CONFIG_DEBUG_VM */
19962351
....@@ -2014,9 +2369,31 @@
20142369 set_page_refcounted(page);
20152370
20162371 arch_alloc_page(page, order);
2017
- kernel_map_pages(page, 1 << order, 1);
2018
- kasan_alloc_pages(page, order);
2019
- kernel_poison_pages(page, 1 << order, 1);
2372
+ debug_pagealloc_map_pages(page, 1 << order);
2373
+
2374
+ /*
2375
+ * Page unpoisoning must happen before memory initialization.
2376
+ * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
2377
+ * allocations and the page unpoisoning code will complain.
2378
+ */
2379
+ kernel_unpoison_pages(page, 1 << order);
2380
+
2381
+ /*
2382
+ * As memory initialization might be integrated into KASAN,
2383
+ * kasan_alloc_pages and kernel_init_free_pages must be
2384
+ * kept together to avoid discrepancies in behavior.
2385
+ */
2386
+ if (kasan_has_integrated_init()) {
2387
+ kasan_alloc_pages(page, order, gfp_flags);
2388
+ } else {
2389
+ bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
2390
+
2391
+ kasan_unpoison_pages(page, order, init);
2392
+ if (init)
2393
+ kernel_init_free_pages(page, 1 << order,
2394
+ gfp_flags & __GFP_ZEROTAGS);
2395
+ }
2396
+
20202397 set_page_owner(page, order, gfp_flags);
20212398 }
20222399
....@@ -2024,9 +2401,6 @@
20242401 unsigned int alloc_flags)
20252402 {
20262403 post_alloc_hook(page, order, gfp_flags);
2027
-
2028
- if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
2029
- kernel_init_free_pages(page, 1 << order);
20302404
20312405 if (order && (gfp_flags & __GFP_COMP))
20322406 prep_compound_page(page, order);
....@@ -2041,6 +2415,7 @@
20412415 set_page_pfmemalloc(page);
20422416 else
20432417 clear_page_pfmemalloc(page);
2418
+ trace_android_vh_test_clear_look_around_ref(page);
20442419 }
20452420
20462421 /*
....@@ -2058,14 +2433,11 @@
20582433 /* Find a page of the appropriate size in the preferred list */
20592434 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
20602435 area = &(zone->free_area[current_order]);
2061
- page = list_first_entry_or_null(&area->free_list[migratetype],
2062
- struct page, lru);
2436
+ page = get_page_from_free_area(area, migratetype);
20632437 if (!page)
20642438 continue;
2065
- list_del(&page->lru);
2066
- rmv_page_order(page);
2067
- area->nr_free--;
2068
- expand(zone, page, order, current_order, area, migratetype);
2439
+ del_page_from_free_list(page, zone, current_order);
2440
+ expand(zone, page, order, current_order, migratetype);
20692441 set_pcppage_migratetype(page, migratetype);
20702442 return page;
20712443 }
....@@ -2078,10 +2450,10 @@
20782450 * This array describes the order lists are fallen back to when
20792451 * the free lists for the desirable migrate type are depleted
20802452 */
2081
-static int fallbacks[MIGRATE_TYPES][4] = {
2453
+static int fallbacks[MIGRATE_TYPES][3] = {
20822454 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2083
- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
20842455 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
2456
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
20852457 #ifdef CONFIG_CMA
20862458 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
20872459 #endif
....@@ -2102,7 +2474,7 @@
21022474 #endif
21032475
21042476 /*
2105
- * Move the free pages in a range to the free lists of the requested type.
2477
+ * Move the free pages in a range to the freelist tail of the requested type.
21062478 * Note that start_page and end_pages are not aligned on a pageblock
21072479 * boundary. If alignment is required, use move_freepages_block()
21082480 */
....@@ -2114,30 +2486,11 @@
21142486 unsigned int order;
21152487 int pages_moved = 0;
21162488
2117
-#ifndef CONFIG_HOLES_IN_ZONE
2118
- /*
2119
- * page_zone is not safe to call in this context when
2120
- * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
2121
- * anyway as we check zone boundaries in move_freepages_block().
2122
- * Remove at a later date when no bug reports exist related to
2123
- * grouping pages by mobility
2124
- */
2125
- VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
2126
- pfn_valid(page_to_pfn(end_page)) &&
2127
- page_zone(start_page) != page_zone(end_page));
2128
-#endif
2129
-
2130
- if (num_movable)
2131
- *num_movable = 0;
2132
-
21332489 for (page = start_page; page <= end_page;) {
21342490 if (!pfn_valid_within(page_to_pfn(page))) {
21352491 page++;
21362492 continue;
21372493 }
2138
-
2139
- /* Make sure we are not inadvertently changing nodes */
2140
- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
21412494
21422495 if (!PageBuddy(page)) {
21432496 /*
....@@ -2153,9 +2506,12 @@
21532506 continue;
21542507 }
21552508
2156
- order = page_order(page);
2157
- list_move(&page->lru,
2158
- &zone->free_area[order].free_list[migratetype]);
2509
+ /* Make sure we are not inadvertently changing nodes */
2510
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
2511
+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
2512
+
2513
+ order = buddy_order(page);
2514
+ move_to_free_list(page, zone, order, migratetype);
21592515 page += 1 << order;
21602516 pages_moved += 1 << order;
21612517 }
....@@ -2168,6 +2524,9 @@
21682524 {
21692525 unsigned long start_pfn, end_pfn;
21702526 struct page *start_page, *end_page;
2527
+
2528
+ if (num_movable)
2529
+ *num_movable = 0;
21712530
21722531 start_pfn = page_to_pfn(page);
21732532 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
....@@ -2229,6 +2588,43 @@
22292588 return false;
22302589 }
22312590
2591
+static inline bool boost_watermark(struct zone *zone)
2592
+{
2593
+ unsigned long max_boost;
2594
+
2595
+ if (!watermark_boost_factor)
2596
+ return false;
2597
+ /*
2598
+ * Don't bother in zones that are unlikely to produce results.
2599
+ * On small machines, including kdump capture kernels running
2600
+ * in a small area, boosting the watermark can cause an out of
2601
+ * memory situation immediately.
2602
+ */
2603
+ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
2604
+ return false;
2605
+
2606
+ max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
2607
+ watermark_boost_factor, 10000);
2608
+
2609
+ /*
2610
+ * high watermark may be uninitialised if fragmentation occurs
2611
+ * very early in boot so do not boost. We do not fall
2612
+ * through and boost by pageblock_nr_pages as failing
2613
+ * allocations that early means that reclaim is not going
2614
+ * to help and it may even be impossible to reclaim the
2615
+ * boosted watermark resulting in a hang.
2616
+ */
2617
+ if (!max_boost)
2618
+ return false;
2619
+
2620
+ max_boost = max(pageblock_nr_pages, max_boost);
2621
+
2622
+ zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
2623
+ max_boost);
2624
+
2625
+ return true;
2626
+}
2627
+
22322628 /*
22332629 * This function implements actual steal behaviour. If order is large enough,
22342630 * we can steal whole pageblock. If not, we first move freepages in this
....@@ -2238,10 +2634,9 @@
22382634 * itself, so pages freed in the future will be put on the correct free list.
22392635 */
22402636 static void steal_suitable_fallback(struct zone *zone, struct page *page,
2241
- int start_type, bool whole_block)
2637
+ unsigned int alloc_flags, int start_type, bool whole_block)
22422638 {
2243
- unsigned int current_order = page_order(page);
2244
- struct free_area *area;
2639
+ unsigned int current_order = buddy_order(page);
22452640 int free_pages, movable_pages, alike_pages;
22462641 int old_block_type;
22472642
....@@ -2259,6 +2654,14 @@
22592654 change_pageblock_range(page, current_order, start_type);
22602655 goto single_page;
22612656 }
2657
+
2658
+ /*
2659
+ * Boost watermarks to increase reclaim pressure to reduce the
2660
+ * likelihood of future fallbacks. Wake kswapd now as the node
2661
+ * may be balanced overall and kswapd will not wake naturally.
2662
+ */
2663
+ if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
2664
+ set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
22622665
22632666 /* We are not allowed to try stealing from the whole block */
22642667 if (!whole_block)
....@@ -2303,8 +2706,7 @@
23032706 return;
23042707
23052708 single_page:
2306
- area = &zone->free_area[current_order];
2307
- list_move(&page->lru, &area->free_list[start_type]);
2709
+ move_to_free_list(page, zone, current_order, start_type);
23082710 }
23092711
23102712 /*
....@@ -2328,7 +2730,7 @@
23282730 if (fallback_mt == MIGRATE_TYPES)
23292731 break;
23302732
2331
- if (list_empty(&area->free_list[fallback_mt]))
2733
+ if (free_area_empty(area, fallback_mt))
23322734 continue;
23332735
23342736 if (can_steal_fallback(order, migratetype))
....@@ -2358,7 +2760,7 @@
23582760 * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
23592761 * Check is race-prone but harmless.
23602762 */
2361
- max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
2763
+ max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
23622764 if (zone->nr_reserved_highatomic >= max_managed)
23632765 return;
23642766
....@@ -2400,8 +2802,9 @@
24002802 struct page *page;
24012803 int order;
24022804 bool ret;
2805
+ bool skip_unreserve_highatomic = false;
24032806
2404
- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2807
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
24052808 ac->nodemask) {
24062809 /*
24072810 * Preserve at least one pageblock unless memory pressure
....@@ -2411,13 +2814,16 @@
24112814 pageblock_nr_pages)
24122815 continue;
24132816
2817
+ trace_android_vh_unreserve_highatomic_bypass(force, zone,
2818
+ &skip_unreserve_highatomic);
2819
+ if (skip_unreserve_highatomic)
2820
+ continue;
2821
+
24142822 spin_lock_irqsave(&zone->lock, flags);
24152823 for (order = 0; order < MAX_ORDER; order++) {
24162824 struct free_area *area = &(zone->free_area[order]);
24172825
2418
- page = list_first_entry_or_null(
2419
- &area->free_list[MIGRATE_HIGHATOMIC],
2420
- struct page, lru);
2826
+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
24212827 if (!page)
24222828 continue;
24232829
....@@ -2475,20 +2881,30 @@
24752881 * condition simpler.
24762882 */
24772883 static __always_inline bool
2478
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
2884
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
2885
+ unsigned int alloc_flags)
24792886 {
24802887 struct free_area *area;
24812888 int current_order;
2889
+ int min_order = order;
24822890 struct page *page;
24832891 int fallback_mt;
24842892 bool can_steal;
2893
+
2894
+ /*
2895
+ * Do not steal pages from freelists belonging to other pageblocks
2896
+ * i.e. orders < pageblock_order. If there are no local zones free,
2897
+ * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
2898
+ */
2899
+ if (alloc_flags & ALLOC_NOFRAGMENT)
2900
+ min_order = pageblock_order;
24852901
24862902 /*
24872903 * Find the largest available free page in the other list. This roughly
24882904 * approximates finding the pageblock with the most free pages, which
24892905 * would be too costly to do exactly.
24902906 */
2491
- for (current_order = MAX_ORDER - 1; current_order >= order;
2907
+ for (current_order = MAX_ORDER - 1; current_order >= min_order;
24922908 --current_order) {
24932909 area = &(zone->free_area[current_order]);
24942910 fallback_mt = find_suitable_fallback(area, current_order,
....@@ -2530,10 +2946,10 @@
25302946 VM_BUG_ON(current_order == MAX_ORDER);
25312947
25322948 do_steal:
2533
- page = list_first_entry(&area->free_list[fallback_mt],
2534
- struct page, lru);
2949
+ page = get_page_from_free_area(area, fallback_mt);
25352950
2536
- steal_suitable_fallback(zone, page, start_migratetype, can_steal);
2951
+ steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
2952
+ can_steal);
25372953
25382954 trace_mm_page_alloc_extfrag(page, order, current_order,
25392955 start_migratetype, fallback_mt);
....@@ -2547,14 +2963,16 @@
25472963 * Call me with the zone->lock already held.
25482964 */
25492965 static __always_inline struct page *
2550
-__rmqueue(struct zone *zone, unsigned int order, int migratetype)
2966
+__rmqueue(struct zone *zone, unsigned int order, int migratetype,
2967
+ unsigned int alloc_flags)
25512968 {
25522969 struct page *page;
25532970
25542971 retry:
25552972 page = __rmqueue_smallest(zone, order, migratetype);
25562973
2557
- if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype))
2974
+ if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype,
2975
+ alloc_flags))
25582976 goto retry;
25592977
25602978 trace_mm_page_alloc_zone_locked(page, order, migratetype);
....@@ -2562,18 +2980,18 @@
25622980 }
25632981
25642982 #ifdef CONFIG_CMA
2565
-static struct page *__rmqueue_cma(struct zone *zone, unsigned int order)
2983
+static struct page *__rmqueue_cma(struct zone *zone, unsigned int order,
2984
+ int migratetype,
2985
+ unsigned int alloc_flags)
25662986 {
2567
- struct page *page = 0;
2568
-
2569
- if (IS_ENABLED(CONFIG_CMA))
2570
- if (!zone->cma_alloc)
2571
- page = __rmqueue_cma_fallback(zone, order);
2987
+ struct page *page = __rmqueue_cma_fallback(zone, order);
25722988 trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA);
25732989 return page;
25742990 }
25752991 #else
2576
-static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order)
2992
+static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order,
2993
+ int migratetype,
2994
+ unsigned int alloc_flags)
25772995 {
25782996 return NULL;
25792997 }
....@@ -2586,7 +3004,7 @@
25863004 */
25873005 static int rmqueue_bulk(struct zone *zone, unsigned int order,
25883006 unsigned long count, struct list_head *list,
2589
- int migratetype)
3007
+ int migratetype, unsigned int alloc_flags)
25903008 {
25913009 int i, alloced = 0;
25923010
....@@ -2594,15 +3012,11 @@
25943012 for (i = 0; i < count; ++i) {
25953013 struct page *page;
25963014
2597
- /*
2598
- * If migrate type CMA is being requested only try to
2599
- * satisfy the request with CMA pages to try and increase
2600
- * CMA utlization.
2601
- */
26023015 if (is_migrate_cma(migratetype))
2603
- page = __rmqueue_cma(zone, order);
3016
+ page = __rmqueue_cma(zone, order, migratetype,
3017
+ alloc_flags);
26043018 else
2605
- page = __rmqueue(zone, order, migratetype);
3019
+ page = __rmqueue(zone, order, migratetype, alloc_flags);
26063020
26073021 if (unlikely(page == NULL))
26083022 break;
....@@ -2645,14 +3059,18 @@
26453059 */
26463060 static struct list_head *get_populated_pcp_list(struct zone *zone,
26473061 unsigned int order, struct per_cpu_pages *pcp,
2648
- int migratetype)
3062
+ int migratetype, unsigned int alloc_flags)
26493063 {
26503064 struct list_head *list = &pcp->lists[migratetype];
26513065
26523066 if (list_empty(list)) {
3067
+ trace_android_vh_rmqueue_bulk_bypass(order, pcp, migratetype, list);
3068
+ if (!list_empty(list))
3069
+ return list;
3070
+
26533071 pcp->count += rmqueue_bulk(zone, order,
26543072 pcp->batch, list,
2655
- migratetype);
3073
+ migratetype, alloc_flags);
26563074
26573075 if (list_empty(list))
26583076 list = NULL;
....@@ -2739,6 +3157,10 @@
27393157
27403158 static void drain_local_pages_wq(struct work_struct *work)
27413159 {
3160
+ struct pcpu_drain *drain;
3161
+
3162
+ drain = container_of(work, struct pcpu_drain, work);
3163
+
27423164 /*
27433165 * drain_all_pages doesn't use proper cpu hotplug protection so
27443166 * we can race with cpu offline when the WQ can move this from
....@@ -2747,7 +3169,7 @@
27473169 * a different one.
27483170 */
27493171 preempt_disable();
2750
- drain_local_pages(NULL);
3172
+ drain_local_pages(drain->zone);
27513173 preempt_enable();
27523174 }
27533175
....@@ -2818,12 +3240,14 @@
28183240 }
28193241
28203242 for_each_cpu(cpu, &cpus_with_pcps) {
2821
- struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
2822
- INIT_WORK(work, drain_local_pages_wq);
2823
- queue_work_on(cpu, mm_percpu_wq, work);
3243
+ struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
3244
+
3245
+ drain->zone = zone;
3246
+ INIT_WORK(&drain->work, drain_local_pages_wq);
3247
+ queue_work_on(cpu, mm_percpu_wq, &drain->work);
28243248 }
28253249 for_each_cpu(cpu, &cpus_with_pcps)
2826
- flush_work(per_cpu_ptr(&pcpu_drain, cpu));
3250
+ flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
28273251
28283252 mutex_unlock(&pcpu_drain_mutex);
28293253 }
....@@ -2900,6 +3324,7 @@
29003324 struct zone *zone = page_zone(page);
29013325 struct per_cpu_pages *pcp;
29023326 int migratetype;
3327
+ bool pcp_skip_cma_pages = false;
29033328
29043329 migratetype = get_pcppage_migratetype(page);
29053330 __count_vm_event(PGFREE);
....@@ -2912,8 +3337,12 @@
29123337 * excessively into the page allocator
29133338 */
29143339 if (migratetype >= MIGRATE_PCPTYPES) {
2915
- if (unlikely(is_migrate_isolate(migratetype))) {
2916
- free_one_page(zone, page, pfn, 0, migratetype);
3340
+ trace_android_vh_pcplist_add_cma_pages_bypass(migratetype,
3341
+ &pcp_skip_cma_pages);
3342
+ if (unlikely(is_migrate_isolate(migratetype)) ||
3343
+ pcp_skip_cma_pages) {
3344
+ free_one_page(zone, page, pfn, 0, migratetype,
3345
+ FPI_NONE);
29173346 return;
29183347 }
29193348 migratetype = MIGRATE_MOVABLE;
....@@ -2935,8 +3364,15 @@
29353364 {
29363365 unsigned long flags;
29373366 unsigned long pfn = page_to_pfn(page);
3367
+ int migratetype;
3368
+ bool skip_free_unref_page = false;
29383369
29393370 if (!free_unref_page_prepare(page, pfn))
3371
+ return;
3372
+
3373
+ migratetype = get_pfnblock_migratetype(page, pfn);
3374
+ trace_android_vh_free_unref_page_bypass(page, 0, migratetype, &skip_free_unref_page);
3375
+ if (skip_free_unref_page)
29403376 return;
29413377
29423378 local_irq_save(flags);
....@@ -2999,7 +3435,8 @@
29993435
30003436 for (i = 1; i < (1 << order); i++)
30013437 set_page_refcounted(page + i);
3002
- split_page_owner(page, order);
3438
+ split_page_owner(page, 1 << order);
3439
+ split_page_memcg(page, 1 << order);
30033440 }
30043441 EXPORT_SYMBOL_GPL(split_page);
30053442
....@@ -3021,7 +3458,7 @@
30213458 * watermark, because we already know our high-order page
30223459 * exists.
30233460 */
3024
- watermark = min_wmark_pages(zone) + (1UL << order);
3461
+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
30253462 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
30263463 return 0;
30273464
....@@ -3029,9 +3466,8 @@
30293466 }
30303467
30313468 /* Remove page from free list */
3032
- list_del(&page->lru);
3033
- zone->free_area[order].nr_free--;
3034
- rmv_page_order(page);
3469
+
3470
+ del_page_from_free_list(page, zone, order);
30353471
30363472 /*
30373473 * Set the pageblock if the isolated page is at least half of a
....@@ -3050,6 +3486,27 @@
30503486
30513487
30523488 return 1UL << order;
3489
+}
3490
+
3491
+/**
3492
+ * __putback_isolated_page - Return a now-isolated page back where we got it
3493
+ * @page: Page that was isolated
3494
+ * @order: Order of the isolated page
3495
+ * @mt: The page's pageblock's migratetype
3496
+ *
3497
+ * This function is meant to return a page pulled from the free lists via
3498
+ * __isolate_free_page back to the free lists they were pulled from.
3499
+ */
3500
+void __putback_isolated_page(struct page *page, unsigned int order, int mt)
3501
+{
3502
+ struct zone *zone = page_zone(page);
3503
+
3504
+ /* zone lock should be held when this function is called */
3505
+ lockdep_assert_held(&zone->lock);
3506
+
3507
+ /* Return isolated page to tail of freelist. */
3508
+ __free_one_page(page, page_to_pfn(page), zone, order, mt,
3509
+ FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
30533510 }
30543511
30553512 /*
....@@ -3081,6 +3538,7 @@
30813538
30823539 /* Remove page from the per-cpu list, caller must protect the list */
30833540 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
3541
+ unsigned int alloc_flags,
30843542 struct per_cpu_pages *pcp,
30853543 gfp_t gfp_flags)
30863544 {
....@@ -3090,9 +3548,9 @@
30903548 do {
30913549 /* First try to get CMA pages */
30923550 if (migratetype == MIGRATE_MOVABLE &&
3093
- gfp_flags & __GFP_CMA) {
3551
+ alloc_flags & ALLOC_CMA) {
30943552 list = get_populated_pcp_list(zone, 0, pcp,
3095
- get_cma_migrate_type());
3553
+ get_cma_migrate_type(), alloc_flags);
30963554 }
30973555
30983556 if (list == NULL) {
....@@ -3101,7 +3559,7 @@
31013559 * free CMA pages.
31023560 */
31033561 list = get_populated_pcp_list(zone, 0, pcp,
3104
- migratetype);
3562
+ migratetype, alloc_flags);
31053563 if (unlikely(list == NULL) ||
31063564 unlikely(list_empty(list)))
31073565 return NULL;
....@@ -3117,8 +3575,8 @@
31173575
31183576 /* Lock and remove page from the per-cpu list */
31193577 static struct page *rmqueue_pcplist(struct zone *preferred_zone,
3120
- struct zone *zone, unsigned int order,
3121
- gfp_t gfp_flags, int migratetype)
3578
+ struct zone *zone, gfp_t gfp_flags,
3579
+ int migratetype, unsigned int alloc_flags)
31223580 {
31233581 struct per_cpu_pages *pcp;
31243582 struct page *page;
....@@ -3126,10 +3584,10 @@
31263584
31273585 local_irq_save(flags);
31283586 pcp = &this_cpu_ptr(zone->pageset)->pcp;
3129
- page = __rmqueue_pcplist(zone, migratetype, pcp,
3587
+ page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp,
31303588 gfp_flags);
31313589 if (page) {
3132
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
3590
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
31333591 zone_statistics(preferred_zone, zone);
31343592 }
31353593 local_irq_restore(flags);
....@@ -3149,8 +3607,8 @@
31493607 struct page *page;
31503608
31513609 if (likely(order == 0)) {
3152
- page = rmqueue_pcplist(preferred_zone, zone, order,
3153
- gfp_flags, migratetype);
3610
+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
3611
+ migratetype, alloc_flags);
31543612 goto out;
31553613 }
31563614
....@@ -3163,21 +3621,27 @@
31633621
31643622 do {
31653623 page = NULL;
3166
-
3167
- if (alloc_flags & ALLOC_HARDER) {
3624
+ /*
3625
+ * order-0 request can reach here when the pcplist is skipped
3626
+ * due to non-CMA allocation context. HIGHATOMIC area is
3627
+ * reserved for high-order atomic allocation, so order-0
3628
+ * request should skip it.
3629
+ */
3630
+ if (order > 0 && alloc_flags & ALLOC_HARDER) {
31683631 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
31693632 if (page)
31703633 trace_mm_page_alloc_zone_locked(page, order, migratetype);
31713634 }
3172
-
3173
- if (!page && migratetype == MIGRATE_MOVABLE &&
3174
- gfp_flags & __GFP_CMA)
3175
- page = __rmqueue_cma(zone, order);
3176
-
3177
- if (!page)
3178
- page = __rmqueue(zone, order, migratetype);
3635
+ if (!page) {
3636
+ if (migratetype == MIGRATE_MOVABLE &&
3637
+ alloc_flags & ALLOC_CMA)
3638
+ page = __rmqueue_cma(zone, order, migratetype,
3639
+ alloc_flags);
3640
+ if (!page)
3641
+ page = __rmqueue(zone, order, migratetype,
3642
+ alloc_flags);
3643
+ }
31793644 } while (page && check_new_pages(page, order));
3180
-
31813645 spin_unlock(&zone->lock);
31823646 if (!page)
31833647 goto failed;
....@@ -3186,9 +3650,17 @@
31863650
31873651 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
31883652 zone_statistics(preferred_zone, zone);
3653
+ trace_android_vh_rmqueue(preferred_zone, zone, order,
3654
+ gfp_flags, alloc_flags, migratetype);
31893655 local_irq_restore(flags);
31903656
31913657 out:
3658
+ /* Separate test+clear to avoid unnecessary atomics */
3659
+ if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
3660
+ clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
3661
+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
3662
+ }
3663
+
31923664 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
31933665 return page;
31943666
....@@ -3218,7 +3690,7 @@
32183690 }
32193691 __setup("fail_page_alloc=", setup_fail_page_alloc);
32203692
3221
-static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3693
+static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
32223694 {
32233695 if (order < fail_page_alloc.min_order)
32243696 return false;
....@@ -3242,24 +3714,14 @@
32423714
32433715 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
32443716 &fail_page_alloc.attr);
3245
- if (IS_ERR(dir))
3246
- return PTR_ERR(dir);
32473717
3248
- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
3249
- &fail_page_alloc.ignore_gfp_reclaim))
3250
- goto fail;
3251
- if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3252
- &fail_page_alloc.ignore_gfp_highmem))
3253
- goto fail;
3254
- if (!debugfs_create_u32("min-order", mode, dir,
3255
- &fail_page_alloc.min_order))
3256
- goto fail;
3718
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
3719
+ &fail_page_alloc.ignore_gfp_reclaim);
3720
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3721
+ &fail_page_alloc.ignore_gfp_highmem);
3722
+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
32573723
32583724 return 0;
3259
-fail:
3260
- debugfs_remove_recursive(dir);
3261
-
3262
- return -ENOMEM;
32633725 }
32643726
32653727 late_initcall(fail_page_alloc_debugfs);
....@@ -3268,12 +3730,41 @@
32683730
32693731 #else /* CONFIG_FAIL_PAGE_ALLOC */
32703732
3271
-static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3733
+static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
32723734 {
32733735 return false;
32743736 }
32753737
32763738 #endif /* CONFIG_FAIL_PAGE_ALLOC */
3739
+
3740
+noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3741
+{
3742
+ return __should_fail_alloc_page(gfp_mask, order);
3743
+}
3744
+ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
3745
+
3746
+static inline long __zone_watermark_unusable_free(struct zone *z,
3747
+ unsigned int order, unsigned int alloc_flags)
3748
+{
3749
+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
3750
+ long unusable_free = (1 << order) - 1;
3751
+
3752
+ /*
3753
+ * If the caller does not have rights to ALLOC_HARDER then subtract
3754
+ * the high-atomic reserves. This will over-estimate the size of the
3755
+ * atomic reserve but it avoids a search.
3756
+ */
3757
+ if (likely(!alloc_harder))
3758
+ unusable_free += z->nr_reserved_highatomic;
3759
+
3760
+#ifdef CONFIG_CMA
3761
+ /* If allocation can't use CMA areas don't use free CMA pages */
3762
+ if (!(alloc_flags & ALLOC_CMA))
3763
+ unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
3764
+#endif
3765
+
3766
+ return unusable_free;
3767
+}
32773768
32783769 /*
32793770 * Return true if free base pages are above 'mark'. For high-order checks it
....@@ -3282,7 +3773,7 @@
32823773 * to check in the allocation paths if no pages are free.
32833774 */
32843775 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3285
- int classzone_idx, unsigned int alloc_flags,
3776
+ int highest_zoneidx, unsigned int alloc_flags,
32863777 long free_pages)
32873778 {
32883779 long min = mark;
....@@ -3290,19 +3781,12 @@
32903781 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
32913782
32923783 /* free_pages may go negative - that's OK */
3293
- free_pages -= (1 << order) - 1;
3784
+ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
32943785
32953786 if (alloc_flags & ALLOC_HIGH)
32963787 min -= min / 2;
32973788
3298
- /*
3299
- * If the caller does not have rights to ALLOC_HARDER then subtract
3300
- * the high-atomic reserves. This will over-estimate the size of the
3301
- * atomic reserve but it avoids a search.
3302
- */
3303
- if (likely(!alloc_harder)) {
3304
- free_pages -= z->nr_reserved_highatomic;
3305
- } else {
3789
+ if (unlikely(alloc_harder)) {
33063790 /*
33073791 * OOM victims can try even harder than normal ALLOC_HARDER
33083792 * users on the grounds that it's definitely going to be in
....@@ -3315,19 +3799,12 @@
33153799 min -= min / 4;
33163800 }
33173801
3318
-
3319
-#ifdef CONFIG_CMA
3320
- /* If allocation can't use CMA areas don't use free CMA pages */
3321
- if (!(alloc_flags & ALLOC_CMA))
3322
- free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
3323
-#endif
3324
-
33253802 /*
33263803 * Check watermarks for an order-0 allocation request. If these
33273804 * are not met, then a high-order request also cannot go ahead
33283805 * even if a suitable page happened to be free.
33293806 */
3330
- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
3807
+ if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
33313808 return false;
33323809
33333810 /* If this is an order-0 request then the watermark is fine */
....@@ -3351,65 +3828,83 @@
33513828 if (mt == MIGRATE_CMA)
33523829 continue;
33533830 #endif
3354
- if (!list_empty(&area->free_list[mt]))
3831
+ if (!free_area_empty(area, mt))
33553832 return true;
33563833 }
33573834
33583835 #ifdef CONFIG_CMA
33593836 if ((alloc_flags & ALLOC_CMA) &&
3360
- !list_empty(&area->free_list[MIGRATE_CMA])) {
3837
+ !free_area_empty(area, MIGRATE_CMA)) {
33613838 return true;
33623839 }
33633840 #endif
3364
- if (alloc_harder &&
3365
- !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
3841
+ if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
33663842 return true;
33673843 }
33683844 return false;
33693845 }
33703846
33713847 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3372
- int classzone_idx, unsigned int alloc_flags)
3848
+ int highest_zoneidx, unsigned int alloc_flags)
33733849 {
3374
- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3850
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
33753851 zone_page_state(z, NR_FREE_PAGES));
33763852 }
3853
+EXPORT_SYMBOL_GPL(zone_watermark_ok);
33773854
33783855 static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
3379
- unsigned long mark, int classzone_idx, unsigned int alloc_flags)
3856
+ unsigned long mark, int highest_zoneidx,
3857
+ unsigned int alloc_flags, gfp_t gfp_mask)
33803858 {
3381
- long free_pages = zone_page_state(z, NR_FREE_PAGES);
3382
- long cma_pages = 0;
3859
+ long free_pages;
33833860
3384
-#ifdef CONFIG_CMA
3385
- /* If allocation can't use CMA areas don't use free CMA pages */
3386
- if (!(alloc_flags & ALLOC_CMA))
3387
- cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
3388
-#endif
3861
+ free_pages = zone_page_state(z, NR_FREE_PAGES);
33893862
33903863 /*
33913864 * Fast check for order-0 only. If this fails then the reserves
3392
- * need to be calculated. There is a corner case where the check
3393
- * passes but only the high-order atomic reserve are free. If
3394
- * the caller is !atomic then it'll uselessly search the free
3395
- * list. That corner case is then slower but it is harmless.
3865
+ * need to be calculated.
33963866 */
3397
- if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
3398
- return true;
3867
+ if (!order) {
3868
+ long usable_free;
3869
+ long reserved;
33993870
3400
- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3401
- free_pages);
3871
+ usable_free = free_pages;
3872
+ reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
3873
+
3874
+ /* reserved may over estimate high-atomic reserves. */
3875
+ usable_free -= min(usable_free, reserved);
3876
+ if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
3877
+ return true;
3878
+ }
3879
+
3880
+ if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3881
+ free_pages))
3882
+ return true;
3883
+ /*
3884
+ * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
3885
+ * when checking the min watermark. The min watermark is the
3886
+ * point where boosting is ignored so that kswapd is woken up
3887
+ * when below the low watermark.
3888
+ */
3889
+ if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
3890
+ && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
3891
+ mark = z->_watermark[WMARK_MIN];
3892
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx,
3893
+ alloc_flags, free_pages);
3894
+ }
3895
+
3896
+ return false;
34023897 }
34033898
34043899 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3405
- unsigned long mark, int classzone_idx)
3900
+ unsigned long mark, int highest_zoneidx)
34063901 {
34073902 long free_pages = zone_page_state(z, NR_FREE_PAGES);
34083903
34093904 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
34103905 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
34113906
3412
- return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
3907
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
34133908 free_pages);
34143909 }
34153910 EXPORT_SYMBOL_GPL(zone_watermark_ok_safe);
....@@ -3418,7 +3913,7 @@
34183913 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
34193914 {
34203915 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3421
- RECLAIM_DISTANCE;
3916
+ node_reclaim_distance;
34223917 }
34233918 #else /* CONFIG_NUMA */
34243919 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
....@@ -3428,6 +3923,61 @@
34283923 #endif /* CONFIG_NUMA */
34293924
34303925 /*
3926
+ * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
3927
+ * fragmentation is subtle. If the preferred zone was HIGHMEM then
3928
+ * premature use of a lower zone may cause lowmem pressure problems that
3929
+ * are worse than fragmentation. If the next zone is ZONE_DMA then it is
3930
+ * probably too small. It only makes sense to spread allocations to avoid
3931
+ * fragmentation between the Normal and DMA32 zones.
3932
+ */
3933
+static inline unsigned int
3934
+alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
3935
+{
3936
+ unsigned int alloc_flags;
3937
+
3938
+ /*
3939
+ * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
3940
+ * to save a branch.
3941
+ */
3942
+ alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
3943
+
3944
+#ifdef CONFIG_ZONE_DMA32
3945
+ if (!zone)
3946
+ return alloc_flags;
3947
+
3948
+ if (zone_idx(zone) != ZONE_NORMAL)
3949
+ return alloc_flags;
3950
+
3951
+ /*
3952
+ * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
3953
+ * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
3954
+ * on UMA that if Normal is populated then so is DMA32.
3955
+ */
3956
+ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
3957
+ if (nr_online_nodes > 1 && !populated_zone(--zone))
3958
+ return alloc_flags;
3959
+
3960
+ alloc_flags |= ALLOC_NOFRAGMENT;
3961
+#endif /* CONFIG_ZONE_DMA32 */
3962
+ return alloc_flags;
3963
+}
3964
+
3965
+static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
3966
+ unsigned int alloc_flags)
3967
+{
3968
+#ifdef CONFIG_CMA
3969
+ unsigned int pflags = current->flags;
3970
+
3971
+ if (!(pflags & PF_MEMALLOC_NOCMA) &&
3972
+ gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE &&
3973
+ gfp_mask & __GFP_CMA)
3974
+ alloc_flags |= ALLOC_CMA;
3975
+
3976
+#endif
3977
+ return alloc_flags;
3978
+}
3979
+
3980
+/*
34313981 * get_page_from_freelist goes through the zonelist trying to allocate
34323982 * a page.
34333983 */
....@@ -3435,16 +3985,20 @@
34353985 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
34363986 const struct alloc_context *ac)
34373987 {
3438
- struct zoneref *z = ac->preferred_zoneref;
3988
+ struct zoneref *z;
34393989 struct zone *zone;
34403990 struct pglist_data *last_pgdat_dirty_limit = NULL;
3991
+ bool no_fallback;
34413992
3993
+retry:
34423994 /*
34433995 * Scan zonelist, looking for a zone with enough free.
34443996 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
34453997 */
3446
- for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3447
- ac->nodemask) {
3998
+ no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
3999
+ z = ac->preferred_zoneref;
4000
+ for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
4001
+ ac->nodemask) {
34484002 struct page *page;
34494003 unsigned long mark;
34504004
....@@ -3481,9 +4035,26 @@
34814035 }
34824036 }
34834037
3484
- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
4038
+ if (no_fallback && nr_online_nodes > 1 &&
4039
+ zone != ac->preferred_zoneref->zone) {
4040
+ int local_nid;
4041
+
4042
+ /*
4043
+ * If moving to a remote node, retry but allow
4044
+ * fragmenting fallbacks. Locality is more important
4045
+ * than fragmentation avoidance.
4046
+ */
4047
+ local_nid = zone_to_nid(ac->preferred_zoneref->zone);
4048
+ if (zone_to_nid(zone) != local_nid) {
4049
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
4050
+ goto retry;
4051
+ }
4052
+ }
4053
+
4054
+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
34854055 if (!zone_watermark_fast(zone, order, mark,
3486
- ac_classzone_idx(ac), alloc_flags)) {
4056
+ ac->highest_zoneidx, alloc_flags,
4057
+ gfp_mask)) {
34874058 int ret;
34884059
34894060 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
....@@ -3516,7 +4087,7 @@
35164087 default:
35174088 /* did we reclaim enough */
35184089 if (zone_watermark_ok(zone, order, mark,
3519
- ac_classzone_idx(ac), alloc_flags))
4090
+ ac->highest_zoneidx, alloc_flags))
35204091 goto try_this_zone;
35214092
35224093 continue;
....@@ -3548,30 +4119,21 @@
35484119 }
35494120 }
35504121
4122
+ /*
4123
+ * It's possible on a UMA machine to get through all zones that are
4124
+ * fragmented. If avoiding fragmentation, reset and try again.
4125
+ */
4126
+ if (no_fallback) {
4127
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
4128
+ goto retry;
4129
+ }
4130
+
35514131 return NULL;
3552
-}
3553
-
3554
-/*
3555
- * Large machines with many possible nodes should not always dump per-node
3556
- * meminfo in irq context.
3557
- */
3558
-static inline bool should_suppress_show_mem(void)
3559
-{
3560
- bool ret = false;
3561
-
3562
-#if NODES_SHIFT > 8
3563
- ret = in_interrupt();
3564
-#endif
3565
- return ret;
35664132 }
35674133
35684134 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
35694135 {
35704136 unsigned int filter = SHOW_MEM_FILTER_NODES;
3571
- static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
3572
-
3573
- if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
3574
- return;
35754137
35764138 /*
35774139 * This documents exceptions given to allocations in certain
....@@ -3592,22 +4154,23 @@
35924154 {
35934155 struct va_format vaf;
35944156 va_list args;
3595
- static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
3596
- DEFAULT_RATELIMIT_BURST);
4157
+ static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
35974158
3598
- if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
4159
+ if ((gfp_mask & __GFP_NOWARN) ||
4160
+ !__ratelimit(&nopage_rs) ||
4161
+ ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
35994162 return;
36004163
36014164 va_start(args, fmt);
36024165 vaf.fmt = fmt;
36034166 vaf.va = &args;
3604
- pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
4167
+ pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
36054168 current->comm, &vaf, gfp_mask, &gfp_mask,
36064169 nodemask_pr_args(nodemask));
36074170 va_end(args);
36084171
36094172 cpuset_print_current_mems_allowed();
3610
-
4173
+ pr_cont("\n");
36114174 dump_stack();
36124175 warn_alloc_show_mem(gfp_mask, nodemask);
36134176 }
....@@ -3681,11 +4244,13 @@
36814244 * success so it is time to admit defeat. We will skip the OOM killer
36824245 * because it is very likely that the caller has a more reasonable
36834246 * fallback than shooting a random task.
4247
+ *
4248
+ * The OOM killer may not free memory on a specific node.
36844249 */
3685
- if (gfp_mask & __GFP_RETRY_MAYFAIL)
4250
+ if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
36864251 goto out;
36874252 /* The OOM killer does not needlessly kill tasks for lowmem */
3688
- if (ac->high_zoneidx < ZONE_NORMAL)
4253
+ if (ac->highest_zoneidx < ZONE_NORMAL)
36894254 goto out;
36904255 if (pm_suspended_storage())
36914256 goto out;
....@@ -3698,10 +4263,6 @@
36984263 * out_of_memory). Once filesystems are ready to handle allocation
36994264 * failures more gracefully we should just bail out here.
37004265 */
3701
-
3702
- /* The OOM killer may not free memory on a specific node */
3703
- if (gfp_mask & __GFP_THISNODE)
3704
- goto out;
37054266
37064267 /* Exhausted what can be done so it's blame time */
37074268 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
....@@ -3733,7 +4294,7 @@
37334294 unsigned int alloc_flags, const struct alloc_context *ac,
37344295 enum compact_priority prio, enum compact_result *compact_result)
37354296 {
3736
- struct page *page;
4297
+ struct page *page = NULL;
37374298 unsigned long pflags;
37384299 unsigned int noreclaim_flag;
37394300
....@@ -3744,13 +4305,10 @@
37444305 noreclaim_flag = memalloc_noreclaim_save();
37454306
37464307 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3747
- prio);
4308
+ prio, &page);
37484309
37494310 memalloc_noreclaim_restore(noreclaim_flag);
37504311 psi_memstall_leave(&pflags);
3751
-
3752
- if (*compact_result <= COMPACT_INACTIVE)
3753
- return NULL;
37544312
37554313 /*
37564314 * At least in one zone compaction wasn't deferred or skipped, so let's
....@@ -3758,7 +4316,13 @@
37584316 */
37594317 count_vm_event(COMPACTSTALL);
37604318
3761
- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4319
+ /* Prep a captured page if available */
4320
+ if (page)
4321
+ prep_new_page(page, order, gfp_mask, alloc_flags);
4322
+
4323
+ /* Try get a page from the freelist if available */
4324
+ if (!page)
4325
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
37624326
37634327 if (page) {
37644328 struct zone *zone = page_zone(page);
....@@ -3807,14 +4371,22 @@
38074371 goto check_priority;
38084372
38094373 /*
3810
- * make sure the compaction wasn't deferred or didn't bail out early
3811
- * due to locks contention before we declare that we should give up.
3812
- * But do not retry if the given zonelist is not suitable for
3813
- * compaction.
4374
+ * compaction was skipped because there are not enough order-0 pages
4375
+ * to work with, so we retry only if it looks like reclaim can help.
38144376 */
3815
- if (compaction_withdrawn(compact_result)) {
4377
+ if (compaction_needs_reclaim(compact_result)) {
38164378 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
38174379 goto out;
4380
+ }
4381
+
4382
+ /*
4383
+ * make sure the compaction wasn't deferred or didn't bail out early
4384
+ * due to locks contention before we declare that we should give up.
4385
+ * But the next retry should use a higher priority if allowed, so
4386
+ * we don't just keep bailing out endlessly.
4387
+ */
4388
+ if (compaction_withdrawn(compact_result)) {
4389
+ goto check_priority;
38184390 }
38194391
38204392 /*
....@@ -3877,10 +4449,10 @@
38774449 * Let's give them a good hope and keep retrying while the order-0
38784450 * watermarks are OK.
38794451 */
3880
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3881
- ac->nodemask) {
4452
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4453
+ ac->highest_zoneidx, ac->nodemask) {
38824454 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
3883
- ac_classzone_idx(ac), alloc_flags))
4455
+ ac->highest_zoneidx, alloc_flags))
38844456 return true;
38854457 }
38864458 return false;
....@@ -3938,33 +4510,50 @@
39384510 EXPORT_SYMBOL_GPL(fs_reclaim_release);
39394511 #endif
39404512
4513
+/*
4514
+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
4515
+ * have been rebuilt so allocation retries. Reader side does not lock and
4516
+ * retries the allocation if zonelist changes. Writer side is protected by the
4517
+ * embedded spin_lock.
4518
+ */
4519
+static DEFINE_SEQLOCK(zonelist_update_seq);
4520
+
4521
+static unsigned int zonelist_iter_begin(void)
4522
+{
4523
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
4524
+ return read_seqbegin(&zonelist_update_seq);
4525
+
4526
+ return 0;
4527
+}
4528
+
4529
+static unsigned int check_retry_zonelist(unsigned int seq)
4530
+{
4531
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
4532
+ return read_seqretry(&zonelist_update_seq, seq);
4533
+
4534
+ return seq;
4535
+}
4536
+
39414537 /* Perform direct synchronous page reclaim */
3942
-static int
4538
+static unsigned long
39434539 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
39444540 const struct alloc_context *ac)
39454541 {
3946
- struct reclaim_state reclaim_state;
3947
- int progress;
39484542 unsigned int noreclaim_flag;
3949
- unsigned long pflags;
4543
+ unsigned long progress;
39504544
39514545 cond_resched();
39524546
39534547 /* We now go into synchronous reclaim */
39544548 cpuset_memory_pressure_bump();
3955
- psi_memstall_enter(&pflags);
39564549 fs_reclaim_acquire(gfp_mask);
39574550 noreclaim_flag = memalloc_noreclaim_save();
3958
- reclaim_state.reclaimed_slab = 0;
3959
- current->reclaim_state = &reclaim_state;
39604551
39614552 progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
39624553 ac->nodemask);
39634554
3964
- current->reclaim_state = NULL;
39654555 memalloc_noreclaim_restore(noreclaim_flag);
39664556 fs_reclaim_release(gfp_mask);
3967
- psi_memstall_leave(&pflags);
39684557
39694558 cond_resched();
39704559
....@@ -3978,11 +4567,14 @@
39784567 unsigned long *did_some_progress)
39794568 {
39804569 struct page *page = NULL;
4570
+ unsigned long pflags;
39814571 bool drained = false;
4572
+ bool skip_pcp_drain = false;
39824573
4574
+ psi_memstall_enter(&pflags);
39834575 *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
39844576 if (unlikely(!(*did_some_progress)))
3985
- return NULL;
4577
+ goto out;
39864578
39874579 retry:
39884580 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
....@@ -3990,14 +4582,19 @@
39904582 /*
39914583 * If an allocation failed after direct reclaim, it could be because
39924584 * pages are pinned on the per-cpu lists or in high alloc reserves.
3993
- * Shrink them them and try again
4585
+ * Shrink them and try again
39944586 */
39954587 if (!page && !drained) {
39964588 unreserve_highatomic_pageblock(ac, false);
3997
- drain_all_pages(NULL);
4589
+ trace_android_vh_drain_all_pages_bypass(gfp_mask, order,
4590
+ alloc_flags, ac->migratetype, *did_some_progress, &skip_pcp_drain);
4591
+ if (!skip_pcp_drain)
4592
+ drain_all_pages(NULL);
39984593 drained = true;
39994594 goto retry;
40004595 }
4596
+out:
4597
+ psi_memstall_leave(&pflags);
40014598
40024599 return page;
40034600 }
....@@ -4008,12 +4605,12 @@
40084605 struct zoneref *z;
40094606 struct zone *zone;
40104607 pg_data_t *last_pgdat = NULL;
4011
- enum zone_type high_zoneidx = ac->high_zoneidx;
4608
+ enum zone_type highest_zoneidx = ac->highest_zoneidx;
40124609
4013
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
4610
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
40144611 ac->nodemask) {
40154612 if (last_pgdat != zone->zone_pgdat)
4016
- wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
4613
+ wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
40174614 last_pgdat = zone->zone_pgdat;
40184615 }
40194616 }
....@@ -4023,8 +4620,13 @@
40234620 {
40244621 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
40254622
4026
- /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
4623
+ /*
4624
+ * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
4625
+ * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
4626
+ * to save two branches.
4627
+ */
40274628 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
4629
+ BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
40284630
40294631 /*
40304632 * The caller may dip into page reserves a bit more if the caller
....@@ -4032,7 +4634,8 @@
40324634 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
40334635 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
40344636 */
4035
- alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
4637
+ alloc_flags |= (__force int)
4638
+ (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
40364639
40374640 if (gfp_mask & __GFP_ATOMIC) {
40384641 /*
....@@ -4049,10 +4652,8 @@
40494652 } else if (unlikely(rt_task(current)) && !in_interrupt())
40504653 alloc_flags |= ALLOC_HARDER;
40514654
4052
-#ifdef CONFIG_CMA
4053
- if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
4054
- alloc_flags |= ALLOC_CMA;
4055
-#endif
4655
+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
4656
+
40564657 return alloc_flags;
40574658 }
40584659
....@@ -4115,6 +4716,7 @@
41154716 {
41164717 struct zone *zone;
41174718 struct zoneref *z;
4719
+ bool ret = false;
41184720
41194721 /*
41204722 * Costly allocations might have made a progress but this doesn't mean
....@@ -4141,8 +4743,8 @@
41414743 * request even if all reclaimable pages are considered then we are
41424744 * screwed and have to go OOM.
41434745 */
4144
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
4145
- ac->nodemask) {
4746
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4747
+ ac->highest_zoneidx, ac->nodemask) {
41464748 unsigned long available;
41474749 unsigned long reclaimable;
41484750 unsigned long min_wmark = min_wmark_pages(zone);
....@@ -4156,7 +4758,7 @@
41564758 * reclaimable pages?
41574759 */
41584760 wmark = __zone_watermark_ok(zone, order, min_wmark,
4159
- ac_classzone_idx(ac), alloc_flags, available);
4761
+ ac->highest_zoneidx, alloc_flags, available);
41604762 trace_reclaim_retry_zone(z, order, reclaimable,
41614763 available, min_wmark, *no_progress_loops, wmark);
41624764 if (wmark) {
....@@ -4178,25 +4780,24 @@
41784780 }
41794781 }
41804782
4181
- /*
4182
- * Memory allocation/reclaim might be called from a WQ
4183
- * context and the current implementation of the WQ
4184
- * concurrency control doesn't recognize that
4185
- * a particular WQ is congested if the worker thread is
4186
- * looping without ever sleeping. Therefore we have to
4187
- * do a short sleep here rather than calling
4188
- * cond_resched().
4189
- */
4190
- if (current->flags & PF_WQ_WORKER)
4191
- schedule_timeout_uninterruptible(1);
4192
- else
4193
- cond_resched();
4194
-
4195
- return true;
4783
+ ret = true;
4784
+ goto out;
41964785 }
41974786 }
41984787
4199
- return false;
4788
+out:
4789
+ /*
4790
+ * Memory allocation/reclaim might be called from a WQ context and the
4791
+ * current implementation of the WQ concurrency control doesn't
4792
+ * recognize that a particular WQ is congested if the worker thread is
4793
+ * looping without ever sleeping. Therefore we have to do a short sleep
4794
+ * here rather than calling cond_resched().
4795
+ */
4796
+ if (current->flags & PF_WQ_WORKER)
4797
+ schedule_timeout_uninterruptible(1);
4798
+ else
4799
+ cond_resched();
4800
+ return ret;
42004801 }
42014802
42024803 static inline bool
....@@ -4246,8 +4847,12 @@
42464847 int compaction_retries;
42474848 int no_progress_loops;
42484849 unsigned int cpuset_mems_cookie;
4850
+ unsigned int zonelist_iter_cookie;
42494851 int reserve_flags;
4852
+ unsigned long vh_record;
4853
+ bool should_alloc_retry = false;
42504854
4855
+ trace_android_vh_alloc_pages_slowpath_begin(gfp_mask, order, &vh_record);
42514856 /*
42524857 * We also sanity check to catch abuse of atomic reserves being used by
42534858 * callers that are not in atomic context.
....@@ -4256,11 +4861,12 @@
42564861 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
42574862 gfp_mask &= ~__GFP_ATOMIC;
42584863
4259
-retry_cpuset:
4864
+restart:
42604865 compaction_retries = 0;
42614866 no_progress_loops = 0;
42624867 compact_priority = DEF_COMPACT_PRIORITY;
42634868 cpuset_mems_cookie = read_mems_allowed_begin();
4869
+ zonelist_iter_cookie = zonelist_iter_begin();
42644870
42654871 /*
42664872 * The fast path uses conservative alloc_flags to succeed only until
....@@ -4276,11 +4882,11 @@
42764882 * could end up iterating over non-eligible zones endlessly.
42774883 */
42784884 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4279
- ac->high_zoneidx, ac->nodemask);
4885
+ ac->highest_zoneidx, ac->nodemask);
42804886 if (!ac->preferred_zoneref->zone)
42814887 goto nopage;
42824888
4283
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
4889
+ if (alloc_flags & ALLOC_KSWAPD)
42844890 wake_all_kswapds(order, gfp_mask, ac);
42854891
42864892 /*
....@@ -4313,18 +4919,28 @@
43134919
43144920 /*
43154921 * Checks for costly allocations with __GFP_NORETRY, which
4316
- * includes THP page fault allocations
4922
+ * includes some THP page fault allocations
43174923 */
43184924 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
43194925 /*
4320
- * If compaction is deferred for high-order allocations,
4321
- * it is because sync compaction recently failed. If
4322
- * this is the case and the caller requested a THP
4323
- * allocation, we do not want to heavily disrupt the
4324
- * system, so we fail the allocation instead of entering
4325
- * direct reclaim.
4926
+ * If allocating entire pageblock(s) and compaction
4927
+ * failed because all zones are below low watermarks
4928
+ * or is prohibited because it recently failed at this
4929
+ * order, fail immediately unless the allocator has
4930
+ * requested compaction and reclaim retry.
4931
+ *
4932
+ * Reclaim is
4933
+ * - potentially very expensive because zones are far
4934
+ * below their low watermarks or this is part of very
4935
+ * bursty high order allocations,
4936
+ * - not guaranteed to help because isolate_freepages()
4937
+ * may not iterate over freed pages as part of its
4938
+ * linear scan, and
4939
+ * - unlikely to make entire pageblocks free on its
4940
+ * own.
43264941 */
4327
- if (compact_result == COMPACT_DEFERRED)
4942
+ if (compact_result == COMPACT_SKIPPED ||
4943
+ compact_result == COMPACT_DEFERRED)
43284944 goto nopage;
43294945
43304946 /*
....@@ -4338,12 +4954,12 @@
43384954
43394955 retry:
43404956 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4341
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
4957
+ if (alloc_flags & ALLOC_KSWAPD)
43424958 wake_all_kswapds(order, gfp_mask, ac);
43434959
43444960 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
43454961 if (reserve_flags)
4346
- alloc_flags = reserve_flags;
4962
+ alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
43474963
43484964 /*
43494965 * Reset the nodemask and zonelist iterators if memory policies can be
....@@ -4353,7 +4969,7 @@
43534969 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
43544970 ac->nodemask = NULL;
43554971 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4356
- ac->high_zoneidx, ac->nodemask);
4972
+ ac->highest_zoneidx, ac->nodemask);
43574973 }
43584974
43594975 /* Attempt with potentially adjusted zonelist and alloc_flags */
....@@ -4368,6 +4984,18 @@
43684984 /* Avoid recursion of direct reclaim */
43694985 if (current->flags & PF_MEMALLOC)
43704986 goto nopage;
4987
+
4988
+ trace_android_vh_alloc_pages_reclaim_bypass(gfp_mask, order,
4989
+ alloc_flags, ac->migratetype, &page);
4990
+
4991
+ if (page)
4992
+ goto got_pg;
4993
+
4994
+ trace_android_vh_should_alloc_pages_retry(gfp_mask, order,
4995
+ &alloc_flags, ac->migratetype, ac->preferred_zoneref->zone,
4996
+ &page, &should_alloc_retry);
4997
+ if (should_alloc_retry)
4998
+ goto retry;
43714999
43725000 /* Try direct reclaim and then allocating */
43735001 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
....@@ -4409,9 +5037,13 @@
44095037 goto retry;
44105038
44115039
4412
- /* Deal with possible cpuset update races before we start OOM killing */
4413
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
4414
- goto retry_cpuset;
5040
+ /*
5041
+ * Deal with possible cpuset update races or zonelist updates to avoid
5042
+ * a unnecessary OOM kill.
5043
+ */
5044
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
5045
+ check_retry_zonelist(zonelist_iter_cookie))
5046
+ goto restart;
44155047
44165048 /* Reclaim has failed us, start killing things */
44175049 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
....@@ -4420,7 +5052,7 @@
44205052
44215053 /* Avoid allocations with no watermarks from looping endlessly */
44225054 if (tsk_is_oom_victim(current) &&
4423
- (alloc_flags == ALLOC_OOM ||
5055
+ (alloc_flags & ALLOC_OOM ||
44245056 (gfp_mask & __GFP_NOMEMALLOC)))
44255057 goto nopage;
44265058
....@@ -4431,9 +5063,13 @@
44315063 }
44325064
44335065 nopage:
4434
- /* Deal with possible cpuset update races before we fail */
4435
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
4436
- goto retry_cpuset;
5066
+ /*
5067
+ * Deal with possible cpuset update races or zonelist updates to avoid
5068
+ * a unnecessary OOM kill.
5069
+ */
5070
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
5071
+ check_retry_zonelist(zonelist_iter_cookie))
5072
+ goto restart;
44375073
44385074 /*
44395075 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
....@@ -4476,9 +5112,15 @@
44765112 goto retry;
44775113 }
44785114 fail:
5115
+ trace_android_vh_alloc_pages_failure_bypass(gfp_mask, order,
5116
+ alloc_flags, ac->migratetype, &page);
5117
+ if (page)
5118
+ goto got_pg;
5119
+
44795120 warn_alloc(gfp_mask, ac->nodemask,
44805121 "page allocation failure: order:%u", order);
44815122 got_pg:
5123
+ trace_android_vh_alloc_pages_slowpath_end(gfp_mask, order, vh_record);
44825124 return page;
44835125 }
44845126
....@@ -4487,14 +5129,18 @@
44875129 struct alloc_context *ac, gfp_t *alloc_mask,
44885130 unsigned int *alloc_flags)
44895131 {
4490
- ac->high_zoneidx = gfp_zone(gfp_mask);
5132
+ ac->highest_zoneidx = gfp_zone(gfp_mask);
44915133 ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
44925134 ac->nodemask = nodemask;
4493
- ac->migratetype = gfpflags_to_migratetype(gfp_mask);
5135
+ ac->migratetype = gfp_migratetype(gfp_mask);
44945136
44955137 if (cpusets_enabled()) {
44965138 *alloc_mask |= __GFP_HARDWALL;
4497
- if (!ac->nodemask)
5139
+ /*
5140
+ * When we are in the interrupt context, it is irrelevant
5141
+ * to the current task context. It means that any node ok.
5142
+ */
5143
+ if (!in_interrupt() && !ac->nodemask)
44985144 ac->nodemask = &cpuset_current_mems_allowed;
44995145 else
45005146 *alloc_flags |= ALLOC_CPUSET;
....@@ -4508,15 +5154,8 @@
45085154 if (should_fail_alloc_page(gfp_mask, order))
45095155 return false;
45105156
4511
- if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
4512
- *alloc_flags |= ALLOC_CMA;
5157
+ *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
45135158
4514
- return true;
4515
-}
4516
-
4517
-/* Determine whether to spread dirty pages and what the first usable zone */
4518
-static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
4519
-{
45205159 /* Dirty zone balancing only done in the fast path */
45215160 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
45225161
....@@ -4526,7 +5165,9 @@
45265165 * may get reset for allocations that ignore memory policies.
45275166 */
45285167 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4529
- ac->high_zoneidx, ac->nodemask);
5168
+ ac->highest_zoneidx, ac->nodemask);
5169
+
5170
+ return true;
45305171 }
45315172
45325173 /*
....@@ -4555,7 +5196,11 @@
45555196 if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
45565197 return NULL;
45575198
4558
- finalise_ac(gfp_mask, &ac);
5199
+ /*
5200
+ * Forbid the first pass from falling back to types that fragment
5201
+ * memory until all local zones are considered.
5202
+ */
5203
+ alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
45595204
45605205 /* First allocation attempt */
45615206 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
....@@ -4575,14 +5220,13 @@
45755220 * Restore the original nodemask if it was potentially replaced with
45765221 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
45775222 */
4578
- if (unlikely(ac.nodemask != nodemask))
4579
- ac.nodemask = nodemask;
5223
+ ac.nodemask = nodemask;
45805224
45815225 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
45825226
45835227 out:
45845228 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
4585
- unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
5229
+ unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
45865230 __free_pages(page, order);
45875231 page = NULL;
45885232 }
....@@ -4620,13 +5264,20 @@
46205264 if (order == 0) /* Via pcp? */
46215265 free_unref_page(page);
46225266 else
4623
- __free_pages_ok(page, order);
5267
+ __free_pages_ok(page, order, FPI_NONE);
46245268 }
46255269
46265270 void __free_pages(struct page *page, unsigned int order)
46275271 {
5272
+ /* get PageHead before we drop reference */
5273
+ int head = PageHead(page);
5274
+
5275
+ trace_android_vh_free_pages(page, order);
46285276 if (put_page_testzero(page))
46295277 free_the_page(page, order);
5278
+ else if (!head)
5279
+ while (order-- > 0)
5280
+ free_the_page(page + (1 << order), order);
46305281 }
46315282 EXPORT_SYMBOL(__free_pages);
46325283
....@@ -4731,6 +5382,18 @@
47315382 /* reset page count bias and offset to start of new frag */
47325383 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
47335384 offset = size - fragsz;
5385
+ if (unlikely(offset < 0)) {
5386
+ /*
5387
+ * The caller is trying to allocate a fragment
5388
+ * with fragsz > PAGE_SIZE but the cache isn't big
5389
+ * enough to satisfy the request, this may
5390
+ * happen in low memory conditions.
5391
+ * We don't release the cache page because
5392
+ * it could make memory pressure worse
5393
+ * so we simply return NULL here.
5394
+ */
5395
+ return NULL;
5396
+ }
47345397 }
47355398
47365399 nc->pagecnt_bias--;
....@@ -4771,7 +5434,7 @@
47715434 /**
47725435 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
47735436 * @size: the number of bytes to allocate
4774
- * @gfp_mask: GFP flags for the allocation
5437
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
47755438 *
47765439 * This function is similar to alloc_pages(), except that it allocates the
47775440 * minimum number of pages to satisfy the request. alloc_pages() can only
....@@ -4780,11 +5443,16 @@
47805443 * This function is also limited by MAX_ORDER.
47815444 *
47825445 * Memory allocated by this function must be released by free_pages_exact().
5446
+ *
5447
+ * Return: pointer to the allocated area or %NULL in case of error.
47835448 */
47845449 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
47855450 {
47865451 unsigned int order = get_order(size);
47875452 unsigned long addr;
5453
+
5454
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
5455
+ gfp_mask &= ~__GFP_COMP;
47885456
47895457 addr = __get_free_pages(gfp_mask, order);
47905458 return make_alloc_exact(addr, order, size);
....@@ -4796,15 +5464,22 @@
47965464 * pages on a node.
47975465 * @nid: the preferred node ID where memory should be allocated
47985466 * @size: the number of bytes to allocate
4799
- * @gfp_mask: GFP flags for the allocation
5467
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
48005468 *
48015469 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
48025470 * back.
5471
+ *
5472
+ * Return: pointer to the allocated area or %NULL in case of error.
48035473 */
48045474 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
48055475 {
48065476 unsigned int order = get_order(size);
4807
- struct page *p = alloc_pages_node(nid, gfp_mask, order);
5477
+ struct page *p;
5478
+
5479
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
5480
+ gfp_mask &= ~__GFP_COMP;
5481
+
5482
+ p = alloc_pages_node(nid, gfp_mask, order);
48085483 if (!p)
48095484 return NULL;
48105485 return make_alloc_exact((unsigned long)page_address(p), order, size);
....@@ -4833,11 +5508,13 @@
48335508 * nr_free_zone_pages - count number of pages beyond high watermark
48345509 * @offset: The zone index of the highest zone
48355510 *
4836
- * nr_free_zone_pages() counts the number of counts pages which are beyond the
5511
+ * nr_free_zone_pages() counts the number of pages which are beyond the
48375512 * high watermark within all zones at or below a given zone index. For each
48385513 * zone, the number of pages is calculated as:
48395514 *
48405515 * nr_free_zone_pages = managed_pages - high_pages
5516
+ *
5517
+ * Return: number of pages beyond high watermark.
48415518 */
48425519 static unsigned long nr_free_zone_pages(int offset)
48435520 {
....@@ -4850,7 +5527,7 @@
48505527 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
48515528
48525529 for_each_zone_zonelist(zone, z, zonelist, offset) {
4853
- unsigned long size = zone->managed_pages;
5530
+ unsigned long size = zone_managed_pages(zone);
48545531 unsigned long high = high_wmark_pages(zone);
48555532 if (size > high)
48565533 sum += size - high;
....@@ -4864,23 +5541,15 @@
48645541 *
48655542 * nr_free_buffer_pages() counts the number of pages which are beyond the high
48665543 * watermark within ZONE_DMA and ZONE_NORMAL.
5544
+ *
5545
+ * Return: number of pages beyond high watermark within ZONE_DMA and
5546
+ * ZONE_NORMAL.
48675547 */
48685548 unsigned long nr_free_buffer_pages(void)
48695549 {
48705550 return nr_free_zone_pages(gfp_zone(GFP_USER));
48715551 }
48725552 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4873
-
4874
-/**
4875
- * nr_free_pagecache_pages - count number of pages beyond high watermark
4876
- *
4877
- * nr_free_pagecache_pages() counts the number of pages which are beyond the
4878
- * high watermark within all zones.
4879
- */
4880
-unsigned long nr_free_pagecache_pages(void)
4881
-{
4882
- return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
4883
-}
48845553
48855554 static inline void show_node(struct zone *zone)
48865555 {
....@@ -4902,7 +5571,7 @@
49025571 pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
49035572
49045573 for_each_zone(zone)
4905
- wmark_low += zone->watermark[WMARK_LOW];
5574
+ wmark_low += low_wmark_pages(zone);
49065575
49075576 /*
49085577 * Estimate the amount of memory available for userspace allocations,
....@@ -4924,8 +5593,8 @@
49245593 * items that are in use, and cannot be freed. Cap this estimate at the
49255594 * low watermark.
49265595 */
4927
- reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
4928
- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
5596
+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
5597
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
49295598 available += reclaimable - min(reclaimable / 2, wmark_low);
49305599
49315600 if (available < 0)
....@@ -4936,11 +5605,11 @@
49365605
49375606 void si_meminfo(struct sysinfo *val)
49385607 {
4939
- val->totalram = totalram_pages;
5608
+ val->totalram = totalram_pages();
49405609 val->sharedram = global_node_page_state(NR_SHMEM);
49415610 val->freeram = global_zone_page_state(NR_FREE_PAGES);
49425611 val->bufferram = nr_blockdev_pages();
4943
- val->totalhigh = totalhigh_pages;
5612
+ val->totalhigh = totalhigh_pages();
49445613 val->freehigh = nr_free_highpages();
49455614 val->mem_unit = PAGE_SIZE;
49465615 }
....@@ -4957,7 +5626,7 @@
49575626 pg_data_t *pgdat = NODE_DATA(nid);
49585627
49595628 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
4960
- managed_pages += pgdat->node_zones[zone_type].managed_pages;
5629
+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
49615630 val->totalram = managed_pages;
49625631 val->sharedram = node_page_state(pgdat, NR_SHMEM);
49635632 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
....@@ -4966,7 +5635,7 @@
49665635 struct zone *zone = &pgdat->node_zones[zone_type];
49675636
49685637 if (is_highmem(zone)) {
4969
- managed_highpages += zone->managed_pages;
5638
+ managed_highpages += zone_managed_pages(zone);
49705639 free_highpages += zone_page_state(zone, NR_FREE_PAGES);
49715640 }
49725641 }
....@@ -5055,7 +5724,7 @@
50555724
50565725 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
50575726 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
5058
- " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
5727
+ " unevictable:%lu dirty:%lu writeback:%lu\n"
50595728 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
50605729 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
50615730 " free:%lu free_pcp:%lu free_cma:%lu\n",
....@@ -5068,9 +5737,8 @@
50685737 global_node_page_state(NR_UNEVICTABLE),
50695738 global_node_page_state(NR_FILE_DIRTY),
50705739 global_node_page_state(NR_WRITEBACK),
5071
- global_node_page_state(NR_UNSTABLE_NFS),
5072
- global_node_page_state(NR_SLAB_RECLAIMABLE),
5073
- global_node_page_state(NR_SLAB_UNRECLAIMABLE),
5740
+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
5741
+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
50745742 global_node_page_state(NR_FILE_MAPPED),
50755743 global_node_page_state(NR_SHMEM),
50765744 global_zone_page_state(NR_PAGETABLE),
....@@ -5079,6 +5747,7 @@
50795747 free_pcp,
50805748 global_zone_page_state(NR_FREE_CMA_PAGES));
50815749
5750
+ trace_android_vh_show_mapcount_pages(NULL);
50825751 for_each_online_pgdat(pgdat) {
50835752 if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
50845753 continue;
....@@ -5101,7 +5770,10 @@
51015770 " anon_thp: %lukB"
51025771 #endif
51035772 " writeback_tmp:%lukB"
5104
- " unstable:%lukB"
5773
+ " kernel_stack:%lukB"
5774
+#ifdef CONFIG_SHADOW_CALL_STACK
5775
+ " shadow_call_stack:%lukB"
5776
+#endif
51055777 " all_unreclaimable? %s"
51065778 "\n",
51075779 pgdat->node_id,
....@@ -5123,7 +5795,10 @@
51235795 K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
51245796 #endif
51255797 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
5126
- K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
5798
+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
5799
+#ifdef CONFIG_SHADOW_CALL_STACK
5800
+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
5801
+#endif
51275802 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
51285803 "yes" : "no");
51295804 }
....@@ -5145,6 +5820,7 @@
51455820 " min:%lukB"
51465821 " low:%lukB"
51475822 " high:%lukB"
5823
+ " reserved_highatomic:%luKB"
51485824 " active_anon:%lukB"
51495825 " inactive_anon:%lukB"
51505826 " active_file:%lukB"
....@@ -5154,10 +5830,6 @@
51545830 " present:%lukB"
51555831 " managed:%lukB"
51565832 " mlocked:%lukB"
5157
- " kernel_stack:%lukB"
5158
-#ifdef CONFIG_SHADOW_CALL_STACK
5159
- " shadow_call_stack:%lukB"
5160
-#endif
51615833 " pagetables:%lukB"
51625834 " bounce:%lukB"
51635835 " free_pcp:%lukB"
....@@ -5169,6 +5841,7 @@
51695841 K(min_wmark_pages(zone)),
51705842 K(low_wmark_pages(zone)),
51715843 K(high_wmark_pages(zone)),
5844
+ K(zone->nr_reserved_highatomic),
51725845 K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
51735846 K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
51745847 K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
....@@ -5176,12 +5849,8 @@
51765849 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
51775850 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
51785851 K(zone->present_pages),
5179
- K(zone->managed_pages),
5852
+ K(zone_managed_pages(zone)),
51805853 K(zone_page_state(zone, NR_MLOCK)),
5181
- zone_page_state(zone, NR_KERNEL_STACK_KB),
5182
-#ifdef CONFIG_SHADOW_CALL_STACK
5183
- zone_page_state(zone, NR_KERNEL_SCS_BYTES) / 1024,
5184
-#endif
51855854 K(zone_page_state(zone, NR_PAGETABLE)),
51865855 K(zone_page_state(zone, NR_BOUNCE)),
51875856 K(free_pcp),
....@@ -5213,7 +5882,7 @@
52135882
52145883 types[order] = 0;
52155884 for (type = 0; type < MIGRATE_TYPES; type++) {
5216
- if (!list_empty(&area->free_list[type]))
5885
+ if (!free_area_empty(area, type))
52175886 types[order] |= 1 << type;
52185887 }
52195888 }
....@@ -5254,7 +5923,7 @@
52545923 do {
52555924 zone_type--;
52565925 zone = pgdat->node_zones + zone_type;
5257
- if (managed_zone(zone)) {
5926
+ if (populated_zone(zone)) {
52585927 zoneref_set_zone(zone, &zonerefs[nr_zones++]);
52595928 check_highest_zone(zone_type);
52605929 }
....@@ -5280,36 +5949,17 @@
52805949 return 0;
52815950 }
52825951
5283
-static __init int setup_numa_zonelist_order(char *s)
5284
-{
5285
- if (!s)
5286
- return 0;
5287
-
5288
- return __parse_numa_zonelist_order(s);
5289
-}
5290
-early_param("numa_zonelist_order", setup_numa_zonelist_order);
5291
-
52925952 char numa_zonelist_order[] = "Node";
52935953
52945954 /*
52955955 * sysctl handler for numa_zonelist_order
52965956 */
52975957 int numa_zonelist_order_handler(struct ctl_table *table, int write,
5298
- void __user *buffer, size_t *length,
5299
- loff_t *ppos)
5958
+ void *buffer, size_t *length, loff_t *ppos)
53005959 {
5301
- char *str;
5302
- int ret;
5303
-
5304
- if (!write)
5305
- return proc_dostring(table, write, buffer, length, ppos);
5306
- str = memdup_user_nul(buffer, 16);
5307
- if (IS_ERR(str))
5308
- return PTR_ERR(str);
5309
-
5310
- ret = __parse_numa_zonelist_order(str);
5311
- kfree(str);
5312
- return ret;
5960
+ if (write)
5961
+ return __parse_numa_zonelist_order(buffer);
5962
+ return proc_dostring(table, write, buffer, length, ppos);
53135963 }
53145964
53155965
....@@ -5328,14 +5978,14 @@
53285978 * from each node to each node in the system), and should also prefer nodes
53295979 * with no CPUs, since presumably they'll have very little allocation pressure
53305980 * on them otherwise.
5331
- * It returns -1 if no node is found.
5981
+ *
5982
+ * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
53325983 */
53335984 static int find_next_best_node(int node, nodemask_t *used_node_mask)
53345985 {
53355986 int n, val;
53365987 int min_val = INT_MAX;
53375988 int best_node = NUMA_NO_NODE;
5338
- const struct cpumask *tmp = cpumask_of_node(0);
53395989
53405990 /* Use the local node if we haven't already */
53415991 if (!node_isset(node, *used_node_mask)) {
....@@ -5356,8 +6006,7 @@
53566006 val += (n < node);
53576007
53586008 /* Give preference to headless and unused nodes */
5359
- tmp = cpumask_of_node(n);
5360
- if (!cpumask_empty(tmp))
6009
+ if (!cpumask_empty(cpumask_of_node(n)))
53616010 val += PENALTY_FOR_NODE_WITH_CPUS;
53626011
53636012 /* Slight preference for less loaded node */
....@@ -5428,14 +6077,13 @@
54286077 {
54296078 static int node_order[MAX_NUMNODES];
54306079 int node, load, nr_nodes = 0;
5431
- nodemask_t used_mask;
6080
+ nodemask_t used_mask = NODE_MASK_NONE;
54326081 int local_node, prev_node;
54336082
54346083 /* NUMA-aware ordering of nodes */
54356084 local_node = pgdat->node_id;
54366085 load = nr_online_nodes;
54376086 prev_node = local_node;
5438
- nodes_clear(used_mask);
54396087
54406088 memset(node_order, 0, sizeof(node_order));
54416089 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
....@@ -5542,9 +6190,22 @@
55426190 int nid;
55436191 int __maybe_unused cpu;
55446192 pg_data_t *self = data;
5545
- static DEFINE_SPINLOCK(lock);
6193
+ unsigned long flags;
55466194
5547
- spin_lock(&lock);
6195
+ /*
6196
+ * Explicitly disable this CPU's interrupts before taking seqlock
6197
+ * to prevent any IRQ handler from calling into the page allocator
6198
+ * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
6199
+ */
6200
+ local_irq_save(flags);
6201
+ /*
6202
+ * Explicitly disable this CPU's synchronous printk() before taking
6203
+ * seqlock to prevent any printk() from trying to hold port->lock, for
6204
+ * tty_insert_flip_string_and_push_buffer() on other CPU might be
6205
+ * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
6206
+ */
6207
+ printk_deferred_enter();
6208
+ write_seqlock(&zonelist_update_seq);
55486209
55496210 #ifdef CONFIG_NUMA
55506211 memset(node_load, 0, sizeof(node_load));
....@@ -5577,7 +6238,9 @@
55776238 #endif
55786239 }
55796240
5580
- spin_unlock(&lock);
6241
+ write_sequnlock(&zonelist_update_seq);
6242
+ printk_deferred_exit();
6243
+ local_irq_restore(flags);
55816244 }
55826245
55836246 static noinline void __init
....@@ -5615,13 +6278,16 @@
56156278 */
56166279 void __ref build_all_zonelists(pg_data_t *pgdat)
56176280 {
6281
+ unsigned long vm_total_pages;
6282
+
56186283 if (system_state == SYSTEM_BOOTING) {
56196284 build_all_zonelists_init();
56206285 } else {
56216286 __build_all_zonelists(pgdat);
56226287 /* cpuset refresh routine should be here */
56236288 }
5624
- vm_total_pages = nr_free_pagecache_pages();
6289
+ /* Get the number of free pages beyond high watermark in all zones. */
6290
+ vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
56256291 /*
56266292 * Disable grouping by mobility if the number of pages in the
56276293 * system is too low to allow the mechanism to work. It would be
....@@ -5634,7 +6300,7 @@
56346300 else
56356301 page_group_by_mobility_disabled = 0;
56366302
5637
- pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
6303
+ pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
56386304 nr_online_nodes,
56396305 page_group_by_mobility_disabled ? "off" : "on",
56406306 vm_total_pages);
....@@ -5643,81 +6309,148 @@
56436309 #endif
56446310 }
56456311
6312
+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
6313
+static bool __meminit
6314
+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
6315
+{
6316
+ static struct memblock_region *r;
6317
+
6318
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
6319
+ if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
6320
+ for_each_mem_region(r) {
6321
+ if (*pfn < memblock_region_memory_end_pfn(r))
6322
+ break;
6323
+ }
6324
+ }
6325
+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
6326
+ memblock_is_mirror(r)) {
6327
+ *pfn = memblock_region_memory_end_pfn(r);
6328
+ return true;
6329
+ }
6330
+ }
6331
+ return false;
6332
+}
6333
+
56466334 /*
56476335 * Initially all pages are reserved - free ones are freed
5648
- * up by free_all_bootmem() once the early boot process is
6336
+ * up by memblock_free_all() once the early boot process is
56496337 * done. Non-atomic initialization, single-pass.
6338
+ *
6339
+ * All aligned pageblocks are initialized to the specified migratetype
6340
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
6341
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
56506342 */
56516343 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5652
- unsigned long start_pfn, enum meminit_context context,
5653
- struct vmem_altmap *altmap)
6344
+ unsigned long start_pfn, unsigned long zone_end_pfn,
6345
+ enum meminit_context context,
6346
+ struct vmem_altmap *altmap, int migratetype)
56546347 {
5655
- unsigned long end_pfn = start_pfn + size;
5656
- pg_data_t *pgdat = NODE_DATA(nid);
5657
- unsigned long pfn;
5658
- unsigned long nr_initialised = 0;
6348
+ unsigned long pfn, end_pfn = start_pfn + size;
56596349 struct page *page;
5660
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5661
- struct memblock_region *r = NULL, *tmp;
5662
-#endif
56636350
56646351 if (highest_memmap_pfn < end_pfn - 1)
56656352 highest_memmap_pfn = end_pfn - 1;
6353
+
6354
+#ifdef CONFIG_ZONE_DEVICE
6355
+ /*
6356
+ * Honor reservation requested by the driver for this ZONE_DEVICE
6357
+ * memory. We limit the total number of pages to initialize to just
6358
+ * those that might contain the memory mapping. We will defer the
6359
+ * ZONE_DEVICE page initialization until after we have released
6360
+ * the hotplug lock.
6361
+ */
6362
+ if (zone == ZONE_DEVICE) {
6363
+ if (!altmap)
6364
+ return;
6365
+
6366
+ if (start_pfn == altmap->base_pfn)
6367
+ start_pfn += altmap->reserve;
6368
+ end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
6369
+ }
6370
+#endif
56666371
56676372 #ifdef CONFIG_ROCKCHIP_THUNDER_BOOT
56686373 /* Zero all page struct in advance */
56696374 memset(pfn_to_page(start_pfn), 0, sizeof(struct page) * size);
56706375 #endif
56716376
5672
- /*
5673
- * Honor reservation requested by the driver for this ZONE_DEVICE
5674
- * memory
5675
- */
5676
- if (altmap && start_pfn == altmap->base_pfn)
5677
- start_pfn += altmap->reserve;
5678
-
5679
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
6377
+ for (pfn = start_pfn; pfn < end_pfn; ) {
56806378 /*
56816379 * There can be holes in boot-time mem_map[]s handed to this
56826380 * function. They do not exist on hotplugged memory.
56836381 */
5684
- if (context != MEMINIT_EARLY)
5685
- goto not_early;
5686
-
5687
- if (!early_pfn_valid(pfn))
5688
- continue;
5689
- if (!early_pfn_in_nid(pfn, nid))
5690
- continue;
5691
- if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
5692
- break;
5693
-
5694
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5695
- /*
5696
- * Check given memblock attribute by firmware which can affect
5697
- * kernel memory layout. If zone==ZONE_MOVABLE but memory is
5698
- * mirrored, it's an overlapped memmap init. skip it.
5699
- */
5700
- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
5701
- if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
5702
- for_each_memblock(memory, tmp)
5703
- if (pfn < memblock_region_memory_end_pfn(tmp))
5704
- break;
5705
- r = tmp;
5706
- }
5707
- if (pfn >= memblock_region_memory_base_pfn(r) &&
5708
- memblock_is_mirror(r)) {
5709
- /* already initialized as NORMAL */
5710
- pfn = memblock_region_memory_end_pfn(r);
6382
+ if (context == MEMINIT_EARLY) {
6383
+ if (overlap_memmap_init(zone, &pfn))
57116384 continue;
5712
- }
6385
+ if (defer_init(nid, pfn, zone_end_pfn))
6386
+ break;
57136387 }
5714
-#endif
57156388
5716
-not_early:
57176389 page = pfn_to_page(pfn);
57186390 __init_single_page(page, pfn, zone, nid, false);
57196391 if (context == MEMINIT_HOTPLUG)
5720
- SetPageReserved(page);
6392
+ __SetPageReserved(page);
6393
+
6394
+ /*
6395
+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
6396
+ * such that unmovable allocations won't be scattered all
6397
+ * over the place during system boot.
6398
+ */
6399
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
6400
+ set_pageblock_migratetype(page, migratetype);
6401
+ cond_resched();
6402
+ }
6403
+ pfn++;
6404
+ }
6405
+}
6406
+
6407
+#ifdef CONFIG_ZONE_DEVICE
6408
+void __ref memmap_init_zone_device(struct zone *zone,
6409
+ unsigned long start_pfn,
6410
+ unsigned long nr_pages,
6411
+ struct dev_pagemap *pgmap)
6412
+{
6413
+ unsigned long pfn, end_pfn = start_pfn + nr_pages;
6414
+ struct pglist_data *pgdat = zone->zone_pgdat;
6415
+ struct vmem_altmap *altmap = pgmap_altmap(pgmap);
6416
+ unsigned long zone_idx = zone_idx(zone);
6417
+ unsigned long start = jiffies;
6418
+ int nid = pgdat->node_id;
6419
+
6420
+ if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
6421
+ return;
6422
+
6423
+ /*
6424
+ * The call to memmap_init should have already taken care
6425
+ * of the pages reserved for the memmap, so we can just jump to
6426
+ * the end of that region and start processing the device pages.
6427
+ */
6428
+ if (altmap) {
6429
+ start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
6430
+ nr_pages = end_pfn - start_pfn;
6431
+ }
6432
+
6433
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
6434
+ struct page *page = pfn_to_page(pfn);
6435
+
6436
+ __init_single_page(page, pfn, zone_idx, nid, true);
6437
+
6438
+ /*
6439
+ * Mark page reserved as it will need to wait for onlining
6440
+ * phase for it to be fully associated with a zone.
6441
+ *
6442
+ * We can use the non-atomic __set_bit operation for setting
6443
+ * the flag as we are still initializing the pages.
6444
+ */
6445
+ __SetPageReserved(page);
6446
+
6447
+ /*
6448
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
6449
+ * and zone_device_data. It is a bug if a ZONE_DEVICE page is
6450
+ * ever freed or placed on a driver-private list.
6451
+ */
6452
+ page->pgmap = pgmap;
6453
+ page->zone_device_data = NULL;
57216454
57226455 /*
57236456 * Mark the block movable so that blocks are reserved for
....@@ -5726,21 +6459,20 @@
57266459 * the address space during boot when many long-lived
57276460 * kernel allocations are made.
57286461 *
5729
- * bitmap is created for zone's valid pfn range. but memmap
5730
- * can be created for invalid pages (for alignment)
5731
- * check here not to call set_pageblock_migratetype() against
5732
- * pfn out of zone.
5733
- *
57346462 * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
5735
- * because this is done early in sparse_add_one_section
6463
+ * because this is done early in section_activate()
57366464 */
5737
- if (!(pfn & (pageblock_nr_pages - 1))) {
6465
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
57386466 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
57396467 cond_resched();
57406468 }
57416469 }
6470
+
6471
+ pr_info("%s initialised %lu pages in %ums\n", __func__,
6472
+ nr_pages, jiffies_to_msecs(jiffies - start));
57426473 }
57436474
6475
+#endif
57446476 static void __meminit zone_init_free_lists(struct zone *zone)
57456477 {
57466478 unsigned int order, t;
....@@ -5750,11 +6482,118 @@
57506482 }
57516483 }
57526484
5753
-#ifndef __HAVE_ARCH_MEMMAP_INIT
5754
-#define memmap_init(size, nid, zone, start_pfn) \
5755
- memmap_init_zone((size), (nid), (zone), (start_pfn), \
5756
- MEMINIT_EARLY, NULL)
6485
+/*
6486
+ * Only struct pages that correspond to ranges defined by memblock.memory
6487
+ * are zeroed and initialized by going through __init_single_page() during
6488
+ * memmap_init_zone_range().
6489
+ *
6490
+ * But, there could be struct pages that correspond to holes in
6491
+ * memblock.memory. This can happen because of the following reasons:
6492
+ * - physical memory bank size is not necessarily the exact multiple of the
6493
+ * arbitrary section size
6494
+ * - early reserved memory may not be listed in memblock.memory
6495
+ * - memory layouts defined with memmap= kernel parameter may not align
6496
+ * nicely with memmap sections
6497
+ *
6498
+ * Explicitly initialize those struct pages so that:
6499
+ * - PG_Reserved is set
6500
+ * - zone and node links point to zone and node that span the page if the
6501
+ * hole is in the middle of a zone
6502
+ * - zone and node links point to adjacent zone/node if the hole falls on
6503
+ * the zone boundary; the pages in such holes will be prepended to the
6504
+ * zone/node above the hole except for the trailing pages in the last
6505
+ * section that will be appended to the zone/node below.
6506
+ */
6507
+static void __init init_unavailable_range(unsigned long spfn,
6508
+ unsigned long epfn,
6509
+ int zone, int node)
6510
+{
6511
+ unsigned long pfn;
6512
+ u64 pgcnt = 0;
6513
+
6514
+ for (pfn = spfn; pfn < epfn; pfn++) {
6515
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
6516
+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
6517
+ + pageblock_nr_pages - 1;
6518
+ continue;
6519
+ }
6520
+ __init_single_page(pfn_to_page(pfn), pfn, zone, node, true);
6521
+ __SetPageReserved(pfn_to_page(pfn));
6522
+ pgcnt++;
6523
+ }
6524
+
6525
+ if (pgcnt)
6526
+ pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
6527
+ node, zone_names[zone], pgcnt);
6528
+}
6529
+
6530
+static void __init memmap_init_zone_range(struct zone *zone,
6531
+ unsigned long start_pfn,
6532
+ unsigned long end_pfn,
6533
+ unsigned long *hole_pfn)
6534
+{
6535
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
6536
+ unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
6537
+ int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
6538
+
6539
+ start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
6540
+ end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
6541
+
6542
+ if (start_pfn >= end_pfn)
6543
+ return;
6544
+
6545
+ memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn,
6546
+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
6547
+
6548
+ if (*hole_pfn < start_pfn)
6549
+ init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
6550
+
6551
+ *hole_pfn = end_pfn;
6552
+}
6553
+
6554
+void __init __weak memmap_init(void)
6555
+{
6556
+ unsigned long start_pfn, end_pfn;
6557
+ unsigned long hole_pfn = 0;
6558
+ int i, j, zone_id, nid;
6559
+
6560
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
6561
+ struct pglist_data *node = NODE_DATA(nid);
6562
+
6563
+ for (j = 0; j < MAX_NR_ZONES; j++) {
6564
+ struct zone *zone = node->node_zones + j;
6565
+
6566
+ if (!populated_zone(zone))
6567
+ continue;
6568
+
6569
+ memmap_init_zone_range(zone, start_pfn, end_pfn,
6570
+ &hole_pfn);
6571
+ zone_id = j;
6572
+ }
6573
+ }
6574
+
6575
+#ifdef CONFIG_SPARSEMEM
6576
+ /*
6577
+ * Initialize the memory map for hole in the range [memory_end,
6578
+ * section_end].
6579
+ * Append the pages in this hole to the highest zone in the last
6580
+ * node.
6581
+ * The call to init_unavailable_range() is outside the ifdef to
6582
+ * silence the compiler warining about zone_id set but not used;
6583
+ * for FLATMEM it is a nop anyway
6584
+ */
6585
+ end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
6586
+ if (hole_pfn < end_pfn)
57576587 #endif
6588
+ init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
6589
+}
6590
+
6591
+/* A stub for backwards compatibility with custom implementatin on IA-64 */
6592
+void __meminit __weak arch_memmap_init(unsigned long size, int nid,
6593
+ unsigned long zone,
6594
+ unsigned long range_start_pfn)
6595
+{
6596
+}
57586597
57596598 static int zone_batchsize(struct zone *zone)
57606599 {
....@@ -5765,7 +6604,7 @@
57656604 * The per-cpu-pages pools are set to around 1000th of the
57666605 * size of the zone.
57676606 */
5768
- batch = zone->managed_pages / 1024;
6607
+ batch = zone_managed_pages(zone) / 1024;
57696608 /* But no more than a meg. */
57706609 if (batch * PAGE_SIZE > 1024 * 1024)
57716610 batch = (1024 * 1024) / PAGE_SIZE;
....@@ -5812,7 +6651,7 @@
58126651 * locking.
58136652 *
58146653 * Any new users of pcp->batch and pcp->high should ensure they can cope with
5815
- * those fields changing asynchronously (acording the the above rule).
6654
+ * those fields changing asynchronously (acording to the above rule).
58166655 *
58176656 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
58186657 * outside of boot time (or some other assurance that no concurrent updaters
....@@ -5821,6 +6660,7 @@
58216660 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
58226661 unsigned long batch)
58236662 {
6663
+ trace_android_vh_pageset_update(&high, &batch);
58246664 /* start with a fail safe value for batch */
58256665 pcp->batch = 1;
58266666 smp_wmb();
....@@ -5846,7 +6686,6 @@
58466686 memset(p, 0, sizeof(*p));
58476687
58486688 pcp = &p->pcp;
5849
- pcp->count = 0;
58506689 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
58516690 INIT_LIST_HEAD(&pcp->lists[migratetype]);
58526691 }
....@@ -5876,7 +6715,7 @@
58766715 {
58776716 if (percpu_pagelist_fraction)
58786717 pageset_set_high(pcp,
5879
- (zone->managed_pages /
6718
+ (zone_managed_pages(zone) /
58806719 percpu_pagelist_fraction));
58816720 else
58826721 pageset_set_batch(pcp, zone_batchsize(zone));
....@@ -5906,9 +6745,24 @@
59066745 {
59076746 struct pglist_data *pgdat;
59086747 struct zone *zone;
6748
+ int __maybe_unused cpu;
59096749
59106750 for_each_populated_zone(zone)
59116751 setup_zone_pageset(zone);
6752
+
6753
+#ifdef CONFIG_NUMA
6754
+ /*
6755
+ * Unpopulated zones continue using the boot pagesets.
6756
+ * The numa stats for these pagesets need to be reset.
6757
+ * Otherwise, they will end up skewing the stats of
6758
+ * the nodes these zones are associated with.
6759
+ */
6760
+ for_each_possible_cpu(cpu) {
6761
+ struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
6762
+ memset(pcp->vm_numa_stat_diff, 0,
6763
+ sizeof(pcp->vm_numa_stat_diff));
6764
+ }
6765
+#endif
59126766
59136767 for_each_online_pgdat(pgdat)
59146768 pgdat->per_cpu_nodestats =
....@@ -5952,73 +6806,6 @@
59526806 zone->initialized = 1;
59536807 }
59546808
5955
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5956
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
5957
-
5958
-/*
5959
- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
5960
- */
5961
-int __meminit __early_pfn_to_nid(unsigned long pfn,
5962
- struct mminit_pfnnid_cache *state)
5963
-{
5964
- unsigned long start_pfn, end_pfn;
5965
- int nid;
5966
-
5967
- if (state->last_start <= pfn && pfn < state->last_end)
5968
- return state->last_nid;
5969
-
5970
- nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
5971
- if (nid != -1) {
5972
- state->last_start = start_pfn;
5973
- state->last_end = end_pfn;
5974
- state->last_nid = nid;
5975
- }
5976
-
5977
- return nid;
5978
-}
5979
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
5980
-
5981
-/**
5982
- * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
5983
- * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
5984
- * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
5985
- *
5986
- * If an architecture guarantees that all ranges registered contain no holes
5987
- * and may be freed, this this function may be used instead of calling
5988
- * memblock_free_early_nid() manually.
5989
- */
5990
-void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
5991
-{
5992
- unsigned long start_pfn, end_pfn;
5993
- int i, this_nid;
5994
-
5995
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
5996
- start_pfn = min(start_pfn, max_low_pfn);
5997
- end_pfn = min(end_pfn, max_low_pfn);
5998
-
5999
- if (start_pfn < end_pfn)
6000
- memblock_free_early_nid(PFN_PHYS(start_pfn),
6001
- (end_pfn - start_pfn) << PAGE_SHIFT,
6002
- this_nid);
6003
- }
6004
-}
6005
-
6006
-/**
6007
- * sparse_memory_present_with_active_regions - Call memory_present for each active range
6008
- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
6009
- *
6010
- * If an architecture guarantees that all ranges registered contain no holes and may
6011
- * be freed, this function may be used instead of calling memory_present() manually.
6012
- */
6013
-void __init sparse_memory_present_with_active_regions(int nid)
6014
-{
6015
- unsigned long start_pfn, end_pfn;
6016
- int i, this_nid;
6017
-
6018
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
6019
- memory_present(this_nid, start_pfn, end_pfn);
6020
-}
6021
-
60226809 /**
60236810 * get_pfn_range_for_nid - Return the start and end page frames for a node
60246811 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
....@@ -6030,7 +6817,7 @@
60306817 * with no available memory, a warning is printed and the start and end
60316818 * PFNs will be 0.
60326819 */
6033
-void __meminit get_pfn_range_for_nid(unsigned int nid,
6820
+void __init get_pfn_range_for_nid(unsigned int nid,
60346821 unsigned long *start_pfn, unsigned long *end_pfn)
60356822 {
60366823 unsigned long this_start_pfn, this_end_pfn;
....@@ -6079,7 +6866,7 @@
60796866 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
60806867 * zones within a node are in order of monotonic increases memory addresses
60816868 */
6082
-static void __meminit adjust_zone_range_for_zone_movable(int nid,
6869
+static void __init adjust_zone_range_for_zone_movable(int nid,
60836870 unsigned long zone_type,
60846871 unsigned long node_start_pfn,
60856872 unsigned long node_end_pfn,
....@@ -6110,13 +6897,12 @@
61106897 * Return the number of pages a zone spans in a node, including holes
61116898 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
61126899 */
6113
-static unsigned long __meminit zone_spanned_pages_in_node(int nid,
6900
+static unsigned long __init zone_spanned_pages_in_node(int nid,
61146901 unsigned long zone_type,
61156902 unsigned long node_start_pfn,
61166903 unsigned long node_end_pfn,
61176904 unsigned long *zone_start_pfn,
6118
- unsigned long *zone_end_pfn,
6119
- unsigned long *ignored)
6905
+ unsigned long *zone_end_pfn)
61206906 {
61216907 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
61226908 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
....@@ -6147,7 +6933,7 @@
61476933 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
61486934 * then all holes in the requested range will be accounted for.
61496935 */
6150
-unsigned long __meminit __absent_pages_in_range(int nid,
6936
+unsigned long __init __absent_pages_in_range(int nid,
61516937 unsigned long range_start_pfn,
61526938 unsigned long range_end_pfn)
61536939 {
....@@ -6168,7 +6954,7 @@
61686954 * @start_pfn: The start PFN to start searching for holes
61696955 * @end_pfn: The end PFN to stop searching for holes
61706956 *
6171
- * It returns the number of pages frames in memory holes within a range.
6957
+ * Return: the number of pages frames in memory holes within a range.
61726958 */
61736959 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
61746960 unsigned long end_pfn)
....@@ -6177,11 +6963,10 @@
61776963 }
61786964
61796965 /* Return the number of page frames in holes in a zone on a node */
6180
-static unsigned long __meminit zone_absent_pages_in_node(int nid,
6966
+static unsigned long __init zone_absent_pages_in_node(int nid,
61816967 unsigned long zone_type,
61826968 unsigned long node_start_pfn,
6183
- unsigned long node_end_pfn,
6184
- unsigned long *ignored)
6969
+ unsigned long node_end_pfn)
61856970 {
61866971 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
61876972 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
....@@ -6209,7 +6994,7 @@
62096994 unsigned long start_pfn, end_pfn;
62106995 struct memblock_region *r;
62116996
6212
- for_each_memblock(memory, r) {
6997
+ for_each_mem_region(r) {
62136998 start_pfn = clamp(memblock_region_memory_base_pfn(r),
62146999 zone_start_pfn, zone_end_pfn);
62157000 end_pfn = clamp(memblock_region_memory_end_pfn(r),
....@@ -6228,45 +7013,9 @@
62287013 return nr_absent;
62297014 }
62307015
6231
-#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6232
-static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
6233
- unsigned long zone_type,
6234
- unsigned long node_start_pfn,
6235
- unsigned long node_end_pfn,
6236
- unsigned long *zone_start_pfn,
6237
- unsigned long *zone_end_pfn,
6238
- unsigned long *zones_size)
6239
-{
6240
- unsigned int zone;
6241
-
6242
- *zone_start_pfn = node_start_pfn;
6243
- for (zone = 0; zone < zone_type; zone++)
6244
- *zone_start_pfn += zones_size[zone];
6245
-
6246
- *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
6247
-
6248
- return zones_size[zone_type];
6249
-}
6250
-
6251
-static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
6252
- unsigned long zone_type,
7016
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
62537017 unsigned long node_start_pfn,
6254
- unsigned long node_end_pfn,
6255
- unsigned long *zholes_size)
6256
-{
6257
- if (!zholes_size)
6258
- return 0;
6259
-
6260
- return zholes_size[zone_type];
6261
-}
6262
-
6263
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6264
-
6265
-static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
6266
- unsigned long node_start_pfn,
6267
- unsigned long node_end_pfn,
6268
- unsigned long *zones_size,
6269
- unsigned long *zholes_size)
7018
+ unsigned long node_end_pfn)
62707019 {
62717020 unsigned long realtotalpages = 0, totalpages = 0;
62727021 enum zone_type i;
....@@ -6274,17 +7023,21 @@
62747023 for (i = 0; i < MAX_NR_ZONES; i++) {
62757024 struct zone *zone = pgdat->node_zones + i;
62767025 unsigned long zone_start_pfn, zone_end_pfn;
7026
+ unsigned long spanned, absent;
62777027 unsigned long size, real_size;
62787028
6279
- size = zone_spanned_pages_in_node(pgdat->node_id, i,
6280
- node_start_pfn,
6281
- node_end_pfn,
6282
- &zone_start_pfn,
6283
- &zone_end_pfn,
6284
- zones_size);
6285
- real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
6286
- node_start_pfn, node_end_pfn,
6287
- zholes_size);
7029
+ spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
7030
+ node_start_pfn,
7031
+ node_end_pfn,
7032
+ &zone_start_pfn,
7033
+ &zone_end_pfn);
7034
+ absent = zone_absent_pages_in_node(pgdat->node_id, i,
7035
+ node_start_pfn,
7036
+ node_end_pfn);
7037
+
7038
+ size = spanned;
7039
+ real_size = size - absent;
7040
+
62887041 if (size)
62897042 zone->zone_start_pfn = zone_start_pfn;
62907043 else
....@@ -6330,10 +7083,14 @@
63307083 {
63317084 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
63327085 zone->pageblock_flags = NULL;
6333
- if (usemapsize)
7086
+ if (usemapsize) {
63347087 zone->pageblock_flags =
6335
- memblock_virt_alloc_node_nopanic(usemapsize,
6336
- pgdat->node_id);
7088
+ memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
7089
+ pgdat->node_id);
7090
+ if (!zone->pageblock_flags)
7091
+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
7092
+ usemapsize, zone->name, pgdat->node_id);
7093
+ }
63377094 }
63387095 #else
63397096 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
....@@ -6400,9 +7157,11 @@
64007157 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
64017158 static void pgdat_init_split_queue(struct pglist_data *pgdat)
64027159 {
6403
- spin_lock_init(&pgdat->split_queue_lock);
6404
- INIT_LIST_HEAD(&pgdat->split_queue);
6405
- pgdat->split_queue_len = 0;
7160
+ struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
7161
+
7162
+ spin_lock_init(&ds_queue->split_queue_lock);
7163
+ INIT_LIST_HEAD(&ds_queue->split_queue);
7164
+ ds_queue->split_queue_len = 0;
64067165 }
64077166 #else
64087167 static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
....@@ -6429,13 +7188,13 @@
64297188
64307189 pgdat_page_ext_init(pgdat);
64317190 spin_lock_init(&pgdat->lru_lock);
6432
- lruvec_init(node_lruvec(pgdat));
7191
+ lruvec_init(&pgdat->__lruvec);
64337192 }
64347193
64357194 static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
64367195 unsigned long remaining_pages)
64377196 {
6438
- zone->managed_pages = remaining_pages;
7197
+ atomic_long_set(&zone->managed_pages, remaining_pages);
64397198 zone_set_nid(zone, nid);
64407199 zone->name = zone_names[idx];
64417200 zone->zone_pgdat = NODE_DATA(nid);
....@@ -6533,7 +7292,7 @@
65337292 set_pageblock_order();
65347293 setup_usemap(pgdat, zone, zone_start_pfn, size);
65357294 init_currently_empty_zone(zone, zone_start_pfn, size);
6536
- memmap_init(size, nid, j, zone_start_pfn);
7295
+ arch_memmap_init(size, nid, j, zone_start_pfn);
65377296 }
65387297 }
65397298
....@@ -6562,7 +7321,11 @@
65627321 end = pgdat_end_pfn(pgdat);
65637322 end = ALIGN(end, MAX_ORDER_NR_PAGES);
65647323 size = (end - start) * sizeof(struct page);
6565
- map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
7324
+ map = memblock_alloc_node(size, SMP_CACHE_BYTES,
7325
+ pgdat->node_id);
7326
+ if (!map)
7327
+ panic("Failed to allocate %ld bytes for node %d memory map\n",
7328
+ size, pgdat->node_id);
65667329 pgdat->node_mem_map = map + offset;
65677330 }
65687331 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
....@@ -6574,10 +7337,8 @@
65747337 */
65757338 if (pgdat == NODE_DATA(0)) {
65767339 mem_map = NODE_DATA(0)->node_mem_map;
6577
-#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
65787340 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
65797341 mem_map -= offset;
6580
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
65817342 }
65827343 #endif
65837344 }
....@@ -6588,42 +7349,31 @@
65887349 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
65897350 static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
65907351 {
6591
- /*
6592
- * We start only with one section of pages, more pages are added as
6593
- * needed until the rest of deferred pages are initialized.
6594
- */
6595
- pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
6596
- pgdat->node_spanned_pages);
65977352 pgdat->first_deferred_pfn = ULONG_MAX;
65987353 }
65997354 #else
66007355 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
66017356 #endif
66027357
6603
-void __init free_area_init_node(int nid, unsigned long *zones_size,
6604
- unsigned long node_start_pfn,
6605
- unsigned long *zholes_size)
7358
+static void __init free_area_init_node(int nid)
66067359 {
66077360 pg_data_t *pgdat = NODE_DATA(nid);
66087361 unsigned long start_pfn = 0;
66097362 unsigned long end_pfn = 0;
66107363
66117364 /* pg_data_t should be reset to zero when it's allocated */
6612
- WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
7365
+ WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
7366
+
7367
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
66137368
66147369 pgdat->node_id = nid;
6615
- pgdat->node_start_pfn = node_start_pfn;
7370
+ pgdat->node_start_pfn = start_pfn;
66167371 pgdat->per_cpu_nodestats = NULL;
6617
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6618
- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
7372
+
66197373 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
66207374 (u64)start_pfn << PAGE_SHIFT,
66217375 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
6622
-#else
6623
- start_pfn = node_start_pfn;
6624
-#endif
6625
- calculate_node_totalpages(pgdat, start_pfn, end_pfn,
6626
- zones_size, zholes_size);
7376
+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
66277377
66287378 alloc_node_mem_map(pgdat);
66297379 pgdat_set_deferred_range(pgdat);
....@@ -6631,80 +7381,10 @@
66317381 free_area_init_core(pgdat);
66327382 }
66337383
6634
-#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
6635
-
6636
-/*
6637
- * Zero all valid struct pages in range [spfn, epfn), return number of struct
6638
- * pages zeroed
6639
- */
6640
-static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
7384
+void __init free_area_init_memoryless_node(int nid)
66417385 {
6642
- unsigned long pfn;
6643
- u64 pgcnt = 0;
6644
-
6645
- for (pfn = spfn; pfn < epfn; pfn++) {
6646
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
6647
- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
6648
- + pageblock_nr_pages - 1;
6649
- continue;
6650
- }
6651
- mm_zero_struct_page(pfn_to_page(pfn));
6652
- pgcnt++;
6653
- }
6654
-
6655
- return pgcnt;
7386
+ free_area_init_node(nid);
66567387 }
6657
-
6658
-/*
6659
- * Only struct pages that are backed by physical memory are zeroed and
6660
- * initialized by going through __init_single_page(). But, there are some
6661
- * struct pages which are reserved in memblock allocator and their fields
6662
- * may be accessed (for example page_to_pfn() on some configuration accesses
6663
- * flags). We must explicitly zero those struct pages.
6664
- *
6665
- * This function also addresses a similar issue where struct pages are left
6666
- * uninitialized because the physical address range is not covered by
6667
- * memblock.memory or memblock.reserved. That could happen when memblock
6668
- * layout is manually configured via memmap=, or when the highest physical
6669
- * address (max_pfn) does not end on a section boundary.
6670
- */
6671
-void __init zero_resv_unavail(void)
6672
-{
6673
- phys_addr_t start, end;
6674
- u64 i, pgcnt;
6675
- phys_addr_t next = 0;
6676
-
6677
- /*
6678
- * Loop through unavailable ranges not covered by memblock.memory.
6679
- */
6680
- pgcnt = 0;
6681
- for_each_mem_range(i, &memblock.memory, NULL,
6682
- NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
6683
- if (next < start)
6684
- pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
6685
- next = end;
6686
- }
6687
-
6688
- /*
6689
- * Early sections always have a fully populated memmap for the whole
6690
- * section - see pfn_valid(). If the last section has holes at the
6691
- * end and that section is marked "online", the memmap will be
6692
- * considered initialized. Make sure that memmap has a well defined
6693
- * state.
6694
- */
6695
- pgcnt += zero_pfn_range(PFN_DOWN(next),
6696
- round_up(max_pfn, PAGES_PER_SECTION));
6697
-
6698
- /*
6699
- * Struct pages that do not have backing memory. This could be because
6700
- * firmware is using some of this memory, or for some other reasons.
6701
- */
6702
- if (pgcnt)
6703
- pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
6704
-}
6705
-#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
6706
-
6707
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
67087388
67097389 #if MAX_NUMNODES > 1
67107390 /*
....@@ -6735,14 +7415,14 @@
67357415 * model has fine enough granularity to avoid incorrect mapping for the
67367416 * populated node map.
67377417 *
6738
- * Returns the determined alignment in pfn's. 0 if there is no alignment
7418
+ * Return: the determined alignment in pfn's. 0 if there is no alignment
67397419 * requirement (single node).
67407420 */
67417421 unsigned long __init node_map_pfn_alignment(void)
67427422 {
67437423 unsigned long accl_mask = 0, last_end = 0;
67447424 unsigned long start, end, mask;
6745
- int last_nid = -1;
7425
+ int last_nid = NUMA_NO_NODE;
67467426 int i, nid;
67477427
67487428 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
....@@ -6769,33 +7449,15 @@
67697449 return ~accl_mask + 1;
67707450 }
67717451
6772
-/* Find the lowest pfn for a node */
6773
-static unsigned long __init find_min_pfn_for_node(int nid)
6774
-{
6775
- unsigned long min_pfn = ULONG_MAX;
6776
- unsigned long start_pfn;
6777
- int i;
6778
-
6779
- for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
6780
- min_pfn = min(min_pfn, start_pfn);
6781
-
6782
- if (min_pfn == ULONG_MAX) {
6783
- pr_warn("Could not find start_pfn for node %d\n", nid);
6784
- return 0;
6785
- }
6786
-
6787
- return min_pfn;
6788
-}
6789
-
67907452 /**
67917453 * find_min_pfn_with_active_regions - Find the minimum PFN registered
67927454 *
6793
- * It returns the minimum PFN based on information provided via
7455
+ * Return: the minimum PFN based on information provided via
67947456 * memblock_set_node().
67957457 */
67967458 unsigned long __init find_min_pfn_with_active_regions(void)
67977459 {
6798
- return find_min_pfn_for_node(MAX_NUMNODES);
7460
+ return PHYS_PFN(memblock_start_of_DRAM());
67997461 }
68007462
68017463 /*
....@@ -6844,11 +7506,11 @@
68447506 * options.
68457507 */
68467508 if (movable_node_is_enabled()) {
6847
- for_each_memblock(memory, r) {
7509
+ for_each_mem_region(r) {
68487510 if (!memblock_is_hotpluggable(r))
68497511 continue;
68507512
6851
- nid = r->nid;
7513
+ nid = memblock_get_region_node(r);
68527514
68537515 usable_startpfn = PFN_DOWN(r->base);
68547516 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
....@@ -6865,11 +7527,11 @@
68657527 if (mirrored_kernelcore) {
68667528 bool mem_below_4gb_not_mirrored = false;
68677529
6868
- for_each_memblock(memory, r) {
7530
+ for_each_mem_region(r) {
68697531 if (memblock_is_mirror(r))
68707532 continue;
68717533
6872
- nid = r->nid;
7534
+ nid = memblock_get_region_node(r);
68737535
68747536 usable_startpfn = memblock_region_memory_base_pfn(r);
68757537
....@@ -6884,7 +7546,7 @@
68847546 }
68857547
68867548 if (mem_below_4gb_not_mirrored)
6887
- pr_warn("This configuration results in unmirrored kernel memory.");
7549
+ pr_warn("This configuration results in unmirrored kernel memory.\n");
68887550
68897551 goto out2;
68907552 }
....@@ -7023,9 +7685,16 @@
70237685
70247686 out2:
70257687 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
7026
- for (nid = 0; nid < MAX_NUMNODES; nid++)
7688
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
7689
+ unsigned long start_pfn, end_pfn;
7690
+
70277691 zone_movable_pfn[nid] =
70287692 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
7693
+
7694
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
7695
+ if (zone_movable_pfn[nid] >= end_pfn)
7696
+ zone_movable_pfn[nid] = 0;
7697
+ }
70297698
70307699 out:
70317700 /* restore the node_state */
....@@ -7037,23 +7706,29 @@
70377706 {
70387707 enum zone_type zone_type;
70397708
7040
- if (N_MEMORY == N_NORMAL_MEMORY)
7041
- return;
7042
-
70437709 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
70447710 struct zone *zone = &pgdat->node_zones[zone_type];
70457711 if (populated_zone(zone)) {
7046
- node_set_state(nid, N_HIGH_MEMORY);
7047
- if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
7048
- zone_type <= ZONE_NORMAL)
7712
+ if (IS_ENABLED(CONFIG_HIGHMEM))
7713
+ node_set_state(nid, N_HIGH_MEMORY);
7714
+ if (zone_type <= ZONE_NORMAL)
70497715 node_set_state(nid, N_NORMAL_MEMORY);
70507716 break;
70517717 }
70527718 }
70537719 }
70547720
7721
+/*
7722
+ * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
7723
+ * such cases we allow max_zone_pfn sorted in the descending order
7724
+ */
7725
+bool __weak arch_has_descending_max_zone_pfns(void)
7726
+{
7727
+ return false;
7728
+}
7729
+
70557730 /**
7056
- * free_area_init_nodes - Initialise all pg_data_t and zone data
7731
+ * free_area_init - Initialise all pg_data_t and zone data
70577732 * @max_zone_pfn: an array of max PFNs for each zone
70587733 *
70597734 * This will call free_area_init_node() for each active node in the system.
....@@ -7065,10 +7740,11 @@
70657740 * starts where the previous one ended. For example, ZONE_DMA32 starts
70667741 * at arch_max_dma_pfn.
70677742 */
7068
-void __init free_area_init_nodes(unsigned long *max_zone_pfn)
7743
+void __init free_area_init(unsigned long *max_zone_pfn)
70697744 {
70707745 unsigned long start_pfn, end_pfn;
7071
- int i, nid;
7746
+ int i, nid, zone;
7747
+ bool descending;
70727748
70737749 /* Record where the zone boundaries are */
70747750 memset(arch_zone_lowest_possible_pfn, 0,
....@@ -7077,14 +7753,20 @@
70777753 sizeof(arch_zone_highest_possible_pfn));
70787754
70797755 start_pfn = find_min_pfn_with_active_regions();
7756
+ descending = arch_has_descending_max_zone_pfns();
70807757
70817758 for (i = 0; i < MAX_NR_ZONES; i++) {
7082
- if (i == ZONE_MOVABLE)
7759
+ if (descending)
7760
+ zone = MAX_NR_ZONES - i - 1;
7761
+ else
7762
+ zone = i;
7763
+
7764
+ if (zone == ZONE_MOVABLE)
70837765 continue;
70847766
7085
- end_pfn = max(max_zone_pfn[i], start_pfn);
7086
- arch_zone_lowest_possible_pfn[i] = start_pfn;
7087
- arch_zone_highest_possible_pfn[i] = end_pfn;
7767
+ end_pfn = max(max_zone_pfn[zone], start_pfn);
7768
+ arch_zone_lowest_possible_pfn[zone] = start_pfn;
7769
+ arch_zone_highest_possible_pfn[zone] = end_pfn;
70887770
70897771 start_pfn = end_pfn;
70907772 }
....@@ -7118,27 +7800,33 @@
71187800 (u64)zone_movable_pfn[i] << PAGE_SHIFT);
71197801 }
71207802
7121
- /* Print out the early node map */
7803
+ /*
7804
+ * Print out the early node map, and initialize the
7805
+ * subsection-map relative to active online memory ranges to
7806
+ * enable future "sub-section" extensions of the memory map.
7807
+ */
71227808 pr_info("Early memory node ranges\n");
7123
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
7809
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
71247810 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
71257811 (u64)start_pfn << PAGE_SHIFT,
71267812 ((u64)end_pfn << PAGE_SHIFT) - 1);
7813
+ subsection_map_init(start_pfn, end_pfn - start_pfn);
7814
+ }
71277815
71287816 /* Initialise every node */
71297817 mminit_verify_pageflags_layout();
71307818 setup_nr_node_ids();
7131
- zero_resv_unavail();
71327819 for_each_online_node(nid) {
71337820 pg_data_t *pgdat = NODE_DATA(nid);
7134
- free_area_init_node(nid, NULL,
7135
- find_min_pfn_for_node(nid), NULL);
7821
+ free_area_init_node(nid);
71367822
71377823 /* Any memory on that node */
71387824 if (pgdat->node_present_pages)
71397825 node_set_state(nid, N_MEMORY);
71407826 check_for_memory(pgdat, nid);
71417827 }
7828
+
7829
+ memmap_init();
71427830 }
71437831
71447832 static int __init cmdline_parse_core(char *p, unsigned long *core,
....@@ -7197,22 +7885,18 @@
71977885 early_param("kernelcore", cmdline_parse_kernelcore);
71987886 early_param("movablecore", cmdline_parse_movablecore);
71997887
7200
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
7201
-
72027888 void adjust_managed_page_count(struct page *page, long count)
72037889 {
7204
- spin_lock(&managed_page_count_lock);
7205
- page_zone(page)->managed_pages += count;
7206
- totalram_pages += count;
7890
+ atomic_long_add(count, &page_zone(page)->managed_pages);
7891
+ totalram_pages_add(count);
72077892 #ifdef CONFIG_HIGHMEM
72087893 if (PageHighMem(page))
7209
- totalhigh_pages += count;
7894
+ totalhigh_pages_add(count);
72107895 #endif
7211
- spin_unlock(&managed_page_count_lock);
72127896 }
72137897 EXPORT_SYMBOL(adjust_managed_page_count);
72147898
7215
-unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
7899
+unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
72167900 {
72177901 void *pos;
72187902 unsigned long pages = 0;
....@@ -7231,6 +7915,11 @@
72317915 * alias for the memset().
72327916 */
72337917 direct_map_addr = page_address(page);
7918
+ /*
7919
+ * Perform a kasan-unchecked memset() since this memory
7920
+ * has not been initialized.
7921
+ */
7922
+ direct_map_addr = kasan_reset_tag(direct_map_addr);
72347923 if ((unsigned int)poison <= 0xFF)
72357924 memset(direct_map_addr, poison, PAGE_SIZE);
72367925
....@@ -7243,15 +7932,14 @@
72437932
72447933 return pages;
72457934 }
7246
-EXPORT_SYMBOL(free_reserved_area);
72477935
72487936 #ifdef CONFIG_HIGHMEM
72497937 void free_highmem_page(struct page *page)
72507938 {
72517939 __free_reserved_page(page);
7252
- totalram_pages++;
7253
- page_zone(page)->managed_pages++;
7254
- totalhigh_pages++;
7940
+ totalram_pages_inc();
7941
+ atomic_long_inc(&page_zone(page)->managed_pages);
7942
+ totalhigh_pages_inc();
72557943 }
72567944 #endif
72577945
....@@ -7278,7 +7966,7 @@
72787966 */
72797967 #define adj_init_size(start, end, size, pos, adj) \
72807968 do { \
7281
- if (start <= pos && pos < end && size > adj) \
7969
+ if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
72827970 size -= adj; \
72837971 } while (0)
72847972
....@@ -7300,10 +7988,10 @@
73007988 physpages << (PAGE_SHIFT - 10),
73017989 codesize >> 10, datasize >> 10, rosize >> 10,
73027990 (init_data_size + init_code_size) >> 10, bss_size >> 10,
7303
- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
7991
+ (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
73047992 totalcma_pages << (PAGE_SHIFT - 10),
73057993 #ifdef CONFIG_HIGHMEM
7306
- totalhigh_pages << (PAGE_SHIFT - 10),
7994
+ totalhigh_pages() << (PAGE_SHIFT - 10),
73077995 #endif
73087996 str ? ", " : "", str ? str : "");
73097997 }
....@@ -7322,13 +8010,6 @@
73228010 void __init set_dma_reserve(unsigned long new_dma_reserve)
73238011 {
73248012 dma_reserve = new_dma_reserve;
7325
-}
7326
-
7327
-void __init free_area_init(unsigned long *zones_size)
7328
-{
7329
- zero_resv_unavail();
7330
- free_area_init_node(0, zones_size,
7331
- __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
73328013 }
73338014
73348015 static int page_alloc_cpu_dead(unsigned int cpu)
....@@ -7356,9 +8037,27 @@
73568037 return 0;
73578038 }
73588039
8040
+#ifdef CONFIG_NUMA
8041
+int hashdist = HASHDIST_DEFAULT;
8042
+
8043
+static int __init set_hashdist(char *str)
8044
+{
8045
+ if (!str)
8046
+ return 0;
8047
+ hashdist = simple_strtoul(str, &str, 0);
8048
+ return 1;
8049
+}
8050
+__setup("hashdist=", set_hashdist);
8051
+#endif
8052
+
73598053 void __init page_alloc_init(void)
73608054 {
73618055 int ret;
8056
+
8057
+#ifdef CONFIG_NUMA
8058
+ if (num_node_state(N_MEMORY) == 1)
8059
+ hashdist = 0;
8060
+#endif
73628061
73638062 ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
73648063 "mm/page_alloc:dead", NULL,
....@@ -7383,6 +8082,7 @@
73838082 for (i = 0; i < MAX_NR_ZONES; i++) {
73848083 struct zone *zone = pgdat->node_zones + i;
73858084 long max = 0;
8085
+ unsigned long managed_pages = zone_managed_pages(zone);
73868086
73878087 /* Find valid and maximum lowmem_reserve in the zone */
73888088 for (j = i; j < MAX_NR_ZONES; j++) {
....@@ -7393,8 +8093,8 @@
73938093 /* we treat the high watermark as reserved pages. */
73948094 max += high_wmark_pages(zone);
73958095
7396
- if (max > zone->managed_pages)
7397
- max = zone->managed_pages;
8096
+ if (max > managed_pages)
8097
+ max = managed_pages;
73988098
73998099 pgdat->totalreserve_pages += max;
74008100
....@@ -7413,30 +8113,24 @@
74138113 static void setup_per_zone_lowmem_reserve(void)
74148114 {
74158115 struct pglist_data *pgdat;
7416
- enum zone_type j, idx;
8116
+ enum zone_type i, j;
74178117
74188118 for_each_online_pgdat(pgdat) {
7419
- for (j = 0; j < MAX_NR_ZONES; j++) {
7420
- struct zone *zone = pgdat->node_zones + j;
7421
- unsigned long managed_pages = zone->managed_pages;
8119
+ for (i = 0; i < MAX_NR_ZONES - 1; i++) {
8120
+ struct zone *zone = &pgdat->node_zones[i];
8121
+ int ratio = sysctl_lowmem_reserve_ratio[i];
8122
+ bool clear = !ratio || !zone_managed_pages(zone);
8123
+ unsigned long managed_pages = 0;
74228124
7423
- zone->lowmem_reserve[j] = 0;
8125
+ for (j = i + 1; j < MAX_NR_ZONES; j++) {
8126
+ struct zone *upper_zone = &pgdat->node_zones[j];
74248127
7425
- idx = j;
7426
- while (idx) {
7427
- struct zone *lower_zone;
8128
+ managed_pages += zone_managed_pages(upper_zone);
74288129
7429
- idx--;
7430
- lower_zone = pgdat->node_zones + idx;
7431
-
7432
- if (sysctl_lowmem_reserve_ratio[idx] < 1) {
7433
- sysctl_lowmem_reserve_ratio[idx] = 0;
7434
- lower_zone->lowmem_reserve[j] = 0;
7435
- } else {
7436
- lower_zone->lowmem_reserve[j] =
7437
- managed_pages / sysctl_lowmem_reserve_ratio[idx];
7438
- }
7439
- managed_pages += lower_zone->managed_pages;
8130
+ if (clear)
8131
+ zone->lowmem_reserve[j] = 0;
8132
+ else
8133
+ zone->lowmem_reserve[j] = managed_pages / ratio;
74408134 }
74418135 }
74428136 }
....@@ -7456,18 +8150,17 @@
74568150 /* Calculate total number of !ZONE_HIGHMEM pages */
74578151 for_each_zone(zone) {
74588152 if (!is_highmem(zone))
7459
- lowmem_pages += zone->managed_pages;
8153
+ lowmem_pages += zone_managed_pages(zone);
74608154 }
74618155
74628156 for_each_zone(zone) {
7463
- u64 min, low;
8157
+ u64 tmp, low;
74648158
74658159 spin_lock_irqsave(&zone->lock, flags);
7466
- min = (u64)pages_min * zone->managed_pages;
7467
- do_div(min, lowmem_pages);
7468
- low = (u64)pages_low * zone->managed_pages;
7469
- do_div(low, vm_total_pages);
7470
-
8160
+ tmp = (u64)pages_min * zone_managed_pages(zone);
8161
+ do_div(tmp, lowmem_pages);
8162
+ low = (u64)pages_low * zone_managed_pages(zone);
8163
+ do_div(low, nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)));
74718164 if (is_highmem(zone)) {
74728165 /*
74738166 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
....@@ -7475,20 +8168,20 @@
74758168 * value here.
74768169 *
74778170 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
7478
- * deltas control asynch page reclaim, and so should
8171
+ * deltas control async page reclaim, and so should
74798172 * not be capped for highmem.
74808173 */
74818174 unsigned long min_pages;
74828175
7483
- min_pages = zone->managed_pages / 1024;
8176
+ min_pages = zone_managed_pages(zone) / 1024;
74848177 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
7485
- zone->watermark[WMARK_MIN] = min_pages;
8178
+ zone->_watermark[WMARK_MIN] = min_pages;
74868179 } else {
74878180 /*
74888181 * If it's a lowmem zone, reserve a number of pages
74898182 * proportionate to the zone's size.
74908183 */
7491
- zone->watermark[WMARK_MIN] = min;
8184
+ zone->_watermark[WMARK_MIN] = tmp;
74928185 }
74938186
74948187 /*
....@@ -7496,14 +8189,13 @@
74968189 * scale factor in proportion to available memory, but
74978190 * ensure a minimum size on small systems.
74988191 */
7499
- min = max_t(u64, min >> 2,
7500
- mult_frac(zone->managed_pages,
8192
+ tmp = max_t(u64, tmp >> 2,
8193
+ mult_frac(zone_managed_pages(zone),
75018194 watermark_scale_factor, 10000));
75028195
7503
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
7504
- low + min;
7505
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
7506
- low + min * 2;
8196
+ zone->watermark_boost = 0;
8197
+ zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + low + tmp;
8198
+ zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + low + tmp * 2;
75078199
75088200 spin_unlock_irqrestore(&zone->lock, flags);
75098201 }
....@@ -7532,7 +8224,7 @@
75328224 * Initialise min_free_kbytes.
75338225 *
75348226 * For small machines we want it small (128k min). For large machines
7535
- * we want it large (64MB max). But it is not linear, because network
8227
+ * we want it large (256MB max). But it is not linear, because network
75368228 * bandwidth does not increase linearly with machine size. We use
75378229 *
75388230 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
....@@ -7564,8 +8256,8 @@
75648256 min_free_kbytes = new_min_free_kbytes;
75658257 if (min_free_kbytes < 128)
75668258 min_free_kbytes = 128;
7567
- if (min_free_kbytes > 65536)
7568
- min_free_kbytes = 65536;
8259
+ if (min_free_kbytes > 262144)
8260
+ min_free_kbytes = 262144;
75698261 } else {
75708262 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
75718263 new_min_free_kbytes, user_min_free_kbytes);
....@@ -7591,7 +8283,7 @@
75918283 * or extra_free_kbytes changes.
75928284 */
75938285 int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
7594
- void __user *buffer, size_t *length, loff_t *ppos)
8286
+ void *buffer, size_t *length, loff_t *ppos)
75958287 {
75968288 int rc;
75978289
....@@ -7607,7 +8299,7 @@
76078299 }
76088300
76098301 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
7610
- void __user *buffer, size_t *length, loff_t *ppos)
8302
+ void *buffer, size_t *length, loff_t *ppos)
76118303 {
76128304 int rc;
76138305
....@@ -7631,13 +8323,13 @@
76318323 pgdat->min_unmapped_pages = 0;
76328324
76338325 for_each_zone(zone)
7634
- zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
7635
- sysctl_min_unmapped_ratio) / 100;
8326
+ zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
8327
+ sysctl_min_unmapped_ratio) / 100;
76368328 }
76378329
76388330
76398331 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
7640
- void __user *buffer, size_t *length, loff_t *ppos)
8332
+ void *buffer, size_t *length, loff_t *ppos)
76418333 {
76428334 int rc;
76438335
....@@ -7659,12 +8351,12 @@
76598351 pgdat->min_slab_pages = 0;
76608352
76618353 for_each_zone(zone)
7662
- zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
7663
- sysctl_min_slab_ratio) / 100;
8354
+ zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
8355
+ sysctl_min_slab_ratio) / 100;
76648356 }
76658357
76668358 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
7667
- void __user *buffer, size_t *length, loff_t *ppos)
8359
+ void *buffer, size_t *length, loff_t *ppos)
76688360 {
76698361 int rc;
76708362
....@@ -7688,11 +8380,28 @@
76888380 * if in function of the boot time zone sizes.
76898381 */
76908382 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
7691
- void __user *buffer, size_t *length, loff_t *ppos)
8383
+ void *buffer, size_t *length, loff_t *ppos)
76928384 {
8385
+ int i;
8386
+
76938387 proc_dointvec_minmax(table, write, buffer, length, ppos);
8388
+
8389
+ for (i = 0; i < MAX_NR_ZONES; i++) {
8390
+ if (sysctl_lowmem_reserve_ratio[i] < 1)
8391
+ sysctl_lowmem_reserve_ratio[i] = 0;
8392
+ }
8393
+
76948394 setup_per_zone_lowmem_reserve();
76958395 return 0;
8396
+}
8397
+
8398
+static void __zone_pcp_update(struct zone *zone)
8399
+{
8400
+ unsigned int cpu;
8401
+
8402
+ for_each_possible_cpu(cpu)
8403
+ pageset_set_high_and_batch(zone,
8404
+ per_cpu_ptr(zone->pageset, cpu));
76968405 }
76978406
76988407 /*
....@@ -7701,7 +8410,7 @@
77018410 * pagelist can have before it gets flushed back to buddy allocator.
77028411 */
77038412 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
7704
- void __user *buffer, size_t *length, loff_t *ppos)
8413
+ void *buffer, size_t *length, loff_t *ppos)
77058414 {
77068415 struct zone *zone;
77078416 int old_percpu_pagelist_fraction;
....@@ -7726,30 +8435,12 @@
77268435 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
77278436 goto out;
77288437
7729
- for_each_populated_zone(zone) {
7730
- unsigned int cpu;
7731
-
7732
- for_each_possible_cpu(cpu)
7733
- pageset_set_high_and_batch(zone,
7734
- per_cpu_ptr(zone->pageset, cpu));
7735
- }
8438
+ for_each_populated_zone(zone)
8439
+ __zone_pcp_update(zone);
77368440 out:
77378441 mutex_unlock(&pcp_batch_high_lock);
77388442 return ret;
77398443 }
7740
-
7741
-#ifdef CONFIG_NUMA
7742
-int hashdist = HASHDIST_DEFAULT;
7743
-
7744
-static int __init set_hashdist(char *str)
7745
-{
7746
- if (!str)
7747
- return 0;
7748
- hashdist = simple_strtoul(str, &str, 0);
7749
- return 1;
7750
-}
7751
-__setup("hashdist=", set_hashdist);
7752
-#endif
77538444
77548445 #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
77558446 /*
....@@ -7797,6 +8488,7 @@
77978488 unsigned long log2qty, size;
77988489 void *table = NULL;
77998490 gfp_t gfp_flags;
8491
+ bool virt;
78008492
78018493 /* allow the kernel cmdline to have a say */
78028494 if (!numentries) {
....@@ -7853,32 +8545,34 @@
78538545
78548546 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
78558547 do {
8548
+ virt = false;
78568549 size = bucketsize << log2qty;
78578550 if (flags & HASH_EARLY) {
78588551 if (flags & HASH_ZERO)
7859
- table = memblock_virt_alloc_nopanic(size, 0);
8552
+ table = memblock_alloc(size, SMP_CACHE_BYTES);
78608553 else
7861
- table = memblock_virt_alloc_raw(size, 0);
7862
- } else if (hashdist) {
7863
- table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
8554
+ table = memblock_alloc_raw(size,
8555
+ SMP_CACHE_BYTES);
8556
+ } else if (get_order(size) >= MAX_ORDER || hashdist) {
8557
+ table = __vmalloc(size, gfp_flags);
8558
+ virt = true;
78648559 } else {
78658560 /*
78668561 * If bucketsize is not a power-of-two, we may free
78678562 * some pages at the end of hash table which
78688563 * alloc_pages_exact() automatically does
78698564 */
7870
- if (get_order(size) < MAX_ORDER) {
7871
- table = alloc_pages_exact(size, gfp_flags);
7872
- kmemleak_alloc(table, size, 1, gfp_flags);
7873
- }
8565
+ table = alloc_pages_exact(size, gfp_flags);
8566
+ kmemleak_alloc(table, size, 1, gfp_flags);
78748567 }
78758568 } while (!table && size > PAGE_SIZE && --log2qty);
78768569
78778570 if (!table)
78788571 panic("Failed to allocate %s hash table\n", tablename);
78798572
7880
- pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
7881
- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
8573
+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
8574
+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
8575
+ virt ? "vmalloc" : "linear");
78828576
78838577 if (_hash_shift)
78848578 *_hash_shift = log2qty;
....@@ -7890,47 +8584,50 @@
78908584
78918585 /*
78928586 * This function checks whether pageblock includes unmovable pages or not.
7893
- * If @count is not zero, it is okay to include less @count unmovable pages
78948587 *
78958588 * PageLRU check without isolation or lru_lock could race so that
78968589 * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
78978590 * check without lock_page also may miss some movable non-lru pages at
78988591 * race condition. So you can't expect this function should be exact.
8592
+ *
8593
+ * Returns a page without holding a reference. If the caller wants to
8594
+ * dereference that page (e.g., dumping), it has to make sure that it
8595
+ * cannot get removed (e.g., via memory unplug) concurrently.
8596
+ *
78998597 */
7900
-bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7901
- int migratetype,
7902
- bool skip_hwpoisoned_pages)
8598
+struct page *has_unmovable_pages(struct zone *zone, struct page *page,
8599
+ int migratetype, int flags)
79038600 {
7904
- unsigned long pfn, iter, found;
8601
+ unsigned long iter = 0;
8602
+ unsigned long pfn = page_to_pfn(page);
8603
+ unsigned long offset = pfn % pageblock_nr_pages;
79058604
7906
- /*
7907
- * TODO we could make this much more efficient by not checking every
7908
- * page in the range if we know all of them are in MOVABLE_ZONE and
7909
- * that the movable zone guarantees that pages are migratable but
7910
- * the later is not the case right now unfortunatelly. E.g. movablecore
7911
- * can still lead to having bootmem allocations in zone_movable.
7912
- */
8605
+ if (is_migrate_cma_page(page)) {
8606
+ /*
8607
+ * CMA allocations (alloc_contig_range) really need to mark
8608
+ * isolate CMA pageblocks even when they are not movable in fact
8609
+ * so consider them movable here.
8610
+ */
8611
+ if (is_migrate_cma(migratetype))
8612
+ return NULL;
79138613
7914
- /*
7915
- * CMA allocations (alloc_contig_range) really need to mark isolate
7916
- * CMA pageblocks even when they are not movable in fact so consider
7917
- * them movable here.
7918
- */
7919
- if (is_migrate_cma(migratetype) &&
7920
- is_migrate_cma(get_pageblock_migratetype(page)))
7921
- return false;
8614
+ return page;
8615
+ }
79228616
7923
- pfn = page_to_pfn(page);
7924
- for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
7925
- unsigned long check = pfn + iter;
7926
-
7927
- if (!pfn_valid_within(check))
8617
+ for (; iter < pageblock_nr_pages - offset; iter++) {
8618
+ if (!pfn_valid_within(pfn + iter))
79288619 continue;
79298620
7930
- page = pfn_to_page(check);
8621
+ page = pfn_to_page(pfn + iter);
79318622
8623
+ /*
8624
+ * Both, bootmem allocations and memory holes are marked
8625
+ * PG_reserved and are unmovable. We can even have unmovable
8626
+ * allocations inside ZONE_MOVABLE, for example when
8627
+ * specifying "movablecore".
8628
+ */
79328629 if (PageReserved(page))
7933
- goto unmovable;
8630
+ return page;
79348631
79358632 /*
79368633 * If the zone is movable and we have ruled out all reserved
....@@ -7942,17 +8639,22 @@
79428639
79438640 /*
79448641 * Hugepages are not in LRU lists, but they're movable.
7945
- * We need not scan over tail pages bacause we don't
8642
+ * THPs are on the LRU, but need to be counted as #small pages.
8643
+ * We need not scan over tail pages because we don't
79468644 * handle each tail page individually in migration.
79478645 */
7948
- if (PageHuge(page)) {
8646
+ if (PageHuge(page) || PageTransCompound(page)) {
79498647 struct page *head = compound_head(page);
79508648 unsigned int skip_pages;
79518649
7952
- if (!hugepage_migration_supported(page_hstate(head)))
7953
- goto unmovable;
8650
+ if (PageHuge(page)) {
8651
+ if (!hugepage_migration_supported(page_hstate(head)))
8652
+ return page;
8653
+ } else if (!PageLRU(head) && !__PageMovable(head)) {
8654
+ return page;
8655
+ }
79548656
7955
- skip_pages = (1 << compound_order(head)) - (page - head);
8657
+ skip_pages = compound_nr(head) - (page - head);
79568658 iter += skip_pages - 1;
79578659 continue;
79588660 }
....@@ -7965,7 +8667,7 @@
79658667 */
79668668 if (!page_ref_count(page)) {
79678669 if (PageBuddy(page))
7968
- iter += (1 << page_order(page)) - 1;
8670
+ iter += (1 << buddy_order(page)) - 1;
79698671 continue;
79708672 }
79718673
....@@ -7973,61 +8675,100 @@
79738675 * The HWPoisoned page may be not in buddy system, and
79748676 * page_count() is not 0.
79758677 */
7976
- if (skip_hwpoisoned_pages && PageHWPoison(page))
8678
+ if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
79778679 continue;
79788680
7979
- if (__PageMovable(page))
8681
+ /*
8682
+ * We treat all PageOffline() pages as movable when offlining
8683
+ * to give drivers a chance to decrement their reference count
8684
+ * in MEM_GOING_OFFLINE in order to indicate that these pages
8685
+ * can be offlined as there are no direct references anymore.
8686
+ * For actually unmovable PageOffline() where the driver does
8687
+ * not support this, we will fail later when trying to actually
8688
+ * move these pages that still have a reference count > 0.
8689
+ * (false negatives in this function only)
8690
+ */
8691
+ if ((flags & MEMORY_OFFLINE) && PageOffline(page))
79808692 continue;
79818693
7982
- if (!PageLRU(page))
7983
- found++;
8694
+ if (__PageMovable(page) || PageLRU(page))
8695
+ continue;
8696
+
79848697 /*
79858698 * If there are RECLAIMABLE pages, we need to check
79868699 * it. But now, memory offline itself doesn't call
79878700 * shrink_node_slabs() and it still to be fixed.
79888701 */
7989
- /*
7990
- * If the page is not RAM, page_count()should be 0.
7991
- * we don't need more check. This is an _used_ not-movable page.
7992
- *
7993
- * The problematic thing here is PG_reserved pages. PG_reserved
7994
- * is set to both of a memory hole page and a _used_ kernel
7995
- * page at boot.
7996
- */
7997
- if (found > count)
7998
- goto unmovable;
8702
+ return page;
79998703 }
8000
- return false;
8001
-unmovable:
8002
- WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
8003
- return true;
8704
+ return NULL;
80048705 }
80058706
8006
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
8007
-
8707
+#ifdef CONFIG_CONTIG_ALLOC
80088708 static unsigned long pfn_max_align_down(unsigned long pfn)
80098709 {
80108710 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
80118711 pageblock_nr_pages) - 1);
80128712 }
80138713
8014
-static unsigned long pfn_max_align_up(unsigned long pfn)
8714
+unsigned long pfn_max_align_up(unsigned long pfn)
80158715 {
80168716 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
80178717 pageblock_nr_pages));
80188718 }
80198719
8720
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
8721
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
8722
+/* Usage: See admin-guide/dynamic-debug-howto.rst */
8723
+static void alloc_contig_dump_pages(struct list_head *page_list)
8724
+{
8725
+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
8726
+
8727
+ if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
8728
+ struct page *page;
8729
+ unsigned long nr_skip = 0;
8730
+ unsigned long nr_pages = 0;
8731
+
8732
+ dump_stack();
8733
+ list_for_each_entry(page, page_list, lru) {
8734
+ nr_pages++;
8735
+ /* The page will be freed by putback_movable_pages soon */
8736
+ if (page_count(page) == 1) {
8737
+ nr_skip++;
8738
+ continue;
8739
+ }
8740
+ dump_page(page, "migration failure");
8741
+ }
8742
+ pr_warn("total dump_pages %lu skipping %lu\n", nr_pages, nr_skip);
8743
+ }
8744
+}
8745
+#else
8746
+static inline void alloc_contig_dump_pages(struct list_head *page_list)
8747
+{
8748
+}
8749
+#endif
8750
+
80208751 /* [start, end) must belong to a single zone. */
80218752 static int __alloc_contig_migrate_range(struct compact_control *cc,
8022
- unsigned long start, unsigned long end)
8753
+ unsigned long start, unsigned long end,
8754
+ struct acr_info *info)
80238755 {
80248756 /* This function is based on compact_zone() from compaction.c. */
8025
- unsigned long nr_reclaimed;
8757
+ unsigned int nr_reclaimed;
80268758 unsigned long pfn = start;
80278759 unsigned int tries = 0;
8760
+ unsigned int max_tries = 5;
80288761 int ret = 0;
8762
+ struct page *page;
8763
+ struct migration_target_control mtc = {
8764
+ .nid = zone_to_nid(cc->zone),
8765
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
8766
+ };
80298767
8030
- migrate_prep();
8768
+ if (cc->alloc_contig && cc->mode == MIGRATE_ASYNC)
8769
+ max_tries = 1;
8770
+
8771
+ lru_cache_disable();
80318772
80328773 while (pfn < end || !list_empty(&cc->migratepages)) {
80338774 if (fatal_signal_pending(current)) {
....@@ -8043,20 +8784,39 @@
80438784 break;
80448785 }
80458786 tries = 0;
8046
- } else if (++tries == 5) {
8787
+ } else if (++tries == max_tries) {
80478788 ret = ret < 0 ? ret : -EBUSY;
80488789 break;
80498790 }
80508791
80518792 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
80528793 &cc->migratepages);
8794
+ info->nr_reclaimed += nr_reclaimed;
80538795 cc->nr_migratepages -= nr_reclaimed;
80548796
8055
- ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
8056
- NULL, 0, cc->mode, MR_CONTIG_RANGE);
8797
+ list_for_each_entry(page, &cc->migratepages, lru)
8798
+ info->nr_mapped += page_mapcount(page);
8799
+
8800
+ ret = migrate_pages(&cc->migratepages, alloc_migration_target,
8801
+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
8802
+ if (!ret)
8803
+ info->nr_migrated += cc->nr_migratepages;
80578804 }
8805
+
8806
+ lru_cache_enable();
80588807 if (ret < 0) {
8808
+ if (ret == -EBUSY) {
8809
+ alloc_contig_dump_pages(&cc->migratepages);
8810
+ page_pinner_mark_migration_failed_pages(&cc->migratepages);
8811
+ }
8812
+
8813
+ if (!list_empty(&cc->migratepages)) {
8814
+ page = list_first_entry(&cc->migratepages, struct page , lru);
8815
+ info->failed_pfn = page_to_pfn(page);
8816
+ }
8817
+
80598818 putback_movable_pages(&cc->migratepages);
8819
+ info->err |= ACR_ERR_MIGRATE;
80608820 return ret;
80618821 }
80628822 return 0;
....@@ -8079,25 +8839,28 @@
80798839 * pageblocks in the range. Once isolated, the pageblocks should not
80808840 * be modified by others.
80818841 *
8082
- * Returns zero on success or negative error code. On success all
8842
+ * Return: zero on success or negative error code. On success all
80838843 * pages which PFN is in [start, end) are allocated for the caller and
80848844 * need to be freed with free_contig_range().
80858845 */
80868846 int alloc_contig_range(unsigned long start, unsigned long end,
8087
- unsigned migratetype, gfp_t gfp_mask)
8847
+ unsigned migratetype, gfp_t gfp_mask,
8848
+ struct acr_info *info)
80888849 {
80898850 unsigned long outer_start, outer_end;
80908851 unsigned int order;
80918852 int ret = 0;
8853
+ bool skip_drain_all_pages = false;
80928854
80938855 struct compact_control cc = {
80948856 .nr_migratepages = 0,
80958857 .order = -1,
80968858 .zone = page_zone(pfn_to_page(start)),
8097
- .mode = MIGRATE_SYNC,
8859
+ .mode = gfp_mask & __GFP_NORETRY ? MIGRATE_ASYNC : MIGRATE_SYNC,
80988860 .ignore_skip_hint = true,
80998861 .no_set_skip_hint = true,
81008862 .gfp_mask = current_gfp_context(gfp_mask),
8863
+ .alloc_contig = true,
81018864 };
81028865 INIT_LIST_HEAD(&cc.migratepages);
81038866
....@@ -8126,14 +8889,18 @@
81268889 */
81278890
81288891 ret = start_isolate_page_range(pfn_max_align_down(start),
8129
- pfn_max_align_up(end), migratetype,
8130
- false);
8131
- if (ret)
8892
+ pfn_max_align_up(end), migratetype, 0,
8893
+ &info->failed_pfn);
8894
+ if (ret) {
8895
+ info->err |= ACR_ERR_ISOLATE;
81328896 return ret;
8897
+ }
81338898
8134
-#ifdef CONFIG_CMA
8135
- cc.zone->cma_alloc = 1;
8136
-#endif
8899
+ trace_android_vh_cma_drain_all_pages_bypass(migratetype,
8900
+ &skip_drain_all_pages);
8901
+ if (!skip_drain_all_pages)
8902
+ drain_all_pages(cc.zone);
8903
+
81378904 /*
81388905 * In case of -EBUSY, we'd like to know which page causes problem.
81398906 * So, just fall through. test_pages_isolated() has a tracepoint
....@@ -8144,8 +8911,8 @@
81448911 * allocated. So, if we fall through be sure to clear ret so that
81458912 * -EBUSY is not accidentally used or returned to caller.
81468913 */
8147
- ret = __alloc_contig_migrate_range(&cc, start, end);
8148
- if (ret && ret != -EBUSY)
8914
+ ret = __alloc_contig_migrate_range(&cc, start, end, info);
8915
+ if (ret && (ret != -EBUSY || (gfp_mask & __GFP_NORETRY)))
81498916 goto done;
81508917 ret =0;
81518918
....@@ -8166,9 +8933,6 @@
81668933 * isolated thus they won't get removed from buddy.
81678934 */
81688935
8169
- lru_add_drain_all();
8170
- drain_all_pages(cc.zone);
8171
-
81728936 order = 0;
81738937 outer_start = start;
81748938 while (!PageBuddy(pfn_to_page(outer_start))) {
....@@ -8180,7 +8944,7 @@
81808944 }
81818945
81828946 if (outer_start != start) {
8183
- order = page_order(pfn_to_page(outer_start));
8947
+ order = buddy_order(pfn_to_page(outer_start));
81848948
81858949 /*
81868950 * outer_start page could be small order buddy page and
....@@ -8193,10 +8957,11 @@
81938957 }
81948958
81958959 /* Make sure the range is really isolated. */
8196
- if (test_pages_isolated(outer_start, end, false)) {
8960
+ if (test_pages_isolated(outer_start, end, 0, &info->failed_pfn)) {
81978961 pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
81988962 __func__, outer_start, end);
81998963 ret = -EBUSY;
8964
+ info->err |= ACR_ERR_TEST;
82008965 goto done;
82018966 }
82028967
....@@ -8216,13 +8981,114 @@
82168981 done:
82178982 undo_isolate_page_range(pfn_max_align_down(start),
82188983 pfn_max_align_up(end), migratetype);
8219
-#ifdef CONFIG_CMA
8220
- cc.zone->cma_alloc = 0;
8221
-#endif
82228984 return ret;
82238985 }
8986
+EXPORT_SYMBOL(alloc_contig_range);
82248987
8225
-void free_contig_range(unsigned long pfn, unsigned nr_pages)
8988
+static int __alloc_contig_pages(unsigned long start_pfn,
8989
+ unsigned long nr_pages, gfp_t gfp_mask)
8990
+{
8991
+ struct acr_info dummy;
8992
+ unsigned long end_pfn = start_pfn + nr_pages;
8993
+
8994
+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
8995
+ gfp_mask, &dummy);
8996
+}
8997
+
8998
+static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
8999
+ unsigned long nr_pages)
9000
+{
9001
+ unsigned long i, end_pfn = start_pfn + nr_pages;
9002
+ struct page *page;
9003
+
9004
+ for (i = start_pfn; i < end_pfn; i++) {
9005
+ page = pfn_to_online_page(i);
9006
+ if (!page)
9007
+ return false;
9008
+
9009
+ if (page_zone(page) != z)
9010
+ return false;
9011
+
9012
+ if (PageReserved(page))
9013
+ return false;
9014
+
9015
+ if (page_count(page) > 0)
9016
+ return false;
9017
+
9018
+ if (PageHuge(page))
9019
+ return false;
9020
+ }
9021
+ return true;
9022
+}
9023
+
9024
+static bool zone_spans_last_pfn(const struct zone *zone,
9025
+ unsigned long start_pfn, unsigned long nr_pages)
9026
+{
9027
+ unsigned long last_pfn = start_pfn + nr_pages - 1;
9028
+
9029
+ return zone_spans_pfn(zone, last_pfn);
9030
+}
9031
+
9032
+/**
9033
+ * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
9034
+ * @nr_pages: Number of contiguous pages to allocate
9035
+ * @gfp_mask: GFP mask to limit search and used during compaction
9036
+ * @nid: Target node
9037
+ * @nodemask: Mask for other possible nodes
9038
+ *
9039
+ * This routine is a wrapper around alloc_contig_range(). It scans over zones
9040
+ * on an applicable zonelist to find a contiguous pfn range which can then be
9041
+ * tried for allocation with alloc_contig_range(). This routine is intended
9042
+ * for allocation requests which can not be fulfilled with the buddy allocator.
9043
+ *
9044
+ * The allocated memory is always aligned to a page boundary. If nr_pages is a
9045
+ * power of two then the alignment is guaranteed to be to the given nr_pages
9046
+ * (e.g. 1GB request would be aligned to 1GB).
9047
+ *
9048
+ * Allocated pages can be freed with free_contig_range() or by manually calling
9049
+ * __free_page() on each allocated page.
9050
+ *
9051
+ * Return: pointer to contiguous pages on success, or NULL if not successful.
9052
+ */
9053
+struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
9054
+ int nid, nodemask_t *nodemask)
9055
+{
9056
+ unsigned long ret, pfn, flags;
9057
+ struct zonelist *zonelist;
9058
+ struct zone *zone;
9059
+ struct zoneref *z;
9060
+
9061
+ zonelist = node_zonelist(nid, gfp_mask);
9062
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
9063
+ gfp_zone(gfp_mask), nodemask) {
9064
+ spin_lock_irqsave(&zone->lock, flags);
9065
+
9066
+ pfn = ALIGN(zone->zone_start_pfn, nr_pages);
9067
+ while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
9068
+ if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
9069
+ /*
9070
+ * We release the zone lock here because
9071
+ * alloc_contig_range() will also lock the zone
9072
+ * at some point. If there's an allocation
9073
+ * spinning on this lock, it may win the race
9074
+ * and cause alloc_contig_range() to fail...
9075
+ */
9076
+ spin_unlock_irqrestore(&zone->lock, flags);
9077
+ ret = __alloc_contig_pages(pfn, nr_pages,
9078
+ gfp_mask);
9079
+ if (!ret)
9080
+ return pfn_to_page(pfn);
9081
+ spin_lock_irqsave(&zone->lock, flags);
9082
+ }
9083
+ pfn += nr_pages;
9084
+ }
9085
+ spin_unlock_irqrestore(&zone->lock, flags);
9086
+ }
9087
+ return NULL;
9088
+}
9089
+#endif /* CONFIG_CONTIG_ALLOC */
9090
+
9091
+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
82269092 {
82279093 unsigned int count = 0;
82289094
....@@ -8234,7 +9100,7 @@
82349100 }
82359101 WARN(count != 0, "%d pages are still in use!\n", count);
82369102 }
8237
-#endif
9103
+EXPORT_SYMBOL(free_contig_range);
82389104
82399105 /*
82409106 * The zone indicated has a new number of managed_pages; batch sizes and percpu
....@@ -8242,11 +9108,8 @@
82429108 */
82439109 void __meminit zone_pcp_update(struct zone *zone)
82449110 {
8245
- unsigned cpu;
82469111 mutex_lock(&pcp_batch_high_lock);
8247
- for_each_possible_cpu(cpu)
8248
- pageset_set_high_and_batch(zone,
8249
- per_cpu_ptr(zone->pageset, cpu));
9112
+ __zone_pcp_update(zone);
82509113 mutex_unlock(&pcp_batch_high_lock);
82519114 }
82529115
....@@ -8271,32 +9134,21 @@
82719134
82729135 #ifdef CONFIG_MEMORY_HOTREMOVE
82739136 /*
8274
- * All pages in the range must be in a single zone and isolated
8275
- * before calling this.
9137
+ * All pages in the range must be in a single zone, must not contain holes,
9138
+ * must span full sections, and must be isolated before calling this function.
82769139 */
8277
-void
8278
-__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
9140
+void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
82799141 {
9142
+ unsigned long pfn = start_pfn;
82809143 struct page *page;
82819144 struct zone *zone;
8282
- unsigned int order, i;
8283
- unsigned long pfn;
9145
+ unsigned int order;
82849146 unsigned long flags;
8285
- /* find the first valid pfn */
8286
- for (pfn = start_pfn; pfn < end_pfn; pfn++)
8287
- if (pfn_valid(pfn))
8288
- break;
8289
- if (pfn == end_pfn)
8290
- return;
9147
+
82919148 offline_mem_sections(pfn, end_pfn);
82929149 zone = page_zone(pfn_to_page(pfn));
82939150 spin_lock_irqsave(&zone->lock, flags);
8294
- pfn = start_pfn;
82959151 while (pfn < end_pfn) {
8296
- if (!pfn_valid(pfn)) {
8297
- pfn++;
8298
- continue;
8299
- }
83009152 page = pfn_to_page(pfn);
83019153 /*
83029154 * The HWPoisoned page may be not in buddy system, and
....@@ -8304,22 +9156,23 @@
83049156 */
83059157 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
83069158 pfn++;
8307
- SetPageReserved(page);
9159
+ continue;
9160
+ }
9161
+ /*
9162
+ * At this point all remaining PageOffline() pages have a
9163
+ * reference count of 0 and can simply be skipped.
9164
+ */
9165
+ if (PageOffline(page)) {
9166
+ BUG_ON(page_count(page));
9167
+ BUG_ON(PageBuddy(page));
9168
+ pfn++;
83089169 continue;
83099170 }
83109171
83119172 BUG_ON(page_count(page));
83129173 BUG_ON(!PageBuddy(page));
8313
- order = page_order(page);
8314
-#ifdef CONFIG_DEBUG_VM
8315
- pr_info("remove from free list %lx %d %lx\n",
8316
- pfn, 1 << order, end_pfn);
8317
-#endif
8318
- list_del(&page->lru);
8319
- rmv_page_order(page);
8320
- zone->free_area[order].nr_free--;
8321
- for (i = 0; i < (1 << order); i++)
8322
- SetPageReserved((page+i));
9174
+ order = buddy_order(page);
9175
+ del_page_from_free_list(page, zone, order);
83239176 pfn += (1 << order);
83249177 }
83259178 spin_unlock_irqrestore(&zone->lock, flags);
....@@ -8337,7 +9190,7 @@
83379190 for (order = 0; order < MAX_ORDER; order++) {
83389191 struct page *page_head = page - (pfn & ((1 << order) - 1));
83399192
8340
- if (PageBuddy(page_head) && page_order(page_head) >= order)
9193
+ if (PageBuddy(page_head) && buddy_order(page_head) >= order)
83419194 break;
83429195 }
83439196 spin_unlock_irqrestore(&zone->lock, flags);
....@@ -8347,30 +9200,87 @@
83479200
83489201 #ifdef CONFIG_MEMORY_FAILURE
83499202 /*
8350
- * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
8351
- * test is performed under the zone lock to prevent a race against page
8352
- * allocation.
9203
+ * Break down a higher-order page in sub-pages, and keep our target out of
9204
+ * buddy allocator.
83539205 */
8354
-bool set_hwpoison_free_buddy_page(struct page *page)
9206
+static void break_down_buddy_pages(struct zone *zone, struct page *page,
9207
+ struct page *target, int low, int high,
9208
+ int migratetype)
9209
+{
9210
+ unsigned long size = 1 << high;
9211
+ struct page *current_buddy, *next_page;
9212
+
9213
+ while (high > low) {
9214
+ high--;
9215
+ size >>= 1;
9216
+
9217
+ if (target >= &page[size]) {
9218
+ next_page = page + size;
9219
+ current_buddy = page;
9220
+ } else {
9221
+ next_page = page;
9222
+ current_buddy = page + size;
9223
+ }
9224
+
9225
+ if (set_page_guard(zone, current_buddy, high, migratetype))
9226
+ continue;
9227
+
9228
+ if (current_buddy != target) {
9229
+ add_to_free_list(current_buddy, zone, high, migratetype);
9230
+ set_buddy_order(current_buddy, high);
9231
+ page = next_page;
9232
+ }
9233
+ }
9234
+}
9235
+
9236
+/*
9237
+ * Take a page that will be marked as poisoned off the buddy allocator.
9238
+ */
9239
+bool take_page_off_buddy(struct page *page)
83559240 {
83569241 struct zone *zone = page_zone(page);
83579242 unsigned long pfn = page_to_pfn(page);
83589243 unsigned long flags;
83599244 unsigned int order;
8360
- bool hwpoisoned = false;
9245
+ bool ret = false;
83619246
83629247 spin_lock_irqsave(&zone->lock, flags);
83639248 for (order = 0; order < MAX_ORDER; order++) {
83649249 struct page *page_head = page - (pfn & ((1 << order) - 1));
9250
+ int page_order = buddy_order(page_head);
83659251
8366
- if (PageBuddy(page_head) && page_order(page_head) >= order) {
8367
- if (!TestSetPageHWPoison(page))
8368
- hwpoisoned = true;
9252
+ if (PageBuddy(page_head) && page_order >= order) {
9253
+ unsigned long pfn_head = page_to_pfn(page_head);
9254
+ int migratetype = get_pfnblock_migratetype(page_head,
9255
+ pfn_head);
9256
+
9257
+ del_page_from_free_list(page_head, zone, page_order);
9258
+ break_down_buddy_pages(zone, page_head, page, 0,
9259
+ page_order, migratetype);
9260
+ if (!is_migrate_isolate(migratetype))
9261
+ __mod_zone_freepage_state(zone, -1, migratetype);
9262
+ ret = true;
83699263 break;
83709264 }
9265
+ if (page_count(page_head) > 0)
9266
+ break;
83719267 }
83729268 spin_unlock_irqrestore(&zone->lock, flags);
8373
-
8374
- return hwpoisoned;
9269
+ return ret;
83759270 }
83769271 #endif
9272
+
9273
+#ifdef CONFIG_ZONE_DMA
9274
+bool has_managed_dma(void)
9275
+{
9276
+ struct pglist_data *pgdat;
9277
+
9278
+ for_each_online_pgdat(pgdat) {
9279
+ struct zone *zone = &pgdat->node_zones[ZONE_DMA];
9280
+
9281
+ if (managed_zone(zone))
9282
+ return true;
9283
+ }
9284
+ return false;
9285
+}
9286
+#endif /* CONFIG_ZONE_DMA */