hc
2023-12-08 01573e231f18eb2d99162747186f59511f56b64d
kernel/mm/page_alloc.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * linux/mm/page_alloc.c
34 *
....@@ -16,11 +17,11 @@
1617
1718 #include <linux/stddef.h>
1819 #include <linux/mm.h>
20
+#include <linux/highmem.h>
1921 #include <linux/swap.h>
2022 #include <linux/interrupt.h>
2123 #include <linux/pagemap.h>
2224 #include <linux/jiffies.h>
23
-#include <linux/bootmem.h>
2425 #include <linux/memblock.h>
2526 #include <linux/compiler.h>
2627 #include <linux/kernel.h>
....@@ -43,12 +44,12 @@
4344 #include <linux/mempolicy.h>
4445 #include <linux/memremap.h>
4546 #include <linux/stop_machine.h>
47
+#include <linux/random.h>
4648 #include <linux/sort.h>
4749 #include <linux/pfn.h>
4850 #include <linux/backing-dev.h>
4951 #include <linux/fault-inject.h>
5052 #include <linux/page-isolation.h>
51
-#include <linux/page_ext.h>
5253 #include <linux/debugobjects.h>
5354 #include <linux/kmemleak.h>
5455 #include <linux/compaction.h>
....@@ -60,19 +61,65 @@
6061 #include <linux/hugetlb.h>
6162 #include <linux/sched/rt.h>
6263 #include <linux/sched/mm.h>
64
+#include <linux/local_lock.h>
6365 #include <linux/page_owner.h>
66
+#include <linux/page_pinner.h>
6467 #include <linux/kthread.h>
6568 #include <linux/memcontrol.h>
6669 #include <linux/ftrace.h>
6770 #include <linux/lockdep.h>
6871 #include <linux/nmi.h>
69
-#include <linux/khugepaged.h>
7072 #include <linux/psi.h>
73
+#include <linux/padata.h>
74
+#include <linux/khugepaged.h>
75
+#include <trace/hooks/mm.h>
76
+#include <trace/hooks/vmscan.h>
7177
7278 #include <asm/sections.h>
7379 #include <asm/tlbflush.h>
7480 #include <asm/div64.h>
7581 #include "internal.h"
82
+#include "shuffle.h"
83
+#include "page_reporting.h"
84
+
85
+/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
86
+typedef int __bitwise fpi_t;
87
+
88
+/* No special request */
89
+#define FPI_NONE ((__force fpi_t)0)
90
+
91
+/*
92
+ * Skip free page reporting notification for the (possibly merged) page.
93
+ * This does not hinder free page reporting from grabbing the page,
94
+ * reporting it and marking it "reported" - it only skips notifying
95
+ * the free page reporting infrastructure about a newly freed page. For
96
+ * example, used when temporarily pulling a page from a freelist and
97
+ * putting it back unmodified.
98
+ */
99
+#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
100
+
101
+/*
102
+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
103
+ * page shuffling (relevant code - e.g., memory onlining - is expected to
104
+ * shuffle the whole zone).
105
+ *
106
+ * Note: No code should rely on this flag for correctness - it's purely
107
+ * to allow for optimizations when handing back either fresh pages
108
+ * (memory onlining) or untouched pages (page isolation, free page
109
+ * reporting).
110
+ */
111
+#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
112
+
113
+/*
114
+ * Don't poison memory with KASAN (only for the tag-based modes).
115
+ * During boot, all non-reserved memblock memory is exposed to page_alloc.
116
+ * Poisoning all that memory lengthens boot time, especially on systems with
117
+ * large amount of RAM. This flag is used to skip that poisoning.
118
+ * This is only done for the tag-based KASAN modes, as those are able to
119
+ * detect memory corruptions with the memory tags assigned by default.
120
+ * All memory allocated normally after boot gets poisoned as usual.
121
+ */
122
+#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
76123
77124 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
78125 static DEFINE_MUTEX(pcp_batch_high_lock);
....@@ -94,12 +141,15 @@
94141 */
95142 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
96143 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
97
-int _node_numa_mem_[MAX_NUMNODES];
98144 #endif
99145
100146 /* work_structs for global per-cpu drains */
101
-DEFINE_MUTEX(pcpu_drain_mutex);
102
-DEFINE_PER_CPU(struct work_struct, pcpu_drain);
147
+struct pcpu_drain {
148
+ struct zone *zone;
149
+ struct work_struct work;
150
+};
151
+static DEFINE_MUTEX(pcpu_drain_mutex);
152
+static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
103153
104154 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
105155 volatile unsigned long latent_entropy __latent_entropy;
....@@ -123,62 +173,33 @@
123173 };
124174 EXPORT_SYMBOL(node_states);
125175
126
-/* Protect totalram_pages and zone->managed_pages */
127
-static DEFINE_SPINLOCK(managed_page_count_lock);
128
-
129
-unsigned long totalram_pages __read_mostly;
176
+atomic_long_t _totalram_pages __read_mostly;
177
+EXPORT_SYMBOL(_totalram_pages);
130178 unsigned long totalreserve_pages __read_mostly;
131179 unsigned long totalcma_pages __read_mostly;
132180
133181 int percpu_pagelist_fraction;
134182 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
135
-#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
136
-DEFINE_STATIC_KEY_TRUE(init_on_alloc);
137
-#else
138183 DEFINE_STATIC_KEY_FALSE(init_on_alloc);
139
-#endif
140184 EXPORT_SYMBOL(init_on_alloc);
141185
142
-#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
143
-DEFINE_STATIC_KEY_TRUE(init_on_free);
144
-#else
145186 DEFINE_STATIC_KEY_FALSE(init_on_free);
146
-#endif
147187 EXPORT_SYMBOL(init_on_free);
148188
189
+static bool _init_on_alloc_enabled_early __read_mostly
190
+ = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
149191 static int __init early_init_on_alloc(char *buf)
150192 {
151
- int ret;
152
- bool bool_result;
153193
154
- if (!buf)
155
- return -EINVAL;
156
- ret = kstrtobool(buf, &bool_result);
157
- if (bool_result && page_poisoning_enabled())
158
- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
159
- if (bool_result)
160
- static_branch_enable(&init_on_alloc);
161
- else
162
- static_branch_disable(&init_on_alloc);
163
- return ret;
194
+ return kstrtobool(buf, &_init_on_alloc_enabled_early);
164195 }
165196 early_param("init_on_alloc", early_init_on_alloc);
166197
198
+static bool _init_on_free_enabled_early __read_mostly
199
+ = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
167200 static int __init early_init_on_free(char *buf)
168201 {
169
- int ret;
170
- bool bool_result;
171
-
172
- if (!buf)
173
- return -EINVAL;
174
- ret = kstrtobool(buf, &bool_result);
175
- if (bool_result && page_poisoning_enabled())
176
- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
177
- if (bool_result)
178
- static_branch_enable(&init_on_free);
179
- else
180
- static_branch_disable(&init_on_free);
181
- return ret;
202
+ return kstrtobool(buf, &_init_on_free_enabled_early);
182203 }
183204 early_param("init_on_free", early_init_on_free);
184205
....@@ -242,7 +263,8 @@
242263 unsigned int pageblock_order __read_mostly;
243264 #endif
244265
245
-static void __free_pages_ok(struct page *page, unsigned int order);
266
+static void __free_pages_ok(struct page *page, unsigned int order,
267
+ fpi_t fpi_flags);
246268
247269 /*
248270 * results with 256, 32 in the lowmem_reserve sysctl:
....@@ -269,8 +291,6 @@
269291 [ZONE_MOVABLE] = 0,
270292 };
271293
272
-EXPORT_SYMBOL(totalram_pages);
273
-
274294 static char * const zone_names[MAX_NR_ZONES] = {
275295 #ifdef CONFIG_ZONE_DMA
276296 "DMA",
....@@ -288,7 +308,7 @@
288308 #endif
289309 };
290310
291
-char * const migratetype_names[MIGRATE_TYPES] = {
311
+const char * const migratetype_names[MIGRATE_TYPES] = {
292312 "Unmovable",
293313 "Movable",
294314 "Reclaimable",
....@@ -301,14 +321,14 @@
301321 #endif
302322 };
303323
304
-compound_page_dtor * const compound_page_dtors[] = {
305
- NULL,
306
- free_compound_page,
324
+compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
325
+ [NULL_COMPOUND_DTOR] = NULL,
326
+ [COMPOUND_PAGE_DTOR] = free_compound_page,
307327 #ifdef CONFIG_HUGETLB_PAGE
308
- free_huge_page,
328
+ [HUGETLB_PAGE_DTOR] = free_huge_page,
309329 #endif
310330 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
311
- free_transhuge_page,
331
+ [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
312332 #endif
313333 };
314334
....@@ -319,6 +339,20 @@
319339 */
320340 int min_free_kbytes = 1024;
321341 int user_min_free_kbytes = -1;
342
+#ifdef CONFIG_DISCONTIGMEM
343
+/*
344
+ * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
345
+ * are not on separate NUMA nodes. Functionally this works but with
346
+ * watermark_boost_factor, it can reclaim prematurely as the ranges can be
347
+ * quite small. By default, do not boost watermarks on discontigmem as in
348
+ * many cases very high-order allocations like THP are likely to be
349
+ * unsupported and the premature reclaim offsets the advantage of long-term
350
+ * fragmentation avoidance.
351
+ */
352
+int watermark_boost_factor __read_mostly;
353
+#else
354
+int watermark_boost_factor __read_mostly = 15000;
355
+#endif
322356 int watermark_scale_factor = 10;
323357
324358 /*
....@@ -328,31 +362,36 @@
328362 */
329363 int extra_free_kbytes = 0;
330364
331
-static unsigned long nr_kernel_pages __meminitdata;
332
-static unsigned long nr_all_pages __meminitdata;
333
-static unsigned long dma_reserve __meminitdata;
365
+static unsigned long nr_kernel_pages __initdata;
366
+static unsigned long nr_all_pages __initdata;
367
+static unsigned long dma_reserve __initdata;
334368
335
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
336
-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
337
-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
369
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
370
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
338371 static unsigned long required_kernelcore __initdata;
339372 static unsigned long required_kernelcore_percent __initdata;
340373 static unsigned long required_movablecore __initdata;
341374 static unsigned long required_movablecore_percent __initdata;
342
-static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
375
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
343376 static bool mirrored_kernelcore __meminitdata;
344377
345378 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
346379 int movable_zone;
347380 EXPORT_SYMBOL(movable_zone);
348
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
349381
350382 #if MAX_NUMNODES > 1
351
-int nr_node_ids __read_mostly = MAX_NUMNODES;
352
-int nr_online_nodes __read_mostly = 1;
383
+unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
384
+unsigned int nr_online_nodes __read_mostly = 1;
353385 EXPORT_SYMBOL(nr_node_ids);
354386 EXPORT_SYMBOL(nr_online_nodes);
355387 #endif
388
+
389
+struct pa_lock {
390
+ local_lock_t l;
391
+};
392
+static DEFINE_PER_CPU(struct pa_lock, pa_lock) = {
393
+ .l = INIT_LOCAL_LOCK(l),
394
+};
356395
357396 int page_group_by_mobility_disabled __read_mostly;
358397
....@@ -365,7 +404,7 @@
365404 static DEFINE_STATIC_KEY_TRUE(deferred_pages);
366405
367406 /*
368
- * Calling kasan_free_pages() only after deferred memory initialization
407
+ * Calling kasan_poison_pages() only after deferred memory initialization
369408 * has completed. Poisoning pages during deferred memory init will greatly
370409 * lengthen the process and cause problem in large memory systems as the
371410 * deferred pages initialization is done with interrupt disabled.
....@@ -377,10 +416,12 @@
377416 * on-demand allocation and then freed again before the deferred pages
378417 * initialization is done, but this is not likely to happen.
379418 */
380
-static inline void kasan_free_nondeferred_pages(struct page *page, int order)
419
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
381420 {
382
- if (!static_branch_unlikely(&deferred_pages))
383
- kasan_free_pages(page, order);
421
+ return static_branch_unlikely(&deferred_pages) ||
422
+ (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
423
+ (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
424
+ PageSkipKASanPoison(page);
384425 }
385426
386427 /* Returns true if the struct page for the pfn is uninitialised */
....@@ -395,38 +436,57 @@
395436 }
396437
397438 /*
398
- * Returns false when the remaining initialisation should be deferred until
439
+ * Returns true when the remaining initialisation should be deferred until
399440 * later in the boot cycle when it can be parallelised.
400441 */
401
-static inline bool update_defer_init(pg_data_t *pgdat,
402
- unsigned long pfn, unsigned long zone_end,
403
- unsigned long *nr_initialised)
442
+static bool __meminit
443
+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
404444 {
405
- /* Always populate low zones for address-constrained allocations */
406
- if (zone_end < pgdat_end_pfn(pgdat))
407
- return true;
408
- (*nr_initialised)++;
409
- if ((*nr_initialised > pgdat->static_init_pgcnt) &&
410
- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
411
- pgdat->first_deferred_pfn = pfn;
412
- return false;
445
+ static unsigned long prev_end_pfn, nr_initialised;
446
+
447
+ /*
448
+ * prev_end_pfn static that contains the end of previous zone
449
+ * No need to protect because called very early in boot before smp_init.
450
+ */
451
+ if (prev_end_pfn != end_pfn) {
452
+ prev_end_pfn = end_pfn;
453
+ nr_initialised = 0;
413454 }
414455
415
- return true;
456
+ /* Always populate low zones for address-constrained allocations */
457
+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
458
+ return false;
459
+
460
+ if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
461
+ return true;
462
+ /*
463
+ * We start only with one section of pages, more pages are added as
464
+ * needed until the rest of deferred pages are initialized.
465
+ */
466
+ nr_initialised++;
467
+ if ((nr_initialised > PAGES_PER_SECTION) &&
468
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
469
+ NODE_DATA(nid)->first_deferred_pfn = pfn;
470
+ return true;
471
+ }
472
+ return false;
416473 }
417474 #else
418
-#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
475
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
476
+{
477
+ return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
478
+ (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
479
+ PageSkipKASanPoison(page);
480
+}
419481
420482 static inline bool early_page_uninitialised(unsigned long pfn)
421483 {
422484 return false;
423485 }
424486
425
-static inline bool update_defer_init(pg_data_t *pgdat,
426
- unsigned long pfn, unsigned long zone_end,
427
- unsigned long *nr_initialised)
487
+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
428488 {
429
- return true;
489
+ return false;
430490 }
431491 #endif
432492
....@@ -435,7 +495,7 @@
435495 unsigned long pfn)
436496 {
437497 #ifdef CONFIG_SPARSEMEM
438
- return __pfn_to_section(pfn)->pageblock_flags;
498
+ return section_to_usemap(__pfn_to_section(pfn));
439499 #else
440500 return page_zone(page)->pageblock_flags;
441501 #endif /* CONFIG_SPARSEMEM */
....@@ -445,25 +505,23 @@
445505 {
446506 #ifdef CONFIG_SPARSEMEM
447507 pfn &= (PAGES_PER_SECTION-1);
448
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
449508 #else
450509 pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
451
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
452510 #endif /* CONFIG_SPARSEMEM */
511
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
453512 }
454513
455514 /**
456515 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
457516 * @page: The page within the block of interest
458517 * @pfn: The target page frame number
459
- * @end_bitidx: The last bit of interest to retrieve
460518 * @mask: mask of bits that the caller is interested in
461519 *
462520 * Return: pageblock_bits flags
463521 */
464
-static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
522
+static __always_inline
523
+unsigned long __get_pfnblock_flags_mask(struct page *page,
465524 unsigned long pfn,
466
- unsigned long end_bitidx,
467525 unsigned long mask)
468526 {
469527 unsigned long *bitmap;
....@@ -476,20 +534,36 @@
476534 bitidx &= (BITS_PER_LONG-1);
477535
478536 word = bitmap[word_bitidx];
479
- bitidx += end_bitidx;
480
- return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
537
+ return (word >> bitidx) & mask;
481538 }
482539
483540 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
484
- unsigned long end_bitidx,
485541 unsigned long mask)
486542 {
487
- return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
543
+ return __get_pfnblock_flags_mask(page, pfn, mask);
488544 }
545
+EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask);
546
+
547
+int isolate_anon_lru_page(struct page *page)
548
+{
549
+ int ret;
550
+
551
+ if (!PageLRU(page) || !PageAnon(page))
552
+ return -EINVAL;
553
+
554
+ if (!get_page_unless_zero(page))
555
+ return -EINVAL;
556
+
557
+ ret = isolate_lru_page(page);
558
+ put_page(page);
559
+
560
+ return ret;
561
+}
562
+EXPORT_SYMBOL_GPL(isolate_anon_lru_page);
489563
490564 static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
491565 {
492
- return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
566
+ return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
493567 }
494568
495569 /**
....@@ -497,12 +571,10 @@
497571 * @page: The page within the block of interest
498572 * @flags: The flags to set
499573 * @pfn: The target page frame number
500
- * @end_bitidx: The last bit of interest
501574 * @mask: mask of bits that the caller is interested in
502575 */
503576 void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
504577 unsigned long pfn,
505
- unsigned long end_bitidx,
506578 unsigned long mask)
507579 {
508580 unsigned long *bitmap;
....@@ -510,6 +582,7 @@
510582 unsigned long old_word, word;
511583
512584 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
585
+ BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
513586
514587 bitmap = get_pageblock_bitmap(page, pfn);
515588 bitidx = pfn_to_bitidx(page, pfn);
....@@ -518,9 +591,8 @@
518591
519592 VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
520593
521
- bitidx += end_bitidx;
522
- mask <<= (BITS_PER_LONG - bitidx - 1);
523
- flags <<= (BITS_PER_LONG - bitidx - 1);
594
+ mask <<= bitidx;
595
+ flags <<= bitidx;
524596
525597 word = READ_ONCE(bitmap[word_bitidx]);
526598 for (;;) {
....@@ -537,8 +609,8 @@
537609 migratetype < MIGRATE_PCPTYPES))
538610 migratetype = MIGRATE_UNMOVABLE;
539611
540
- set_pageblock_flags_group(page, (unsigned long)migratetype,
541
- PB_migrate, PB_migrate_end);
612
+ set_pfnblock_flags_mask(page, (unsigned long)migratetype,
613
+ page_to_pfn(page), MIGRATETYPE_MASK);
542614 }
543615
544616 #ifdef CONFIG_DEBUG_VM
....@@ -593,8 +665,7 @@
593665 }
594666 #endif
595667
596
-static void bad_page(struct page *page, const char *reason,
597
- unsigned long bad_flags)
668
+static void bad_page(struct page *page, const char *reason)
598669 {
599670 static unsigned long resume;
600671 static unsigned long nr_shown;
....@@ -623,10 +694,6 @@
623694 pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
624695 current->comm, page_to_pfn(page));
625696 __dump_page(page, reason);
626
- bad_flags &= page->flags;
627
- if (bad_flags)
628
- pr_alert("bad because of flags: %#lx(%pGp)\n",
629
- bad_flags, &bad_flags);
630697 dump_page_owner(page);
631698
632699 print_modules();
....@@ -654,7 +721,8 @@
654721
655722 void free_compound_page(struct page *page)
656723 {
657
- __free_pages_ok(page, compound_order(page));
724
+ mem_cgroup_uncharge(page);
725
+ __free_pages_ok(page, compound_order(page), FPI_NONE);
658726 }
659727
660728 void prep_compound_page(struct page *page, unsigned int order)
....@@ -662,8 +730,6 @@
662730 int i;
663731 int nr_pages = 1 << order;
664732
665
- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
666
- set_compound_order(page, order);
667733 __SetPageHead(page);
668734 for (i = 1; i < nr_pages; i++) {
669735 struct page *p = page + i;
....@@ -671,51 +737,30 @@
671737 p->mapping = TAIL_MAPPING;
672738 set_compound_head(p, page);
673739 }
740
+
741
+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
742
+ set_compound_order(page, order);
674743 atomic_set(compound_mapcount_ptr(page), -1);
744
+ if (hpage_pincount_available(page))
745
+ atomic_set(compound_pincount_ptr(page), 0);
675746 }
676747
677748 #ifdef CONFIG_DEBUG_PAGEALLOC
678749 unsigned int _debug_guardpage_minorder;
679
-bool _debug_pagealloc_enabled __read_mostly
750
+
751
+bool _debug_pagealloc_enabled_early __read_mostly
680752 = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
753
+EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
754
+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
681755 EXPORT_SYMBOL(_debug_pagealloc_enabled);
682
-bool _debug_guardpage_enabled __read_mostly;
756
+
757
+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
683758
684759 static int __init early_debug_pagealloc(char *buf)
685760 {
686
- if (!buf)
687
- return -EINVAL;
688
- return kstrtobool(buf, &_debug_pagealloc_enabled);
761
+ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
689762 }
690763 early_param("debug_pagealloc", early_debug_pagealloc);
691
-
692
-static bool need_debug_guardpage(void)
693
-{
694
- /* If we don't use debug_pagealloc, we don't need guard page */
695
- if (!debug_pagealloc_enabled())
696
- return false;
697
-
698
- if (!debug_guardpage_minorder())
699
- return false;
700
-
701
- return true;
702
-}
703
-
704
-static void init_debug_guardpage(void)
705
-{
706
- if (!debug_pagealloc_enabled())
707
- return;
708
-
709
- if (!debug_guardpage_minorder())
710
- return;
711
-
712
- _debug_guardpage_enabled = true;
713
-}
714
-
715
-struct page_ext_operations debug_guardpage_ops = {
716
- .need = need_debug_guardpage,
717
- .init = init_debug_guardpage,
718
-};
719764
720765 static int __init debug_guardpage_minorder_setup(char *buf)
721766 {
....@@ -734,20 +779,13 @@
734779 static inline bool set_page_guard(struct zone *zone, struct page *page,
735780 unsigned int order, int migratetype)
736781 {
737
- struct page_ext *page_ext;
738
-
739782 if (!debug_guardpage_enabled())
740783 return false;
741784
742785 if (order >= debug_guardpage_minorder())
743786 return false;
744787
745
- page_ext = lookup_page_ext(page);
746
- if (unlikely(!page_ext))
747
- return false;
748
-
749
- __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
750
-
788
+ __SetPageGuard(page);
751789 INIT_LIST_HEAD(&page->lru);
752790 set_page_private(page, order);
753791 /* Guard pages are not available for any usage */
....@@ -759,39 +797,77 @@
759797 static inline void clear_page_guard(struct zone *zone, struct page *page,
760798 unsigned int order, int migratetype)
761799 {
762
- struct page_ext *page_ext;
763
-
764800 if (!debug_guardpage_enabled())
765801 return;
766802
767
- page_ext = lookup_page_ext(page);
768
- if (unlikely(!page_ext))
769
- return;
770
-
771
- __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
803
+ __ClearPageGuard(page);
772804
773805 set_page_private(page, 0);
774806 if (!is_migrate_isolate(migratetype))
775807 __mod_zone_freepage_state(zone, (1 << order), migratetype);
776808 }
777809 #else
778
-struct page_ext_operations debug_guardpage_ops;
779810 static inline bool set_page_guard(struct zone *zone, struct page *page,
780811 unsigned int order, int migratetype) { return false; }
781812 static inline void clear_page_guard(struct zone *zone, struct page *page,
782813 unsigned int order, int migratetype) {}
783814 #endif
784815
785
-static inline void set_page_order(struct page *page, unsigned int order)
816
+/*
817
+ * Enable static keys related to various memory debugging and hardening options.
818
+ * Some override others, and depend on early params that are evaluated in the
819
+ * order of appearance. So we need to first gather the full picture of what was
820
+ * enabled, and then make decisions.
821
+ */
822
+void init_mem_debugging_and_hardening(void)
823
+{
824
+ bool page_poisoning_requested = false;
825
+
826
+#ifdef CONFIG_PAGE_POISONING
827
+ /*
828
+ * Page poisoning is debug page alloc for some arches. If
829
+ * either of those options are enabled, enable poisoning.
830
+ */
831
+ if (page_poisoning_enabled() ||
832
+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
833
+ debug_pagealloc_enabled())) {
834
+ static_branch_enable(&_page_poisoning_enabled);
835
+ page_poisoning_requested = true;
836
+ }
837
+#endif
838
+
839
+ if (_init_on_alloc_enabled_early) {
840
+ if (page_poisoning_requested)
841
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
842
+ "will take precedence over init_on_alloc\n");
843
+ else
844
+ static_branch_enable(&init_on_alloc);
845
+ }
846
+ if (_init_on_free_enabled_early) {
847
+ if (page_poisoning_requested)
848
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
849
+ "will take precedence over init_on_free\n");
850
+ else
851
+ static_branch_enable(&init_on_free);
852
+ }
853
+
854
+#ifdef CONFIG_DEBUG_PAGEALLOC
855
+ if (!debug_pagealloc_enabled())
856
+ return;
857
+
858
+ static_branch_enable(&_debug_pagealloc_enabled);
859
+
860
+ if (!debug_guardpage_minorder())
861
+ return;
862
+
863
+ static_branch_enable(&_debug_guardpage_enabled);
864
+#endif
865
+}
866
+
867
+static inline void set_buddy_order(struct page *page, unsigned int order)
786868 {
787869 set_page_private(page, order);
788870 __SetPageBuddy(page);
789
-}
790
-
791
-static inline void rmv_page_order(struct page *page)
792
-{
793
- __ClearPageBuddy(page);
794
- set_page_private(page, 0);
795871 }
796872
797873 /*
....@@ -807,32 +883,151 @@
807883 *
808884 * For recording page's order, we use page_private(page).
809885 */
810
-static inline int page_is_buddy(struct page *page, struct page *buddy,
886
+static inline bool page_is_buddy(struct page *page, struct page *buddy,
811887 unsigned int order)
812888 {
813
- if (page_is_guard(buddy) && page_order(buddy) == order) {
814
- if (page_zone_id(page) != page_zone_id(buddy))
815
- return 0;
889
+ if (!page_is_guard(buddy) && !PageBuddy(buddy))
890
+ return false;
816891
817
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
892
+ if (buddy_order(buddy) != order)
893
+ return false;
818894
819
- return 1;
820
- }
895
+ /*
896
+ * zone check is done late to avoid uselessly calculating
897
+ * zone/node ids for pages that could never merge.
898
+ */
899
+ if (page_zone_id(page) != page_zone_id(buddy))
900
+ return false;
821901
822
- if (PageBuddy(buddy) && page_order(buddy) == order) {
823
- /*
824
- * zone check is done late to avoid uselessly
825
- * calculating zone/node ids for pages that could
826
- * never merge.
827
- */
828
- if (page_zone_id(page) != page_zone_id(buddy))
829
- return 0;
902
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
830903
831
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
904
+ return true;
905
+}
832906
833
- return 1;
834
- }
835
- return 0;
907
+#ifdef CONFIG_COMPACTION
908
+static inline struct capture_control *task_capc(struct zone *zone)
909
+{
910
+ struct capture_control *capc = current->capture_control;
911
+
912
+ return unlikely(capc) &&
913
+ !(current->flags & PF_KTHREAD) &&
914
+ !capc->page &&
915
+ capc->cc->zone == zone ? capc : NULL;
916
+}
917
+
918
+static inline bool
919
+compaction_capture(struct capture_control *capc, struct page *page,
920
+ int order, int migratetype)
921
+{
922
+ if (!capc || order != capc->cc->order)
923
+ return false;
924
+
925
+ /* Do not accidentally pollute CMA or isolated regions*/
926
+ if (is_migrate_cma(migratetype) ||
927
+ is_migrate_isolate(migratetype))
928
+ return false;
929
+
930
+ /*
931
+ * Do not let lower order allocations polluate a movable pageblock.
932
+ * This might let an unmovable request use a reclaimable pageblock
933
+ * and vice-versa but no more than normal fallback logic which can
934
+ * have trouble finding a high-order free page.
935
+ */
936
+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
937
+ return false;
938
+
939
+ capc->page = page;
940
+ return true;
941
+}
942
+
943
+#else
944
+static inline struct capture_control *task_capc(struct zone *zone)
945
+{
946
+ return NULL;
947
+}
948
+
949
+static inline bool
950
+compaction_capture(struct capture_control *capc, struct page *page,
951
+ int order, int migratetype)
952
+{
953
+ return false;
954
+}
955
+#endif /* CONFIG_COMPACTION */
956
+
957
+/* Used for pages not on another list */
958
+static inline void add_to_free_list(struct page *page, struct zone *zone,
959
+ unsigned int order, int migratetype)
960
+{
961
+ struct free_area *area = &zone->free_area[order];
962
+
963
+ list_add(&page->lru, &area->free_list[migratetype]);
964
+ area->nr_free++;
965
+}
966
+
967
+/* Used for pages not on another list */
968
+static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
969
+ unsigned int order, int migratetype)
970
+{
971
+ struct free_area *area = &zone->free_area[order];
972
+
973
+ list_add_tail(&page->lru, &area->free_list[migratetype]);
974
+ area->nr_free++;
975
+}
976
+
977
+/*
978
+ * Used for pages which are on another list. Move the pages to the tail
979
+ * of the list - so the moved pages won't immediately be considered for
980
+ * allocation again (e.g., optimization for memory onlining).
981
+ */
982
+static inline void move_to_free_list(struct page *page, struct zone *zone,
983
+ unsigned int order, int migratetype)
984
+{
985
+ struct free_area *area = &zone->free_area[order];
986
+
987
+ list_move_tail(&page->lru, &area->free_list[migratetype]);
988
+}
989
+
990
+static inline void del_page_from_free_list(struct page *page, struct zone *zone,
991
+ unsigned int order)
992
+{
993
+ /* clear reported state and update reported page count */
994
+ if (page_reported(page))
995
+ __ClearPageReported(page);
996
+
997
+ list_del(&page->lru);
998
+ __ClearPageBuddy(page);
999
+ set_page_private(page, 0);
1000
+ zone->free_area[order].nr_free--;
1001
+}
1002
+
1003
+/*
1004
+ * If this is not the largest possible page, check if the buddy
1005
+ * of the next-highest order is free. If it is, it's possible
1006
+ * that pages are being freed that will coalesce soon. In case,
1007
+ * that is happening, add the free page to the tail of the list
1008
+ * so it's less likely to be used soon and more likely to be merged
1009
+ * as a higher order page
1010
+ */
1011
+static inline bool
1012
+buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
1013
+ struct page *page, unsigned int order)
1014
+{
1015
+ struct page *higher_page, *higher_buddy;
1016
+ unsigned long combined_pfn;
1017
+
1018
+ if (order >= MAX_ORDER - 2)
1019
+ return false;
1020
+
1021
+ if (!pfn_valid_within(buddy_pfn))
1022
+ return false;
1023
+
1024
+ combined_pfn = buddy_pfn & pfn;
1025
+ higher_page = page + (combined_pfn - pfn);
1026
+ buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
1027
+ higher_buddy = higher_page + (buddy_pfn - combined_pfn);
1028
+
1029
+ return pfn_valid_within(buddy_pfn) &&
1030
+ page_is_buddy(higher_page, higher_buddy, order + 1);
8361031 }
8371032
8381033 /*
....@@ -862,12 +1057,14 @@
8621057 static inline void __free_one_page(struct page *page,
8631058 unsigned long pfn,
8641059 struct zone *zone, unsigned int order,
865
- int migratetype)
1060
+ int migratetype, fpi_t fpi_flags)
8661061 {
1062
+ struct capture_control *capc = task_capc(zone);
1063
+ unsigned long buddy_pfn;
8671064 unsigned long combined_pfn;
868
- unsigned long uninitialized_var(buddy_pfn);
869
- struct page *buddy;
8701065 unsigned int max_order;
1066
+ struct page *buddy;
1067
+ bool to_tail;
8711068
8721069 max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
8731070
....@@ -883,6 +1080,11 @@
8831080
8841081 continue_merging:
8851082 while (order < max_order) {
1083
+ if (compaction_capture(capc, page, order, migratetype)) {
1084
+ __mod_zone_freepage_state(zone, -(1 << order),
1085
+ migratetype);
1086
+ return;
1087
+ }
8861088 buddy_pfn = __find_buddy_pfn(pfn, order);
8871089 buddy = page + (buddy_pfn - pfn);
8881090
....@@ -894,13 +1096,10 @@
8941096 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
8951097 * merge with it and move up one order.
8961098 */
897
- if (page_is_guard(buddy)) {
1099
+ if (page_is_guard(buddy))
8981100 clear_page_guard(zone, buddy, order, migratetype);
899
- } else {
900
- list_del(&buddy->lru);
901
- zone->free_area[order].nr_free--;
902
- rmv_page_order(buddy);
903
- }
1101
+ else
1102
+ del_page_from_free_list(buddy, zone, order);
9041103 combined_pfn = buddy_pfn & pfn;
9051104 page = page + (combined_pfn - pfn);
9061105 pfn = combined_pfn;
....@@ -932,33 +1131,23 @@
9321131 }
9331132
9341133 done_merging:
935
- set_page_order(page, order);
1134
+ set_buddy_order(page, order);
9361135
937
- /*
938
- * If this is not the largest possible page, check if the buddy
939
- * of the next-highest order is free. If it is, it's possible
940
- * that pages are being freed that will coalesce soon. In case,
941
- * that is happening, add the free page to the tail of the list
942
- * so it's less likely to be used soon and more likely to be merged
943
- * as a higher order page
944
- */
945
- if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
946
- struct page *higher_page, *higher_buddy;
947
- combined_pfn = buddy_pfn & pfn;
948
- higher_page = page + (combined_pfn - pfn);
949
- buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
950
- higher_buddy = higher_page + (buddy_pfn - combined_pfn);
951
- if (pfn_valid_within(buddy_pfn) &&
952
- page_is_buddy(higher_page, higher_buddy, order + 1)) {
953
- list_add_tail(&page->lru,
954
- &zone->free_area[order].free_list[migratetype]);
955
- goto out;
956
- }
957
- }
1136
+ if (fpi_flags & FPI_TO_TAIL)
1137
+ to_tail = true;
1138
+ else if (is_shuffle_order(order))
1139
+ to_tail = shuffle_pick_tail();
1140
+ else
1141
+ to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
9581142
959
- list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
960
-out:
961
- zone->free_area[order].nr_free++;
1143
+ if (to_tail)
1144
+ add_to_free_list_tail(page, zone, order, migratetype);
1145
+ else
1146
+ add_to_free_list(page, zone, order, migratetype);
1147
+
1148
+ /* Notify page reporting subsystem of freed page */
1149
+ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
1150
+ page_reporting_notify_free(order);
9621151 }
9631152
9641153 /*
....@@ -983,13 +1172,9 @@
9831172 return true;
9841173 }
9851174
986
-static void free_pages_check_bad(struct page *page)
1175
+static const char *page_bad_reason(struct page *page, unsigned long flags)
9871176 {
988
- const char *bad_reason;
989
- unsigned long bad_flags;
990
-
991
- bad_reason = NULL;
992
- bad_flags = 0;
1177
+ const char *bad_reason = NULL;
9931178
9941179 if (unlikely(atomic_read(&page->_mapcount) != -1))
9951180 bad_reason = "nonzero mapcount";
....@@ -997,24 +1182,32 @@
9971182 bad_reason = "non-NULL mapping";
9981183 if (unlikely(page_ref_count(page) != 0))
9991184 bad_reason = "nonzero _refcount";
1000
- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
1001
- bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1002
- bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
1185
+ if (unlikely(page->flags & flags)) {
1186
+ if (flags == PAGE_FLAGS_CHECK_AT_PREP)
1187
+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
1188
+ else
1189
+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
10031190 }
10041191 #ifdef CONFIG_MEMCG
10051192 if (unlikely(page->mem_cgroup))
10061193 bad_reason = "page still charged to cgroup";
10071194 #endif
1008
- bad_page(page, bad_reason, bad_flags);
1195
+ return bad_reason;
10091196 }
10101197
1011
-static inline int free_pages_check(struct page *page)
1198
+static void check_free_page_bad(struct page *page)
1199
+{
1200
+ bad_page(page,
1201
+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
1202
+}
1203
+
1204
+static inline int check_free_page(struct page *page)
10121205 {
10131206 if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
10141207 return 0;
10151208
10161209 /* Something has gone sideways, find it */
1017
- free_pages_check_bad(page);
1210
+ check_free_page_bad(page);
10181211 return 1;
10191212 }
10201213
....@@ -1036,7 +1229,7 @@
10361229 case 1:
10371230 /* the first tail page: ->mapping may be compound_mapcount() */
10381231 if (unlikely(compound_mapcount(page))) {
1039
- bad_page(page, "nonzero compound_mapcount", 0);
1232
+ bad_page(page, "nonzero compound_mapcount");
10401233 goto out;
10411234 }
10421235 break;
....@@ -1048,17 +1241,17 @@
10481241 break;
10491242 default:
10501243 if (page->mapping != TAIL_MAPPING) {
1051
- bad_page(page, "corrupted mapping in tail page", 0);
1244
+ bad_page(page, "corrupted mapping in tail page");
10521245 goto out;
10531246 }
10541247 break;
10551248 }
10561249 if (unlikely(!PageTail(page))) {
1057
- bad_page(page, "PageTail not set", 0);
1250
+ bad_page(page, "PageTail not set");
10581251 goto out;
10591252 }
10601253 if (unlikely(compound_head(page) != head_page)) {
1061
- bad_page(page, "compound_head not consistent", 0);
1254
+ bad_page(page, "compound_head not consistent");
10621255 goto out;
10631256 }
10641257 ret = 0;
....@@ -1068,25 +1261,48 @@
10681261 return ret;
10691262 }
10701263
1071
-static void kernel_init_free_pages(struct page *page, int numpages)
1264
+static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
10721265 {
10731266 int i;
10741267
1268
+ if (zero_tags) {
1269
+ for (i = 0; i < numpages; i++)
1270
+ tag_clear_highpage(page + i);
1271
+ return;
1272
+ }
1273
+
10751274 /* s390's use of memset() could override KASAN redzones. */
10761275 kasan_disable_current();
1077
- for (i = 0; i < numpages; i++)
1276
+ for (i = 0; i < numpages; i++) {
1277
+ u8 tag = page_kasan_tag(page + i);
1278
+ page_kasan_tag_reset(page + i);
10781279 clear_highpage(page + i);
1280
+ page_kasan_tag_set(page + i, tag);
1281
+ }
10791282 kasan_enable_current();
10801283 }
10811284
10821285 static __always_inline bool free_pages_prepare(struct page *page,
1083
- unsigned int order, bool check_free)
1286
+ unsigned int order, bool check_free, fpi_t fpi_flags)
10841287 {
10851288 int bad = 0;
1289
+ bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
10861290
10871291 VM_BUG_ON_PAGE(PageTail(page), page);
10881292
10891293 trace_mm_page_free(page, order);
1294
+
1295
+ if (unlikely(PageHWPoison(page)) && !order) {
1296
+ /*
1297
+ * Do not let hwpoison pages hit pcplists/buddy
1298
+ * Untie memcg state and reset page's owner
1299
+ */
1300
+ if (memcg_kmem_enabled() && PageKmemcg(page))
1301
+ __memcg_kmem_uncharge_page(page, order);
1302
+ reset_page_owner(page, order);
1303
+ free_page_pinner(page, order);
1304
+ return false;
1305
+ }
10901306
10911307 /*
10921308 * Check tail pages before head page information is cleared to
....@@ -1103,7 +1319,7 @@
11031319 for (i = 1; i < (1 << order); i++) {
11041320 if (compound)
11051321 bad += free_tail_pages_check(page, page + i);
1106
- if (unlikely(free_pages_check(page + i))) {
1322
+ if (unlikely(check_free_page(page + i))) {
11071323 bad++;
11081324 continue;
11091325 }
....@@ -1113,15 +1329,16 @@
11131329 if (PageMappingFlags(page))
11141330 page->mapping = NULL;
11151331 if (memcg_kmem_enabled() && PageKmemcg(page))
1116
- memcg_kmem_uncharge(page, order);
1332
+ __memcg_kmem_uncharge_page(page, order);
11171333 if (check_free)
1118
- bad += free_pages_check(page);
1334
+ bad += check_free_page(page);
11191335 if (bad)
11201336 return false;
11211337
11221338 page_cpupid_reset_last(page);
11231339 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
11241340 reset_page_owner(page, order);
1341
+ free_page_pinner(page, order);
11251342
11261343 if (!PageHighMem(page)) {
11271344 debug_check_no_locks_freed(page_address(page),
....@@ -1129,36 +1346,77 @@
11291346 debug_check_no_obj_freed(page_address(page),
11301347 PAGE_SIZE << order);
11311348 }
1132
- arch_free_page(page, order);
1133
- if (want_init_on_free())
1134
- kernel_init_free_pages(page, 1 << order);
11351349
1136
- kernel_poison_pages(page, 1 << order, 0);
1137
- kernel_map_pages(page, 1 << order, 0);
1138
- kasan_free_nondeferred_pages(page, order);
1350
+ kernel_poison_pages(page, 1 << order);
1351
+
1352
+ /*
1353
+ * As memory initialization might be integrated into KASAN,
1354
+ * kasan_free_pages and kernel_init_free_pages must be
1355
+ * kept together to avoid discrepancies in behavior.
1356
+ *
1357
+ * With hardware tag-based KASAN, memory tags must be set before the
1358
+ * page becomes unavailable via debug_pagealloc or arch_free_page.
1359
+ */
1360
+ if (kasan_has_integrated_init()) {
1361
+ if (!skip_kasan_poison)
1362
+ kasan_free_pages(page, order);
1363
+ } else {
1364
+ bool init = want_init_on_free();
1365
+
1366
+ if (init)
1367
+ kernel_init_free_pages(page, 1 << order, false);
1368
+ if (!skip_kasan_poison)
1369
+ kasan_poison_pages(page, order, init);
1370
+ }
1371
+
1372
+ /*
1373
+ * arch_free_page() can make the page's contents inaccessible. s390
1374
+ * does this. So nothing which can access the page's contents should
1375
+ * happen after this.
1376
+ */
1377
+ arch_free_page(page, order);
1378
+
1379
+ debug_pagealloc_unmap_pages(page, 1 << order);
11391380
11401381 return true;
11411382 }
11421383
11431384 #ifdef CONFIG_DEBUG_VM
1144
-static inline bool free_pcp_prepare(struct page *page)
1145
-{
1146
- return free_pages_prepare(page, 0, true);
1147
-}
1148
-
1149
-static inline bool bulkfree_pcp_prepare(struct page *page)
1150
-{
1151
- return false;
1152
-}
1153
-#else
1385
+/*
1386
+ * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
1387
+ * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
1388
+ * moved from pcp lists to free lists.
1389
+ */
11541390 static bool free_pcp_prepare(struct page *page)
11551391 {
1156
- return free_pages_prepare(page, 0, false);
1392
+ return free_pages_prepare(page, 0, true, FPI_NONE);
11571393 }
11581394
11591395 static bool bulkfree_pcp_prepare(struct page *page)
11601396 {
1161
- return free_pages_check(page);
1397
+ if (debug_pagealloc_enabled_static())
1398
+ return check_free_page(page);
1399
+ else
1400
+ return false;
1401
+}
1402
+#else
1403
+/*
1404
+ * With DEBUG_VM disabled, order-0 pages being freed are checked only when
1405
+ * moving from pcp lists to free list in order to reduce overhead. With
1406
+ * debug_pagealloc enabled, they are checked also immediately when being freed
1407
+ * to the pcp lists.
1408
+ */
1409
+static bool free_pcp_prepare(struct page *page)
1410
+{
1411
+ if (debug_pagealloc_enabled_static())
1412
+ return free_pages_prepare(page, 0, true, FPI_NONE);
1413
+ else
1414
+ return free_pages_prepare(page, 0, false, FPI_NONE);
1415
+}
1416
+
1417
+static bool bulkfree_pcp_prepare(struct page *page)
1418
+{
1419
+ return check_free_page(page);
11621420 }
11631421 #endif /* CONFIG_DEBUG_VM */
11641422
....@@ -1172,7 +1430,7 @@
11721430 }
11731431
11741432 /*
1175
- * Frees a number of pages from the PCP lists
1433
+ * Frees a number of pages which have been collected from the pcp lists.
11761434 * Assumes all pages on list are in same zone, and of same order.
11771435 * count is the number of pages to free.
11781436 *
....@@ -1182,15 +1440,56 @@
11821440 * And clear the zone's pages_scanned counter, to hold off the "all pages are
11831441 * pinned" detection logic.
11841442 */
1185
-static void free_pcppages_bulk(struct zone *zone, int count,
1186
- struct per_cpu_pages *pcp)
1443
+static void free_pcppages_bulk(struct zone *zone, struct list_head *head,
1444
+ bool zone_retry)
1445
+{
1446
+ bool isolated_pageblocks;
1447
+ struct page *page, *tmp;
1448
+ unsigned long flags;
1449
+
1450
+ spin_lock_irqsave(&zone->lock, flags);
1451
+ isolated_pageblocks = has_isolate_pageblock(zone);
1452
+
1453
+ /*
1454
+ * Use safe version since after __free_one_page(),
1455
+ * page->lru.next will not point to original list.
1456
+ */
1457
+ list_for_each_entry_safe(page, tmp, head, lru) {
1458
+ int mt = get_pcppage_migratetype(page);
1459
+
1460
+ if (page_zone(page) != zone) {
1461
+ /*
1462
+ * free_unref_page_list() sorts pages by zone. If we end
1463
+ * up with pages from a different NUMA nodes belonging
1464
+ * to the same ZONE index then we need to redo with the
1465
+ * correct ZONE pointer. Skip the page for now, redo it
1466
+ * on the next iteration.
1467
+ */
1468
+ WARN_ON_ONCE(zone_retry == false);
1469
+ if (zone_retry)
1470
+ continue;
1471
+ }
1472
+
1473
+ /* MIGRATE_ISOLATE page should not go to pcplists */
1474
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1475
+ /* Pageblock could have been isolated meanwhile */
1476
+ if (unlikely(isolated_pageblocks))
1477
+ mt = get_pageblock_migratetype(page);
1478
+
1479
+ list_del(&page->lru);
1480
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
1481
+ trace_mm_page_pcpu_drain(page, 0, mt);
1482
+ }
1483
+ spin_unlock_irqrestore(&zone->lock, flags);
1484
+}
1485
+
1486
+static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp,
1487
+ struct list_head *dst)
11871488 {
11881489 int migratetype = 0;
11891490 int batch_free = 0;
11901491 int prefetch_nr = 0;
1191
- bool isolated_pageblocks;
1192
- struct page *page, *tmp;
1193
- LIST_HEAD(head);
1492
+ struct page *page;
11941493
11951494 /*
11961495 * Ensure proper count is passed which otherwise would stuck in the
....@@ -1227,7 +1526,7 @@
12271526 if (bulkfree_pcp_prepare(page))
12281527 continue;
12291528
1230
- list_add_tail(&page->lru, &head);
1529
+ list_add_tail(&page->lru, dst);
12311530
12321531 /*
12331532 * We are going to put the page back to the global
....@@ -1242,39 +1541,19 @@
12421541 prefetch_buddy(page);
12431542 } while (--count && --batch_free && !list_empty(list));
12441543 }
1245
-
1246
- spin_lock(&zone->lock);
1247
- isolated_pageblocks = has_isolate_pageblock(zone);
1248
-
1249
- /*
1250
- * Use safe version since after __free_one_page(),
1251
- * page->lru.next will not point to original list.
1252
- */
1253
- list_for_each_entry_safe(page, tmp, &head, lru) {
1254
- int mt = get_pcppage_migratetype(page);
1255
- /* MIGRATE_ISOLATE page should not go to pcplists */
1256
- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1257
- /* Pageblock could have been isolated meanwhile */
1258
- if (unlikely(isolated_pageblocks))
1259
- mt = get_pageblock_migratetype(page);
1260
-
1261
- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
1262
- trace_mm_page_pcpu_drain(page, 0, mt);
1263
- }
1264
- spin_unlock(&zone->lock);
12651544 }
12661545
12671546 static void free_one_page(struct zone *zone,
12681547 struct page *page, unsigned long pfn,
12691548 unsigned int order,
1270
- int migratetype)
1549
+ int migratetype, fpi_t fpi_flags)
12711550 {
12721551 spin_lock(&zone->lock);
12731552 if (unlikely(has_isolate_pageblock(zone) ||
12741553 is_migrate_isolate(migratetype))) {
12751554 migratetype = get_pfnblock_migratetype(page, pfn);
12761555 }
1277
- __free_one_page(page, pfn, zone, order, migratetype);
1556
+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
12781557 spin_unlock(&zone->lock);
12791558 }
12801559
....@@ -1348,33 +1627,45 @@
13481627 /* Avoid false-positive PageTail() */
13491628 INIT_LIST_HEAD(&page->lru);
13501629
1351
- SetPageReserved(page);
1630
+ /*
1631
+ * no need for atomic set_bit because the struct
1632
+ * page is not visible yet so nobody should
1633
+ * access it yet.
1634
+ */
1635
+ __SetPageReserved(page);
13521636 }
13531637 }
13541638 }
13551639
1356
-static void __free_pages_ok(struct page *page, unsigned int order)
1640
+static void __free_pages_ok(struct page *page, unsigned int order,
1641
+ fpi_t fpi_flags)
13571642 {
13581643 unsigned long flags;
13591644 int migratetype;
13601645 unsigned long pfn = page_to_pfn(page);
13611646
1362
- if (!free_pages_prepare(page, order, true))
1647
+ if (!free_pages_prepare(page, order, true, fpi_flags))
13631648 return;
13641649
13651650 migratetype = get_pfnblock_migratetype(page, pfn);
1366
- local_irq_save(flags);
1651
+ local_lock_irqsave(&pa_lock.l, flags);
13671652 __count_vm_events(PGFREE, 1 << order);
1368
- free_one_page(page_zone(page), page, pfn, order, migratetype);
1369
- local_irq_restore(flags);
1653
+ free_one_page(page_zone(page), page, pfn, order, migratetype,
1654
+ fpi_flags);
1655
+ local_unlock_irqrestore(&pa_lock.l, flags);
13701656 }
13711657
1372
-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
1658
+void __free_pages_core(struct page *page, unsigned int order)
13731659 {
13741660 unsigned int nr_pages = 1 << order;
13751661 struct page *p = page;
13761662 unsigned int loop;
13771663
1664
+ /*
1665
+ * When initializing the memmap, __init_single_page() sets the refcount
1666
+ * of all pages to 1 ("allocated"/"not free"). We have to set the
1667
+ * refcount of all involved pages to 0.
1668
+ */
13781669 prefetchw(p);
13791670 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
13801671 prefetchw(p + 1);
....@@ -1384,15 +1675,43 @@
13841675 __ClearPageReserved(p);
13851676 set_page_count(p, 0);
13861677
1387
- page_zone(page)->managed_pages += nr_pages;
1388
- set_page_refcounted(page);
1389
- __free_pages(page, order);
1678
+ atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
1679
+
1680
+ /*
1681
+ * Bypass PCP and place fresh pages right to the tail, primarily
1682
+ * relevant for memory onlining.
1683
+ */
1684
+ __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
13901685 }
13911686
1392
-#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
1393
- defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
1687
+#ifdef CONFIG_NEED_MULTIPLE_NODES
13941688
13951689 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
1690
+
1691
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
1692
+
1693
+/*
1694
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
1695
+ */
1696
+int __meminit __early_pfn_to_nid(unsigned long pfn,
1697
+ struct mminit_pfnnid_cache *state)
1698
+{
1699
+ unsigned long start_pfn, end_pfn;
1700
+ int nid;
1701
+
1702
+ if (state->last_start <= pfn && pfn < state->last_end)
1703
+ return state->last_nid;
1704
+
1705
+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
1706
+ if (nid != NUMA_NO_NODE) {
1707
+ state->last_start = start_pfn;
1708
+ state->last_end = end_pfn;
1709
+ state->last_nid = nid;
1710
+ }
1711
+
1712
+ return nid;
1713
+}
1714
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
13961715
13971716 int __meminit early_pfn_to_nid(unsigned long pfn)
13981717 {
....@@ -1407,48 +1726,14 @@
14071726
14081727 return nid;
14091728 }
1410
-#endif
1729
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
14111730
1412
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
1413
-static inline bool __meminit __maybe_unused
1414
-meminit_pfn_in_nid(unsigned long pfn, int node,
1415
- struct mminit_pfnnid_cache *state)
1416
-{
1417
- int nid;
1418
-
1419
- nid = __early_pfn_to_nid(pfn, state);
1420
- if (nid >= 0 && nid != node)
1421
- return false;
1422
- return true;
1423
-}
1424
-
1425
-/* Only safe to use early in boot when initialisation is single-threaded */
1426
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1427
-{
1428
- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
1429
-}
1430
-
1431
-#else
1432
-
1433
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1434
-{
1435
- return true;
1436
-}
1437
-static inline bool __meminit __maybe_unused
1438
-meminit_pfn_in_nid(unsigned long pfn, int node,
1439
- struct mminit_pfnnid_cache *state)
1440
-{
1441
- return true;
1442
-}
1443
-#endif
1444
-
1445
-
1446
-void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
1731
+void __init memblock_free_pages(struct page *page, unsigned long pfn,
14471732 unsigned int order)
14481733 {
14491734 if (early_page_uninitialised(pfn))
14501735 return;
1451
- return __free_pages_boot_core(page, order);
1736
+ __free_pages_core(page, order);
14521737 }
14531738
14541739 /*
....@@ -1539,14 +1824,14 @@
15391824 if (nr_pages == pageblock_nr_pages &&
15401825 (pfn & (pageblock_nr_pages - 1)) == 0) {
15411826 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1542
- __free_pages_boot_core(page, pageblock_order);
1827
+ __free_pages_core(page, pageblock_order);
15431828 return;
15441829 }
15451830
15461831 for (i = 0; i < nr_pages; i++, page++, pfn++) {
15471832 if ((pfn & (pageblock_nr_pages - 1)) == 0)
15481833 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1549
- __free_pages_boot_core(page, 0);
1834
+ __free_pages_core(page, 0);
15501835 }
15511836 }
15521837
....@@ -1569,20 +1854,12 @@
15691854 *
15701855 * Then, we check if a current large page is valid by only checking the validity
15711856 * of the head pfn.
1572
- *
1573
- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
1574
- * within a node: a pfn is between start and end of a node, but does not belong
1575
- * to this memory node.
15761857 */
1577
-static inline bool __init
1578
-deferred_pfn_valid(int nid, unsigned long pfn,
1579
- struct mminit_pfnnid_cache *nid_init_state)
1858
+static inline bool __init deferred_pfn_valid(unsigned long pfn)
15801859 {
15811860 if (!pfn_valid_within(pfn))
15821861 return false;
15831862 if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
1584
- return false;
1585
- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
15861863 return false;
15871864 return true;
15881865 }
....@@ -1591,21 +1868,19 @@
15911868 * Free pages to buddy allocator. Try to free aligned pages in
15921869 * pageblock_nr_pages sizes.
15931870 */
1594
-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
1871
+static void __init deferred_free_pages(unsigned long pfn,
15951872 unsigned long end_pfn)
15961873 {
1597
- struct mminit_pfnnid_cache nid_init_state = { };
15981874 unsigned long nr_pgmask = pageblock_nr_pages - 1;
15991875 unsigned long nr_free = 0;
16001876
16011877 for (; pfn < end_pfn; pfn++) {
1602
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
1878
+ if (!deferred_pfn_valid(pfn)) {
16031879 deferred_free_range(pfn - nr_free, nr_free);
16041880 nr_free = 0;
16051881 } else if (!(pfn & nr_pgmask)) {
16061882 deferred_free_range(pfn - nr_free, nr_free);
16071883 nr_free = 1;
1608
- touch_nmi_watchdog();
16091884 } else {
16101885 nr_free++;
16111886 }
....@@ -1619,22 +1894,22 @@
16191894 * by performing it only once every pageblock_nr_pages.
16201895 * Return number of pages initialized.
16211896 */
1622
-static unsigned long __init deferred_init_pages(int nid, int zid,
1897
+static unsigned long __init deferred_init_pages(struct zone *zone,
16231898 unsigned long pfn,
16241899 unsigned long end_pfn)
16251900 {
1626
- struct mminit_pfnnid_cache nid_init_state = { };
16271901 unsigned long nr_pgmask = pageblock_nr_pages - 1;
1902
+ int nid = zone_to_nid(zone);
16281903 unsigned long nr_pages = 0;
1904
+ int zid = zone_idx(zone);
16291905 struct page *page = NULL;
16301906
16311907 for (; pfn < end_pfn; pfn++) {
1632
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
1908
+ if (!deferred_pfn_valid(pfn)) {
16331909 page = NULL;
16341910 continue;
16351911 } else if (!page || !(pfn & nr_pgmask)) {
16361912 page = pfn_to_page(pfn);
1637
- touch_nmi_watchdog();
16381913 } else {
16391914 page++;
16401915 }
....@@ -1644,18 +1919,127 @@
16441919 return (nr_pages);
16451920 }
16461921
1922
+/*
1923
+ * This function is meant to pre-load the iterator for the zone init.
1924
+ * Specifically it walks through the ranges until we are caught up to the
1925
+ * first_init_pfn value and exits there. If we never encounter the value we
1926
+ * return false indicating there are no valid ranges left.
1927
+ */
1928
+static bool __init
1929
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
1930
+ unsigned long *spfn, unsigned long *epfn,
1931
+ unsigned long first_init_pfn)
1932
+{
1933
+ u64 j;
1934
+
1935
+ /*
1936
+ * Start out by walking through the ranges in this zone that have
1937
+ * already been initialized. We don't need to do anything with them
1938
+ * so we just need to flush them out of the system.
1939
+ */
1940
+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
1941
+ if (*epfn <= first_init_pfn)
1942
+ continue;
1943
+ if (*spfn < first_init_pfn)
1944
+ *spfn = first_init_pfn;
1945
+ *i = j;
1946
+ return true;
1947
+ }
1948
+
1949
+ return false;
1950
+}
1951
+
1952
+/*
1953
+ * Initialize and free pages. We do it in two loops: first we initialize
1954
+ * struct page, then free to buddy allocator, because while we are
1955
+ * freeing pages we can access pages that are ahead (computing buddy
1956
+ * page in __free_one_page()).
1957
+ *
1958
+ * In order to try and keep some memory in the cache we have the loop
1959
+ * broken along max page order boundaries. This way we will not cause
1960
+ * any issues with the buddy page computation.
1961
+ */
1962
+static unsigned long __init
1963
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
1964
+ unsigned long *end_pfn)
1965
+{
1966
+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
1967
+ unsigned long spfn = *start_pfn, epfn = *end_pfn;
1968
+ unsigned long nr_pages = 0;
1969
+ u64 j = *i;
1970
+
1971
+ /* First we loop through and initialize the page values */
1972
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
1973
+ unsigned long t;
1974
+
1975
+ if (mo_pfn <= *start_pfn)
1976
+ break;
1977
+
1978
+ t = min(mo_pfn, *end_pfn);
1979
+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
1980
+
1981
+ if (mo_pfn < *end_pfn) {
1982
+ *start_pfn = mo_pfn;
1983
+ break;
1984
+ }
1985
+ }
1986
+
1987
+ /* Reset values and now loop through freeing pages as needed */
1988
+ swap(j, *i);
1989
+
1990
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
1991
+ unsigned long t;
1992
+
1993
+ if (mo_pfn <= spfn)
1994
+ break;
1995
+
1996
+ t = min(mo_pfn, epfn);
1997
+ deferred_free_pages(spfn, t);
1998
+
1999
+ if (mo_pfn <= epfn)
2000
+ break;
2001
+ }
2002
+
2003
+ return nr_pages;
2004
+}
2005
+
2006
+static void __init
2007
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
2008
+ void *arg)
2009
+{
2010
+ unsigned long spfn, epfn;
2011
+ struct zone *zone = arg;
2012
+ u64 i;
2013
+
2014
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
2015
+
2016
+ /*
2017
+ * Initialize and free pages in MAX_ORDER sized increments so that we
2018
+ * can avoid introducing any issues with the buddy allocator.
2019
+ */
2020
+ while (spfn < end_pfn) {
2021
+ deferred_init_maxorder(&i, zone, &spfn, &epfn);
2022
+ cond_resched();
2023
+ }
2024
+}
2025
+
2026
+/* An arch may override for more concurrency. */
2027
+__weak int __init
2028
+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
2029
+{
2030
+ return 1;
2031
+}
2032
+
16472033 /* Initialise remaining memory on a node */
16482034 static int __init deferred_init_memmap(void *data)
16492035 {
16502036 pg_data_t *pgdat = data;
1651
- int nid = pgdat->node_id;
1652
- unsigned long start = jiffies;
1653
- unsigned long nr_pages = 0;
1654
- unsigned long spfn, epfn, first_init_pfn, flags;
1655
- phys_addr_t spa, epa;
1656
- int zid;
1657
- struct zone *zone;
16582037 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2038
+ unsigned long spfn = 0, epfn = 0;
2039
+ unsigned long first_init_pfn, flags;
2040
+ unsigned long start = jiffies;
2041
+ struct zone *zone;
2042
+ int zid, max_threads;
16592043 u64 i;
16602044
16612045 /* Bind memory initialisation thread to a local node if possible */
....@@ -1688,30 +2072,36 @@
16882072 if (first_init_pfn < zone_end_pfn(zone))
16892073 break;
16902074 }
1691
- first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
16922075
1693
- /*
1694
- * Initialize and free pages. We do it in two loops: first we initialize
1695
- * struct page, than free to buddy allocator, because while we are
1696
- * freeing pages we can access pages that are ahead (computing buddy
1697
- * page in __free_one_page()).
1698
- */
1699
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1700
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1701
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1702
- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
1703
- }
1704
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1705
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1706
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1707
- deferred_free_pages(nid, zid, spfn, epfn);
1708
- }
2076
+ /* If the zone is empty somebody else may have cleared out the zone */
2077
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2078
+ first_init_pfn))
2079
+ goto zone_empty;
17092080
2081
+ max_threads = deferred_page_init_max_threads(cpumask);
2082
+
2083
+ while (spfn < epfn) {
2084
+ unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
2085
+ struct padata_mt_job job = {
2086
+ .thread_fn = deferred_init_memmap_chunk,
2087
+ .fn_arg = zone,
2088
+ .start = spfn,
2089
+ .size = epfn_align - spfn,
2090
+ .align = PAGES_PER_SECTION,
2091
+ .min_chunk = PAGES_PER_SECTION,
2092
+ .max_threads = max_threads,
2093
+ };
2094
+
2095
+ padata_do_multithreaded(&job);
2096
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2097
+ epfn_align);
2098
+ }
2099
+zone_empty:
17102100 /* Sanity check that the next zone really is unpopulated */
17112101 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
17122102
1713
- pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
1714
- jiffies_to_msecs(jiffies - start));
2103
+ pr_info("node %d deferred pages initialised in %ums\n",
2104
+ pgdat->node_id, jiffies_to_msecs(jiffies - start));
17152105
17162106 pgdat_init_report_one_done();
17172107 return 0;
....@@ -1735,14 +2125,11 @@
17352125 static noinline bool __init
17362126 deferred_grow_zone(struct zone *zone, unsigned int order)
17372127 {
1738
- int zid = zone_idx(zone);
1739
- int nid = zone_to_nid(zone);
1740
- pg_data_t *pgdat = NODE_DATA(nid);
17412128 unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
1742
- unsigned long nr_pages = 0;
1743
- unsigned long first_init_pfn, spfn, epfn, t, flags;
2129
+ pg_data_t *pgdat = zone->zone_pgdat;
17442130 unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
1745
- phys_addr_t spa, epa;
2131
+ unsigned long spfn, epfn, flags;
2132
+ unsigned long nr_pages = 0;
17462133 u64 i;
17472134
17482135 /* Only the last zone may have deferred pages */
....@@ -1760,38 +2147,37 @@
17602147 return true;
17612148 }
17622149
1763
- first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
1764
-
1765
- if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
2150
+ /* If the zone is empty somebody else may have cleared out the zone */
2151
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2152
+ first_deferred_pfn)) {
2153
+ pgdat->first_deferred_pfn = ULONG_MAX;
17662154 pgdat_resize_unlock(pgdat, &flags);
1767
- return false;
2155
+ /* Retry only once. */
2156
+ return first_deferred_pfn != ULONG_MAX;
17682157 }
17692158
1770
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1771
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1772
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
2159
+ /*
2160
+ * Initialize and free pages in MAX_ORDER sized increments so
2161
+ * that we can avoid introducing any issues with the buddy
2162
+ * allocator.
2163
+ */
2164
+ while (spfn < epfn) {
2165
+ /* update our first deferred PFN for this section */
2166
+ first_deferred_pfn = spfn;
17732167
1774
- while (spfn < epfn && nr_pages < nr_pages_needed) {
1775
- t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
1776
- first_deferred_pfn = min(t, epfn);
1777
- nr_pages += deferred_init_pages(nid, zid, spfn,
1778
- first_deferred_pfn);
1779
- spfn = first_deferred_pfn;
1780
- }
2168
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
2169
+ touch_nmi_watchdog();
17812170
2171
+ /* We should only stop along section boundaries */
2172
+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
2173
+ continue;
2174
+
2175
+ /* If our quota has been met we can stop here */
17822176 if (nr_pages >= nr_pages_needed)
17832177 break;
17842178 }
17852179
1786
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1787
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1788
- epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
1789
- deferred_free_pages(nid, zid, spfn, epfn);
1790
-
1791
- if (first_deferred_pfn == epfn)
1792
- break;
1793
- }
1794
- pgdat->first_deferred_pfn = first_deferred_pfn;
2180
+ pgdat->first_deferred_pfn = spfn;
17952181 pgdat_resize_unlock(pgdat, &flags);
17962182
17972183 return nr_pages > 0;
....@@ -1814,9 +2200,9 @@
18142200 void __init page_alloc_init_late(void)
18152201 {
18162202 struct zone *zone;
2203
+ int nid;
18172204
18182205 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1819
- int nid;
18202206
18212207 /* There will be num_node_state(N_MEMORY) threads */
18222208 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
....@@ -1844,10 +2230,12 @@
18442230 /* Reinit limits that are based on free pages after the kernel is up */
18452231 files_maxfiles_init();
18462232 #endif
1847
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
2233
+
18482234 /* Discard memblock private memory */
18492235 memblock_discard();
1850
-#endif
2236
+
2237
+ for_each_node_state(nid, N_MEMORY)
2238
+ shuffle_free_memory(NODE_DATA(nid));
18512239
18522240 for_each_populated_zone(zone)
18532241 set_zone_contiguous(zone);
....@@ -1881,6 +2269,7 @@
18812269 }
18822270
18832271 adjust_managed_page_count(page, pageblock_nr_pages);
2272
+ page_zone(page)->cma_pages += pageblock_nr_pages;
18842273 }
18852274 #endif
18862275
....@@ -1899,13 +2288,11 @@
18992288 * -- nyc
19002289 */
19012290 static inline void expand(struct zone *zone, struct page *page,
1902
- int low, int high, struct free_area *area,
1903
- int migratetype)
2291
+ int low, int high, int migratetype)
19042292 {
19052293 unsigned long size = 1 << high;
19062294
19072295 while (high > low) {
1908
- area--;
19092296 high--;
19102297 size >>= 1;
19112298 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
....@@ -1919,39 +2306,21 @@
19192306 if (set_page_guard(zone, &page[size], high, migratetype))
19202307 continue;
19212308
1922
- list_add(&page[size].lru, &area->free_list[migratetype]);
1923
- area->nr_free++;
1924
- set_page_order(&page[size], high);
2309
+ add_to_free_list(&page[size], zone, high, migratetype);
2310
+ set_buddy_order(&page[size], high);
19252311 }
19262312 }
19272313
19282314 static void check_new_page_bad(struct page *page)
19292315 {
1930
- const char *bad_reason = NULL;
1931
- unsigned long bad_flags = 0;
1932
-
1933
- if (unlikely(atomic_read(&page->_mapcount) != -1))
1934
- bad_reason = "nonzero mapcount";
1935
- if (unlikely(page->mapping != NULL))
1936
- bad_reason = "non-NULL mapping";
1937
- if (unlikely(page_ref_count(page) != 0))
1938
- bad_reason = "nonzero _count";
19392316 if (unlikely(page->flags & __PG_HWPOISON)) {
1940
- bad_reason = "HWPoisoned (hardware-corrupted)";
1941
- bad_flags = __PG_HWPOISON;
19422317 /* Don't complain about hwpoisoned pages */
19432318 page_mapcount_reset(page); /* remove PageBuddy */
19442319 return;
19452320 }
1946
- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
1947
- bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
1948
- bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
1949
- }
1950
-#ifdef CONFIG_MEMCG
1951
- if (unlikely(page->mem_cgroup))
1952
- bad_reason = "page still charged to cgroup";
1953
-#endif
1954
- bad_page(page, bad_reason, bad_flags);
2321
+
2322
+ bad_page(page,
2323
+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
19552324 }
19562325
19572326 /*
....@@ -1967,30 +2336,40 @@
19672336 return 1;
19682337 }
19692338
1970
-static inline bool free_pages_prezeroed(void)
1971
-{
1972
- return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
1973
- page_poisoning_enabled()) || want_init_on_free();
1974
-}
1975
-
19762339 #ifdef CONFIG_DEBUG_VM
1977
-static bool check_pcp_refill(struct page *page)
2340
+/*
2341
+ * With DEBUG_VM enabled, order-0 pages are checked for expected state when
2342
+ * being allocated from pcp lists. With debug_pagealloc also enabled, they are
2343
+ * also checked when pcp lists are refilled from the free lists.
2344
+ */
2345
+static inline bool check_pcp_refill(struct page *page)
19782346 {
1979
- return false;
2347
+ if (debug_pagealloc_enabled_static())
2348
+ return check_new_page(page);
2349
+ else
2350
+ return false;
19802351 }
19812352
1982
-static bool check_new_pcp(struct page *page)
2353
+static inline bool check_new_pcp(struct page *page)
19832354 {
19842355 return check_new_page(page);
19852356 }
19862357 #else
1987
-static bool check_pcp_refill(struct page *page)
2358
+/*
2359
+ * With DEBUG_VM disabled, free order-0 pages are checked for expected state
2360
+ * when pcp lists are being refilled from the free lists. With debug_pagealloc
2361
+ * enabled, they are also checked when being allocated from the pcp lists.
2362
+ */
2363
+static inline bool check_pcp_refill(struct page *page)
19882364 {
19892365 return check_new_page(page);
19902366 }
1991
-static bool check_new_pcp(struct page *page)
2367
+static inline bool check_new_pcp(struct page *page)
19922368 {
1993
- return false;
2369
+ if (debug_pagealloc_enabled_static())
2370
+ return check_new_page(page);
2371
+ else
2372
+ return false;
19942373 }
19952374 #endif /* CONFIG_DEBUG_VM */
19962375
....@@ -2014,9 +2393,31 @@
20142393 set_page_refcounted(page);
20152394
20162395 arch_alloc_page(page, order);
2017
- kernel_map_pages(page, 1 << order, 1);
2018
- kasan_alloc_pages(page, order);
2019
- kernel_poison_pages(page, 1 << order, 1);
2396
+ debug_pagealloc_map_pages(page, 1 << order);
2397
+
2398
+ /*
2399
+ * Page unpoisoning must happen before memory initialization.
2400
+ * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
2401
+ * allocations and the page unpoisoning code will complain.
2402
+ */
2403
+ kernel_unpoison_pages(page, 1 << order);
2404
+
2405
+ /*
2406
+ * As memory initialization might be integrated into KASAN,
2407
+ * kasan_alloc_pages and kernel_init_free_pages must be
2408
+ * kept together to avoid discrepancies in behavior.
2409
+ */
2410
+ if (kasan_has_integrated_init()) {
2411
+ kasan_alloc_pages(page, order, gfp_flags);
2412
+ } else {
2413
+ bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
2414
+
2415
+ kasan_unpoison_pages(page, order, init);
2416
+ if (init)
2417
+ kernel_init_free_pages(page, 1 << order,
2418
+ gfp_flags & __GFP_ZEROTAGS);
2419
+ }
2420
+
20202421 set_page_owner(page, order, gfp_flags);
20212422 }
20222423
....@@ -2024,9 +2425,6 @@
20242425 unsigned int alloc_flags)
20252426 {
20262427 post_alloc_hook(page, order, gfp_flags);
2027
-
2028
- if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
2029
- kernel_init_free_pages(page, 1 << order);
20302428
20312429 if (order && (gfp_flags & __GFP_COMP))
20322430 prep_compound_page(page, order);
....@@ -2041,6 +2439,7 @@
20412439 set_page_pfmemalloc(page);
20422440 else
20432441 clear_page_pfmemalloc(page);
2442
+ trace_android_vh_test_clear_look_around_ref(page);
20442443 }
20452444
20462445 /*
....@@ -2058,14 +2457,11 @@
20582457 /* Find a page of the appropriate size in the preferred list */
20592458 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
20602459 area = &(zone->free_area[current_order]);
2061
- page = list_first_entry_or_null(&area->free_list[migratetype],
2062
- struct page, lru);
2460
+ page = get_page_from_free_area(area, migratetype);
20632461 if (!page)
20642462 continue;
2065
- list_del(&page->lru);
2066
- rmv_page_order(page);
2067
- area->nr_free--;
2068
- expand(zone, page, order, current_order, area, migratetype);
2463
+ del_page_from_free_list(page, zone, current_order);
2464
+ expand(zone, page, order, current_order, migratetype);
20692465 set_pcppage_migratetype(page, migratetype);
20702466 return page;
20712467 }
....@@ -2078,10 +2474,10 @@
20782474 * This array describes the order lists are fallen back to when
20792475 * the free lists for the desirable migrate type are depleted
20802476 */
2081
-static int fallbacks[MIGRATE_TYPES][4] = {
2477
+static int fallbacks[MIGRATE_TYPES][3] = {
20822478 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
2083
- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
20842479 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
2480
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
20852481 #ifdef CONFIG_CMA
20862482 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
20872483 #endif
....@@ -2102,7 +2498,7 @@
21022498 #endif
21032499
21042500 /*
2105
- * Move the free pages in a range to the free lists of the requested type.
2501
+ * Move the free pages in a range to the freelist tail of the requested type.
21062502 * Note that start_page and end_pages are not aligned on a pageblock
21072503 * boundary. If alignment is required, use move_freepages_block()
21082504 */
....@@ -2114,30 +2510,11 @@
21142510 unsigned int order;
21152511 int pages_moved = 0;
21162512
2117
-#ifndef CONFIG_HOLES_IN_ZONE
2118
- /*
2119
- * page_zone is not safe to call in this context when
2120
- * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
2121
- * anyway as we check zone boundaries in move_freepages_block().
2122
- * Remove at a later date when no bug reports exist related to
2123
- * grouping pages by mobility
2124
- */
2125
- VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
2126
- pfn_valid(page_to_pfn(end_page)) &&
2127
- page_zone(start_page) != page_zone(end_page));
2128
-#endif
2129
-
2130
- if (num_movable)
2131
- *num_movable = 0;
2132
-
21332513 for (page = start_page; page <= end_page;) {
21342514 if (!pfn_valid_within(page_to_pfn(page))) {
21352515 page++;
21362516 continue;
21372517 }
2138
-
2139
- /* Make sure we are not inadvertently changing nodes */
2140
- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
21412518
21422519 if (!PageBuddy(page)) {
21432520 /*
....@@ -2153,9 +2530,12 @@
21532530 continue;
21542531 }
21552532
2156
- order = page_order(page);
2157
- list_move(&page->lru,
2158
- &zone->free_area[order].free_list[migratetype]);
2533
+ /* Make sure we are not inadvertently changing nodes */
2534
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
2535
+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
2536
+
2537
+ order = buddy_order(page);
2538
+ move_to_free_list(page, zone, order, migratetype);
21592539 page += 1 << order;
21602540 pages_moved += 1 << order;
21612541 }
....@@ -2168,6 +2548,9 @@
21682548 {
21692549 unsigned long start_pfn, end_pfn;
21702550 struct page *start_page, *end_page;
2551
+
2552
+ if (num_movable)
2553
+ *num_movable = 0;
21712554
21722555 start_pfn = page_to_pfn(page);
21732556 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
....@@ -2229,6 +2612,43 @@
22292612 return false;
22302613 }
22312614
2615
+static inline bool boost_watermark(struct zone *zone)
2616
+{
2617
+ unsigned long max_boost;
2618
+
2619
+ if (!watermark_boost_factor)
2620
+ return false;
2621
+ /*
2622
+ * Don't bother in zones that are unlikely to produce results.
2623
+ * On small machines, including kdump capture kernels running
2624
+ * in a small area, boosting the watermark can cause an out of
2625
+ * memory situation immediately.
2626
+ */
2627
+ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
2628
+ return false;
2629
+
2630
+ max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
2631
+ watermark_boost_factor, 10000);
2632
+
2633
+ /*
2634
+ * high watermark may be uninitialised if fragmentation occurs
2635
+ * very early in boot so do not boost. We do not fall
2636
+ * through and boost by pageblock_nr_pages as failing
2637
+ * allocations that early means that reclaim is not going
2638
+ * to help and it may even be impossible to reclaim the
2639
+ * boosted watermark resulting in a hang.
2640
+ */
2641
+ if (!max_boost)
2642
+ return false;
2643
+
2644
+ max_boost = max(pageblock_nr_pages, max_boost);
2645
+
2646
+ zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
2647
+ max_boost);
2648
+
2649
+ return true;
2650
+}
2651
+
22322652 /*
22332653 * This function implements actual steal behaviour. If order is large enough,
22342654 * we can steal whole pageblock. If not, we first move freepages in this
....@@ -2238,10 +2658,9 @@
22382658 * itself, so pages freed in the future will be put on the correct free list.
22392659 */
22402660 static void steal_suitable_fallback(struct zone *zone, struct page *page,
2241
- int start_type, bool whole_block)
2661
+ unsigned int alloc_flags, int start_type, bool whole_block)
22422662 {
2243
- unsigned int current_order = page_order(page);
2244
- struct free_area *area;
2663
+ unsigned int current_order = buddy_order(page);
22452664 int free_pages, movable_pages, alike_pages;
22462665 int old_block_type;
22472666
....@@ -2259,6 +2678,14 @@
22592678 change_pageblock_range(page, current_order, start_type);
22602679 goto single_page;
22612680 }
2681
+
2682
+ /*
2683
+ * Boost watermarks to increase reclaim pressure to reduce the
2684
+ * likelihood of future fallbacks. Wake kswapd now as the node
2685
+ * may be balanced overall and kswapd will not wake naturally.
2686
+ */
2687
+ if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
2688
+ set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
22622689
22632690 /* We are not allowed to try stealing from the whole block */
22642691 if (!whole_block)
....@@ -2303,8 +2730,7 @@
23032730 return;
23042731
23052732 single_page:
2306
- area = &zone->free_area[current_order];
2307
- list_move(&page->lru, &area->free_list[start_type]);
2733
+ move_to_free_list(page, zone, current_order, start_type);
23082734 }
23092735
23102736 /*
....@@ -2328,7 +2754,7 @@
23282754 if (fallback_mt == MIGRATE_TYPES)
23292755 break;
23302756
2331
- if (list_empty(&area->free_list[fallback_mt]))
2757
+ if (free_area_empty(area, fallback_mt))
23322758 continue;
23332759
23342760 if (can_steal_fallback(order, migratetype))
....@@ -2358,7 +2784,7 @@
23582784 * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
23592785 * Check is race-prone but harmless.
23602786 */
2361
- max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
2787
+ max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
23622788 if (zone->nr_reserved_highatomic >= max_managed)
23632789 return;
23642790
....@@ -2401,7 +2827,7 @@
24012827 int order;
24022828 bool ret;
24032829
2404
- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2830
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
24052831 ac->nodemask) {
24062832 /*
24072833 * Preserve at least one pageblock unless memory pressure
....@@ -2415,9 +2841,7 @@
24152841 for (order = 0; order < MAX_ORDER; order++) {
24162842 struct free_area *area = &(zone->free_area[order]);
24172843
2418
- page = list_first_entry_or_null(
2419
- &area->free_list[MIGRATE_HIGHATOMIC],
2420
- struct page, lru);
2844
+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
24212845 if (!page)
24222846 continue;
24232847
....@@ -2475,20 +2899,30 @@
24752899 * condition simpler.
24762900 */
24772901 static __always_inline bool
2478
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
2902
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
2903
+ unsigned int alloc_flags)
24792904 {
24802905 struct free_area *area;
24812906 int current_order;
2907
+ int min_order = order;
24822908 struct page *page;
24832909 int fallback_mt;
24842910 bool can_steal;
2911
+
2912
+ /*
2913
+ * Do not steal pages from freelists belonging to other pageblocks
2914
+ * i.e. orders < pageblock_order. If there are no local zones free,
2915
+ * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
2916
+ */
2917
+ if (alloc_flags & ALLOC_NOFRAGMENT)
2918
+ min_order = pageblock_order;
24852919
24862920 /*
24872921 * Find the largest available free page in the other list. This roughly
24882922 * approximates finding the pageblock with the most free pages, which
24892923 * would be too costly to do exactly.
24902924 */
2491
- for (current_order = MAX_ORDER - 1; current_order >= order;
2925
+ for (current_order = MAX_ORDER - 1; current_order >= min_order;
24922926 --current_order) {
24932927 area = &(zone->free_area[current_order]);
24942928 fallback_mt = find_suitable_fallback(area, current_order,
....@@ -2530,10 +2964,10 @@
25302964 VM_BUG_ON(current_order == MAX_ORDER);
25312965
25322966 do_steal:
2533
- page = list_first_entry(&area->free_list[fallback_mt],
2534
- struct page, lru);
2967
+ page = get_page_from_free_area(area, fallback_mt);
25352968
2536
- steal_suitable_fallback(zone, page, start_migratetype, can_steal);
2969
+ steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
2970
+ can_steal);
25372971
25382972 trace_mm_page_alloc_extfrag(page, order, current_order,
25392973 start_migratetype, fallback_mt);
....@@ -2547,14 +2981,16 @@
25472981 * Call me with the zone->lock already held.
25482982 */
25492983 static __always_inline struct page *
2550
-__rmqueue(struct zone *zone, unsigned int order, int migratetype)
2984
+__rmqueue(struct zone *zone, unsigned int order, int migratetype,
2985
+ unsigned int alloc_flags)
25512986 {
25522987 struct page *page;
25532988
25542989 retry:
25552990 page = __rmqueue_smallest(zone, order, migratetype);
25562991
2557
- if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype))
2992
+ if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype,
2993
+ alloc_flags))
25582994 goto retry;
25592995
25602996 trace_mm_page_alloc_zone_locked(page, order, migratetype);
....@@ -2562,18 +2998,18 @@
25622998 }
25632999
25643000 #ifdef CONFIG_CMA
2565
-static struct page *__rmqueue_cma(struct zone *zone, unsigned int order)
3001
+static struct page *__rmqueue_cma(struct zone *zone, unsigned int order,
3002
+ int migratetype,
3003
+ unsigned int alloc_flags)
25663004 {
2567
- struct page *page = 0;
2568
-
2569
- if (IS_ENABLED(CONFIG_CMA))
2570
- if (!zone->cma_alloc)
2571
- page = __rmqueue_cma_fallback(zone, order);
3005
+ struct page *page = __rmqueue_cma_fallback(zone, order);
25723006 trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA);
25733007 return page;
25743008 }
25753009 #else
2576
-static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order)
3010
+static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order,
3011
+ int migratetype,
3012
+ unsigned int alloc_flags)
25773013 {
25783014 return NULL;
25793015 }
....@@ -2586,7 +3022,7 @@
25863022 */
25873023 static int rmqueue_bulk(struct zone *zone, unsigned int order,
25883024 unsigned long count, struct list_head *list,
2589
- int migratetype)
3025
+ int migratetype, unsigned int alloc_flags)
25903026 {
25913027 int i, alloced = 0;
25923028
....@@ -2594,15 +3030,11 @@
25943030 for (i = 0; i < count; ++i) {
25953031 struct page *page;
25963032
2597
- /*
2598
- * If migrate type CMA is being requested only try to
2599
- * satisfy the request with CMA pages to try and increase
2600
- * CMA utlization.
2601
- */
26023033 if (is_migrate_cma(migratetype))
2603
- page = __rmqueue_cma(zone, order);
3034
+ page = __rmqueue_cma(zone, order, migratetype,
3035
+ alloc_flags);
26043036 else
2605
- page = __rmqueue(zone, order, migratetype);
3037
+ page = __rmqueue(zone, order, migratetype, alloc_flags);
26063038
26073039 if (unlikely(page == NULL))
26083040 break;
....@@ -2645,14 +3077,14 @@
26453077 */
26463078 static struct list_head *get_populated_pcp_list(struct zone *zone,
26473079 unsigned int order, struct per_cpu_pages *pcp,
2648
- int migratetype)
3080
+ int migratetype, unsigned int alloc_flags)
26493081 {
26503082 struct list_head *list = &pcp->lists[migratetype];
26513083
26523084 if (list_empty(list)) {
26533085 pcp->count += rmqueue_bulk(zone, order,
26543086 pcp->batch, list,
2655
- migratetype);
3087
+ migratetype, alloc_flags);
26563088
26573089 if (list_empty(list))
26583090 list = NULL;
....@@ -2673,13 +3105,18 @@
26733105 {
26743106 unsigned long flags;
26753107 int to_drain, batch;
3108
+ LIST_HEAD(dst);
26763109
2677
- local_irq_save(flags);
3110
+ local_lock_irqsave(&pa_lock.l, flags);
26783111 batch = READ_ONCE(pcp->batch);
26793112 to_drain = min(pcp->count, batch);
26803113 if (to_drain > 0)
2681
- free_pcppages_bulk(zone, to_drain, pcp);
2682
- local_irq_restore(flags);
3114
+ isolate_pcp_pages(to_drain, pcp, &dst);
3115
+
3116
+ local_unlock_irqrestore(&pa_lock.l, flags);
3117
+
3118
+ if (to_drain > 0)
3119
+ free_pcppages_bulk(zone, &dst, false);
26833120 }
26843121 #endif
26853122
....@@ -2695,14 +3132,21 @@
26953132 unsigned long flags;
26963133 struct per_cpu_pageset *pset;
26973134 struct per_cpu_pages *pcp;
3135
+ LIST_HEAD(dst);
3136
+ int count;
26983137
2699
- local_irq_save(flags);
3138
+ local_lock_irqsave(&pa_lock.l, flags);
27003139 pset = per_cpu_ptr(zone->pageset, cpu);
27013140
27023141 pcp = &pset->pcp;
2703
- if (pcp->count)
2704
- free_pcppages_bulk(zone, pcp->count, pcp);
2705
- local_irq_restore(flags);
3142
+ count = pcp->count;
3143
+ if (count)
3144
+ isolate_pcp_pages(count, pcp, &dst);
3145
+
3146
+ local_unlock_irqrestore(&pa_lock.l, flags);
3147
+
3148
+ if (count)
3149
+ free_pcppages_bulk(zone, &dst, false);
27063150 }
27073151
27083152 /*
....@@ -2739,6 +3183,10 @@
27393183
27403184 static void drain_local_pages_wq(struct work_struct *work)
27413185 {
3186
+ struct pcpu_drain *drain;
3187
+
3188
+ drain = container_of(work, struct pcpu_drain, work);
3189
+
27423190 /*
27433191 * drain_all_pages doesn't use proper cpu hotplug protection so
27443192 * we can race with cpu offline when the WQ can move this from
....@@ -2746,9 +3194,9 @@
27463194 * cpu which is allright but we also have to make sure to not move to
27473195 * a different one.
27483196 */
2749
- preempt_disable();
2750
- drain_local_pages(NULL);
2751
- preempt_enable();
3197
+ migrate_disable();
3198
+ drain_local_pages(drain->zone);
3199
+ migrate_enable();
27523200 }
27533201
27543202 /*
....@@ -2818,12 +3266,14 @@
28183266 }
28193267
28203268 for_each_cpu(cpu, &cpus_with_pcps) {
2821
- struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
2822
- INIT_WORK(work, drain_local_pages_wq);
2823
- queue_work_on(cpu, mm_percpu_wq, work);
3269
+ struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
3270
+
3271
+ drain->zone = zone;
3272
+ INIT_WORK(&drain->work, drain_local_pages_wq);
3273
+ queue_work_on(cpu, mm_percpu_wq, &drain->work);
28243274 }
28253275 for_each_cpu(cpu, &cpus_with_pcps)
2826
- flush_work(per_cpu_ptr(&pcpu_drain, cpu));
3276
+ flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
28273277
28283278 mutex_unlock(&pcpu_drain_mutex);
28293279 }
....@@ -2895,11 +3345,13 @@
28953345 return true;
28963346 }
28973347
2898
-static void free_unref_page_commit(struct page *page, unsigned long pfn)
3348
+static void free_unref_page_commit(struct page *page, unsigned long pfn,
3349
+ struct list_head *dst)
28993350 {
29003351 struct zone *zone = page_zone(page);
29013352 struct per_cpu_pages *pcp;
29023353 int migratetype;
3354
+ bool pcp_skip_cma_pages = false;
29033355
29043356 migratetype = get_pcppage_migratetype(page);
29053357 __count_vm_event(PGFREE);
....@@ -2912,8 +3364,12 @@
29123364 * excessively into the page allocator
29133365 */
29143366 if (migratetype >= MIGRATE_PCPTYPES) {
2915
- if (unlikely(is_migrate_isolate(migratetype))) {
2916
- free_one_page(zone, page, pfn, 0, migratetype);
3367
+ trace_android_vh_pcplist_add_cma_pages_bypass(migratetype,
3368
+ &pcp_skip_cma_pages);
3369
+ if (unlikely(is_migrate_isolate(migratetype)) ||
3370
+ pcp_skip_cma_pages) {
3371
+ free_one_page(zone, page, pfn, 0, migratetype,
3372
+ FPI_NONE);
29173373 return;
29183374 }
29193375 migratetype = MIGRATE_MOVABLE;
....@@ -2924,7 +3380,8 @@
29243380 pcp->count++;
29253381 if (pcp->count >= pcp->high) {
29263382 unsigned long batch = READ_ONCE(pcp->batch);
2927
- free_pcppages_bulk(zone, batch, pcp);
3383
+
3384
+ isolate_pcp_pages(batch, pcp, dst);
29283385 }
29293386 }
29303387
....@@ -2935,13 +3392,17 @@
29353392 {
29363393 unsigned long flags;
29373394 unsigned long pfn = page_to_pfn(page);
3395
+ struct zone *zone = page_zone(page);
3396
+ LIST_HEAD(dst);
29383397
29393398 if (!free_unref_page_prepare(page, pfn))
29403399 return;
29413400
2942
- local_irq_save(flags);
2943
- free_unref_page_commit(page, pfn);
2944
- local_irq_restore(flags);
3401
+ local_lock_irqsave(&pa_lock.l, flags);
3402
+ free_unref_page_commit(page, pfn, &dst);
3403
+ local_unlock_irqrestore(&pa_lock.l, flags);
3404
+ if (!list_empty(&dst))
3405
+ free_pcppages_bulk(zone, &dst, false);
29453406 }
29463407
29473408 /*
....@@ -2952,6 +3413,11 @@
29523413 struct page *page, *next;
29533414 unsigned long flags, pfn;
29543415 int batch_count = 0;
3416
+ struct list_head dsts[__MAX_NR_ZONES];
3417
+ int i;
3418
+
3419
+ for (i = 0; i < __MAX_NR_ZONES; i++)
3420
+ INIT_LIST_HEAD(&dsts[i]);
29553421
29563422 /* Prepare pages for freeing */
29573423 list_for_each_entry_safe(page, next, list, lru) {
....@@ -2961,25 +3427,42 @@
29613427 set_page_private(page, pfn);
29623428 }
29633429
2964
- local_irq_save(flags);
3430
+ local_lock_irqsave(&pa_lock.l, flags);
29653431 list_for_each_entry_safe(page, next, list, lru) {
29663432 unsigned long pfn = page_private(page);
3433
+ enum zone_type type;
29673434
29683435 set_page_private(page, 0);
29693436 trace_mm_page_free_batched(page);
2970
- free_unref_page_commit(page, pfn);
3437
+ type = page_zonenum(page);
3438
+ free_unref_page_commit(page, pfn, &dsts[type]);
29713439
29723440 /*
29733441 * Guard against excessive IRQ disabled times when we get
29743442 * a large list of pages to free.
29753443 */
29763444 if (++batch_count == SWAP_CLUSTER_MAX) {
2977
- local_irq_restore(flags);
3445
+ local_unlock_irqrestore(&pa_lock.l, flags);
29783446 batch_count = 0;
2979
- local_irq_save(flags);
3447
+ local_lock_irqsave(&pa_lock.l, flags);
29803448 }
29813449 }
2982
- local_irq_restore(flags);
3450
+ local_unlock_irqrestore(&pa_lock.l, flags);
3451
+
3452
+ for (i = 0; i < __MAX_NR_ZONES; ) {
3453
+ struct page *page;
3454
+ struct zone *zone;
3455
+
3456
+ if (list_empty(&dsts[i])) {
3457
+ i++;
3458
+ continue;
3459
+ }
3460
+
3461
+ page = list_first_entry(&dsts[i], struct page, lru);
3462
+ zone = page_zone(page);
3463
+
3464
+ free_pcppages_bulk(zone, &dsts[i], true);
3465
+ }
29833466 }
29843467
29853468 /*
....@@ -2999,7 +3482,8 @@
29993482
30003483 for (i = 1; i < (1 << order); i++)
30013484 set_page_refcounted(page + i);
3002
- split_page_owner(page, order);
3485
+ split_page_owner(page, 1 << order);
3486
+ split_page_memcg(page, 1 << order);
30033487 }
30043488 EXPORT_SYMBOL_GPL(split_page);
30053489
....@@ -3021,7 +3505,7 @@
30213505 * watermark, because we already know our high-order page
30223506 * exists.
30233507 */
3024
- watermark = min_wmark_pages(zone) + (1UL << order);
3508
+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
30253509 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
30263510 return 0;
30273511
....@@ -3029,9 +3513,8 @@
30293513 }
30303514
30313515 /* Remove page from free list */
3032
- list_del(&page->lru);
3033
- zone->free_area[order].nr_free--;
3034
- rmv_page_order(page);
3516
+
3517
+ del_page_from_free_list(page, zone, order);
30353518
30363519 /*
30373520 * Set the pageblock if the isolated page is at least half of a
....@@ -3050,6 +3533,27 @@
30503533
30513534
30523535 return 1UL << order;
3536
+}
3537
+
3538
+/**
3539
+ * __putback_isolated_page - Return a now-isolated page back where we got it
3540
+ * @page: Page that was isolated
3541
+ * @order: Order of the isolated page
3542
+ * @mt: The page's pageblock's migratetype
3543
+ *
3544
+ * This function is meant to return a page pulled from the free lists via
3545
+ * __isolate_free_page back to the free lists they were pulled from.
3546
+ */
3547
+void __putback_isolated_page(struct page *page, unsigned int order, int mt)
3548
+{
3549
+ struct zone *zone = page_zone(page);
3550
+
3551
+ /* zone lock should be held when this function is called */
3552
+ lockdep_assert_held(&zone->lock);
3553
+
3554
+ /* Return isolated page to tail of freelist. */
3555
+ __free_one_page(page, page_to_pfn(page), zone, order, mt,
3556
+ FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
30533557 }
30543558
30553559 /*
....@@ -3081,6 +3585,7 @@
30813585
30823586 /* Remove page from the per-cpu list, caller must protect the list */
30833587 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
3588
+ unsigned int alloc_flags,
30843589 struct per_cpu_pages *pcp,
30853590 gfp_t gfp_flags)
30863591 {
....@@ -3090,9 +3595,9 @@
30903595 do {
30913596 /* First try to get CMA pages */
30923597 if (migratetype == MIGRATE_MOVABLE &&
3093
- gfp_flags & __GFP_CMA) {
3598
+ alloc_flags & ALLOC_CMA) {
30943599 list = get_populated_pcp_list(zone, 0, pcp,
3095
- get_cma_migrate_type());
3600
+ get_cma_migrate_type(), alloc_flags);
30963601 }
30973602
30983603 if (list == NULL) {
....@@ -3101,7 +3606,7 @@
31013606 * free CMA pages.
31023607 */
31033608 list = get_populated_pcp_list(zone, 0, pcp,
3104
- migratetype);
3609
+ migratetype, alloc_flags);
31053610 if (unlikely(list == NULL) ||
31063611 unlikely(list_empty(list)))
31073612 return NULL;
....@@ -3117,22 +3622,22 @@
31173622
31183623 /* Lock and remove page from the per-cpu list */
31193624 static struct page *rmqueue_pcplist(struct zone *preferred_zone,
3120
- struct zone *zone, unsigned int order,
3121
- gfp_t gfp_flags, int migratetype)
3625
+ struct zone *zone, gfp_t gfp_flags,
3626
+ int migratetype, unsigned int alloc_flags)
31223627 {
31233628 struct per_cpu_pages *pcp;
31243629 struct page *page;
31253630 unsigned long flags;
31263631
3127
- local_irq_save(flags);
3632
+ local_lock_irqsave(&pa_lock.l, flags);
31283633 pcp = &this_cpu_ptr(zone->pageset)->pcp;
3129
- page = __rmqueue_pcplist(zone, migratetype, pcp,
3634
+ page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp,
31303635 gfp_flags);
31313636 if (page) {
3132
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
3637
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
31333638 zone_statistics(preferred_zone, zone);
31343639 }
3135
- local_irq_restore(flags);
3640
+ local_unlock_irqrestore(&pa_lock.l, flags);
31363641 return page;
31373642 }
31383643
....@@ -3149,8 +3654,8 @@
31493654 struct page *page;
31503655
31513656 if (likely(order == 0)) {
3152
- page = rmqueue_pcplist(preferred_zone, zone, order,
3153
- gfp_flags, migratetype);
3657
+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
3658
+ migratetype, alloc_flags);
31543659 goto out;
31553660 }
31563661
....@@ -3159,25 +3664,32 @@
31593664 * allocate greater than order-1 page units with __GFP_NOFAIL.
31603665 */
31613666 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
3162
- spin_lock_irqsave(&zone->lock, flags);
3667
+ local_lock_irqsave(&pa_lock.l, flags);
3668
+ spin_lock(&zone->lock);
31633669
31643670 do {
31653671 page = NULL;
3166
-
3167
- if (alloc_flags & ALLOC_HARDER) {
3672
+ /*
3673
+ * order-0 request can reach here when the pcplist is skipped
3674
+ * due to non-CMA allocation context. HIGHATOMIC area is
3675
+ * reserved for high-order atomic allocation, so order-0
3676
+ * request should skip it.
3677
+ */
3678
+ if (order > 0 && alloc_flags & ALLOC_HARDER) {
31683679 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
31693680 if (page)
31703681 trace_mm_page_alloc_zone_locked(page, order, migratetype);
31713682 }
3172
-
3173
- if (!page && migratetype == MIGRATE_MOVABLE &&
3174
- gfp_flags & __GFP_CMA)
3175
- page = __rmqueue_cma(zone, order);
3176
-
3177
- if (!page)
3178
- page = __rmqueue(zone, order, migratetype);
3683
+ if (!page) {
3684
+ if (migratetype == MIGRATE_MOVABLE &&
3685
+ alloc_flags & ALLOC_CMA)
3686
+ page = __rmqueue_cma(zone, order, migratetype,
3687
+ alloc_flags);
3688
+ if (!page)
3689
+ page = __rmqueue(zone, order, migratetype,
3690
+ alloc_flags);
3691
+ }
31793692 } while (page && check_new_pages(page, order));
3180
-
31813693 spin_unlock(&zone->lock);
31823694 if (!page)
31833695 goto failed;
....@@ -3186,14 +3698,22 @@
31863698
31873699 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
31883700 zone_statistics(preferred_zone, zone);
3189
- local_irq_restore(flags);
3701
+ trace_android_vh_rmqueue(preferred_zone, zone, order,
3702
+ gfp_flags, alloc_flags, migratetype);
3703
+ local_unlock_irqrestore(&pa_lock.l, flags);
31903704
31913705 out:
3706
+ /* Separate test+clear to avoid unnecessary atomics */
3707
+ if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
3708
+ clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
3709
+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
3710
+ }
3711
+
31923712 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
31933713 return page;
31943714
31953715 failed:
3196
- local_irq_restore(flags);
3716
+ local_unlock_irqrestore(&pa_lock.l, flags);
31973717 return NULL;
31983718 }
31993719
....@@ -3218,7 +3738,7 @@
32183738 }
32193739 __setup("fail_page_alloc=", setup_fail_page_alloc);
32203740
3221
-static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3741
+static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
32223742 {
32233743 if (order < fail_page_alloc.min_order)
32243744 return false;
....@@ -3242,24 +3762,14 @@
32423762
32433763 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
32443764 &fail_page_alloc.attr);
3245
- if (IS_ERR(dir))
3246
- return PTR_ERR(dir);
32473765
3248
- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
3249
- &fail_page_alloc.ignore_gfp_reclaim))
3250
- goto fail;
3251
- if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3252
- &fail_page_alloc.ignore_gfp_highmem))
3253
- goto fail;
3254
- if (!debugfs_create_u32("min-order", mode, dir,
3255
- &fail_page_alloc.min_order))
3256
- goto fail;
3766
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
3767
+ &fail_page_alloc.ignore_gfp_reclaim);
3768
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3769
+ &fail_page_alloc.ignore_gfp_highmem);
3770
+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
32573771
32583772 return 0;
3259
-fail:
3260
- debugfs_remove_recursive(dir);
3261
-
3262
- return -ENOMEM;
32633773 }
32643774
32653775 late_initcall(fail_page_alloc_debugfs);
....@@ -3268,12 +3778,41 @@
32683778
32693779 #else /* CONFIG_FAIL_PAGE_ALLOC */
32703780
3271
-static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3781
+static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
32723782 {
32733783 return false;
32743784 }
32753785
32763786 #endif /* CONFIG_FAIL_PAGE_ALLOC */
3787
+
3788
+noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3789
+{
3790
+ return __should_fail_alloc_page(gfp_mask, order);
3791
+}
3792
+ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
3793
+
3794
+static inline long __zone_watermark_unusable_free(struct zone *z,
3795
+ unsigned int order, unsigned int alloc_flags)
3796
+{
3797
+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
3798
+ long unusable_free = (1 << order) - 1;
3799
+
3800
+ /*
3801
+ * If the caller does not have rights to ALLOC_HARDER then subtract
3802
+ * the high-atomic reserves. This will over-estimate the size of the
3803
+ * atomic reserve but it avoids a search.
3804
+ */
3805
+ if (likely(!alloc_harder))
3806
+ unusable_free += z->nr_reserved_highatomic;
3807
+
3808
+#ifdef CONFIG_CMA
3809
+ /* If allocation can't use CMA areas don't use free CMA pages */
3810
+ if (!(alloc_flags & ALLOC_CMA))
3811
+ unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
3812
+#endif
3813
+
3814
+ return unusable_free;
3815
+}
32773816
32783817 /*
32793818 * Return true if free base pages are above 'mark'. For high-order checks it
....@@ -3282,7 +3821,7 @@
32823821 * to check in the allocation paths if no pages are free.
32833822 */
32843823 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3285
- int classzone_idx, unsigned int alloc_flags,
3824
+ int highest_zoneidx, unsigned int alloc_flags,
32863825 long free_pages)
32873826 {
32883827 long min = mark;
....@@ -3290,19 +3829,12 @@
32903829 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
32913830
32923831 /* free_pages may go negative - that's OK */
3293
- free_pages -= (1 << order) - 1;
3832
+ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
32943833
32953834 if (alloc_flags & ALLOC_HIGH)
32963835 min -= min / 2;
32973836
3298
- /*
3299
- * If the caller does not have rights to ALLOC_HARDER then subtract
3300
- * the high-atomic reserves. This will over-estimate the size of the
3301
- * atomic reserve but it avoids a search.
3302
- */
3303
- if (likely(!alloc_harder)) {
3304
- free_pages -= z->nr_reserved_highatomic;
3305
- } else {
3837
+ if (unlikely(alloc_harder)) {
33063838 /*
33073839 * OOM victims can try even harder than normal ALLOC_HARDER
33083840 * users on the grounds that it's definitely going to be in
....@@ -3315,19 +3847,12 @@
33153847 min -= min / 4;
33163848 }
33173849
3318
-
3319
-#ifdef CONFIG_CMA
3320
- /* If allocation can't use CMA areas don't use free CMA pages */
3321
- if (!(alloc_flags & ALLOC_CMA))
3322
- free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
3323
-#endif
3324
-
33253850 /*
33263851 * Check watermarks for an order-0 allocation request. If these
33273852 * are not met, then a high-order request also cannot go ahead
33283853 * even if a suitable page happened to be free.
33293854 */
3330
- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
3855
+ if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
33313856 return false;
33323857
33333858 /* If this is an order-0 request then the watermark is fine */
....@@ -3351,65 +3876,83 @@
33513876 if (mt == MIGRATE_CMA)
33523877 continue;
33533878 #endif
3354
- if (!list_empty(&area->free_list[mt]))
3879
+ if (!free_area_empty(area, mt))
33553880 return true;
33563881 }
33573882
33583883 #ifdef CONFIG_CMA
33593884 if ((alloc_flags & ALLOC_CMA) &&
3360
- !list_empty(&area->free_list[MIGRATE_CMA])) {
3885
+ !free_area_empty(area, MIGRATE_CMA)) {
33613886 return true;
33623887 }
33633888 #endif
3364
- if (alloc_harder &&
3365
- !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
3889
+ if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
33663890 return true;
33673891 }
33683892 return false;
33693893 }
33703894
33713895 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3372
- int classzone_idx, unsigned int alloc_flags)
3896
+ int highest_zoneidx, unsigned int alloc_flags)
33733897 {
3374
- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3898
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
33753899 zone_page_state(z, NR_FREE_PAGES));
33763900 }
3901
+EXPORT_SYMBOL_GPL(zone_watermark_ok);
33773902
33783903 static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
3379
- unsigned long mark, int classzone_idx, unsigned int alloc_flags)
3904
+ unsigned long mark, int highest_zoneidx,
3905
+ unsigned int alloc_flags, gfp_t gfp_mask)
33803906 {
3381
- long free_pages = zone_page_state(z, NR_FREE_PAGES);
3382
- long cma_pages = 0;
3907
+ long free_pages;
33833908
3384
-#ifdef CONFIG_CMA
3385
- /* If allocation can't use CMA areas don't use free CMA pages */
3386
- if (!(alloc_flags & ALLOC_CMA))
3387
- cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
3388
-#endif
3909
+ free_pages = zone_page_state(z, NR_FREE_PAGES);
33893910
33903911 /*
33913912 * Fast check for order-0 only. If this fails then the reserves
3392
- * need to be calculated. There is a corner case where the check
3393
- * passes but only the high-order atomic reserve are free. If
3394
- * the caller is !atomic then it'll uselessly search the free
3395
- * list. That corner case is then slower but it is harmless.
3913
+ * need to be calculated.
33963914 */
3397
- if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
3398
- return true;
3915
+ if (!order) {
3916
+ long usable_free;
3917
+ long reserved;
33993918
3400
- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3401
- free_pages);
3919
+ usable_free = free_pages;
3920
+ reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
3921
+
3922
+ /* reserved may over estimate high-atomic reserves. */
3923
+ usable_free -= min(usable_free, reserved);
3924
+ if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
3925
+ return true;
3926
+ }
3927
+
3928
+ if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3929
+ free_pages))
3930
+ return true;
3931
+ /*
3932
+ * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
3933
+ * when checking the min watermark. The min watermark is the
3934
+ * point where boosting is ignored so that kswapd is woken up
3935
+ * when below the low watermark.
3936
+ */
3937
+ if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
3938
+ && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
3939
+ mark = z->_watermark[WMARK_MIN];
3940
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx,
3941
+ alloc_flags, free_pages);
3942
+ }
3943
+
3944
+ return false;
34023945 }
34033946
34043947 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3405
- unsigned long mark, int classzone_idx)
3948
+ unsigned long mark, int highest_zoneidx)
34063949 {
34073950 long free_pages = zone_page_state(z, NR_FREE_PAGES);
34083951
34093952 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
34103953 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
34113954
3412
- return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
3955
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
34133956 free_pages);
34143957 }
34153958 EXPORT_SYMBOL_GPL(zone_watermark_ok_safe);
....@@ -3418,7 +3961,7 @@
34183961 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
34193962 {
34203963 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3421
- RECLAIM_DISTANCE;
3964
+ node_reclaim_distance;
34223965 }
34233966 #else /* CONFIG_NUMA */
34243967 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
....@@ -3428,6 +3971,61 @@
34283971 #endif /* CONFIG_NUMA */
34293972
34303973 /*
3974
+ * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
3975
+ * fragmentation is subtle. If the preferred zone was HIGHMEM then
3976
+ * premature use of a lower zone may cause lowmem pressure problems that
3977
+ * are worse than fragmentation. If the next zone is ZONE_DMA then it is
3978
+ * probably too small. It only makes sense to spread allocations to avoid
3979
+ * fragmentation between the Normal and DMA32 zones.
3980
+ */
3981
+static inline unsigned int
3982
+alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
3983
+{
3984
+ unsigned int alloc_flags;
3985
+
3986
+ /*
3987
+ * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
3988
+ * to save a branch.
3989
+ */
3990
+ alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
3991
+
3992
+#ifdef CONFIG_ZONE_DMA32
3993
+ if (!zone)
3994
+ return alloc_flags;
3995
+
3996
+ if (zone_idx(zone) != ZONE_NORMAL)
3997
+ return alloc_flags;
3998
+
3999
+ /*
4000
+ * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
4001
+ * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
4002
+ * on UMA that if Normal is populated then so is DMA32.
4003
+ */
4004
+ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
4005
+ if (nr_online_nodes > 1 && !populated_zone(--zone))
4006
+ return alloc_flags;
4007
+
4008
+ alloc_flags |= ALLOC_NOFRAGMENT;
4009
+#endif /* CONFIG_ZONE_DMA32 */
4010
+ return alloc_flags;
4011
+}
4012
+
4013
+static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
4014
+ unsigned int alloc_flags)
4015
+{
4016
+#ifdef CONFIG_CMA
4017
+ unsigned int pflags = current->flags;
4018
+
4019
+ if (!(pflags & PF_MEMALLOC_NOCMA) &&
4020
+ gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE &&
4021
+ gfp_mask & __GFP_CMA)
4022
+ alloc_flags |= ALLOC_CMA;
4023
+
4024
+#endif
4025
+ return alloc_flags;
4026
+}
4027
+
4028
+/*
34314029 * get_page_from_freelist goes through the zonelist trying to allocate
34324030 * a page.
34334031 */
....@@ -3435,16 +4033,20 @@
34354033 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
34364034 const struct alloc_context *ac)
34374035 {
3438
- struct zoneref *z = ac->preferred_zoneref;
4036
+ struct zoneref *z;
34394037 struct zone *zone;
34404038 struct pglist_data *last_pgdat_dirty_limit = NULL;
4039
+ bool no_fallback;
34414040
4041
+retry:
34424042 /*
34434043 * Scan zonelist, looking for a zone with enough free.
34444044 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
34454045 */
3446
- for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3447
- ac->nodemask) {
4046
+ no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
4047
+ z = ac->preferred_zoneref;
4048
+ for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
4049
+ ac->nodemask) {
34484050 struct page *page;
34494051 unsigned long mark;
34504052
....@@ -3481,9 +4083,26 @@
34814083 }
34824084 }
34834085
3484
- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
4086
+ if (no_fallback && nr_online_nodes > 1 &&
4087
+ zone != ac->preferred_zoneref->zone) {
4088
+ int local_nid;
4089
+
4090
+ /*
4091
+ * If moving to a remote node, retry but allow
4092
+ * fragmenting fallbacks. Locality is more important
4093
+ * than fragmentation avoidance.
4094
+ */
4095
+ local_nid = zone_to_nid(ac->preferred_zoneref->zone);
4096
+ if (zone_to_nid(zone) != local_nid) {
4097
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
4098
+ goto retry;
4099
+ }
4100
+ }
4101
+
4102
+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
34854103 if (!zone_watermark_fast(zone, order, mark,
3486
- ac_classzone_idx(ac), alloc_flags)) {
4104
+ ac->highest_zoneidx, alloc_flags,
4105
+ gfp_mask)) {
34874106 int ret;
34884107
34894108 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
....@@ -3516,7 +4135,7 @@
35164135 default:
35174136 /* did we reclaim enough */
35184137 if (zone_watermark_ok(zone, order, mark,
3519
- ac_classzone_idx(ac), alloc_flags))
4138
+ ac->highest_zoneidx, alloc_flags))
35204139 goto try_this_zone;
35214140
35224141 continue;
....@@ -3548,30 +4167,21 @@
35484167 }
35494168 }
35504169
4170
+ /*
4171
+ * It's possible on a UMA machine to get through all zones that are
4172
+ * fragmented. If avoiding fragmentation, reset and try again.
4173
+ */
4174
+ if (no_fallback) {
4175
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
4176
+ goto retry;
4177
+ }
4178
+
35514179 return NULL;
3552
-}
3553
-
3554
-/*
3555
- * Large machines with many possible nodes should not always dump per-node
3556
- * meminfo in irq context.
3557
- */
3558
-static inline bool should_suppress_show_mem(void)
3559
-{
3560
- bool ret = false;
3561
-
3562
-#if NODES_SHIFT > 8
3563
- ret = in_interrupt();
3564
-#endif
3565
- return ret;
35664180 }
35674181
35684182 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
35694183 {
35704184 unsigned int filter = SHOW_MEM_FILTER_NODES;
3571
- static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
3572
-
3573
- if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
3574
- return;
35754185
35764186 /*
35774187 * This documents exceptions given to allocations in certain
....@@ -3592,22 +4202,23 @@
35924202 {
35934203 struct va_format vaf;
35944204 va_list args;
3595
- static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
3596
- DEFAULT_RATELIMIT_BURST);
4205
+ static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
35974206
3598
- if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
4207
+ if ((gfp_mask & __GFP_NOWARN) ||
4208
+ !__ratelimit(&nopage_rs) ||
4209
+ ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
35994210 return;
36004211
36014212 va_start(args, fmt);
36024213 vaf.fmt = fmt;
36034214 vaf.va = &args;
3604
- pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
4215
+ pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
36054216 current->comm, &vaf, gfp_mask, &gfp_mask,
36064217 nodemask_pr_args(nodemask));
36074218 va_end(args);
36084219
36094220 cpuset_print_current_mems_allowed();
3610
-
4221
+ pr_cont("\n");
36114222 dump_stack();
36124223 warn_alloc_show_mem(gfp_mask, nodemask);
36134224 }
....@@ -3681,11 +4292,13 @@
36814292 * success so it is time to admit defeat. We will skip the OOM killer
36824293 * because it is very likely that the caller has a more reasonable
36834294 * fallback than shooting a random task.
4295
+ *
4296
+ * The OOM killer may not free memory on a specific node.
36844297 */
3685
- if (gfp_mask & __GFP_RETRY_MAYFAIL)
4298
+ if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
36864299 goto out;
36874300 /* The OOM killer does not needlessly kill tasks for lowmem */
3688
- if (ac->high_zoneidx < ZONE_NORMAL)
4301
+ if (ac->highest_zoneidx < ZONE_NORMAL)
36894302 goto out;
36904303 if (pm_suspended_storage())
36914304 goto out;
....@@ -3698,10 +4311,6 @@
36984311 * out_of_memory). Once filesystems are ready to handle allocation
36994312 * failures more gracefully we should just bail out here.
37004313 */
3701
-
3702
- /* The OOM killer may not free memory on a specific node */
3703
- if (gfp_mask & __GFP_THISNODE)
3704
- goto out;
37054314
37064315 /* Exhausted what can be done so it's blame time */
37074316 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
....@@ -3733,7 +4342,7 @@
37334342 unsigned int alloc_flags, const struct alloc_context *ac,
37344343 enum compact_priority prio, enum compact_result *compact_result)
37354344 {
3736
- struct page *page;
4345
+ struct page *page = NULL;
37374346 unsigned long pflags;
37384347 unsigned int noreclaim_flag;
37394348
....@@ -3744,13 +4353,10 @@
37444353 noreclaim_flag = memalloc_noreclaim_save();
37454354
37464355 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3747
- prio);
4356
+ prio, &page);
37484357
37494358 memalloc_noreclaim_restore(noreclaim_flag);
37504359 psi_memstall_leave(&pflags);
3751
-
3752
- if (*compact_result <= COMPACT_INACTIVE)
3753
- return NULL;
37544360
37554361 /*
37564362 * At least in one zone compaction wasn't deferred or skipped, so let's
....@@ -3758,7 +4364,13 @@
37584364 */
37594365 count_vm_event(COMPACTSTALL);
37604366
3761
- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4367
+ /* Prep a captured page if available */
4368
+ if (page)
4369
+ prep_new_page(page, order, gfp_mask, alloc_flags);
4370
+
4371
+ /* Try get a page from the freelist if available */
4372
+ if (!page)
4373
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
37624374
37634375 if (page) {
37644376 struct zone *zone = page_zone(page);
....@@ -3807,14 +4419,22 @@
38074419 goto check_priority;
38084420
38094421 /*
3810
- * make sure the compaction wasn't deferred or didn't bail out early
3811
- * due to locks contention before we declare that we should give up.
3812
- * But do not retry if the given zonelist is not suitable for
3813
- * compaction.
4422
+ * compaction was skipped because there are not enough order-0 pages
4423
+ * to work with, so we retry only if it looks like reclaim can help.
38144424 */
3815
- if (compaction_withdrawn(compact_result)) {
4425
+ if (compaction_needs_reclaim(compact_result)) {
38164426 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
38174427 goto out;
4428
+ }
4429
+
4430
+ /*
4431
+ * make sure the compaction wasn't deferred or didn't bail out early
4432
+ * due to locks contention before we declare that we should give up.
4433
+ * But the next retry should use a higher priority if allowed, so
4434
+ * we don't just keep bailing out endlessly.
4435
+ */
4436
+ if (compaction_withdrawn(compact_result)) {
4437
+ goto check_priority;
38184438 }
38194439
38204440 /*
....@@ -3877,10 +4497,10 @@
38774497 * Let's give them a good hope and keep retrying while the order-0
38784498 * watermarks are OK.
38794499 */
3880
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3881
- ac->nodemask) {
4500
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4501
+ ac->highest_zoneidx, ac->nodemask) {
38824502 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
3883
- ac_classzone_idx(ac), alloc_flags))
4503
+ ac->highest_zoneidx, alloc_flags))
38844504 return true;
38854505 }
38864506 return false;
....@@ -3938,33 +4558,50 @@
39384558 EXPORT_SYMBOL_GPL(fs_reclaim_release);
39394559 #endif
39404560
4561
+/*
4562
+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
4563
+ * have been rebuilt so allocation retries. Reader side does not lock and
4564
+ * retries the allocation if zonelist changes. Writer side is protected by the
4565
+ * embedded spin_lock.
4566
+ */
4567
+static DEFINE_SEQLOCK(zonelist_update_seq);
4568
+
4569
+static unsigned int zonelist_iter_begin(void)
4570
+{
4571
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
4572
+ return read_seqbegin(&zonelist_update_seq);
4573
+
4574
+ return 0;
4575
+}
4576
+
4577
+static unsigned int check_retry_zonelist(unsigned int seq)
4578
+{
4579
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
4580
+ return read_seqretry(&zonelist_update_seq, seq);
4581
+
4582
+ return seq;
4583
+}
4584
+
39414585 /* Perform direct synchronous page reclaim */
3942
-static int
4586
+static unsigned long
39434587 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
39444588 const struct alloc_context *ac)
39454589 {
3946
- struct reclaim_state reclaim_state;
3947
- int progress;
39484590 unsigned int noreclaim_flag;
3949
- unsigned long pflags;
4591
+ unsigned long progress;
39504592
39514593 cond_resched();
39524594
39534595 /* We now go into synchronous reclaim */
39544596 cpuset_memory_pressure_bump();
3955
- psi_memstall_enter(&pflags);
39564597 fs_reclaim_acquire(gfp_mask);
39574598 noreclaim_flag = memalloc_noreclaim_save();
3958
- reclaim_state.reclaimed_slab = 0;
3959
- current->reclaim_state = &reclaim_state;
39604599
39614600 progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
39624601 ac->nodemask);
39634602
3964
- current->reclaim_state = NULL;
39654603 memalloc_noreclaim_restore(noreclaim_flag);
39664604 fs_reclaim_release(gfp_mask);
3967
- psi_memstall_leave(&pflags);
39684605
39694606 cond_resched();
39704607
....@@ -3978,11 +4615,14 @@
39784615 unsigned long *did_some_progress)
39794616 {
39804617 struct page *page = NULL;
4618
+ unsigned long pflags;
39814619 bool drained = false;
4620
+ bool skip_pcp_drain = false;
39824621
4622
+ psi_memstall_enter(&pflags);
39834623 *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
39844624 if (unlikely(!(*did_some_progress)))
3985
- return NULL;
4625
+ goto out;
39864626
39874627 retry:
39884628 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
....@@ -3990,14 +4630,19 @@
39904630 /*
39914631 * If an allocation failed after direct reclaim, it could be because
39924632 * pages are pinned on the per-cpu lists or in high alloc reserves.
3993
- * Shrink them them and try again
4633
+ * Shrink them and try again
39944634 */
39954635 if (!page && !drained) {
39964636 unreserve_highatomic_pageblock(ac, false);
3997
- drain_all_pages(NULL);
4637
+ trace_android_vh_drain_all_pages_bypass(gfp_mask, order,
4638
+ alloc_flags, ac->migratetype, *did_some_progress, &skip_pcp_drain);
4639
+ if (!skip_pcp_drain)
4640
+ drain_all_pages(NULL);
39984641 drained = true;
39994642 goto retry;
40004643 }
4644
+out:
4645
+ psi_memstall_leave(&pflags);
40014646
40024647 return page;
40034648 }
....@@ -4008,12 +4653,12 @@
40084653 struct zoneref *z;
40094654 struct zone *zone;
40104655 pg_data_t *last_pgdat = NULL;
4011
- enum zone_type high_zoneidx = ac->high_zoneidx;
4656
+ enum zone_type highest_zoneidx = ac->highest_zoneidx;
40124657
4013
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
4658
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
40144659 ac->nodemask) {
40154660 if (last_pgdat != zone->zone_pgdat)
4016
- wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
4661
+ wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
40174662 last_pgdat = zone->zone_pgdat;
40184663 }
40194664 }
....@@ -4023,8 +4668,13 @@
40234668 {
40244669 unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
40254670
4026
- /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
4671
+ /*
4672
+ * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
4673
+ * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
4674
+ * to save two branches.
4675
+ */
40274676 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
4677
+ BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
40284678
40294679 /*
40304680 * The caller may dip into page reserves a bit more if the caller
....@@ -4032,7 +4682,8 @@
40324682 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
40334683 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
40344684 */
4035
- alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
4685
+ alloc_flags |= (__force int)
4686
+ (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
40364687
40374688 if (gfp_mask & __GFP_ATOMIC) {
40384689 /*
....@@ -4049,10 +4700,8 @@
40494700 } else if (unlikely(rt_task(current)) && !in_interrupt())
40504701 alloc_flags |= ALLOC_HARDER;
40514702
4052
-#ifdef CONFIG_CMA
4053
- if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
4054
- alloc_flags |= ALLOC_CMA;
4055
-#endif
4703
+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
4704
+
40564705 return alloc_flags;
40574706 }
40584707
....@@ -4115,6 +4764,7 @@
41154764 {
41164765 struct zone *zone;
41174766 struct zoneref *z;
4767
+ bool ret = false;
41184768
41194769 /*
41204770 * Costly allocations might have made a progress but this doesn't mean
....@@ -4141,8 +4791,8 @@
41414791 * request even if all reclaimable pages are considered then we are
41424792 * screwed and have to go OOM.
41434793 */
4144
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
4145
- ac->nodemask) {
4794
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4795
+ ac->highest_zoneidx, ac->nodemask) {
41464796 unsigned long available;
41474797 unsigned long reclaimable;
41484798 unsigned long min_wmark = min_wmark_pages(zone);
....@@ -4156,7 +4806,7 @@
41564806 * reclaimable pages?
41574807 */
41584808 wmark = __zone_watermark_ok(zone, order, min_wmark,
4159
- ac_classzone_idx(ac), alloc_flags, available);
4809
+ ac->highest_zoneidx, alloc_flags, available);
41604810 trace_reclaim_retry_zone(z, order, reclaimable,
41614811 available, min_wmark, *no_progress_loops, wmark);
41624812 if (wmark) {
....@@ -4178,25 +4828,24 @@
41784828 }
41794829 }
41804830
4181
- /*
4182
- * Memory allocation/reclaim might be called from a WQ
4183
- * context and the current implementation of the WQ
4184
- * concurrency control doesn't recognize that
4185
- * a particular WQ is congested if the worker thread is
4186
- * looping without ever sleeping. Therefore we have to
4187
- * do a short sleep here rather than calling
4188
- * cond_resched().
4189
- */
4190
- if (current->flags & PF_WQ_WORKER)
4191
- schedule_timeout_uninterruptible(1);
4192
- else
4193
- cond_resched();
4194
-
4195
- return true;
4831
+ ret = true;
4832
+ goto out;
41964833 }
41974834 }
41984835
4199
- return false;
4836
+out:
4837
+ /*
4838
+ * Memory allocation/reclaim might be called from a WQ context and the
4839
+ * current implementation of the WQ concurrency control doesn't
4840
+ * recognize that a particular WQ is congested if the worker thread is
4841
+ * looping without ever sleeping. Therefore we have to do a short sleep
4842
+ * here rather than calling cond_resched().
4843
+ */
4844
+ if (current->flags & PF_WQ_WORKER)
4845
+ schedule_timeout_uninterruptible(1);
4846
+ else
4847
+ cond_resched();
4848
+ return ret;
42004849 }
42014850
42024851 static inline bool
....@@ -4246,8 +4895,11 @@
42464895 int compaction_retries;
42474896 int no_progress_loops;
42484897 unsigned int cpuset_mems_cookie;
4898
+ unsigned int zonelist_iter_cookie;
42494899 int reserve_flags;
4900
+ unsigned long vh_record;
42504901
4902
+ trace_android_vh_alloc_pages_slowpath_begin(gfp_mask, order, &vh_record);
42514903 /*
42524904 * We also sanity check to catch abuse of atomic reserves being used by
42534905 * callers that are not in atomic context.
....@@ -4256,11 +4908,12 @@
42564908 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
42574909 gfp_mask &= ~__GFP_ATOMIC;
42584910
4259
-retry_cpuset:
4911
+restart:
42604912 compaction_retries = 0;
42614913 no_progress_loops = 0;
42624914 compact_priority = DEF_COMPACT_PRIORITY;
42634915 cpuset_mems_cookie = read_mems_allowed_begin();
4916
+ zonelist_iter_cookie = zonelist_iter_begin();
42644917
42654918 /*
42664919 * The fast path uses conservative alloc_flags to succeed only until
....@@ -4276,11 +4929,11 @@
42764929 * could end up iterating over non-eligible zones endlessly.
42774930 */
42784931 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4279
- ac->high_zoneidx, ac->nodemask);
4932
+ ac->highest_zoneidx, ac->nodemask);
42804933 if (!ac->preferred_zoneref->zone)
42814934 goto nopage;
42824935
4283
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
4936
+ if (alloc_flags & ALLOC_KSWAPD)
42844937 wake_all_kswapds(order, gfp_mask, ac);
42854938
42864939 /*
....@@ -4313,18 +4966,28 @@
43134966
43144967 /*
43154968 * Checks for costly allocations with __GFP_NORETRY, which
4316
- * includes THP page fault allocations
4969
+ * includes some THP page fault allocations
43174970 */
43184971 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
43194972 /*
4320
- * If compaction is deferred for high-order allocations,
4321
- * it is because sync compaction recently failed. If
4322
- * this is the case and the caller requested a THP
4323
- * allocation, we do not want to heavily disrupt the
4324
- * system, so we fail the allocation instead of entering
4325
- * direct reclaim.
4973
+ * If allocating entire pageblock(s) and compaction
4974
+ * failed because all zones are below low watermarks
4975
+ * or is prohibited because it recently failed at this
4976
+ * order, fail immediately unless the allocator has
4977
+ * requested compaction and reclaim retry.
4978
+ *
4979
+ * Reclaim is
4980
+ * - potentially very expensive because zones are far
4981
+ * below their low watermarks or this is part of very
4982
+ * bursty high order allocations,
4983
+ * - not guaranteed to help because isolate_freepages()
4984
+ * may not iterate over freed pages as part of its
4985
+ * linear scan, and
4986
+ * - unlikely to make entire pageblocks free on its
4987
+ * own.
43264988 */
4327
- if (compact_result == COMPACT_DEFERRED)
4989
+ if (compact_result == COMPACT_SKIPPED ||
4990
+ compact_result == COMPACT_DEFERRED)
43284991 goto nopage;
43294992
43304993 /*
....@@ -4338,12 +5001,12 @@
43385001
43395002 retry:
43405003 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4341
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
5004
+ if (alloc_flags & ALLOC_KSWAPD)
43425005 wake_all_kswapds(order, gfp_mask, ac);
43435006
43445007 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
43455008 if (reserve_flags)
4346
- alloc_flags = reserve_flags;
5009
+ alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
43475010
43485011 /*
43495012 * Reset the nodemask and zonelist iterators if memory policies can be
....@@ -4353,7 +5016,7 @@
43535016 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
43545017 ac->nodemask = NULL;
43555018 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4356
- ac->high_zoneidx, ac->nodemask);
5019
+ ac->highest_zoneidx, ac->nodemask);
43575020 }
43585021
43595022 /* Attempt with potentially adjusted zonelist and alloc_flags */
....@@ -4368,6 +5031,12 @@
43685031 /* Avoid recursion of direct reclaim */
43695032 if (current->flags & PF_MEMALLOC)
43705033 goto nopage;
5034
+
5035
+ trace_android_vh_alloc_pages_reclaim_bypass(gfp_mask, order,
5036
+ alloc_flags, ac->migratetype, &page);
5037
+
5038
+ if (page)
5039
+ goto got_pg;
43715040
43725041 /* Try direct reclaim and then allocating */
43735042 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
....@@ -4409,9 +5078,13 @@
44095078 goto retry;
44105079
44115080
4412
- /* Deal with possible cpuset update races before we start OOM killing */
4413
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
4414
- goto retry_cpuset;
5081
+ /*
5082
+ * Deal with possible cpuset update races or zonelist updates to avoid
5083
+ * a unnecessary OOM kill.
5084
+ */
5085
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
5086
+ check_retry_zonelist(zonelist_iter_cookie))
5087
+ goto restart;
44155088
44165089 /* Reclaim has failed us, start killing things */
44175090 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
....@@ -4420,7 +5093,7 @@
44205093
44215094 /* Avoid allocations with no watermarks from looping endlessly */
44225095 if (tsk_is_oom_victim(current) &&
4423
- (alloc_flags == ALLOC_OOM ||
5096
+ (alloc_flags & ALLOC_OOM ||
44245097 (gfp_mask & __GFP_NOMEMALLOC)))
44255098 goto nopage;
44265099
....@@ -4431,9 +5104,13 @@
44315104 }
44325105
44335106 nopage:
4434
- /* Deal with possible cpuset update races before we fail */
4435
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
4436
- goto retry_cpuset;
5107
+ /*
5108
+ * Deal with possible cpuset update races or zonelist updates to avoid
5109
+ * a unnecessary OOM kill.
5110
+ */
5111
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
5112
+ check_retry_zonelist(zonelist_iter_cookie))
5113
+ goto restart;
44375114
44385115 /*
44395116 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
....@@ -4476,9 +5153,15 @@
44765153 goto retry;
44775154 }
44785155 fail:
5156
+ trace_android_vh_alloc_pages_failure_bypass(gfp_mask, order,
5157
+ alloc_flags, ac->migratetype, &page);
5158
+ if (page)
5159
+ goto got_pg;
5160
+
44795161 warn_alloc(gfp_mask, ac->nodemask,
44805162 "page allocation failure: order:%u", order);
44815163 got_pg:
5164
+ trace_android_vh_alloc_pages_slowpath_end(gfp_mask, order, vh_record);
44825165 return page;
44835166 }
44845167
....@@ -4487,14 +5170,18 @@
44875170 struct alloc_context *ac, gfp_t *alloc_mask,
44885171 unsigned int *alloc_flags)
44895172 {
4490
- ac->high_zoneidx = gfp_zone(gfp_mask);
5173
+ ac->highest_zoneidx = gfp_zone(gfp_mask);
44915174 ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
44925175 ac->nodemask = nodemask;
4493
- ac->migratetype = gfpflags_to_migratetype(gfp_mask);
5176
+ ac->migratetype = gfp_migratetype(gfp_mask);
44945177
44955178 if (cpusets_enabled()) {
44965179 *alloc_mask |= __GFP_HARDWALL;
4497
- if (!ac->nodemask)
5180
+ /*
5181
+ * When we are in the interrupt context, it is irrelevant
5182
+ * to the current task context. It means that any node ok.
5183
+ */
5184
+ if (!in_interrupt() && !ac->nodemask)
44985185 ac->nodemask = &cpuset_current_mems_allowed;
44995186 else
45005187 *alloc_flags |= ALLOC_CPUSET;
....@@ -4508,15 +5195,8 @@
45085195 if (should_fail_alloc_page(gfp_mask, order))
45095196 return false;
45105197
4511
- if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
4512
- *alloc_flags |= ALLOC_CMA;
5198
+ *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
45135199
4514
- return true;
4515
-}
4516
-
4517
-/* Determine whether to spread dirty pages and what the first usable zone */
4518
-static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
4519
-{
45205200 /* Dirty zone balancing only done in the fast path */
45215201 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
45225202
....@@ -4526,7 +5206,9 @@
45265206 * may get reset for allocations that ignore memory policies.
45275207 */
45285208 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4529
- ac->high_zoneidx, ac->nodemask);
5209
+ ac->highest_zoneidx, ac->nodemask);
5210
+
5211
+ return true;
45305212 }
45315213
45325214 /*
....@@ -4555,7 +5237,11 @@
45555237 if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
45565238 return NULL;
45575239
4558
- finalise_ac(gfp_mask, &ac);
5240
+ /*
5241
+ * Forbid the first pass from falling back to types that fragment
5242
+ * memory until all local zones are considered.
5243
+ */
5244
+ alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
45595245
45605246 /* First allocation attempt */
45615247 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
....@@ -4575,14 +5261,13 @@
45755261 * Restore the original nodemask if it was potentially replaced with
45765262 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
45775263 */
4578
- if (unlikely(ac.nodemask != nodemask))
4579
- ac.nodemask = nodemask;
5264
+ ac.nodemask = nodemask;
45805265
45815266 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
45825267
45835268 out:
45845269 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
4585
- unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
5270
+ unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
45865271 __free_pages(page, order);
45875272 page = NULL;
45885273 }
....@@ -4620,13 +5305,17 @@
46205305 if (order == 0) /* Via pcp? */
46215306 free_unref_page(page);
46225307 else
4623
- __free_pages_ok(page, order);
5308
+ __free_pages_ok(page, order, FPI_NONE);
46245309 }
46255310
46265311 void __free_pages(struct page *page, unsigned int order)
46275312 {
5313
+ trace_android_vh_free_pages(page, order);
46285314 if (put_page_testzero(page))
46295315 free_the_page(page, order);
5316
+ else if (!PageHead(page))
5317
+ while (order-- > 0)
5318
+ free_the_page(page + (1 << order), order);
46305319 }
46315320 EXPORT_SYMBOL(__free_pages);
46325321
....@@ -4731,6 +5420,18 @@
47315420 /* reset page count bias and offset to start of new frag */
47325421 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
47335422 offset = size - fragsz;
5423
+ if (unlikely(offset < 0)) {
5424
+ /*
5425
+ * The caller is trying to allocate a fragment
5426
+ * with fragsz > PAGE_SIZE but the cache isn't big
5427
+ * enough to satisfy the request, this may
5428
+ * happen in low memory conditions.
5429
+ * We don't release the cache page because
5430
+ * it could make memory pressure worse
5431
+ * so we simply return NULL here.
5432
+ */
5433
+ return NULL;
5434
+ }
47345435 }
47355436
47365437 nc->pagecnt_bias--;
....@@ -4771,7 +5472,7 @@
47715472 /**
47725473 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
47735474 * @size: the number of bytes to allocate
4774
- * @gfp_mask: GFP flags for the allocation
5475
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
47755476 *
47765477 * This function is similar to alloc_pages(), except that it allocates the
47775478 * minimum number of pages to satisfy the request. alloc_pages() can only
....@@ -4780,11 +5481,16 @@
47805481 * This function is also limited by MAX_ORDER.
47815482 *
47825483 * Memory allocated by this function must be released by free_pages_exact().
5484
+ *
5485
+ * Return: pointer to the allocated area or %NULL in case of error.
47835486 */
47845487 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
47855488 {
47865489 unsigned int order = get_order(size);
47875490 unsigned long addr;
5491
+
5492
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
5493
+ gfp_mask &= ~__GFP_COMP;
47885494
47895495 addr = __get_free_pages(gfp_mask, order);
47905496 return make_alloc_exact(addr, order, size);
....@@ -4796,15 +5502,22 @@
47965502 * pages on a node.
47975503 * @nid: the preferred node ID where memory should be allocated
47985504 * @size: the number of bytes to allocate
4799
- * @gfp_mask: GFP flags for the allocation
5505
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
48005506 *
48015507 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
48025508 * back.
5509
+ *
5510
+ * Return: pointer to the allocated area or %NULL in case of error.
48035511 */
48045512 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
48055513 {
48065514 unsigned int order = get_order(size);
4807
- struct page *p = alloc_pages_node(nid, gfp_mask, order);
5515
+ struct page *p;
5516
+
5517
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
5518
+ gfp_mask &= ~__GFP_COMP;
5519
+
5520
+ p = alloc_pages_node(nid, gfp_mask, order);
48085521 if (!p)
48095522 return NULL;
48105523 return make_alloc_exact((unsigned long)page_address(p), order, size);
....@@ -4833,11 +5546,13 @@
48335546 * nr_free_zone_pages - count number of pages beyond high watermark
48345547 * @offset: The zone index of the highest zone
48355548 *
4836
- * nr_free_zone_pages() counts the number of counts pages which are beyond the
5549
+ * nr_free_zone_pages() counts the number of pages which are beyond the
48375550 * high watermark within all zones at or below a given zone index. For each
48385551 * zone, the number of pages is calculated as:
48395552 *
48405553 * nr_free_zone_pages = managed_pages - high_pages
5554
+ *
5555
+ * Return: number of pages beyond high watermark.
48415556 */
48425557 static unsigned long nr_free_zone_pages(int offset)
48435558 {
....@@ -4850,7 +5565,7 @@
48505565 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
48515566
48525567 for_each_zone_zonelist(zone, z, zonelist, offset) {
4853
- unsigned long size = zone->managed_pages;
5568
+ unsigned long size = zone_managed_pages(zone);
48545569 unsigned long high = high_wmark_pages(zone);
48555570 if (size > high)
48565571 sum += size - high;
....@@ -4864,23 +5579,15 @@
48645579 *
48655580 * nr_free_buffer_pages() counts the number of pages which are beyond the high
48665581 * watermark within ZONE_DMA and ZONE_NORMAL.
5582
+ *
5583
+ * Return: number of pages beyond high watermark within ZONE_DMA and
5584
+ * ZONE_NORMAL.
48675585 */
48685586 unsigned long nr_free_buffer_pages(void)
48695587 {
48705588 return nr_free_zone_pages(gfp_zone(GFP_USER));
48715589 }
48725590 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4873
-
4874
-/**
4875
- * nr_free_pagecache_pages - count number of pages beyond high watermark
4876
- *
4877
- * nr_free_pagecache_pages() counts the number of pages which are beyond the
4878
- * high watermark within all zones.
4879
- */
4880
-unsigned long nr_free_pagecache_pages(void)
4881
-{
4882
- return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
4883
-}
48845591
48855592 static inline void show_node(struct zone *zone)
48865593 {
....@@ -4902,7 +5609,7 @@
49025609 pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
49035610
49045611 for_each_zone(zone)
4905
- wmark_low += zone->watermark[WMARK_LOW];
5612
+ wmark_low += low_wmark_pages(zone);
49065613
49075614 /*
49085615 * Estimate the amount of memory available for userspace allocations,
....@@ -4924,8 +5631,8 @@
49245631 * items that are in use, and cannot be freed. Cap this estimate at the
49255632 * low watermark.
49265633 */
4927
- reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
4928
- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
5634
+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
5635
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
49295636 available += reclaimable - min(reclaimable / 2, wmark_low);
49305637
49315638 if (available < 0)
....@@ -4936,11 +5643,11 @@
49365643
49375644 void si_meminfo(struct sysinfo *val)
49385645 {
4939
- val->totalram = totalram_pages;
5646
+ val->totalram = totalram_pages();
49405647 val->sharedram = global_node_page_state(NR_SHMEM);
49415648 val->freeram = global_zone_page_state(NR_FREE_PAGES);
49425649 val->bufferram = nr_blockdev_pages();
4943
- val->totalhigh = totalhigh_pages;
5650
+ val->totalhigh = totalhigh_pages();
49445651 val->freehigh = nr_free_highpages();
49455652 val->mem_unit = PAGE_SIZE;
49465653 }
....@@ -4957,7 +5664,7 @@
49575664 pg_data_t *pgdat = NODE_DATA(nid);
49585665
49595666 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
4960
- managed_pages += pgdat->node_zones[zone_type].managed_pages;
5667
+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
49615668 val->totalram = managed_pages;
49625669 val->sharedram = node_page_state(pgdat, NR_SHMEM);
49635670 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
....@@ -4966,7 +5673,7 @@
49665673 struct zone *zone = &pgdat->node_zones[zone_type];
49675674
49685675 if (is_highmem(zone)) {
4969
- managed_highpages += zone->managed_pages;
5676
+ managed_highpages += zone_managed_pages(zone);
49705677 free_highpages += zone_page_state(zone, NR_FREE_PAGES);
49715678 }
49725679 }
....@@ -5055,7 +5762,7 @@
50555762
50565763 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
50575764 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
5058
- " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
5765
+ " unevictable:%lu dirty:%lu writeback:%lu\n"
50595766 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
50605767 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
50615768 " free:%lu free_pcp:%lu free_cma:%lu\n",
....@@ -5068,9 +5775,8 @@
50685775 global_node_page_state(NR_UNEVICTABLE),
50695776 global_node_page_state(NR_FILE_DIRTY),
50705777 global_node_page_state(NR_WRITEBACK),
5071
- global_node_page_state(NR_UNSTABLE_NFS),
5072
- global_node_page_state(NR_SLAB_RECLAIMABLE),
5073
- global_node_page_state(NR_SLAB_UNRECLAIMABLE),
5778
+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
5779
+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
50745780 global_node_page_state(NR_FILE_MAPPED),
50755781 global_node_page_state(NR_SHMEM),
50765782 global_zone_page_state(NR_PAGETABLE),
....@@ -5079,6 +5785,7 @@
50795785 free_pcp,
50805786 global_zone_page_state(NR_FREE_CMA_PAGES));
50815787
5788
+ trace_android_vh_show_mapcount_pages(NULL);
50825789 for_each_online_pgdat(pgdat) {
50835790 if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
50845791 continue;
....@@ -5101,7 +5808,10 @@
51015808 " anon_thp: %lukB"
51025809 #endif
51035810 " writeback_tmp:%lukB"
5104
- " unstable:%lukB"
5811
+ " kernel_stack:%lukB"
5812
+#ifdef CONFIG_SHADOW_CALL_STACK
5813
+ " shadow_call_stack:%lukB"
5814
+#endif
51055815 " all_unreclaimable? %s"
51065816 "\n",
51075817 pgdat->node_id,
....@@ -5123,7 +5833,10 @@
51235833 K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
51245834 #endif
51255835 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
5126
- K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
5836
+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
5837
+#ifdef CONFIG_SHADOW_CALL_STACK
5838
+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
5839
+#endif
51275840 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
51285841 "yes" : "no");
51295842 }
....@@ -5145,6 +5858,7 @@
51455858 " min:%lukB"
51465859 " low:%lukB"
51475860 " high:%lukB"
5861
+ " reserved_highatomic:%luKB"
51485862 " active_anon:%lukB"
51495863 " inactive_anon:%lukB"
51505864 " active_file:%lukB"
....@@ -5154,10 +5868,6 @@
51545868 " present:%lukB"
51555869 " managed:%lukB"
51565870 " mlocked:%lukB"
5157
- " kernel_stack:%lukB"
5158
-#ifdef CONFIG_SHADOW_CALL_STACK
5159
- " shadow_call_stack:%lukB"
5160
-#endif
51615871 " pagetables:%lukB"
51625872 " bounce:%lukB"
51635873 " free_pcp:%lukB"
....@@ -5169,6 +5879,7 @@
51695879 K(min_wmark_pages(zone)),
51705880 K(low_wmark_pages(zone)),
51715881 K(high_wmark_pages(zone)),
5882
+ K(zone->nr_reserved_highatomic),
51725883 K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
51735884 K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
51745885 K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
....@@ -5176,12 +5887,8 @@
51765887 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
51775888 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
51785889 K(zone->present_pages),
5179
- K(zone->managed_pages),
5890
+ K(zone_managed_pages(zone)),
51805891 K(zone_page_state(zone, NR_MLOCK)),
5181
- zone_page_state(zone, NR_KERNEL_STACK_KB),
5182
-#ifdef CONFIG_SHADOW_CALL_STACK
5183
- zone_page_state(zone, NR_KERNEL_SCS_BYTES) / 1024,
5184
-#endif
51855892 K(zone_page_state(zone, NR_PAGETABLE)),
51865893 K(zone_page_state(zone, NR_BOUNCE)),
51875894 K(free_pcp),
....@@ -5213,7 +5920,7 @@
52135920
52145921 types[order] = 0;
52155922 for (type = 0; type < MIGRATE_TYPES; type++) {
5216
- if (!list_empty(&area->free_list[type]))
5923
+ if (!free_area_empty(area, type))
52175924 types[order] |= 1 << type;
52185925 }
52195926 }
....@@ -5254,7 +5961,7 @@
52545961 do {
52555962 zone_type--;
52565963 zone = pgdat->node_zones + zone_type;
5257
- if (managed_zone(zone)) {
5964
+ if (populated_zone(zone)) {
52585965 zoneref_set_zone(zone, &zonerefs[nr_zones++]);
52595966 check_highest_zone(zone_type);
52605967 }
....@@ -5280,36 +5987,17 @@
52805987 return 0;
52815988 }
52825989
5283
-static __init int setup_numa_zonelist_order(char *s)
5284
-{
5285
- if (!s)
5286
- return 0;
5287
-
5288
- return __parse_numa_zonelist_order(s);
5289
-}
5290
-early_param("numa_zonelist_order", setup_numa_zonelist_order);
5291
-
52925990 char numa_zonelist_order[] = "Node";
52935991
52945992 /*
52955993 * sysctl handler for numa_zonelist_order
52965994 */
52975995 int numa_zonelist_order_handler(struct ctl_table *table, int write,
5298
- void __user *buffer, size_t *length,
5299
- loff_t *ppos)
5996
+ void *buffer, size_t *length, loff_t *ppos)
53005997 {
5301
- char *str;
5302
- int ret;
5303
-
5304
- if (!write)
5305
- return proc_dostring(table, write, buffer, length, ppos);
5306
- str = memdup_user_nul(buffer, 16);
5307
- if (IS_ERR(str))
5308
- return PTR_ERR(str);
5309
-
5310
- ret = __parse_numa_zonelist_order(str);
5311
- kfree(str);
5312
- return ret;
5998
+ if (write)
5999
+ return __parse_numa_zonelist_order(buffer);
6000
+ return proc_dostring(table, write, buffer, length, ppos);
53136001 }
53146002
53156003
....@@ -5328,14 +6016,14 @@
53286016 * from each node to each node in the system), and should also prefer nodes
53296017 * with no CPUs, since presumably they'll have very little allocation pressure
53306018 * on them otherwise.
5331
- * It returns -1 if no node is found.
6019
+ *
6020
+ * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
53326021 */
53336022 static int find_next_best_node(int node, nodemask_t *used_node_mask)
53346023 {
53356024 int n, val;
53366025 int min_val = INT_MAX;
53376026 int best_node = NUMA_NO_NODE;
5338
- const struct cpumask *tmp = cpumask_of_node(0);
53396027
53406028 /* Use the local node if we haven't already */
53416029 if (!node_isset(node, *used_node_mask)) {
....@@ -5356,8 +6044,7 @@
53566044 val += (n < node);
53576045
53586046 /* Give preference to headless and unused nodes */
5359
- tmp = cpumask_of_node(n);
5360
- if (!cpumask_empty(tmp))
6047
+ if (!cpumask_empty(cpumask_of_node(n)))
53616048 val += PENALTY_FOR_NODE_WITH_CPUS;
53626049
53636050 /* Slight preference for less loaded node */
....@@ -5428,14 +6115,13 @@
54286115 {
54296116 static int node_order[MAX_NUMNODES];
54306117 int node, load, nr_nodes = 0;
5431
- nodemask_t used_mask;
6118
+ nodemask_t used_mask = NODE_MASK_NONE;
54326119 int local_node, prev_node;
54336120
54346121 /* NUMA-aware ordering of nodes */
54356122 local_node = pgdat->node_id;
54366123 load = nr_online_nodes;
54376124 prev_node = local_node;
5438
- nodes_clear(used_mask);
54396125
54406126 memset(node_order, 0, sizeof(node_order));
54416127 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
....@@ -5542,9 +6228,8 @@
55426228 int nid;
55436229 int __maybe_unused cpu;
55446230 pg_data_t *self = data;
5545
- static DEFINE_SPINLOCK(lock);
55466231
5547
- spin_lock(&lock);
6232
+ write_seqlock(&zonelist_update_seq);
55486233
55496234 #ifdef CONFIG_NUMA
55506235 memset(node_load, 0, sizeof(node_load));
....@@ -5577,7 +6262,7 @@
55776262 #endif
55786263 }
55796264
5580
- spin_unlock(&lock);
6265
+ write_sequnlock(&zonelist_update_seq);
55816266 }
55826267
55836268 static noinline void __init
....@@ -5615,13 +6300,16 @@
56156300 */
56166301 void __ref build_all_zonelists(pg_data_t *pgdat)
56176302 {
6303
+ unsigned long vm_total_pages;
6304
+
56186305 if (system_state == SYSTEM_BOOTING) {
56196306 build_all_zonelists_init();
56206307 } else {
56216308 __build_all_zonelists(pgdat);
56226309 /* cpuset refresh routine should be here */
56236310 }
5624
- vm_total_pages = nr_free_pagecache_pages();
6311
+ /* Get the number of free pages beyond high watermark in all zones. */
6312
+ vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
56256313 /*
56266314 * Disable grouping by mobility if the number of pages in the
56276315 * system is too low to allow the mechanism to work. It would be
....@@ -5634,7 +6322,7 @@
56346322 else
56356323 page_group_by_mobility_disabled = 0;
56366324
5637
- pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
6325
+ pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
56386326 nr_online_nodes,
56396327 page_group_by_mobility_disabled ? "off" : "on",
56406328 vm_total_pages);
....@@ -5643,81 +6331,148 @@
56436331 #endif
56446332 }
56456333
6334
+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
6335
+static bool __meminit
6336
+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
6337
+{
6338
+ static struct memblock_region *r;
6339
+
6340
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
6341
+ if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
6342
+ for_each_mem_region(r) {
6343
+ if (*pfn < memblock_region_memory_end_pfn(r))
6344
+ break;
6345
+ }
6346
+ }
6347
+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
6348
+ memblock_is_mirror(r)) {
6349
+ *pfn = memblock_region_memory_end_pfn(r);
6350
+ return true;
6351
+ }
6352
+ }
6353
+ return false;
6354
+}
6355
+
56466356 /*
56476357 * Initially all pages are reserved - free ones are freed
5648
- * up by free_all_bootmem() once the early boot process is
6358
+ * up by memblock_free_all() once the early boot process is
56496359 * done. Non-atomic initialization, single-pass.
6360
+ *
6361
+ * All aligned pageblocks are initialized to the specified migratetype
6362
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
6363
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
56506364 */
56516365 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5652
- unsigned long start_pfn, enum meminit_context context,
5653
- struct vmem_altmap *altmap)
6366
+ unsigned long start_pfn, unsigned long zone_end_pfn,
6367
+ enum meminit_context context,
6368
+ struct vmem_altmap *altmap, int migratetype)
56546369 {
5655
- unsigned long end_pfn = start_pfn + size;
5656
- pg_data_t *pgdat = NODE_DATA(nid);
5657
- unsigned long pfn;
5658
- unsigned long nr_initialised = 0;
6370
+ unsigned long pfn, end_pfn = start_pfn + size;
56596371 struct page *page;
5660
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5661
- struct memblock_region *r = NULL, *tmp;
5662
-#endif
56636372
56646373 if (highest_memmap_pfn < end_pfn - 1)
56656374 highest_memmap_pfn = end_pfn - 1;
6375
+
6376
+#ifdef CONFIG_ZONE_DEVICE
6377
+ /*
6378
+ * Honor reservation requested by the driver for this ZONE_DEVICE
6379
+ * memory. We limit the total number of pages to initialize to just
6380
+ * those that might contain the memory mapping. We will defer the
6381
+ * ZONE_DEVICE page initialization until after we have released
6382
+ * the hotplug lock.
6383
+ */
6384
+ if (zone == ZONE_DEVICE) {
6385
+ if (!altmap)
6386
+ return;
6387
+
6388
+ if (start_pfn == altmap->base_pfn)
6389
+ start_pfn += altmap->reserve;
6390
+ end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
6391
+ }
6392
+#endif
56666393
56676394 #ifdef CONFIG_ROCKCHIP_THUNDER_BOOT
56686395 /* Zero all page struct in advance */
56696396 memset(pfn_to_page(start_pfn), 0, sizeof(struct page) * size);
56706397 #endif
56716398
5672
- /*
5673
- * Honor reservation requested by the driver for this ZONE_DEVICE
5674
- * memory
5675
- */
5676
- if (altmap && start_pfn == altmap->base_pfn)
5677
- start_pfn += altmap->reserve;
5678
-
5679
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
6399
+ for (pfn = start_pfn; pfn < end_pfn; ) {
56806400 /*
56816401 * There can be holes in boot-time mem_map[]s handed to this
56826402 * function. They do not exist on hotplugged memory.
56836403 */
5684
- if (context != MEMINIT_EARLY)
5685
- goto not_early;
5686
-
5687
- if (!early_pfn_valid(pfn))
5688
- continue;
5689
- if (!early_pfn_in_nid(pfn, nid))
5690
- continue;
5691
- if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
5692
- break;
5693
-
5694
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5695
- /*
5696
- * Check given memblock attribute by firmware which can affect
5697
- * kernel memory layout. If zone==ZONE_MOVABLE but memory is
5698
- * mirrored, it's an overlapped memmap init. skip it.
5699
- */
5700
- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
5701
- if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
5702
- for_each_memblock(memory, tmp)
5703
- if (pfn < memblock_region_memory_end_pfn(tmp))
5704
- break;
5705
- r = tmp;
5706
- }
5707
- if (pfn >= memblock_region_memory_base_pfn(r) &&
5708
- memblock_is_mirror(r)) {
5709
- /* already initialized as NORMAL */
5710
- pfn = memblock_region_memory_end_pfn(r);
6404
+ if (context == MEMINIT_EARLY) {
6405
+ if (overlap_memmap_init(zone, &pfn))
57116406 continue;
5712
- }
6407
+ if (defer_init(nid, pfn, zone_end_pfn))
6408
+ break;
57136409 }
5714
-#endif
57156410
5716
-not_early:
57176411 page = pfn_to_page(pfn);
57186412 __init_single_page(page, pfn, zone, nid, false);
57196413 if (context == MEMINIT_HOTPLUG)
5720
- SetPageReserved(page);
6414
+ __SetPageReserved(page);
6415
+
6416
+ /*
6417
+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
6418
+ * such that unmovable allocations won't be scattered all
6419
+ * over the place during system boot.
6420
+ */
6421
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
6422
+ set_pageblock_migratetype(page, migratetype);
6423
+ cond_resched();
6424
+ }
6425
+ pfn++;
6426
+ }
6427
+}
6428
+
6429
+#ifdef CONFIG_ZONE_DEVICE
6430
+void __ref memmap_init_zone_device(struct zone *zone,
6431
+ unsigned long start_pfn,
6432
+ unsigned long nr_pages,
6433
+ struct dev_pagemap *pgmap)
6434
+{
6435
+ unsigned long pfn, end_pfn = start_pfn + nr_pages;
6436
+ struct pglist_data *pgdat = zone->zone_pgdat;
6437
+ struct vmem_altmap *altmap = pgmap_altmap(pgmap);
6438
+ unsigned long zone_idx = zone_idx(zone);
6439
+ unsigned long start = jiffies;
6440
+ int nid = pgdat->node_id;
6441
+
6442
+ if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
6443
+ return;
6444
+
6445
+ /*
6446
+ * The call to memmap_init should have already taken care
6447
+ * of the pages reserved for the memmap, so we can just jump to
6448
+ * the end of that region and start processing the device pages.
6449
+ */
6450
+ if (altmap) {
6451
+ start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
6452
+ nr_pages = end_pfn - start_pfn;
6453
+ }
6454
+
6455
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
6456
+ struct page *page = pfn_to_page(pfn);
6457
+
6458
+ __init_single_page(page, pfn, zone_idx, nid, true);
6459
+
6460
+ /*
6461
+ * Mark page reserved as it will need to wait for onlining
6462
+ * phase for it to be fully associated with a zone.
6463
+ *
6464
+ * We can use the non-atomic __set_bit operation for setting
6465
+ * the flag as we are still initializing the pages.
6466
+ */
6467
+ __SetPageReserved(page);
6468
+
6469
+ /*
6470
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
6471
+ * and zone_device_data. It is a bug if a ZONE_DEVICE page is
6472
+ * ever freed or placed on a driver-private list.
6473
+ */
6474
+ page->pgmap = pgmap;
6475
+ page->zone_device_data = NULL;
57216476
57226477 /*
57236478 * Mark the block movable so that blocks are reserved for
....@@ -5726,21 +6481,20 @@
57266481 * the address space during boot when many long-lived
57276482 * kernel allocations are made.
57286483 *
5729
- * bitmap is created for zone's valid pfn range. but memmap
5730
- * can be created for invalid pages (for alignment)
5731
- * check here not to call set_pageblock_migratetype() against
5732
- * pfn out of zone.
5733
- *
57346484 * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
5735
- * because this is done early in sparse_add_one_section
6485
+ * because this is done early in section_activate()
57366486 */
5737
- if (!(pfn & (pageblock_nr_pages - 1))) {
6487
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
57386488 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
57396489 cond_resched();
57406490 }
57416491 }
6492
+
6493
+ pr_info("%s initialised %lu pages in %ums\n", __func__,
6494
+ nr_pages, jiffies_to_msecs(jiffies - start));
57426495 }
57436496
6497
+#endif
57446498 static void __meminit zone_init_free_lists(struct zone *zone)
57456499 {
57466500 unsigned int order, t;
....@@ -5750,11 +6504,118 @@
57506504 }
57516505 }
57526506
5753
-#ifndef __HAVE_ARCH_MEMMAP_INIT
5754
-#define memmap_init(size, nid, zone, start_pfn) \
5755
- memmap_init_zone((size), (nid), (zone), (start_pfn), \
5756
- MEMINIT_EARLY, NULL)
6507
+/*
6508
+ * Only struct pages that correspond to ranges defined by memblock.memory
6509
+ * are zeroed and initialized by going through __init_single_page() during
6510
+ * memmap_init_zone_range().
6511
+ *
6512
+ * But, there could be struct pages that correspond to holes in
6513
+ * memblock.memory. This can happen because of the following reasons:
6514
+ * - physical memory bank size is not necessarily the exact multiple of the
6515
+ * arbitrary section size
6516
+ * - early reserved memory may not be listed in memblock.memory
6517
+ * - memory layouts defined with memmap= kernel parameter may not align
6518
+ * nicely with memmap sections
6519
+ *
6520
+ * Explicitly initialize those struct pages so that:
6521
+ * - PG_Reserved is set
6522
+ * - zone and node links point to zone and node that span the page if the
6523
+ * hole is in the middle of a zone
6524
+ * - zone and node links point to adjacent zone/node if the hole falls on
6525
+ * the zone boundary; the pages in such holes will be prepended to the
6526
+ * zone/node above the hole except for the trailing pages in the last
6527
+ * section that will be appended to the zone/node below.
6528
+ */
6529
+static void __init init_unavailable_range(unsigned long spfn,
6530
+ unsigned long epfn,
6531
+ int zone, int node)
6532
+{
6533
+ unsigned long pfn;
6534
+ u64 pgcnt = 0;
6535
+
6536
+ for (pfn = spfn; pfn < epfn; pfn++) {
6537
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
6538
+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
6539
+ + pageblock_nr_pages - 1;
6540
+ continue;
6541
+ }
6542
+ __init_single_page(pfn_to_page(pfn), pfn, zone, node, true);
6543
+ __SetPageReserved(pfn_to_page(pfn));
6544
+ pgcnt++;
6545
+ }
6546
+
6547
+ if (pgcnt)
6548
+ pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
6549
+ node, zone_names[zone], pgcnt);
6550
+}
6551
+
6552
+static void __init memmap_init_zone_range(struct zone *zone,
6553
+ unsigned long start_pfn,
6554
+ unsigned long end_pfn,
6555
+ unsigned long *hole_pfn)
6556
+{
6557
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
6558
+ unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
6559
+ int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
6560
+
6561
+ start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
6562
+ end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
6563
+
6564
+ if (start_pfn >= end_pfn)
6565
+ return;
6566
+
6567
+ memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn,
6568
+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
6569
+
6570
+ if (*hole_pfn < start_pfn)
6571
+ init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
6572
+
6573
+ *hole_pfn = end_pfn;
6574
+}
6575
+
6576
+void __init __weak memmap_init(void)
6577
+{
6578
+ unsigned long start_pfn, end_pfn;
6579
+ unsigned long hole_pfn = 0;
6580
+ int i, j, zone_id, nid;
6581
+
6582
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
6583
+ struct pglist_data *node = NODE_DATA(nid);
6584
+
6585
+ for (j = 0; j < MAX_NR_ZONES; j++) {
6586
+ struct zone *zone = node->node_zones + j;
6587
+
6588
+ if (!populated_zone(zone))
6589
+ continue;
6590
+
6591
+ memmap_init_zone_range(zone, start_pfn, end_pfn,
6592
+ &hole_pfn);
6593
+ zone_id = j;
6594
+ }
6595
+ }
6596
+
6597
+#ifdef CONFIG_SPARSEMEM
6598
+ /*
6599
+ * Initialize the memory map for hole in the range [memory_end,
6600
+ * section_end].
6601
+ * Append the pages in this hole to the highest zone in the last
6602
+ * node.
6603
+ * The call to init_unavailable_range() is outside the ifdef to
6604
+ * silence the compiler warining about zone_id set but not used;
6605
+ * for FLATMEM it is a nop anyway
6606
+ */
6607
+ end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
6608
+ if (hole_pfn < end_pfn)
57576609 #endif
6610
+ init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
6611
+}
6612
+
6613
+/* A stub for backwards compatibility with custom implementatin on IA-64 */
6614
+void __meminit __weak arch_memmap_init(unsigned long size, int nid,
6615
+ unsigned long zone,
6616
+ unsigned long range_start_pfn)
6617
+{
6618
+}
57586619
57596620 static int zone_batchsize(struct zone *zone)
57606621 {
....@@ -5765,7 +6626,7 @@
57656626 * The per-cpu-pages pools are set to around 1000th of the
57666627 * size of the zone.
57676628 */
5768
- batch = zone->managed_pages / 1024;
6629
+ batch = zone_managed_pages(zone) / 1024;
57696630 /* But no more than a meg. */
57706631 if (batch * PAGE_SIZE > 1024 * 1024)
57716632 batch = (1024 * 1024) / PAGE_SIZE;
....@@ -5812,7 +6673,7 @@
58126673 * locking.
58136674 *
58146675 * Any new users of pcp->batch and pcp->high should ensure they can cope with
5815
- * those fields changing asynchronously (acording the the above rule).
6676
+ * those fields changing asynchronously (acording to the above rule).
58166677 *
58176678 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
58186679 * outside of boot time (or some other assurance that no concurrent updaters
....@@ -5846,7 +6707,6 @@
58466707 memset(p, 0, sizeof(*p));
58476708
58486709 pcp = &p->pcp;
5849
- pcp->count = 0;
58506710 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
58516711 INIT_LIST_HEAD(&pcp->lists[migratetype]);
58526712 }
....@@ -5876,7 +6736,7 @@
58766736 {
58776737 if (percpu_pagelist_fraction)
58786738 pageset_set_high(pcp,
5879
- (zone->managed_pages /
6739
+ (zone_managed_pages(zone) /
58806740 percpu_pagelist_fraction));
58816741 else
58826742 pageset_set_batch(pcp, zone_batchsize(zone));
....@@ -5906,9 +6766,24 @@
59066766 {
59076767 struct pglist_data *pgdat;
59086768 struct zone *zone;
6769
+ int __maybe_unused cpu;
59096770
59106771 for_each_populated_zone(zone)
59116772 setup_zone_pageset(zone);
6773
+
6774
+#ifdef CONFIG_NUMA
6775
+ /*
6776
+ * Unpopulated zones continue using the boot pagesets.
6777
+ * The numa stats for these pagesets need to be reset.
6778
+ * Otherwise, they will end up skewing the stats of
6779
+ * the nodes these zones are associated with.
6780
+ */
6781
+ for_each_possible_cpu(cpu) {
6782
+ struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
6783
+ memset(pcp->vm_numa_stat_diff, 0,
6784
+ sizeof(pcp->vm_numa_stat_diff));
6785
+ }
6786
+#endif
59126787
59136788 for_each_online_pgdat(pgdat)
59146789 pgdat->per_cpu_nodestats =
....@@ -5952,73 +6827,6 @@
59526827 zone->initialized = 1;
59536828 }
59546829
5955
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5956
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
5957
-
5958
-/*
5959
- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
5960
- */
5961
-int __meminit __early_pfn_to_nid(unsigned long pfn,
5962
- struct mminit_pfnnid_cache *state)
5963
-{
5964
- unsigned long start_pfn, end_pfn;
5965
- int nid;
5966
-
5967
- if (state->last_start <= pfn && pfn < state->last_end)
5968
- return state->last_nid;
5969
-
5970
- nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
5971
- if (nid != -1) {
5972
- state->last_start = start_pfn;
5973
- state->last_end = end_pfn;
5974
- state->last_nid = nid;
5975
- }
5976
-
5977
- return nid;
5978
-}
5979
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
5980
-
5981
-/**
5982
- * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
5983
- * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
5984
- * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
5985
- *
5986
- * If an architecture guarantees that all ranges registered contain no holes
5987
- * and may be freed, this this function may be used instead of calling
5988
- * memblock_free_early_nid() manually.
5989
- */
5990
-void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
5991
-{
5992
- unsigned long start_pfn, end_pfn;
5993
- int i, this_nid;
5994
-
5995
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
5996
- start_pfn = min(start_pfn, max_low_pfn);
5997
- end_pfn = min(end_pfn, max_low_pfn);
5998
-
5999
- if (start_pfn < end_pfn)
6000
- memblock_free_early_nid(PFN_PHYS(start_pfn),
6001
- (end_pfn - start_pfn) << PAGE_SHIFT,
6002
- this_nid);
6003
- }
6004
-}
6005
-
6006
-/**
6007
- * sparse_memory_present_with_active_regions - Call memory_present for each active range
6008
- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
6009
- *
6010
- * If an architecture guarantees that all ranges registered contain no holes and may
6011
- * be freed, this function may be used instead of calling memory_present() manually.
6012
- */
6013
-void __init sparse_memory_present_with_active_regions(int nid)
6014
-{
6015
- unsigned long start_pfn, end_pfn;
6016
- int i, this_nid;
6017
-
6018
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
6019
- memory_present(this_nid, start_pfn, end_pfn);
6020
-}
6021
-
60226830 /**
60236831 * get_pfn_range_for_nid - Return the start and end page frames for a node
60246832 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
....@@ -6030,7 +6838,7 @@
60306838 * with no available memory, a warning is printed and the start and end
60316839 * PFNs will be 0.
60326840 */
6033
-void __meminit get_pfn_range_for_nid(unsigned int nid,
6841
+void __init get_pfn_range_for_nid(unsigned int nid,
60346842 unsigned long *start_pfn, unsigned long *end_pfn)
60356843 {
60366844 unsigned long this_start_pfn, this_end_pfn;
....@@ -6079,7 +6887,7 @@
60796887 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
60806888 * zones within a node are in order of monotonic increases memory addresses
60816889 */
6082
-static void __meminit adjust_zone_range_for_zone_movable(int nid,
6890
+static void __init adjust_zone_range_for_zone_movable(int nid,
60836891 unsigned long zone_type,
60846892 unsigned long node_start_pfn,
60856893 unsigned long node_end_pfn,
....@@ -6110,13 +6918,12 @@
61106918 * Return the number of pages a zone spans in a node, including holes
61116919 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
61126920 */
6113
-static unsigned long __meminit zone_spanned_pages_in_node(int nid,
6921
+static unsigned long __init zone_spanned_pages_in_node(int nid,
61146922 unsigned long zone_type,
61156923 unsigned long node_start_pfn,
61166924 unsigned long node_end_pfn,
61176925 unsigned long *zone_start_pfn,
6118
- unsigned long *zone_end_pfn,
6119
- unsigned long *ignored)
6926
+ unsigned long *zone_end_pfn)
61206927 {
61216928 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
61226929 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
....@@ -6147,7 +6954,7 @@
61476954 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
61486955 * then all holes in the requested range will be accounted for.
61496956 */
6150
-unsigned long __meminit __absent_pages_in_range(int nid,
6957
+unsigned long __init __absent_pages_in_range(int nid,
61516958 unsigned long range_start_pfn,
61526959 unsigned long range_end_pfn)
61536960 {
....@@ -6168,7 +6975,7 @@
61686975 * @start_pfn: The start PFN to start searching for holes
61696976 * @end_pfn: The end PFN to stop searching for holes
61706977 *
6171
- * It returns the number of pages frames in memory holes within a range.
6978
+ * Return: the number of pages frames in memory holes within a range.
61726979 */
61736980 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
61746981 unsigned long end_pfn)
....@@ -6177,11 +6984,10 @@
61776984 }
61786985
61796986 /* Return the number of page frames in holes in a zone on a node */
6180
-static unsigned long __meminit zone_absent_pages_in_node(int nid,
6987
+static unsigned long __init zone_absent_pages_in_node(int nid,
61816988 unsigned long zone_type,
61826989 unsigned long node_start_pfn,
6183
- unsigned long node_end_pfn,
6184
- unsigned long *ignored)
6990
+ unsigned long node_end_pfn)
61856991 {
61866992 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
61876993 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
....@@ -6209,7 +7015,7 @@
62097015 unsigned long start_pfn, end_pfn;
62107016 struct memblock_region *r;
62117017
6212
- for_each_memblock(memory, r) {
7018
+ for_each_mem_region(r) {
62137019 start_pfn = clamp(memblock_region_memory_base_pfn(r),
62147020 zone_start_pfn, zone_end_pfn);
62157021 end_pfn = clamp(memblock_region_memory_end_pfn(r),
....@@ -6228,45 +7034,9 @@
62287034 return nr_absent;
62297035 }
62307036
6231
-#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6232
-static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
6233
- unsigned long zone_type,
6234
- unsigned long node_start_pfn,
6235
- unsigned long node_end_pfn,
6236
- unsigned long *zone_start_pfn,
6237
- unsigned long *zone_end_pfn,
6238
- unsigned long *zones_size)
6239
-{
6240
- unsigned int zone;
6241
-
6242
- *zone_start_pfn = node_start_pfn;
6243
- for (zone = 0; zone < zone_type; zone++)
6244
- *zone_start_pfn += zones_size[zone];
6245
-
6246
- *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
6247
-
6248
- return zones_size[zone_type];
6249
-}
6250
-
6251
-static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
6252
- unsigned long zone_type,
7037
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
62537038 unsigned long node_start_pfn,
6254
- unsigned long node_end_pfn,
6255
- unsigned long *zholes_size)
6256
-{
6257
- if (!zholes_size)
6258
- return 0;
6259
-
6260
- return zholes_size[zone_type];
6261
-}
6262
-
6263
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6264
-
6265
-static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
6266
- unsigned long node_start_pfn,
6267
- unsigned long node_end_pfn,
6268
- unsigned long *zones_size,
6269
- unsigned long *zholes_size)
7039
+ unsigned long node_end_pfn)
62707040 {
62717041 unsigned long realtotalpages = 0, totalpages = 0;
62727042 enum zone_type i;
....@@ -6274,17 +7044,21 @@
62747044 for (i = 0; i < MAX_NR_ZONES; i++) {
62757045 struct zone *zone = pgdat->node_zones + i;
62767046 unsigned long zone_start_pfn, zone_end_pfn;
7047
+ unsigned long spanned, absent;
62777048 unsigned long size, real_size;
62787049
6279
- size = zone_spanned_pages_in_node(pgdat->node_id, i,
6280
- node_start_pfn,
6281
- node_end_pfn,
6282
- &zone_start_pfn,
6283
- &zone_end_pfn,
6284
- zones_size);
6285
- real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
6286
- node_start_pfn, node_end_pfn,
6287
- zholes_size);
7050
+ spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
7051
+ node_start_pfn,
7052
+ node_end_pfn,
7053
+ &zone_start_pfn,
7054
+ &zone_end_pfn);
7055
+ absent = zone_absent_pages_in_node(pgdat->node_id, i,
7056
+ node_start_pfn,
7057
+ node_end_pfn);
7058
+
7059
+ size = spanned;
7060
+ real_size = size - absent;
7061
+
62887062 if (size)
62897063 zone->zone_start_pfn = zone_start_pfn;
62907064 else
....@@ -6330,10 +7104,14 @@
63307104 {
63317105 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
63327106 zone->pageblock_flags = NULL;
6333
- if (usemapsize)
7107
+ if (usemapsize) {
63347108 zone->pageblock_flags =
6335
- memblock_virt_alloc_node_nopanic(usemapsize,
6336
- pgdat->node_id);
7109
+ memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
7110
+ pgdat->node_id);
7111
+ if (!zone->pageblock_flags)
7112
+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
7113
+ usemapsize, zone->name, pgdat->node_id);
7114
+ }
63377115 }
63387116 #else
63397117 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
....@@ -6400,9 +7178,11 @@
64007178 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
64017179 static void pgdat_init_split_queue(struct pglist_data *pgdat)
64027180 {
6403
- spin_lock_init(&pgdat->split_queue_lock);
6404
- INIT_LIST_HEAD(&pgdat->split_queue);
6405
- pgdat->split_queue_len = 0;
7181
+ struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
7182
+
7183
+ spin_lock_init(&ds_queue->split_queue_lock);
7184
+ INIT_LIST_HEAD(&ds_queue->split_queue);
7185
+ ds_queue->split_queue_len = 0;
64067186 }
64077187 #else
64087188 static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
....@@ -6429,13 +7209,13 @@
64297209
64307210 pgdat_page_ext_init(pgdat);
64317211 spin_lock_init(&pgdat->lru_lock);
6432
- lruvec_init(node_lruvec(pgdat));
7212
+ lruvec_init(&pgdat->__lruvec);
64337213 }
64347214
64357215 static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
64367216 unsigned long remaining_pages)
64377217 {
6438
- zone->managed_pages = remaining_pages;
7218
+ atomic_long_set(&zone->managed_pages, remaining_pages);
64397219 zone_set_nid(zone, nid);
64407220 zone->name = zone_names[idx];
64417221 zone->zone_pgdat = NODE_DATA(nid);
....@@ -6533,7 +7313,7 @@
65337313 set_pageblock_order();
65347314 setup_usemap(pgdat, zone, zone_start_pfn, size);
65357315 init_currently_empty_zone(zone, zone_start_pfn, size);
6536
- memmap_init(size, nid, j, zone_start_pfn);
7316
+ arch_memmap_init(size, nid, j, zone_start_pfn);
65377317 }
65387318 }
65397319
....@@ -6562,7 +7342,11 @@
65627342 end = pgdat_end_pfn(pgdat);
65637343 end = ALIGN(end, MAX_ORDER_NR_PAGES);
65647344 size = (end - start) * sizeof(struct page);
6565
- map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
7345
+ map = memblock_alloc_node(size, SMP_CACHE_BYTES,
7346
+ pgdat->node_id);
7347
+ if (!map)
7348
+ panic("Failed to allocate %ld bytes for node %d memory map\n",
7349
+ size, pgdat->node_id);
65667350 pgdat->node_mem_map = map + offset;
65677351 }
65687352 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
....@@ -6574,10 +7358,8 @@
65747358 */
65757359 if (pgdat == NODE_DATA(0)) {
65767360 mem_map = NODE_DATA(0)->node_mem_map;
6577
-#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
65787361 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
65797362 mem_map -= offset;
6580
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
65817363 }
65827364 #endif
65837365 }
....@@ -6588,42 +7370,31 @@
65887370 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
65897371 static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
65907372 {
6591
- /*
6592
- * We start only with one section of pages, more pages are added as
6593
- * needed until the rest of deferred pages are initialized.
6594
- */
6595
- pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
6596
- pgdat->node_spanned_pages);
65977373 pgdat->first_deferred_pfn = ULONG_MAX;
65987374 }
65997375 #else
66007376 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
66017377 #endif
66027378
6603
-void __init free_area_init_node(int nid, unsigned long *zones_size,
6604
- unsigned long node_start_pfn,
6605
- unsigned long *zholes_size)
7379
+static void __init free_area_init_node(int nid)
66067380 {
66077381 pg_data_t *pgdat = NODE_DATA(nid);
66087382 unsigned long start_pfn = 0;
66097383 unsigned long end_pfn = 0;
66107384
66117385 /* pg_data_t should be reset to zero when it's allocated */
6612
- WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
7386
+ WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
7387
+
7388
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
66137389
66147390 pgdat->node_id = nid;
6615
- pgdat->node_start_pfn = node_start_pfn;
7391
+ pgdat->node_start_pfn = start_pfn;
66167392 pgdat->per_cpu_nodestats = NULL;
6617
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6618
- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
7393
+
66197394 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
66207395 (u64)start_pfn << PAGE_SHIFT,
66217396 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
6622
-#else
6623
- start_pfn = node_start_pfn;
6624
-#endif
6625
- calculate_node_totalpages(pgdat, start_pfn, end_pfn,
6626
- zones_size, zholes_size);
7397
+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
66277398
66287399 alloc_node_mem_map(pgdat);
66297400 pgdat_set_deferred_range(pgdat);
....@@ -6631,80 +7402,10 @@
66317402 free_area_init_core(pgdat);
66327403 }
66337404
6634
-#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
6635
-
6636
-/*
6637
- * Zero all valid struct pages in range [spfn, epfn), return number of struct
6638
- * pages zeroed
6639
- */
6640
-static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
7405
+void __init free_area_init_memoryless_node(int nid)
66417406 {
6642
- unsigned long pfn;
6643
- u64 pgcnt = 0;
6644
-
6645
- for (pfn = spfn; pfn < epfn; pfn++) {
6646
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
6647
- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
6648
- + pageblock_nr_pages - 1;
6649
- continue;
6650
- }
6651
- mm_zero_struct_page(pfn_to_page(pfn));
6652
- pgcnt++;
6653
- }
6654
-
6655
- return pgcnt;
7407
+ free_area_init_node(nid);
66567408 }
6657
-
6658
-/*
6659
- * Only struct pages that are backed by physical memory are zeroed and
6660
- * initialized by going through __init_single_page(). But, there are some
6661
- * struct pages which are reserved in memblock allocator and their fields
6662
- * may be accessed (for example page_to_pfn() on some configuration accesses
6663
- * flags). We must explicitly zero those struct pages.
6664
- *
6665
- * This function also addresses a similar issue where struct pages are left
6666
- * uninitialized because the physical address range is not covered by
6667
- * memblock.memory or memblock.reserved. That could happen when memblock
6668
- * layout is manually configured via memmap=, or when the highest physical
6669
- * address (max_pfn) does not end on a section boundary.
6670
- */
6671
-void __init zero_resv_unavail(void)
6672
-{
6673
- phys_addr_t start, end;
6674
- u64 i, pgcnt;
6675
- phys_addr_t next = 0;
6676
-
6677
- /*
6678
- * Loop through unavailable ranges not covered by memblock.memory.
6679
- */
6680
- pgcnt = 0;
6681
- for_each_mem_range(i, &memblock.memory, NULL,
6682
- NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
6683
- if (next < start)
6684
- pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
6685
- next = end;
6686
- }
6687
-
6688
- /*
6689
- * Early sections always have a fully populated memmap for the whole
6690
- * section - see pfn_valid(). If the last section has holes at the
6691
- * end and that section is marked "online", the memmap will be
6692
- * considered initialized. Make sure that memmap has a well defined
6693
- * state.
6694
- */
6695
- pgcnt += zero_pfn_range(PFN_DOWN(next),
6696
- round_up(max_pfn, PAGES_PER_SECTION));
6697
-
6698
- /*
6699
- * Struct pages that do not have backing memory. This could be because
6700
- * firmware is using some of this memory, or for some other reasons.
6701
- */
6702
- if (pgcnt)
6703
- pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
6704
-}
6705
-#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
6706
-
6707
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
67087409
67097410 #if MAX_NUMNODES > 1
67107411 /*
....@@ -6735,14 +7436,14 @@
67357436 * model has fine enough granularity to avoid incorrect mapping for the
67367437 * populated node map.
67377438 *
6738
- * Returns the determined alignment in pfn's. 0 if there is no alignment
7439
+ * Return: the determined alignment in pfn's. 0 if there is no alignment
67397440 * requirement (single node).
67407441 */
67417442 unsigned long __init node_map_pfn_alignment(void)
67427443 {
67437444 unsigned long accl_mask = 0, last_end = 0;
67447445 unsigned long start, end, mask;
6745
- int last_nid = -1;
7446
+ int last_nid = NUMA_NO_NODE;
67467447 int i, nid;
67477448
67487449 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
....@@ -6769,33 +7470,15 @@
67697470 return ~accl_mask + 1;
67707471 }
67717472
6772
-/* Find the lowest pfn for a node */
6773
-static unsigned long __init find_min_pfn_for_node(int nid)
6774
-{
6775
- unsigned long min_pfn = ULONG_MAX;
6776
- unsigned long start_pfn;
6777
- int i;
6778
-
6779
- for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
6780
- min_pfn = min(min_pfn, start_pfn);
6781
-
6782
- if (min_pfn == ULONG_MAX) {
6783
- pr_warn("Could not find start_pfn for node %d\n", nid);
6784
- return 0;
6785
- }
6786
-
6787
- return min_pfn;
6788
-}
6789
-
67907473 /**
67917474 * find_min_pfn_with_active_regions - Find the minimum PFN registered
67927475 *
6793
- * It returns the minimum PFN based on information provided via
7476
+ * Return: the minimum PFN based on information provided via
67947477 * memblock_set_node().
67957478 */
67967479 unsigned long __init find_min_pfn_with_active_regions(void)
67977480 {
6798
- return find_min_pfn_for_node(MAX_NUMNODES);
7481
+ return PHYS_PFN(memblock_start_of_DRAM());
67997482 }
68007483
68017484 /*
....@@ -6844,11 +7527,11 @@
68447527 * options.
68457528 */
68467529 if (movable_node_is_enabled()) {
6847
- for_each_memblock(memory, r) {
7530
+ for_each_mem_region(r) {
68487531 if (!memblock_is_hotpluggable(r))
68497532 continue;
68507533
6851
- nid = r->nid;
7534
+ nid = memblock_get_region_node(r);
68527535
68537536 usable_startpfn = PFN_DOWN(r->base);
68547537 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
....@@ -6865,11 +7548,11 @@
68657548 if (mirrored_kernelcore) {
68667549 bool mem_below_4gb_not_mirrored = false;
68677550
6868
- for_each_memblock(memory, r) {
7551
+ for_each_mem_region(r) {
68697552 if (memblock_is_mirror(r))
68707553 continue;
68717554
6872
- nid = r->nid;
7555
+ nid = memblock_get_region_node(r);
68737556
68747557 usable_startpfn = memblock_region_memory_base_pfn(r);
68757558
....@@ -6884,7 +7567,7 @@
68847567 }
68857568
68867569 if (mem_below_4gb_not_mirrored)
6887
- pr_warn("This configuration results in unmirrored kernel memory.");
7570
+ pr_warn("This configuration results in unmirrored kernel memory.\n");
68887571
68897572 goto out2;
68907573 }
....@@ -7023,9 +7706,16 @@
70237706
70247707 out2:
70257708 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
7026
- for (nid = 0; nid < MAX_NUMNODES; nid++)
7709
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
7710
+ unsigned long start_pfn, end_pfn;
7711
+
70277712 zone_movable_pfn[nid] =
70287713 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
7714
+
7715
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
7716
+ if (zone_movable_pfn[nid] >= end_pfn)
7717
+ zone_movable_pfn[nid] = 0;
7718
+ }
70297719
70307720 out:
70317721 /* restore the node_state */
....@@ -7037,23 +7727,29 @@
70377727 {
70387728 enum zone_type zone_type;
70397729
7040
- if (N_MEMORY == N_NORMAL_MEMORY)
7041
- return;
7042
-
70437730 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
70447731 struct zone *zone = &pgdat->node_zones[zone_type];
70457732 if (populated_zone(zone)) {
7046
- node_set_state(nid, N_HIGH_MEMORY);
7047
- if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
7048
- zone_type <= ZONE_NORMAL)
7733
+ if (IS_ENABLED(CONFIG_HIGHMEM))
7734
+ node_set_state(nid, N_HIGH_MEMORY);
7735
+ if (zone_type <= ZONE_NORMAL)
70497736 node_set_state(nid, N_NORMAL_MEMORY);
70507737 break;
70517738 }
70527739 }
70537740 }
70547741
7742
+/*
7743
+ * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
7744
+ * such cases we allow max_zone_pfn sorted in the descending order
7745
+ */
7746
+bool __weak arch_has_descending_max_zone_pfns(void)
7747
+{
7748
+ return false;
7749
+}
7750
+
70557751 /**
7056
- * free_area_init_nodes - Initialise all pg_data_t and zone data
7752
+ * free_area_init - Initialise all pg_data_t and zone data
70577753 * @max_zone_pfn: an array of max PFNs for each zone
70587754 *
70597755 * This will call free_area_init_node() for each active node in the system.
....@@ -7065,10 +7761,11 @@
70657761 * starts where the previous one ended. For example, ZONE_DMA32 starts
70667762 * at arch_max_dma_pfn.
70677763 */
7068
-void __init free_area_init_nodes(unsigned long *max_zone_pfn)
7764
+void __init free_area_init(unsigned long *max_zone_pfn)
70697765 {
70707766 unsigned long start_pfn, end_pfn;
7071
- int i, nid;
7767
+ int i, nid, zone;
7768
+ bool descending;
70727769
70737770 /* Record where the zone boundaries are */
70747771 memset(arch_zone_lowest_possible_pfn, 0,
....@@ -7077,14 +7774,20 @@
70777774 sizeof(arch_zone_highest_possible_pfn));
70787775
70797776 start_pfn = find_min_pfn_with_active_regions();
7777
+ descending = arch_has_descending_max_zone_pfns();
70807778
70817779 for (i = 0; i < MAX_NR_ZONES; i++) {
7082
- if (i == ZONE_MOVABLE)
7780
+ if (descending)
7781
+ zone = MAX_NR_ZONES - i - 1;
7782
+ else
7783
+ zone = i;
7784
+
7785
+ if (zone == ZONE_MOVABLE)
70837786 continue;
70847787
7085
- end_pfn = max(max_zone_pfn[i], start_pfn);
7086
- arch_zone_lowest_possible_pfn[i] = start_pfn;
7087
- arch_zone_highest_possible_pfn[i] = end_pfn;
7788
+ end_pfn = max(max_zone_pfn[zone], start_pfn);
7789
+ arch_zone_lowest_possible_pfn[zone] = start_pfn;
7790
+ arch_zone_highest_possible_pfn[zone] = end_pfn;
70887791
70897792 start_pfn = end_pfn;
70907793 }
....@@ -7118,27 +7821,33 @@
71187821 (u64)zone_movable_pfn[i] << PAGE_SHIFT);
71197822 }
71207823
7121
- /* Print out the early node map */
7824
+ /*
7825
+ * Print out the early node map, and initialize the
7826
+ * subsection-map relative to active online memory ranges to
7827
+ * enable future "sub-section" extensions of the memory map.
7828
+ */
71227829 pr_info("Early memory node ranges\n");
7123
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
7830
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
71247831 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
71257832 (u64)start_pfn << PAGE_SHIFT,
71267833 ((u64)end_pfn << PAGE_SHIFT) - 1);
7834
+ subsection_map_init(start_pfn, end_pfn - start_pfn);
7835
+ }
71277836
71287837 /* Initialise every node */
71297838 mminit_verify_pageflags_layout();
71307839 setup_nr_node_ids();
7131
- zero_resv_unavail();
71327840 for_each_online_node(nid) {
71337841 pg_data_t *pgdat = NODE_DATA(nid);
7134
- free_area_init_node(nid, NULL,
7135
- find_min_pfn_for_node(nid), NULL);
7842
+ free_area_init_node(nid);
71367843
71377844 /* Any memory on that node */
71387845 if (pgdat->node_present_pages)
71397846 node_set_state(nid, N_MEMORY);
71407847 check_for_memory(pgdat, nid);
71417848 }
7849
+
7850
+ memmap_init();
71427851 }
71437852
71447853 static int __init cmdline_parse_core(char *p, unsigned long *core,
....@@ -7197,22 +7906,18 @@
71977906 early_param("kernelcore", cmdline_parse_kernelcore);
71987907 early_param("movablecore", cmdline_parse_movablecore);
71997908
7200
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
7201
-
72027909 void adjust_managed_page_count(struct page *page, long count)
72037910 {
7204
- spin_lock(&managed_page_count_lock);
7205
- page_zone(page)->managed_pages += count;
7206
- totalram_pages += count;
7911
+ atomic_long_add(count, &page_zone(page)->managed_pages);
7912
+ totalram_pages_add(count);
72077913 #ifdef CONFIG_HIGHMEM
72087914 if (PageHighMem(page))
7209
- totalhigh_pages += count;
7915
+ totalhigh_pages_add(count);
72107916 #endif
7211
- spin_unlock(&managed_page_count_lock);
72127917 }
72137918 EXPORT_SYMBOL(adjust_managed_page_count);
72147919
7215
-unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
7920
+unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
72167921 {
72177922 void *pos;
72187923 unsigned long pages = 0;
....@@ -7231,6 +7936,11 @@
72317936 * alias for the memset().
72327937 */
72337938 direct_map_addr = page_address(page);
7939
+ /*
7940
+ * Perform a kasan-unchecked memset() since this memory
7941
+ * has not been initialized.
7942
+ */
7943
+ direct_map_addr = kasan_reset_tag(direct_map_addr);
72347944 if ((unsigned int)poison <= 0xFF)
72357945 memset(direct_map_addr, poison, PAGE_SIZE);
72367946
....@@ -7243,15 +7953,14 @@
72437953
72447954 return pages;
72457955 }
7246
-EXPORT_SYMBOL(free_reserved_area);
72477956
72487957 #ifdef CONFIG_HIGHMEM
72497958 void free_highmem_page(struct page *page)
72507959 {
72517960 __free_reserved_page(page);
7252
- totalram_pages++;
7253
- page_zone(page)->managed_pages++;
7254
- totalhigh_pages++;
7961
+ totalram_pages_inc();
7962
+ atomic_long_inc(&page_zone(page)->managed_pages);
7963
+ totalhigh_pages_inc();
72557964 }
72567965 #endif
72577966
....@@ -7278,7 +7987,7 @@
72787987 */
72797988 #define adj_init_size(start, end, size, pos, adj) \
72807989 do { \
7281
- if (start <= pos && pos < end && size > adj) \
7990
+ if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
72827991 size -= adj; \
72837992 } while (0)
72847993
....@@ -7300,10 +8009,10 @@
73008009 physpages << (PAGE_SHIFT - 10),
73018010 codesize >> 10, datasize >> 10, rosize >> 10,
73028011 (init_data_size + init_code_size) >> 10, bss_size >> 10,
7303
- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
8012
+ (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
73048013 totalcma_pages << (PAGE_SHIFT - 10),
73058014 #ifdef CONFIG_HIGHMEM
7306
- totalhigh_pages << (PAGE_SHIFT - 10),
8015
+ totalhigh_pages() << (PAGE_SHIFT - 10),
73078016 #endif
73088017 str ? ", " : "", str ? str : "");
73098018 }
....@@ -7322,13 +8031,6 @@
73228031 void __init set_dma_reserve(unsigned long new_dma_reserve)
73238032 {
73248033 dma_reserve = new_dma_reserve;
7325
-}
7326
-
7327
-void __init free_area_init(unsigned long *zones_size)
7328
-{
7329
- zero_resv_unavail();
7330
- free_area_init_node(0, zones_size,
7331
- __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
73328034 }
73338035
73348036 static int page_alloc_cpu_dead(unsigned int cpu)
....@@ -7356,9 +8058,27 @@
73568058 return 0;
73578059 }
73588060
8061
+#ifdef CONFIG_NUMA
8062
+int hashdist = HASHDIST_DEFAULT;
8063
+
8064
+static int __init set_hashdist(char *str)
8065
+{
8066
+ if (!str)
8067
+ return 0;
8068
+ hashdist = simple_strtoul(str, &str, 0);
8069
+ return 1;
8070
+}
8071
+__setup("hashdist=", set_hashdist);
8072
+#endif
8073
+
73598074 void __init page_alloc_init(void)
73608075 {
73618076 int ret;
8077
+
8078
+#ifdef CONFIG_NUMA
8079
+ if (num_node_state(N_MEMORY) == 1)
8080
+ hashdist = 0;
8081
+#endif
73628082
73638083 ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
73648084 "mm/page_alloc:dead", NULL,
....@@ -7383,6 +8103,7 @@
73838103 for (i = 0; i < MAX_NR_ZONES; i++) {
73848104 struct zone *zone = pgdat->node_zones + i;
73858105 long max = 0;
8106
+ unsigned long managed_pages = zone_managed_pages(zone);
73868107
73878108 /* Find valid and maximum lowmem_reserve in the zone */
73888109 for (j = i; j < MAX_NR_ZONES; j++) {
....@@ -7393,8 +8114,8 @@
73938114 /* we treat the high watermark as reserved pages. */
73948115 max += high_wmark_pages(zone);
73958116
7396
- if (max > zone->managed_pages)
7397
- max = zone->managed_pages;
8117
+ if (max > managed_pages)
8118
+ max = managed_pages;
73988119
73998120 pgdat->totalreserve_pages += max;
74008121
....@@ -7413,30 +8134,24 @@
74138134 static void setup_per_zone_lowmem_reserve(void)
74148135 {
74158136 struct pglist_data *pgdat;
7416
- enum zone_type j, idx;
8137
+ enum zone_type i, j;
74178138
74188139 for_each_online_pgdat(pgdat) {
7419
- for (j = 0; j < MAX_NR_ZONES; j++) {
7420
- struct zone *zone = pgdat->node_zones + j;
7421
- unsigned long managed_pages = zone->managed_pages;
8140
+ for (i = 0; i < MAX_NR_ZONES - 1; i++) {
8141
+ struct zone *zone = &pgdat->node_zones[i];
8142
+ int ratio = sysctl_lowmem_reserve_ratio[i];
8143
+ bool clear = !ratio || !zone_managed_pages(zone);
8144
+ unsigned long managed_pages = 0;
74228145
7423
- zone->lowmem_reserve[j] = 0;
8146
+ for (j = i + 1; j < MAX_NR_ZONES; j++) {
8147
+ struct zone *upper_zone = &pgdat->node_zones[j];
74248148
7425
- idx = j;
7426
- while (idx) {
7427
- struct zone *lower_zone;
8149
+ managed_pages += zone_managed_pages(upper_zone);
74288150
7429
- idx--;
7430
- lower_zone = pgdat->node_zones + idx;
7431
-
7432
- if (sysctl_lowmem_reserve_ratio[idx] < 1) {
7433
- sysctl_lowmem_reserve_ratio[idx] = 0;
7434
- lower_zone->lowmem_reserve[j] = 0;
7435
- } else {
7436
- lower_zone->lowmem_reserve[j] =
7437
- managed_pages / sysctl_lowmem_reserve_ratio[idx];
7438
- }
7439
- managed_pages += lower_zone->managed_pages;
8151
+ if (clear)
8152
+ zone->lowmem_reserve[j] = 0;
8153
+ else
8154
+ zone->lowmem_reserve[j] = managed_pages / ratio;
74408155 }
74418156 }
74428157 }
....@@ -7456,18 +8171,17 @@
74568171 /* Calculate total number of !ZONE_HIGHMEM pages */
74578172 for_each_zone(zone) {
74588173 if (!is_highmem(zone))
7459
- lowmem_pages += zone->managed_pages;
8174
+ lowmem_pages += zone_managed_pages(zone);
74608175 }
74618176
74628177 for_each_zone(zone) {
7463
- u64 min, low;
8178
+ u64 tmp, low;
74648179
74658180 spin_lock_irqsave(&zone->lock, flags);
7466
- min = (u64)pages_min * zone->managed_pages;
7467
- do_div(min, lowmem_pages);
7468
- low = (u64)pages_low * zone->managed_pages;
7469
- do_div(low, vm_total_pages);
7470
-
8181
+ tmp = (u64)pages_min * zone_managed_pages(zone);
8182
+ do_div(tmp, lowmem_pages);
8183
+ low = (u64)pages_low * zone_managed_pages(zone);
8184
+ do_div(low, nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)));
74718185 if (is_highmem(zone)) {
74728186 /*
74738187 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
....@@ -7475,20 +8189,20 @@
74758189 * value here.
74768190 *
74778191 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
7478
- * deltas control asynch page reclaim, and so should
8192
+ * deltas control async page reclaim, and so should
74798193 * not be capped for highmem.
74808194 */
74818195 unsigned long min_pages;
74828196
7483
- min_pages = zone->managed_pages / 1024;
8197
+ min_pages = zone_managed_pages(zone) / 1024;
74848198 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
7485
- zone->watermark[WMARK_MIN] = min_pages;
8199
+ zone->_watermark[WMARK_MIN] = min_pages;
74868200 } else {
74878201 /*
74888202 * If it's a lowmem zone, reserve a number of pages
74898203 * proportionate to the zone's size.
74908204 */
7491
- zone->watermark[WMARK_MIN] = min;
8205
+ zone->_watermark[WMARK_MIN] = tmp;
74928206 }
74938207
74948208 /*
....@@ -7496,14 +8210,13 @@
74968210 * scale factor in proportion to available memory, but
74978211 * ensure a minimum size on small systems.
74988212 */
7499
- min = max_t(u64, min >> 2,
7500
- mult_frac(zone->managed_pages,
8213
+ tmp = max_t(u64, tmp >> 2,
8214
+ mult_frac(zone_managed_pages(zone),
75018215 watermark_scale_factor, 10000));
75028216
7503
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
7504
- low + min;
7505
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
7506
- low + min * 2;
8217
+ zone->watermark_boost = 0;
8218
+ zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + low + tmp;
8219
+ zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + low + tmp * 2;
75078220
75088221 spin_unlock_irqrestore(&zone->lock, flags);
75098222 }
....@@ -7532,7 +8245,7 @@
75328245 * Initialise min_free_kbytes.
75338246 *
75348247 * For small machines we want it small (128k min). For large machines
7535
- * we want it large (64MB max). But it is not linear, because network
8248
+ * we want it large (256MB max). But it is not linear, because network
75368249 * bandwidth does not increase linearly with machine size. We use
75378250 *
75388251 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
....@@ -7564,8 +8277,8 @@
75648277 min_free_kbytes = new_min_free_kbytes;
75658278 if (min_free_kbytes < 128)
75668279 min_free_kbytes = 128;
7567
- if (min_free_kbytes > 65536)
7568
- min_free_kbytes = 65536;
8280
+ if (min_free_kbytes > 262144)
8281
+ min_free_kbytes = 262144;
75698282 } else {
75708283 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
75718284 new_min_free_kbytes, user_min_free_kbytes);
....@@ -7591,7 +8304,7 @@
75918304 * or extra_free_kbytes changes.
75928305 */
75938306 int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
7594
- void __user *buffer, size_t *length, loff_t *ppos)
8307
+ void *buffer, size_t *length, loff_t *ppos)
75958308 {
75968309 int rc;
75978310
....@@ -7607,7 +8320,7 @@
76078320 }
76088321
76098322 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
7610
- void __user *buffer, size_t *length, loff_t *ppos)
8323
+ void *buffer, size_t *length, loff_t *ppos)
76118324 {
76128325 int rc;
76138326
....@@ -7631,13 +8344,13 @@
76318344 pgdat->min_unmapped_pages = 0;
76328345
76338346 for_each_zone(zone)
7634
- zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
7635
- sysctl_min_unmapped_ratio) / 100;
8347
+ zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
8348
+ sysctl_min_unmapped_ratio) / 100;
76368349 }
76378350
76388351
76398352 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
7640
- void __user *buffer, size_t *length, loff_t *ppos)
8353
+ void *buffer, size_t *length, loff_t *ppos)
76418354 {
76428355 int rc;
76438356
....@@ -7659,12 +8372,12 @@
76598372 pgdat->min_slab_pages = 0;
76608373
76618374 for_each_zone(zone)
7662
- zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
7663
- sysctl_min_slab_ratio) / 100;
8375
+ zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
8376
+ sysctl_min_slab_ratio) / 100;
76648377 }
76658378
76668379 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
7667
- void __user *buffer, size_t *length, loff_t *ppos)
8380
+ void *buffer, size_t *length, loff_t *ppos)
76688381 {
76698382 int rc;
76708383
....@@ -7688,11 +8401,28 @@
76888401 * if in function of the boot time zone sizes.
76898402 */
76908403 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
7691
- void __user *buffer, size_t *length, loff_t *ppos)
8404
+ void *buffer, size_t *length, loff_t *ppos)
76928405 {
8406
+ int i;
8407
+
76938408 proc_dointvec_minmax(table, write, buffer, length, ppos);
8409
+
8410
+ for (i = 0; i < MAX_NR_ZONES; i++) {
8411
+ if (sysctl_lowmem_reserve_ratio[i] < 1)
8412
+ sysctl_lowmem_reserve_ratio[i] = 0;
8413
+ }
8414
+
76948415 setup_per_zone_lowmem_reserve();
76958416 return 0;
8417
+}
8418
+
8419
+static void __zone_pcp_update(struct zone *zone)
8420
+{
8421
+ unsigned int cpu;
8422
+
8423
+ for_each_possible_cpu(cpu)
8424
+ pageset_set_high_and_batch(zone,
8425
+ per_cpu_ptr(zone->pageset, cpu));
76968426 }
76978427
76988428 /*
....@@ -7701,7 +8431,7 @@
77018431 * pagelist can have before it gets flushed back to buddy allocator.
77028432 */
77038433 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
7704
- void __user *buffer, size_t *length, loff_t *ppos)
8434
+ void *buffer, size_t *length, loff_t *ppos)
77058435 {
77068436 struct zone *zone;
77078437 int old_percpu_pagelist_fraction;
....@@ -7726,30 +8456,12 @@
77268456 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
77278457 goto out;
77288458
7729
- for_each_populated_zone(zone) {
7730
- unsigned int cpu;
7731
-
7732
- for_each_possible_cpu(cpu)
7733
- pageset_set_high_and_batch(zone,
7734
- per_cpu_ptr(zone->pageset, cpu));
7735
- }
8459
+ for_each_populated_zone(zone)
8460
+ __zone_pcp_update(zone);
77368461 out:
77378462 mutex_unlock(&pcp_batch_high_lock);
77388463 return ret;
77398464 }
7740
-
7741
-#ifdef CONFIG_NUMA
7742
-int hashdist = HASHDIST_DEFAULT;
7743
-
7744
-static int __init set_hashdist(char *str)
7745
-{
7746
- if (!str)
7747
- return 0;
7748
- hashdist = simple_strtoul(str, &str, 0);
7749
- return 1;
7750
-}
7751
-__setup("hashdist=", set_hashdist);
7752
-#endif
77538465
77548466 #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
77558467 /*
....@@ -7797,6 +8509,7 @@
77978509 unsigned long log2qty, size;
77988510 void *table = NULL;
77998511 gfp_t gfp_flags;
8512
+ bool virt;
78008513
78018514 /* allow the kernel cmdline to have a say */
78028515 if (!numentries) {
....@@ -7853,32 +8566,34 @@
78538566
78548567 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
78558568 do {
8569
+ virt = false;
78568570 size = bucketsize << log2qty;
78578571 if (flags & HASH_EARLY) {
78588572 if (flags & HASH_ZERO)
7859
- table = memblock_virt_alloc_nopanic(size, 0);
8573
+ table = memblock_alloc(size, SMP_CACHE_BYTES);
78608574 else
7861
- table = memblock_virt_alloc_raw(size, 0);
7862
- } else if (hashdist) {
7863
- table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
8575
+ table = memblock_alloc_raw(size,
8576
+ SMP_CACHE_BYTES);
8577
+ } else if (get_order(size) >= MAX_ORDER || hashdist) {
8578
+ table = __vmalloc(size, gfp_flags);
8579
+ virt = true;
78648580 } else {
78658581 /*
78668582 * If bucketsize is not a power-of-two, we may free
78678583 * some pages at the end of hash table which
78688584 * alloc_pages_exact() automatically does
78698585 */
7870
- if (get_order(size) < MAX_ORDER) {
7871
- table = alloc_pages_exact(size, gfp_flags);
7872
- kmemleak_alloc(table, size, 1, gfp_flags);
7873
- }
8586
+ table = alloc_pages_exact(size, gfp_flags);
8587
+ kmemleak_alloc(table, size, 1, gfp_flags);
78748588 }
78758589 } while (!table && size > PAGE_SIZE && --log2qty);
78768590
78778591 if (!table)
78788592 panic("Failed to allocate %s hash table\n", tablename);
78798593
7880
- pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
7881
- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
8594
+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
8595
+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
8596
+ virt ? "vmalloc" : "linear");
78828597
78838598 if (_hash_shift)
78848599 *_hash_shift = log2qty;
....@@ -7890,47 +8605,50 @@
78908605
78918606 /*
78928607 * This function checks whether pageblock includes unmovable pages or not.
7893
- * If @count is not zero, it is okay to include less @count unmovable pages
78948608 *
78958609 * PageLRU check without isolation or lru_lock could race so that
78968610 * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
78978611 * check without lock_page also may miss some movable non-lru pages at
78988612 * race condition. So you can't expect this function should be exact.
8613
+ *
8614
+ * Returns a page without holding a reference. If the caller wants to
8615
+ * dereference that page (e.g., dumping), it has to make sure that it
8616
+ * cannot get removed (e.g., via memory unplug) concurrently.
8617
+ *
78998618 */
7900
-bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7901
- int migratetype,
7902
- bool skip_hwpoisoned_pages)
8619
+struct page *has_unmovable_pages(struct zone *zone, struct page *page,
8620
+ int migratetype, int flags)
79038621 {
7904
- unsigned long pfn, iter, found;
8622
+ unsigned long iter = 0;
8623
+ unsigned long pfn = page_to_pfn(page);
8624
+ unsigned long offset = pfn % pageblock_nr_pages;
79058625
7906
- /*
7907
- * TODO we could make this much more efficient by not checking every
7908
- * page in the range if we know all of them are in MOVABLE_ZONE and
7909
- * that the movable zone guarantees that pages are migratable but
7910
- * the later is not the case right now unfortunatelly. E.g. movablecore
7911
- * can still lead to having bootmem allocations in zone_movable.
7912
- */
8626
+ if (is_migrate_cma_page(page)) {
8627
+ /*
8628
+ * CMA allocations (alloc_contig_range) really need to mark
8629
+ * isolate CMA pageblocks even when they are not movable in fact
8630
+ * so consider them movable here.
8631
+ */
8632
+ if (is_migrate_cma(migratetype))
8633
+ return NULL;
79138634
7914
- /*
7915
- * CMA allocations (alloc_contig_range) really need to mark isolate
7916
- * CMA pageblocks even when they are not movable in fact so consider
7917
- * them movable here.
7918
- */
7919
- if (is_migrate_cma(migratetype) &&
7920
- is_migrate_cma(get_pageblock_migratetype(page)))
7921
- return false;
8635
+ return page;
8636
+ }
79228637
7923
- pfn = page_to_pfn(page);
7924
- for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
7925
- unsigned long check = pfn + iter;
7926
-
7927
- if (!pfn_valid_within(check))
8638
+ for (; iter < pageblock_nr_pages - offset; iter++) {
8639
+ if (!pfn_valid_within(pfn + iter))
79288640 continue;
79298641
7930
- page = pfn_to_page(check);
8642
+ page = pfn_to_page(pfn + iter);
79318643
8644
+ /*
8645
+ * Both, bootmem allocations and memory holes are marked
8646
+ * PG_reserved and are unmovable. We can even have unmovable
8647
+ * allocations inside ZONE_MOVABLE, for example when
8648
+ * specifying "movablecore".
8649
+ */
79328650 if (PageReserved(page))
7933
- goto unmovable;
8651
+ return page;
79348652
79358653 /*
79368654 * If the zone is movable and we have ruled out all reserved
....@@ -7942,17 +8660,22 @@
79428660
79438661 /*
79448662 * Hugepages are not in LRU lists, but they're movable.
7945
- * We need not scan over tail pages bacause we don't
8663
+ * THPs are on the LRU, but need to be counted as #small pages.
8664
+ * We need not scan over tail pages because we don't
79468665 * handle each tail page individually in migration.
79478666 */
7948
- if (PageHuge(page)) {
8667
+ if (PageHuge(page) || PageTransCompound(page)) {
79498668 struct page *head = compound_head(page);
79508669 unsigned int skip_pages;
79518670
7952
- if (!hugepage_migration_supported(page_hstate(head)))
7953
- goto unmovable;
8671
+ if (PageHuge(page)) {
8672
+ if (!hugepage_migration_supported(page_hstate(head)))
8673
+ return page;
8674
+ } else if (!PageLRU(head) && !__PageMovable(head)) {
8675
+ return page;
8676
+ }
79548677
7955
- skip_pages = (1 << compound_order(head)) - (page - head);
8678
+ skip_pages = compound_nr(head) - (page - head);
79568679 iter += skip_pages - 1;
79578680 continue;
79588681 }
....@@ -7965,7 +8688,7 @@
79658688 */
79668689 if (!page_ref_count(page)) {
79678690 if (PageBuddy(page))
7968
- iter += (1 << page_order(page)) - 1;
8691
+ iter += (1 << buddy_order(page)) - 1;
79698692 continue;
79708693 }
79718694
....@@ -7973,61 +8696,100 @@
79738696 * The HWPoisoned page may be not in buddy system, and
79748697 * page_count() is not 0.
79758698 */
7976
- if (skip_hwpoisoned_pages && PageHWPoison(page))
8699
+ if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
79778700 continue;
79788701
7979
- if (__PageMovable(page))
8702
+ /*
8703
+ * We treat all PageOffline() pages as movable when offlining
8704
+ * to give drivers a chance to decrement their reference count
8705
+ * in MEM_GOING_OFFLINE in order to indicate that these pages
8706
+ * can be offlined as there are no direct references anymore.
8707
+ * For actually unmovable PageOffline() where the driver does
8708
+ * not support this, we will fail later when trying to actually
8709
+ * move these pages that still have a reference count > 0.
8710
+ * (false negatives in this function only)
8711
+ */
8712
+ if ((flags & MEMORY_OFFLINE) && PageOffline(page))
79808713 continue;
79818714
7982
- if (!PageLRU(page))
7983
- found++;
8715
+ if (__PageMovable(page) || PageLRU(page))
8716
+ continue;
8717
+
79848718 /*
79858719 * If there are RECLAIMABLE pages, we need to check
79868720 * it. But now, memory offline itself doesn't call
79878721 * shrink_node_slabs() and it still to be fixed.
79888722 */
7989
- /*
7990
- * If the page is not RAM, page_count()should be 0.
7991
- * we don't need more check. This is an _used_ not-movable page.
7992
- *
7993
- * The problematic thing here is PG_reserved pages. PG_reserved
7994
- * is set to both of a memory hole page and a _used_ kernel
7995
- * page at boot.
7996
- */
7997
- if (found > count)
7998
- goto unmovable;
8723
+ return page;
79998724 }
8000
- return false;
8001
-unmovable:
8002
- WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
8003
- return true;
8725
+ return NULL;
80048726 }
80058727
8006
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
8007
-
8728
+#ifdef CONFIG_CONTIG_ALLOC
80088729 static unsigned long pfn_max_align_down(unsigned long pfn)
80098730 {
80108731 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
80118732 pageblock_nr_pages) - 1);
80128733 }
80138734
8014
-static unsigned long pfn_max_align_up(unsigned long pfn)
8735
+unsigned long pfn_max_align_up(unsigned long pfn)
80158736 {
80168737 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
80178738 pageblock_nr_pages));
80188739 }
80198740
8741
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
8742
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
8743
+/* Usage: See admin-guide/dynamic-debug-howto.rst */
8744
+static void alloc_contig_dump_pages(struct list_head *page_list)
8745
+{
8746
+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
8747
+
8748
+ if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
8749
+ struct page *page;
8750
+ unsigned long nr_skip = 0;
8751
+ unsigned long nr_pages = 0;
8752
+
8753
+ dump_stack();
8754
+ list_for_each_entry(page, page_list, lru) {
8755
+ nr_pages++;
8756
+ /* The page will be freed by putback_movable_pages soon */
8757
+ if (page_count(page) == 1) {
8758
+ nr_skip++;
8759
+ continue;
8760
+ }
8761
+ dump_page(page, "migration failure");
8762
+ }
8763
+ pr_warn("total dump_pages %lu skipping %lu\n", nr_pages, nr_skip);
8764
+ }
8765
+}
8766
+#else
8767
+static inline void alloc_contig_dump_pages(struct list_head *page_list)
8768
+{
8769
+}
8770
+#endif
8771
+
80208772 /* [start, end) must belong to a single zone. */
80218773 static int __alloc_contig_migrate_range(struct compact_control *cc,
8022
- unsigned long start, unsigned long end)
8774
+ unsigned long start, unsigned long end,
8775
+ struct acr_info *info)
80238776 {
80248777 /* This function is based on compact_zone() from compaction.c. */
8025
- unsigned long nr_reclaimed;
8778
+ unsigned int nr_reclaimed;
80268779 unsigned long pfn = start;
80278780 unsigned int tries = 0;
8781
+ unsigned int max_tries = 5;
80288782 int ret = 0;
8783
+ struct page *page;
8784
+ struct migration_target_control mtc = {
8785
+ .nid = zone_to_nid(cc->zone),
8786
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
8787
+ };
80298788
8030
- migrate_prep();
8789
+ if (cc->alloc_contig && cc->mode == MIGRATE_ASYNC)
8790
+ max_tries = 1;
8791
+
8792
+ lru_cache_disable();
80318793
80328794 while (pfn < end || !list_empty(&cc->migratepages)) {
80338795 if (fatal_signal_pending(current)) {
....@@ -8043,20 +8805,39 @@
80438805 break;
80448806 }
80458807 tries = 0;
8046
- } else if (++tries == 5) {
8808
+ } else if (++tries == max_tries) {
80478809 ret = ret < 0 ? ret : -EBUSY;
80488810 break;
80498811 }
80508812
80518813 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
80528814 &cc->migratepages);
8815
+ info->nr_reclaimed += nr_reclaimed;
80538816 cc->nr_migratepages -= nr_reclaimed;
80548817
8055
- ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
8056
- NULL, 0, cc->mode, MR_CONTIG_RANGE);
8818
+ list_for_each_entry(page, &cc->migratepages, lru)
8819
+ info->nr_mapped += page_mapcount(page);
8820
+
8821
+ ret = migrate_pages(&cc->migratepages, alloc_migration_target,
8822
+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
8823
+ if (!ret)
8824
+ info->nr_migrated += cc->nr_migratepages;
80578825 }
8826
+
8827
+ lru_cache_enable();
80588828 if (ret < 0) {
8829
+ if (ret == -EBUSY) {
8830
+ alloc_contig_dump_pages(&cc->migratepages);
8831
+ page_pinner_mark_migration_failed_pages(&cc->migratepages);
8832
+ }
8833
+
8834
+ if (!list_empty(&cc->migratepages)) {
8835
+ page = list_first_entry(&cc->migratepages, struct page , lru);
8836
+ info->failed_pfn = page_to_pfn(page);
8837
+ }
8838
+
80598839 putback_movable_pages(&cc->migratepages);
8840
+ info->err |= ACR_ERR_MIGRATE;
80608841 return ret;
80618842 }
80628843 return 0;
....@@ -8079,25 +8860,28 @@
80798860 * pageblocks in the range. Once isolated, the pageblocks should not
80808861 * be modified by others.
80818862 *
8082
- * Returns zero on success or negative error code. On success all
8863
+ * Return: zero on success or negative error code. On success all
80838864 * pages which PFN is in [start, end) are allocated for the caller and
80848865 * need to be freed with free_contig_range().
80858866 */
80868867 int alloc_contig_range(unsigned long start, unsigned long end,
8087
- unsigned migratetype, gfp_t gfp_mask)
8868
+ unsigned migratetype, gfp_t gfp_mask,
8869
+ struct acr_info *info)
80888870 {
80898871 unsigned long outer_start, outer_end;
80908872 unsigned int order;
80918873 int ret = 0;
8874
+ bool skip_drain_all_pages = false;
80928875
80938876 struct compact_control cc = {
80948877 .nr_migratepages = 0,
80958878 .order = -1,
80968879 .zone = page_zone(pfn_to_page(start)),
8097
- .mode = MIGRATE_SYNC,
8880
+ .mode = gfp_mask & __GFP_NORETRY ? MIGRATE_ASYNC : MIGRATE_SYNC,
80988881 .ignore_skip_hint = true,
80998882 .no_set_skip_hint = true,
81008883 .gfp_mask = current_gfp_context(gfp_mask),
8884
+ .alloc_contig = true,
81018885 };
81028886 INIT_LIST_HEAD(&cc.migratepages);
81038887
....@@ -8126,14 +8910,18 @@
81268910 */
81278911
81288912 ret = start_isolate_page_range(pfn_max_align_down(start),
8129
- pfn_max_align_up(end), migratetype,
8130
- false);
8131
- if (ret)
8913
+ pfn_max_align_up(end), migratetype, 0,
8914
+ &info->failed_pfn);
8915
+ if (ret) {
8916
+ info->err |= ACR_ERR_ISOLATE;
81328917 return ret;
8918
+ }
81338919
8134
-#ifdef CONFIG_CMA
8135
- cc.zone->cma_alloc = 1;
8136
-#endif
8920
+ trace_android_vh_cma_drain_all_pages_bypass(migratetype,
8921
+ &skip_drain_all_pages);
8922
+ if (!skip_drain_all_pages)
8923
+ drain_all_pages(cc.zone);
8924
+
81378925 /*
81388926 * In case of -EBUSY, we'd like to know which page causes problem.
81398927 * So, just fall through. test_pages_isolated() has a tracepoint
....@@ -8144,8 +8932,8 @@
81448932 * allocated. So, if we fall through be sure to clear ret so that
81458933 * -EBUSY is not accidentally used or returned to caller.
81468934 */
8147
- ret = __alloc_contig_migrate_range(&cc, start, end);
8148
- if (ret && ret != -EBUSY)
8935
+ ret = __alloc_contig_migrate_range(&cc, start, end, info);
8936
+ if (ret && (ret != -EBUSY || (gfp_mask & __GFP_NORETRY)))
81498937 goto done;
81508938 ret =0;
81518939
....@@ -8166,9 +8954,6 @@
81668954 * isolated thus they won't get removed from buddy.
81678955 */
81688956
8169
- lru_add_drain_all();
8170
- drain_all_pages(cc.zone);
8171
-
81728957 order = 0;
81738958 outer_start = start;
81748959 while (!PageBuddy(pfn_to_page(outer_start))) {
....@@ -8180,7 +8965,7 @@
81808965 }
81818966
81828967 if (outer_start != start) {
8183
- order = page_order(pfn_to_page(outer_start));
8968
+ order = buddy_order(pfn_to_page(outer_start));
81848969
81858970 /*
81868971 * outer_start page could be small order buddy page and
....@@ -8193,10 +8978,11 @@
81938978 }
81948979
81958980 /* Make sure the range is really isolated. */
8196
- if (test_pages_isolated(outer_start, end, false)) {
8981
+ if (test_pages_isolated(outer_start, end, 0, &info->failed_pfn)) {
81978982 pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
81988983 __func__, outer_start, end);
81998984 ret = -EBUSY;
8985
+ info->err |= ACR_ERR_TEST;
82008986 goto done;
82018987 }
82028988
....@@ -8216,13 +9002,114 @@
82169002 done:
82179003 undo_isolate_page_range(pfn_max_align_down(start),
82189004 pfn_max_align_up(end), migratetype);
8219
-#ifdef CONFIG_CMA
8220
- cc.zone->cma_alloc = 0;
8221
-#endif
82229005 return ret;
82239006 }
9007
+EXPORT_SYMBOL(alloc_contig_range);
82249008
8225
-void free_contig_range(unsigned long pfn, unsigned nr_pages)
9009
+static int __alloc_contig_pages(unsigned long start_pfn,
9010
+ unsigned long nr_pages, gfp_t gfp_mask)
9011
+{
9012
+ struct acr_info dummy;
9013
+ unsigned long end_pfn = start_pfn + nr_pages;
9014
+
9015
+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
9016
+ gfp_mask, &dummy);
9017
+}
9018
+
9019
+static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
9020
+ unsigned long nr_pages)
9021
+{
9022
+ unsigned long i, end_pfn = start_pfn + nr_pages;
9023
+ struct page *page;
9024
+
9025
+ for (i = start_pfn; i < end_pfn; i++) {
9026
+ page = pfn_to_online_page(i);
9027
+ if (!page)
9028
+ return false;
9029
+
9030
+ if (page_zone(page) != z)
9031
+ return false;
9032
+
9033
+ if (PageReserved(page))
9034
+ return false;
9035
+
9036
+ if (page_count(page) > 0)
9037
+ return false;
9038
+
9039
+ if (PageHuge(page))
9040
+ return false;
9041
+ }
9042
+ return true;
9043
+}
9044
+
9045
+static bool zone_spans_last_pfn(const struct zone *zone,
9046
+ unsigned long start_pfn, unsigned long nr_pages)
9047
+{
9048
+ unsigned long last_pfn = start_pfn + nr_pages - 1;
9049
+
9050
+ return zone_spans_pfn(zone, last_pfn);
9051
+}
9052
+
9053
+/**
9054
+ * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
9055
+ * @nr_pages: Number of contiguous pages to allocate
9056
+ * @gfp_mask: GFP mask to limit search and used during compaction
9057
+ * @nid: Target node
9058
+ * @nodemask: Mask for other possible nodes
9059
+ *
9060
+ * This routine is a wrapper around alloc_contig_range(). It scans over zones
9061
+ * on an applicable zonelist to find a contiguous pfn range which can then be
9062
+ * tried for allocation with alloc_contig_range(). This routine is intended
9063
+ * for allocation requests which can not be fulfilled with the buddy allocator.
9064
+ *
9065
+ * The allocated memory is always aligned to a page boundary. If nr_pages is a
9066
+ * power of two then the alignment is guaranteed to be to the given nr_pages
9067
+ * (e.g. 1GB request would be aligned to 1GB).
9068
+ *
9069
+ * Allocated pages can be freed with free_contig_range() or by manually calling
9070
+ * __free_page() on each allocated page.
9071
+ *
9072
+ * Return: pointer to contiguous pages on success, or NULL if not successful.
9073
+ */
9074
+struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
9075
+ int nid, nodemask_t *nodemask)
9076
+{
9077
+ unsigned long ret, pfn, flags;
9078
+ struct zonelist *zonelist;
9079
+ struct zone *zone;
9080
+ struct zoneref *z;
9081
+
9082
+ zonelist = node_zonelist(nid, gfp_mask);
9083
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
9084
+ gfp_zone(gfp_mask), nodemask) {
9085
+ spin_lock_irqsave(&zone->lock, flags);
9086
+
9087
+ pfn = ALIGN(zone->zone_start_pfn, nr_pages);
9088
+ while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
9089
+ if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
9090
+ /*
9091
+ * We release the zone lock here because
9092
+ * alloc_contig_range() will also lock the zone
9093
+ * at some point. If there's an allocation
9094
+ * spinning on this lock, it may win the race
9095
+ * and cause alloc_contig_range() to fail...
9096
+ */
9097
+ spin_unlock_irqrestore(&zone->lock, flags);
9098
+ ret = __alloc_contig_pages(pfn, nr_pages,
9099
+ gfp_mask);
9100
+ if (!ret)
9101
+ return pfn_to_page(pfn);
9102
+ spin_lock_irqsave(&zone->lock, flags);
9103
+ }
9104
+ pfn += nr_pages;
9105
+ }
9106
+ spin_unlock_irqrestore(&zone->lock, flags);
9107
+ }
9108
+ return NULL;
9109
+}
9110
+#endif /* CONFIG_CONTIG_ALLOC */
9111
+
9112
+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
82269113 {
82279114 unsigned int count = 0;
82289115
....@@ -8234,7 +9121,7 @@
82349121 }
82359122 WARN(count != 0, "%d pages are still in use!\n", count);
82369123 }
8237
-#endif
9124
+EXPORT_SYMBOL(free_contig_range);
82389125
82399126 /*
82409127 * The zone indicated has a new number of managed_pages; batch sizes and percpu
....@@ -8242,11 +9129,8 @@
82429129 */
82439130 void __meminit zone_pcp_update(struct zone *zone)
82449131 {
8245
- unsigned cpu;
82469132 mutex_lock(&pcp_batch_high_lock);
8247
- for_each_possible_cpu(cpu)
8248
- pageset_set_high_and_batch(zone,
8249
- per_cpu_ptr(zone->pageset, cpu));
9133
+ __zone_pcp_update(zone);
82509134 mutex_unlock(&pcp_batch_high_lock);
82519135 }
82529136
....@@ -8257,7 +9141,7 @@
82579141 struct per_cpu_pageset *pset;
82589142
82599143 /* avoid races with drain_pages() */
8260
- local_irq_save(flags);
9144
+ local_lock_irqsave(&pa_lock.l, flags);
82619145 if (zone->pageset != &boot_pageset) {
82629146 for_each_online_cpu(cpu) {
82639147 pset = per_cpu_ptr(zone->pageset, cpu);
....@@ -8266,37 +9150,26 @@
82669150 free_percpu(zone->pageset);
82679151 zone->pageset = &boot_pageset;
82689152 }
8269
- local_irq_restore(flags);
9153
+ local_unlock_irqrestore(&pa_lock.l, flags);
82709154 }
82719155
82729156 #ifdef CONFIG_MEMORY_HOTREMOVE
82739157 /*
8274
- * All pages in the range must be in a single zone and isolated
8275
- * before calling this.
9158
+ * All pages in the range must be in a single zone, must not contain holes,
9159
+ * must span full sections, and must be isolated before calling this function.
82769160 */
8277
-void
8278
-__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
9161
+void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
82799162 {
9163
+ unsigned long pfn = start_pfn;
82809164 struct page *page;
82819165 struct zone *zone;
8282
- unsigned int order, i;
8283
- unsigned long pfn;
9166
+ unsigned int order;
82849167 unsigned long flags;
8285
- /* find the first valid pfn */
8286
- for (pfn = start_pfn; pfn < end_pfn; pfn++)
8287
- if (pfn_valid(pfn))
8288
- break;
8289
- if (pfn == end_pfn)
8290
- return;
9168
+
82919169 offline_mem_sections(pfn, end_pfn);
82929170 zone = page_zone(pfn_to_page(pfn));
82939171 spin_lock_irqsave(&zone->lock, flags);
8294
- pfn = start_pfn;
82959172 while (pfn < end_pfn) {
8296
- if (!pfn_valid(pfn)) {
8297
- pfn++;
8298
- continue;
8299
- }
83009173 page = pfn_to_page(pfn);
83019174 /*
83029175 * The HWPoisoned page may be not in buddy system, and
....@@ -8304,22 +9177,23 @@
83049177 */
83059178 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
83069179 pfn++;
8307
- SetPageReserved(page);
9180
+ continue;
9181
+ }
9182
+ /*
9183
+ * At this point all remaining PageOffline() pages have a
9184
+ * reference count of 0 and can simply be skipped.
9185
+ */
9186
+ if (PageOffline(page)) {
9187
+ BUG_ON(page_count(page));
9188
+ BUG_ON(PageBuddy(page));
9189
+ pfn++;
83089190 continue;
83099191 }
83109192
83119193 BUG_ON(page_count(page));
83129194 BUG_ON(!PageBuddy(page));
8313
- order = page_order(page);
8314
-#ifdef CONFIG_DEBUG_VM
8315
- pr_info("remove from free list %lx %d %lx\n",
8316
- pfn, 1 << order, end_pfn);
8317
-#endif
8318
- list_del(&page->lru);
8319
- rmv_page_order(page);
8320
- zone->free_area[order].nr_free--;
8321
- for (i = 0; i < (1 << order); i++)
8322
- SetPageReserved((page+i));
9195
+ order = buddy_order(page);
9196
+ del_page_from_free_list(page, zone, order);
83239197 pfn += (1 << order);
83249198 }
83259199 spin_unlock_irqrestore(&zone->lock, flags);
....@@ -8337,7 +9211,7 @@
83379211 for (order = 0; order < MAX_ORDER; order++) {
83389212 struct page *page_head = page - (pfn & ((1 << order) - 1));
83399213
8340
- if (PageBuddy(page_head) && page_order(page_head) >= order)
9214
+ if (PageBuddy(page_head) && buddy_order(page_head) >= order)
83419215 break;
83429216 }
83439217 spin_unlock_irqrestore(&zone->lock, flags);
....@@ -8347,30 +9221,87 @@
83479221
83489222 #ifdef CONFIG_MEMORY_FAILURE
83499223 /*
8350
- * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
8351
- * test is performed under the zone lock to prevent a race against page
8352
- * allocation.
9224
+ * Break down a higher-order page in sub-pages, and keep our target out of
9225
+ * buddy allocator.
83539226 */
8354
-bool set_hwpoison_free_buddy_page(struct page *page)
9227
+static void break_down_buddy_pages(struct zone *zone, struct page *page,
9228
+ struct page *target, int low, int high,
9229
+ int migratetype)
9230
+{
9231
+ unsigned long size = 1 << high;
9232
+ struct page *current_buddy, *next_page;
9233
+
9234
+ while (high > low) {
9235
+ high--;
9236
+ size >>= 1;
9237
+
9238
+ if (target >= &page[size]) {
9239
+ next_page = page + size;
9240
+ current_buddy = page;
9241
+ } else {
9242
+ next_page = page;
9243
+ current_buddy = page + size;
9244
+ }
9245
+
9246
+ if (set_page_guard(zone, current_buddy, high, migratetype))
9247
+ continue;
9248
+
9249
+ if (current_buddy != target) {
9250
+ add_to_free_list(current_buddy, zone, high, migratetype);
9251
+ set_buddy_order(current_buddy, high);
9252
+ page = next_page;
9253
+ }
9254
+ }
9255
+}
9256
+
9257
+/*
9258
+ * Take a page that will be marked as poisoned off the buddy allocator.
9259
+ */
9260
+bool take_page_off_buddy(struct page *page)
83559261 {
83569262 struct zone *zone = page_zone(page);
83579263 unsigned long pfn = page_to_pfn(page);
83589264 unsigned long flags;
83599265 unsigned int order;
8360
- bool hwpoisoned = false;
9266
+ bool ret = false;
83619267
83629268 spin_lock_irqsave(&zone->lock, flags);
83639269 for (order = 0; order < MAX_ORDER; order++) {
83649270 struct page *page_head = page - (pfn & ((1 << order) - 1));
9271
+ int page_order = buddy_order(page_head);
83659272
8366
- if (PageBuddy(page_head) && page_order(page_head) >= order) {
8367
- if (!TestSetPageHWPoison(page))
8368
- hwpoisoned = true;
9273
+ if (PageBuddy(page_head) && page_order >= order) {
9274
+ unsigned long pfn_head = page_to_pfn(page_head);
9275
+ int migratetype = get_pfnblock_migratetype(page_head,
9276
+ pfn_head);
9277
+
9278
+ del_page_from_free_list(page_head, zone, page_order);
9279
+ break_down_buddy_pages(zone, page_head, page, 0,
9280
+ page_order, migratetype);
9281
+ if (!is_migrate_isolate(migratetype))
9282
+ __mod_zone_freepage_state(zone, -1, migratetype);
9283
+ ret = true;
83699284 break;
83709285 }
9286
+ if (page_count(page_head) > 0)
9287
+ break;
83719288 }
83729289 spin_unlock_irqrestore(&zone->lock, flags);
8373
-
8374
- return hwpoisoned;
9290
+ return ret;
83759291 }
83769292 #endif
9293
+
9294
+#ifdef CONFIG_ZONE_DMA
9295
+bool has_managed_dma(void)
9296
+{
9297
+ struct pglist_data *pgdat;
9298
+
9299
+ for_each_online_pgdat(pgdat) {
9300
+ struct zone *zone = &pgdat->node_zones[ZONE_DMA];
9301
+
9302
+ if (managed_zone(zone))
9303
+ return true;
9304
+ }
9305
+ return false;
9306
+}
9307
+#endif /* CONFIG_ZONE_DMA */