.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * linux/mm/page_alloc.c |
---|
3 | 4 | * |
---|
.. | .. |
---|
16 | 17 | |
---|
17 | 18 | #include <linux/stddef.h> |
---|
18 | 19 | #include <linux/mm.h> |
---|
| 20 | +#include <linux/highmem.h> |
---|
19 | 21 | #include <linux/swap.h> |
---|
20 | 22 | #include <linux/interrupt.h> |
---|
21 | 23 | #include <linux/pagemap.h> |
---|
22 | 24 | #include <linux/jiffies.h> |
---|
23 | | -#include <linux/bootmem.h> |
---|
24 | 25 | #include <linux/memblock.h> |
---|
25 | 26 | #include <linux/compiler.h> |
---|
26 | 27 | #include <linux/kernel.h> |
---|
.. | .. |
---|
43 | 44 | #include <linux/mempolicy.h> |
---|
44 | 45 | #include <linux/memremap.h> |
---|
45 | 46 | #include <linux/stop_machine.h> |
---|
| 47 | +#include <linux/random.h> |
---|
46 | 48 | #include <linux/sort.h> |
---|
47 | 49 | #include <linux/pfn.h> |
---|
48 | 50 | #include <linux/backing-dev.h> |
---|
49 | 51 | #include <linux/fault-inject.h> |
---|
50 | 52 | #include <linux/page-isolation.h> |
---|
51 | | -#include <linux/page_ext.h> |
---|
52 | 53 | #include <linux/debugobjects.h> |
---|
53 | 54 | #include <linux/kmemleak.h> |
---|
54 | 55 | #include <linux/compaction.h> |
---|
.. | .. |
---|
61 | 62 | #include <linux/sched/rt.h> |
---|
62 | 63 | #include <linux/sched/mm.h> |
---|
63 | 64 | #include <linux/page_owner.h> |
---|
| 65 | +#include <linux/page_pinner.h> |
---|
64 | 66 | #include <linux/kthread.h> |
---|
65 | 67 | #include <linux/memcontrol.h> |
---|
66 | 68 | #include <linux/ftrace.h> |
---|
67 | 69 | #include <linux/lockdep.h> |
---|
68 | 70 | #include <linux/nmi.h> |
---|
69 | | -#include <linux/khugepaged.h> |
---|
70 | 71 | #include <linux/psi.h> |
---|
| 72 | +#include <linux/padata.h> |
---|
| 73 | +#include <linux/khugepaged.h> |
---|
| 74 | +#include <trace/hooks/mm.h> |
---|
| 75 | +#include <trace/hooks/vmscan.h> |
---|
71 | 76 | |
---|
72 | 77 | #include <asm/sections.h> |
---|
73 | 78 | #include <asm/tlbflush.h> |
---|
74 | 79 | #include <asm/div64.h> |
---|
75 | 80 | #include "internal.h" |
---|
| 81 | +#include "shuffle.h" |
---|
| 82 | +#include "page_reporting.h" |
---|
| 83 | + |
---|
| 84 | +/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ |
---|
| 85 | +typedef int __bitwise fpi_t; |
---|
| 86 | + |
---|
| 87 | +/* No special request */ |
---|
| 88 | +#define FPI_NONE ((__force fpi_t)0) |
---|
| 89 | + |
---|
| 90 | +/* |
---|
| 91 | + * Skip free page reporting notification for the (possibly merged) page. |
---|
| 92 | + * This does not hinder free page reporting from grabbing the page, |
---|
| 93 | + * reporting it and marking it "reported" - it only skips notifying |
---|
| 94 | + * the free page reporting infrastructure about a newly freed page. For |
---|
| 95 | + * example, used when temporarily pulling a page from a freelist and |
---|
| 96 | + * putting it back unmodified. |
---|
| 97 | + */ |
---|
| 98 | +#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) |
---|
| 99 | + |
---|
| 100 | +/* |
---|
| 101 | + * Place the (possibly merged) page to the tail of the freelist. Will ignore |
---|
| 102 | + * page shuffling (relevant code - e.g., memory onlining - is expected to |
---|
| 103 | + * shuffle the whole zone). |
---|
| 104 | + * |
---|
| 105 | + * Note: No code should rely on this flag for correctness - it's purely |
---|
| 106 | + * to allow for optimizations when handing back either fresh pages |
---|
| 107 | + * (memory onlining) or untouched pages (page isolation, free page |
---|
| 108 | + * reporting). |
---|
| 109 | + */ |
---|
| 110 | +#define FPI_TO_TAIL ((__force fpi_t)BIT(1)) |
---|
| 111 | + |
---|
| 112 | +/* |
---|
| 113 | + * Don't poison memory with KASAN (only for the tag-based modes). |
---|
| 114 | + * During boot, all non-reserved memblock memory is exposed to page_alloc. |
---|
| 115 | + * Poisoning all that memory lengthens boot time, especially on systems with |
---|
| 116 | + * large amount of RAM. This flag is used to skip that poisoning. |
---|
| 117 | + * This is only done for the tag-based KASAN modes, as those are able to |
---|
| 118 | + * detect memory corruptions with the memory tags assigned by default. |
---|
| 119 | + * All memory allocated normally after boot gets poisoned as usual. |
---|
| 120 | + */ |
---|
| 121 | +#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) |
---|
76 | 122 | |
---|
77 | 123 | /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ |
---|
78 | 124 | static DEFINE_MUTEX(pcp_batch_high_lock); |
---|
.. | .. |
---|
94 | 140 | */ |
---|
95 | 141 | DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ |
---|
96 | 142 | EXPORT_PER_CPU_SYMBOL(_numa_mem_); |
---|
97 | | -int _node_numa_mem_[MAX_NUMNODES]; |
---|
98 | 143 | #endif |
---|
99 | 144 | |
---|
100 | 145 | /* work_structs for global per-cpu drains */ |
---|
101 | | -DEFINE_MUTEX(pcpu_drain_mutex); |
---|
102 | | -DEFINE_PER_CPU(struct work_struct, pcpu_drain); |
---|
| 146 | +struct pcpu_drain { |
---|
| 147 | + struct zone *zone; |
---|
| 148 | + struct work_struct work; |
---|
| 149 | +}; |
---|
| 150 | +static DEFINE_MUTEX(pcpu_drain_mutex); |
---|
| 151 | +static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); |
---|
103 | 152 | |
---|
104 | 153 | #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY |
---|
105 | 154 | volatile unsigned long latent_entropy __latent_entropy; |
---|
.. | .. |
---|
123 | 172 | }; |
---|
124 | 173 | EXPORT_SYMBOL(node_states); |
---|
125 | 174 | |
---|
126 | | -/* Protect totalram_pages and zone->managed_pages */ |
---|
127 | | -static DEFINE_SPINLOCK(managed_page_count_lock); |
---|
128 | | - |
---|
129 | | -unsigned long totalram_pages __read_mostly; |
---|
| 175 | +atomic_long_t _totalram_pages __read_mostly; |
---|
| 176 | +EXPORT_SYMBOL(_totalram_pages); |
---|
130 | 177 | unsigned long totalreserve_pages __read_mostly; |
---|
131 | 178 | unsigned long totalcma_pages __read_mostly; |
---|
132 | 179 | |
---|
133 | 180 | int percpu_pagelist_fraction; |
---|
134 | 181 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
---|
135 | | -#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON |
---|
136 | | -DEFINE_STATIC_KEY_TRUE(init_on_alloc); |
---|
137 | | -#else |
---|
138 | 182 | DEFINE_STATIC_KEY_FALSE(init_on_alloc); |
---|
139 | | -#endif |
---|
140 | 183 | EXPORT_SYMBOL(init_on_alloc); |
---|
141 | 184 | |
---|
142 | | -#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON |
---|
143 | | -DEFINE_STATIC_KEY_TRUE(init_on_free); |
---|
144 | | -#else |
---|
145 | 185 | DEFINE_STATIC_KEY_FALSE(init_on_free); |
---|
146 | | -#endif |
---|
147 | 186 | EXPORT_SYMBOL(init_on_free); |
---|
148 | 187 | |
---|
| 188 | +static bool _init_on_alloc_enabled_early __read_mostly |
---|
| 189 | + = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); |
---|
149 | 190 | static int __init early_init_on_alloc(char *buf) |
---|
150 | 191 | { |
---|
151 | | - int ret; |
---|
152 | | - bool bool_result; |
---|
153 | 192 | |
---|
154 | | - if (!buf) |
---|
155 | | - return -EINVAL; |
---|
156 | | - ret = kstrtobool(buf, &bool_result); |
---|
157 | | - if (bool_result && page_poisoning_enabled()) |
---|
158 | | - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n"); |
---|
159 | | - if (bool_result) |
---|
160 | | - static_branch_enable(&init_on_alloc); |
---|
161 | | - else |
---|
162 | | - static_branch_disable(&init_on_alloc); |
---|
163 | | - return ret; |
---|
| 193 | + return kstrtobool(buf, &_init_on_alloc_enabled_early); |
---|
164 | 194 | } |
---|
165 | 195 | early_param("init_on_alloc", early_init_on_alloc); |
---|
166 | 196 | |
---|
| 197 | +static bool _init_on_free_enabled_early __read_mostly |
---|
| 198 | + = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON); |
---|
167 | 199 | static int __init early_init_on_free(char *buf) |
---|
168 | 200 | { |
---|
169 | | - int ret; |
---|
170 | | - bool bool_result; |
---|
171 | | - |
---|
172 | | - if (!buf) |
---|
173 | | - return -EINVAL; |
---|
174 | | - ret = kstrtobool(buf, &bool_result); |
---|
175 | | - if (bool_result && page_poisoning_enabled()) |
---|
176 | | - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n"); |
---|
177 | | - if (bool_result) |
---|
178 | | - static_branch_enable(&init_on_free); |
---|
179 | | - else |
---|
180 | | - static_branch_disable(&init_on_free); |
---|
181 | | - return ret; |
---|
| 201 | + return kstrtobool(buf, &_init_on_free_enabled_early); |
---|
182 | 202 | } |
---|
183 | 203 | early_param("init_on_free", early_init_on_free); |
---|
184 | 204 | |
---|
.. | .. |
---|
242 | 262 | unsigned int pageblock_order __read_mostly; |
---|
243 | 263 | #endif |
---|
244 | 264 | |
---|
245 | | -static void __free_pages_ok(struct page *page, unsigned int order); |
---|
| 265 | +static void __free_pages_ok(struct page *page, unsigned int order, |
---|
| 266 | + fpi_t fpi_flags); |
---|
246 | 267 | |
---|
247 | 268 | /* |
---|
248 | 269 | * results with 256, 32 in the lowmem_reserve sysctl: |
---|
.. | .. |
---|
269 | 290 | [ZONE_MOVABLE] = 0, |
---|
270 | 291 | }; |
---|
271 | 292 | |
---|
272 | | -EXPORT_SYMBOL(totalram_pages); |
---|
273 | | - |
---|
274 | 293 | static char * const zone_names[MAX_NR_ZONES] = { |
---|
275 | 294 | #ifdef CONFIG_ZONE_DMA |
---|
276 | 295 | "DMA", |
---|
.. | .. |
---|
288 | 307 | #endif |
---|
289 | 308 | }; |
---|
290 | 309 | |
---|
291 | | -char * const migratetype_names[MIGRATE_TYPES] = { |
---|
| 310 | +const char * const migratetype_names[MIGRATE_TYPES] = { |
---|
292 | 311 | "Unmovable", |
---|
293 | 312 | "Movable", |
---|
294 | 313 | "Reclaimable", |
---|
.. | .. |
---|
301 | 320 | #endif |
---|
302 | 321 | }; |
---|
303 | 322 | |
---|
304 | | -compound_page_dtor * const compound_page_dtors[] = { |
---|
305 | | - NULL, |
---|
306 | | - free_compound_page, |
---|
| 323 | +compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { |
---|
| 324 | + [NULL_COMPOUND_DTOR] = NULL, |
---|
| 325 | + [COMPOUND_PAGE_DTOR] = free_compound_page, |
---|
307 | 326 | #ifdef CONFIG_HUGETLB_PAGE |
---|
308 | | - free_huge_page, |
---|
| 327 | + [HUGETLB_PAGE_DTOR] = free_huge_page, |
---|
309 | 328 | #endif |
---|
310 | 329 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
---|
311 | | - free_transhuge_page, |
---|
| 330 | + [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, |
---|
312 | 331 | #endif |
---|
313 | 332 | }; |
---|
314 | 333 | |
---|
.. | .. |
---|
319 | 338 | */ |
---|
320 | 339 | int min_free_kbytes = 1024; |
---|
321 | 340 | int user_min_free_kbytes = -1; |
---|
| 341 | +#ifdef CONFIG_DISCONTIGMEM |
---|
| 342 | +/* |
---|
| 343 | + * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges |
---|
| 344 | + * are not on separate NUMA nodes. Functionally this works but with |
---|
| 345 | + * watermark_boost_factor, it can reclaim prematurely as the ranges can be |
---|
| 346 | + * quite small. By default, do not boost watermarks on discontigmem as in |
---|
| 347 | + * many cases very high-order allocations like THP are likely to be |
---|
| 348 | + * unsupported and the premature reclaim offsets the advantage of long-term |
---|
| 349 | + * fragmentation avoidance. |
---|
| 350 | + */ |
---|
| 351 | +int watermark_boost_factor __read_mostly; |
---|
| 352 | +#else |
---|
| 353 | +int watermark_boost_factor __read_mostly = 15000; |
---|
| 354 | +#endif |
---|
322 | 355 | int watermark_scale_factor = 10; |
---|
323 | 356 | |
---|
324 | 357 | /* |
---|
.. | .. |
---|
328 | 361 | */ |
---|
329 | 362 | int extra_free_kbytes = 0; |
---|
330 | 363 | |
---|
331 | | -static unsigned long nr_kernel_pages __meminitdata; |
---|
332 | | -static unsigned long nr_all_pages __meminitdata; |
---|
333 | | -static unsigned long dma_reserve __meminitdata; |
---|
| 364 | +static unsigned long nr_kernel_pages __initdata; |
---|
| 365 | +static unsigned long nr_all_pages __initdata; |
---|
| 366 | +static unsigned long dma_reserve __initdata; |
---|
334 | 367 | |
---|
335 | | -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
---|
336 | | -static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata; |
---|
337 | | -static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata; |
---|
| 368 | +static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; |
---|
| 369 | +static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; |
---|
338 | 370 | static unsigned long required_kernelcore __initdata; |
---|
339 | 371 | static unsigned long required_kernelcore_percent __initdata; |
---|
340 | 372 | static unsigned long required_movablecore __initdata; |
---|
341 | 373 | static unsigned long required_movablecore_percent __initdata; |
---|
342 | | -static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata; |
---|
| 374 | +static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; |
---|
343 | 375 | static bool mirrored_kernelcore __meminitdata; |
---|
344 | 376 | |
---|
345 | 377 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
---|
346 | 378 | int movable_zone; |
---|
347 | 379 | EXPORT_SYMBOL(movable_zone); |
---|
348 | | -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
---|
349 | 380 | |
---|
350 | 381 | #if MAX_NUMNODES > 1 |
---|
351 | | -int nr_node_ids __read_mostly = MAX_NUMNODES; |
---|
352 | | -int nr_online_nodes __read_mostly = 1; |
---|
| 382 | +unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; |
---|
| 383 | +unsigned int nr_online_nodes __read_mostly = 1; |
---|
353 | 384 | EXPORT_SYMBOL(nr_node_ids); |
---|
354 | 385 | EXPORT_SYMBOL(nr_online_nodes); |
---|
355 | 386 | #endif |
---|
.. | .. |
---|
365 | 396 | static DEFINE_STATIC_KEY_TRUE(deferred_pages); |
---|
366 | 397 | |
---|
367 | 398 | /* |
---|
368 | | - * Calling kasan_free_pages() only after deferred memory initialization |
---|
| 399 | + * Calling kasan_poison_pages() only after deferred memory initialization |
---|
369 | 400 | * has completed. Poisoning pages during deferred memory init will greatly |
---|
370 | 401 | * lengthen the process and cause problem in large memory systems as the |
---|
371 | 402 | * deferred pages initialization is done with interrupt disabled. |
---|
.. | .. |
---|
377 | 408 | * on-demand allocation and then freed again before the deferred pages |
---|
378 | 409 | * initialization is done, but this is not likely to happen. |
---|
379 | 410 | */ |
---|
380 | | -static inline void kasan_free_nondeferred_pages(struct page *page, int order) |
---|
| 411 | +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) |
---|
381 | 412 | { |
---|
382 | | - if (!static_branch_unlikely(&deferred_pages)) |
---|
383 | | - kasan_free_pages(page, order); |
---|
| 413 | + return static_branch_unlikely(&deferred_pages) || |
---|
| 414 | + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && |
---|
| 415 | + (fpi_flags & FPI_SKIP_KASAN_POISON)) || |
---|
| 416 | + PageSkipKASanPoison(page); |
---|
384 | 417 | } |
---|
385 | 418 | |
---|
386 | 419 | /* Returns true if the struct page for the pfn is uninitialised */ |
---|
.. | .. |
---|
395 | 428 | } |
---|
396 | 429 | |
---|
397 | 430 | /* |
---|
398 | | - * Returns false when the remaining initialisation should be deferred until |
---|
| 431 | + * Returns true when the remaining initialisation should be deferred until |
---|
399 | 432 | * later in the boot cycle when it can be parallelised. |
---|
400 | 433 | */ |
---|
401 | | -static inline bool update_defer_init(pg_data_t *pgdat, |
---|
402 | | - unsigned long pfn, unsigned long zone_end, |
---|
403 | | - unsigned long *nr_initialised) |
---|
| 434 | +static bool __meminit |
---|
| 435 | +defer_init(int nid, unsigned long pfn, unsigned long end_pfn) |
---|
404 | 436 | { |
---|
405 | | - /* Always populate low zones for address-constrained allocations */ |
---|
406 | | - if (zone_end < pgdat_end_pfn(pgdat)) |
---|
407 | | - return true; |
---|
408 | | - (*nr_initialised)++; |
---|
409 | | - if ((*nr_initialised > pgdat->static_init_pgcnt) && |
---|
410 | | - (pfn & (PAGES_PER_SECTION - 1)) == 0) { |
---|
411 | | - pgdat->first_deferred_pfn = pfn; |
---|
412 | | - return false; |
---|
| 437 | + static unsigned long prev_end_pfn, nr_initialised; |
---|
| 438 | + |
---|
| 439 | + /* |
---|
| 440 | + * prev_end_pfn static that contains the end of previous zone |
---|
| 441 | + * No need to protect because called very early in boot before smp_init. |
---|
| 442 | + */ |
---|
| 443 | + if (prev_end_pfn != end_pfn) { |
---|
| 444 | + prev_end_pfn = end_pfn; |
---|
| 445 | + nr_initialised = 0; |
---|
413 | 446 | } |
---|
414 | 447 | |
---|
415 | | - return true; |
---|
| 448 | + /* Always populate low zones for address-constrained allocations */ |
---|
| 449 | + if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) |
---|
| 450 | + return false; |
---|
| 451 | + |
---|
| 452 | + if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) |
---|
| 453 | + return true; |
---|
| 454 | + /* |
---|
| 455 | + * We start only with one section of pages, more pages are added as |
---|
| 456 | + * needed until the rest of deferred pages are initialized. |
---|
| 457 | + */ |
---|
| 458 | + nr_initialised++; |
---|
| 459 | + if ((nr_initialised > PAGES_PER_SECTION) && |
---|
| 460 | + (pfn & (PAGES_PER_SECTION - 1)) == 0) { |
---|
| 461 | + NODE_DATA(nid)->first_deferred_pfn = pfn; |
---|
| 462 | + return true; |
---|
| 463 | + } |
---|
| 464 | + return false; |
---|
416 | 465 | } |
---|
417 | 466 | #else |
---|
418 | | -#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o) |
---|
| 467 | +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) |
---|
| 468 | +{ |
---|
| 469 | + return (!IS_ENABLED(CONFIG_KASAN_GENERIC) && |
---|
| 470 | + (fpi_flags & FPI_SKIP_KASAN_POISON)) || |
---|
| 471 | + PageSkipKASanPoison(page); |
---|
| 472 | +} |
---|
419 | 473 | |
---|
420 | 474 | static inline bool early_page_uninitialised(unsigned long pfn) |
---|
421 | 475 | { |
---|
422 | 476 | return false; |
---|
423 | 477 | } |
---|
424 | 478 | |
---|
425 | | -static inline bool update_defer_init(pg_data_t *pgdat, |
---|
426 | | - unsigned long pfn, unsigned long zone_end, |
---|
427 | | - unsigned long *nr_initialised) |
---|
| 479 | +static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) |
---|
428 | 480 | { |
---|
429 | | - return true; |
---|
| 481 | + return false; |
---|
430 | 482 | } |
---|
431 | 483 | #endif |
---|
432 | 484 | |
---|
.. | .. |
---|
435 | 487 | unsigned long pfn) |
---|
436 | 488 | { |
---|
437 | 489 | #ifdef CONFIG_SPARSEMEM |
---|
438 | | - return __pfn_to_section(pfn)->pageblock_flags; |
---|
| 490 | + return section_to_usemap(__pfn_to_section(pfn)); |
---|
439 | 491 | #else |
---|
440 | 492 | return page_zone(page)->pageblock_flags; |
---|
441 | 493 | #endif /* CONFIG_SPARSEMEM */ |
---|
.. | .. |
---|
445 | 497 | { |
---|
446 | 498 | #ifdef CONFIG_SPARSEMEM |
---|
447 | 499 | pfn &= (PAGES_PER_SECTION-1); |
---|
448 | | - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
---|
449 | 500 | #else |
---|
450 | 501 | pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); |
---|
451 | | - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
---|
452 | 502 | #endif /* CONFIG_SPARSEMEM */ |
---|
| 503 | + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
---|
453 | 504 | } |
---|
454 | 505 | |
---|
455 | 506 | /** |
---|
456 | 507 | * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages |
---|
457 | 508 | * @page: The page within the block of interest |
---|
458 | 509 | * @pfn: The target page frame number |
---|
459 | | - * @end_bitidx: The last bit of interest to retrieve |
---|
460 | 510 | * @mask: mask of bits that the caller is interested in |
---|
461 | 511 | * |
---|
462 | 512 | * Return: pageblock_bits flags |
---|
463 | 513 | */ |
---|
464 | | -static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, |
---|
| 514 | +static __always_inline |
---|
| 515 | +unsigned long __get_pfnblock_flags_mask(struct page *page, |
---|
465 | 516 | unsigned long pfn, |
---|
466 | | - unsigned long end_bitidx, |
---|
467 | 517 | unsigned long mask) |
---|
468 | 518 | { |
---|
469 | 519 | unsigned long *bitmap; |
---|
.. | .. |
---|
476 | 526 | bitidx &= (BITS_PER_LONG-1); |
---|
477 | 527 | |
---|
478 | 528 | word = bitmap[word_bitidx]; |
---|
479 | | - bitidx += end_bitidx; |
---|
480 | | - return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; |
---|
| 529 | + return (word >> bitidx) & mask; |
---|
481 | 530 | } |
---|
482 | 531 | |
---|
483 | 532 | unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, |
---|
484 | | - unsigned long end_bitidx, |
---|
485 | 533 | unsigned long mask) |
---|
486 | 534 | { |
---|
487 | | - return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask); |
---|
| 535 | + return __get_pfnblock_flags_mask(page, pfn, mask); |
---|
488 | 536 | } |
---|
| 537 | +EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask); |
---|
| 538 | + |
---|
| 539 | +int isolate_anon_lru_page(struct page *page) |
---|
| 540 | +{ |
---|
| 541 | + int ret; |
---|
| 542 | + |
---|
| 543 | + if (!PageLRU(page) || !PageAnon(page)) |
---|
| 544 | + return -EINVAL; |
---|
| 545 | + |
---|
| 546 | + if (!get_page_unless_zero(page)) |
---|
| 547 | + return -EINVAL; |
---|
| 548 | + |
---|
| 549 | + ret = isolate_lru_page(page); |
---|
| 550 | + put_page(page); |
---|
| 551 | + |
---|
| 552 | + return ret; |
---|
| 553 | +} |
---|
| 554 | +EXPORT_SYMBOL_GPL(isolate_anon_lru_page); |
---|
489 | 555 | |
---|
490 | 556 | static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) |
---|
491 | 557 | { |
---|
492 | | - return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK); |
---|
| 558 | + return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); |
---|
493 | 559 | } |
---|
494 | 560 | |
---|
495 | 561 | /** |
---|
.. | .. |
---|
497 | 563 | * @page: The page within the block of interest |
---|
498 | 564 | * @flags: The flags to set |
---|
499 | 565 | * @pfn: The target page frame number |
---|
500 | | - * @end_bitidx: The last bit of interest |
---|
501 | 566 | * @mask: mask of bits that the caller is interested in |
---|
502 | 567 | */ |
---|
503 | 568 | void set_pfnblock_flags_mask(struct page *page, unsigned long flags, |
---|
504 | 569 | unsigned long pfn, |
---|
505 | | - unsigned long end_bitidx, |
---|
506 | 570 | unsigned long mask) |
---|
507 | 571 | { |
---|
508 | 572 | unsigned long *bitmap; |
---|
.. | .. |
---|
510 | 574 | unsigned long old_word, word; |
---|
511 | 575 | |
---|
512 | 576 | BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); |
---|
| 577 | + BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); |
---|
513 | 578 | |
---|
514 | 579 | bitmap = get_pageblock_bitmap(page, pfn); |
---|
515 | 580 | bitidx = pfn_to_bitidx(page, pfn); |
---|
.. | .. |
---|
518 | 583 | |
---|
519 | 584 | VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); |
---|
520 | 585 | |
---|
521 | | - bitidx += end_bitidx; |
---|
522 | | - mask <<= (BITS_PER_LONG - bitidx - 1); |
---|
523 | | - flags <<= (BITS_PER_LONG - bitidx - 1); |
---|
| 586 | + mask <<= bitidx; |
---|
| 587 | + flags <<= bitidx; |
---|
524 | 588 | |
---|
525 | 589 | word = READ_ONCE(bitmap[word_bitidx]); |
---|
526 | 590 | for (;;) { |
---|
.. | .. |
---|
537 | 601 | migratetype < MIGRATE_PCPTYPES)) |
---|
538 | 602 | migratetype = MIGRATE_UNMOVABLE; |
---|
539 | 603 | |
---|
540 | | - set_pageblock_flags_group(page, (unsigned long)migratetype, |
---|
541 | | - PB_migrate, PB_migrate_end); |
---|
| 604 | + set_pfnblock_flags_mask(page, (unsigned long)migratetype, |
---|
| 605 | + page_to_pfn(page), MIGRATETYPE_MASK); |
---|
542 | 606 | } |
---|
543 | 607 | |
---|
544 | 608 | #ifdef CONFIG_DEBUG_VM |
---|
.. | .. |
---|
593 | 657 | } |
---|
594 | 658 | #endif |
---|
595 | 659 | |
---|
596 | | -static void bad_page(struct page *page, const char *reason, |
---|
597 | | - unsigned long bad_flags) |
---|
| 660 | +static void bad_page(struct page *page, const char *reason) |
---|
598 | 661 | { |
---|
599 | 662 | static unsigned long resume; |
---|
600 | 663 | static unsigned long nr_shown; |
---|
.. | .. |
---|
623 | 686 | pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", |
---|
624 | 687 | current->comm, page_to_pfn(page)); |
---|
625 | 688 | __dump_page(page, reason); |
---|
626 | | - bad_flags &= page->flags; |
---|
627 | | - if (bad_flags) |
---|
628 | | - pr_alert("bad because of flags: %#lx(%pGp)\n", |
---|
629 | | - bad_flags, &bad_flags); |
---|
630 | 689 | dump_page_owner(page); |
---|
631 | 690 | |
---|
632 | 691 | print_modules(); |
---|
.. | .. |
---|
654 | 713 | |
---|
655 | 714 | void free_compound_page(struct page *page) |
---|
656 | 715 | { |
---|
657 | | - __free_pages_ok(page, compound_order(page)); |
---|
| 716 | + mem_cgroup_uncharge(page); |
---|
| 717 | + __free_pages_ok(page, compound_order(page), FPI_NONE); |
---|
658 | 718 | } |
---|
659 | 719 | |
---|
660 | 720 | void prep_compound_page(struct page *page, unsigned int order) |
---|
.. | .. |
---|
662 | 722 | int i; |
---|
663 | 723 | int nr_pages = 1 << order; |
---|
664 | 724 | |
---|
665 | | - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); |
---|
666 | | - set_compound_order(page, order); |
---|
667 | 725 | __SetPageHead(page); |
---|
668 | 726 | for (i = 1; i < nr_pages; i++) { |
---|
669 | 727 | struct page *p = page + i; |
---|
.. | .. |
---|
671 | 729 | p->mapping = TAIL_MAPPING; |
---|
672 | 730 | set_compound_head(p, page); |
---|
673 | 731 | } |
---|
| 732 | + |
---|
| 733 | + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); |
---|
| 734 | + set_compound_order(page, order); |
---|
674 | 735 | atomic_set(compound_mapcount_ptr(page), -1); |
---|
| 736 | + if (hpage_pincount_available(page)) |
---|
| 737 | + atomic_set(compound_pincount_ptr(page), 0); |
---|
675 | 738 | } |
---|
676 | 739 | |
---|
677 | 740 | #ifdef CONFIG_DEBUG_PAGEALLOC |
---|
678 | 741 | unsigned int _debug_guardpage_minorder; |
---|
679 | | -bool _debug_pagealloc_enabled __read_mostly |
---|
| 742 | + |
---|
| 743 | +bool _debug_pagealloc_enabled_early __read_mostly |
---|
680 | 744 | = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); |
---|
| 745 | +EXPORT_SYMBOL(_debug_pagealloc_enabled_early); |
---|
| 746 | +DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); |
---|
681 | 747 | EXPORT_SYMBOL(_debug_pagealloc_enabled); |
---|
682 | | -bool _debug_guardpage_enabled __read_mostly; |
---|
| 748 | + |
---|
| 749 | +DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); |
---|
683 | 750 | |
---|
684 | 751 | static int __init early_debug_pagealloc(char *buf) |
---|
685 | 752 | { |
---|
686 | | - if (!buf) |
---|
687 | | - return -EINVAL; |
---|
688 | | - return kstrtobool(buf, &_debug_pagealloc_enabled); |
---|
| 753 | + return kstrtobool(buf, &_debug_pagealloc_enabled_early); |
---|
689 | 754 | } |
---|
690 | 755 | early_param("debug_pagealloc", early_debug_pagealloc); |
---|
691 | | - |
---|
692 | | -static bool need_debug_guardpage(void) |
---|
693 | | -{ |
---|
694 | | - /* If we don't use debug_pagealloc, we don't need guard page */ |
---|
695 | | - if (!debug_pagealloc_enabled()) |
---|
696 | | - return false; |
---|
697 | | - |
---|
698 | | - if (!debug_guardpage_minorder()) |
---|
699 | | - return false; |
---|
700 | | - |
---|
701 | | - return true; |
---|
702 | | -} |
---|
703 | | - |
---|
704 | | -static void init_debug_guardpage(void) |
---|
705 | | -{ |
---|
706 | | - if (!debug_pagealloc_enabled()) |
---|
707 | | - return; |
---|
708 | | - |
---|
709 | | - if (!debug_guardpage_minorder()) |
---|
710 | | - return; |
---|
711 | | - |
---|
712 | | - _debug_guardpage_enabled = true; |
---|
713 | | -} |
---|
714 | | - |
---|
715 | | -struct page_ext_operations debug_guardpage_ops = { |
---|
716 | | - .need = need_debug_guardpage, |
---|
717 | | - .init = init_debug_guardpage, |
---|
718 | | -}; |
---|
719 | 756 | |
---|
720 | 757 | static int __init debug_guardpage_minorder_setup(char *buf) |
---|
721 | 758 | { |
---|
.. | .. |
---|
734 | 771 | static inline bool set_page_guard(struct zone *zone, struct page *page, |
---|
735 | 772 | unsigned int order, int migratetype) |
---|
736 | 773 | { |
---|
737 | | - struct page_ext *page_ext; |
---|
738 | | - |
---|
739 | 774 | if (!debug_guardpage_enabled()) |
---|
740 | 775 | return false; |
---|
741 | 776 | |
---|
742 | 777 | if (order >= debug_guardpage_minorder()) |
---|
743 | 778 | return false; |
---|
744 | 779 | |
---|
745 | | - page_ext = lookup_page_ext(page); |
---|
746 | | - if (unlikely(!page_ext)) |
---|
747 | | - return false; |
---|
748 | | - |
---|
749 | | - __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); |
---|
750 | | - |
---|
| 780 | + __SetPageGuard(page); |
---|
751 | 781 | INIT_LIST_HEAD(&page->lru); |
---|
752 | 782 | set_page_private(page, order); |
---|
753 | 783 | /* Guard pages are not available for any usage */ |
---|
.. | .. |
---|
759 | 789 | static inline void clear_page_guard(struct zone *zone, struct page *page, |
---|
760 | 790 | unsigned int order, int migratetype) |
---|
761 | 791 | { |
---|
762 | | - struct page_ext *page_ext; |
---|
763 | | - |
---|
764 | 792 | if (!debug_guardpage_enabled()) |
---|
765 | 793 | return; |
---|
766 | 794 | |
---|
767 | | - page_ext = lookup_page_ext(page); |
---|
768 | | - if (unlikely(!page_ext)) |
---|
769 | | - return; |
---|
770 | | - |
---|
771 | | - __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); |
---|
| 795 | + __ClearPageGuard(page); |
---|
772 | 796 | |
---|
773 | 797 | set_page_private(page, 0); |
---|
774 | 798 | if (!is_migrate_isolate(migratetype)) |
---|
775 | 799 | __mod_zone_freepage_state(zone, (1 << order), migratetype); |
---|
776 | 800 | } |
---|
777 | 801 | #else |
---|
778 | | -struct page_ext_operations debug_guardpage_ops; |
---|
779 | 802 | static inline bool set_page_guard(struct zone *zone, struct page *page, |
---|
780 | 803 | unsigned int order, int migratetype) { return false; } |
---|
781 | 804 | static inline void clear_page_guard(struct zone *zone, struct page *page, |
---|
782 | 805 | unsigned int order, int migratetype) {} |
---|
783 | 806 | #endif |
---|
784 | 807 | |
---|
785 | | -static inline void set_page_order(struct page *page, unsigned int order) |
---|
| 808 | +/* |
---|
| 809 | + * Enable static keys related to various memory debugging and hardening options. |
---|
| 810 | + * Some override others, and depend on early params that are evaluated in the |
---|
| 811 | + * order of appearance. So we need to first gather the full picture of what was |
---|
| 812 | + * enabled, and then make decisions. |
---|
| 813 | + */ |
---|
| 814 | +void init_mem_debugging_and_hardening(void) |
---|
| 815 | +{ |
---|
| 816 | + bool page_poisoning_requested = false; |
---|
| 817 | + |
---|
| 818 | +#ifdef CONFIG_PAGE_POISONING |
---|
| 819 | + /* |
---|
| 820 | + * Page poisoning is debug page alloc for some arches. If |
---|
| 821 | + * either of those options are enabled, enable poisoning. |
---|
| 822 | + */ |
---|
| 823 | + if (page_poisoning_enabled() || |
---|
| 824 | + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && |
---|
| 825 | + debug_pagealloc_enabled())) { |
---|
| 826 | + static_branch_enable(&_page_poisoning_enabled); |
---|
| 827 | + page_poisoning_requested = true; |
---|
| 828 | + } |
---|
| 829 | +#endif |
---|
| 830 | + |
---|
| 831 | + if (_init_on_alloc_enabled_early) { |
---|
| 832 | + if (page_poisoning_requested) |
---|
| 833 | + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " |
---|
| 834 | + "will take precedence over init_on_alloc\n"); |
---|
| 835 | + else |
---|
| 836 | + static_branch_enable(&init_on_alloc); |
---|
| 837 | + } |
---|
| 838 | + if (_init_on_free_enabled_early) { |
---|
| 839 | + if (page_poisoning_requested) |
---|
| 840 | + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " |
---|
| 841 | + "will take precedence over init_on_free\n"); |
---|
| 842 | + else |
---|
| 843 | + static_branch_enable(&init_on_free); |
---|
| 844 | + } |
---|
| 845 | + |
---|
| 846 | +#ifdef CONFIG_DEBUG_PAGEALLOC |
---|
| 847 | + if (!debug_pagealloc_enabled()) |
---|
| 848 | + return; |
---|
| 849 | + |
---|
| 850 | + static_branch_enable(&_debug_pagealloc_enabled); |
---|
| 851 | + |
---|
| 852 | + if (!debug_guardpage_minorder()) |
---|
| 853 | + return; |
---|
| 854 | + |
---|
| 855 | + static_branch_enable(&_debug_guardpage_enabled); |
---|
| 856 | +#endif |
---|
| 857 | +} |
---|
| 858 | + |
---|
| 859 | +static inline void set_buddy_order(struct page *page, unsigned int order) |
---|
786 | 860 | { |
---|
787 | 861 | set_page_private(page, order); |
---|
788 | 862 | __SetPageBuddy(page); |
---|
789 | | -} |
---|
790 | | - |
---|
791 | | -static inline void rmv_page_order(struct page *page) |
---|
792 | | -{ |
---|
793 | | - __ClearPageBuddy(page); |
---|
794 | | - set_page_private(page, 0); |
---|
795 | 863 | } |
---|
796 | 864 | |
---|
797 | 865 | /* |
---|
.. | .. |
---|
807 | 875 | * |
---|
808 | 876 | * For recording page's order, we use page_private(page). |
---|
809 | 877 | */ |
---|
810 | | -static inline int page_is_buddy(struct page *page, struct page *buddy, |
---|
| 878 | +static inline bool page_is_buddy(struct page *page, struct page *buddy, |
---|
811 | 879 | unsigned int order) |
---|
812 | 880 | { |
---|
813 | | - if (page_is_guard(buddy) && page_order(buddy) == order) { |
---|
814 | | - if (page_zone_id(page) != page_zone_id(buddy)) |
---|
815 | | - return 0; |
---|
| 881 | + if (!page_is_guard(buddy) && !PageBuddy(buddy)) |
---|
| 882 | + return false; |
---|
816 | 883 | |
---|
817 | | - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
---|
| 884 | + if (buddy_order(buddy) != order) |
---|
| 885 | + return false; |
---|
818 | 886 | |
---|
819 | | - return 1; |
---|
820 | | - } |
---|
| 887 | + /* |
---|
| 888 | + * zone check is done late to avoid uselessly calculating |
---|
| 889 | + * zone/node ids for pages that could never merge. |
---|
| 890 | + */ |
---|
| 891 | + if (page_zone_id(page) != page_zone_id(buddy)) |
---|
| 892 | + return false; |
---|
821 | 893 | |
---|
822 | | - if (PageBuddy(buddy) && page_order(buddy) == order) { |
---|
823 | | - /* |
---|
824 | | - * zone check is done late to avoid uselessly |
---|
825 | | - * calculating zone/node ids for pages that could |
---|
826 | | - * never merge. |
---|
827 | | - */ |
---|
828 | | - if (page_zone_id(page) != page_zone_id(buddy)) |
---|
829 | | - return 0; |
---|
| 894 | + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
---|
830 | 895 | |
---|
831 | | - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
---|
| 896 | + return true; |
---|
| 897 | +} |
---|
832 | 898 | |
---|
833 | | - return 1; |
---|
834 | | - } |
---|
835 | | - return 0; |
---|
| 899 | +#ifdef CONFIG_COMPACTION |
---|
| 900 | +static inline struct capture_control *task_capc(struct zone *zone) |
---|
| 901 | +{ |
---|
| 902 | + struct capture_control *capc = current->capture_control; |
---|
| 903 | + |
---|
| 904 | + return unlikely(capc) && |
---|
| 905 | + !(current->flags & PF_KTHREAD) && |
---|
| 906 | + !capc->page && |
---|
| 907 | + capc->cc->zone == zone ? capc : NULL; |
---|
| 908 | +} |
---|
| 909 | + |
---|
| 910 | +static inline bool |
---|
| 911 | +compaction_capture(struct capture_control *capc, struct page *page, |
---|
| 912 | + int order, int migratetype) |
---|
| 913 | +{ |
---|
| 914 | + if (!capc || order != capc->cc->order) |
---|
| 915 | + return false; |
---|
| 916 | + |
---|
| 917 | + /* Do not accidentally pollute CMA or isolated regions*/ |
---|
| 918 | + if (is_migrate_cma(migratetype) || |
---|
| 919 | + is_migrate_isolate(migratetype)) |
---|
| 920 | + return false; |
---|
| 921 | + |
---|
| 922 | + /* |
---|
| 923 | + * Do not let lower order allocations polluate a movable pageblock. |
---|
| 924 | + * This might let an unmovable request use a reclaimable pageblock |
---|
| 925 | + * and vice-versa but no more than normal fallback logic which can |
---|
| 926 | + * have trouble finding a high-order free page. |
---|
| 927 | + */ |
---|
| 928 | + if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) |
---|
| 929 | + return false; |
---|
| 930 | + |
---|
| 931 | + capc->page = page; |
---|
| 932 | + return true; |
---|
| 933 | +} |
---|
| 934 | + |
---|
| 935 | +#else |
---|
| 936 | +static inline struct capture_control *task_capc(struct zone *zone) |
---|
| 937 | +{ |
---|
| 938 | + return NULL; |
---|
| 939 | +} |
---|
| 940 | + |
---|
| 941 | +static inline bool |
---|
| 942 | +compaction_capture(struct capture_control *capc, struct page *page, |
---|
| 943 | + int order, int migratetype) |
---|
| 944 | +{ |
---|
| 945 | + return false; |
---|
| 946 | +} |
---|
| 947 | +#endif /* CONFIG_COMPACTION */ |
---|
| 948 | + |
---|
| 949 | +/* Used for pages not on another list */ |
---|
| 950 | +static inline void add_to_free_list(struct page *page, struct zone *zone, |
---|
| 951 | + unsigned int order, int migratetype) |
---|
| 952 | +{ |
---|
| 953 | + struct free_area *area = &zone->free_area[order]; |
---|
| 954 | + |
---|
| 955 | + list_add(&page->lru, &area->free_list[migratetype]); |
---|
| 956 | + area->nr_free++; |
---|
| 957 | +} |
---|
| 958 | + |
---|
| 959 | +/* Used for pages not on another list */ |
---|
| 960 | +static inline void add_to_free_list_tail(struct page *page, struct zone *zone, |
---|
| 961 | + unsigned int order, int migratetype) |
---|
| 962 | +{ |
---|
| 963 | + struct free_area *area = &zone->free_area[order]; |
---|
| 964 | + |
---|
| 965 | + list_add_tail(&page->lru, &area->free_list[migratetype]); |
---|
| 966 | + area->nr_free++; |
---|
| 967 | +} |
---|
| 968 | + |
---|
| 969 | +/* |
---|
| 970 | + * Used for pages which are on another list. Move the pages to the tail |
---|
| 971 | + * of the list - so the moved pages won't immediately be considered for |
---|
| 972 | + * allocation again (e.g., optimization for memory onlining). |
---|
| 973 | + */ |
---|
| 974 | +static inline void move_to_free_list(struct page *page, struct zone *zone, |
---|
| 975 | + unsigned int order, int migratetype) |
---|
| 976 | +{ |
---|
| 977 | + struct free_area *area = &zone->free_area[order]; |
---|
| 978 | + |
---|
| 979 | + list_move_tail(&page->lru, &area->free_list[migratetype]); |
---|
| 980 | +} |
---|
| 981 | + |
---|
| 982 | +static inline void del_page_from_free_list(struct page *page, struct zone *zone, |
---|
| 983 | + unsigned int order) |
---|
| 984 | +{ |
---|
| 985 | + /* clear reported state and update reported page count */ |
---|
| 986 | + if (page_reported(page)) |
---|
| 987 | + __ClearPageReported(page); |
---|
| 988 | + |
---|
| 989 | + list_del(&page->lru); |
---|
| 990 | + __ClearPageBuddy(page); |
---|
| 991 | + set_page_private(page, 0); |
---|
| 992 | + zone->free_area[order].nr_free--; |
---|
| 993 | +} |
---|
| 994 | + |
---|
| 995 | +/* |
---|
| 996 | + * If this is not the largest possible page, check if the buddy |
---|
| 997 | + * of the next-highest order is free. If it is, it's possible |
---|
| 998 | + * that pages are being freed that will coalesce soon. In case, |
---|
| 999 | + * that is happening, add the free page to the tail of the list |
---|
| 1000 | + * so it's less likely to be used soon and more likely to be merged |
---|
| 1001 | + * as a higher order page |
---|
| 1002 | + */ |
---|
| 1003 | +static inline bool |
---|
| 1004 | +buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, |
---|
| 1005 | + struct page *page, unsigned int order) |
---|
| 1006 | +{ |
---|
| 1007 | + struct page *higher_page, *higher_buddy; |
---|
| 1008 | + unsigned long combined_pfn; |
---|
| 1009 | + |
---|
| 1010 | + if (order >= MAX_ORDER - 2) |
---|
| 1011 | + return false; |
---|
| 1012 | + |
---|
| 1013 | + if (!pfn_valid_within(buddy_pfn)) |
---|
| 1014 | + return false; |
---|
| 1015 | + |
---|
| 1016 | + combined_pfn = buddy_pfn & pfn; |
---|
| 1017 | + higher_page = page + (combined_pfn - pfn); |
---|
| 1018 | + buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); |
---|
| 1019 | + higher_buddy = higher_page + (buddy_pfn - combined_pfn); |
---|
| 1020 | + |
---|
| 1021 | + return pfn_valid_within(buddy_pfn) && |
---|
| 1022 | + page_is_buddy(higher_page, higher_buddy, order + 1); |
---|
836 | 1023 | } |
---|
837 | 1024 | |
---|
838 | 1025 | /* |
---|
.. | .. |
---|
862 | 1049 | static inline void __free_one_page(struct page *page, |
---|
863 | 1050 | unsigned long pfn, |
---|
864 | 1051 | struct zone *zone, unsigned int order, |
---|
865 | | - int migratetype) |
---|
| 1052 | + int migratetype, fpi_t fpi_flags) |
---|
866 | 1053 | { |
---|
| 1054 | + struct capture_control *capc = task_capc(zone); |
---|
| 1055 | + unsigned long buddy_pfn; |
---|
867 | 1056 | unsigned long combined_pfn; |
---|
868 | | - unsigned long uninitialized_var(buddy_pfn); |
---|
869 | | - struct page *buddy; |
---|
870 | 1057 | unsigned int max_order; |
---|
| 1058 | + struct page *buddy; |
---|
| 1059 | + bool to_tail; |
---|
871 | 1060 | |
---|
872 | 1061 | max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); |
---|
873 | 1062 | |
---|
.. | .. |
---|
883 | 1072 | |
---|
884 | 1073 | continue_merging: |
---|
885 | 1074 | while (order < max_order) { |
---|
| 1075 | + if (compaction_capture(capc, page, order, migratetype)) { |
---|
| 1076 | + __mod_zone_freepage_state(zone, -(1 << order), |
---|
| 1077 | + migratetype); |
---|
| 1078 | + return; |
---|
| 1079 | + } |
---|
886 | 1080 | buddy_pfn = __find_buddy_pfn(pfn, order); |
---|
887 | 1081 | buddy = page + (buddy_pfn - pfn); |
---|
888 | 1082 | |
---|
.. | .. |
---|
894 | 1088 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, |
---|
895 | 1089 | * merge with it and move up one order. |
---|
896 | 1090 | */ |
---|
897 | | - if (page_is_guard(buddy)) { |
---|
| 1091 | + if (page_is_guard(buddy)) |
---|
898 | 1092 | clear_page_guard(zone, buddy, order, migratetype); |
---|
899 | | - } else { |
---|
900 | | - list_del(&buddy->lru); |
---|
901 | | - zone->free_area[order].nr_free--; |
---|
902 | | - rmv_page_order(buddy); |
---|
903 | | - } |
---|
| 1093 | + else |
---|
| 1094 | + del_page_from_free_list(buddy, zone, order); |
---|
904 | 1095 | combined_pfn = buddy_pfn & pfn; |
---|
905 | 1096 | page = page + (combined_pfn - pfn); |
---|
906 | 1097 | pfn = combined_pfn; |
---|
.. | .. |
---|
932 | 1123 | } |
---|
933 | 1124 | |
---|
934 | 1125 | done_merging: |
---|
935 | | - set_page_order(page, order); |
---|
| 1126 | + set_buddy_order(page, order); |
---|
936 | 1127 | |
---|
937 | | - /* |
---|
938 | | - * If this is not the largest possible page, check if the buddy |
---|
939 | | - * of the next-highest order is free. If it is, it's possible |
---|
940 | | - * that pages are being freed that will coalesce soon. In case, |
---|
941 | | - * that is happening, add the free page to the tail of the list |
---|
942 | | - * so it's less likely to be used soon and more likely to be merged |
---|
943 | | - * as a higher order page |
---|
944 | | - */ |
---|
945 | | - if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { |
---|
946 | | - struct page *higher_page, *higher_buddy; |
---|
947 | | - combined_pfn = buddy_pfn & pfn; |
---|
948 | | - higher_page = page + (combined_pfn - pfn); |
---|
949 | | - buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); |
---|
950 | | - higher_buddy = higher_page + (buddy_pfn - combined_pfn); |
---|
951 | | - if (pfn_valid_within(buddy_pfn) && |
---|
952 | | - page_is_buddy(higher_page, higher_buddy, order + 1)) { |
---|
953 | | - list_add_tail(&page->lru, |
---|
954 | | - &zone->free_area[order].free_list[migratetype]); |
---|
955 | | - goto out; |
---|
956 | | - } |
---|
957 | | - } |
---|
| 1128 | + if (fpi_flags & FPI_TO_TAIL) |
---|
| 1129 | + to_tail = true; |
---|
| 1130 | + else if (is_shuffle_order(order)) |
---|
| 1131 | + to_tail = shuffle_pick_tail(); |
---|
| 1132 | + else |
---|
| 1133 | + to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); |
---|
958 | 1134 | |
---|
959 | | - list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); |
---|
960 | | -out: |
---|
961 | | - zone->free_area[order].nr_free++; |
---|
| 1135 | + if (to_tail) |
---|
| 1136 | + add_to_free_list_tail(page, zone, order, migratetype); |
---|
| 1137 | + else |
---|
| 1138 | + add_to_free_list(page, zone, order, migratetype); |
---|
| 1139 | + |
---|
| 1140 | + /* Notify page reporting subsystem of freed page */ |
---|
| 1141 | + if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) |
---|
| 1142 | + page_reporting_notify_free(order); |
---|
962 | 1143 | } |
---|
963 | 1144 | |
---|
964 | 1145 | /* |
---|
.. | .. |
---|
983 | 1164 | return true; |
---|
984 | 1165 | } |
---|
985 | 1166 | |
---|
986 | | -static void free_pages_check_bad(struct page *page) |
---|
| 1167 | +static const char *page_bad_reason(struct page *page, unsigned long flags) |
---|
987 | 1168 | { |
---|
988 | | - const char *bad_reason; |
---|
989 | | - unsigned long bad_flags; |
---|
990 | | - |
---|
991 | | - bad_reason = NULL; |
---|
992 | | - bad_flags = 0; |
---|
| 1169 | + const char *bad_reason = NULL; |
---|
993 | 1170 | |
---|
994 | 1171 | if (unlikely(atomic_read(&page->_mapcount) != -1)) |
---|
995 | 1172 | bad_reason = "nonzero mapcount"; |
---|
.. | .. |
---|
997 | 1174 | bad_reason = "non-NULL mapping"; |
---|
998 | 1175 | if (unlikely(page_ref_count(page) != 0)) |
---|
999 | 1176 | bad_reason = "nonzero _refcount"; |
---|
1000 | | - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { |
---|
1001 | | - bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; |
---|
1002 | | - bad_flags = PAGE_FLAGS_CHECK_AT_FREE; |
---|
| 1177 | + if (unlikely(page->flags & flags)) { |
---|
| 1178 | + if (flags == PAGE_FLAGS_CHECK_AT_PREP) |
---|
| 1179 | + bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; |
---|
| 1180 | + else |
---|
| 1181 | + bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; |
---|
1003 | 1182 | } |
---|
1004 | 1183 | #ifdef CONFIG_MEMCG |
---|
1005 | 1184 | if (unlikely(page->mem_cgroup)) |
---|
1006 | 1185 | bad_reason = "page still charged to cgroup"; |
---|
1007 | 1186 | #endif |
---|
1008 | | - bad_page(page, bad_reason, bad_flags); |
---|
| 1187 | + return bad_reason; |
---|
1009 | 1188 | } |
---|
1010 | 1189 | |
---|
1011 | | -static inline int free_pages_check(struct page *page) |
---|
| 1190 | +static void check_free_page_bad(struct page *page) |
---|
| 1191 | +{ |
---|
| 1192 | + bad_page(page, |
---|
| 1193 | + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); |
---|
| 1194 | +} |
---|
| 1195 | + |
---|
| 1196 | +static inline int check_free_page(struct page *page) |
---|
1012 | 1197 | { |
---|
1013 | 1198 | if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) |
---|
1014 | 1199 | return 0; |
---|
1015 | 1200 | |
---|
1016 | 1201 | /* Something has gone sideways, find it */ |
---|
1017 | | - free_pages_check_bad(page); |
---|
| 1202 | + check_free_page_bad(page); |
---|
1018 | 1203 | return 1; |
---|
1019 | 1204 | } |
---|
1020 | 1205 | |
---|
.. | .. |
---|
1036 | 1221 | case 1: |
---|
1037 | 1222 | /* the first tail page: ->mapping may be compound_mapcount() */ |
---|
1038 | 1223 | if (unlikely(compound_mapcount(page))) { |
---|
1039 | | - bad_page(page, "nonzero compound_mapcount", 0); |
---|
| 1224 | + bad_page(page, "nonzero compound_mapcount"); |
---|
1040 | 1225 | goto out; |
---|
1041 | 1226 | } |
---|
1042 | 1227 | break; |
---|
.. | .. |
---|
1048 | 1233 | break; |
---|
1049 | 1234 | default: |
---|
1050 | 1235 | if (page->mapping != TAIL_MAPPING) { |
---|
1051 | | - bad_page(page, "corrupted mapping in tail page", 0); |
---|
| 1236 | + bad_page(page, "corrupted mapping in tail page"); |
---|
1052 | 1237 | goto out; |
---|
1053 | 1238 | } |
---|
1054 | 1239 | break; |
---|
1055 | 1240 | } |
---|
1056 | 1241 | if (unlikely(!PageTail(page))) { |
---|
1057 | | - bad_page(page, "PageTail not set", 0); |
---|
| 1242 | + bad_page(page, "PageTail not set"); |
---|
1058 | 1243 | goto out; |
---|
1059 | 1244 | } |
---|
1060 | 1245 | if (unlikely(compound_head(page) != head_page)) { |
---|
1061 | | - bad_page(page, "compound_head not consistent", 0); |
---|
| 1246 | + bad_page(page, "compound_head not consistent"); |
---|
1062 | 1247 | goto out; |
---|
1063 | 1248 | } |
---|
1064 | 1249 | ret = 0; |
---|
.. | .. |
---|
1068 | 1253 | return ret; |
---|
1069 | 1254 | } |
---|
1070 | 1255 | |
---|
1071 | | -static void kernel_init_free_pages(struct page *page, int numpages) |
---|
| 1256 | +static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) |
---|
1072 | 1257 | { |
---|
1073 | 1258 | int i; |
---|
1074 | 1259 | |
---|
| 1260 | + if (zero_tags) { |
---|
| 1261 | + for (i = 0; i < numpages; i++) |
---|
| 1262 | + tag_clear_highpage(page + i); |
---|
| 1263 | + return; |
---|
| 1264 | + } |
---|
| 1265 | + |
---|
1075 | 1266 | /* s390's use of memset() could override KASAN redzones. */ |
---|
1076 | 1267 | kasan_disable_current(); |
---|
1077 | | - for (i = 0; i < numpages; i++) |
---|
| 1268 | + for (i = 0; i < numpages; i++) { |
---|
| 1269 | + u8 tag = page_kasan_tag(page + i); |
---|
| 1270 | + page_kasan_tag_reset(page + i); |
---|
1078 | 1271 | clear_highpage(page + i); |
---|
| 1272 | + page_kasan_tag_set(page + i, tag); |
---|
| 1273 | + } |
---|
1079 | 1274 | kasan_enable_current(); |
---|
1080 | 1275 | } |
---|
1081 | 1276 | |
---|
1082 | 1277 | static __always_inline bool free_pages_prepare(struct page *page, |
---|
1083 | | - unsigned int order, bool check_free) |
---|
| 1278 | + unsigned int order, bool check_free, fpi_t fpi_flags) |
---|
1084 | 1279 | { |
---|
1085 | 1280 | int bad = 0; |
---|
| 1281 | + bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); |
---|
1086 | 1282 | |
---|
1087 | 1283 | VM_BUG_ON_PAGE(PageTail(page), page); |
---|
1088 | 1284 | |
---|
1089 | 1285 | trace_mm_page_free(page, order); |
---|
| 1286 | + |
---|
| 1287 | + if (unlikely(PageHWPoison(page)) && !order) { |
---|
| 1288 | + /* |
---|
| 1289 | + * Do not let hwpoison pages hit pcplists/buddy |
---|
| 1290 | + * Untie memcg state and reset page's owner |
---|
| 1291 | + */ |
---|
| 1292 | + if (memcg_kmem_enabled() && PageKmemcg(page)) |
---|
| 1293 | + __memcg_kmem_uncharge_page(page, order); |
---|
| 1294 | + reset_page_owner(page, order); |
---|
| 1295 | + free_page_pinner(page, order); |
---|
| 1296 | + return false; |
---|
| 1297 | + } |
---|
1090 | 1298 | |
---|
1091 | 1299 | /* |
---|
1092 | 1300 | * Check tail pages before head page information is cleared to |
---|
.. | .. |
---|
1103 | 1311 | for (i = 1; i < (1 << order); i++) { |
---|
1104 | 1312 | if (compound) |
---|
1105 | 1313 | bad += free_tail_pages_check(page, page + i); |
---|
1106 | | - if (unlikely(free_pages_check(page + i))) { |
---|
| 1314 | + if (unlikely(check_free_page(page + i))) { |
---|
1107 | 1315 | bad++; |
---|
1108 | 1316 | continue; |
---|
1109 | 1317 | } |
---|
.. | .. |
---|
1113 | 1321 | if (PageMappingFlags(page)) |
---|
1114 | 1322 | page->mapping = NULL; |
---|
1115 | 1323 | if (memcg_kmem_enabled() && PageKmemcg(page)) |
---|
1116 | | - memcg_kmem_uncharge(page, order); |
---|
| 1324 | + __memcg_kmem_uncharge_page(page, order); |
---|
1117 | 1325 | if (check_free) |
---|
1118 | | - bad += free_pages_check(page); |
---|
| 1326 | + bad += check_free_page(page); |
---|
1119 | 1327 | if (bad) |
---|
1120 | 1328 | return false; |
---|
1121 | 1329 | |
---|
1122 | 1330 | page_cpupid_reset_last(page); |
---|
1123 | 1331 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
---|
1124 | 1332 | reset_page_owner(page, order); |
---|
| 1333 | + free_page_pinner(page, order); |
---|
1125 | 1334 | |
---|
1126 | 1335 | if (!PageHighMem(page)) { |
---|
1127 | 1336 | debug_check_no_locks_freed(page_address(page), |
---|
.. | .. |
---|
1129 | 1338 | debug_check_no_obj_freed(page_address(page), |
---|
1130 | 1339 | PAGE_SIZE << order); |
---|
1131 | 1340 | } |
---|
1132 | | - arch_free_page(page, order); |
---|
1133 | | - if (want_init_on_free()) |
---|
1134 | | - kernel_init_free_pages(page, 1 << order); |
---|
1135 | 1341 | |
---|
1136 | | - kernel_poison_pages(page, 1 << order, 0); |
---|
1137 | | - kernel_map_pages(page, 1 << order, 0); |
---|
1138 | | - kasan_free_nondeferred_pages(page, order); |
---|
| 1342 | + kernel_poison_pages(page, 1 << order); |
---|
| 1343 | + |
---|
| 1344 | + /* |
---|
| 1345 | + * As memory initialization might be integrated into KASAN, |
---|
| 1346 | + * kasan_free_pages and kernel_init_free_pages must be |
---|
| 1347 | + * kept together to avoid discrepancies in behavior. |
---|
| 1348 | + * |
---|
| 1349 | + * With hardware tag-based KASAN, memory tags must be set before the |
---|
| 1350 | + * page becomes unavailable via debug_pagealloc or arch_free_page. |
---|
| 1351 | + */ |
---|
| 1352 | + if (kasan_has_integrated_init()) { |
---|
| 1353 | + if (!skip_kasan_poison) |
---|
| 1354 | + kasan_free_pages(page, order); |
---|
| 1355 | + } else { |
---|
| 1356 | + bool init = want_init_on_free(); |
---|
| 1357 | + |
---|
| 1358 | + if (init) |
---|
| 1359 | + kernel_init_free_pages(page, 1 << order, false); |
---|
| 1360 | + if (!skip_kasan_poison) |
---|
| 1361 | + kasan_poison_pages(page, order, init); |
---|
| 1362 | + } |
---|
| 1363 | + |
---|
| 1364 | + /* |
---|
| 1365 | + * arch_free_page() can make the page's contents inaccessible. s390 |
---|
| 1366 | + * does this. So nothing which can access the page's contents should |
---|
| 1367 | + * happen after this. |
---|
| 1368 | + */ |
---|
| 1369 | + arch_free_page(page, order); |
---|
| 1370 | + |
---|
| 1371 | + debug_pagealloc_unmap_pages(page, 1 << order); |
---|
1139 | 1372 | |
---|
1140 | 1373 | return true; |
---|
1141 | 1374 | } |
---|
1142 | 1375 | |
---|
1143 | 1376 | #ifdef CONFIG_DEBUG_VM |
---|
1144 | | -static inline bool free_pcp_prepare(struct page *page) |
---|
1145 | | -{ |
---|
1146 | | - return free_pages_prepare(page, 0, true); |
---|
1147 | | -} |
---|
1148 | | - |
---|
1149 | | -static inline bool bulkfree_pcp_prepare(struct page *page) |
---|
1150 | | -{ |
---|
1151 | | - return false; |
---|
1152 | | -} |
---|
1153 | | -#else |
---|
| 1377 | +/* |
---|
| 1378 | + * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed |
---|
| 1379 | + * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when |
---|
| 1380 | + * moved from pcp lists to free lists. |
---|
| 1381 | + */ |
---|
1154 | 1382 | static bool free_pcp_prepare(struct page *page) |
---|
1155 | 1383 | { |
---|
1156 | | - return free_pages_prepare(page, 0, false); |
---|
| 1384 | + return free_pages_prepare(page, 0, true, FPI_NONE); |
---|
1157 | 1385 | } |
---|
1158 | 1386 | |
---|
1159 | 1387 | static bool bulkfree_pcp_prepare(struct page *page) |
---|
1160 | 1388 | { |
---|
1161 | | - return free_pages_check(page); |
---|
| 1389 | + if (debug_pagealloc_enabled_static()) |
---|
| 1390 | + return check_free_page(page); |
---|
| 1391 | + else |
---|
| 1392 | + return false; |
---|
| 1393 | +} |
---|
| 1394 | +#else |
---|
| 1395 | +/* |
---|
| 1396 | + * With DEBUG_VM disabled, order-0 pages being freed are checked only when |
---|
| 1397 | + * moving from pcp lists to free list in order to reduce overhead. With |
---|
| 1398 | + * debug_pagealloc enabled, they are checked also immediately when being freed |
---|
| 1399 | + * to the pcp lists. |
---|
| 1400 | + */ |
---|
| 1401 | +static bool free_pcp_prepare(struct page *page) |
---|
| 1402 | +{ |
---|
| 1403 | + if (debug_pagealloc_enabled_static()) |
---|
| 1404 | + return free_pages_prepare(page, 0, true, FPI_NONE); |
---|
| 1405 | + else |
---|
| 1406 | + return free_pages_prepare(page, 0, false, FPI_NONE); |
---|
| 1407 | +} |
---|
| 1408 | + |
---|
| 1409 | +static bool bulkfree_pcp_prepare(struct page *page) |
---|
| 1410 | +{ |
---|
| 1411 | + return check_free_page(page); |
---|
1162 | 1412 | } |
---|
1163 | 1413 | #endif /* CONFIG_DEBUG_VM */ |
---|
1164 | 1414 | |
---|
.. | .. |
---|
1258 | 1508 | if (unlikely(isolated_pageblocks)) |
---|
1259 | 1509 | mt = get_pageblock_migratetype(page); |
---|
1260 | 1510 | |
---|
1261 | | - __free_one_page(page, page_to_pfn(page), zone, 0, mt); |
---|
| 1511 | + __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); |
---|
1262 | 1512 | trace_mm_page_pcpu_drain(page, 0, mt); |
---|
1263 | 1513 | } |
---|
1264 | 1514 | spin_unlock(&zone->lock); |
---|
.. | .. |
---|
1267 | 1517 | static void free_one_page(struct zone *zone, |
---|
1268 | 1518 | struct page *page, unsigned long pfn, |
---|
1269 | 1519 | unsigned int order, |
---|
1270 | | - int migratetype) |
---|
| 1520 | + int migratetype, fpi_t fpi_flags) |
---|
1271 | 1521 | { |
---|
1272 | 1522 | spin_lock(&zone->lock); |
---|
1273 | 1523 | if (unlikely(has_isolate_pageblock(zone) || |
---|
1274 | 1524 | is_migrate_isolate(migratetype))) { |
---|
1275 | 1525 | migratetype = get_pfnblock_migratetype(page, pfn); |
---|
1276 | 1526 | } |
---|
1277 | | - __free_one_page(page, pfn, zone, order, migratetype); |
---|
| 1527 | + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); |
---|
1278 | 1528 | spin_unlock(&zone->lock); |
---|
1279 | 1529 | } |
---|
1280 | 1530 | |
---|
.. | .. |
---|
1348 | 1598 | /* Avoid false-positive PageTail() */ |
---|
1349 | 1599 | INIT_LIST_HEAD(&page->lru); |
---|
1350 | 1600 | |
---|
1351 | | - SetPageReserved(page); |
---|
| 1601 | + /* |
---|
| 1602 | + * no need for atomic set_bit because the struct |
---|
| 1603 | + * page is not visible yet so nobody should |
---|
| 1604 | + * access it yet. |
---|
| 1605 | + */ |
---|
| 1606 | + __SetPageReserved(page); |
---|
1352 | 1607 | } |
---|
1353 | 1608 | } |
---|
1354 | 1609 | } |
---|
1355 | 1610 | |
---|
1356 | | -static void __free_pages_ok(struct page *page, unsigned int order) |
---|
| 1611 | +static void __free_pages_ok(struct page *page, unsigned int order, |
---|
| 1612 | + fpi_t fpi_flags) |
---|
1357 | 1613 | { |
---|
1358 | 1614 | unsigned long flags; |
---|
1359 | 1615 | int migratetype; |
---|
1360 | 1616 | unsigned long pfn = page_to_pfn(page); |
---|
| 1617 | + bool skip_free_unref_page = false; |
---|
1361 | 1618 | |
---|
1362 | | - if (!free_pages_prepare(page, order, true)) |
---|
| 1619 | + if (!free_pages_prepare(page, order, true, fpi_flags)) |
---|
1363 | 1620 | return; |
---|
1364 | 1621 | |
---|
1365 | 1622 | migratetype = get_pfnblock_migratetype(page, pfn); |
---|
| 1623 | + trace_android_vh_free_unref_page_bypass(page, order, migratetype, &skip_free_unref_page); |
---|
| 1624 | + if (skip_free_unref_page) |
---|
| 1625 | + return; |
---|
| 1626 | + |
---|
1366 | 1627 | local_irq_save(flags); |
---|
1367 | 1628 | __count_vm_events(PGFREE, 1 << order); |
---|
1368 | | - free_one_page(page_zone(page), page, pfn, order, migratetype); |
---|
| 1629 | + free_one_page(page_zone(page), page, pfn, order, migratetype, |
---|
| 1630 | + fpi_flags); |
---|
1369 | 1631 | local_irq_restore(flags); |
---|
1370 | 1632 | } |
---|
1371 | 1633 | |
---|
1372 | | -static void __init __free_pages_boot_core(struct page *page, unsigned int order) |
---|
| 1634 | +void __free_pages_core(struct page *page, unsigned int order) |
---|
1373 | 1635 | { |
---|
1374 | 1636 | unsigned int nr_pages = 1 << order; |
---|
1375 | 1637 | struct page *p = page; |
---|
1376 | 1638 | unsigned int loop; |
---|
1377 | 1639 | |
---|
| 1640 | + /* |
---|
| 1641 | + * When initializing the memmap, __init_single_page() sets the refcount |
---|
| 1642 | + * of all pages to 1 ("allocated"/"not free"). We have to set the |
---|
| 1643 | + * refcount of all involved pages to 0. |
---|
| 1644 | + */ |
---|
1378 | 1645 | prefetchw(p); |
---|
1379 | 1646 | for (loop = 0; loop < (nr_pages - 1); loop++, p++) { |
---|
1380 | 1647 | prefetchw(p + 1); |
---|
.. | .. |
---|
1384 | 1651 | __ClearPageReserved(p); |
---|
1385 | 1652 | set_page_count(p, 0); |
---|
1386 | 1653 | |
---|
1387 | | - page_zone(page)->managed_pages += nr_pages; |
---|
1388 | | - set_page_refcounted(page); |
---|
1389 | | - __free_pages(page, order); |
---|
| 1654 | + atomic_long_add(nr_pages, &page_zone(page)->managed_pages); |
---|
| 1655 | + |
---|
| 1656 | + /* |
---|
| 1657 | + * Bypass PCP and place fresh pages right to the tail, primarily |
---|
| 1658 | + * relevant for memory onlining. |
---|
| 1659 | + */ |
---|
| 1660 | + __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); |
---|
1390 | 1661 | } |
---|
1391 | 1662 | |
---|
1392 | | -#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ |
---|
1393 | | - defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) |
---|
| 1663 | +#ifdef CONFIG_NEED_MULTIPLE_NODES |
---|
1394 | 1664 | |
---|
1395 | 1665 | static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; |
---|
| 1666 | + |
---|
| 1667 | +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID |
---|
| 1668 | + |
---|
| 1669 | +/* |
---|
| 1670 | + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. |
---|
| 1671 | + */ |
---|
| 1672 | +int __meminit __early_pfn_to_nid(unsigned long pfn, |
---|
| 1673 | + struct mminit_pfnnid_cache *state) |
---|
| 1674 | +{ |
---|
| 1675 | + unsigned long start_pfn, end_pfn; |
---|
| 1676 | + int nid; |
---|
| 1677 | + |
---|
| 1678 | + if (state->last_start <= pfn && pfn < state->last_end) |
---|
| 1679 | + return state->last_nid; |
---|
| 1680 | + |
---|
| 1681 | + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); |
---|
| 1682 | + if (nid != NUMA_NO_NODE) { |
---|
| 1683 | + state->last_start = start_pfn; |
---|
| 1684 | + state->last_end = end_pfn; |
---|
| 1685 | + state->last_nid = nid; |
---|
| 1686 | + } |
---|
| 1687 | + |
---|
| 1688 | + return nid; |
---|
| 1689 | +} |
---|
| 1690 | +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ |
---|
1396 | 1691 | |
---|
1397 | 1692 | int __meminit early_pfn_to_nid(unsigned long pfn) |
---|
1398 | 1693 | { |
---|
.. | .. |
---|
1407 | 1702 | |
---|
1408 | 1703 | return nid; |
---|
1409 | 1704 | } |
---|
1410 | | -#endif |
---|
| 1705 | +#endif /* CONFIG_NEED_MULTIPLE_NODES */ |
---|
1411 | 1706 | |
---|
1412 | | -#ifdef CONFIG_NODES_SPAN_OTHER_NODES |
---|
1413 | | -static inline bool __meminit __maybe_unused |
---|
1414 | | -meminit_pfn_in_nid(unsigned long pfn, int node, |
---|
1415 | | - struct mminit_pfnnid_cache *state) |
---|
1416 | | -{ |
---|
1417 | | - int nid; |
---|
1418 | | - |
---|
1419 | | - nid = __early_pfn_to_nid(pfn, state); |
---|
1420 | | - if (nid >= 0 && nid != node) |
---|
1421 | | - return false; |
---|
1422 | | - return true; |
---|
1423 | | -} |
---|
1424 | | - |
---|
1425 | | -/* Only safe to use early in boot when initialisation is single-threaded */ |
---|
1426 | | -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) |
---|
1427 | | -{ |
---|
1428 | | - return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); |
---|
1429 | | -} |
---|
1430 | | - |
---|
1431 | | -#else |
---|
1432 | | - |
---|
1433 | | -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) |
---|
1434 | | -{ |
---|
1435 | | - return true; |
---|
1436 | | -} |
---|
1437 | | -static inline bool __meminit __maybe_unused |
---|
1438 | | -meminit_pfn_in_nid(unsigned long pfn, int node, |
---|
1439 | | - struct mminit_pfnnid_cache *state) |
---|
1440 | | -{ |
---|
1441 | | - return true; |
---|
1442 | | -} |
---|
1443 | | -#endif |
---|
1444 | | - |
---|
1445 | | - |
---|
1446 | | -void __init __free_pages_bootmem(struct page *page, unsigned long pfn, |
---|
| 1707 | +void __init memblock_free_pages(struct page *page, unsigned long pfn, |
---|
1447 | 1708 | unsigned int order) |
---|
1448 | 1709 | { |
---|
1449 | 1710 | if (early_page_uninitialised(pfn)) |
---|
1450 | 1711 | return; |
---|
1451 | | - return __free_pages_boot_core(page, order); |
---|
| 1712 | + __free_pages_core(page, order); |
---|
1452 | 1713 | } |
---|
1453 | 1714 | |
---|
1454 | 1715 | /* |
---|
.. | .. |
---|
1539 | 1800 | if (nr_pages == pageblock_nr_pages && |
---|
1540 | 1801 | (pfn & (pageblock_nr_pages - 1)) == 0) { |
---|
1541 | 1802 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
---|
1542 | | - __free_pages_boot_core(page, pageblock_order); |
---|
| 1803 | + __free_pages_core(page, pageblock_order); |
---|
1543 | 1804 | return; |
---|
1544 | 1805 | } |
---|
1545 | 1806 | |
---|
1546 | 1807 | for (i = 0; i < nr_pages; i++, page++, pfn++) { |
---|
1547 | 1808 | if ((pfn & (pageblock_nr_pages - 1)) == 0) |
---|
1548 | 1809 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
---|
1549 | | - __free_pages_boot_core(page, 0); |
---|
| 1810 | + __free_pages_core(page, 0); |
---|
1550 | 1811 | } |
---|
1551 | 1812 | } |
---|
1552 | 1813 | |
---|
.. | .. |
---|
1569 | 1830 | * |
---|
1570 | 1831 | * Then, we check if a current large page is valid by only checking the validity |
---|
1571 | 1832 | * of the head pfn. |
---|
1572 | | - * |
---|
1573 | | - * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave |
---|
1574 | | - * within a node: a pfn is between start and end of a node, but does not belong |
---|
1575 | | - * to this memory node. |
---|
1576 | 1833 | */ |
---|
1577 | | -static inline bool __init |
---|
1578 | | -deferred_pfn_valid(int nid, unsigned long pfn, |
---|
1579 | | - struct mminit_pfnnid_cache *nid_init_state) |
---|
| 1834 | +static inline bool __init deferred_pfn_valid(unsigned long pfn) |
---|
1580 | 1835 | { |
---|
1581 | 1836 | if (!pfn_valid_within(pfn)) |
---|
1582 | 1837 | return false; |
---|
1583 | 1838 | if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) |
---|
1584 | | - return false; |
---|
1585 | | - if (!meminit_pfn_in_nid(pfn, nid, nid_init_state)) |
---|
1586 | 1839 | return false; |
---|
1587 | 1840 | return true; |
---|
1588 | 1841 | } |
---|
.. | .. |
---|
1591 | 1844 | * Free pages to buddy allocator. Try to free aligned pages in |
---|
1592 | 1845 | * pageblock_nr_pages sizes. |
---|
1593 | 1846 | */ |
---|
1594 | | -static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, |
---|
| 1847 | +static void __init deferred_free_pages(unsigned long pfn, |
---|
1595 | 1848 | unsigned long end_pfn) |
---|
1596 | 1849 | { |
---|
1597 | | - struct mminit_pfnnid_cache nid_init_state = { }; |
---|
1598 | 1850 | unsigned long nr_pgmask = pageblock_nr_pages - 1; |
---|
1599 | 1851 | unsigned long nr_free = 0; |
---|
1600 | 1852 | |
---|
1601 | 1853 | for (; pfn < end_pfn; pfn++) { |
---|
1602 | | - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { |
---|
| 1854 | + if (!deferred_pfn_valid(pfn)) { |
---|
1603 | 1855 | deferred_free_range(pfn - nr_free, nr_free); |
---|
1604 | 1856 | nr_free = 0; |
---|
1605 | 1857 | } else if (!(pfn & nr_pgmask)) { |
---|
1606 | 1858 | deferred_free_range(pfn - nr_free, nr_free); |
---|
1607 | 1859 | nr_free = 1; |
---|
1608 | | - touch_nmi_watchdog(); |
---|
1609 | 1860 | } else { |
---|
1610 | 1861 | nr_free++; |
---|
1611 | 1862 | } |
---|
.. | .. |
---|
1619 | 1870 | * by performing it only once every pageblock_nr_pages. |
---|
1620 | 1871 | * Return number of pages initialized. |
---|
1621 | 1872 | */ |
---|
1622 | | -static unsigned long __init deferred_init_pages(int nid, int zid, |
---|
| 1873 | +static unsigned long __init deferred_init_pages(struct zone *zone, |
---|
1623 | 1874 | unsigned long pfn, |
---|
1624 | 1875 | unsigned long end_pfn) |
---|
1625 | 1876 | { |
---|
1626 | | - struct mminit_pfnnid_cache nid_init_state = { }; |
---|
1627 | 1877 | unsigned long nr_pgmask = pageblock_nr_pages - 1; |
---|
| 1878 | + int nid = zone_to_nid(zone); |
---|
1628 | 1879 | unsigned long nr_pages = 0; |
---|
| 1880 | + int zid = zone_idx(zone); |
---|
1629 | 1881 | struct page *page = NULL; |
---|
1630 | 1882 | |
---|
1631 | 1883 | for (; pfn < end_pfn; pfn++) { |
---|
1632 | | - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { |
---|
| 1884 | + if (!deferred_pfn_valid(pfn)) { |
---|
1633 | 1885 | page = NULL; |
---|
1634 | 1886 | continue; |
---|
1635 | 1887 | } else if (!page || !(pfn & nr_pgmask)) { |
---|
1636 | 1888 | page = pfn_to_page(pfn); |
---|
1637 | | - touch_nmi_watchdog(); |
---|
1638 | 1889 | } else { |
---|
1639 | 1890 | page++; |
---|
1640 | 1891 | } |
---|
.. | .. |
---|
1644 | 1895 | return (nr_pages); |
---|
1645 | 1896 | } |
---|
1646 | 1897 | |
---|
| 1898 | +/* |
---|
| 1899 | + * This function is meant to pre-load the iterator for the zone init. |
---|
| 1900 | + * Specifically it walks through the ranges until we are caught up to the |
---|
| 1901 | + * first_init_pfn value and exits there. If we never encounter the value we |
---|
| 1902 | + * return false indicating there are no valid ranges left. |
---|
| 1903 | + */ |
---|
| 1904 | +static bool __init |
---|
| 1905 | +deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, |
---|
| 1906 | + unsigned long *spfn, unsigned long *epfn, |
---|
| 1907 | + unsigned long first_init_pfn) |
---|
| 1908 | +{ |
---|
| 1909 | + u64 j; |
---|
| 1910 | + |
---|
| 1911 | + /* |
---|
| 1912 | + * Start out by walking through the ranges in this zone that have |
---|
| 1913 | + * already been initialized. We don't need to do anything with them |
---|
| 1914 | + * so we just need to flush them out of the system. |
---|
| 1915 | + */ |
---|
| 1916 | + for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { |
---|
| 1917 | + if (*epfn <= first_init_pfn) |
---|
| 1918 | + continue; |
---|
| 1919 | + if (*spfn < first_init_pfn) |
---|
| 1920 | + *spfn = first_init_pfn; |
---|
| 1921 | + *i = j; |
---|
| 1922 | + return true; |
---|
| 1923 | + } |
---|
| 1924 | + |
---|
| 1925 | + return false; |
---|
| 1926 | +} |
---|
| 1927 | + |
---|
| 1928 | +/* |
---|
| 1929 | + * Initialize and free pages. We do it in two loops: first we initialize |
---|
| 1930 | + * struct page, then free to buddy allocator, because while we are |
---|
| 1931 | + * freeing pages we can access pages that are ahead (computing buddy |
---|
| 1932 | + * page in __free_one_page()). |
---|
| 1933 | + * |
---|
| 1934 | + * In order to try and keep some memory in the cache we have the loop |
---|
| 1935 | + * broken along max page order boundaries. This way we will not cause |
---|
| 1936 | + * any issues with the buddy page computation. |
---|
| 1937 | + */ |
---|
| 1938 | +static unsigned long __init |
---|
| 1939 | +deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, |
---|
| 1940 | + unsigned long *end_pfn) |
---|
| 1941 | +{ |
---|
| 1942 | + unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); |
---|
| 1943 | + unsigned long spfn = *start_pfn, epfn = *end_pfn; |
---|
| 1944 | + unsigned long nr_pages = 0; |
---|
| 1945 | + u64 j = *i; |
---|
| 1946 | + |
---|
| 1947 | + /* First we loop through and initialize the page values */ |
---|
| 1948 | + for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { |
---|
| 1949 | + unsigned long t; |
---|
| 1950 | + |
---|
| 1951 | + if (mo_pfn <= *start_pfn) |
---|
| 1952 | + break; |
---|
| 1953 | + |
---|
| 1954 | + t = min(mo_pfn, *end_pfn); |
---|
| 1955 | + nr_pages += deferred_init_pages(zone, *start_pfn, t); |
---|
| 1956 | + |
---|
| 1957 | + if (mo_pfn < *end_pfn) { |
---|
| 1958 | + *start_pfn = mo_pfn; |
---|
| 1959 | + break; |
---|
| 1960 | + } |
---|
| 1961 | + } |
---|
| 1962 | + |
---|
| 1963 | + /* Reset values and now loop through freeing pages as needed */ |
---|
| 1964 | + swap(j, *i); |
---|
| 1965 | + |
---|
| 1966 | + for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { |
---|
| 1967 | + unsigned long t; |
---|
| 1968 | + |
---|
| 1969 | + if (mo_pfn <= spfn) |
---|
| 1970 | + break; |
---|
| 1971 | + |
---|
| 1972 | + t = min(mo_pfn, epfn); |
---|
| 1973 | + deferred_free_pages(spfn, t); |
---|
| 1974 | + |
---|
| 1975 | + if (mo_pfn <= epfn) |
---|
| 1976 | + break; |
---|
| 1977 | + } |
---|
| 1978 | + |
---|
| 1979 | + return nr_pages; |
---|
| 1980 | +} |
---|
| 1981 | + |
---|
| 1982 | +static void __init |
---|
| 1983 | +deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, |
---|
| 1984 | + void *arg) |
---|
| 1985 | +{ |
---|
| 1986 | + unsigned long spfn, epfn; |
---|
| 1987 | + struct zone *zone = arg; |
---|
| 1988 | + u64 i; |
---|
| 1989 | + |
---|
| 1990 | + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); |
---|
| 1991 | + |
---|
| 1992 | + /* |
---|
| 1993 | + * Initialize and free pages in MAX_ORDER sized increments so that we |
---|
| 1994 | + * can avoid introducing any issues with the buddy allocator. |
---|
| 1995 | + */ |
---|
| 1996 | + while (spfn < end_pfn) { |
---|
| 1997 | + deferred_init_maxorder(&i, zone, &spfn, &epfn); |
---|
| 1998 | + cond_resched(); |
---|
| 1999 | + } |
---|
| 2000 | +} |
---|
| 2001 | + |
---|
| 2002 | +/* An arch may override for more concurrency. */ |
---|
| 2003 | +__weak int __init |
---|
| 2004 | +deferred_page_init_max_threads(const struct cpumask *node_cpumask) |
---|
| 2005 | +{ |
---|
| 2006 | + return 1; |
---|
| 2007 | +} |
---|
| 2008 | + |
---|
1647 | 2009 | /* Initialise remaining memory on a node */ |
---|
1648 | 2010 | static int __init deferred_init_memmap(void *data) |
---|
1649 | 2011 | { |
---|
1650 | 2012 | pg_data_t *pgdat = data; |
---|
1651 | | - int nid = pgdat->node_id; |
---|
1652 | | - unsigned long start = jiffies; |
---|
1653 | | - unsigned long nr_pages = 0; |
---|
1654 | | - unsigned long spfn, epfn, first_init_pfn, flags; |
---|
1655 | | - phys_addr_t spa, epa; |
---|
1656 | | - int zid; |
---|
1657 | | - struct zone *zone; |
---|
1658 | 2013 | const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); |
---|
| 2014 | + unsigned long spfn = 0, epfn = 0; |
---|
| 2015 | + unsigned long first_init_pfn, flags; |
---|
| 2016 | + unsigned long start = jiffies; |
---|
| 2017 | + struct zone *zone; |
---|
| 2018 | + int zid, max_threads; |
---|
1659 | 2019 | u64 i; |
---|
1660 | 2020 | |
---|
1661 | 2021 | /* Bind memory initialisation thread to a local node if possible */ |
---|
.. | .. |
---|
1688 | 2048 | if (first_init_pfn < zone_end_pfn(zone)) |
---|
1689 | 2049 | break; |
---|
1690 | 2050 | } |
---|
1691 | | - first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); |
---|
1692 | 2051 | |
---|
1693 | | - /* |
---|
1694 | | - * Initialize and free pages. We do it in two loops: first we initialize |
---|
1695 | | - * struct page, than free to buddy allocator, because while we are |
---|
1696 | | - * freeing pages we can access pages that are ahead (computing buddy |
---|
1697 | | - * page in __free_one_page()). |
---|
1698 | | - */ |
---|
1699 | | - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { |
---|
1700 | | - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); |
---|
1701 | | - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); |
---|
1702 | | - nr_pages += deferred_init_pages(nid, zid, spfn, epfn); |
---|
1703 | | - } |
---|
1704 | | - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { |
---|
1705 | | - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); |
---|
1706 | | - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); |
---|
1707 | | - deferred_free_pages(nid, zid, spfn, epfn); |
---|
1708 | | - } |
---|
| 2052 | + /* If the zone is empty somebody else may have cleared out the zone */ |
---|
| 2053 | + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, |
---|
| 2054 | + first_init_pfn)) |
---|
| 2055 | + goto zone_empty; |
---|
1709 | 2056 | |
---|
| 2057 | + max_threads = deferred_page_init_max_threads(cpumask); |
---|
| 2058 | + |
---|
| 2059 | + while (spfn < epfn) { |
---|
| 2060 | + unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); |
---|
| 2061 | + struct padata_mt_job job = { |
---|
| 2062 | + .thread_fn = deferred_init_memmap_chunk, |
---|
| 2063 | + .fn_arg = zone, |
---|
| 2064 | + .start = spfn, |
---|
| 2065 | + .size = epfn_align - spfn, |
---|
| 2066 | + .align = PAGES_PER_SECTION, |
---|
| 2067 | + .min_chunk = PAGES_PER_SECTION, |
---|
| 2068 | + .max_threads = max_threads, |
---|
| 2069 | + }; |
---|
| 2070 | + |
---|
| 2071 | + padata_do_multithreaded(&job); |
---|
| 2072 | + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, |
---|
| 2073 | + epfn_align); |
---|
| 2074 | + } |
---|
| 2075 | +zone_empty: |
---|
1710 | 2076 | /* Sanity check that the next zone really is unpopulated */ |
---|
1711 | 2077 | WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); |
---|
1712 | 2078 | |
---|
1713 | | - pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, |
---|
1714 | | - jiffies_to_msecs(jiffies - start)); |
---|
| 2079 | + pr_info("node %d deferred pages initialised in %ums\n", |
---|
| 2080 | + pgdat->node_id, jiffies_to_msecs(jiffies - start)); |
---|
1715 | 2081 | |
---|
1716 | 2082 | pgdat_init_report_one_done(); |
---|
1717 | 2083 | return 0; |
---|
.. | .. |
---|
1735 | 2101 | static noinline bool __init |
---|
1736 | 2102 | deferred_grow_zone(struct zone *zone, unsigned int order) |
---|
1737 | 2103 | { |
---|
1738 | | - int zid = zone_idx(zone); |
---|
1739 | | - int nid = zone_to_nid(zone); |
---|
1740 | | - pg_data_t *pgdat = NODE_DATA(nid); |
---|
1741 | 2104 | unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); |
---|
1742 | | - unsigned long nr_pages = 0; |
---|
1743 | | - unsigned long first_init_pfn, spfn, epfn, t, flags; |
---|
| 2105 | + pg_data_t *pgdat = zone->zone_pgdat; |
---|
1744 | 2106 | unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; |
---|
1745 | | - phys_addr_t spa, epa; |
---|
| 2107 | + unsigned long spfn, epfn, flags; |
---|
| 2108 | + unsigned long nr_pages = 0; |
---|
1746 | 2109 | u64 i; |
---|
1747 | 2110 | |
---|
1748 | 2111 | /* Only the last zone may have deferred pages */ |
---|
.. | .. |
---|
1760 | 2123 | return true; |
---|
1761 | 2124 | } |
---|
1762 | 2125 | |
---|
1763 | | - first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); |
---|
1764 | | - |
---|
1765 | | - if (first_init_pfn >= pgdat_end_pfn(pgdat)) { |
---|
| 2126 | + /* If the zone is empty somebody else may have cleared out the zone */ |
---|
| 2127 | + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, |
---|
| 2128 | + first_deferred_pfn)) { |
---|
| 2129 | + pgdat->first_deferred_pfn = ULONG_MAX; |
---|
1766 | 2130 | pgdat_resize_unlock(pgdat, &flags); |
---|
1767 | | - return false; |
---|
| 2131 | + /* Retry only once. */ |
---|
| 2132 | + return first_deferred_pfn != ULONG_MAX; |
---|
1768 | 2133 | } |
---|
1769 | 2134 | |
---|
1770 | | - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { |
---|
1771 | | - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); |
---|
1772 | | - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); |
---|
| 2135 | + /* |
---|
| 2136 | + * Initialize and free pages in MAX_ORDER sized increments so |
---|
| 2137 | + * that we can avoid introducing any issues with the buddy |
---|
| 2138 | + * allocator. |
---|
| 2139 | + */ |
---|
| 2140 | + while (spfn < epfn) { |
---|
| 2141 | + /* update our first deferred PFN for this section */ |
---|
| 2142 | + first_deferred_pfn = spfn; |
---|
1773 | 2143 | |
---|
1774 | | - while (spfn < epfn && nr_pages < nr_pages_needed) { |
---|
1775 | | - t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); |
---|
1776 | | - first_deferred_pfn = min(t, epfn); |
---|
1777 | | - nr_pages += deferred_init_pages(nid, zid, spfn, |
---|
1778 | | - first_deferred_pfn); |
---|
1779 | | - spfn = first_deferred_pfn; |
---|
1780 | | - } |
---|
| 2144 | + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); |
---|
| 2145 | + touch_nmi_watchdog(); |
---|
1781 | 2146 | |
---|
| 2147 | + /* We should only stop along section boundaries */ |
---|
| 2148 | + if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) |
---|
| 2149 | + continue; |
---|
| 2150 | + |
---|
| 2151 | + /* If our quota has been met we can stop here */ |
---|
1782 | 2152 | if (nr_pages >= nr_pages_needed) |
---|
1783 | 2153 | break; |
---|
1784 | 2154 | } |
---|
1785 | 2155 | |
---|
1786 | | - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { |
---|
1787 | | - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); |
---|
1788 | | - epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); |
---|
1789 | | - deferred_free_pages(nid, zid, spfn, epfn); |
---|
1790 | | - |
---|
1791 | | - if (first_deferred_pfn == epfn) |
---|
1792 | | - break; |
---|
1793 | | - } |
---|
1794 | | - pgdat->first_deferred_pfn = first_deferred_pfn; |
---|
| 2156 | + pgdat->first_deferred_pfn = spfn; |
---|
1795 | 2157 | pgdat_resize_unlock(pgdat, &flags); |
---|
1796 | 2158 | |
---|
1797 | 2159 | return nr_pages > 0; |
---|
.. | .. |
---|
1814 | 2176 | void __init page_alloc_init_late(void) |
---|
1815 | 2177 | { |
---|
1816 | 2178 | struct zone *zone; |
---|
| 2179 | + int nid; |
---|
1817 | 2180 | |
---|
1818 | 2181 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
---|
1819 | | - int nid; |
---|
1820 | 2182 | |
---|
1821 | 2183 | /* There will be num_node_state(N_MEMORY) threads */ |
---|
1822 | 2184 | atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); |
---|
.. | .. |
---|
1844 | 2206 | /* Reinit limits that are based on free pages after the kernel is up */ |
---|
1845 | 2207 | files_maxfiles_init(); |
---|
1846 | 2208 | #endif |
---|
1847 | | -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK |
---|
| 2209 | + |
---|
1848 | 2210 | /* Discard memblock private memory */ |
---|
1849 | 2211 | memblock_discard(); |
---|
1850 | | -#endif |
---|
| 2212 | + |
---|
| 2213 | + for_each_node_state(nid, N_MEMORY) |
---|
| 2214 | + shuffle_free_memory(NODE_DATA(nid)); |
---|
1851 | 2215 | |
---|
1852 | 2216 | for_each_populated_zone(zone) |
---|
1853 | 2217 | set_zone_contiguous(zone); |
---|
.. | .. |
---|
1881 | 2245 | } |
---|
1882 | 2246 | |
---|
1883 | 2247 | adjust_managed_page_count(page, pageblock_nr_pages); |
---|
| 2248 | + page_zone(page)->cma_pages += pageblock_nr_pages; |
---|
1884 | 2249 | } |
---|
1885 | 2250 | #endif |
---|
1886 | 2251 | |
---|
.. | .. |
---|
1899 | 2264 | * -- nyc |
---|
1900 | 2265 | */ |
---|
1901 | 2266 | static inline void expand(struct zone *zone, struct page *page, |
---|
1902 | | - int low, int high, struct free_area *area, |
---|
1903 | | - int migratetype) |
---|
| 2267 | + int low, int high, int migratetype) |
---|
1904 | 2268 | { |
---|
1905 | 2269 | unsigned long size = 1 << high; |
---|
1906 | 2270 | |
---|
1907 | 2271 | while (high > low) { |
---|
1908 | | - area--; |
---|
1909 | 2272 | high--; |
---|
1910 | 2273 | size >>= 1; |
---|
1911 | 2274 | VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); |
---|
.. | .. |
---|
1919 | 2282 | if (set_page_guard(zone, &page[size], high, migratetype)) |
---|
1920 | 2283 | continue; |
---|
1921 | 2284 | |
---|
1922 | | - list_add(&page[size].lru, &area->free_list[migratetype]); |
---|
1923 | | - area->nr_free++; |
---|
1924 | | - set_page_order(&page[size], high); |
---|
| 2285 | + add_to_free_list(&page[size], zone, high, migratetype); |
---|
| 2286 | + set_buddy_order(&page[size], high); |
---|
1925 | 2287 | } |
---|
1926 | 2288 | } |
---|
1927 | 2289 | |
---|
1928 | 2290 | static void check_new_page_bad(struct page *page) |
---|
1929 | 2291 | { |
---|
1930 | | - const char *bad_reason = NULL; |
---|
1931 | | - unsigned long bad_flags = 0; |
---|
1932 | | - |
---|
1933 | | - if (unlikely(atomic_read(&page->_mapcount) != -1)) |
---|
1934 | | - bad_reason = "nonzero mapcount"; |
---|
1935 | | - if (unlikely(page->mapping != NULL)) |
---|
1936 | | - bad_reason = "non-NULL mapping"; |
---|
1937 | | - if (unlikely(page_ref_count(page) != 0)) |
---|
1938 | | - bad_reason = "nonzero _count"; |
---|
1939 | 2292 | if (unlikely(page->flags & __PG_HWPOISON)) { |
---|
1940 | | - bad_reason = "HWPoisoned (hardware-corrupted)"; |
---|
1941 | | - bad_flags = __PG_HWPOISON; |
---|
1942 | 2293 | /* Don't complain about hwpoisoned pages */ |
---|
1943 | 2294 | page_mapcount_reset(page); /* remove PageBuddy */ |
---|
1944 | 2295 | return; |
---|
1945 | 2296 | } |
---|
1946 | | - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { |
---|
1947 | | - bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; |
---|
1948 | | - bad_flags = PAGE_FLAGS_CHECK_AT_PREP; |
---|
1949 | | - } |
---|
1950 | | -#ifdef CONFIG_MEMCG |
---|
1951 | | - if (unlikely(page->mem_cgroup)) |
---|
1952 | | - bad_reason = "page still charged to cgroup"; |
---|
1953 | | -#endif |
---|
1954 | | - bad_page(page, bad_reason, bad_flags); |
---|
| 2297 | + |
---|
| 2298 | + bad_page(page, |
---|
| 2299 | + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); |
---|
1955 | 2300 | } |
---|
1956 | 2301 | |
---|
1957 | 2302 | /* |
---|
.. | .. |
---|
1967 | 2312 | return 1; |
---|
1968 | 2313 | } |
---|
1969 | 2314 | |
---|
1970 | | -static inline bool free_pages_prezeroed(void) |
---|
1971 | | -{ |
---|
1972 | | - return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && |
---|
1973 | | - page_poisoning_enabled()) || want_init_on_free(); |
---|
1974 | | -} |
---|
1975 | | - |
---|
1976 | 2315 | #ifdef CONFIG_DEBUG_VM |
---|
1977 | | -static bool check_pcp_refill(struct page *page) |
---|
| 2316 | +/* |
---|
| 2317 | + * With DEBUG_VM enabled, order-0 pages are checked for expected state when |
---|
| 2318 | + * being allocated from pcp lists. With debug_pagealloc also enabled, they are |
---|
| 2319 | + * also checked when pcp lists are refilled from the free lists. |
---|
| 2320 | + */ |
---|
| 2321 | +static inline bool check_pcp_refill(struct page *page) |
---|
1978 | 2322 | { |
---|
1979 | | - return false; |
---|
| 2323 | + if (debug_pagealloc_enabled_static()) |
---|
| 2324 | + return check_new_page(page); |
---|
| 2325 | + else |
---|
| 2326 | + return false; |
---|
1980 | 2327 | } |
---|
1981 | 2328 | |
---|
1982 | | -static bool check_new_pcp(struct page *page) |
---|
| 2329 | +static inline bool check_new_pcp(struct page *page) |
---|
1983 | 2330 | { |
---|
1984 | 2331 | return check_new_page(page); |
---|
1985 | 2332 | } |
---|
1986 | 2333 | #else |
---|
1987 | | -static bool check_pcp_refill(struct page *page) |
---|
| 2334 | +/* |
---|
| 2335 | + * With DEBUG_VM disabled, free order-0 pages are checked for expected state |
---|
| 2336 | + * when pcp lists are being refilled from the free lists. With debug_pagealloc |
---|
| 2337 | + * enabled, they are also checked when being allocated from the pcp lists. |
---|
| 2338 | + */ |
---|
| 2339 | +static inline bool check_pcp_refill(struct page *page) |
---|
1988 | 2340 | { |
---|
1989 | 2341 | return check_new_page(page); |
---|
1990 | 2342 | } |
---|
1991 | | -static bool check_new_pcp(struct page *page) |
---|
| 2343 | +static inline bool check_new_pcp(struct page *page) |
---|
1992 | 2344 | { |
---|
1993 | | - return false; |
---|
| 2345 | + if (debug_pagealloc_enabled_static()) |
---|
| 2346 | + return check_new_page(page); |
---|
| 2347 | + else |
---|
| 2348 | + return false; |
---|
1994 | 2349 | } |
---|
1995 | 2350 | #endif /* CONFIG_DEBUG_VM */ |
---|
1996 | 2351 | |
---|
.. | .. |
---|
2014 | 2369 | set_page_refcounted(page); |
---|
2015 | 2370 | |
---|
2016 | 2371 | arch_alloc_page(page, order); |
---|
2017 | | - kernel_map_pages(page, 1 << order, 1); |
---|
2018 | | - kasan_alloc_pages(page, order); |
---|
2019 | | - kernel_poison_pages(page, 1 << order, 1); |
---|
| 2372 | + debug_pagealloc_map_pages(page, 1 << order); |
---|
| 2373 | + |
---|
| 2374 | + /* |
---|
| 2375 | + * Page unpoisoning must happen before memory initialization. |
---|
| 2376 | + * Otherwise, the poison pattern will be overwritten for __GFP_ZERO |
---|
| 2377 | + * allocations and the page unpoisoning code will complain. |
---|
| 2378 | + */ |
---|
| 2379 | + kernel_unpoison_pages(page, 1 << order); |
---|
| 2380 | + |
---|
| 2381 | + /* |
---|
| 2382 | + * As memory initialization might be integrated into KASAN, |
---|
| 2383 | + * kasan_alloc_pages and kernel_init_free_pages must be |
---|
| 2384 | + * kept together to avoid discrepancies in behavior. |
---|
| 2385 | + */ |
---|
| 2386 | + if (kasan_has_integrated_init()) { |
---|
| 2387 | + kasan_alloc_pages(page, order, gfp_flags); |
---|
| 2388 | + } else { |
---|
| 2389 | + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); |
---|
| 2390 | + |
---|
| 2391 | + kasan_unpoison_pages(page, order, init); |
---|
| 2392 | + if (init) |
---|
| 2393 | + kernel_init_free_pages(page, 1 << order, |
---|
| 2394 | + gfp_flags & __GFP_ZEROTAGS); |
---|
| 2395 | + } |
---|
| 2396 | + |
---|
2020 | 2397 | set_page_owner(page, order, gfp_flags); |
---|
2021 | 2398 | } |
---|
2022 | 2399 | |
---|
.. | .. |
---|
2024 | 2401 | unsigned int alloc_flags) |
---|
2025 | 2402 | { |
---|
2026 | 2403 | post_alloc_hook(page, order, gfp_flags); |
---|
2027 | | - |
---|
2028 | | - if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) |
---|
2029 | | - kernel_init_free_pages(page, 1 << order); |
---|
2030 | 2404 | |
---|
2031 | 2405 | if (order && (gfp_flags & __GFP_COMP)) |
---|
2032 | 2406 | prep_compound_page(page, order); |
---|
.. | .. |
---|
2041 | 2415 | set_page_pfmemalloc(page); |
---|
2042 | 2416 | else |
---|
2043 | 2417 | clear_page_pfmemalloc(page); |
---|
| 2418 | + trace_android_vh_test_clear_look_around_ref(page); |
---|
2044 | 2419 | } |
---|
2045 | 2420 | |
---|
2046 | 2421 | /* |
---|
.. | .. |
---|
2058 | 2433 | /* Find a page of the appropriate size in the preferred list */ |
---|
2059 | 2434 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { |
---|
2060 | 2435 | area = &(zone->free_area[current_order]); |
---|
2061 | | - page = list_first_entry_or_null(&area->free_list[migratetype], |
---|
2062 | | - struct page, lru); |
---|
| 2436 | + page = get_page_from_free_area(area, migratetype); |
---|
2063 | 2437 | if (!page) |
---|
2064 | 2438 | continue; |
---|
2065 | | - list_del(&page->lru); |
---|
2066 | | - rmv_page_order(page); |
---|
2067 | | - area->nr_free--; |
---|
2068 | | - expand(zone, page, order, current_order, area, migratetype); |
---|
| 2439 | + del_page_from_free_list(page, zone, current_order); |
---|
| 2440 | + expand(zone, page, order, current_order, migratetype); |
---|
2069 | 2441 | set_pcppage_migratetype(page, migratetype); |
---|
2070 | 2442 | return page; |
---|
2071 | 2443 | } |
---|
.. | .. |
---|
2078 | 2450 | * This array describes the order lists are fallen back to when |
---|
2079 | 2451 | * the free lists for the desirable migrate type are depleted |
---|
2080 | 2452 | */ |
---|
2081 | | -static int fallbacks[MIGRATE_TYPES][4] = { |
---|
| 2453 | +static int fallbacks[MIGRATE_TYPES][3] = { |
---|
2082 | 2454 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, |
---|
2083 | | - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, |
---|
2084 | 2455 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, |
---|
| 2456 | + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, |
---|
2085 | 2457 | #ifdef CONFIG_CMA |
---|
2086 | 2458 | [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ |
---|
2087 | 2459 | #endif |
---|
.. | .. |
---|
2102 | 2474 | #endif |
---|
2103 | 2475 | |
---|
2104 | 2476 | /* |
---|
2105 | | - * Move the free pages in a range to the free lists of the requested type. |
---|
| 2477 | + * Move the free pages in a range to the freelist tail of the requested type. |
---|
2106 | 2478 | * Note that start_page and end_pages are not aligned on a pageblock |
---|
2107 | 2479 | * boundary. If alignment is required, use move_freepages_block() |
---|
2108 | 2480 | */ |
---|
.. | .. |
---|
2114 | 2486 | unsigned int order; |
---|
2115 | 2487 | int pages_moved = 0; |
---|
2116 | 2488 | |
---|
2117 | | -#ifndef CONFIG_HOLES_IN_ZONE |
---|
2118 | | - /* |
---|
2119 | | - * page_zone is not safe to call in this context when |
---|
2120 | | - * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant |
---|
2121 | | - * anyway as we check zone boundaries in move_freepages_block(). |
---|
2122 | | - * Remove at a later date when no bug reports exist related to |
---|
2123 | | - * grouping pages by mobility |
---|
2124 | | - */ |
---|
2125 | | - VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) && |
---|
2126 | | - pfn_valid(page_to_pfn(end_page)) && |
---|
2127 | | - page_zone(start_page) != page_zone(end_page)); |
---|
2128 | | -#endif |
---|
2129 | | - |
---|
2130 | | - if (num_movable) |
---|
2131 | | - *num_movable = 0; |
---|
2132 | | - |
---|
2133 | 2489 | for (page = start_page; page <= end_page;) { |
---|
2134 | 2490 | if (!pfn_valid_within(page_to_pfn(page))) { |
---|
2135 | 2491 | page++; |
---|
2136 | 2492 | continue; |
---|
2137 | 2493 | } |
---|
2138 | | - |
---|
2139 | | - /* Make sure we are not inadvertently changing nodes */ |
---|
2140 | | - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); |
---|
2141 | 2494 | |
---|
2142 | 2495 | if (!PageBuddy(page)) { |
---|
2143 | 2496 | /* |
---|
.. | .. |
---|
2153 | 2506 | continue; |
---|
2154 | 2507 | } |
---|
2155 | 2508 | |
---|
2156 | | - order = page_order(page); |
---|
2157 | | - list_move(&page->lru, |
---|
2158 | | - &zone->free_area[order].free_list[migratetype]); |
---|
| 2509 | + /* Make sure we are not inadvertently changing nodes */ |
---|
| 2510 | + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); |
---|
| 2511 | + VM_BUG_ON_PAGE(page_zone(page) != zone, page); |
---|
| 2512 | + |
---|
| 2513 | + order = buddy_order(page); |
---|
| 2514 | + move_to_free_list(page, zone, order, migratetype); |
---|
2159 | 2515 | page += 1 << order; |
---|
2160 | 2516 | pages_moved += 1 << order; |
---|
2161 | 2517 | } |
---|
.. | .. |
---|
2168 | 2524 | { |
---|
2169 | 2525 | unsigned long start_pfn, end_pfn; |
---|
2170 | 2526 | struct page *start_page, *end_page; |
---|
| 2527 | + |
---|
| 2528 | + if (num_movable) |
---|
| 2529 | + *num_movable = 0; |
---|
2171 | 2530 | |
---|
2172 | 2531 | start_pfn = page_to_pfn(page); |
---|
2173 | 2532 | start_pfn = start_pfn & ~(pageblock_nr_pages-1); |
---|
.. | .. |
---|
2229 | 2588 | return false; |
---|
2230 | 2589 | } |
---|
2231 | 2590 | |
---|
| 2591 | +static inline bool boost_watermark(struct zone *zone) |
---|
| 2592 | +{ |
---|
| 2593 | + unsigned long max_boost; |
---|
| 2594 | + |
---|
| 2595 | + if (!watermark_boost_factor) |
---|
| 2596 | + return false; |
---|
| 2597 | + /* |
---|
| 2598 | + * Don't bother in zones that are unlikely to produce results. |
---|
| 2599 | + * On small machines, including kdump capture kernels running |
---|
| 2600 | + * in a small area, boosting the watermark can cause an out of |
---|
| 2601 | + * memory situation immediately. |
---|
| 2602 | + */ |
---|
| 2603 | + if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) |
---|
| 2604 | + return false; |
---|
| 2605 | + |
---|
| 2606 | + max_boost = mult_frac(zone->_watermark[WMARK_HIGH], |
---|
| 2607 | + watermark_boost_factor, 10000); |
---|
| 2608 | + |
---|
| 2609 | + /* |
---|
| 2610 | + * high watermark may be uninitialised if fragmentation occurs |
---|
| 2611 | + * very early in boot so do not boost. We do not fall |
---|
| 2612 | + * through and boost by pageblock_nr_pages as failing |
---|
| 2613 | + * allocations that early means that reclaim is not going |
---|
| 2614 | + * to help and it may even be impossible to reclaim the |
---|
| 2615 | + * boosted watermark resulting in a hang. |
---|
| 2616 | + */ |
---|
| 2617 | + if (!max_boost) |
---|
| 2618 | + return false; |
---|
| 2619 | + |
---|
| 2620 | + max_boost = max(pageblock_nr_pages, max_boost); |
---|
| 2621 | + |
---|
| 2622 | + zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, |
---|
| 2623 | + max_boost); |
---|
| 2624 | + |
---|
| 2625 | + return true; |
---|
| 2626 | +} |
---|
| 2627 | + |
---|
2232 | 2628 | /* |
---|
2233 | 2629 | * This function implements actual steal behaviour. If order is large enough, |
---|
2234 | 2630 | * we can steal whole pageblock. If not, we first move freepages in this |
---|
.. | .. |
---|
2238 | 2634 | * itself, so pages freed in the future will be put on the correct free list. |
---|
2239 | 2635 | */ |
---|
2240 | 2636 | static void steal_suitable_fallback(struct zone *zone, struct page *page, |
---|
2241 | | - int start_type, bool whole_block) |
---|
| 2637 | + unsigned int alloc_flags, int start_type, bool whole_block) |
---|
2242 | 2638 | { |
---|
2243 | | - unsigned int current_order = page_order(page); |
---|
2244 | | - struct free_area *area; |
---|
| 2639 | + unsigned int current_order = buddy_order(page); |
---|
2245 | 2640 | int free_pages, movable_pages, alike_pages; |
---|
2246 | 2641 | int old_block_type; |
---|
2247 | 2642 | |
---|
.. | .. |
---|
2259 | 2654 | change_pageblock_range(page, current_order, start_type); |
---|
2260 | 2655 | goto single_page; |
---|
2261 | 2656 | } |
---|
| 2657 | + |
---|
| 2658 | + /* |
---|
| 2659 | + * Boost watermarks to increase reclaim pressure to reduce the |
---|
| 2660 | + * likelihood of future fallbacks. Wake kswapd now as the node |
---|
| 2661 | + * may be balanced overall and kswapd will not wake naturally. |
---|
| 2662 | + */ |
---|
| 2663 | + if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) |
---|
| 2664 | + set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); |
---|
2262 | 2665 | |
---|
2263 | 2666 | /* We are not allowed to try stealing from the whole block */ |
---|
2264 | 2667 | if (!whole_block) |
---|
.. | .. |
---|
2303 | 2706 | return; |
---|
2304 | 2707 | |
---|
2305 | 2708 | single_page: |
---|
2306 | | - area = &zone->free_area[current_order]; |
---|
2307 | | - list_move(&page->lru, &area->free_list[start_type]); |
---|
| 2709 | + move_to_free_list(page, zone, current_order, start_type); |
---|
2308 | 2710 | } |
---|
2309 | 2711 | |
---|
2310 | 2712 | /* |
---|
.. | .. |
---|
2328 | 2730 | if (fallback_mt == MIGRATE_TYPES) |
---|
2329 | 2731 | break; |
---|
2330 | 2732 | |
---|
2331 | | - if (list_empty(&area->free_list[fallback_mt])) |
---|
| 2733 | + if (free_area_empty(area, fallback_mt)) |
---|
2332 | 2734 | continue; |
---|
2333 | 2735 | |
---|
2334 | 2736 | if (can_steal_fallback(order, migratetype)) |
---|
.. | .. |
---|
2358 | 2760 | * Limit the number reserved to 1 pageblock or roughly 1% of a zone. |
---|
2359 | 2761 | * Check is race-prone but harmless. |
---|
2360 | 2762 | */ |
---|
2361 | | - max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; |
---|
| 2763 | + max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; |
---|
2362 | 2764 | if (zone->nr_reserved_highatomic >= max_managed) |
---|
2363 | 2765 | return; |
---|
2364 | 2766 | |
---|
.. | .. |
---|
2400 | 2802 | struct page *page; |
---|
2401 | 2803 | int order; |
---|
2402 | 2804 | bool ret; |
---|
| 2805 | + bool skip_unreserve_highatomic = false; |
---|
2403 | 2806 | |
---|
2404 | | - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, |
---|
| 2807 | + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, |
---|
2405 | 2808 | ac->nodemask) { |
---|
2406 | 2809 | /* |
---|
2407 | 2810 | * Preserve at least one pageblock unless memory pressure |
---|
.. | .. |
---|
2411 | 2814 | pageblock_nr_pages) |
---|
2412 | 2815 | continue; |
---|
2413 | 2816 | |
---|
| 2817 | + trace_android_vh_unreserve_highatomic_bypass(force, zone, |
---|
| 2818 | + &skip_unreserve_highatomic); |
---|
| 2819 | + if (skip_unreserve_highatomic) |
---|
| 2820 | + continue; |
---|
| 2821 | + |
---|
2414 | 2822 | spin_lock_irqsave(&zone->lock, flags); |
---|
2415 | 2823 | for (order = 0; order < MAX_ORDER; order++) { |
---|
2416 | 2824 | struct free_area *area = &(zone->free_area[order]); |
---|
2417 | 2825 | |
---|
2418 | | - page = list_first_entry_or_null( |
---|
2419 | | - &area->free_list[MIGRATE_HIGHATOMIC], |
---|
2420 | | - struct page, lru); |
---|
| 2826 | + page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); |
---|
2421 | 2827 | if (!page) |
---|
2422 | 2828 | continue; |
---|
2423 | 2829 | |
---|
.. | .. |
---|
2475 | 2881 | * condition simpler. |
---|
2476 | 2882 | */ |
---|
2477 | 2883 | static __always_inline bool |
---|
2478 | | -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) |
---|
| 2884 | +__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, |
---|
| 2885 | + unsigned int alloc_flags) |
---|
2479 | 2886 | { |
---|
2480 | 2887 | struct free_area *area; |
---|
2481 | 2888 | int current_order; |
---|
| 2889 | + int min_order = order; |
---|
2482 | 2890 | struct page *page; |
---|
2483 | 2891 | int fallback_mt; |
---|
2484 | 2892 | bool can_steal; |
---|
| 2893 | + |
---|
| 2894 | + /* |
---|
| 2895 | + * Do not steal pages from freelists belonging to other pageblocks |
---|
| 2896 | + * i.e. orders < pageblock_order. If there are no local zones free, |
---|
| 2897 | + * the zonelists will be reiterated without ALLOC_NOFRAGMENT. |
---|
| 2898 | + */ |
---|
| 2899 | + if (alloc_flags & ALLOC_NOFRAGMENT) |
---|
| 2900 | + min_order = pageblock_order; |
---|
2485 | 2901 | |
---|
2486 | 2902 | /* |
---|
2487 | 2903 | * Find the largest available free page in the other list. This roughly |
---|
2488 | 2904 | * approximates finding the pageblock with the most free pages, which |
---|
2489 | 2905 | * would be too costly to do exactly. |
---|
2490 | 2906 | */ |
---|
2491 | | - for (current_order = MAX_ORDER - 1; current_order >= order; |
---|
| 2907 | + for (current_order = MAX_ORDER - 1; current_order >= min_order; |
---|
2492 | 2908 | --current_order) { |
---|
2493 | 2909 | area = &(zone->free_area[current_order]); |
---|
2494 | 2910 | fallback_mt = find_suitable_fallback(area, current_order, |
---|
.. | .. |
---|
2530 | 2946 | VM_BUG_ON(current_order == MAX_ORDER); |
---|
2531 | 2947 | |
---|
2532 | 2948 | do_steal: |
---|
2533 | | - page = list_first_entry(&area->free_list[fallback_mt], |
---|
2534 | | - struct page, lru); |
---|
| 2949 | + page = get_page_from_free_area(area, fallback_mt); |
---|
2535 | 2950 | |
---|
2536 | | - steal_suitable_fallback(zone, page, start_migratetype, can_steal); |
---|
| 2951 | + steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, |
---|
| 2952 | + can_steal); |
---|
2537 | 2953 | |
---|
2538 | 2954 | trace_mm_page_alloc_extfrag(page, order, current_order, |
---|
2539 | 2955 | start_migratetype, fallback_mt); |
---|
.. | .. |
---|
2547 | 2963 | * Call me with the zone->lock already held. |
---|
2548 | 2964 | */ |
---|
2549 | 2965 | static __always_inline struct page * |
---|
2550 | | -__rmqueue(struct zone *zone, unsigned int order, int migratetype) |
---|
| 2966 | +__rmqueue(struct zone *zone, unsigned int order, int migratetype, |
---|
| 2967 | + unsigned int alloc_flags) |
---|
2551 | 2968 | { |
---|
2552 | 2969 | struct page *page; |
---|
2553 | 2970 | |
---|
2554 | 2971 | retry: |
---|
2555 | 2972 | page = __rmqueue_smallest(zone, order, migratetype); |
---|
2556 | 2973 | |
---|
2557 | | - if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype)) |
---|
| 2974 | + if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype, |
---|
| 2975 | + alloc_flags)) |
---|
2558 | 2976 | goto retry; |
---|
2559 | 2977 | |
---|
2560 | 2978 | trace_mm_page_alloc_zone_locked(page, order, migratetype); |
---|
.. | .. |
---|
2562 | 2980 | } |
---|
2563 | 2981 | |
---|
2564 | 2982 | #ifdef CONFIG_CMA |
---|
2565 | | -static struct page *__rmqueue_cma(struct zone *zone, unsigned int order) |
---|
| 2983 | +static struct page *__rmqueue_cma(struct zone *zone, unsigned int order, |
---|
| 2984 | + int migratetype, |
---|
| 2985 | + unsigned int alloc_flags) |
---|
2566 | 2986 | { |
---|
2567 | | - struct page *page = 0; |
---|
2568 | | - |
---|
2569 | | - if (IS_ENABLED(CONFIG_CMA)) |
---|
2570 | | - if (!zone->cma_alloc) |
---|
2571 | | - page = __rmqueue_cma_fallback(zone, order); |
---|
| 2987 | + struct page *page = __rmqueue_cma_fallback(zone, order); |
---|
2572 | 2988 | trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA); |
---|
2573 | 2989 | return page; |
---|
2574 | 2990 | } |
---|
2575 | 2991 | #else |
---|
2576 | | -static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order) |
---|
| 2992 | +static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order, |
---|
| 2993 | + int migratetype, |
---|
| 2994 | + unsigned int alloc_flags) |
---|
2577 | 2995 | { |
---|
2578 | 2996 | return NULL; |
---|
2579 | 2997 | } |
---|
.. | .. |
---|
2586 | 3004 | */ |
---|
2587 | 3005 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
---|
2588 | 3006 | unsigned long count, struct list_head *list, |
---|
2589 | | - int migratetype) |
---|
| 3007 | + int migratetype, unsigned int alloc_flags) |
---|
2590 | 3008 | { |
---|
2591 | 3009 | int i, alloced = 0; |
---|
2592 | 3010 | |
---|
.. | .. |
---|
2594 | 3012 | for (i = 0; i < count; ++i) { |
---|
2595 | 3013 | struct page *page; |
---|
2596 | 3014 | |
---|
2597 | | - /* |
---|
2598 | | - * If migrate type CMA is being requested only try to |
---|
2599 | | - * satisfy the request with CMA pages to try and increase |
---|
2600 | | - * CMA utlization. |
---|
2601 | | - */ |
---|
2602 | 3015 | if (is_migrate_cma(migratetype)) |
---|
2603 | | - page = __rmqueue_cma(zone, order); |
---|
| 3016 | + page = __rmqueue_cma(zone, order, migratetype, |
---|
| 3017 | + alloc_flags); |
---|
2604 | 3018 | else |
---|
2605 | | - page = __rmqueue(zone, order, migratetype); |
---|
| 3019 | + page = __rmqueue(zone, order, migratetype, alloc_flags); |
---|
2606 | 3020 | |
---|
2607 | 3021 | if (unlikely(page == NULL)) |
---|
2608 | 3022 | break; |
---|
.. | .. |
---|
2645 | 3059 | */ |
---|
2646 | 3060 | static struct list_head *get_populated_pcp_list(struct zone *zone, |
---|
2647 | 3061 | unsigned int order, struct per_cpu_pages *pcp, |
---|
2648 | | - int migratetype) |
---|
| 3062 | + int migratetype, unsigned int alloc_flags) |
---|
2649 | 3063 | { |
---|
2650 | 3064 | struct list_head *list = &pcp->lists[migratetype]; |
---|
2651 | 3065 | |
---|
2652 | 3066 | if (list_empty(list)) { |
---|
| 3067 | + trace_android_vh_rmqueue_bulk_bypass(order, pcp, migratetype, list); |
---|
| 3068 | + if (!list_empty(list)) |
---|
| 3069 | + return list; |
---|
| 3070 | + |
---|
2653 | 3071 | pcp->count += rmqueue_bulk(zone, order, |
---|
2654 | 3072 | pcp->batch, list, |
---|
2655 | | - migratetype); |
---|
| 3073 | + migratetype, alloc_flags); |
---|
2656 | 3074 | |
---|
2657 | 3075 | if (list_empty(list)) |
---|
2658 | 3076 | list = NULL; |
---|
.. | .. |
---|
2739 | 3157 | |
---|
2740 | 3158 | static void drain_local_pages_wq(struct work_struct *work) |
---|
2741 | 3159 | { |
---|
| 3160 | + struct pcpu_drain *drain; |
---|
| 3161 | + |
---|
| 3162 | + drain = container_of(work, struct pcpu_drain, work); |
---|
| 3163 | + |
---|
2742 | 3164 | /* |
---|
2743 | 3165 | * drain_all_pages doesn't use proper cpu hotplug protection so |
---|
2744 | 3166 | * we can race with cpu offline when the WQ can move this from |
---|
.. | .. |
---|
2747 | 3169 | * a different one. |
---|
2748 | 3170 | */ |
---|
2749 | 3171 | preempt_disable(); |
---|
2750 | | - drain_local_pages(NULL); |
---|
| 3172 | + drain_local_pages(drain->zone); |
---|
2751 | 3173 | preempt_enable(); |
---|
2752 | 3174 | } |
---|
2753 | 3175 | |
---|
.. | .. |
---|
2818 | 3240 | } |
---|
2819 | 3241 | |
---|
2820 | 3242 | for_each_cpu(cpu, &cpus_with_pcps) { |
---|
2821 | | - struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); |
---|
2822 | | - INIT_WORK(work, drain_local_pages_wq); |
---|
2823 | | - queue_work_on(cpu, mm_percpu_wq, work); |
---|
| 3243 | + struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); |
---|
| 3244 | + |
---|
| 3245 | + drain->zone = zone; |
---|
| 3246 | + INIT_WORK(&drain->work, drain_local_pages_wq); |
---|
| 3247 | + queue_work_on(cpu, mm_percpu_wq, &drain->work); |
---|
2824 | 3248 | } |
---|
2825 | 3249 | for_each_cpu(cpu, &cpus_with_pcps) |
---|
2826 | | - flush_work(per_cpu_ptr(&pcpu_drain, cpu)); |
---|
| 3250 | + flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); |
---|
2827 | 3251 | |
---|
2828 | 3252 | mutex_unlock(&pcpu_drain_mutex); |
---|
2829 | 3253 | } |
---|
.. | .. |
---|
2900 | 3324 | struct zone *zone = page_zone(page); |
---|
2901 | 3325 | struct per_cpu_pages *pcp; |
---|
2902 | 3326 | int migratetype; |
---|
| 3327 | + bool pcp_skip_cma_pages = false; |
---|
2903 | 3328 | |
---|
2904 | 3329 | migratetype = get_pcppage_migratetype(page); |
---|
2905 | 3330 | __count_vm_event(PGFREE); |
---|
.. | .. |
---|
2912 | 3337 | * excessively into the page allocator |
---|
2913 | 3338 | */ |
---|
2914 | 3339 | if (migratetype >= MIGRATE_PCPTYPES) { |
---|
2915 | | - if (unlikely(is_migrate_isolate(migratetype))) { |
---|
2916 | | - free_one_page(zone, page, pfn, 0, migratetype); |
---|
| 3340 | + trace_android_vh_pcplist_add_cma_pages_bypass(migratetype, |
---|
| 3341 | + &pcp_skip_cma_pages); |
---|
| 3342 | + if (unlikely(is_migrate_isolate(migratetype)) || |
---|
| 3343 | + pcp_skip_cma_pages) { |
---|
| 3344 | + free_one_page(zone, page, pfn, 0, migratetype, |
---|
| 3345 | + FPI_NONE); |
---|
2917 | 3346 | return; |
---|
2918 | 3347 | } |
---|
2919 | 3348 | migratetype = MIGRATE_MOVABLE; |
---|
.. | .. |
---|
2935 | 3364 | { |
---|
2936 | 3365 | unsigned long flags; |
---|
2937 | 3366 | unsigned long pfn = page_to_pfn(page); |
---|
| 3367 | + int migratetype; |
---|
| 3368 | + bool skip_free_unref_page = false; |
---|
2938 | 3369 | |
---|
2939 | 3370 | if (!free_unref_page_prepare(page, pfn)) |
---|
| 3371 | + return; |
---|
| 3372 | + |
---|
| 3373 | + migratetype = get_pfnblock_migratetype(page, pfn); |
---|
| 3374 | + trace_android_vh_free_unref_page_bypass(page, 0, migratetype, &skip_free_unref_page); |
---|
| 3375 | + if (skip_free_unref_page) |
---|
2940 | 3376 | return; |
---|
2941 | 3377 | |
---|
2942 | 3378 | local_irq_save(flags); |
---|
.. | .. |
---|
2999 | 3435 | |
---|
3000 | 3436 | for (i = 1; i < (1 << order); i++) |
---|
3001 | 3437 | set_page_refcounted(page + i); |
---|
3002 | | - split_page_owner(page, order); |
---|
| 3438 | + split_page_owner(page, 1 << order); |
---|
| 3439 | + split_page_memcg(page, 1 << order); |
---|
3003 | 3440 | } |
---|
3004 | 3441 | EXPORT_SYMBOL_GPL(split_page); |
---|
3005 | 3442 | |
---|
.. | .. |
---|
3021 | 3458 | * watermark, because we already know our high-order page |
---|
3022 | 3459 | * exists. |
---|
3023 | 3460 | */ |
---|
3024 | | - watermark = min_wmark_pages(zone) + (1UL << order); |
---|
| 3461 | + watermark = zone->_watermark[WMARK_MIN] + (1UL << order); |
---|
3025 | 3462 | if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) |
---|
3026 | 3463 | return 0; |
---|
3027 | 3464 | |
---|
.. | .. |
---|
3029 | 3466 | } |
---|
3030 | 3467 | |
---|
3031 | 3468 | /* Remove page from free list */ |
---|
3032 | | - list_del(&page->lru); |
---|
3033 | | - zone->free_area[order].nr_free--; |
---|
3034 | | - rmv_page_order(page); |
---|
| 3469 | + |
---|
| 3470 | + del_page_from_free_list(page, zone, order); |
---|
3035 | 3471 | |
---|
3036 | 3472 | /* |
---|
3037 | 3473 | * Set the pageblock if the isolated page is at least half of a |
---|
.. | .. |
---|
3050 | 3486 | |
---|
3051 | 3487 | |
---|
3052 | 3488 | return 1UL << order; |
---|
| 3489 | +} |
---|
| 3490 | + |
---|
| 3491 | +/** |
---|
| 3492 | + * __putback_isolated_page - Return a now-isolated page back where we got it |
---|
| 3493 | + * @page: Page that was isolated |
---|
| 3494 | + * @order: Order of the isolated page |
---|
| 3495 | + * @mt: The page's pageblock's migratetype |
---|
| 3496 | + * |
---|
| 3497 | + * This function is meant to return a page pulled from the free lists via |
---|
| 3498 | + * __isolate_free_page back to the free lists they were pulled from. |
---|
| 3499 | + */ |
---|
| 3500 | +void __putback_isolated_page(struct page *page, unsigned int order, int mt) |
---|
| 3501 | +{ |
---|
| 3502 | + struct zone *zone = page_zone(page); |
---|
| 3503 | + |
---|
| 3504 | + /* zone lock should be held when this function is called */ |
---|
| 3505 | + lockdep_assert_held(&zone->lock); |
---|
| 3506 | + |
---|
| 3507 | + /* Return isolated page to tail of freelist. */ |
---|
| 3508 | + __free_one_page(page, page_to_pfn(page), zone, order, mt, |
---|
| 3509 | + FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); |
---|
3053 | 3510 | } |
---|
3054 | 3511 | |
---|
3055 | 3512 | /* |
---|
.. | .. |
---|
3081 | 3538 | |
---|
3082 | 3539 | /* Remove page from the per-cpu list, caller must protect the list */ |
---|
3083 | 3540 | static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, |
---|
| 3541 | + unsigned int alloc_flags, |
---|
3084 | 3542 | struct per_cpu_pages *pcp, |
---|
3085 | 3543 | gfp_t gfp_flags) |
---|
3086 | 3544 | { |
---|
.. | .. |
---|
3090 | 3548 | do { |
---|
3091 | 3549 | /* First try to get CMA pages */ |
---|
3092 | 3550 | if (migratetype == MIGRATE_MOVABLE && |
---|
3093 | | - gfp_flags & __GFP_CMA) { |
---|
| 3551 | + alloc_flags & ALLOC_CMA) { |
---|
3094 | 3552 | list = get_populated_pcp_list(zone, 0, pcp, |
---|
3095 | | - get_cma_migrate_type()); |
---|
| 3553 | + get_cma_migrate_type(), alloc_flags); |
---|
3096 | 3554 | } |
---|
3097 | 3555 | |
---|
3098 | 3556 | if (list == NULL) { |
---|
.. | .. |
---|
3101 | 3559 | * free CMA pages. |
---|
3102 | 3560 | */ |
---|
3103 | 3561 | list = get_populated_pcp_list(zone, 0, pcp, |
---|
3104 | | - migratetype); |
---|
| 3562 | + migratetype, alloc_flags); |
---|
3105 | 3563 | if (unlikely(list == NULL) || |
---|
3106 | 3564 | unlikely(list_empty(list))) |
---|
3107 | 3565 | return NULL; |
---|
.. | .. |
---|
3117 | 3575 | |
---|
3118 | 3576 | /* Lock and remove page from the per-cpu list */ |
---|
3119 | 3577 | static struct page *rmqueue_pcplist(struct zone *preferred_zone, |
---|
3120 | | - struct zone *zone, unsigned int order, |
---|
3121 | | - gfp_t gfp_flags, int migratetype) |
---|
| 3578 | + struct zone *zone, gfp_t gfp_flags, |
---|
| 3579 | + int migratetype, unsigned int alloc_flags) |
---|
3122 | 3580 | { |
---|
3123 | 3581 | struct per_cpu_pages *pcp; |
---|
3124 | 3582 | struct page *page; |
---|
.. | .. |
---|
3126 | 3584 | |
---|
3127 | 3585 | local_irq_save(flags); |
---|
3128 | 3586 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
---|
3129 | | - page = __rmqueue_pcplist(zone, migratetype, pcp, |
---|
| 3587 | + page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, |
---|
3130 | 3588 | gfp_flags); |
---|
3131 | 3589 | if (page) { |
---|
3132 | | - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
---|
| 3590 | + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); |
---|
3133 | 3591 | zone_statistics(preferred_zone, zone); |
---|
3134 | 3592 | } |
---|
3135 | 3593 | local_irq_restore(flags); |
---|
.. | .. |
---|
3149 | 3607 | struct page *page; |
---|
3150 | 3608 | |
---|
3151 | 3609 | if (likely(order == 0)) { |
---|
3152 | | - page = rmqueue_pcplist(preferred_zone, zone, order, |
---|
3153 | | - gfp_flags, migratetype); |
---|
| 3610 | + page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, |
---|
| 3611 | + migratetype, alloc_flags); |
---|
3154 | 3612 | goto out; |
---|
3155 | 3613 | } |
---|
3156 | 3614 | |
---|
.. | .. |
---|
3163 | 3621 | |
---|
3164 | 3622 | do { |
---|
3165 | 3623 | page = NULL; |
---|
3166 | | - |
---|
3167 | | - if (alloc_flags & ALLOC_HARDER) { |
---|
| 3624 | + /* |
---|
| 3625 | + * order-0 request can reach here when the pcplist is skipped |
---|
| 3626 | + * due to non-CMA allocation context. HIGHATOMIC area is |
---|
| 3627 | + * reserved for high-order atomic allocation, so order-0 |
---|
| 3628 | + * request should skip it. |
---|
| 3629 | + */ |
---|
| 3630 | + if (order > 0 && alloc_flags & ALLOC_HARDER) { |
---|
3168 | 3631 | page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); |
---|
3169 | 3632 | if (page) |
---|
3170 | 3633 | trace_mm_page_alloc_zone_locked(page, order, migratetype); |
---|
3171 | 3634 | } |
---|
3172 | | - |
---|
3173 | | - if (!page && migratetype == MIGRATE_MOVABLE && |
---|
3174 | | - gfp_flags & __GFP_CMA) |
---|
3175 | | - page = __rmqueue_cma(zone, order); |
---|
3176 | | - |
---|
3177 | | - if (!page) |
---|
3178 | | - page = __rmqueue(zone, order, migratetype); |
---|
| 3635 | + if (!page) { |
---|
| 3636 | + if (migratetype == MIGRATE_MOVABLE && |
---|
| 3637 | + alloc_flags & ALLOC_CMA) |
---|
| 3638 | + page = __rmqueue_cma(zone, order, migratetype, |
---|
| 3639 | + alloc_flags); |
---|
| 3640 | + if (!page) |
---|
| 3641 | + page = __rmqueue(zone, order, migratetype, |
---|
| 3642 | + alloc_flags); |
---|
| 3643 | + } |
---|
3179 | 3644 | } while (page && check_new_pages(page, order)); |
---|
3180 | | - |
---|
3181 | 3645 | spin_unlock(&zone->lock); |
---|
3182 | 3646 | if (!page) |
---|
3183 | 3647 | goto failed; |
---|
.. | .. |
---|
3186 | 3650 | |
---|
3187 | 3651 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
---|
3188 | 3652 | zone_statistics(preferred_zone, zone); |
---|
| 3653 | + trace_android_vh_rmqueue(preferred_zone, zone, order, |
---|
| 3654 | + gfp_flags, alloc_flags, migratetype); |
---|
3189 | 3655 | local_irq_restore(flags); |
---|
3190 | 3656 | |
---|
3191 | 3657 | out: |
---|
| 3658 | + /* Separate test+clear to avoid unnecessary atomics */ |
---|
| 3659 | + if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { |
---|
| 3660 | + clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); |
---|
| 3661 | + wakeup_kswapd(zone, 0, 0, zone_idx(zone)); |
---|
| 3662 | + } |
---|
| 3663 | + |
---|
3192 | 3664 | VM_BUG_ON_PAGE(page && bad_range(zone, page), page); |
---|
3193 | 3665 | return page; |
---|
3194 | 3666 | |
---|
.. | .. |
---|
3218 | 3690 | } |
---|
3219 | 3691 | __setup("fail_page_alloc=", setup_fail_page_alloc); |
---|
3220 | 3692 | |
---|
3221 | | -static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
---|
| 3693 | +static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
---|
3222 | 3694 | { |
---|
3223 | 3695 | if (order < fail_page_alloc.min_order) |
---|
3224 | 3696 | return false; |
---|
.. | .. |
---|
3242 | 3714 | |
---|
3243 | 3715 | dir = fault_create_debugfs_attr("fail_page_alloc", NULL, |
---|
3244 | 3716 | &fail_page_alloc.attr); |
---|
3245 | | - if (IS_ERR(dir)) |
---|
3246 | | - return PTR_ERR(dir); |
---|
3247 | 3717 | |
---|
3248 | | - if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, |
---|
3249 | | - &fail_page_alloc.ignore_gfp_reclaim)) |
---|
3250 | | - goto fail; |
---|
3251 | | - if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, |
---|
3252 | | - &fail_page_alloc.ignore_gfp_highmem)) |
---|
3253 | | - goto fail; |
---|
3254 | | - if (!debugfs_create_u32("min-order", mode, dir, |
---|
3255 | | - &fail_page_alloc.min_order)) |
---|
3256 | | - goto fail; |
---|
| 3718 | + debugfs_create_bool("ignore-gfp-wait", mode, dir, |
---|
| 3719 | + &fail_page_alloc.ignore_gfp_reclaim); |
---|
| 3720 | + debugfs_create_bool("ignore-gfp-highmem", mode, dir, |
---|
| 3721 | + &fail_page_alloc.ignore_gfp_highmem); |
---|
| 3722 | + debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); |
---|
3257 | 3723 | |
---|
3258 | 3724 | return 0; |
---|
3259 | | -fail: |
---|
3260 | | - debugfs_remove_recursive(dir); |
---|
3261 | | - |
---|
3262 | | - return -ENOMEM; |
---|
3263 | 3725 | } |
---|
3264 | 3726 | |
---|
3265 | 3727 | late_initcall(fail_page_alloc_debugfs); |
---|
.. | .. |
---|
3268 | 3730 | |
---|
3269 | 3731 | #else /* CONFIG_FAIL_PAGE_ALLOC */ |
---|
3270 | 3732 | |
---|
3271 | | -static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
---|
| 3733 | +static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
---|
3272 | 3734 | { |
---|
3273 | 3735 | return false; |
---|
3274 | 3736 | } |
---|
3275 | 3737 | |
---|
3276 | 3738 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
---|
| 3739 | + |
---|
| 3740 | +noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
---|
| 3741 | +{ |
---|
| 3742 | + return __should_fail_alloc_page(gfp_mask, order); |
---|
| 3743 | +} |
---|
| 3744 | +ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); |
---|
| 3745 | + |
---|
| 3746 | +static inline long __zone_watermark_unusable_free(struct zone *z, |
---|
| 3747 | + unsigned int order, unsigned int alloc_flags) |
---|
| 3748 | +{ |
---|
| 3749 | + const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); |
---|
| 3750 | + long unusable_free = (1 << order) - 1; |
---|
| 3751 | + |
---|
| 3752 | + /* |
---|
| 3753 | + * If the caller does not have rights to ALLOC_HARDER then subtract |
---|
| 3754 | + * the high-atomic reserves. This will over-estimate the size of the |
---|
| 3755 | + * atomic reserve but it avoids a search. |
---|
| 3756 | + */ |
---|
| 3757 | + if (likely(!alloc_harder)) |
---|
| 3758 | + unusable_free += z->nr_reserved_highatomic; |
---|
| 3759 | + |
---|
| 3760 | +#ifdef CONFIG_CMA |
---|
| 3761 | + /* If allocation can't use CMA areas don't use free CMA pages */ |
---|
| 3762 | + if (!(alloc_flags & ALLOC_CMA)) |
---|
| 3763 | + unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); |
---|
| 3764 | +#endif |
---|
| 3765 | + |
---|
| 3766 | + return unusable_free; |
---|
| 3767 | +} |
---|
3277 | 3768 | |
---|
3278 | 3769 | /* |
---|
3279 | 3770 | * Return true if free base pages are above 'mark'. For high-order checks it |
---|
.. | .. |
---|
3282 | 3773 | * to check in the allocation paths if no pages are free. |
---|
3283 | 3774 | */ |
---|
3284 | 3775 | bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, |
---|
3285 | | - int classzone_idx, unsigned int alloc_flags, |
---|
| 3776 | + int highest_zoneidx, unsigned int alloc_flags, |
---|
3286 | 3777 | long free_pages) |
---|
3287 | 3778 | { |
---|
3288 | 3779 | long min = mark; |
---|
.. | .. |
---|
3290 | 3781 | const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); |
---|
3291 | 3782 | |
---|
3292 | 3783 | /* free_pages may go negative - that's OK */ |
---|
3293 | | - free_pages -= (1 << order) - 1; |
---|
| 3784 | + free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); |
---|
3294 | 3785 | |
---|
3295 | 3786 | if (alloc_flags & ALLOC_HIGH) |
---|
3296 | 3787 | min -= min / 2; |
---|
3297 | 3788 | |
---|
3298 | | - /* |
---|
3299 | | - * If the caller does not have rights to ALLOC_HARDER then subtract |
---|
3300 | | - * the high-atomic reserves. This will over-estimate the size of the |
---|
3301 | | - * atomic reserve but it avoids a search. |
---|
3302 | | - */ |
---|
3303 | | - if (likely(!alloc_harder)) { |
---|
3304 | | - free_pages -= z->nr_reserved_highatomic; |
---|
3305 | | - } else { |
---|
| 3789 | + if (unlikely(alloc_harder)) { |
---|
3306 | 3790 | /* |
---|
3307 | 3791 | * OOM victims can try even harder than normal ALLOC_HARDER |
---|
3308 | 3792 | * users on the grounds that it's definitely going to be in |
---|
.. | .. |
---|
3315 | 3799 | min -= min / 4; |
---|
3316 | 3800 | } |
---|
3317 | 3801 | |
---|
3318 | | - |
---|
3319 | | -#ifdef CONFIG_CMA |
---|
3320 | | - /* If allocation can't use CMA areas don't use free CMA pages */ |
---|
3321 | | - if (!(alloc_flags & ALLOC_CMA)) |
---|
3322 | | - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); |
---|
3323 | | -#endif |
---|
3324 | | - |
---|
3325 | 3802 | /* |
---|
3326 | 3803 | * Check watermarks for an order-0 allocation request. If these |
---|
3327 | 3804 | * are not met, then a high-order request also cannot go ahead |
---|
3328 | 3805 | * even if a suitable page happened to be free. |
---|
3329 | 3806 | */ |
---|
3330 | | - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) |
---|
| 3807 | + if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) |
---|
3331 | 3808 | return false; |
---|
3332 | 3809 | |
---|
3333 | 3810 | /* If this is an order-0 request then the watermark is fine */ |
---|
.. | .. |
---|
3351 | 3828 | if (mt == MIGRATE_CMA) |
---|
3352 | 3829 | continue; |
---|
3353 | 3830 | #endif |
---|
3354 | | - if (!list_empty(&area->free_list[mt])) |
---|
| 3831 | + if (!free_area_empty(area, mt)) |
---|
3355 | 3832 | return true; |
---|
3356 | 3833 | } |
---|
3357 | 3834 | |
---|
3358 | 3835 | #ifdef CONFIG_CMA |
---|
3359 | 3836 | if ((alloc_flags & ALLOC_CMA) && |
---|
3360 | | - !list_empty(&area->free_list[MIGRATE_CMA])) { |
---|
| 3837 | + !free_area_empty(area, MIGRATE_CMA)) { |
---|
3361 | 3838 | return true; |
---|
3362 | 3839 | } |
---|
3363 | 3840 | #endif |
---|
3364 | | - if (alloc_harder && |
---|
3365 | | - !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) |
---|
| 3841 | + if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) |
---|
3366 | 3842 | return true; |
---|
3367 | 3843 | } |
---|
3368 | 3844 | return false; |
---|
3369 | 3845 | } |
---|
3370 | 3846 | |
---|
3371 | 3847 | bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, |
---|
3372 | | - int classzone_idx, unsigned int alloc_flags) |
---|
| 3848 | + int highest_zoneidx, unsigned int alloc_flags) |
---|
3373 | 3849 | { |
---|
3374 | | - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
---|
| 3850 | + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, |
---|
3375 | 3851 | zone_page_state(z, NR_FREE_PAGES)); |
---|
3376 | 3852 | } |
---|
| 3853 | +EXPORT_SYMBOL_GPL(zone_watermark_ok); |
---|
3377 | 3854 | |
---|
3378 | 3855 | static inline bool zone_watermark_fast(struct zone *z, unsigned int order, |
---|
3379 | | - unsigned long mark, int classzone_idx, unsigned int alloc_flags) |
---|
| 3856 | + unsigned long mark, int highest_zoneidx, |
---|
| 3857 | + unsigned int alloc_flags, gfp_t gfp_mask) |
---|
3380 | 3858 | { |
---|
3381 | | - long free_pages = zone_page_state(z, NR_FREE_PAGES); |
---|
3382 | | - long cma_pages = 0; |
---|
| 3859 | + long free_pages; |
---|
3383 | 3860 | |
---|
3384 | | -#ifdef CONFIG_CMA |
---|
3385 | | - /* If allocation can't use CMA areas don't use free CMA pages */ |
---|
3386 | | - if (!(alloc_flags & ALLOC_CMA)) |
---|
3387 | | - cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); |
---|
3388 | | -#endif |
---|
| 3861 | + free_pages = zone_page_state(z, NR_FREE_PAGES); |
---|
3389 | 3862 | |
---|
3390 | 3863 | /* |
---|
3391 | 3864 | * Fast check for order-0 only. If this fails then the reserves |
---|
3392 | | - * need to be calculated. There is a corner case where the check |
---|
3393 | | - * passes but only the high-order atomic reserve are free. If |
---|
3394 | | - * the caller is !atomic then it'll uselessly search the free |
---|
3395 | | - * list. That corner case is then slower but it is harmless. |
---|
| 3865 | + * need to be calculated. |
---|
3396 | 3866 | */ |
---|
3397 | | - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) |
---|
3398 | | - return true; |
---|
| 3867 | + if (!order) { |
---|
| 3868 | + long usable_free; |
---|
| 3869 | + long reserved; |
---|
3399 | 3870 | |
---|
3400 | | - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
---|
3401 | | - free_pages); |
---|
| 3871 | + usable_free = free_pages; |
---|
| 3872 | + reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); |
---|
| 3873 | + |
---|
| 3874 | + /* reserved may over estimate high-atomic reserves. */ |
---|
| 3875 | + usable_free -= min(usable_free, reserved); |
---|
| 3876 | + if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) |
---|
| 3877 | + return true; |
---|
| 3878 | + } |
---|
| 3879 | + |
---|
| 3880 | + if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, |
---|
| 3881 | + free_pages)) |
---|
| 3882 | + return true; |
---|
| 3883 | + /* |
---|
| 3884 | + * Ignore watermark boosting for GFP_ATOMIC order-0 allocations |
---|
| 3885 | + * when checking the min watermark. The min watermark is the |
---|
| 3886 | + * point where boosting is ignored so that kswapd is woken up |
---|
| 3887 | + * when below the low watermark. |
---|
| 3888 | + */ |
---|
| 3889 | + if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost |
---|
| 3890 | + && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { |
---|
| 3891 | + mark = z->_watermark[WMARK_MIN]; |
---|
| 3892 | + return __zone_watermark_ok(z, order, mark, highest_zoneidx, |
---|
| 3893 | + alloc_flags, free_pages); |
---|
| 3894 | + } |
---|
| 3895 | + |
---|
| 3896 | + return false; |
---|
3402 | 3897 | } |
---|
3403 | 3898 | |
---|
3404 | 3899 | bool zone_watermark_ok_safe(struct zone *z, unsigned int order, |
---|
3405 | | - unsigned long mark, int classzone_idx) |
---|
| 3900 | + unsigned long mark, int highest_zoneidx) |
---|
3406 | 3901 | { |
---|
3407 | 3902 | long free_pages = zone_page_state(z, NR_FREE_PAGES); |
---|
3408 | 3903 | |
---|
3409 | 3904 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) |
---|
3410 | 3905 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); |
---|
3411 | 3906 | |
---|
3412 | | - return __zone_watermark_ok(z, order, mark, classzone_idx, 0, |
---|
| 3907 | + return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, |
---|
3413 | 3908 | free_pages); |
---|
3414 | 3909 | } |
---|
3415 | 3910 | EXPORT_SYMBOL_GPL(zone_watermark_ok_safe); |
---|
.. | .. |
---|
3418 | 3913 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
---|
3419 | 3914 | { |
---|
3420 | 3915 | return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= |
---|
3421 | | - RECLAIM_DISTANCE; |
---|
| 3916 | + node_reclaim_distance; |
---|
3422 | 3917 | } |
---|
3423 | 3918 | #else /* CONFIG_NUMA */ |
---|
3424 | 3919 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
---|
.. | .. |
---|
3428 | 3923 | #endif /* CONFIG_NUMA */ |
---|
3429 | 3924 | |
---|
3430 | 3925 | /* |
---|
| 3926 | + * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid |
---|
| 3927 | + * fragmentation is subtle. If the preferred zone was HIGHMEM then |
---|
| 3928 | + * premature use of a lower zone may cause lowmem pressure problems that |
---|
| 3929 | + * are worse than fragmentation. If the next zone is ZONE_DMA then it is |
---|
| 3930 | + * probably too small. It only makes sense to spread allocations to avoid |
---|
| 3931 | + * fragmentation between the Normal and DMA32 zones. |
---|
| 3932 | + */ |
---|
| 3933 | +static inline unsigned int |
---|
| 3934 | +alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) |
---|
| 3935 | +{ |
---|
| 3936 | + unsigned int alloc_flags; |
---|
| 3937 | + |
---|
| 3938 | + /* |
---|
| 3939 | + * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD |
---|
| 3940 | + * to save a branch. |
---|
| 3941 | + */ |
---|
| 3942 | + alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM); |
---|
| 3943 | + |
---|
| 3944 | +#ifdef CONFIG_ZONE_DMA32 |
---|
| 3945 | + if (!zone) |
---|
| 3946 | + return alloc_flags; |
---|
| 3947 | + |
---|
| 3948 | + if (zone_idx(zone) != ZONE_NORMAL) |
---|
| 3949 | + return alloc_flags; |
---|
| 3950 | + |
---|
| 3951 | + /* |
---|
| 3952 | + * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and |
---|
| 3953 | + * the pointer is within zone->zone_pgdat->node_zones[]. Also assume |
---|
| 3954 | + * on UMA that if Normal is populated then so is DMA32. |
---|
| 3955 | + */ |
---|
| 3956 | + BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); |
---|
| 3957 | + if (nr_online_nodes > 1 && !populated_zone(--zone)) |
---|
| 3958 | + return alloc_flags; |
---|
| 3959 | + |
---|
| 3960 | + alloc_flags |= ALLOC_NOFRAGMENT; |
---|
| 3961 | +#endif /* CONFIG_ZONE_DMA32 */ |
---|
| 3962 | + return alloc_flags; |
---|
| 3963 | +} |
---|
| 3964 | + |
---|
| 3965 | +static inline unsigned int current_alloc_flags(gfp_t gfp_mask, |
---|
| 3966 | + unsigned int alloc_flags) |
---|
| 3967 | +{ |
---|
| 3968 | +#ifdef CONFIG_CMA |
---|
| 3969 | + unsigned int pflags = current->flags; |
---|
| 3970 | + |
---|
| 3971 | + if (!(pflags & PF_MEMALLOC_NOCMA) && |
---|
| 3972 | + gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE && |
---|
| 3973 | + gfp_mask & __GFP_CMA) |
---|
| 3974 | + alloc_flags |= ALLOC_CMA; |
---|
| 3975 | + |
---|
| 3976 | +#endif |
---|
| 3977 | + return alloc_flags; |
---|
| 3978 | +} |
---|
| 3979 | + |
---|
| 3980 | +/* |
---|
3431 | 3981 | * get_page_from_freelist goes through the zonelist trying to allocate |
---|
3432 | 3982 | * a page. |
---|
3433 | 3983 | */ |
---|
.. | .. |
---|
3435 | 3985 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, |
---|
3436 | 3986 | const struct alloc_context *ac) |
---|
3437 | 3987 | { |
---|
3438 | | - struct zoneref *z = ac->preferred_zoneref; |
---|
| 3988 | + struct zoneref *z; |
---|
3439 | 3989 | struct zone *zone; |
---|
3440 | 3990 | struct pglist_data *last_pgdat_dirty_limit = NULL; |
---|
| 3991 | + bool no_fallback; |
---|
3441 | 3992 | |
---|
| 3993 | +retry: |
---|
3442 | 3994 | /* |
---|
3443 | 3995 | * Scan zonelist, looking for a zone with enough free. |
---|
3444 | 3996 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. |
---|
3445 | 3997 | */ |
---|
3446 | | - for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, |
---|
3447 | | - ac->nodemask) { |
---|
| 3998 | + no_fallback = alloc_flags & ALLOC_NOFRAGMENT; |
---|
| 3999 | + z = ac->preferred_zoneref; |
---|
| 4000 | + for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, |
---|
| 4001 | + ac->nodemask) { |
---|
3448 | 4002 | struct page *page; |
---|
3449 | 4003 | unsigned long mark; |
---|
3450 | 4004 | |
---|
.. | .. |
---|
3481 | 4035 | } |
---|
3482 | 4036 | } |
---|
3483 | 4037 | |
---|
3484 | | - mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
---|
| 4038 | + if (no_fallback && nr_online_nodes > 1 && |
---|
| 4039 | + zone != ac->preferred_zoneref->zone) { |
---|
| 4040 | + int local_nid; |
---|
| 4041 | + |
---|
| 4042 | + /* |
---|
| 4043 | + * If moving to a remote node, retry but allow |
---|
| 4044 | + * fragmenting fallbacks. Locality is more important |
---|
| 4045 | + * than fragmentation avoidance. |
---|
| 4046 | + */ |
---|
| 4047 | + local_nid = zone_to_nid(ac->preferred_zoneref->zone); |
---|
| 4048 | + if (zone_to_nid(zone) != local_nid) { |
---|
| 4049 | + alloc_flags &= ~ALLOC_NOFRAGMENT; |
---|
| 4050 | + goto retry; |
---|
| 4051 | + } |
---|
| 4052 | + } |
---|
| 4053 | + |
---|
| 4054 | + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); |
---|
3485 | 4055 | if (!zone_watermark_fast(zone, order, mark, |
---|
3486 | | - ac_classzone_idx(ac), alloc_flags)) { |
---|
| 4056 | + ac->highest_zoneidx, alloc_flags, |
---|
| 4057 | + gfp_mask)) { |
---|
3487 | 4058 | int ret; |
---|
3488 | 4059 | |
---|
3489 | 4060 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
---|
.. | .. |
---|
3516 | 4087 | default: |
---|
3517 | 4088 | /* did we reclaim enough */ |
---|
3518 | 4089 | if (zone_watermark_ok(zone, order, mark, |
---|
3519 | | - ac_classzone_idx(ac), alloc_flags)) |
---|
| 4090 | + ac->highest_zoneidx, alloc_flags)) |
---|
3520 | 4091 | goto try_this_zone; |
---|
3521 | 4092 | |
---|
3522 | 4093 | continue; |
---|
.. | .. |
---|
3548 | 4119 | } |
---|
3549 | 4120 | } |
---|
3550 | 4121 | |
---|
| 4122 | + /* |
---|
| 4123 | + * It's possible on a UMA machine to get through all zones that are |
---|
| 4124 | + * fragmented. If avoiding fragmentation, reset and try again. |
---|
| 4125 | + */ |
---|
| 4126 | + if (no_fallback) { |
---|
| 4127 | + alloc_flags &= ~ALLOC_NOFRAGMENT; |
---|
| 4128 | + goto retry; |
---|
| 4129 | + } |
---|
| 4130 | + |
---|
3551 | 4131 | return NULL; |
---|
3552 | | -} |
---|
3553 | | - |
---|
3554 | | -/* |
---|
3555 | | - * Large machines with many possible nodes should not always dump per-node |
---|
3556 | | - * meminfo in irq context. |
---|
3557 | | - */ |
---|
3558 | | -static inline bool should_suppress_show_mem(void) |
---|
3559 | | -{ |
---|
3560 | | - bool ret = false; |
---|
3561 | | - |
---|
3562 | | -#if NODES_SHIFT > 8 |
---|
3563 | | - ret = in_interrupt(); |
---|
3564 | | -#endif |
---|
3565 | | - return ret; |
---|
3566 | 4132 | } |
---|
3567 | 4133 | |
---|
3568 | 4134 | static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) |
---|
3569 | 4135 | { |
---|
3570 | 4136 | unsigned int filter = SHOW_MEM_FILTER_NODES; |
---|
3571 | | - static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); |
---|
3572 | | - |
---|
3573 | | - if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) |
---|
3574 | | - return; |
---|
3575 | 4137 | |
---|
3576 | 4138 | /* |
---|
3577 | 4139 | * This documents exceptions given to allocations in certain |
---|
.. | .. |
---|
3592 | 4154 | { |
---|
3593 | 4155 | struct va_format vaf; |
---|
3594 | 4156 | va_list args; |
---|
3595 | | - static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, |
---|
3596 | | - DEFAULT_RATELIMIT_BURST); |
---|
| 4157 | + static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); |
---|
3597 | 4158 | |
---|
3598 | | - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) |
---|
| 4159 | + if ((gfp_mask & __GFP_NOWARN) || |
---|
| 4160 | + !__ratelimit(&nopage_rs) || |
---|
| 4161 | + ((gfp_mask & __GFP_DMA) && !has_managed_dma())) |
---|
3599 | 4162 | return; |
---|
3600 | 4163 | |
---|
3601 | 4164 | va_start(args, fmt); |
---|
3602 | 4165 | vaf.fmt = fmt; |
---|
3603 | 4166 | vaf.va = &args; |
---|
3604 | | - pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", |
---|
| 4167 | + pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", |
---|
3605 | 4168 | current->comm, &vaf, gfp_mask, &gfp_mask, |
---|
3606 | 4169 | nodemask_pr_args(nodemask)); |
---|
3607 | 4170 | va_end(args); |
---|
3608 | 4171 | |
---|
3609 | 4172 | cpuset_print_current_mems_allowed(); |
---|
3610 | | - |
---|
| 4173 | + pr_cont("\n"); |
---|
3611 | 4174 | dump_stack(); |
---|
3612 | 4175 | warn_alloc_show_mem(gfp_mask, nodemask); |
---|
3613 | 4176 | } |
---|
.. | .. |
---|
3681 | 4244 | * success so it is time to admit defeat. We will skip the OOM killer |
---|
3682 | 4245 | * because it is very likely that the caller has a more reasonable |
---|
3683 | 4246 | * fallback than shooting a random task. |
---|
| 4247 | + * |
---|
| 4248 | + * The OOM killer may not free memory on a specific node. |
---|
3684 | 4249 | */ |
---|
3685 | | - if (gfp_mask & __GFP_RETRY_MAYFAIL) |
---|
| 4250 | + if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) |
---|
3686 | 4251 | goto out; |
---|
3687 | 4252 | /* The OOM killer does not needlessly kill tasks for lowmem */ |
---|
3688 | | - if (ac->high_zoneidx < ZONE_NORMAL) |
---|
| 4253 | + if (ac->highest_zoneidx < ZONE_NORMAL) |
---|
3689 | 4254 | goto out; |
---|
3690 | 4255 | if (pm_suspended_storage()) |
---|
3691 | 4256 | goto out; |
---|
.. | .. |
---|
3698 | 4263 | * out_of_memory). Once filesystems are ready to handle allocation |
---|
3699 | 4264 | * failures more gracefully we should just bail out here. |
---|
3700 | 4265 | */ |
---|
3701 | | - |
---|
3702 | | - /* The OOM killer may not free memory on a specific node */ |
---|
3703 | | - if (gfp_mask & __GFP_THISNODE) |
---|
3704 | | - goto out; |
---|
3705 | 4266 | |
---|
3706 | 4267 | /* Exhausted what can be done so it's blame time */ |
---|
3707 | 4268 | if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { |
---|
.. | .. |
---|
3733 | 4294 | unsigned int alloc_flags, const struct alloc_context *ac, |
---|
3734 | 4295 | enum compact_priority prio, enum compact_result *compact_result) |
---|
3735 | 4296 | { |
---|
3736 | | - struct page *page; |
---|
| 4297 | + struct page *page = NULL; |
---|
3737 | 4298 | unsigned long pflags; |
---|
3738 | 4299 | unsigned int noreclaim_flag; |
---|
3739 | 4300 | |
---|
.. | .. |
---|
3744 | 4305 | noreclaim_flag = memalloc_noreclaim_save(); |
---|
3745 | 4306 | |
---|
3746 | 4307 | *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, |
---|
3747 | | - prio); |
---|
| 4308 | + prio, &page); |
---|
3748 | 4309 | |
---|
3749 | 4310 | memalloc_noreclaim_restore(noreclaim_flag); |
---|
3750 | 4311 | psi_memstall_leave(&pflags); |
---|
3751 | | - |
---|
3752 | | - if (*compact_result <= COMPACT_INACTIVE) |
---|
3753 | | - return NULL; |
---|
3754 | 4312 | |
---|
3755 | 4313 | /* |
---|
3756 | 4314 | * At least in one zone compaction wasn't deferred or skipped, so let's |
---|
.. | .. |
---|
3758 | 4316 | */ |
---|
3759 | 4317 | count_vm_event(COMPACTSTALL); |
---|
3760 | 4318 | |
---|
3761 | | - page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); |
---|
| 4319 | + /* Prep a captured page if available */ |
---|
| 4320 | + if (page) |
---|
| 4321 | + prep_new_page(page, order, gfp_mask, alloc_flags); |
---|
| 4322 | + |
---|
| 4323 | + /* Try get a page from the freelist if available */ |
---|
| 4324 | + if (!page) |
---|
| 4325 | + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); |
---|
3762 | 4326 | |
---|
3763 | 4327 | if (page) { |
---|
3764 | 4328 | struct zone *zone = page_zone(page); |
---|
.. | .. |
---|
3807 | 4371 | goto check_priority; |
---|
3808 | 4372 | |
---|
3809 | 4373 | /* |
---|
3810 | | - * make sure the compaction wasn't deferred or didn't bail out early |
---|
3811 | | - * due to locks contention before we declare that we should give up. |
---|
3812 | | - * But do not retry if the given zonelist is not suitable for |
---|
3813 | | - * compaction. |
---|
| 4374 | + * compaction was skipped because there are not enough order-0 pages |
---|
| 4375 | + * to work with, so we retry only if it looks like reclaim can help. |
---|
3814 | 4376 | */ |
---|
3815 | | - if (compaction_withdrawn(compact_result)) { |
---|
| 4377 | + if (compaction_needs_reclaim(compact_result)) { |
---|
3816 | 4378 | ret = compaction_zonelist_suitable(ac, order, alloc_flags); |
---|
3817 | 4379 | goto out; |
---|
| 4380 | + } |
---|
| 4381 | + |
---|
| 4382 | + /* |
---|
| 4383 | + * make sure the compaction wasn't deferred or didn't bail out early |
---|
| 4384 | + * due to locks contention before we declare that we should give up. |
---|
| 4385 | + * But the next retry should use a higher priority if allowed, so |
---|
| 4386 | + * we don't just keep bailing out endlessly. |
---|
| 4387 | + */ |
---|
| 4388 | + if (compaction_withdrawn(compact_result)) { |
---|
| 4389 | + goto check_priority; |
---|
3818 | 4390 | } |
---|
3819 | 4391 | |
---|
3820 | 4392 | /* |
---|
.. | .. |
---|
3877 | 4449 | * Let's give them a good hope and keep retrying while the order-0 |
---|
3878 | 4450 | * watermarks are OK. |
---|
3879 | 4451 | */ |
---|
3880 | | - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, |
---|
3881 | | - ac->nodemask) { |
---|
| 4452 | + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, |
---|
| 4453 | + ac->highest_zoneidx, ac->nodemask) { |
---|
3882 | 4454 | if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), |
---|
3883 | | - ac_classzone_idx(ac), alloc_flags)) |
---|
| 4455 | + ac->highest_zoneidx, alloc_flags)) |
---|
3884 | 4456 | return true; |
---|
3885 | 4457 | } |
---|
3886 | 4458 | return false; |
---|
.. | .. |
---|
3938 | 4510 | EXPORT_SYMBOL_GPL(fs_reclaim_release); |
---|
3939 | 4511 | #endif |
---|
3940 | 4512 | |
---|
| 4513 | +/* |
---|
| 4514 | + * Zonelists may change due to hotplug during allocation. Detect when zonelists |
---|
| 4515 | + * have been rebuilt so allocation retries. Reader side does not lock and |
---|
| 4516 | + * retries the allocation if zonelist changes. Writer side is protected by the |
---|
| 4517 | + * embedded spin_lock. |
---|
| 4518 | + */ |
---|
| 4519 | +static DEFINE_SEQLOCK(zonelist_update_seq); |
---|
| 4520 | + |
---|
| 4521 | +static unsigned int zonelist_iter_begin(void) |
---|
| 4522 | +{ |
---|
| 4523 | + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) |
---|
| 4524 | + return read_seqbegin(&zonelist_update_seq); |
---|
| 4525 | + |
---|
| 4526 | + return 0; |
---|
| 4527 | +} |
---|
| 4528 | + |
---|
| 4529 | +static unsigned int check_retry_zonelist(unsigned int seq) |
---|
| 4530 | +{ |
---|
| 4531 | + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) |
---|
| 4532 | + return read_seqretry(&zonelist_update_seq, seq); |
---|
| 4533 | + |
---|
| 4534 | + return seq; |
---|
| 4535 | +} |
---|
| 4536 | + |
---|
3941 | 4537 | /* Perform direct synchronous page reclaim */ |
---|
3942 | | -static int |
---|
| 4538 | +static unsigned long |
---|
3943 | 4539 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, |
---|
3944 | 4540 | const struct alloc_context *ac) |
---|
3945 | 4541 | { |
---|
3946 | | - struct reclaim_state reclaim_state; |
---|
3947 | | - int progress; |
---|
3948 | 4542 | unsigned int noreclaim_flag; |
---|
3949 | | - unsigned long pflags; |
---|
| 4543 | + unsigned long progress; |
---|
3950 | 4544 | |
---|
3951 | 4545 | cond_resched(); |
---|
3952 | 4546 | |
---|
3953 | 4547 | /* We now go into synchronous reclaim */ |
---|
3954 | 4548 | cpuset_memory_pressure_bump(); |
---|
3955 | | - psi_memstall_enter(&pflags); |
---|
3956 | 4549 | fs_reclaim_acquire(gfp_mask); |
---|
3957 | 4550 | noreclaim_flag = memalloc_noreclaim_save(); |
---|
3958 | | - reclaim_state.reclaimed_slab = 0; |
---|
3959 | | - current->reclaim_state = &reclaim_state; |
---|
3960 | 4551 | |
---|
3961 | 4552 | progress = try_to_free_pages(ac->zonelist, order, gfp_mask, |
---|
3962 | 4553 | ac->nodemask); |
---|
3963 | 4554 | |
---|
3964 | | - current->reclaim_state = NULL; |
---|
3965 | 4555 | memalloc_noreclaim_restore(noreclaim_flag); |
---|
3966 | 4556 | fs_reclaim_release(gfp_mask); |
---|
3967 | | - psi_memstall_leave(&pflags); |
---|
3968 | 4557 | |
---|
3969 | 4558 | cond_resched(); |
---|
3970 | 4559 | |
---|
.. | .. |
---|
3978 | 4567 | unsigned long *did_some_progress) |
---|
3979 | 4568 | { |
---|
3980 | 4569 | struct page *page = NULL; |
---|
| 4570 | + unsigned long pflags; |
---|
3981 | 4571 | bool drained = false; |
---|
| 4572 | + bool skip_pcp_drain = false; |
---|
3982 | 4573 | |
---|
| 4574 | + psi_memstall_enter(&pflags); |
---|
3983 | 4575 | *did_some_progress = __perform_reclaim(gfp_mask, order, ac); |
---|
3984 | 4576 | if (unlikely(!(*did_some_progress))) |
---|
3985 | | - return NULL; |
---|
| 4577 | + goto out; |
---|
3986 | 4578 | |
---|
3987 | 4579 | retry: |
---|
3988 | 4580 | page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); |
---|
.. | .. |
---|
3990 | 4582 | /* |
---|
3991 | 4583 | * If an allocation failed after direct reclaim, it could be because |
---|
3992 | 4584 | * pages are pinned on the per-cpu lists or in high alloc reserves. |
---|
3993 | | - * Shrink them them and try again |
---|
| 4585 | + * Shrink them and try again |
---|
3994 | 4586 | */ |
---|
3995 | 4587 | if (!page && !drained) { |
---|
3996 | 4588 | unreserve_highatomic_pageblock(ac, false); |
---|
3997 | | - drain_all_pages(NULL); |
---|
| 4589 | + trace_android_vh_drain_all_pages_bypass(gfp_mask, order, |
---|
| 4590 | + alloc_flags, ac->migratetype, *did_some_progress, &skip_pcp_drain); |
---|
| 4591 | + if (!skip_pcp_drain) |
---|
| 4592 | + drain_all_pages(NULL); |
---|
3998 | 4593 | drained = true; |
---|
3999 | 4594 | goto retry; |
---|
4000 | 4595 | } |
---|
| 4596 | +out: |
---|
| 4597 | + psi_memstall_leave(&pflags); |
---|
4001 | 4598 | |
---|
4002 | 4599 | return page; |
---|
4003 | 4600 | } |
---|
.. | .. |
---|
4008 | 4605 | struct zoneref *z; |
---|
4009 | 4606 | struct zone *zone; |
---|
4010 | 4607 | pg_data_t *last_pgdat = NULL; |
---|
4011 | | - enum zone_type high_zoneidx = ac->high_zoneidx; |
---|
| 4608 | + enum zone_type highest_zoneidx = ac->highest_zoneidx; |
---|
4012 | 4609 | |
---|
4013 | | - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, |
---|
| 4610 | + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, |
---|
4014 | 4611 | ac->nodemask) { |
---|
4015 | 4612 | if (last_pgdat != zone->zone_pgdat) |
---|
4016 | | - wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); |
---|
| 4613 | + wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); |
---|
4017 | 4614 | last_pgdat = zone->zone_pgdat; |
---|
4018 | 4615 | } |
---|
4019 | 4616 | } |
---|
.. | .. |
---|
4023 | 4620 | { |
---|
4024 | 4621 | unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; |
---|
4025 | 4622 | |
---|
4026 | | - /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ |
---|
| 4623 | + /* |
---|
| 4624 | + * __GFP_HIGH is assumed to be the same as ALLOC_HIGH |
---|
| 4625 | + * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD |
---|
| 4626 | + * to save two branches. |
---|
| 4627 | + */ |
---|
4027 | 4628 | BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); |
---|
| 4629 | + BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); |
---|
4028 | 4630 | |
---|
4029 | 4631 | /* |
---|
4030 | 4632 | * The caller may dip into page reserves a bit more if the caller |
---|
.. | .. |
---|
4032 | 4634 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will |
---|
4033 | 4635 | * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). |
---|
4034 | 4636 | */ |
---|
4035 | | - alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); |
---|
| 4637 | + alloc_flags |= (__force int) |
---|
| 4638 | + (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); |
---|
4036 | 4639 | |
---|
4037 | 4640 | if (gfp_mask & __GFP_ATOMIC) { |
---|
4038 | 4641 | /* |
---|
.. | .. |
---|
4049 | 4652 | } else if (unlikely(rt_task(current)) && !in_interrupt()) |
---|
4050 | 4653 | alloc_flags |= ALLOC_HARDER; |
---|
4051 | 4654 | |
---|
4052 | | -#ifdef CONFIG_CMA |
---|
4053 | | - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
---|
4054 | | - alloc_flags |= ALLOC_CMA; |
---|
4055 | | -#endif |
---|
| 4655 | + alloc_flags = current_alloc_flags(gfp_mask, alloc_flags); |
---|
| 4656 | + |
---|
4056 | 4657 | return alloc_flags; |
---|
4057 | 4658 | } |
---|
4058 | 4659 | |
---|
.. | .. |
---|
4115 | 4716 | { |
---|
4116 | 4717 | struct zone *zone; |
---|
4117 | 4718 | struct zoneref *z; |
---|
| 4719 | + bool ret = false; |
---|
4118 | 4720 | |
---|
4119 | 4721 | /* |
---|
4120 | 4722 | * Costly allocations might have made a progress but this doesn't mean |
---|
.. | .. |
---|
4141 | 4743 | * request even if all reclaimable pages are considered then we are |
---|
4142 | 4744 | * screwed and have to go OOM. |
---|
4143 | 4745 | */ |
---|
4144 | | - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, |
---|
4145 | | - ac->nodemask) { |
---|
| 4746 | + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, |
---|
| 4747 | + ac->highest_zoneidx, ac->nodemask) { |
---|
4146 | 4748 | unsigned long available; |
---|
4147 | 4749 | unsigned long reclaimable; |
---|
4148 | 4750 | unsigned long min_wmark = min_wmark_pages(zone); |
---|
.. | .. |
---|
4156 | 4758 | * reclaimable pages? |
---|
4157 | 4759 | */ |
---|
4158 | 4760 | wmark = __zone_watermark_ok(zone, order, min_wmark, |
---|
4159 | | - ac_classzone_idx(ac), alloc_flags, available); |
---|
| 4761 | + ac->highest_zoneidx, alloc_flags, available); |
---|
4160 | 4762 | trace_reclaim_retry_zone(z, order, reclaimable, |
---|
4161 | 4763 | available, min_wmark, *no_progress_loops, wmark); |
---|
4162 | 4764 | if (wmark) { |
---|
.. | .. |
---|
4178 | 4780 | } |
---|
4179 | 4781 | } |
---|
4180 | 4782 | |
---|
4181 | | - /* |
---|
4182 | | - * Memory allocation/reclaim might be called from a WQ |
---|
4183 | | - * context and the current implementation of the WQ |
---|
4184 | | - * concurrency control doesn't recognize that |
---|
4185 | | - * a particular WQ is congested if the worker thread is |
---|
4186 | | - * looping without ever sleeping. Therefore we have to |
---|
4187 | | - * do a short sleep here rather than calling |
---|
4188 | | - * cond_resched(). |
---|
4189 | | - */ |
---|
4190 | | - if (current->flags & PF_WQ_WORKER) |
---|
4191 | | - schedule_timeout_uninterruptible(1); |
---|
4192 | | - else |
---|
4193 | | - cond_resched(); |
---|
4194 | | - |
---|
4195 | | - return true; |
---|
| 4783 | + ret = true; |
---|
| 4784 | + goto out; |
---|
4196 | 4785 | } |
---|
4197 | 4786 | } |
---|
4198 | 4787 | |
---|
4199 | | - return false; |
---|
| 4788 | +out: |
---|
| 4789 | + /* |
---|
| 4790 | + * Memory allocation/reclaim might be called from a WQ context and the |
---|
| 4791 | + * current implementation of the WQ concurrency control doesn't |
---|
| 4792 | + * recognize that a particular WQ is congested if the worker thread is |
---|
| 4793 | + * looping without ever sleeping. Therefore we have to do a short sleep |
---|
| 4794 | + * here rather than calling cond_resched(). |
---|
| 4795 | + */ |
---|
| 4796 | + if (current->flags & PF_WQ_WORKER) |
---|
| 4797 | + schedule_timeout_uninterruptible(1); |
---|
| 4798 | + else |
---|
| 4799 | + cond_resched(); |
---|
| 4800 | + return ret; |
---|
4200 | 4801 | } |
---|
4201 | 4802 | |
---|
4202 | 4803 | static inline bool |
---|
.. | .. |
---|
4246 | 4847 | int compaction_retries; |
---|
4247 | 4848 | int no_progress_loops; |
---|
4248 | 4849 | unsigned int cpuset_mems_cookie; |
---|
| 4850 | + unsigned int zonelist_iter_cookie; |
---|
4249 | 4851 | int reserve_flags; |
---|
| 4852 | + unsigned long vh_record; |
---|
| 4853 | + bool should_alloc_retry = false; |
---|
4250 | 4854 | |
---|
| 4855 | + trace_android_vh_alloc_pages_slowpath_begin(gfp_mask, order, &vh_record); |
---|
4251 | 4856 | /* |
---|
4252 | 4857 | * We also sanity check to catch abuse of atomic reserves being used by |
---|
4253 | 4858 | * callers that are not in atomic context. |
---|
.. | .. |
---|
4256 | 4861 | (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) |
---|
4257 | 4862 | gfp_mask &= ~__GFP_ATOMIC; |
---|
4258 | 4863 | |
---|
4259 | | -retry_cpuset: |
---|
| 4864 | +restart: |
---|
4260 | 4865 | compaction_retries = 0; |
---|
4261 | 4866 | no_progress_loops = 0; |
---|
4262 | 4867 | compact_priority = DEF_COMPACT_PRIORITY; |
---|
4263 | 4868 | cpuset_mems_cookie = read_mems_allowed_begin(); |
---|
| 4869 | + zonelist_iter_cookie = zonelist_iter_begin(); |
---|
4264 | 4870 | |
---|
4265 | 4871 | /* |
---|
4266 | 4872 | * The fast path uses conservative alloc_flags to succeed only until |
---|
.. | .. |
---|
4276 | 4882 | * could end up iterating over non-eligible zones endlessly. |
---|
4277 | 4883 | */ |
---|
4278 | 4884 | ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, |
---|
4279 | | - ac->high_zoneidx, ac->nodemask); |
---|
| 4885 | + ac->highest_zoneidx, ac->nodemask); |
---|
4280 | 4886 | if (!ac->preferred_zoneref->zone) |
---|
4281 | 4887 | goto nopage; |
---|
4282 | 4888 | |
---|
4283 | | - if (gfp_mask & __GFP_KSWAPD_RECLAIM) |
---|
| 4889 | + if (alloc_flags & ALLOC_KSWAPD) |
---|
4284 | 4890 | wake_all_kswapds(order, gfp_mask, ac); |
---|
4285 | 4891 | |
---|
4286 | 4892 | /* |
---|
.. | .. |
---|
4313 | 4919 | |
---|
4314 | 4920 | /* |
---|
4315 | 4921 | * Checks for costly allocations with __GFP_NORETRY, which |
---|
4316 | | - * includes THP page fault allocations |
---|
| 4922 | + * includes some THP page fault allocations |
---|
4317 | 4923 | */ |
---|
4318 | 4924 | if (costly_order && (gfp_mask & __GFP_NORETRY)) { |
---|
4319 | 4925 | /* |
---|
4320 | | - * If compaction is deferred for high-order allocations, |
---|
4321 | | - * it is because sync compaction recently failed. If |
---|
4322 | | - * this is the case and the caller requested a THP |
---|
4323 | | - * allocation, we do not want to heavily disrupt the |
---|
4324 | | - * system, so we fail the allocation instead of entering |
---|
4325 | | - * direct reclaim. |
---|
| 4926 | + * If allocating entire pageblock(s) and compaction |
---|
| 4927 | + * failed because all zones are below low watermarks |
---|
| 4928 | + * or is prohibited because it recently failed at this |
---|
| 4929 | + * order, fail immediately unless the allocator has |
---|
| 4930 | + * requested compaction and reclaim retry. |
---|
| 4931 | + * |
---|
| 4932 | + * Reclaim is |
---|
| 4933 | + * - potentially very expensive because zones are far |
---|
| 4934 | + * below their low watermarks or this is part of very |
---|
| 4935 | + * bursty high order allocations, |
---|
| 4936 | + * - not guaranteed to help because isolate_freepages() |
---|
| 4937 | + * may not iterate over freed pages as part of its |
---|
| 4938 | + * linear scan, and |
---|
| 4939 | + * - unlikely to make entire pageblocks free on its |
---|
| 4940 | + * own. |
---|
4326 | 4941 | */ |
---|
4327 | | - if (compact_result == COMPACT_DEFERRED) |
---|
| 4942 | + if (compact_result == COMPACT_SKIPPED || |
---|
| 4943 | + compact_result == COMPACT_DEFERRED) |
---|
4328 | 4944 | goto nopage; |
---|
4329 | 4945 | |
---|
4330 | 4946 | /* |
---|
.. | .. |
---|
4338 | 4954 | |
---|
4339 | 4955 | retry: |
---|
4340 | 4956 | /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ |
---|
4341 | | - if (gfp_mask & __GFP_KSWAPD_RECLAIM) |
---|
| 4957 | + if (alloc_flags & ALLOC_KSWAPD) |
---|
4342 | 4958 | wake_all_kswapds(order, gfp_mask, ac); |
---|
4343 | 4959 | |
---|
4344 | 4960 | reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); |
---|
4345 | 4961 | if (reserve_flags) |
---|
4346 | | - alloc_flags = reserve_flags; |
---|
| 4962 | + alloc_flags = current_alloc_flags(gfp_mask, reserve_flags); |
---|
4347 | 4963 | |
---|
4348 | 4964 | /* |
---|
4349 | 4965 | * Reset the nodemask and zonelist iterators if memory policies can be |
---|
.. | .. |
---|
4353 | 4969 | if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { |
---|
4354 | 4970 | ac->nodemask = NULL; |
---|
4355 | 4971 | ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, |
---|
4356 | | - ac->high_zoneidx, ac->nodemask); |
---|
| 4972 | + ac->highest_zoneidx, ac->nodemask); |
---|
4357 | 4973 | } |
---|
4358 | 4974 | |
---|
4359 | 4975 | /* Attempt with potentially adjusted zonelist and alloc_flags */ |
---|
.. | .. |
---|
4368 | 4984 | /* Avoid recursion of direct reclaim */ |
---|
4369 | 4985 | if (current->flags & PF_MEMALLOC) |
---|
4370 | 4986 | goto nopage; |
---|
| 4987 | + |
---|
| 4988 | + trace_android_vh_alloc_pages_reclaim_bypass(gfp_mask, order, |
---|
| 4989 | + alloc_flags, ac->migratetype, &page); |
---|
| 4990 | + |
---|
| 4991 | + if (page) |
---|
| 4992 | + goto got_pg; |
---|
| 4993 | + |
---|
| 4994 | + trace_android_vh_should_alloc_pages_retry(gfp_mask, order, |
---|
| 4995 | + &alloc_flags, ac->migratetype, ac->preferred_zoneref->zone, |
---|
| 4996 | + &page, &should_alloc_retry); |
---|
| 4997 | + if (should_alloc_retry) |
---|
| 4998 | + goto retry; |
---|
4371 | 4999 | |
---|
4372 | 5000 | /* Try direct reclaim and then allocating */ |
---|
4373 | 5001 | page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, |
---|
.. | .. |
---|
4409 | 5037 | goto retry; |
---|
4410 | 5038 | |
---|
4411 | 5039 | |
---|
4412 | | - /* Deal with possible cpuset update races before we start OOM killing */ |
---|
4413 | | - if (check_retry_cpuset(cpuset_mems_cookie, ac)) |
---|
4414 | | - goto retry_cpuset; |
---|
| 5040 | + /* |
---|
| 5041 | + * Deal with possible cpuset update races or zonelist updates to avoid |
---|
| 5042 | + * a unnecessary OOM kill. |
---|
| 5043 | + */ |
---|
| 5044 | + if (check_retry_cpuset(cpuset_mems_cookie, ac) || |
---|
| 5045 | + check_retry_zonelist(zonelist_iter_cookie)) |
---|
| 5046 | + goto restart; |
---|
4415 | 5047 | |
---|
4416 | 5048 | /* Reclaim has failed us, start killing things */ |
---|
4417 | 5049 | page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); |
---|
.. | .. |
---|
4420 | 5052 | |
---|
4421 | 5053 | /* Avoid allocations with no watermarks from looping endlessly */ |
---|
4422 | 5054 | if (tsk_is_oom_victim(current) && |
---|
4423 | | - (alloc_flags == ALLOC_OOM || |
---|
| 5055 | + (alloc_flags & ALLOC_OOM || |
---|
4424 | 5056 | (gfp_mask & __GFP_NOMEMALLOC))) |
---|
4425 | 5057 | goto nopage; |
---|
4426 | 5058 | |
---|
.. | .. |
---|
4431 | 5063 | } |
---|
4432 | 5064 | |
---|
4433 | 5065 | nopage: |
---|
4434 | | - /* Deal with possible cpuset update races before we fail */ |
---|
4435 | | - if (check_retry_cpuset(cpuset_mems_cookie, ac)) |
---|
4436 | | - goto retry_cpuset; |
---|
| 5066 | + /* |
---|
| 5067 | + * Deal with possible cpuset update races or zonelist updates to avoid |
---|
| 5068 | + * a unnecessary OOM kill. |
---|
| 5069 | + */ |
---|
| 5070 | + if (check_retry_cpuset(cpuset_mems_cookie, ac) || |
---|
| 5071 | + check_retry_zonelist(zonelist_iter_cookie)) |
---|
| 5072 | + goto restart; |
---|
4437 | 5073 | |
---|
4438 | 5074 | /* |
---|
4439 | 5075 | * Make sure that __GFP_NOFAIL request doesn't leak out and make sure |
---|
.. | .. |
---|
4476 | 5112 | goto retry; |
---|
4477 | 5113 | } |
---|
4478 | 5114 | fail: |
---|
| 5115 | + trace_android_vh_alloc_pages_failure_bypass(gfp_mask, order, |
---|
| 5116 | + alloc_flags, ac->migratetype, &page); |
---|
| 5117 | + if (page) |
---|
| 5118 | + goto got_pg; |
---|
| 5119 | + |
---|
4479 | 5120 | warn_alloc(gfp_mask, ac->nodemask, |
---|
4480 | 5121 | "page allocation failure: order:%u", order); |
---|
4481 | 5122 | got_pg: |
---|
| 5123 | + trace_android_vh_alloc_pages_slowpath_end(gfp_mask, order, vh_record); |
---|
4482 | 5124 | return page; |
---|
4483 | 5125 | } |
---|
4484 | 5126 | |
---|
.. | .. |
---|
4487 | 5129 | struct alloc_context *ac, gfp_t *alloc_mask, |
---|
4488 | 5130 | unsigned int *alloc_flags) |
---|
4489 | 5131 | { |
---|
4490 | | - ac->high_zoneidx = gfp_zone(gfp_mask); |
---|
| 5132 | + ac->highest_zoneidx = gfp_zone(gfp_mask); |
---|
4491 | 5133 | ac->zonelist = node_zonelist(preferred_nid, gfp_mask); |
---|
4492 | 5134 | ac->nodemask = nodemask; |
---|
4493 | | - ac->migratetype = gfpflags_to_migratetype(gfp_mask); |
---|
| 5135 | + ac->migratetype = gfp_migratetype(gfp_mask); |
---|
4494 | 5136 | |
---|
4495 | 5137 | if (cpusets_enabled()) { |
---|
4496 | 5138 | *alloc_mask |= __GFP_HARDWALL; |
---|
4497 | | - if (!ac->nodemask) |
---|
| 5139 | + /* |
---|
| 5140 | + * When we are in the interrupt context, it is irrelevant |
---|
| 5141 | + * to the current task context. It means that any node ok. |
---|
| 5142 | + */ |
---|
| 5143 | + if (!in_interrupt() && !ac->nodemask) |
---|
4498 | 5144 | ac->nodemask = &cpuset_current_mems_allowed; |
---|
4499 | 5145 | else |
---|
4500 | 5146 | *alloc_flags |= ALLOC_CPUSET; |
---|
.. | .. |
---|
4508 | 5154 | if (should_fail_alloc_page(gfp_mask, order)) |
---|
4509 | 5155 | return false; |
---|
4510 | 5156 | |
---|
4511 | | - if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) |
---|
4512 | | - *alloc_flags |= ALLOC_CMA; |
---|
| 5157 | + *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags); |
---|
4513 | 5158 | |
---|
4514 | | - return true; |
---|
4515 | | -} |
---|
4516 | | - |
---|
4517 | | -/* Determine whether to spread dirty pages and what the first usable zone */ |
---|
4518 | | -static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) |
---|
4519 | | -{ |
---|
4520 | 5159 | /* Dirty zone balancing only done in the fast path */ |
---|
4521 | 5160 | ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); |
---|
4522 | 5161 | |
---|
.. | .. |
---|
4526 | 5165 | * may get reset for allocations that ignore memory policies. |
---|
4527 | 5166 | */ |
---|
4528 | 5167 | ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, |
---|
4529 | | - ac->high_zoneidx, ac->nodemask); |
---|
| 5168 | + ac->highest_zoneidx, ac->nodemask); |
---|
| 5169 | + |
---|
| 5170 | + return true; |
---|
4530 | 5171 | } |
---|
4531 | 5172 | |
---|
4532 | 5173 | /* |
---|
.. | .. |
---|
4555 | 5196 | if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) |
---|
4556 | 5197 | return NULL; |
---|
4557 | 5198 | |
---|
4558 | | - finalise_ac(gfp_mask, &ac); |
---|
| 5199 | + /* |
---|
| 5200 | + * Forbid the first pass from falling back to types that fragment |
---|
| 5201 | + * memory until all local zones are considered. |
---|
| 5202 | + */ |
---|
| 5203 | + alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); |
---|
4559 | 5204 | |
---|
4560 | 5205 | /* First allocation attempt */ |
---|
4561 | 5206 | page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); |
---|
.. | .. |
---|
4575 | 5220 | * Restore the original nodemask if it was potentially replaced with |
---|
4576 | 5221 | * &cpuset_current_mems_allowed to optimize the fast-path attempt. |
---|
4577 | 5222 | */ |
---|
4578 | | - if (unlikely(ac.nodemask != nodemask)) |
---|
4579 | | - ac.nodemask = nodemask; |
---|
| 5223 | + ac.nodemask = nodemask; |
---|
4580 | 5224 | |
---|
4581 | 5225 | page = __alloc_pages_slowpath(alloc_mask, order, &ac); |
---|
4582 | 5226 | |
---|
4583 | 5227 | out: |
---|
4584 | 5228 | if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && |
---|
4585 | | - unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { |
---|
| 5229 | + unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) { |
---|
4586 | 5230 | __free_pages(page, order); |
---|
4587 | 5231 | page = NULL; |
---|
4588 | 5232 | } |
---|
.. | .. |
---|
4620 | 5264 | if (order == 0) /* Via pcp? */ |
---|
4621 | 5265 | free_unref_page(page); |
---|
4622 | 5266 | else |
---|
4623 | | - __free_pages_ok(page, order); |
---|
| 5267 | + __free_pages_ok(page, order, FPI_NONE); |
---|
4624 | 5268 | } |
---|
4625 | 5269 | |
---|
4626 | 5270 | void __free_pages(struct page *page, unsigned int order) |
---|
4627 | 5271 | { |
---|
| 5272 | + /* get PageHead before we drop reference */ |
---|
| 5273 | + int head = PageHead(page); |
---|
| 5274 | + |
---|
| 5275 | + trace_android_vh_free_pages(page, order); |
---|
4628 | 5276 | if (put_page_testzero(page)) |
---|
4629 | 5277 | free_the_page(page, order); |
---|
| 5278 | + else if (!head) |
---|
| 5279 | + while (order-- > 0) |
---|
| 5280 | + free_the_page(page + (1 << order), order); |
---|
4630 | 5281 | } |
---|
4631 | 5282 | EXPORT_SYMBOL(__free_pages); |
---|
4632 | 5283 | |
---|
.. | .. |
---|
4731 | 5382 | /* reset page count bias and offset to start of new frag */ |
---|
4732 | 5383 | nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; |
---|
4733 | 5384 | offset = size - fragsz; |
---|
| 5385 | + if (unlikely(offset < 0)) { |
---|
| 5386 | + /* |
---|
| 5387 | + * The caller is trying to allocate a fragment |
---|
| 5388 | + * with fragsz > PAGE_SIZE but the cache isn't big |
---|
| 5389 | + * enough to satisfy the request, this may |
---|
| 5390 | + * happen in low memory conditions. |
---|
| 5391 | + * We don't release the cache page because |
---|
| 5392 | + * it could make memory pressure worse |
---|
| 5393 | + * so we simply return NULL here. |
---|
| 5394 | + */ |
---|
| 5395 | + return NULL; |
---|
| 5396 | + } |
---|
4734 | 5397 | } |
---|
4735 | 5398 | |
---|
4736 | 5399 | nc->pagecnt_bias--; |
---|
.. | .. |
---|
4771 | 5434 | /** |
---|
4772 | 5435 | * alloc_pages_exact - allocate an exact number physically-contiguous pages. |
---|
4773 | 5436 | * @size: the number of bytes to allocate |
---|
4774 | | - * @gfp_mask: GFP flags for the allocation |
---|
| 5437 | + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP |
---|
4775 | 5438 | * |
---|
4776 | 5439 | * This function is similar to alloc_pages(), except that it allocates the |
---|
4777 | 5440 | * minimum number of pages to satisfy the request. alloc_pages() can only |
---|
.. | .. |
---|
4780 | 5443 | * This function is also limited by MAX_ORDER. |
---|
4781 | 5444 | * |
---|
4782 | 5445 | * Memory allocated by this function must be released by free_pages_exact(). |
---|
| 5446 | + * |
---|
| 5447 | + * Return: pointer to the allocated area or %NULL in case of error. |
---|
4783 | 5448 | */ |
---|
4784 | 5449 | void *alloc_pages_exact(size_t size, gfp_t gfp_mask) |
---|
4785 | 5450 | { |
---|
4786 | 5451 | unsigned int order = get_order(size); |
---|
4787 | 5452 | unsigned long addr; |
---|
| 5453 | + |
---|
| 5454 | + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) |
---|
| 5455 | + gfp_mask &= ~__GFP_COMP; |
---|
4788 | 5456 | |
---|
4789 | 5457 | addr = __get_free_pages(gfp_mask, order); |
---|
4790 | 5458 | return make_alloc_exact(addr, order, size); |
---|
.. | .. |
---|
4796 | 5464 | * pages on a node. |
---|
4797 | 5465 | * @nid: the preferred node ID where memory should be allocated |
---|
4798 | 5466 | * @size: the number of bytes to allocate |
---|
4799 | | - * @gfp_mask: GFP flags for the allocation |
---|
| 5467 | + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP |
---|
4800 | 5468 | * |
---|
4801 | 5469 | * Like alloc_pages_exact(), but try to allocate on node nid first before falling |
---|
4802 | 5470 | * back. |
---|
| 5471 | + * |
---|
| 5472 | + * Return: pointer to the allocated area or %NULL in case of error. |
---|
4803 | 5473 | */ |
---|
4804 | 5474 | void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) |
---|
4805 | 5475 | { |
---|
4806 | 5476 | unsigned int order = get_order(size); |
---|
4807 | | - struct page *p = alloc_pages_node(nid, gfp_mask, order); |
---|
| 5477 | + struct page *p; |
---|
| 5478 | + |
---|
| 5479 | + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) |
---|
| 5480 | + gfp_mask &= ~__GFP_COMP; |
---|
| 5481 | + |
---|
| 5482 | + p = alloc_pages_node(nid, gfp_mask, order); |
---|
4808 | 5483 | if (!p) |
---|
4809 | 5484 | return NULL; |
---|
4810 | 5485 | return make_alloc_exact((unsigned long)page_address(p), order, size); |
---|
.. | .. |
---|
4833 | 5508 | * nr_free_zone_pages - count number of pages beyond high watermark |
---|
4834 | 5509 | * @offset: The zone index of the highest zone |
---|
4835 | 5510 | * |
---|
4836 | | - * nr_free_zone_pages() counts the number of counts pages which are beyond the |
---|
| 5511 | + * nr_free_zone_pages() counts the number of pages which are beyond the |
---|
4837 | 5512 | * high watermark within all zones at or below a given zone index. For each |
---|
4838 | 5513 | * zone, the number of pages is calculated as: |
---|
4839 | 5514 | * |
---|
4840 | 5515 | * nr_free_zone_pages = managed_pages - high_pages |
---|
| 5516 | + * |
---|
| 5517 | + * Return: number of pages beyond high watermark. |
---|
4841 | 5518 | */ |
---|
4842 | 5519 | static unsigned long nr_free_zone_pages(int offset) |
---|
4843 | 5520 | { |
---|
.. | .. |
---|
4850 | 5527 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); |
---|
4851 | 5528 | |
---|
4852 | 5529 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
---|
4853 | | - unsigned long size = zone->managed_pages; |
---|
| 5530 | + unsigned long size = zone_managed_pages(zone); |
---|
4854 | 5531 | unsigned long high = high_wmark_pages(zone); |
---|
4855 | 5532 | if (size > high) |
---|
4856 | 5533 | sum += size - high; |
---|
.. | .. |
---|
4864 | 5541 | * |
---|
4865 | 5542 | * nr_free_buffer_pages() counts the number of pages which are beyond the high |
---|
4866 | 5543 | * watermark within ZONE_DMA and ZONE_NORMAL. |
---|
| 5544 | + * |
---|
| 5545 | + * Return: number of pages beyond high watermark within ZONE_DMA and |
---|
| 5546 | + * ZONE_NORMAL. |
---|
4867 | 5547 | */ |
---|
4868 | 5548 | unsigned long nr_free_buffer_pages(void) |
---|
4869 | 5549 | { |
---|
4870 | 5550 | return nr_free_zone_pages(gfp_zone(GFP_USER)); |
---|
4871 | 5551 | } |
---|
4872 | 5552 | EXPORT_SYMBOL_GPL(nr_free_buffer_pages); |
---|
4873 | | - |
---|
4874 | | -/** |
---|
4875 | | - * nr_free_pagecache_pages - count number of pages beyond high watermark |
---|
4876 | | - * |
---|
4877 | | - * nr_free_pagecache_pages() counts the number of pages which are beyond the |
---|
4878 | | - * high watermark within all zones. |
---|
4879 | | - */ |
---|
4880 | | -unsigned long nr_free_pagecache_pages(void) |
---|
4881 | | -{ |
---|
4882 | | - return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); |
---|
4883 | | -} |
---|
4884 | 5553 | |
---|
4885 | 5554 | static inline void show_node(struct zone *zone) |
---|
4886 | 5555 | { |
---|
.. | .. |
---|
4902 | 5571 | pages[lru] = global_node_page_state(NR_LRU_BASE + lru); |
---|
4903 | 5572 | |
---|
4904 | 5573 | for_each_zone(zone) |
---|
4905 | | - wmark_low += zone->watermark[WMARK_LOW]; |
---|
| 5574 | + wmark_low += low_wmark_pages(zone); |
---|
4906 | 5575 | |
---|
4907 | 5576 | /* |
---|
4908 | 5577 | * Estimate the amount of memory available for userspace allocations, |
---|
.. | .. |
---|
4924 | 5593 | * items that are in use, and cannot be freed. Cap this estimate at the |
---|
4925 | 5594 | * low watermark. |
---|
4926 | 5595 | */ |
---|
4927 | | - reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) + |
---|
4928 | | - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); |
---|
| 5596 | + reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + |
---|
| 5597 | + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); |
---|
4929 | 5598 | available += reclaimable - min(reclaimable / 2, wmark_low); |
---|
4930 | 5599 | |
---|
4931 | 5600 | if (available < 0) |
---|
.. | .. |
---|
4936 | 5605 | |
---|
4937 | 5606 | void si_meminfo(struct sysinfo *val) |
---|
4938 | 5607 | { |
---|
4939 | | - val->totalram = totalram_pages; |
---|
| 5608 | + val->totalram = totalram_pages(); |
---|
4940 | 5609 | val->sharedram = global_node_page_state(NR_SHMEM); |
---|
4941 | 5610 | val->freeram = global_zone_page_state(NR_FREE_PAGES); |
---|
4942 | 5611 | val->bufferram = nr_blockdev_pages(); |
---|
4943 | | - val->totalhigh = totalhigh_pages; |
---|
| 5612 | + val->totalhigh = totalhigh_pages(); |
---|
4944 | 5613 | val->freehigh = nr_free_highpages(); |
---|
4945 | 5614 | val->mem_unit = PAGE_SIZE; |
---|
4946 | 5615 | } |
---|
.. | .. |
---|
4957 | 5626 | pg_data_t *pgdat = NODE_DATA(nid); |
---|
4958 | 5627 | |
---|
4959 | 5628 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) |
---|
4960 | | - managed_pages += pgdat->node_zones[zone_type].managed_pages; |
---|
| 5629 | + managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); |
---|
4961 | 5630 | val->totalram = managed_pages; |
---|
4962 | 5631 | val->sharedram = node_page_state(pgdat, NR_SHMEM); |
---|
4963 | 5632 | val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); |
---|
.. | .. |
---|
4966 | 5635 | struct zone *zone = &pgdat->node_zones[zone_type]; |
---|
4967 | 5636 | |
---|
4968 | 5637 | if (is_highmem(zone)) { |
---|
4969 | | - managed_highpages += zone->managed_pages; |
---|
| 5638 | + managed_highpages += zone_managed_pages(zone); |
---|
4970 | 5639 | free_highpages += zone_page_state(zone, NR_FREE_PAGES); |
---|
4971 | 5640 | } |
---|
4972 | 5641 | } |
---|
.. | .. |
---|
5055 | 5724 | |
---|
5056 | 5725 | printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" |
---|
5057 | 5726 | " active_file:%lu inactive_file:%lu isolated_file:%lu\n" |
---|
5058 | | - " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" |
---|
| 5727 | + " unevictable:%lu dirty:%lu writeback:%lu\n" |
---|
5059 | 5728 | " slab_reclaimable:%lu slab_unreclaimable:%lu\n" |
---|
5060 | 5729 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" |
---|
5061 | 5730 | " free:%lu free_pcp:%lu free_cma:%lu\n", |
---|
.. | .. |
---|
5068 | 5737 | global_node_page_state(NR_UNEVICTABLE), |
---|
5069 | 5738 | global_node_page_state(NR_FILE_DIRTY), |
---|
5070 | 5739 | global_node_page_state(NR_WRITEBACK), |
---|
5071 | | - global_node_page_state(NR_UNSTABLE_NFS), |
---|
5072 | | - global_node_page_state(NR_SLAB_RECLAIMABLE), |
---|
5073 | | - global_node_page_state(NR_SLAB_UNRECLAIMABLE), |
---|
| 5740 | + global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), |
---|
| 5741 | + global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), |
---|
5074 | 5742 | global_node_page_state(NR_FILE_MAPPED), |
---|
5075 | 5743 | global_node_page_state(NR_SHMEM), |
---|
5076 | 5744 | global_zone_page_state(NR_PAGETABLE), |
---|
.. | .. |
---|
5079 | 5747 | free_pcp, |
---|
5080 | 5748 | global_zone_page_state(NR_FREE_CMA_PAGES)); |
---|
5081 | 5749 | |
---|
| 5750 | + trace_android_vh_show_mapcount_pages(NULL); |
---|
5082 | 5751 | for_each_online_pgdat(pgdat) { |
---|
5083 | 5752 | if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) |
---|
5084 | 5753 | continue; |
---|
.. | .. |
---|
5101 | 5770 | " anon_thp: %lukB" |
---|
5102 | 5771 | #endif |
---|
5103 | 5772 | " writeback_tmp:%lukB" |
---|
5104 | | - " unstable:%lukB" |
---|
| 5773 | + " kernel_stack:%lukB" |
---|
| 5774 | +#ifdef CONFIG_SHADOW_CALL_STACK |
---|
| 5775 | + " shadow_call_stack:%lukB" |
---|
| 5776 | +#endif |
---|
5105 | 5777 | " all_unreclaimable? %s" |
---|
5106 | 5778 | "\n", |
---|
5107 | 5779 | pgdat->node_id, |
---|
.. | .. |
---|
5123 | 5795 | K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), |
---|
5124 | 5796 | #endif |
---|
5125 | 5797 | K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), |
---|
5126 | | - K(node_page_state(pgdat, NR_UNSTABLE_NFS)), |
---|
| 5798 | + node_page_state(pgdat, NR_KERNEL_STACK_KB), |
---|
| 5799 | +#ifdef CONFIG_SHADOW_CALL_STACK |
---|
| 5800 | + node_page_state(pgdat, NR_KERNEL_SCS_KB), |
---|
| 5801 | +#endif |
---|
5127 | 5802 | pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? |
---|
5128 | 5803 | "yes" : "no"); |
---|
5129 | 5804 | } |
---|
.. | .. |
---|
5145 | 5820 | " min:%lukB" |
---|
5146 | 5821 | " low:%lukB" |
---|
5147 | 5822 | " high:%lukB" |
---|
| 5823 | + " reserved_highatomic:%luKB" |
---|
5148 | 5824 | " active_anon:%lukB" |
---|
5149 | 5825 | " inactive_anon:%lukB" |
---|
5150 | 5826 | " active_file:%lukB" |
---|
.. | .. |
---|
5154 | 5830 | " present:%lukB" |
---|
5155 | 5831 | " managed:%lukB" |
---|
5156 | 5832 | " mlocked:%lukB" |
---|
5157 | | - " kernel_stack:%lukB" |
---|
5158 | | -#ifdef CONFIG_SHADOW_CALL_STACK |
---|
5159 | | - " shadow_call_stack:%lukB" |
---|
5160 | | -#endif |
---|
5161 | 5833 | " pagetables:%lukB" |
---|
5162 | 5834 | " bounce:%lukB" |
---|
5163 | 5835 | " free_pcp:%lukB" |
---|
.. | .. |
---|
5169 | 5841 | K(min_wmark_pages(zone)), |
---|
5170 | 5842 | K(low_wmark_pages(zone)), |
---|
5171 | 5843 | K(high_wmark_pages(zone)), |
---|
| 5844 | + K(zone->nr_reserved_highatomic), |
---|
5172 | 5845 | K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), |
---|
5173 | 5846 | K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), |
---|
5174 | 5847 | K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), |
---|
.. | .. |
---|
5176 | 5849 | K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), |
---|
5177 | 5850 | K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), |
---|
5178 | 5851 | K(zone->present_pages), |
---|
5179 | | - K(zone->managed_pages), |
---|
| 5852 | + K(zone_managed_pages(zone)), |
---|
5180 | 5853 | K(zone_page_state(zone, NR_MLOCK)), |
---|
5181 | | - zone_page_state(zone, NR_KERNEL_STACK_KB), |
---|
5182 | | -#ifdef CONFIG_SHADOW_CALL_STACK |
---|
5183 | | - zone_page_state(zone, NR_KERNEL_SCS_BYTES) / 1024, |
---|
5184 | | -#endif |
---|
5185 | 5854 | K(zone_page_state(zone, NR_PAGETABLE)), |
---|
5186 | 5855 | K(zone_page_state(zone, NR_BOUNCE)), |
---|
5187 | 5856 | K(free_pcp), |
---|
.. | .. |
---|
5213 | 5882 | |
---|
5214 | 5883 | types[order] = 0; |
---|
5215 | 5884 | for (type = 0; type < MIGRATE_TYPES; type++) { |
---|
5216 | | - if (!list_empty(&area->free_list[type])) |
---|
| 5885 | + if (!free_area_empty(area, type)) |
---|
5217 | 5886 | types[order] |= 1 << type; |
---|
5218 | 5887 | } |
---|
5219 | 5888 | } |
---|
.. | .. |
---|
5254 | 5923 | do { |
---|
5255 | 5924 | zone_type--; |
---|
5256 | 5925 | zone = pgdat->node_zones + zone_type; |
---|
5257 | | - if (managed_zone(zone)) { |
---|
| 5926 | + if (populated_zone(zone)) { |
---|
5258 | 5927 | zoneref_set_zone(zone, &zonerefs[nr_zones++]); |
---|
5259 | 5928 | check_highest_zone(zone_type); |
---|
5260 | 5929 | } |
---|
.. | .. |
---|
5280 | 5949 | return 0; |
---|
5281 | 5950 | } |
---|
5282 | 5951 | |
---|
5283 | | -static __init int setup_numa_zonelist_order(char *s) |
---|
5284 | | -{ |
---|
5285 | | - if (!s) |
---|
5286 | | - return 0; |
---|
5287 | | - |
---|
5288 | | - return __parse_numa_zonelist_order(s); |
---|
5289 | | -} |
---|
5290 | | -early_param("numa_zonelist_order", setup_numa_zonelist_order); |
---|
5291 | | - |
---|
5292 | 5952 | char numa_zonelist_order[] = "Node"; |
---|
5293 | 5953 | |
---|
5294 | 5954 | /* |
---|
5295 | 5955 | * sysctl handler for numa_zonelist_order |
---|
5296 | 5956 | */ |
---|
5297 | 5957 | int numa_zonelist_order_handler(struct ctl_table *table, int write, |
---|
5298 | | - void __user *buffer, size_t *length, |
---|
5299 | | - loff_t *ppos) |
---|
| 5958 | + void *buffer, size_t *length, loff_t *ppos) |
---|
5300 | 5959 | { |
---|
5301 | | - char *str; |
---|
5302 | | - int ret; |
---|
5303 | | - |
---|
5304 | | - if (!write) |
---|
5305 | | - return proc_dostring(table, write, buffer, length, ppos); |
---|
5306 | | - str = memdup_user_nul(buffer, 16); |
---|
5307 | | - if (IS_ERR(str)) |
---|
5308 | | - return PTR_ERR(str); |
---|
5309 | | - |
---|
5310 | | - ret = __parse_numa_zonelist_order(str); |
---|
5311 | | - kfree(str); |
---|
5312 | | - return ret; |
---|
| 5960 | + if (write) |
---|
| 5961 | + return __parse_numa_zonelist_order(buffer); |
---|
| 5962 | + return proc_dostring(table, write, buffer, length, ppos); |
---|
5313 | 5963 | } |
---|
5314 | 5964 | |
---|
5315 | 5965 | |
---|
.. | .. |
---|
5328 | 5978 | * from each node to each node in the system), and should also prefer nodes |
---|
5329 | 5979 | * with no CPUs, since presumably they'll have very little allocation pressure |
---|
5330 | 5980 | * on them otherwise. |
---|
5331 | | - * It returns -1 if no node is found. |
---|
| 5981 | + * |
---|
| 5982 | + * Return: node id of the found node or %NUMA_NO_NODE if no node is found. |
---|
5332 | 5983 | */ |
---|
5333 | 5984 | static int find_next_best_node(int node, nodemask_t *used_node_mask) |
---|
5334 | 5985 | { |
---|
5335 | 5986 | int n, val; |
---|
5336 | 5987 | int min_val = INT_MAX; |
---|
5337 | 5988 | int best_node = NUMA_NO_NODE; |
---|
5338 | | - const struct cpumask *tmp = cpumask_of_node(0); |
---|
5339 | 5989 | |
---|
5340 | 5990 | /* Use the local node if we haven't already */ |
---|
5341 | 5991 | if (!node_isset(node, *used_node_mask)) { |
---|
.. | .. |
---|
5356 | 6006 | val += (n < node); |
---|
5357 | 6007 | |
---|
5358 | 6008 | /* Give preference to headless and unused nodes */ |
---|
5359 | | - tmp = cpumask_of_node(n); |
---|
5360 | | - if (!cpumask_empty(tmp)) |
---|
| 6009 | + if (!cpumask_empty(cpumask_of_node(n))) |
---|
5361 | 6010 | val += PENALTY_FOR_NODE_WITH_CPUS; |
---|
5362 | 6011 | |
---|
5363 | 6012 | /* Slight preference for less loaded node */ |
---|
.. | .. |
---|
5428 | 6077 | { |
---|
5429 | 6078 | static int node_order[MAX_NUMNODES]; |
---|
5430 | 6079 | int node, load, nr_nodes = 0; |
---|
5431 | | - nodemask_t used_mask; |
---|
| 6080 | + nodemask_t used_mask = NODE_MASK_NONE; |
---|
5432 | 6081 | int local_node, prev_node; |
---|
5433 | 6082 | |
---|
5434 | 6083 | /* NUMA-aware ordering of nodes */ |
---|
5435 | 6084 | local_node = pgdat->node_id; |
---|
5436 | 6085 | load = nr_online_nodes; |
---|
5437 | 6086 | prev_node = local_node; |
---|
5438 | | - nodes_clear(used_mask); |
---|
5439 | 6087 | |
---|
5440 | 6088 | memset(node_order, 0, sizeof(node_order)); |
---|
5441 | 6089 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
---|
.. | .. |
---|
5542 | 6190 | int nid; |
---|
5543 | 6191 | int __maybe_unused cpu; |
---|
5544 | 6192 | pg_data_t *self = data; |
---|
5545 | | - static DEFINE_SPINLOCK(lock); |
---|
| 6193 | + unsigned long flags; |
---|
5546 | 6194 | |
---|
5547 | | - spin_lock(&lock); |
---|
| 6195 | + /* |
---|
| 6196 | + * Explicitly disable this CPU's interrupts before taking seqlock |
---|
| 6197 | + * to prevent any IRQ handler from calling into the page allocator |
---|
| 6198 | + * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. |
---|
| 6199 | + */ |
---|
| 6200 | + local_irq_save(flags); |
---|
| 6201 | + /* |
---|
| 6202 | + * Explicitly disable this CPU's synchronous printk() before taking |
---|
| 6203 | + * seqlock to prevent any printk() from trying to hold port->lock, for |
---|
| 6204 | + * tty_insert_flip_string_and_push_buffer() on other CPU might be |
---|
| 6205 | + * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. |
---|
| 6206 | + */ |
---|
| 6207 | + printk_deferred_enter(); |
---|
| 6208 | + write_seqlock(&zonelist_update_seq); |
---|
5548 | 6209 | |
---|
5549 | 6210 | #ifdef CONFIG_NUMA |
---|
5550 | 6211 | memset(node_load, 0, sizeof(node_load)); |
---|
.. | .. |
---|
5577 | 6238 | #endif |
---|
5578 | 6239 | } |
---|
5579 | 6240 | |
---|
5580 | | - spin_unlock(&lock); |
---|
| 6241 | + write_sequnlock(&zonelist_update_seq); |
---|
| 6242 | + printk_deferred_exit(); |
---|
| 6243 | + local_irq_restore(flags); |
---|
5581 | 6244 | } |
---|
5582 | 6245 | |
---|
5583 | 6246 | static noinline void __init |
---|
.. | .. |
---|
5615 | 6278 | */ |
---|
5616 | 6279 | void __ref build_all_zonelists(pg_data_t *pgdat) |
---|
5617 | 6280 | { |
---|
| 6281 | + unsigned long vm_total_pages; |
---|
| 6282 | + |
---|
5618 | 6283 | if (system_state == SYSTEM_BOOTING) { |
---|
5619 | 6284 | build_all_zonelists_init(); |
---|
5620 | 6285 | } else { |
---|
5621 | 6286 | __build_all_zonelists(pgdat); |
---|
5622 | 6287 | /* cpuset refresh routine should be here */ |
---|
5623 | 6288 | } |
---|
5624 | | - vm_total_pages = nr_free_pagecache_pages(); |
---|
| 6289 | + /* Get the number of free pages beyond high watermark in all zones. */ |
---|
| 6290 | + vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); |
---|
5625 | 6291 | /* |
---|
5626 | 6292 | * Disable grouping by mobility if the number of pages in the |
---|
5627 | 6293 | * system is too low to allow the mechanism to work. It would be |
---|
.. | .. |
---|
5634 | 6300 | else |
---|
5635 | 6301 | page_group_by_mobility_disabled = 0; |
---|
5636 | 6302 | |
---|
5637 | | - pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", |
---|
| 6303 | + pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", |
---|
5638 | 6304 | nr_online_nodes, |
---|
5639 | 6305 | page_group_by_mobility_disabled ? "off" : "on", |
---|
5640 | 6306 | vm_total_pages); |
---|
.. | .. |
---|
5643 | 6309 | #endif |
---|
5644 | 6310 | } |
---|
5645 | 6311 | |
---|
| 6312 | +/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ |
---|
| 6313 | +static bool __meminit |
---|
| 6314 | +overlap_memmap_init(unsigned long zone, unsigned long *pfn) |
---|
| 6315 | +{ |
---|
| 6316 | + static struct memblock_region *r; |
---|
| 6317 | + |
---|
| 6318 | + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { |
---|
| 6319 | + if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { |
---|
| 6320 | + for_each_mem_region(r) { |
---|
| 6321 | + if (*pfn < memblock_region_memory_end_pfn(r)) |
---|
| 6322 | + break; |
---|
| 6323 | + } |
---|
| 6324 | + } |
---|
| 6325 | + if (*pfn >= memblock_region_memory_base_pfn(r) && |
---|
| 6326 | + memblock_is_mirror(r)) { |
---|
| 6327 | + *pfn = memblock_region_memory_end_pfn(r); |
---|
| 6328 | + return true; |
---|
| 6329 | + } |
---|
| 6330 | + } |
---|
| 6331 | + return false; |
---|
| 6332 | +} |
---|
| 6333 | + |
---|
5646 | 6334 | /* |
---|
5647 | 6335 | * Initially all pages are reserved - free ones are freed |
---|
5648 | | - * up by free_all_bootmem() once the early boot process is |
---|
| 6336 | + * up by memblock_free_all() once the early boot process is |
---|
5649 | 6337 | * done. Non-atomic initialization, single-pass. |
---|
| 6338 | + * |
---|
| 6339 | + * All aligned pageblocks are initialized to the specified migratetype |
---|
| 6340 | + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related |
---|
| 6341 | + * zone stats (e.g., nr_isolate_pageblock) are touched. |
---|
5650 | 6342 | */ |
---|
5651 | 6343 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
---|
5652 | | - unsigned long start_pfn, enum meminit_context context, |
---|
5653 | | - struct vmem_altmap *altmap) |
---|
| 6344 | + unsigned long start_pfn, unsigned long zone_end_pfn, |
---|
| 6345 | + enum meminit_context context, |
---|
| 6346 | + struct vmem_altmap *altmap, int migratetype) |
---|
5654 | 6347 | { |
---|
5655 | | - unsigned long end_pfn = start_pfn + size; |
---|
5656 | | - pg_data_t *pgdat = NODE_DATA(nid); |
---|
5657 | | - unsigned long pfn; |
---|
5658 | | - unsigned long nr_initialised = 0; |
---|
| 6348 | + unsigned long pfn, end_pfn = start_pfn + size; |
---|
5659 | 6349 | struct page *page; |
---|
5660 | | -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
---|
5661 | | - struct memblock_region *r = NULL, *tmp; |
---|
5662 | | -#endif |
---|
5663 | 6350 | |
---|
5664 | 6351 | if (highest_memmap_pfn < end_pfn - 1) |
---|
5665 | 6352 | highest_memmap_pfn = end_pfn - 1; |
---|
| 6353 | + |
---|
| 6354 | +#ifdef CONFIG_ZONE_DEVICE |
---|
| 6355 | + /* |
---|
| 6356 | + * Honor reservation requested by the driver for this ZONE_DEVICE |
---|
| 6357 | + * memory. We limit the total number of pages to initialize to just |
---|
| 6358 | + * those that might contain the memory mapping. We will defer the |
---|
| 6359 | + * ZONE_DEVICE page initialization until after we have released |
---|
| 6360 | + * the hotplug lock. |
---|
| 6361 | + */ |
---|
| 6362 | + if (zone == ZONE_DEVICE) { |
---|
| 6363 | + if (!altmap) |
---|
| 6364 | + return; |
---|
| 6365 | + |
---|
| 6366 | + if (start_pfn == altmap->base_pfn) |
---|
| 6367 | + start_pfn += altmap->reserve; |
---|
| 6368 | + end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); |
---|
| 6369 | + } |
---|
| 6370 | +#endif |
---|
5666 | 6371 | |
---|
5667 | 6372 | #ifdef CONFIG_ROCKCHIP_THUNDER_BOOT |
---|
5668 | 6373 | /* Zero all page struct in advance */ |
---|
5669 | 6374 | memset(pfn_to_page(start_pfn), 0, sizeof(struct page) * size); |
---|
5670 | 6375 | #endif |
---|
5671 | 6376 | |
---|
5672 | | - /* |
---|
5673 | | - * Honor reservation requested by the driver for this ZONE_DEVICE |
---|
5674 | | - * memory |
---|
5675 | | - */ |
---|
5676 | | - if (altmap && start_pfn == altmap->base_pfn) |
---|
5677 | | - start_pfn += altmap->reserve; |
---|
5678 | | - |
---|
5679 | | - for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
---|
| 6377 | + for (pfn = start_pfn; pfn < end_pfn; ) { |
---|
5680 | 6378 | /* |
---|
5681 | 6379 | * There can be holes in boot-time mem_map[]s handed to this |
---|
5682 | 6380 | * function. They do not exist on hotplugged memory. |
---|
5683 | 6381 | */ |
---|
5684 | | - if (context != MEMINIT_EARLY) |
---|
5685 | | - goto not_early; |
---|
5686 | | - |
---|
5687 | | - if (!early_pfn_valid(pfn)) |
---|
5688 | | - continue; |
---|
5689 | | - if (!early_pfn_in_nid(pfn, nid)) |
---|
5690 | | - continue; |
---|
5691 | | - if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) |
---|
5692 | | - break; |
---|
5693 | | - |
---|
5694 | | -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
---|
5695 | | - /* |
---|
5696 | | - * Check given memblock attribute by firmware which can affect |
---|
5697 | | - * kernel memory layout. If zone==ZONE_MOVABLE but memory is |
---|
5698 | | - * mirrored, it's an overlapped memmap init. skip it. |
---|
5699 | | - */ |
---|
5700 | | - if (mirrored_kernelcore && zone == ZONE_MOVABLE) { |
---|
5701 | | - if (!r || pfn >= memblock_region_memory_end_pfn(r)) { |
---|
5702 | | - for_each_memblock(memory, tmp) |
---|
5703 | | - if (pfn < memblock_region_memory_end_pfn(tmp)) |
---|
5704 | | - break; |
---|
5705 | | - r = tmp; |
---|
5706 | | - } |
---|
5707 | | - if (pfn >= memblock_region_memory_base_pfn(r) && |
---|
5708 | | - memblock_is_mirror(r)) { |
---|
5709 | | - /* already initialized as NORMAL */ |
---|
5710 | | - pfn = memblock_region_memory_end_pfn(r); |
---|
| 6382 | + if (context == MEMINIT_EARLY) { |
---|
| 6383 | + if (overlap_memmap_init(zone, &pfn)) |
---|
5711 | 6384 | continue; |
---|
5712 | | - } |
---|
| 6385 | + if (defer_init(nid, pfn, zone_end_pfn)) |
---|
| 6386 | + break; |
---|
5713 | 6387 | } |
---|
5714 | | -#endif |
---|
5715 | 6388 | |
---|
5716 | | -not_early: |
---|
5717 | 6389 | page = pfn_to_page(pfn); |
---|
5718 | 6390 | __init_single_page(page, pfn, zone, nid, false); |
---|
5719 | 6391 | if (context == MEMINIT_HOTPLUG) |
---|
5720 | | - SetPageReserved(page); |
---|
| 6392 | + __SetPageReserved(page); |
---|
| 6393 | + |
---|
| 6394 | + /* |
---|
| 6395 | + * Usually, we want to mark the pageblock MIGRATE_MOVABLE, |
---|
| 6396 | + * such that unmovable allocations won't be scattered all |
---|
| 6397 | + * over the place during system boot. |
---|
| 6398 | + */ |
---|
| 6399 | + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { |
---|
| 6400 | + set_pageblock_migratetype(page, migratetype); |
---|
| 6401 | + cond_resched(); |
---|
| 6402 | + } |
---|
| 6403 | + pfn++; |
---|
| 6404 | + } |
---|
| 6405 | +} |
---|
| 6406 | + |
---|
| 6407 | +#ifdef CONFIG_ZONE_DEVICE |
---|
| 6408 | +void __ref memmap_init_zone_device(struct zone *zone, |
---|
| 6409 | + unsigned long start_pfn, |
---|
| 6410 | + unsigned long nr_pages, |
---|
| 6411 | + struct dev_pagemap *pgmap) |
---|
| 6412 | +{ |
---|
| 6413 | + unsigned long pfn, end_pfn = start_pfn + nr_pages; |
---|
| 6414 | + struct pglist_data *pgdat = zone->zone_pgdat; |
---|
| 6415 | + struct vmem_altmap *altmap = pgmap_altmap(pgmap); |
---|
| 6416 | + unsigned long zone_idx = zone_idx(zone); |
---|
| 6417 | + unsigned long start = jiffies; |
---|
| 6418 | + int nid = pgdat->node_id; |
---|
| 6419 | + |
---|
| 6420 | + if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) |
---|
| 6421 | + return; |
---|
| 6422 | + |
---|
| 6423 | + /* |
---|
| 6424 | + * The call to memmap_init should have already taken care |
---|
| 6425 | + * of the pages reserved for the memmap, so we can just jump to |
---|
| 6426 | + * the end of that region and start processing the device pages. |
---|
| 6427 | + */ |
---|
| 6428 | + if (altmap) { |
---|
| 6429 | + start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); |
---|
| 6430 | + nr_pages = end_pfn - start_pfn; |
---|
| 6431 | + } |
---|
| 6432 | + |
---|
| 6433 | + for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
---|
| 6434 | + struct page *page = pfn_to_page(pfn); |
---|
| 6435 | + |
---|
| 6436 | + __init_single_page(page, pfn, zone_idx, nid, true); |
---|
| 6437 | + |
---|
| 6438 | + /* |
---|
| 6439 | + * Mark page reserved as it will need to wait for onlining |
---|
| 6440 | + * phase for it to be fully associated with a zone. |
---|
| 6441 | + * |
---|
| 6442 | + * We can use the non-atomic __set_bit operation for setting |
---|
| 6443 | + * the flag as we are still initializing the pages. |
---|
| 6444 | + */ |
---|
| 6445 | + __SetPageReserved(page); |
---|
| 6446 | + |
---|
| 6447 | + /* |
---|
| 6448 | + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer |
---|
| 6449 | + * and zone_device_data. It is a bug if a ZONE_DEVICE page is |
---|
| 6450 | + * ever freed or placed on a driver-private list. |
---|
| 6451 | + */ |
---|
| 6452 | + page->pgmap = pgmap; |
---|
| 6453 | + page->zone_device_data = NULL; |
---|
5721 | 6454 | |
---|
5722 | 6455 | /* |
---|
5723 | 6456 | * Mark the block movable so that blocks are reserved for |
---|
.. | .. |
---|
5726 | 6459 | * the address space during boot when many long-lived |
---|
5727 | 6460 | * kernel allocations are made. |
---|
5728 | 6461 | * |
---|
5729 | | - * bitmap is created for zone's valid pfn range. but memmap |
---|
5730 | | - * can be created for invalid pages (for alignment) |
---|
5731 | | - * check here not to call set_pageblock_migratetype() against |
---|
5732 | | - * pfn out of zone. |
---|
5733 | | - * |
---|
5734 | 6462 | * Please note that MEMINIT_HOTPLUG path doesn't clear memmap |
---|
5735 | | - * because this is done early in sparse_add_one_section |
---|
| 6463 | + * because this is done early in section_activate() |
---|
5736 | 6464 | */ |
---|
5737 | | - if (!(pfn & (pageblock_nr_pages - 1))) { |
---|
| 6465 | + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { |
---|
5738 | 6466 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
---|
5739 | 6467 | cond_resched(); |
---|
5740 | 6468 | } |
---|
5741 | 6469 | } |
---|
| 6470 | + |
---|
| 6471 | + pr_info("%s initialised %lu pages in %ums\n", __func__, |
---|
| 6472 | + nr_pages, jiffies_to_msecs(jiffies - start)); |
---|
5742 | 6473 | } |
---|
5743 | 6474 | |
---|
| 6475 | +#endif |
---|
5744 | 6476 | static void __meminit zone_init_free_lists(struct zone *zone) |
---|
5745 | 6477 | { |
---|
5746 | 6478 | unsigned int order, t; |
---|
.. | .. |
---|
5750 | 6482 | } |
---|
5751 | 6483 | } |
---|
5752 | 6484 | |
---|
5753 | | -#ifndef __HAVE_ARCH_MEMMAP_INIT |
---|
5754 | | -#define memmap_init(size, nid, zone, start_pfn) \ |
---|
5755 | | - memmap_init_zone((size), (nid), (zone), (start_pfn), \ |
---|
5756 | | - MEMINIT_EARLY, NULL) |
---|
| 6485 | +/* |
---|
| 6486 | + * Only struct pages that correspond to ranges defined by memblock.memory |
---|
| 6487 | + * are zeroed and initialized by going through __init_single_page() during |
---|
| 6488 | + * memmap_init_zone_range(). |
---|
| 6489 | + * |
---|
| 6490 | + * But, there could be struct pages that correspond to holes in |
---|
| 6491 | + * memblock.memory. This can happen because of the following reasons: |
---|
| 6492 | + * - physical memory bank size is not necessarily the exact multiple of the |
---|
| 6493 | + * arbitrary section size |
---|
| 6494 | + * - early reserved memory may not be listed in memblock.memory |
---|
| 6495 | + * - memory layouts defined with memmap= kernel parameter may not align |
---|
| 6496 | + * nicely with memmap sections |
---|
| 6497 | + * |
---|
| 6498 | + * Explicitly initialize those struct pages so that: |
---|
| 6499 | + * - PG_Reserved is set |
---|
| 6500 | + * - zone and node links point to zone and node that span the page if the |
---|
| 6501 | + * hole is in the middle of a zone |
---|
| 6502 | + * - zone and node links point to adjacent zone/node if the hole falls on |
---|
| 6503 | + * the zone boundary; the pages in such holes will be prepended to the |
---|
| 6504 | + * zone/node above the hole except for the trailing pages in the last |
---|
| 6505 | + * section that will be appended to the zone/node below. |
---|
| 6506 | + */ |
---|
| 6507 | +static void __init init_unavailable_range(unsigned long spfn, |
---|
| 6508 | + unsigned long epfn, |
---|
| 6509 | + int zone, int node) |
---|
| 6510 | +{ |
---|
| 6511 | + unsigned long pfn; |
---|
| 6512 | + u64 pgcnt = 0; |
---|
| 6513 | + |
---|
| 6514 | + for (pfn = spfn; pfn < epfn; pfn++) { |
---|
| 6515 | + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { |
---|
| 6516 | + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) |
---|
| 6517 | + + pageblock_nr_pages - 1; |
---|
| 6518 | + continue; |
---|
| 6519 | + } |
---|
| 6520 | + __init_single_page(pfn_to_page(pfn), pfn, zone, node, true); |
---|
| 6521 | + __SetPageReserved(pfn_to_page(pfn)); |
---|
| 6522 | + pgcnt++; |
---|
| 6523 | + } |
---|
| 6524 | + |
---|
| 6525 | + if (pgcnt) |
---|
| 6526 | + pr_info("On node %d, zone %s: %lld pages in unavailable ranges", |
---|
| 6527 | + node, zone_names[zone], pgcnt); |
---|
| 6528 | +} |
---|
| 6529 | + |
---|
| 6530 | +static void __init memmap_init_zone_range(struct zone *zone, |
---|
| 6531 | + unsigned long start_pfn, |
---|
| 6532 | + unsigned long end_pfn, |
---|
| 6533 | + unsigned long *hole_pfn) |
---|
| 6534 | +{ |
---|
| 6535 | + unsigned long zone_start_pfn = zone->zone_start_pfn; |
---|
| 6536 | + unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; |
---|
| 6537 | + int nid = zone_to_nid(zone), zone_id = zone_idx(zone); |
---|
| 6538 | + |
---|
| 6539 | + start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); |
---|
| 6540 | + end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); |
---|
| 6541 | + |
---|
| 6542 | + if (start_pfn >= end_pfn) |
---|
| 6543 | + return; |
---|
| 6544 | + |
---|
| 6545 | + memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn, |
---|
| 6546 | + zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); |
---|
| 6547 | + |
---|
| 6548 | + if (*hole_pfn < start_pfn) |
---|
| 6549 | + init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); |
---|
| 6550 | + |
---|
| 6551 | + *hole_pfn = end_pfn; |
---|
| 6552 | +} |
---|
| 6553 | + |
---|
| 6554 | +void __init __weak memmap_init(void) |
---|
| 6555 | +{ |
---|
| 6556 | + unsigned long start_pfn, end_pfn; |
---|
| 6557 | + unsigned long hole_pfn = 0; |
---|
| 6558 | + int i, j, zone_id, nid; |
---|
| 6559 | + |
---|
| 6560 | + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { |
---|
| 6561 | + struct pglist_data *node = NODE_DATA(nid); |
---|
| 6562 | + |
---|
| 6563 | + for (j = 0; j < MAX_NR_ZONES; j++) { |
---|
| 6564 | + struct zone *zone = node->node_zones + j; |
---|
| 6565 | + |
---|
| 6566 | + if (!populated_zone(zone)) |
---|
| 6567 | + continue; |
---|
| 6568 | + |
---|
| 6569 | + memmap_init_zone_range(zone, start_pfn, end_pfn, |
---|
| 6570 | + &hole_pfn); |
---|
| 6571 | + zone_id = j; |
---|
| 6572 | + } |
---|
| 6573 | + } |
---|
| 6574 | + |
---|
| 6575 | +#ifdef CONFIG_SPARSEMEM |
---|
| 6576 | + /* |
---|
| 6577 | + * Initialize the memory map for hole in the range [memory_end, |
---|
| 6578 | + * section_end]. |
---|
| 6579 | + * Append the pages in this hole to the highest zone in the last |
---|
| 6580 | + * node. |
---|
| 6581 | + * The call to init_unavailable_range() is outside the ifdef to |
---|
| 6582 | + * silence the compiler warining about zone_id set but not used; |
---|
| 6583 | + * for FLATMEM it is a nop anyway |
---|
| 6584 | + */ |
---|
| 6585 | + end_pfn = round_up(end_pfn, PAGES_PER_SECTION); |
---|
| 6586 | + if (hole_pfn < end_pfn) |
---|
5757 | 6587 | #endif |
---|
| 6588 | + init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); |
---|
| 6589 | +} |
---|
| 6590 | + |
---|
| 6591 | +/* A stub for backwards compatibility with custom implementatin on IA-64 */ |
---|
| 6592 | +void __meminit __weak arch_memmap_init(unsigned long size, int nid, |
---|
| 6593 | + unsigned long zone, |
---|
| 6594 | + unsigned long range_start_pfn) |
---|
| 6595 | +{ |
---|
| 6596 | +} |
---|
5758 | 6597 | |
---|
5759 | 6598 | static int zone_batchsize(struct zone *zone) |
---|
5760 | 6599 | { |
---|
.. | .. |
---|
5765 | 6604 | * The per-cpu-pages pools are set to around 1000th of the |
---|
5766 | 6605 | * size of the zone. |
---|
5767 | 6606 | */ |
---|
5768 | | - batch = zone->managed_pages / 1024; |
---|
| 6607 | + batch = zone_managed_pages(zone) / 1024; |
---|
5769 | 6608 | /* But no more than a meg. */ |
---|
5770 | 6609 | if (batch * PAGE_SIZE > 1024 * 1024) |
---|
5771 | 6610 | batch = (1024 * 1024) / PAGE_SIZE; |
---|
.. | .. |
---|
5812 | 6651 | * locking. |
---|
5813 | 6652 | * |
---|
5814 | 6653 | * Any new users of pcp->batch and pcp->high should ensure they can cope with |
---|
5815 | | - * those fields changing asynchronously (acording the the above rule). |
---|
| 6654 | + * those fields changing asynchronously (acording to the above rule). |
---|
5816 | 6655 | * |
---|
5817 | 6656 | * mutex_is_locked(&pcp_batch_high_lock) required when calling this function |
---|
5818 | 6657 | * outside of boot time (or some other assurance that no concurrent updaters |
---|
.. | .. |
---|
5821 | 6660 | static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, |
---|
5822 | 6661 | unsigned long batch) |
---|
5823 | 6662 | { |
---|
| 6663 | + trace_android_vh_pageset_update(&high, &batch); |
---|
5824 | 6664 | /* start with a fail safe value for batch */ |
---|
5825 | 6665 | pcp->batch = 1; |
---|
5826 | 6666 | smp_wmb(); |
---|
.. | .. |
---|
5846 | 6686 | memset(p, 0, sizeof(*p)); |
---|
5847 | 6687 | |
---|
5848 | 6688 | pcp = &p->pcp; |
---|
5849 | | - pcp->count = 0; |
---|
5850 | 6689 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) |
---|
5851 | 6690 | INIT_LIST_HEAD(&pcp->lists[migratetype]); |
---|
5852 | 6691 | } |
---|
.. | .. |
---|
5876 | 6715 | { |
---|
5877 | 6716 | if (percpu_pagelist_fraction) |
---|
5878 | 6717 | pageset_set_high(pcp, |
---|
5879 | | - (zone->managed_pages / |
---|
| 6718 | + (zone_managed_pages(zone) / |
---|
5880 | 6719 | percpu_pagelist_fraction)); |
---|
5881 | 6720 | else |
---|
5882 | 6721 | pageset_set_batch(pcp, zone_batchsize(zone)); |
---|
.. | .. |
---|
5906 | 6745 | { |
---|
5907 | 6746 | struct pglist_data *pgdat; |
---|
5908 | 6747 | struct zone *zone; |
---|
| 6748 | + int __maybe_unused cpu; |
---|
5909 | 6749 | |
---|
5910 | 6750 | for_each_populated_zone(zone) |
---|
5911 | 6751 | setup_zone_pageset(zone); |
---|
| 6752 | + |
---|
| 6753 | +#ifdef CONFIG_NUMA |
---|
| 6754 | + /* |
---|
| 6755 | + * Unpopulated zones continue using the boot pagesets. |
---|
| 6756 | + * The numa stats for these pagesets need to be reset. |
---|
| 6757 | + * Otherwise, they will end up skewing the stats of |
---|
| 6758 | + * the nodes these zones are associated with. |
---|
| 6759 | + */ |
---|
| 6760 | + for_each_possible_cpu(cpu) { |
---|
| 6761 | + struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu); |
---|
| 6762 | + memset(pcp->vm_numa_stat_diff, 0, |
---|
| 6763 | + sizeof(pcp->vm_numa_stat_diff)); |
---|
| 6764 | + } |
---|
| 6765 | +#endif |
---|
5912 | 6766 | |
---|
5913 | 6767 | for_each_online_pgdat(pgdat) |
---|
5914 | 6768 | pgdat->per_cpu_nodestats = |
---|
.. | .. |
---|
5952 | 6806 | zone->initialized = 1; |
---|
5953 | 6807 | } |
---|
5954 | 6808 | |
---|
5955 | | -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
---|
5956 | | -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID |
---|
5957 | | - |
---|
5958 | | -/* |
---|
5959 | | - * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. |
---|
5960 | | - */ |
---|
5961 | | -int __meminit __early_pfn_to_nid(unsigned long pfn, |
---|
5962 | | - struct mminit_pfnnid_cache *state) |
---|
5963 | | -{ |
---|
5964 | | - unsigned long start_pfn, end_pfn; |
---|
5965 | | - int nid; |
---|
5966 | | - |
---|
5967 | | - if (state->last_start <= pfn && pfn < state->last_end) |
---|
5968 | | - return state->last_nid; |
---|
5969 | | - |
---|
5970 | | - nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); |
---|
5971 | | - if (nid != -1) { |
---|
5972 | | - state->last_start = start_pfn; |
---|
5973 | | - state->last_end = end_pfn; |
---|
5974 | | - state->last_nid = nid; |
---|
5975 | | - } |
---|
5976 | | - |
---|
5977 | | - return nid; |
---|
5978 | | -} |
---|
5979 | | -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ |
---|
5980 | | - |
---|
5981 | | -/** |
---|
5982 | | - * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range |
---|
5983 | | - * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. |
---|
5984 | | - * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid |
---|
5985 | | - * |
---|
5986 | | - * If an architecture guarantees that all ranges registered contain no holes |
---|
5987 | | - * and may be freed, this this function may be used instead of calling |
---|
5988 | | - * memblock_free_early_nid() manually. |
---|
5989 | | - */ |
---|
5990 | | -void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) |
---|
5991 | | -{ |
---|
5992 | | - unsigned long start_pfn, end_pfn; |
---|
5993 | | - int i, this_nid; |
---|
5994 | | - |
---|
5995 | | - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { |
---|
5996 | | - start_pfn = min(start_pfn, max_low_pfn); |
---|
5997 | | - end_pfn = min(end_pfn, max_low_pfn); |
---|
5998 | | - |
---|
5999 | | - if (start_pfn < end_pfn) |
---|
6000 | | - memblock_free_early_nid(PFN_PHYS(start_pfn), |
---|
6001 | | - (end_pfn - start_pfn) << PAGE_SHIFT, |
---|
6002 | | - this_nid); |
---|
6003 | | - } |
---|
6004 | | -} |
---|
6005 | | - |
---|
6006 | | -/** |
---|
6007 | | - * sparse_memory_present_with_active_regions - Call memory_present for each active range |
---|
6008 | | - * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. |
---|
6009 | | - * |
---|
6010 | | - * If an architecture guarantees that all ranges registered contain no holes and may |
---|
6011 | | - * be freed, this function may be used instead of calling memory_present() manually. |
---|
6012 | | - */ |
---|
6013 | | -void __init sparse_memory_present_with_active_regions(int nid) |
---|
6014 | | -{ |
---|
6015 | | - unsigned long start_pfn, end_pfn; |
---|
6016 | | - int i, this_nid; |
---|
6017 | | - |
---|
6018 | | - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) |
---|
6019 | | - memory_present(this_nid, start_pfn, end_pfn); |
---|
6020 | | -} |
---|
6021 | | - |
---|
6022 | 6809 | /** |
---|
6023 | 6810 | * get_pfn_range_for_nid - Return the start and end page frames for a node |
---|
6024 | 6811 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. |
---|
.. | .. |
---|
6030 | 6817 | * with no available memory, a warning is printed and the start and end |
---|
6031 | 6818 | * PFNs will be 0. |
---|
6032 | 6819 | */ |
---|
6033 | | -void __meminit get_pfn_range_for_nid(unsigned int nid, |
---|
| 6820 | +void __init get_pfn_range_for_nid(unsigned int nid, |
---|
6034 | 6821 | unsigned long *start_pfn, unsigned long *end_pfn) |
---|
6035 | 6822 | { |
---|
6036 | 6823 | unsigned long this_start_pfn, this_end_pfn; |
---|
.. | .. |
---|
6079 | 6866 | * highest usable zone for ZONE_MOVABLE. This preserves the assumption that |
---|
6080 | 6867 | * zones within a node are in order of monotonic increases memory addresses |
---|
6081 | 6868 | */ |
---|
6082 | | -static void __meminit adjust_zone_range_for_zone_movable(int nid, |
---|
| 6869 | +static void __init adjust_zone_range_for_zone_movable(int nid, |
---|
6083 | 6870 | unsigned long zone_type, |
---|
6084 | 6871 | unsigned long node_start_pfn, |
---|
6085 | 6872 | unsigned long node_end_pfn, |
---|
.. | .. |
---|
6110 | 6897 | * Return the number of pages a zone spans in a node, including holes |
---|
6111 | 6898 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() |
---|
6112 | 6899 | */ |
---|
6113 | | -static unsigned long __meminit zone_spanned_pages_in_node(int nid, |
---|
| 6900 | +static unsigned long __init zone_spanned_pages_in_node(int nid, |
---|
6114 | 6901 | unsigned long zone_type, |
---|
6115 | 6902 | unsigned long node_start_pfn, |
---|
6116 | 6903 | unsigned long node_end_pfn, |
---|
6117 | 6904 | unsigned long *zone_start_pfn, |
---|
6118 | | - unsigned long *zone_end_pfn, |
---|
6119 | | - unsigned long *ignored) |
---|
| 6905 | + unsigned long *zone_end_pfn) |
---|
6120 | 6906 | { |
---|
6121 | 6907 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; |
---|
6122 | 6908 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; |
---|
.. | .. |
---|
6147 | 6933 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
---|
6148 | 6934 | * then all holes in the requested range will be accounted for. |
---|
6149 | 6935 | */ |
---|
6150 | | -unsigned long __meminit __absent_pages_in_range(int nid, |
---|
| 6936 | +unsigned long __init __absent_pages_in_range(int nid, |
---|
6151 | 6937 | unsigned long range_start_pfn, |
---|
6152 | 6938 | unsigned long range_end_pfn) |
---|
6153 | 6939 | { |
---|
.. | .. |
---|
6168 | 6954 | * @start_pfn: The start PFN to start searching for holes |
---|
6169 | 6955 | * @end_pfn: The end PFN to stop searching for holes |
---|
6170 | 6956 | * |
---|
6171 | | - * It returns the number of pages frames in memory holes within a range. |
---|
| 6957 | + * Return: the number of pages frames in memory holes within a range. |
---|
6172 | 6958 | */ |
---|
6173 | 6959 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, |
---|
6174 | 6960 | unsigned long end_pfn) |
---|
.. | .. |
---|
6177 | 6963 | } |
---|
6178 | 6964 | |
---|
6179 | 6965 | /* Return the number of page frames in holes in a zone on a node */ |
---|
6180 | | -static unsigned long __meminit zone_absent_pages_in_node(int nid, |
---|
| 6966 | +static unsigned long __init zone_absent_pages_in_node(int nid, |
---|
6181 | 6967 | unsigned long zone_type, |
---|
6182 | 6968 | unsigned long node_start_pfn, |
---|
6183 | | - unsigned long node_end_pfn, |
---|
6184 | | - unsigned long *ignored) |
---|
| 6969 | + unsigned long node_end_pfn) |
---|
6185 | 6970 | { |
---|
6186 | 6971 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; |
---|
6187 | 6972 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; |
---|
.. | .. |
---|
6209 | 6994 | unsigned long start_pfn, end_pfn; |
---|
6210 | 6995 | struct memblock_region *r; |
---|
6211 | 6996 | |
---|
6212 | | - for_each_memblock(memory, r) { |
---|
| 6997 | + for_each_mem_region(r) { |
---|
6213 | 6998 | start_pfn = clamp(memblock_region_memory_base_pfn(r), |
---|
6214 | 6999 | zone_start_pfn, zone_end_pfn); |
---|
6215 | 7000 | end_pfn = clamp(memblock_region_memory_end_pfn(r), |
---|
.. | .. |
---|
6228 | 7013 | return nr_absent; |
---|
6229 | 7014 | } |
---|
6230 | 7015 | |
---|
6231 | | -#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
---|
6232 | | -static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, |
---|
6233 | | - unsigned long zone_type, |
---|
6234 | | - unsigned long node_start_pfn, |
---|
6235 | | - unsigned long node_end_pfn, |
---|
6236 | | - unsigned long *zone_start_pfn, |
---|
6237 | | - unsigned long *zone_end_pfn, |
---|
6238 | | - unsigned long *zones_size) |
---|
6239 | | -{ |
---|
6240 | | - unsigned int zone; |
---|
6241 | | - |
---|
6242 | | - *zone_start_pfn = node_start_pfn; |
---|
6243 | | - for (zone = 0; zone < zone_type; zone++) |
---|
6244 | | - *zone_start_pfn += zones_size[zone]; |
---|
6245 | | - |
---|
6246 | | - *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; |
---|
6247 | | - |
---|
6248 | | - return zones_size[zone_type]; |
---|
6249 | | -} |
---|
6250 | | - |
---|
6251 | | -static inline unsigned long __meminit zone_absent_pages_in_node(int nid, |
---|
6252 | | - unsigned long zone_type, |
---|
| 7016 | +static void __init calculate_node_totalpages(struct pglist_data *pgdat, |
---|
6253 | 7017 | unsigned long node_start_pfn, |
---|
6254 | | - unsigned long node_end_pfn, |
---|
6255 | | - unsigned long *zholes_size) |
---|
6256 | | -{ |
---|
6257 | | - if (!zholes_size) |
---|
6258 | | - return 0; |
---|
6259 | | - |
---|
6260 | | - return zholes_size[zone_type]; |
---|
6261 | | -} |
---|
6262 | | - |
---|
6263 | | -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
---|
6264 | | - |
---|
6265 | | -static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, |
---|
6266 | | - unsigned long node_start_pfn, |
---|
6267 | | - unsigned long node_end_pfn, |
---|
6268 | | - unsigned long *zones_size, |
---|
6269 | | - unsigned long *zholes_size) |
---|
| 7018 | + unsigned long node_end_pfn) |
---|
6270 | 7019 | { |
---|
6271 | 7020 | unsigned long realtotalpages = 0, totalpages = 0; |
---|
6272 | 7021 | enum zone_type i; |
---|
.. | .. |
---|
6274 | 7023 | for (i = 0; i < MAX_NR_ZONES; i++) { |
---|
6275 | 7024 | struct zone *zone = pgdat->node_zones + i; |
---|
6276 | 7025 | unsigned long zone_start_pfn, zone_end_pfn; |
---|
| 7026 | + unsigned long spanned, absent; |
---|
6277 | 7027 | unsigned long size, real_size; |
---|
6278 | 7028 | |
---|
6279 | | - size = zone_spanned_pages_in_node(pgdat->node_id, i, |
---|
6280 | | - node_start_pfn, |
---|
6281 | | - node_end_pfn, |
---|
6282 | | - &zone_start_pfn, |
---|
6283 | | - &zone_end_pfn, |
---|
6284 | | - zones_size); |
---|
6285 | | - real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, |
---|
6286 | | - node_start_pfn, node_end_pfn, |
---|
6287 | | - zholes_size); |
---|
| 7029 | + spanned = zone_spanned_pages_in_node(pgdat->node_id, i, |
---|
| 7030 | + node_start_pfn, |
---|
| 7031 | + node_end_pfn, |
---|
| 7032 | + &zone_start_pfn, |
---|
| 7033 | + &zone_end_pfn); |
---|
| 7034 | + absent = zone_absent_pages_in_node(pgdat->node_id, i, |
---|
| 7035 | + node_start_pfn, |
---|
| 7036 | + node_end_pfn); |
---|
| 7037 | + |
---|
| 7038 | + size = spanned; |
---|
| 7039 | + real_size = size - absent; |
---|
| 7040 | + |
---|
6288 | 7041 | if (size) |
---|
6289 | 7042 | zone->zone_start_pfn = zone_start_pfn; |
---|
6290 | 7043 | else |
---|
.. | .. |
---|
6330 | 7083 | { |
---|
6331 | 7084 | unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); |
---|
6332 | 7085 | zone->pageblock_flags = NULL; |
---|
6333 | | - if (usemapsize) |
---|
| 7086 | + if (usemapsize) { |
---|
6334 | 7087 | zone->pageblock_flags = |
---|
6335 | | - memblock_virt_alloc_node_nopanic(usemapsize, |
---|
6336 | | - pgdat->node_id); |
---|
| 7088 | + memblock_alloc_node(usemapsize, SMP_CACHE_BYTES, |
---|
| 7089 | + pgdat->node_id); |
---|
| 7090 | + if (!zone->pageblock_flags) |
---|
| 7091 | + panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", |
---|
| 7092 | + usemapsize, zone->name, pgdat->node_id); |
---|
| 7093 | + } |
---|
6337 | 7094 | } |
---|
6338 | 7095 | #else |
---|
6339 | 7096 | static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, |
---|
.. | .. |
---|
6400 | 7157 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
---|
6401 | 7158 | static void pgdat_init_split_queue(struct pglist_data *pgdat) |
---|
6402 | 7159 | { |
---|
6403 | | - spin_lock_init(&pgdat->split_queue_lock); |
---|
6404 | | - INIT_LIST_HEAD(&pgdat->split_queue); |
---|
6405 | | - pgdat->split_queue_len = 0; |
---|
| 7160 | + struct deferred_split *ds_queue = &pgdat->deferred_split_queue; |
---|
| 7161 | + |
---|
| 7162 | + spin_lock_init(&ds_queue->split_queue_lock); |
---|
| 7163 | + INIT_LIST_HEAD(&ds_queue->split_queue); |
---|
| 7164 | + ds_queue->split_queue_len = 0; |
---|
6406 | 7165 | } |
---|
6407 | 7166 | #else |
---|
6408 | 7167 | static void pgdat_init_split_queue(struct pglist_data *pgdat) {} |
---|
.. | .. |
---|
6429 | 7188 | |
---|
6430 | 7189 | pgdat_page_ext_init(pgdat); |
---|
6431 | 7190 | spin_lock_init(&pgdat->lru_lock); |
---|
6432 | | - lruvec_init(node_lruvec(pgdat)); |
---|
| 7191 | + lruvec_init(&pgdat->__lruvec); |
---|
6433 | 7192 | } |
---|
6434 | 7193 | |
---|
6435 | 7194 | static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, |
---|
6436 | 7195 | unsigned long remaining_pages) |
---|
6437 | 7196 | { |
---|
6438 | | - zone->managed_pages = remaining_pages; |
---|
| 7197 | + atomic_long_set(&zone->managed_pages, remaining_pages); |
---|
6439 | 7198 | zone_set_nid(zone, nid); |
---|
6440 | 7199 | zone->name = zone_names[idx]; |
---|
6441 | 7200 | zone->zone_pgdat = NODE_DATA(nid); |
---|
.. | .. |
---|
6533 | 7292 | set_pageblock_order(); |
---|
6534 | 7293 | setup_usemap(pgdat, zone, zone_start_pfn, size); |
---|
6535 | 7294 | init_currently_empty_zone(zone, zone_start_pfn, size); |
---|
6536 | | - memmap_init(size, nid, j, zone_start_pfn); |
---|
| 7295 | + arch_memmap_init(size, nid, j, zone_start_pfn); |
---|
6537 | 7296 | } |
---|
6538 | 7297 | } |
---|
6539 | 7298 | |
---|
.. | .. |
---|
6562 | 7321 | end = pgdat_end_pfn(pgdat); |
---|
6563 | 7322 | end = ALIGN(end, MAX_ORDER_NR_PAGES); |
---|
6564 | 7323 | size = (end - start) * sizeof(struct page); |
---|
6565 | | - map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); |
---|
| 7324 | + map = memblock_alloc_node(size, SMP_CACHE_BYTES, |
---|
| 7325 | + pgdat->node_id); |
---|
| 7326 | + if (!map) |
---|
| 7327 | + panic("Failed to allocate %ld bytes for node %d memory map\n", |
---|
| 7328 | + size, pgdat->node_id); |
---|
6566 | 7329 | pgdat->node_mem_map = map + offset; |
---|
6567 | 7330 | } |
---|
6568 | 7331 | pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", |
---|
.. | .. |
---|
6574 | 7337 | */ |
---|
6575 | 7338 | if (pgdat == NODE_DATA(0)) { |
---|
6576 | 7339 | mem_map = NODE_DATA(0)->node_mem_map; |
---|
6577 | | -#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) |
---|
6578 | 7340 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) |
---|
6579 | 7341 | mem_map -= offset; |
---|
6580 | | -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
---|
6581 | 7342 | } |
---|
6582 | 7343 | #endif |
---|
6583 | 7344 | } |
---|
.. | .. |
---|
6588 | 7349 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
---|
6589 | 7350 | static inline void pgdat_set_deferred_range(pg_data_t *pgdat) |
---|
6590 | 7351 | { |
---|
6591 | | - /* |
---|
6592 | | - * We start only with one section of pages, more pages are added as |
---|
6593 | | - * needed until the rest of deferred pages are initialized. |
---|
6594 | | - */ |
---|
6595 | | - pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, |
---|
6596 | | - pgdat->node_spanned_pages); |
---|
6597 | 7352 | pgdat->first_deferred_pfn = ULONG_MAX; |
---|
6598 | 7353 | } |
---|
6599 | 7354 | #else |
---|
6600 | 7355 | static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} |
---|
6601 | 7356 | #endif |
---|
6602 | 7357 | |
---|
6603 | | -void __init free_area_init_node(int nid, unsigned long *zones_size, |
---|
6604 | | - unsigned long node_start_pfn, |
---|
6605 | | - unsigned long *zholes_size) |
---|
| 7358 | +static void __init free_area_init_node(int nid) |
---|
6606 | 7359 | { |
---|
6607 | 7360 | pg_data_t *pgdat = NODE_DATA(nid); |
---|
6608 | 7361 | unsigned long start_pfn = 0; |
---|
6609 | 7362 | unsigned long end_pfn = 0; |
---|
6610 | 7363 | |
---|
6611 | 7364 | /* pg_data_t should be reset to zero when it's allocated */ |
---|
6612 | | - WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); |
---|
| 7365 | + WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); |
---|
| 7366 | + |
---|
| 7367 | + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
---|
6613 | 7368 | |
---|
6614 | 7369 | pgdat->node_id = nid; |
---|
6615 | | - pgdat->node_start_pfn = node_start_pfn; |
---|
| 7370 | + pgdat->node_start_pfn = start_pfn; |
---|
6616 | 7371 | pgdat->per_cpu_nodestats = NULL; |
---|
6617 | | -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
---|
6618 | | - get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
---|
| 7372 | + |
---|
6619 | 7373 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, |
---|
6620 | 7374 | (u64)start_pfn << PAGE_SHIFT, |
---|
6621 | 7375 | end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); |
---|
6622 | | -#else |
---|
6623 | | - start_pfn = node_start_pfn; |
---|
6624 | | -#endif |
---|
6625 | | - calculate_node_totalpages(pgdat, start_pfn, end_pfn, |
---|
6626 | | - zones_size, zholes_size); |
---|
| 7376 | + calculate_node_totalpages(pgdat, start_pfn, end_pfn); |
---|
6627 | 7377 | |
---|
6628 | 7378 | alloc_node_mem_map(pgdat); |
---|
6629 | 7379 | pgdat_set_deferred_range(pgdat); |
---|
.. | .. |
---|
6631 | 7381 | free_area_init_core(pgdat); |
---|
6632 | 7382 | } |
---|
6633 | 7383 | |
---|
6634 | | -#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) |
---|
6635 | | - |
---|
6636 | | -/* |
---|
6637 | | - * Zero all valid struct pages in range [spfn, epfn), return number of struct |
---|
6638 | | - * pages zeroed |
---|
6639 | | - */ |
---|
6640 | | -static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) |
---|
| 7384 | +void __init free_area_init_memoryless_node(int nid) |
---|
6641 | 7385 | { |
---|
6642 | | - unsigned long pfn; |
---|
6643 | | - u64 pgcnt = 0; |
---|
6644 | | - |
---|
6645 | | - for (pfn = spfn; pfn < epfn; pfn++) { |
---|
6646 | | - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { |
---|
6647 | | - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) |
---|
6648 | | - + pageblock_nr_pages - 1; |
---|
6649 | | - continue; |
---|
6650 | | - } |
---|
6651 | | - mm_zero_struct_page(pfn_to_page(pfn)); |
---|
6652 | | - pgcnt++; |
---|
6653 | | - } |
---|
6654 | | - |
---|
6655 | | - return pgcnt; |
---|
| 7386 | + free_area_init_node(nid); |
---|
6656 | 7387 | } |
---|
6657 | | - |
---|
6658 | | -/* |
---|
6659 | | - * Only struct pages that are backed by physical memory are zeroed and |
---|
6660 | | - * initialized by going through __init_single_page(). But, there are some |
---|
6661 | | - * struct pages which are reserved in memblock allocator and their fields |
---|
6662 | | - * may be accessed (for example page_to_pfn() on some configuration accesses |
---|
6663 | | - * flags). We must explicitly zero those struct pages. |
---|
6664 | | - * |
---|
6665 | | - * This function also addresses a similar issue where struct pages are left |
---|
6666 | | - * uninitialized because the physical address range is not covered by |
---|
6667 | | - * memblock.memory or memblock.reserved. That could happen when memblock |
---|
6668 | | - * layout is manually configured via memmap=, or when the highest physical |
---|
6669 | | - * address (max_pfn) does not end on a section boundary. |
---|
6670 | | - */ |
---|
6671 | | -void __init zero_resv_unavail(void) |
---|
6672 | | -{ |
---|
6673 | | - phys_addr_t start, end; |
---|
6674 | | - u64 i, pgcnt; |
---|
6675 | | - phys_addr_t next = 0; |
---|
6676 | | - |
---|
6677 | | - /* |
---|
6678 | | - * Loop through unavailable ranges not covered by memblock.memory. |
---|
6679 | | - */ |
---|
6680 | | - pgcnt = 0; |
---|
6681 | | - for_each_mem_range(i, &memblock.memory, NULL, |
---|
6682 | | - NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { |
---|
6683 | | - if (next < start) |
---|
6684 | | - pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start)); |
---|
6685 | | - next = end; |
---|
6686 | | - } |
---|
6687 | | - |
---|
6688 | | - /* |
---|
6689 | | - * Early sections always have a fully populated memmap for the whole |
---|
6690 | | - * section - see pfn_valid(). If the last section has holes at the |
---|
6691 | | - * end and that section is marked "online", the memmap will be |
---|
6692 | | - * considered initialized. Make sure that memmap has a well defined |
---|
6693 | | - * state. |
---|
6694 | | - */ |
---|
6695 | | - pgcnt += zero_pfn_range(PFN_DOWN(next), |
---|
6696 | | - round_up(max_pfn, PAGES_PER_SECTION)); |
---|
6697 | | - |
---|
6698 | | - /* |
---|
6699 | | - * Struct pages that do not have backing memory. This could be because |
---|
6700 | | - * firmware is using some of this memory, or for some other reasons. |
---|
6701 | | - */ |
---|
6702 | | - if (pgcnt) |
---|
6703 | | - pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); |
---|
6704 | | -} |
---|
6705 | | -#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ |
---|
6706 | | - |
---|
6707 | | -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
---|
6708 | 7388 | |
---|
6709 | 7389 | #if MAX_NUMNODES > 1 |
---|
6710 | 7390 | /* |
---|
.. | .. |
---|
6735 | 7415 | * model has fine enough granularity to avoid incorrect mapping for the |
---|
6736 | 7416 | * populated node map. |
---|
6737 | 7417 | * |
---|
6738 | | - * Returns the determined alignment in pfn's. 0 if there is no alignment |
---|
| 7418 | + * Return: the determined alignment in pfn's. 0 if there is no alignment |
---|
6739 | 7419 | * requirement (single node). |
---|
6740 | 7420 | */ |
---|
6741 | 7421 | unsigned long __init node_map_pfn_alignment(void) |
---|
6742 | 7422 | { |
---|
6743 | 7423 | unsigned long accl_mask = 0, last_end = 0; |
---|
6744 | 7424 | unsigned long start, end, mask; |
---|
6745 | | - int last_nid = -1; |
---|
| 7425 | + int last_nid = NUMA_NO_NODE; |
---|
6746 | 7426 | int i, nid; |
---|
6747 | 7427 | |
---|
6748 | 7428 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { |
---|
.. | .. |
---|
6769 | 7449 | return ~accl_mask + 1; |
---|
6770 | 7450 | } |
---|
6771 | 7451 | |
---|
6772 | | -/* Find the lowest pfn for a node */ |
---|
6773 | | -static unsigned long __init find_min_pfn_for_node(int nid) |
---|
6774 | | -{ |
---|
6775 | | - unsigned long min_pfn = ULONG_MAX; |
---|
6776 | | - unsigned long start_pfn; |
---|
6777 | | - int i; |
---|
6778 | | - |
---|
6779 | | - for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) |
---|
6780 | | - min_pfn = min(min_pfn, start_pfn); |
---|
6781 | | - |
---|
6782 | | - if (min_pfn == ULONG_MAX) { |
---|
6783 | | - pr_warn("Could not find start_pfn for node %d\n", nid); |
---|
6784 | | - return 0; |
---|
6785 | | - } |
---|
6786 | | - |
---|
6787 | | - return min_pfn; |
---|
6788 | | -} |
---|
6789 | | - |
---|
6790 | 7452 | /** |
---|
6791 | 7453 | * find_min_pfn_with_active_regions - Find the minimum PFN registered |
---|
6792 | 7454 | * |
---|
6793 | | - * It returns the minimum PFN based on information provided via |
---|
| 7455 | + * Return: the minimum PFN based on information provided via |
---|
6794 | 7456 | * memblock_set_node(). |
---|
6795 | 7457 | */ |
---|
6796 | 7458 | unsigned long __init find_min_pfn_with_active_regions(void) |
---|
6797 | 7459 | { |
---|
6798 | | - return find_min_pfn_for_node(MAX_NUMNODES); |
---|
| 7460 | + return PHYS_PFN(memblock_start_of_DRAM()); |
---|
6799 | 7461 | } |
---|
6800 | 7462 | |
---|
6801 | 7463 | /* |
---|
.. | .. |
---|
6844 | 7506 | * options. |
---|
6845 | 7507 | */ |
---|
6846 | 7508 | if (movable_node_is_enabled()) { |
---|
6847 | | - for_each_memblock(memory, r) { |
---|
| 7509 | + for_each_mem_region(r) { |
---|
6848 | 7510 | if (!memblock_is_hotpluggable(r)) |
---|
6849 | 7511 | continue; |
---|
6850 | 7512 | |
---|
6851 | | - nid = r->nid; |
---|
| 7513 | + nid = memblock_get_region_node(r); |
---|
6852 | 7514 | |
---|
6853 | 7515 | usable_startpfn = PFN_DOWN(r->base); |
---|
6854 | 7516 | zone_movable_pfn[nid] = zone_movable_pfn[nid] ? |
---|
.. | .. |
---|
6865 | 7527 | if (mirrored_kernelcore) { |
---|
6866 | 7528 | bool mem_below_4gb_not_mirrored = false; |
---|
6867 | 7529 | |
---|
6868 | | - for_each_memblock(memory, r) { |
---|
| 7530 | + for_each_mem_region(r) { |
---|
6869 | 7531 | if (memblock_is_mirror(r)) |
---|
6870 | 7532 | continue; |
---|
6871 | 7533 | |
---|
6872 | | - nid = r->nid; |
---|
| 7534 | + nid = memblock_get_region_node(r); |
---|
6873 | 7535 | |
---|
6874 | 7536 | usable_startpfn = memblock_region_memory_base_pfn(r); |
---|
6875 | 7537 | |
---|
.. | .. |
---|
6884 | 7546 | } |
---|
6885 | 7547 | |
---|
6886 | 7548 | if (mem_below_4gb_not_mirrored) |
---|
6887 | | - pr_warn("This configuration results in unmirrored kernel memory."); |
---|
| 7549 | + pr_warn("This configuration results in unmirrored kernel memory.\n"); |
---|
6888 | 7550 | |
---|
6889 | 7551 | goto out2; |
---|
6890 | 7552 | } |
---|
.. | .. |
---|
7023 | 7685 | |
---|
7024 | 7686 | out2: |
---|
7025 | 7687 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ |
---|
7026 | | - for (nid = 0; nid < MAX_NUMNODES; nid++) |
---|
| 7688 | + for (nid = 0; nid < MAX_NUMNODES; nid++) { |
---|
| 7689 | + unsigned long start_pfn, end_pfn; |
---|
| 7690 | + |
---|
7027 | 7691 | zone_movable_pfn[nid] = |
---|
7028 | 7692 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); |
---|
| 7693 | + |
---|
| 7694 | + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
---|
| 7695 | + if (zone_movable_pfn[nid] >= end_pfn) |
---|
| 7696 | + zone_movable_pfn[nid] = 0; |
---|
| 7697 | + } |
---|
7029 | 7698 | |
---|
7030 | 7699 | out: |
---|
7031 | 7700 | /* restore the node_state */ |
---|
.. | .. |
---|
7037 | 7706 | { |
---|
7038 | 7707 | enum zone_type zone_type; |
---|
7039 | 7708 | |
---|
7040 | | - if (N_MEMORY == N_NORMAL_MEMORY) |
---|
7041 | | - return; |
---|
7042 | | - |
---|
7043 | 7709 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { |
---|
7044 | 7710 | struct zone *zone = &pgdat->node_zones[zone_type]; |
---|
7045 | 7711 | if (populated_zone(zone)) { |
---|
7046 | | - node_set_state(nid, N_HIGH_MEMORY); |
---|
7047 | | - if (N_NORMAL_MEMORY != N_HIGH_MEMORY && |
---|
7048 | | - zone_type <= ZONE_NORMAL) |
---|
| 7712 | + if (IS_ENABLED(CONFIG_HIGHMEM)) |
---|
| 7713 | + node_set_state(nid, N_HIGH_MEMORY); |
---|
| 7714 | + if (zone_type <= ZONE_NORMAL) |
---|
7049 | 7715 | node_set_state(nid, N_NORMAL_MEMORY); |
---|
7050 | 7716 | break; |
---|
7051 | 7717 | } |
---|
7052 | 7718 | } |
---|
7053 | 7719 | } |
---|
7054 | 7720 | |
---|
| 7721 | +/* |
---|
| 7722 | + * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For |
---|
| 7723 | + * such cases we allow max_zone_pfn sorted in the descending order |
---|
| 7724 | + */ |
---|
| 7725 | +bool __weak arch_has_descending_max_zone_pfns(void) |
---|
| 7726 | +{ |
---|
| 7727 | + return false; |
---|
| 7728 | +} |
---|
| 7729 | + |
---|
7055 | 7730 | /** |
---|
7056 | | - * free_area_init_nodes - Initialise all pg_data_t and zone data |
---|
| 7731 | + * free_area_init - Initialise all pg_data_t and zone data |
---|
7057 | 7732 | * @max_zone_pfn: an array of max PFNs for each zone |
---|
7058 | 7733 | * |
---|
7059 | 7734 | * This will call free_area_init_node() for each active node in the system. |
---|
.. | .. |
---|
7065 | 7740 | * starts where the previous one ended. For example, ZONE_DMA32 starts |
---|
7066 | 7741 | * at arch_max_dma_pfn. |
---|
7067 | 7742 | */ |
---|
7068 | | -void __init free_area_init_nodes(unsigned long *max_zone_pfn) |
---|
| 7743 | +void __init free_area_init(unsigned long *max_zone_pfn) |
---|
7069 | 7744 | { |
---|
7070 | 7745 | unsigned long start_pfn, end_pfn; |
---|
7071 | | - int i, nid; |
---|
| 7746 | + int i, nid, zone; |
---|
| 7747 | + bool descending; |
---|
7072 | 7748 | |
---|
7073 | 7749 | /* Record where the zone boundaries are */ |
---|
7074 | 7750 | memset(arch_zone_lowest_possible_pfn, 0, |
---|
.. | .. |
---|
7077 | 7753 | sizeof(arch_zone_highest_possible_pfn)); |
---|
7078 | 7754 | |
---|
7079 | 7755 | start_pfn = find_min_pfn_with_active_regions(); |
---|
| 7756 | + descending = arch_has_descending_max_zone_pfns(); |
---|
7080 | 7757 | |
---|
7081 | 7758 | for (i = 0; i < MAX_NR_ZONES; i++) { |
---|
7082 | | - if (i == ZONE_MOVABLE) |
---|
| 7759 | + if (descending) |
---|
| 7760 | + zone = MAX_NR_ZONES - i - 1; |
---|
| 7761 | + else |
---|
| 7762 | + zone = i; |
---|
| 7763 | + |
---|
| 7764 | + if (zone == ZONE_MOVABLE) |
---|
7083 | 7765 | continue; |
---|
7084 | 7766 | |
---|
7085 | | - end_pfn = max(max_zone_pfn[i], start_pfn); |
---|
7086 | | - arch_zone_lowest_possible_pfn[i] = start_pfn; |
---|
7087 | | - arch_zone_highest_possible_pfn[i] = end_pfn; |
---|
| 7767 | + end_pfn = max(max_zone_pfn[zone], start_pfn); |
---|
| 7768 | + arch_zone_lowest_possible_pfn[zone] = start_pfn; |
---|
| 7769 | + arch_zone_highest_possible_pfn[zone] = end_pfn; |
---|
7088 | 7770 | |
---|
7089 | 7771 | start_pfn = end_pfn; |
---|
7090 | 7772 | } |
---|
.. | .. |
---|
7118 | 7800 | (u64)zone_movable_pfn[i] << PAGE_SHIFT); |
---|
7119 | 7801 | } |
---|
7120 | 7802 | |
---|
7121 | | - /* Print out the early node map */ |
---|
| 7803 | + /* |
---|
| 7804 | + * Print out the early node map, and initialize the |
---|
| 7805 | + * subsection-map relative to active online memory ranges to |
---|
| 7806 | + * enable future "sub-section" extensions of the memory map. |
---|
| 7807 | + */ |
---|
7122 | 7808 | pr_info("Early memory node ranges\n"); |
---|
7123 | | - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
---|
| 7809 | + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { |
---|
7124 | 7810 | pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, |
---|
7125 | 7811 | (u64)start_pfn << PAGE_SHIFT, |
---|
7126 | 7812 | ((u64)end_pfn << PAGE_SHIFT) - 1); |
---|
| 7813 | + subsection_map_init(start_pfn, end_pfn - start_pfn); |
---|
| 7814 | + } |
---|
7127 | 7815 | |
---|
7128 | 7816 | /* Initialise every node */ |
---|
7129 | 7817 | mminit_verify_pageflags_layout(); |
---|
7130 | 7818 | setup_nr_node_ids(); |
---|
7131 | | - zero_resv_unavail(); |
---|
7132 | 7819 | for_each_online_node(nid) { |
---|
7133 | 7820 | pg_data_t *pgdat = NODE_DATA(nid); |
---|
7134 | | - free_area_init_node(nid, NULL, |
---|
7135 | | - find_min_pfn_for_node(nid), NULL); |
---|
| 7821 | + free_area_init_node(nid); |
---|
7136 | 7822 | |
---|
7137 | 7823 | /* Any memory on that node */ |
---|
7138 | 7824 | if (pgdat->node_present_pages) |
---|
7139 | 7825 | node_set_state(nid, N_MEMORY); |
---|
7140 | 7826 | check_for_memory(pgdat, nid); |
---|
7141 | 7827 | } |
---|
| 7828 | + |
---|
| 7829 | + memmap_init(); |
---|
7142 | 7830 | } |
---|
7143 | 7831 | |
---|
7144 | 7832 | static int __init cmdline_parse_core(char *p, unsigned long *core, |
---|
.. | .. |
---|
7197 | 7885 | early_param("kernelcore", cmdline_parse_kernelcore); |
---|
7198 | 7886 | early_param("movablecore", cmdline_parse_movablecore); |
---|
7199 | 7887 | |
---|
7200 | | -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
---|
7201 | | - |
---|
7202 | 7888 | void adjust_managed_page_count(struct page *page, long count) |
---|
7203 | 7889 | { |
---|
7204 | | - spin_lock(&managed_page_count_lock); |
---|
7205 | | - page_zone(page)->managed_pages += count; |
---|
7206 | | - totalram_pages += count; |
---|
| 7890 | + atomic_long_add(count, &page_zone(page)->managed_pages); |
---|
| 7891 | + totalram_pages_add(count); |
---|
7207 | 7892 | #ifdef CONFIG_HIGHMEM |
---|
7208 | 7893 | if (PageHighMem(page)) |
---|
7209 | | - totalhigh_pages += count; |
---|
| 7894 | + totalhigh_pages_add(count); |
---|
7210 | 7895 | #endif |
---|
7211 | | - spin_unlock(&managed_page_count_lock); |
---|
7212 | 7896 | } |
---|
7213 | 7897 | EXPORT_SYMBOL(adjust_managed_page_count); |
---|
7214 | 7898 | |
---|
7215 | | -unsigned long free_reserved_area(void *start, void *end, int poison, char *s) |
---|
| 7899 | +unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) |
---|
7216 | 7900 | { |
---|
7217 | 7901 | void *pos; |
---|
7218 | 7902 | unsigned long pages = 0; |
---|
.. | .. |
---|
7231 | 7915 | * alias for the memset(). |
---|
7232 | 7916 | */ |
---|
7233 | 7917 | direct_map_addr = page_address(page); |
---|
| 7918 | + /* |
---|
| 7919 | + * Perform a kasan-unchecked memset() since this memory |
---|
| 7920 | + * has not been initialized. |
---|
| 7921 | + */ |
---|
| 7922 | + direct_map_addr = kasan_reset_tag(direct_map_addr); |
---|
7234 | 7923 | if ((unsigned int)poison <= 0xFF) |
---|
7235 | 7924 | memset(direct_map_addr, poison, PAGE_SIZE); |
---|
7236 | 7925 | |
---|
.. | .. |
---|
7243 | 7932 | |
---|
7244 | 7933 | return pages; |
---|
7245 | 7934 | } |
---|
7246 | | -EXPORT_SYMBOL(free_reserved_area); |
---|
7247 | 7935 | |
---|
7248 | 7936 | #ifdef CONFIG_HIGHMEM |
---|
7249 | 7937 | void free_highmem_page(struct page *page) |
---|
7250 | 7938 | { |
---|
7251 | 7939 | __free_reserved_page(page); |
---|
7252 | | - totalram_pages++; |
---|
7253 | | - page_zone(page)->managed_pages++; |
---|
7254 | | - totalhigh_pages++; |
---|
| 7940 | + totalram_pages_inc(); |
---|
| 7941 | + atomic_long_inc(&page_zone(page)->managed_pages); |
---|
| 7942 | + totalhigh_pages_inc(); |
---|
7255 | 7943 | } |
---|
7256 | 7944 | #endif |
---|
7257 | 7945 | |
---|
.. | .. |
---|
7278 | 7966 | */ |
---|
7279 | 7967 | #define adj_init_size(start, end, size, pos, adj) \ |
---|
7280 | 7968 | do { \ |
---|
7281 | | - if (start <= pos && pos < end && size > adj) \ |
---|
| 7969 | + if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ |
---|
7282 | 7970 | size -= adj; \ |
---|
7283 | 7971 | } while (0) |
---|
7284 | 7972 | |
---|
.. | .. |
---|
7300 | 7988 | physpages << (PAGE_SHIFT - 10), |
---|
7301 | 7989 | codesize >> 10, datasize >> 10, rosize >> 10, |
---|
7302 | 7990 | (init_data_size + init_code_size) >> 10, bss_size >> 10, |
---|
7303 | | - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), |
---|
| 7991 | + (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), |
---|
7304 | 7992 | totalcma_pages << (PAGE_SHIFT - 10), |
---|
7305 | 7993 | #ifdef CONFIG_HIGHMEM |
---|
7306 | | - totalhigh_pages << (PAGE_SHIFT - 10), |
---|
| 7994 | + totalhigh_pages() << (PAGE_SHIFT - 10), |
---|
7307 | 7995 | #endif |
---|
7308 | 7996 | str ? ", " : "", str ? str : ""); |
---|
7309 | 7997 | } |
---|
.. | .. |
---|
7322 | 8010 | void __init set_dma_reserve(unsigned long new_dma_reserve) |
---|
7323 | 8011 | { |
---|
7324 | 8012 | dma_reserve = new_dma_reserve; |
---|
7325 | | -} |
---|
7326 | | - |
---|
7327 | | -void __init free_area_init(unsigned long *zones_size) |
---|
7328 | | -{ |
---|
7329 | | - zero_resv_unavail(); |
---|
7330 | | - free_area_init_node(0, zones_size, |
---|
7331 | | - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
---|
7332 | 8013 | } |
---|
7333 | 8014 | |
---|
7334 | 8015 | static int page_alloc_cpu_dead(unsigned int cpu) |
---|
.. | .. |
---|
7356 | 8037 | return 0; |
---|
7357 | 8038 | } |
---|
7358 | 8039 | |
---|
| 8040 | +#ifdef CONFIG_NUMA |
---|
| 8041 | +int hashdist = HASHDIST_DEFAULT; |
---|
| 8042 | + |
---|
| 8043 | +static int __init set_hashdist(char *str) |
---|
| 8044 | +{ |
---|
| 8045 | + if (!str) |
---|
| 8046 | + return 0; |
---|
| 8047 | + hashdist = simple_strtoul(str, &str, 0); |
---|
| 8048 | + return 1; |
---|
| 8049 | +} |
---|
| 8050 | +__setup("hashdist=", set_hashdist); |
---|
| 8051 | +#endif |
---|
| 8052 | + |
---|
7359 | 8053 | void __init page_alloc_init(void) |
---|
7360 | 8054 | { |
---|
7361 | 8055 | int ret; |
---|
| 8056 | + |
---|
| 8057 | +#ifdef CONFIG_NUMA |
---|
| 8058 | + if (num_node_state(N_MEMORY) == 1) |
---|
| 8059 | + hashdist = 0; |
---|
| 8060 | +#endif |
---|
7362 | 8061 | |
---|
7363 | 8062 | ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, |
---|
7364 | 8063 | "mm/page_alloc:dead", NULL, |
---|
.. | .. |
---|
7383 | 8082 | for (i = 0; i < MAX_NR_ZONES; i++) { |
---|
7384 | 8083 | struct zone *zone = pgdat->node_zones + i; |
---|
7385 | 8084 | long max = 0; |
---|
| 8085 | + unsigned long managed_pages = zone_managed_pages(zone); |
---|
7386 | 8086 | |
---|
7387 | 8087 | /* Find valid and maximum lowmem_reserve in the zone */ |
---|
7388 | 8088 | for (j = i; j < MAX_NR_ZONES; j++) { |
---|
.. | .. |
---|
7393 | 8093 | /* we treat the high watermark as reserved pages. */ |
---|
7394 | 8094 | max += high_wmark_pages(zone); |
---|
7395 | 8095 | |
---|
7396 | | - if (max > zone->managed_pages) |
---|
7397 | | - max = zone->managed_pages; |
---|
| 8096 | + if (max > managed_pages) |
---|
| 8097 | + max = managed_pages; |
---|
7398 | 8098 | |
---|
7399 | 8099 | pgdat->totalreserve_pages += max; |
---|
7400 | 8100 | |
---|
.. | .. |
---|
7413 | 8113 | static void setup_per_zone_lowmem_reserve(void) |
---|
7414 | 8114 | { |
---|
7415 | 8115 | struct pglist_data *pgdat; |
---|
7416 | | - enum zone_type j, idx; |
---|
| 8116 | + enum zone_type i, j; |
---|
7417 | 8117 | |
---|
7418 | 8118 | for_each_online_pgdat(pgdat) { |
---|
7419 | | - for (j = 0; j < MAX_NR_ZONES; j++) { |
---|
7420 | | - struct zone *zone = pgdat->node_zones + j; |
---|
7421 | | - unsigned long managed_pages = zone->managed_pages; |
---|
| 8119 | + for (i = 0; i < MAX_NR_ZONES - 1; i++) { |
---|
| 8120 | + struct zone *zone = &pgdat->node_zones[i]; |
---|
| 8121 | + int ratio = sysctl_lowmem_reserve_ratio[i]; |
---|
| 8122 | + bool clear = !ratio || !zone_managed_pages(zone); |
---|
| 8123 | + unsigned long managed_pages = 0; |
---|
7422 | 8124 | |
---|
7423 | | - zone->lowmem_reserve[j] = 0; |
---|
| 8125 | + for (j = i + 1; j < MAX_NR_ZONES; j++) { |
---|
| 8126 | + struct zone *upper_zone = &pgdat->node_zones[j]; |
---|
7424 | 8127 | |
---|
7425 | | - idx = j; |
---|
7426 | | - while (idx) { |
---|
7427 | | - struct zone *lower_zone; |
---|
| 8128 | + managed_pages += zone_managed_pages(upper_zone); |
---|
7428 | 8129 | |
---|
7429 | | - idx--; |
---|
7430 | | - lower_zone = pgdat->node_zones + idx; |
---|
7431 | | - |
---|
7432 | | - if (sysctl_lowmem_reserve_ratio[idx] < 1) { |
---|
7433 | | - sysctl_lowmem_reserve_ratio[idx] = 0; |
---|
7434 | | - lower_zone->lowmem_reserve[j] = 0; |
---|
7435 | | - } else { |
---|
7436 | | - lower_zone->lowmem_reserve[j] = |
---|
7437 | | - managed_pages / sysctl_lowmem_reserve_ratio[idx]; |
---|
7438 | | - } |
---|
7439 | | - managed_pages += lower_zone->managed_pages; |
---|
| 8130 | + if (clear) |
---|
| 8131 | + zone->lowmem_reserve[j] = 0; |
---|
| 8132 | + else |
---|
| 8133 | + zone->lowmem_reserve[j] = managed_pages / ratio; |
---|
7440 | 8134 | } |
---|
7441 | 8135 | } |
---|
7442 | 8136 | } |
---|
.. | .. |
---|
7456 | 8150 | /* Calculate total number of !ZONE_HIGHMEM pages */ |
---|
7457 | 8151 | for_each_zone(zone) { |
---|
7458 | 8152 | if (!is_highmem(zone)) |
---|
7459 | | - lowmem_pages += zone->managed_pages; |
---|
| 8153 | + lowmem_pages += zone_managed_pages(zone); |
---|
7460 | 8154 | } |
---|
7461 | 8155 | |
---|
7462 | 8156 | for_each_zone(zone) { |
---|
7463 | | - u64 min, low; |
---|
| 8157 | + u64 tmp, low; |
---|
7464 | 8158 | |
---|
7465 | 8159 | spin_lock_irqsave(&zone->lock, flags); |
---|
7466 | | - min = (u64)pages_min * zone->managed_pages; |
---|
7467 | | - do_div(min, lowmem_pages); |
---|
7468 | | - low = (u64)pages_low * zone->managed_pages; |
---|
7469 | | - do_div(low, vm_total_pages); |
---|
7470 | | - |
---|
| 8160 | + tmp = (u64)pages_min * zone_managed_pages(zone); |
---|
| 8161 | + do_div(tmp, lowmem_pages); |
---|
| 8162 | + low = (u64)pages_low * zone_managed_pages(zone); |
---|
| 8163 | + do_div(low, nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE))); |
---|
7471 | 8164 | if (is_highmem(zone)) { |
---|
7472 | 8165 | /* |
---|
7473 | 8166 | * __GFP_HIGH and PF_MEMALLOC allocations usually don't |
---|
.. | .. |
---|
7475 | 8168 | * value here. |
---|
7476 | 8169 | * |
---|
7477 | 8170 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) |
---|
7478 | | - * deltas control asynch page reclaim, and so should |
---|
| 8171 | + * deltas control async page reclaim, and so should |
---|
7479 | 8172 | * not be capped for highmem. |
---|
7480 | 8173 | */ |
---|
7481 | 8174 | unsigned long min_pages; |
---|
7482 | 8175 | |
---|
7483 | | - min_pages = zone->managed_pages / 1024; |
---|
| 8176 | + min_pages = zone_managed_pages(zone) / 1024; |
---|
7484 | 8177 | min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); |
---|
7485 | | - zone->watermark[WMARK_MIN] = min_pages; |
---|
| 8178 | + zone->_watermark[WMARK_MIN] = min_pages; |
---|
7486 | 8179 | } else { |
---|
7487 | 8180 | /* |
---|
7488 | 8181 | * If it's a lowmem zone, reserve a number of pages |
---|
7489 | 8182 | * proportionate to the zone's size. |
---|
7490 | 8183 | */ |
---|
7491 | | - zone->watermark[WMARK_MIN] = min; |
---|
| 8184 | + zone->_watermark[WMARK_MIN] = tmp; |
---|
7492 | 8185 | } |
---|
7493 | 8186 | |
---|
7494 | 8187 | /* |
---|
.. | .. |
---|
7496 | 8189 | * scale factor in proportion to available memory, but |
---|
7497 | 8190 | * ensure a minimum size on small systems. |
---|
7498 | 8191 | */ |
---|
7499 | | - min = max_t(u64, min >> 2, |
---|
7500 | | - mult_frac(zone->managed_pages, |
---|
| 8192 | + tmp = max_t(u64, tmp >> 2, |
---|
| 8193 | + mult_frac(zone_managed_pages(zone), |
---|
7501 | 8194 | watermark_scale_factor, 10000)); |
---|
7502 | 8195 | |
---|
7503 | | - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + |
---|
7504 | | - low + min; |
---|
7505 | | - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + |
---|
7506 | | - low + min * 2; |
---|
| 8196 | + zone->watermark_boost = 0; |
---|
| 8197 | + zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + low + tmp; |
---|
| 8198 | + zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + low + tmp * 2; |
---|
7507 | 8199 | |
---|
7508 | 8200 | spin_unlock_irqrestore(&zone->lock, flags); |
---|
7509 | 8201 | } |
---|
.. | .. |
---|
7532 | 8224 | * Initialise min_free_kbytes. |
---|
7533 | 8225 | * |
---|
7534 | 8226 | * For small machines we want it small (128k min). For large machines |
---|
7535 | | - * we want it large (64MB max). But it is not linear, because network |
---|
| 8227 | + * we want it large (256MB max). But it is not linear, because network |
---|
7536 | 8228 | * bandwidth does not increase linearly with machine size. We use |
---|
7537 | 8229 | * |
---|
7538 | 8230 | * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: |
---|
.. | .. |
---|
7564 | 8256 | min_free_kbytes = new_min_free_kbytes; |
---|
7565 | 8257 | if (min_free_kbytes < 128) |
---|
7566 | 8258 | min_free_kbytes = 128; |
---|
7567 | | - if (min_free_kbytes > 65536) |
---|
7568 | | - min_free_kbytes = 65536; |
---|
| 8259 | + if (min_free_kbytes > 262144) |
---|
| 8260 | + min_free_kbytes = 262144; |
---|
7569 | 8261 | } else { |
---|
7570 | 8262 | pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", |
---|
7571 | 8263 | new_min_free_kbytes, user_min_free_kbytes); |
---|
.. | .. |
---|
7591 | 8283 | * or extra_free_kbytes changes. |
---|
7592 | 8284 | */ |
---|
7593 | 8285 | int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, |
---|
7594 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 8286 | + void *buffer, size_t *length, loff_t *ppos) |
---|
7595 | 8287 | { |
---|
7596 | 8288 | int rc; |
---|
7597 | 8289 | |
---|
.. | .. |
---|
7607 | 8299 | } |
---|
7608 | 8300 | |
---|
7609 | 8301 | int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, |
---|
7610 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 8302 | + void *buffer, size_t *length, loff_t *ppos) |
---|
7611 | 8303 | { |
---|
7612 | 8304 | int rc; |
---|
7613 | 8305 | |
---|
.. | .. |
---|
7631 | 8323 | pgdat->min_unmapped_pages = 0; |
---|
7632 | 8324 | |
---|
7633 | 8325 | for_each_zone(zone) |
---|
7634 | | - zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * |
---|
7635 | | - sysctl_min_unmapped_ratio) / 100; |
---|
| 8326 | + zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * |
---|
| 8327 | + sysctl_min_unmapped_ratio) / 100; |
---|
7636 | 8328 | } |
---|
7637 | 8329 | |
---|
7638 | 8330 | |
---|
7639 | 8331 | int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, |
---|
7640 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 8332 | + void *buffer, size_t *length, loff_t *ppos) |
---|
7641 | 8333 | { |
---|
7642 | 8334 | int rc; |
---|
7643 | 8335 | |
---|
.. | .. |
---|
7659 | 8351 | pgdat->min_slab_pages = 0; |
---|
7660 | 8352 | |
---|
7661 | 8353 | for_each_zone(zone) |
---|
7662 | | - zone->zone_pgdat->min_slab_pages += (zone->managed_pages * |
---|
7663 | | - sysctl_min_slab_ratio) / 100; |
---|
| 8354 | + zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * |
---|
| 8355 | + sysctl_min_slab_ratio) / 100; |
---|
7664 | 8356 | } |
---|
7665 | 8357 | |
---|
7666 | 8358 | int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, |
---|
7667 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 8359 | + void *buffer, size_t *length, loff_t *ppos) |
---|
7668 | 8360 | { |
---|
7669 | 8361 | int rc; |
---|
7670 | 8362 | |
---|
.. | .. |
---|
7688 | 8380 | * if in function of the boot time zone sizes. |
---|
7689 | 8381 | */ |
---|
7690 | 8382 | int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, |
---|
7691 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 8383 | + void *buffer, size_t *length, loff_t *ppos) |
---|
7692 | 8384 | { |
---|
| 8385 | + int i; |
---|
| 8386 | + |
---|
7693 | 8387 | proc_dointvec_minmax(table, write, buffer, length, ppos); |
---|
| 8388 | + |
---|
| 8389 | + for (i = 0; i < MAX_NR_ZONES; i++) { |
---|
| 8390 | + if (sysctl_lowmem_reserve_ratio[i] < 1) |
---|
| 8391 | + sysctl_lowmem_reserve_ratio[i] = 0; |
---|
| 8392 | + } |
---|
| 8393 | + |
---|
7694 | 8394 | setup_per_zone_lowmem_reserve(); |
---|
7695 | 8395 | return 0; |
---|
| 8396 | +} |
---|
| 8397 | + |
---|
| 8398 | +static void __zone_pcp_update(struct zone *zone) |
---|
| 8399 | +{ |
---|
| 8400 | + unsigned int cpu; |
---|
| 8401 | + |
---|
| 8402 | + for_each_possible_cpu(cpu) |
---|
| 8403 | + pageset_set_high_and_batch(zone, |
---|
| 8404 | + per_cpu_ptr(zone->pageset, cpu)); |
---|
7696 | 8405 | } |
---|
7697 | 8406 | |
---|
7698 | 8407 | /* |
---|
.. | .. |
---|
7701 | 8410 | * pagelist can have before it gets flushed back to buddy allocator. |
---|
7702 | 8411 | */ |
---|
7703 | 8412 | int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, |
---|
7704 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 8413 | + void *buffer, size_t *length, loff_t *ppos) |
---|
7705 | 8414 | { |
---|
7706 | 8415 | struct zone *zone; |
---|
7707 | 8416 | int old_percpu_pagelist_fraction; |
---|
.. | .. |
---|
7726 | 8435 | if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) |
---|
7727 | 8436 | goto out; |
---|
7728 | 8437 | |
---|
7729 | | - for_each_populated_zone(zone) { |
---|
7730 | | - unsigned int cpu; |
---|
7731 | | - |
---|
7732 | | - for_each_possible_cpu(cpu) |
---|
7733 | | - pageset_set_high_and_batch(zone, |
---|
7734 | | - per_cpu_ptr(zone->pageset, cpu)); |
---|
7735 | | - } |
---|
| 8438 | + for_each_populated_zone(zone) |
---|
| 8439 | + __zone_pcp_update(zone); |
---|
7736 | 8440 | out: |
---|
7737 | 8441 | mutex_unlock(&pcp_batch_high_lock); |
---|
7738 | 8442 | return ret; |
---|
7739 | 8443 | } |
---|
7740 | | - |
---|
7741 | | -#ifdef CONFIG_NUMA |
---|
7742 | | -int hashdist = HASHDIST_DEFAULT; |
---|
7743 | | - |
---|
7744 | | -static int __init set_hashdist(char *str) |
---|
7745 | | -{ |
---|
7746 | | - if (!str) |
---|
7747 | | - return 0; |
---|
7748 | | - hashdist = simple_strtoul(str, &str, 0); |
---|
7749 | | - return 1; |
---|
7750 | | -} |
---|
7751 | | -__setup("hashdist=", set_hashdist); |
---|
7752 | | -#endif |
---|
7753 | 8444 | |
---|
7754 | 8445 | #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES |
---|
7755 | 8446 | /* |
---|
.. | .. |
---|
7797 | 8488 | unsigned long log2qty, size; |
---|
7798 | 8489 | void *table = NULL; |
---|
7799 | 8490 | gfp_t gfp_flags; |
---|
| 8491 | + bool virt; |
---|
7800 | 8492 | |
---|
7801 | 8493 | /* allow the kernel cmdline to have a say */ |
---|
7802 | 8494 | if (!numentries) { |
---|
.. | .. |
---|
7853 | 8545 | |
---|
7854 | 8546 | gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; |
---|
7855 | 8547 | do { |
---|
| 8548 | + virt = false; |
---|
7856 | 8549 | size = bucketsize << log2qty; |
---|
7857 | 8550 | if (flags & HASH_EARLY) { |
---|
7858 | 8551 | if (flags & HASH_ZERO) |
---|
7859 | | - table = memblock_virt_alloc_nopanic(size, 0); |
---|
| 8552 | + table = memblock_alloc(size, SMP_CACHE_BYTES); |
---|
7860 | 8553 | else |
---|
7861 | | - table = memblock_virt_alloc_raw(size, 0); |
---|
7862 | | - } else if (hashdist) { |
---|
7863 | | - table = __vmalloc(size, gfp_flags, PAGE_KERNEL); |
---|
| 8554 | + table = memblock_alloc_raw(size, |
---|
| 8555 | + SMP_CACHE_BYTES); |
---|
| 8556 | + } else if (get_order(size) >= MAX_ORDER || hashdist) { |
---|
| 8557 | + table = __vmalloc(size, gfp_flags); |
---|
| 8558 | + virt = true; |
---|
7864 | 8559 | } else { |
---|
7865 | 8560 | /* |
---|
7866 | 8561 | * If bucketsize is not a power-of-two, we may free |
---|
7867 | 8562 | * some pages at the end of hash table which |
---|
7868 | 8563 | * alloc_pages_exact() automatically does |
---|
7869 | 8564 | */ |
---|
7870 | | - if (get_order(size) < MAX_ORDER) { |
---|
7871 | | - table = alloc_pages_exact(size, gfp_flags); |
---|
7872 | | - kmemleak_alloc(table, size, 1, gfp_flags); |
---|
7873 | | - } |
---|
| 8565 | + table = alloc_pages_exact(size, gfp_flags); |
---|
| 8566 | + kmemleak_alloc(table, size, 1, gfp_flags); |
---|
7874 | 8567 | } |
---|
7875 | 8568 | } while (!table && size > PAGE_SIZE && --log2qty); |
---|
7876 | 8569 | |
---|
7877 | 8570 | if (!table) |
---|
7878 | 8571 | panic("Failed to allocate %s hash table\n", tablename); |
---|
7879 | 8572 | |
---|
7880 | | - pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", |
---|
7881 | | - tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); |
---|
| 8573 | + pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", |
---|
| 8574 | + tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, |
---|
| 8575 | + virt ? "vmalloc" : "linear"); |
---|
7882 | 8576 | |
---|
7883 | 8577 | if (_hash_shift) |
---|
7884 | 8578 | *_hash_shift = log2qty; |
---|
.. | .. |
---|
7890 | 8584 | |
---|
7891 | 8585 | /* |
---|
7892 | 8586 | * This function checks whether pageblock includes unmovable pages or not. |
---|
7893 | | - * If @count is not zero, it is okay to include less @count unmovable pages |
---|
7894 | 8587 | * |
---|
7895 | 8588 | * PageLRU check without isolation or lru_lock could race so that |
---|
7896 | 8589 | * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable |
---|
7897 | 8590 | * check without lock_page also may miss some movable non-lru pages at |
---|
7898 | 8591 | * race condition. So you can't expect this function should be exact. |
---|
| 8592 | + * |
---|
| 8593 | + * Returns a page without holding a reference. If the caller wants to |
---|
| 8594 | + * dereference that page (e.g., dumping), it has to make sure that it |
---|
| 8595 | + * cannot get removed (e.g., via memory unplug) concurrently. |
---|
| 8596 | + * |
---|
7899 | 8597 | */ |
---|
7900 | | -bool has_unmovable_pages(struct zone *zone, struct page *page, int count, |
---|
7901 | | - int migratetype, |
---|
7902 | | - bool skip_hwpoisoned_pages) |
---|
| 8598 | +struct page *has_unmovable_pages(struct zone *zone, struct page *page, |
---|
| 8599 | + int migratetype, int flags) |
---|
7903 | 8600 | { |
---|
7904 | | - unsigned long pfn, iter, found; |
---|
| 8601 | + unsigned long iter = 0; |
---|
| 8602 | + unsigned long pfn = page_to_pfn(page); |
---|
| 8603 | + unsigned long offset = pfn % pageblock_nr_pages; |
---|
7905 | 8604 | |
---|
7906 | | - /* |
---|
7907 | | - * TODO we could make this much more efficient by not checking every |
---|
7908 | | - * page in the range if we know all of them are in MOVABLE_ZONE and |
---|
7909 | | - * that the movable zone guarantees that pages are migratable but |
---|
7910 | | - * the later is not the case right now unfortunatelly. E.g. movablecore |
---|
7911 | | - * can still lead to having bootmem allocations in zone_movable. |
---|
7912 | | - */ |
---|
| 8605 | + if (is_migrate_cma_page(page)) { |
---|
| 8606 | + /* |
---|
| 8607 | + * CMA allocations (alloc_contig_range) really need to mark |
---|
| 8608 | + * isolate CMA pageblocks even when they are not movable in fact |
---|
| 8609 | + * so consider them movable here. |
---|
| 8610 | + */ |
---|
| 8611 | + if (is_migrate_cma(migratetype)) |
---|
| 8612 | + return NULL; |
---|
7913 | 8613 | |
---|
7914 | | - /* |
---|
7915 | | - * CMA allocations (alloc_contig_range) really need to mark isolate |
---|
7916 | | - * CMA pageblocks even when they are not movable in fact so consider |
---|
7917 | | - * them movable here. |
---|
7918 | | - */ |
---|
7919 | | - if (is_migrate_cma(migratetype) && |
---|
7920 | | - is_migrate_cma(get_pageblock_migratetype(page))) |
---|
7921 | | - return false; |
---|
| 8614 | + return page; |
---|
| 8615 | + } |
---|
7922 | 8616 | |
---|
7923 | | - pfn = page_to_pfn(page); |
---|
7924 | | - for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { |
---|
7925 | | - unsigned long check = pfn + iter; |
---|
7926 | | - |
---|
7927 | | - if (!pfn_valid_within(check)) |
---|
| 8617 | + for (; iter < pageblock_nr_pages - offset; iter++) { |
---|
| 8618 | + if (!pfn_valid_within(pfn + iter)) |
---|
7928 | 8619 | continue; |
---|
7929 | 8620 | |
---|
7930 | | - page = pfn_to_page(check); |
---|
| 8621 | + page = pfn_to_page(pfn + iter); |
---|
7931 | 8622 | |
---|
| 8623 | + /* |
---|
| 8624 | + * Both, bootmem allocations and memory holes are marked |
---|
| 8625 | + * PG_reserved and are unmovable. We can even have unmovable |
---|
| 8626 | + * allocations inside ZONE_MOVABLE, for example when |
---|
| 8627 | + * specifying "movablecore". |
---|
| 8628 | + */ |
---|
7932 | 8629 | if (PageReserved(page)) |
---|
7933 | | - goto unmovable; |
---|
| 8630 | + return page; |
---|
7934 | 8631 | |
---|
7935 | 8632 | /* |
---|
7936 | 8633 | * If the zone is movable and we have ruled out all reserved |
---|
.. | .. |
---|
7942 | 8639 | |
---|
7943 | 8640 | /* |
---|
7944 | 8641 | * Hugepages are not in LRU lists, but they're movable. |
---|
7945 | | - * We need not scan over tail pages bacause we don't |
---|
| 8642 | + * THPs are on the LRU, but need to be counted as #small pages. |
---|
| 8643 | + * We need not scan over tail pages because we don't |
---|
7946 | 8644 | * handle each tail page individually in migration. |
---|
7947 | 8645 | */ |
---|
7948 | | - if (PageHuge(page)) { |
---|
| 8646 | + if (PageHuge(page) || PageTransCompound(page)) { |
---|
7949 | 8647 | struct page *head = compound_head(page); |
---|
7950 | 8648 | unsigned int skip_pages; |
---|
7951 | 8649 | |
---|
7952 | | - if (!hugepage_migration_supported(page_hstate(head))) |
---|
7953 | | - goto unmovable; |
---|
| 8650 | + if (PageHuge(page)) { |
---|
| 8651 | + if (!hugepage_migration_supported(page_hstate(head))) |
---|
| 8652 | + return page; |
---|
| 8653 | + } else if (!PageLRU(head) && !__PageMovable(head)) { |
---|
| 8654 | + return page; |
---|
| 8655 | + } |
---|
7954 | 8656 | |
---|
7955 | | - skip_pages = (1 << compound_order(head)) - (page - head); |
---|
| 8657 | + skip_pages = compound_nr(head) - (page - head); |
---|
7956 | 8658 | iter += skip_pages - 1; |
---|
7957 | 8659 | continue; |
---|
7958 | 8660 | } |
---|
.. | .. |
---|
7965 | 8667 | */ |
---|
7966 | 8668 | if (!page_ref_count(page)) { |
---|
7967 | 8669 | if (PageBuddy(page)) |
---|
7968 | | - iter += (1 << page_order(page)) - 1; |
---|
| 8670 | + iter += (1 << buddy_order(page)) - 1; |
---|
7969 | 8671 | continue; |
---|
7970 | 8672 | } |
---|
7971 | 8673 | |
---|
.. | .. |
---|
7973 | 8675 | * The HWPoisoned page may be not in buddy system, and |
---|
7974 | 8676 | * page_count() is not 0. |
---|
7975 | 8677 | */ |
---|
7976 | | - if (skip_hwpoisoned_pages && PageHWPoison(page)) |
---|
| 8678 | + if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) |
---|
7977 | 8679 | continue; |
---|
7978 | 8680 | |
---|
7979 | | - if (__PageMovable(page)) |
---|
| 8681 | + /* |
---|
| 8682 | + * We treat all PageOffline() pages as movable when offlining |
---|
| 8683 | + * to give drivers a chance to decrement their reference count |
---|
| 8684 | + * in MEM_GOING_OFFLINE in order to indicate that these pages |
---|
| 8685 | + * can be offlined as there are no direct references anymore. |
---|
| 8686 | + * For actually unmovable PageOffline() where the driver does |
---|
| 8687 | + * not support this, we will fail later when trying to actually |
---|
| 8688 | + * move these pages that still have a reference count > 0. |
---|
| 8689 | + * (false negatives in this function only) |
---|
| 8690 | + */ |
---|
| 8691 | + if ((flags & MEMORY_OFFLINE) && PageOffline(page)) |
---|
7980 | 8692 | continue; |
---|
7981 | 8693 | |
---|
7982 | | - if (!PageLRU(page)) |
---|
7983 | | - found++; |
---|
| 8694 | + if (__PageMovable(page) || PageLRU(page)) |
---|
| 8695 | + continue; |
---|
| 8696 | + |
---|
7984 | 8697 | /* |
---|
7985 | 8698 | * If there are RECLAIMABLE pages, we need to check |
---|
7986 | 8699 | * it. But now, memory offline itself doesn't call |
---|
7987 | 8700 | * shrink_node_slabs() and it still to be fixed. |
---|
7988 | 8701 | */ |
---|
7989 | | - /* |
---|
7990 | | - * If the page is not RAM, page_count()should be 0. |
---|
7991 | | - * we don't need more check. This is an _used_ not-movable page. |
---|
7992 | | - * |
---|
7993 | | - * The problematic thing here is PG_reserved pages. PG_reserved |
---|
7994 | | - * is set to both of a memory hole page and a _used_ kernel |
---|
7995 | | - * page at boot. |
---|
7996 | | - */ |
---|
7997 | | - if (found > count) |
---|
7998 | | - goto unmovable; |
---|
| 8702 | + return page; |
---|
7999 | 8703 | } |
---|
8000 | | - return false; |
---|
8001 | | -unmovable: |
---|
8002 | | - WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); |
---|
8003 | | - return true; |
---|
| 8704 | + return NULL; |
---|
8004 | 8705 | } |
---|
8005 | 8706 | |
---|
8006 | | -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) |
---|
8007 | | - |
---|
| 8707 | +#ifdef CONFIG_CONTIG_ALLOC |
---|
8008 | 8708 | static unsigned long pfn_max_align_down(unsigned long pfn) |
---|
8009 | 8709 | { |
---|
8010 | 8710 | return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, |
---|
8011 | 8711 | pageblock_nr_pages) - 1); |
---|
8012 | 8712 | } |
---|
8013 | 8713 | |
---|
8014 | | -static unsigned long pfn_max_align_up(unsigned long pfn) |
---|
| 8714 | +unsigned long pfn_max_align_up(unsigned long pfn) |
---|
8015 | 8715 | { |
---|
8016 | 8716 | return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, |
---|
8017 | 8717 | pageblock_nr_pages)); |
---|
8018 | 8718 | } |
---|
8019 | 8719 | |
---|
| 8720 | +#if defined(CONFIG_DYNAMIC_DEBUG) || \ |
---|
| 8721 | + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) |
---|
| 8722 | +/* Usage: See admin-guide/dynamic-debug-howto.rst */ |
---|
| 8723 | +static void alloc_contig_dump_pages(struct list_head *page_list) |
---|
| 8724 | +{ |
---|
| 8725 | + DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure"); |
---|
| 8726 | + |
---|
| 8727 | + if (DYNAMIC_DEBUG_BRANCH(descriptor)) { |
---|
| 8728 | + struct page *page; |
---|
| 8729 | + unsigned long nr_skip = 0; |
---|
| 8730 | + unsigned long nr_pages = 0; |
---|
| 8731 | + |
---|
| 8732 | + dump_stack(); |
---|
| 8733 | + list_for_each_entry(page, page_list, lru) { |
---|
| 8734 | + nr_pages++; |
---|
| 8735 | + /* The page will be freed by putback_movable_pages soon */ |
---|
| 8736 | + if (page_count(page) == 1) { |
---|
| 8737 | + nr_skip++; |
---|
| 8738 | + continue; |
---|
| 8739 | + } |
---|
| 8740 | + dump_page(page, "migration failure"); |
---|
| 8741 | + } |
---|
| 8742 | + pr_warn("total dump_pages %lu skipping %lu\n", nr_pages, nr_skip); |
---|
| 8743 | + } |
---|
| 8744 | +} |
---|
| 8745 | +#else |
---|
| 8746 | +static inline void alloc_contig_dump_pages(struct list_head *page_list) |
---|
| 8747 | +{ |
---|
| 8748 | +} |
---|
| 8749 | +#endif |
---|
| 8750 | + |
---|
8020 | 8751 | /* [start, end) must belong to a single zone. */ |
---|
8021 | 8752 | static int __alloc_contig_migrate_range(struct compact_control *cc, |
---|
8022 | | - unsigned long start, unsigned long end) |
---|
| 8753 | + unsigned long start, unsigned long end, |
---|
| 8754 | + struct acr_info *info) |
---|
8023 | 8755 | { |
---|
8024 | 8756 | /* This function is based on compact_zone() from compaction.c. */ |
---|
8025 | | - unsigned long nr_reclaimed; |
---|
| 8757 | + unsigned int nr_reclaimed; |
---|
8026 | 8758 | unsigned long pfn = start; |
---|
8027 | 8759 | unsigned int tries = 0; |
---|
| 8760 | + unsigned int max_tries = 5; |
---|
8028 | 8761 | int ret = 0; |
---|
| 8762 | + struct page *page; |
---|
| 8763 | + struct migration_target_control mtc = { |
---|
| 8764 | + .nid = zone_to_nid(cc->zone), |
---|
| 8765 | + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, |
---|
| 8766 | + }; |
---|
8029 | 8767 | |
---|
8030 | | - migrate_prep(); |
---|
| 8768 | + if (cc->alloc_contig && cc->mode == MIGRATE_ASYNC) |
---|
| 8769 | + max_tries = 1; |
---|
| 8770 | + |
---|
| 8771 | + lru_cache_disable(); |
---|
8031 | 8772 | |
---|
8032 | 8773 | while (pfn < end || !list_empty(&cc->migratepages)) { |
---|
8033 | 8774 | if (fatal_signal_pending(current)) { |
---|
.. | .. |
---|
8043 | 8784 | break; |
---|
8044 | 8785 | } |
---|
8045 | 8786 | tries = 0; |
---|
8046 | | - } else if (++tries == 5) { |
---|
| 8787 | + } else if (++tries == max_tries) { |
---|
8047 | 8788 | ret = ret < 0 ? ret : -EBUSY; |
---|
8048 | 8789 | break; |
---|
8049 | 8790 | } |
---|
8050 | 8791 | |
---|
8051 | 8792 | nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, |
---|
8052 | 8793 | &cc->migratepages); |
---|
| 8794 | + info->nr_reclaimed += nr_reclaimed; |
---|
8053 | 8795 | cc->nr_migratepages -= nr_reclaimed; |
---|
8054 | 8796 | |
---|
8055 | | - ret = migrate_pages(&cc->migratepages, alloc_migrate_target, |
---|
8056 | | - NULL, 0, cc->mode, MR_CONTIG_RANGE); |
---|
| 8797 | + list_for_each_entry(page, &cc->migratepages, lru) |
---|
| 8798 | + info->nr_mapped += page_mapcount(page); |
---|
| 8799 | + |
---|
| 8800 | + ret = migrate_pages(&cc->migratepages, alloc_migration_target, |
---|
| 8801 | + NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); |
---|
| 8802 | + if (!ret) |
---|
| 8803 | + info->nr_migrated += cc->nr_migratepages; |
---|
8057 | 8804 | } |
---|
| 8805 | + |
---|
| 8806 | + lru_cache_enable(); |
---|
8058 | 8807 | if (ret < 0) { |
---|
| 8808 | + if (ret == -EBUSY) { |
---|
| 8809 | + alloc_contig_dump_pages(&cc->migratepages); |
---|
| 8810 | + page_pinner_mark_migration_failed_pages(&cc->migratepages); |
---|
| 8811 | + } |
---|
| 8812 | + |
---|
| 8813 | + if (!list_empty(&cc->migratepages)) { |
---|
| 8814 | + page = list_first_entry(&cc->migratepages, struct page , lru); |
---|
| 8815 | + info->failed_pfn = page_to_pfn(page); |
---|
| 8816 | + } |
---|
| 8817 | + |
---|
8059 | 8818 | putback_movable_pages(&cc->migratepages); |
---|
| 8819 | + info->err |= ACR_ERR_MIGRATE; |
---|
8060 | 8820 | return ret; |
---|
8061 | 8821 | } |
---|
8062 | 8822 | return 0; |
---|
.. | .. |
---|
8079 | 8839 | * pageblocks in the range. Once isolated, the pageblocks should not |
---|
8080 | 8840 | * be modified by others. |
---|
8081 | 8841 | * |
---|
8082 | | - * Returns zero on success or negative error code. On success all |
---|
| 8842 | + * Return: zero on success or negative error code. On success all |
---|
8083 | 8843 | * pages which PFN is in [start, end) are allocated for the caller and |
---|
8084 | 8844 | * need to be freed with free_contig_range(). |
---|
8085 | 8845 | */ |
---|
8086 | 8846 | int alloc_contig_range(unsigned long start, unsigned long end, |
---|
8087 | | - unsigned migratetype, gfp_t gfp_mask) |
---|
| 8847 | + unsigned migratetype, gfp_t gfp_mask, |
---|
| 8848 | + struct acr_info *info) |
---|
8088 | 8849 | { |
---|
8089 | 8850 | unsigned long outer_start, outer_end; |
---|
8090 | 8851 | unsigned int order; |
---|
8091 | 8852 | int ret = 0; |
---|
| 8853 | + bool skip_drain_all_pages = false; |
---|
8092 | 8854 | |
---|
8093 | 8855 | struct compact_control cc = { |
---|
8094 | 8856 | .nr_migratepages = 0, |
---|
8095 | 8857 | .order = -1, |
---|
8096 | 8858 | .zone = page_zone(pfn_to_page(start)), |
---|
8097 | | - .mode = MIGRATE_SYNC, |
---|
| 8859 | + .mode = gfp_mask & __GFP_NORETRY ? MIGRATE_ASYNC : MIGRATE_SYNC, |
---|
8098 | 8860 | .ignore_skip_hint = true, |
---|
8099 | 8861 | .no_set_skip_hint = true, |
---|
8100 | 8862 | .gfp_mask = current_gfp_context(gfp_mask), |
---|
| 8863 | + .alloc_contig = true, |
---|
8101 | 8864 | }; |
---|
8102 | 8865 | INIT_LIST_HEAD(&cc.migratepages); |
---|
8103 | 8866 | |
---|
.. | .. |
---|
8126 | 8889 | */ |
---|
8127 | 8890 | |
---|
8128 | 8891 | ret = start_isolate_page_range(pfn_max_align_down(start), |
---|
8129 | | - pfn_max_align_up(end), migratetype, |
---|
8130 | | - false); |
---|
8131 | | - if (ret) |
---|
| 8892 | + pfn_max_align_up(end), migratetype, 0, |
---|
| 8893 | + &info->failed_pfn); |
---|
| 8894 | + if (ret) { |
---|
| 8895 | + info->err |= ACR_ERR_ISOLATE; |
---|
8132 | 8896 | return ret; |
---|
| 8897 | + } |
---|
8133 | 8898 | |
---|
8134 | | -#ifdef CONFIG_CMA |
---|
8135 | | - cc.zone->cma_alloc = 1; |
---|
8136 | | -#endif |
---|
| 8899 | + trace_android_vh_cma_drain_all_pages_bypass(migratetype, |
---|
| 8900 | + &skip_drain_all_pages); |
---|
| 8901 | + if (!skip_drain_all_pages) |
---|
| 8902 | + drain_all_pages(cc.zone); |
---|
| 8903 | + |
---|
8137 | 8904 | /* |
---|
8138 | 8905 | * In case of -EBUSY, we'd like to know which page causes problem. |
---|
8139 | 8906 | * So, just fall through. test_pages_isolated() has a tracepoint |
---|
.. | .. |
---|
8144 | 8911 | * allocated. So, if we fall through be sure to clear ret so that |
---|
8145 | 8912 | * -EBUSY is not accidentally used or returned to caller. |
---|
8146 | 8913 | */ |
---|
8147 | | - ret = __alloc_contig_migrate_range(&cc, start, end); |
---|
8148 | | - if (ret && ret != -EBUSY) |
---|
| 8914 | + ret = __alloc_contig_migrate_range(&cc, start, end, info); |
---|
| 8915 | + if (ret && (ret != -EBUSY || (gfp_mask & __GFP_NORETRY))) |
---|
8149 | 8916 | goto done; |
---|
8150 | 8917 | ret =0; |
---|
8151 | 8918 | |
---|
.. | .. |
---|
8166 | 8933 | * isolated thus they won't get removed from buddy. |
---|
8167 | 8934 | */ |
---|
8168 | 8935 | |
---|
8169 | | - lru_add_drain_all(); |
---|
8170 | | - drain_all_pages(cc.zone); |
---|
8171 | | - |
---|
8172 | 8936 | order = 0; |
---|
8173 | 8937 | outer_start = start; |
---|
8174 | 8938 | while (!PageBuddy(pfn_to_page(outer_start))) { |
---|
.. | .. |
---|
8180 | 8944 | } |
---|
8181 | 8945 | |
---|
8182 | 8946 | if (outer_start != start) { |
---|
8183 | | - order = page_order(pfn_to_page(outer_start)); |
---|
| 8947 | + order = buddy_order(pfn_to_page(outer_start)); |
---|
8184 | 8948 | |
---|
8185 | 8949 | /* |
---|
8186 | 8950 | * outer_start page could be small order buddy page and |
---|
.. | .. |
---|
8193 | 8957 | } |
---|
8194 | 8958 | |
---|
8195 | 8959 | /* Make sure the range is really isolated. */ |
---|
8196 | | - if (test_pages_isolated(outer_start, end, false)) { |
---|
| 8960 | + if (test_pages_isolated(outer_start, end, 0, &info->failed_pfn)) { |
---|
8197 | 8961 | pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", |
---|
8198 | 8962 | __func__, outer_start, end); |
---|
8199 | 8963 | ret = -EBUSY; |
---|
| 8964 | + info->err |= ACR_ERR_TEST; |
---|
8200 | 8965 | goto done; |
---|
8201 | 8966 | } |
---|
8202 | 8967 | |
---|
.. | .. |
---|
8216 | 8981 | done: |
---|
8217 | 8982 | undo_isolate_page_range(pfn_max_align_down(start), |
---|
8218 | 8983 | pfn_max_align_up(end), migratetype); |
---|
8219 | | -#ifdef CONFIG_CMA |
---|
8220 | | - cc.zone->cma_alloc = 0; |
---|
8221 | | -#endif |
---|
8222 | 8984 | return ret; |
---|
8223 | 8985 | } |
---|
| 8986 | +EXPORT_SYMBOL(alloc_contig_range); |
---|
8224 | 8987 | |
---|
8225 | | -void free_contig_range(unsigned long pfn, unsigned nr_pages) |
---|
| 8988 | +static int __alloc_contig_pages(unsigned long start_pfn, |
---|
| 8989 | + unsigned long nr_pages, gfp_t gfp_mask) |
---|
| 8990 | +{ |
---|
| 8991 | + struct acr_info dummy; |
---|
| 8992 | + unsigned long end_pfn = start_pfn + nr_pages; |
---|
| 8993 | + |
---|
| 8994 | + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, |
---|
| 8995 | + gfp_mask, &dummy); |
---|
| 8996 | +} |
---|
| 8997 | + |
---|
| 8998 | +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, |
---|
| 8999 | + unsigned long nr_pages) |
---|
| 9000 | +{ |
---|
| 9001 | + unsigned long i, end_pfn = start_pfn + nr_pages; |
---|
| 9002 | + struct page *page; |
---|
| 9003 | + |
---|
| 9004 | + for (i = start_pfn; i < end_pfn; i++) { |
---|
| 9005 | + page = pfn_to_online_page(i); |
---|
| 9006 | + if (!page) |
---|
| 9007 | + return false; |
---|
| 9008 | + |
---|
| 9009 | + if (page_zone(page) != z) |
---|
| 9010 | + return false; |
---|
| 9011 | + |
---|
| 9012 | + if (PageReserved(page)) |
---|
| 9013 | + return false; |
---|
| 9014 | + |
---|
| 9015 | + if (page_count(page) > 0) |
---|
| 9016 | + return false; |
---|
| 9017 | + |
---|
| 9018 | + if (PageHuge(page)) |
---|
| 9019 | + return false; |
---|
| 9020 | + } |
---|
| 9021 | + return true; |
---|
| 9022 | +} |
---|
| 9023 | + |
---|
| 9024 | +static bool zone_spans_last_pfn(const struct zone *zone, |
---|
| 9025 | + unsigned long start_pfn, unsigned long nr_pages) |
---|
| 9026 | +{ |
---|
| 9027 | + unsigned long last_pfn = start_pfn + nr_pages - 1; |
---|
| 9028 | + |
---|
| 9029 | + return zone_spans_pfn(zone, last_pfn); |
---|
| 9030 | +} |
---|
| 9031 | + |
---|
| 9032 | +/** |
---|
| 9033 | + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages |
---|
| 9034 | + * @nr_pages: Number of contiguous pages to allocate |
---|
| 9035 | + * @gfp_mask: GFP mask to limit search and used during compaction |
---|
| 9036 | + * @nid: Target node |
---|
| 9037 | + * @nodemask: Mask for other possible nodes |
---|
| 9038 | + * |
---|
| 9039 | + * This routine is a wrapper around alloc_contig_range(). It scans over zones |
---|
| 9040 | + * on an applicable zonelist to find a contiguous pfn range which can then be |
---|
| 9041 | + * tried for allocation with alloc_contig_range(). This routine is intended |
---|
| 9042 | + * for allocation requests which can not be fulfilled with the buddy allocator. |
---|
| 9043 | + * |
---|
| 9044 | + * The allocated memory is always aligned to a page boundary. If nr_pages is a |
---|
| 9045 | + * power of two then the alignment is guaranteed to be to the given nr_pages |
---|
| 9046 | + * (e.g. 1GB request would be aligned to 1GB). |
---|
| 9047 | + * |
---|
| 9048 | + * Allocated pages can be freed with free_contig_range() or by manually calling |
---|
| 9049 | + * __free_page() on each allocated page. |
---|
| 9050 | + * |
---|
| 9051 | + * Return: pointer to contiguous pages on success, or NULL if not successful. |
---|
| 9052 | + */ |
---|
| 9053 | +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, |
---|
| 9054 | + int nid, nodemask_t *nodemask) |
---|
| 9055 | +{ |
---|
| 9056 | + unsigned long ret, pfn, flags; |
---|
| 9057 | + struct zonelist *zonelist; |
---|
| 9058 | + struct zone *zone; |
---|
| 9059 | + struct zoneref *z; |
---|
| 9060 | + |
---|
| 9061 | + zonelist = node_zonelist(nid, gfp_mask); |
---|
| 9062 | + for_each_zone_zonelist_nodemask(zone, z, zonelist, |
---|
| 9063 | + gfp_zone(gfp_mask), nodemask) { |
---|
| 9064 | + spin_lock_irqsave(&zone->lock, flags); |
---|
| 9065 | + |
---|
| 9066 | + pfn = ALIGN(zone->zone_start_pfn, nr_pages); |
---|
| 9067 | + while (zone_spans_last_pfn(zone, pfn, nr_pages)) { |
---|
| 9068 | + if (pfn_range_valid_contig(zone, pfn, nr_pages)) { |
---|
| 9069 | + /* |
---|
| 9070 | + * We release the zone lock here because |
---|
| 9071 | + * alloc_contig_range() will also lock the zone |
---|
| 9072 | + * at some point. If there's an allocation |
---|
| 9073 | + * spinning on this lock, it may win the race |
---|
| 9074 | + * and cause alloc_contig_range() to fail... |
---|
| 9075 | + */ |
---|
| 9076 | + spin_unlock_irqrestore(&zone->lock, flags); |
---|
| 9077 | + ret = __alloc_contig_pages(pfn, nr_pages, |
---|
| 9078 | + gfp_mask); |
---|
| 9079 | + if (!ret) |
---|
| 9080 | + return pfn_to_page(pfn); |
---|
| 9081 | + spin_lock_irqsave(&zone->lock, flags); |
---|
| 9082 | + } |
---|
| 9083 | + pfn += nr_pages; |
---|
| 9084 | + } |
---|
| 9085 | + spin_unlock_irqrestore(&zone->lock, flags); |
---|
| 9086 | + } |
---|
| 9087 | + return NULL; |
---|
| 9088 | +} |
---|
| 9089 | +#endif /* CONFIG_CONTIG_ALLOC */ |
---|
| 9090 | + |
---|
| 9091 | +void free_contig_range(unsigned long pfn, unsigned int nr_pages) |
---|
8226 | 9092 | { |
---|
8227 | 9093 | unsigned int count = 0; |
---|
8228 | 9094 | |
---|
.. | .. |
---|
8234 | 9100 | } |
---|
8235 | 9101 | WARN(count != 0, "%d pages are still in use!\n", count); |
---|
8236 | 9102 | } |
---|
8237 | | -#endif |
---|
| 9103 | +EXPORT_SYMBOL(free_contig_range); |
---|
8238 | 9104 | |
---|
8239 | 9105 | /* |
---|
8240 | 9106 | * The zone indicated has a new number of managed_pages; batch sizes and percpu |
---|
.. | .. |
---|
8242 | 9108 | */ |
---|
8243 | 9109 | void __meminit zone_pcp_update(struct zone *zone) |
---|
8244 | 9110 | { |
---|
8245 | | - unsigned cpu; |
---|
8246 | 9111 | mutex_lock(&pcp_batch_high_lock); |
---|
8247 | | - for_each_possible_cpu(cpu) |
---|
8248 | | - pageset_set_high_and_batch(zone, |
---|
8249 | | - per_cpu_ptr(zone->pageset, cpu)); |
---|
| 9112 | + __zone_pcp_update(zone); |
---|
8250 | 9113 | mutex_unlock(&pcp_batch_high_lock); |
---|
8251 | 9114 | } |
---|
8252 | 9115 | |
---|
.. | .. |
---|
8271 | 9134 | |
---|
8272 | 9135 | #ifdef CONFIG_MEMORY_HOTREMOVE |
---|
8273 | 9136 | /* |
---|
8274 | | - * All pages in the range must be in a single zone and isolated |
---|
8275 | | - * before calling this. |
---|
| 9137 | + * All pages in the range must be in a single zone, must not contain holes, |
---|
| 9138 | + * must span full sections, and must be isolated before calling this function. |
---|
8276 | 9139 | */ |
---|
8277 | | -void |
---|
8278 | | -__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) |
---|
| 9140 | +void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) |
---|
8279 | 9141 | { |
---|
| 9142 | + unsigned long pfn = start_pfn; |
---|
8280 | 9143 | struct page *page; |
---|
8281 | 9144 | struct zone *zone; |
---|
8282 | | - unsigned int order, i; |
---|
8283 | | - unsigned long pfn; |
---|
| 9145 | + unsigned int order; |
---|
8284 | 9146 | unsigned long flags; |
---|
8285 | | - /* find the first valid pfn */ |
---|
8286 | | - for (pfn = start_pfn; pfn < end_pfn; pfn++) |
---|
8287 | | - if (pfn_valid(pfn)) |
---|
8288 | | - break; |
---|
8289 | | - if (pfn == end_pfn) |
---|
8290 | | - return; |
---|
| 9147 | + |
---|
8291 | 9148 | offline_mem_sections(pfn, end_pfn); |
---|
8292 | 9149 | zone = page_zone(pfn_to_page(pfn)); |
---|
8293 | 9150 | spin_lock_irqsave(&zone->lock, flags); |
---|
8294 | | - pfn = start_pfn; |
---|
8295 | 9151 | while (pfn < end_pfn) { |
---|
8296 | | - if (!pfn_valid(pfn)) { |
---|
8297 | | - pfn++; |
---|
8298 | | - continue; |
---|
8299 | | - } |
---|
8300 | 9152 | page = pfn_to_page(pfn); |
---|
8301 | 9153 | /* |
---|
8302 | 9154 | * The HWPoisoned page may be not in buddy system, and |
---|
.. | .. |
---|
8304 | 9156 | */ |
---|
8305 | 9157 | if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { |
---|
8306 | 9158 | pfn++; |
---|
8307 | | - SetPageReserved(page); |
---|
| 9159 | + continue; |
---|
| 9160 | + } |
---|
| 9161 | + /* |
---|
| 9162 | + * At this point all remaining PageOffline() pages have a |
---|
| 9163 | + * reference count of 0 and can simply be skipped. |
---|
| 9164 | + */ |
---|
| 9165 | + if (PageOffline(page)) { |
---|
| 9166 | + BUG_ON(page_count(page)); |
---|
| 9167 | + BUG_ON(PageBuddy(page)); |
---|
| 9168 | + pfn++; |
---|
8308 | 9169 | continue; |
---|
8309 | 9170 | } |
---|
8310 | 9171 | |
---|
8311 | 9172 | BUG_ON(page_count(page)); |
---|
8312 | 9173 | BUG_ON(!PageBuddy(page)); |
---|
8313 | | - order = page_order(page); |
---|
8314 | | -#ifdef CONFIG_DEBUG_VM |
---|
8315 | | - pr_info("remove from free list %lx %d %lx\n", |
---|
8316 | | - pfn, 1 << order, end_pfn); |
---|
8317 | | -#endif |
---|
8318 | | - list_del(&page->lru); |
---|
8319 | | - rmv_page_order(page); |
---|
8320 | | - zone->free_area[order].nr_free--; |
---|
8321 | | - for (i = 0; i < (1 << order); i++) |
---|
8322 | | - SetPageReserved((page+i)); |
---|
| 9174 | + order = buddy_order(page); |
---|
| 9175 | + del_page_from_free_list(page, zone, order); |
---|
8323 | 9176 | pfn += (1 << order); |
---|
8324 | 9177 | } |
---|
8325 | 9178 | spin_unlock_irqrestore(&zone->lock, flags); |
---|
.. | .. |
---|
8337 | 9190 | for (order = 0; order < MAX_ORDER; order++) { |
---|
8338 | 9191 | struct page *page_head = page - (pfn & ((1 << order) - 1)); |
---|
8339 | 9192 | |
---|
8340 | | - if (PageBuddy(page_head) && page_order(page_head) >= order) |
---|
| 9193 | + if (PageBuddy(page_head) && buddy_order(page_head) >= order) |
---|
8341 | 9194 | break; |
---|
8342 | 9195 | } |
---|
8343 | 9196 | spin_unlock_irqrestore(&zone->lock, flags); |
---|
.. | .. |
---|
8347 | 9200 | |
---|
8348 | 9201 | #ifdef CONFIG_MEMORY_FAILURE |
---|
8349 | 9202 | /* |
---|
8350 | | - * Set PG_hwpoison flag if a given page is confirmed to be a free page. This |
---|
8351 | | - * test is performed under the zone lock to prevent a race against page |
---|
8352 | | - * allocation. |
---|
| 9203 | + * Break down a higher-order page in sub-pages, and keep our target out of |
---|
| 9204 | + * buddy allocator. |
---|
8353 | 9205 | */ |
---|
8354 | | -bool set_hwpoison_free_buddy_page(struct page *page) |
---|
| 9206 | +static void break_down_buddy_pages(struct zone *zone, struct page *page, |
---|
| 9207 | + struct page *target, int low, int high, |
---|
| 9208 | + int migratetype) |
---|
| 9209 | +{ |
---|
| 9210 | + unsigned long size = 1 << high; |
---|
| 9211 | + struct page *current_buddy, *next_page; |
---|
| 9212 | + |
---|
| 9213 | + while (high > low) { |
---|
| 9214 | + high--; |
---|
| 9215 | + size >>= 1; |
---|
| 9216 | + |
---|
| 9217 | + if (target >= &page[size]) { |
---|
| 9218 | + next_page = page + size; |
---|
| 9219 | + current_buddy = page; |
---|
| 9220 | + } else { |
---|
| 9221 | + next_page = page; |
---|
| 9222 | + current_buddy = page + size; |
---|
| 9223 | + } |
---|
| 9224 | + |
---|
| 9225 | + if (set_page_guard(zone, current_buddy, high, migratetype)) |
---|
| 9226 | + continue; |
---|
| 9227 | + |
---|
| 9228 | + if (current_buddy != target) { |
---|
| 9229 | + add_to_free_list(current_buddy, zone, high, migratetype); |
---|
| 9230 | + set_buddy_order(current_buddy, high); |
---|
| 9231 | + page = next_page; |
---|
| 9232 | + } |
---|
| 9233 | + } |
---|
| 9234 | +} |
---|
| 9235 | + |
---|
| 9236 | +/* |
---|
| 9237 | + * Take a page that will be marked as poisoned off the buddy allocator. |
---|
| 9238 | + */ |
---|
| 9239 | +bool take_page_off_buddy(struct page *page) |
---|
8355 | 9240 | { |
---|
8356 | 9241 | struct zone *zone = page_zone(page); |
---|
8357 | 9242 | unsigned long pfn = page_to_pfn(page); |
---|
8358 | 9243 | unsigned long flags; |
---|
8359 | 9244 | unsigned int order; |
---|
8360 | | - bool hwpoisoned = false; |
---|
| 9245 | + bool ret = false; |
---|
8361 | 9246 | |
---|
8362 | 9247 | spin_lock_irqsave(&zone->lock, flags); |
---|
8363 | 9248 | for (order = 0; order < MAX_ORDER; order++) { |
---|
8364 | 9249 | struct page *page_head = page - (pfn & ((1 << order) - 1)); |
---|
| 9250 | + int page_order = buddy_order(page_head); |
---|
8365 | 9251 | |
---|
8366 | | - if (PageBuddy(page_head) && page_order(page_head) >= order) { |
---|
8367 | | - if (!TestSetPageHWPoison(page)) |
---|
8368 | | - hwpoisoned = true; |
---|
| 9252 | + if (PageBuddy(page_head) && page_order >= order) { |
---|
| 9253 | + unsigned long pfn_head = page_to_pfn(page_head); |
---|
| 9254 | + int migratetype = get_pfnblock_migratetype(page_head, |
---|
| 9255 | + pfn_head); |
---|
| 9256 | + |
---|
| 9257 | + del_page_from_free_list(page_head, zone, page_order); |
---|
| 9258 | + break_down_buddy_pages(zone, page_head, page, 0, |
---|
| 9259 | + page_order, migratetype); |
---|
| 9260 | + if (!is_migrate_isolate(migratetype)) |
---|
| 9261 | + __mod_zone_freepage_state(zone, -1, migratetype); |
---|
| 9262 | + ret = true; |
---|
8369 | 9263 | break; |
---|
8370 | 9264 | } |
---|
| 9265 | + if (page_count(page_head) > 0) |
---|
| 9266 | + break; |
---|
8371 | 9267 | } |
---|
8372 | 9268 | spin_unlock_irqrestore(&zone->lock, flags); |
---|
8373 | | - |
---|
8374 | | - return hwpoisoned; |
---|
| 9269 | + return ret; |
---|
8375 | 9270 | } |
---|
8376 | 9271 | #endif |
---|
| 9272 | + |
---|
| 9273 | +#ifdef CONFIG_ZONE_DMA |
---|
| 9274 | +bool has_managed_dma(void) |
---|
| 9275 | +{ |
---|
| 9276 | + struct pglist_data *pgdat; |
---|
| 9277 | + |
---|
| 9278 | + for_each_online_pgdat(pgdat) { |
---|
| 9279 | + struct zone *zone = &pgdat->node_zones[ZONE_DMA]; |
---|
| 9280 | + |
---|
| 9281 | + if (managed_zone(zone)) |
---|
| 9282 | + return true; |
---|
| 9283 | + } |
---|
| 9284 | + return false; |
---|
| 9285 | +} |
---|
| 9286 | +#endif /* CONFIG_ZONE_DMA */ |
---|