.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * linux/mm/page_alloc.c |
---|
3 | 4 | * |
---|
.. | .. |
---|
16 | 17 | |
---|
17 | 18 | #include <linux/stddef.h> |
---|
18 | 19 | #include <linux/mm.h> |
---|
| 20 | +#include <linux/highmem.h> |
---|
19 | 21 | #include <linux/swap.h> |
---|
20 | 22 | #include <linux/interrupt.h> |
---|
21 | 23 | #include <linux/pagemap.h> |
---|
22 | 24 | #include <linux/jiffies.h> |
---|
23 | | -#include <linux/bootmem.h> |
---|
24 | 25 | #include <linux/memblock.h> |
---|
25 | 26 | #include <linux/compiler.h> |
---|
26 | 27 | #include <linux/kernel.h> |
---|
.. | .. |
---|
43 | 44 | #include <linux/mempolicy.h> |
---|
44 | 45 | #include <linux/memremap.h> |
---|
45 | 46 | #include <linux/stop_machine.h> |
---|
| 47 | +#include <linux/random.h> |
---|
46 | 48 | #include <linux/sort.h> |
---|
47 | 49 | #include <linux/pfn.h> |
---|
48 | 50 | #include <linux/backing-dev.h> |
---|
49 | 51 | #include <linux/fault-inject.h> |
---|
50 | 52 | #include <linux/page-isolation.h> |
---|
51 | | -#include <linux/page_ext.h> |
---|
52 | 53 | #include <linux/debugobjects.h> |
---|
53 | 54 | #include <linux/kmemleak.h> |
---|
54 | 55 | #include <linux/compaction.h> |
---|
.. | .. |
---|
60 | 61 | #include <linux/hugetlb.h> |
---|
61 | 62 | #include <linux/sched/rt.h> |
---|
62 | 63 | #include <linux/sched/mm.h> |
---|
63 | | -#include <linux/locallock.h> |
---|
| 64 | +#include <linux/local_lock.h> |
---|
64 | 65 | #include <linux/page_owner.h> |
---|
| 66 | +#include <linux/page_pinner.h> |
---|
65 | 67 | #include <linux/kthread.h> |
---|
66 | 68 | #include <linux/memcontrol.h> |
---|
67 | 69 | #include <linux/ftrace.h> |
---|
68 | 70 | #include <linux/lockdep.h> |
---|
69 | 71 | #include <linux/nmi.h> |
---|
70 | | -#include <linux/khugepaged.h> |
---|
71 | 72 | #include <linux/psi.h> |
---|
| 73 | +#include <linux/padata.h> |
---|
| 74 | +#include <linux/khugepaged.h> |
---|
| 75 | +#include <trace/hooks/mm.h> |
---|
| 76 | +#include <trace/hooks/vmscan.h> |
---|
72 | 77 | |
---|
73 | 78 | #include <asm/sections.h> |
---|
74 | 79 | #include <asm/tlbflush.h> |
---|
75 | 80 | #include <asm/div64.h> |
---|
76 | 81 | #include "internal.h" |
---|
| 82 | +#include "shuffle.h" |
---|
| 83 | +#include "page_reporting.h" |
---|
| 84 | + |
---|
| 85 | +/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ |
---|
| 86 | +typedef int __bitwise fpi_t; |
---|
| 87 | + |
---|
| 88 | +/* No special request */ |
---|
| 89 | +#define FPI_NONE ((__force fpi_t)0) |
---|
| 90 | + |
---|
| 91 | +/* |
---|
| 92 | + * Skip free page reporting notification for the (possibly merged) page. |
---|
| 93 | + * This does not hinder free page reporting from grabbing the page, |
---|
| 94 | + * reporting it and marking it "reported" - it only skips notifying |
---|
| 95 | + * the free page reporting infrastructure about a newly freed page. For |
---|
| 96 | + * example, used when temporarily pulling a page from a freelist and |
---|
| 97 | + * putting it back unmodified. |
---|
| 98 | + */ |
---|
| 99 | +#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) |
---|
| 100 | + |
---|
| 101 | +/* |
---|
| 102 | + * Place the (possibly merged) page to the tail of the freelist. Will ignore |
---|
| 103 | + * page shuffling (relevant code - e.g., memory onlining - is expected to |
---|
| 104 | + * shuffle the whole zone). |
---|
| 105 | + * |
---|
| 106 | + * Note: No code should rely on this flag for correctness - it's purely |
---|
| 107 | + * to allow for optimizations when handing back either fresh pages |
---|
| 108 | + * (memory onlining) or untouched pages (page isolation, free page |
---|
| 109 | + * reporting). |
---|
| 110 | + */ |
---|
| 111 | +#define FPI_TO_TAIL ((__force fpi_t)BIT(1)) |
---|
| 112 | + |
---|
| 113 | +/* |
---|
| 114 | + * Don't poison memory with KASAN (only for the tag-based modes). |
---|
| 115 | + * During boot, all non-reserved memblock memory is exposed to page_alloc. |
---|
| 116 | + * Poisoning all that memory lengthens boot time, especially on systems with |
---|
| 117 | + * large amount of RAM. This flag is used to skip that poisoning. |
---|
| 118 | + * This is only done for the tag-based KASAN modes, as those are able to |
---|
| 119 | + * detect memory corruptions with the memory tags assigned by default. |
---|
| 120 | + * All memory allocated normally after boot gets poisoned as usual. |
---|
| 121 | + */ |
---|
| 122 | +#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) |
---|
77 | 123 | |
---|
78 | 124 | /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ |
---|
79 | 125 | static DEFINE_MUTEX(pcp_batch_high_lock); |
---|
.. | .. |
---|
95 | 141 | */ |
---|
96 | 142 | DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ |
---|
97 | 143 | EXPORT_PER_CPU_SYMBOL(_numa_mem_); |
---|
98 | | -int _node_numa_mem_[MAX_NUMNODES]; |
---|
99 | 144 | #endif |
---|
100 | 145 | |
---|
101 | 146 | /* work_structs for global per-cpu drains */ |
---|
102 | | -DEFINE_MUTEX(pcpu_drain_mutex); |
---|
103 | | -DEFINE_PER_CPU(struct work_struct, pcpu_drain); |
---|
| 147 | +struct pcpu_drain { |
---|
| 148 | + struct zone *zone; |
---|
| 149 | + struct work_struct work; |
---|
| 150 | +}; |
---|
| 151 | +static DEFINE_MUTEX(pcpu_drain_mutex); |
---|
| 152 | +static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); |
---|
104 | 153 | |
---|
105 | 154 | #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY |
---|
106 | 155 | volatile unsigned long latent_entropy __latent_entropy; |
---|
.. | .. |
---|
124 | 173 | }; |
---|
125 | 174 | EXPORT_SYMBOL(node_states); |
---|
126 | 175 | |
---|
127 | | -/* Protect totalram_pages and zone->managed_pages */ |
---|
128 | | -static DEFINE_SPINLOCK(managed_page_count_lock); |
---|
129 | | - |
---|
130 | | -unsigned long totalram_pages __read_mostly; |
---|
| 176 | +atomic_long_t _totalram_pages __read_mostly; |
---|
| 177 | +EXPORT_SYMBOL(_totalram_pages); |
---|
131 | 178 | unsigned long totalreserve_pages __read_mostly; |
---|
132 | 179 | unsigned long totalcma_pages __read_mostly; |
---|
133 | 180 | |
---|
134 | 181 | int percpu_pagelist_fraction; |
---|
135 | 182 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
---|
136 | | -#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON |
---|
137 | | -DEFINE_STATIC_KEY_TRUE(init_on_alloc); |
---|
138 | | -#else |
---|
139 | 183 | DEFINE_STATIC_KEY_FALSE(init_on_alloc); |
---|
140 | | -#endif |
---|
141 | 184 | EXPORT_SYMBOL(init_on_alloc); |
---|
142 | 185 | |
---|
143 | | -#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON |
---|
144 | | -DEFINE_STATIC_KEY_TRUE(init_on_free); |
---|
145 | | -#else |
---|
146 | 186 | DEFINE_STATIC_KEY_FALSE(init_on_free); |
---|
147 | | -#endif |
---|
148 | 187 | EXPORT_SYMBOL(init_on_free); |
---|
149 | 188 | |
---|
| 189 | +static bool _init_on_alloc_enabled_early __read_mostly |
---|
| 190 | + = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); |
---|
150 | 191 | static int __init early_init_on_alloc(char *buf) |
---|
151 | 192 | { |
---|
152 | | - int ret; |
---|
153 | | - bool bool_result; |
---|
154 | 193 | |
---|
155 | | - if (!buf) |
---|
156 | | - return -EINVAL; |
---|
157 | | - ret = kstrtobool(buf, &bool_result); |
---|
158 | | - if (bool_result && page_poisoning_enabled()) |
---|
159 | | - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n"); |
---|
160 | | - if (bool_result) |
---|
161 | | - static_branch_enable(&init_on_alloc); |
---|
162 | | - else |
---|
163 | | - static_branch_disable(&init_on_alloc); |
---|
164 | | - return ret; |
---|
| 194 | + return kstrtobool(buf, &_init_on_alloc_enabled_early); |
---|
165 | 195 | } |
---|
166 | 196 | early_param("init_on_alloc", early_init_on_alloc); |
---|
167 | 197 | |
---|
| 198 | +static bool _init_on_free_enabled_early __read_mostly |
---|
| 199 | + = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON); |
---|
168 | 200 | static int __init early_init_on_free(char *buf) |
---|
169 | 201 | { |
---|
170 | | - int ret; |
---|
171 | | - bool bool_result; |
---|
172 | | - |
---|
173 | | - if (!buf) |
---|
174 | | - return -EINVAL; |
---|
175 | | - ret = kstrtobool(buf, &bool_result); |
---|
176 | | - if (bool_result && page_poisoning_enabled()) |
---|
177 | | - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n"); |
---|
178 | | - if (bool_result) |
---|
179 | | - static_branch_enable(&init_on_free); |
---|
180 | | - else |
---|
181 | | - static_branch_disable(&init_on_free); |
---|
182 | | - return ret; |
---|
| 202 | + return kstrtobool(buf, &_init_on_free_enabled_early); |
---|
183 | 203 | } |
---|
184 | 204 | early_param("init_on_free", early_init_on_free); |
---|
185 | 205 | |
---|
.. | .. |
---|
243 | 263 | unsigned int pageblock_order __read_mostly; |
---|
244 | 264 | #endif |
---|
245 | 265 | |
---|
246 | | -static void __free_pages_ok(struct page *page, unsigned int order); |
---|
| 266 | +static void __free_pages_ok(struct page *page, unsigned int order, |
---|
| 267 | + fpi_t fpi_flags); |
---|
247 | 268 | |
---|
248 | 269 | /* |
---|
249 | 270 | * results with 256, 32 in the lowmem_reserve sysctl: |
---|
.. | .. |
---|
270 | 291 | [ZONE_MOVABLE] = 0, |
---|
271 | 292 | }; |
---|
272 | 293 | |
---|
273 | | -EXPORT_SYMBOL(totalram_pages); |
---|
274 | | - |
---|
275 | 294 | static char * const zone_names[MAX_NR_ZONES] = { |
---|
276 | 295 | #ifdef CONFIG_ZONE_DMA |
---|
277 | 296 | "DMA", |
---|
.. | .. |
---|
289 | 308 | #endif |
---|
290 | 309 | }; |
---|
291 | 310 | |
---|
292 | | -char * const migratetype_names[MIGRATE_TYPES] = { |
---|
| 311 | +const char * const migratetype_names[MIGRATE_TYPES] = { |
---|
293 | 312 | "Unmovable", |
---|
294 | 313 | "Movable", |
---|
295 | 314 | "Reclaimable", |
---|
.. | .. |
---|
302 | 321 | #endif |
---|
303 | 322 | }; |
---|
304 | 323 | |
---|
305 | | -compound_page_dtor * const compound_page_dtors[] = { |
---|
306 | | - NULL, |
---|
307 | | - free_compound_page, |
---|
| 324 | +compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { |
---|
| 325 | + [NULL_COMPOUND_DTOR] = NULL, |
---|
| 326 | + [COMPOUND_PAGE_DTOR] = free_compound_page, |
---|
308 | 327 | #ifdef CONFIG_HUGETLB_PAGE |
---|
309 | | - free_huge_page, |
---|
| 328 | + [HUGETLB_PAGE_DTOR] = free_huge_page, |
---|
310 | 329 | #endif |
---|
311 | 330 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
---|
312 | | - free_transhuge_page, |
---|
| 331 | + [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, |
---|
313 | 332 | #endif |
---|
314 | 333 | }; |
---|
315 | 334 | |
---|
.. | .. |
---|
320 | 339 | */ |
---|
321 | 340 | int min_free_kbytes = 1024; |
---|
322 | 341 | int user_min_free_kbytes = -1; |
---|
| 342 | +#ifdef CONFIG_DISCONTIGMEM |
---|
| 343 | +/* |
---|
| 344 | + * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges |
---|
| 345 | + * are not on separate NUMA nodes. Functionally this works but with |
---|
| 346 | + * watermark_boost_factor, it can reclaim prematurely as the ranges can be |
---|
| 347 | + * quite small. By default, do not boost watermarks on discontigmem as in |
---|
| 348 | + * many cases very high-order allocations like THP are likely to be |
---|
| 349 | + * unsupported and the premature reclaim offsets the advantage of long-term |
---|
| 350 | + * fragmentation avoidance. |
---|
| 351 | + */ |
---|
| 352 | +int watermark_boost_factor __read_mostly; |
---|
| 353 | +#else |
---|
| 354 | +int watermark_boost_factor __read_mostly = 15000; |
---|
| 355 | +#endif |
---|
323 | 356 | int watermark_scale_factor = 10; |
---|
324 | 357 | |
---|
325 | 358 | /* |
---|
.. | .. |
---|
329 | 362 | */ |
---|
330 | 363 | int extra_free_kbytes = 0; |
---|
331 | 364 | |
---|
332 | | -static unsigned long nr_kernel_pages __meminitdata; |
---|
333 | | -static unsigned long nr_all_pages __meminitdata; |
---|
334 | | -static unsigned long dma_reserve __meminitdata; |
---|
| 365 | +static unsigned long nr_kernel_pages __initdata; |
---|
| 366 | +static unsigned long nr_all_pages __initdata; |
---|
| 367 | +static unsigned long dma_reserve __initdata; |
---|
335 | 368 | |
---|
336 | | -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
---|
337 | | -static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata; |
---|
338 | | -static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata; |
---|
| 369 | +static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; |
---|
| 370 | +static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; |
---|
339 | 371 | static unsigned long required_kernelcore __initdata; |
---|
340 | 372 | static unsigned long required_kernelcore_percent __initdata; |
---|
341 | 373 | static unsigned long required_movablecore __initdata; |
---|
342 | 374 | static unsigned long required_movablecore_percent __initdata; |
---|
343 | | -static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata; |
---|
| 375 | +static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; |
---|
344 | 376 | static bool mirrored_kernelcore __meminitdata; |
---|
345 | 377 | |
---|
346 | 378 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
---|
347 | 379 | int movable_zone; |
---|
348 | 380 | EXPORT_SYMBOL(movable_zone); |
---|
349 | | -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
---|
350 | 381 | |
---|
351 | 382 | #if MAX_NUMNODES > 1 |
---|
352 | | -int nr_node_ids __read_mostly = MAX_NUMNODES; |
---|
353 | | -int nr_online_nodes __read_mostly = 1; |
---|
| 383 | +unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; |
---|
| 384 | +unsigned int nr_online_nodes __read_mostly = 1; |
---|
354 | 385 | EXPORT_SYMBOL(nr_node_ids); |
---|
355 | 386 | EXPORT_SYMBOL(nr_online_nodes); |
---|
356 | 387 | #endif |
---|
357 | 388 | |
---|
358 | | -static DEFINE_LOCAL_IRQ_LOCK(pa_lock); |
---|
359 | | - |
---|
360 | | -#ifdef CONFIG_PREEMPT_RT_BASE |
---|
361 | | -# define cpu_lock_irqsave(cpu, flags) \ |
---|
362 | | - local_lock_irqsave_on(pa_lock, flags, cpu) |
---|
363 | | -# define cpu_unlock_irqrestore(cpu, flags) \ |
---|
364 | | - local_unlock_irqrestore_on(pa_lock, flags, cpu) |
---|
365 | | -#else |
---|
366 | | -# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags) |
---|
367 | | -# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags) |
---|
368 | | -#endif |
---|
| 389 | +struct pa_lock { |
---|
| 390 | + local_lock_t l; |
---|
| 391 | +}; |
---|
| 392 | +static DEFINE_PER_CPU(struct pa_lock, pa_lock) = { |
---|
| 393 | + .l = INIT_LOCAL_LOCK(l), |
---|
| 394 | +}; |
---|
369 | 395 | |
---|
370 | 396 | int page_group_by_mobility_disabled __read_mostly; |
---|
371 | 397 | |
---|
.. | .. |
---|
378 | 404 | static DEFINE_STATIC_KEY_TRUE(deferred_pages); |
---|
379 | 405 | |
---|
380 | 406 | /* |
---|
381 | | - * Calling kasan_free_pages() only after deferred memory initialization |
---|
| 407 | + * Calling kasan_poison_pages() only after deferred memory initialization |
---|
382 | 408 | * has completed. Poisoning pages during deferred memory init will greatly |
---|
383 | 409 | * lengthen the process and cause problem in large memory systems as the |
---|
384 | 410 | * deferred pages initialization is done with interrupt disabled. |
---|
.. | .. |
---|
390 | 416 | * on-demand allocation and then freed again before the deferred pages |
---|
391 | 417 | * initialization is done, but this is not likely to happen. |
---|
392 | 418 | */ |
---|
393 | | -static inline void kasan_free_nondeferred_pages(struct page *page, int order) |
---|
| 419 | +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) |
---|
394 | 420 | { |
---|
395 | | - if (!static_branch_unlikely(&deferred_pages)) |
---|
396 | | - kasan_free_pages(page, order); |
---|
| 421 | + return static_branch_unlikely(&deferred_pages) || |
---|
| 422 | + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && |
---|
| 423 | + (fpi_flags & FPI_SKIP_KASAN_POISON)) || |
---|
| 424 | + PageSkipKASanPoison(page); |
---|
397 | 425 | } |
---|
398 | 426 | |
---|
399 | 427 | /* Returns true if the struct page for the pfn is uninitialised */ |
---|
.. | .. |
---|
408 | 436 | } |
---|
409 | 437 | |
---|
410 | 438 | /* |
---|
411 | | - * Returns false when the remaining initialisation should be deferred until |
---|
| 439 | + * Returns true when the remaining initialisation should be deferred until |
---|
412 | 440 | * later in the boot cycle when it can be parallelised. |
---|
413 | 441 | */ |
---|
414 | | -static inline bool update_defer_init(pg_data_t *pgdat, |
---|
415 | | - unsigned long pfn, unsigned long zone_end, |
---|
416 | | - unsigned long *nr_initialised) |
---|
| 442 | +static bool __meminit |
---|
| 443 | +defer_init(int nid, unsigned long pfn, unsigned long end_pfn) |
---|
417 | 444 | { |
---|
418 | | - /* Always populate low zones for address-constrained allocations */ |
---|
419 | | - if (zone_end < pgdat_end_pfn(pgdat)) |
---|
420 | | - return true; |
---|
421 | | - (*nr_initialised)++; |
---|
422 | | - if ((*nr_initialised > pgdat->static_init_pgcnt) && |
---|
423 | | - (pfn & (PAGES_PER_SECTION - 1)) == 0) { |
---|
424 | | - pgdat->first_deferred_pfn = pfn; |
---|
425 | | - return false; |
---|
| 445 | + static unsigned long prev_end_pfn, nr_initialised; |
---|
| 446 | + |
---|
| 447 | + /* |
---|
| 448 | + * prev_end_pfn static that contains the end of previous zone |
---|
| 449 | + * No need to protect because called very early in boot before smp_init. |
---|
| 450 | + */ |
---|
| 451 | + if (prev_end_pfn != end_pfn) { |
---|
| 452 | + prev_end_pfn = end_pfn; |
---|
| 453 | + nr_initialised = 0; |
---|
426 | 454 | } |
---|
427 | 455 | |
---|
428 | | - return true; |
---|
| 456 | + /* Always populate low zones for address-constrained allocations */ |
---|
| 457 | + if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) |
---|
| 458 | + return false; |
---|
| 459 | + |
---|
| 460 | + if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) |
---|
| 461 | + return true; |
---|
| 462 | + /* |
---|
| 463 | + * We start only with one section of pages, more pages are added as |
---|
| 464 | + * needed until the rest of deferred pages are initialized. |
---|
| 465 | + */ |
---|
| 466 | + nr_initialised++; |
---|
| 467 | + if ((nr_initialised > PAGES_PER_SECTION) && |
---|
| 468 | + (pfn & (PAGES_PER_SECTION - 1)) == 0) { |
---|
| 469 | + NODE_DATA(nid)->first_deferred_pfn = pfn; |
---|
| 470 | + return true; |
---|
| 471 | + } |
---|
| 472 | + return false; |
---|
429 | 473 | } |
---|
430 | 474 | #else |
---|
431 | | -#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o) |
---|
| 475 | +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) |
---|
| 476 | +{ |
---|
| 477 | + return (!IS_ENABLED(CONFIG_KASAN_GENERIC) && |
---|
| 478 | + (fpi_flags & FPI_SKIP_KASAN_POISON)) || |
---|
| 479 | + PageSkipKASanPoison(page); |
---|
| 480 | +} |
---|
432 | 481 | |
---|
433 | 482 | static inline bool early_page_uninitialised(unsigned long pfn) |
---|
434 | 483 | { |
---|
435 | 484 | return false; |
---|
436 | 485 | } |
---|
437 | 486 | |
---|
438 | | -static inline bool update_defer_init(pg_data_t *pgdat, |
---|
439 | | - unsigned long pfn, unsigned long zone_end, |
---|
440 | | - unsigned long *nr_initialised) |
---|
| 487 | +static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) |
---|
441 | 488 | { |
---|
442 | | - return true; |
---|
| 489 | + return false; |
---|
443 | 490 | } |
---|
444 | 491 | #endif |
---|
445 | 492 | |
---|
.. | .. |
---|
448 | 495 | unsigned long pfn) |
---|
449 | 496 | { |
---|
450 | 497 | #ifdef CONFIG_SPARSEMEM |
---|
451 | | - return __pfn_to_section(pfn)->pageblock_flags; |
---|
| 498 | + return section_to_usemap(__pfn_to_section(pfn)); |
---|
452 | 499 | #else |
---|
453 | 500 | return page_zone(page)->pageblock_flags; |
---|
454 | 501 | #endif /* CONFIG_SPARSEMEM */ |
---|
.. | .. |
---|
458 | 505 | { |
---|
459 | 506 | #ifdef CONFIG_SPARSEMEM |
---|
460 | 507 | pfn &= (PAGES_PER_SECTION-1); |
---|
461 | | - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
---|
462 | 508 | #else |
---|
463 | 509 | pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); |
---|
464 | | - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
---|
465 | 510 | #endif /* CONFIG_SPARSEMEM */ |
---|
| 511 | + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
---|
466 | 512 | } |
---|
467 | 513 | |
---|
468 | 514 | /** |
---|
469 | 515 | * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages |
---|
470 | 516 | * @page: The page within the block of interest |
---|
471 | 517 | * @pfn: The target page frame number |
---|
472 | | - * @end_bitidx: The last bit of interest to retrieve |
---|
473 | 518 | * @mask: mask of bits that the caller is interested in |
---|
474 | 519 | * |
---|
475 | 520 | * Return: pageblock_bits flags |
---|
476 | 521 | */ |
---|
477 | | -static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, |
---|
| 522 | +static __always_inline |
---|
| 523 | +unsigned long __get_pfnblock_flags_mask(struct page *page, |
---|
478 | 524 | unsigned long pfn, |
---|
479 | | - unsigned long end_bitidx, |
---|
480 | 525 | unsigned long mask) |
---|
481 | 526 | { |
---|
482 | 527 | unsigned long *bitmap; |
---|
.. | .. |
---|
489 | 534 | bitidx &= (BITS_PER_LONG-1); |
---|
490 | 535 | |
---|
491 | 536 | word = bitmap[word_bitidx]; |
---|
492 | | - bitidx += end_bitidx; |
---|
493 | | - return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; |
---|
| 537 | + return (word >> bitidx) & mask; |
---|
494 | 538 | } |
---|
495 | 539 | |
---|
496 | 540 | unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, |
---|
497 | | - unsigned long end_bitidx, |
---|
498 | 541 | unsigned long mask) |
---|
499 | 542 | { |
---|
500 | | - return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask); |
---|
| 543 | + return __get_pfnblock_flags_mask(page, pfn, mask); |
---|
501 | 544 | } |
---|
| 545 | +EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask); |
---|
| 546 | + |
---|
| 547 | +int isolate_anon_lru_page(struct page *page) |
---|
| 548 | +{ |
---|
| 549 | + int ret; |
---|
| 550 | + |
---|
| 551 | + if (!PageLRU(page) || !PageAnon(page)) |
---|
| 552 | + return -EINVAL; |
---|
| 553 | + |
---|
| 554 | + if (!get_page_unless_zero(page)) |
---|
| 555 | + return -EINVAL; |
---|
| 556 | + |
---|
| 557 | + ret = isolate_lru_page(page); |
---|
| 558 | + put_page(page); |
---|
| 559 | + |
---|
| 560 | + return ret; |
---|
| 561 | +} |
---|
| 562 | +EXPORT_SYMBOL_GPL(isolate_anon_lru_page); |
---|
502 | 563 | |
---|
503 | 564 | static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) |
---|
504 | 565 | { |
---|
505 | | - return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK); |
---|
| 566 | + return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); |
---|
506 | 567 | } |
---|
507 | 568 | |
---|
508 | 569 | /** |
---|
.. | .. |
---|
510 | 571 | * @page: The page within the block of interest |
---|
511 | 572 | * @flags: The flags to set |
---|
512 | 573 | * @pfn: The target page frame number |
---|
513 | | - * @end_bitidx: The last bit of interest |
---|
514 | 574 | * @mask: mask of bits that the caller is interested in |
---|
515 | 575 | */ |
---|
516 | 576 | void set_pfnblock_flags_mask(struct page *page, unsigned long flags, |
---|
517 | 577 | unsigned long pfn, |
---|
518 | | - unsigned long end_bitidx, |
---|
519 | 578 | unsigned long mask) |
---|
520 | 579 | { |
---|
521 | 580 | unsigned long *bitmap; |
---|
.. | .. |
---|
523 | 582 | unsigned long old_word, word; |
---|
524 | 583 | |
---|
525 | 584 | BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); |
---|
| 585 | + BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); |
---|
526 | 586 | |
---|
527 | 587 | bitmap = get_pageblock_bitmap(page, pfn); |
---|
528 | 588 | bitidx = pfn_to_bitidx(page, pfn); |
---|
.. | .. |
---|
531 | 591 | |
---|
532 | 592 | VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); |
---|
533 | 593 | |
---|
534 | | - bitidx += end_bitidx; |
---|
535 | | - mask <<= (BITS_PER_LONG - bitidx - 1); |
---|
536 | | - flags <<= (BITS_PER_LONG - bitidx - 1); |
---|
| 594 | + mask <<= bitidx; |
---|
| 595 | + flags <<= bitidx; |
---|
537 | 596 | |
---|
538 | 597 | word = READ_ONCE(bitmap[word_bitidx]); |
---|
539 | 598 | for (;;) { |
---|
.. | .. |
---|
550 | 609 | migratetype < MIGRATE_PCPTYPES)) |
---|
551 | 610 | migratetype = MIGRATE_UNMOVABLE; |
---|
552 | 611 | |
---|
553 | | - set_pageblock_flags_group(page, (unsigned long)migratetype, |
---|
554 | | - PB_migrate, PB_migrate_end); |
---|
| 612 | + set_pfnblock_flags_mask(page, (unsigned long)migratetype, |
---|
| 613 | + page_to_pfn(page), MIGRATETYPE_MASK); |
---|
555 | 614 | } |
---|
556 | 615 | |
---|
557 | 616 | #ifdef CONFIG_DEBUG_VM |
---|
.. | .. |
---|
606 | 665 | } |
---|
607 | 666 | #endif |
---|
608 | 667 | |
---|
609 | | -static void bad_page(struct page *page, const char *reason, |
---|
610 | | - unsigned long bad_flags) |
---|
| 668 | +static void bad_page(struct page *page, const char *reason) |
---|
611 | 669 | { |
---|
612 | 670 | static unsigned long resume; |
---|
613 | 671 | static unsigned long nr_shown; |
---|
.. | .. |
---|
636 | 694 | pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", |
---|
637 | 695 | current->comm, page_to_pfn(page)); |
---|
638 | 696 | __dump_page(page, reason); |
---|
639 | | - bad_flags &= page->flags; |
---|
640 | | - if (bad_flags) |
---|
641 | | - pr_alert("bad because of flags: %#lx(%pGp)\n", |
---|
642 | | - bad_flags, &bad_flags); |
---|
643 | 697 | dump_page_owner(page); |
---|
644 | 698 | |
---|
645 | 699 | print_modules(); |
---|
.. | .. |
---|
667 | 721 | |
---|
668 | 722 | void free_compound_page(struct page *page) |
---|
669 | 723 | { |
---|
670 | | - __free_pages_ok(page, compound_order(page)); |
---|
| 724 | + mem_cgroup_uncharge(page); |
---|
| 725 | + __free_pages_ok(page, compound_order(page), FPI_NONE); |
---|
671 | 726 | } |
---|
672 | 727 | |
---|
673 | 728 | void prep_compound_page(struct page *page, unsigned int order) |
---|
.. | .. |
---|
675 | 730 | int i; |
---|
676 | 731 | int nr_pages = 1 << order; |
---|
677 | 732 | |
---|
678 | | - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); |
---|
679 | | - set_compound_order(page, order); |
---|
680 | 733 | __SetPageHead(page); |
---|
681 | 734 | for (i = 1; i < nr_pages; i++) { |
---|
682 | 735 | struct page *p = page + i; |
---|
.. | .. |
---|
684 | 737 | p->mapping = TAIL_MAPPING; |
---|
685 | 738 | set_compound_head(p, page); |
---|
686 | 739 | } |
---|
| 740 | + |
---|
| 741 | + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); |
---|
| 742 | + set_compound_order(page, order); |
---|
687 | 743 | atomic_set(compound_mapcount_ptr(page), -1); |
---|
| 744 | + if (hpage_pincount_available(page)) |
---|
| 745 | + atomic_set(compound_pincount_ptr(page), 0); |
---|
688 | 746 | } |
---|
689 | 747 | |
---|
690 | 748 | #ifdef CONFIG_DEBUG_PAGEALLOC |
---|
691 | 749 | unsigned int _debug_guardpage_minorder; |
---|
692 | | -bool _debug_pagealloc_enabled __read_mostly |
---|
| 750 | + |
---|
| 751 | +bool _debug_pagealloc_enabled_early __read_mostly |
---|
693 | 752 | = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); |
---|
| 753 | +EXPORT_SYMBOL(_debug_pagealloc_enabled_early); |
---|
| 754 | +DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); |
---|
694 | 755 | EXPORT_SYMBOL(_debug_pagealloc_enabled); |
---|
695 | | -bool _debug_guardpage_enabled __read_mostly; |
---|
| 756 | + |
---|
| 757 | +DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); |
---|
696 | 758 | |
---|
697 | 759 | static int __init early_debug_pagealloc(char *buf) |
---|
698 | 760 | { |
---|
699 | | - if (!buf) |
---|
700 | | - return -EINVAL; |
---|
701 | | - return kstrtobool(buf, &_debug_pagealloc_enabled); |
---|
| 761 | + return kstrtobool(buf, &_debug_pagealloc_enabled_early); |
---|
702 | 762 | } |
---|
703 | 763 | early_param("debug_pagealloc", early_debug_pagealloc); |
---|
704 | | - |
---|
705 | | -static bool need_debug_guardpage(void) |
---|
706 | | -{ |
---|
707 | | - /* If we don't use debug_pagealloc, we don't need guard page */ |
---|
708 | | - if (!debug_pagealloc_enabled()) |
---|
709 | | - return false; |
---|
710 | | - |
---|
711 | | - if (!debug_guardpage_minorder()) |
---|
712 | | - return false; |
---|
713 | | - |
---|
714 | | - return true; |
---|
715 | | -} |
---|
716 | | - |
---|
717 | | -static void init_debug_guardpage(void) |
---|
718 | | -{ |
---|
719 | | - if (!debug_pagealloc_enabled()) |
---|
720 | | - return; |
---|
721 | | - |
---|
722 | | - if (!debug_guardpage_minorder()) |
---|
723 | | - return; |
---|
724 | | - |
---|
725 | | - _debug_guardpage_enabled = true; |
---|
726 | | -} |
---|
727 | | - |
---|
728 | | -struct page_ext_operations debug_guardpage_ops = { |
---|
729 | | - .need = need_debug_guardpage, |
---|
730 | | - .init = init_debug_guardpage, |
---|
731 | | -}; |
---|
732 | 764 | |
---|
733 | 765 | static int __init debug_guardpage_minorder_setup(char *buf) |
---|
734 | 766 | { |
---|
.. | .. |
---|
747 | 779 | static inline bool set_page_guard(struct zone *zone, struct page *page, |
---|
748 | 780 | unsigned int order, int migratetype) |
---|
749 | 781 | { |
---|
750 | | - struct page_ext *page_ext; |
---|
751 | | - |
---|
752 | 782 | if (!debug_guardpage_enabled()) |
---|
753 | 783 | return false; |
---|
754 | 784 | |
---|
755 | 785 | if (order >= debug_guardpage_minorder()) |
---|
756 | 786 | return false; |
---|
757 | 787 | |
---|
758 | | - page_ext = lookup_page_ext(page); |
---|
759 | | - if (unlikely(!page_ext)) |
---|
760 | | - return false; |
---|
761 | | - |
---|
762 | | - __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); |
---|
763 | | - |
---|
| 788 | + __SetPageGuard(page); |
---|
764 | 789 | INIT_LIST_HEAD(&page->lru); |
---|
765 | 790 | set_page_private(page, order); |
---|
766 | 791 | /* Guard pages are not available for any usage */ |
---|
.. | .. |
---|
772 | 797 | static inline void clear_page_guard(struct zone *zone, struct page *page, |
---|
773 | 798 | unsigned int order, int migratetype) |
---|
774 | 799 | { |
---|
775 | | - struct page_ext *page_ext; |
---|
776 | | - |
---|
777 | 800 | if (!debug_guardpage_enabled()) |
---|
778 | 801 | return; |
---|
779 | 802 | |
---|
780 | | - page_ext = lookup_page_ext(page); |
---|
781 | | - if (unlikely(!page_ext)) |
---|
782 | | - return; |
---|
783 | | - |
---|
784 | | - __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); |
---|
| 803 | + __ClearPageGuard(page); |
---|
785 | 804 | |
---|
786 | 805 | set_page_private(page, 0); |
---|
787 | 806 | if (!is_migrate_isolate(migratetype)) |
---|
788 | 807 | __mod_zone_freepage_state(zone, (1 << order), migratetype); |
---|
789 | 808 | } |
---|
790 | 809 | #else |
---|
791 | | -struct page_ext_operations debug_guardpage_ops; |
---|
792 | 810 | static inline bool set_page_guard(struct zone *zone, struct page *page, |
---|
793 | 811 | unsigned int order, int migratetype) { return false; } |
---|
794 | 812 | static inline void clear_page_guard(struct zone *zone, struct page *page, |
---|
795 | 813 | unsigned int order, int migratetype) {} |
---|
796 | 814 | #endif |
---|
797 | 815 | |
---|
798 | | -static inline void set_page_order(struct page *page, unsigned int order) |
---|
| 816 | +/* |
---|
| 817 | + * Enable static keys related to various memory debugging and hardening options. |
---|
| 818 | + * Some override others, and depend on early params that are evaluated in the |
---|
| 819 | + * order of appearance. So we need to first gather the full picture of what was |
---|
| 820 | + * enabled, and then make decisions. |
---|
| 821 | + */ |
---|
| 822 | +void init_mem_debugging_and_hardening(void) |
---|
| 823 | +{ |
---|
| 824 | + bool page_poisoning_requested = false; |
---|
| 825 | + |
---|
| 826 | +#ifdef CONFIG_PAGE_POISONING |
---|
| 827 | + /* |
---|
| 828 | + * Page poisoning is debug page alloc for some arches. If |
---|
| 829 | + * either of those options are enabled, enable poisoning. |
---|
| 830 | + */ |
---|
| 831 | + if (page_poisoning_enabled() || |
---|
| 832 | + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && |
---|
| 833 | + debug_pagealloc_enabled())) { |
---|
| 834 | + static_branch_enable(&_page_poisoning_enabled); |
---|
| 835 | + page_poisoning_requested = true; |
---|
| 836 | + } |
---|
| 837 | +#endif |
---|
| 838 | + |
---|
| 839 | + if (_init_on_alloc_enabled_early) { |
---|
| 840 | + if (page_poisoning_requested) |
---|
| 841 | + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " |
---|
| 842 | + "will take precedence over init_on_alloc\n"); |
---|
| 843 | + else |
---|
| 844 | + static_branch_enable(&init_on_alloc); |
---|
| 845 | + } |
---|
| 846 | + if (_init_on_free_enabled_early) { |
---|
| 847 | + if (page_poisoning_requested) |
---|
| 848 | + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " |
---|
| 849 | + "will take precedence over init_on_free\n"); |
---|
| 850 | + else |
---|
| 851 | + static_branch_enable(&init_on_free); |
---|
| 852 | + } |
---|
| 853 | + |
---|
| 854 | +#ifdef CONFIG_DEBUG_PAGEALLOC |
---|
| 855 | + if (!debug_pagealloc_enabled()) |
---|
| 856 | + return; |
---|
| 857 | + |
---|
| 858 | + static_branch_enable(&_debug_pagealloc_enabled); |
---|
| 859 | + |
---|
| 860 | + if (!debug_guardpage_minorder()) |
---|
| 861 | + return; |
---|
| 862 | + |
---|
| 863 | + static_branch_enable(&_debug_guardpage_enabled); |
---|
| 864 | +#endif |
---|
| 865 | +} |
---|
| 866 | + |
---|
| 867 | +static inline void set_buddy_order(struct page *page, unsigned int order) |
---|
799 | 868 | { |
---|
800 | 869 | set_page_private(page, order); |
---|
801 | 870 | __SetPageBuddy(page); |
---|
802 | | -} |
---|
803 | | - |
---|
804 | | -static inline void rmv_page_order(struct page *page) |
---|
805 | | -{ |
---|
806 | | - __ClearPageBuddy(page); |
---|
807 | | - set_page_private(page, 0); |
---|
808 | 871 | } |
---|
809 | 872 | |
---|
810 | 873 | /* |
---|
.. | .. |
---|
820 | 883 | * |
---|
821 | 884 | * For recording page's order, we use page_private(page). |
---|
822 | 885 | */ |
---|
823 | | -static inline int page_is_buddy(struct page *page, struct page *buddy, |
---|
| 886 | +static inline bool page_is_buddy(struct page *page, struct page *buddy, |
---|
824 | 887 | unsigned int order) |
---|
825 | 888 | { |
---|
826 | | - if (page_is_guard(buddy) && page_order(buddy) == order) { |
---|
827 | | - if (page_zone_id(page) != page_zone_id(buddy)) |
---|
828 | | - return 0; |
---|
| 889 | + if (!page_is_guard(buddy) && !PageBuddy(buddy)) |
---|
| 890 | + return false; |
---|
829 | 891 | |
---|
830 | | - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
---|
| 892 | + if (buddy_order(buddy) != order) |
---|
| 893 | + return false; |
---|
831 | 894 | |
---|
832 | | - return 1; |
---|
833 | | - } |
---|
| 895 | + /* |
---|
| 896 | + * zone check is done late to avoid uselessly calculating |
---|
| 897 | + * zone/node ids for pages that could never merge. |
---|
| 898 | + */ |
---|
| 899 | + if (page_zone_id(page) != page_zone_id(buddy)) |
---|
| 900 | + return false; |
---|
834 | 901 | |
---|
835 | | - if (PageBuddy(buddy) && page_order(buddy) == order) { |
---|
836 | | - /* |
---|
837 | | - * zone check is done late to avoid uselessly |
---|
838 | | - * calculating zone/node ids for pages that could |
---|
839 | | - * never merge. |
---|
840 | | - */ |
---|
841 | | - if (page_zone_id(page) != page_zone_id(buddy)) |
---|
842 | | - return 0; |
---|
| 902 | + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
---|
843 | 903 | |
---|
844 | | - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
---|
| 904 | + return true; |
---|
| 905 | +} |
---|
845 | 906 | |
---|
846 | | - return 1; |
---|
847 | | - } |
---|
848 | | - return 0; |
---|
| 907 | +#ifdef CONFIG_COMPACTION |
---|
| 908 | +static inline struct capture_control *task_capc(struct zone *zone) |
---|
| 909 | +{ |
---|
| 910 | + struct capture_control *capc = current->capture_control; |
---|
| 911 | + |
---|
| 912 | + return unlikely(capc) && |
---|
| 913 | + !(current->flags & PF_KTHREAD) && |
---|
| 914 | + !capc->page && |
---|
| 915 | + capc->cc->zone == zone ? capc : NULL; |
---|
| 916 | +} |
---|
| 917 | + |
---|
| 918 | +static inline bool |
---|
| 919 | +compaction_capture(struct capture_control *capc, struct page *page, |
---|
| 920 | + int order, int migratetype) |
---|
| 921 | +{ |
---|
| 922 | + if (!capc || order != capc->cc->order) |
---|
| 923 | + return false; |
---|
| 924 | + |
---|
| 925 | + /* Do not accidentally pollute CMA or isolated regions*/ |
---|
| 926 | + if (is_migrate_cma(migratetype) || |
---|
| 927 | + is_migrate_isolate(migratetype)) |
---|
| 928 | + return false; |
---|
| 929 | + |
---|
| 930 | + /* |
---|
| 931 | + * Do not let lower order allocations polluate a movable pageblock. |
---|
| 932 | + * This might let an unmovable request use a reclaimable pageblock |
---|
| 933 | + * and vice-versa but no more than normal fallback logic which can |
---|
| 934 | + * have trouble finding a high-order free page. |
---|
| 935 | + */ |
---|
| 936 | + if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) |
---|
| 937 | + return false; |
---|
| 938 | + |
---|
| 939 | + capc->page = page; |
---|
| 940 | + return true; |
---|
| 941 | +} |
---|
| 942 | + |
---|
| 943 | +#else |
---|
| 944 | +static inline struct capture_control *task_capc(struct zone *zone) |
---|
| 945 | +{ |
---|
| 946 | + return NULL; |
---|
| 947 | +} |
---|
| 948 | + |
---|
| 949 | +static inline bool |
---|
| 950 | +compaction_capture(struct capture_control *capc, struct page *page, |
---|
| 951 | + int order, int migratetype) |
---|
| 952 | +{ |
---|
| 953 | + return false; |
---|
| 954 | +} |
---|
| 955 | +#endif /* CONFIG_COMPACTION */ |
---|
| 956 | + |
---|
| 957 | +/* Used for pages not on another list */ |
---|
| 958 | +static inline void add_to_free_list(struct page *page, struct zone *zone, |
---|
| 959 | + unsigned int order, int migratetype) |
---|
| 960 | +{ |
---|
| 961 | + struct free_area *area = &zone->free_area[order]; |
---|
| 962 | + |
---|
| 963 | + list_add(&page->lru, &area->free_list[migratetype]); |
---|
| 964 | + area->nr_free++; |
---|
| 965 | +} |
---|
| 966 | + |
---|
| 967 | +/* Used for pages not on another list */ |
---|
| 968 | +static inline void add_to_free_list_tail(struct page *page, struct zone *zone, |
---|
| 969 | + unsigned int order, int migratetype) |
---|
| 970 | +{ |
---|
| 971 | + struct free_area *area = &zone->free_area[order]; |
---|
| 972 | + |
---|
| 973 | + list_add_tail(&page->lru, &area->free_list[migratetype]); |
---|
| 974 | + area->nr_free++; |
---|
| 975 | +} |
---|
| 976 | + |
---|
| 977 | +/* |
---|
| 978 | + * Used for pages which are on another list. Move the pages to the tail |
---|
| 979 | + * of the list - so the moved pages won't immediately be considered for |
---|
| 980 | + * allocation again (e.g., optimization for memory onlining). |
---|
| 981 | + */ |
---|
| 982 | +static inline void move_to_free_list(struct page *page, struct zone *zone, |
---|
| 983 | + unsigned int order, int migratetype) |
---|
| 984 | +{ |
---|
| 985 | + struct free_area *area = &zone->free_area[order]; |
---|
| 986 | + |
---|
| 987 | + list_move_tail(&page->lru, &area->free_list[migratetype]); |
---|
| 988 | +} |
---|
| 989 | + |
---|
| 990 | +static inline void del_page_from_free_list(struct page *page, struct zone *zone, |
---|
| 991 | + unsigned int order) |
---|
| 992 | +{ |
---|
| 993 | + /* clear reported state and update reported page count */ |
---|
| 994 | + if (page_reported(page)) |
---|
| 995 | + __ClearPageReported(page); |
---|
| 996 | + |
---|
| 997 | + list_del(&page->lru); |
---|
| 998 | + __ClearPageBuddy(page); |
---|
| 999 | + set_page_private(page, 0); |
---|
| 1000 | + zone->free_area[order].nr_free--; |
---|
| 1001 | +} |
---|
| 1002 | + |
---|
| 1003 | +/* |
---|
| 1004 | + * If this is not the largest possible page, check if the buddy |
---|
| 1005 | + * of the next-highest order is free. If it is, it's possible |
---|
| 1006 | + * that pages are being freed that will coalesce soon. In case, |
---|
| 1007 | + * that is happening, add the free page to the tail of the list |
---|
| 1008 | + * so it's less likely to be used soon and more likely to be merged |
---|
| 1009 | + * as a higher order page |
---|
| 1010 | + */ |
---|
| 1011 | +static inline bool |
---|
| 1012 | +buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, |
---|
| 1013 | + struct page *page, unsigned int order) |
---|
| 1014 | +{ |
---|
| 1015 | + struct page *higher_page, *higher_buddy; |
---|
| 1016 | + unsigned long combined_pfn; |
---|
| 1017 | + |
---|
| 1018 | + if (order >= MAX_ORDER - 2) |
---|
| 1019 | + return false; |
---|
| 1020 | + |
---|
| 1021 | + if (!pfn_valid_within(buddy_pfn)) |
---|
| 1022 | + return false; |
---|
| 1023 | + |
---|
| 1024 | + combined_pfn = buddy_pfn & pfn; |
---|
| 1025 | + higher_page = page + (combined_pfn - pfn); |
---|
| 1026 | + buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); |
---|
| 1027 | + higher_buddy = higher_page + (buddy_pfn - combined_pfn); |
---|
| 1028 | + |
---|
| 1029 | + return pfn_valid_within(buddy_pfn) && |
---|
| 1030 | + page_is_buddy(higher_page, higher_buddy, order + 1); |
---|
849 | 1031 | } |
---|
850 | 1032 | |
---|
851 | 1033 | /* |
---|
.. | .. |
---|
875 | 1057 | static inline void __free_one_page(struct page *page, |
---|
876 | 1058 | unsigned long pfn, |
---|
877 | 1059 | struct zone *zone, unsigned int order, |
---|
878 | | - int migratetype) |
---|
| 1060 | + int migratetype, fpi_t fpi_flags) |
---|
879 | 1061 | { |
---|
| 1062 | + struct capture_control *capc = task_capc(zone); |
---|
| 1063 | + unsigned long buddy_pfn; |
---|
880 | 1064 | unsigned long combined_pfn; |
---|
881 | | - unsigned long uninitialized_var(buddy_pfn); |
---|
882 | | - struct page *buddy; |
---|
883 | 1065 | unsigned int max_order; |
---|
| 1066 | + struct page *buddy; |
---|
| 1067 | + bool to_tail; |
---|
884 | 1068 | |
---|
885 | 1069 | max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); |
---|
886 | 1070 | |
---|
.. | .. |
---|
896 | 1080 | |
---|
897 | 1081 | continue_merging: |
---|
898 | 1082 | while (order < max_order) { |
---|
| 1083 | + if (compaction_capture(capc, page, order, migratetype)) { |
---|
| 1084 | + __mod_zone_freepage_state(zone, -(1 << order), |
---|
| 1085 | + migratetype); |
---|
| 1086 | + return; |
---|
| 1087 | + } |
---|
899 | 1088 | buddy_pfn = __find_buddy_pfn(pfn, order); |
---|
900 | 1089 | buddy = page + (buddy_pfn - pfn); |
---|
901 | 1090 | |
---|
.. | .. |
---|
907 | 1096 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, |
---|
908 | 1097 | * merge with it and move up one order. |
---|
909 | 1098 | */ |
---|
910 | | - if (page_is_guard(buddy)) { |
---|
| 1099 | + if (page_is_guard(buddy)) |
---|
911 | 1100 | clear_page_guard(zone, buddy, order, migratetype); |
---|
912 | | - } else { |
---|
913 | | - list_del(&buddy->lru); |
---|
914 | | - zone->free_area[order].nr_free--; |
---|
915 | | - rmv_page_order(buddy); |
---|
916 | | - } |
---|
| 1101 | + else |
---|
| 1102 | + del_page_from_free_list(buddy, zone, order); |
---|
917 | 1103 | combined_pfn = buddy_pfn & pfn; |
---|
918 | 1104 | page = page + (combined_pfn - pfn); |
---|
919 | 1105 | pfn = combined_pfn; |
---|
.. | .. |
---|
945 | 1131 | } |
---|
946 | 1132 | |
---|
947 | 1133 | done_merging: |
---|
948 | | - set_page_order(page, order); |
---|
| 1134 | + set_buddy_order(page, order); |
---|
949 | 1135 | |
---|
950 | | - /* |
---|
951 | | - * If this is not the largest possible page, check if the buddy |
---|
952 | | - * of the next-highest order is free. If it is, it's possible |
---|
953 | | - * that pages are being freed that will coalesce soon. In case, |
---|
954 | | - * that is happening, add the free page to the tail of the list |
---|
955 | | - * so it's less likely to be used soon and more likely to be merged |
---|
956 | | - * as a higher order page |
---|
957 | | - */ |
---|
958 | | - if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { |
---|
959 | | - struct page *higher_page, *higher_buddy; |
---|
960 | | - combined_pfn = buddy_pfn & pfn; |
---|
961 | | - higher_page = page + (combined_pfn - pfn); |
---|
962 | | - buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); |
---|
963 | | - higher_buddy = higher_page + (buddy_pfn - combined_pfn); |
---|
964 | | - if (pfn_valid_within(buddy_pfn) && |
---|
965 | | - page_is_buddy(higher_page, higher_buddy, order + 1)) { |
---|
966 | | - list_add_tail(&page->lru, |
---|
967 | | - &zone->free_area[order].free_list[migratetype]); |
---|
968 | | - goto out; |
---|
969 | | - } |
---|
970 | | - } |
---|
| 1136 | + if (fpi_flags & FPI_TO_TAIL) |
---|
| 1137 | + to_tail = true; |
---|
| 1138 | + else if (is_shuffle_order(order)) |
---|
| 1139 | + to_tail = shuffle_pick_tail(); |
---|
| 1140 | + else |
---|
| 1141 | + to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); |
---|
971 | 1142 | |
---|
972 | | - list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); |
---|
973 | | -out: |
---|
974 | | - zone->free_area[order].nr_free++; |
---|
| 1143 | + if (to_tail) |
---|
| 1144 | + add_to_free_list_tail(page, zone, order, migratetype); |
---|
| 1145 | + else |
---|
| 1146 | + add_to_free_list(page, zone, order, migratetype); |
---|
| 1147 | + |
---|
| 1148 | + /* Notify page reporting subsystem of freed page */ |
---|
| 1149 | + if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) |
---|
| 1150 | + page_reporting_notify_free(order); |
---|
975 | 1151 | } |
---|
976 | 1152 | |
---|
977 | 1153 | /* |
---|
.. | .. |
---|
996 | 1172 | return true; |
---|
997 | 1173 | } |
---|
998 | 1174 | |
---|
999 | | -static void free_pages_check_bad(struct page *page) |
---|
| 1175 | +static const char *page_bad_reason(struct page *page, unsigned long flags) |
---|
1000 | 1176 | { |
---|
1001 | | - const char *bad_reason; |
---|
1002 | | - unsigned long bad_flags; |
---|
1003 | | - |
---|
1004 | | - bad_reason = NULL; |
---|
1005 | | - bad_flags = 0; |
---|
| 1177 | + const char *bad_reason = NULL; |
---|
1006 | 1178 | |
---|
1007 | 1179 | if (unlikely(atomic_read(&page->_mapcount) != -1)) |
---|
1008 | 1180 | bad_reason = "nonzero mapcount"; |
---|
.. | .. |
---|
1010 | 1182 | bad_reason = "non-NULL mapping"; |
---|
1011 | 1183 | if (unlikely(page_ref_count(page) != 0)) |
---|
1012 | 1184 | bad_reason = "nonzero _refcount"; |
---|
1013 | | - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { |
---|
1014 | | - bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; |
---|
1015 | | - bad_flags = PAGE_FLAGS_CHECK_AT_FREE; |
---|
| 1185 | + if (unlikely(page->flags & flags)) { |
---|
| 1186 | + if (flags == PAGE_FLAGS_CHECK_AT_PREP) |
---|
| 1187 | + bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; |
---|
| 1188 | + else |
---|
| 1189 | + bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; |
---|
1016 | 1190 | } |
---|
1017 | 1191 | #ifdef CONFIG_MEMCG |
---|
1018 | 1192 | if (unlikely(page->mem_cgroup)) |
---|
1019 | 1193 | bad_reason = "page still charged to cgroup"; |
---|
1020 | 1194 | #endif |
---|
1021 | | - bad_page(page, bad_reason, bad_flags); |
---|
| 1195 | + return bad_reason; |
---|
1022 | 1196 | } |
---|
1023 | 1197 | |
---|
1024 | | -static inline int free_pages_check(struct page *page) |
---|
| 1198 | +static void check_free_page_bad(struct page *page) |
---|
| 1199 | +{ |
---|
| 1200 | + bad_page(page, |
---|
| 1201 | + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); |
---|
| 1202 | +} |
---|
| 1203 | + |
---|
| 1204 | +static inline int check_free_page(struct page *page) |
---|
1025 | 1205 | { |
---|
1026 | 1206 | if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) |
---|
1027 | 1207 | return 0; |
---|
1028 | 1208 | |
---|
1029 | 1209 | /* Something has gone sideways, find it */ |
---|
1030 | | - free_pages_check_bad(page); |
---|
| 1210 | + check_free_page_bad(page); |
---|
1031 | 1211 | return 1; |
---|
1032 | 1212 | } |
---|
1033 | 1213 | |
---|
.. | .. |
---|
1049 | 1229 | case 1: |
---|
1050 | 1230 | /* the first tail page: ->mapping may be compound_mapcount() */ |
---|
1051 | 1231 | if (unlikely(compound_mapcount(page))) { |
---|
1052 | | - bad_page(page, "nonzero compound_mapcount", 0); |
---|
| 1232 | + bad_page(page, "nonzero compound_mapcount"); |
---|
1053 | 1233 | goto out; |
---|
1054 | 1234 | } |
---|
1055 | 1235 | break; |
---|
.. | .. |
---|
1061 | 1241 | break; |
---|
1062 | 1242 | default: |
---|
1063 | 1243 | if (page->mapping != TAIL_MAPPING) { |
---|
1064 | | - bad_page(page, "corrupted mapping in tail page", 0); |
---|
| 1244 | + bad_page(page, "corrupted mapping in tail page"); |
---|
1065 | 1245 | goto out; |
---|
1066 | 1246 | } |
---|
1067 | 1247 | break; |
---|
1068 | 1248 | } |
---|
1069 | 1249 | if (unlikely(!PageTail(page))) { |
---|
1070 | | - bad_page(page, "PageTail not set", 0); |
---|
| 1250 | + bad_page(page, "PageTail not set"); |
---|
1071 | 1251 | goto out; |
---|
1072 | 1252 | } |
---|
1073 | 1253 | if (unlikely(compound_head(page) != head_page)) { |
---|
1074 | | - bad_page(page, "compound_head not consistent", 0); |
---|
| 1254 | + bad_page(page, "compound_head not consistent"); |
---|
1075 | 1255 | goto out; |
---|
1076 | 1256 | } |
---|
1077 | 1257 | ret = 0; |
---|
.. | .. |
---|
1081 | 1261 | return ret; |
---|
1082 | 1262 | } |
---|
1083 | 1263 | |
---|
1084 | | -static void kernel_init_free_pages(struct page *page, int numpages) |
---|
| 1264 | +static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) |
---|
1085 | 1265 | { |
---|
1086 | 1266 | int i; |
---|
1087 | 1267 | |
---|
| 1268 | + if (zero_tags) { |
---|
| 1269 | + for (i = 0; i < numpages; i++) |
---|
| 1270 | + tag_clear_highpage(page + i); |
---|
| 1271 | + return; |
---|
| 1272 | + } |
---|
| 1273 | + |
---|
1088 | 1274 | /* s390's use of memset() could override KASAN redzones. */ |
---|
1089 | 1275 | kasan_disable_current(); |
---|
1090 | | - for (i = 0; i < numpages; i++) |
---|
| 1276 | + for (i = 0; i < numpages; i++) { |
---|
| 1277 | + u8 tag = page_kasan_tag(page + i); |
---|
| 1278 | + page_kasan_tag_reset(page + i); |
---|
1091 | 1279 | clear_highpage(page + i); |
---|
| 1280 | + page_kasan_tag_set(page + i, tag); |
---|
| 1281 | + } |
---|
1092 | 1282 | kasan_enable_current(); |
---|
1093 | 1283 | } |
---|
1094 | 1284 | |
---|
1095 | 1285 | static __always_inline bool free_pages_prepare(struct page *page, |
---|
1096 | | - unsigned int order, bool check_free) |
---|
| 1286 | + unsigned int order, bool check_free, fpi_t fpi_flags) |
---|
1097 | 1287 | { |
---|
1098 | 1288 | int bad = 0; |
---|
| 1289 | + bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); |
---|
1099 | 1290 | |
---|
1100 | 1291 | VM_BUG_ON_PAGE(PageTail(page), page); |
---|
1101 | 1292 | |
---|
1102 | 1293 | trace_mm_page_free(page, order); |
---|
| 1294 | + |
---|
| 1295 | + if (unlikely(PageHWPoison(page)) && !order) { |
---|
| 1296 | + /* |
---|
| 1297 | + * Do not let hwpoison pages hit pcplists/buddy |
---|
| 1298 | + * Untie memcg state and reset page's owner |
---|
| 1299 | + */ |
---|
| 1300 | + if (memcg_kmem_enabled() && PageKmemcg(page)) |
---|
| 1301 | + __memcg_kmem_uncharge_page(page, order); |
---|
| 1302 | + reset_page_owner(page, order); |
---|
| 1303 | + free_page_pinner(page, order); |
---|
| 1304 | + return false; |
---|
| 1305 | + } |
---|
1103 | 1306 | |
---|
1104 | 1307 | /* |
---|
1105 | 1308 | * Check tail pages before head page information is cleared to |
---|
.. | .. |
---|
1116 | 1319 | for (i = 1; i < (1 << order); i++) { |
---|
1117 | 1320 | if (compound) |
---|
1118 | 1321 | bad += free_tail_pages_check(page, page + i); |
---|
1119 | | - if (unlikely(free_pages_check(page + i))) { |
---|
| 1322 | + if (unlikely(check_free_page(page + i))) { |
---|
1120 | 1323 | bad++; |
---|
1121 | 1324 | continue; |
---|
1122 | 1325 | } |
---|
.. | .. |
---|
1126 | 1329 | if (PageMappingFlags(page)) |
---|
1127 | 1330 | page->mapping = NULL; |
---|
1128 | 1331 | if (memcg_kmem_enabled() && PageKmemcg(page)) |
---|
1129 | | - memcg_kmem_uncharge(page, order); |
---|
| 1332 | + __memcg_kmem_uncharge_page(page, order); |
---|
1130 | 1333 | if (check_free) |
---|
1131 | | - bad += free_pages_check(page); |
---|
| 1334 | + bad += check_free_page(page); |
---|
1132 | 1335 | if (bad) |
---|
1133 | 1336 | return false; |
---|
1134 | 1337 | |
---|
1135 | 1338 | page_cpupid_reset_last(page); |
---|
1136 | 1339 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
---|
1137 | 1340 | reset_page_owner(page, order); |
---|
| 1341 | + free_page_pinner(page, order); |
---|
1138 | 1342 | |
---|
1139 | 1343 | if (!PageHighMem(page)) { |
---|
1140 | 1344 | debug_check_no_locks_freed(page_address(page), |
---|
.. | .. |
---|
1142 | 1346 | debug_check_no_obj_freed(page_address(page), |
---|
1143 | 1347 | PAGE_SIZE << order); |
---|
1144 | 1348 | } |
---|
1145 | | - arch_free_page(page, order); |
---|
1146 | | - if (want_init_on_free()) |
---|
1147 | | - kernel_init_free_pages(page, 1 << order); |
---|
1148 | 1349 | |
---|
1149 | | - kernel_poison_pages(page, 1 << order, 0); |
---|
1150 | | - kernel_map_pages(page, 1 << order, 0); |
---|
1151 | | - kasan_free_nondeferred_pages(page, order); |
---|
| 1350 | + kernel_poison_pages(page, 1 << order); |
---|
| 1351 | + |
---|
| 1352 | + /* |
---|
| 1353 | + * As memory initialization might be integrated into KASAN, |
---|
| 1354 | + * kasan_free_pages and kernel_init_free_pages must be |
---|
| 1355 | + * kept together to avoid discrepancies in behavior. |
---|
| 1356 | + * |
---|
| 1357 | + * With hardware tag-based KASAN, memory tags must be set before the |
---|
| 1358 | + * page becomes unavailable via debug_pagealloc or arch_free_page. |
---|
| 1359 | + */ |
---|
| 1360 | + if (kasan_has_integrated_init()) { |
---|
| 1361 | + if (!skip_kasan_poison) |
---|
| 1362 | + kasan_free_pages(page, order); |
---|
| 1363 | + } else { |
---|
| 1364 | + bool init = want_init_on_free(); |
---|
| 1365 | + |
---|
| 1366 | + if (init) |
---|
| 1367 | + kernel_init_free_pages(page, 1 << order, false); |
---|
| 1368 | + if (!skip_kasan_poison) |
---|
| 1369 | + kasan_poison_pages(page, order, init); |
---|
| 1370 | + } |
---|
| 1371 | + |
---|
| 1372 | + /* |
---|
| 1373 | + * arch_free_page() can make the page's contents inaccessible. s390 |
---|
| 1374 | + * does this. So nothing which can access the page's contents should |
---|
| 1375 | + * happen after this. |
---|
| 1376 | + */ |
---|
| 1377 | + arch_free_page(page, order); |
---|
| 1378 | + |
---|
| 1379 | + debug_pagealloc_unmap_pages(page, 1 << order); |
---|
1152 | 1380 | |
---|
1153 | 1381 | return true; |
---|
1154 | 1382 | } |
---|
1155 | 1383 | |
---|
1156 | 1384 | #ifdef CONFIG_DEBUG_VM |
---|
1157 | | -static inline bool free_pcp_prepare(struct page *page) |
---|
1158 | | -{ |
---|
1159 | | - return free_pages_prepare(page, 0, true); |
---|
1160 | | -} |
---|
1161 | | - |
---|
1162 | | -static inline bool bulkfree_pcp_prepare(struct page *page) |
---|
1163 | | -{ |
---|
1164 | | - return false; |
---|
1165 | | -} |
---|
1166 | | -#else |
---|
| 1385 | +/* |
---|
| 1386 | + * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed |
---|
| 1387 | + * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when |
---|
| 1388 | + * moved from pcp lists to free lists. |
---|
| 1389 | + */ |
---|
1167 | 1390 | static bool free_pcp_prepare(struct page *page) |
---|
1168 | 1391 | { |
---|
1169 | | - return free_pages_prepare(page, 0, false); |
---|
| 1392 | + return free_pages_prepare(page, 0, true, FPI_NONE); |
---|
1170 | 1393 | } |
---|
1171 | 1394 | |
---|
1172 | 1395 | static bool bulkfree_pcp_prepare(struct page *page) |
---|
1173 | 1396 | { |
---|
1174 | | - return free_pages_check(page); |
---|
| 1397 | + if (debug_pagealloc_enabled_static()) |
---|
| 1398 | + return check_free_page(page); |
---|
| 1399 | + else |
---|
| 1400 | + return false; |
---|
| 1401 | +} |
---|
| 1402 | +#else |
---|
| 1403 | +/* |
---|
| 1404 | + * With DEBUG_VM disabled, order-0 pages being freed are checked only when |
---|
| 1405 | + * moving from pcp lists to free list in order to reduce overhead. With |
---|
| 1406 | + * debug_pagealloc enabled, they are checked also immediately when being freed |
---|
| 1407 | + * to the pcp lists. |
---|
| 1408 | + */ |
---|
| 1409 | +static bool free_pcp_prepare(struct page *page) |
---|
| 1410 | +{ |
---|
| 1411 | + if (debug_pagealloc_enabled_static()) |
---|
| 1412 | + return free_pages_prepare(page, 0, true, FPI_NONE); |
---|
| 1413 | + else |
---|
| 1414 | + return free_pages_prepare(page, 0, false, FPI_NONE); |
---|
| 1415 | +} |
---|
| 1416 | + |
---|
| 1417 | +static bool bulkfree_pcp_prepare(struct page *page) |
---|
| 1418 | +{ |
---|
| 1419 | + return check_free_page(page); |
---|
1175 | 1420 | } |
---|
1176 | 1421 | #endif /* CONFIG_DEBUG_VM */ |
---|
1177 | 1422 | |
---|
.. | .. |
---|
1232 | 1477 | mt = get_pageblock_migratetype(page); |
---|
1233 | 1478 | |
---|
1234 | 1479 | list_del(&page->lru); |
---|
1235 | | - __free_one_page(page, page_to_pfn(page), zone, 0, mt); |
---|
| 1480 | + __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); |
---|
1236 | 1481 | trace_mm_page_pcpu_drain(page, 0, mt); |
---|
1237 | 1482 | } |
---|
1238 | 1483 | spin_unlock_irqrestore(&zone->lock, flags); |
---|
.. | .. |
---|
1240 | 1485 | |
---|
1241 | 1486 | static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp, |
---|
1242 | 1487 | struct list_head *dst) |
---|
1243 | | - |
---|
1244 | 1488 | { |
---|
1245 | 1489 | int migratetype = 0; |
---|
1246 | 1490 | int batch_free = 0; |
---|
.. | .. |
---|
1302 | 1546 | static void free_one_page(struct zone *zone, |
---|
1303 | 1547 | struct page *page, unsigned long pfn, |
---|
1304 | 1548 | unsigned int order, |
---|
1305 | | - int migratetype) |
---|
| 1549 | + int migratetype, fpi_t fpi_flags) |
---|
1306 | 1550 | { |
---|
1307 | 1551 | spin_lock(&zone->lock); |
---|
1308 | 1552 | if (unlikely(has_isolate_pageblock(zone) || |
---|
1309 | 1553 | is_migrate_isolate(migratetype))) { |
---|
1310 | 1554 | migratetype = get_pfnblock_migratetype(page, pfn); |
---|
1311 | 1555 | } |
---|
1312 | | - __free_one_page(page, pfn, zone, order, migratetype); |
---|
| 1556 | + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); |
---|
1313 | 1557 | spin_unlock(&zone->lock); |
---|
1314 | 1558 | } |
---|
1315 | 1559 | |
---|
.. | .. |
---|
1383 | 1627 | /* Avoid false-positive PageTail() */ |
---|
1384 | 1628 | INIT_LIST_HEAD(&page->lru); |
---|
1385 | 1629 | |
---|
1386 | | - SetPageReserved(page); |
---|
| 1630 | + /* |
---|
| 1631 | + * no need for atomic set_bit because the struct |
---|
| 1632 | + * page is not visible yet so nobody should |
---|
| 1633 | + * access it yet. |
---|
| 1634 | + */ |
---|
| 1635 | + __SetPageReserved(page); |
---|
1387 | 1636 | } |
---|
1388 | 1637 | } |
---|
1389 | 1638 | } |
---|
1390 | 1639 | |
---|
1391 | | -static void __free_pages_ok(struct page *page, unsigned int order) |
---|
| 1640 | +static void __free_pages_ok(struct page *page, unsigned int order, |
---|
| 1641 | + fpi_t fpi_flags) |
---|
1392 | 1642 | { |
---|
1393 | 1643 | unsigned long flags; |
---|
1394 | 1644 | int migratetype; |
---|
1395 | 1645 | unsigned long pfn = page_to_pfn(page); |
---|
1396 | 1646 | |
---|
1397 | | - if (!free_pages_prepare(page, order, true)) |
---|
| 1647 | + if (!free_pages_prepare(page, order, true, fpi_flags)) |
---|
1398 | 1648 | return; |
---|
1399 | 1649 | |
---|
1400 | 1650 | migratetype = get_pfnblock_migratetype(page, pfn); |
---|
1401 | | - local_lock_irqsave(pa_lock, flags); |
---|
| 1651 | + local_lock_irqsave(&pa_lock.l, flags); |
---|
1402 | 1652 | __count_vm_events(PGFREE, 1 << order); |
---|
1403 | | - free_one_page(page_zone(page), page, pfn, order, migratetype); |
---|
1404 | | - local_unlock_irqrestore(pa_lock, flags); |
---|
| 1653 | + free_one_page(page_zone(page), page, pfn, order, migratetype, |
---|
| 1654 | + fpi_flags); |
---|
| 1655 | + local_unlock_irqrestore(&pa_lock.l, flags); |
---|
1405 | 1656 | } |
---|
1406 | 1657 | |
---|
1407 | | -static void __init __free_pages_boot_core(struct page *page, unsigned int order) |
---|
| 1658 | +void __free_pages_core(struct page *page, unsigned int order) |
---|
1408 | 1659 | { |
---|
1409 | 1660 | unsigned int nr_pages = 1 << order; |
---|
1410 | 1661 | struct page *p = page; |
---|
1411 | 1662 | unsigned int loop; |
---|
1412 | 1663 | |
---|
| 1664 | + /* |
---|
| 1665 | + * When initializing the memmap, __init_single_page() sets the refcount |
---|
| 1666 | + * of all pages to 1 ("allocated"/"not free"). We have to set the |
---|
| 1667 | + * refcount of all involved pages to 0. |
---|
| 1668 | + */ |
---|
1413 | 1669 | prefetchw(p); |
---|
1414 | 1670 | for (loop = 0; loop < (nr_pages - 1); loop++, p++) { |
---|
1415 | 1671 | prefetchw(p + 1); |
---|
.. | .. |
---|
1419 | 1675 | __ClearPageReserved(p); |
---|
1420 | 1676 | set_page_count(p, 0); |
---|
1421 | 1677 | |
---|
1422 | | - page_zone(page)->managed_pages += nr_pages; |
---|
1423 | | - set_page_refcounted(page); |
---|
1424 | | - __free_pages(page, order); |
---|
| 1678 | + atomic_long_add(nr_pages, &page_zone(page)->managed_pages); |
---|
| 1679 | + |
---|
| 1680 | + /* |
---|
| 1681 | + * Bypass PCP and place fresh pages right to the tail, primarily |
---|
| 1682 | + * relevant for memory onlining. |
---|
| 1683 | + */ |
---|
| 1684 | + __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); |
---|
1425 | 1685 | } |
---|
1426 | 1686 | |
---|
1427 | | -#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ |
---|
1428 | | - defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) |
---|
| 1687 | +#ifdef CONFIG_NEED_MULTIPLE_NODES |
---|
1429 | 1688 | |
---|
1430 | 1689 | static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; |
---|
| 1690 | + |
---|
| 1691 | +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID |
---|
| 1692 | + |
---|
| 1693 | +/* |
---|
| 1694 | + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. |
---|
| 1695 | + */ |
---|
| 1696 | +int __meminit __early_pfn_to_nid(unsigned long pfn, |
---|
| 1697 | + struct mminit_pfnnid_cache *state) |
---|
| 1698 | +{ |
---|
| 1699 | + unsigned long start_pfn, end_pfn; |
---|
| 1700 | + int nid; |
---|
| 1701 | + |
---|
| 1702 | + if (state->last_start <= pfn && pfn < state->last_end) |
---|
| 1703 | + return state->last_nid; |
---|
| 1704 | + |
---|
| 1705 | + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); |
---|
| 1706 | + if (nid != NUMA_NO_NODE) { |
---|
| 1707 | + state->last_start = start_pfn; |
---|
| 1708 | + state->last_end = end_pfn; |
---|
| 1709 | + state->last_nid = nid; |
---|
| 1710 | + } |
---|
| 1711 | + |
---|
| 1712 | + return nid; |
---|
| 1713 | +} |
---|
| 1714 | +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ |
---|
1431 | 1715 | |
---|
1432 | 1716 | int __meminit early_pfn_to_nid(unsigned long pfn) |
---|
1433 | 1717 | { |
---|
.. | .. |
---|
1442 | 1726 | |
---|
1443 | 1727 | return nid; |
---|
1444 | 1728 | } |
---|
1445 | | -#endif |
---|
| 1729 | +#endif /* CONFIG_NEED_MULTIPLE_NODES */ |
---|
1446 | 1730 | |
---|
1447 | | -#ifdef CONFIG_NODES_SPAN_OTHER_NODES |
---|
1448 | | -static inline bool __meminit __maybe_unused |
---|
1449 | | -meminit_pfn_in_nid(unsigned long pfn, int node, |
---|
1450 | | - struct mminit_pfnnid_cache *state) |
---|
1451 | | -{ |
---|
1452 | | - int nid; |
---|
1453 | | - |
---|
1454 | | - nid = __early_pfn_to_nid(pfn, state); |
---|
1455 | | - if (nid >= 0 && nid != node) |
---|
1456 | | - return false; |
---|
1457 | | - return true; |
---|
1458 | | -} |
---|
1459 | | - |
---|
1460 | | -/* Only safe to use early in boot when initialisation is single-threaded */ |
---|
1461 | | -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) |
---|
1462 | | -{ |
---|
1463 | | - return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); |
---|
1464 | | -} |
---|
1465 | | - |
---|
1466 | | -#else |
---|
1467 | | - |
---|
1468 | | -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) |
---|
1469 | | -{ |
---|
1470 | | - return true; |
---|
1471 | | -} |
---|
1472 | | -static inline bool __meminit __maybe_unused |
---|
1473 | | -meminit_pfn_in_nid(unsigned long pfn, int node, |
---|
1474 | | - struct mminit_pfnnid_cache *state) |
---|
1475 | | -{ |
---|
1476 | | - return true; |
---|
1477 | | -} |
---|
1478 | | -#endif |
---|
1479 | | - |
---|
1480 | | - |
---|
1481 | | -void __init __free_pages_bootmem(struct page *page, unsigned long pfn, |
---|
| 1731 | +void __init memblock_free_pages(struct page *page, unsigned long pfn, |
---|
1482 | 1732 | unsigned int order) |
---|
1483 | 1733 | { |
---|
1484 | 1734 | if (early_page_uninitialised(pfn)) |
---|
1485 | 1735 | return; |
---|
1486 | | - return __free_pages_boot_core(page, order); |
---|
| 1736 | + __free_pages_core(page, order); |
---|
1487 | 1737 | } |
---|
1488 | 1738 | |
---|
1489 | 1739 | /* |
---|
.. | .. |
---|
1574 | 1824 | if (nr_pages == pageblock_nr_pages && |
---|
1575 | 1825 | (pfn & (pageblock_nr_pages - 1)) == 0) { |
---|
1576 | 1826 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
---|
1577 | | - __free_pages_boot_core(page, pageblock_order); |
---|
| 1827 | + __free_pages_core(page, pageblock_order); |
---|
1578 | 1828 | return; |
---|
1579 | 1829 | } |
---|
1580 | 1830 | |
---|
1581 | 1831 | for (i = 0; i < nr_pages; i++, page++, pfn++) { |
---|
1582 | 1832 | if ((pfn & (pageblock_nr_pages - 1)) == 0) |
---|
1583 | 1833 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
---|
1584 | | - __free_pages_boot_core(page, 0); |
---|
| 1834 | + __free_pages_core(page, 0); |
---|
1585 | 1835 | } |
---|
1586 | 1836 | } |
---|
1587 | 1837 | |
---|
.. | .. |
---|
1604 | 1854 | * |
---|
1605 | 1855 | * Then, we check if a current large page is valid by only checking the validity |
---|
1606 | 1856 | * of the head pfn. |
---|
1607 | | - * |
---|
1608 | | - * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave |
---|
1609 | | - * within a node: a pfn is between start and end of a node, but does not belong |
---|
1610 | | - * to this memory node. |
---|
1611 | 1857 | */ |
---|
1612 | | -static inline bool __init |
---|
1613 | | -deferred_pfn_valid(int nid, unsigned long pfn, |
---|
1614 | | - struct mminit_pfnnid_cache *nid_init_state) |
---|
| 1858 | +static inline bool __init deferred_pfn_valid(unsigned long pfn) |
---|
1615 | 1859 | { |
---|
1616 | 1860 | if (!pfn_valid_within(pfn)) |
---|
1617 | 1861 | return false; |
---|
1618 | 1862 | if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) |
---|
1619 | | - return false; |
---|
1620 | | - if (!meminit_pfn_in_nid(pfn, nid, nid_init_state)) |
---|
1621 | 1863 | return false; |
---|
1622 | 1864 | return true; |
---|
1623 | 1865 | } |
---|
.. | .. |
---|
1626 | 1868 | * Free pages to buddy allocator. Try to free aligned pages in |
---|
1627 | 1869 | * pageblock_nr_pages sizes. |
---|
1628 | 1870 | */ |
---|
1629 | | -static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, |
---|
| 1871 | +static void __init deferred_free_pages(unsigned long pfn, |
---|
1630 | 1872 | unsigned long end_pfn) |
---|
1631 | 1873 | { |
---|
1632 | | - struct mminit_pfnnid_cache nid_init_state = { }; |
---|
1633 | 1874 | unsigned long nr_pgmask = pageblock_nr_pages - 1; |
---|
1634 | 1875 | unsigned long nr_free = 0; |
---|
1635 | 1876 | |
---|
1636 | 1877 | for (; pfn < end_pfn; pfn++) { |
---|
1637 | | - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { |
---|
| 1878 | + if (!deferred_pfn_valid(pfn)) { |
---|
1638 | 1879 | deferred_free_range(pfn - nr_free, nr_free); |
---|
1639 | 1880 | nr_free = 0; |
---|
1640 | 1881 | } else if (!(pfn & nr_pgmask)) { |
---|
1641 | 1882 | deferred_free_range(pfn - nr_free, nr_free); |
---|
1642 | 1883 | nr_free = 1; |
---|
1643 | | - touch_nmi_watchdog(); |
---|
1644 | 1884 | } else { |
---|
1645 | 1885 | nr_free++; |
---|
1646 | 1886 | } |
---|
.. | .. |
---|
1654 | 1894 | * by performing it only once every pageblock_nr_pages. |
---|
1655 | 1895 | * Return number of pages initialized. |
---|
1656 | 1896 | */ |
---|
1657 | | -static unsigned long __init deferred_init_pages(int nid, int zid, |
---|
| 1897 | +static unsigned long __init deferred_init_pages(struct zone *zone, |
---|
1658 | 1898 | unsigned long pfn, |
---|
1659 | 1899 | unsigned long end_pfn) |
---|
1660 | 1900 | { |
---|
1661 | | - struct mminit_pfnnid_cache nid_init_state = { }; |
---|
1662 | 1901 | unsigned long nr_pgmask = pageblock_nr_pages - 1; |
---|
| 1902 | + int nid = zone_to_nid(zone); |
---|
1663 | 1903 | unsigned long nr_pages = 0; |
---|
| 1904 | + int zid = zone_idx(zone); |
---|
1664 | 1905 | struct page *page = NULL; |
---|
1665 | 1906 | |
---|
1666 | 1907 | for (; pfn < end_pfn; pfn++) { |
---|
1667 | | - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { |
---|
| 1908 | + if (!deferred_pfn_valid(pfn)) { |
---|
1668 | 1909 | page = NULL; |
---|
1669 | 1910 | continue; |
---|
1670 | 1911 | } else if (!page || !(pfn & nr_pgmask)) { |
---|
1671 | 1912 | page = pfn_to_page(pfn); |
---|
1672 | | - touch_nmi_watchdog(); |
---|
1673 | 1913 | } else { |
---|
1674 | 1914 | page++; |
---|
1675 | 1915 | } |
---|
.. | .. |
---|
1679 | 1919 | return (nr_pages); |
---|
1680 | 1920 | } |
---|
1681 | 1921 | |
---|
| 1922 | +/* |
---|
| 1923 | + * This function is meant to pre-load the iterator for the zone init. |
---|
| 1924 | + * Specifically it walks through the ranges until we are caught up to the |
---|
| 1925 | + * first_init_pfn value and exits there. If we never encounter the value we |
---|
| 1926 | + * return false indicating there are no valid ranges left. |
---|
| 1927 | + */ |
---|
| 1928 | +static bool __init |
---|
| 1929 | +deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, |
---|
| 1930 | + unsigned long *spfn, unsigned long *epfn, |
---|
| 1931 | + unsigned long first_init_pfn) |
---|
| 1932 | +{ |
---|
| 1933 | + u64 j; |
---|
| 1934 | + |
---|
| 1935 | + /* |
---|
| 1936 | + * Start out by walking through the ranges in this zone that have |
---|
| 1937 | + * already been initialized. We don't need to do anything with them |
---|
| 1938 | + * so we just need to flush them out of the system. |
---|
| 1939 | + */ |
---|
| 1940 | + for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { |
---|
| 1941 | + if (*epfn <= first_init_pfn) |
---|
| 1942 | + continue; |
---|
| 1943 | + if (*spfn < first_init_pfn) |
---|
| 1944 | + *spfn = first_init_pfn; |
---|
| 1945 | + *i = j; |
---|
| 1946 | + return true; |
---|
| 1947 | + } |
---|
| 1948 | + |
---|
| 1949 | + return false; |
---|
| 1950 | +} |
---|
| 1951 | + |
---|
| 1952 | +/* |
---|
| 1953 | + * Initialize and free pages. We do it in two loops: first we initialize |
---|
| 1954 | + * struct page, then free to buddy allocator, because while we are |
---|
| 1955 | + * freeing pages we can access pages that are ahead (computing buddy |
---|
| 1956 | + * page in __free_one_page()). |
---|
| 1957 | + * |
---|
| 1958 | + * In order to try and keep some memory in the cache we have the loop |
---|
| 1959 | + * broken along max page order boundaries. This way we will not cause |
---|
| 1960 | + * any issues with the buddy page computation. |
---|
| 1961 | + */ |
---|
| 1962 | +static unsigned long __init |
---|
| 1963 | +deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, |
---|
| 1964 | + unsigned long *end_pfn) |
---|
| 1965 | +{ |
---|
| 1966 | + unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); |
---|
| 1967 | + unsigned long spfn = *start_pfn, epfn = *end_pfn; |
---|
| 1968 | + unsigned long nr_pages = 0; |
---|
| 1969 | + u64 j = *i; |
---|
| 1970 | + |
---|
| 1971 | + /* First we loop through and initialize the page values */ |
---|
| 1972 | + for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { |
---|
| 1973 | + unsigned long t; |
---|
| 1974 | + |
---|
| 1975 | + if (mo_pfn <= *start_pfn) |
---|
| 1976 | + break; |
---|
| 1977 | + |
---|
| 1978 | + t = min(mo_pfn, *end_pfn); |
---|
| 1979 | + nr_pages += deferred_init_pages(zone, *start_pfn, t); |
---|
| 1980 | + |
---|
| 1981 | + if (mo_pfn < *end_pfn) { |
---|
| 1982 | + *start_pfn = mo_pfn; |
---|
| 1983 | + break; |
---|
| 1984 | + } |
---|
| 1985 | + } |
---|
| 1986 | + |
---|
| 1987 | + /* Reset values and now loop through freeing pages as needed */ |
---|
| 1988 | + swap(j, *i); |
---|
| 1989 | + |
---|
| 1990 | + for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { |
---|
| 1991 | + unsigned long t; |
---|
| 1992 | + |
---|
| 1993 | + if (mo_pfn <= spfn) |
---|
| 1994 | + break; |
---|
| 1995 | + |
---|
| 1996 | + t = min(mo_pfn, epfn); |
---|
| 1997 | + deferred_free_pages(spfn, t); |
---|
| 1998 | + |
---|
| 1999 | + if (mo_pfn <= epfn) |
---|
| 2000 | + break; |
---|
| 2001 | + } |
---|
| 2002 | + |
---|
| 2003 | + return nr_pages; |
---|
| 2004 | +} |
---|
| 2005 | + |
---|
| 2006 | +static void __init |
---|
| 2007 | +deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, |
---|
| 2008 | + void *arg) |
---|
| 2009 | +{ |
---|
| 2010 | + unsigned long spfn, epfn; |
---|
| 2011 | + struct zone *zone = arg; |
---|
| 2012 | + u64 i; |
---|
| 2013 | + |
---|
| 2014 | + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); |
---|
| 2015 | + |
---|
| 2016 | + /* |
---|
| 2017 | + * Initialize and free pages in MAX_ORDER sized increments so that we |
---|
| 2018 | + * can avoid introducing any issues with the buddy allocator. |
---|
| 2019 | + */ |
---|
| 2020 | + while (spfn < end_pfn) { |
---|
| 2021 | + deferred_init_maxorder(&i, zone, &spfn, &epfn); |
---|
| 2022 | + cond_resched(); |
---|
| 2023 | + } |
---|
| 2024 | +} |
---|
| 2025 | + |
---|
| 2026 | +/* An arch may override for more concurrency. */ |
---|
| 2027 | +__weak int __init |
---|
| 2028 | +deferred_page_init_max_threads(const struct cpumask *node_cpumask) |
---|
| 2029 | +{ |
---|
| 2030 | + return 1; |
---|
| 2031 | +} |
---|
| 2032 | + |
---|
1682 | 2033 | /* Initialise remaining memory on a node */ |
---|
1683 | 2034 | static int __init deferred_init_memmap(void *data) |
---|
1684 | 2035 | { |
---|
1685 | 2036 | pg_data_t *pgdat = data; |
---|
1686 | | - int nid = pgdat->node_id; |
---|
1687 | | - unsigned long start = jiffies; |
---|
1688 | | - unsigned long nr_pages = 0; |
---|
1689 | | - unsigned long spfn, epfn, first_init_pfn, flags; |
---|
1690 | | - phys_addr_t spa, epa; |
---|
1691 | | - int zid; |
---|
1692 | | - struct zone *zone; |
---|
1693 | 2037 | const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); |
---|
| 2038 | + unsigned long spfn = 0, epfn = 0; |
---|
| 2039 | + unsigned long first_init_pfn, flags; |
---|
| 2040 | + unsigned long start = jiffies; |
---|
| 2041 | + struct zone *zone; |
---|
| 2042 | + int zid, max_threads; |
---|
1694 | 2043 | u64 i; |
---|
1695 | 2044 | |
---|
1696 | 2045 | /* Bind memory initialisation thread to a local node if possible */ |
---|
.. | .. |
---|
1723 | 2072 | if (first_init_pfn < zone_end_pfn(zone)) |
---|
1724 | 2073 | break; |
---|
1725 | 2074 | } |
---|
1726 | | - first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); |
---|
1727 | 2075 | |
---|
1728 | | - /* |
---|
1729 | | - * Initialize and free pages. We do it in two loops: first we initialize |
---|
1730 | | - * struct page, than free to buddy allocator, because while we are |
---|
1731 | | - * freeing pages we can access pages that are ahead (computing buddy |
---|
1732 | | - * page in __free_one_page()). |
---|
1733 | | - */ |
---|
1734 | | - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { |
---|
1735 | | - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); |
---|
1736 | | - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); |
---|
1737 | | - nr_pages += deferred_init_pages(nid, zid, spfn, epfn); |
---|
1738 | | - } |
---|
1739 | | - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { |
---|
1740 | | - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); |
---|
1741 | | - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); |
---|
1742 | | - deferred_free_pages(nid, zid, spfn, epfn); |
---|
1743 | | - } |
---|
| 2076 | + /* If the zone is empty somebody else may have cleared out the zone */ |
---|
| 2077 | + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, |
---|
| 2078 | + first_init_pfn)) |
---|
| 2079 | + goto zone_empty; |
---|
1744 | 2080 | |
---|
| 2081 | + max_threads = deferred_page_init_max_threads(cpumask); |
---|
| 2082 | + |
---|
| 2083 | + while (spfn < epfn) { |
---|
| 2084 | + unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); |
---|
| 2085 | + struct padata_mt_job job = { |
---|
| 2086 | + .thread_fn = deferred_init_memmap_chunk, |
---|
| 2087 | + .fn_arg = zone, |
---|
| 2088 | + .start = spfn, |
---|
| 2089 | + .size = epfn_align - spfn, |
---|
| 2090 | + .align = PAGES_PER_SECTION, |
---|
| 2091 | + .min_chunk = PAGES_PER_SECTION, |
---|
| 2092 | + .max_threads = max_threads, |
---|
| 2093 | + }; |
---|
| 2094 | + |
---|
| 2095 | + padata_do_multithreaded(&job); |
---|
| 2096 | + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, |
---|
| 2097 | + epfn_align); |
---|
| 2098 | + } |
---|
| 2099 | +zone_empty: |
---|
1745 | 2100 | /* Sanity check that the next zone really is unpopulated */ |
---|
1746 | 2101 | WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); |
---|
1747 | 2102 | |
---|
1748 | | - pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, |
---|
1749 | | - jiffies_to_msecs(jiffies - start)); |
---|
| 2103 | + pr_info("node %d deferred pages initialised in %ums\n", |
---|
| 2104 | + pgdat->node_id, jiffies_to_msecs(jiffies - start)); |
---|
1750 | 2105 | |
---|
1751 | 2106 | pgdat_init_report_one_done(); |
---|
1752 | 2107 | return 0; |
---|
.. | .. |
---|
1770 | 2125 | static noinline bool __init |
---|
1771 | 2126 | deferred_grow_zone(struct zone *zone, unsigned int order) |
---|
1772 | 2127 | { |
---|
1773 | | - int zid = zone_idx(zone); |
---|
1774 | | - int nid = zone_to_nid(zone); |
---|
1775 | | - pg_data_t *pgdat = NODE_DATA(nid); |
---|
1776 | 2128 | unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); |
---|
1777 | | - unsigned long nr_pages = 0; |
---|
1778 | | - unsigned long first_init_pfn, spfn, epfn, t, flags; |
---|
| 2129 | + pg_data_t *pgdat = zone->zone_pgdat; |
---|
1779 | 2130 | unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; |
---|
1780 | | - phys_addr_t spa, epa; |
---|
| 2131 | + unsigned long spfn, epfn, flags; |
---|
| 2132 | + unsigned long nr_pages = 0; |
---|
1781 | 2133 | u64 i; |
---|
1782 | 2134 | |
---|
1783 | 2135 | /* Only the last zone may have deferred pages */ |
---|
.. | .. |
---|
1795 | 2147 | return true; |
---|
1796 | 2148 | } |
---|
1797 | 2149 | |
---|
1798 | | - first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); |
---|
1799 | | - |
---|
1800 | | - if (first_init_pfn >= pgdat_end_pfn(pgdat)) { |
---|
| 2150 | + /* If the zone is empty somebody else may have cleared out the zone */ |
---|
| 2151 | + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, |
---|
| 2152 | + first_deferred_pfn)) { |
---|
| 2153 | + pgdat->first_deferred_pfn = ULONG_MAX; |
---|
1801 | 2154 | pgdat_resize_unlock(pgdat, &flags); |
---|
1802 | | - return false; |
---|
| 2155 | + /* Retry only once. */ |
---|
| 2156 | + return first_deferred_pfn != ULONG_MAX; |
---|
1803 | 2157 | } |
---|
1804 | 2158 | |
---|
1805 | | - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { |
---|
1806 | | - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); |
---|
1807 | | - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); |
---|
| 2159 | + /* |
---|
| 2160 | + * Initialize and free pages in MAX_ORDER sized increments so |
---|
| 2161 | + * that we can avoid introducing any issues with the buddy |
---|
| 2162 | + * allocator. |
---|
| 2163 | + */ |
---|
| 2164 | + while (spfn < epfn) { |
---|
| 2165 | + /* update our first deferred PFN for this section */ |
---|
| 2166 | + first_deferred_pfn = spfn; |
---|
1808 | 2167 | |
---|
1809 | | - while (spfn < epfn && nr_pages < nr_pages_needed) { |
---|
1810 | | - t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); |
---|
1811 | | - first_deferred_pfn = min(t, epfn); |
---|
1812 | | - nr_pages += deferred_init_pages(nid, zid, spfn, |
---|
1813 | | - first_deferred_pfn); |
---|
1814 | | - spfn = first_deferred_pfn; |
---|
1815 | | - } |
---|
| 2168 | + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); |
---|
| 2169 | + touch_nmi_watchdog(); |
---|
1816 | 2170 | |
---|
| 2171 | + /* We should only stop along section boundaries */ |
---|
| 2172 | + if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) |
---|
| 2173 | + continue; |
---|
| 2174 | + |
---|
| 2175 | + /* If our quota has been met we can stop here */ |
---|
1817 | 2176 | if (nr_pages >= nr_pages_needed) |
---|
1818 | 2177 | break; |
---|
1819 | 2178 | } |
---|
1820 | 2179 | |
---|
1821 | | - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { |
---|
1822 | | - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); |
---|
1823 | | - epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); |
---|
1824 | | - deferred_free_pages(nid, zid, spfn, epfn); |
---|
1825 | | - |
---|
1826 | | - if (first_deferred_pfn == epfn) |
---|
1827 | | - break; |
---|
1828 | | - } |
---|
1829 | | - pgdat->first_deferred_pfn = first_deferred_pfn; |
---|
| 2180 | + pgdat->first_deferred_pfn = spfn; |
---|
1830 | 2181 | pgdat_resize_unlock(pgdat, &flags); |
---|
1831 | 2182 | |
---|
1832 | 2183 | return nr_pages > 0; |
---|
.. | .. |
---|
1849 | 2200 | void __init page_alloc_init_late(void) |
---|
1850 | 2201 | { |
---|
1851 | 2202 | struct zone *zone; |
---|
| 2203 | + int nid; |
---|
1852 | 2204 | |
---|
1853 | 2205 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
---|
1854 | | - int nid; |
---|
1855 | 2206 | |
---|
1856 | 2207 | /* There will be num_node_state(N_MEMORY) threads */ |
---|
1857 | 2208 | atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); |
---|
.. | .. |
---|
1879 | 2230 | /* Reinit limits that are based on free pages after the kernel is up */ |
---|
1880 | 2231 | files_maxfiles_init(); |
---|
1881 | 2232 | #endif |
---|
1882 | | -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK |
---|
| 2233 | + |
---|
1883 | 2234 | /* Discard memblock private memory */ |
---|
1884 | 2235 | memblock_discard(); |
---|
1885 | | -#endif |
---|
| 2236 | + |
---|
| 2237 | + for_each_node_state(nid, N_MEMORY) |
---|
| 2238 | + shuffle_free_memory(NODE_DATA(nid)); |
---|
1886 | 2239 | |
---|
1887 | 2240 | for_each_populated_zone(zone) |
---|
1888 | 2241 | set_zone_contiguous(zone); |
---|
.. | .. |
---|
1916 | 2269 | } |
---|
1917 | 2270 | |
---|
1918 | 2271 | adjust_managed_page_count(page, pageblock_nr_pages); |
---|
| 2272 | + page_zone(page)->cma_pages += pageblock_nr_pages; |
---|
1919 | 2273 | } |
---|
1920 | 2274 | #endif |
---|
1921 | 2275 | |
---|
.. | .. |
---|
1934 | 2288 | * -- nyc |
---|
1935 | 2289 | */ |
---|
1936 | 2290 | static inline void expand(struct zone *zone, struct page *page, |
---|
1937 | | - int low, int high, struct free_area *area, |
---|
1938 | | - int migratetype) |
---|
| 2291 | + int low, int high, int migratetype) |
---|
1939 | 2292 | { |
---|
1940 | 2293 | unsigned long size = 1 << high; |
---|
1941 | 2294 | |
---|
1942 | 2295 | while (high > low) { |
---|
1943 | | - area--; |
---|
1944 | 2296 | high--; |
---|
1945 | 2297 | size >>= 1; |
---|
1946 | 2298 | VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); |
---|
.. | .. |
---|
1954 | 2306 | if (set_page_guard(zone, &page[size], high, migratetype)) |
---|
1955 | 2307 | continue; |
---|
1956 | 2308 | |
---|
1957 | | - list_add(&page[size].lru, &area->free_list[migratetype]); |
---|
1958 | | - area->nr_free++; |
---|
1959 | | - set_page_order(&page[size], high); |
---|
| 2309 | + add_to_free_list(&page[size], zone, high, migratetype); |
---|
| 2310 | + set_buddy_order(&page[size], high); |
---|
1960 | 2311 | } |
---|
1961 | 2312 | } |
---|
1962 | 2313 | |
---|
1963 | 2314 | static void check_new_page_bad(struct page *page) |
---|
1964 | 2315 | { |
---|
1965 | | - const char *bad_reason = NULL; |
---|
1966 | | - unsigned long bad_flags = 0; |
---|
1967 | | - |
---|
1968 | | - if (unlikely(atomic_read(&page->_mapcount) != -1)) |
---|
1969 | | - bad_reason = "nonzero mapcount"; |
---|
1970 | | - if (unlikely(page->mapping != NULL)) |
---|
1971 | | - bad_reason = "non-NULL mapping"; |
---|
1972 | | - if (unlikely(page_ref_count(page) != 0)) |
---|
1973 | | - bad_reason = "nonzero _count"; |
---|
1974 | 2316 | if (unlikely(page->flags & __PG_HWPOISON)) { |
---|
1975 | | - bad_reason = "HWPoisoned (hardware-corrupted)"; |
---|
1976 | | - bad_flags = __PG_HWPOISON; |
---|
1977 | 2317 | /* Don't complain about hwpoisoned pages */ |
---|
1978 | 2318 | page_mapcount_reset(page); /* remove PageBuddy */ |
---|
1979 | 2319 | return; |
---|
1980 | 2320 | } |
---|
1981 | | - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { |
---|
1982 | | - bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; |
---|
1983 | | - bad_flags = PAGE_FLAGS_CHECK_AT_PREP; |
---|
1984 | | - } |
---|
1985 | | -#ifdef CONFIG_MEMCG |
---|
1986 | | - if (unlikely(page->mem_cgroup)) |
---|
1987 | | - bad_reason = "page still charged to cgroup"; |
---|
1988 | | -#endif |
---|
1989 | | - bad_page(page, bad_reason, bad_flags); |
---|
| 2321 | + |
---|
| 2322 | + bad_page(page, |
---|
| 2323 | + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); |
---|
1990 | 2324 | } |
---|
1991 | 2325 | |
---|
1992 | 2326 | /* |
---|
.. | .. |
---|
2002 | 2336 | return 1; |
---|
2003 | 2337 | } |
---|
2004 | 2338 | |
---|
2005 | | -static inline bool free_pages_prezeroed(void) |
---|
2006 | | -{ |
---|
2007 | | - return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && |
---|
2008 | | - page_poisoning_enabled()) || want_init_on_free(); |
---|
2009 | | -} |
---|
2010 | | - |
---|
2011 | 2339 | #ifdef CONFIG_DEBUG_VM |
---|
2012 | | -static bool check_pcp_refill(struct page *page) |
---|
| 2340 | +/* |
---|
| 2341 | + * With DEBUG_VM enabled, order-0 pages are checked for expected state when |
---|
| 2342 | + * being allocated from pcp lists. With debug_pagealloc also enabled, they are |
---|
| 2343 | + * also checked when pcp lists are refilled from the free lists. |
---|
| 2344 | + */ |
---|
| 2345 | +static inline bool check_pcp_refill(struct page *page) |
---|
2013 | 2346 | { |
---|
2014 | | - return false; |
---|
| 2347 | + if (debug_pagealloc_enabled_static()) |
---|
| 2348 | + return check_new_page(page); |
---|
| 2349 | + else |
---|
| 2350 | + return false; |
---|
2015 | 2351 | } |
---|
2016 | 2352 | |
---|
2017 | | -static bool check_new_pcp(struct page *page) |
---|
| 2353 | +static inline bool check_new_pcp(struct page *page) |
---|
2018 | 2354 | { |
---|
2019 | 2355 | return check_new_page(page); |
---|
2020 | 2356 | } |
---|
2021 | 2357 | #else |
---|
2022 | | -static bool check_pcp_refill(struct page *page) |
---|
| 2358 | +/* |
---|
| 2359 | + * With DEBUG_VM disabled, free order-0 pages are checked for expected state |
---|
| 2360 | + * when pcp lists are being refilled from the free lists. With debug_pagealloc |
---|
| 2361 | + * enabled, they are also checked when being allocated from the pcp lists. |
---|
| 2362 | + */ |
---|
| 2363 | +static inline bool check_pcp_refill(struct page *page) |
---|
2023 | 2364 | { |
---|
2024 | 2365 | return check_new_page(page); |
---|
2025 | 2366 | } |
---|
2026 | | -static bool check_new_pcp(struct page *page) |
---|
| 2367 | +static inline bool check_new_pcp(struct page *page) |
---|
2027 | 2368 | { |
---|
2028 | | - return false; |
---|
| 2369 | + if (debug_pagealloc_enabled_static()) |
---|
| 2370 | + return check_new_page(page); |
---|
| 2371 | + else |
---|
| 2372 | + return false; |
---|
2029 | 2373 | } |
---|
2030 | 2374 | #endif /* CONFIG_DEBUG_VM */ |
---|
2031 | 2375 | |
---|
.. | .. |
---|
2049 | 2393 | set_page_refcounted(page); |
---|
2050 | 2394 | |
---|
2051 | 2395 | arch_alloc_page(page, order); |
---|
2052 | | - kernel_map_pages(page, 1 << order, 1); |
---|
2053 | | - kasan_alloc_pages(page, order); |
---|
2054 | | - kernel_poison_pages(page, 1 << order, 1); |
---|
| 2396 | + debug_pagealloc_map_pages(page, 1 << order); |
---|
| 2397 | + |
---|
| 2398 | + /* |
---|
| 2399 | + * Page unpoisoning must happen before memory initialization. |
---|
| 2400 | + * Otherwise, the poison pattern will be overwritten for __GFP_ZERO |
---|
| 2401 | + * allocations and the page unpoisoning code will complain. |
---|
| 2402 | + */ |
---|
| 2403 | + kernel_unpoison_pages(page, 1 << order); |
---|
| 2404 | + |
---|
| 2405 | + /* |
---|
| 2406 | + * As memory initialization might be integrated into KASAN, |
---|
| 2407 | + * kasan_alloc_pages and kernel_init_free_pages must be |
---|
| 2408 | + * kept together to avoid discrepancies in behavior. |
---|
| 2409 | + */ |
---|
| 2410 | + if (kasan_has_integrated_init()) { |
---|
| 2411 | + kasan_alloc_pages(page, order, gfp_flags); |
---|
| 2412 | + } else { |
---|
| 2413 | + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); |
---|
| 2414 | + |
---|
| 2415 | + kasan_unpoison_pages(page, order, init); |
---|
| 2416 | + if (init) |
---|
| 2417 | + kernel_init_free_pages(page, 1 << order, |
---|
| 2418 | + gfp_flags & __GFP_ZEROTAGS); |
---|
| 2419 | + } |
---|
| 2420 | + |
---|
2055 | 2421 | set_page_owner(page, order, gfp_flags); |
---|
2056 | 2422 | } |
---|
2057 | 2423 | |
---|
.. | .. |
---|
2059 | 2425 | unsigned int alloc_flags) |
---|
2060 | 2426 | { |
---|
2061 | 2427 | post_alloc_hook(page, order, gfp_flags); |
---|
2062 | | - |
---|
2063 | | - if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) |
---|
2064 | | - kernel_init_free_pages(page, 1 << order); |
---|
2065 | 2428 | |
---|
2066 | 2429 | if (order && (gfp_flags & __GFP_COMP)) |
---|
2067 | 2430 | prep_compound_page(page, order); |
---|
.. | .. |
---|
2076 | 2439 | set_page_pfmemalloc(page); |
---|
2077 | 2440 | else |
---|
2078 | 2441 | clear_page_pfmemalloc(page); |
---|
| 2442 | + trace_android_vh_test_clear_look_around_ref(page); |
---|
2079 | 2443 | } |
---|
2080 | 2444 | |
---|
2081 | 2445 | /* |
---|
.. | .. |
---|
2093 | 2457 | /* Find a page of the appropriate size in the preferred list */ |
---|
2094 | 2458 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { |
---|
2095 | 2459 | area = &(zone->free_area[current_order]); |
---|
2096 | | - page = list_first_entry_or_null(&area->free_list[migratetype], |
---|
2097 | | - struct page, lru); |
---|
| 2460 | + page = get_page_from_free_area(area, migratetype); |
---|
2098 | 2461 | if (!page) |
---|
2099 | 2462 | continue; |
---|
2100 | | - list_del(&page->lru); |
---|
2101 | | - rmv_page_order(page); |
---|
2102 | | - area->nr_free--; |
---|
2103 | | - expand(zone, page, order, current_order, area, migratetype); |
---|
| 2463 | + del_page_from_free_list(page, zone, current_order); |
---|
| 2464 | + expand(zone, page, order, current_order, migratetype); |
---|
2104 | 2465 | set_pcppage_migratetype(page, migratetype); |
---|
2105 | 2466 | return page; |
---|
2106 | 2467 | } |
---|
.. | .. |
---|
2113 | 2474 | * This array describes the order lists are fallen back to when |
---|
2114 | 2475 | * the free lists for the desirable migrate type are depleted |
---|
2115 | 2476 | */ |
---|
2116 | | -static int fallbacks[MIGRATE_TYPES][4] = { |
---|
| 2477 | +static int fallbacks[MIGRATE_TYPES][3] = { |
---|
2117 | 2478 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, |
---|
2118 | | - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, |
---|
2119 | 2479 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, |
---|
| 2480 | + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, |
---|
2120 | 2481 | #ifdef CONFIG_CMA |
---|
2121 | 2482 | [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ |
---|
2122 | 2483 | #endif |
---|
.. | .. |
---|
2137 | 2498 | #endif |
---|
2138 | 2499 | |
---|
2139 | 2500 | /* |
---|
2140 | | - * Move the free pages in a range to the free lists of the requested type. |
---|
| 2501 | + * Move the free pages in a range to the freelist tail of the requested type. |
---|
2141 | 2502 | * Note that start_page and end_pages are not aligned on a pageblock |
---|
2142 | 2503 | * boundary. If alignment is required, use move_freepages_block() |
---|
2143 | 2504 | */ |
---|
.. | .. |
---|
2149 | 2510 | unsigned int order; |
---|
2150 | 2511 | int pages_moved = 0; |
---|
2151 | 2512 | |
---|
2152 | | -#ifndef CONFIG_HOLES_IN_ZONE |
---|
2153 | | - /* |
---|
2154 | | - * page_zone is not safe to call in this context when |
---|
2155 | | - * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant |
---|
2156 | | - * anyway as we check zone boundaries in move_freepages_block(). |
---|
2157 | | - * Remove at a later date when no bug reports exist related to |
---|
2158 | | - * grouping pages by mobility |
---|
2159 | | - */ |
---|
2160 | | - VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) && |
---|
2161 | | - pfn_valid(page_to_pfn(end_page)) && |
---|
2162 | | - page_zone(start_page) != page_zone(end_page)); |
---|
2163 | | -#endif |
---|
2164 | | - |
---|
2165 | | - if (num_movable) |
---|
2166 | | - *num_movable = 0; |
---|
2167 | | - |
---|
2168 | 2513 | for (page = start_page; page <= end_page;) { |
---|
2169 | 2514 | if (!pfn_valid_within(page_to_pfn(page))) { |
---|
2170 | 2515 | page++; |
---|
2171 | 2516 | continue; |
---|
2172 | 2517 | } |
---|
2173 | | - |
---|
2174 | | - /* Make sure we are not inadvertently changing nodes */ |
---|
2175 | | - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); |
---|
2176 | 2518 | |
---|
2177 | 2519 | if (!PageBuddy(page)) { |
---|
2178 | 2520 | /* |
---|
.. | .. |
---|
2188 | 2530 | continue; |
---|
2189 | 2531 | } |
---|
2190 | 2532 | |
---|
2191 | | - order = page_order(page); |
---|
2192 | | - list_move(&page->lru, |
---|
2193 | | - &zone->free_area[order].free_list[migratetype]); |
---|
| 2533 | + /* Make sure we are not inadvertently changing nodes */ |
---|
| 2534 | + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); |
---|
| 2535 | + VM_BUG_ON_PAGE(page_zone(page) != zone, page); |
---|
| 2536 | + |
---|
| 2537 | + order = buddy_order(page); |
---|
| 2538 | + move_to_free_list(page, zone, order, migratetype); |
---|
2194 | 2539 | page += 1 << order; |
---|
2195 | 2540 | pages_moved += 1 << order; |
---|
2196 | 2541 | } |
---|
.. | .. |
---|
2203 | 2548 | { |
---|
2204 | 2549 | unsigned long start_pfn, end_pfn; |
---|
2205 | 2550 | struct page *start_page, *end_page; |
---|
| 2551 | + |
---|
| 2552 | + if (num_movable) |
---|
| 2553 | + *num_movable = 0; |
---|
2206 | 2554 | |
---|
2207 | 2555 | start_pfn = page_to_pfn(page); |
---|
2208 | 2556 | start_pfn = start_pfn & ~(pageblock_nr_pages-1); |
---|
.. | .. |
---|
2264 | 2612 | return false; |
---|
2265 | 2613 | } |
---|
2266 | 2614 | |
---|
| 2615 | +static inline bool boost_watermark(struct zone *zone) |
---|
| 2616 | +{ |
---|
| 2617 | + unsigned long max_boost; |
---|
| 2618 | + |
---|
| 2619 | + if (!watermark_boost_factor) |
---|
| 2620 | + return false; |
---|
| 2621 | + /* |
---|
| 2622 | + * Don't bother in zones that are unlikely to produce results. |
---|
| 2623 | + * On small machines, including kdump capture kernels running |
---|
| 2624 | + * in a small area, boosting the watermark can cause an out of |
---|
| 2625 | + * memory situation immediately. |
---|
| 2626 | + */ |
---|
| 2627 | + if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) |
---|
| 2628 | + return false; |
---|
| 2629 | + |
---|
| 2630 | + max_boost = mult_frac(zone->_watermark[WMARK_HIGH], |
---|
| 2631 | + watermark_boost_factor, 10000); |
---|
| 2632 | + |
---|
| 2633 | + /* |
---|
| 2634 | + * high watermark may be uninitialised if fragmentation occurs |
---|
| 2635 | + * very early in boot so do not boost. We do not fall |
---|
| 2636 | + * through and boost by pageblock_nr_pages as failing |
---|
| 2637 | + * allocations that early means that reclaim is not going |
---|
| 2638 | + * to help and it may even be impossible to reclaim the |
---|
| 2639 | + * boosted watermark resulting in a hang. |
---|
| 2640 | + */ |
---|
| 2641 | + if (!max_boost) |
---|
| 2642 | + return false; |
---|
| 2643 | + |
---|
| 2644 | + max_boost = max(pageblock_nr_pages, max_boost); |
---|
| 2645 | + |
---|
| 2646 | + zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, |
---|
| 2647 | + max_boost); |
---|
| 2648 | + |
---|
| 2649 | + return true; |
---|
| 2650 | +} |
---|
| 2651 | + |
---|
2267 | 2652 | /* |
---|
2268 | 2653 | * This function implements actual steal behaviour. If order is large enough, |
---|
2269 | 2654 | * we can steal whole pageblock. If not, we first move freepages in this |
---|
.. | .. |
---|
2273 | 2658 | * itself, so pages freed in the future will be put on the correct free list. |
---|
2274 | 2659 | */ |
---|
2275 | 2660 | static void steal_suitable_fallback(struct zone *zone, struct page *page, |
---|
2276 | | - int start_type, bool whole_block) |
---|
| 2661 | + unsigned int alloc_flags, int start_type, bool whole_block) |
---|
2277 | 2662 | { |
---|
2278 | | - unsigned int current_order = page_order(page); |
---|
2279 | | - struct free_area *area; |
---|
| 2663 | + unsigned int current_order = buddy_order(page); |
---|
2280 | 2664 | int free_pages, movable_pages, alike_pages; |
---|
2281 | 2665 | int old_block_type; |
---|
2282 | 2666 | |
---|
.. | .. |
---|
2294 | 2678 | change_pageblock_range(page, current_order, start_type); |
---|
2295 | 2679 | goto single_page; |
---|
2296 | 2680 | } |
---|
| 2681 | + |
---|
| 2682 | + /* |
---|
| 2683 | + * Boost watermarks to increase reclaim pressure to reduce the |
---|
| 2684 | + * likelihood of future fallbacks. Wake kswapd now as the node |
---|
| 2685 | + * may be balanced overall and kswapd will not wake naturally. |
---|
| 2686 | + */ |
---|
| 2687 | + if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) |
---|
| 2688 | + set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); |
---|
2297 | 2689 | |
---|
2298 | 2690 | /* We are not allowed to try stealing from the whole block */ |
---|
2299 | 2691 | if (!whole_block) |
---|
.. | .. |
---|
2338 | 2730 | return; |
---|
2339 | 2731 | |
---|
2340 | 2732 | single_page: |
---|
2341 | | - area = &zone->free_area[current_order]; |
---|
2342 | | - list_move(&page->lru, &area->free_list[start_type]); |
---|
| 2733 | + move_to_free_list(page, zone, current_order, start_type); |
---|
2343 | 2734 | } |
---|
2344 | 2735 | |
---|
2345 | 2736 | /* |
---|
.. | .. |
---|
2363 | 2754 | if (fallback_mt == MIGRATE_TYPES) |
---|
2364 | 2755 | break; |
---|
2365 | 2756 | |
---|
2366 | | - if (list_empty(&area->free_list[fallback_mt])) |
---|
| 2757 | + if (free_area_empty(area, fallback_mt)) |
---|
2367 | 2758 | continue; |
---|
2368 | 2759 | |
---|
2369 | 2760 | if (can_steal_fallback(order, migratetype)) |
---|
.. | .. |
---|
2393 | 2784 | * Limit the number reserved to 1 pageblock or roughly 1% of a zone. |
---|
2394 | 2785 | * Check is race-prone but harmless. |
---|
2395 | 2786 | */ |
---|
2396 | | - max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; |
---|
| 2787 | + max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; |
---|
2397 | 2788 | if (zone->nr_reserved_highatomic >= max_managed) |
---|
2398 | 2789 | return; |
---|
2399 | 2790 | |
---|
.. | .. |
---|
2436 | 2827 | int order; |
---|
2437 | 2828 | bool ret; |
---|
2438 | 2829 | |
---|
2439 | | - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, |
---|
| 2830 | + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, |
---|
2440 | 2831 | ac->nodemask) { |
---|
2441 | 2832 | /* |
---|
2442 | 2833 | * Preserve at least one pageblock unless memory pressure |
---|
.. | .. |
---|
2450 | 2841 | for (order = 0; order < MAX_ORDER; order++) { |
---|
2451 | 2842 | struct free_area *area = &(zone->free_area[order]); |
---|
2452 | 2843 | |
---|
2453 | | - page = list_first_entry_or_null( |
---|
2454 | | - &area->free_list[MIGRATE_HIGHATOMIC], |
---|
2455 | | - struct page, lru); |
---|
| 2844 | + page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); |
---|
2456 | 2845 | if (!page) |
---|
2457 | 2846 | continue; |
---|
2458 | 2847 | |
---|
.. | .. |
---|
2510 | 2899 | * condition simpler. |
---|
2511 | 2900 | */ |
---|
2512 | 2901 | static __always_inline bool |
---|
2513 | | -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) |
---|
| 2902 | +__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, |
---|
| 2903 | + unsigned int alloc_flags) |
---|
2514 | 2904 | { |
---|
2515 | 2905 | struct free_area *area; |
---|
2516 | 2906 | int current_order; |
---|
| 2907 | + int min_order = order; |
---|
2517 | 2908 | struct page *page; |
---|
2518 | 2909 | int fallback_mt; |
---|
2519 | 2910 | bool can_steal; |
---|
| 2911 | + |
---|
| 2912 | + /* |
---|
| 2913 | + * Do not steal pages from freelists belonging to other pageblocks |
---|
| 2914 | + * i.e. orders < pageblock_order. If there are no local zones free, |
---|
| 2915 | + * the zonelists will be reiterated without ALLOC_NOFRAGMENT. |
---|
| 2916 | + */ |
---|
| 2917 | + if (alloc_flags & ALLOC_NOFRAGMENT) |
---|
| 2918 | + min_order = pageblock_order; |
---|
2520 | 2919 | |
---|
2521 | 2920 | /* |
---|
2522 | 2921 | * Find the largest available free page in the other list. This roughly |
---|
2523 | 2922 | * approximates finding the pageblock with the most free pages, which |
---|
2524 | 2923 | * would be too costly to do exactly. |
---|
2525 | 2924 | */ |
---|
2526 | | - for (current_order = MAX_ORDER - 1; current_order >= order; |
---|
| 2925 | + for (current_order = MAX_ORDER - 1; current_order >= min_order; |
---|
2527 | 2926 | --current_order) { |
---|
2528 | 2927 | area = &(zone->free_area[current_order]); |
---|
2529 | 2928 | fallback_mt = find_suitable_fallback(area, current_order, |
---|
.. | .. |
---|
2565 | 2964 | VM_BUG_ON(current_order == MAX_ORDER); |
---|
2566 | 2965 | |
---|
2567 | 2966 | do_steal: |
---|
2568 | | - page = list_first_entry(&area->free_list[fallback_mt], |
---|
2569 | | - struct page, lru); |
---|
| 2967 | + page = get_page_from_free_area(area, fallback_mt); |
---|
2570 | 2968 | |
---|
2571 | | - steal_suitable_fallback(zone, page, start_migratetype, can_steal); |
---|
| 2969 | + steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, |
---|
| 2970 | + can_steal); |
---|
2572 | 2971 | |
---|
2573 | 2972 | trace_mm_page_alloc_extfrag(page, order, current_order, |
---|
2574 | 2973 | start_migratetype, fallback_mt); |
---|
.. | .. |
---|
2582 | 2981 | * Call me with the zone->lock already held. |
---|
2583 | 2982 | */ |
---|
2584 | 2983 | static __always_inline struct page * |
---|
2585 | | -__rmqueue(struct zone *zone, unsigned int order, int migratetype) |
---|
| 2984 | +__rmqueue(struct zone *zone, unsigned int order, int migratetype, |
---|
| 2985 | + unsigned int alloc_flags) |
---|
2586 | 2986 | { |
---|
2587 | 2987 | struct page *page; |
---|
2588 | 2988 | |
---|
2589 | 2989 | retry: |
---|
2590 | 2990 | page = __rmqueue_smallest(zone, order, migratetype); |
---|
2591 | 2991 | |
---|
2592 | | - if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype)) |
---|
| 2992 | + if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype, |
---|
| 2993 | + alloc_flags)) |
---|
2593 | 2994 | goto retry; |
---|
2594 | 2995 | |
---|
2595 | 2996 | trace_mm_page_alloc_zone_locked(page, order, migratetype); |
---|
.. | .. |
---|
2597 | 2998 | } |
---|
2598 | 2999 | |
---|
2599 | 3000 | #ifdef CONFIG_CMA |
---|
2600 | | -static struct page *__rmqueue_cma(struct zone *zone, unsigned int order) |
---|
| 3001 | +static struct page *__rmqueue_cma(struct zone *zone, unsigned int order, |
---|
| 3002 | + int migratetype, |
---|
| 3003 | + unsigned int alloc_flags) |
---|
2601 | 3004 | { |
---|
2602 | | - struct page *page = 0; |
---|
2603 | | - |
---|
2604 | | - if (IS_ENABLED(CONFIG_CMA)) |
---|
2605 | | - if (!zone->cma_alloc) |
---|
2606 | | - page = __rmqueue_cma_fallback(zone, order); |
---|
| 3005 | + struct page *page = __rmqueue_cma_fallback(zone, order); |
---|
2607 | 3006 | trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA); |
---|
2608 | 3007 | return page; |
---|
2609 | 3008 | } |
---|
2610 | 3009 | #else |
---|
2611 | | -static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order) |
---|
| 3010 | +static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order, |
---|
| 3011 | + int migratetype, |
---|
| 3012 | + unsigned int alloc_flags) |
---|
2612 | 3013 | { |
---|
2613 | 3014 | return NULL; |
---|
2614 | 3015 | } |
---|
.. | .. |
---|
2621 | 3022 | */ |
---|
2622 | 3023 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
---|
2623 | 3024 | unsigned long count, struct list_head *list, |
---|
2624 | | - int migratetype) |
---|
| 3025 | + int migratetype, unsigned int alloc_flags) |
---|
2625 | 3026 | { |
---|
2626 | 3027 | int i, alloced = 0; |
---|
2627 | 3028 | |
---|
.. | .. |
---|
2629 | 3030 | for (i = 0; i < count; ++i) { |
---|
2630 | 3031 | struct page *page; |
---|
2631 | 3032 | |
---|
2632 | | - /* |
---|
2633 | | - * If migrate type CMA is being requested only try to |
---|
2634 | | - * satisfy the request with CMA pages to try and increase |
---|
2635 | | - * CMA utlization. |
---|
2636 | | - */ |
---|
2637 | 3033 | if (is_migrate_cma(migratetype)) |
---|
2638 | | - page = __rmqueue_cma(zone, order); |
---|
| 3034 | + page = __rmqueue_cma(zone, order, migratetype, |
---|
| 3035 | + alloc_flags); |
---|
2639 | 3036 | else |
---|
2640 | | - page = __rmqueue(zone, order, migratetype); |
---|
| 3037 | + page = __rmqueue(zone, order, migratetype, alloc_flags); |
---|
2641 | 3038 | |
---|
2642 | 3039 | if (unlikely(page == NULL)) |
---|
2643 | 3040 | break; |
---|
.. | .. |
---|
2680 | 3077 | */ |
---|
2681 | 3078 | static struct list_head *get_populated_pcp_list(struct zone *zone, |
---|
2682 | 3079 | unsigned int order, struct per_cpu_pages *pcp, |
---|
2683 | | - int migratetype) |
---|
| 3080 | + int migratetype, unsigned int alloc_flags) |
---|
2684 | 3081 | { |
---|
2685 | 3082 | struct list_head *list = &pcp->lists[migratetype]; |
---|
2686 | 3083 | |
---|
2687 | 3084 | if (list_empty(list)) { |
---|
2688 | 3085 | pcp->count += rmqueue_bulk(zone, order, |
---|
2689 | 3086 | pcp->batch, list, |
---|
2690 | | - migratetype); |
---|
| 3087 | + migratetype, alloc_flags); |
---|
2691 | 3088 | |
---|
2692 | 3089 | if (list_empty(list)) |
---|
2693 | 3090 | list = NULL; |
---|
.. | .. |
---|
2710 | 3107 | int to_drain, batch; |
---|
2711 | 3108 | LIST_HEAD(dst); |
---|
2712 | 3109 | |
---|
2713 | | - local_lock_irqsave(pa_lock, flags); |
---|
| 3110 | + local_lock_irqsave(&pa_lock.l, flags); |
---|
2714 | 3111 | batch = READ_ONCE(pcp->batch); |
---|
2715 | 3112 | to_drain = min(pcp->count, batch); |
---|
2716 | 3113 | if (to_drain > 0) |
---|
2717 | 3114 | isolate_pcp_pages(to_drain, pcp, &dst); |
---|
2718 | 3115 | |
---|
2719 | | - local_unlock_irqrestore(pa_lock, flags); |
---|
| 3116 | + local_unlock_irqrestore(&pa_lock.l, flags); |
---|
2720 | 3117 | |
---|
2721 | 3118 | if (to_drain > 0) |
---|
2722 | 3119 | free_pcppages_bulk(zone, &dst, false); |
---|
.. | .. |
---|
2738 | 3135 | LIST_HEAD(dst); |
---|
2739 | 3136 | int count; |
---|
2740 | 3137 | |
---|
2741 | | - cpu_lock_irqsave(cpu, flags); |
---|
| 3138 | + local_lock_irqsave(&pa_lock.l, flags); |
---|
2742 | 3139 | pset = per_cpu_ptr(zone->pageset, cpu); |
---|
2743 | 3140 | |
---|
2744 | 3141 | pcp = &pset->pcp; |
---|
.. | .. |
---|
2746 | 3143 | if (count) |
---|
2747 | 3144 | isolate_pcp_pages(count, pcp, &dst); |
---|
2748 | 3145 | |
---|
2749 | | - cpu_unlock_irqrestore(cpu, flags); |
---|
| 3146 | + local_unlock_irqrestore(&pa_lock.l, flags); |
---|
2750 | 3147 | |
---|
2751 | 3148 | if (count) |
---|
2752 | 3149 | free_pcppages_bulk(zone, &dst, false); |
---|
.. | .. |
---|
2784 | 3181 | drain_pages(cpu); |
---|
2785 | 3182 | } |
---|
2786 | 3183 | |
---|
2787 | | -#ifndef CONFIG_PREEMPT_RT_BASE |
---|
2788 | 3184 | static void drain_local_pages_wq(struct work_struct *work) |
---|
2789 | 3185 | { |
---|
| 3186 | + struct pcpu_drain *drain; |
---|
| 3187 | + |
---|
| 3188 | + drain = container_of(work, struct pcpu_drain, work); |
---|
| 3189 | + |
---|
2790 | 3190 | /* |
---|
2791 | 3191 | * drain_all_pages doesn't use proper cpu hotplug protection so |
---|
2792 | 3192 | * we can race with cpu offline when the WQ can move this from |
---|
.. | .. |
---|
2794 | 3194 | * cpu which is allright but we also have to make sure to not move to |
---|
2795 | 3195 | * a different one. |
---|
2796 | 3196 | */ |
---|
2797 | | - preempt_disable(); |
---|
2798 | | - drain_local_pages(NULL); |
---|
2799 | | - preempt_enable(); |
---|
| 3197 | + migrate_disable(); |
---|
| 3198 | + drain_local_pages(drain->zone); |
---|
| 3199 | + migrate_enable(); |
---|
2800 | 3200 | } |
---|
2801 | | -#endif |
---|
2802 | 3201 | |
---|
2803 | 3202 | /* |
---|
2804 | 3203 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator. |
---|
.. | .. |
---|
2865 | 3264 | else |
---|
2866 | 3265 | cpumask_clear_cpu(cpu, &cpus_with_pcps); |
---|
2867 | 3266 | } |
---|
2868 | | -#ifdef CONFIG_PREEMPT_RT_BASE |
---|
| 3267 | + |
---|
2869 | 3268 | for_each_cpu(cpu, &cpus_with_pcps) { |
---|
2870 | | - if (zone) |
---|
2871 | | - drain_pages_zone(cpu, zone); |
---|
2872 | | - else |
---|
2873 | | - drain_pages(cpu); |
---|
2874 | | - } |
---|
2875 | | -#else |
---|
2876 | | - for_each_cpu(cpu, &cpus_with_pcps) { |
---|
2877 | | - struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); |
---|
2878 | | - INIT_WORK(work, drain_local_pages_wq); |
---|
2879 | | - queue_work_on(cpu, mm_percpu_wq, work); |
---|
| 3269 | + struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); |
---|
| 3270 | + |
---|
| 3271 | + drain->zone = zone; |
---|
| 3272 | + INIT_WORK(&drain->work, drain_local_pages_wq); |
---|
| 3273 | + queue_work_on(cpu, mm_percpu_wq, &drain->work); |
---|
2880 | 3274 | } |
---|
2881 | 3275 | for_each_cpu(cpu, &cpus_with_pcps) |
---|
2882 | | - flush_work(per_cpu_ptr(&pcpu_drain, cpu)); |
---|
2883 | | -#endif |
---|
| 3276 | + flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); |
---|
2884 | 3277 | |
---|
2885 | 3278 | mutex_unlock(&pcpu_drain_mutex); |
---|
2886 | 3279 | } |
---|
.. | .. |
---|
2958 | 3351 | struct zone *zone = page_zone(page); |
---|
2959 | 3352 | struct per_cpu_pages *pcp; |
---|
2960 | 3353 | int migratetype; |
---|
| 3354 | + bool pcp_skip_cma_pages = false; |
---|
2961 | 3355 | |
---|
2962 | 3356 | migratetype = get_pcppage_migratetype(page); |
---|
2963 | 3357 | __count_vm_event(PGFREE); |
---|
.. | .. |
---|
2970 | 3364 | * excessively into the page allocator |
---|
2971 | 3365 | */ |
---|
2972 | 3366 | if (migratetype >= MIGRATE_PCPTYPES) { |
---|
2973 | | - if (unlikely(is_migrate_isolate(migratetype))) { |
---|
2974 | | - free_one_page(zone, page, pfn, 0, migratetype); |
---|
| 3367 | + trace_android_vh_pcplist_add_cma_pages_bypass(migratetype, |
---|
| 3368 | + &pcp_skip_cma_pages); |
---|
| 3369 | + if (unlikely(is_migrate_isolate(migratetype)) || |
---|
| 3370 | + pcp_skip_cma_pages) { |
---|
| 3371 | + free_one_page(zone, page, pfn, 0, migratetype, |
---|
| 3372 | + FPI_NONE); |
---|
2975 | 3373 | return; |
---|
2976 | 3374 | } |
---|
2977 | 3375 | migratetype = MIGRATE_MOVABLE; |
---|
.. | .. |
---|
3000 | 3398 | if (!free_unref_page_prepare(page, pfn)) |
---|
3001 | 3399 | return; |
---|
3002 | 3400 | |
---|
3003 | | - local_lock_irqsave(pa_lock, flags); |
---|
| 3401 | + local_lock_irqsave(&pa_lock.l, flags); |
---|
3004 | 3402 | free_unref_page_commit(page, pfn, &dst); |
---|
3005 | | - local_unlock_irqrestore(pa_lock, flags); |
---|
| 3403 | + local_unlock_irqrestore(&pa_lock.l, flags); |
---|
3006 | 3404 | if (!list_empty(&dst)) |
---|
3007 | 3405 | free_pcppages_bulk(zone, &dst, false); |
---|
3008 | 3406 | } |
---|
.. | .. |
---|
3029 | 3427 | set_page_private(page, pfn); |
---|
3030 | 3428 | } |
---|
3031 | 3429 | |
---|
3032 | | - local_lock_irqsave(pa_lock, flags); |
---|
| 3430 | + local_lock_irqsave(&pa_lock.l, flags); |
---|
3033 | 3431 | list_for_each_entry_safe(page, next, list, lru) { |
---|
3034 | 3432 | unsigned long pfn = page_private(page); |
---|
3035 | 3433 | enum zone_type type; |
---|
.. | .. |
---|
3044 | 3442 | * a large list of pages to free. |
---|
3045 | 3443 | */ |
---|
3046 | 3444 | if (++batch_count == SWAP_CLUSTER_MAX) { |
---|
3047 | | - local_unlock_irqrestore(pa_lock, flags); |
---|
| 3445 | + local_unlock_irqrestore(&pa_lock.l, flags); |
---|
3048 | 3446 | batch_count = 0; |
---|
3049 | | - local_lock_irqsave(pa_lock, flags); |
---|
| 3447 | + local_lock_irqsave(&pa_lock.l, flags); |
---|
3050 | 3448 | } |
---|
3051 | 3449 | } |
---|
3052 | | - local_unlock_irqrestore(pa_lock, flags); |
---|
| 3450 | + local_unlock_irqrestore(&pa_lock.l, flags); |
---|
3053 | 3451 | |
---|
3054 | 3452 | for (i = 0; i < __MAX_NR_ZONES; ) { |
---|
3055 | 3453 | struct page *page; |
---|
.. | .. |
---|
3084 | 3482 | |
---|
3085 | 3483 | for (i = 1; i < (1 << order); i++) |
---|
3086 | 3484 | set_page_refcounted(page + i); |
---|
3087 | | - split_page_owner(page, order); |
---|
| 3485 | + split_page_owner(page, 1 << order); |
---|
| 3486 | + split_page_memcg(page, 1 << order); |
---|
3088 | 3487 | } |
---|
3089 | 3488 | EXPORT_SYMBOL_GPL(split_page); |
---|
3090 | 3489 | |
---|
.. | .. |
---|
3106 | 3505 | * watermark, because we already know our high-order page |
---|
3107 | 3506 | * exists. |
---|
3108 | 3507 | */ |
---|
3109 | | - watermark = min_wmark_pages(zone) + (1UL << order); |
---|
| 3508 | + watermark = zone->_watermark[WMARK_MIN] + (1UL << order); |
---|
3110 | 3509 | if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) |
---|
3111 | 3510 | return 0; |
---|
3112 | 3511 | |
---|
.. | .. |
---|
3114 | 3513 | } |
---|
3115 | 3514 | |
---|
3116 | 3515 | /* Remove page from free list */ |
---|
3117 | | - list_del(&page->lru); |
---|
3118 | | - zone->free_area[order].nr_free--; |
---|
3119 | | - rmv_page_order(page); |
---|
| 3516 | + |
---|
| 3517 | + del_page_from_free_list(page, zone, order); |
---|
3120 | 3518 | |
---|
3121 | 3519 | /* |
---|
3122 | 3520 | * Set the pageblock if the isolated page is at least half of a |
---|
.. | .. |
---|
3135 | 3533 | |
---|
3136 | 3534 | |
---|
3137 | 3535 | return 1UL << order; |
---|
| 3536 | +} |
---|
| 3537 | + |
---|
| 3538 | +/** |
---|
| 3539 | + * __putback_isolated_page - Return a now-isolated page back where we got it |
---|
| 3540 | + * @page: Page that was isolated |
---|
| 3541 | + * @order: Order of the isolated page |
---|
| 3542 | + * @mt: The page's pageblock's migratetype |
---|
| 3543 | + * |
---|
| 3544 | + * This function is meant to return a page pulled from the free lists via |
---|
| 3545 | + * __isolate_free_page back to the free lists they were pulled from. |
---|
| 3546 | + */ |
---|
| 3547 | +void __putback_isolated_page(struct page *page, unsigned int order, int mt) |
---|
| 3548 | +{ |
---|
| 3549 | + struct zone *zone = page_zone(page); |
---|
| 3550 | + |
---|
| 3551 | + /* zone lock should be held when this function is called */ |
---|
| 3552 | + lockdep_assert_held(&zone->lock); |
---|
| 3553 | + |
---|
| 3554 | + /* Return isolated page to tail of freelist. */ |
---|
| 3555 | + __free_one_page(page, page_to_pfn(page), zone, order, mt, |
---|
| 3556 | + FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); |
---|
3138 | 3557 | } |
---|
3139 | 3558 | |
---|
3140 | 3559 | /* |
---|
.. | .. |
---|
3166 | 3585 | |
---|
3167 | 3586 | /* Remove page from the per-cpu list, caller must protect the list */ |
---|
3168 | 3587 | static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, |
---|
| 3588 | + unsigned int alloc_flags, |
---|
3169 | 3589 | struct per_cpu_pages *pcp, |
---|
3170 | 3590 | gfp_t gfp_flags) |
---|
3171 | 3591 | { |
---|
.. | .. |
---|
3175 | 3595 | do { |
---|
3176 | 3596 | /* First try to get CMA pages */ |
---|
3177 | 3597 | if (migratetype == MIGRATE_MOVABLE && |
---|
3178 | | - gfp_flags & __GFP_CMA) { |
---|
| 3598 | + alloc_flags & ALLOC_CMA) { |
---|
3179 | 3599 | list = get_populated_pcp_list(zone, 0, pcp, |
---|
3180 | | - get_cma_migrate_type()); |
---|
| 3600 | + get_cma_migrate_type(), alloc_flags); |
---|
3181 | 3601 | } |
---|
3182 | 3602 | |
---|
3183 | 3603 | if (list == NULL) { |
---|
.. | .. |
---|
3186 | 3606 | * free CMA pages. |
---|
3187 | 3607 | */ |
---|
3188 | 3608 | list = get_populated_pcp_list(zone, 0, pcp, |
---|
3189 | | - migratetype); |
---|
| 3609 | + migratetype, alloc_flags); |
---|
3190 | 3610 | if (unlikely(list == NULL) || |
---|
3191 | 3611 | unlikely(list_empty(list))) |
---|
3192 | 3612 | return NULL; |
---|
.. | .. |
---|
3202 | 3622 | |
---|
3203 | 3623 | /* Lock and remove page from the per-cpu list */ |
---|
3204 | 3624 | static struct page *rmqueue_pcplist(struct zone *preferred_zone, |
---|
3205 | | - struct zone *zone, unsigned int order, |
---|
3206 | | - gfp_t gfp_flags, int migratetype) |
---|
| 3625 | + struct zone *zone, gfp_t gfp_flags, |
---|
| 3626 | + int migratetype, unsigned int alloc_flags) |
---|
3207 | 3627 | { |
---|
3208 | 3628 | struct per_cpu_pages *pcp; |
---|
3209 | 3629 | struct page *page; |
---|
3210 | 3630 | unsigned long flags; |
---|
3211 | 3631 | |
---|
3212 | | - local_lock_irqsave(pa_lock, flags); |
---|
| 3632 | + local_lock_irqsave(&pa_lock.l, flags); |
---|
3213 | 3633 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
---|
3214 | | - page = __rmqueue_pcplist(zone, migratetype, pcp, |
---|
| 3634 | + page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, |
---|
3215 | 3635 | gfp_flags); |
---|
3216 | 3636 | if (page) { |
---|
3217 | | - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
---|
| 3637 | + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); |
---|
3218 | 3638 | zone_statistics(preferred_zone, zone); |
---|
3219 | 3639 | } |
---|
3220 | | - local_unlock_irqrestore(pa_lock, flags); |
---|
| 3640 | + local_unlock_irqrestore(&pa_lock.l, flags); |
---|
3221 | 3641 | return page; |
---|
3222 | 3642 | } |
---|
3223 | 3643 | |
---|
.. | .. |
---|
3234 | 3654 | struct page *page; |
---|
3235 | 3655 | |
---|
3236 | 3656 | if (likely(order == 0)) { |
---|
3237 | | - page = rmqueue_pcplist(preferred_zone, zone, order, |
---|
3238 | | - gfp_flags, migratetype); |
---|
| 3657 | + page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, |
---|
| 3658 | + migratetype, alloc_flags); |
---|
3239 | 3659 | goto out; |
---|
3240 | 3660 | } |
---|
3241 | 3661 | |
---|
.. | .. |
---|
3244 | 3664 | * allocate greater than order-1 page units with __GFP_NOFAIL. |
---|
3245 | 3665 | */ |
---|
3246 | 3666 | WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); |
---|
3247 | | - local_spin_lock_irqsave(pa_lock, &zone->lock, flags); |
---|
| 3667 | + local_lock_irqsave(&pa_lock.l, flags); |
---|
| 3668 | + spin_lock(&zone->lock); |
---|
3248 | 3669 | |
---|
3249 | 3670 | do { |
---|
3250 | 3671 | page = NULL; |
---|
3251 | | - |
---|
3252 | | - if (alloc_flags & ALLOC_HARDER) { |
---|
| 3672 | + /* |
---|
| 3673 | + * order-0 request can reach here when the pcplist is skipped |
---|
| 3674 | + * due to non-CMA allocation context. HIGHATOMIC area is |
---|
| 3675 | + * reserved for high-order atomic allocation, so order-0 |
---|
| 3676 | + * request should skip it. |
---|
| 3677 | + */ |
---|
| 3678 | + if (order > 0 && alloc_flags & ALLOC_HARDER) { |
---|
3253 | 3679 | page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); |
---|
3254 | 3680 | if (page) |
---|
3255 | 3681 | trace_mm_page_alloc_zone_locked(page, order, migratetype); |
---|
3256 | 3682 | } |
---|
3257 | | - |
---|
3258 | | - if (!page && migratetype == MIGRATE_MOVABLE && |
---|
3259 | | - gfp_flags & __GFP_CMA) |
---|
3260 | | - page = __rmqueue_cma(zone, order); |
---|
3261 | | - |
---|
3262 | | - if (!page) |
---|
3263 | | - page = __rmqueue(zone, order, migratetype); |
---|
| 3683 | + if (!page) { |
---|
| 3684 | + if (migratetype == MIGRATE_MOVABLE && |
---|
| 3685 | + alloc_flags & ALLOC_CMA) |
---|
| 3686 | + page = __rmqueue_cma(zone, order, migratetype, |
---|
| 3687 | + alloc_flags); |
---|
| 3688 | + if (!page) |
---|
| 3689 | + page = __rmqueue(zone, order, migratetype, |
---|
| 3690 | + alloc_flags); |
---|
| 3691 | + } |
---|
3264 | 3692 | } while (page && check_new_pages(page, order)); |
---|
3265 | | - |
---|
3266 | 3693 | spin_unlock(&zone->lock); |
---|
3267 | 3694 | if (!page) |
---|
3268 | 3695 | goto failed; |
---|
.. | .. |
---|
3271 | 3698 | |
---|
3272 | 3699 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
---|
3273 | 3700 | zone_statistics(preferred_zone, zone); |
---|
3274 | | - local_unlock_irqrestore(pa_lock, flags); |
---|
| 3701 | + trace_android_vh_rmqueue(preferred_zone, zone, order, |
---|
| 3702 | + gfp_flags, alloc_flags, migratetype); |
---|
| 3703 | + local_unlock_irqrestore(&pa_lock.l, flags); |
---|
3275 | 3704 | |
---|
3276 | 3705 | out: |
---|
| 3706 | + /* Separate test+clear to avoid unnecessary atomics */ |
---|
| 3707 | + if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { |
---|
| 3708 | + clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); |
---|
| 3709 | + wakeup_kswapd(zone, 0, 0, zone_idx(zone)); |
---|
| 3710 | + } |
---|
| 3711 | + |
---|
3277 | 3712 | VM_BUG_ON_PAGE(page && bad_range(zone, page), page); |
---|
3278 | 3713 | return page; |
---|
3279 | 3714 | |
---|
3280 | 3715 | failed: |
---|
3281 | | - local_unlock_irqrestore(pa_lock, flags); |
---|
| 3716 | + local_unlock_irqrestore(&pa_lock.l, flags); |
---|
3282 | 3717 | return NULL; |
---|
3283 | 3718 | } |
---|
3284 | 3719 | |
---|
.. | .. |
---|
3303 | 3738 | } |
---|
3304 | 3739 | __setup("fail_page_alloc=", setup_fail_page_alloc); |
---|
3305 | 3740 | |
---|
3306 | | -static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
---|
| 3741 | +static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
---|
3307 | 3742 | { |
---|
3308 | 3743 | if (order < fail_page_alloc.min_order) |
---|
3309 | 3744 | return false; |
---|
.. | .. |
---|
3327 | 3762 | |
---|
3328 | 3763 | dir = fault_create_debugfs_attr("fail_page_alloc", NULL, |
---|
3329 | 3764 | &fail_page_alloc.attr); |
---|
3330 | | - if (IS_ERR(dir)) |
---|
3331 | | - return PTR_ERR(dir); |
---|
3332 | 3765 | |
---|
3333 | | - if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, |
---|
3334 | | - &fail_page_alloc.ignore_gfp_reclaim)) |
---|
3335 | | - goto fail; |
---|
3336 | | - if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, |
---|
3337 | | - &fail_page_alloc.ignore_gfp_highmem)) |
---|
3338 | | - goto fail; |
---|
3339 | | - if (!debugfs_create_u32("min-order", mode, dir, |
---|
3340 | | - &fail_page_alloc.min_order)) |
---|
3341 | | - goto fail; |
---|
| 3766 | + debugfs_create_bool("ignore-gfp-wait", mode, dir, |
---|
| 3767 | + &fail_page_alloc.ignore_gfp_reclaim); |
---|
| 3768 | + debugfs_create_bool("ignore-gfp-highmem", mode, dir, |
---|
| 3769 | + &fail_page_alloc.ignore_gfp_highmem); |
---|
| 3770 | + debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); |
---|
3342 | 3771 | |
---|
3343 | 3772 | return 0; |
---|
3344 | | -fail: |
---|
3345 | | - debugfs_remove_recursive(dir); |
---|
3346 | | - |
---|
3347 | | - return -ENOMEM; |
---|
3348 | 3773 | } |
---|
3349 | 3774 | |
---|
3350 | 3775 | late_initcall(fail_page_alloc_debugfs); |
---|
.. | .. |
---|
3353 | 3778 | |
---|
3354 | 3779 | #else /* CONFIG_FAIL_PAGE_ALLOC */ |
---|
3355 | 3780 | |
---|
3356 | | -static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
---|
| 3781 | +static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
---|
3357 | 3782 | { |
---|
3358 | 3783 | return false; |
---|
3359 | 3784 | } |
---|
3360 | 3785 | |
---|
3361 | 3786 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
---|
| 3787 | + |
---|
| 3788 | +noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
---|
| 3789 | +{ |
---|
| 3790 | + return __should_fail_alloc_page(gfp_mask, order); |
---|
| 3791 | +} |
---|
| 3792 | +ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); |
---|
| 3793 | + |
---|
| 3794 | +static inline long __zone_watermark_unusable_free(struct zone *z, |
---|
| 3795 | + unsigned int order, unsigned int alloc_flags) |
---|
| 3796 | +{ |
---|
| 3797 | + const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); |
---|
| 3798 | + long unusable_free = (1 << order) - 1; |
---|
| 3799 | + |
---|
| 3800 | + /* |
---|
| 3801 | + * If the caller does not have rights to ALLOC_HARDER then subtract |
---|
| 3802 | + * the high-atomic reserves. This will over-estimate the size of the |
---|
| 3803 | + * atomic reserve but it avoids a search. |
---|
| 3804 | + */ |
---|
| 3805 | + if (likely(!alloc_harder)) |
---|
| 3806 | + unusable_free += z->nr_reserved_highatomic; |
---|
| 3807 | + |
---|
| 3808 | +#ifdef CONFIG_CMA |
---|
| 3809 | + /* If allocation can't use CMA areas don't use free CMA pages */ |
---|
| 3810 | + if (!(alloc_flags & ALLOC_CMA)) |
---|
| 3811 | + unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); |
---|
| 3812 | +#endif |
---|
| 3813 | + |
---|
| 3814 | + return unusable_free; |
---|
| 3815 | +} |
---|
3362 | 3816 | |
---|
3363 | 3817 | /* |
---|
3364 | 3818 | * Return true if free base pages are above 'mark'. For high-order checks it |
---|
.. | .. |
---|
3367 | 3821 | * to check in the allocation paths if no pages are free. |
---|
3368 | 3822 | */ |
---|
3369 | 3823 | bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, |
---|
3370 | | - int classzone_idx, unsigned int alloc_flags, |
---|
| 3824 | + int highest_zoneidx, unsigned int alloc_flags, |
---|
3371 | 3825 | long free_pages) |
---|
3372 | 3826 | { |
---|
3373 | 3827 | long min = mark; |
---|
.. | .. |
---|
3375 | 3829 | const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); |
---|
3376 | 3830 | |
---|
3377 | 3831 | /* free_pages may go negative - that's OK */ |
---|
3378 | | - free_pages -= (1 << order) - 1; |
---|
| 3832 | + free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); |
---|
3379 | 3833 | |
---|
3380 | 3834 | if (alloc_flags & ALLOC_HIGH) |
---|
3381 | 3835 | min -= min / 2; |
---|
3382 | 3836 | |
---|
3383 | | - /* |
---|
3384 | | - * If the caller does not have rights to ALLOC_HARDER then subtract |
---|
3385 | | - * the high-atomic reserves. This will over-estimate the size of the |
---|
3386 | | - * atomic reserve but it avoids a search. |
---|
3387 | | - */ |
---|
3388 | | - if (likely(!alloc_harder)) { |
---|
3389 | | - free_pages -= z->nr_reserved_highatomic; |
---|
3390 | | - } else { |
---|
| 3837 | + if (unlikely(alloc_harder)) { |
---|
3391 | 3838 | /* |
---|
3392 | 3839 | * OOM victims can try even harder than normal ALLOC_HARDER |
---|
3393 | 3840 | * users on the grounds that it's definitely going to be in |
---|
.. | .. |
---|
3400 | 3847 | min -= min / 4; |
---|
3401 | 3848 | } |
---|
3402 | 3849 | |
---|
3403 | | - |
---|
3404 | | -#ifdef CONFIG_CMA |
---|
3405 | | - /* If allocation can't use CMA areas don't use free CMA pages */ |
---|
3406 | | - if (!(alloc_flags & ALLOC_CMA)) |
---|
3407 | | - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); |
---|
3408 | | -#endif |
---|
3409 | | - |
---|
3410 | 3850 | /* |
---|
3411 | 3851 | * Check watermarks for an order-0 allocation request. If these |
---|
3412 | 3852 | * are not met, then a high-order request also cannot go ahead |
---|
3413 | 3853 | * even if a suitable page happened to be free. |
---|
3414 | 3854 | */ |
---|
3415 | | - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) |
---|
| 3855 | + if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) |
---|
3416 | 3856 | return false; |
---|
3417 | 3857 | |
---|
3418 | 3858 | /* If this is an order-0 request then the watermark is fine */ |
---|
.. | .. |
---|
3436 | 3876 | if (mt == MIGRATE_CMA) |
---|
3437 | 3877 | continue; |
---|
3438 | 3878 | #endif |
---|
3439 | | - if (!list_empty(&area->free_list[mt])) |
---|
| 3879 | + if (!free_area_empty(area, mt)) |
---|
3440 | 3880 | return true; |
---|
3441 | 3881 | } |
---|
3442 | 3882 | |
---|
3443 | 3883 | #ifdef CONFIG_CMA |
---|
3444 | 3884 | if ((alloc_flags & ALLOC_CMA) && |
---|
3445 | | - !list_empty(&area->free_list[MIGRATE_CMA])) { |
---|
| 3885 | + !free_area_empty(area, MIGRATE_CMA)) { |
---|
3446 | 3886 | return true; |
---|
3447 | 3887 | } |
---|
3448 | 3888 | #endif |
---|
3449 | | - if (alloc_harder && |
---|
3450 | | - !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) |
---|
| 3889 | + if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) |
---|
3451 | 3890 | return true; |
---|
3452 | 3891 | } |
---|
3453 | 3892 | return false; |
---|
3454 | 3893 | } |
---|
3455 | 3894 | |
---|
3456 | 3895 | bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, |
---|
3457 | | - int classzone_idx, unsigned int alloc_flags) |
---|
| 3896 | + int highest_zoneidx, unsigned int alloc_flags) |
---|
3458 | 3897 | { |
---|
3459 | | - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
---|
| 3898 | + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, |
---|
3460 | 3899 | zone_page_state(z, NR_FREE_PAGES)); |
---|
3461 | 3900 | } |
---|
| 3901 | +EXPORT_SYMBOL_GPL(zone_watermark_ok); |
---|
3462 | 3902 | |
---|
3463 | 3903 | static inline bool zone_watermark_fast(struct zone *z, unsigned int order, |
---|
3464 | | - unsigned long mark, int classzone_idx, unsigned int alloc_flags) |
---|
| 3904 | + unsigned long mark, int highest_zoneidx, |
---|
| 3905 | + unsigned int alloc_flags, gfp_t gfp_mask) |
---|
3465 | 3906 | { |
---|
3466 | | - long free_pages = zone_page_state(z, NR_FREE_PAGES); |
---|
3467 | | - long cma_pages = 0; |
---|
| 3907 | + long free_pages; |
---|
3468 | 3908 | |
---|
3469 | | -#ifdef CONFIG_CMA |
---|
3470 | | - /* If allocation can't use CMA areas don't use free CMA pages */ |
---|
3471 | | - if (!(alloc_flags & ALLOC_CMA)) |
---|
3472 | | - cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); |
---|
3473 | | -#endif |
---|
| 3909 | + free_pages = zone_page_state(z, NR_FREE_PAGES); |
---|
3474 | 3910 | |
---|
3475 | 3911 | /* |
---|
3476 | 3912 | * Fast check for order-0 only. If this fails then the reserves |
---|
3477 | | - * need to be calculated. There is a corner case where the check |
---|
3478 | | - * passes but only the high-order atomic reserve are free. If |
---|
3479 | | - * the caller is !atomic then it'll uselessly search the free |
---|
3480 | | - * list. That corner case is then slower but it is harmless. |
---|
| 3913 | + * need to be calculated. |
---|
3481 | 3914 | */ |
---|
3482 | | - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) |
---|
3483 | | - return true; |
---|
| 3915 | + if (!order) { |
---|
| 3916 | + long usable_free; |
---|
| 3917 | + long reserved; |
---|
3484 | 3918 | |
---|
3485 | | - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
---|
3486 | | - free_pages); |
---|
| 3919 | + usable_free = free_pages; |
---|
| 3920 | + reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); |
---|
| 3921 | + |
---|
| 3922 | + /* reserved may over estimate high-atomic reserves. */ |
---|
| 3923 | + usable_free -= min(usable_free, reserved); |
---|
| 3924 | + if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) |
---|
| 3925 | + return true; |
---|
| 3926 | + } |
---|
| 3927 | + |
---|
| 3928 | + if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, |
---|
| 3929 | + free_pages)) |
---|
| 3930 | + return true; |
---|
| 3931 | + /* |
---|
| 3932 | + * Ignore watermark boosting for GFP_ATOMIC order-0 allocations |
---|
| 3933 | + * when checking the min watermark. The min watermark is the |
---|
| 3934 | + * point where boosting is ignored so that kswapd is woken up |
---|
| 3935 | + * when below the low watermark. |
---|
| 3936 | + */ |
---|
| 3937 | + if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost |
---|
| 3938 | + && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { |
---|
| 3939 | + mark = z->_watermark[WMARK_MIN]; |
---|
| 3940 | + return __zone_watermark_ok(z, order, mark, highest_zoneidx, |
---|
| 3941 | + alloc_flags, free_pages); |
---|
| 3942 | + } |
---|
| 3943 | + |
---|
| 3944 | + return false; |
---|
3487 | 3945 | } |
---|
3488 | 3946 | |
---|
3489 | 3947 | bool zone_watermark_ok_safe(struct zone *z, unsigned int order, |
---|
3490 | | - unsigned long mark, int classzone_idx) |
---|
| 3948 | + unsigned long mark, int highest_zoneidx) |
---|
3491 | 3949 | { |
---|
3492 | 3950 | long free_pages = zone_page_state(z, NR_FREE_PAGES); |
---|
3493 | 3951 | |
---|
3494 | 3952 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) |
---|
3495 | 3953 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); |
---|
3496 | 3954 | |
---|
3497 | | - return __zone_watermark_ok(z, order, mark, classzone_idx, 0, |
---|
| 3955 | + return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, |
---|
3498 | 3956 | free_pages); |
---|
3499 | 3957 | } |
---|
3500 | 3958 | EXPORT_SYMBOL_GPL(zone_watermark_ok_safe); |
---|
.. | .. |
---|
3503 | 3961 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
---|
3504 | 3962 | { |
---|
3505 | 3963 | return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= |
---|
3506 | | - RECLAIM_DISTANCE; |
---|
| 3964 | + node_reclaim_distance; |
---|
3507 | 3965 | } |
---|
3508 | 3966 | #else /* CONFIG_NUMA */ |
---|
3509 | 3967 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
---|
.. | .. |
---|
3513 | 3971 | #endif /* CONFIG_NUMA */ |
---|
3514 | 3972 | |
---|
3515 | 3973 | /* |
---|
| 3974 | + * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid |
---|
| 3975 | + * fragmentation is subtle. If the preferred zone was HIGHMEM then |
---|
| 3976 | + * premature use of a lower zone may cause lowmem pressure problems that |
---|
| 3977 | + * are worse than fragmentation. If the next zone is ZONE_DMA then it is |
---|
| 3978 | + * probably too small. It only makes sense to spread allocations to avoid |
---|
| 3979 | + * fragmentation between the Normal and DMA32 zones. |
---|
| 3980 | + */ |
---|
| 3981 | +static inline unsigned int |
---|
| 3982 | +alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) |
---|
| 3983 | +{ |
---|
| 3984 | + unsigned int alloc_flags; |
---|
| 3985 | + |
---|
| 3986 | + /* |
---|
| 3987 | + * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD |
---|
| 3988 | + * to save a branch. |
---|
| 3989 | + */ |
---|
| 3990 | + alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM); |
---|
| 3991 | + |
---|
| 3992 | +#ifdef CONFIG_ZONE_DMA32 |
---|
| 3993 | + if (!zone) |
---|
| 3994 | + return alloc_flags; |
---|
| 3995 | + |
---|
| 3996 | + if (zone_idx(zone) != ZONE_NORMAL) |
---|
| 3997 | + return alloc_flags; |
---|
| 3998 | + |
---|
| 3999 | + /* |
---|
| 4000 | + * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and |
---|
| 4001 | + * the pointer is within zone->zone_pgdat->node_zones[]. Also assume |
---|
| 4002 | + * on UMA that if Normal is populated then so is DMA32. |
---|
| 4003 | + */ |
---|
| 4004 | + BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); |
---|
| 4005 | + if (nr_online_nodes > 1 && !populated_zone(--zone)) |
---|
| 4006 | + return alloc_flags; |
---|
| 4007 | + |
---|
| 4008 | + alloc_flags |= ALLOC_NOFRAGMENT; |
---|
| 4009 | +#endif /* CONFIG_ZONE_DMA32 */ |
---|
| 4010 | + return alloc_flags; |
---|
| 4011 | +} |
---|
| 4012 | + |
---|
| 4013 | +static inline unsigned int current_alloc_flags(gfp_t gfp_mask, |
---|
| 4014 | + unsigned int alloc_flags) |
---|
| 4015 | +{ |
---|
| 4016 | +#ifdef CONFIG_CMA |
---|
| 4017 | + unsigned int pflags = current->flags; |
---|
| 4018 | + |
---|
| 4019 | + if (!(pflags & PF_MEMALLOC_NOCMA) && |
---|
| 4020 | + gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE && |
---|
| 4021 | + gfp_mask & __GFP_CMA) |
---|
| 4022 | + alloc_flags |= ALLOC_CMA; |
---|
| 4023 | + |
---|
| 4024 | +#endif |
---|
| 4025 | + return alloc_flags; |
---|
| 4026 | +} |
---|
| 4027 | + |
---|
| 4028 | +/* |
---|
3516 | 4029 | * get_page_from_freelist goes through the zonelist trying to allocate |
---|
3517 | 4030 | * a page. |
---|
3518 | 4031 | */ |
---|
.. | .. |
---|
3520 | 4033 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, |
---|
3521 | 4034 | const struct alloc_context *ac) |
---|
3522 | 4035 | { |
---|
3523 | | - struct zoneref *z = ac->preferred_zoneref; |
---|
| 4036 | + struct zoneref *z; |
---|
3524 | 4037 | struct zone *zone; |
---|
3525 | 4038 | struct pglist_data *last_pgdat_dirty_limit = NULL; |
---|
| 4039 | + bool no_fallback; |
---|
3526 | 4040 | |
---|
| 4041 | +retry: |
---|
3527 | 4042 | /* |
---|
3528 | 4043 | * Scan zonelist, looking for a zone with enough free. |
---|
3529 | 4044 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. |
---|
3530 | 4045 | */ |
---|
3531 | | - for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, |
---|
3532 | | - ac->nodemask) { |
---|
| 4046 | + no_fallback = alloc_flags & ALLOC_NOFRAGMENT; |
---|
| 4047 | + z = ac->preferred_zoneref; |
---|
| 4048 | + for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, |
---|
| 4049 | + ac->nodemask) { |
---|
3533 | 4050 | struct page *page; |
---|
3534 | 4051 | unsigned long mark; |
---|
3535 | 4052 | |
---|
.. | .. |
---|
3566 | 4083 | } |
---|
3567 | 4084 | } |
---|
3568 | 4085 | |
---|
3569 | | - mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
---|
| 4086 | + if (no_fallback && nr_online_nodes > 1 && |
---|
| 4087 | + zone != ac->preferred_zoneref->zone) { |
---|
| 4088 | + int local_nid; |
---|
| 4089 | + |
---|
| 4090 | + /* |
---|
| 4091 | + * If moving to a remote node, retry but allow |
---|
| 4092 | + * fragmenting fallbacks. Locality is more important |
---|
| 4093 | + * than fragmentation avoidance. |
---|
| 4094 | + */ |
---|
| 4095 | + local_nid = zone_to_nid(ac->preferred_zoneref->zone); |
---|
| 4096 | + if (zone_to_nid(zone) != local_nid) { |
---|
| 4097 | + alloc_flags &= ~ALLOC_NOFRAGMENT; |
---|
| 4098 | + goto retry; |
---|
| 4099 | + } |
---|
| 4100 | + } |
---|
| 4101 | + |
---|
| 4102 | + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); |
---|
3570 | 4103 | if (!zone_watermark_fast(zone, order, mark, |
---|
3571 | | - ac_classzone_idx(ac), alloc_flags)) { |
---|
| 4104 | + ac->highest_zoneidx, alloc_flags, |
---|
| 4105 | + gfp_mask)) { |
---|
3572 | 4106 | int ret; |
---|
3573 | 4107 | |
---|
3574 | 4108 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
---|
.. | .. |
---|
3601 | 4135 | default: |
---|
3602 | 4136 | /* did we reclaim enough */ |
---|
3603 | 4137 | if (zone_watermark_ok(zone, order, mark, |
---|
3604 | | - ac_classzone_idx(ac), alloc_flags)) |
---|
| 4138 | + ac->highest_zoneidx, alloc_flags)) |
---|
3605 | 4139 | goto try_this_zone; |
---|
3606 | 4140 | |
---|
3607 | 4141 | continue; |
---|
.. | .. |
---|
3633 | 4167 | } |
---|
3634 | 4168 | } |
---|
3635 | 4169 | |
---|
| 4170 | + /* |
---|
| 4171 | + * It's possible on a UMA machine to get through all zones that are |
---|
| 4172 | + * fragmented. If avoiding fragmentation, reset and try again. |
---|
| 4173 | + */ |
---|
| 4174 | + if (no_fallback) { |
---|
| 4175 | + alloc_flags &= ~ALLOC_NOFRAGMENT; |
---|
| 4176 | + goto retry; |
---|
| 4177 | + } |
---|
| 4178 | + |
---|
3636 | 4179 | return NULL; |
---|
3637 | | -} |
---|
3638 | | - |
---|
3639 | | -/* |
---|
3640 | | - * Large machines with many possible nodes should not always dump per-node |
---|
3641 | | - * meminfo in irq context. |
---|
3642 | | - */ |
---|
3643 | | -static inline bool should_suppress_show_mem(void) |
---|
3644 | | -{ |
---|
3645 | | - bool ret = false; |
---|
3646 | | - |
---|
3647 | | -#if NODES_SHIFT > 8 |
---|
3648 | | - ret = in_interrupt(); |
---|
3649 | | -#endif |
---|
3650 | | - return ret; |
---|
3651 | 4180 | } |
---|
3652 | 4181 | |
---|
3653 | 4182 | static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) |
---|
3654 | 4183 | { |
---|
3655 | 4184 | unsigned int filter = SHOW_MEM_FILTER_NODES; |
---|
3656 | | - static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); |
---|
3657 | | - |
---|
3658 | | - if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) |
---|
3659 | | - return; |
---|
3660 | 4185 | |
---|
3661 | 4186 | /* |
---|
3662 | 4187 | * This documents exceptions given to allocations in certain |
---|
.. | .. |
---|
3677 | 4202 | { |
---|
3678 | 4203 | struct va_format vaf; |
---|
3679 | 4204 | va_list args; |
---|
3680 | | - static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, |
---|
3681 | | - DEFAULT_RATELIMIT_BURST); |
---|
| 4205 | + static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); |
---|
3682 | 4206 | |
---|
3683 | | - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) |
---|
| 4207 | + if ((gfp_mask & __GFP_NOWARN) || |
---|
| 4208 | + !__ratelimit(&nopage_rs) || |
---|
| 4209 | + ((gfp_mask & __GFP_DMA) && !has_managed_dma())) |
---|
3684 | 4210 | return; |
---|
3685 | 4211 | |
---|
3686 | 4212 | va_start(args, fmt); |
---|
3687 | 4213 | vaf.fmt = fmt; |
---|
3688 | 4214 | vaf.va = &args; |
---|
3689 | | - pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", |
---|
| 4215 | + pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", |
---|
3690 | 4216 | current->comm, &vaf, gfp_mask, &gfp_mask, |
---|
3691 | 4217 | nodemask_pr_args(nodemask)); |
---|
3692 | 4218 | va_end(args); |
---|
3693 | 4219 | |
---|
3694 | 4220 | cpuset_print_current_mems_allowed(); |
---|
3695 | | - |
---|
| 4221 | + pr_cont("\n"); |
---|
3696 | 4222 | dump_stack(); |
---|
3697 | 4223 | warn_alloc_show_mem(gfp_mask, nodemask); |
---|
3698 | 4224 | } |
---|
.. | .. |
---|
3766 | 4292 | * success so it is time to admit defeat. We will skip the OOM killer |
---|
3767 | 4293 | * because it is very likely that the caller has a more reasonable |
---|
3768 | 4294 | * fallback than shooting a random task. |
---|
| 4295 | + * |
---|
| 4296 | + * The OOM killer may not free memory on a specific node. |
---|
3769 | 4297 | */ |
---|
3770 | | - if (gfp_mask & __GFP_RETRY_MAYFAIL) |
---|
| 4298 | + if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) |
---|
3771 | 4299 | goto out; |
---|
3772 | 4300 | /* The OOM killer does not needlessly kill tasks for lowmem */ |
---|
3773 | | - if (ac->high_zoneidx < ZONE_NORMAL) |
---|
| 4301 | + if (ac->highest_zoneidx < ZONE_NORMAL) |
---|
3774 | 4302 | goto out; |
---|
3775 | 4303 | if (pm_suspended_storage()) |
---|
3776 | 4304 | goto out; |
---|
.. | .. |
---|
3783 | 4311 | * out_of_memory). Once filesystems are ready to handle allocation |
---|
3784 | 4312 | * failures more gracefully we should just bail out here. |
---|
3785 | 4313 | */ |
---|
3786 | | - |
---|
3787 | | - /* The OOM killer may not free memory on a specific node */ |
---|
3788 | | - if (gfp_mask & __GFP_THISNODE) |
---|
3789 | | - goto out; |
---|
3790 | 4314 | |
---|
3791 | 4315 | /* Exhausted what can be done so it's blame time */ |
---|
3792 | 4316 | if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { |
---|
.. | .. |
---|
3818 | 4342 | unsigned int alloc_flags, const struct alloc_context *ac, |
---|
3819 | 4343 | enum compact_priority prio, enum compact_result *compact_result) |
---|
3820 | 4344 | { |
---|
3821 | | - struct page *page; |
---|
| 4345 | + struct page *page = NULL; |
---|
3822 | 4346 | unsigned long pflags; |
---|
3823 | 4347 | unsigned int noreclaim_flag; |
---|
3824 | 4348 | |
---|
.. | .. |
---|
3829 | 4353 | noreclaim_flag = memalloc_noreclaim_save(); |
---|
3830 | 4354 | |
---|
3831 | 4355 | *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, |
---|
3832 | | - prio); |
---|
| 4356 | + prio, &page); |
---|
3833 | 4357 | |
---|
3834 | 4358 | memalloc_noreclaim_restore(noreclaim_flag); |
---|
3835 | 4359 | psi_memstall_leave(&pflags); |
---|
3836 | | - |
---|
3837 | | - if (*compact_result <= COMPACT_INACTIVE) |
---|
3838 | | - return NULL; |
---|
3839 | 4360 | |
---|
3840 | 4361 | /* |
---|
3841 | 4362 | * At least in one zone compaction wasn't deferred or skipped, so let's |
---|
.. | .. |
---|
3843 | 4364 | */ |
---|
3844 | 4365 | count_vm_event(COMPACTSTALL); |
---|
3845 | 4366 | |
---|
3846 | | - page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); |
---|
| 4367 | + /* Prep a captured page if available */ |
---|
| 4368 | + if (page) |
---|
| 4369 | + prep_new_page(page, order, gfp_mask, alloc_flags); |
---|
| 4370 | + |
---|
| 4371 | + /* Try get a page from the freelist if available */ |
---|
| 4372 | + if (!page) |
---|
| 4373 | + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); |
---|
3847 | 4374 | |
---|
3848 | 4375 | if (page) { |
---|
3849 | 4376 | struct zone *zone = page_zone(page); |
---|
.. | .. |
---|
3892 | 4419 | goto check_priority; |
---|
3893 | 4420 | |
---|
3894 | 4421 | /* |
---|
3895 | | - * make sure the compaction wasn't deferred or didn't bail out early |
---|
3896 | | - * due to locks contention before we declare that we should give up. |
---|
3897 | | - * But do not retry if the given zonelist is not suitable for |
---|
3898 | | - * compaction. |
---|
| 4422 | + * compaction was skipped because there are not enough order-0 pages |
---|
| 4423 | + * to work with, so we retry only if it looks like reclaim can help. |
---|
3899 | 4424 | */ |
---|
3900 | | - if (compaction_withdrawn(compact_result)) { |
---|
| 4425 | + if (compaction_needs_reclaim(compact_result)) { |
---|
3901 | 4426 | ret = compaction_zonelist_suitable(ac, order, alloc_flags); |
---|
3902 | 4427 | goto out; |
---|
| 4428 | + } |
---|
| 4429 | + |
---|
| 4430 | + /* |
---|
| 4431 | + * make sure the compaction wasn't deferred or didn't bail out early |
---|
| 4432 | + * due to locks contention before we declare that we should give up. |
---|
| 4433 | + * But the next retry should use a higher priority if allowed, so |
---|
| 4434 | + * we don't just keep bailing out endlessly. |
---|
| 4435 | + */ |
---|
| 4436 | + if (compaction_withdrawn(compact_result)) { |
---|
| 4437 | + goto check_priority; |
---|
3903 | 4438 | } |
---|
3904 | 4439 | |
---|
3905 | 4440 | /* |
---|
.. | .. |
---|
3962 | 4497 | * Let's give them a good hope and keep retrying while the order-0 |
---|
3963 | 4498 | * watermarks are OK. |
---|
3964 | 4499 | */ |
---|
3965 | | - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, |
---|
3966 | | - ac->nodemask) { |
---|
| 4500 | + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, |
---|
| 4501 | + ac->highest_zoneidx, ac->nodemask) { |
---|
3967 | 4502 | if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), |
---|
3968 | | - ac_classzone_idx(ac), alloc_flags)) |
---|
| 4503 | + ac->highest_zoneidx, alloc_flags)) |
---|
3969 | 4504 | return true; |
---|
3970 | 4505 | } |
---|
3971 | 4506 | return false; |
---|
.. | .. |
---|
4023 | 4558 | EXPORT_SYMBOL_GPL(fs_reclaim_release); |
---|
4024 | 4559 | #endif |
---|
4025 | 4560 | |
---|
| 4561 | +/* |
---|
| 4562 | + * Zonelists may change due to hotplug during allocation. Detect when zonelists |
---|
| 4563 | + * have been rebuilt so allocation retries. Reader side does not lock and |
---|
| 4564 | + * retries the allocation if zonelist changes. Writer side is protected by the |
---|
| 4565 | + * embedded spin_lock. |
---|
| 4566 | + */ |
---|
| 4567 | +static DEFINE_SEQLOCK(zonelist_update_seq); |
---|
| 4568 | + |
---|
| 4569 | +static unsigned int zonelist_iter_begin(void) |
---|
| 4570 | +{ |
---|
| 4571 | + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) |
---|
| 4572 | + return read_seqbegin(&zonelist_update_seq); |
---|
| 4573 | + |
---|
| 4574 | + return 0; |
---|
| 4575 | +} |
---|
| 4576 | + |
---|
| 4577 | +static unsigned int check_retry_zonelist(unsigned int seq) |
---|
| 4578 | +{ |
---|
| 4579 | + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) |
---|
| 4580 | + return read_seqretry(&zonelist_update_seq, seq); |
---|
| 4581 | + |
---|
| 4582 | + return seq; |
---|
| 4583 | +} |
---|
| 4584 | + |
---|
4026 | 4585 | /* Perform direct synchronous page reclaim */ |
---|
4027 | | -static int |
---|
| 4586 | +static unsigned long |
---|
4028 | 4587 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, |
---|
4029 | 4588 | const struct alloc_context *ac) |
---|
4030 | 4589 | { |
---|
4031 | | - struct reclaim_state reclaim_state; |
---|
4032 | | - int progress; |
---|
4033 | 4590 | unsigned int noreclaim_flag; |
---|
4034 | | - unsigned long pflags; |
---|
| 4591 | + unsigned long progress; |
---|
4035 | 4592 | |
---|
4036 | 4593 | cond_resched(); |
---|
4037 | 4594 | |
---|
4038 | 4595 | /* We now go into synchronous reclaim */ |
---|
4039 | 4596 | cpuset_memory_pressure_bump(); |
---|
4040 | | - psi_memstall_enter(&pflags); |
---|
4041 | 4597 | fs_reclaim_acquire(gfp_mask); |
---|
4042 | 4598 | noreclaim_flag = memalloc_noreclaim_save(); |
---|
4043 | | - reclaim_state.reclaimed_slab = 0; |
---|
4044 | | - current->reclaim_state = &reclaim_state; |
---|
4045 | 4599 | |
---|
4046 | 4600 | progress = try_to_free_pages(ac->zonelist, order, gfp_mask, |
---|
4047 | 4601 | ac->nodemask); |
---|
4048 | 4602 | |
---|
4049 | | - current->reclaim_state = NULL; |
---|
4050 | 4603 | memalloc_noreclaim_restore(noreclaim_flag); |
---|
4051 | 4604 | fs_reclaim_release(gfp_mask); |
---|
4052 | | - psi_memstall_leave(&pflags); |
---|
4053 | 4605 | |
---|
4054 | 4606 | cond_resched(); |
---|
4055 | 4607 | |
---|
.. | .. |
---|
4063 | 4615 | unsigned long *did_some_progress) |
---|
4064 | 4616 | { |
---|
4065 | 4617 | struct page *page = NULL; |
---|
| 4618 | + unsigned long pflags; |
---|
4066 | 4619 | bool drained = false; |
---|
| 4620 | + bool skip_pcp_drain = false; |
---|
4067 | 4621 | |
---|
| 4622 | + psi_memstall_enter(&pflags); |
---|
4068 | 4623 | *did_some_progress = __perform_reclaim(gfp_mask, order, ac); |
---|
4069 | 4624 | if (unlikely(!(*did_some_progress))) |
---|
4070 | | - return NULL; |
---|
| 4625 | + goto out; |
---|
4071 | 4626 | |
---|
4072 | 4627 | retry: |
---|
4073 | 4628 | page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); |
---|
.. | .. |
---|
4075 | 4630 | /* |
---|
4076 | 4631 | * If an allocation failed after direct reclaim, it could be because |
---|
4077 | 4632 | * pages are pinned on the per-cpu lists or in high alloc reserves. |
---|
4078 | | - * Shrink them them and try again |
---|
| 4633 | + * Shrink them and try again |
---|
4079 | 4634 | */ |
---|
4080 | 4635 | if (!page && !drained) { |
---|
4081 | 4636 | unreserve_highatomic_pageblock(ac, false); |
---|
4082 | | - drain_all_pages(NULL); |
---|
| 4637 | + trace_android_vh_drain_all_pages_bypass(gfp_mask, order, |
---|
| 4638 | + alloc_flags, ac->migratetype, *did_some_progress, &skip_pcp_drain); |
---|
| 4639 | + if (!skip_pcp_drain) |
---|
| 4640 | + drain_all_pages(NULL); |
---|
4083 | 4641 | drained = true; |
---|
4084 | 4642 | goto retry; |
---|
4085 | 4643 | } |
---|
| 4644 | +out: |
---|
| 4645 | + psi_memstall_leave(&pflags); |
---|
4086 | 4646 | |
---|
4087 | 4647 | return page; |
---|
4088 | 4648 | } |
---|
.. | .. |
---|
4093 | 4653 | struct zoneref *z; |
---|
4094 | 4654 | struct zone *zone; |
---|
4095 | 4655 | pg_data_t *last_pgdat = NULL; |
---|
4096 | | - enum zone_type high_zoneidx = ac->high_zoneidx; |
---|
| 4656 | + enum zone_type highest_zoneidx = ac->highest_zoneidx; |
---|
4097 | 4657 | |
---|
4098 | | - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, |
---|
| 4658 | + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, |
---|
4099 | 4659 | ac->nodemask) { |
---|
4100 | 4660 | if (last_pgdat != zone->zone_pgdat) |
---|
4101 | | - wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); |
---|
| 4661 | + wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); |
---|
4102 | 4662 | last_pgdat = zone->zone_pgdat; |
---|
4103 | 4663 | } |
---|
4104 | 4664 | } |
---|
.. | .. |
---|
4108 | 4668 | { |
---|
4109 | 4669 | unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; |
---|
4110 | 4670 | |
---|
4111 | | - /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ |
---|
| 4671 | + /* |
---|
| 4672 | + * __GFP_HIGH is assumed to be the same as ALLOC_HIGH |
---|
| 4673 | + * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD |
---|
| 4674 | + * to save two branches. |
---|
| 4675 | + */ |
---|
4112 | 4676 | BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); |
---|
| 4677 | + BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); |
---|
4113 | 4678 | |
---|
4114 | 4679 | /* |
---|
4115 | 4680 | * The caller may dip into page reserves a bit more if the caller |
---|
.. | .. |
---|
4117 | 4682 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will |
---|
4118 | 4683 | * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). |
---|
4119 | 4684 | */ |
---|
4120 | | - alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); |
---|
| 4685 | + alloc_flags |= (__force int) |
---|
| 4686 | + (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); |
---|
4121 | 4687 | |
---|
4122 | 4688 | if (gfp_mask & __GFP_ATOMIC) { |
---|
4123 | 4689 | /* |
---|
.. | .. |
---|
4134 | 4700 | } else if (unlikely(rt_task(current)) && !in_interrupt()) |
---|
4135 | 4701 | alloc_flags |= ALLOC_HARDER; |
---|
4136 | 4702 | |
---|
4137 | | -#ifdef CONFIG_CMA |
---|
4138 | | - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
---|
4139 | | - alloc_flags |= ALLOC_CMA; |
---|
4140 | | -#endif |
---|
| 4703 | + alloc_flags = current_alloc_flags(gfp_mask, alloc_flags); |
---|
| 4704 | + |
---|
4141 | 4705 | return alloc_flags; |
---|
4142 | 4706 | } |
---|
4143 | 4707 | |
---|
.. | .. |
---|
4200 | 4764 | { |
---|
4201 | 4765 | struct zone *zone; |
---|
4202 | 4766 | struct zoneref *z; |
---|
| 4767 | + bool ret = false; |
---|
4203 | 4768 | |
---|
4204 | 4769 | /* |
---|
4205 | 4770 | * Costly allocations might have made a progress but this doesn't mean |
---|
.. | .. |
---|
4226 | 4791 | * request even if all reclaimable pages are considered then we are |
---|
4227 | 4792 | * screwed and have to go OOM. |
---|
4228 | 4793 | */ |
---|
4229 | | - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, |
---|
4230 | | - ac->nodemask) { |
---|
| 4794 | + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, |
---|
| 4795 | + ac->highest_zoneidx, ac->nodemask) { |
---|
4231 | 4796 | unsigned long available; |
---|
4232 | 4797 | unsigned long reclaimable; |
---|
4233 | 4798 | unsigned long min_wmark = min_wmark_pages(zone); |
---|
.. | .. |
---|
4241 | 4806 | * reclaimable pages? |
---|
4242 | 4807 | */ |
---|
4243 | 4808 | wmark = __zone_watermark_ok(zone, order, min_wmark, |
---|
4244 | | - ac_classzone_idx(ac), alloc_flags, available); |
---|
| 4809 | + ac->highest_zoneidx, alloc_flags, available); |
---|
4245 | 4810 | trace_reclaim_retry_zone(z, order, reclaimable, |
---|
4246 | 4811 | available, min_wmark, *no_progress_loops, wmark); |
---|
4247 | 4812 | if (wmark) { |
---|
.. | .. |
---|
4263 | 4828 | } |
---|
4264 | 4829 | } |
---|
4265 | 4830 | |
---|
4266 | | - /* |
---|
4267 | | - * Memory allocation/reclaim might be called from a WQ |
---|
4268 | | - * context and the current implementation of the WQ |
---|
4269 | | - * concurrency control doesn't recognize that |
---|
4270 | | - * a particular WQ is congested if the worker thread is |
---|
4271 | | - * looping without ever sleeping. Therefore we have to |
---|
4272 | | - * do a short sleep here rather than calling |
---|
4273 | | - * cond_resched(). |
---|
4274 | | - */ |
---|
4275 | | - if (current->flags & PF_WQ_WORKER) |
---|
4276 | | - schedule_timeout_uninterruptible(1); |
---|
4277 | | - else |
---|
4278 | | - cond_resched(); |
---|
4279 | | - |
---|
4280 | | - return true; |
---|
| 4831 | + ret = true; |
---|
| 4832 | + goto out; |
---|
4281 | 4833 | } |
---|
4282 | 4834 | } |
---|
4283 | 4835 | |
---|
4284 | | - return false; |
---|
| 4836 | +out: |
---|
| 4837 | + /* |
---|
| 4838 | + * Memory allocation/reclaim might be called from a WQ context and the |
---|
| 4839 | + * current implementation of the WQ concurrency control doesn't |
---|
| 4840 | + * recognize that a particular WQ is congested if the worker thread is |
---|
| 4841 | + * looping without ever sleeping. Therefore we have to do a short sleep |
---|
| 4842 | + * here rather than calling cond_resched(). |
---|
| 4843 | + */ |
---|
| 4844 | + if (current->flags & PF_WQ_WORKER) |
---|
| 4845 | + schedule_timeout_uninterruptible(1); |
---|
| 4846 | + else |
---|
| 4847 | + cond_resched(); |
---|
| 4848 | + return ret; |
---|
4285 | 4849 | } |
---|
4286 | 4850 | |
---|
4287 | 4851 | static inline bool |
---|
.. | .. |
---|
4331 | 4895 | int compaction_retries; |
---|
4332 | 4896 | int no_progress_loops; |
---|
4333 | 4897 | unsigned int cpuset_mems_cookie; |
---|
| 4898 | + unsigned int zonelist_iter_cookie; |
---|
4334 | 4899 | int reserve_flags; |
---|
| 4900 | + unsigned long vh_record; |
---|
4335 | 4901 | |
---|
| 4902 | + trace_android_vh_alloc_pages_slowpath_begin(gfp_mask, order, &vh_record); |
---|
4336 | 4903 | /* |
---|
4337 | 4904 | * We also sanity check to catch abuse of atomic reserves being used by |
---|
4338 | 4905 | * callers that are not in atomic context. |
---|
.. | .. |
---|
4341 | 4908 | (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) |
---|
4342 | 4909 | gfp_mask &= ~__GFP_ATOMIC; |
---|
4343 | 4910 | |
---|
4344 | | -retry_cpuset: |
---|
| 4911 | +restart: |
---|
4345 | 4912 | compaction_retries = 0; |
---|
4346 | 4913 | no_progress_loops = 0; |
---|
4347 | 4914 | compact_priority = DEF_COMPACT_PRIORITY; |
---|
4348 | 4915 | cpuset_mems_cookie = read_mems_allowed_begin(); |
---|
| 4916 | + zonelist_iter_cookie = zonelist_iter_begin(); |
---|
4349 | 4917 | |
---|
4350 | 4918 | /* |
---|
4351 | 4919 | * The fast path uses conservative alloc_flags to succeed only until |
---|
.. | .. |
---|
4361 | 4929 | * could end up iterating over non-eligible zones endlessly. |
---|
4362 | 4930 | */ |
---|
4363 | 4931 | ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, |
---|
4364 | | - ac->high_zoneidx, ac->nodemask); |
---|
| 4932 | + ac->highest_zoneidx, ac->nodemask); |
---|
4365 | 4933 | if (!ac->preferred_zoneref->zone) |
---|
4366 | 4934 | goto nopage; |
---|
4367 | 4935 | |
---|
4368 | | - if (gfp_mask & __GFP_KSWAPD_RECLAIM) |
---|
| 4936 | + if (alloc_flags & ALLOC_KSWAPD) |
---|
4369 | 4937 | wake_all_kswapds(order, gfp_mask, ac); |
---|
4370 | 4938 | |
---|
4371 | 4939 | /* |
---|
.. | .. |
---|
4398 | 4966 | |
---|
4399 | 4967 | /* |
---|
4400 | 4968 | * Checks for costly allocations with __GFP_NORETRY, which |
---|
4401 | | - * includes THP page fault allocations |
---|
| 4969 | + * includes some THP page fault allocations |
---|
4402 | 4970 | */ |
---|
4403 | 4971 | if (costly_order && (gfp_mask & __GFP_NORETRY)) { |
---|
4404 | 4972 | /* |
---|
4405 | | - * If compaction is deferred for high-order allocations, |
---|
4406 | | - * it is because sync compaction recently failed. If |
---|
4407 | | - * this is the case and the caller requested a THP |
---|
4408 | | - * allocation, we do not want to heavily disrupt the |
---|
4409 | | - * system, so we fail the allocation instead of entering |
---|
4410 | | - * direct reclaim. |
---|
| 4973 | + * If allocating entire pageblock(s) and compaction |
---|
| 4974 | + * failed because all zones are below low watermarks |
---|
| 4975 | + * or is prohibited because it recently failed at this |
---|
| 4976 | + * order, fail immediately unless the allocator has |
---|
| 4977 | + * requested compaction and reclaim retry. |
---|
| 4978 | + * |
---|
| 4979 | + * Reclaim is |
---|
| 4980 | + * - potentially very expensive because zones are far |
---|
| 4981 | + * below their low watermarks or this is part of very |
---|
| 4982 | + * bursty high order allocations, |
---|
| 4983 | + * - not guaranteed to help because isolate_freepages() |
---|
| 4984 | + * may not iterate over freed pages as part of its |
---|
| 4985 | + * linear scan, and |
---|
| 4986 | + * - unlikely to make entire pageblocks free on its |
---|
| 4987 | + * own. |
---|
4411 | 4988 | */ |
---|
4412 | | - if (compact_result == COMPACT_DEFERRED) |
---|
| 4989 | + if (compact_result == COMPACT_SKIPPED || |
---|
| 4990 | + compact_result == COMPACT_DEFERRED) |
---|
4413 | 4991 | goto nopage; |
---|
4414 | 4992 | |
---|
4415 | 4993 | /* |
---|
.. | .. |
---|
4423 | 5001 | |
---|
4424 | 5002 | retry: |
---|
4425 | 5003 | /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ |
---|
4426 | | - if (gfp_mask & __GFP_KSWAPD_RECLAIM) |
---|
| 5004 | + if (alloc_flags & ALLOC_KSWAPD) |
---|
4427 | 5005 | wake_all_kswapds(order, gfp_mask, ac); |
---|
4428 | 5006 | |
---|
4429 | 5007 | reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); |
---|
4430 | 5008 | if (reserve_flags) |
---|
4431 | | - alloc_flags = reserve_flags; |
---|
| 5009 | + alloc_flags = current_alloc_flags(gfp_mask, reserve_flags); |
---|
4432 | 5010 | |
---|
4433 | 5011 | /* |
---|
4434 | 5012 | * Reset the nodemask and zonelist iterators if memory policies can be |
---|
.. | .. |
---|
4438 | 5016 | if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { |
---|
4439 | 5017 | ac->nodemask = NULL; |
---|
4440 | 5018 | ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, |
---|
4441 | | - ac->high_zoneidx, ac->nodemask); |
---|
| 5019 | + ac->highest_zoneidx, ac->nodemask); |
---|
4442 | 5020 | } |
---|
4443 | 5021 | |
---|
4444 | 5022 | /* Attempt with potentially adjusted zonelist and alloc_flags */ |
---|
.. | .. |
---|
4453 | 5031 | /* Avoid recursion of direct reclaim */ |
---|
4454 | 5032 | if (current->flags & PF_MEMALLOC) |
---|
4455 | 5033 | goto nopage; |
---|
| 5034 | + |
---|
| 5035 | + trace_android_vh_alloc_pages_reclaim_bypass(gfp_mask, order, |
---|
| 5036 | + alloc_flags, ac->migratetype, &page); |
---|
| 5037 | + |
---|
| 5038 | + if (page) |
---|
| 5039 | + goto got_pg; |
---|
4456 | 5040 | |
---|
4457 | 5041 | /* Try direct reclaim and then allocating */ |
---|
4458 | 5042 | page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, |
---|
.. | .. |
---|
4494 | 5078 | goto retry; |
---|
4495 | 5079 | |
---|
4496 | 5080 | |
---|
4497 | | - /* Deal with possible cpuset update races before we start OOM killing */ |
---|
4498 | | - if (check_retry_cpuset(cpuset_mems_cookie, ac)) |
---|
4499 | | - goto retry_cpuset; |
---|
| 5081 | + /* |
---|
| 5082 | + * Deal with possible cpuset update races or zonelist updates to avoid |
---|
| 5083 | + * a unnecessary OOM kill. |
---|
| 5084 | + */ |
---|
| 5085 | + if (check_retry_cpuset(cpuset_mems_cookie, ac) || |
---|
| 5086 | + check_retry_zonelist(zonelist_iter_cookie)) |
---|
| 5087 | + goto restart; |
---|
4500 | 5088 | |
---|
4501 | 5089 | /* Reclaim has failed us, start killing things */ |
---|
4502 | 5090 | page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); |
---|
.. | .. |
---|
4505 | 5093 | |
---|
4506 | 5094 | /* Avoid allocations with no watermarks from looping endlessly */ |
---|
4507 | 5095 | if (tsk_is_oom_victim(current) && |
---|
4508 | | - (alloc_flags == ALLOC_OOM || |
---|
| 5096 | + (alloc_flags & ALLOC_OOM || |
---|
4509 | 5097 | (gfp_mask & __GFP_NOMEMALLOC))) |
---|
4510 | 5098 | goto nopage; |
---|
4511 | 5099 | |
---|
.. | .. |
---|
4516 | 5104 | } |
---|
4517 | 5105 | |
---|
4518 | 5106 | nopage: |
---|
4519 | | - /* Deal with possible cpuset update races before we fail */ |
---|
4520 | | - if (check_retry_cpuset(cpuset_mems_cookie, ac)) |
---|
4521 | | - goto retry_cpuset; |
---|
| 5107 | + /* |
---|
| 5108 | + * Deal with possible cpuset update races or zonelist updates to avoid |
---|
| 5109 | + * a unnecessary OOM kill. |
---|
| 5110 | + */ |
---|
| 5111 | + if (check_retry_cpuset(cpuset_mems_cookie, ac) || |
---|
| 5112 | + check_retry_zonelist(zonelist_iter_cookie)) |
---|
| 5113 | + goto restart; |
---|
4522 | 5114 | |
---|
4523 | 5115 | /* |
---|
4524 | 5116 | * Make sure that __GFP_NOFAIL request doesn't leak out and make sure |
---|
.. | .. |
---|
4561 | 5153 | goto retry; |
---|
4562 | 5154 | } |
---|
4563 | 5155 | fail: |
---|
| 5156 | + trace_android_vh_alloc_pages_failure_bypass(gfp_mask, order, |
---|
| 5157 | + alloc_flags, ac->migratetype, &page); |
---|
| 5158 | + if (page) |
---|
| 5159 | + goto got_pg; |
---|
| 5160 | + |
---|
4564 | 5161 | warn_alloc(gfp_mask, ac->nodemask, |
---|
4565 | 5162 | "page allocation failure: order:%u", order); |
---|
4566 | 5163 | got_pg: |
---|
| 5164 | + trace_android_vh_alloc_pages_slowpath_end(gfp_mask, order, vh_record); |
---|
4567 | 5165 | return page; |
---|
4568 | 5166 | } |
---|
4569 | 5167 | |
---|
.. | .. |
---|
4572 | 5170 | struct alloc_context *ac, gfp_t *alloc_mask, |
---|
4573 | 5171 | unsigned int *alloc_flags) |
---|
4574 | 5172 | { |
---|
4575 | | - ac->high_zoneidx = gfp_zone(gfp_mask); |
---|
| 5173 | + ac->highest_zoneidx = gfp_zone(gfp_mask); |
---|
4576 | 5174 | ac->zonelist = node_zonelist(preferred_nid, gfp_mask); |
---|
4577 | 5175 | ac->nodemask = nodemask; |
---|
4578 | | - ac->migratetype = gfpflags_to_migratetype(gfp_mask); |
---|
| 5176 | + ac->migratetype = gfp_migratetype(gfp_mask); |
---|
4579 | 5177 | |
---|
4580 | 5178 | if (cpusets_enabled()) { |
---|
4581 | 5179 | *alloc_mask |= __GFP_HARDWALL; |
---|
4582 | | - if (!ac->nodemask) |
---|
| 5180 | + /* |
---|
| 5181 | + * When we are in the interrupt context, it is irrelevant |
---|
| 5182 | + * to the current task context. It means that any node ok. |
---|
| 5183 | + */ |
---|
| 5184 | + if (!in_interrupt() && !ac->nodemask) |
---|
4583 | 5185 | ac->nodemask = &cpuset_current_mems_allowed; |
---|
4584 | 5186 | else |
---|
4585 | 5187 | *alloc_flags |= ALLOC_CPUSET; |
---|
.. | .. |
---|
4593 | 5195 | if (should_fail_alloc_page(gfp_mask, order)) |
---|
4594 | 5196 | return false; |
---|
4595 | 5197 | |
---|
4596 | | - if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) |
---|
4597 | | - *alloc_flags |= ALLOC_CMA; |
---|
| 5198 | + *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags); |
---|
4598 | 5199 | |
---|
4599 | | - return true; |
---|
4600 | | -} |
---|
4601 | | - |
---|
4602 | | -/* Determine whether to spread dirty pages and what the first usable zone */ |
---|
4603 | | -static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) |
---|
4604 | | -{ |
---|
4605 | 5200 | /* Dirty zone balancing only done in the fast path */ |
---|
4606 | 5201 | ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); |
---|
4607 | 5202 | |
---|
.. | .. |
---|
4611 | 5206 | * may get reset for allocations that ignore memory policies. |
---|
4612 | 5207 | */ |
---|
4613 | 5208 | ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, |
---|
4614 | | - ac->high_zoneidx, ac->nodemask); |
---|
| 5209 | + ac->highest_zoneidx, ac->nodemask); |
---|
| 5210 | + |
---|
| 5211 | + return true; |
---|
4615 | 5212 | } |
---|
4616 | 5213 | |
---|
4617 | 5214 | /* |
---|
.. | .. |
---|
4640 | 5237 | if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) |
---|
4641 | 5238 | return NULL; |
---|
4642 | 5239 | |
---|
4643 | | - finalise_ac(gfp_mask, &ac); |
---|
| 5240 | + /* |
---|
| 5241 | + * Forbid the first pass from falling back to types that fragment |
---|
| 5242 | + * memory until all local zones are considered. |
---|
| 5243 | + */ |
---|
| 5244 | + alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); |
---|
4644 | 5245 | |
---|
4645 | 5246 | /* First allocation attempt */ |
---|
4646 | 5247 | page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); |
---|
.. | .. |
---|
4660 | 5261 | * Restore the original nodemask if it was potentially replaced with |
---|
4661 | 5262 | * &cpuset_current_mems_allowed to optimize the fast-path attempt. |
---|
4662 | 5263 | */ |
---|
4663 | | - if (unlikely(ac.nodemask != nodemask)) |
---|
4664 | | - ac.nodemask = nodemask; |
---|
| 5264 | + ac.nodemask = nodemask; |
---|
4665 | 5265 | |
---|
4666 | 5266 | page = __alloc_pages_slowpath(alloc_mask, order, &ac); |
---|
4667 | 5267 | |
---|
4668 | 5268 | out: |
---|
4669 | 5269 | if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && |
---|
4670 | | - unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { |
---|
| 5270 | + unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) { |
---|
4671 | 5271 | __free_pages(page, order); |
---|
4672 | 5272 | page = NULL; |
---|
4673 | 5273 | } |
---|
.. | .. |
---|
4705 | 5305 | if (order == 0) /* Via pcp? */ |
---|
4706 | 5306 | free_unref_page(page); |
---|
4707 | 5307 | else |
---|
4708 | | - __free_pages_ok(page, order); |
---|
| 5308 | + __free_pages_ok(page, order, FPI_NONE); |
---|
4709 | 5309 | } |
---|
4710 | 5310 | |
---|
4711 | 5311 | void __free_pages(struct page *page, unsigned int order) |
---|
4712 | 5312 | { |
---|
| 5313 | + trace_android_vh_free_pages(page, order); |
---|
4713 | 5314 | if (put_page_testzero(page)) |
---|
4714 | 5315 | free_the_page(page, order); |
---|
| 5316 | + else if (!PageHead(page)) |
---|
| 5317 | + while (order-- > 0) |
---|
| 5318 | + free_the_page(page + (1 << order), order); |
---|
4715 | 5319 | } |
---|
4716 | 5320 | EXPORT_SYMBOL(__free_pages); |
---|
4717 | 5321 | |
---|
.. | .. |
---|
4816 | 5420 | /* reset page count bias and offset to start of new frag */ |
---|
4817 | 5421 | nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; |
---|
4818 | 5422 | offset = size - fragsz; |
---|
| 5423 | + if (unlikely(offset < 0)) { |
---|
| 5424 | + /* |
---|
| 5425 | + * The caller is trying to allocate a fragment |
---|
| 5426 | + * with fragsz > PAGE_SIZE but the cache isn't big |
---|
| 5427 | + * enough to satisfy the request, this may |
---|
| 5428 | + * happen in low memory conditions. |
---|
| 5429 | + * We don't release the cache page because |
---|
| 5430 | + * it could make memory pressure worse |
---|
| 5431 | + * so we simply return NULL here. |
---|
| 5432 | + */ |
---|
| 5433 | + return NULL; |
---|
| 5434 | + } |
---|
4819 | 5435 | } |
---|
4820 | 5436 | |
---|
4821 | 5437 | nc->pagecnt_bias--; |
---|
.. | .. |
---|
4856 | 5472 | /** |
---|
4857 | 5473 | * alloc_pages_exact - allocate an exact number physically-contiguous pages. |
---|
4858 | 5474 | * @size: the number of bytes to allocate |
---|
4859 | | - * @gfp_mask: GFP flags for the allocation |
---|
| 5475 | + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP |
---|
4860 | 5476 | * |
---|
4861 | 5477 | * This function is similar to alloc_pages(), except that it allocates the |
---|
4862 | 5478 | * minimum number of pages to satisfy the request. alloc_pages() can only |
---|
.. | .. |
---|
4865 | 5481 | * This function is also limited by MAX_ORDER. |
---|
4866 | 5482 | * |
---|
4867 | 5483 | * Memory allocated by this function must be released by free_pages_exact(). |
---|
| 5484 | + * |
---|
| 5485 | + * Return: pointer to the allocated area or %NULL in case of error. |
---|
4868 | 5486 | */ |
---|
4869 | 5487 | void *alloc_pages_exact(size_t size, gfp_t gfp_mask) |
---|
4870 | 5488 | { |
---|
4871 | 5489 | unsigned int order = get_order(size); |
---|
4872 | 5490 | unsigned long addr; |
---|
| 5491 | + |
---|
| 5492 | + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) |
---|
| 5493 | + gfp_mask &= ~__GFP_COMP; |
---|
4873 | 5494 | |
---|
4874 | 5495 | addr = __get_free_pages(gfp_mask, order); |
---|
4875 | 5496 | return make_alloc_exact(addr, order, size); |
---|
.. | .. |
---|
4881 | 5502 | * pages on a node. |
---|
4882 | 5503 | * @nid: the preferred node ID where memory should be allocated |
---|
4883 | 5504 | * @size: the number of bytes to allocate |
---|
4884 | | - * @gfp_mask: GFP flags for the allocation |
---|
| 5505 | + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP |
---|
4885 | 5506 | * |
---|
4886 | 5507 | * Like alloc_pages_exact(), but try to allocate on node nid first before falling |
---|
4887 | 5508 | * back. |
---|
| 5509 | + * |
---|
| 5510 | + * Return: pointer to the allocated area or %NULL in case of error. |
---|
4888 | 5511 | */ |
---|
4889 | 5512 | void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) |
---|
4890 | 5513 | { |
---|
4891 | 5514 | unsigned int order = get_order(size); |
---|
4892 | | - struct page *p = alloc_pages_node(nid, gfp_mask, order); |
---|
| 5515 | + struct page *p; |
---|
| 5516 | + |
---|
| 5517 | + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) |
---|
| 5518 | + gfp_mask &= ~__GFP_COMP; |
---|
| 5519 | + |
---|
| 5520 | + p = alloc_pages_node(nid, gfp_mask, order); |
---|
4893 | 5521 | if (!p) |
---|
4894 | 5522 | return NULL; |
---|
4895 | 5523 | return make_alloc_exact((unsigned long)page_address(p), order, size); |
---|
.. | .. |
---|
4918 | 5546 | * nr_free_zone_pages - count number of pages beyond high watermark |
---|
4919 | 5547 | * @offset: The zone index of the highest zone |
---|
4920 | 5548 | * |
---|
4921 | | - * nr_free_zone_pages() counts the number of counts pages which are beyond the |
---|
| 5549 | + * nr_free_zone_pages() counts the number of pages which are beyond the |
---|
4922 | 5550 | * high watermark within all zones at or below a given zone index. For each |
---|
4923 | 5551 | * zone, the number of pages is calculated as: |
---|
4924 | 5552 | * |
---|
4925 | 5553 | * nr_free_zone_pages = managed_pages - high_pages |
---|
| 5554 | + * |
---|
| 5555 | + * Return: number of pages beyond high watermark. |
---|
4926 | 5556 | */ |
---|
4927 | 5557 | static unsigned long nr_free_zone_pages(int offset) |
---|
4928 | 5558 | { |
---|
.. | .. |
---|
4935 | 5565 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); |
---|
4936 | 5566 | |
---|
4937 | 5567 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
---|
4938 | | - unsigned long size = zone->managed_pages; |
---|
| 5568 | + unsigned long size = zone_managed_pages(zone); |
---|
4939 | 5569 | unsigned long high = high_wmark_pages(zone); |
---|
4940 | 5570 | if (size > high) |
---|
4941 | 5571 | sum += size - high; |
---|
.. | .. |
---|
4949 | 5579 | * |
---|
4950 | 5580 | * nr_free_buffer_pages() counts the number of pages which are beyond the high |
---|
4951 | 5581 | * watermark within ZONE_DMA and ZONE_NORMAL. |
---|
| 5582 | + * |
---|
| 5583 | + * Return: number of pages beyond high watermark within ZONE_DMA and |
---|
| 5584 | + * ZONE_NORMAL. |
---|
4952 | 5585 | */ |
---|
4953 | 5586 | unsigned long nr_free_buffer_pages(void) |
---|
4954 | 5587 | { |
---|
4955 | 5588 | return nr_free_zone_pages(gfp_zone(GFP_USER)); |
---|
4956 | 5589 | } |
---|
4957 | 5590 | EXPORT_SYMBOL_GPL(nr_free_buffer_pages); |
---|
4958 | | - |
---|
4959 | | -/** |
---|
4960 | | - * nr_free_pagecache_pages - count number of pages beyond high watermark |
---|
4961 | | - * |
---|
4962 | | - * nr_free_pagecache_pages() counts the number of pages which are beyond the |
---|
4963 | | - * high watermark within all zones. |
---|
4964 | | - */ |
---|
4965 | | -unsigned long nr_free_pagecache_pages(void) |
---|
4966 | | -{ |
---|
4967 | | - return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); |
---|
4968 | | -} |
---|
4969 | 5591 | |
---|
4970 | 5592 | static inline void show_node(struct zone *zone) |
---|
4971 | 5593 | { |
---|
.. | .. |
---|
4987 | 5609 | pages[lru] = global_node_page_state(NR_LRU_BASE + lru); |
---|
4988 | 5610 | |
---|
4989 | 5611 | for_each_zone(zone) |
---|
4990 | | - wmark_low += zone->watermark[WMARK_LOW]; |
---|
| 5612 | + wmark_low += low_wmark_pages(zone); |
---|
4991 | 5613 | |
---|
4992 | 5614 | /* |
---|
4993 | 5615 | * Estimate the amount of memory available for userspace allocations, |
---|
.. | .. |
---|
5009 | 5631 | * items that are in use, and cannot be freed. Cap this estimate at the |
---|
5010 | 5632 | * low watermark. |
---|
5011 | 5633 | */ |
---|
5012 | | - reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) + |
---|
5013 | | - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); |
---|
| 5634 | + reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + |
---|
| 5635 | + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); |
---|
5014 | 5636 | available += reclaimable - min(reclaimable / 2, wmark_low); |
---|
5015 | 5637 | |
---|
5016 | 5638 | if (available < 0) |
---|
.. | .. |
---|
5021 | 5643 | |
---|
5022 | 5644 | void si_meminfo(struct sysinfo *val) |
---|
5023 | 5645 | { |
---|
5024 | | - val->totalram = totalram_pages; |
---|
| 5646 | + val->totalram = totalram_pages(); |
---|
5025 | 5647 | val->sharedram = global_node_page_state(NR_SHMEM); |
---|
5026 | 5648 | val->freeram = global_zone_page_state(NR_FREE_PAGES); |
---|
5027 | 5649 | val->bufferram = nr_blockdev_pages(); |
---|
5028 | | - val->totalhigh = totalhigh_pages; |
---|
| 5650 | + val->totalhigh = totalhigh_pages(); |
---|
5029 | 5651 | val->freehigh = nr_free_highpages(); |
---|
5030 | 5652 | val->mem_unit = PAGE_SIZE; |
---|
5031 | 5653 | } |
---|
.. | .. |
---|
5042 | 5664 | pg_data_t *pgdat = NODE_DATA(nid); |
---|
5043 | 5665 | |
---|
5044 | 5666 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) |
---|
5045 | | - managed_pages += pgdat->node_zones[zone_type].managed_pages; |
---|
| 5667 | + managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); |
---|
5046 | 5668 | val->totalram = managed_pages; |
---|
5047 | 5669 | val->sharedram = node_page_state(pgdat, NR_SHMEM); |
---|
5048 | 5670 | val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); |
---|
.. | .. |
---|
5051 | 5673 | struct zone *zone = &pgdat->node_zones[zone_type]; |
---|
5052 | 5674 | |
---|
5053 | 5675 | if (is_highmem(zone)) { |
---|
5054 | | - managed_highpages += zone->managed_pages; |
---|
| 5676 | + managed_highpages += zone_managed_pages(zone); |
---|
5055 | 5677 | free_highpages += zone_page_state(zone, NR_FREE_PAGES); |
---|
5056 | 5678 | } |
---|
5057 | 5679 | } |
---|
.. | .. |
---|
5140 | 5762 | |
---|
5141 | 5763 | printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" |
---|
5142 | 5764 | " active_file:%lu inactive_file:%lu isolated_file:%lu\n" |
---|
5143 | | - " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" |
---|
| 5765 | + " unevictable:%lu dirty:%lu writeback:%lu\n" |
---|
5144 | 5766 | " slab_reclaimable:%lu slab_unreclaimable:%lu\n" |
---|
5145 | 5767 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" |
---|
5146 | 5768 | " free:%lu free_pcp:%lu free_cma:%lu\n", |
---|
.. | .. |
---|
5153 | 5775 | global_node_page_state(NR_UNEVICTABLE), |
---|
5154 | 5776 | global_node_page_state(NR_FILE_DIRTY), |
---|
5155 | 5777 | global_node_page_state(NR_WRITEBACK), |
---|
5156 | | - global_node_page_state(NR_UNSTABLE_NFS), |
---|
5157 | | - global_node_page_state(NR_SLAB_RECLAIMABLE), |
---|
5158 | | - global_node_page_state(NR_SLAB_UNRECLAIMABLE), |
---|
| 5778 | + global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), |
---|
| 5779 | + global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), |
---|
5159 | 5780 | global_node_page_state(NR_FILE_MAPPED), |
---|
5160 | 5781 | global_node_page_state(NR_SHMEM), |
---|
5161 | 5782 | global_zone_page_state(NR_PAGETABLE), |
---|
.. | .. |
---|
5164 | 5785 | free_pcp, |
---|
5165 | 5786 | global_zone_page_state(NR_FREE_CMA_PAGES)); |
---|
5166 | 5787 | |
---|
| 5788 | + trace_android_vh_show_mapcount_pages(NULL); |
---|
5167 | 5789 | for_each_online_pgdat(pgdat) { |
---|
5168 | 5790 | if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) |
---|
5169 | 5791 | continue; |
---|
.. | .. |
---|
5186 | 5808 | " anon_thp: %lukB" |
---|
5187 | 5809 | #endif |
---|
5188 | 5810 | " writeback_tmp:%lukB" |
---|
5189 | | - " unstable:%lukB" |
---|
| 5811 | + " kernel_stack:%lukB" |
---|
| 5812 | +#ifdef CONFIG_SHADOW_CALL_STACK |
---|
| 5813 | + " shadow_call_stack:%lukB" |
---|
| 5814 | +#endif |
---|
5190 | 5815 | " all_unreclaimable? %s" |
---|
5191 | 5816 | "\n", |
---|
5192 | 5817 | pgdat->node_id, |
---|
.. | .. |
---|
5208 | 5833 | K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), |
---|
5209 | 5834 | #endif |
---|
5210 | 5835 | K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), |
---|
5211 | | - K(node_page_state(pgdat, NR_UNSTABLE_NFS)), |
---|
| 5836 | + node_page_state(pgdat, NR_KERNEL_STACK_KB), |
---|
| 5837 | +#ifdef CONFIG_SHADOW_CALL_STACK |
---|
| 5838 | + node_page_state(pgdat, NR_KERNEL_SCS_KB), |
---|
| 5839 | +#endif |
---|
5212 | 5840 | pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? |
---|
5213 | 5841 | "yes" : "no"); |
---|
5214 | 5842 | } |
---|
.. | .. |
---|
5230 | 5858 | " min:%lukB" |
---|
5231 | 5859 | " low:%lukB" |
---|
5232 | 5860 | " high:%lukB" |
---|
| 5861 | + " reserved_highatomic:%luKB" |
---|
5233 | 5862 | " active_anon:%lukB" |
---|
5234 | 5863 | " inactive_anon:%lukB" |
---|
5235 | 5864 | " active_file:%lukB" |
---|
.. | .. |
---|
5239 | 5868 | " present:%lukB" |
---|
5240 | 5869 | " managed:%lukB" |
---|
5241 | 5870 | " mlocked:%lukB" |
---|
5242 | | - " kernel_stack:%lukB" |
---|
5243 | | -#ifdef CONFIG_SHADOW_CALL_STACK |
---|
5244 | | - " shadow_call_stack:%lukB" |
---|
5245 | | -#endif |
---|
5246 | 5871 | " pagetables:%lukB" |
---|
5247 | 5872 | " bounce:%lukB" |
---|
5248 | 5873 | " free_pcp:%lukB" |
---|
.. | .. |
---|
5254 | 5879 | K(min_wmark_pages(zone)), |
---|
5255 | 5880 | K(low_wmark_pages(zone)), |
---|
5256 | 5881 | K(high_wmark_pages(zone)), |
---|
| 5882 | + K(zone->nr_reserved_highatomic), |
---|
5257 | 5883 | K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), |
---|
5258 | 5884 | K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), |
---|
5259 | 5885 | K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), |
---|
.. | .. |
---|
5261 | 5887 | K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), |
---|
5262 | 5888 | K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), |
---|
5263 | 5889 | K(zone->present_pages), |
---|
5264 | | - K(zone->managed_pages), |
---|
| 5890 | + K(zone_managed_pages(zone)), |
---|
5265 | 5891 | K(zone_page_state(zone, NR_MLOCK)), |
---|
5266 | | - zone_page_state(zone, NR_KERNEL_STACK_KB), |
---|
5267 | | -#ifdef CONFIG_SHADOW_CALL_STACK |
---|
5268 | | - zone_page_state(zone, NR_KERNEL_SCS_BYTES) / 1024, |
---|
5269 | | -#endif |
---|
5270 | 5892 | K(zone_page_state(zone, NR_PAGETABLE)), |
---|
5271 | 5893 | K(zone_page_state(zone, NR_BOUNCE)), |
---|
5272 | 5894 | K(free_pcp), |
---|
.. | .. |
---|
5298 | 5920 | |
---|
5299 | 5921 | types[order] = 0; |
---|
5300 | 5922 | for (type = 0; type < MIGRATE_TYPES; type++) { |
---|
5301 | | - if (!list_empty(&area->free_list[type])) |
---|
| 5923 | + if (!free_area_empty(area, type)) |
---|
5302 | 5924 | types[order] |= 1 << type; |
---|
5303 | 5925 | } |
---|
5304 | 5926 | } |
---|
.. | .. |
---|
5339 | 5961 | do { |
---|
5340 | 5962 | zone_type--; |
---|
5341 | 5963 | zone = pgdat->node_zones + zone_type; |
---|
5342 | | - if (managed_zone(zone)) { |
---|
| 5964 | + if (populated_zone(zone)) { |
---|
5343 | 5965 | zoneref_set_zone(zone, &zonerefs[nr_zones++]); |
---|
5344 | 5966 | check_highest_zone(zone_type); |
---|
5345 | 5967 | } |
---|
.. | .. |
---|
5365 | 5987 | return 0; |
---|
5366 | 5988 | } |
---|
5367 | 5989 | |
---|
5368 | | -static __init int setup_numa_zonelist_order(char *s) |
---|
5369 | | -{ |
---|
5370 | | - if (!s) |
---|
5371 | | - return 0; |
---|
5372 | | - |
---|
5373 | | - return __parse_numa_zonelist_order(s); |
---|
5374 | | -} |
---|
5375 | | -early_param("numa_zonelist_order", setup_numa_zonelist_order); |
---|
5376 | | - |
---|
5377 | 5990 | char numa_zonelist_order[] = "Node"; |
---|
5378 | 5991 | |
---|
5379 | 5992 | /* |
---|
5380 | 5993 | * sysctl handler for numa_zonelist_order |
---|
5381 | 5994 | */ |
---|
5382 | 5995 | int numa_zonelist_order_handler(struct ctl_table *table, int write, |
---|
5383 | | - void __user *buffer, size_t *length, |
---|
5384 | | - loff_t *ppos) |
---|
| 5996 | + void *buffer, size_t *length, loff_t *ppos) |
---|
5385 | 5997 | { |
---|
5386 | | - char *str; |
---|
5387 | | - int ret; |
---|
5388 | | - |
---|
5389 | | - if (!write) |
---|
5390 | | - return proc_dostring(table, write, buffer, length, ppos); |
---|
5391 | | - str = memdup_user_nul(buffer, 16); |
---|
5392 | | - if (IS_ERR(str)) |
---|
5393 | | - return PTR_ERR(str); |
---|
5394 | | - |
---|
5395 | | - ret = __parse_numa_zonelist_order(str); |
---|
5396 | | - kfree(str); |
---|
5397 | | - return ret; |
---|
| 5998 | + if (write) |
---|
| 5999 | + return __parse_numa_zonelist_order(buffer); |
---|
| 6000 | + return proc_dostring(table, write, buffer, length, ppos); |
---|
5398 | 6001 | } |
---|
5399 | 6002 | |
---|
5400 | 6003 | |
---|
.. | .. |
---|
5413 | 6016 | * from each node to each node in the system), and should also prefer nodes |
---|
5414 | 6017 | * with no CPUs, since presumably they'll have very little allocation pressure |
---|
5415 | 6018 | * on them otherwise. |
---|
5416 | | - * It returns -1 if no node is found. |
---|
| 6019 | + * |
---|
| 6020 | + * Return: node id of the found node or %NUMA_NO_NODE if no node is found. |
---|
5417 | 6021 | */ |
---|
5418 | 6022 | static int find_next_best_node(int node, nodemask_t *used_node_mask) |
---|
5419 | 6023 | { |
---|
5420 | 6024 | int n, val; |
---|
5421 | 6025 | int min_val = INT_MAX; |
---|
5422 | 6026 | int best_node = NUMA_NO_NODE; |
---|
5423 | | - const struct cpumask *tmp = cpumask_of_node(0); |
---|
5424 | 6027 | |
---|
5425 | 6028 | /* Use the local node if we haven't already */ |
---|
5426 | 6029 | if (!node_isset(node, *used_node_mask)) { |
---|
.. | .. |
---|
5441 | 6044 | val += (n < node); |
---|
5442 | 6045 | |
---|
5443 | 6046 | /* Give preference to headless and unused nodes */ |
---|
5444 | | - tmp = cpumask_of_node(n); |
---|
5445 | | - if (!cpumask_empty(tmp)) |
---|
| 6047 | + if (!cpumask_empty(cpumask_of_node(n))) |
---|
5446 | 6048 | val += PENALTY_FOR_NODE_WITH_CPUS; |
---|
5447 | 6049 | |
---|
5448 | 6050 | /* Slight preference for less loaded node */ |
---|
.. | .. |
---|
5513 | 6115 | { |
---|
5514 | 6116 | static int node_order[MAX_NUMNODES]; |
---|
5515 | 6117 | int node, load, nr_nodes = 0; |
---|
5516 | | - nodemask_t used_mask; |
---|
| 6118 | + nodemask_t used_mask = NODE_MASK_NONE; |
---|
5517 | 6119 | int local_node, prev_node; |
---|
5518 | 6120 | |
---|
5519 | 6121 | /* NUMA-aware ordering of nodes */ |
---|
5520 | 6122 | local_node = pgdat->node_id; |
---|
5521 | 6123 | load = nr_online_nodes; |
---|
5522 | 6124 | prev_node = local_node; |
---|
5523 | | - nodes_clear(used_mask); |
---|
5524 | 6125 | |
---|
5525 | 6126 | memset(node_order, 0, sizeof(node_order)); |
---|
5526 | 6127 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
---|
.. | .. |
---|
5627 | 6228 | int nid; |
---|
5628 | 6229 | int __maybe_unused cpu; |
---|
5629 | 6230 | pg_data_t *self = data; |
---|
5630 | | - static DEFINE_SPINLOCK(lock); |
---|
5631 | 6231 | |
---|
5632 | | - spin_lock(&lock); |
---|
| 6232 | + write_seqlock(&zonelist_update_seq); |
---|
5633 | 6233 | |
---|
5634 | 6234 | #ifdef CONFIG_NUMA |
---|
5635 | 6235 | memset(node_load, 0, sizeof(node_load)); |
---|
.. | .. |
---|
5662 | 6262 | #endif |
---|
5663 | 6263 | } |
---|
5664 | 6264 | |
---|
5665 | | - spin_unlock(&lock); |
---|
| 6265 | + write_sequnlock(&zonelist_update_seq); |
---|
5666 | 6266 | } |
---|
5667 | 6267 | |
---|
5668 | 6268 | static noinline void __init |
---|
.. | .. |
---|
5700 | 6300 | */ |
---|
5701 | 6301 | void __ref build_all_zonelists(pg_data_t *pgdat) |
---|
5702 | 6302 | { |
---|
| 6303 | + unsigned long vm_total_pages; |
---|
| 6304 | + |
---|
5703 | 6305 | if (system_state == SYSTEM_BOOTING) { |
---|
5704 | 6306 | build_all_zonelists_init(); |
---|
5705 | 6307 | } else { |
---|
5706 | 6308 | __build_all_zonelists(pgdat); |
---|
5707 | 6309 | /* cpuset refresh routine should be here */ |
---|
5708 | 6310 | } |
---|
5709 | | - vm_total_pages = nr_free_pagecache_pages(); |
---|
| 6311 | + /* Get the number of free pages beyond high watermark in all zones. */ |
---|
| 6312 | + vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); |
---|
5710 | 6313 | /* |
---|
5711 | 6314 | * Disable grouping by mobility if the number of pages in the |
---|
5712 | 6315 | * system is too low to allow the mechanism to work. It would be |
---|
.. | .. |
---|
5719 | 6322 | else |
---|
5720 | 6323 | page_group_by_mobility_disabled = 0; |
---|
5721 | 6324 | |
---|
5722 | | - pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", |
---|
| 6325 | + pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", |
---|
5723 | 6326 | nr_online_nodes, |
---|
5724 | 6327 | page_group_by_mobility_disabled ? "off" : "on", |
---|
5725 | 6328 | vm_total_pages); |
---|
.. | .. |
---|
5728 | 6331 | #endif |
---|
5729 | 6332 | } |
---|
5730 | 6333 | |
---|
| 6334 | +/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ |
---|
| 6335 | +static bool __meminit |
---|
| 6336 | +overlap_memmap_init(unsigned long zone, unsigned long *pfn) |
---|
| 6337 | +{ |
---|
| 6338 | + static struct memblock_region *r; |
---|
| 6339 | + |
---|
| 6340 | + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { |
---|
| 6341 | + if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { |
---|
| 6342 | + for_each_mem_region(r) { |
---|
| 6343 | + if (*pfn < memblock_region_memory_end_pfn(r)) |
---|
| 6344 | + break; |
---|
| 6345 | + } |
---|
| 6346 | + } |
---|
| 6347 | + if (*pfn >= memblock_region_memory_base_pfn(r) && |
---|
| 6348 | + memblock_is_mirror(r)) { |
---|
| 6349 | + *pfn = memblock_region_memory_end_pfn(r); |
---|
| 6350 | + return true; |
---|
| 6351 | + } |
---|
| 6352 | + } |
---|
| 6353 | + return false; |
---|
| 6354 | +} |
---|
| 6355 | + |
---|
5731 | 6356 | /* |
---|
5732 | 6357 | * Initially all pages are reserved - free ones are freed |
---|
5733 | | - * up by free_all_bootmem() once the early boot process is |
---|
| 6358 | + * up by memblock_free_all() once the early boot process is |
---|
5734 | 6359 | * done. Non-atomic initialization, single-pass. |
---|
| 6360 | + * |
---|
| 6361 | + * All aligned pageblocks are initialized to the specified migratetype |
---|
| 6362 | + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related |
---|
| 6363 | + * zone stats (e.g., nr_isolate_pageblock) are touched. |
---|
5735 | 6364 | */ |
---|
5736 | 6365 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
---|
5737 | | - unsigned long start_pfn, enum meminit_context context, |
---|
5738 | | - struct vmem_altmap *altmap) |
---|
| 6366 | + unsigned long start_pfn, unsigned long zone_end_pfn, |
---|
| 6367 | + enum meminit_context context, |
---|
| 6368 | + struct vmem_altmap *altmap, int migratetype) |
---|
5739 | 6369 | { |
---|
5740 | | - unsigned long end_pfn = start_pfn + size; |
---|
5741 | | - pg_data_t *pgdat = NODE_DATA(nid); |
---|
5742 | | - unsigned long pfn; |
---|
5743 | | - unsigned long nr_initialised = 0; |
---|
| 6370 | + unsigned long pfn, end_pfn = start_pfn + size; |
---|
5744 | 6371 | struct page *page; |
---|
5745 | | -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
---|
5746 | | - struct memblock_region *r = NULL, *tmp; |
---|
5747 | | -#endif |
---|
5748 | 6372 | |
---|
5749 | 6373 | if (highest_memmap_pfn < end_pfn - 1) |
---|
5750 | 6374 | highest_memmap_pfn = end_pfn - 1; |
---|
| 6375 | + |
---|
| 6376 | +#ifdef CONFIG_ZONE_DEVICE |
---|
| 6377 | + /* |
---|
| 6378 | + * Honor reservation requested by the driver for this ZONE_DEVICE |
---|
| 6379 | + * memory. We limit the total number of pages to initialize to just |
---|
| 6380 | + * those that might contain the memory mapping. We will defer the |
---|
| 6381 | + * ZONE_DEVICE page initialization until after we have released |
---|
| 6382 | + * the hotplug lock. |
---|
| 6383 | + */ |
---|
| 6384 | + if (zone == ZONE_DEVICE) { |
---|
| 6385 | + if (!altmap) |
---|
| 6386 | + return; |
---|
| 6387 | + |
---|
| 6388 | + if (start_pfn == altmap->base_pfn) |
---|
| 6389 | + start_pfn += altmap->reserve; |
---|
| 6390 | + end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); |
---|
| 6391 | + } |
---|
| 6392 | +#endif |
---|
5751 | 6393 | |
---|
5752 | 6394 | #ifdef CONFIG_ROCKCHIP_THUNDER_BOOT |
---|
5753 | 6395 | /* Zero all page struct in advance */ |
---|
5754 | 6396 | memset(pfn_to_page(start_pfn), 0, sizeof(struct page) * size); |
---|
5755 | 6397 | #endif |
---|
5756 | 6398 | |
---|
5757 | | - /* |
---|
5758 | | - * Honor reservation requested by the driver for this ZONE_DEVICE |
---|
5759 | | - * memory |
---|
5760 | | - */ |
---|
5761 | | - if (altmap && start_pfn == altmap->base_pfn) |
---|
5762 | | - start_pfn += altmap->reserve; |
---|
5763 | | - |
---|
5764 | | - for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
---|
| 6399 | + for (pfn = start_pfn; pfn < end_pfn; ) { |
---|
5765 | 6400 | /* |
---|
5766 | 6401 | * There can be holes in boot-time mem_map[]s handed to this |
---|
5767 | 6402 | * function. They do not exist on hotplugged memory. |
---|
5768 | 6403 | */ |
---|
5769 | | - if (context != MEMINIT_EARLY) |
---|
5770 | | - goto not_early; |
---|
5771 | | - |
---|
5772 | | - if (!early_pfn_valid(pfn)) |
---|
5773 | | - continue; |
---|
5774 | | - if (!early_pfn_in_nid(pfn, nid)) |
---|
5775 | | - continue; |
---|
5776 | | - if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) |
---|
5777 | | - break; |
---|
5778 | | - |
---|
5779 | | -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
---|
5780 | | - /* |
---|
5781 | | - * Check given memblock attribute by firmware which can affect |
---|
5782 | | - * kernel memory layout. If zone==ZONE_MOVABLE but memory is |
---|
5783 | | - * mirrored, it's an overlapped memmap init. skip it. |
---|
5784 | | - */ |
---|
5785 | | - if (mirrored_kernelcore && zone == ZONE_MOVABLE) { |
---|
5786 | | - if (!r || pfn >= memblock_region_memory_end_pfn(r)) { |
---|
5787 | | - for_each_memblock(memory, tmp) |
---|
5788 | | - if (pfn < memblock_region_memory_end_pfn(tmp)) |
---|
5789 | | - break; |
---|
5790 | | - r = tmp; |
---|
5791 | | - } |
---|
5792 | | - if (pfn >= memblock_region_memory_base_pfn(r) && |
---|
5793 | | - memblock_is_mirror(r)) { |
---|
5794 | | - /* already initialized as NORMAL */ |
---|
5795 | | - pfn = memblock_region_memory_end_pfn(r); |
---|
| 6404 | + if (context == MEMINIT_EARLY) { |
---|
| 6405 | + if (overlap_memmap_init(zone, &pfn)) |
---|
5796 | 6406 | continue; |
---|
5797 | | - } |
---|
| 6407 | + if (defer_init(nid, pfn, zone_end_pfn)) |
---|
| 6408 | + break; |
---|
5798 | 6409 | } |
---|
5799 | | -#endif |
---|
5800 | 6410 | |
---|
5801 | | -not_early: |
---|
5802 | 6411 | page = pfn_to_page(pfn); |
---|
5803 | 6412 | __init_single_page(page, pfn, zone, nid, false); |
---|
5804 | 6413 | if (context == MEMINIT_HOTPLUG) |
---|
5805 | | - SetPageReserved(page); |
---|
| 6414 | + __SetPageReserved(page); |
---|
| 6415 | + |
---|
| 6416 | + /* |
---|
| 6417 | + * Usually, we want to mark the pageblock MIGRATE_MOVABLE, |
---|
| 6418 | + * such that unmovable allocations won't be scattered all |
---|
| 6419 | + * over the place during system boot. |
---|
| 6420 | + */ |
---|
| 6421 | + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { |
---|
| 6422 | + set_pageblock_migratetype(page, migratetype); |
---|
| 6423 | + cond_resched(); |
---|
| 6424 | + } |
---|
| 6425 | + pfn++; |
---|
| 6426 | + } |
---|
| 6427 | +} |
---|
| 6428 | + |
---|
| 6429 | +#ifdef CONFIG_ZONE_DEVICE |
---|
| 6430 | +void __ref memmap_init_zone_device(struct zone *zone, |
---|
| 6431 | + unsigned long start_pfn, |
---|
| 6432 | + unsigned long nr_pages, |
---|
| 6433 | + struct dev_pagemap *pgmap) |
---|
| 6434 | +{ |
---|
| 6435 | + unsigned long pfn, end_pfn = start_pfn + nr_pages; |
---|
| 6436 | + struct pglist_data *pgdat = zone->zone_pgdat; |
---|
| 6437 | + struct vmem_altmap *altmap = pgmap_altmap(pgmap); |
---|
| 6438 | + unsigned long zone_idx = zone_idx(zone); |
---|
| 6439 | + unsigned long start = jiffies; |
---|
| 6440 | + int nid = pgdat->node_id; |
---|
| 6441 | + |
---|
| 6442 | + if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) |
---|
| 6443 | + return; |
---|
| 6444 | + |
---|
| 6445 | + /* |
---|
| 6446 | + * The call to memmap_init should have already taken care |
---|
| 6447 | + * of the pages reserved for the memmap, so we can just jump to |
---|
| 6448 | + * the end of that region and start processing the device pages. |
---|
| 6449 | + */ |
---|
| 6450 | + if (altmap) { |
---|
| 6451 | + start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); |
---|
| 6452 | + nr_pages = end_pfn - start_pfn; |
---|
| 6453 | + } |
---|
| 6454 | + |
---|
| 6455 | + for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
---|
| 6456 | + struct page *page = pfn_to_page(pfn); |
---|
| 6457 | + |
---|
| 6458 | + __init_single_page(page, pfn, zone_idx, nid, true); |
---|
| 6459 | + |
---|
| 6460 | + /* |
---|
| 6461 | + * Mark page reserved as it will need to wait for onlining |
---|
| 6462 | + * phase for it to be fully associated with a zone. |
---|
| 6463 | + * |
---|
| 6464 | + * We can use the non-atomic __set_bit operation for setting |
---|
| 6465 | + * the flag as we are still initializing the pages. |
---|
| 6466 | + */ |
---|
| 6467 | + __SetPageReserved(page); |
---|
| 6468 | + |
---|
| 6469 | + /* |
---|
| 6470 | + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer |
---|
| 6471 | + * and zone_device_data. It is a bug if a ZONE_DEVICE page is |
---|
| 6472 | + * ever freed or placed on a driver-private list. |
---|
| 6473 | + */ |
---|
| 6474 | + page->pgmap = pgmap; |
---|
| 6475 | + page->zone_device_data = NULL; |
---|
5806 | 6476 | |
---|
5807 | 6477 | /* |
---|
5808 | 6478 | * Mark the block movable so that blocks are reserved for |
---|
.. | .. |
---|
5811 | 6481 | * the address space during boot when many long-lived |
---|
5812 | 6482 | * kernel allocations are made. |
---|
5813 | 6483 | * |
---|
5814 | | - * bitmap is created for zone's valid pfn range. but memmap |
---|
5815 | | - * can be created for invalid pages (for alignment) |
---|
5816 | | - * check here not to call set_pageblock_migratetype() against |
---|
5817 | | - * pfn out of zone. |
---|
5818 | | - * |
---|
5819 | 6484 | * Please note that MEMINIT_HOTPLUG path doesn't clear memmap |
---|
5820 | | - * because this is done early in sparse_add_one_section |
---|
| 6485 | + * because this is done early in section_activate() |
---|
5821 | 6486 | */ |
---|
5822 | | - if (!(pfn & (pageblock_nr_pages - 1))) { |
---|
| 6487 | + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { |
---|
5823 | 6488 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
---|
5824 | 6489 | cond_resched(); |
---|
5825 | 6490 | } |
---|
5826 | 6491 | } |
---|
| 6492 | + |
---|
| 6493 | + pr_info("%s initialised %lu pages in %ums\n", __func__, |
---|
| 6494 | + nr_pages, jiffies_to_msecs(jiffies - start)); |
---|
5827 | 6495 | } |
---|
5828 | 6496 | |
---|
| 6497 | +#endif |
---|
5829 | 6498 | static void __meminit zone_init_free_lists(struct zone *zone) |
---|
5830 | 6499 | { |
---|
5831 | 6500 | unsigned int order, t; |
---|
.. | .. |
---|
5835 | 6504 | } |
---|
5836 | 6505 | } |
---|
5837 | 6506 | |
---|
5838 | | -#ifndef __HAVE_ARCH_MEMMAP_INIT |
---|
5839 | | -#define memmap_init(size, nid, zone, start_pfn) \ |
---|
5840 | | - memmap_init_zone((size), (nid), (zone), (start_pfn), \ |
---|
5841 | | - MEMINIT_EARLY, NULL) |
---|
| 6507 | +/* |
---|
| 6508 | + * Only struct pages that correspond to ranges defined by memblock.memory |
---|
| 6509 | + * are zeroed and initialized by going through __init_single_page() during |
---|
| 6510 | + * memmap_init_zone_range(). |
---|
| 6511 | + * |
---|
| 6512 | + * But, there could be struct pages that correspond to holes in |
---|
| 6513 | + * memblock.memory. This can happen because of the following reasons: |
---|
| 6514 | + * - physical memory bank size is not necessarily the exact multiple of the |
---|
| 6515 | + * arbitrary section size |
---|
| 6516 | + * - early reserved memory may not be listed in memblock.memory |
---|
| 6517 | + * - memory layouts defined with memmap= kernel parameter may not align |
---|
| 6518 | + * nicely with memmap sections |
---|
| 6519 | + * |
---|
| 6520 | + * Explicitly initialize those struct pages so that: |
---|
| 6521 | + * - PG_Reserved is set |
---|
| 6522 | + * - zone and node links point to zone and node that span the page if the |
---|
| 6523 | + * hole is in the middle of a zone |
---|
| 6524 | + * - zone and node links point to adjacent zone/node if the hole falls on |
---|
| 6525 | + * the zone boundary; the pages in such holes will be prepended to the |
---|
| 6526 | + * zone/node above the hole except for the trailing pages in the last |
---|
| 6527 | + * section that will be appended to the zone/node below. |
---|
| 6528 | + */ |
---|
| 6529 | +static void __init init_unavailable_range(unsigned long spfn, |
---|
| 6530 | + unsigned long epfn, |
---|
| 6531 | + int zone, int node) |
---|
| 6532 | +{ |
---|
| 6533 | + unsigned long pfn; |
---|
| 6534 | + u64 pgcnt = 0; |
---|
| 6535 | + |
---|
| 6536 | + for (pfn = spfn; pfn < epfn; pfn++) { |
---|
| 6537 | + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { |
---|
| 6538 | + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) |
---|
| 6539 | + + pageblock_nr_pages - 1; |
---|
| 6540 | + continue; |
---|
| 6541 | + } |
---|
| 6542 | + __init_single_page(pfn_to_page(pfn), pfn, zone, node, true); |
---|
| 6543 | + __SetPageReserved(pfn_to_page(pfn)); |
---|
| 6544 | + pgcnt++; |
---|
| 6545 | + } |
---|
| 6546 | + |
---|
| 6547 | + if (pgcnt) |
---|
| 6548 | + pr_info("On node %d, zone %s: %lld pages in unavailable ranges", |
---|
| 6549 | + node, zone_names[zone], pgcnt); |
---|
| 6550 | +} |
---|
| 6551 | + |
---|
| 6552 | +static void __init memmap_init_zone_range(struct zone *zone, |
---|
| 6553 | + unsigned long start_pfn, |
---|
| 6554 | + unsigned long end_pfn, |
---|
| 6555 | + unsigned long *hole_pfn) |
---|
| 6556 | +{ |
---|
| 6557 | + unsigned long zone_start_pfn = zone->zone_start_pfn; |
---|
| 6558 | + unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; |
---|
| 6559 | + int nid = zone_to_nid(zone), zone_id = zone_idx(zone); |
---|
| 6560 | + |
---|
| 6561 | + start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); |
---|
| 6562 | + end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); |
---|
| 6563 | + |
---|
| 6564 | + if (start_pfn >= end_pfn) |
---|
| 6565 | + return; |
---|
| 6566 | + |
---|
| 6567 | + memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn, |
---|
| 6568 | + zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); |
---|
| 6569 | + |
---|
| 6570 | + if (*hole_pfn < start_pfn) |
---|
| 6571 | + init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); |
---|
| 6572 | + |
---|
| 6573 | + *hole_pfn = end_pfn; |
---|
| 6574 | +} |
---|
| 6575 | + |
---|
| 6576 | +void __init __weak memmap_init(void) |
---|
| 6577 | +{ |
---|
| 6578 | + unsigned long start_pfn, end_pfn; |
---|
| 6579 | + unsigned long hole_pfn = 0; |
---|
| 6580 | + int i, j, zone_id, nid; |
---|
| 6581 | + |
---|
| 6582 | + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { |
---|
| 6583 | + struct pglist_data *node = NODE_DATA(nid); |
---|
| 6584 | + |
---|
| 6585 | + for (j = 0; j < MAX_NR_ZONES; j++) { |
---|
| 6586 | + struct zone *zone = node->node_zones + j; |
---|
| 6587 | + |
---|
| 6588 | + if (!populated_zone(zone)) |
---|
| 6589 | + continue; |
---|
| 6590 | + |
---|
| 6591 | + memmap_init_zone_range(zone, start_pfn, end_pfn, |
---|
| 6592 | + &hole_pfn); |
---|
| 6593 | + zone_id = j; |
---|
| 6594 | + } |
---|
| 6595 | + } |
---|
| 6596 | + |
---|
| 6597 | +#ifdef CONFIG_SPARSEMEM |
---|
| 6598 | + /* |
---|
| 6599 | + * Initialize the memory map for hole in the range [memory_end, |
---|
| 6600 | + * section_end]. |
---|
| 6601 | + * Append the pages in this hole to the highest zone in the last |
---|
| 6602 | + * node. |
---|
| 6603 | + * The call to init_unavailable_range() is outside the ifdef to |
---|
| 6604 | + * silence the compiler warining about zone_id set but not used; |
---|
| 6605 | + * for FLATMEM it is a nop anyway |
---|
| 6606 | + */ |
---|
| 6607 | + end_pfn = round_up(end_pfn, PAGES_PER_SECTION); |
---|
| 6608 | + if (hole_pfn < end_pfn) |
---|
5842 | 6609 | #endif |
---|
| 6610 | + init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); |
---|
| 6611 | +} |
---|
| 6612 | + |
---|
| 6613 | +/* A stub for backwards compatibility with custom implementatin on IA-64 */ |
---|
| 6614 | +void __meminit __weak arch_memmap_init(unsigned long size, int nid, |
---|
| 6615 | + unsigned long zone, |
---|
| 6616 | + unsigned long range_start_pfn) |
---|
| 6617 | +{ |
---|
| 6618 | +} |
---|
5843 | 6619 | |
---|
5844 | 6620 | static int zone_batchsize(struct zone *zone) |
---|
5845 | 6621 | { |
---|
.. | .. |
---|
5850 | 6626 | * The per-cpu-pages pools are set to around 1000th of the |
---|
5851 | 6627 | * size of the zone. |
---|
5852 | 6628 | */ |
---|
5853 | | - batch = zone->managed_pages / 1024; |
---|
| 6629 | + batch = zone_managed_pages(zone) / 1024; |
---|
5854 | 6630 | /* But no more than a meg. */ |
---|
5855 | 6631 | if (batch * PAGE_SIZE > 1024 * 1024) |
---|
5856 | 6632 | batch = (1024 * 1024) / PAGE_SIZE; |
---|
.. | .. |
---|
5897 | 6673 | * locking. |
---|
5898 | 6674 | * |
---|
5899 | 6675 | * Any new users of pcp->batch and pcp->high should ensure they can cope with |
---|
5900 | | - * those fields changing asynchronously (acording the the above rule). |
---|
| 6676 | + * those fields changing asynchronously (acording to the above rule). |
---|
5901 | 6677 | * |
---|
5902 | 6678 | * mutex_is_locked(&pcp_batch_high_lock) required when calling this function |
---|
5903 | 6679 | * outside of boot time (or some other assurance that no concurrent updaters |
---|
.. | .. |
---|
5931 | 6707 | memset(p, 0, sizeof(*p)); |
---|
5932 | 6708 | |
---|
5933 | 6709 | pcp = &p->pcp; |
---|
5934 | | - pcp->count = 0; |
---|
5935 | 6710 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) |
---|
5936 | 6711 | INIT_LIST_HEAD(&pcp->lists[migratetype]); |
---|
5937 | 6712 | } |
---|
.. | .. |
---|
5961 | 6736 | { |
---|
5962 | 6737 | if (percpu_pagelist_fraction) |
---|
5963 | 6738 | pageset_set_high(pcp, |
---|
5964 | | - (zone->managed_pages / |
---|
| 6739 | + (zone_managed_pages(zone) / |
---|
5965 | 6740 | percpu_pagelist_fraction)); |
---|
5966 | 6741 | else |
---|
5967 | 6742 | pageset_set_batch(pcp, zone_batchsize(zone)); |
---|
.. | .. |
---|
5991 | 6766 | { |
---|
5992 | 6767 | struct pglist_data *pgdat; |
---|
5993 | 6768 | struct zone *zone; |
---|
| 6769 | + int __maybe_unused cpu; |
---|
5994 | 6770 | |
---|
5995 | 6771 | for_each_populated_zone(zone) |
---|
5996 | 6772 | setup_zone_pageset(zone); |
---|
| 6773 | + |
---|
| 6774 | +#ifdef CONFIG_NUMA |
---|
| 6775 | + /* |
---|
| 6776 | + * Unpopulated zones continue using the boot pagesets. |
---|
| 6777 | + * The numa stats for these pagesets need to be reset. |
---|
| 6778 | + * Otherwise, they will end up skewing the stats of |
---|
| 6779 | + * the nodes these zones are associated with. |
---|
| 6780 | + */ |
---|
| 6781 | + for_each_possible_cpu(cpu) { |
---|
| 6782 | + struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu); |
---|
| 6783 | + memset(pcp->vm_numa_stat_diff, 0, |
---|
| 6784 | + sizeof(pcp->vm_numa_stat_diff)); |
---|
| 6785 | + } |
---|
| 6786 | +#endif |
---|
5997 | 6787 | |
---|
5998 | 6788 | for_each_online_pgdat(pgdat) |
---|
5999 | 6789 | pgdat->per_cpu_nodestats = |
---|
.. | .. |
---|
6037 | 6827 | zone->initialized = 1; |
---|
6038 | 6828 | } |
---|
6039 | 6829 | |
---|
6040 | | -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
---|
6041 | | -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID |
---|
6042 | | - |
---|
6043 | | -/* |
---|
6044 | | - * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. |
---|
6045 | | - */ |
---|
6046 | | -int __meminit __early_pfn_to_nid(unsigned long pfn, |
---|
6047 | | - struct mminit_pfnnid_cache *state) |
---|
6048 | | -{ |
---|
6049 | | - unsigned long start_pfn, end_pfn; |
---|
6050 | | - int nid; |
---|
6051 | | - |
---|
6052 | | - if (state->last_start <= pfn && pfn < state->last_end) |
---|
6053 | | - return state->last_nid; |
---|
6054 | | - |
---|
6055 | | - nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); |
---|
6056 | | - if (nid != -1) { |
---|
6057 | | - state->last_start = start_pfn; |
---|
6058 | | - state->last_end = end_pfn; |
---|
6059 | | - state->last_nid = nid; |
---|
6060 | | - } |
---|
6061 | | - |
---|
6062 | | - return nid; |
---|
6063 | | -} |
---|
6064 | | -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ |
---|
6065 | | - |
---|
6066 | | -/** |
---|
6067 | | - * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range |
---|
6068 | | - * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. |
---|
6069 | | - * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid |
---|
6070 | | - * |
---|
6071 | | - * If an architecture guarantees that all ranges registered contain no holes |
---|
6072 | | - * and may be freed, this this function may be used instead of calling |
---|
6073 | | - * memblock_free_early_nid() manually. |
---|
6074 | | - */ |
---|
6075 | | -void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) |
---|
6076 | | -{ |
---|
6077 | | - unsigned long start_pfn, end_pfn; |
---|
6078 | | - int i, this_nid; |
---|
6079 | | - |
---|
6080 | | - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { |
---|
6081 | | - start_pfn = min(start_pfn, max_low_pfn); |
---|
6082 | | - end_pfn = min(end_pfn, max_low_pfn); |
---|
6083 | | - |
---|
6084 | | - if (start_pfn < end_pfn) |
---|
6085 | | - memblock_free_early_nid(PFN_PHYS(start_pfn), |
---|
6086 | | - (end_pfn - start_pfn) << PAGE_SHIFT, |
---|
6087 | | - this_nid); |
---|
6088 | | - } |
---|
6089 | | -} |
---|
6090 | | - |
---|
6091 | | -/** |
---|
6092 | | - * sparse_memory_present_with_active_regions - Call memory_present for each active range |
---|
6093 | | - * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. |
---|
6094 | | - * |
---|
6095 | | - * If an architecture guarantees that all ranges registered contain no holes and may |
---|
6096 | | - * be freed, this function may be used instead of calling memory_present() manually. |
---|
6097 | | - */ |
---|
6098 | | -void __init sparse_memory_present_with_active_regions(int nid) |
---|
6099 | | -{ |
---|
6100 | | - unsigned long start_pfn, end_pfn; |
---|
6101 | | - int i, this_nid; |
---|
6102 | | - |
---|
6103 | | - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) |
---|
6104 | | - memory_present(this_nid, start_pfn, end_pfn); |
---|
6105 | | -} |
---|
6106 | | - |
---|
6107 | 6830 | /** |
---|
6108 | 6831 | * get_pfn_range_for_nid - Return the start and end page frames for a node |
---|
6109 | 6832 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. |
---|
.. | .. |
---|
6115 | 6838 | * with no available memory, a warning is printed and the start and end |
---|
6116 | 6839 | * PFNs will be 0. |
---|
6117 | 6840 | */ |
---|
6118 | | -void __meminit get_pfn_range_for_nid(unsigned int nid, |
---|
| 6841 | +void __init get_pfn_range_for_nid(unsigned int nid, |
---|
6119 | 6842 | unsigned long *start_pfn, unsigned long *end_pfn) |
---|
6120 | 6843 | { |
---|
6121 | 6844 | unsigned long this_start_pfn, this_end_pfn; |
---|
.. | .. |
---|
6164 | 6887 | * highest usable zone for ZONE_MOVABLE. This preserves the assumption that |
---|
6165 | 6888 | * zones within a node are in order of monotonic increases memory addresses |
---|
6166 | 6889 | */ |
---|
6167 | | -static void __meminit adjust_zone_range_for_zone_movable(int nid, |
---|
| 6890 | +static void __init adjust_zone_range_for_zone_movable(int nid, |
---|
6168 | 6891 | unsigned long zone_type, |
---|
6169 | 6892 | unsigned long node_start_pfn, |
---|
6170 | 6893 | unsigned long node_end_pfn, |
---|
.. | .. |
---|
6195 | 6918 | * Return the number of pages a zone spans in a node, including holes |
---|
6196 | 6919 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() |
---|
6197 | 6920 | */ |
---|
6198 | | -static unsigned long __meminit zone_spanned_pages_in_node(int nid, |
---|
| 6921 | +static unsigned long __init zone_spanned_pages_in_node(int nid, |
---|
6199 | 6922 | unsigned long zone_type, |
---|
6200 | 6923 | unsigned long node_start_pfn, |
---|
6201 | 6924 | unsigned long node_end_pfn, |
---|
6202 | 6925 | unsigned long *zone_start_pfn, |
---|
6203 | | - unsigned long *zone_end_pfn, |
---|
6204 | | - unsigned long *ignored) |
---|
| 6926 | + unsigned long *zone_end_pfn) |
---|
6205 | 6927 | { |
---|
6206 | 6928 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; |
---|
6207 | 6929 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; |
---|
.. | .. |
---|
6232 | 6954 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
---|
6233 | 6955 | * then all holes in the requested range will be accounted for. |
---|
6234 | 6956 | */ |
---|
6235 | | -unsigned long __meminit __absent_pages_in_range(int nid, |
---|
| 6957 | +unsigned long __init __absent_pages_in_range(int nid, |
---|
6236 | 6958 | unsigned long range_start_pfn, |
---|
6237 | 6959 | unsigned long range_end_pfn) |
---|
6238 | 6960 | { |
---|
.. | .. |
---|
6253 | 6975 | * @start_pfn: The start PFN to start searching for holes |
---|
6254 | 6976 | * @end_pfn: The end PFN to stop searching for holes |
---|
6255 | 6977 | * |
---|
6256 | | - * It returns the number of pages frames in memory holes within a range. |
---|
| 6978 | + * Return: the number of pages frames in memory holes within a range. |
---|
6257 | 6979 | */ |
---|
6258 | 6980 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, |
---|
6259 | 6981 | unsigned long end_pfn) |
---|
.. | .. |
---|
6262 | 6984 | } |
---|
6263 | 6985 | |
---|
6264 | 6986 | /* Return the number of page frames in holes in a zone on a node */ |
---|
6265 | | -static unsigned long __meminit zone_absent_pages_in_node(int nid, |
---|
| 6987 | +static unsigned long __init zone_absent_pages_in_node(int nid, |
---|
6266 | 6988 | unsigned long zone_type, |
---|
6267 | 6989 | unsigned long node_start_pfn, |
---|
6268 | | - unsigned long node_end_pfn, |
---|
6269 | | - unsigned long *ignored) |
---|
| 6990 | + unsigned long node_end_pfn) |
---|
6270 | 6991 | { |
---|
6271 | 6992 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; |
---|
6272 | 6993 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; |
---|
.. | .. |
---|
6294 | 7015 | unsigned long start_pfn, end_pfn; |
---|
6295 | 7016 | struct memblock_region *r; |
---|
6296 | 7017 | |
---|
6297 | | - for_each_memblock(memory, r) { |
---|
| 7018 | + for_each_mem_region(r) { |
---|
6298 | 7019 | start_pfn = clamp(memblock_region_memory_base_pfn(r), |
---|
6299 | 7020 | zone_start_pfn, zone_end_pfn); |
---|
6300 | 7021 | end_pfn = clamp(memblock_region_memory_end_pfn(r), |
---|
.. | .. |
---|
6313 | 7034 | return nr_absent; |
---|
6314 | 7035 | } |
---|
6315 | 7036 | |
---|
6316 | | -#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
---|
6317 | | -static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, |
---|
6318 | | - unsigned long zone_type, |
---|
6319 | | - unsigned long node_start_pfn, |
---|
6320 | | - unsigned long node_end_pfn, |
---|
6321 | | - unsigned long *zone_start_pfn, |
---|
6322 | | - unsigned long *zone_end_pfn, |
---|
6323 | | - unsigned long *zones_size) |
---|
6324 | | -{ |
---|
6325 | | - unsigned int zone; |
---|
6326 | | - |
---|
6327 | | - *zone_start_pfn = node_start_pfn; |
---|
6328 | | - for (zone = 0; zone < zone_type; zone++) |
---|
6329 | | - *zone_start_pfn += zones_size[zone]; |
---|
6330 | | - |
---|
6331 | | - *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; |
---|
6332 | | - |
---|
6333 | | - return zones_size[zone_type]; |
---|
6334 | | -} |
---|
6335 | | - |
---|
6336 | | -static inline unsigned long __meminit zone_absent_pages_in_node(int nid, |
---|
6337 | | - unsigned long zone_type, |
---|
| 7037 | +static void __init calculate_node_totalpages(struct pglist_data *pgdat, |
---|
6338 | 7038 | unsigned long node_start_pfn, |
---|
6339 | | - unsigned long node_end_pfn, |
---|
6340 | | - unsigned long *zholes_size) |
---|
6341 | | -{ |
---|
6342 | | - if (!zholes_size) |
---|
6343 | | - return 0; |
---|
6344 | | - |
---|
6345 | | - return zholes_size[zone_type]; |
---|
6346 | | -} |
---|
6347 | | - |
---|
6348 | | -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
---|
6349 | | - |
---|
6350 | | -static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, |
---|
6351 | | - unsigned long node_start_pfn, |
---|
6352 | | - unsigned long node_end_pfn, |
---|
6353 | | - unsigned long *zones_size, |
---|
6354 | | - unsigned long *zholes_size) |
---|
| 7039 | + unsigned long node_end_pfn) |
---|
6355 | 7040 | { |
---|
6356 | 7041 | unsigned long realtotalpages = 0, totalpages = 0; |
---|
6357 | 7042 | enum zone_type i; |
---|
.. | .. |
---|
6359 | 7044 | for (i = 0; i < MAX_NR_ZONES; i++) { |
---|
6360 | 7045 | struct zone *zone = pgdat->node_zones + i; |
---|
6361 | 7046 | unsigned long zone_start_pfn, zone_end_pfn; |
---|
| 7047 | + unsigned long spanned, absent; |
---|
6362 | 7048 | unsigned long size, real_size; |
---|
6363 | 7049 | |
---|
6364 | | - size = zone_spanned_pages_in_node(pgdat->node_id, i, |
---|
6365 | | - node_start_pfn, |
---|
6366 | | - node_end_pfn, |
---|
6367 | | - &zone_start_pfn, |
---|
6368 | | - &zone_end_pfn, |
---|
6369 | | - zones_size); |
---|
6370 | | - real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, |
---|
6371 | | - node_start_pfn, node_end_pfn, |
---|
6372 | | - zholes_size); |
---|
| 7050 | + spanned = zone_spanned_pages_in_node(pgdat->node_id, i, |
---|
| 7051 | + node_start_pfn, |
---|
| 7052 | + node_end_pfn, |
---|
| 7053 | + &zone_start_pfn, |
---|
| 7054 | + &zone_end_pfn); |
---|
| 7055 | + absent = zone_absent_pages_in_node(pgdat->node_id, i, |
---|
| 7056 | + node_start_pfn, |
---|
| 7057 | + node_end_pfn); |
---|
| 7058 | + |
---|
| 7059 | + size = spanned; |
---|
| 7060 | + real_size = size - absent; |
---|
| 7061 | + |
---|
6373 | 7062 | if (size) |
---|
6374 | 7063 | zone->zone_start_pfn = zone_start_pfn; |
---|
6375 | 7064 | else |
---|
.. | .. |
---|
6415 | 7104 | { |
---|
6416 | 7105 | unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); |
---|
6417 | 7106 | zone->pageblock_flags = NULL; |
---|
6418 | | - if (usemapsize) |
---|
| 7107 | + if (usemapsize) { |
---|
6419 | 7108 | zone->pageblock_flags = |
---|
6420 | | - memblock_virt_alloc_node_nopanic(usemapsize, |
---|
6421 | | - pgdat->node_id); |
---|
| 7109 | + memblock_alloc_node(usemapsize, SMP_CACHE_BYTES, |
---|
| 7110 | + pgdat->node_id); |
---|
| 7111 | + if (!zone->pageblock_flags) |
---|
| 7112 | + panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", |
---|
| 7113 | + usemapsize, zone->name, pgdat->node_id); |
---|
| 7114 | + } |
---|
6422 | 7115 | } |
---|
6423 | 7116 | #else |
---|
6424 | 7117 | static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, |
---|
.. | .. |
---|
6485 | 7178 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
---|
6486 | 7179 | static void pgdat_init_split_queue(struct pglist_data *pgdat) |
---|
6487 | 7180 | { |
---|
6488 | | - spin_lock_init(&pgdat->split_queue_lock); |
---|
6489 | | - INIT_LIST_HEAD(&pgdat->split_queue); |
---|
6490 | | - pgdat->split_queue_len = 0; |
---|
| 7181 | + struct deferred_split *ds_queue = &pgdat->deferred_split_queue; |
---|
| 7182 | + |
---|
| 7183 | + spin_lock_init(&ds_queue->split_queue_lock); |
---|
| 7184 | + INIT_LIST_HEAD(&ds_queue->split_queue); |
---|
| 7185 | + ds_queue->split_queue_len = 0; |
---|
6491 | 7186 | } |
---|
6492 | 7187 | #else |
---|
6493 | 7188 | static void pgdat_init_split_queue(struct pglist_data *pgdat) {} |
---|
.. | .. |
---|
6514 | 7209 | |
---|
6515 | 7210 | pgdat_page_ext_init(pgdat); |
---|
6516 | 7211 | spin_lock_init(&pgdat->lru_lock); |
---|
6517 | | - lruvec_init(node_lruvec(pgdat)); |
---|
| 7212 | + lruvec_init(&pgdat->__lruvec); |
---|
6518 | 7213 | } |
---|
6519 | 7214 | |
---|
6520 | 7215 | static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, |
---|
6521 | 7216 | unsigned long remaining_pages) |
---|
6522 | 7217 | { |
---|
6523 | | - zone->managed_pages = remaining_pages; |
---|
| 7218 | + atomic_long_set(&zone->managed_pages, remaining_pages); |
---|
6524 | 7219 | zone_set_nid(zone, nid); |
---|
6525 | 7220 | zone->name = zone_names[idx]; |
---|
6526 | 7221 | zone->zone_pgdat = NODE_DATA(nid); |
---|
.. | .. |
---|
6618 | 7313 | set_pageblock_order(); |
---|
6619 | 7314 | setup_usemap(pgdat, zone, zone_start_pfn, size); |
---|
6620 | 7315 | init_currently_empty_zone(zone, zone_start_pfn, size); |
---|
6621 | | - memmap_init(size, nid, j, zone_start_pfn); |
---|
| 7316 | + arch_memmap_init(size, nid, j, zone_start_pfn); |
---|
6622 | 7317 | } |
---|
6623 | 7318 | } |
---|
6624 | 7319 | |
---|
.. | .. |
---|
6647 | 7342 | end = pgdat_end_pfn(pgdat); |
---|
6648 | 7343 | end = ALIGN(end, MAX_ORDER_NR_PAGES); |
---|
6649 | 7344 | size = (end - start) * sizeof(struct page); |
---|
6650 | | - map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); |
---|
| 7345 | + map = memblock_alloc_node(size, SMP_CACHE_BYTES, |
---|
| 7346 | + pgdat->node_id); |
---|
| 7347 | + if (!map) |
---|
| 7348 | + panic("Failed to allocate %ld bytes for node %d memory map\n", |
---|
| 7349 | + size, pgdat->node_id); |
---|
6651 | 7350 | pgdat->node_mem_map = map + offset; |
---|
6652 | 7351 | } |
---|
6653 | 7352 | pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", |
---|
.. | .. |
---|
6659 | 7358 | */ |
---|
6660 | 7359 | if (pgdat == NODE_DATA(0)) { |
---|
6661 | 7360 | mem_map = NODE_DATA(0)->node_mem_map; |
---|
6662 | | -#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) |
---|
6663 | 7361 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) |
---|
6664 | 7362 | mem_map -= offset; |
---|
6665 | | -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
---|
6666 | 7363 | } |
---|
6667 | 7364 | #endif |
---|
6668 | 7365 | } |
---|
.. | .. |
---|
6673 | 7370 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
---|
6674 | 7371 | static inline void pgdat_set_deferred_range(pg_data_t *pgdat) |
---|
6675 | 7372 | { |
---|
6676 | | - /* |
---|
6677 | | - * We start only with one section of pages, more pages are added as |
---|
6678 | | - * needed until the rest of deferred pages are initialized. |
---|
6679 | | - */ |
---|
6680 | | - pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, |
---|
6681 | | - pgdat->node_spanned_pages); |
---|
6682 | 7373 | pgdat->first_deferred_pfn = ULONG_MAX; |
---|
6683 | 7374 | } |
---|
6684 | 7375 | #else |
---|
6685 | 7376 | static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} |
---|
6686 | 7377 | #endif |
---|
6687 | 7378 | |
---|
6688 | | -void __init free_area_init_node(int nid, unsigned long *zones_size, |
---|
6689 | | - unsigned long node_start_pfn, |
---|
6690 | | - unsigned long *zholes_size) |
---|
| 7379 | +static void __init free_area_init_node(int nid) |
---|
6691 | 7380 | { |
---|
6692 | 7381 | pg_data_t *pgdat = NODE_DATA(nid); |
---|
6693 | 7382 | unsigned long start_pfn = 0; |
---|
6694 | 7383 | unsigned long end_pfn = 0; |
---|
6695 | 7384 | |
---|
6696 | 7385 | /* pg_data_t should be reset to zero when it's allocated */ |
---|
6697 | | - WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); |
---|
| 7386 | + WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); |
---|
| 7387 | + |
---|
| 7388 | + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
---|
6698 | 7389 | |
---|
6699 | 7390 | pgdat->node_id = nid; |
---|
6700 | | - pgdat->node_start_pfn = node_start_pfn; |
---|
| 7391 | + pgdat->node_start_pfn = start_pfn; |
---|
6701 | 7392 | pgdat->per_cpu_nodestats = NULL; |
---|
6702 | | -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
---|
6703 | | - get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
---|
| 7393 | + |
---|
6704 | 7394 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, |
---|
6705 | 7395 | (u64)start_pfn << PAGE_SHIFT, |
---|
6706 | 7396 | end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); |
---|
6707 | | -#else |
---|
6708 | | - start_pfn = node_start_pfn; |
---|
6709 | | -#endif |
---|
6710 | | - calculate_node_totalpages(pgdat, start_pfn, end_pfn, |
---|
6711 | | - zones_size, zholes_size); |
---|
| 7397 | + calculate_node_totalpages(pgdat, start_pfn, end_pfn); |
---|
6712 | 7398 | |
---|
6713 | 7399 | alloc_node_mem_map(pgdat); |
---|
6714 | 7400 | pgdat_set_deferred_range(pgdat); |
---|
.. | .. |
---|
6716 | 7402 | free_area_init_core(pgdat); |
---|
6717 | 7403 | } |
---|
6718 | 7404 | |
---|
6719 | | -#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) |
---|
6720 | | - |
---|
6721 | | -/* |
---|
6722 | | - * Zero all valid struct pages in range [spfn, epfn), return number of struct |
---|
6723 | | - * pages zeroed |
---|
6724 | | - */ |
---|
6725 | | -static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) |
---|
| 7405 | +void __init free_area_init_memoryless_node(int nid) |
---|
6726 | 7406 | { |
---|
6727 | | - unsigned long pfn; |
---|
6728 | | - u64 pgcnt = 0; |
---|
6729 | | - |
---|
6730 | | - for (pfn = spfn; pfn < epfn; pfn++) { |
---|
6731 | | - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { |
---|
6732 | | - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) |
---|
6733 | | - + pageblock_nr_pages - 1; |
---|
6734 | | - continue; |
---|
6735 | | - } |
---|
6736 | | - mm_zero_struct_page(pfn_to_page(pfn)); |
---|
6737 | | - pgcnt++; |
---|
6738 | | - } |
---|
6739 | | - |
---|
6740 | | - return pgcnt; |
---|
| 7407 | + free_area_init_node(nid); |
---|
6741 | 7408 | } |
---|
6742 | | - |
---|
6743 | | -/* |
---|
6744 | | - * Only struct pages that are backed by physical memory are zeroed and |
---|
6745 | | - * initialized by going through __init_single_page(). But, there are some |
---|
6746 | | - * struct pages which are reserved in memblock allocator and their fields |
---|
6747 | | - * may be accessed (for example page_to_pfn() on some configuration accesses |
---|
6748 | | - * flags). We must explicitly zero those struct pages. |
---|
6749 | | - * |
---|
6750 | | - * This function also addresses a similar issue where struct pages are left |
---|
6751 | | - * uninitialized because the physical address range is not covered by |
---|
6752 | | - * memblock.memory or memblock.reserved. That could happen when memblock |
---|
6753 | | - * layout is manually configured via memmap=, or when the highest physical |
---|
6754 | | - * address (max_pfn) does not end on a section boundary. |
---|
6755 | | - */ |
---|
6756 | | -void __init zero_resv_unavail(void) |
---|
6757 | | -{ |
---|
6758 | | - phys_addr_t start, end; |
---|
6759 | | - u64 i, pgcnt; |
---|
6760 | | - phys_addr_t next = 0; |
---|
6761 | | - |
---|
6762 | | - /* |
---|
6763 | | - * Loop through unavailable ranges not covered by memblock.memory. |
---|
6764 | | - */ |
---|
6765 | | - pgcnt = 0; |
---|
6766 | | - for_each_mem_range(i, &memblock.memory, NULL, |
---|
6767 | | - NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { |
---|
6768 | | - if (next < start) |
---|
6769 | | - pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start)); |
---|
6770 | | - next = end; |
---|
6771 | | - } |
---|
6772 | | - |
---|
6773 | | - /* |
---|
6774 | | - * Early sections always have a fully populated memmap for the whole |
---|
6775 | | - * section - see pfn_valid(). If the last section has holes at the |
---|
6776 | | - * end and that section is marked "online", the memmap will be |
---|
6777 | | - * considered initialized. Make sure that memmap has a well defined |
---|
6778 | | - * state. |
---|
6779 | | - */ |
---|
6780 | | - pgcnt += zero_pfn_range(PFN_DOWN(next), |
---|
6781 | | - round_up(max_pfn, PAGES_PER_SECTION)); |
---|
6782 | | - |
---|
6783 | | - /* |
---|
6784 | | - * Struct pages that do not have backing memory. This could be because |
---|
6785 | | - * firmware is using some of this memory, or for some other reasons. |
---|
6786 | | - */ |
---|
6787 | | - if (pgcnt) |
---|
6788 | | - pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); |
---|
6789 | | -} |
---|
6790 | | -#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ |
---|
6791 | | - |
---|
6792 | | -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
---|
6793 | 7409 | |
---|
6794 | 7410 | #if MAX_NUMNODES > 1 |
---|
6795 | 7411 | /* |
---|
.. | .. |
---|
6820 | 7436 | * model has fine enough granularity to avoid incorrect mapping for the |
---|
6821 | 7437 | * populated node map. |
---|
6822 | 7438 | * |
---|
6823 | | - * Returns the determined alignment in pfn's. 0 if there is no alignment |
---|
| 7439 | + * Return: the determined alignment in pfn's. 0 if there is no alignment |
---|
6824 | 7440 | * requirement (single node). |
---|
6825 | 7441 | */ |
---|
6826 | 7442 | unsigned long __init node_map_pfn_alignment(void) |
---|
6827 | 7443 | { |
---|
6828 | 7444 | unsigned long accl_mask = 0, last_end = 0; |
---|
6829 | 7445 | unsigned long start, end, mask; |
---|
6830 | | - int last_nid = -1; |
---|
| 7446 | + int last_nid = NUMA_NO_NODE; |
---|
6831 | 7447 | int i, nid; |
---|
6832 | 7448 | |
---|
6833 | 7449 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { |
---|
.. | .. |
---|
6854 | 7470 | return ~accl_mask + 1; |
---|
6855 | 7471 | } |
---|
6856 | 7472 | |
---|
6857 | | -/* Find the lowest pfn for a node */ |
---|
6858 | | -static unsigned long __init find_min_pfn_for_node(int nid) |
---|
6859 | | -{ |
---|
6860 | | - unsigned long min_pfn = ULONG_MAX; |
---|
6861 | | - unsigned long start_pfn; |
---|
6862 | | - int i; |
---|
6863 | | - |
---|
6864 | | - for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) |
---|
6865 | | - min_pfn = min(min_pfn, start_pfn); |
---|
6866 | | - |
---|
6867 | | - if (min_pfn == ULONG_MAX) { |
---|
6868 | | - pr_warn("Could not find start_pfn for node %d\n", nid); |
---|
6869 | | - return 0; |
---|
6870 | | - } |
---|
6871 | | - |
---|
6872 | | - return min_pfn; |
---|
6873 | | -} |
---|
6874 | | - |
---|
6875 | 7473 | /** |
---|
6876 | 7474 | * find_min_pfn_with_active_regions - Find the minimum PFN registered |
---|
6877 | 7475 | * |
---|
6878 | | - * It returns the minimum PFN based on information provided via |
---|
| 7476 | + * Return: the minimum PFN based on information provided via |
---|
6879 | 7477 | * memblock_set_node(). |
---|
6880 | 7478 | */ |
---|
6881 | 7479 | unsigned long __init find_min_pfn_with_active_regions(void) |
---|
6882 | 7480 | { |
---|
6883 | | - return find_min_pfn_for_node(MAX_NUMNODES); |
---|
| 7481 | + return PHYS_PFN(memblock_start_of_DRAM()); |
---|
6884 | 7482 | } |
---|
6885 | 7483 | |
---|
6886 | 7484 | /* |
---|
.. | .. |
---|
6929 | 7527 | * options. |
---|
6930 | 7528 | */ |
---|
6931 | 7529 | if (movable_node_is_enabled()) { |
---|
6932 | | - for_each_memblock(memory, r) { |
---|
| 7530 | + for_each_mem_region(r) { |
---|
6933 | 7531 | if (!memblock_is_hotpluggable(r)) |
---|
6934 | 7532 | continue; |
---|
6935 | 7533 | |
---|
6936 | | - nid = r->nid; |
---|
| 7534 | + nid = memblock_get_region_node(r); |
---|
6937 | 7535 | |
---|
6938 | 7536 | usable_startpfn = PFN_DOWN(r->base); |
---|
6939 | 7537 | zone_movable_pfn[nid] = zone_movable_pfn[nid] ? |
---|
.. | .. |
---|
6950 | 7548 | if (mirrored_kernelcore) { |
---|
6951 | 7549 | bool mem_below_4gb_not_mirrored = false; |
---|
6952 | 7550 | |
---|
6953 | | - for_each_memblock(memory, r) { |
---|
| 7551 | + for_each_mem_region(r) { |
---|
6954 | 7552 | if (memblock_is_mirror(r)) |
---|
6955 | 7553 | continue; |
---|
6956 | 7554 | |
---|
6957 | | - nid = r->nid; |
---|
| 7555 | + nid = memblock_get_region_node(r); |
---|
6958 | 7556 | |
---|
6959 | 7557 | usable_startpfn = memblock_region_memory_base_pfn(r); |
---|
6960 | 7558 | |
---|
.. | .. |
---|
6969 | 7567 | } |
---|
6970 | 7568 | |
---|
6971 | 7569 | if (mem_below_4gb_not_mirrored) |
---|
6972 | | - pr_warn("This configuration results in unmirrored kernel memory."); |
---|
| 7570 | + pr_warn("This configuration results in unmirrored kernel memory.\n"); |
---|
6973 | 7571 | |
---|
6974 | 7572 | goto out2; |
---|
6975 | 7573 | } |
---|
.. | .. |
---|
7108 | 7706 | |
---|
7109 | 7707 | out2: |
---|
7110 | 7708 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ |
---|
7111 | | - for (nid = 0; nid < MAX_NUMNODES; nid++) |
---|
| 7709 | + for (nid = 0; nid < MAX_NUMNODES; nid++) { |
---|
| 7710 | + unsigned long start_pfn, end_pfn; |
---|
| 7711 | + |
---|
7112 | 7712 | zone_movable_pfn[nid] = |
---|
7113 | 7713 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); |
---|
| 7714 | + |
---|
| 7715 | + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
---|
| 7716 | + if (zone_movable_pfn[nid] >= end_pfn) |
---|
| 7717 | + zone_movable_pfn[nid] = 0; |
---|
| 7718 | + } |
---|
7114 | 7719 | |
---|
7115 | 7720 | out: |
---|
7116 | 7721 | /* restore the node_state */ |
---|
.. | .. |
---|
7122 | 7727 | { |
---|
7123 | 7728 | enum zone_type zone_type; |
---|
7124 | 7729 | |
---|
7125 | | - if (N_MEMORY == N_NORMAL_MEMORY) |
---|
7126 | | - return; |
---|
7127 | | - |
---|
7128 | 7730 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { |
---|
7129 | 7731 | struct zone *zone = &pgdat->node_zones[zone_type]; |
---|
7130 | 7732 | if (populated_zone(zone)) { |
---|
7131 | | - node_set_state(nid, N_HIGH_MEMORY); |
---|
7132 | | - if (N_NORMAL_MEMORY != N_HIGH_MEMORY && |
---|
7133 | | - zone_type <= ZONE_NORMAL) |
---|
| 7733 | + if (IS_ENABLED(CONFIG_HIGHMEM)) |
---|
| 7734 | + node_set_state(nid, N_HIGH_MEMORY); |
---|
| 7735 | + if (zone_type <= ZONE_NORMAL) |
---|
7134 | 7736 | node_set_state(nid, N_NORMAL_MEMORY); |
---|
7135 | 7737 | break; |
---|
7136 | 7738 | } |
---|
7137 | 7739 | } |
---|
7138 | 7740 | } |
---|
7139 | 7741 | |
---|
| 7742 | +/* |
---|
| 7743 | + * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For |
---|
| 7744 | + * such cases we allow max_zone_pfn sorted in the descending order |
---|
| 7745 | + */ |
---|
| 7746 | +bool __weak arch_has_descending_max_zone_pfns(void) |
---|
| 7747 | +{ |
---|
| 7748 | + return false; |
---|
| 7749 | +} |
---|
| 7750 | + |
---|
7140 | 7751 | /** |
---|
7141 | | - * free_area_init_nodes - Initialise all pg_data_t and zone data |
---|
| 7752 | + * free_area_init - Initialise all pg_data_t and zone data |
---|
7142 | 7753 | * @max_zone_pfn: an array of max PFNs for each zone |
---|
7143 | 7754 | * |
---|
7144 | 7755 | * This will call free_area_init_node() for each active node in the system. |
---|
.. | .. |
---|
7150 | 7761 | * starts where the previous one ended. For example, ZONE_DMA32 starts |
---|
7151 | 7762 | * at arch_max_dma_pfn. |
---|
7152 | 7763 | */ |
---|
7153 | | -void __init free_area_init_nodes(unsigned long *max_zone_pfn) |
---|
| 7764 | +void __init free_area_init(unsigned long *max_zone_pfn) |
---|
7154 | 7765 | { |
---|
7155 | 7766 | unsigned long start_pfn, end_pfn; |
---|
7156 | | - int i, nid; |
---|
| 7767 | + int i, nid, zone; |
---|
| 7768 | + bool descending; |
---|
7157 | 7769 | |
---|
7158 | 7770 | /* Record where the zone boundaries are */ |
---|
7159 | 7771 | memset(arch_zone_lowest_possible_pfn, 0, |
---|
.. | .. |
---|
7162 | 7774 | sizeof(arch_zone_highest_possible_pfn)); |
---|
7163 | 7775 | |
---|
7164 | 7776 | start_pfn = find_min_pfn_with_active_regions(); |
---|
| 7777 | + descending = arch_has_descending_max_zone_pfns(); |
---|
7165 | 7778 | |
---|
7166 | 7779 | for (i = 0; i < MAX_NR_ZONES; i++) { |
---|
7167 | | - if (i == ZONE_MOVABLE) |
---|
| 7780 | + if (descending) |
---|
| 7781 | + zone = MAX_NR_ZONES - i - 1; |
---|
| 7782 | + else |
---|
| 7783 | + zone = i; |
---|
| 7784 | + |
---|
| 7785 | + if (zone == ZONE_MOVABLE) |
---|
7168 | 7786 | continue; |
---|
7169 | 7787 | |
---|
7170 | | - end_pfn = max(max_zone_pfn[i], start_pfn); |
---|
7171 | | - arch_zone_lowest_possible_pfn[i] = start_pfn; |
---|
7172 | | - arch_zone_highest_possible_pfn[i] = end_pfn; |
---|
| 7788 | + end_pfn = max(max_zone_pfn[zone], start_pfn); |
---|
| 7789 | + arch_zone_lowest_possible_pfn[zone] = start_pfn; |
---|
| 7790 | + arch_zone_highest_possible_pfn[zone] = end_pfn; |
---|
7173 | 7791 | |
---|
7174 | 7792 | start_pfn = end_pfn; |
---|
7175 | 7793 | } |
---|
.. | .. |
---|
7203 | 7821 | (u64)zone_movable_pfn[i] << PAGE_SHIFT); |
---|
7204 | 7822 | } |
---|
7205 | 7823 | |
---|
7206 | | - /* Print out the early node map */ |
---|
| 7824 | + /* |
---|
| 7825 | + * Print out the early node map, and initialize the |
---|
| 7826 | + * subsection-map relative to active online memory ranges to |
---|
| 7827 | + * enable future "sub-section" extensions of the memory map. |
---|
| 7828 | + */ |
---|
7207 | 7829 | pr_info("Early memory node ranges\n"); |
---|
7208 | | - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
---|
| 7830 | + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { |
---|
7209 | 7831 | pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, |
---|
7210 | 7832 | (u64)start_pfn << PAGE_SHIFT, |
---|
7211 | 7833 | ((u64)end_pfn << PAGE_SHIFT) - 1); |
---|
| 7834 | + subsection_map_init(start_pfn, end_pfn - start_pfn); |
---|
| 7835 | + } |
---|
7212 | 7836 | |
---|
7213 | 7837 | /* Initialise every node */ |
---|
7214 | 7838 | mminit_verify_pageflags_layout(); |
---|
7215 | 7839 | setup_nr_node_ids(); |
---|
7216 | | - zero_resv_unavail(); |
---|
7217 | 7840 | for_each_online_node(nid) { |
---|
7218 | 7841 | pg_data_t *pgdat = NODE_DATA(nid); |
---|
7219 | | - free_area_init_node(nid, NULL, |
---|
7220 | | - find_min_pfn_for_node(nid), NULL); |
---|
| 7842 | + free_area_init_node(nid); |
---|
7221 | 7843 | |
---|
7222 | 7844 | /* Any memory on that node */ |
---|
7223 | 7845 | if (pgdat->node_present_pages) |
---|
7224 | 7846 | node_set_state(nid, N_MEMORY); |
---|
7225 | 7847 | check_for_memory(pgdat, nid); |
---|
7226 | 7848 | } |
---|
| 7849 | + |
---|
| 7850 | + memmap_init(); |
---|
7227 | 7851 | } |
---|
7228 | 7852 | |
---|
7229 | 7853 | static int __init cmdline_parse_core(char *p, unsigned long *core, |
---|
.. | .. |
---|
7282 | 7906 | early_param("kernelcore", cmdline_parse_kernelcore); |
---|
7283 | 7907 | early_param("movablecore", cmdline_parse_movablecore); |
---|
7284 | 7908 | |
---|
7285 | | -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
---|
7286 | | - |
---|
7287 | 7909 | void adjust_managed_page_count(struct page *page, long count) |
---|
7288 | 7910 | { |
---|
7289 | | - spin_lock(&managed_page_count_lock); |
---|
7290 | | - page_zone(page)->managed_pages += count; |
---|
7291 | | - totalram_pages += count; |
---|
| 7911 | + atomic_long_add(count, &page_zone(page)->managed_pages); |
---|
| 7912 | + totalram_pages_add(count); |
---|
7292 | 7913 | #ifdef CONFIG_HIGHMEM |
---|
7293 | 7914 | if (PageHighMem(page)) |
---|
7294 | | - totalhigh_pages += count; |
---|
| 7915 | + totalhigh_pages_add(count); |
---|
7295 | 7916 | #endif |
---|
7296 | | - spin_unlock(&managed_page_count_lock); |
---|
7297 | 7917 | } |
---|
7298 | 7918 | EXPORT_SYMBOL(adjust_managed_page_count); |
---|
7299 | 7919 | |
---|
7300 | | -unsigned long free_reserved_area(void *start, void *end, int poison, char *s) |
---|
| 7920 | +unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) |
---|
7301 | 7921 | { |
---|
7302 | 7922 | void *pos; |
---|
7303 | 7923 | unsigned long pages = 0; |
---|
.. | .. |
---|
7316 | 7936 | * alias for the memset(). |
---|
7317 | 7937 | */ |
---|
7318 | 7938 | direct_map_addr = page_address(page); |
---|
| 7939 | + /* |
---|
| 7940 | + * Perform a kasan-unchecked memset() since this memory |
---|
| 7941 | + * has not been initialized. |
---|
| 7942 | + */ |
---|
| 7943 | + direct_map_addr = kasan_reset_tag(direct_map_addr); |
---|
7319 | 7944 | if ((unsigned int)poison <= 0xFF) |
---|
7320 | 7945 | memset(direct_map_addr, poison, PAGE_SIZE); |
---|
7321 | 7946 | |
---|
.. | .. |
---|
7328 | 7953 | |
---|
7329 | 7954 | return pages; |
---|
7330 | 7955 | } |
---|
7331 | | -EXPORT_SYMBOL(free_reserved_area); |
---|
7332 | 7956 | |
---|
7333 | 7957 | #ifdef CONFIG_HIGHMEM |
---|
7334 | 7958 | void free_highmem_page(struct page *page) |
---|
7335 | 7959 | { |
---|
7336 | 7960 | __free_reserved_page(page); |
---|
7337 | | - totalram_pages++; |
---|
7338 | | - page_zone(page)->managed_pages++; |
---|
7339 | | - totalhigh_pages++; |
---|
| 7961 | + totalram_pages_inc(); |
---|
| 7962 | + atomic_long_inc(&page_zone(page)->managed_pages); |
---|
| 7963 | + totalhigh_pages_inc(); |
---|
7340 | 7964 | } |
---|
7341 | 7965 | #endif |
---|
7342 | 7966 | |
---|
.. | .. |
---|
7363 | 7987 | */ |
---|
7364 | 7988 | #define adj_init_size(start, end, size, pos, adj) \ |
---|
7365 | 7989 | do { \ |
---|
7366 | | - if (start <= pos && pos < end && size > adj) \ |
---|
| 7990 | + if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ |
---|
7367 | 7991 | size -= adj; \ |
---|
7368 | 7992 | } while (0) |
---|
7369 | 7993 | |
---|
.. | .. |
---|
7385 | 8009 | physpages << (PAGE_SHIFT - 10), |
---|
7386 | 8010 | codesize >> 10, datasize >> 10, rosize >> 10, |
---|
7387 | 8011 | (init_data_size + init_code_size) >> 10, bss_size >> 10, |
---|
7388 | | - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), |
---|
| 8012 | + (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), |
---|
7389 | 8013 | totalcma_pages << (PAGE_SHIFT - 10), |
---|
7390 | 8014 | #ifdef CONFIG_HIGHMEM |
---|
7391 | | - totalhigh_pages << (PAGE_SHIFT - 10), |
---|
| 8015 | + totalhigh_pages() << (PAGE_SHIFT - 10), |
---|
7392 | 8016 | #endif |
---|
7393 | 8017 | str ? ", " : "", str ? str : ""); |
---|
7394 | 8018 | } |
---|
.. | .. |
---|
7409 | 8033 | dma_reserve = new_dma_reserve; |
---|
7410 | 8034 | } |
---|
7411 | 8035 | |
---|
7412 | | -void __init free_area_init(unsigned long *zones_size) |
---|
7413 | | -{ |
---|
7414 | | - zero_resv_unavail(); |
---|
7415 | | - free_area_init_node(0, zones_size, |
---|
7416 | | - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
---|
7417 | | -} |
---|
7418 | | - |
---|
7419 | 8036 | static int page_alloc_cpu_dead(unsigned int cpu) |
---|
7420 | 8037 | { |
---|
7421 | | - local_lock_irq_on(swapvec_lock, cpu); |
---|
| 8038 | + |
---|
7422 | 8039 | lru_add_drain_cpu(cpu); |
---|
7423 | | - local_unlock_irq_on(swapvec_lock, cpu); |
---|
7424 | 8040 | drain_pages(cpu); |
---|
7425 | 8041 | |
---|
7426 | 8042 | /* |
---|
.. | .. |
---|
7442 | 8058 | return 0; |
---|
7443 | 8059 | } |
---|
7444 | 8060 | |
---|
| 8061 | +#ifdef CONFIG_NUMA |
---|
| 8062 | +int hashdist = HASHDIST_DEFAULT; |
---|
| 8063 | + |
---|
| 8064 | +static int __init set_hashdist(char *str) |
---|
| 8065 | +{ |
---|
| 8066 | + if (!str) |
---|
| 8067 | + return 0; |
---|
| 8068 | + hashdist = simple_strtoul(str, &str, 0); |
---|
| 8069 | + return 1; |
---|
| 8070 | +} |
---|
| 8071 | +__setup("hashdist=", set_hashdist); |
---|
| 8072 | +#endif |
---|
| 8073 | + |
---|
7445 | 8074 | void __init page_alloc_init(void) |
---|
7446 | 8075 | { |
---|
7447 | 8076 | int ret; |
---|
| 8077 | + |
---|
| 8078 | +#ifdef CONFIG_NUMA |
---|
| 8079 | + if (num_node_state(N_MEMORY) == 1) |
---|
| 8080 | + hashdist = 0; |
---|
| 8081 | +#endif |
---|
7448 | 8082 | |
---|
7449 | 8083 | ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, |
---|
7450 | 8084 | "mm/page_alloc:dead", NULL, |
---|
.. | .. |
---|
7469 | 8103 | for (i = 0; i < MAX_NR_ZONES; i++) { |
---|
7470 | 8104 | struct zone *zone = pgdat->node_zones + i; |
---|
7471 | 8105 | long max = 0; |
---|
| 8106 | + unsigned long managed_pages = zone_managed_pages(zone); |
---|
7472 | 8107 | |
---|
7473 | 8108 | /* Find valid and maximum lowmem_reserve in the zone */ |
---|
7474 | 8109 | for (j = i; j < MAX_NR_ZONES; j++) { |
---|
.. | .. |
---|
7479 | 8114 | /* we treat the high watermark as reserved pages. */ |
---|
7480 | 8115 | max += high_wmark_pages(zone); |
---|
7481 | 8116 | |
---|
7482 | | - if (max > zone->managed_pages) |
---|
7483 | | - max = zone->managed_pages; |
---|
| 8117 | + if (max > managed_pages) |
---|
| 8118 | + max = managed_pages; |
---|
7484 | 8119 | |
---|
7485 | 8120 | pgdat->totalreserve_pages += max; |
---|
7486 | 8121 | |
---|
.. | .. |
---|
7499 | 8134 | static void setup_per_zone_lowmem_reserve(void) |
---|
7500 | 8135 | { |
---|
7501 | 8136 | struct pglist_data *pgdat; |
---|
7502 | | - enum zone_type j, idx; |
---|
| 8137 | + enum zone_type i, j; |
---|
7503 | 8138 | |
---|
7504 | 8139 | for_each_online_pgdat(pgdat) { |
---|
7505 | | - for (j = 0; j < MAX_NR_ZONES; j++) { |
---|
7506 | | - struct zone *zone = pgdat->node_zones + j; |
---|
7507 | | - unsigned long managed_pages = zone->managed_pages; |
---|
| 8140 | + for (i = 0; i < MAX_NR_ZONES - 1; i++) { |
---|
| 8141 | + struct zone *zone = &pgdat->node_zones[i]; |
---|
| 8142 | + int ratio = sysctl_lowmem_reserve_ratio[i]; |
---|
| 8143 | + bool clear = !ratio || !zone_managed_pages(zone); |
---|
| 8144 | + unsigned long managed_pages = 0; |
---|
7508 | 8145 | |
---|
7509 | | - zone->lowmem_reserve[j] = 0; |
---|
| 8146 | + for (j = i + 1; j < MAX_NR_ZONES; j++) { |
---|
| 8147 | + struct zone *upper_zone = &pgdat->node_zones[j]; |
---|
7510 | 8148 | |
---|
7511 | | - idx = j; |
---|
7512 | | - while (idx) { |
---|
7513 | | - struct zone *lower_zone; |
---|
| 8149 | + managed_pages += zone_managed_pages(upper_zone); |
---|
7514 | 8150 | |
---|
7515 | | - idx--; |
---|
7516 | | - lower_zone = pgdat->node_zones + idx; |
---|
7517 | | - |
---|
7518 | | - if (sysctl_lowmem_reserve_ratio[idx] < 1) { |
---|
7519 | | - sysctl_lowmem_reserve_ratio[idx] = 0; |
---|
7520 | | - lower_zone->lowmem_reserve[j] = 0; |
---|
7521 | | - } else { |
---|
7522 | | - lower_zone->lowmem_reserve[j] = |
---|
7523 | | - managed_pages / sysctl_lowmem_reserve_ratio[idx]; |
---|
7524 | | - } |
---|
7525 | | - managed_pages += lower_zone->managed_pages; |
---|
| 8151 | + if (clear) |
---|
| 8152 | + zone->lowmem_reserve[j] = 0; |
---|
| 8153 | + else |
---|
| 8154 | + zone->lowmem_reserve[j] = managed_pages / ratio; |
---|
7526 | 8155 | } |
---|
7527 | 8156 | } |
---|
7528 | 8157 | } |
---|
.. | .. |
---|
7542 | 8171 | /* Calculate total number of !ZONE_HIGHMEM pages */ |
---|
7543 | 8172 | for_each_zone(zone) { |
---|
7544 | 8173 | if (!is_highmem(zone)) |
---|
7545 | | - lowmem_pages += zone->managed_pages; |
---|
| 8174 | + lowmem_pages += zone_managed_pages(zone); |
---|
7546 | 8175 | } |
---|
7547 | 8176 | |
---|
7548 | 8177 | for_each_zone(zone) { |
---|
7549 | | - u64 min, low; |
---|
| 8178 | + u64 tmp, low; |
---|
7550 | 8179 | |
---|
7551 | 8180 | spin_lock_irqsave(&zone->lock, flags); |
---|
7552 | | - min = (u64)pages_min * zone->managed_pages; |
---|
7553 | | - do_div(min, lowmem_pages); |
---|
7554 | | - low = (u64)pages_low * zone->managed_pages; |
---|
7555 | | - do_div(low, vm_total_pages); |
---|
7556 | | - |
---|
| 8181 | + tmp = (u64)pages_min * zone_managed_pages(zone); |
---|
| 8182 | + do_div(tmp, lowmem_pages); |
---|
| 8183 | + low = (u64)pages_low * zone_managed_pages(zone); |
---|
| 8184 | + do_div(low, nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE))); |
---|
7557 | 8185 | if (is_highmem(zone)) { |
---|
7558 | 8186 | /* |
---|
7559 | 8187 | * __GFP_HIGH and PF_MEMALLOC allocations usually don't |
---|
.. | .. |
---|
7561 | 8189 | * value here. |
---|
7562 | 8190 | * |
---|
7563 | 8191 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) |
---|
7564 | | - * deltas control asynch page reclaim, and so should |
---|
| 8192 | + * deltas control async page reclaim, and so should |
---|
7565 | 8193 | * not be capped for highmem. |
---|
7566 | 8194 | */ |
---|
7567 | 8195 | unsigned long min_pages; |
---|
7568 | 8196 | |
---|
7569 | | - min_pages = zone->managed_pages / 1024; |
---|
| 8197 | + min_pages = zone_managed_pages(zone) / 1024; |
---|
7570 | 8198 | min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); |
---|
7571 | | - zone->watermark[WMARK_MIN] = min_pages; |
---|
| 8199 | + zone->_watermark[WMARK_MIN] = min_pages; |
---|
7572 | 8200 | } else { |
---|
7573 | 8201 | /* |
---|
7574 | 8202 | * If it's a lowmem zone, reserve a number of pages |
---|
7575 | 8203 | * proportionate to the zone's size. |
---|
7576 | 8204 | */ |
---|
7577 | | - zone->watermark[WMARK_MIN] = min; |
---|
| 8205 | + zone->_watermark[WMARK_MIN] = tmp; |
---|
7578 | 8206 | } |
---|
7579 | 8207 | |
---|
7580 | 8208 | /* |
---|
.. | .. |
---|
7582 | 8210 | * scale factor in proportion to available memory, but |
---|
7583 | 8211 | * ensure a minimum size on small systems. |
---|
7584 | 8212 | */ |
---|
7585 | | - min = max_t(u64, min >> 2, |
---|
7586 | | - mult_frac(zone->managed_pages, |
---|
| 8213 | + tmp = max_t(u64, tmp >> 2, |
---|
| 8214 | + mult_frac(zone_managed_pages(zone), |
---|
7587 | 8215 | watermark_scale_factor, 10000)); |
---|
7588 | 8216 | |
---|
7589 | | - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + |
---|
7590 | | - low + min; |
---|
7591 | | - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + |
---|
7592 | | - low + min * 2; |
---|
| 8217 | + zone->watermark_boost = 0; |
---|
| 8218 | + zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + low + tmp; |
---|
| 8219 | + zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + low + tmp * 2; |
---|
7593 | 8220 | |
---|
7594 | 8221 | spin_unlock_irqrestore(&zone->lock, flags); |
---|
7595 | 8222 | } |
---|
.. | .. |
---|
7618 | 8245 | * Initialise min_free_kbytes. |
---|
7619 | 8246 | * |
---|
7620 | 8247 | * For small machines we want it small (128k min). For large machines |
---|
7621 | | - * we want it large (64MB max). But it is not linear, because network |
---|
| 8248 | + * we want it large (256MB max). But it is not linear, because network |
---|
7622 | 8249 | * bandwidth does not increase linearly with machine size. We use |
---|
7623 | 8250 | * |
---|
7624 | 8251 | * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: |
---|
.. | .. |
---|
7650 | 8277 | min_free_kbytes = new_min_free_kbytes; |
---|
7651 | 8278 | if (min_free_kbytes < 128) |
---|
7652 | 8279 | min_free_kbytes = 128; |
---|
7653 | | - if (min_free_kbytes > 65536) |
---|
7654 | | - min_free_kbytes = 65536; |
---|
| 8280 | + if (min_free_kbytes > 262144) |
---|
| 8281 | + min_free_kbytes = 262144; |
---|
7655 | 8282 | } else { |
---|
7656 | 8283 | pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", |
---|
7657 | 8284 | new_min_free_kbytes, user_min_free_kbytes); |
---|
.. | .. |
---|
7677 | 8304 | * or extra_free_kbytes changes. |
---|
7678 | 8305 | */ |
---|
7679 | 8306 | int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, |
---|
7680 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 8307 | + void *buffer, size_t *length, loff_t *ppos) |
---|
7681 | 8308 | { |
---|
7682 | 8309 | int rc; |
---|
7683 | 8310 | |
---|
.. | .. |
---|
7693 | 8320 | } |
---|
7694 | 8321 | |
---|
7695 | 8322 | int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, |
---|
7696 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 8323 | + void *buffer, size_t *length, loff_t *ppos) |
---|
7697 | 8324 | { |
---|
7698 | 8325 | int rc; |
---|
7699 | 8326 | |
---|
.. | .. |
---|
7717 | 8344 | pgdat->min_unmapped_pages = 0; |
---|
7718 | 8345 | |
---|
7719 | 8346 | for_each_zone(zone) |
---|
7720 | | - zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * |
---|
7721 | | - sysctl_min_unmapped_ratio) / 100; |
---|
| 8347 | + zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * |
---|
| 8348 | + sysctl_min_unmapped_ratio) / 100; |
---|
7722 | 8349 | } |
---|
7723 | 8350 | |
---|
7724 | 8351 | |
---|
7725 | 8352 | int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, |
---|
7726 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 8353 | + void *buffer, size_t *length, loff_t *ppos) |
---|
7727 | 8354 | { |
---|
7728 | 8355 | int rc; |
---|
7729 | 8356 | |
---|
.. | .. |
---|
7745 | 8372 | pgdat->min_slab_pages = 0; |
---|
7746 | 8373 | |
---|
7747 | 8374 | for_each_zone(zone) |
---|
7748 | | - zone->zone_pgdat->min_slab_pages += (zone->managed_pages * |
---|
7749 | | - sysctl_min_slab_ratio) / 100; |
---|
| 8375 | + zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * |
---|
| 8376 | + sysctl_min_slab_ratio) / 100; |
---|
7750 | 8377 | } |
---|
7751 | 8378 | |
---|
7752 | 8379 | int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, |
---|
7753 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 8380 | + void *buffer, size_t *length, loff_t *ppos) |
---|
7754 | 8381 | { |
---|
7755 | 8382 | int rc; |
---|
7756 | 8383 | |
---|
.. | .. |
---|
7774 | 8401 | * if in function of the boot time zone sizes. |
---|
7775 | 8402 | */ |
---|
7776 | 8403 | int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, |
---|
7777 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 8404 | + void *buffer, size_t *length, loff_t *ppos) |
---|
7778 | 8405 | { |
---|
| 8406 | + int i; |
---|
| 8407 | + |
---|
7779 | 8408 | proc_dointvec_minmax(table, write, buffer, length, ppos); |
---|
| 8409 | + |
---|
| 8410 | + for (i = 0; i < MAX_NR_ZONES; i++) { |
---|
| 8411 | + if (sysctl_lowmem_reserve_ratio[i] < 1) |
---|
| 8412 | + sysctl_lowmem_reserve_ratio[i] = 0; |
---|
| 8413 | + } |
---|
| 8414 | + |
---|
7780 | 8415 | setup_per_zone_lowmem_reserve(); |
---|
7781 | 8416 | return 0; |
---|
| 8417 | +} |
---|
| 8418 | + |
---|
| 8419 | +static void __zone_pcp_update(struct zone *zone) |
---|
| 8420 | +{ |
---|
| 8421 | + unsigned int cpu; |
---|
| 8422 | + |
---|
| 8423 | + for_each_possible_cpu(cpu) |
---|
| 8424 | + pageset_set_high_and_batch(zone, |
---|
| 8425 | + per_cpu_ptr(zone->pageset, cpu)); |
---|
7782 | 8426 | } |
---|
7783 | 8427 | |
---|
7784 | 8428 | /* |
---|
.. | .. |
---|
7787 | 8431 | * pagelist can have before it gets flushed back to buddy allocator. |
---|
7788 | 8432 | */ |
---|
7789 | 8433 | int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, |
---|
7790 | | - void __user *buffer, size_t *length, loff_t *ppos) |
---|
| 8434 | + void *buffer, size_t *length, loff_t *ppos) |
---|
7791 | 8435 | { |
---|
7792 | 8436 | struct zone *zone; |
---|
7793 | 8437 | int old_percpu_pagelist_fraction; |
---|
.. | .. |
---|
7812 | 8456 | if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) |
---|
7813 | 8457 | goto out; |
---|
7814 | 8458 | |
---|
7815 | | - for_each_populated_zone(zone) { |
---|
7816 | | - unsigned int cpu; |
---|
7817 | | - |
---|
7818 | | - for_each_possible_cpu(cpu) |
---|
7819 | | - pageset_set_high_and_batch(zone, |
---|
7820 | | - per_cpu_ptr(zone->pageset, cpu)); |
---|
7821 | | - } |
---|
| 8459 | + for_each_populated_zone(zone) |
---|
| 8460 | + __zone_pcp_update(zone); |
---|
7822 | 8461 | out: |
---|
7823 | 8462 | mutex_unlock(&pcp_batch_high_lock); |
---|
7824 | 8463 | return ret; |
---|
7825 | 8464 | } |
---|
7826 | | - |
---|
7827 | | -#ifdef CONFIG_NUMA |
---|
7828 | | -int hashdist = HASHDIST_DEFAULT; |
---|
7829 | | - |
---|
7830 | | -static int __init set_hashdist(char *str) |
---|
7831 | | -{ |
---|
7832 | | - if (!str) |
---|
7833 | | - return 0; |
---|
7834 | | - hashdist = simple_strtoul(str, &str, 0); |
---|
7835 | | - return 1; |
---|
7836 | | -} |
---|
7837 | | -__setup("hashdist=", set_hashdist); |
---|
7838 | | -#endif |
---|
7839 | 8465 | |
---|
7840 | 8466 | #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES |
---|
7841 | 8467 | /* |
---|
.. | .. |
---|
7883 | 8509 | unsigned long log2qty, size; |
---|
7884 | 8510 | void *table = NULL; |
---|
7885 | 8511 | gfp_t gfp_flags; |
---|
| 8512 | + bool virt; |
---|
7886 | 8513 | |
---|
7887 | 8514 | /* allow the kernel cmdline to have a say */ |
---|
7888 | 8515 | if (!numentries) { |
---|
.. | .. |
---|
7939 | 8566 | |
---|
7940 | 8567 | gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; |
---|
7941 | 8568 | do { |
---|
| 8569 | + virt = false; |
---|
7942 | 8570 | size = bucketsize << log2qty; |
---|
7943 | 8571 | if (flags & HASH_EARLY) { |
---|
7944 | 8572 | if (flags & HASH_ZERO) |
---|
7945 | | - table = memblock_virt_alloc_nopanic(size, 0); |
---|
| 8573 | + table = memblock_alloc(size, SMP_CACHE_BYTES); |
---|
7946 | 8574 | else |
---|
7947 | | - table = memblock_virt_alloc_raw(size, 0); |
---|
7948 | | - } else if (hashdist) { |
---|
7949 | | - table = __vmalloc(size, gfp_flags, PAGE_KERNEL); |
---|
| 8575 | + table = memblock_alloc_raw(size, |
---|
| 8576 | + SMP_CACHE_BYTES); |
---|
| 8577 | + } else if (get_order(size) >= MAX_ORDER || hashdist) { |
---|
| 8578 | + table = __vmalloc(size, gfp_flags); |
---|
| 8579 | + virt = true; |
---|
7950 | 8580 | } else { |
---|
7951 | 8581 | /* |
---|
7952 | 8582 | * If bucketsize is not a power-of-two, we may free |
---|
7953 | 8583 | * some pages at the end of hash table which |
---|
7954 | 8584 | * alloc_pages_exact() automatically does |
---|
7955 | 8585 | */ |
---|
7956 | | - if (get_order(size) < MAX_ORDER) { |
---|
7957 | | - table = alloc_pages_exact(size, gfp_flags); |
---|
7958 | | - kmemleak_alloc(table, size, 1, gfp_flags); |
---|
7959 | | - } |
---|
| 8586 | + table = alloc_pages_exact(size, gfp_flags); |
---|
| 8587 | + kmemleak_alloc(table, size, 1, gfp_flags); |
---|
7960 | 8588 | } |
---|
7961 | 8589 | } while (!table && size > PAGE_SIZE && --log2qty); |
---|
7962 | 8590 | |
---|
7963 | 8591 | if (!table) |
---|
7964 | 8592 | panic("Failed to allocate %s hash table\n", tablename); |
---|
7965 | 8593 | |
---|
7966 | | - pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", |
---|
7967 | | - tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); |
---|
| 8594 | + pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", |
---|
| 8595 | + tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, |
---|
| 8596 | + virt ? "vmalloc" : "linear"); |
---|
7968 | 8597 | |
---|
7969 | 8598 | if (_hash_shift) |
---|
7970 | 8599 | *_hash_shift = log2qty; |
---|
.. | .. |
---|
7976 | 8605 | |
---|
7977 | 8606 | /* |
---|
7978 | 8607 | * This function checks whether pageblock includes unmovable pages or not. |
---|
7979 | | - * If @count is not zero, it is okay to include less @count unmovable pages |
---|
7980 | 8608 | * |
---|
7981 | 8609 | * PageLRU check without isolation or lru_lock could race so that |
---|
7982 | 8610 | * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable |
---|
7983 | 8611 | * check without lock_page also may miss some movable non-lru pages at |
---|
7984 | 8612 | * race condition. So you can't expect this function should be exact. |
---|
| 8613 | + * |
---|
| 8614 | + * Returns a page without holding a reference. If the caller wants to |
---|
| 8615 | + * dereference that page (e.g., dumping), it has to make sure that it |
---|
| 8616 | + * cannot get removed (e.g., via memory unplug) concurrently. |
---|
| 8617 | + * |
---|
7985 | 8618 | */ |
---|
7986 | | -bool has_unmovable_pages(struct zone *zone, struct page *page, int count, |
---|
7987 | | - int migratetype, |
---|
7988 | | - bool skip_hwpoisoned_pages) |
---|
| 8619 | +struct page *has_unmovable_pages(struct zone *zone, struct page *page, |
---|
| 8620 | + int migratetype, int flags) |
---|
7989 | 8621 | { |
---|
7990 | | - unsigned long pfn, iter, found; |
---|
| 8622 | + unsigned long iter = 0; |
---|
| 8623 | + unsigned long pfn = page_to_pfn(page); |
---|
| 8624 | + unsigned long offset = pfn % pageblock_nr_pages; |
---|
7991 | 8625 | |
---|
7992 | | - /* |
---|
7993 | | - * TODO we could make this much more efficient by not checking every |
---|
7994 | | - * page in the range if we know all of them are in MOVABLE_ZONE and |
---|
7995 | | - * that the movable zone guarantees that pages are migratable but |
---|
7996 | | - * the later is not the case right now unfortunatelly. E.g. movablecore |
---|
7997 | | - * can still lead to having bootmem allocations in zone_movable. |
---|
7998 | | - */ |
---|
| 8626 | + if (is_migrate_cma_page(page)) { |
---|
| 8627 | + /* |
---|
| 8628 | + * CMA allocations (alloc_contig_range) really need to mark |
---|
| 8629 | + * isolate CMA pageblocks even when they are not movable in fact |
---|
| 8630 | + * so consider them movable here. |
---|
| 8631 | + */ |
---|
| 8632 | + if (is_migrate_cma(migratetype)) |
---|
| 8633 | + return NULL; |
---|
7999 | 8634 | |
---|
8000 | | - /* |
---|
8001 | | - * CMA allocations (alloc_contig_range) really need to mark isolate |
---|
8002 | | - * CMA pageblocks even when they are not movable in fact so consider |
---|
8003 | | - * them movable here. |
---|
8004 | | - */ |
---|
8005 | | - if (is_migrate_cma(migratetype) && |
---|
8006 | | - is_migrate_cma(get_pageblock_migratetype(page))) |
---|
8007 | | - return false; |
---|
| 8635 | + return page; |
---|
| 8636 | + } |
---|
8008 | 8637 | |
---|
8009 | | - pfn = page_to_pfn(page); |
---|
8010 | | - for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { |
---|
8011 | | - unsigned long check = pfn + iter; |
---|
8012 | | - |
---|
8013 | | - if (!pfn_valid_within(check)) |
---|
| 8638 | + for (; iter < pageblock_nr_pages - offset; iter++) { |
---|
| 8639 | + if (!pfn_valid_within(pfn + iter)) |
---|
8014 | 8640 | continue; |
---|
8015 | 8641 | |
---|
8016 | | - page = pfn_to_page(check); |
---|
| 8642 | + page = pfn_to_page(pfn + iter); |
---|
8017 | 8643 | |
---|
| 8644 | + /* |
---|
| 8645 | + * Both, bootmem allocations and memory holes are marked |
---|
| 8646 | + * PG_reserved and are unmovable. We can even have unmovable |
---|
| 8647 | + * allocations inside ZONE_MOVABLE, for example when |
---|
| 8648 | + * specifying "movablecore". |
---|
| 8649 | + */ |
---|
8018 | 8650 | if (PageReserved(page)) |
---|
8019 | | - goto unmovable; |
---|
| 8651 | + return page; |
---|
8020 | 8652 | |
---|
8021 | 8653 | /* |
---|
8022 | 8654 | * If the zone is movable and we have ruled out all reserved |
---|
.. | .. |
---|
8028 | 8660 | |
---|
8029 | 8661 | /* |
---|
8030 | 8662 | * Hugepages are not in LRU lists, but they're movable. |
---|
8031 | | - * We need not scan over tail pages bacause we don't |
---|
| 8663 | + * THPs are on the LRU, but need to be counted as #small pages. |
---|
| 8664 | + * We need not scan over tail pages because we don't |
---|
8032 | 8665 | * handle each tail page individually in migration. |
---|
8033 | 8666 | */ |
---|
8034 | | - if (PageHuge(page)) { |
---|
| 8667 | + if (PageHuge(page) || PageTransCompound(page)) { |
---|
8035 | 8668 | struct page *head = compound_head(page); |
---|
8036 | 8669 | unsigned int skip_pages; |
---|
8037 | 8670 | |
---|
8038 | | - if (!hugepage_migration_supported(page_hstate(head))) |
---|
8039 | | - goto unmovable; |
---|
| 8671 | + if (PageHuge(page)) { |
---|
| 8672 | + if (!hugepage_migration_supported(page_hstate(head))) |
---|
| 8673 | + return page; |
---|
| 8674 | + } else if (!PageLRU(head) && !__PageMovable(head)) { |
---|
| 8675 | + return page; |
---|
| 8676 | + } |
---|
8040 | 8677 | |
---|
8041 | | - skip_pages = (1 << compound_order(head)) - (page - head); |
---|
| 8678 | + skip_pages = compound_nr(head) - (page - head); |
---|
8042 | 8679 | iter += skip_pages - 1; |
---|
8043 | 8680 | continue; |
---|
8044 | 8681 | } |
---|
.. | .. |
---|
8051 | 8688 | */ |
---|
8052 | 8689 | if (!page_ref_count(page)) { |
---|
8053 | 8690 | if (PageBuddy(page)) |
---|
8054 | | - iter += (1 << page_order(page)) - 1; |
---|
| 8691 | + iter += (1 << buddy_order(page)) - 1; |
---|
8055 | 8692 | continue; |
---|
8056 | 8693 | } |
---|
8057 | 8694 | |
---|
.. | .. |
---|
8059 | 8696 | * The HWPoisoned page may be not in buddy system, and |
---|
8060 | 8697 | * page_count() is not 0. |
---|
8061 | 8698 | */ |
---|
8062 | | - if (skip_hwpoisoned_pages && PageHWPoison(page)) |
---|
| 8699 | + if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) |
---|
8063 | 8700 | continue; |
---|
8064 | 8701 | |
---|
8065 | | - if (__PageMovable(page)) |
---|
| 8702 | + /* |
---|
| 8703 | + * We treat all PageOffline() pages as movable when offlining |
---|
| 8704 | + * to give drivers a chance to decrement their reference count |
---|
| 8705 | + * in MEM_GOING_OFFLINE in order to indicate that these pages |
---|
| 8706 | + * can be offlined as there are no direct references anymore. |
---|
| 8707 | + * For actually unmovable PageOffline() where the driver does |
---|
| 8708 | + * not support this, we will fail later when trying to actually |
---|
| 8709 | + * move these pages that still have a reference count > 0. |
---|
| 8710 | + * (false negatives in this function only) |
---|
| 8711 | + */ |
---|
| 8712 | + if ((flags & MEMORY_OFFLINE) && PageOffline(page)) |
---|
8066 | 8713 | continue; |
---|
8067 | 8714 | |
---|
8068 | | - if (!PageLRU(page)) |
---|
8069 | | - found++; |
---|
| 8715 | + if (__PageMovable(page) || PageLRU(page)) |
---|
| 8716 | + continue; |
---|
| 8717 | + |
---|
8070 | 8718 | /* |
---|
8071 | 8719 | * If there are RECLAIMABLE pages, we need to check |
---|
8072 | 8720 | * it. But now, memory offline itself doesn't call |
---|
8073 | 8721 | * shrink_node_slabs() and it still to be fixed. |
---|
8074 | 8722 | */ |
---|
8075 | | - /* |
---|
8076 | | - * If the page is not RAM, page_count()should be 0. |
---|
8077 | | - * we don't need more check. This is an _used_ not-movable page. |
---|
8078 | | - * |
---|
8079 | | - * The problematic thing here is PG_reserved pages. PG_reserved |
---|
8080 | | - * is set to both of a memory hole page and a _used_ kernel |
---|
8081 | | - * page at boot. |
---|
8082 | | - */ |
---|
8083 | | - if (found > count) |
---|
8084 | | - goto unmovable; |
---|
| 8723 | + return page; |
---|
8085 | 8724 | } |
---|
8086 | | - return false; |
---|
8087 | | -unmovable: |
---|
8088 | | - WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); |
---|
8089 | | - return true; |
---|
| 8725 | + return NULL; |
---|
8090 | 8726 | } |
---|
8091 | 8727 | |
---|
8092 | | -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) |
---|
8093 | | - |
---|
| 8728 | +#ifdef CONFIG_CONTIG_ALLOC |
---|
8094 | 8729 | static unsigned long pfn_max_align_down(unsigned long pfn) |
---|
8095 | 8730 | { |
---|
8096 | 8731 | return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, |
---|
8097 | 8732 | pageblock_nr_pages) - 1); |
---|
8098 | 8733 | } |
---|
8099 | 8734 | |
---|
8100 | | -static unsigned long pfn_max_align_up(unsigned long pfn) |
---|
| 8735 | +unsigned long pfn_max_align_up(unsigned long pfn) |
---|
8101 | 8736 | { |
---|
8102 | 8737 | return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, |
---|
8103 | 8738 | pageblock_nr_pages)); |
---|
8104 | 8739 | } |
---|
8105 | 8740 | |
---|
| 8741 | +#if defined(CONFIG_DYNAMIC_DEBUG) || \ |
---|
| 8742 | + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) |
---|
| 8743 | +/* Usage: See admin-guide/dynamic-debug-howto.rst */ |
---|
| 8744 | +static void alloc_contig_dump_pages(struct list_head *page_list) |
---|
| 8745 | +{ |
---|
| 8746 | + DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure"); |
---|
| 8747 | + |
---|
| 8748 | + if (DYNAMIC_DEBUG_BRANCH(descriptor)) { |
---|
| 8749 | + struct page *page; |
---|
| 8750 | + unsigned long nr_skip = 0; |
---|
| 8751 | + unsigned long nr_pages = 0; |
---|
| 8752 | + |
---|
| 8753 | + dump_stack(); |
---|
| 8754 | + list_for_each_entry(page, page_list, lru) { |
---|
| 8755 | + nr_pages++; |
---|
| 8756 | + /* The page will be freed by putback_movable_pages soon */ |
---|
| 8757 | + if (page_count(page) == 1) { |
---|
| 8758 | + nr_skip++; |
---|
| 8759 | + continue; |
---|
| 8760 | + } |
---|
| 8761 | + dump_page(page, "migration failure"); |
---|
| 8762 | + } |
---|
| 8763 | + pr_warn("total dump_pages %lu skipping %lu\n", nr_pages, nr_skip); |
---|
| 8764 | + } |
---|
| 8765 | +} |
---|
| 8766 | +#else |
---|
| 8767 | +static inline void alloc_contig_dump_pages(struct list_head *page_list) |
---|
| 8768 | +{ |
---|
| 8769 | +} |
---|
| 8770 | +#endif |
---|
| 8771 | + |
---|
8106 | 8772 | /* [start, end) must belong to a single zone. */ |
---|
8107 | 8773 | static int __alloc_contig_migrate_range(struct compact_control *cc, |
---|
8108 | | - unsigned long start, unsigned long end) |
---|
| 8774 | + unsigned long start, unsigned long end, |
---|
| 8775 | + struct acr_info *info) |
---|
8109 | 8776 | { |
---|
8110 | 8777 | /* This function is based on compact_zone() from compaction.c. */ |
---|
8111 | | - unsigned long nr_reclaimed; |
---|
| 8778 | + unsigned int nr_reclaimed; |
---|
8112 | 8779 | unsigned long pfn = start; |
---|
8113 | 8780 | unsigned int tries = 0; |
---|
| 8781 | + unsigned int max_tries = 5; |
---|
8114 | 8782 | int ret = 0; |
---|
| 8783 | + struct page *page; |
---|
| 8784 | + struct migration_target_control mtc = { |
---|
| 8785 | + .nid = zone_to_nid(cc->zone), |
---|
| 8786 | + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, |
---|
| 8787 | + }; |
---|
8115 | 8788 | |
---|
8116 | | - migrate_prep(); |
---|
| 8789 | + if (cc->alloc_contig && cc->mode == MIGRATE_ASYNC) |
---|
| 8790 | + max_tries = 1; |
---|
| 8791 | + |
---|
| 8792 | + lru_cache_disable(); |
---|
8117 | 8793 | |
---|
8118 | 8794 | while (pfn < end || !list_empty(&cc->migratepages)) { |
---|
8119 | 8795 | if (fatal_signal_pending(current)) { |
---|
.. | .. |
---|
8129 | 8805 | break; |
---|
8130 | 8806 | } |
---|
8131 | 8807 | tries = 0; |
---|
8132 | | - } else if (++tries == 5) { |
---|
| 8808 | + } else if (++tries == max_tries) { |
---|
8133 | 8809 | ret = ret < 0 ? ret : -EBUSY; |
---|
8134 | 8810 | break; |
---|
8135 | 8811 | } |
---|
8136 | 8812 | |
---|
8137 | 8813 | nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, |
---|
8138 | 8814 | &cc->migratepages); |
---|
| 8815 | + info->nr_reclaimed += nr_reclaimed; |
---|
8139 | 8816 | cc->nr_migratepages -= nr_reclaimed; |
---|
8140 | 8817 | |
---|
8141 | | - ret = migrate_pages(&cc->migratepages, alloc_migrate_target, |
---|
8142 | | - NULL, 0, cc->mode, MR_CONTIG_RANGE); |
---|
| 8818 | + list_for_each_entry(page, &cc->migratepages, lru) |
---|
| 8819 | + info->nr_mapped += page_mapcount(page); |
---|
| 8820 | + |
---|
| 8821 | + ret = migrate_pages(&cc->migratepages, alloc_migration_target, |
---|
| 8822 | + NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); |
---|
| 8823 | + if (!ret) |
---|
| 8824 | + info->nr_migrated += cc->nr_migratepages; |
---|
8143 | 8825 | } |
---|
| 8826 | + |
---|
| 8827 | + lru_cache_enable(); |
---|
8144 | 8828 | if (ret < 0) { |
---|
| 8829 | + if (ret == -EBUSY) { |
---|
| 8830 | + alloc_contig_dump_pages(&cc->migratepages); |
---|
| 8831 | + page_pinner_mark_migration_failed_pages(&cc->migratepages); |
---|
| 8832 | + } |
---|
| 8833 | + |
---|
| 8834 | + if (!list_empty(&cc->migratepages)) { |
---|
| 8835 | + page = list_first_entry(&cc->migratepages, struct page , lru); |
---|
| 8836 | + info->failed_pfn = page_to_pfn(page); |
---|
| 8837 | + } |
---|
| 8838 | + |
---|
8145 | 8839 | putback_movable_pages(&cc->migratepages); |
---|
| 8840 | + info->err |= ACR_ERR_MIGRATE; |
---|
8146 | 8841 | return ret; |
---|
8147 | 8842 | } |
---|
8148 | 8843 | return 0; |
---|
.. | .. |
---|
8165 | 8860 | * pageblocks in the range. Once isolated, the pageblocks should not |
---|
8166 | 8861 | * be modified by others. |
---|
8167 | 8862 | * |
---|
8168 | | - * Returns zero on success or negative error code. On success all |
---|
| 8863 | + * Return: zero on success or negative error code. On success all |
---|
8169 | 8864 | * pages which PFN is in [start, end) are allocated for the caller and |
---|
8170 | 8865 | * need to be freed with free_contig_range(). |
---|
8171 | 8866 | */ |
---|
8172 | 8867 | int alloc_contig_range(unsigned long start, unsigned long end, |
---|
8173 | | - unsigned migratetype, gfp_t gfp_mask) |
---|
| 8868 | + unsigned migratetype, gfp_t gfp_mask, |
---|
| 8869 | + struct acr_info *info) |
---|
8174 | 8870 | { |
---|
8175 | 8871 | unsigned long outer_start, outer_end; |
---|
8176 | 8872 | unsigned int order; |
---|
8177 | 8873 | int ret = 0; |
---|
| 8874 | + bool skip_drain_all_pages = false; |
---|
8178 | 8875 | |
---|
8179 | 8876 | struct compact_control cc = { |
---|
8180 | 8877 | .nr_migratepages = 0, |
---|
8181 | 8878 | .order = -1, |
---|
8182 | 8879 | .zone = page_zone(pfn_to_page(start)), |
---|
8183 | | - .mode = MIGRATE_SYNC, |
---|
| 8880 | + .mode = gfp_mask & __GFP_NORETRY ? MIGRATE_ASYNC : MIGRATE_SYNC, |
---|
8184 | 8881 | .ignore_skip_hint = true, |
---|
8185 | 8882 | .no_set_skip_hint = true, |
---|
8186 | 8883 | .gfp_mask = current_gfp_context(gfp_mask), |
---|
| 8884 | + .alloc_contig = true, |
---|
8187 | 8885 | }; |
---|
8188 | 8886 | INIT_LIST_HEAD(&cc.migratepages); |
---|
8189 | 8887 | |
---|
.. | .. |
---|
8212 | 8910 | */ |
---|
8213 | 8911 | |
---|
8214 | 8912 | ret = start_isolate_page_range(pfn_max_align_down(start), |
---|
8215 | | - pfn_max_align_up(end), migratetype, |
---|
8216 | | - false); |
---|
8217 | | - if (ret) |
---|
| 8913 | + pfn_max_align_up(end), migratetype, 0, |
---|
| 8914 | + &info->failed_pfn); |
---|
| 8915 | + if (ret) { |
---|
| 8916 | + info->err |= ACR_ERR_ISOLATE; |
---|
8218 | 8917 | return ret; |
---|
| 8918 | + } |
---|
8219 | 8919 | |
---|
8220 | | -#ifdef CONFIG_CMA |
---|
8221 | | - cc.zone->cma_alloc = 1; |
---|
8222 | | -#endif |
---|
| 8920 | + trace_android_vh_cma_drain_all_pages_bypass(migratetype, |
---|
| 8921 | + &skip_drain_all_pages); |
---|
| 8922 | + if (!skip_drain_all_pages) |
---|
| 8923 | + drain_all_pages(cc.zone); |
---|
| 8924 | + |
---|
8223 | 8925 | /* |
---|
8224 | 8926 | * In case of -EBUSY, we'd like to know which page causes problem. |
---|
8225 | 8927 | * So, just fall through. test_pages_isolated() has a tracepoint |
---|
.. | .. |
---|
8230 | 8932 | * allocated. So, if we fall through be sure to clear ret so that |
---|
8231 | 8933 | * -EBUSY is not accidentally used or returned to caller. |
---|
8232 | 8934 | */ |
---|
8233 | | - ret = __alloc_contig_migrate_range(&cc, start, end); |
---|
8234 | | - if (ret && ret != -EBUSY) |
---|
| 8935 | + ret = __alloc_contig_migrate_range(&cc, start, end, info); |
---|
| 8936 | + if (ret && (ret != -EBUSY || (gfp_mask & __GFP_NORETRY))) |
---|
8235 | 8937 | goto done; |
---|
8236 | 8938 | ret =0; |
---|
8237 | 8939 | |
---|
.. | .. |
---|
8252 | 8954 | * isolated thus they won't get removed from buddy. |
---|
8253 | 8955 | */ |
---|
8254 | 8956 | |
---|
8255 | | - lru_add_drain_all(); |
---|
8256 | | - drain_all_pages(cc.zone); |
---|
8257 | | - |
---|
8258 | 8957 | order = 0; |
---|
8259 | 8958 | outer_start = start; |
---|
8260 | 8959 | while (!PageBuddy(pfn_to_page(outer_start))) { |
---|
.. | .. |
---|
8266 | 8965 | } |
---|
8267 | 8966 | |
---|
8268 | 8967 | if (outer_start != start) { |
---|
8269 | | - order = page_order(pfn_to_page(outer_start)); |
---|
| 8968 | + order = buddy_order(pfn_to_page(outer_start)); |
---|
8270 | 8969 | |
---|
8271 | 8970 | /* |
---|
8272 | 8971 | * outer_start page could be small order buddy page and |
---|
.. | .. |
---|
8279 | 8978 | } |
---|
8280 | 8979 | |
---|
8281 | 8980 | /* Make sure the range is really isolated. */ |
---|
8282 | | - if (test_pages_isolated(outer_start, end, false)) { |
---|
| 8981 | + if (test_pages_isolated(outer_start, end, 0, &info->failed_pfn)) { |
---|
8283 | 8982 | pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", |
---|
8284 | 8983 | __func__, outer_start, end); |
---|
8285 | 8984 | ret = -EBUSY; |
---|
| 8985 | + info->err |= ACR_ERR_TEST; |
---|
8286 | 8986 | goto done; |
---|
8287 | 8987 | } |
---|
8288 | 8988 | |
---|
.. | .. |
---|
8302 | 9002 | done: |
---|
8303 | 9003 | undo_isolate_page_range(pfn_max_align_down(start), |
---|
8304 | 9004 | pfn_max_align_up(end), migratetype); |
---|
8305 | | -#ifdef CONFIG_CMA |
---|
8306 | | - cc.zone->cma_alloc = 0; |
---|
8307 | | -#endif |
---|
8308 | 9005 | return ret; |
---|
8309 | 9006 | } |
---|
| 9007 | +EXPORT_SYMBOL(alloc_contig_range); |
---|
8310 | 9008 | |
---|
8311 | | -void free_contig_range(unsigned long pfn, unsigned nr_pages) |
---|
| 9009 | +static int __alloc_contig_pages(unsigned long start_pfn, |
---|
| 9010 | + unsigned long nr_pages, gfp_t gfp_mask) |
---|
| 9011 | +{ |
---|
| 9012 | + struct acr_info dummy; |
---|
| 9013 | + unsigned long end_pfn = start_pfn + nr_pages; |
---|
| 9014 | + |
---|
| 9015 | + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, |
---|
| 9016 | + gfp_mask, &dummy); |
---|
| 9017 | +} |
---|
| 9018 | + |
---|
| 9019 | +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, |
---|
| 9020 | + unsigned long nr_pages) |
---|
| 9021 | +{ |
---|
| 9022 | + unsigned long i, end_pfn = start_pfn + nr_pages; |
---|
| 9023 | + struct page *page; |
---|
| 9024 | + |
---|
| 9025 | + for (i = start_pfn; i < end_pfn; i++) { |
---|
| 9026 | + page = pfn_to_online_page(i); |
---|
| 9027 | + if (!page) |
---|
| 9028 | + return false; |
---|
| 9029 | + |
---|
| 9030 | + if (page_zone(page) != z) |
---|
| 9031 | + return false; |
---|
| 9032 | + |
---|
| 9033 | + if (PageReserved(page)) |
---|
| 9034 | + return false; |
---|
| 9035 | + |
---|
| 9036 | + if (page_count(page) > 0) |
---|
| 9037 | + return false; |
---|
| 9038 | + |
---|
| 9039 | + if (PageHuge(page)) |
---|
| 9040 | + return false; |
---|
| 9041 | + } |
---|
| 9042 | + return true; |
---|
| 9043 | +} |
---|
| 9044 | + |
---|
| 9045 | +static bool zone_spans_last_pfn(const struct zone *zone, |
---|
| 9046 | + unsigned long start_pfn, unsigned long nr_pages) |
---|
| 9047 | +{ |
---|
| 9048 | + unsigned long last_pfn = start_pfn + nr_pages - 1; |
---|
| 9049 | + |
---|
| 9050 | + return zone_spans_pfn(zone, last_pfn); |
---|
| 9051 | +} |
---|
| 9052 | + |
---|
| 9053 | +/** |
---|
| 9054 | + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages |
---|
| 9055 | + * @nr_pages: Number of contiguous pages to allocate |
---|
| 9056 | + * @gfp_mask: GFP mask to limit search and used during compaction |
---|
| 9057 | + * @nid: Target node |
---|
| 9058 | + * @nodemask: Mask for other possible nodes |
---|
| 9059 | + * |
---|
| 9060 | + * This routine is a wrapper around alloc_contig_range(). It scans over zones |
---|
| 9061 | + * on an applicable zonelist to find a contiguous pfn range which can then be |
---|
| 9062 | + * tried for allocation with alloc_contig_range(). This routine is intended |
---|
| 9063 | + * for allocation requests which can not be fulfilled with the buddy allocator. |
---|
| 9064 | + * |
---|
| 9065 | + * The allocated memory is always aligned to a page boundary. If nr_pages is a |
---|
| 9066 | + * power of two then the alignment is guaranteed to be to the given nr_pages |
---|
| 9067 | + * (e.g. 1GB request would be aligned to 1GB). |
---|
| 9068 | + * |
---|
| 9069 | + * Allocated pages can be freed with free_contig_range() or by manually calling |
---|
| 9070 | + * __free_page() on each allocated page. |
---|
| 9071 | + * |
---|
| 9072 | + * Return: pointer to contiguous pages on success, or NULL if not successful. |
---|
| 9073 | + */ |
---|
| 9074 | +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, |
---|
| 9075 | + int nid, nodemask_t *nodemask) |
---|
| 9076 | +{ |
---|
| 9077 | + unsigned long ret, pfn, flags; |
---|
| 9078 | + struct zonelist *zonelist; |
---|
| 9079 | + struct zone *zone; |
---|
| 9080 | + struct zoneref *z; |
---|
| 9081 | + |
---|
| 9082 | + zonelist = node_zonelist(nid, gfp_mask); |
---|
| 9083 | + for_each_zone_zonelist_nodemask(zone, z, zonelist, |
---|
| 9084 | + gfp_zone(gfp_mask), nodemask) { |
---|
| 9085 | + spin_lock_irqsave(&zone->lock, flags); |
---|
| 9086 | + |
---|
| 9087 | + pfn = ALIGN(zone->zone_start_pfn, nr_pages); |
---|
| 9088 | + while (zone_spans_last_pfn(zone, pfn, nr_pages)) { |
---|
| 9089 | + if (pfn_range_valid_contig(zone, pfn, nr_pages)) { |
---|
| 9090 | + /* |
---|
| 9091 | + * We release the zone lock here because |
---|
| 9092 | + * alloc_contig_range() will also lock the zone |
---|
| 9093 | + * at some point. If there's an allocation |
---|
| 9094 | + * spinning on this lock, it may win the race |
---|
| 9095 | + * and cause alloc_contig_range() to fail... |
---|
| 9096 | + */ |
---|
| 9097 | + spin_unlock_irqrestore(&zone->lock, flags); |
---|
| 9098 | + ret = __alloc_contig_pages(pfn, nr_pages, |
---|
| 9099 | + gfp_mask); |
---|
| 9100 | + if (!ret) |
---|
| 9101 | + return pfn_to_page(pfn); |
---|
| 9102 | + spin_lock_irqsave(&zone->lock, flags); |
---|
| 9103 | + } |
---|
| 9104 | + pfn += nr_pages; |
---|
| 9105 | + } |
---|
| 9106 | + spin_unlock_irqrestore(&zone->lock, flags); |
---|
| 9107 | + } |
---|
| 9108 | + return NULL; |
---|
| 9109 | +} |
---|
| 9110 | +#endif /* CONFIG_CONTIG_ALLOC */ |
---|
| 9111 | + |
---|
| 9112 | +void free_contig_range(unsigned long pfn, unsigned int nr_pages) |
---|
8312 | 9113 | { |
---|
8313 | 9114 | unsigned int count = 0; |
---|
8314 | 9115 | |
---|
.. | .. |
---|
8320 | 9121 | } |
---|
8321 | 9122 | WARN(count != 0, "%d pages are still in use!\n", count); |
---|
8322 | 9123 | } |
---|
8323 | | -#endif |
---|
| 9124 | +EXPORT_SYMBOL(free_contig_range); |
---|
8324 | 9125 | |
---|
8325 | 9126 | /* |
---|
8326 | 9127 | * The zone indicated has a new number of managed_pages; batch sizes and percpu |
---|
.. | .. |
---|
8328 | 9129 | */ |
---|
8329 | 9130 | void __meminit zone_pcp_update(struct zone *zone) |
---|
8330 | 9131 | { |
---|
8331 | | - unsigned cpu; |
---|
8332 | 9132 | mutex_lock(&pcp_batch_high_lock); |
---|
8333 | | - for_each_possible_cpu(cpu) |
---|
8334 | | - pageset_set_high_and_batch(zone, |
---|
8335 | | - per_cpu_ptr(zone->pageset, cpu)); |
---|
| 9133 | + __zone_pcp_update(zone); |
---|
8336 | 9134 | mutex_unlock(&pcp_batch_high_lock); |
---|
8337 | 9135 | } |
---|
8338 | 9136 | |
---|
.. | .. |
---|
8343 | 9141 | struct per_cpu_pageset *pset; |
---|
8344 | 9142 | |
---|
8345 | 9143 | /* avoid races with drain_pages() */ |
---|
8346 | | - local_lock_irqsave(pa_lock, flags); |
---|
| 9144 | + local_lock_irqsave(&pa_lock.l, flags); |
---|
8347 | 9145 | if (zone->pageset != &boot_pageset) { |
---|
8348 | 9146 | for_each_online_cpu(cpu) { |
---|
8349 | 9147 | pset = per_cpu_ptr(zone->pageset, cpu); |
---|
.. | .. |
---|
8352 | 9150 | free_percpu(zone->pageset); |
---|
8353 | 9151 | zone->pageset = &boot_pageset; |
---|
8354 | 9152 | } |
---|
8355 | | - local_unlock_irqrestore(pa_lock, flags); |
---|
| 9153 | + local_unlock_irqrestore(&pa_lock.l, flags); |
---|
8356 | 9154 | } |
---|
8357 | 9155 | |
---|
8358 | 9156 | #ifdef CONFIG_MEMORY_HOTREMOVE |
---|
8359 | 9157 | /* |
---|
8360 | | - * All pages in the range must be in a single zone and isolated |
---|
8361 | | - * before calling this. |
---|
| 9158 | + * All pages in the range must be in a single zone, must not contain holes, |
---|
| 9159 | + * must span full sections, and must be isolated before calling this function. |
---|
8362 | 9160 | */ |
---|
8363 | | -void |
---|
8364 | | -__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) |
---|
| 9161 | +void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) |
---|
8365 | 9162 | { |
---|
| 9163 | + unsigned long pfn = start_pfn; |
---|
8366 | 9164 | struct page *page; |
---|
8367 | 9165 | struct zone *zone; |
---|
8368 | | - unsigned int order, i; |
---|
8369 | | - unsigned long pfn; |
---|
| 9166 | + unsigned int order; |
---|
8370 | 9167 | unsigned long flags; |
---|
8371 | | - /* find the first valid pfn */ |
---|
8372 | | - for (pfn = start_pfn; pfn < end_pfn; pfn++) |
---|
8373 | | - if (pfn_valid(pfn)) |
---|
8374 | | - break; |
---|
8375 | | - if (pfn == end_pfn) |
---|
8376 | | - return; |
---|
| 9168 | + |
---|
8377 | 9169 | offline_mem_sections(pfn, end_pfn); |
---|
8378 | 9170 | zone = page_zone(pfn_to_page(pfn)); |
---|
8379 | 9171 | spin_lock_irqsave(&zone->lock, flags); |
---|
8380 | | - pfn = start_pfn; |
---|
8381 | 9172 | while (pfn < end_pfn) { |
---|
8382 | | - if (!pfn_valid(pfn)) { |
---|
8383 | | - pfn++; |
---|
8384 | | - continue; |
---|
8385 | | - } |
---|
8386 | 9173 | page = pfn_to_page(pfn); |
---|
8387 | 9174 | /* |
---|
8388 | 9175 | * The HWPoisoned page may be not in buddy system, and |
---|
.. | .. |
---|
8390 | 9177 | */ |
---|
8391 | 9178 | if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { |
---|
8392 | 9179 | pfn++; |
---|
8393 | | - SetPageReserved(page); |
---|
| 9180 | + continue; |
---|
| 9181 | + } |
---|
| 9182 | + /* |
---|
| 9183 | + * At this point all remaining PageOffline() pages have a |
---|
| 9184 | + * reference count of 0 and can simply be skipped. |
---|
| 9185 | + */ |
---|
| 9186 | + if (PageOffline(page)) { |
---|
| 9187 | + BUG_ON(page_count(page)); |
---|
| 9188 | + BUG_ON(PageBuddy(page)); |
---|
| 9189 | + pfn++; |
---|
8394 | 9190 | continue; |
---|
8395 | 9191 | } |
---|
8396 | 9192 | |
---|
8397 | 9193 | BUG_ON(page_count(page)); |
---|
8398 | 9194 | BUG_ON(!PageBuddy(page)); |
---|
8399 | | - order = page_order(page); |
---|
8400 | | -#ifdef CONFIG_DEBUG_VM |
---|
8401 | | - pr_info("remove from free list %lx %d %lx\n", |
---|
8402 | | - pfn, 1 << order, end_pfn); |
---|
8403 | | -#endif |
---|
8404 | | - list_del(&page->lru); |
---|
8405 | | - rmv_page_order(page); |
---|
8406 | | - zone->free_area[order].nr_free--; |
---|
8407 | | - for (i = 0; i < (1 << order); i++) |
---|
8408 | | - SetPageReserved((page+i)); |
---|
| 9195 | + order = buddy_order(page); |
---|
| 9196 | + del_page_from_free_list(page, zone, order); |
---|
8409 | 9197 | pfn += (1 << order); |
---|
8410 | 9198 | } |
---|
8411 | 9199 | spin_unlock_irqrestore(&zone->lock, flags); |
---|
.. | .. |
---|
8423 | 9211 | for (order = 0; order < MAX_ORDER; order++) { |
---|
8424 | 9212 | struct page *page_head = page - (pfn & ((1 << order) - 1)); |
---|
8425 | 9213 | |
---|
8426 | | - if (PageBuddy(page_head) && page_order(page_head) >= order) |
---|
| 9214 | + if (PageBuddy(page_head) && buddy_order(page_head) >= order) |
---|
8427 | 9215 | break; |
---|
8428 | 9216 | } |
---|
8429 | 9217 | spin_unlock_irqrestore(&zone->lock, flags); |
---|
.. | .. |
---|
8433 | 9221 | |
---|
8434 | 9222 | #ifdef CONFIG_MEMORY_FAILURE |
---|
8435 | 9223 | /* |
---|
8436 | | - * Set PG_hwpoison flag if a given page is confirmed to be a free page. This |
---|
8437 | | - * test is performed under the zone lock to prevent a race against page |
---|
8438 | | - * allocation. |
---|
| 9224 | + * Break down a higher-order page in sub-pages, and keep our target out of |
---|
| 9225 | + * buddy allocator. |
---|
8439 | 9226 | */ |
---|
8440 | | -bool set_hwpoison_free_buddy_page(struct page *page) |
---|
| 9227 | +static void break_down_buddy_pages(struct zone *zone, struct page *page, |
---|
| 9228 | + struct page *target, int low, int high, |
---|
| 9229 | + int migratetype) |
---|
| 9230 | +{ |
---|
| 9231 | + unsigned long size = 1 << high; |
---|
| 9232 | + struct page *current_buddy, *next_page; |
---|
| 9233 | + |
---|
| 9234 | + while (high > low) { |
---|
| 9235 | + high--; |
---|
| 9236 | + size >>= 1; |
---|
| 9237 | + |
---|
| 9238 | + if (target >= &page[size]) { |
---|
| 9239 | + next_page = page + size; |
---|
| 9240 | + current_buddy = page; |
---|
| 9241 | + } else { |
---|
| 9242 | + next_page = page; |
---|
| 9243 | + current_buddy = page + size; |
---|
| 9244 | + } |
---|
| 9245 | + |
---|
| 9246 | + if (set_page_guard(zone, current_buddy, high, migratetype)) |
---|
| 9247 | + continue; |
---|
| 9248 | + |
---|
| 9249 | + if (current_buddy != target) { |
---|
| 9250 | + add_to_free_list(current_buddy, zone, high, migratetype); |
---|
| 9251 | + set_buddy_order(current_buddy, high); |
---|
| 9252 | + page = next_page; |
---|
| 9253 | + } |
---|
| 9254 | + } |
---|
| 9255 | +} |
---|
| 9256 | + |
---|
| 9257 | +/* |
---|
| 9258 | + * Take a page that will be marked as poisoned off the buddy allocator. |
---|
| 9259 | + */ |
---|
| 9260 | +bool take_page_off_buddy(struct page *page) |
---|
8441 | 9261 | { |
---|
8442 | 9262 | struct zone *zone = page_zone(page); |
---|
8443 | 9263 | unsigned long pfn = page_to_pfn(page); |
---|
8444 | 9264 | unsigned long flags; |
---|
8445 | 9265 | unsigned int order; |
---|
8446 | | - bool hwpoisoned = false; |
---|
| 9266 | + bool ret = false; |
---|
8447 | 9267 | |
---|
8448 | 9268 | spin_lock_irqsave(&zone->lock, flags); |
---|
8449 | 9269 | for (order = 0; order < MAX_ORDER; order++) { |
---|
8450 | 9270 | struct page *page_head = page - (pfn & ((1 << order) - 1)); |
---|
| 9271 | + int page_order = buddy_order(page_head); |
---|
8451 | 9272 | |
---|
8452 | | - if (PageBuddy(page_head) && page_order(page_head) >= order) { |
---|
8453 | | - if (!TestSetPageHWPoison(page)) |
---|
8454 | | - hwpoisoned = true; |
---|
| 9273 | + if (PageBuddy(page_head) && page_order >= order) { |
---|
| 9274 | + unsigned long pfn_head = page_to_pfn(page_head); |
---|
| 9275 | + int migratetype = get_pfnblock_migratetype(page_head, |
---|
| 9276 | + pfn_head); |
---|
| 9277 | + |
---|
| 9278 | + del_page_from_free_list(page_head, zone, page_order); |
---|
| 9279 | + break_down_buddy_pages(zone, page_head, page, 0, |
---|
| 9280 | + page_order, migratetype); |
---|
| 9281 | + if (!is_migrate_isolate(migratetype)) |
---|
| 9282 | + __mod_zone_freepage_state(zone, -1, migratetype); |
---|
| 9283 | + ret = true; |
---|
8455 | 9284 | break; |
---|
8456 | 9285 | } |
---|
| 9286 | + if (page_count(page_head) > 0) |
---|
| 9287 | + break; |
---|
8457 | 9288 | } |
---|
8458 | 9289 | spin_unlock_irqrestore(&zone->lock, flags); |
---|
8459 | | - |
---|
8460 | | - return hwpoisoned; |
---|
| 9290 | + return ret; |
---|
8461 | 9291 | } |
---|
8462 | 9292 | #endif |
---|
| 9293 | + |
---|
| 9294 | +#ifdef CONFIG_ZONE_DMA |
---|
| 9295 | +bool has_managed_dma(void) |
---|
| 9296 | +{ |
---|
| 9297 | + struct pglist_data *pgdat; |
---|
| 9298 | + |
---|
| 9299 | + for_each_online_pgdat(pgdat) { |
---|
| 9300 | + struct zone *zone = &pgdat->node_zones[ZONE_DMA]; |
---|
| 9301 | + |
---|
| 9302 | + if (managed_zone(zone)) |
---|
| 9303 | + return true; |
---|
| 9304 | + } |
---|
| 9305 | + return false; |
---|
| 9306 | +} |
---|
| 9307 | +#endif /* CONFIG_ZONE_DMA */ |
---|