From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/mm/page_alloc.c | 3616 +++++++++++++++++++++++++++++++++++++---------------------- 1 files changed, 2,263 insertions(+), 1,353 deletions(-) diff --git a/kernel/mm/page_alloc.c b/kernel/mm/page_alloc.c index f5d8cf7..3bcee27 100644 --- a/kernel/mm/page_alloc.c +++ b/kernel/mm/page_alloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/page_alloc.c * @@ -16,11 +17,11 @@ #include <linux/stddef.h> #include <linux/mm.h> +#include <linux/highmem.h> #include <linux/swap.h> #include <linux/interrupt.h> #include <linux/pagemap.h> #include <linux/jiffies.h> -#include <linux/bootmem.h> #include <linux/memblock.h> #include <linux/compiler.h> #include <linux/kernel.h> @@ -43,12 +44,12 @@ #include <linux/mempolicy.h> #include <linux/memremap.h> #include <linux/stop_machine.h> +#include <linux/random.h> #include <linux/sort.h> #include <linux/pfn.h> #include <linux/backing-dev.h> #include <linux/fault-inject.h> #include <linux/page-isolation.h> -#include <linux/page_ext.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> #include <linux/compaction.h> @@ -61,18 +62,63 @@ #include <linux/sched/rt.h> #include <linux/sched/mm.h> #include <linux/page_owner.h> +#include <linux/page_pinner.h> #include <linux/kthread.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> #include <linux/lockdep.h> #include <linux/nmi.h> -#include <linux/khugepaged.h> #include <linux/psi.h> +#include <linux/padata.h> +#include <linux/khugepaged.h> +#include <trace/hooks/mm.h> +#include <trace/hooks/vmscan.h> #include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" +#include "shuffle.h" +#include "page_reporting.h" + +/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ +typedef int __bitwise fpi_t; + +/* No special request */ +#define FPI_NONE ((__force fpi_t)0) + +/* + * Skip free page reporting notification for the (possibly merged) page. + * This does not hinder free page reporting from grabbing the page, + * reporting it and marking it "reported" - it only skips notifying + * the free page reporting infrastructure about a newly freed page. For + * example, used when temporarily pulling a page from a freelist and + * putting it back unmodified. + */ +#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) + +/* + * Place the (possibly merged) page to the tail of the freelist. Will ignore + * page shuffling (relevant code - e.g., memory onlining - is expected to + * shuffle the whole zone). + * + * Note: No code should rely on this flag for correctness - it's purely + * to allow for optimizations when handing back either fresh pages + * (memory onlining) or untouched pages (page isolation, free page + * reporting). + */ +#define FPI_TO_TAIL ((__force fpi_t)BIT(1)) + +/* + * Don't poison memory with KASAN (only for the tag-based modes). + * During boot, all non-reserved memblock memory is exposed to page_alloc. + * Poisoning all that memory lengthens boot time, especially on systems with + * large amount of RAM. This flag is used to skip that poisoning. + * This is only done for the tag-based KASAN modes, as those are able to + * detect memory corruptions with the memory tags assigned by default. + * All memory allocated normally after boot gets poisoned as usual. + */ +#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); @@ -94,12 +140,15 @@ */ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ EXPORT_PER_CPU_SYMBOL(_numa_mem_); -int _node_numa_mem_[MAX_NUMNODES]; #endif /* work_structs for global per-cpu drains */ -DEFINE_MUTEX(pcpu_drain_mutex); -DEFINE_PER_CPU(struct work_struct, pcpu_drain); +struct pcpu_drain { + struct zone *zone; + struct work_struct work; +}; +static DEFINE_MUTEX(pcpu_drain_mutex); +static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; @@ -123,62 +172,33 @@ }; EXPORT_SYMBOL(node_states); -/* Protect totalram_pages and zone->managed_pages */ -static DEFINE_SPINLOCK(managed_page_count_lock); - -unsigned long totalram_pages __read_mostly; +atomic_long_t _totalram_pages __read_mostly; +EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON -DEFINE_STATIC_KEY_TRUE(init_on_alloc); -#else DEFINE_STATIC_KEY_FALSE(init_on_alloc); -#endif EXPORT_SYMBOL(init_on_alloc); -#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON -DEFINE_STATIC_KEY_TRUE(init_on_free); -#else DEFINE_STATIC_KEY_FALSE(init_on_free); -#endif EXPORT_SYMBOL(init_on_free); +static bool _init_on_alloc_enabled_early __read_mostly + = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); static int __init early_init_on_alloc(char *buf) { - int ret; - bool bool_result; - if (!buf) - return -EINVAL; - ret = kstrtobool(buf, &bool_result); - if (bool_result && page_poisoning_enabled()) - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n"); - if (bool_result) - static_branch_enable(&init_on_alloc); - else - static_branch_disable(&init_on_alloc); - return ret; + return kstrtobool(buf, &_init_on_alloc_enabled_early); } early_param("init_on_alloc", early_init_on_alloc); +static bool _init_on_free_enabled_early __read_mostly + = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON); static int __init early_init_on_free(char *buf) { - int ret; - bool bool_result; - - if (!buf) - return -EINVAL; - ret = kstrtobool(buf, &bool_result); - if (bool_result && page_poisoning_enabled()) - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n"); - if (bool_result) - static_branch_enable(&init_on_free); - else - static_branch_disable(&init_on_free); - return ret; + return kstrtobool(buf, &_init_on_free_enabled_early); } early_param("init_on_free", early_init_on_free); @@ -242,7 +262,8 @@ unsigned int pageblock_order __read_mostly; #endif -static void __free_pages_ok(struct page *page, unsigned int order); +static void __free_pages_ok(struct page *page, unsigned int order, + fpi_t fpi_flags); /* * results with 256, 32 in the lowmem_reserve sysctl: @@ -269,8 +290,6 @@ [ZONE_MOVABLE] = 0, }; -EXPORT_SYMBOL(totalram_pages); - static char * const zone_names[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA "DMA", @@ -288,7 +307,7 @@ #endif }; -char * const migratetype_names[MIGRATE_TYPES] = { +const char * const migratetype_names[MIGRATE_TYPES] = { "Unmovable", "Movable", "Reclaimable", @@ -301,14 +320,14 @@ #endif }; -compound_page_dtor * const compound_page_dtors[] = { - NULL, - free_compound_page, +compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { + [NULL_COMPOUND_DTOR] = NULL, + [COMPOUND_PAGE_DTOR] = free_compound_page, #ifdef CONFIG_HUGETLB_PAGE - free_huge_page, + [HUGETLB_PAGE_DTOR] = free_huge_page, #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - free_transhuge_page, + [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, #endif }; @@ -319,6 +338,20 @@ */ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +#ifdef CONFIG_DISCONTIGMEM +/* + * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges + * are not on separate NUMA nodes. Functionally this works but with + * watermark_boost_factor, it can reclaim prematurely as the ranges can be + * quite small. By default, do not boost watermarks on discontigmem as in + * many cases very high-order allocations like THP are likely to be + * unsupported and the premature reclaim offsets the advantage of long-term + * fragmentation avoidance. + */ +int watermark_boost_factor __read_mostly; +#else +int watermark_boost_factor __read_mostly = 15000; +#endif int watermark_scale_factor = 10; /* @@ -328,28 +361,26 @@ */ int extra_free_kbytes = 0; -static unsigned long nr_kernel_pages __meminitdata; -static unsigned long nr_all_pages __meminitdata; -static unsigned long dma_reserve __meminitdata; +static unsigned long nr_kernel_pages __initdata; +static unsigned long nr_all_pages __initdata; +static unsigned long dma_reserve __initdata; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata; -static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata; +static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; +static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long required_kernelcore __initdata; static unsigned long required_kernelcore_percent __initdata; static unsigned long required_movablecore __initdata; static unsigned long required_movablecore_percent __initdata; -static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata; +static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; static bool mirrored_kernelcore __meminitdata; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; EXPORT_SYMBOL(movable_zone); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if MAX_NUMNODES > 1 -int nr_node_ids __read_mostly = MAX_NUMNODES; -int nr_online_nodes __read_mostly = 1; +unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; +unsigned int nr_online_nodes __read_mostly = 1; EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif @@ -365,7 +396,7 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages); /* - * Calling kasan_free_pages() only after deferred memory initialization + * Calling kasan_poison_pages() only after deferred memory initialization * has completed. Poisoning pages during deferred memory init will greatly * lengthen the process and cause problem in large memory systems as the * deferred pages initialization is done with interrupt disabled. @@ -377,10 +408,12 @@ * on-demand allocation and then freed again before the deferred pages * initialization is done, but this is not likely to happen. */ -static inline void kasan_free_nondeferred_pages(struct page *page, int order) +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) { - if (!static_branch_unlikely(&deferred_pages)) - kasan_free_pages(page, order); + return static_branch_unlikely(&deferred_pages) || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -395,38 +428,57 @@ } /* - * Returns false when the remaining initialisation should be deferred until + * Returns true when the remaining initialisation should be deferred until * later in the boot cycle when it can be parallelised. */ -static inline bool update_defer_init(pg_data_t *pgdat, - unsigned long pfn, unsigned long zone_end, - unsigned long *nr_initialised) +static bool __meminit +defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { - /* Always populate low zones for address-constrained allocations */ - if (zone_end < pgdat_end_pfn(pgdat)) - return true; - (*nr_initialised)++; - if ((*nr_initialised > pgdat->static_init_pgcnt) && - (pfn & (PAGES_PER_SECTION - 1)) == 0) { - pgdat->first_deferred_pfn = pfn; - return false; + static unsigned long prev_end_pfn, nr_initialised; + + /* + * prev_end_pfn static that contains the end of previous zone + * No need to protect because called very early in boot before smp_init. + */ + if (prev_end_pfn != end_pfn) { + prev_end_pfn = end_pfn; + nr_initialised = 0; } - return true; + /* Always populate low zones for address-constrained allocations */ + if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) + return false; + + if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) + return true; + /* + * We start only with one section of pages, more pages are added as + * needed until the rest of deferred pages are initialized. + */ + nr_initialised++; + if ((nr_initialised > PAGES_PER_SECTION) && + (pfn & (PAGES_PER_SECTION - 1)) == 0) { + NODE_DATA(nid)->first_deferred_pfn = pfn; + return true; + } + return false; } #else -#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o) +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +{ + return (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); +} static inline bool early_page_uninitialised(unsigned long pfn) { return false; } -static inline bool update_defer_init(pg_data_t *pgdat, - unsigned long pfn, unsigned long zone_end, - unsigned long *nr_initialised) +static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { - return true; + return false; } #endif @@ -435,7 +487,7 @@ unsigned long pfn) { #ifdef CONFIG_SPARSEMEM - return __pfn_to_section(pfn)->pageblock_flags; + return section_to_usemap(__pfn_to_section(pfn)); #else return page_zone(page)->pageblock_flags; #endif /* CONFIG_SPARSEMEM */ @@ -445,25 +497,23 @@ { #ifdef CONFIG_SPARSEMEM pfn &= (PAGES_PER_SECTION-1); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #else pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #endif /* CONFIG_SPARSEMEM */ + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; } /** * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages * @page: The page within the block of interest * @pfn: The target page frame number - * @end_bitidx: The last bit of interest to retrieve * @mask: mask of bits that the caller is interested in * * Return: pageblock_bits flags */ -static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, +static __always_inline +unsigned long __get_pfnblock_flags_mask(struct page *page, unsigned long pfn, - unsigned long end_bitidx, unsigned long mask) { unsigned long *bitmap; @@ -476,20 +526,36 @@ bitidx &= (BITS_PER_LONG-1); word = bitmap[word_bitidx]; - bitidx += end_bitidx; - return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; + return (word >> bitidx) & mask; } unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, - unsigned long end_bitidx, unsigned long mask) { - return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask); + return __get_pfnblock_flags_mask(page, pfn, mask); } +EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask); + +int isolate_anon_lru_page(struct page *page) +{ + int ret; + + if (!PageLRU(page) || !PageAnon(page)) + return -EINVAL; + + if (!get_page_unless_zero(page)) + return -EINVAL; + + ret = isolate_lru_page(page); + put_page(page); + + return ret; +} +EXPORT_SYMBOL_GPL(isolate_anon_lru_page); static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) { - return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK); + return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); } /** @@ -497,12 +563,10 @@ * @page: The page within the block of interest * @flags: The flags to set * @pfn: The target page frame number - * @end_bitidx: The last bit of interest * @mask: mask of bits that the caller is interested in */ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, unsigned long pfn, - unsigned long end_bitidx, unsigned long mask) { unsigned long *bitmap; @@ -510,6 +574,7 @@ unsigned long old_word, word; BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); + BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); bitmap = get_pageblock_bitmap(page, pfn); bitidx = pfn_to_bitidx(page, pfn); @@ -518,9 +583,8 @@ VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); - bitidx += end_bitidx; - mask <<= (BITS_PER_LONG - bitidx - 1); - flags <<= (BITS_PER_LONG - bitidx - 1); + mask <<= bitidx; + flags <<= bitidx; word = READ_ONCE(bitmap[word_bitidx]); for (;;) { @@ -537,8 +601,8 @@ migratetype < MIGRATE_PCPTYPES)) migratetype = MIGRATE_UNMOVABLE; - set_pageblock_flags_group(page, (unsigned long)migratetype, - PB_migrate, PB_migrate_end); + set_pfnblock_flags_mask(page, (unsigned long)migratetype, + page_to_pfn(page), MIGRATETYPE_MASK); } #ifdef CONFIG_DEBUG_VM @@ -593,8 +657,7 @@ } #endif -static void bad_page(struct page *page, const char *reason, - unsigned long bad_flags) +static void bad_page(struct page *page, const char *reason) { static unsigned long resume; static unsigned long nr_shown; @@ -623,10 +686,6 @@ pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); __dump_page(page, reason); - bad_flags &= page->flags; - if (bad_flags) - pr_alert("bad because of flags: %#lx(%pGp)\n", - bad_flags, &bad_flags); dump_page_owner(page); print_modules(); @@ -654,7 +713,8 @@ void free_compound_page(struct page *page) { - __free_pages_ok(page, compound_order(page)); + mem_cgroup_uncharge(page); + __free_pages_ok(page, compound_order(page), FPI_NONE); } void prep_compound_page(struct page *page, unsigned int order) @@ -662,8 +722,6 @@ int i; int nr_pages = 1 << order; - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); - set_compound_order(page, order); __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; @@ -671,51 +729,30 @@ p->mapping = TAIL_MAPPING; set_compound_head(p, page); } + + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); + set_compound_order(page, order); atomic_set(compound_mapcount_ptr(page), -1); + if (hpage_pincount_available(page)) + atomic_set(compound_pincount_ptr(page), 0); } #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; -bool _debug_pagealloc_enabled __read_mostly + +bool _debug_pagealloc_enabled_early __read_mostly = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled_early); +DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); EXPORT_SYMBOL(_debug_pagealloc_enabled); -bool _debug_guardpage_enabled __read_mostly; + +DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static int __init early_debug_pagealloc(char *buf) { - if (!buf) - return -EINVAL; - return kstrtobool(buf, &_debug_pagealloc_enabled); + return kstrtobool(buf, &_debug_pagealloc_enabled_early); } early_param("debug_pagealloc", early_debug_pagealloc); - -static bool need_debug_guardpage(void) -{ - /* If we don't use debug_pagealloc, we don't need guard page */ - if (!debug_pagealloc_enabled()) - return false; - - if (!debug_guardpage_minorder()) - return false; - - return true; -} - -static void init_debug_guardpage(void) -{ - if (!debug_pagealloc_enabled()) - return; - - if (!debug_guardpage_minorder()) - return; - - _debug_guardpage_enabled = true; -} - -struct page_ext_operations debug_guardpage_ops = { - .need = need_debug_guardpage, - .init = init_debug_guardpage, -}; static int __init debug_guardpage_minorder_setup(char *buf) { @@ -734,20 +771,13 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { - struct page_ext *page_ext; - if (!debug_guardpage_enabled()) return false; if (order >= debug_guardpage_minorder()) return false; - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return false; - - __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); - + __SetPageGuard(page); INIT_LIST_HEAD(&page->lru); set_page_private(page, order); /* Guard pages are not available for any usage */ @@ -759,39 +789,77 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { - struct page_ext *page_ext; - if (!debug_guardpage_enabled()) return; - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + __ClearPageGuard(page); set_page_private(page, 0); if (!is_migrate_isolate(migratetype)) __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else -struct page_ext_operations debug_guardpage_ops; static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) {} #endif -static inline void set_page_order(struct page *page, unsigned int order) +/* + * Enable static keys related to various memory debugging and hardening options. + * Some override others, and depend on early params that are evaluated in the + * order of appearance. So we need to first gather the full picture of what was + * enabled, and then make decisions. + */ +void init_mem_debugging_and_hardening(void) +{ + bool page_poisoning_requested = false; + +#ifdef CONFIG_PAGE_POISONING + /* + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. + */ + if (page_poisoning_enabled() || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())) { + static_branch_enable(&_page_poisoning_enabled); + page_poisoning_requested = true; + } +#endif + + if (_init_on_alloc_enabled_early) { + if (page_poisoning_requested) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " + "will take precedence over init_on_alloc\n"); + else + static_branch_enable(&init_on_alloc); + } + if (_init_on_free_enabled_early) { + if (page_poisoning_requested) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " + "will take precedence over init_on_free\n"); + else + static_branch_enable(&init_on_free); + } + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (!debug_pagealloc_enabled()) + return; + + static_branch_enable(&_debug_pagealloc_enabled); + + if (!debug_guardpage_minorder()) + return; + + static_branch_enable(&_debug_guardpage_enabled); +#endif +} + +static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); __SetPageBuddy(page); -} - -static inline void rmv_page_order(struct page *page) -{ - __ClearPageBuddy(page); - set_page_private(page, 0); } /* @@ -807,32 +875,151 @@ * * For recording page's order, we use page_private(page). */ -static inline int page_is_buddy(struct page *page, struct page *buddy, +static inline bool page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { - if (page_is_guard(buddy) && page_order(buddy) == order) { - if (page_zone_id(page) != page_zone_id(buddy)) - return 0; + if (!page_is_guard(buddy) && !PageBuddy(buddy)) + return false; - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + if (buddy_order(buddy) != order) + return false; - return 1; - } + /* + * zone check is done late to avoid uselessly calculating + * zone/node ids for pages that could never merge. + */ + if (page_zone_id(page) != page_zone_id(buddy)) + return false; - if (PageBuddy(buddy) && page_order(buddy) == order) { - /* - * zone check is done late to avoid uselessly - * calculating zone/node ids for pages that could - * never merge. - */ - if (page_zone_id(page) != page_zone_id(buddy)) - return 0; + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + return true; +} - return 1; - } - return 0; +#ifdef CONFIG_COMPACTION +static inline struct capture_control *task_capc(struct zone *zone) +{ + struct capture_control *capc = current->capture_control; + + return unlikely(capc) && + !(current->flags & PF_KTHREAD) && + !capc->page && + capc->cc->zone == zone ? capc : NULL; +} + +static inline bool +compaction_capture(struct capture_control *capc, struct page *page, + int order, int migratetype) +{ + if (!capc || order != capc->cc->order) + return false; + + /* Do not accidentally pollute CMA or isolated regions*/ + if (is_migrate_cma(migratetype) || + is_migrate_isolate(migratetype)) + return false; + + /* + * Do not let lower order allocations polluate a movable pageblock. + * This might let an unmovable request use a reclaimable pageblock + * and vice-versa but no more than normal fallback logic which can + * have trouble finding a high-order free page. + */ + if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) + return false; + + capc->page = page; + return true; +} + +#else +static inline struct capture_control *task_capc(struct zone *zone) +{ + return NULL; +} + +static inline bool +compaction_capture(struct capture_control *capc, struct page *page, + int order, int migratetype) +{ + return false; +} +#endif /* CONFIG_COMPACTION */ + +/* Used for pages not on another list */ +static inline void add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_add(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +/* Used for pages not on another list */ +static inline void add_to_free_list_tail(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_add_tail(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +/* + * Used for pages which are on another list. Move the pages to the tail + * of the list - so the moved pages won't immediately be considered for + * allocation again (e.g., optimization for memory onlining). + */ +static inline void move_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_move_tail(&page->lru, &area->free_list[migratetype]); +} + +static inline void del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order) +{ + /* clear reported state and update reported page count */ + if (page_reported(page)) + __ClearPageReported(page); + + list_del(&page->lru); + __ClearPageBuddy(page); + set_page_private(page, 0); + zone->free_area[order].nr_free--; +} + +/* + * If this is not the largest possible page, check if the buddy + * of the next-highest order is free. If it is, it's possible + * that pages are being freed that will coalesce soon. In case, + * that is happening, add the free page to the tail of the list + * so it's less likely to be used soon and more likely to be merged + * as a higher order page + */ +static inline bool +buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, + struct page *page, unsigned int order) +{ + struct page *higher_page, *higher_buddy; + unsigned long combined_pfn; + + if (order >= MAX_ORDER - 2) + return false; + + if (!pfn_valid_within(buddy_pfn)) + return false; + + combined_pfn = buddy_pfn & pfn; + higher_page = page + (combined_pfn - pfn); + buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); + higher_buddy = higher_page + (buddy_pfn - combined_pfn); + + return pfn_valid_within(buddy_pfn) && + page_is_buddy(higher_page, higher_buddy, order + 1); } /* @@ -862,12 +1049,14 @@ static inline void __free_one_page(struct page *page, unsigned long pfn, struct zone *zone, unsigned int order, - int migratetype) + int migratetype, fpi_t fpi_flags) { + struct capture_control *capc = task_capc(zone); + unsigned long buddy_pfn; unsigned long combined_pfn; - unsigned long uninitialized_var(buddy_pfn); - struct page *buddy; unsigned int max_order; + struct page *buddy; + bool to_tail; max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); @@ -883,6 +1072,11 @@ continue_merging: while (order < max_order) { + if (compaction_capture(capc, page, order, migratetype)) { + __mod_zone_freepage_state(zone, -(1 << order), + migratetype); + return; + } buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); @@ -894,13 +1088,10 @@ * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. */ - if (page_is_guard(buddy)) { + if (page_is_guard(buddy)) clear_page_guard(zone, buddy, order, migratetype); - } else { - list_del(&buddy->lru); - zone->free_area[order].nr_free--; - rmv_page_order(buddy); - } + else + del_page_from_free_list(buddy, zone, order); combined_pfn = buddy_pfn & pfn; page = page + (combined_pfn - pfn); pfn = combined_pfn; @@ -932,33 +1123,23 @@ } done_merging: - set_page_order(page, order); + set_buddy_order(page, order); - /* - * If this is not the largest possible page, check if the buddy - * of the next-highest order is free. If it is, it's possible - * that pages are being freed that will coalesce soon. In case, - * that is happening, add the free page to the tail of the list - * so it's less likely to be used soon and more likely to be merged - * as a higher order page - */ - if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { - struct page *higher_page, *higher_buddy; - combined_pfn = buddy_pfn & pfn; - higher_page = page + (combined_pfn - pfn); - buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); - higher_buddy = higher_page + (buddy_pfn - combined_pfn); - if (pfn_valid_within(buddy_pfn) && - page_is_buddy(higher_page, higher_buddy, order + 1)) { - list_add_tail(&page->lru, - &zone->free_area[order].free_list[migratetype]); - goto out; - } - } + if (fpi_flags & FPI_TO_TAIL) + to_tail = true; + else if (is_shuffle_order(order)) + to_tail = shuffle_pick_tail(); + else + to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); - list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); -out: - zone->free_area[order].nr_free++; + if (to_tail) + add_to_free_list_tail(page, zone, order, migratetype); + else + add_to_free_list(page, zone, order, migratetype); + + /* Notify page reporting subsystem of freed page */ + if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) + page_reporting_notify_free(order); } /* @@ -983,13 +1164,9 @@ return true; } -static void free_pages_check_bad(struct page *page) +static const char *page_bad_reason(struct page *page, unsigned long flags) { - const char *bad_reason; - unsigned long bad_flags; - - bad_reason = NULL; - bad_flags = 0; + const char *bad_reason = NULL; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; @@ -997,24 +1174,32 @@ bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { - bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; - bad_flags = PAGE_FLAGS_CHECK_AT_FREE; + if (unlikely(page->flags & flags)) { + if (flags == PAGE_FLAGS_CHECK_AT_PREP) + bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; + else + bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - bad_page(page, bad_reason, bad_flags); + return bad_reason; } -static inline int free_pages_check(struct page *page) +static void check_free_page_bad(struct page *page) +{ + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); +} + +static inline int check_free_page(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return 0; /* Something has gone sideways, find it */ - free_pages_check_bad(page); + check_free_page_bad(page); return 1; } @@ -1036,7 +1221,7 @@ case 1: /* the first tail page: ->mapping may be compound_mapcount() */ if (unlikely(compound_mapcount(page))) { - bad_page(page, "nonzero compound_mapcount", 0); + bad_page(page, "nonzero compound_mapcount"); goto out; } break; @@ -1048,17 +1233,17 @@ break; default: if (page->mapping != TAIL_MAPPING) { - bad_page(page, "corrupted mapping in tail page", 0); + bad_page(page, "corrupted mapping in tail page"); goto out; } break; } if (unlikely(!PageTail(page))) { - bad_page(page, "PageTail not set", 0); + bad_page(page, "PageTail not set"); goto out; } if (unlikely(compound_head(page) != head_page)) { - bad_page(page, "compound_head not consistent", 0); + bad_page(page, "compound_head not consistent"); goto out; } ret = 0; @@ -1068,25 +1253,48 @@ return ret; } -static void kernel_init_free_pages(struct page *page, int numpages) +static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) { int i; + if (zero_tags) { + for (i = 0; i < numpages; i++) + tag_clear_highpage(page + i); + return; + } + /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); - for (i = 0; i < numpages; i++) + for (i = 0; i < numpages; i++) { + u8 tag = page_kasan_tag(page + i); + page_kasan_tag_reset(page + i); clear_highpage(page + i); + page_kasan_tag_set(page + i, tag); + } kasan_enable_current(); } static __always_inline bool free_pages_prepare(struct page *page, - unsigned int order, bool check_free) + unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; + bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); + + if (unlikely(PageHWPoison(page)) && !order) { + /* + * Do not let hwpoison pages hit pcplists/buddy + * Untie memcg state and reset page's owner + */ + if (memcg_kmem_enabled() && PageKmemcg(page)) + __memcg_kmem_uncharge_page(page, order); + reset_page_owner(page, order); + free_page_pinner(page, order); + return false; + } /* * Check tail pages before head page information is cleared to @@ -1103,7 +1311,7 @@ for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); - if (unlikely(free_pages_check(page + i))) { + if (unlikely(check_free_page(page + i))) { bad++; continue; } @@ -1113,15 +1321,16 @@ if (PageMappingFlags(page)) page->mapping = NULL; if (memcg_kmem_enabled() && PageKmemcg(page)) - memcg_kmem_uncharge(page, order); + __memcg_kmem_uncharge_page(page, order); if (check_free) - bad += free_pages_check(page); + bad += check_free_page(page); if (bad) return false; page_cpupid_reset_last(page); page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); + free_page_pinner(page, order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), @@ -1129,36 +1338,77 @@ debug_check_no_obj_freed(page_address(page), PAGE_SIZE << order); } - arch_free_page(page, order); - if (want_init_on_free()) - kernel_init_free_pages(page, 1 << order); - kernel_poison_pages(page, 1 << order, 0); - kernel_map_pages(page, 1 << order, 0); - kasan_free_nondeferred_pages(page, order); + kernel_poison_pages(page, 1 << order); + + /* + * As memory initialization might be integrated into KASAN, + * kasan_free_pages and kernel_init_free_pages must be + * kept together to avoid discrepancies in behavior. + * + * With hardware tag-based KASAN, memory tags must be set before the + * page becomes unavailable via debug_pagealloc or arch_free_page. + */ + if (kasan_has_integrated_init()) { + if (!skip_kasan_poison) + kasan_free_pages(page, order); + } else { + bool init = want_init_on_free(); + + if (init) + kernel_init_free_pages(page, 1 << order, false); + if (!skip_kasan_poison) + kasan_poison_pages(page, order, init); + } + + /* + * arch_free_page() can make the page's contents inaccessible. s390 + * does this. So nothing which can access the page's contents should + * happen after this. + */ + arch_free_page(page, order); + + debug_pagealloc_unmap_pages(page, 1 << order); return true; } #ifdef CONFIG_DEBUG_VM -static inline bool free_pcp_prepare(struct page *page) -{ - return free_pages_prepare(page, 0, true); -} - -static inline bool bulkfree_pcp_prepare(struct page *page) -{ - return false; -} -#else +/* + * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed + * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when + * moved from pcp lists to free lists. + */ static bool free_pcp_prepare(struct page *page) { - return free_pages_prepare(page, 0, false); + return free_pages_prepare(page, 0, true, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) { - return free_pages_check(page); + if (debug_pagealloc_enabled_static()) + return check_free_page(page); + else + return false; +} +#else +/* + * With DEBUG_VM disabled, order-0 pages being freed are checked only when + * moving from pcp lists to free list in order to reduce overhead. With + * debug_pagealloc enabled, they are checked also immediately when being freed + * to the pcp lists. + */ +static bool free_pcp_prepare(struct page *page) +{ + if (debug_pagealloc_enabled_static()) + return free_pages_prepare(page, 0, true, FPI_NONE); + else + return free_pages_prepare(page, 0, false, FPI_NONE); +} + +static bool bulkfree_pcp_prepare(struct page *page) +{ + return check_free_page(page); } #endif /* CONFIG_DEBUG_VM */ @@ -1258,7 +1508,7 @@ if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); - __free_one_page(page, page_to_pfn(page), zone, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); trace_mm_page_pcpu_drain(page, 0, mt); } spin_unlock(&zone->lock); @@ -1267,14 +1517,14 @@ static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, unsigned int order, - int migratetype) + int migratetype, fpi_t fpi_flags) { spin_lock(&zone->lock); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } - __free_one_page(page, pfn, zone, order, migratetype); + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); spin_unlock(&zone->lock); } @@ -1348,33 +1598,50 @@ /* Avoid false-positive PageTail() */ INIT_LIST_HEAD(&page->lru); - SetPageReserved(page); + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); } } } -static void __free_pages_ok(struct page *page, unsigned int order) +static void __free_pages_ok(struct page *page, unsigned int order, + fpi_t fpi_flags) { unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); + bool skip_free_unref_page = false; - if (!free_pages_prepare(page, order, true)) + if (!free_pages_prepare(page, order, true, fpi_flags)) return; migratetype = get_pfnblock_migratetype(page, pfn); + trace_android_vh_free_unref_page_bypass(page, order, migratetype, &skip_free_unref_page); + if (skip_free_unref_page) + return; + local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, pfn, order, migratetype); + free_one_page(page_zone(page), page, pfn, order, migratetype, + fpi_flags); local_irq_restore(flags); } -static void __init __free_pages_boot_core(struct page *page, unsigned int order) +void __free_pages_core(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; unsigned int loop; + /* + * When initializing the memmap, __init_single_page() sets the refcount + * of all pages to 1 ("allocated"/"not free"). We have to set the + * refcount of all involved pages to 0. + */ prefetchw(p); for (loop = 0; loop < (nr_pages - 1); loop++, p++) { prefetchw(p + 1); @@ -1384,15 +1651,43 @@ __ClearPageReserved(p); set_page_count(p, 0); - page_zone(page)->managed_pages += nr_pages; - set_page_refcounted(page); - __free_pages(page, order); + atomic_long_add(nr_pages, &page_zone(page)->managed_pages); + + /* + * Bypass PCP and place fresh pages right to the tail, primarily + * relevant for memory onlining. + */ + __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); } -#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ - defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +#ifdef CONFIG_NEED_MULTIPLE_NODES static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; + +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + */ +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) +{ + unsigned long start_pfn, end_pfn; + int nid; + + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; + + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); + if (nid != NUMA_NO_NODE) { + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; + } + + return nid; +} +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ int __meminit early_pfn_to_nid(unsigned long pfn) { @@ -1407,48 +1702,14 @@ return nid; } -#endif +#endif /* CONFIG_NEED_MULTIPLE_NODES */ -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -static inline bool __meminit __maybe_unused -meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) -{ - int nid; - - nid = __early_pfn_to_nid(pfn, state); - if (nid >= 0 && nid != node) - return false; - return true; -} - -/* Only safe to use early in boot when initialisation is single-threaded */ -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); -} - -#else - -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - return true; -} -static inline bool __meminit __maybe_unused -meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) -{ - return true; -} -#endif - - -void __init __free_pages_bootmem(struct page *page, unsigned long pfn, +void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { if (early_page_uninitialised(pfn)) return; - return __free_pages_boot_core(page, order); + __free_pages_core(page, order); } /* @@ -1539,14 +1800,14 @@ if (nr_pages == pageblock_nr_pages && (pfn & (pageblock_nr_pages - 1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, pageblock_order); + __free_pages_core(page, pageblock_order); return; } for (i = 0; i < nr_pages; i++, page++, pfn++) { if ((pfn & (pageblock_nr_pages - 1)) == 0) set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, 0); + __free_pages_core(page, 0); } } @@ -1569,20 +1830,12 @@ * * Then, we check if a current large page is valid by only checking the validity * of the head pfn. - * - * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave - * within a node: a pfn is between start and end of a node, but does not belong - * to this memory node. */ -static inline bool __init -deferred_pfn_valid(int nid, unsigned long pfn, - struct mminit_pfnnid_cache *nid_init_state) +static inline bool __init deferred_pfn_valid(unsigned long pfn) { if (!pfn_valid_within(pfn)) return false; if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) - return false; - if (!meminit_pfn_in_nid(pfn, nid, nid_init_state)) return false; return true; } @@ -1591,21 +1844,19 @@ * Free pages to buddy allocator. Try to free aligned pages in * pageblock_nr_pages sizes. */ -static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, +static void __init deferred_free_pages(unsigned long pfn, unsigned long end_pfn) { - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; unsigned long nr_free = 0; for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + if (!deferred_pfn_valid(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 0; } else if (!(pfn & nr_pgmask)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; - touch_nmi_watchdog(); } else { nr_free++; } @@ -1619,22 +1870,22 @@ * by performing it only once every pageblock_nr_pages. * Return number of pages initialized. */ -static unsigned long __init deferred_init_pages(int nid, int zid, +static unsigned long __init deferred_init_pages(struct zone *zone, unsigned long pfn, unsigned long end_pfn) { - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; + int nid = zone_to_nid(zone); unsigned long nr_pages = 0; + int zid = zone_idx(zone); struct page *page = NULL; for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + if (!deferred_pfn_valid(pfn)) { page = NULL; continue; } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - touch_nmi_watchdog(); } else { page++; } @@ -1644,18 +1895,127 @@ return (nr_pages); } +/* + * This function is meant to pre-load the iterator for the zone init. + * Specifically it walks through the ranges until we are caught up to the + * first_init_pfn value and exits there. If we never encounter the value we + * return false indicating there are no valid ranges left. + */ +static bool __init +deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, + unsigned long *spfn, unsigned long *epfn, + unsigned long first_init_pfn) +{ + u64 j; + + /* + * Start out by walking through the ranges in this zone that have + * already been initialized. We don't need to do anything with them + * so we just need to flush them out of the system. + */ + for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { + if (*epfn <= first_init_pfn) + continue; + if (*spfn < first_init_pfn) + *spfn = first_init_pfn; + *i = j; + return true; + } + + return false; +} + +/* + * Initialize and free pages. We do it in two loops: first we initialize + * struct page, then free to buddy allocator, because while we are + * freeing pages we can access pages that are ahead (computing buddy + * page in __free_one_page()). + * + * In order to try and keep some memory in the cache we have the loop + * broken along max page order boundaries. This way we will not cause + * any issues with the buddy page computation. + */ +static unsigned long __init +deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, + unsigned long *end_pfn) +{ + unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); + unsigned long spfn = *start_pfn, epfn = *end_pfn; + unsigned long nr_pages = 0; + u64 j = *i; + + /* First we loop through and initialize the page values */ + for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { + unsigned long t; + + if (mo_pfn <= *start_pfn) + break; + + t = min(mo_pfn, *end_pfn); + nr_pages += deferred_init_pages(zone, *start_pfn, t); + + if (mo_pfn < *end_pfn) { + *start_pfn = mo_pfn; + break; + } + } + + /* Reset values and now loop through freeing pages as needed */ + swap(j, *i); + + for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { + unsigned long t; + + if (mo_pfn <= spfn) + break; + + t = min(mo_pfn, epfn); + deferred_free_pages(spfn, t); + + if (mo_pfn <= epfn) + break; + } + + return nr_pages; +} + +static void __init +deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, + void *arg) +{ + unsigned long spfn, epfn; + struct zone *zone = arg; + u64 i; + + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); + + /* + * Initialize and free pages in MAX_ORDER sized increments so that we + * can avoid introducing any issues with the buddy allocator. + */ + while (spfn < end_pfn) { + deferred_init_maxorder(&i, zone, &spfn, &epfn); + cond_resched(); + } +} + +/* An arch may override for more concurrency. */ +__weak int __init +deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + return 1; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; - int nid = pgdat->node_id; - unsigned long start = jiffies; - unsigned long nr_pages = 0; - unsigned long spfn, epfn, first_init_pfn, flags; - phys_addr_t spa, epa; - int zid; - struct zone *zone; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + unsigned long spfn = 0, epfn = 0; + unsigned long first_init_pfn, flags; + unsigned long start = jiffies; + struct zone *zone; + int zid, max_threads; u64 i; /* Bind memory initialisation thread to a local node if possible */ @@ -1688,30 +2048,36 @@ if (first_init_pfn < zone_end_pfn(zone)) break; } - first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); - /* - * Initialize and free pages. We do it in two loops: first we initialize - * struct page, than free to buddy allocator, because while we are - * freeing pages we can access pages that are ahead (computing buddy - * page in __free_one_page()). - */ - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - nr_pages += deferred_init_pages(nid, zid, spfn, epfn); - } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - deferred_free_pages(nid, zid, spfn, epfn); - } + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_init_pfn)) + goto zone_empty; + max_threads = deferred_page_init_max_threads(cpumask); + + while (spfn < epfn) { + unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); + struct padata_mt_job job = { + .thread_fn = deferred_init_memmap_chunk, + .fn_arg = zone, + .start = spfn, + .size = epfn_align - spfn, + .align = PAGES_PER_SECTION, + .min_chunk = PAGES_PER_SECTION, + .max_threads = max_threads, + }; + + padata_do_multithreaded(&job); + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + epfn_align); + } +zone_empty: /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); - pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, - jiffies_to_msecs(jiffies - start)); + pr_info("node %d deferred pages initialised in %ums\n", + pgdat->node_id, jiffies_to_msecs(jiffies - start)); pgdat_init_report_one_done(); return 0; @@ -1735,14 +2101,11 @@ static noinline bool __init deferred_grow_zone(struct zone *zone, unsigned int order) { - int zid = zone_idx(zone); - int nid = zone_to_nid(zone); - pg_data_t *pgdat = NODE_DATA(nid); unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); - unsigned long nr_pages = 0; - unsigned long first_init_pfn, spfn, epfn, t, flags; + pg_data_t *pgdat = zone->zone_pgdat; unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; - phys_addr_t spa, epa; + unsigned long spfn, epfn, flags; + unsigned long nr_pages = 0; u64 i; /* Only the last zone may have deferred pages */ @@ -1760,38 +2123,37 @@ return true; } - first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); - - if (first_init_pfn >= pgdat_end_pfn(pgdat)) { + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_deferred_pfn)) { + pgdat->first_deferred_pfn = ULONG_MAX; pgdat_resize_unlock(pgdat, &flags); - return false; + /* Retry only once. */ + return first_deferred_pfn != ULONG_MAX; } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + /* + * Initialize and free pages in MAX_ORDER sized increments so + * that we can avoid introducing any issues with the buddy + * allocator. + */ + while (spfn < epfn) { + /* update our first deferred PFN for this section */ + first_deferred_pfn = spfn; - while (spfn < epfn && nr_pages < nr_pages_needed) { - t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); - first_deferred_pfn = min(t, epfn); - nr_pages += deferred_init_pages(nid, zid, spfn, - first_deferred_pfn); - spfn = first_deferred_pfn; - } + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + touch_nmi_watchdog(); + /* We should only stop along section boundaries */ + if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) + continue; + + /* If our quota has been met we can stop here */ if (nr_pages >= nr_pages_needed) break; } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); - deferred_free_pages(nid, zid, spfn, epfn); - - if (first_deferred_pfn == epfn) - break; - } - pgdat->first_deferred_pfn = first_deferred_pfn; + pgdat->first_deferred_pfn = spfn; pgdat_resize_unlock(pgdat, &flags); return nr_pages > 0; @@ -1814,9 +2176,9 @@ void __init page_alloc_init_late(void) { struct zone *zone; + int nid; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - int nid; /* There will be num_node_state(N_MEMORY) threads */ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); @@ -1844,10 +2206,12 @@ /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + /* Discard memblock private memory */ memblock_discard(); -#endif + + for_each_node_state(nid, N_MEMORY) + shuffle_free_memory(NODE_DATA(nid)); for_each_populated_zone(zone) set_zone_contiguous(zone); @@ -1881,6 +2245,7 @@ } adjust_managed_page_count(page, pageblock_nr_pages); + page_zone(page)->cma_pages += pageblock_nr_pages; } #endif @@ -1899,13 +2264,11 @@ * -- nyc */ static inline void expand(struct zone *zone, struct page *page, - int low, int high, struct free_area *area, - int migratetype) + int low, int high, int migratetype) { unsigned long size = 1 << high; while (high > low) { - area--; high--; size >>= 1; VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); @@ -1919,39 +2282,21 @@ if (set_page_guard(zone, &page[size], high, migratetype)) continue; - list_add(&page[size].lru, &area->free_list[migratetype]); - area->nr_free++; - set_page_order(&page[size], high); + add_to_free_list(&page[size], zone, high, migratetype); + set_buddy_order(&page[size], high); } } static void check_new_page_bad(struct page *page) { - const char *bad_reason = NULL; - unsigned long bad_flags = 0; - - if (unlikely(atomic_read(&page->_mapcount) != -1)) - bad_reason = "nonzero mapcount"; - if (unlikely(page->mapping != NULL)) - bad_reason = "non-NULL mapping"; - if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _count"; if (unlikely(page->flags & __PG_HWPOISON)) { - bad_reason = "HWPoisoned (hardware-corrupted)"; - bad_flags = __PG_HWPOISON; /* Don't complain about hwpoisoned pages */ page_mapcount_reset(page); /* remove PageBuddy */ return; } - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { - bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; - bad_flags = PAGE_FLAGS_CHECK_AT_PREP; - } -#ifdef CONFIG_MEMCG - if (unlikely(page->mem_cgroup)) - bad_reason = "page still charged to cgroup"; -#endif - bad_page(page, bad_reason, bad_flags); + + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); } /* @@ -1967,30 +2312,40 @@ return 1; } -static inline bool free_pages_prezeroed(void) -{ - return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled()) || want_init_on_free(); -} - #ifdef CONFIG_DEBUG_VM -static bool check_pcp_refill(struct page *page) +/* + * With DEBUG_VM enabled, order-0 pages are checked for expected state when + * being allocated from pcp lists. With debug_pagealloc also enabled, they are + * also checked when pcp lists are refilled from the free lists. + */ +static inline bool check_pcp_refill(struct page *page) { - return false; + if (debug_pagealloc_enabled_static()) + return check_new_page(page); + else + return false; } -static bool check_new_pcp(struct page *page) +static inline bool check_new_pcp(struct page *page) { return check_new_page(page); } #else -static bool check_pcp_refill(struct page *page) +/* + * With DEBUG_VM disabled, free order-0 pages are checked for expected state + * when pcp lists are being refilled from the free lists. With debug_pagealloc + * enabled, they are also checked when being allocated from the pcp lists. + */ +static inline bool check_pcp_refill(struct page *page) { return check_new_page(page); } -static bool check_new_pcp(struct page *page) +static inline bool check_new_pcp(struct page *page) { - return false; + if (debug_pagealloc_enabled_static()) + return check_new_page(page); + else + return false; } #endif /* CONFIG_DEBUG_VM */ @@ -2014,9 +2369,31 @@ set_page_refcounted(page); arch_alloc_page(page, order); - kernel_map_pages(page, 1 << order, 1); - kasan_alloc_pages(page, order); - kernel_poison_pages(page, 1 << order, 1); + debug_pagealloc_map_pages(page, 1 << order); + + /* + * Page unpoisoning must happen before memory initialization. + * Otherwise, the poison pattern will be overwritten for __GFP_ZERO + * allocations and the page unpoisoning code will complain. + */ + kernel_unpoison_pages(page, 1 << order); + + /* + * As memory initialization might be integrated into KASAN, + * kasan_alloc_pages and kernel_init_free_pages must be + * kept together to avoid discrepancies in behavior. + */ + if (kasan_has_integrated_init()) { + kasan_alloc_pages(page, order, gfp_flags); + } else { + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + + kasan_unpoison_pages(page, order, init); + if (init) + kernel_init_free_pages(page, 1 << order, + gfp_flags & __GFP_ZEROTAGS); + } + set_page_owner(page, order, gfp_flags); } @@ -2024,9 +2401,6 @@ unsigned int alloc_flags) { post_alloc_hook(page, order, gfp_flags); - - if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) - kernel_init_free_pages(page, 1 << order); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -2041,6 +2415,7 @@ set_page_pfmemalloc(page); else clear_page_pfmemalloc(page); + trace_android_vh_test_clear_look_around_ref(page); } /* @@ -2058,14 +2433,11 @@ /* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < MAX_ORDER; ++current_order) { area = &(zone->free_area[current_order]); - page = list_first_entry_or_null(&area->free_list[migratetype], - struct page, lru); + page = get_page_from_free_area(area, migratetype); if (!page) continue; - list_del(&page->lru); - rmv_page_order(page); - area->nr_free--; - expand(zone, page, order, current_order, area, migratetype); + del_page_from_free_list(page, zone, current_order); + expand(zone, page, order, current_order, migratetype); set_pcppage_migratetype(page, migratetype); return page; } @@ -2078,10 +2450,10 @@ * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted */ -static int fallbacks[MIGRATE_TYPES][4] = { +static int fallbacks[MIGRATE_TYPES][3] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, #ifdef CONFIG_CMA [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ #endif @@ -2102,7 +2474,7 @@ #endif /* - * Move the free pages in a range to the free lists of the requested type. + * Move the free pages in a range to the freelist tail of the requested type. * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ @@ -2114,30 +2486,11 @@ unsigned int order; int pages_moved = 0; -#ifndef CONFIG_HOLES_IN_ZONE - /* - * page_zone is not safe to call in this context when - * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant - * anyway as we check zone boundaries in move_freepages_block(). - * Remove at a later date when no bug reports exist related to - * grouping pages by mobility - */ - VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) && - pfn_valid(page_to_pfn(end_page)) && - page_zone(start_page) != page_zone(end_page)); -#endif - - if (num_movable) - *num_movable = 0; - for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; } - - /* Make sure we are not inadvertently changing nodes */ - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); if (!PageBuddy(page)) { /* @@ -2153,9 +2506,12 @@ continue; } - order = page_order(page); - list_move(&page->lru, - &zone->free_area[order].free_list[migratetype]); + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); + VM_BUG_ON_PAGE(page_zone(page) != zone, page); + + order = buddy_order(page); + move_to_free_list(page, zone, order, migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -2168,6 +2524,9 @@ { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; + + if (num_movable) + *num_movable = 0; start_pfn = page_to_pfn(page); start_pfn = start_pfn & ~(pageblock_nr_pages-1); @@ -2229,6 +2588,43 @@ return false; } +static inline bool boost_watermark(struct zone *zone) +{ + unsigned long max_boost; + + if (!watermark_boost_factor) + return false; + /* + * Don't bother in zones that are unlikely to produce results. + * On small machines, including kdump capture kernels running + * in a small area, boosting the watermark can cause an out of + * memory situation immediately. + */ + if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) + return false; + + max_boost = mult_frac(zone->_watermark[WMARK_HIGH], + watermark_boost_factor, 10000); + + /* + * high watermark may be uninitialised if fragmentation occurs + * very early in boot so do not boost. We do not fall + * through and boost by pageblock_nr_pages as failing + * allocations that early means that reclaim is not going + * to help and it may even be impossible to reclaim the + * boosted watermark resulting in a hang. + */ + if (!max_boost) + return false; + + max_boost = max(pageblock_nr_pages, max_boost); + + zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, + max_boost); + + return true; +} + /* * This function implements actual steal behaviour. If order is large enough, * we can steal whole pageblock. If not, we first move freepages in this @@ -2238,10 +2634,9 @@ * itself, so pages freed in the future will be put on the correct free list. */ static void steal_suitable_fallback(struct zone *zone, struct page *page, - int start_type, bool whole_block) + unsigned int alloc_flags, int start_type, bool whole_block) { - unsigned int current_order = page_order(page); - struct free_area *area; + unsigned int current_order = buddy_order(page); int free_pages, movable_pages, alike_pages; int old_block_type; @@ -2259,6 +2654,14 @@ change_pageblock_range(page, current_order, start_type); goto single_page; } + + /* + * Boost watermarks to increase reclaim pressure to reduce the + * likelihood of future fallbacks. Wake kswapd now as the node + * may be balanced overall and kswapd will not wake naturally. + */ + if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) + set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); /* We are not allowed to try stealing from the whole block */ if (!whole_block) @@ -2303,8 +2706,7 @@ return; single_page: - area = &zone->free_area[current_order]; - list_move(&page->lru, &area->free_list[start_type]); + move_to_free_list(page, zone, current_order, start_type); } /* @@ -2328,7 +2730,7 @@ if (fallback_mt == MIGRATE_TYPES) break; - if (list_empty(&area->free_list[fallback_mt])) + if (free_area_empty(area, fallback_mt)) continue; if (can_steal_fallback(order, migratetype)) @@ -2358,7 +2760,7 @@ * Limit the number reserved to 1 pageblock or roughly 1% of a zone. * Check is race-prone but harmless. */ - max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; + max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; if (zone->nr_reserved_highatomic >= max_managed) return; @@ -2400,8 +2802,9 @@ struct page *page; int order; bool ret; + bool skip_unreserve_highatomic = false; - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, ac->nodemask) { /* * Preserve at least one pageblock unless memory pressure @@ -2411,13 +2814,16 @@ pageblock_nr_pages) continue; + trace_android_vh_unreserve_highatomic_bypass(force, zone, + &skip_unreserve_highatomic); + if (skip_unreserve_highatomic) + continue; + spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { struct free_area *area = &(zone->free_area[order]); - page = list_first_entry_or_null( - &area->free_list[MIGRATE_HIGHATOMIC], - struct page, lru); + page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); if (!page) continue; @@ -2475,20 +2881,30 @@ * condition simpler. */ static __always_inline bool -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, + unsigned int alloc_flags) { struct free_area *area; int current_order; + int min_order = order; struct page *page; int fallback_mt; bool can_steal; + + /* + * Do not steal pages from freelists belonging to other pageblocks + * i.e. orders < pageblock_order. If there are no local zones free, + * the zonelists will be reiterated without ALLOC_NOFRAGMENT. + */ + if (alloc_flags & ALLOC_NOFRAGMENT) + min_order = pageblock_order; /* * Find the largest available free page in the other list. This roughly * approximates finding the pageblock with the most free pages, which * would be too costly to do exactly. */ - for (current_order = MAX_ORDER - 1; current_order >= order; + for (current_order = MAX_ORDER - 1; current_order >= min_order; --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2530,10 +2946,10 @@ VM_BUG_ON(current_order == MAX_ORDER); do_steal: - page = list_first_entry(&area->free_list[fallback_mt], - struct page, lru); + page = get_page_from_free_area(area, fallback_mt); - steal_suitable_fallback(zone, page, start_migratetype, can_steal); + steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, + can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); @@ -2547,14 +2963,16 @@ * Call me with the zone->lock already held. */ static __always_inline struct page * -__rmqueue(struct zone *zone, unsigned int order, int migratetype) +__rmqueue(struct zone *zone, unsigned int order, int migratetype, + unsigned int alloc_flags) { struct page *page; retry: page = __rmqueue_smallest(zone, order, migratetype); - if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype)) + if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype, + alloc_flags)) goto retry; trace_mm_page_alloc_zone_locked(page, order, migratetype); @@ -2562,18 +2980,18 @@ } #ifdef CONFIG_CMA -static struct page *__rmqueue_cma(struct zone *zone, unsigned int order) +static struct page *__rmqueue_cma(struct zone *zone, unsigned int order, + int migratetype, + unsigned int alloc_flags) { - struct page *page = 0; - - if (IS_ENABLED(CONFIG_CMA)) - if (!zone->cma_alloc) - page = __rmqueue_cma_fallback(zone, order); + struct page *page = __rmqueue_cma_fallback(zone, order); trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA); return page; } #else -static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order) +static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order, + int migratetype, + unsigned int alloc_flags) { return NULL; } @@ -2586,7 +3004,7 @@ */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype) + int migratetype, unsigned int alloc_flags) { int i, alloced = 0; @@ -2594,15 +3012,11 @@ for (i = 0; i < count; ++i) { struct page *page; - /* - * If migrate type CMA is being requested only try to - * satisfy the request with CMA pages to try and increase - * CMA utlization. - */ if (is_migrate_cma(migratetype)) - page = __rmqueue_cma(zone, order); + page = __rmqueue_cma(zone, order, migratetype, + alloc_flags); else - page = __rmqueue(zone, order, migratetype); + page = __rmqueue(zone, order, migratetype, alloc_flags); if (unlikely(page == NULL)) break; @@ -2645,14 +3059,18 @@ */ static struct list_head *get_populated_pcp_list(struct zone *zone, unsigned int order, struct per_cpu_pages *pcp, - int migratetype) + int migratetype, unsigned int alloc_flags) { struct list_head *list = &pcp->lists[migratetype]; if (list_empty(list)) { + trace_android_vh_rmqueue_bulk_bypass(order, pcp, migratetype, list); + if (!list_empty(list)) + return list; + pcp->count += rmqueue_bulk(zone, order, pcp->batch, list, - migratetype); + migratetype, alloc_flags); if (list_empty(list)) list = NULL; @@ -2739,6 +3157,10 @@ static void drain_local_pages_wq(struct work_struct *work) { + struct pcpu_drain *drain; + + drain = container_of(work, struct pcpu_drain, work); + /* * drain_all_pages doesn't use proper cpu hotplug protection so * we can race with cpu offline when the WQ can move this from @@ -2747,7 +3169,7 @@ * a different one. */ preempt_disable(); - drain_local_pages(NULL); + drain_local_pages(drain->zone); preempt_enable(); } @@ -2818,12 +3240,14 @@ } for_each_cpu(cpu, &cpus_with_pcps) { - struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); - INIT_WORK(work, drain_local_pages_wq); - queue_work_on(cpu, mm_percpu_wq, work); + struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); + + drain->zone = zone; + INIT_WORK(&drain->work, drain_local_pages_wq); + queue_work_on(cpu, mm_percpu_wq, &drain->work); } for_each_cpu(cpu, &cpus_with_pcps) - flush_work(per_cpu_ptr(&pcpu_drain, cpu)); + flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); mutex_unlock(&pcpu_drain_mutex); } @@ -2900,6 +3324,7 @@ struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; int migratetype; + bool pcp_skip_cma_pages = false; migratetype = get_pcppage_migratetype(page); __count_vm_event(PGFREE); @@ -2912,8 +3337,12 @@ * excessively into the page allocator */ if (migratetype >= MIGRATE_PCPTYPES) { - if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, pfn, 0, migratetype); + trace_android_vh_pcplist_add_cma_pages_bypass(migratetype, + &pcp_skip_cma_pages); + if (unlikely(is_migrate_isolate(migratetype)) || + pcp_skip_cma_pages) { + free_one_page(zone, page, pfn, 0, migratetype, + FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; @@ -2935,8 +3364,15 @@ { unsigned long flags; unsigned long pfn = page_to_pfn(page); + int migratetype; + bool skip_free_unref_page = false; if (!free_unref_page_prepare(page, pfn)) + return; + + migratetype = get_pfnblock_migratetype(page, pfn); + trace_android_vh_free_unref_page_bypass(page, 0, migratetype, &skip_free_unref_page); + if (skip_free_unref_page) return; local_irq_save(flags); @@ -2999,7 +3435,8 @@ for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - split_page_owner(page, order); + split_page_owner(page, 1 << order); + split_page_memcg(page, 1 << order); } EXPORT_SYMBOL_GPL(split_page); @@ -3021,7 +3458,7 @@ * watermark, because we already know our high-order page * exists. */ - watermark = min_wmark_pages(zone) + (1UL << order); + watermark = zone->_watermark[WMARK_MIN] + (1UL << order); if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; @@ -3029,9 +3466,8 @@ } /* Remove page from free list */ - list_del(&page->lru); - zone->free_area[order].nr_free--; - rmv_page_order(page); + + del_page_from_free_list(page, zone, order); /* * Set the pageblock if the isolated page is at least half of a @@ -3050,6 +3486,27 @@ return 1UL << order; +} + +/** + * __putback_isolated_page - Return a now-isolated page back where we got it + * @page: Page that was isolated + * @order: Order of the isolated page + * @mt: The page's pageblock's migratetype + * + * This function is meant to return a page pulled from the free lists via + * __isolate_free_page back to the free lists they were pulled from. + */ +void __putback_isolated_page(struct page *page, unsigned int order, int mt) +{ + struct zone *zone = page_zone(page); + + /* zone lock should be held when this function is called */ + lockdep_assert_held(&zone->lock); + + /* Return isolated page to tail of freelist. */ + __free_one_page(page, page_to_pfn(page), zone, order, mt, + FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); } /* @@ -3081,6 +3538,7 @@ /* Remove page from the per-cpu list, caller must protect the list */ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, + unsigned int alloc_flags, struct per_cpu_pages *pcp, gfp_t gfp_flags) { @@ -3090,9 +3548,9 @@ do { /* First try to get CMA pages */ if (migratetype == MIGRATE_MOVABLE && - gfp_flags & __GFP_CMA) { + alloc_flags & ALLOC_CMA) { list = get_populated_pcp_list(zone, 0, pcp, - get_cma_migrate_type()); + get_cma_migrate_type(), alloc_flags); } if (list == NULL) { @@ -3101,7 +3559,7 @@ * free CMA pages. */ list = get_populated_pcp_list(zone, 0, pcp, - migratetype); + migratetype, alloc_flags); if (unlikely(list == NULL) || unlikely(list_empty(list))) return NULL; @@ -3117,8 +3575,8 @@ /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, - struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype) + struct zone *zone, gfp_t gfp_flags, + int migratetype, unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct page *page; @@ -3126,10 +3584,10 @@ local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; - page = __rmqueue_pcplist(zone, migratetype, pcp, + page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, gfp_flags); if (page) { - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone); } local_irq_restore(flags); @@ -3149,8 +3607,8 @@ struct page *page; if (likely(order == 0)) { - page = rmqueue_pcplist(preferred_zone, zone, order, - gfp_flags, migratetype); + page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, + migratetype, alloc_flags); goto out; } @@ -3163,21 +3621,27 @@ do { page = NULL; - - if (alloc_flags & ALLOC_HARDER) { + /* + * order-0 request can reach here when the pcplist is skipped + * due to non-CMA allocation context. HIGHATOMIC area is + * reserved for high-order atomic allocation, so order-0 + * request should skip it. + */ + if (order > 0 && alloc_flags & ALLOC_HARDER) { page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (page) trace_mm_page_alloc_zone_locked(page, order, migratetype); } - - if (!page && migratetype == MIGRATE_MOVABLE && - gfp_flags & __GFP_CMA) - page = __rmqueue_cma(zone, order); - - if (!page) - page = __rmqueue(zone, order, migratetype); + if (!page) { + if (migratetype == MIGRATE_MOVABLE && + alloc_flags & ALLOC_CMA) + page = __rmqueue_cma(zone, order, migratetype, + alloc_flags); + if (!page) + page = __rmqueue(zone, order, migratetype, + alloc_flags); + } } while (page && check_new_pages(page, order)); - spin_unlock(&zone->lock); if (!page) goto failed; @@ -3186,9 +3650,17 @@ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); + trace_android_vh_rmqueue(preferred_zone, zone, order, + gfp_flags, alloc_flags, migratetype); local_irq_restore(flags); out: + /* Separate test+clear to avoid unnecessary atomics */ + if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { + clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); + wakeup_kswapd(zone, 0, 0, zone_idx(zone)); + } + VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; @@ -3218,7 +3690,7 @@ } __setup("fail_page_alloc=", setup_fail_page_alloc); -static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { if (order < fail_page_alloc.min_order) return false; @@ -3242,24 +3714,14 @@ dir = fault_create_debugfs_attr("fail_page_alloc", NULL, &fail_page_alloc.attr); - if (IS_ERR(dir)) - return PTR_ERR(dir); - if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_reclaim)) - goto fail; - if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, - &fail_page_alloc.ignore_gfp_highmem)) - goto fail; - if (!debugfs_create_u32("min-order", mode, dir, - &fail_page_alloc.min_order)) - goto fail; + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_reclaim); + debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem); + debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); return 0; -fail: - debugfs_remove_recursive(dir); - - return -ENOMEM; } late_initcall(fail_page_alloc_debugfs); @@ -3268,12 +3730,41 @@ #else /* CONFIG_FAIL_PAGE_ALLOC */ -static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return false; } #endif /* CONFIG_FAIL_PAGE_ALLOC */ + +noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + return __should_fail_alloc_page(gfp_mask, order); +} +ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); + +static inline long __zone_watermark_unusable_free(struct zone *z, + unsigned int order, unsigned int alloc_flags) +{ + const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); + long unusable_free = (1 << order) - 1; + + /* + * If the caller does not have rights to ALLOC_HARDER then subtract + * the high-atomic reserves. This will over-estimate the size of the + * atomic reserve but it avoids a search. + */ + if (likely(!alloc_harder)) + unusable_free += z->nr_reserved_highatomic; + +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); +#endif + + return unusable_free; +} /* * Return true if free base pages are above 'mark'. For high-order checks it @@ -3282,7 +3773,7 @@ * to check in the allocation paths if no pages are free. */ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags, + int highest_zoneidx, unsigned int alloc_flags, long free_pages) { long min = mark; @@ -3290,19 +3781,12 @@ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); /* free_pages may go negative - that's OK */ - free_pages -= (1 << order) - 1; + free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); if (alloc_flags & ALLOC_HIGH) min -= min / 2; - /* - * If the caller does not have rights to ALLOC_HARDER then subtract - * the high-atomic reserves. This will over-estimate the size of the - * atomic reserve but it avoids a search. - */ - if (likely(!alloc_harder)) { - free_pages -= z->nr_reserved_highatomic; - } else { + if (unlikely(alloc_harder)) { /* * OOM victims can try even harder than normal ALLOC_HARDER * users on the grounds that it's definitely going to be in @@ -3315,19 +3799,12 @@ min -= min / 4; } - -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); -#endif - /* * Check watermarks for an order-0 allocation request. If these * are not met, then a high-order request also cannot go ahead * even if a suitable page happened to be free. */ - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) return false; /* If this is an order-0 request then the watermark is fine */ @@ -3351,65 +3828,83 @@ if (mt == MIGRATE_CMA) continue; #endif - if (!list_empty(&area->free_list[mt])) + if (!free_area_empty(area, mt)) return true; } #ifdef CONFIG_CMA if ((alloc_flags & ALLOC_CMA) && - !list_empty(&area->free_list[MIGRATE_CMA])) { + !free_area_empty(area, MIGRATE_CMA)) { return true; } #endif - if (alloc_harder && - !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) return true; } return false; } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags) + int highest_zoneidx, unsigned int alloc_flags) { - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } +EXPORT_SYMBOL_GPL(zone_watermark_ok); static inline bool zone_watermark_fast(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, unsigned int alloc_flags) + unsigned long mark, int highest_zoneidx, + unsigned int alloc_flags, gfp_t gfp_mask) { - long free_pages = zone_page_state(z, NR_FREE_PAGES); - long cma_pages = 0; + long free_pages; -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); -#endif + free_pages = zone_page_state(z, NR_FREE_PAGES); /* * Fast check for order-0 only. If this fails then the reserves - * need to be calculated. There is a corner case where the check - * passes but only the high-order atomic reserve are free. If - * the caller is !atomic then it'll uselessly search the free - * list. That corner case is then slower but it is harmless. + * need to be calculated. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) - return true; + if (!order) { + long usable_free; + long reserved; - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, - free_pages); + usable_free = free_pages; + reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); + + /* reserved may over estimate high-atomic reserves. */ + usable_free -= min(usable_free, reserved); + if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) + return true; + } + + if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, + free_pages)) + return true; + /* + * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * when checking the min watermark. The min watermark is the + * point where boosting is ignored so that kswapd is woken up + * when below the low watermark. + */ + if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { + mark = z->_watermark[WMARK_MIN]; + return __zone_watermark_ok(z, order, mark, highest_zoneidx, + alloc_flags, free_pages); + } + + return false; } bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx) + unsigned long mark, int highest_zoneidx) { long free_pages = zone_page_state(z, NR_FREE_PAGES); if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, classzone_idx, 0, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, free_pages); } EXPORT_SYMBOL_GPL(zone_watermark_ok_safe); @@ -3418,7 +3913,7 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= - RECLAIM_DISTANCE; + node_reclaim_distance; } #else /* CONFIG_NUMA */ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) @@ -3428,6 +3923,61 @@ #endif /* CONFIG_NUMA */ /* + * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid + * fragmentation is subtle. If the preferred zone was HIGHMEM then + * premature use of a lower zone may cause lowmem pressure problems that + * are worse than fragmentation. If the next zone is ZONE_DMA then it is + * probably too small. It only makes sense to spread allocations to avoid + * fragmentation between the Normal and DMA32 zones. + */ +static inline unsigned int +alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) +{ + unsigned int alloc_flags; + + /* + * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD + * to save a branch. + */ + alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM); + +#ifdef CONFIG_ZONE_DMA32 + if (!zone) + return alloc_flags; + + if (zone_idx(zone) != ZONE_NORMAL) + return alloc_flags; + + /* + * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and + * the pointer is within zone->zone_pgdat->node_zones[]. Also assume + * on UMA that if Normal is populated then so is DMA32. + */ + BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); + if (nr_online_nodes > 1 && !populated_zone(--zone)) + return alloc_flags; + + alloc_flags |= ALLOC_NOFRAGMENT; +#endif /* CONFIG_ZONE_DMA32 */ + return alloc_flags; +} + +static inline unsigned int current_alloc_flags(gfp_t gfp_mask, + unsigned int alloc_flags) +{ +#ifdef CONFIG_CMA + unsigned int pflags = current->flags; + + if (!(pflags & PF_MEMALLOC_NOCMA) && + gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE && + gfp_mask & __GFP_CMA) + alloc_flags |= ALLOC_CMA; + +#endif + return alloc_flags; +} + +/* * get_page_from_freelist goes through the zonelist trying to allocate * a page. */ @@ -3435,16 +3985,20 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { - struct zoneref *z = ac->preferred_zoneref; + struct zoneref *z; struct zone *zone; struct pglist_data *last_pgdat_dirty_limit = NULL; + bool no_fallback; +retry: /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ - for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + no_fallback = alloc_flags & ALLOC_NOFRAGMENT; + z = ac->preferred_zoneref; + for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, + ac->nodemask) { struct page *page; unsigned long mark; @@ -3481,9 +4035,26 @@ } } - mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; + if (no_fallback && nr_online_nodes > 1 && + zone != ac->preferred_zoneref->zone) { + int local_nid; + + /* + * If moving to a remote node, retry but allow + * fragmenting fallbacks. Locality is more important + * than fragmentation avoidance. + */ + local_nid = zone_to_nid(ac->preferred_zoneref->zone); + if (zone_to_nid(zone) != local_nid) { + alloc_flags &= ~ALLOC_NOFRAGMENT; + goto retry; + } + } + + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); if (!zone_watermark_fast(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) { + ac->highest_zoneidx, alloc_flags, + gfp_mask)) { int ret; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -3516,7 +4087,7 @@ default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) goto try_this_zone; continue; @@ -3548,30 +4119,21 @@ } } + /* + * It's possible on a UMA machine to get through all zones that are + * fragmented. If avoiding fragmentation, reset and try again. + */ + if (no_fallback) { + alloc_flags &= ~ALLOC_NOFRAGMENT; + goto retry; + } + return NULL; -} - -/* - * Large machines with many possible nodes should not always dump per-node - * meminfo in irq context. - */ -static inline bool should_suppress_show_mem(void) -{ - bool ret = false; - -#if NODES_SHIFT > 8 - ret = in_interrupt(); -#endif - return ret; } static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) { unsigned int filter = SHOW_MEM_FILTER_NODES; - static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); - - if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) - return; /* * This documents exceptions given to allocations in certain @@ -3592,22 +4154,23 @@ { struct va_format vaf; va_list args; - static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); + static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + if ((gfp_mask & __GFP_NOWARN) || + !__ratelimit(&nopage_rs) || + ((gfp_mask & __GFP_DMA) && !has_managed_dma())) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", + pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", current->comm, &vaf, gfp_mask, &gfp_mask, nodemask_pr_args(nodemask)); va_end(args); cpuset_print_current_mems_allowed(); - + pr_cont("\n"); dump_stack(); warn_alloc_show_mem(gfp_mask, nodemask); } @@ -3681,11 +4244,13 @@ * success so it is time to admit defeat. We will skip the OOM killer * because it is very likely that the caller has a more reasonable * fallback than shooting a random task. + * + * The OOM killer may not free memory on a specific node. */ - if (gfp_mask & __GFP_RETRY_MAYFAIL) + if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ - if (ac->high_zoneidx < ZONE_NORMAL) + if (ac->highest_zoneidx < ZONE_NORMAL) goto out; if (pm_suspended_storage()) goto out; @@ -3698,10 +4263,6 @@ * out_of_memory). Once filesystems are ready to handle allocation * failures more gracefully we should just bail out here. */ - - /* The OOM killer may not free memory on a specific node */ - if (gfp_mask & __GFP_THISNODE) - goto out; /* Exhausted what can be done so it's blame time */ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { @@ -3733,7 +4294,7 @@ unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, enum compact_result *compact_result) { - struct page *page; + struct page *page = NULL; unsigned long pflags; unsigned int noreclaim_flag; @@ -3744,13 +4305,10 @@ noreclaim_flag = memalloc_noreclaim_save(); *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, - prio); + prio, &page); memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); - - if (*compact_result <= COMPACT_INACTIVE) - return NULL; /* * At least in one zone compaction wasn't deferred or skipped, so let's @@ -3758,7 +4316,13 @@ */ count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + /* Prep a captured page if available */ + if (page) + prep_new_page(page, order, gfp_mask, alloc_flags); + + /* Try get a page from the freelist if available */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) { struct zone *zone = page_zone(page); @@ -3807,14 +4371,22 @@ goto check_priority; /* - * make sure the compaction wasn't deferred or didn't bail out early - * due to locks contention before we declare that we should give up. - * But do not retry if the given zonelist is not suitable for - * compaction. + * compaction was skipped because there are not enough order-0 pages + * to work with, so we retry only if it looks like reclaim can help. */ - if (compaction_withdrawn(compact_result)) { + if (compaction_needs_reclaim(compact_result)) { ret = compaction_zonelist_suitable(ac, order, alloc_flags); goto out; + } + + /* + * make sure the compaction wasn't deferred or didn't bail out early + * due to locks contention before we declare that we should give up. + * But the next retry should use a higher priority if allowed, so + * we don't just keep bailing out endlessly. + */ + if (compaction_withdrawn(compact_result)) { + goto check_priority; } /* @@ -3877,10 +4449,10 @@ * Let's give them a good hope and keep retrying while the order-0 * watermarks are OK. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) return true; } return false; @@ -3938,33 +4510,50 @@ EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif +/* + * Zonelists may change due to hotplug during allocation. Detect when zonelists + * have been rebuilt so allocation retries. Reader side does not lock and + * retries the allocation if zonelist changes. Writer side is protected by the + * embedded spin_lock. + */ +static DEFINE_SEQLOCK(zonelist_update_seq); + +static unsigned int zonelist_iter_begin(void) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqbegin(&zonelist_update_seq); + + return 0; +} + +static unsigned int check_retry_zonelist(unsigned int seq) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqretry(&zonelist_update_seq, seq); + + return seq; +} + /* Perform direct synchronous page reclaim */ -static int +static unsigned long __perform_reclaim(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac) { - struct reclaim_state reclaim_state; - int progress; unsigned int noreclaim_flag; - unsigned long pflags; + unsigned long progress; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - psi_memstall_enter(&pflags); fs_reclaim_acquire(gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); - reclaim_state.reclaimed_slab = 0; - current->reclaim_state = &reclaim_state; progress = try_to_free_pages(ac->zonelist, order, gfp_mask, ac->nodemask); - current->reclaim_state = NULL; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); - psi_memstall_leave(&pflags); cond_resched(); @@ -3978,11 +4567,14 @@ unsigned long *did_some_progress) { struct page *page = NULL; + unsigned long pflags; bool drained = false; + bool skip_pcp_drain = false; + psi_memstall_enter(&pflags); *did_some_progress = __perform_reclaim(gfp_mask, order, ac); if (unlikely(!(*did_some_progress))) - return NULL; + goto out; retry: page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); @@ -3990,14 +4582,19 @@ /* * If an allocation failed after direct reclaim, it could be because * pages are pinned on the per-cpu lists or in high alloc reserves. - * Shrink them them and try again + * Shrink them and try again */ if (!page && !drained) { unreserve_highatomic_pageblock(ac, false); - drain_all_pages(NULL); + trace_android_vh_drain_all_pages_bypass(gfp_mask, order, + alloc_flags, ac->migratetype, *did_some_progress, &skip_pcp_drain); + if (!skip_pcp_drain) + drain_all_pages(NULL); drained = true; goto retry; } +out: + psi_memstall_leave(&pflags); return page; } @@ -4008,12 +4605,12 @@ struct zoneref *z; struct zone *zone; pg_data_t *last_pgdat = NULL; - enum zone_type high_zoneidx = ac->high_zoneidx; + enum zone_type highest_zoneidx = ac->highest_zoneidx; - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, ac->nodemask) { if (last_pgdat != zone->zone_pgdat) - wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); + wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); last_pgdat = zone->zone_pgdat; } } @@ -4023,8 +4620,13 @@ { unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; - /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ + /* + * __GFP_HIGH is assumed to be the same as ALLOC_HIGH + * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD + * to save two branches. + */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); + BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); /* * The caller may dip into page reserves a bit more if the caller @@ -4032,7 +4634,8 @@ * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). */ - alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); + alloc_flags |= (__force int) + (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); if (gfp_mask & __GFP_ATOMIC) { /* @@ -4049,10 +4652,8 @@ } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; -#ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif + alloc_flags = current_alloc_flags(gfp_mask, alloc_flags); + return alloc_flags; } @@ -4115,6 +4716,7 @@ { struct zone *zone; struct zoneref *z; + bool ret = false; /* * Costly allocations might have made a progress but this doesn't mean @@ -4141,8 +4743,8 @@ * request even if all reclaimable pages are considered then we are * screwed and have to go OOM. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { unsigned long available; unsigned long reclaimable; unsigned long min_wmark = min_wmark_pages(zone); @@ -4156,7 +4758,7 @@ * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, - ac_classzone_idx(ac), alloc_flags, available); + ac->highest_zoneidx, alloc_flags, available); trace_reclaim_retry_zone(z, order, reclaimable, available, min_wmark, *no_progress_loops, wmark); if (wmark) { @@ -4178,25 +4780,24 @@ } } - /* - * Memory allocation/reclaim might be called from a WQ - * context and the current implementation of the WQ - * concurrency control doesn't recognize that - * a particular WQ is congested if the worker thread is - * looping without ever sleeping. Therefore we have to - * do a short sleep here rather than calling - * cond_resched(). - */ - if (current->flags & PF_WQ_WORKER) - schedule_timeout_uninterruptible(1); - else - cond_resched(); - - return true; + ret = true; + goto out; } } - return false; +out: + /* + * Memory allocation/reclaim might be called from a WQ context and the + * current implementation of the WQ concurrency control doesn't + * recognize that a particular WQ is congested if the worker thread is + * looping without ever sleeping. Therefore we have to do a short sleep + * here rather than calling cond_resched(). + */ + if (current->flags & PF_WQ_WORKER) + schedule_timeout_uninterruptible(1); + else + cond_resched(); + return ret; } static inline bool @@ -4246,8 +4847,12 @@ int compaction_retries; int no_progress_loops; unsigned int cpuset_mems_cookie; + unsigned int zonelist_iter_cookie; int reserve_flags; + unsigned long vh_record; + bool should_alloc_retry = false; + trace_android_vh_alloc_pages_slowpath_begin(gfp_mask, order, &vh_record); /* * We also sanity check to catch abuse of atomic reserves being used by * callers that are not in atomic context. @@ -4256,11 +4861,12 @@ (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; -retry_cpuset: +restart: compaction_retries = 0; no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist_iter_cookie = zonelist_iter_begin(); /* * The fast path uses conservative alloc_flags to succeed only until @@ -4276,11 +4882,11 @@ * could end up iterating over non-eligible zones endlessly. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); if (!ac->preferred_zoneref->zone) goto nopage; - if (gfp_mask & __GFP_KSWAPD_RECLAIM) + if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); /* @@ -4313,18 +4919,28 @@ /* * Checks for costly allocations with __GFP_NORETRY, which - * includes THP page fault allocations + * includes some THP page fault allocations */ if (costly_order && (gfp_mask & __GFP_NORETRY)) { /* - * If compaction is deferred for high-order allocations, - * it is because sync compaction recently failed. If - * this is the case and the caller requested a THP - * allocation, we do not want to heavily disrupt the - * system, so we fail the allocation instead of entering - * direct reclaim. + * If allocating entire pageblock(s) and compaction + * failed because all zones are below low watermarks + * or is prohibited because it recently failed at this + * order, fail immediately unless the allocator has + * requested compaction and reclaim retry. + * + * Reclaim is + * - potentially very expensive because zones are far + * below their low watermarks or this is part of very + * bursty high order allocations, + * - not guaranteed to help because isolate_freepages() + * may not iterate over freed pages as part of its + * linear scan, and + * - unlikely to make entire pageblocks free on its + * own. */ - if (compact_result == COMPACT_DEFERRED) + if (compact_result == COMPACT_SKIPPED || + compact_result == COMPACT_DEFERRED) goto nopage; /* @@ -4338,12 +4954,12 @@ retry: /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ - if (gfp_mask & __GFP_KSWAPD_RECLAIM) + if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) - alloc_flags = reserve_flags; + alloc_flags = current_alloc_flags(gfp_mask, reserve_flags); /* * Reset the nodemask and zonelist iterators if memory policies can be @@ -4353,7 +4969,7 @@ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { ac->nodemask = NULL; ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); } /* Attempt with potentially adjusted zonelist and alloc_flags */ @@ -4368,6 +4984,18 @@ /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) goto nopage; + + trace_android_vh_alloc_pages_reclaim_bypass(gfp_mask, order, + alloc_flags, ac->migratetype, &page); + + if (page) + goto got_pg; + + trace_android_vh_should_alloc_pages_retry(gfp_mask, order, + &alloc_flags, ac->migratetype, ac->preferred_zoneref->zone, + &page, &should_alloc_retry); + if (should_alloc_retry) + goto retry; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, @@ -4409,9 +5037,13 @@ goto retry; - /* Deal with possible cpuset update races before we start OOM killing */ - if (check_retry_cpuset(cpuset_mems_cookie, ac)) - goto retry_cpuset; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); @@ -4420,7 +5052,7 @@ /* Avoid allocations with no watermarks from looping endlessly */ if (tsk_is_oom_victim(current) && - (alloc_flags == ALLOC_OOM || + (alloc_flags & ALLOC_OOM || (gfp_mask & __GFP_NOMEMALLOC))) goto nopage; @@ -4431,9 +5063,13 @@ } nopage: - /* Deal with possible cpuset update races before we fail */ - if (check_retry_cpuset(cpuset_mems_cookie, ac)) - goto retry_cpuset; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; /* * Make sure that __GFP_NOFAIL request doesn't leak out and make sure @@ -4476,9 +5112,15 @@ goto retry; } fail: + trace_android_vh_alloc_pages_failure_bypass(gfp_mask, order, + alloc_flags, ac->migratetype, &page); + if (page) + goto got_pg; + warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); got_pg: + trace_android_vh_alloc_pages_slowpath_end(gfp_mask, order, vh_record); return page; } @@ -4487,14 +5129,18 @@ struct alloc_context *ac, gfp_t *alloc_mask, unsigned int *alloc_flags) { - ac->high_zoneidx = gfp_zone(gfp_mask); + ac->highest_zoneidx = gfp_zone(gfp_mask); ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; - ac->migratetype = gfpflags_to_migratetype(gfp_mask); + ac->migratetype = gfp_migratetype(gfp_mask); if (cpusets_enabled()) { *alloc_mask |= __GFP_HARDWALL; - if (!ac->nodemask) + /* + * When we are in the interrupt context, it is irrelevant + * to the current task context. It means that any node ok. + */ + if (!in_interrupt() && !ac->nodemask) ac->nodemask = &cpuset_current_mems_allowed; else *alloc_flags |= ALLOC_CPUSET; @@ -4508,15 +5154,8 @@ if (should_fail_alloc_page(gfp_mask, order)) return false; - if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) - *alloc_flags |= ALLOC_CMA; + *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags); - return true; -} - -/* Determine whether to spread dirty pages and what the first usable zone */ -static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) -{ /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -4526,7 +5165,9 @@ * may get reset for allocations that ignore memory policies. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); + + return true; } /* @@ -4555,7 +5196,11 @@ if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; - finalise_ac(gfp_mask, &ac); + /* + * Forbid the first pass from falling back to types that fragment + * memory until all local zones are considered. + */ + alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); @@ -4575,14 +5220,13 @@ * Restore the original nodemask if it was potentially replaced with * &cpuset_current_mems_allowed to optimize the fast-path attempt. */ - if (unlikely(ac.nodemask != nodemask)) - ac.nodemask = nodemask; + ac.nodemask = nodemask; page = __alloc_pages_slowpath(alloc_mask, order, &ac); out: if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && - unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { + unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) { __free_pages(page, order); page = NULL; } @@ -4620,13 +5264,20 @@ if (order == 0) /* Via pcp? */ free_unref_page(page); else - __free_pages_ok(page, order); + __free_pages_ok(page, order, FPI_NONE); } void __free_pages(struct page *page, unsigned int order) { + /* get PageHead before we drop reference */ + int head = PageHead(page); + + trace_android_vh_free_pages(page, order); if (put_page_testzero(page)) free_the_page(page, order); + else if (!head) + while (order-- > 0) + free_the_page(page + (1 << order), order); } EXPORT_SYMBOL(__free_pages); @@ -4731,6 +5382,18 @@ /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; offset = size - fragsz; + if (unlikely(offset < 0)) { + /* + * The caller is trying to allocate a fragment + * with fragsz > PAGE_SIZE but the cache isn't big + * enough to satisfy the request, this may + * happen in low memory conditions. + * We don't release the cache page because + * it could make memory pressure worse + * so we simply return NULL here. + */ + return NULL; + } } nc->pagecnt_bias--; @@ -4771,7 +5434,7 @@ /** * alloc_pages_exact - allocate an exact number physically-contiguous pages. * @size: the number of bytes to allocate - * @gfp_mask: GFP flags for the allocation + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP * * This function is similar to alloc_pages(), except that it allocates the * minimum number of pages to satisfy the request. alloc_pages() can only @@ -4780,11 +5443,16 @@ * This function is also limited by MAX_ORDER. * * Memory allocated by this function must be released by free_pages_exact(). + * + * Return: pointer to the allocated area or %NULL in case of error. */ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) { unsigned int order = get_order(size); unsigned long addr; + + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) + gfp_mask &= ~__GFP_COMP; addr = __get_free_pages(gfp_mask, order); return make_alloc_exact(addr, order, size); @@ -4796,15 +5464,22 @@ * pages on a node. * @nid: the preferred node ID where memory should be allocated * @size: the number of bytes to allocate - * @gfp_mask: GFP flags for the allocation + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP * * Like alloc_pages_exact(), but try to allocate on node nid first before falling * back. + * + * Return: pointer to the allocated area or %NULL in case of error. */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned int order = get_order(size); - struct page *p = alloc_pages_node(nid, gfp_mask, order); + struct page *p; + + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) + gfp_mask &= ~__GFP_COMP; + + p = alloc_pages_node(nid, gfp_mask, order); if (!p) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); @@ -4833,11 +5508,13 @@ * nr_free_zone_pages - count number of pages beyond high watermark * @offset: The zone index of the highest zone * - * nr_free_zone_pages() counts the number of counts pages which are beyond the + * nr_free_zone_pages() counts the number of pages which are beyond the * high watermark within all zones at or below a given zone index. For each * zone, the number of pages is calculated as: * * nr_free_zone_pages = managed_pages - high_pages + * + * Return: number of pages beyond high watermark. */ static unsigned long nr_free_zone_pages(int offset) { @@ -4850,7 +5527,7 @@ struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); for_each_zone_zonelist(zone, z, zonelist, offset) { - unsigned long size = zone->managed_pages; + unsigned long size = zone_managed_pages(zone); unsigned long high = high_wmark_pages(zone); if (size > high) sum += size - high; @@ -4864,23 +5541,15 @@ * * nr_free_buffer_pages() counts the number of pages which are beyond the high * watermark within ZONE_DMA and ZONE_NORMAL. + * + * Return: number of pages beyond high watermark within ZONE_DMA and + * ZONE_NORMAL. */ unsigned long nr_free_buffer_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_USER)); } EXPORT_SYMBOL_GPL(nr_free_buffer_pages); - -/** - * nr_free_pagecache_pages - count number of pages beyond high watermark - * - * nr_free_pagecache_pages() counts the number of pages which are beyond the - * high watermark within all zones. - */ -unsigned long nr_free_pagecache_pages(void) -{ - return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); -} static inline void show_node(struct zone *zone) { @@ -4902,7 +5571,7 @@ pages[lru] = global_node_page_state(NR_LRU_BASE + lru); for_each_zone(zone) - wmark_low += zone->watermark[WMARK_LOW]; + wmark_low += low_wmark_pages(zone); /* * Estimate the amount of memory available for userspace allocations, @@ -4924,8 +5593,8 @@ * items that are in use, and cannot be freed. Cap this estimate at the * low watermark. */ - reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) + - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); + reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); available += reclaimable - min(reclaimable / 2, wmark_low); if (available < 0) @@ -4936,11 +5605,11 @@ void si_meminfo(struct sysinfo *val) { - val->totalram = totalram_pages; + val->totalram = totalram_pages(); val->sharedram = global_node_page_state(NR_SHMEM); val->freeram = global_zone_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); - val->totalhigh = totalhigh_pages; + val->totalhigh = totalhigh_pages(); val->freehigh = nr_free_highpages(); val->mem_unit = PAGE_SIZE; } @@ -4957,7 +5626,7 @@ pg_data_t *pgdat = NODE_DATA(nid); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += pgdat->node_zones[zone_type].managed_pages; + managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); val->totalram = managed_pages; val->sharedram = node_page_state(pgdat, NR_SHMEM); val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); @@ -4966,7 +5635,7 @@ struct zone *zone = &pgdat->node_zones[zone_type]; if (is_highmem(zone)) { - managed_highpages += zone->managed_pages; + managed_highpages += zone_managed_pages(zone); free_highpages += zone_page_state(zone, NR_FREE_PAGES); } } @@ -5055,7 +5724,7 @@ printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", @@ -5068,9 +5737,8 @@ global_node_page_state(NR_UNEVICTABLE), global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), - global_node_page_state(NR_UNSTABLE_NFS), - global_node_page_state(NR_SLAB_RECLAIMABLE), - global_node_page_state(NR_SLAB_UNRECLAIMABLE), + global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), + global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), global_zone_page_state(NR_PAGETABLE), @@ -5079,6 +5747,7 @@ free_pcp, global_zone_page_state(NR_FREE_CMA_PAGES)); + trace_android_vh_show_mapcount_pages(NULL); for_each_online_pgdat(pgdat) { if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) continue; @@ -5101,7 +5770,10 @@ " anon_thp: %lukB" #endif " writeback_tmp:%lukB" - " unstable:%lukB" + " kernel_stack:%lukB" +#ifdef CONFIG_SHADOW_CALL_STACK + " shadow_call_stack:%lukB" +#endif " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5123,7 +5795,10 @@ K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - K(node_page_state(pgdat, NR_UNSTABLE_NFS)), + node_page_state(pgdat, NR_KERNEL_STACK_KB), +#ifdef CONFIG_SHADOW_CALL_STACK + node_page_state(pgdat, NR_KERNEL_SCS_KB), +#endif pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } @@ -5145,6 +5820,7 @@ " min:%lukB" " low:%lukB" " high:%lukB" + " reserved_highatomic:%luKB" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" @@ -5154,10 +5830,6 @@ " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " kernel_stack:%lukB" -#ifdef CONFIG_SHADOW_CALL_STACK - " shadow_call_stack:%lukB" -#endif " pagetables:%lukB" " bounce:%lukB" " free_pcp:%lukB" @@ -5169,6 +5841,7 @@ K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), + K(zone->nr_reserved_highatomic), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), @@ -5176,12 +5849,8 @@ K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), K(zone->present_pages), - K(zone->managed_pages), + K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), - zone_page_state(zone, NR_KERNEL_STACK_KB), -#ifdef CONFIG_SHADOW_CALL_STACK - zone_page_state(zone, NR_KERNEL_SCS_BYTES) / 1024, -#endif K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), @@ -5213,7 +5882,7 @@ types[order] = 0; for (type = 0; type < MIGRATE_TYPES; type++) { - if (!list_empty(&area->free_list[type])) + if (!free_area_empty(area, type)) types[order] |= 1 << type; } } @@ -5254,7 +5923,7 @@ do { zone_type--; zone = pgdat->node_zones + zone_type; - if (managed_zone(zone)) { + if (populated_zone(zone)) { zoneref_set_zone(zone, &zonerefs[nr_zones++]); check_highest_zone(zone_type); } @@ -5280,36 +5949,17 @@ return 0; } -static __init int setup_numa_zonelist_order(char *s) -{ - if (!s) - return 0; - - return __parse_numa_zonelist_order(s); -} -early_param("numa_zonelist_order", setup_numa_zonelist_order); - char numa_zonelist_order[] = "Node"; /* * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { - char *str; - int ret; - - if (!write) - return proc_dostring(table, write, buffer, length, ppos); - str = memdup_user_nul(buffer, 16); - if (IS_ERR(str)) - return PTR_ERR(str); - - ret = __parse_numa_zonelist_order(str); - kfree(str); - return ret; + if (write) + return __parse_numa_zonelist_order(buffer); + return proc_dostring(table, write, buffer, length, ppos); } @@ -5328,14 +5978,14 @@ * from each node to each node in the system), and should also prefer nodes * with no CPUs, since presumably they'll have very little allocation pressure * on them otherwise. - * It returns -1 if no node is found. + * + * Return: node id of the found node or %NUMA_NO_NODE if no node is found. */ static int find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; int best_node = NUMA_NO_NODE; - const struct cpumask *tmp = cpumask_of_node(0); /* Use the local node if we haven't already */ if (!node_isset(node, *used_node_mask)) { @@ -5356,8 +6006,7 @@ val += (n < node); /* Give preference to headless and unused nodes */ - tmp = cpumask_of_node(n); - if (!cpumask_empty(tmp)) + if (!cpumask_empty(cpumask_of_node(n))) val += PENALTY_FOR_NODE_WITH_CPUS; /* Slight preference for less loaded node */ @@ -5428,14 +6077,13 @@ { static int node_order[MAX_NUMNODES]; int node, load, nr_nodes = 0; - nodemask_t used_mask; + nodemask_t used_mask = NODE_MASK_NONE; int local_node, prev_node; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; load = nr_online_nodes; prev_node = local_node; - nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { @@ -5542,9 +6190,22 @@ int nid; int __maybe_unused cpu; pg_data_t *self = data; - static DEFINE_SPINLOCK(lock); + unsigned long flags; - spin_lock(&lock); + /* + * Explicitly disable this CPU's interrupts before taking seqlock + * to prevent any IRQ handler from calling into the page allocator + * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. + */ + local_irq_save(flags); + /* + * Explicitly disable this CPU's synchronous printk() before taking + * seqlock to prevent any printk() from trying to hold port->lock, for + * tty_insert_flip_string_and_push_buffer() on other CPU might be + * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. + */ + printk_deferred_enter(); + write_seqlock(&zonelist_update_seq); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -5577,7 +6238,9 @@ #endif } - spin_unlock(&lock); + write_sequnlock(&zonelist_update_seq); + printk_deferred_exit(); + local_irq_restore(flags); } static noinline void __init @@ -5615,13 +6278,16 @@ */ void __ref build_all_zonelists(pg_data_t *pgdat) { + unsigned long vm_total_pages; + if (system_state == SYSTEM_BOOTING) { build_all_zonelists_init(); } else { __build_all_zonelists(pgdat); /* cpuset refresh routine should be here */ } - vm_total_pages = nr_free_pagecache_pages(); + /* Get the number of free pages beyond high watermark in all zones. */ + vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); /* * Disable grouping by mobility if the number of pages in the * system is too low to allow the mechanism to work. It would be @@ -5634,7 +6300,7 @@ else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", + pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", nr_online_nodes, page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); @@ -5643,81 +6309,148 @@ #endif } +/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ +static bool __meminit +overlap_memmap_init(unsigned long zone, unsigned long *pfn) +{ + static struct memblock_region *r; + + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { + if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { + for_each_mem_region(r) { + if (*pfn < memblock_region_memory_end_pfn(r)) + break; + } + } + if (*pfn >= memblock_region_memory_base_pfn(r) && + memblock_is_mirror(r)) { + *pfn = memblock_region_memory_end_pfn(r); + return true; + } + } + return false; +} + /* * Initially all pages are reserved - free ones are freed - * up by free_all_bootmem() once the early boot process is + * up by memblock_free_all() once the early boot process is * done. Non-atomic initialization, single-pass. + * + * All aligned pageblocks are initialized to the specified migratetype + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related + * zone stats (e.g., nr_isolate_pageblock) are touched. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn, enum meminit_context context, - struct vmem_altmap *altmap) + unsigned long start_pfn, unsigned long zone_end_pfn, + enum meminit_context context, + struct vmem_altmap *altmap, int migratetype) { - unsigned long end_pfn = start_pfn + size; - pg_data_t *pgdat = NODE_DATA(nid); - unsigned long pfn; - unsigned long nr_initialised = 0; + unsigned long pfn, end_pfn = start_pfn + size; struct page *page; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - struct memblock_region *r = NULL, *tmp; -#endif if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; + +#ifdef CONFIG_ZONE_DEVICE + /* + * Honor reservation requested by the driver for this ZONE_DEVICE + * memory. We limit the total number of pages to initialize to just + * those that might contain the memory mapping. We will defer the + * ZONE_DEVICE page initialization until after we have released + * the hotplug lock. + */ + if (zone == ZONE_DEVICE) { + if (!altmap) + return; + + if (start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + } +#endif #ifdef CONFIG_ROCKCHIP_THUNDER_BOOT /* Zero all page struct in advance */ memset(pfn_to_page(start_pfn), 0, sizeof(struct page) * size); #endif - /* - * Honor reservation requested by the driver for this ZONE_DEVICE - * memory - */ - if (altmap && start_pfn == altmap->base_pfn) - start_pfn += altmap->reserve; - - for (pfn = start_pfn; pfn < end_pfn; pfn++) { + for (pfn = start_pfn; pfn < end_pfn; ) { /* * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory. */ - if (context != MEMINIT_EARLY) - goto not_early; - - if (!early_pfn_valid(pfn)) - continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; - if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) - break; - -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - /* - * Check given memblock attribute by firmware which can affect - * kernel memory layout. If zone==ZONE_MOVABLE but memory is - * mirrored, it's an overlapped memmap init. skip it. - */ - if (mirrored_kernelcore && zone == ZONE_MOVABLE) { - if (!r || pfn >= memblock_region_memory_end_pfn(r)) { - for_each_memblock(memory, tmp) - if (pfn < memblock_region_memory_end_pfn(tmp)) - break; - r = tmp; - } - if (pfn >= memblock_region_memory_base_pfn(r) && - memblock_is_mirror(r)) { - /* already initialized as NORMAL */ - pfn = memblock_region_memory_end_pfn(r); + if (context == MEMINIT_EARLY) { + if (overlap_memmap_init(zone, &pfn)) continue; - } + if (defer_init(nid, pfn, zone_end_pfn)) + break; } -#endif -not_early: page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid, false); if (context == MEMINIT_HOTPLUG) - SetPageReserved(page); + __SetPageReserved(page); + + /* + * Usually, we want to mark the pageblock MIGRATE_MOVABLE, + * such that unmovable allocations won't be scattered all + * over the place during system boot. + */ + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + set_pageblock_migratetype(page, migratetype); + cond_resched(); + } + pfn++; + } +} + +#ifdef CONFIG_ZONE_DEVICE +void __ref memmap_init_zone_device(struct zone *zone, + unsigned long start_pfn, + unsigned long nr_pages, + struct dev_pagemap *pgmap) +{ + unsigned long pfn, end_pfn = start_pfn + nr_pages; + struct pglist_data *pgdat = zone->zone_pgdat; + struct vmem_altmap *altmap = pgmap_altmap(pgmap); + unsigned long zone_idx = zone_idx(zone); + unsigned long start = jiffies; + int nid = pgdat->node_id; + + if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) + return; + + /* + * The call to memmap_init should have already taken care + * of the pages reserved for the memmap, so we can just jump to + * the end of that region and start processing the device pages. + */ + if (altmap) { + start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + nr_pages = end_pfn - start_pfn; + } + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_single_page(page, pfn, zone_idx, nid, true); + + /* + * Mark page reserved as it will need to wait for onlining + * phase for it to be fully associated with a zone. + * + * We can use the non-atomic __set_bit operation for setting + * the flag as we are still initializing the pages. + */ + __SetPageReserved(page); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer + * and zone_device_data. It is a bug if a ZONE_DEVICE page is + * ever freed or placed on a driver-private list. + */ + page->pgmap = pgmap; + page->zone_device_data = NULL; /* * Mark the block movable so that blocks are reserved for @@ -5726,21 +6459,20 @@ * the address space during boot when many long-lived * kernel allocations are made. * - * bitmap is created for zone's valid pfn range. but memmap - * can be created for invalid pages (for alignment) - * check here not to call set_pageblock_migratetype() against - * pfn out of zone. - * * Please note that MEMINIT_HOTPLUG path doesn't clear memmap - * because this is done early in sparse_add_one_section + * because this is done early in section_activate() */ - if (!(pfn & (pageblock_nr_pages - 1))) { + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } } + + pr_info("%s initialised %lu pages in %ums\n", __func__, + nr_pages, jiffies_to_msecs(jiffies - start)); } +#endif static void __meminit zone_init_free_lists(struct zone *zone) { unsigned int order, t; @@ -5750,11 +6482,118 @@ } } -#ifndef __HAVE_ARCH_MEMMAP_INIT -#define memmap_init(size, nid, zone, start_pfn) \ - memmap_init_zone((size), (nid), (zone), (start_pfn), \ - MEMINIT_EARLY, NULL) +/* + * Only struct pages that correspond to ranges defined by memblock.memory + * are zeroed and initialized by going through __init_single_page() during + * memmap_init_zone_range(). + * + * But, there could be struct pages that correspond to holes in + * memblock.memory. This can happen because of the following reasons: + * - physical memory bank size is not necessarily the exact multiple of the + * arbitrary section size + * - early reserved memory may not be listed in memblock.memory + * - memory layouts defined with memmap= kernel parameter may not align + * nicely with memmap sections + * + * Explicitly initialize those struct pages so that: + * - PG_Reserved is set + * - zone and node links point to zone and node that span the page if the + * hole is in the middle of a zone + * - zone and node links point to adjacent zone/node if the hole falls on + * the zone boundary; the pages in such holes will be prepended to the + * zone/node above the hole except for the trailing pages in the last + * section that will be appended to the zone/node below. + */ +static void __init init_unavailable_range(unsigned long spfn, + unsigned long epfn, + int zone, int node) +{ + unsigned long pfn; + u64 pgcnt = 0; + + for (pfn = spfn; pfn < epfn; pfn++) { + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) + + pageblock_nr_pages - 1; + continue; + } + __init_single_page(pfn_to_page(pfn), pfn, zone, node, true); + __SetPageReserved(pfn_to_page(pfn)); + pgcnt++; + } + + if (pgcnt) + pr_info("On node %d, zone %s: %lld pages in unavailable ranges", + node, zone_names[zone], pgcnt); +} + +static void __init memmap_init_zone_range(struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long *hole_pfn) +{ + unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; + int nid = zone_to_nid(zone), zone_id = zone_idx(zone); + + start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); + end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); + + if (start_pfn >= end_pfn) + return; + + memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn, + zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + + if (*hole_pfn < start_pfn) + init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); + + *hole_pfn = end_pfn; +} + +void __init __weak memmap_init(void) +{ + unsigned long start_pfn, end_pfn; + unsigned long hole_pfn = 0; + int i, j, zone_id, nid; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + struct pglist_data *node = NODE_DATA(nid); + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = node->node_zones + j; + + if (!populated_zone(zone)) + continue; + + memmap_init_zone_range(zone, start_pfn, end_pfn, + &hole_pfn); + zone_id = j; + } + } + +#ifdef CONFIG_SPARSEMEM + /* + * Initialize the memory map for hole in the range [memory_end, + * section_end]. + * Append the pages in this hole to the highest zone in the last + * node. + * The call to init_unavailable_range() is outside the ifdef to + * silence the compiler warining about zone_id set but not used; + * for FLATMEM it is a nop anyway + */ + end_pfn = round_up(end_pfn, PAGES_PER_SECTION); + if (hole_pfn < end_pfn) #endif + init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); +} + +/* A stub for backwards compatibility with custom implementatin on IA-64 */ +void __meminit __weak arch_memmap_init(unsigned long size, int nid, + unsigned long zone, + unsigned long range_start_pfn) +{ +} static int zone_batchsize(struct zone *zone) { @@ -5765,7 +6604,7 @@ * The per-cpu-pages pools are set to around 1000th of the * size of the zone. */ - batch = zone->managed_pages / 1024; + batch = zone_managed_pages(zone) / 1024; /* But no more than a meg. */ if (batch * PAGE_SIZE > 1024 * 1024) batch = (1024 * 1024) / PAGE_SIZE; @@ -5812,7 +6651,7 @@ * locking. * * Any new users of pcp->batch and pcp->high should ensure they can cope with - * those fields changing asynchronously (acording the the above rule). + * those fields changing asynchronously (acording to the above rule). * * mutex_is_locked(&pcp_batch_high_lock) required when calling this function * outside of boot time (or some other assurance that no concurrent updaters @@ -5821,6 +6660,7 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, unsigned long batch) { + trace_android_vh_pageset_update(&high, &batch); /* start with a fail safe value for batch */ pcp->batch = 1; smp_wmb(); @@ -5846,7 +6686,6 @@ memset(p, 0, sizeof(*p)); pcp = &p->pcp; - pcp->count = 0; for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } @@ -5876,7 +6715,7 @@ { if (percpu_pagelist_fraction) pageset_set_high(pcp, - (zone->managed_pages / + (zone_managed_pages(zone) / percpu_pagelist_fraction)); else pageset_set_batch(pcp, zone_batchsize(zone)); @@ -5906,9 +6745,24 @@ { struct pglist_data *pgdat; struct zone *zone; + int __maybe_unused cpu; for_each_populated_zone(zone) setup_zone_pageset(zone); + +#ifdef CONFIG_NUMA + /* + * Unpopulated zones continue using the boot pagesets. + * The numa stats for these pagesets need to be reset. + * Otherwise, they will end up skewing the stats of + * the nodes these zones are associated with. + */ + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu); + memset(pcp->vm_numa_stat_diff, 0, + sizeof(pcp->vm_numa_stat_diff)); + } +#endif for_each_online_pgdat(pgdat) pgdat->per_cpu_nodestats = @@ -5952,73 +6806,6 @@ zone->initialized = 1; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID - -/* - * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - */ -int __meminit __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state) -{ - unsigned long start_pfn, end_pfn; - int nid; - - if (state->last_start <= pfn && pfn < state->last_end) - return state->last_nid; - - nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); - if (nid != -1) { - state->last_start = start_pfn; - state->last_end = end_pfn; - state->last_nid = nid; - } - - return nid; -} -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ - -/** - * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range - * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. - * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid - * - * If an architecture guarantees that all ranges registered contain no holes - * and may be freed, this this function may be used instead of calling - * memblock_free_early_nid() manually. - */ -void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) -{ - unsigned long start_pfn, end_pfn; - int i, this_nid; - - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { - start_pfn = min(start_pfn, max_low_pfn); - end_pfn = min(end_pfn, max_low_pfn); - - if (start_pfn < end_pfn) - memblock_free_early_nid(PFN_PHYS(start_pfn), - (end_pfn - start_pfn) << PAGE_SHIFT, - this_nid); - } -} - -/** - * sparse_memory_present_with_active_regions - Call memory_present for each active range - * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. - * - * If an architecture guarantees that all ranges registered contain no holes and may - * be freed, this function may be used instead of calling memory_present() manually. - */ -void __init sparse_memory_present_with_active_regions(int nid) -{ - unsigned long start_pfn, end_pfn; - int i, this_nid; - - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) - memory_present(this_nid, start_pfn, end_pfn); -} - /** * get_pfn_range_for_nid - Return the start and end page frames for a node * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. @@ -6030,7 +6817,7 @@ * with no available memory, a warning is printed and the start and end * PFNs will be 0. */ -void __meminit get_pfn_range_for_nid(unsigned int nid, +void __init get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) { unsigned long this_start_pfn, this_end_pfn; @@ -6079,7 +6866,7 @@ * highest usable zone for ZONE_MOVABLE. This preserves the assumption that * zones within a node are in order of monotonic increases memory addresses */ -static void __meminit adjust_zone_range_for_zone_movable(int nid, +static void __init adjust_zone_range_for_zone_movable(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, @@ -6110,13 +6897,12 @@ * Return the number of pages a zone spans in a node, including holes * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() */ -static unsigned long __meminit zone_spanned_pages_in_node(int nid, +static unsigned long __init zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zone_start_pfn, - unsigned long *zone_end_pfn, - unsigned long *ignored) + unsigned long *zone_end_pfn) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; @@ -6147,7 +6933,7 @@ * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -unsigned long __meminit __absent_pages_in_range(int nid, +unsigned long __init __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { @@ -6168,7 +6954,7 @@ * @start_pfn: The start PFN to start searching for holes * @end_pfn: The end PFN to stop searching for holes * - * It returns the number of pages frames in memory holes within a range. + * Return: the number of pages frames in memory holes within a range. */ unsigned long __init absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn) @@ -6177,11 +6963,10 @@ } /* Return the number of page frames in holes in a zone on a node */ -static unsigned long __meminit zone_absent_pages_in_node(int nid, +static unsigned long __init zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *ignored) + unsigned long node_end_pfn) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; @@ -6209,7 +6994,7 @@ unsigned long start_pfn, end_pfn; struct memblock_region *r; - for_each_memblock(memory, r) { + for_each_mem_region(r) { start_pfn = clamp(memblock_region_memory_base_pfn(r), zone_start_pfn, zone_end_pfn); end_pfn = clamp(memblock_region_memory_end_pfn(r), @@ -6228,45 +7013,9 @@ return nr_absent; } -#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, - unsigned long zone_type, - unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zone_start_pfn, - unsigned long *zone_end_pfn, - unsigned long *zones_size) -{ - unsigned int zone; - - *zone_start_pfn = node_start_pfn; - for (zone = 0; zone < zone_type; zone++) - *zone_start_pfn += zones_size[zone]; - - *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; - - return zones_size[zone_type]; -} - -static inline unsigned long __meminit zone_absent_pages_in_node(int nid, - unsigned long zone_type, +static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zholes_size) -{ - if (!zholes_size) - return 0; - - return zholes_size[zone_type]; -} - -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - -static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, - unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zones_size, - unsigned long *zholes_size) + unsigned long node_end_pfn) { unsigned long realtotalpages = 0, totalpages = 0; enum zone_type i; @@ -6274,17 +7023,21 @@ for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; unsigned long zone_start_pfn, zone_end_pfn; + unsigned long spanned, absent; unsigned long size, real_size; - size = zone_spanned_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - &zone_start_pfn, - &zone_end_pfn, - zones_size); - real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, - node_start_pfn, node_end_pfn, - zholes_size); + spanned = zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + &zone_start_pfn, + &zone_end_pfn); + absent = zone_absent_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn); + + size = spanned; + real_size = size - absent; + if (size) zone->zone_start_pfn = zone_start_pfn; else @@ -6330,10 +7083,14 @@ { unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; - if (usemapsize) + if (usemapsize) { zone->pageblock_flags = - memblock_virt_alloc_node_nopanic(usemapsize, - pgdat->node_id); + memblock_alloc_node(usemapsize, SMP_CACHE_BYTES, + pgdat->node_id); + if (!zone->pageblock_flags) + panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", + usemapsize, zone->name, pgdat->node_id); + } } #else static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, @@ -6400,9 +7157,11 @@ #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void pgdat_init_split_queue(struct pglist_data *pgdat) { - spin_lock_init(&pgdat->split_queue_lock); - INIT_LIST_HEAD(&pgdat->split_queue); - pgdat->split_queue_len = 0; + struct deferred_split *ds_queue = &pgdat->deferred_split_queue; + + spin_lock_init(&ds_queue->split_queue_lock); + INIT_LIST_HEAD(&ds_queue->split_queue); + ds_queue->split_queue_len = 0; } #else static void pgdat_init_split_queue(struct pglist_data *pgdat) {} @@ -6429,13 +7188,13 @@ pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); - lruvec_init(node_lruvec(pgdat)); + lruvec_init(&pgdat->__lruvec); } static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, unsigned long remaining_pages) { - zone->managed_pages = remaining_pages; + atomic_long_set(&zone->managed_pages, remaining_pages); zone_set_nid(zone, nid); zone->name = zone_names[idx]; zone->zone_pgdat = NODE_DATA(nid); @@ -6533,7 +7292,7 @@ set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); init_currently_empty_zone(zone, zone_start_pfn, size); - memmap_init(size, nid, j, zone_start_pfn); + arch_memmap_init(size, nid, j, zone_start_pfn); } } @@ -6562,7 +7321,11 @@ end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); + map = memblock_alloc_node(size, SMP_CACHE_BYTES, + pgdat->node_id); + if (!map) + panic("Failed to allocate %ld bytes for node %d memory map\n", + size, pgdat->node_id); pgdat->node_mem_map = map + offset; } pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", @@ -6574,10 +7337,8 @@ */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= offset; -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif } @@ -6588,42 +7349,31 @@ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void pgdat_set_deferred_range(pg_data_t *pgdat) { - /* - * We start only with one section of pages, more pages are added as - * needed until the rest of deferred pages are initialized. - */ - pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, - pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } #else static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} #endif -void __init free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, - unsigned long *zholes_size) +static void __init free_area_init_node(int nid) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ - WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); + WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); + + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pgdat->node_id = nid; - pgdat->node_start_pfn = node_start_pfn; + pgdat->node_start_pfn = start_pfn; pgdat->per_cpu_nodestats = NULL; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); -#else - start_pfn = node_start_pfn; -#endif - calculate_node_totalpages(pgdat, start_pfn, end_pfn, - zones_size, zholes_size); + calculate_node_totalpages(pgdat, start_pfn, end_pfn); alloc_node_mem_map(pgdat); pgdat_set_deferred_range(pgdat); @@ -6631,80 +7381,10 @@ free_area_init_core(pgdat); } -#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) - -/* - * Zero all valid struct pages in range [spfn, epfn), return number of struct - * pages zeroed - */ -static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) +void __init free_area_init_memoryless_node(int nid) { - unsigned long pfn; - u64 pgcnt = 0; - - for (pfn = spfn; pfn < epfn; pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; - continue; - } - mm_zero_struct_page(pfn_to_page(pfn)); - pgcnt++; - } - - return pgcnt; + free_area_init_node(nid); } - -/* - * Only struct pages that are backed by physical memory are zeroed and - * initialized by going through __init_single_page(). But, there are some - * struct pages which are reserved in memblock allocator and their fields - * may be accessed (for example page_to_pfn() on some configuration accesses - * flags). We must explicitly zero those struct pages. - * - * This function also addresses a similar issue where struct pages are left - * uninitialized because the physical address range is not covered by - * memblock.memory or memblock.reserved. That could happen when memblock - * layout is manually configured via memmap=, or when the highest physical - * address (max_pfn) does not end on a section boundary. - */ -void __init zero_resv_unavail(void) -{ - phys_addr_t start, end; - u64 i, pgcnt; - phys_addr_t next = 0; - - /* - * Loop through unavailable ranges not covered by memblock.memory. - */ - pgcnt = 0; - for_each_mem_range(i, &memblock.memory, NULL, - NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { - if (next < start) - pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start)); - next = end; - } - - /* - * Early sections always have a fully populated memmap for the whole - * section - see pfn_valid(). If the last section has holes at the - * end and that section is marked "online", the memmap will be - * considered initialized. Make sure that memmap has a well defined - * state. - */ - pgcnt += zero_pfn_range(PFN_DOWN(next), - round_up(max_pfn, PAGES_PER_SECTION)); - - /* - * Struct pages that do not have backing memory. This could be because - * firmware is using some of this memory, or for some other reasons. - */ - if (pgcnt) - pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); -} -#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ - -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #if MAX_NUMNODES > 1 /* @@ -6735,14 +7415,14 @@ * model has fine enough granularity to avoid incorrect mapping for the * populated node map. * - * Returns the determined alignment in pfn's. 0 if there is no alignment + * Return: the determined alignment in pfn's. 0 if there is no alignment * requirement (single node). */ unsigned long __init node_map_pfn_alignment(void) { unsigned long accl_mask = 0, last_end = 0; unsigned long start, end, mask; - int last_nid = -1; + int last_nid = NUMA_NO_NODE; int i, nid; for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { @@ -6769,33 +7449,15 @@ return ~accl_mask + 1; } -/* Find the lowest pfn for a node */ -static unsigned long __init find_min_pfn_for_node(int nid) -{ - unsigned long min_pfn = ULONG_MAX; - unsigned long start_pfn; - int i; - - for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) - min_pfn = min(min_pfn, start_pfn); - - if (min_pfn == ULONG_MAX) { - pr_warn("Could not find start_pfn for node %d\n", nid); - return 0; - } - - return min_pfn; -} - /** * find_min_pfn_with_active_regions - Find the minimum PFN registered * - * It returns the minimum PFN based on information provided via + * Return: the minimum PFN based on information provided via * memblock_set_node(). */ unsigned long __init find_min_pfn_with_active_regions(void) { - return find_min_pfn_for_node(MAX_NUMNODES); + return PHYS_PFN(memblock_start_of_DRAM()); } /* @@ -6844,11 +7506,11 @@ * options. */ if (movable_node_is_enabled()) { - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (!memblock_is_hotpluggable(r)) continue; - nid = r->nid; + nid = memblock_get_region_node(r); usable_startpfn = PFN_DOWN(r->base); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? @@ -6865,11 +7527,11 @@ if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (memblock_is_mirror(r)) continue; - nid = r->nid; + nid = memblock_get_region_node(r); usable_startpfn = memblock_region_memory_base_pfn(r); @@ -6884,7 +7546,7 @@ } if (mem_below_4gb_not_mirrored) - pr_warn("This configuration results in unmirrored kernel memory."); + pr_warn("This configuration results in unmirrored kernel memory.\n"); goto out2; } @@ -7023,9 +7685,16 @@ out2: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ - for (nid = 0; nid < MAX_NUMNODES; nid++) + for (nid = 0; nid < MAX_NUMNODES; nid++) { + unsigned long start_pfn, end_pfn; + zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + if (zone_movable_pfn[nid] >= end_pfn) + zone_movable_pfn[nid] = 0; + } out: /* restore the node_state */ @@ -7037,23 +7706,29 @@ { enum zone_type zone_type; - if (N_MEMORY == N_NORMAL_MEMORY) - return; - for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; if (populated_zone(zone)) { - node_set_state(nid, N_HIGH_MEMORY); - if (N_NORMAL_MEMORY != N_HIGH_MEMORY && - zone_type <= ZONE_NORMAL) + if (IS_ENABLED(CONFIG_HIGHMEM)) + node_set_state(nid, N_HIGH_MEMORY); + if (zone_type <= ZONE_NORMAL) node_set_state(nid, N_NORMAL_MEMORY); break; } } } +/* + * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For + * such cases we allow max_zone_pfn sorted in the descending order + */ +bool __weak arch_has_descending_max_zone_pfns(void) +{ + return false; +} + /** - * free_area_init_nodes - Initialise all pg_data_t and zone data + * free_area_init - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. @@ -7065,10 +7740,11 @@ * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn. */ -void __init free_area_init_nodes(unsigned long *max_zone_pfn) +void __init free_area_init(unsigned long *max_zone_pfn) { unsigned long start_pfn, end_pfn; - int i, nid; + int i, nid, zone; + bool descending; /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, @@ -7077,14 +7753,20 @@ sizeof(arch_zone_highest_possible_pfn)); start_pfn = find_min_pfn_with_active_regions(); + descending = arch_has_descending_max_zone_pfns(); for (i = 0; i < MAX_NR_ZONES; i++) { - if (i == ZONE_MOVABLE) + if (descending) + zone = MAX_NR_ZONES - i - 1; + else + zone = i; + + if (zone == ZONE_MOVABLE) continue; - end_pfn = max(max_zone_pfn[i], start_pfn); - arch_zone_lowest_possible_pfn[i] = start_pfn; - arch_zone_highest_possible_pfn[i] = end_pfn; + end_pfn = max(max_zone_pfn[zone], start_pfn); + arch_zone_lowest_possible_pfn[zone] = start_pfn; + arch_zone_highest_possible_pfn[zone] = end_pfn; start_pfn = end_pfn; } @@ -7118,27 +7800,33 @@ (u64)zone_movable_pfn[i] << PAGE_SHIFT); } - /* Print out the early node map */ + /* + * Print out the early node map, and initialize the + * subsection-map relative to active online memory ranges to + * enable future "sub-section" extensions of the memory map. + */ pr_info("Early memory node ranges\n"); - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); + subsection_map_init(start_pfn, end_pfn - start_pfn); + } /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); - zero_resv_unavail(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - free_area_init_node(nid, NULL, - find_min_pfn_for_node(nid), NULL); + free_area_init_node(nid); /* Any memory on that node */ if (pgdat->node_present_pages) node_set_state(nid, N_MEMORY); check_for_memory(pgdat, nid); } + + memmap_init(); } static int __init cmdline_parse_core(char *p, unsigned long *core, @@ -7197,22 +7885,18 @@ early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - void adjust_managed_page_count(struct page *page, long count) { - spin_lock(&managed_page_count_lock); - page_zone(page)->managed_pages += count; - totalram_pages += count; + atomic_long_add(count, &page_zone(page)->managed_pages); + totalram_pages_add(count); #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) - totalhigh_pages += count; + totalhigh_pages_add(count); #endif - spin_unlock(&managed_page_count_lock); } EXPORT_SYMBOL(adjust_managed_page_count); -unsigned long free_reserved_area(void *start, void *end, int poison, char *s) +unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) { void *pos; unsigned long pages = 0; @@ -7231,6 +7915,11 @@ * alias for the memset(). */ direct_map_addr = page_address(page); + /* + * Perform a kasan-unchecked memset() since this memory + * has not been initialized. + */ + direct_map_addr = kasan_reset_tag(direct_map_addr); if ((unsigned int)poison <= 0xFF) memset(direct_map_addr, poison, PAGE_SIZE); @@ -7243,15 +7932,14 @@ return pages; } -EXPORT_SYMBOL(free_reserved_area); #ifdef CONFIG_HIGHMEM void free_highmem_page(struct page *page) { __free_reserved_page(page); - totalram_pages++; - page_zone(page)->managed_pages++; - totalhigh_pages++; + totalram_pages_inc(); + atomic_long_inc(&page_zone(page)->managed_pages); + totalhigh_pages_inc(); } #endif @@ -7278,7 +7966,7 @@ */ #define adj_init_size(start, end, size, pos, adj) \ do { \ - if (start <= pos && pos < end && size > adj) \ + if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ size -= adj; \ } while (0) @@ -7300,10 +7988,10 @@ physpages << (PAGE_SHIFT - 10), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), + (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), totalcma_pages << (PAGE_SHIFT - 10), #ifdef CONFIG_HIGHMEM - totalhigh_pages << (PAGE_SHIFT - 10), + totalhigh_pages() << (PAGE_SHIFT - 10), #endif str ? ", " : "", str ? str : ""); } @@ -7322,13 +8010,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) { dma_reserve = new_dma_reserve; -} - -void __init free_area_init(unsigned long *zones_size) -{ - zero_resv_unavail(); - free_area_init_node(0, zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } static int page_alloc_cpu_dead(unsigned int cpu) @@ -7356,9 +8037,27 @@ return 0; } +#ifdef CONFIG_NUMA +int hashdist = HASHDIST_DEFAULT; + +static int __init set_hashdist(char *str) +{ + if (!str) + return 0; + hashdist = simple_strtoul(str, &str, 0); + return 1; +} +__setup("hashdist=", set_hashdist); +#endif + void __init page_alloc_init(void) { int ret; + +#ifdef CONFIG_NUMA + if (num_node_state(N_MEMORY) == 1) + hashdist = 0; +#endif ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, "mm/page_alloc:dead", NULL, @@ -7383,6 +8082,7 @@ for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; long max = 0; + unsigned long managed_pages = zone_managed_pages(zone); /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { @@ -7393,8 +8093,8 @@ /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); - if (max > zone->managed_pages) - max = zone->managed_pages; + if (max > managed_pages) + max = managed_pages; pgdat->totalreserve_pages += max; @@ -7413,30 +8113,24 @@ static void setup_per_zone_lowmem_reserve(void) { struct pglist_data *pgdat; - enum zone_type j, idx; + enum zone_type i, j; for_each_online_pgdat(pgdat) { - for (j = 0; j < MAX_NR_ZONES; j++) { - struct zone *zone = pgdat->node_zones + j; - unsigned long managed_pages = zone->managed_pages; + for (i = 0; i < MAX_NR_ZONES - 1; i++) { + struct zone *zone = &pgdat->node_zones[i]; + int ratio = sysctl_lowmem_reserve_ratio[i]; + bool clear = !ratio || !zone_managed_pages(zone); + unsigned long managed_pages = 0; - zone->lowmem_reserve[j] = 0; + for (j = i + 1; j < MAX_NR_ZONES; j++) { + struct zone *upper_zone = &pgdat->node_zones[j]; - idx = j; - while (idx) { - struct zone *lower_zone; + managed_pages += zone_managed_pages(upper_zone); - idx--; - lower_zone = pgdat->node_zones + idx; - - if (sysctl_lowmem_reserve_ratio[idx] < 1) { - sysctl_lowmem_reserve_ratio[idx] = 0; - lower_zone->lowmem_reserve[j] = 0; - } else { - lower_zone->lowmem_reserve[j] = - managed_pages / sysctl_lowmem_reserve_ratio[idx]; - } - managed_pages += lower_zone->managed_pages; + if (clear) + zone->lowmem_reserve[j] = 0; + else + zone->lowmem_reserve[j] = managed_pages / ratio; } } } @@ -7456,18 +8150,17 @@ /* Calculate total number of !ZONE_HIGHMEM pages */ for_each_zone(zone) { if (!is_highmem(zone)) - lowmem_pages += zone->managed_pages; + lowmem_pages += zone_managed_pages(zone); } for_each_zone(zone) { - u64 min, low; + u64 tmp, low; spin_lock_irqsave(&zone->lock, flags); - min = (u64)pages_min * zone->managed_pages; - do_div(min, lowmem_pages); - low = (u64)pages_low * zone->managed_pages; - do_div(low, vm_total_pages); - + tmp = (u64)pages_min * zone_managed_pages(zone); + do_div(tmp, lowmem_pages); + low = (u64)pages_low * zone_managed_pages(zone); + do_div(low, nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE))); if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -7475,20 +8168,20 @@ * value here. * * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) - * deltas control asynch page reclaim, and so should + * deltas control async page reclaim, and so should * not be capped for highmem. */ unsigned long min_pages; - min_pages = zone->managed_pages / 1024; + min_pages = zone_managed_pages(zone) / 1024; min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); - zone->watermark[WMARK_MIN] = min_pages; + zone->_watermark[WMARK_MIN] = min_pages; } else { /* * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->watermark[WMARK_MIN] = min; + zone->_watermark[WMARK_MIN] = tmp; } /* @@ -7496,14 +8189,13 @@ * scale factor in proportion to available memory, but * ensure a minimum size on small systems. */ - min = max_t(u64, min >> 2, - mult_frac(zone->managed_pages, + tmp = max_t(u64, tmp >> 2, + mult_frac(zone_managed_pages(zone), watermark_scale_factor, 10000)); - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + - low + min; - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + - low + min * 2; + zone->watermark_boost = 0; + zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + low + tmp; + zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + low + tmp * 2; spin_unlock_irqrestore(&zone->lock, flags); } @@ -7532,7 +8224,7 @@ * Initialise min_free_kbytes. * * For small machines we want it small (128k min). For large machines - * we want it large (64MB max). But it is not linear, because network + * we want it large (256MB max). But it is not linear, because network * bandwidth does not increase linearly with machine size. We use * * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: @@ -7564,8 +8256,8 @@ min_free_kbytes = new_min_free_kbytes; if (min_free_kbytes < 128) min_free_kbytes = 128; - if (min_free_kbytes > 65536) - min_free_kbytes = 65536; + if (min_free_kbytes > 262144) + min_free_kbytes = 262144; } else { pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", new_min_free_kbytes, user_min_free_kbytes); @@ -7591,7 +8283,7 @@ * or extra_free_kbytes changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -7607,7 +8299,7 @@ } int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -7631,13 +8323,13 @@ pgdat->min_unmapped_pages = 0; for_each_zone(zone) - zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * - sysctl_min_unmapped_ratio) / 100; + zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * + sysctl_min_unmapped_ratio) / 100; } int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -7659,12 +8351,12 @@ pgdat->min_slab_pages = 0; for_each_zone(zone) - zone->zone_pgdat->min_slab_pages += (zone->managed_pages * - sysctl_min_slab_ratio) / 100; + zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * + sysctl_min_slab_ratio) / 100; } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -7688,11 +8380,28 @@ * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { + int i; + proc_dointvec_minmax(table, write, buffer, length, ppos); + + for (i = 0; i < MAX_NR_ZONES; i++) { + if (sysctl_lowmem_reserve_ratio[i] < 1) + sysctl_lowmem_reserve_ratio[i] = 0; + } + setup_per_zone_lowmem_reserve(); return 0; +} + +static void __zone_pcp_update(struct zone *zone) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); } /* @@ -7701,7 +8410,7 @@ * pagelist can have before it gets flushed back to buddy allocator. */ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int old_percpu_pagelist_fraction; @@ -7726,30 +8435,12 @@ if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) goto out; - for_each_populated_zone(zone) { - unsigned int cpu; - - for_each_possible_cpu(cpu) - pageset_set_high_and_batch(zone, - per_cpu_ptr(zone->pageset, cpu)); - } + for_each_populated_zone(zone) + __zone_pcp_update(zone); out: mutex_unlock(&pcp_batch_high_lock); return ret; } - -#ifdef CONFIG_NUMA -int hashdist = HASHDIST_DEFAULT; - -static int __init set_hashdist(char *str) -{ - if (!str) - return 0; - hashdist = simple_strtoul(str, &str, 0); - return 1; -} -__setup("hashdist=", set_hashdist); -#endif #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES /* @@ -7797,6 +8488,7 @@ unsigned long log2qty, size; void *table = NULL; gfp_t gfp_flags; + bool virt; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -7853,32 +8545,34 @@ gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { + virt = false; size = bucketsize << log2qty; if (flags & HASH_EARLY) { if (flags & HASH_ZERO) - table = memblock_virt_alloc_nopanic(size, 0); + table = memblock_alloc(size, SMP_CACHE_BYTES); else - table = memblock_virt_alloc_raw(size, 0); - } else if (hashdist) { - table = __vmalloc(size, gfp_flags, PAGE_KERNEL); + table = memblock_alloc_raw(size, + SMP_CACHE_BYTES); + } else if (get_order(size) >= MAX_ORDER || hashdist) { + table = __vmalloc(size, gfp_flags); + virt = true; } else { /* * If bucketsize is not a power-of-two, we may free * some pages at the end of hash table which * alloc_pages_exact() automatically does */ - if (get_order(size) < MAX_ORDER) { - table = alloc_pages_exact(size, gfp_flags); - kmemleak_alloc(table, size, 1, gfp_flags); - } + table = alloc_pages_exact(size, gfp_flags); + kmemleak_alloc(table, size, 1, gfp_flags); } } while (!table && size > PAGE_SIZE && --log2qty); if (!table) panic("Failed to allocate %s hash table\n", tablename); - pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", - tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); + pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", + tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, + virt ? "vmalloc" : "linear"); if (_hash_shift) *_hash_shift = log2qty; @@ -7890,47 +8584,50 @@ /* * This function checks whether pageblock includes unmovable pages or not. - * If @count is not zero, it is okay to include less @count unmovable pages * * PageLRU check without isolation or lru_lock could race so that * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable * check without lock_page also may miss some movable non-lru pages at * race condition. So you can't expect this function should be exact. + * + * Returns a page without holding a reference. If the caller wants to + * dereference that page (e.g., dumping), it has to make sure that it + * cannot get removed (e.g., via memory unplug) concurrently. + * */ -bool has_unmovable_pages(struct zone *zone, struct page *page, int count, - int migratetype, - bool skip_hwpoisoned_pages) +struct page *has_unmovable_pages(struct zone *zone, struct page *page, + int migratetype, int flags) { - unsigned long pfn, iter, found; + unsigned long iter = 0; + unsigned long pfn = page_to_pfn(page); + unsigned long offset = pfn % pageblock_nr_pages; - /* - * TODO we could make this much more efficient by not checking every - * page in the range if we know all of them are in MOVABLE_ZONE and - * that the movable zone guarantees that pages are migratable but - * the later is not the case right now unfortunatelly. E.g. movablecore - * can still lead to having bootmem allocations in zone_movable. - */ + if (is_migrate_cma_page(page)) { + /* + * CMA allocations (alloc_contig_range) really need to mark + * isolate CMA pageblocks even when they are not movable in fact + * so consider them movable here. + */ + if (is_migrate_cma(migratetype)) + return NULL; - /* - * CMA allocations (alloc_contig_range) really need to mark isolate - * CMA pageblocks even when they are not movable in fact so consider - * them movable here. - */ - if (is_migrate_cma(migratetype) && - is_migrate_cma(get_pageblock_migratetype(page))) - return false; + return page; + } - pfn = page_to_pfn(page); - for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { - unsigned long check = pfn + iter; - - if (!pfn_valid_within(check)) + for (; iter < pageblock_nr_pages - offset; iter++) { + if (!pfn_valid_within(pfn + iter)) continue; - page = pfn_to_page(check); + page = pfn_to_page(pfn + iter); + /* + * Both, bootmem allocations and memory holes are marked + * PG_reserved and are unmovable. We can even have unmovable + * allocations inside ZONE_MOVABLE, for example when + * specifying "movablecore". + */ if (PageReserved(page)) - goto unmovable; + return page; /* * If the zone is movable and we have ruled out all reserved @@ -7942,17 +8639,22 @@ /* * Hugepages are not in LRU lists, but they're movable. - * We need not scan over tail pages bacause we don't + * THPs are on the LRU, but need to be counted as #small pages. + * We need not scan over tail pages because we don't * handle each tail page individually in migration. */ - if (PageHuge(page)) { + if (PageHuge(page) || PageTransCompound(page)) { struct page *head = compound_head(page); unsigned int skip_pages; - if (!hugepage_migration_supported(page_hstate(head))) - goto unmovable; + if (PageHuge(page)) { + if (!hugepage_migration_supported(page_hstate(head))) + return page; + } else if (!PageLRU(head) && !__PageMovable(head)) { + return page; + } - skip_pages = (1 << compound_order(head)) - (page - head); + skip_pages = compound_nr(head) - (page - head); iter += skip_pages - 1; continue; } @@ -7965,7 +8667,7 @@ */ if (!page_ref_count(page)) { if (PageBuddy(page)) - iter += (1 << page_order(page)) - 1; + iter += (1 << buddy_order(page)) - 1; continue; } @@ -7973,61 +8675,100 @@ * The HWPoisoned page may be not in buddy system, and * page_count() is not 0. */ - if (skip_hwpoisoned_pages && PageHWPoison(page)) + if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) continue; - if (__PageMovable(page)) + /* + * We treat all PageOffline() pages as movable when offlining + * to give drivers a chance to decrement their reference count + * in MEM_GOING_OFFLINE in order to indicate that these pages + * can be offlined as there are no direct references anymore. + * For actually unmovable PageOffline() where the driver does + * not support this, we will fail later when trying to actually + * move these pages that still have a reference count > 0. + * (false negatives in this function only) + */ + if ((flags & MEMORY_OFFLINE) && PageOffline(page)) continue; - if (!PageLRU(page)) - found++; + if (__PageMovable(page) || PageLRU(page)) + continue; + /* * If there are RECLAIMABLE pages, we need to check * it. But now, memory offline itself doesn't call * shrink_node_slabs() and it still to be fixed. */ - /* - * If the page is not RAM, page_count()should be 0. - * we don't need more check. This is an _used_ not-movable page. - * - * The problematic thing here is PG_reserved pages. PG_reserved - * is set to both of a memory hole page and a _used_ kernel - * page at boot. - */ - if (found > count) - goto unmovable; + return page; } - return false; -unmovable: - WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); - return true; + return NULL; } -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) - +#ifdef CONFIG_CONTIG_ALLOC static unsigned long pfn_max_align_down(unsigned long pfn) { return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, pageblock_nr_pages) - 1); } -static unsigned long pfn_max_align_up(unsigned long pfn) +unsigned long pfn_max_align_up(unsigned long pfn) { return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, pageblock_nr_pages)); } +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +/* Usage: See admin-guide/dynamic-debug-howto.rst */ +static void alloc_contig_dump_pages(struct list_head *page_list) +{ + DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure"); + + if (DYNAMIC_DEBUG_BRANCH(descriptor)) { + struct page *page; + unsigned long nr_skip = 0; + unsigned long nr_pages = 0; + + dump_stack(); + list_for_each_entry(page, page_list, lru) { + nr_pages++; + /* The page will be freed by putback_movable_pages soon */ + if (page_count(page) == 1) { + nr_skip++; + continue; + } + dump_page(page, "migration failure"); + } + pr_warn("total dump_pages %lu skipping %lu\n", nr_pages, nr_skip); + } +} +#else +static inline void alloc_contig_dump_pages(struct list_head *page_list) +{ +} +#endif + /* [start, end) must belong to a single zone. */ static int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + struct acr_info *info) { /* This function is based on compact_zone() from compaction.c. */ - unsigned long nr_reclaimed; + unsigned int nr_reclaimed; unsigned long pfn = start; unsigned int tries = 0; + unsigned int max_tries = 5; int ret = 0; + struct page *page; + struct migration_target_control mtc = { + .nid = zone_to_nid(cc->zone), + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; - migrate_prep(); + if (cc->alloc_contig && cc->mode == MIGRATE_ASYNC) + max_tries = 1; + + lru_cache_disable(); while (pfn < end || !list_empty(&cc->migratepages)) { if (fatal_signal_pending(current)) { @@ -8043,20 +8784,39 @@ break; } tries = 0; - } else if (++tries == 5) { + } else if (++tries == max_tries) { ret = ret < 0 ? ret : -EBUSY; break; } nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, &cc->migratepages); + info->nr_reclaimed += nr_reclaimed; cc->nr_migratepages -= nr_reclaimed; - ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - NULL, 0, cc->mode, MR_CONTIG_RANGE); + list_for_each_entry(page, &cc->migratepages, lru) + info->nr_mapped += page_mapcount(page); + + ret = migrate_pages(&cc->migratepages, alloc_migration_target, + NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); + if (!ret) + info->nr_migrated += cc->nr_migratepages; } + + lru_cache_enable(); if (ret < 0) { + if (ret == -EBUSY) { + alloc_contig_dump_pages(&cc->migratepages); + page_pinner_mark_migration_failed_pages(&cc->migratepages); + } + + if (!list_empty(&cc->migratepages)) { + page = list_first_entry(&cc->migratepages, struct page , lru); + info->failed_pfn = page_to_pfn(page); + } + putback_movable_pages(&cc->migratepages); + info->err |= ACR_ERR_MIGRATE; return ret; } return 0; @@ -8079,25 +8839,28 @@ * pageblocks in the range. Once isolated, the pageblocks should not * be modified by others. * - * Returns zero on success or negative error code. On success all + * Return: zero on success or negative error code. On success all * pages which PFN is in [start, end) are allocated for the caller and * need to be freed with free_contig_range(). */ int alloc_contig_range(unsigned long start, unsigned long end, - unsigned migratetype, gfp_t gfp_mask) + unsigned migratetype, gfp_t gfp_mask, + struct acr_info *info) { unsigned long outer_start, outer_end; unsigned int order; int ret = 0; + bool skip_drain_all_pages = false; struct compact_control cc = { .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .mode = MIGRATE_SYNC, + .mode = gfp_mask & __GFP_NORETRY ? MIGRATE_ASYNC : MIGRATE_SYNC, .ignore_skip_hint = true, .no_set_skip_hint = true, .gfp_mask = current_gfp_context(gfp_mask), + .alloc_contig = true, }; INIT_LIST_HEAD(&cc.migratepages); @@ -8126,14 +8889,18 @@ */ ret = start_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end), migratetype, - false); - if (ret) + pfn_max_align_up(end), migratetype, 0, + &info->failed_pfn); + if (ret) { + info->err |= ACR_ERR_ISOLATE; return ret; + } -#ifdef CONFIG_CMA - cc.zone->cma_alloc = 1; -#endif + trace_android_vh_cma_drain_all_pages_bypass(migratetype, + &skip_drain_all_pages); + if (!skip_drain_all_pages) + drain_all_pages(cc.zone); + /* * In case of -EBUSY, we'd like to know which page causes problem. * So, just fall through. test_pages_isolated() has a tracepoint @@ -8144,8 +8911,8 @@ * allocated. So, if we fall through be sure to clear ret so that * -EBUSY is not accidentally used or returned to caller. */ - ret = __alloc_contig_migrate_range(&cc, start, end); - if (ret && ret != -EBUSY) + ret = __alloc_contig_migrate_range(&cc, start, end, info); + if (ret && (ret != -EBUSY || (gfp_mask & __GFP_NORETRY))) goto done; ret =0; @@ -8166,9 +8933,6 @@ * isolated thus they won't get removed from buddy. */ - lru_add_drain_all(); - drain_all_pages(cc.zone); - order = 0; outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { @@ -8180,7 +8944,7 @@ } if (outer_start != start) { - order = page_order(pfn_to_page(outer_start)); + order = buddy_order(pfn_to_page(outer_start)); /* * outer_start page could be small order buddy page and @@ -8193,10 +8957,11 @@ } /* Make sure the range is really isolated. */ - if (test_pages_isolated(outer_start, end, false)) { + if (test_pages_isolated(outer_start, end, 0, &info->failed_pfn)) { pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", __func__, outer_start, end); ret = -EBUSY; + info->err |= ACR_ERR_TEST; goto done; } @@ -8216,13 +8981,114 @@ done: undo_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype); -#ifdef CONFIG_CMA - cc.zone->cma_alloc = 0; -#endif return ret; } +EXPORT_SYMBOL(alloc_contig_range); -void free_contig_range(unsigned long pfn, unsigned nr_pages) +static int __alloc_contig_pages(unsigned long start_pfn, + unsigned long nr_pages, gfp_t gfp_mask) +{ + struct acr_info dummy; + unsigned long end_pfn = start_pfn + nr_pages; + + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, + gfp_mask, &dummy); +} + +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long i, end_pfn = start_pfn + nr_pages; + struct page *page; + + for (i = start_pfn; i < end_pfn; i++) { + page = pfn_to_online_page(i); + if (!page) + return false; + + if (page_zone(page) != z) + return false; + + if (PageReserved(page)) + return false; + + if (page_count(page) > 0) + return false; + + if (PageHuge(page)) + return false; + } + return true; +} + +static bool zone_spans_last_pfn(const struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + unsigned long last_pfn = start_pfn + nr_pages - 1; + + return zone_spans_pfn(zone, last_pfn); +} + +/** + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages + * @nr_pages: Number of contiguous pages to allocate + * @gfp_mask: GFP mask to limit search and used during compaction + * @nid: Target node + * @nodemask: Mask for other possible nodes + * + * This routine is a wrapper around alloc_contig_range(). It scans over zones + * on an applicable zonelist to find a contiguous pfn range which can then be + * tried for allocation with alloc_contig_range(). This routine is intended + * for allocation requests which can not be fulfilled with the buddy allocator. + * + * The allocated memory is always aligned to a page boundary. If nr_pages is a + * power of two then the alignment is guaranteed to be to the given nr_pages + * (e.g. 1GB request would be aligned to 1GB). + * + * Allocated pages can be freed with free_contig_range() or by manually calling + * __free_page() on each allocated page. + * + * Return: pointer to contiguous pages on success, or NULL if not successful. + */ +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) +{ + unsigned long ret, pfn, flags; + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; + + zonelist = node_zonelist(nid, gfp_mask); + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(gfp_mask), nodemask) { + spin_lock_irqsave(&zone->lock, flags); + + pfn = ALIGN(zone->zone_start_pfn, nr_pages); + while (zone_spans_last_pfn(zone, pfn, nr_pages)) { + if (pfn_range_valid_contig(zone, pfn, nr_pages)) { + /* + * We release the zone lock here because + * alloc_contig_range() will also lock the zone + * at some point. If there's an allocation + * spinning on this lock, it may win the race + * and cause alloc_contig_range() to fail... + */ + spin_unlock_irqrestore(&zone->lock, flags); + ret = __alloc_contig_pages(pfn, nr_pages, + gfp_mask); + if (!ret) + return pfn_to_page(pfn); + spin_lock_irqsave(&zone->lock, flags); + } + pfn += nr_pages; + } + spin_unlock_irqrestore(&zone->lock, flags); + } + return NULL; +} +#endif /* CONFIG_CONTIG_ALLOC */ + +void free_contig_range(unsigned long pfn, unsigned int nr_pages) { unsigned int count = 0; @@ -8234,7 +9100,7 @@ } WARN(count != 0, "%d pages are still in use!\n", count); } -#endif +EXPORT_SYMBOL(free_contig_range); /* * The zone indicated has a new number of managed_pages; batch sizes and percpu @@ -8242,11 +9108,8 @@ */ void __meminit zone_pcp_update(struct zone *zone) { - unsigned cpu; mutex_lock(&pcp_batch_high_lock); - for_each_possible_cpu(cpu) - pageset_set_high_and_batch(zone, - per_cpu_ptr(zone->pageset, cpu)); + __zone_pcp_update(zone); mutex_unlock(&pcp_batch_high_lock); } @@ -8271,32 +9134,21 @@ #ifdef CONFIG_MEMORY_HOTREMOVE /* - * All pages in the range must be in a single zone and isolated - * before calling this. + * All pages in the range must be in a single zone, must not contain holes, + * must span full sections, and must be isolated before calling this function. */ -void -__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) +void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { + unsigned long pfn = start_pfn; struct page *page; struct zone *zone; - unsigned int order, i; - unsigned long pfn; + unsigned int order; unsigned long flags; - /* find the first valid pfn */ - for (pfn = start_pfn; pfn < end_pfn; pfn++) - if (pfn_valid(pfn)) - break; - if (pfn == end_pfn) - return; + offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); - pfn = start_pfn; while (pfn < end_pfn) { - if (!pfn_valid(pfn)) { - pfn++; - continue; - } page = pfn_to_page(pfn); /* * The HWPoisoned page may be not in buddy system, and @@ -8304,22 +9156,23 @@ */ if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { pfn++; - SetPageReserved(page); + continue; + } + /* + * At this point all remaining PageOffline() pages have a + * reference count of 0 and can simply be skipped. + */ + if (PageOffline(page)) { + BUG_ON(page_count(page)); + BUG_ON(PageBuddy(page)); + pfn++; continue; } BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); - order = page_order(page); -#ifdef CONFIG_DEBUG_VM - pr_info("remove from free list %lx %d %lx\n", - pfn, 1 << order, end_pfn); -#endif - list_del(&page->lru); - rmv_page_order(page); - zone->free_area[order].nr_free--; - for (i = 0; i < (1 << order); i++) - SetPageReserved((page+i)); + order = buddy_order(page); + del_page_from_free_list(page, zone, order); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); @@ -8337,7 +9190,7 @@ for (order = 0; order < MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); - if (PageBuddy(page_head) && page_order(page_head) >= order) + if (PageBuddy(page_head) && buddy_order(page_head) >= order) break; } spin_unlock_irqrestore(&zone->lock, flags); @@ -8347,30 +9200,87 @@ #ifdef CONFIG_MEMORY_FAILURE /* - * Set PG_hwpoison flag if a given page is confirmed to be a free page. This - * test is performed under the zone lock to prevent a race against page - * allocation. + * Break down a higher-order page in sub-pages, and keep our target out of + * buddy allocator. */ -bool set_hwpoison_free_buddy_page(struct page *page) +static void break_down_buddy_pages(struct zone *zone, struct page *page, + struct page *target, int low, int high, + int migratetype) +{ + unsigned long size = 1 << high; + struct page *current_buddy, *next_page; + + while (high > low) { + high--; + size >>= 1; + + if (target >= &page[size]) { + next_page = page + size; + current_buddy = page; + } else { + next_page = page; + current_buddy = page + size; + } + + if (set_page_guard(zone, current_buddy, high, migratetype)) + continue; + + if (current_buddy != target) { + add_to_free_list(current_buddy, zone, high, migratetype); + set_buddy_order(current_buddy, high); + page = next_page; + } + } +} + +/* + * Take a page that will be marked as poisoned off the buddy allocator. + */ +bool take_page_off_buddy(struct page *page) { struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); unsigned long flags; unsigned int order; - bool hwpoisoned = false; + bool ret = false; spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); + int page_order = buddy_order(page_head); - if (PageBuddy(page_head) && page_order(page_head) >= order) { - if (!TestSetPageHWPoison(page)) - hwpoisoned = true; + if (PageBuddy(page_head) && page_order >= order) { + unsigned long pfn_head = page_to_pfn(page_head); + int migratetype = get_pfnblock_migratetype(page_head, + pfn_head); + + del_page_from_free_list(page_head, zone, page_order); + break_down_buddy_pages(zone, page_head, page, 0, + page_order, migratetype); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, -1, migratetype); + ret = true; break; } + if (page_count(page_head) > 0) + break; } spin_unlock_irqrestore(&zone->lock, flags); - - return hwpoisoned; + return ret; } #endif + +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) { + struct zone *zone = &pgdat->node_zones[ZONE_DMA]; + + if (managed_zone(zone)) + return true; + } + return false; +} +#endif /* CONFIG_ZONE_DMA */ -- Gitblit v1.6.2