From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/mm/memory_hotplug.c | 1601 ++++++++++++++++++++++++++++++----------------------------- 1 files changed, 817 insertions(+), 784 deletions(-) diff --git a/kernel/mm/memory_hotplug.c b/kernel/mm/memory_hotplug.c index 20f079c..386d85f 100644 --- a/kernel/mm/memory_hotplug.c +++ b/kernel/mm/memory_hotplug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory_hotplug.c * @@ -33,13 +34,13 @@ #include <linux/stop_machine.h> #include <linux/hugetlb.h> #include <linux/memblock.h> -#include <linux/bootmem.h> #include <linux/compaction.h> #include <linux/rmap.h> #include <asm/tlbflush.h> #include "internal.h" +#include "shuffle.h" /* * online_page_callback contains pointer to current page onlining function. @@ -47,8 +48,6 @@ * changed by calling set_online_page_callback() for callback registration * and restore_online_page_callback() for generic callback restore. */ - -static void generic_online_page(struct page *page); static online_page_callback_t online_page_callback = generic_online_page; static DEFINE_MUTEX(online_page_callback_lock); @@ -68,18 +67,17 @@ bool movable_node_enabled = false; #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -bool memhp_auto_online; +int memhp_default_online_type = MMOP_OFFLINE; #else -bool memhp_auto_online = true; +int memhp_default_online_type = MMOP_ONLINE; #endif -EXPORT_SYMBOL_GPL(memhp_auto_online); static int __init setup_memhp_default_state(char *str) { - if (!strcmp(str, "online")) - memhp_auto_online = true; - else if (!strcmp(str, "offline")) - memhp_auto_online = false; + const int online_type = memhp_online_type_from_str(str); + + if (online_type >= 0) + memhp_default_online_type = online_type; return 1; } @@ -97,27 +95,38 @@ cpus_read_unlock(); } -/* add this memory to iomem resource */ -static struct resource *register_memory_resource(u64 start, u64 size) -{ - struct resource *res, *conflict; - res = kzalloc(sizeof(struct resource), GFP_KERNEL); - if (!res) - return ERR_PTR(-ENOMEM); +u64 max_mem_size = U64_MAX; - res->name = "System RAM"; - res->start = start; - res->end = start + size - 1; - res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - conflict = request_resource_conflict(&iomem_resource, res); - if (conflict) { - if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { - pr_debug("Device unaddressable memory block " - "memory hotplug at %#010llx !\n", - (unsigned long long)start); - } - pr_debug("System RAM resource %pR cannot be added\n", res); - kfree(res); +/* add this memory to iomem resource */ +static struct resource *register_memory_resource(u64 start, u64 size, + const char *resource_name) +{ + struct resource *res; + unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + + if (strcmp(resource_name, "System RAM")) + flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED; + + /* + * Make sure value parsed from 'mem=' only restricts memory adding + * while booting, so that memory hotplug won't be impacted. Please + * refer to document of 'mem=' in kernel-parameters.txt for more + * details. + */ + if (start + size > max_mem_size && system_state < SYSTEM_RUNNING) + return ERR_PTR(-E2BIG); + + /* + * Request ownership of the new memory range. This might be + * a child of an existing resource that was present but + * not marked as busy. + */ + res = __request_region(&iomem_resource, start, size, + resource_name, flags); + + if (!res) { + pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n", + start, start + size); return ERR_PTR(-EEXIST); } return res; @@ -129,7 +138,6 @@ return; release_resource(res); kfree(res); - return; } #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE @@ -163,9 +171,10 @@ #ifndef CONFIG_SPARSEMEM_VMEMMAP static void register_page_bootmem_info_section(unsigned long start_pfn) { - unsigned long *usemap, mapsize, section_nr, i; + unsigned long mapsize, section_nr, i; struct mem_section *ms; struct page *page, *memmap; + struct mem_section_usage *usage; section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -185,10 +194,10 @@ for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, SECTION_INFO); - usemap = ms->pageblock_flags; - page = virt_to_page(usemap); + usage = ms->usage; + page = virt_to_page(usage); - mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); @@ -197,9 +206,10 @@ #else /* CONFIG_SPARSEMEM_VMEMMAP */ static void register_page_bootmem_info_section(unsigned long start_pfn) { - unsigned long *usemap, mapsize, section_nr, i; + unsigned long mapsize, section_nr, i; struct mem_section *ms; struct page *page, *memmap; + struct mem_section_usage *usage; section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -208,10 +218,10 @@ register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); - usemap = ms->pageblock_flags; - page = virt_to_page(usemap); + usage = ms->usage; + page = virt_to_page(usage); - mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); @@ -247,16 +257,47 @@ } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static int __meminit __add_section(int nid, unsigned long phys_start_pfn, - struct vmem_altmap *altmap, bool want_memblock) +static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, + const char *reason) { - int ret; + /* + * Disallow all operations smaller than a sub-section and only + * allow operations smaller than a section for + * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range() + * enforces a larger memory_block_size_bytes() granularity for + * memory that will be marked online, so this check should only + * fire for direct arch_{add,remove}_memory() users outside of + * add_memory_resource(). + */ + unsigned long min_align; - if (pfn_valid(phys_start_pfn)) - return -EEXIST; + if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) + min_align = PAGES_PER_SUBSECTION; + else + min_align = PAGES_PER_SECTION; + if (!IS_ALIGNED(pfn, min_align) + || !IS_ALIGNED(nr_pages, min_align)) { + WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n", + reason, pfn, pfn + nr_pages - 1); + return -EINVAL; + } + return 0; +} - ret = sparse_add_one_section(nid, phys_start_pfn, altmap); - return ret < 0 ? ret : 0; +static int check_hotplug_memory_addressable(unsigned long pfn, + unsigned long nr_pages) +{ + const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1; + + if (max_addr >> MAX_PHYSMEM_BITS) { + const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1; + WARN(1, + "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n", + (u64)PFN_PHYS(pfn), max_addr, max_allowed); + return -E2BIG; + } + + return 0; } /* @@ -265,47 +306,47 @@ * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(int nid, unsigned long phys_start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - bool want_memblock) +int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, + struct mhp_params *params) { - unsigned long i; - int err = 0; - int start_sec, end_sec; + const unsigned long end_pfn = pfn + nr_pages; + unsigned long cur_nr_pages; + int err; + struct vmem_altmap *altmap = params->altmap; - /* during initialize mem_map, align hot-added range to section */ - start_sec = pfn_to_section_nr(phys_start_pfn); - end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); + if (WARN_ON_ONCE(!params->pgprot.pgprot)) + return -EINVAL; + + err = check_hotplug_memory_addressable(pfn, nr_pages); + if (err) + return err; if (altmap) { /* * Validate altmap is within bounds of the total request */ - if (altmap->base_pfn != phys_start_pfn + if (altmap->base_pfn != pfn || vmem_altmap_offset(altmap) > nr_pages) { pr_warn_once("memory add fail, invalid altmap\n"); - err = -EINVAL; - goto out; + return -EINVAL; } altmap->alloc = 0; } - for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, section_nr_to_pfn(i), altmap, - want_memblock); + err = check_pfn_span(pfn, nr_pages, "add"); + if (err) + return err; - /* - * EEXIST is finally dealt with by ioresource collision - * check. see add_memory() => register_memory_resource() - * Warning will be printed if there is collision. - */ - if (err && (err != -EEXIST)) + for (; pfn < end_pfn; pfn += cur_nr_pages) { + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = min(end_pfn - pfn, + SECTION_ALIGN_UP(pfn + 1) - pfn); + err = sparse_add_section(nid, pfn, cur_nr_pages, altmap); + if (err) break; - err = 0; cond_resched(); } vmemmap_populate_print_last(); -out: return err; } @@ -314,14 +355,14 @@ unsigned long start_pfn, unsigned long end_pfn) { - for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { + for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) { if (unlikely(!pfn_to_online_page(start_pfn))) continue; if (unlikely(pfn_to_nid(start_pfn) != nid)) continue; - if (zone && zone != page_zone(pfn_to_page(start_pfn))) + if (zone != page_zone(pfn_to_page(start_pfn))) continue; return start_pfn; @@ -339,14 +380,14 @@ /* pfn is the end pfn of a memory section. */ pfn = end_pfn - 1; - for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { + for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) { if (unlikely(!pfn_to_online_page(pfn))) continue; if (unlikely(pfn_to_nid(pfn) != nid)) continue; - if (zone && zone != page_zone(pfn_to_page(pfn))) + if (zone != page_zone(pfn_to_page(pfn))) continue; return pfn; @@ -358,14 +399,11 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long zone_start_pfn = zone->zone_start_pfn; - unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ - unsigned long zone_end_pfn = z; unsigned long pfn; int nid = zone_to_nid(zone); zone_span_writelock(zone); - if (zone_start_pfn == start_pfn) { + if (zone->zone_start_pfn == start_pfn) { /* * If the section is smallest section in the zone, it need * shrink zone->zone_start_pfn and zone->zone_spanned_pages. @@ -373,50 +411,30 @@ * for shrinking zone. */ pfn = find_smallest_section_pfn(nid, zone, end_pfn, - zone_end_pfn); + zone_end_pfn(zone)); if (pfn) { + zone->spanned_pages = zone_end_pfn(zone) - pfn; zone->zone_start_pfn = pfn; - zone->spanned_pages = zone_end_pfn - pfn; + } else { + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; } - } else if (zone_end_pfn == end_pfn) { + } else if (zone_end_pfn(zone) == end_pfn) { /* * If the section is biggest section in the zone, it need * shrink zone->spanned_pages. * In this case, we find second biggest valid mem_section for * shrinking zone. */ - pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, + pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn, start_pfn); if (pfn) - zone->spanned_pages = pfn - zone_start_pfn + 1; + zone->spanned_pages = pfn - zone->zone_start_pfn + 1; + else { + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + } } - - /* - * The section is not biggest or smallest mem_section in the zone, it - * only creates a hole in the zone. So in this case, we need not - * change the zone. But perhaps, the zone has only hole data. Thus - * it check the zone has only hole or not. - */ - pfn = zone_start_pfn; - for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { - if (unlikely(!pfn_to_online_page(pfn))) - continue; - - if (page_zone(pfn_to_page(pfn)) != zone) - continue; - - /* If the section is current section, it continues the loop */ - if (start_pfn == pfn) - continue; - - /* If we find valid section, we have nothing to do */ - zone_span_writeunlock(zone); - return; - } - - /* The zone has no valid section */ - zone->zone_start_pfn = 0; - zone->spanned_pages = 0; zone_span_writeunlock(zone); } @@ -453,8 +471,20 @@ unsigned long start_pfn, unsigned long nr_pages) { + const unsigned long end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long flags; + unsigned long pfn, cur_nr_pages, flags; + + /* Poison struct pages because they are now uninitialized again. */ + for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) { + cond_resched(); + + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = + min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); + page_init_poison(pfn_to_page(pfn), + sizeof(struct page) * cur_nr_pages); + } #ifdef CONFIG_ZONE_DEVICE /* @@ -476,24 +506,21 @@ set_zone_contiguous(zone); } -static void __remove_section(struct mem_section *ms, unsigned long map_offset, +static void __remove_section(unsigned long pfn, unsigned long nr_pages, + unsigned long map_offset, struct vmem_altmap *altmap) { - unsigned long start_pfn; - int scn_nr; + struct mem_section *ms = __pfn_to_section(pfn); if (WARN_ON_ONCE(!valid_section(ms))) return; - scn_nr = __section_nr(ms); - start_pfn = section_nr_to_pfn((unsigned long)scn_nr); - - sparse_remove_one_section(ms, map_offset, altmap); + sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap); } /** * __remove_pages() - remove sections of pages - * @phys_start_pfn: starting pageframe (must be aligned to start of a section) + * @pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used * @@ -502,28 +529,24 @@ * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ -void __remove_pages(unsigned long phys_start_pfn, unsigned long nr_pages, +void __remove_pages(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - unsigned long i; + const unsigned long end_pfn = pfn + nr_pages; + unsigned long cur_nr_pages; unsigned long map_offset = 0; - int sections_to_remove; - if (altmap) - map_offset = vmem_altmap_offset(altmap); + map_offset = vmem_altmap_offset(altmap); - /* - * We can only remove entire sections - */ - BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); - BUG_ON(nr_pages % PAGES_PER_SECTION); + if (check_pfn_span(pfn, nr_pages, "remove")) + return; - sections_to_remove = nr_pages / PAGES_PER_SECTION; - for (i = 0; i < sections_to_remove; i++) { - unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; - + for (; pfn < end_pfn; pfn += cur_nr_pages) { cond_resched(); - __remove_section(__pfn_to_section(pfn), map_offset, altmap); + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = min(end_pfn - pfn, + SECTION_ALIGN_UP(pfn + 1) - pfn); + __remove_section(pfn, cur_nr_pages, map_offset, altmap); map_offset = 0; } } @@ -566,48 +589,39 @@ } EXPORT_SYMBOL_GPL(restore_online_page_callback); -void __online_page_set_limits(struct page *page) +void generic_online_page(struct page *page, unsigned int order) { + /* + * Freeing the page with debug_pagealloc enabled will try to unmap it, + * so we should map it first. This is better than introducing a special + * case in page freeing fast path. + */ + debug_pagealloc_map_pages(page, 1 << order); + __free_pages_core(page, order); + totalram_pages_add(1UL << order); +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages_add(1UL << order); +#endif } -EXPORT_SYMBOL_GPL(__online_page_set_limits); +EXPORT_SYMBOL_GPL(generic_online_page); -void __online_page_increment_counters(struct page *page) +static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) { - adjust_managed_page_count(page, 1); -} -EXPORT_SYMBOL_GPL(__online_page_increment_counters); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; -void __online_page_free(struct page *page) -{ - __free_reserved_page(page); -} -EXPORT_SYMBOL_GPL(__online_page_free); + /* + * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might + * decide to not expose all pages to the buddy (e.g., expose them + * later). We account all pages as being online and belonging to this + * zone ("present"). + */ + for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) + (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1); -static void generic_online_page(struct page *page) -{ - __online_page_set_limits(page); - __online_page_increment_counters(page); - __online_page_free(page); -} - -static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, - void *arg) -{ - unsigned long i; - unsigned long onlined_pages = *(unsigned long *)arg; - struct page *page; - - if (PageReserved(pfn_to_page(start_pfn))) - for (i = 0; i < nr_pages; i++) { - page = pfn_to_page(start_pfn + i); - (*online_page_callback)(page); - onlined_pages++; - } - - online_mem_sections(start_pfn, start_pfn + nr_pages); - - *(unsigned long *)arg = onlined_pages; - return 0; + /* mark all involved sections as online */ + online_mem_sections(start_pfn, end_pfn); } /* check which state of node_states will be changed when online memory */ @@ -615,62 +629,19 @@ struct zone *zone, struct memory_notify *arg) { int nid = zone_to_nid(zone); - enum zone_type zone_last = ZONE_NORMAL; - /* - * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_NORMAL, - * set zone_last to ZONE_NORMAL. - * - * If we don't have HIGHMEM nor movable node, - * node_states[N_NORMAL_MEMORY] contains nodes which have zones of - * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. - */ - if (N_MEMORY == N_NORMAL_MEMORY) - zone_last = ZONE_MOVABLE; + arg->status_change_nid = NUMA_NO_NODE; + arg->status_change_nid_normal = NUMA_NO_NODE; + arg->status_change_nid_high = NUMA_NO_NODE; - /* - * if the memory to be online is in a zone of 0...zone_last, and - * the zones of 0...zone_last don't have memory before online, we will - * need to set the node to node_states[N_NORMAL_MEMORY] after - * the memory is online. - */ - if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) - arg->status_change_nid_normal = nid; - else - arg->status_change_nid_normal = -1; - -#ifdef CONFIG_HIGHMEM - /* - * If we have movable node, node_states[N_HIGH_MEMORY] - * contains nodes which have zones of 0...ZONE_HIGHMEM, - * set zone_last to ZONE_HIGHMEM. - * - * If we don't have movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_MOVABLE, - * set zone_last to ZONE_MOVABLE. - */ - zone_last = ZONE_HIGHMEM; - if (N_MEMORY == N_HIGH_MEMORY) - zone_last = ZONE_MOVABLE; - - if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) - arg->status_change_nid_high = nid; - else - arg->status_change_nid_high = -1; -#else - arg->status_change_nid_high = arg->status_change_nid_normal; -#endif - - /* - * if the node don't have memory befor online, we will need to - * set the node to node_states[N_MEMORY] after the memory - * is online. - */ if (!node_state(nid, N_MEMORY)) arg->status_change_nid = nid; - else - arg->status_change_nid = -1; + if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) + arg->status_change_nid_normal = nid; +#ifdef CONFIG_HIGHMEM + if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY)) + arg->status_change_nid_high = nid; +#endif } static void node_states_set_node(int node, struct memory_notify *arg) @@ -681,7 +652,8 @@ if (arg->status_change_nid_high >= 0) node_set_state(node, N_HIGH_MEMORY); - node_set_state(node, N_MEMORY); + if (arg->status_change_nid >= 0) + node_set_state(node, N_MEMORY); } static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, @@ -704,23 +676,32 @@ pgdat->node_start_pfn = start_pfn; pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; -} +} +/* + * Associate the pfn range with the given zone, initializing the memmaps + * and resizing the pgdat/zone data to span the added pages. After this + * call, all affected pages are PG_reserved. + * + * All aligned pageblocks are initialized to the specified migratetype + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related + * zone stats (e.g., nr_isolate_pageblock) are touched. + */ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) + unsigned long nr_pages, + struct vmem_altmap *altmap, int migratetype) { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; unsigned long flags; - - if (zone_is_empty(zone)) - init_currently_empty_zone(zone, start_pfn, nr_pages); clear_zone_contiguous(zone); /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ pgdat_resize_lock(pgdat, &flags); zone_span_writelock(zone); + if (zone_is_empty(zone)) + init_currently_empty_zone(zone, start_pfn, nr_pages); resize_zone_range(zone, start_pfn, nr_pages); zone_span_writeunlock(zone); resize_pgdat_range(pgdat, start_pfn, nr_pages); @@ -732,8 +713,8 @@ * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe */ - memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, - MEMINIT_HOTPLUG, altmap); + memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0, + MEMINIT_HOTPLUG, altmap, migratetype); set_zone_contiguous(zone); } @@ -795,43 +776,25 @@ return default_zone_for_pfn(nid, start_pfn, nr_pages); } -/* - * Associates the given pfn range with the given node and the zone appropriate - * for the given online type. - */ -static struct zone * __meminit move_pfn_range(int online_type, int nid, - unsigned long start_pfn, unsigned long nr_pages) -{ - struct zone *zone; - - zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); - move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL); - return zone; -} - -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + int online_type, int nid) { unsigned long flags; - unsigned long onlined_pages = 0; struct zone *zone; int need_zonelists_rebuild = 0; - int nid; int ret; struct memory_notify arg; - struct memory_block *mem; + + /* We can only online full sections (e.g., SECTION_IS_ONLINE) */ + if (WARN_ON_ONCE(!nr_pages || + !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION))) + return -EINVAL; mem_hotplug_begin(); - /* - * We can't use pfn_to_nid() because nid might be stored in struct page - * which is not yet initialized. Instead, we find nid from memory block. - */ - mem = find_memory_block(__pfn_to_section(pfn)); - nid = mem->nid; - put_device(&mem->dev); - /* associate pfn range with the zone */ - zone = move_pfn_range(online_type, nid, pfn, nr_pages); + zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); arg.start_pfn = pfn; arg.nr_pages = nr_pages; @@ -843,6 +806,14 @@ goto failed_addition; /* + * Fixup the number of isolated pageblocks before marking the sections + * onlining, such that undo_isolate_page_range() works correctly. + */ + spin_lock_irqsave(&zone->lock, flags); + zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages; + spin_unlock_irqrestore(&zone->lock, flags); + + /* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. * So, zonelist must be updated after online. @@ -852,41 +823,37 @@ setup_zone_pageset(zone); } - ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, - online_pages_range); - if (ret) { - if (need_zonelists_rebuild) - zone_pcp_reset(zone); - goto failed_addition; - } - - zone->present_pages += onlined_pages; + online_pages_range(pfn, nr_pages); + zone->present_pages += nr_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages += onlined_pages; + zone->zone_pgdat->node_present_pages += nr_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); - if (onlined_pages) { - node_states_set_node(nid, &arg); - if (need_zonelists_rebuild) - build_all_zonelists(NULL); - else - zone_pcp_update(zone); - } + node_states_set_node(nid, &arg); + if (need_zonelists_rebuild) + build_all_zonelists(NULL); + zone_pcp_update(zone); + + /* Basic onlining is complete, allow allocation of onlined pages. */ + undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); + + /* + * Freshly onlined pages aren't shuffled (e.g., all pages are placed to + * the tail of the freelist when undoing isolation). Shuffle the whole + * zone to make sure the just onlined pages are properly distributed + * across the whole freelist - to create an initial shuffle. + */ + shuffle_zone(zone); init_per_zone_wmark_min(); - if (onlined_pages) { - kswapd_run(nid); - kcompactd_run(nid); - } - - vm_total_pages = nr_free_pagecache_pages(); + kswapd_run(nid); + kcompactd_run(nid); writeback_set_ratelimit(); - if (onlined_pages) - memory_notify(MEM_ONLINE, &arg); + memory_notify(MEM_ONLINE, &arg); mem_hotplug_done(); return 0; @@ -912,10 +879,9 @@ } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) +static pg_data_t __ref *hotadd_new_pgdat(int nid) { struct pglist_data *pgdat; - unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); if (!pgdat) { @@ -923,26 +889,33 @@ if (!pgdat) return NULL; + pgdat->per_cpu_nodestats = + alloc_percpu(struct per_cpu_nodestat); arch_refresh_nodedata(nid, pgdat); } else { + int cpu; /* - * Reset the nr_zones, order and classzone_idx before reuse. - * Note that kswapd will init kswapd_classzone_idx properly + * Reset the nr_zones, order and highest_zoneidx before reuse. + * Note that kswapd will init kswapd_highest_zoneidx properly * when it starts in the near future. */ pgdat->nr_zones = 0; pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_highest_zoneidx = 0; + for_each_online_cpu(cpu) { + struct per_cpu_nodestat *p; + + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); + memset(p, 0, sizeof(*p)); + } } /* we can use NODE_DATA(nid) from here */ - pgdat->node_id = nid; - pgdat->node_start_pfn = start_pfn; + pgdat->node_start_pfn = 0; /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_core_hotplug(nid); - pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* * The node we allocated has no zone fallback lists. For avoiding @@ -968,14 +941,12 @@ arch_refresh_nodedata(nid, NULL); free_percpu(pgdat->per_cpu_nodestats); arch_free_nodedata(pgdat); - return; } /** * try_online_node - online a node if offlined * @nid: the node ID - * @start: start addr of the node * @set_node_online: Whether we want to online the node * called by cpu_up() to online a node without onlined memory. * @@ -984,7 +955,7 @@ * 0 -> the node is already online * -ENOMEM -> the node could not be allocated */ -static int __try_online_node(int nid, u64 start, bool set_node_online) +static int __try_online_node(int nid, bool set_node_online) { pg_data_t *pgdat; int ret = 1; @@ -992,7 +963,7 @@ if (node_online(nid)) return 0; - pgdat = hotadd_new_pgdat(nid, start); + pgdat = hotadd_new_pgdat(nid); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; @@ -1016,23 +987,18 @@ int ret; mem_hotplug_begin(); - ret = __try_online_node(nid, 0, true); + ret = __try_online_node(nid, true); mem_hotplug_done(); return ret; } static int check_hotplug_memory_range(u64 start, u64 size) { - unsigned long block_sz = memory_block_size_bytes(); - u64 block_nr_pages = block_sz >> PAGE_SHIFT; - u64 nr_pages = size >> PAGE_SHIFT; - u64 start_pfn = PFN_DOWN(start); - /* memory range must be block size aligned */ - if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) || - !IS_ALIGNED(nr_pages, block_nr_pages)) { + if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) || + !IS_ALIGNED(size, memory_block_size_bytes())) { pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx", - block_sz, start, size); + memory_block_size_bytes(), start, size); return -EINVAL; } @@ -1041,6 +1007,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) { + mem->online_type = memhp_default_online_type; return device_online(&mem->dev); } @@ -1050,8 +1017,9 @@ * * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res, bool online) +int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { + struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; u64 start, size; bool new_node = false; int ret; @@ -1063,23 +1031,23 @@ if (ret) return ret; + if (!node_possible(nid)) { + WARN(1, "node %d was absent from the node_possible_map\n", nid); + return -EINVAL; + } + mem_hotplug_begin(); - /* - * Add new range to memblock so that when hotadd_new_pgdat() is called - * to allocate new pgdat, get_pfn_range_for_nid() will be able to find - * this new range and calculate total pages correctly. The range will - * be removed at hot-remove time. - */ - memblock_add_node(start, size, nid); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_add_node(start, size, nid); - ret = __try_online_node(nid, start, false); + ret = __try_online_node(nid, false); if (ret < 0) goto error; new_node = ret; /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, NULL, true); + ret = arch_add_memory(nid, start, size, ¶ms); if (ret < 0) goto error; @@ -1102,143 +1070,170 @@ } /* link memory sections under this node.*/ - ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), - MEMINIT_HOTPLUG); - BUG_ON(ret); + link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), + MEMINIT_HOTPLUG); /* create new memmap entry */ - firmware_map_add_hotplug(start, start + size, "System RAM"); + if (!strcmp(res->name, "System RAM")) + firmware_map_add_hotplug(start, start + size, "System RAM"); /* device_online() will take the lock when calling online_pages() */ mem_hotplug_done(); + /* + * In case we're allowed to merge the resource, flag it and trigger + * merging now that adding succeeded. + */ + if (mhp_flags & MEMHP_MERGE_RESOURCE) + merge_system_ram_resource(res); + /* online pages if requested */ - if (online) - walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), - NULL, online_memory_block); + if (memhp_default_online_type != MMOP_OFFLINE) + walk_memory_blocks(start, size, NULL, online_memory_block); return ret; error: /* rollback pgdat allocation and others */ if (new_node) rollback_node_hotadd(nid); - memblock_remove(start, size); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_remove(start, size); mem_hotplug_done(); return ret; } /* requires device_hotplug_lock, see add_memory_resource() */ -int __ref __add_memory(int nid, u64 start, u64 size) +int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { struct resource *res; int ret; - res = register_memory_resource(start, size); + res = register_memory_resource(start, size, "System RAM"); if (IS_ERR(res)) return PTR_ERR(res); - ret = add_memory_resource(nid, res, memhp_auto_online); + ret = add_memory_resource(nid, res, mhp_flags); if (ret < 0) release_memory_resource(res); return ret; } -int add_memory(int nid, u64 start, u64 size) +int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { int rc; lock_device_hotplug(); - rc = __add_memory(nid, start, size); + rc = __add_memory(nid, start, size, mhp_flags); unlock_device_hotplug(); return rc; } EXPORT_SYMBOL_GPL(add_memory); +int add_memory_subsection(int nid, u64 start, u64 size) +{ + struct mhp_params params = { .pgprot = PAGE_KERNEL }; + struct resource *res; + int ret; + + if (!IS_ALIGNED(start, SUBSECTION_SIZE) || + !IS_ALIGNED(size, SUBSECTION_SIZE)) { + pr_err("%s: start 0x%llx size 0x%llx not aligned to subsection size\n", + __func__, start, size); + return -EINVAL; + } + + res = register_memory_resource(start, size, "System RAM"); + if (IS_ERR(res)) + return PTR_ERR(res); + + mem_hotplug_begin(); + + nid = memory_add_physaddr_to_nid(start); + + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_add_node(start, size, nid); + + ret = arch_add_memory(nid, start, size, ¶ms); + if (ret) { + pr_err("%s failed to add subsection start 0x%llx size 0x%llx\n", + __func__, start, size); + goto err_add_memory; + } + mem_hotplug_done(); + + return ret; + +err_add_memory: + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_remove(start, size); + + mem_hotplug_done(); + + release_memory_resource(res); + return ret; +} +EXPORT_SYMBOL_GPL(add_memory_subsection); + +/* + * Add special, driver-managed memory to the system as system RAM. Such + * memory is not exposed via the raw firmware-provided memmap as system + * RAM, instead, it is detected and added by a driver - during cold boot, + * after a reboot, and after kexec. + * + * Reasons why this memory should not be used for the initial memmap of a + * kexec kernel or for placing kexec images: + * - The booting kernel is in charge of determining how this memory will be + * used (e.g., use persistent memory as system RAM) + * - Coordination with a hypervisor is required before this memory + * can be used (e.g., inaccessible parts). + * + * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided + * memory map") are created. Also, the created memory resource is flagged + * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case + * this memory as well (esp., not place kexec images onto it). + * + * The resource_name (visible via /proc/iomem) has to have the format + * "System RAM ($DRIVER)". + */ +int add_memory_driver_managed(int nid, u64 start, u64 size, + const char *resource_name, mhp_t mhp_flags) +{ + struct resource *res; + int rc; + + if (!resource_name || + strstr(resource_name, "System RAM (") != resource_name || + resource_name[strlen(resource_name) - 1] != ')') + return -EINVAL; + + lock_device_hotplug(); + + res = register_memory_resource(start, size, resource_name); + if (IS_ERR(res)) { + rc = PTR_ERR(res); + goto out_unlock; + } + + rc = add_memory_resource(nid, res, mhp_flags); + if (rc < 0) + release_memory_resource(res); + +out_unlock: + unlock_device_hotplug(); + return rc; +} +EXPORT_SYMBOL_GPL(add_memory_driver_managed); + #ifdef CONFIG_MEMORY_HOTREMOVE /* - * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy - * set and the size of the free page is given by page_order(). Using this, - * the function determines if the pageblock contains only free pages. - * Due to buddy contraints, a free page at least the size of a pageblock will - * be located at the start of the pageblock + * Confirm all pages in a range [start, end) belong to the same zone (skipping + * memory holes). When true, return the zone. */ -static inline int pageblock_free(struct page *page) -{ - return PageBuddy(page) && page_order(page) >= pageblock_order; -} - -/* Return the pfn of the start of the next active pageblock after a given pfn */ -static unsigned long next_active_pageblock(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); - - /* Ensure the starting page is pageblock-aligned */ - BUG_ON(pfn & (pageblock_nr_pages - 1)); - - /* If the entire pageblock is free, move to the end of free page */ - if (pageblock_free(page)) { - int order; - /* be careful. we don't have locks, page_order can be changed.*/ - order = page_order(page); - if ((order < MAX_ORDER) && (order >= pageblock_order)) - return pfn + (1 << order); - } - - return pfn + pageblock_nr_pages; -} - -static bool is_pageblock_removable_nolock(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); - struct zone *zone; - - /* - * We have to be careful here because we are iterating over memory - * sections which are not zone aware so we might end up outside of - * the zone but still within the section. - * We have to take care about the node as well. If the node is offline - * its NODE_DATA will be NULL - see page_zone. - */ - if (!node_online(page_to_nid(page))) - return false; - - zone = page_zone(page); - pfn = page_to_pfn(page); - if (!zone_spans_pfn(zone, pfn)) - return false; - - return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); -} - -/* Checks if this range of memory is likely to be hot-removable. */ -bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long end_pfn, pfn; - - end_pfn = min(start_pfn + nr_pages, - zone_end_pfn(page_zone(pfn_to_page(start_pfn)))); - - /* Check the starting page of each pageblock within the range */ - for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) { - if (!is_pageblock_removable_nolock(pfn)) - return false; - cond_resched(); - } - - /* All pageblocks in the memory block are likely to be hot-removable */ - return true; -} - -/* - * Confirm all pages in a range [start, end) belong to the same zone. - * When true, return its valid [start, end). - */ -int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, - unsigned long *valid_start, unsigned long *valid_end) +struct zone *test_pages_in_a_zone(unsigned long start_pfn, + unsigned long end_pfn) { unsigned long pfn, sec_end_pfn; - unsigned long start, end; struct zone *zone = NULL; struct page *page; int i; @@ -1259,33 +1254,30 @@ continue; /* Check if we got outside of the zone */ if (zone && !zone_spans_pfn(zone, pfn + i)) - return 0; + return NULL; page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) - return 0; - if (!zone) - start = pfn + i; + return NULL; zone = page_zone(page); - end = pfn + MAX_ORDER_NR_PAGES; } } - if (zone) { - *valid_start = start; - *valid_end = min(end, end_pfn); - return 1; - } else { - return 0; - } + return zone; } /* * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, - * non-lru movable pages and hugepages). We scan pfn because it's much - * easier than scanning over linked list. This function returns the pfn - * of the first found movable page if it's found, otherwise 0. + * non-lru movable pages and hugepages). Will skip over most unmovable + * pages (esp., pages that can be skipped when offlining), but bail out on + * definitely unmovable pages. + * + * Returns: + * 0 in case a movable page is found and movable_pfn was updated. + * -ENOENT in case no movable page was found. + * -EBUSY in case a definitely unmovable page was found. */ -static unsigned long scan_movable_pages(unsigned long start, unsigned long end) +static int scan_movable_pages(unsigned long start, unsigned long end, + unsigned long *movable_pfn) { unsigned long pfn; @@ -1297,68 +1289,55 @@ continue; page = pfn_to_page(pfn); if (PageLRU(page)) - return pfn; + goto found; if (__PageMovable(page)) - return pfn; + goto found; + + /* + * PageOffline() pages that are not marked __PageMovable() and + * have a reference count > 0 (after MEM_GOING_OFFLINE) are + * definitely unmovable. If their reference count would be 0, + * they could at least be skipped when offlining memory. + */ + if (PageOffline(page) && page_count(page)) + return -EBUSY; if (!PageHuge(page)) continue; head = compound_head(page); - if (hugepage_migration_supported(page_hstate(head)) && - page_huge_active(head)) - return pfn; - skip = (1 << compound_order(head)) - (page - head); + if (page_huge_active(head)) + goto found; + skip = compound_nr(head) - (page - head); pfn += skip - 1; } + return -ENOENT; +found: + *movable_pfn = pfn; return 0; } -static struct page *new_node_page(struct page *page, unsigned long private) -{ - int nid = page_to_nid(page); - nodemask_t nmask = node_states[N_MEMORY]; - - /* - * try to allocate from a different node but reuse this node if there - * are no other online nodes to be used (e.g. we are offlining a part - * of the only existing node) - */ - node_clear(nid, nmask); - if (nodes_empty(nmask)) - node_set(nid, nmask); - - return new_page_nodemask(page, nid, &nmask); -} - -#define NR_OFFLINE_AT_ONCE_PAGES (256) static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; - struct page *page; - int move_pages = NR_OFFLINE_AT_ONCE_PAGES; - int not_managed = 0; + struct page *page, *head; int ret = 0; LIST_HEAD(source); + static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); - for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { + for (pfn = start_pfn; pfn < end_pfn; pfn++) { if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); + head = compound_head(page); if (PageHuge(page)) { - struct page *head = compound_head(page); - pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; - if (compound_order(head) > PFN_SECTION_SHIFT) { - ret = -EBUSY; - break; - } - if (isolate_huge_page(page, &source)) - move_pages -= 1 << compound_order(head); + pfn = page_to_pfn(head) + compound_nr(head) - 1; + isolate_hugetlb(head, &source); continue; } else if (PageTransHuge(page)) - pfn = page_to_pfn(compound_head(page)) - + hpage_nr_pages(page) - 1; + pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; /* * HWPoison pages have elevated reference counts so the migration would @@ -1371,7 +1350,7 @@ if (WARN_ON(PageLRU(page))) isolate_lru_page(page); if (page_mapped(page)) - try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS); + try_to_unmap(page, TTU_IGNORE_MLOCK); continue; } @@ -1386,98 +1365,60 @@ else ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); if (!ret) { /* Success */ - put_page(page); list_add_tail(&page->lru, &source); - move_pages--; if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_lru(page)); } else { -#ifdef CONFIG_DEBUG_VM - pr_alert("failed to isolate pfn %lx\n", pfn); - dump_page(page, "isolation failed"); -#endif - put_page(page); - /* Because we don't have big zone->lock. we should - check this again here. */ - if (page_count(page)) { - not_managed++; - ret = -EBUSY; - break; + if (__ratelimit(&migrate_rs)) { + pr_warn("failed to isolate pfn %lx\n", pfn); + dump_page(page, "isolation failed"); } } + put_page(page); } if (!list_empty(&source)) { - if (not_managed) { + nodemask_t nmask = node_states[N_MEMORY]; + struct migration_target_control mtc = { + .nmask = &nmask, + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; + + /* + * We have checked that migration range is on a single zone so + * we can use the nid of the first page to all the others. + */ + mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru)); + + /* + * try to allocate from a different node but reuse this node + * if there are no other online nodes to be used (e.g. we are + * offlining a part of the only existing node) + */ + node_clear(mtc.nid, nmask); + if (nodes_empty(nmask)) + node_set(mtc.nid, nmask); + ret = migrate_pages(&source, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); + if (ret) { + list_for_each_entry(page, &source, lru) { + if (__ratelimit(&migrate_rs)) { + pr_warn("migrating pfn %lx failed ret:%d\n", + page_to_pfn(page), ret); + dump_page(page, "migration failure"); + } + } putback_movable_pages(&source); - goto out; } - - /* Allocate a new page from the nearest neighbor node */ - ret = migrate_pages(&source, new_node_page, NULL, 0, - MIGRATE_SYNC, MR_MEMORY_HOTPLUG); - if (ret) - putback_movable_pages(&source); } -out: + return ret; -} - -/* - * remove from free_area[] and mark all as Reserved. - */ -static int -offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, - void *data) -{ - __offline_isolated_pages(start, start + nr_pages); - return 0; -} - -static void -offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) -{ - walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, - offline_isolated_pages_cb); -} - -/* - * Check all pages in range, recoreded as memory resource, are isolated. - */ -static int -check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, - void *data) -{ - int ret; - long offlined = *(long *)data; - ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); - offlined = nr_pages; - if (!ret) - *(long *)data += offlined; - return ret; -} - -static long -check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) -{ - long offlined = 0; - int ret; - - ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, - check_pages_isolated_cb); - if (ret < 0) - offlined = (long)ret; - return offlined; } static int __init cmdline_parse_movable_node(char *p) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP movable_node_enabled = true; -#else - pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n"); -#endif return 0; } early_param("movable_node", cmdline_parse_movable_node); @@ -1488,75 +1429,53 @@ { struct pglist_data *pgdat = zone->zone_pgdat; unsigned long present_pages = 0; - enum zone_type zt, zone_last = ZONE_NORMAL; + enum zone_type zt; + + arg->status_change_nid = NUMA_NO_NODE; + arg->status_change_nid_normal = NUMA_NO_NODE; + arg->status_change_nid_high = NUMA_NO_NODE; /* - * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_NORMAL, - * set zone_last to ZONE_NORMAL. - * - * If we don't have HIGHMEM nor movable node, - * node_states[N_NORMAL_MEMORY] contains nodes which have zones of - * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + * Check whether node_states[N_NORMAL_MEMORY] will be changed. + * If the memory to be offline is within the range + * [0..ZONE_NORMAL], and it is the last present memory there, + * the zones in that range will become empty after the offlining, + * thus we can determine that we need to clear the node from + * node_states[N_NORMAL_MEMORY]. */ - if (N_MEMORY == N_NORMAL_MEMORY) - zone_last = ZONE_MOVABLE; - - /* - * check whether node_states[N_NORMAL_MEMORY] will be changed. - * If the memory to be offline is in a zone of 0...zone_last, - * and it is the last present memory, 0...zone_last will - * become empty after offline , thus we can determind we will - * need to clear the node from node_states[N_NORMAL_MEMORY]. - */ - for (zt = 0; zt <= zone_last; zt++) + for (zt = 0; zt <= ZONE_NORMAL; zt++) present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) arg->status_change_nid_normal = zone_to_nid(zone); - else - arg->status_change_nid_normal = -1; #ifdef CONFIG_HIGHMEM /* - * If we have movable node, node_states[N_HIGH_MEMORY] - * contains nodes which have zones of 0...ZONE_HIGHMEM, - * set zone_last to ZONE_HIGHMEM. - * - * If we don't have movable node, node_states[N_NORMAL_MEMORY] - * contains nodes which have zones of 0...ZONE_MOVABLE, - * set zone_last to ZONE_MOVABLE. + * node_states[N_HIGH_MEMORY] contains nodes which + * have normal memory or high memory. + * Here we add the present_pages belonging to ZONE_HIGHMEM. + * If the zone is within the range of [0..ZONE_HIGHMEM), and + * we determine that the zones in that range become empty, + * we need to clear the node for N_HIGH_MEMORY. */ - zone_last = ZONE_HIGHMEM; - if (N_MEMORY == N_HIGH_MEMORY) - zone_last = ZONE_MOVABLE; - - for (; zt <= zone_last; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages; + if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages) arg->status_change_nid_high = zone_to_nid(zone); - else - arg->status_change_nid_high = -1; -#else - arg->status_change_nid_high = arg->status_change_nid_normal; #endif /* - * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE + * We have accounted the pages from [0..ZONE_NORMAL), and + * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM + * as well. + * Here we count the possible pages from ZONE_MOVABLE. + * If after having accounted all the pages, we see that the nr_pages + * to be offlined is over or equal to the accounted pages, + * we know that the node will become empty, and so, we can clear + * it for N_MEMORY as well. */ - zone_last = ZONE_MOVABLE; + present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; - /* - * check whether node_states[N_HIGH_MEMORY] will be changed - * If we try to offline the last present @nr_pages from the node, - * we can determind we will need to clear the node from - * node_states[N_HIGH_MEMORY]. - */ - for (; zt <= zone_last; zt++) - present_pages += pgdat->node_zones[zt].present_pages; if (nr_pages >= present_pages) arg->status_change_nid = zone_to_nid(zone); - else - arg->status_change_nid = -1; } static void node_states_clear_node(int node, struct memory_notify *arg) @@ -1564,53 +1483,76 @@ if (arg->status_change_nid_normal >= 0) node_clear_state(node, N_NORMAL_MEMORY); - if ((N_MEMORY != N_NORMAL_MEMORY) && - (arg->status_change_nid_high >= 0)) + if (arg->status_change_nid_high >= 0) node_clear_state(node, N_HIGH_MEMORY); - if ((N_MEMORY != N_HIGH_MEMORY) && - (arg->status_change_nid >= 0)) + if (arg->status_change_nid >= 0) node_clear_state(node, N_MEMORY); } -static int __ref __offline_pages(unsigned long start_pfn, - unsigned long end_pfn) +static int count_system_ram_pages_cb(unsigned long start_pfn, + unsigned long nr_pages, void *data) { - unsigned long pfn, nr_pages; - long offlined_pages; - int ret, node; + unsigned long *nr_system_ram_pages = data; + + *nr_system_ram_pages += nr_pages; + return 0; +} + +int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) +{ + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn, system_ram_pages = 0; unsigned long flags; - unsigned long valid_start, valid_end; struct zone *zone; struct memory_notify arg; + int ret, node; + char *reason; - /* at least, alignment against pageblock is necessary */ - if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) - return -EINVAL; - if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) + /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */ + if (WARN_ON_ONCE(!nr_pages || + !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION))) return -EINVAL; mem_hotplug_begin(); + /* + * Don't allow to offline memory blocks that contain holes. + * Consequently, memory blocks with holes can never get onlined + * via the hotplug path - online_pages() - as hotplugged memory has + * no holes. This way, we e.g., don't have to worry about marking + * memory holes PG_reserved, don't need pfn_valid() checks, and can + * avoid using walk_system_ram_range() later. + */ + walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages, + count_system_ram_pages_cb); + if (system_ram_pages != nr_pages) { + ret = -EINVAL; + reason = "memory holes"; + goto failed_removal; + } + /* This makes hotplug much easier...and readable. we assume this for now. .*/ - if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, - &valid_end)) { - mem_hotplug_done(); - return -EINVAL; + zone = test_pages_in_a_zone(start_pfn, end_pfn); + if (!zone) { + ret = -EINVAL; + reason = "multizone range"; + goto failed_removal; } - - zone = page_zone(pfn_to_page(valid_start)); node = zone_to_nid(zone); - nr_pages = end_pfn - start_pfn; + lru_cache_disable(); /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, - MIGRATE_MOVABLE, true); + MIGRATE_MOVABLE, + MEMORY_OFFLINE | REPORT_FAILURE, NULL); if (ret) { - mem_hotplug_done(); - return ret; + reason = "failure to isolate range"; + goto failed_removal_lru_cache_disabled; } + + drain_all_pages(zone); arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; @@ -1618,49 +1560,84 @@ ret = memory_notify(MEM_GOING_OFFLINE, &arg); ret = notifier_to_errno(ret); - if (ret) - goto failed_removal; - - pfn = start_pfn; -repeat: - /* start memory hot removal */ - ret = -EINTR; - if (signal_pending(current)) - goto failed_removal; - - cond_resched(); - lru_add_drain_all(); - drain_all_pages(zone); - - pfn = scan_movable_pages(start_pfn, end_pfn); - if (pfn) { /* We have movable pages */ - ret = do_migrate_range(pfn, end_pfn); - goto repeat; + if (ret) { + reason = "notifier failure"; + goto failed_removal_isolated; } + do { + pfn = start_pfn; + do { + if (signal_pending(current)) { + ret = -EINTR; + reason = "signal backoff"; + goto failed_removal_isolated; + } + + cond_resched(); + + ret = scan_movable_pages(pfn, end_pfn, &pfn); + if (!ret) { + /* + * TODO: fatal migration failures should bail + * out + */ + do_migrate_range(pfn, end_pfn); + } + } while (!ret); + + if (ret != -ENOENT) { + reason = "unmovable page"; + goto failed_removal_isolated; + } + + /* + * Dissolve free hugepages in the memory block before doing + * offlining actually in order to make hugetlbfs's object + * counting consistent. + */ + ret = dissolve_free_huge_pages(start_pfn, end_pfn); + if (ret) { + reason = "failure to dissolve huge pages"; + goto failed_removal_isolated; + } + + /* + * per-cpu pages are drained after start_isolate_page_range, but + * if there are still pages that are not free, make sure that we + * drain again, because when we isolated range we might have + * raced with another thread that was adding pages to pcp list. + * + * Forward progress should be still guaranteed because + * pages on the pcp list can only belong to MOVABLE_ZONE + * because has_unmovable_pages explicitly checks for + * PageBuddy on freed pages on other zones. + */ + ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE, NULL); + if (ret) + drain_all_pages(zone); + } while (ret); + + /* Mark all sections offline and remove free pages from the buddy. */ + __offline_isolated_pages(start_pfn, end_pfn); + pr_info("Offlined Pages %ld\n", nr_pages); + /* - * dissolve free hugepages in the memory block before doing offlining - * actually in order to make hugetlbfs's object counting consistent. + * The memory sections are marked offline, and the pageblock flags + * effectively stale; nobody should be touching them. Fixup the number + * of isolated pageblocks, memory onlining will properly revert this. */ - ret = dissolve_free_huge_pages(start_pfn, end_pfn); - if (ret) - goto failed_removal; - /* check again */ - offlined_pages = check_pages_isolated(start_pfn, end_pfn); - if (offlined_pages < 0) - goto repeat; - pr_info("Offlined Pages %ld\n", offlined_pages); - /* Ok, all of our target is isolated. - We cannot do rollback at this point. */ - offline_isolated_pages(start_pfn, end_pfn); - /* reset pagetype flags and makes migrate type to be MOVABLE */ - undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + spin_lock_irqsave(&zone->lock, flags); + zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; + spin_unlock_irqrestore(&zone->lock, flags); + + lru_cache_enable(); /* removal success */ - adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); - zone->present_pages -= offlined_pages; + adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); + zone->present_pages -= nr_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages -= offlined_pages; + zone->zone_pgdat->node_present_pages -= nr_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); init_per_zone_wmark_min(); @@ -1677,7 +1654,6 @@ kcompactd_stop(node); } - vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); @@ -1685,73 +1661,21 @@ mem_hotplug_done(); return 0; -failed_removal: - pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", - (unsigned long long) start_pfn << PAGE_SHIFT, - ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); - memory_notify(MEM_CANCEL_OFFLINE, &arg); - /* pushback to free area */ +failed_removal_isolated: undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + memory_notify(MEM_CANCEL_OFFLINE, &arg); +failed_removal_lru_cache_disabled: + lru_cache_enable(); +failed_removal: + pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, + reason); + /* pushback to free area */ mem_hotplug_done(); return ret; } -int offline_pages(unsigned long start_pfn, unsigned long nr_pages) -{ - return __offline_pages(start_pfn, start_pfn + nr_pages); -} -#endif /* CONFIG_MEMORY_HOTREMOVE */ - -/** - * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) - * @start_pfn: start pfn of the memory range - * @end_pfn: end pfn of the memory range - * @arg: argument passed to func - * @func: callback for each memory section walked - * - * This function walks through all present mem sections in range - * [start_pfn, end_pfn) and call func on each mem section. - * - * Returns the return value of func. - */ -int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, - void *arg, int (*func)(struct memory_block *, void *)) -{ - struct memory_block *mem = NULL; - struct mem_section *section; - unsigned long pfn, section_nr; - int ret; - - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - section_nr = pfn_to_section_nr(pfn); - if (!present_section_nr(section_nr)) - continue; - - section = __nr_to_section(section_nr); - /* same memblock? */ - if (mem) - if ((section_nr >= mem->start_section_nr) && - (section_nr <= mem->end_section_nr)) - continue; - - mem = find_memory_block_hinted(section, mem); - if (!mem) - continue; - - ret = func(mem, arg); - if (ret) { - kobject_put(&mem->dev.kobj); - return ret; - } - } - - if (mem) - kobject_put(&mem->dev.kobj); - - return 0; -} - -#ifdef CONFIG_MEMORY_HOTREMOVE static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); @@ -1760,12 +1684,13 @@ phys_addr_t beginpa, endpa; beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); - endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; + endpa = beginpa + memory_block_size_bytes() - 1; pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); - } - return ret; + return -EBUSY; + } + return 0; } static int check_cpu_on_node(pg_data_t *pgdat) @@ -1781,34 +1706,6 @@ return -EBUSY; } - return 0; -} - -static void unmap_cpu_on_node(pg_data_t *pgdat) -{ -#ifdef CONFIG_ACPI_NUMA - int cpu; - - for_each_possible_cpu(cpu) - if (cpu_to_node(cpu) == pgdat->node_id) - numa_clear_node(cpu); -#endif -} - -static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) -{ - int ret; - - ret = check_cpu_on_node(pgdat); - if (ret) - return ret; - - /* - * the node will be offlined when we come here, so we can clear - * the cpu_to_node() now. - */ - - unmap_cpu_on_node(pgdat); return 0; } @@ -1855,7 +1752,7 @@ if (rc) return; - if (check_and_unmap_cpu_on_node(pgdat)) + if (check_cpu_on_node(pgdat)) return; /* @@ -1867,24 +1764,45 @@ } EXPORT_SYMBOL(try_offline_node); -static void __release_memory_resource(resource_size_t start, - resource_size_t size) +static int __ref try_remove_memory(int nid, u64 start, u64 size) { - int ret; + int rc = 0; + + BUG_ON(check_hotplug_memory_range(start, size)); /* - * When removing memory in the same granularity as it was added, - * this function never fails. It might only fail if resources - * have to be adjusted or split. We'll ignore the error, as - * removing of memory cannot fail. + * All memory blocks must be offlined before removing memory. Check + * whether all memory blocks in question are offline and return error + * if this is not the case. */ - ret = release_mem_region_adjustable(&iomem_resource, start, size); - if (ret) { - resource_size_t endres = start + size - 1; + rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb); + if (rc) + return rc; - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", - &start, &endres, ret); + /* remove memmap entry */ + firmware_map_remove(start, start + size, "System RAM"); + + /* + * Memory block device removal under the device_hotplug_lock is + * a barrier against racing online attempts. + */ + remove_memory_block_devices(start, size); + + mem_hotplug_begin(); + + arch_remove_memory(nid, start, size, NULL); + + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { + memblock_free(start, size); + memblock_remove(start, size); } + + release_mem_region_adjustable(start, size); + + try_offline_node(nid); + + mem_hotplug_done(); + return 0; } /** @@ -1897,48 +1815,163 @@ * and online/offline operations before this call, as required by * try_offline_node(). */ -void __ref __remove_memory(int nid, u64 start, u64 size) +void __remove_memory(int nid, u64 start, u64 size) { - int ret; - - BUG_ON(check_hotplug_memory_range(start, size)); /* - * All memory blocks must be offlined before removing memory. Check - * whether all memory blocks in question are offline and trigger a BUG() - * if this is not the case. + * trigger BUG() if some memory is not offlined prior to calling this + * function */ - ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, - check_memblock_offlined_cb); - if (ret) + if (try_remove_memory(nid, start, size)) BUG(); - - /* remove memmap entry */ - firmware_map_remove(start, start + size, "System RAM"); - memblock_free(start, size); - memblock_remove(start, size); - - /* - * Memory block device removal under the device_hotplug_lock is - * a barrier against racing online attempts. - */ - remove_memory_block_devices(start, size); - - mem_hotplug_begin(); - - arch_remove_memory(nid, start, size, NULL); - __release_memory_resource(start, size); - - try_offline_node(nid); - - mem_hotplug_done(); } -void remove_memory(int nid, u64 start, u64 size) +/* + * Remove memory if every memory block is offline, otherwise return -EBUSY is + * some memory is not offline + */ +int remove_memory(int nid, u64 start, u64 size) { + int rc; + lock_device_hotplug(); - __remove_memory(nid, start, size); + rc = try_remove_memory(nid, start, size); unlock_device_hotplug(); + + return rc; } EXPORT_SYMBOL_GPL(remove_memory); + +int remove_memory_subsection(int nid, u64 start, u64 size) +{ + if (!IS_ALIGNED(start, SUBSECTION_SIZE) || + !IS_ALIGNED(size, SUBSECTION_SIZE)) { + pr_err("%s: start 0x%llx size 0x%llx not aligned to subsection size\n", + __func__, start, size); + return -EINVAL; + } + + mem_hotplug_begin(); + arch_remove_memory(nid, start, size, NULL); + + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_remove(start, size); + + release_mem_region_adjustable(start, size); + + mem_hotplug_done(); + + return 0; +} +EXPORT_SYMBOL_GPL(remove_memory_subsection); + +static int try_offline_memory_block(struct memory_block *mem, void *arg) +{ + uint8_t online_type = MMOP_ONLINE_KERNEL; + uint8_t **online_types = arg; + struct page *page; + int rc; + + /* + * Sense the online_type via the zone of the memory block. Offlining + * with multiple zones within one memory block will be rejected + * by offlining code ... so we don't care about that. + */ + page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); + if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) + online_type = MMOP_ONLINE_MOVABLE; + + rc = device_offline(&mem->dev); + /* + * Default is MMOP_OFFLINE - change it only if offlining succeeded, + * so try_reonline_memory_block() can do the right thing. + */ + if (!rc) + **online_types = online_type; + + (*online_types)++; + /* Ignore if already offline. */ + return rc < 0 ? rc : 0; +} + +static int try_reonline_memory_block(struct memory_block *mem, void *arg) +{ + uint8_t **online_types = arg; + int rc; + + if (**online_types != MMOP_OFFLINE) { + mem->online_type = **online_types; + rc = device_online(&mem->dev); + if (rc < 0) + pr_warn("%s: Failed to re-online memory: %d", + __func__, rc); + } + + /* Continue processing all remaining memory blocks. */ + (*online_types)++; + return 0; +} + +/* + * Try to offline and remove memory. Might take a long time to finish in case + * memory is still in use. Primarily useful for memory devices that logically + * unplugged all memory (so it's no longer in use) and want to offline + remove + * that memory. + */ +int offline_and_remove_memory(int nid, u64 start, u64 size) +{ + const unsigned long mb_count = size / memory_block_size_bytes(); + uint8_t *online_types, *tmp; + int rc; + + if (!IS_ALIGNED(start, memory_block_size_bytes()) || + !IS_ALIGNED(size, memory_block_size_bytes()) || !size) + return -EINVAL; + + /* + * We'll remember the old online type of each memory block, so we can + * try to revert whatever we did when offlining one memory block fails + * after offlining some others succeeded. + */ + online_types = kmalloc_array(mb_count, sizeof(*online_types), + GFP_KERNEL); + if (!online_types) + return -ENOMEM; + /* + * Initialize all states to MMOP_OFFLINE, so when we abort processing in + * try_offline_memory_block(), we'll skip all unprocessed blocks in + * try_reonline_memory_block(). + */ + memset(online_types, MMOP_OFFLINE, mb_count); + + lock_device_hotplug(); + + tmp = online_types; + rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block); + + /* + * In case we succeeded to offline all memory, remove it. + * This cannot fail as it cannot get onlined in the meantime. + */ + if (!rc) { + rc = try_remove_memory(nid, start, size); + if (rc) + pr_err("%s: Failed to remove memory: %d", __func__, rc); + } + + /* + * Rollback what we did. While memory onlining might theoretically fail + * (nacked by a notifier), it barely ever happens. + */ + if (rc) { + tmp = online_types; + walk_memory_blocks(start, size, &tmp, + try_reonline_memory_block); + } + unlock_device_hotplug(); + + kfree(online_types); + return rc; +} +EXPORT_SYMBOL_GPL(offline_and_remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ -- Gitblit v1.6.2