~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/mm/memory_hotplug.c
3	4	*
..	..	@@ -33,13 +34,13 @@
33	34	#include <linux/stop_machine.h>
34	35	#include <linux/hugetlb.h>
35	36	#include <linux/memblock.h>
36		-#include <linux/bootmem.h>
37	37	#include <linux/compaction.h>
38	38	#include <linux/rmap.h>
39	39
40	40	#include <asm/tlbflush.h>
41	41
42	42	#include "internal.h"
	43	+#include "shuffle.h"
43	44
44	45	/*
45	46	* online_page_callback contains pointer to current page onlining function.
..	..	@@ -47,8 +48,6 @@
47	48	* changed by calling set_online_page_callback() for callback registration
48	49	* and restore_online_page_callback() for generic callback restore.
49	50	*/
50		-
51		-static void generic_online_page(struct page *page);
52	51
53	52	static online_page_callback_t online_page_callback = generic_online_page;
54	53	static DEFINE_MUTEX(online_page_callback_lock);
..	..	@@ -68,18 +67,17 @@
68	67	bool movable_node_enabled = false;
69	68
70	69	#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
71		-bool memhp_auto_online;
	70	+int memhp_default_online_type = MMOP_OFFLINE;
72	71	#else
73		-bool memhp_auto_online = true;
	72	+int memhp_default_online_type = MMOP_ONLINE;
74	73	#endif
75		-EXPORT_SYMBOL_GPL(memhp_auto_online);
76	74
77	75	static int __init setup_memhp_default_state(char *str)
78	76	{
79		- if (!strcmp(str, "online"))
80		- memhp_auto_online = true;
81		- else if (!strcmp(str, "offline"))
82		- memhp_auto_online = false;
	77	+ const int online_type = memhp_online_type_from_str(str);
	78	+
	79	+ if (online_type >= 0)
	80	+ memhp_default_online_type = online_type;
83	81
84	82	return 1;
85	83	}
..	..	@@ -97,27 +95,38 @@
97	95	cpus_read_unlock();
98	96	}
99	97
100		-/* add this memory to iomem resource */
101		-static struct resource *register_memory_resource(u64 start, u64 size)
102		-{
103		- struct resource res, conflict;
104		- res = kzalloc(sizeof(struct resource), GFP_KERNEL);
105		- if (!res)
106		- return ERR_PTR(-ENOMEM);
	98	+u64 max_mem_size = U64_MAX;
107	99
108		- res->name = "System RAM";
109		- res->start = start;
110		- res->end = start + size - 1;
111		- res->flags = IORESOURCE_SYSTEM_RAM \| IORESOURCE_BUSY;
112		- conflict = request_resource_conflict(&iomem_resource, res);
113		- if (conflict) {
114		- if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
115		- pr_debug("Device unaddressable memory block "
116		- "memory hotplug at %#010llx !\n",
117		- (unsigned long long)start);
118		- }
119		- pr_debug("System RAM resource %pR cannot be added\n", res);
120		- kfree(res);
	100	+/* add this memory to iomem resource */
	101	+static struct resource *register_memory_resource(u64 start, u64 size,
	102	+ const char *resource_name)
	103	+{
	104	+ struct resource *res;
	105	+ unsigned long flags = IORESOURCE_SYSTEM_RAM \| IORESOURCE_BUSY;
	106	+
	107	+ if (strcmp(resource_name, "System RAM"))
	108	+ flags \|= IORESOURCE_SYSRAM_DRIVER_MANAGED;
	109	+
	110	+ /*
	111	+ * Make sure value parsed from 'mem=' only restricts memory adding
	112	+ * while booting, so that memory hotplug won't be impacted. Please
	113	+ * refer to document of 'mem=' in kernel-parameters.txt for more
	114	+ * details.
	115	+ */
	116	+ if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
	117	+ return ERR_PTR(-E2BIG);
	118	+
	119	+ /*
	120	+ * Request ownership of the new memory range. This might be
	121	+ * a child of an existing resource that was present but
	122	+ * not marked as busy.
	123	+ */
	124	+ res = __request_region(&iomem_resource, start, size,
	125	+ resource_name, flags);
	126	+
	127	+ if (!res) {
	128	+ pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
	129	+ start, start + size);
121	130	return ERR_PTR(-EEXIST);
122	131	}
123	132	return res;
..	..	@@ -129,7 +138,6 @@
129	138	return;
130	139	release_resource(res);
131	140	kfree(res);
132		- return;
133	141	}
134	142
135	143	#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
..	..	@@ -163,9 +171,10 @@
163	171	#ifndef CONFIG_SPARSEMEM_VMEMMAP
164	172	static void register_page_bootmem_info_section(unsigned long start_pfn)
165	173	{
166		- unsigned long *usemap, mapsize, section_nr, i;
	174	+ unsigned long mapsize, section_nr, i;
167	175	struct mem_section *ms;
168	176	struct page page, memmap;
	177	+ struct mem_section_usage *usage;
169	178
170	179	section_nr = pfn_to_section_nr(start_pfn);
171	180	ms = __nr_to_section(section_nr);
..	..	@@ -185,10 +194,10 @@
185	194	for (i = 0; i < mapsize; i++, page++)
186	195	get_page_bootmem(section_nr, page, SECTION_INFO);
187	196
188		- usemap = ms->pageblock_flags;
189		- page = virt_to_page(usemap);
	197	+ usage = ms->usage;
	198	+ page = virt_to_page(usage);
190	199
191		- mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
	200	+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
192	201
193	202	for (i = 0; i < mapsize; i++, page++)
194	203	get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
..	..	@@ -197,9 +206,10 @@
197	206	#else /* CONFIG_SPARSEMEM_VMEMMAP */
198	207	static void register_page_bootmem_info_section(unsigned long start_pfn)
199	208	{
200		- unsigned long *usemap, mapsize, section_nr, i;
	209	+ unsigned long mapsize, section_nr, i;
201	210	struct mem_section *ms;
202	211	struct page page, memmap;
	212	+ struct mem_section_usage *usage;
203	213
204	214	section_nr = pfn_to_section_nr(start_pfn);
205	215	ms = __nr_to_section(section_nr);
..	..	@@ -208,10 +218,10 @@
208	218
209	219	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
210	220
211		- usemap = ms->pageblock_flags;
212		- page = virt_to_page(usemap);
	221	+ usage = ms->usage;
	222	+ page = virt_to_page(usage);
213	223
214		- mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
	224	+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
215	225
216	226	for (i = 0; i < mapsize; i++, page++)
217	227	get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
..	..	@@ -247,16 +257,47 @@
247	257	}
248	258	#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
249	259
250		-static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
251		- struct vmem_altmap *altmap, bool want_memblock)
	260	+static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
	261	+ const char *reason)
252	262	{
253		- int ret;
	263	+ /*
	264	+ * Disallow all operations smaller than a sub-section and only
	265	+ * allow operations smaller than a section for
	266	+ * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
	267	+ * enforces a larger memory_block_size_bytes() granularity for
	268	+ * memory that will be marked online, so this check should only
	269	+ * fire for direct arch_{add,remove}_memory() users outside of
	270	+ * add_memory_resource().
	271	+ */
	272	+ unsigned long min_align;
254	273
255		- if (pfn_valid(phys_start_pfn))
256		- return -EEXIST;
	274	+ if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
	275	+ min_align = PAGES_PER_SUBSECTION;
	276	+ else
	277	+ min_align = PAGES_PER_SECTION;
	278	+ if (!IS_ALIGNED(pfn, min_align)
	279	+ \|\| !IS_ALIGNED(nr_pages, min_align)) {
	280	+ WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
	281	+ reason, pfn, pfn + nr_pages - 1);
	282	+ return -EINVAL;
	283	+ }
	284	+ return 0;
	285	+}
257	286
258		- ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
259		- return ret < 0 ? ret : 0;
	287	+static int check_hotplug_memory_addressable(unsigned long pfn,
	288	+ unsigned long nr_pages)
	289	+{
	290	+ const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
	291	+
	292	+ if (max_addr >> MAX_PHYSMEM_BITS) {
	293	+ const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
	294	+ WARN(1,
	295	+ "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
	296	+ (u64)PFN_PHYS(pfn), max_addr, max_allowed);
	297	+ return -E2BIG;
	298	+ }
	299	+
	300	+ return 0;
260	301	}
261	302
262	303	/*
..	..	@@ -265,47 +306,47 @@
265	306	* call this function after deciding the zone to which to
266	307	* add the new pages.
267	308	*/
268		-int __ref __add_pages(int nid, unsigned long phys_start_pfn,
269		- unsigned long nr_pages, struct vmem_altmap *altmap,
270		- bool want_memblock)
	309	+int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
	310	+ struct mhp_params *params)
271	311	{
272		- unsigned long i;
273		- int err = 0;
274		- int start_sec, end_sec;
	312	+ const unsigned long end_pfn = pfn + nr_pages;
	313	+ unsigned long cur_nr_pages;
	314	+ int err;
	315	+ struct vmem_altmap *altmap = params->altmap;
275	316
276		- /* during initialize mem_map, align hot-added range to section */
277		- start_sec = pfn_to_section_nr(phys_start_pfn);
278		- end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
	317	+ if (WARN_ON_ONCE(!params->pgprot.pgprot))
	318	+ return -EINVAL;
	319	+
	320	+ err = check_hotplug_memory_addressable(pfn, nr_pages);
	321	+ if (err)
	322	+ return err;
279	323
280	324	if (altmap) {
281	325	/*
282	326	* Validate altmap is within bounds of the total request
283	327	*/
284		- if (altmap->base_pfn != phys_start_pfn
	328	+ if (altmap->base_pfn != pfn
285	329	\|\| vmem_altmap_offset(altmap) > nr_pages) {
286	330	pr_warn_once("memory add fail, invalid altmap\n");
287		- err = -EINVAL;
288		- goto out;
	331	+ return -EINVAL;
289	332	}
290	333	altmap->alloc = 0;
291	334	}
292	335
293		- for (i = start_sec; i <= end_sec; i++) {
294		- err = __add_section(nid, section_nr_to_pfn(i), altmap,
295		- want_memblock);
	336	+ err = check_pfn_span(pfn, nr_pages, "add");
	337	+ if (err)
	338	+ return err;
296	339
297		- /*
298		- * EEXIST is finally dealt with by ioresource collision
299		- * check. see add_memory() => register_memory_resource()
300		- * Warning will be printed if there is collision.
301		- */
302		- if (err && (err != -EEXIST))
	340	+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
	341	+ /* Select all remaining pages up to the next section boundary */
	342	+ cur_nr_pages = min(end_pfn - pfn,
	343	+ SECTION_ALIGN_UP(pfn + 1) - pfn);
	344	+ err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
	345	+ if (err)
303	346	break;
304		- err = 0;
305	347	cond_resched();
306	348	}
307	349	vmemmap_populate_print_last();
308		-out:
309	350	return err;
310	351	}
311	352
..	..	@@ -314,14 +355,14 @@
314	355	unsigned long start_pfn,
315	356	unsigned long end_pfn)
316	357	{
317		- for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
	358	+ for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
318	359	if (unlikely(!pfn_to_online_page(start_pfn)))
319	360	continue;
320	361
321	362	if (unlikely(pfn_to_nid(start_pfn) != nid))
322	363	continue;
323	364
324		- if (zone && zone != page_zone(pfn_to_page(start_pfn)))
	365	+ if (zone != page_zone(pfn_to_page(start_pfn)))
325	366	continue;
326	367
327	368	return start_pfn;
..	..	@@ -339,14 +380,14 @@
339	380
340	381	/* pfn is the end pfn of a memory section. */
341	382	pfn = end_pfn - 1;
342		- for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
	383	+ for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
343	384	if (unlikely(!pfn_to_online_page(pfn)))
344	385	continue;
345	386
346	387	if (unlikely(pfn_to_nid(pfn) != nid))
347	388	continue;
348	389
349		- if (zone && zone != page_zone(pfn_to_page(pfn)))
	390	+ if (zone != page_zone(pfn_to_page(pfn)))
350	391	continue;
351	392
352	393	return pfn;
..	..	@@ -358,14 +399,11 @@
358	399	static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
359	400	unsigned long end_pfn)
360	401	{
361		- unsigned long zone_start_pfn = zone->zone_start_pfn;
362		- unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
363		- unsigned long zone_end_pfn = z;
364	402	unsigned long pfn;
365	403	int nid = zone_to_nid(zone);
366	404
367	405	zone_span_writelock(zone);
368		- if (zone_start_pfn == start_pfn) {
	406	+ if (zone->zone_start_pfn == start_pfn) {
369	407	/*
370	408	* If the section is smallest section in the zone, it need
371	409	* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
..	..	@@ -373,50 +411,30 @@
373	411	* for shrinking zone.
374	412	*/
375	413	pfn = find_smallest_section_pfn(nid, zone, end_pfn,
376		- zone_end_pfn);
	414	+ zone_end_pfn(zone));
377	415	if (pfn) {
	416	+ zone->spanned_pages = zone_end_pfn(zone) - pfn;
378	417	zone->zone_start_pfn = pfn;
379		- zone->spanned_pages = zone_end_pfn - pfn;
	418	+ } else {
	419	+ zone->zone_start_pfn = 0;
	420	+ zone->spanned_pages = 0;
380	421	}
381		- } else if (zone_end_pfn == end_pfn) {
	422	+ } else if (zone_end_pfn(zone) == end_pfn) {
382	423	/*
383	424	* If the section is biggest section in the zone, it need
384	425	* shrink zone->spanned_pages.
385	426	* In this case, we find second biggest valid mem_section for
386	427	* shrinking zone.
387	428	*/
388		- pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
	429	+ pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
389	430	start_pfn);
390	431	if (pfn)
391		- zone->spanned_pages = pfn - zone_start_pfn + 1;
	432	+ zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
	433	+ else {
	434	+ zone->zone_start_pfn = 0;
	435	+ zone->spanned_pages = 0;
	436	+ }
392	437	}
393		-
394		- /*
395		- * The section is not biggest or smallest mem_section in the zone, it
396		- * only creates a hole in the zone. So in this case, we need not
397		- * change the zone. But perhaps, the zone has only hole data. Thus
398		- * it check the zone has only hole or not.
399		- */
400		- pfn = zone_start_pfn;
401		- for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
402		- if (unlikely(!pfn_to_online_page(pfn)))
403		- continue;
404		-
405		- if (page_zone(pfn_to_page(pfn)) != zone)
406		- continue;
407		-
408		- /* If the section is current section, it continues the loop */
409		- if (start_pfn == pfn)
410		- continue;
411		-
412		- /* If we find valid section, we have nothing to do */
413		- zone_span_writeunlock(zone);
414		- return;
415		- }
416		-
417		- /* The zone has no valid section */
418		- zone->zone_start_pfn = 0;
419		- zone->spanned_pages = 0;
420	438	zone_span_writeunlock(zone);
421	439	}
422	440
..	..	@@ -453,8 +471,20 @@
453	471	unsigned long start_pfn,
454	472	unsigned long nr_pages)
455	473	{
	474	+ const unsigned long end_pfn = start_pfn + nr_pages;
456	475	struct pglist_data *pgdat = zone->zone_pgdat;
457		- unsigned long flags;
	476	+ unsigned long pfn, cur_nr_pages, flags;
	477	+
	478	+ /* Poison struct pages because they are now uninitialized again. */
	479	+ for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
	480	+ cond_resched();
	481	+
	482	+ /* Select all remaining pages up to the next section boundary */
	483	+ cur_nr_pages =
	484	+ min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
	485	+ page_init_poison(pfn_to_page(pfn),
	486	+ sizeof(struct page) * cur_nr_pages);
	487	+ }
458	488
459	489	#ifdef CONFIG_ZONE_DEVICE
460	490	/*
..	..	@@ -476,24 +506,21 @@
476	506	set_zone_contiguous(zone);
477	507	}
478	508
479		-static void __remove_section(struct mem_section *ms, unsigned long map_offset,
	509	+static void __remove_section(unsigned long pfn, unsigned long nr_pages,
	510	+ unsigned long map_offset,
480	511	struct vmem_altmap *altmap)
481	512	{
482		- unsigned long start_pfn;
483		- int scn_nr;
	513	+ struct mem_section *ms = __pfn_to_section(pfn);
484	514
485	515	if (WARN_ON_ONCE(!valid_section(ms)))
486	516	return;
487	517
488		- scn_nr = __section_nr(ms);
489		- start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
490		-
491		- sparse_remove_one_section(ms, map_offset, altmap);
	518	+ sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
492	519	}
493	520
494	521	/**
495	522	* __remove_pages() - remove sections of pages
496		- * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
	523	+ * @pfn: starting pageframe (must be aligned to start of a section)
497	524	* @nr_pages: number of pages to remove (must be multiple of section size)
498	525	* @altmap: alternative device page map or %NULL if default memmap is used
499	526	*
..	..	@@ -502,28 +529,24 @@
502	529	* sure that pages are marked reserved and zones are adjust properly by
503	530	* calling offline_pages().
504	531	*/
505		-void __remove_pages(unsigned long phys_start_pfn, unsigned long nr_pages,
	532	+void __remove_pages(unsigned long pfn, unsigned long nr_pages,
506	533	struct vmem_altmap *altmap)
507	534	{
508		- unsigned long i;
	535	+ const unsigned long end_pfn = pfn + nr_pages;
	536	+ unsigned long cur_nr_pages;
509	537	unsigned long map_offset = 0;
510		- int sections_to_remove;
511	538
512		- if (altmap)
513		- map_offset = vmem_altmap_offset(altmap);
	539	+ map_offset = vmem_altmap_offset(altmap);
514	540
515		- /*
516		- * We can only remove entire sections
517		- */
518		- BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
519		- BUG_ON(nr_pages % PAGES_PER_SECTION);
	541	+ if (check_pfn_span(pfn, nr_pages, "remove"))
	542	+ return;
520	543
521		- sections_to_remove = nr_pages / PAGES_PER_SECTION;
522		- for (i = 0; i < sections_to_remove; i++) {
523		- unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
524		-
	544	+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
525	545	cond_resched();
526		- __remove_section(__pfn_to_section(pfn), map_offset, altmap);
	546	+ /* Select all remaining pages up to the next section boundary */
	547	+ cur_nr_pages = min(end_pfn - pfn,
	548	+ SECTION_ALIGN_UP(pfn + 1) - pfn);
	549	+ __remove_section(pfn, cur_nr_pages, map_offset, altmap);
527	550	map_offset = 0;
528	551	}
529	552	}
..	..	@@ -566,48 +589,39 @@
566	589	}
567	590	EXPORT_SYMBOL_GPL(restore_online_page_callback);
568	591
569		-void __online_page_set_limits(struct page *page)
	592	+void generic_online_page(struct page *page, unsigned int order)
570	593	{
	594	+ /*
	595	+ * Freeing the page with debug_pagealloc enabled will try to unmap it,
	596	+ * so we should map it first. This is better than introducing a special
	597	+ * case in page freeing fast path.
	598	+ */
	599	+ debug_pagealloc_map_pages(page, 1 << order);
	600	+ __free_pages_core(page, order);
	601	+ totalram_pages_add(1UL << order);
	602	+#ifdef CONFIG_HIGHMEM
	603	+ if (PageHighMem(page))
	604	+ totalhigh_pages_add(1UL << order);
	605	+#endif
571	606	}
572		-EXPORT_SYMBOL_GPL(__online_page_set_limits);
	607	+EXPORT_SYMBOL_GPL(generic_online_page);
573	608
574		-void __online_page_increment_counters(struct page *page)
	609	+static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
575	610	{
576		- adjust_managed_page_count(page, 1);
577		-}
578		-EXPORT_SYMBOL_GPL(__online_page_increment_counters);
	611	+ const unsigned long end_pfn = start_pfn + nr_pages;
	612	+ unsigned long pfn;
579	613
580		-void __online_page_free(struct page *page)
581		-{
582		- __free_reserved_page(page);
583		-}
584		-EXPORT_SYMBOL_GPL(__online_page_free);
	614	+ /*
	615	+ * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
	616	+ * decide to not expose all pages to the buddy (e.g., expose them
	617	+ * later). We account all pages as being online and belonging to this
	618	+ * zone ("present").
	619	+ */
	620	+ for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
	621	+ (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
585	622
586		-static void generic_online_page(struct page *page)
587		-{
588		- __online_page_set_limits(page);
589		- __online_page_increment_counters(page);
590		- __online_page_free(page);
591		-}
592		-
593		-static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
594		- void *arg)
595		-{
596		- unsigned long i;
597		- unsigned long onlined_pages = (unsigned long )arg;
598		- struct page *page;
599		-
600		- if (PageReserved(pfn_to_page(start_pfn)))
601		- for (i = 0; i < nr_pages; i++) {
602		- page = pfn_to_page(start_pfn + i);
603		- (*online_page_callback)(page);
604		- onlined_pages++;
605		- }
606		-
607		- online_mem_sections(start_pfn, start_pfn + nr_pages);
608		-
609		- (unsigned long )arg = onlined_pages;
610		- return 0;
	623	+ /* mark all involved sections as online */
	624	+ online_mem_sections(start_pfn, end_pfn);
611	625	}
612	626
613	627	/* check which state of node_states will be changed when online memory */
..	..	@@ -615,62 +629,19 @@
615	629	struct zone zone, struct memory_notify arg)
616	630	{
617	631	int nid = zone_to_nid(zone);
618		- enum zone_type zone_last = ZONE_NORMAL;
619	632
620		- /*
621		- * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
622		- * contains nodes which have zones of 0...ZONE_NORMAL,
623		- * set zone_last to ZONE_NORMAL.
624		- *
625		- * If we don't have HIGHMEM nor movable node,
626		- * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
627		- * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
628		- */
629		- if (N_MEMORY == N_NORMAL_MEMORY)
630		- zone_last = ZONE_MOVABLE;
	633	+ arg->status_change_nid = NUMA_NO_NODE;
	634	+ arg->status_change_nid_normal = NUMA_NO_NODE;
	635	+ arg->status_change_nid_high = NUMA_NO_NODE;
631	636
632		- /*
633		- * if the memory to be online is in a zone of 0...zone_last, and
634		- * the zones of 0...zone_last don't have memory before online, we will
635		- * need to set the node to node_states[N_NORMAL_MEMORY] after
636		- * the memory is online.
637		- */
638		- if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
639		- arg->status_change_nid_normal = nid;
640		- else
641		- arg->status_change_nid_normal = -1;
642		-
643		-#ifdef CONFIG_HIGHMEM
644		- /*
645		- * If we have movable node, node_states[N_HIGH_MEMORY]
646		- * contains nodes which have zones of 0...ZONE_HIGHMEM,
647		- * set zone_last to ZONE_HIGHMEM.
648		- *
649		- * If we don't have movable node, node_states[N_NORMAL_MEMORY]
650		- * contains nodes which have zones of 0...ZONE_MOVABLE,
651		- * set zone_last to ZONE_MOVABLE.
652		- */
653		- zone_last = ZONE_HIGHMEM;
654		- if (N_MEMORY == N_HIGH_MEMORY)
655		- zone_last = ZONE_MOVABLE;
656		-
657		- if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
658		- arg->status_change_nid_high = nid;
659		- else
660		- arg->status_change_nid_high = -1;
661		-#else
662		- arg->status_change_nid_high = arg->status_change_nid_normal;
663		-#endif
664		-
665		- /*
666		- * if the node don't have memory befor online, we will need to
667		- * set the node to node_states[N_MEMORY] after the memory
668		- * is online.
669		- */
670	637	if (!node_state(nid, N_MEMORY))
671	638	arg->status_change_nid = nid;
672		- else
673		- arg->status_change_nid = -1;
	639	+ if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
	640	+ arg->status_change_nid_normal = nid;
	641	+#ifdef CONFIG_HIGHMEM
	642	+ if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
	643	+ arg->status_change_nid_high = nid;
	644	+#endif
674	645	}
675	646
676	647	static void node_states_set_node(int node, struct memory_notify *arg)
..	..	@@ -681,7 +652,8 @@
681	652	if (arg->status_change_nid_high >= 0)
682	653	node_set_state(node, N_HIGH_MEMORY);
683	654
684		- node_set_state(node, N_MEMORY);
	655	+ if (arg->status_change_nid >= 0)
	656	+ node_set_state(node, N_MEMORY);
685	657	}
686	658
687	659	static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
..	..	@@ -704,23 +676,32 @@
704	676	pgdat->node_start_pfn = start_pfn;
705	677
706	678	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
707		-}
708	679
	680	+}
	681	+/*
	682	+ * Associate the pfn range with the given zone, initializing the memmaps
	683	+ * and resizing the pgdat/zone data to span the added pages. After this
	684	+ * call, all affected pages are PG_reserved.
	685	+ *
	686	+ * All aligned pageblocks are initialized to the specified migratetype
	687	+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
	688	+ * zone stats (e.g., nr_isolate_pageblock) are touched.
	689	+ */
709	690	void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
710		- unsigned long nr_pages, struct vmem_altmap *altmap)
	691	+ unsigned long nr_pages,
	692	+ struct vmem_altmap *altmap, int migratetype)
711	693	{
712	694	struct pglist_data *pgdat = zone->zone_pgdat;
713	695	int nid = pgdat->node_id;
714	696	unsigned long flags;
715		-
716		- if (zone_is_empty(zone))
717		- init_currently_empty_zone(zone, start_pfn, nr_pages);
718	697
719	698	clear_zone_contiguous(zone);
720	699
721	700	/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
722	701	pgdat_resize_lock(pgdat, &flags);
723	702	zone_span_writelock(zone);
	703	+ if (zone_is_empty(zone))
	704	+ init_currently_empty_zone(zone, start_pfn, nr_pages);
724	705	resize_zone_range(zone, start_pfn, nr_pages);
725	706	zone_span_writeunlock(zone);
726	707	resize_pgdat_range(pgdat, start_pfn, nr_pages);
..	..	@@ -732,8 +713,8 @@
732	713	* expects the zone spans the pfn range. All the pages in the range
733	714	* are reserved so nobody should be touching them so we should be safe
734	715	*/
735		- memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
736		- MEMINIT_HOTPLUG, altmap);
	716	+ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0,
	717	+ MEMINIT_HOTPLUG, altmap, migratetype);
737	718
738	719	set_zone_contiguous(zone);
739	720	}
..	..	@@ -795,43 +776,25 @@
795	776	return default_zone_for_pfn(nid, start_pfn, nr_pages);
796	777	}
797	778
798		-/*
799		- * Associates the given pfn range with the given node and the zone appropriate
800		- * for the given online type.
801		- */
802		-static struct zone * __meminit move_pfn_range(int online_type, int nid,
803		- unsigned long start_pfn, unsigned long nr_pages)
804		-{
805		- struct zone *zone;
806		-
807		- zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
808		- move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL);
809		- return zone;
810		-}
811		-
812		-int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
	779	+int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
	780	+ int online_type, int nid)
813	781	{
814	782	unsigned long flags;
815		- unsigned long onlined_pages = 0;
816	783	struct zone *zone;
817	784	int need_zonelists_rebuild = 0;
818		- int nid;
819	785	int ret;
820	786	struct memory_notify arg;
821		- struct memory_block *mem;
	787	+
	788	+ /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
	789	+ if (WARN_ON_ONCE(!nr_pages \|\|
	790	+ !IS_ALIGNED(pfn \| nr_pages, PAGES_PER_SECTION)))
	791	+ return -EINVAL;
822	792
823	793	mem_hotplug_begin();
824	794
825		- /*
826		- * We can't use pfn_to_nid() because nid might be stored in struct page
827		- * which is not yet initialized. Instead, we find nid from memory block.
828		- */
829		- mem = find_memory_block(__pfn_to_section(pfn));
830		- nid = mem->nid;
831		- put_device(&mem->dev);
832		-
833	795	/* associate pfn range with the zone */
834		- zone = move_pfn_range(online_type, nid, pfn, nr_pages);
	796	+ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
	797	+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
835	798
836	799	arg.start_pfn = pfn;
837	800	arg.nr_pages = nr_pages;
..	..	@@ -843,6 +806,14 @@
843	806	goto failed_addition;
844	807
845	808	/*
	809	+ * Fixup the number of isolated pageblocks before marking the sections
	810	+ * onlining, such that undo_isolate_page_range() works correctly.
	811	+ */
	812	+ spin_lock_irqsave(&zone->lock, flags);
	813	+ zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
	814	+ spin_unlock_irqrestore(&zone->lock, flags);
	815	+
	816	+ /*
846	817	* If this zone is not populated, then it is not in zonelist.
847	818	* This means the page allocator ignores this zone.
848	819	* So, zonelist must be updated after online.
..	..	@@ -852,41 +823,37 @@
852	823	setup_zone_pageset(zone);
853	824	}
854	825
855		- ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
856		- online_pages_range);
857		- if (ret) {
858		- if (need_zonelists_rebuild)
859		- zone_pcp_reset(zone);
860		- goto failed_addition;
861		- }
862		-
863		- zone->present_pages += onlined_pages;
	826	+ online_pages_range(pfn, nr_pages);
	827	+ zone->present_pages += nr_pages;
864	828
865	829	pgdat_resize_lock(zone->zone_pgdat, &flags);
866		- zone->zone_pgdat->node_present_pages += onlined_pages;
	830	+ zone->zone_pgdat->node_present_pages += nr_pages;
867	831	pgdat_resize_unlock(zone->zone_pgdat, &flags);
868	832
869		- if (onlined_pages) {
870		- node_states_set_node(nid, &arg);
871		- if (need_zonelists_rebuild)
872		- build_all_zonelists(NULL);
873		- else
874		- zone_pcp_update(zone);
875		- }
	833	+ node_states_set_node(nid, &arg);
	834	+ if (need_zonelists_rebuild)
	835	+ build_all_zonelists(NULL);
	836	+ zone_pcp_update(zone);
	837	+
	838	+ /* Basic onlining is complete, allow allocation of onlined pages. */
	839	+ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
	840	+
	841	+ /*
	842	+ * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
	843	+ * the tail of the freelist when undoing isolation). Shuffle the whole
	844	+ * zone to make sure the just onlined pages are properly distributed
	845	+ * across the whole freelist - to create an initial shuffle.
	846	+ */
	847	+ shuffle_zone(zone);
876	848
877	849	init_per_zone_wmark_min();
878	850
879		- if (onlined_pages) {
880		- kswapd_run(nid);
881		- kcompactd_run(nid);
882		- }
883		-
884		- vm_total_pages = nr_free_pagecache_pages();
	851	+ kswapd_run(nid);
	852	+ kcompactd_run(nid);
885	853
886	854	writeback_set_ratelimit();
887	855
888		- if (onlined_pages)
889		- memory_notify(MEM_ONLINE, &arg);
	856	+ memory_notify(MEM_ONLINE, &arg);
890	857	mem_hotplug_done();
891	858	return 0;
892	859
..	..	@@ -912,10 +879,9 @@
912	879	}
913	880
914	881	/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
915		-static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
	882	+static pg_data_t __ref *hotadd_new_pgdat(int nid)
916	883	{
917	884	struct pglist_data *pgdat;
918		- unsigned long start_pfn = PFN_DOWN(start);
919	885
920	886	pgdat = NODE_DATA(nid);
921	887	if (!pgdat) {
..	..	@@ -923,26 +889,33 @@
923	889	if (!pgdat)
924	890	return NULL;
925	891
	892	+ pgdat->per_cpu_nodestats =
	893	+ alloc_percpu(struct per_cpu_nodestat);
926	894	arch_refresh_nodedata(nid, pgdat);
927	895	} else {
	896	+ int cpu;
928	897	/*
929		- * Reset the nr_zones, order and classzone_idx before reuse.
930		- * Note that kswapd will init kswapd_classzone_idx properly
	898	+ * Reset the nr_zones, order and highest_zoneidx before reuse.
	899	+ * Note that kswapd will init kswapd_highest_zoneidx properly
931	900	* when it starts in the near future.
932	901	*/
933	902	pgdat->nr_zones = 0;
934	903	pgdat->kswapd_order = 0;
935		- pgdat->kswapd_classzone_idx = 0;
	904	+ pgdat->kswapd_highest_zoneidx = 0;
	905	+ for_each_online_cpu(cpu) {
	906	+ struct per_cpu_nodestat *p;
	907	+
	908	+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
	909	+ memset(p, 0, sizeof(*p));
	910	+ }
936	911	}
937	912
938	913	/* we can use NODE_DATA(nid) from here */
939		-
940	914	pgdat->node_id = nid;
941		- pgdat->node_start_pfn = start_pfn;
	915	+ pgdat->node_start_pfn = 0;
942	916
943	917	/* init node's zones as empty zones, we don't have any present pages.*/
944	918	free_area_init_core_hotplug(nid);
945		- pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
946	919
947	920	/*
948	921	* The node we allocated has no zone fallback lists. For avoiding
..	..	@@ -968,14 +941,12 @@
968	941	arch_refresh_nodedata(nid, NULL);
969	942	free_percpu(pgdat->per_cpu_nodestats);
970	943	arch_free_nodedata(pgdat);
971		- return;
972	944	}
973	945
974	946
975	947	/**
976	948	* try_online_node - online a node if offlined
977	949	* @nid: the node ID
978		- * @start: start addr of the node
979	950	* @set_node_online: Whether we want to online the node
980	951	* called by cpu_up() to online a node without onlined memory.
981	952	*
..	..	@@ -984,7 +955,7 @@
984	955	* 0 -> the node is already online
985	956	* -ENOMEM -> the node could not be allocated
986	957	*/
987		-static int __try_online_node(int nid, u64 start, bool set_node_online)
	958	+static int __try_online_node(int nid, bool set_node_online)
988	959	{
989	960	pg_data_t *pgdat;
990	961	int ret = 1;
..	..	@@ -992,7 +963,7 @@
992	963	if (node_online(nid))
993	964	return 0;
994	965
995		- pgdat = hotadd_new_pgdat(nid, start);
	966	+ pgdat = hotadd_new_pgdat(nid);
996	967	if (!pgdat) {
997	968	pr_err("Cannot online node %d due to NULL pgdat\n", nid);
998	969	ret = -ENOMEM;
..	..	@@ -1016,23 +987,18 @@
1016	987	int ret;
1017	988
1018	989	mem_hotplug_begin();
1019		- ret = __try_online_node(nid, 0, true);
	990	+ ret = __try_online_node(nid, true);
1020	991	mem_hotplug_done();
1021	992	return ret;
1022	993	}
1023	994
1024	995	static int check_hotplug_memory_range(u64 start, u64 size)
1025	996	{
1026		- unsigned long block_sz = memory_block_size_bytes();
1027		- u64 block_nr_pages = block_sz >> PAGE_SHIFT;
1028		- u64 nr_pages = size >> PAGE_SHIFT;
1029		- u64 start_pfn = PFN_DOWN(start);
1030		-
1031	997	/* memory range must be block size aligned */
1032		- if (!nr_pages \|\| !IS_ALIGNED(start_pfn, block_nr_pages) \|\|
1033		- !IS_ALIGNED(nr_pages, block_nr_pages)) {
	998	+ if (!size \|\| !IS_ALIGNED(start, memory_block_size_bytes()) \|\|
	999	+ !IS_ALIGNED(size, memory_block_size_bytes())) {
1034	1000	pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
1035		- block_sz, start, size);
	1001	+ memory_block_size_bytes(), start, size);
1036	1002	return -EINVAL;
1037	1003	}
1038	1004
..	..	@@ -1041,6 +1007,7 @@
1041	1007
1042	1008	static int online_memory_block(struct memory_block mem, void arg)
1043	1009	{
	1010	+ mem->online_type = memhp_default_online_type;
1044	1011	return device_online(&mem->dev);
1045	1012	}
1046	1013
..	..	@@ -1050,8 +1017,9 @@
1050	1017	*
1051	1018	* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
1052	1019	*/
1053		-int __ref add_memory_resource(int nid, struct resource *res, bool online)
	1020	+int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
1054	1021	{
	1022	+ struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
1055	1023	u64 start, size;
1056	1024	bool new_node = false;
1057	1025	int ret;
..	..	@@ -1063,23 +1031,23 @@
1063	1031	if (ret)
1064	1032	return ret;
1065	1033
	1034	+ if (!node_possible(nid)) {
	1035	+ WARN(1, "node %d was absent from the node_possible_map\n", nid);
	1036	+ return -EINVAL;
	1037	+ }
	1038	+
1066	1039	mem_hotplug_begin();
1067	1040
1068		- /*
1069		- * Add new range to memblock so that when hotadd_new_pgdat() is called
1070		- * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
1071		- * this new range and calculate total pages correctly. The range will
1072		- * be removed at hot-remove time.
1073		- */
1074		- memblock_add_node(start, size, nid);
	1041	+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
	1042	+ memblock_add_node(start, size, nid);
1075	1043
1076		- ret = __try_online_node(nid, start, false);
	1044	+ ret = __try_online_node(nid, false);
1077	1045	if (ret < 0)
1078	1046	goto error;
1079	1047	new_node = ret;
1080	1048
1081	1049	/* call arch's memory hotadd */
1082		- ret = arch_add_memory(nid, start, size, NULL, true);
	1050	+ ret = arch_add_memory(nid, start, size, &params);
1083	1051	if (ret < 0)
1084	1052	goto error;
1085	1053
..	..	@@ -1102,143 +1070,170 @@
1102	1070	}
1103	1071
1104	1072	/* link memory sections under this node.*/
1105		- ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
1106		- MEMINIT_HOTPLUG);
1107		- BUG_ON(ret);
	1073	+ link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
	1074	+ MEMINIT_HOTPLUG);
1108	1075
1109	1076	/* create new memmap entry */
1110		- firmware_map_add_hotplug(start, start + size, "System RAM");
	1077	+ if (!strcmp(res->name, "System RAM"))
	1078	+ firmware_map_add_hotplug(start, start + size, "System RAM");
1111	1079
1112	1080	/* device_online() will take the lock when calling online_pages() */
1113	1081	mem_hotplug_done();
1114	1082
	1083	+ /*
	1084	+ * In case we're allowed to merge the resource, flag it and trigger
	1085	+ * merging now that adding succeeded.
	1086	+ */
	1087	+ if (mhp_flags & MEMHP_MERGE_RESOURCE)
	1088	+ merge_system_ram_resource(res);
	1089	+
1115	1090	/* online pages if requested */
1116		- if (online)
1117		- walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
1118		- NULL, online_memory_block);
	1091	+ if (memhp_default_online_type != MMOP_OFFLINE)
	1092	+ walk_memory_blocks(start, size, NULL, online_memory_block);
1119	1093
1120	1094	return ret;
1121	1095	error:
1122	1096	/* rollback pgdat allocation and others */
1123	1097	if (new_node)
1124	1098	rollback_node_hotadd(nid);
1125		- memblock_remove(start, size);
	1099	+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
	1100	+ memblock_remove(start, size);
1126	1101	mem_hotplug_done();
1127	1102	return ret;
1128	1103	}
1129	1104
1130	1105	/* requires device_hotplug_lock, see add_memory_resource() */
1131		-int __ref __add_memory(int nid, u64 start, u64 size)
	1106	+int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
1132	1107	{
1133	1108	struct resource *res;
1134	1109	int ret;
1135	1110
1136		- res = register_memory_resource(start, size);
	1111	+ res = register_memory_resource(start, size, "System RAM");
1137	1112	if (IS_ERR(res))
1138	1113	return PTR_ERR(res);
1139	1114
1140		- ret = add_memory_resource(nid, res, memhp_auto_online);
	1115	+ ret = add_memory_resource(nid, res, mhp_flags);
1141	1116	if (ret < 0)
1142	1117	release_memory_resource(res);
1143	1118	return ret;
1144	1119	}
1145	1120
1146		-int add_memory(int nid, u64 start, u64 size)
	1121	+int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
1147	1122	{
1148	1123	int rc;
1149	1124
1150	1125	lock_device_hotplug();
1151		- rc = __add_memory(nid, start, size);
	1126	+ rc = __add_memory(nid, start, size, mhp_flags);
1152	1127	unlock_device_hotplug();
1153	1128
1154	1129	return rc;
1155	1130	}
1156	1131	EXPORT_SYMBOL_GPL(add_memory);
1157	1132
	1133	+int add_memory_subsection(int nid, u64 start, u64 size)
	1134	+{
	1135	+ struct mhp_params params = { .pgprot = PAGE_KERNEL };
	1136	+ struct resource *res;
	1137	+ int ret;
	1138	+
	1139	+ if (!IS_ALIGNED(start, SUBSECTION_SIZE) \|\|
	1140	+ !IS_ALIGNED(size, SUBSECTION_SIZE)) {
	1141	+ pr_err("%s: start 0x%llx size 0x%llx not aligned to subsection size\n",
	1142	+ __func__, start, size);
	1143	+ return -EINVAL;
	1144	+ }
	1145	+
	1146	+ res = register_memory_resource(start, size, "System RAM");
	1147	+ if (IS_ERR(res))
	1148	+ return PTR_ERR(res);
	1149	+
	1150	+ mem_hotplug_begin();
	1151	+
	1152	+ nid = memory_add_physaddr_to_nid(start);
	1153	+
	1154	+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
	1155	+ memblock_add_node(start, size, nid);
	1156	+
	1157	+ ret = arch_add_memory(nid, start, size, &params);
	1158	+ if (ret) {
	1159	+ pr_err("%s failed to add subsection start 0x%llx size 0x%llx\n",
	1160	+ __func__, start, size);
	1161	+ goto err_add_memory;
	1162	+ }
	1163	+ mem_hotplug_done();
	1164	+
	1165	+ return ret;
	1166	+
	1167	+err_add_memory:
	1168	+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
	1169	+ memblock_remove(start, size);
	1170	+
	1171	+ mem_hotplug_done();
	1172	+
	1173	+ release_memory_resource(res);
	1174	+ return ret;
	1175	+}
	1176	+EXPORT_SYMBOL_GPL(add_memory_subsection);
	1177	+
	1178	+/*
	1179	+ * Add special, driver-managed memory to the system as system RAM. Such
	1180	+ * memory is not exposed via the raw firmware-provided memmap as system
	1181	+ * RAM, instead, it is detected and added by a driver - during cold boot,
	1182	+ * after a reboot, and after kexec.
	1183	+ *
	1184	+ * Reasons why this memory should not be used for the initial memmap of a
	1185	+ * kexec kernel or for placing kexec images:
	1186	+ * - The booting kernel is in charge of determining how this memory will be
	1187	+ * used (e.g., use persistent memory as system RAM)
	1188	+ * - Coordination with a hypervisor is required before this memory
	1189	+ * can be used (e.g., inaccessible parts).
	1190	+ *
	1191	+ * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
	1192	+ * memory map") are created. Also, the created memory resource is flagged
	1193	+ * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
	1194	+ * this memory as well (esp., not place kexec images onto it).
	1195	+ *
	1196	+ * The resource_name (visible via /proc/iomem) has to have the format
	1197	+ * "System RAM ($DRIVER)".
	1198	+ */
	1199	+int add_memory_driver_managed(int nid, u64 start, u64 size,
	1200	+ const char *resource_name, mhp_t mhp_flags)
	1201	+{
	1202	+ struct resource *res;
	1203	+ int rc;
	1204	+
	1205	+ if (!resource_name \|\|
	1206	+ strstr(resource_name, "System RAM (") != resource_name \|\|
	1207	+ resource_name[strlen(resource_name) - 1] != ')')
	1208	+ return -EINVAL;
	1209	+
	1210	+ lock_device_hotplug();
	1211	+
	1212	+ res = register_memory_resource(start, size, resource_name);
	1213	+ if (IS_ERR(res)) {
	1214	+ rc = PTR_ERR(res);
	1215	+ goto out_unlock;
	1216	+ }
	1217	+
	1218	+ rc = add_memory_resource(nid, res, mhp_flags);
	1219	+ if (rc < 0)
	1220	+ release_memory_resource(res);
	1221	+
	1222	+out_unlock:
	1223	+ unlock_device_hotplug();
	1224	+ return rc;
	1225	+}
	1226	+EXPORT_SYMBOL_GPL(add_memory_driver_managed);
	1227	+
1158	1228	#ifdef CONFIG_MEMORY_HOTREMOVE
1159	1229	/*
1160		- * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
1161		- * set and the size of the free page is given by page_order(). Using this,
1162		- * the function determines if the pageblock contains only free pages.
1163		- * Due to buddy contraints, a free page at least the size of a pageblock will
1164		- * be located at the start of the pageblock
	1230	+ * Confirm all pages in a range [start, end) belong to the same zone (skipping
	1231	+ * memory holes). When true, return the zone.
1165	1232	*/
1166		-static inline int pageblock_free(struct page *page)
1167		-{
1168		- return PageBuddy(page) && page_order(page) >= pageblock_order;
1169		-}
1170		-
1171		-/* Return the pfn of the start of the next active pageblock after a given pfn */
1172		-static unsigned long next_active_pageblock(unsigned long pfn)
1173		-{
1174		- struct page *page = pfn_to_page(pfn);
1175		-
1176		- /* Ensure the starting page is pageblock-aligned */
1177		- BUG_ON(pfn & (pageblock_nr_pages - 1));
1178		-
1179		- /* If the entire pageblock is free, move to the end of free page */
1180		- if (pageblock_free(page)) {
1181		- int order;
1182		- /* be careful. we don't have locks, page_order can be changed.*/
1183		- order = page_order(page);
1184		- if ((order < MAX_ORDER) && (order >= pageblock_order))
1185		- return pfn + (1 << order);
1186		- }
1187		-
1188		- return pfn + pageblock_nr_pages;
1189		-}
1190		-
1191		-static bool is_pageblock_removable_nolock(unsigned long pfn)
1192		-{
1193		- struct page *page = pfn_to_page(pfn);
1194		- struct zone *zone;
1195		-
1196		- /*
1197		- * We have to be careful here because we are iterating over memory
1198		- * sections which are not zone aware so we might end up outside of
1199		- * the zone but still within the section.
1200		- * We have to take care about the node as well. If the node is offline
1201		- * its NODE_DATA will be NULL - see page_zone.
1202		- */
1203		- if (!node_online(page_to_nid(page)))
1204		- return false;
1205		-
1206		- zone = page_zone(page);
1207		- pfn = page_to_pfn(page);
1208		- if (!zone_spans_pfn(zone, pfn))
1209		- return false;
1210		-
1211		- return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
1212		-}
1213		-
1214		-/* Checks if this range of memory is likely to be hot-removable. */
1215		-bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
1216		-{
1217		- unsigned long end_pfn, pfn;
1218		-
1219		- end_pfn = min(start_pfn + nr_pages,
1220		- zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
1221		-
1222		- /* Check the starting page of each pageblock within the range */
1223		- for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
1224		- if (!is_pageblock_removable_nolock(pfn))
1225		- return false;
1226		- cond_resched();
1227		- }
1228		-
1229		- /* All pageblocks in the memory block are likely to be hot-removable */
1230		- return true;
1231		-}
1232		-
1233		-/*
1234		- * Confirm all pages in a range [start, end) belong to the same zone.
1235		- * When true, return its valid [start, end).
1236		- */
1237		-int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
1238		- unsigned long valid_start, unsigned long valid_end)
	1233	+struct zone *test_pages_in_a_zone(unsigned long start_pfn,
	1234	+ unsigned long end_pfn)
1239	1235	{
1240	1236	unsigned long pfn, sec_end_pfn;
1241		- unsigned long start, end;
1242	1237	struct zone *zone = NULL;
1243	1238	struct page *page;
1244	1239	int i;
..	..	@@ -1259,33 +1254,30 @@
1259	1254	continue;
1260	1255	/* Check if we got outside of the zone */
1261	1256	if (zone && !zone_spans_pfn(zone, pfn + i))
1262		- return 0;
	1257	+ return NULL;
1263	1258	page = pfn_to_page(pfn + i);
1264	1259	if (zone && page_zone(page) != zone)
1265		- return 0;
1266		- if (!zone)
1267		- start = pfn + i;
	1260	+ return NULL;
1268	1261	zone = page_zone(page);
1269		- end = pfn + MAX_ORDER_NR_PAGES;
1270	1262	}
1271	1263	}
1272	1264
1273		- if (zone) {
1274		- *valid_start = start;
1275		- *valid_end = min(end, end_pfn);
1276		- return 1;
1277		- } else {
1278		- return 0;
1279		- }
	1265	+ return zone;
1280	1266	}
1281	1267
1282	1268	/*
1283	1269	* Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
1284		- * non-lru movable pages and hugepages). We scan pfn because it's much
1285		- * easier than scanning over linked list. This function returns the pfn
1286		- * of the first found movable page if it's found, otherwise 0.
	1270	+ * non-lru movable pages and hugepages). Will skip over most unmovable
	1271	+ * pages (esp., pages that can be skipped when offlining), but bail out on
	1272	+ * definitely unmovable pages.
	1273	+ *
	1274	+ * Returns:
	1275	+ * 0 in case a movable page is found and movable_pfn was updated.
	1276	+ * -ENOENT in case no movable page was found.
	1277	+ * -EBUSY in case a definitely unmovable page was found.
1287	1278	*/
1288		-static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
	1279	+static int scan_movable_pages(unsigned long start, unsigned long end,
	1280	+ unsigned long *movable_pfn)
1289	1281	{
1290	1282	unsigned long pfn;
1291	1283
..	..	@@ -1297,68 +1289,55 @@
1297	1289	continue;
1298	1290	page = pfn_to_page(pfn);
1299	1291	if (PageLRU(page))
1300		- return pfn;
	1292	+ goto found;
1301	1293	if (__PageMovable(page))
1302		- return pfn;
	1294	+ goto found;
	1295	+
	1296	+ /*
	1297	+ * PageOffline() pages that are not marked __PageMovable() and
	1298	+ * have a reference count > 0 (after MEM_GOING_OFFLINE) are
	1299	+ * definitely unmovable. If their reference count would be 0,
	1300	+ * they could at least be skipped when offlining memory.
	1301	+ */
	1302	+ if (PageOffline(page) && page_count(page))
	1303	+ return -EBUSY;
1303	1304
1304	1305	if (!PageHuge(page))
1305	1306	continue;
1306	1307	head = compound_head(page);
1307		- if (hugepage_migration_supported(page_hstate(head)) &&
1308		- page_huge_active(head))
1309		- return pfn;
1310		- skip = (1 << compound_order(head)) - (page - head);
	1308	+ if (page_huge_active(head))
	1309	+ goto found;
	1310	+ skip = compound_nr(head) - (page - head);
1311	1311	pfn += skip - 1;
1312	1312	}
	1313	+ return -ENOENT;
	1314	+found:
	1315	+ *movable_pfn = pfn;
1313	1316	return 0;
1314	1317	}
1315	1318
1316		-static struct page new_node_page(struct page page, unsigned long private)
1317		-{
1318		- int nid = page_to_nid(page);
1319		- nodemask_t nmask = node_states[N_MEMORY];
1320		-
1321		- /*
1322		- * try to allocate from a different node but reuse this node if there
1323		- * are no other online nodes to be used (e.g. we are offlining a part
1324		- * of the only existing node)
1325		- */
1326		- node_clear(nid, nmask);
1327		- if (nodes_empty(nmask))
1328		- node_set(nid, nmask);
1329		-
1330		- return new_page_nodemask(page, nid, &nmask);
1331		-}
1332		-
1333		-#define NR_OFFLINE_AT_ONCE_PAGES (256)
1334	1319	static int
1335	1320	do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1336	1321	{
1337	1322	unsigned long pfn;
1338		- struct page *page;
1339		- int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
1340		- int not_managed = 0;
	1323	+ struct page page, head;
1341	1324	int ret = 0;
1342	1325	LIST_HEAD(source);
	1326	+ static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
	1327	+ DEFAULT_RATELIMIT_BURST);
1343	1328
1344		- for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
	1329	+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1345	1330	if (!pfn_valid(pfn))
1346	1331	continue;
1347	1332	page = pfn_to_page(pfn);
	1333	+ head = compound_head(page);
1348	1334
1349	1335	if (PageHuge(page)) {
1350		- struct page *head = compound_head(page);
1351		- pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
1352		- if (compound_order(head) > PFN_SECTION_SHIFT) {
1353		- ret = -EBUSY;
1354		- break;
1355		- }
1356		- if (isolate_huge_page(page, &source))
1357		- move_pages -= 1 << compound_order(head);
	1336	+ pfn = page_to_pfn(head) + compound_nr(head) - 1;
	1337	+ isolate_hugetlb(head, &source);
1358	1338	continue;
1359	1339	} else if (PageTransHuge(page))
1360		- pfn = page_to_pfn(compound_head(page))
1361		- + hpage_nr_pages(page) - 1;
	1340	+ pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
1362	1341
1363	1342	/*
1364	1343	* HWPoison pages have elevated reference counts so the migration would
..	..	@@ -1371,7 +1350,7 @@
1371	1350	if (WARN_ON(PageLRU(page)))
1372	1351	isolate_lru_page(page);
1373	1352	if (page_mapped(page))
1374		- try_to_unmap(page, TTU_IGNORE_MLOCK \| TTU_IGNORE_ACCESS);
	1353	+ try_to_unmap(page, TTU_IGNORE_MLOCK);
1375	1354	continue;
1376	1355	}
1377	1356
..	..	@@ -1386,98 +1365,60 @@
1386	1365	else
1387	1366	ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1388	1367	if (!ret) { /* Success */
1389		- put_page(page);
1390	1368	list_add_tail(&page->lru, &source);
1391		- move_pages--;
1392	1369	if (!__PageMovable(page))
1393	1370	inc_node_page_state(page, NR_ISOLATED_ANON +
1394		- page_is_file_cache(page));
	1371	+ page_is_file_lru(page));
1395	1372
1396	1373	} else {
1397		-#ifdef CONFIG_DEBUG_VM
1398		- pr_alert("failed to isolate pfn %lx\n", pfn);
1399		- dump_page(page, "isolation failed");
1400		-#endif
1401		- put_page(page);
1402		- /* Because we don't have big zone->lock. we should
1403		- check this again here. */
1404		- if (page_count(page)) {
1405		- not_managed++;
1406		- ret = -EBUSY;
1407		- break;
	1374	+ if (__ratelimit(&migrate_rs)) {
	1375	+ pr_warn("failed to isolate pfn %lx\n", pfn);
	1376	+ dump_page(page, "isolation failed");
1408	1377	}
1409	1378	}
	1379	+ put_page(page);
1410	1380	}
1411	1381	if (!list_empty(&source)) {
1412		- if (not_managed) {
	1382	+ nodemask_t nmask = node_states[N_MEMORY];
	1383	+ struct migration_target_control mtc = {
	1384	+ .nmask = &nmask,
	1385	+ .gfp_mask = GFP_USER \| __GFP_MOVABLE \| __GFP_RETRY_MAYFAIL,
	1386	+ };
	1387	+
	1388	+ /*
	1389	+ * We have checked that migration range is on a single zone so
	1390	+ * we can use the nid of the first page to all the others.
	1391	+ */
	1392	+ mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
	1393	+
	1394	+ /*
	1395	+ * try to allocate from a different node but reuse this node
	1396	+ * if there are no other online nodes to be used (e.g. we are
	1397	+ * offlining a part of the only existing node)
	1398	+ */
	1399	+ node_clear(mtc.nid, nmask);
	1400	+ if (nodes_empty(nmask))
	1401	+ node_set(mtc.nid, nmask);
	1402	+ ret = migrate_pages(&source, alloc_migration_target, NULL,
	1403	+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
	1404	+ if (ret) {
	1405	+ list_for_each_entry(page, &source, lru) {
	1406	+ if (__ratelimit(&migrate_rs)) {
	1407	+ pr_warn("migrating pfn %lx failed ret:%d\n",
	1408	+ page_to_pfn(page), ret);
	1409	+ dump_page(page, "migration failure");
	1410	+ }
	1411	+ }
1413	1412	putback_movable_pages(&source);
1414		- goto out;
1415	1413	}
1416		-
1417		- /* Allocate a new page from the nearest neighbor node */
1418		- ret = migrate_pages(&source, new_node_page, NULL, 0,
1419		- MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1420		- if (ret)
1421		- putback_movable_pages(&source);
1422	1414	}
1423		-out:
	1415	+
1424	1416	return ret;
1425		-}
1426		-
1427		-/*
1428		- * remove from free_area[] and mark all as Reserved.
1429		- */
1430		-static int
1431		-offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
1432		- void *data)
1433		-{
1434		- __offline_isolated_pages(start, start + nr_pages);
1435		- return 0;
1436		-}
1437		-
1438		-static void
1439		-offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
1440		-{
1441		- walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
1442		- offline_isolated_pages_cb);
1443		-}
1444		-
1445		-/*
1446		- * Check all pages in range, recoreded as memory resource, are isolated.
1447		- */
1448		-static int
1449		-check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
1450		- void *data)
1451		-{
1452		- int ret;
1453		- long offlined = (long )data;
1454		- ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
1455		- offlined = nr_pages;
1456		- if (!ret)
1457		- (long )data += offlined;
1458		- return ret;
1459		-}
1460		-
1461		-static long
1462		-check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
1463		-{
1464		- long offlined = 0;
1465		- int ret;
1466		-
1467		- ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
1468		- check_pages_isolated_cb);
1469		- if (ret < 0)
1470		- offlined = (long)ret;
1471		- return offlined;
1472	1417	}
1473	1418
1474	1419	static int __init cmdline_parse_movable_node(char *p)
1475	1420	{
1476		-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
1477	1421	movable_node_enabled = true;
1478		-#else
1479		- pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
1480		-#endif
1481	1422	return 0;
1482	1423	}
1483	1424	early_param("movable_node", cmdline_parse_movable_node);
..	..	@@ -1488,75 +1429,53 @@
1488	1429	{
1489	1430	struct pglist_data *pgdat = zone->zone_pgdat;
1490	1431	unsigned long present_pages = 0;
1491		- enum zone_type zt, zone_last = ZONE_NORMAL;
	1432	+ enum zone_type zt;
	1433	+
	1434	+ arg->status_change_nid = NUMA_NO_NODE;
	1435	+ arg->status_change_nid_normal = NUMA_NO_NODE;
	1436	+ arg->status_change_nid_high = NUMA_NO_NODE;
1492	1437
1493	1438	/*
1494		- * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
1495		- * contains nodes which have zones of 0...ZONE_NORMAL,
1496		- * set zone_last to ZONE_NORMAL.
1497		- *
1498		- * If we don't have HIGHMEM nor movable node,
1499		- * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
1500		- * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
	1439	+ * Check whether node_states[N_NORMAL_MEMORY] will be changed.
	1440	+ * If the memory to be offline is within the range
	1441	+ * [0..ZONE_NORMAL], and it is the last present memory there,
	1442	+ * the zones in that range will become empty after the offlining,
	1443	+ * thus we can determine that we need to clear the node from
	1444	+ * node_states[N_NORMAL_MEMORY].
1501	1445	*/
1502		- if (N_MEMORY == N_NORMAL_MEMORY)
1503		- zone_last = ZONE_MOVABLE;
1504		-
1505		- /*
1506		- * check whether node_states[N_NORMAL_MEMORY] will be changed.
1507		- * If the memory to be offline is in a zone of 0...zone_last,
1508		- * and it is the last present memory, 0...zone_last will
1509		- * become empty after offline , thus we can determind we will
1510		- * need to clear the node from node_states[N_NORMAL_MEMORY].
1511		- */
1512		- for (zt = 0; zt <= zone_last; zt++)
	1446	+ for (zt = 0; zt <= ZONE_NORMAL; zt++)
1513	1447	present_pages += pgdat->node_zones[zt].present_pages;
1514		- if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
	1448	+ if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
1515	1449	arg->status_change_nid_normal = zone_to_nid(zone);
1516		- else
1517		- arg->status_change_nid_normal = -1;
1518	1450
1519	1451	#ifdef CONFIG_HIGHMEM
1520	1452	/*
1521		- * If we have movable node, node_states[N_HIGH_MEMORY]
1522		- * contains nodes which have zones of 0...ZONE_HIGHMEM,
1523		- * set zone_last to ZONE_HIGHMEM.
1524		- *
1525		- * If we don't have movable node, node_states[N_NORMAL_MEMORY]
1526		- * contains nodes which have zones of 0...ZONE_MOVABLE,
1527		- * set zone_last to ZONE_MOVABLE.
	1453	+ * node_states[N_HIGH_MEMORY] contains nodes which
	1454	+ * have normal memory or high memory.
	1455	+ * Here we add the present_pages belonging to ZONE_HIGHMEM.
	1456	+ * If the zone is within the range of [0..ZONE_HIGHMEM), and
	1457	+ * we determine that the zones in that range become empty,
	1458	+ * we need to clear the node for N_HIGH_MEMORY.
1528	1459	*/
1529		- zone_last = ZONE_HIGHMEM;
1530		- if (N_MEMORY == N_HIGH_MEMORY)
1531		- zone_last = ZONE_MOVABLE;
1532		-
1533		- for (; zt <= zone_last; zt++)
1534		- present_pages += pgdat->node_zones[zt].present_pages;
1535		- if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
	1460	+ present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
	1461	+ if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
1536	1462	arg->status_change_nid_high = zone_to_nid(zone);
1537		- else
1538		- arg->status_change_nid_high = -1;
1539		-#else
1540		- arg->status_change_nid_high = arg->status_change_nid_normal;
1541	1463	#endif
1542	1464
1543	1465	/*
1544		- * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
	1466	+ * We have accounted the pages from [0..ZONE_NORMAL), and
	1467	+ * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
	1468	+ * as well.
	1469	+ * Here we count the possible pages from ZONE_MOVABLE.
	1470	+ * If after having accounted all the pages, we see that the nr_pages
	1471	+ * to be offlined is over or equal to the accounted pages,
	1472	+ * we know that the node will become empty, and so, we can clear
	1473	+ * it for N_MEMORY as well.
1545	1474	*/
1546		- zone_last = ZONE_MOVABLE;
	1475	+ present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
1547	1476
1548		- /*
1549		- * check whether node_states[N_HIGH_MEMORY] will be changed
1550		- * If we try to offline the last present @nr_pages from the node,
1551		- * we can determind we will need to clear the node from
1552		- * node_states[N_HIGH_MEMORY].
1553		- */
1554		- for (; zt <= zone_last; zt++)
1555		- present_pages += pgdat->node_zones[zt].present_pages;
1556	1477	if (nr_pages >= present_pages)
1557	1478	arg->status_change_nid = zone_to_nid(zone);
1558		- else
1559		- arg->status_change_nid = -1;
1560	1479	}
1561	1480
1562	1481	static void node_states_clear_node(int node, struct memory_notify *arg)
..	..	@@ -1564,53 +1483,76 @@
1564	1483	if (arg->status_change_nid_normal >= 0)
1565	1484	node_clear_state(node, N_NORMAL_MEMORY);
1566	1485
1567		- if ((N_MEMORY != N_NORMAL_MEMORY) &&
1568		- (arg->status_change_nid_high >= 0))
	1486	+ if (arg->status_change_nid_high >= 0)
1569	1487	node_clear_state(node, N_HIGH_MEMORY);
1570	1488
1571		- if ((N_MEMORY != N_HIGH_MEMORY) &&
1572		- (arg->status_change_nid >= 0))
	1489	+ if (arg->status_change_nid >= 0)
1573	1490	node_clear_state(node, N_MEMORY);
1574	1491	}
1575	1492
1576		-static int __ref __offline_pages(unsigned long start_pfn,
1577		- unsigned long end_pfn)
	1493	+static int count_system_ram_pages_cb(unsigned long start_pfn,
	1494	+ unsigned long nr_pages, void *data)
1578	1495	{
1579		- unsigned long pfn, nr_pages;
1580		- long offlined_pages;
1581		- int ret, node;
	1496	+ unsigned long *nr_system_ram_pages = data;
	1497	+
	1498	+ *nr_system_ram_pages += nr_pages;
	1499	+ return 0;
	1500	+}
	1501	+
	1502	+int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
	1503	+{
	1504	+ const unsigned long end_pfn = start_pfn + nr_pages;
	1505	+ unsigned long pfn, system_ram_pages = 0;
1582	1506	unsigned long flags;
1583		- unsigned long valid_start, valid_end;
1584	1507	struct zone *zone;
1585	1508	struct memory_notify arg;
	1509	+ int ret, node;
	1510	+ char *reason;
1586	1511
1587		- /* at least, alignment against pageblock is necessary */
1588		- if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
1589		- return -EINVAL;
1590		- if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
	1512	+ /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
	1513	+ if (WARN_ON_ONCE(!nr_pages \|\|
	1514	+ !IS_ALIGNED(start_pfn \| nr_pages, PAGES_PER_SECTION)))
1591	1515	return -EINVAL;
1592	1516
1593	1517	mem_hotplug_begin();
1594	1518
	1519	+ /*
	1520	+ * Don't allow to offline memory blocks that contain holes.
	1521	+ * Consequently, memory blocks with holes can never get onlined
	1522	+ * via the hotplug path - online_pages() - as hotplugged memory has
	1523	+ * no holes. This way, we e.g., don't have to worry about marking
	1524	+ * memory holes PG_reserved, don't need pfn_valid() checks, and can
	1525	+ * avoid using walk_system_ram_range() later.
	1526	+ */
	1527	+ walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
	1528	+ count_system_ram_pages_cb);
	1529	+ if (system_ram_pages != nr_pages) {
	1530	+ ret = -EINVAL;
	1531	+ reason = "memory holes";
	1532	+ goto failed_removal;
	1533	+ }
	1534	+
1595	1535	/* This makes hotplug much easier...and readable.
1596	1536	we assume this for now. .*/
1597		- if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
1598		- &valid_end)) {
1599		- mem_hotplug_done();
1600		- return -EINVAL;
	1537	+ zone = test_pages_in_a_zone(start_pfn, end_pfn);
	1538	+ if (!zone) {
	1539	+ ret = -EINVAL;
	1540	+ reason = "multizone range";
	1541	+ goto failed_removal;
1601	1542	}
1602		-
1603		- zone = page_zone(pfn_to_page(valid_start));
1604	1543	node = zone_to_nid(zone);
1605		- nr_pages = end_pfn - start_pfn;
1606	1544
	1545	+ lru_cache_disable();
1607	1546	/* set above range as isolated */
1608	1547	ret = start_isolate_page_range(start_pfn, end_pfn,
1609		- MIGRATE_MOVABLE, true);
	1548	+ MIGRATE_MOVABLE,
	1549	+ MEMORY_OFFLINE \| REPORT_FAILURE, NULL);
1610	1550	if (ret) {
1611		- mem_hotplug_done();
1612		- return ret;
	1551	+ reason = "failure to isolate range";
	1552	+ goto failed_removal_lru_cache_disabled;
1613	1553	}
	1554	+
	1555	+ drain_all_pages(zone);
1614	1556
1615	1557	arg.start_pfn = start_pfn;
1616	1558	arg.nr_pages = nr_pages;
..	..	@@ -1618,49 +1560,84 @@
1618	1560
1619	1561	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
1620	1562	ret = notifier_to_errno(ret);
1621		- if (ret)
1622		- goto failed_removal;
1623		-
1624		- pfn = start_pfn;
1625		-repeat:
1626		- /* start memory hot removal */
1627		- ret = -EINTR;
1628		- if (signal_pending(current))
1629		- goto failed_removal;
1630		-
1631		- cond_resched();
1632		- lru_add_drain_all();
1633		- drain_all_pages(zone);
1634		-
1635		- pfn = scan_movable_pages(start_pfn, end_pfn);
1636		- if (pfn) { /* We have movable pages */
1637		- ret = do_migrate_range(pfn, end_pfn);
1638		- goto repeat;
	1563	+ if (ret) {
	1564	+ reason = "notifier failure";
	1565	+ goto failed_removal_isolated;
1639	1566	}
1640	1567
	1568	+ do {
	1569	+ pfn = start_pfn;
	1570	+ do {
	1571	+ if (signal_pending(current)) {
	1572	+ ret = -EINTR;
	1573	+ reason = "signal backoff";
	1574	+ goto failed_removal_isolated;
	1575	+ }
	1576	+
	1577	+ cond_resched();
	1578	+
	1579	+ ret = scan_movable_pages(pfn, end_pfn, &pfn);
	1580	+ if (!ret) {
	1581	+ /*
	1582	+ * TODO: fatal migration failures should bail
	1583	+ * out
	1584	+ */
	1585	+ do_migrate_range(pfn, end_pfn);
	1586	+ }
	1587	+ } while (!ret);
	1588	+
	1589	+ if (ret != -ENOENT) {
	1590	+ reason = "unmovable page";
	1591	+ goto failed_removal_isolated;
	1592	+ }
	1593	+
	1594	+ /*
	1595	+ * Dissolve free hugepages in the memory block before doing
	1596	+ * offlining actually in order to make hugetlbfs's object
	1597	+ * counting consistent.
	1598	+ */
	1599	+ ret = dissolve_free_huge_pages(start_pfn, end_pfn);
	1600	+ if (ret) {
	1601	+ reason = "failure to dissolve huge pages";
	1602	+ goto failed_removal_isolated;
	1603	+ }
	1604	+
	1605	+ /*
	1606	+ * per-cpu pages are drained after start_isolate_page_range, but
	1607	+ * if there are still pages that are not free, make sure that we
	1608	+ * drain again, because when we isolated range we might have
	1609	+ * raced with another thread that was adding pages to pcp list.
	1610	+ *
	1611	+ * Forward progress should be still guaranteed because
	1612	+ * pages on the pcp list can only belong to MOVABLE_ZONE
	1613	+ * because has_unmovable_pages explicitly checks for
	1614	+ * PageBuddy on freed pages on other zones.
	1615	+ */
	1616	+ ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE, NULL);
	1617	+ if (ret)
	1618	+ drain_all_pages(zone);
	1619	+ } while (ret);
	1620	+
	1621	+ /* Mark all sections offline and remove free pages from the buddy. */
	1622	+ __offline_isolated_pages(start_pfn, end_pfn);
	1623	+ pr_info("Offlined Pages %ld\n", nr_pages);
	1624	+
1641	1625	/*
1642		- * dissolve free hugepages in the memory block before doing offlining
1643		- * actually in order to make hugetlbfs's object counting consistent.
	1626	+ * The memory sections are marked offline, and the pageblock flags
	1627	+ * effectively stale; nobody should be touching them. Fixup the number
	1628	+ * of isolated pageblocks, memory onlining will properly revert this.
1644	1629	*/
1645		- ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1646		- if (ret)
1647		- goto failed_removal;
1648		- /* check again */
1649		- offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1650		- if (offlined_pages < 0)
1651		- goto repeat;
1652		- pr_info("Offlined Pages %ld\n", offlined_pages);
1653		- /* Ok, all of our target is isolated.
1654		- We cannot do rollback at this point. */
1655		- offline_isolated_pages(start_pfn, end_pfn);
1656		- /* reset pagetype flags and makes migrate type to be MOVABLE */
1657		- undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
	1630	+ spin_lock_irqsave(&zone->lock, flags);
	1631	+ zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
	1632	+ spin_unlock_irqrestore(&zone->lock, flags);
	1633	+
	1634	+ lru_cache_enable();
1658	1635	/* removal success */
1659		- adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
1660		- zone->present_pages -= offlined_pages;
	1636	+ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
	1637	+ zone->present_pages -= nr_pages;
1661	1638
1662	1639	pgdat_resize_lock(zone->zone_pgdat, &flags);
1663		- zone->zone_pgdat->node_present_pages -= offlined_pages;
	1640	+ zone->zone_pgdat->node_present_pages -= nr_pages;
1664	1641	pgdat_resize_unlock(zone->zone_pgdat, &flags);
1665	1642
1666	1643	init_per_zone_wmark_min();
..	..	@@ -1677,7 +1654,6 @@
1677	1654	kcompactd_stop(node);
1678	1655	}
1679	1656
1680		- vm_total_pages = nr_free_pagecache_pages();
1681	1657	writeback_set_ratelimit();
1682	1658
1683	1659	memory_notify(MEM_OFFLINE, &arg);
..	..	@@ -1685,73 +1661,21 @@
1685	1661	mem_hotplug_done();
1686	1662	return 0;
1687	1663
1688		-failed_removal:
1689		- pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
1690		- (unsigned long long) start_pfn << PAGE_SHIFT,
1691		- ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
1692		- memory_notify(MEM_CANCEL_OFFLINE, &arg);
1693		- /* pushback to free area */
	1664	+failed_removal_isolated:
1694	1665	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
	1666	+ memory_notify(MEM_CANCEL_OFFLINE, &arg);
	1667	+failed_removal_lru_cache_disabled:
	1668	+ lru_cache_enable();
	1669	+failed_removal:
	1670	+ pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
	1671	+ (unsigned long long) start_pfn << PAGE_SHIFT,
	1672	+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
	1673	+ reason);
	1674	+ /* pushback to free area */
1695	1675	mem_hotplug_done();
1696	1676	return ret;
1697	1677	}
1698	1678
1699		-int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1700		-{
1701		- return __offline_pages(start_pfn, start_pfn + nr_pages);
1702		-}
1703		-#endif /* CONFIG_MEMORY_HOTREMOVE */
1704		-
1705		-/**
1706		- * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
1707		- * @start_pfn: start pfn of the memory range
1708		- * @end_pfn: end pfn of the memory range
1709		- * @arg: argument passed to func
1710		- * @func: callback for each memory section walked
1711		- *
1712		- * This function walks through all present mem sections in range
1713		- * [start_pfn, end_pfn) and call func on each mem section.
1714		- *
1715		- * Returns the return value of func.
1716		- */
1717		-int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1718		- void arg, int (func)(struct memory_block , void ))
1719		-{
1720		- struct memory_block *mem = NULL;
1721		- struct mem_section *section;
1722		- unsigned long pfn, section_nr;
1723		- int ret;
1724		-
1725		- for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1726		- section_nr = pfn_to_section_nr(pfn);
1727		- if (!present_section_nr(section_nr))
1728		- continue;
1729		-
1730		- section = __nr_to_section(section_nr);
1731		- /* same memblock? */
1732		- if (mem)
1733		- if ((section_nr >= mem->start_section_nr) &&
1734		- (section_nr <= mem->end_section_nr))
1735		- continue;
1736		-
1737		- mem = find_memory_block_hinted(section, mem);
1738		- if (!mem)
1739		- continue;
1740		-
1741		- ret = func(mem, arg);
1742		- if (ret) {
1743		- kobject_put(&mem->dev.kobj);
1744		- return ret;
1745		- }
1746		- }
1747		-
1748		- if (mem)
1749		- kobject_put(&mem->dev.kobj);
1750		-
1751		- return 0;
1752		-}
1753		-
1754		-#ifdef CONFIG_MEMORY_HOTREMOVE
1755	1679	static int check_memblock_offlined_cb(struct memory_block mem, void arg)
1756	1680	{
1757	1681	int ret = !is_memblock_offlined(mem);
..	..	@@ -1760,12 +1684,13 @@
1760	1684	phys_addr_t beginpa, endpa;
1761	1685
1762	1686	beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
1763		- endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
	1687	+ endpa = beginpa + memory_block_size_bytes() - 1;
1764	1688	pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
1765	1689	&beginpa, &endpa);
1766		- }
1767	1690
1768		- return ret;
	1691	+ return -EBUSY;
	1692	+ }
	1693	+ return 0;
1769	1694	}
1770	1695
1771	1696	static int check_cpu_on_node(pg_data_t *pgdat)
..	..	@@ -1781,34 +1706,6 @@
1781	1706	return -EBUSY;
1782	1707	}
1783	1708
1784		- return 0;
1785		-}
1786		-
1787		-static void unmap_cpu_on_node(pg_data_t *pgdat)
1788		-{
1789		-#ifdef CONFIG_ACPI_NUMA
1790		- int cpu;
1791		-
1792		- for_each_possible_cpu(cpu)
1793		- if (cpu_to_node(cpu) == pgdat->node_id)
1794		- numa_clear_node(cpu);
1795		-#endif
1796		-}
1797		-
1798		-static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
1799		-{
1800		- int ret;
1801		-
1802		- ret = check_cpu_on_node(pgdat);
1803		- if (ret)
1804		- return ret;
1805		-
1806		- /*
1807		- * the node will be offlined when we come here, so we can clear
1808		- * the cpu_to_node() now.
1809		- */
1810		-
1811		- unmap_cpu_on_node(pgdat);
1812	1709	return 0;
1813	1710	}
1814	1711
..	..	@@ -1855,7 +1752,7 @@
1855	1752	if (rc)
1856	1753	return;
1857	1754
1858		- if (check_and_unmap_cpu_on_node(pgdat))
	1755	+ if (check_cpu_on_node(pgdat))
1859	1756	return;
1860	1757
1861	1758	/*
..	..	@@ -1867,24 +1764,45 @@
1867	1764	}
1868	1765	EXPORT_SYMBOL(try_offline_node);
1869	1766
1870		-static void __release_memory_resource(resource_size_t start,
1871		- resource_size_t size)
	1767	+static int __ref try_remove_memory(int nid, u64 start, u64 size)
1872	1768	{
1873		- int ret;
	1769	+ int rc = 0;
	1770	+
	1771	+ BUG_ON(check_hotplug_memory_range(start, size));
1874	1772
1875	1773	/*
1876		- * When removing memory in the same granularity as it was added,
1877		- * this function never fails. It might only fail if resources
1878		- * have to be adjusted or split. We'll ignore the error, as
1879		- * removing of memory cannot fail.
	1774	+ * All memory blocks must be offlined before removing memory. Check
	1775	+ * whether all memory blocks in question are offline and return error
	1776	+ * if this is not the case.
1880	1777	*/
1881		- ret = release_mem_region_adjustable(&iomem_resource, start, size);
1882		- if (ret) {
1883		- resource_size_t endres = start + size - 1;
	1778	+ rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
	1779	+ if (rc)
	1780	+ return rc;
1884	1781
1885		- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
1886		- &start, &endres, ret);
	1782	+ /* remove memmap entry */
	1783	+ firmware_map_remove(start, start + size, "System RAM");
	1784	+
	1785	+ /*
	1786	+ * Memory block device removal under the device_hotplug_lock is
	1787	+ * a barrier against racing online attempts.
	1788	+ */
	1789	+ remove_memory_block_devices(start, size);
	1790	+
	1791	+ mem_hotplug_begin();
	1792	+
	1793	+ arch_remove_memory(nid, start, size, NULL);
	1794	+
	1795	+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
	1796	+ memblock_free(start, size);
	1797	+ memblock_remove(start, size);
1887	1798	}
	1799	+
	1800	+ release_mem_region_adjustable(start, size);
	1801	+
	1802	+ try_offline_node(nid);
	1803	+
	1804	+ mem_hotplug_done();
	1805	+ return 0;
1888	1806	}
1889	1807
1890	1808	/**
..	..	@@ -1897,48 +1815,163 @@
1897	1815	* and online/offline operations before this call, as required by
1898	1816	* try_offline_node().
1899	1817	*/
1900		-void __ref __remove_memory(int nid, u64 start, u64 size)
	1818	+void __remove_memory(int nid, u64 start, u64 size)
1901	1819	{
1902		- int ret;
1903		-
1904		- BUG_ON(check_hotplug_memory_range(start, size));
1905	1820
1906	1821	/*
1907		- * All memory blocks must be offlined before removing memory. Check
1908		- * whether all memory blocks in question are offline and trigger a BUG()
1909		- * if this is not the case.
	1822	+ * trigger BUG() if some memory is not offlined prior to calling this
	1823	+ * function
1910	1824	*/
1911		- ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
1912		- check_memblock_offlined_cb);
1913		- if (ret)
	1825	+ if (try_remove_memory(nid, start, size))
1914	1826	BUG();
1915		-
1916		- /* remove memmap entry */
1917		- firmware_map_remove(start, start + size, "System RAM");
1918		- memblock_free(start, size);
1919		- memblock_remove(start, size);
1920		-
1921		- /*
1922		- * Memory block device removal under the device_hotplug_lock is
1923		- * a barrier against racing online attempts.
1924		- */
1925		- remove_memory_block_devices(start, size);
1926		-
1927		- mem_hotplug_begin();
1928		-
1929		- arch_remove_memory(nid, start, size, NULL);
1930		- __release_memory_resource(start, size);
1931		-
1932		- try_offline_node(nid);
1933		-
1934		- mem_hotplug_done();
1935	1827	}
1936	1828
1937		-void remove_memory(int nid, u64 start, u64 size)
	1829	+/*
	1830	+ * Remove memory if every memory block is offline, otherwise return -EBUSY is
	1831	+ * some memory is not offline
	1832	+ */
	1833	+int remove_memory(int nid, u64 start, u64 size)
1938	1834	{
	1835	+ int rc;
	1836	+
1939	1837	lock_device_hotplug();
1940		- __remove_memory(nid, start, size);
	1838	+ rc = try_remove_memory(nid, start, size);
1941	1839	unlock_device_hotplug();
	1840	+
	1841	+ return rc;
1942	1842	}
1943	1843	EXPORT_SYMBOL_GPL(remove_memory);
	1844	+
	1845	+int remove_memory_subsection(int nid, u64 start, u64 size)
	1846	+{
	1847	+ if (!IS_ALIGNED(start, SUBSECTION_SIZE) \|\|
	1848	+ !IS_ALIGNED(size, SUBSECTION_SIZE)) {
	1849	+ pr_err("%s: start 0x%llx size 0x%llx not aligned to subsection size\n",
	1850	+ __func__, start, size);
	1851	+ return -EINVAL;
	1852	+ }
	1853	+
	1854	+ mem_hotplug_begin();
	1855	+ arch_remove_memory(nid, start, size, NULL);
	1856	+
	1857	+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
	1858	+ memblock_remove(start, size);
	1859	+
	1860	+ release_mem_region_adjustable(start, size);
	1861	+
	1862	+ mem_hotplug_done();
	1863	+
	1864	+ return 0;
	1865	+}
	1866	+EXPORT_SYMBOL_GPL(remove_memory_subsection);
	1867	+
	1868	+static int try_offline_memory_block(struct memory_block mem, void arg)
	1869	+{
	1870	+ uint8_t online_type = MMOP_ONLINE_KERNEL;
	1871	+ uint8_t **online_types = arg;
	1872	+ struct page *page;
	1873	+ int rc;
	1874	+
	1875	+ /*
	1876	+ * Sense the online_type via the zone of the memory block. Offlining
	1877	+ * with multiple zones within one memory block will be rejected
	1878	+ * by offlining code ... so we don't care about that.
	1879	+ */
	1880	+ page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
	1881	+ if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
	1882	+ online_type = MMOP_ONLINE_MOVABLE;
	1883	+
	1884	+ rc = device_offline(&mem->dev);
	1885	+ /*
	1886	+ * Default is MMOP_OFFLINE - change it only if offlining succeeded,
	1887	+ * so try_reonline_memory_block() can do the right thing.
	1888	+ */
	1889	+ if (!rc)
	1890	+ **online_types = online_type;
	1891	+
	1892	+ (*online_types)++;
	1893	+ /* Ignore if already offline. */
	1894	+ return rc < 0 ? rc : 0;
	1895	+}
	1896	+
	1897	+static int try_reonline_memory_block(struct memory_block mem, void arg)
	1898	+{
	1899	+ uint8_t **online_types = arg;
	1900	+ int rc;
	1901	+
	1902	+ if (**online_types != MMOP_OFFLINE) {
	1903	+ mem->online_type = **online_types;
	1904	+ rc = device_online(&mem->dev);
	1905	+ if (rc < 0)
	1906	+ pr_warn("%s: Failed to re-online memory: %d",
	1907	+ __func__, rc);
	1908	+ }
	1909	+
	1910	+ /* Continue processing all remaining memory blocks. */
	1911	+ (*online_types)++;
	1912	+ return 0;
	1913	+}
	1914	+
	1915	+/*
	1916	+ * Try to offline and remove memory. Might take a long time to finish in case
	1917	+ * memory is still in use. Primarily useful for memory devices that logically
	1918	+ * unplugged all memory (so it's no longer in use) and want to offline + remove
	1919	+ * that memory.
	1920	+ */
	1921	+int offline_and_remove_memory(int nid, u64 start, u64 size)
	1922	+{
	1923	+ const unsigned long mb_count = size / memory_block_size_bytes();
	1924	+ uint8_t online_types, tmp;
	1925	+ int rc;
	1926	+
	1927	+ if (!IS_ALIGNED(start, memory_block_size_bytes()) \|\|
	1928	+ !IS_ALIGNED(size, memory_block_size_bytes()) \|\| !size)
	1929	+ return -EINVAL;
	1930	+
	1931	+ /*
	1932	+ * We'll remember the old online type of each memory block, so we can
	1933	+ * try to revert whatever we did when offlining one memory block fails
	1934	+ * after offlining some others succeeded.
	1935	+ */
	1936	+ online_types = kmalloc_array(mb_count, sizeof(*online_types),
	1937	+ GFP_KERNEL);
	1938	+ if (!online_types)
	1939	+ return -ENOMEM;
	1940	+ /*
	1941	+ * Initialize all states to MMOP_OFFLINE, so when we abort processing in
	1942	+ * try_offline_memory_block(), we'll skip all unprocessed blocks in
	1943	+ * try_reonline_memory_block().
	1944	+ */
	1945	+ memset(online_types, MMOP_OFFLINE, mb_count);
	1946	+
	1947	+ lock_device_hotplug();
	1948	+
	1949	+ tmp = online_types;
	1950	+ rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
	1951	+
	1952	+ /*
	1953	+ * In case we succeeded to offline all memory, remove it.
	1954	+ * This cannot fail as it cannot get onlined in the meantime.
	1955	+ */
	1956	+ if (!rc) {
	1957	+ rc = try_remove_memory(nid, start, size);
	1958	+ if (rc)
	1959	+ pr_err("%s: Failed to remove memory: %d", __func__, rc);
	1960	+ }
	1961	+
	1962	+ /*
	1963	+ * Rollback what we did. While memory onlining might theoretically fail
	1964	+ * (nacked by a notifier), it barely ever happens.
	1965	+ */
	1966	+ if (rc) {
	1967	+ tmp = online_types;
	1968	+ walk_memory_blocks(start, size, &tmp,
	1969	+ try_reonline_memory_block);
	1970	+ }
	1971	+ unlock_device_hotplug();
	1972	+
	1973	+ kfree(online_types);
	1974	+ return rc;
	1975	+}
	1976	+EXPORT_SYMBOL_GPL(offline_and_remove_memory);
1944	1977	#endif /* CONFIG_MEMORY_HOTREMOVE */