hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/mm/memory_hotplug.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * linux/mm/memory_hotplug.c
34 *
....@@ -33,13 +34,13 @@
3334 #include <linux/stop_machine.h>
3435 #include <linux/hugetlb.h>
3536 #include <linux/memblock.h>
36
-#include <linux/bootmem.h>
3737 #include <linux/compaction.h>
3838 #include <linux/rmap.h>
3939
4040 #include <asm/tlbflush.h>
4141
4242 #include "internal.h"
43
+#include "shuffle.h"
4344
4445 /*
4546 * online_page_callback contains pointer to current page onlining function.
....@@ -47,8 +48,6 @@
4748 * changed by calling set_online_page_callback() for callback registration
4849 * and restore_online_page_callback() for generic callback restore.
4950 */
50
-
51
-static void generic_online_page(struct page *page);
5251
5352 static online_page_callback_t online_page_callback = generic_online_page;
5453 static DEFINE_MUTEX(online_page_callback_lock);
....@@ -68,18 +67,17 @@
6867 bool movable_node_enabled = false;
6968
7069 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
71
-bool memhp_auto_online;
70
+int memhp_default_online_type = MMOP_OFFLINE;
7271 #else
73
-bool memhp_auto_online = true;
72
+int memhp_default_online_type = MMOP_ONLINE;
7473 #endif
75
-EXPORT_SYMBOL_GPL(memhp_auto_online);
7674
7775 static int __init setup_memhp_default_state(char *str)
7876 {
79
- if (!strcmp(str, "online"))
80
- memhp_auto_online = true;
81
- else if (!strcmp(str, "offline"))
82
- memhp_auto_online = false;
77
+ const int online_type = memhp_online_type_from_str(str);
78
+
79
+ if (online_type >= 0)
80
+ memhp_default_online_type = online_type;
8381
8482 return 1;
8583 }
....@@ -97,27 +95,38 @@
9795 cpus_read_unlock();
9896 }
9997
100
-/* add this memory to iomem resource */
101
-static struct resource *register_memory_resource(u64 start, u64 size)
102
-{
103
- struct resource *res, *conflict;
104
- res = kzalloc(sizeof(struct resource), GFP_KERNEL);
105
- if (!res)
106
- return ERR_PTR(-ENOMEM);
98
+u64 max_mem_size = U64_MAX;
10799
108
- res->name = "System RAM";
109
- res->start = start;
110
- res->end = start + size - 1;
111
- res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
112
- conflict = request_resource_conflict(&iomem_resource, res);
113
- if (conflict) {
114
- if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
115
- pr_debug("Device unaddressable memory block "
116
- "memory hotplug at %#010llx !\n",
117
- (unsigned long long)start);
118
- }
119
- pr_debug("System RAM resource %pR cannot be added\n", res);
120
- kfree(res);
100
+/* add this memory to iomem resource */
101
+static struct resource *register_memory_resource(u64 start, u64 size,
102
+ const char *resource_name)
103
+{
104
+ struct resource *res;
105
+ unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
106
+
107
+ if (strcmp(resource_name, "System RAM"))
108
+ flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
109
+
110
+ /*
111
+ * Make sure value parsed from 'mem=' only restricts memory adding
112
+ * while booting, so that memory hotplug won't be impacted. Please
113
+ * refer to document of 'mem=' in kernel-parameters.txt for more
114
+ * details.
115
+ */
116
+ if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
117
+ return ERR_PTR(-E2BIG);
118
+
119
+ /*
120
+ * Request ownership of the new memory range. This might be
121
+ * a child of an existing resource that was present but
122
+ * not marked as busy.
123
+ */
124
+ res = __request_region(&iomem_resource, start, size,
125
+ resource_name, flags);
126
+
127
+ if (!res) {
128
+ pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
129
+ start, start + size);
121130 return ERR_PTR(-EEXIST);
122131 }
123132 return res;
....@@ -129,7 +138,6 @@
129138 return;
130139 release_resource(res);
131140 kfree(res);
132
- return;
133141 }
134142
135143 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
....@@ -163,9 +171,10 @@
163171 #ifndef CONFIG_SPARSEMEM_VMEMMAP
164172 static void register_page_bootmem_info_section(unsigned long start_pfn)
165173 {
166
- unsigned long *usemap, mapsize, section_nr, i;
174
+ unsigned long mapsize, section_nr, i;
167175 struct mem_section *ms;
168176 struct page *page, *memmap;
177
+ struct mem_section_usage *usage;
169178
170179 section_nr = pfn_to_section_nr(start_pfn);
171180 ms = __nr_to_section(section_nr);
....@@ -185,10 +194,10 @@
185194 for (i = 0; i < mapsize; i++, page++)
186195 get_page_bootmem(section_nr, page, SECTION_INFO);
187196
188
- usemap = ms->pageblock_flags;
189
- page = virt_to_page(usemap);
197
+ usage = ms->usage;
198
+ page = virt_to_page(usage);
190199
191
- mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
200
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
192201
193202 for (i = 0; i < mapsize; i++, page++)
194203 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
....@@ -197,9 +206,10 @@
197206 #else /* CONFIG_SPARSEMEM_VMEMMAP */
198207 static void register_page_bootmem_info_section(unsigned long start_pfn)
199208 {
200
- unsigned long *usemap, mapsize, section_nr, i;
209
+ unsigned long mapsize, section_nr, i;
201210 struct mem_section *ms;
202211 struct page *page, *memmap;
212
+ struct mem_section_usage *usage;
203213
204214 section_nr = pfn_to_section_nr(start_pfn);
205215 ms = __nr_to_section(section_nr);
....@@ -208,10 +218,10 @@
208218
209219 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
210220
211
- usemap = ms->pageblock_flags;
212
- page = virt_to_page(usemap);
221
+ usage = ms->usage;
222
+ page = virt_to_page(usage);
213223
214
- mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
224
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
215225
216226 for (i = 0; i < mapsize; i++, page++)
217227 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
....@@ -247,16 +257,47 @@
247257 }
248258 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
249259
250
-static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
251
- struct vmem_altmap *altmap, bool want_memblock)
260
+static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
261
+ const char *reason)
252262 {
253
- int ret;
263
+ /*
264
+ * Disallow all operations smaller than a sub-section and only
265
+ * allow operations smaller than a section for
266
+ * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
267
+ * enforces a larger memory_block_size_bytes() granularity for
268
+ * memory that will be marked online, so this check should only
269
+ * fire for direct arch_{add,remove}_memory() users outside of
270
+ * add_memory_resource().
271
+ */
272
+ unsigned long min_align;
254273
255
- if (pfn_valid(phys_start_pfn))
256
- return -EEXIST;
274
+ if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
275
+ min_align = PAGES_PER_SUBSECTION;
276
+ else
277
+ min_align = PAGES_PER_SECTION;
278
+ if (!IS_ALIGNED(pfn, min_align)
279
+ || !IS_ALIGNED(nr_pages, min_align)) {
280
+ WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
281
+ reason, pfn, pfn + nr_pages - 1);
282
+ return -EINVAL;
283
+ }
284
+ return 0;
285
+}
257286
258
- ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
259
- return ret < 0 ? ret : 0;
287
+static int check_hotplug_memory_addressable(unsigned long pfn,
288
+ unsigned long nr_pages)
289
+{
290
+ const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
291
+
292
+ if (max_addr >> MAX_PHYSMEM_BITS) {
293
+ const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
294
+ WARN(1,
295
+ "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
296
+ (u64)PFN_PHYS(pfn), max_addr, max_allowed);
297
+ return -E2BIG;
298
+ }
299
+
300
+ return 0;
260301 }
261302
262303 /*
....@@ -265,47 +306,47 @@
265306 * call this function after deciding the zone to which to
266307 * add the new pages.
267308 */
268
-int __ref __add_pages(int nid, unsigned long phys_start_pfn,
269
- unsigned long nr_pages, struct vmem_altmap *altmap,
270
- bool want_memblock)
309
+int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
310
+ struct mhp_params *params)
271311 {
272
- unsigned long i;
273
- int err = 0;
274
- int start_sec, end_sec;
312
+ const unsigned long end_pfn = pfn + nr_pages;
313
+ unsigned long cur_nr_pages;
314
+ int err;
315
+ struct vmem_altmap *altmap = params->altmap;
275316
276
- /* during initialize mem_map, align hot-added range to section */
277
- start_sec = pfn_to_section_nr(phys_start_pfn);
278
- end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
317
+ if (WARN_ON_ONCE(!params->pgprot.pgprot))
318
+ return -EINVAL;
319
+
320
+ err = check_hotplug_memory_addressable(pfn, nr_pages);
321
+ if (err)
322
+ return err;
279323
280324 if (altmap) {
281325 /*
282326 * Validate altmap is within bounds of the total request
283327 */
284
- if (altmap->base_pfn != phys_start_pfn
328
+ if (altmap->base_pfn != pfn
285329 || vmem_altmap_offset(altmap) > nr_pages) {
286330 pr_warn_once("memory add fail, invalid altmap\n");
287
- err = -EINVAL;
288
- goto out;
331
+ return -EINVAL;
289332 }
290333 altmap->alloc = 0;
291334 }
292335
293
- for (i = start_sec; i <= end_sec; i++) {
294
- err = __add_section(nid, section_nr_to_pfn(i), altmap,
295
- want_memblock);
336
+ err = check_pfn_span(pfn, nr_pages, "add");
337
+ if (err)
338
+ return err;
296339
297
- /*
298
- * EEXIST is finally dealt with by ioresource collision
299
- * check. see add_memory() => register_memory_resource()
300
- * Warning will be printed if there is collision.
301
- */
302
- if (err && (err != -EEXIST))
340
+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
341
+ /* Select all remaining pages up to the next section boundary */
342
+ cur_nr_pages = min(end_pfn - pfn,
343
+ SECTION_ALIGN_UP(pfn + 1) - pfn);
344
+ err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
345
+ if (err)
303346 break;
304
- err = 0;
305347 cond_resched();
306348 }
307349 vmemmap_populate_print_last();
308
-out:
309350 return err;
310351 }
311352
....@@ -314,14 +355,14 @@
314355 unsigned long start_pfn,
315356 unsigned long end_pfn)
316357 {
317
- for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
358
+ for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
318359 if (unlikely(!pfn_to_online_page(start_pfn)))
319360 continue;
320361
321362 if (unlikely(pfn_to_nid(start_pfn) != nid))
322363 continue;
323364
324
- if (zone && zone != page_zone(pfn_to_page(start_pfn)))
365
+ if (zone != page_zone(pfn_to_page(start_pfn)))
325366 continue;
326367
327368 return start_pfn;
....@@ -339,14 +380,14 @@
339380
340381 /* pfn is the end pfn of a memory section. */
341382 pfn = end_pfn - 1;
342
- for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
383
+ for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
343384 if (unlikely(!pfn_to_online_page(pfn)))
344385 continue;
345386
346387 if (unlikely(pfn_to_nid(pfn) != nid))
347388 continue;
348389
349
- if (zone && zone != page_zone(pfn_to_page(pfn)))
390
+ if (zone != page_zone(pfn_to_page(pfn)))
350391 continue;
351392
352393 return pfn;
....@@ -358,14 +399,11 @@
358399 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
359400 unsigned long end_pfn)
360401 {
361
- unsigned long zone_start_pfn = zone->zone_start_pfn;
362
- unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
363
- unsigned long zone_end_pfn = z;
364402 unsigned long pfn;
365403 int nid = zone_to_nid(zone);
366404
367405 zone_span_writelock(zone);
368
- if (zone_start_pfn == start_pfn) {
406
+ if (zone->zone_start_pfn == start_pfn) {
369407 /*
370408 * If the section is smallest section in the zone, it need
371409 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
....@@ -373,50 +411,30 @@
373411 * for shrinking zone.
374412 */
375413 pfn = find_smallest_section_pfn(nid, zone, end_pfn,
376
- zone_end_pfn);
414
+ zone_end_pfn(zone));
377415 if (pfn) {
416
+ zone->spanned_pages = zone_end_pfn(zone) - pfn;
378417 zone->zone_start_pfn = pfn;
379
- zone->spanned_pages = zone_end_pfn - pfn;
418
+ } else {
419
+ zone->zone_start_pfn = 0;
420
+ zone->spanned_pages = 0;
380421 }
381
- } else if (zone_end_pfn == end_pfn) {
422
+ } else if (zone_end_pfn(zone) == end_pfn) {
382423 /*
383424 * If the section is biggest section in the zone, it need
384425 * shrink zone->spanned_pages.
385426 * In this case, we find second biggest valid mem_section for
386427 * shrinking zone.
387428 */
388
- pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
429
+ pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
389430 start_pfn);
390431 if (pfn)
391
- zone->spanned_pages = pfn - zone_start_pfn + 1;
432
+ zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
433
+ else {
434
+ zone->zone_start_pfn = 0;
435
+ zone->spanned_pages = 0;
436
+ }
392437 }
393
-
394
- /*
395
- * The section is not biggest or smallest mem_section in the zone, it
396
- * only creates a hole in the zone. So in this case, we need not
397
- * change the zone. But perhaps, the zone has only hole data. Thus
398
- * it check the zone has only hole or not.
399
- */
400
- pfn = zone_start_pfn;
401
- for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
402
- if (unlikely(!pfn_to_online_page(pfn)))
403
- continue;
404
-
405
- if (page_zone(pfn_to_page(pfn)) != zone)
406
- continue;
407
-
408
- /* If the section is current section, it continues the loop */
409
- if (start_pfn == pfn)
410
- continue;
411
-
412
- /* If we find valid section, we have nothing to do */
413
- zone_span_writeunlock(zone);
414
- return;
415
- }
416
-
417
- /* The zone has no valid section */
418
- zone->zone_start_pfn = 0;
419
- zone->spanned_pages = 0;
420438 zone_span_writeunlock(zone);
421439 }
422440
....@@ -453,8 +471,20 @@
453471 unsigned long start_pfn,
454472 unsigned long nr_pages)
455473 {
474
+ const unsigned long end_pfn = start_pfn + nr_pages;
456475 struct pglist_data *pgdat = zone->zone_pgdat;
457
- unsigned long flags;
476
+ unsigned long pfn, cur_nr_pages, flags;
477
+
478
+ /* Poison struct pages because they are now uninitialized again. */
479
+ for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
480
+ cond_resched();
481
+
482
+ /* Select all remaining pages up to the next section boundary */
483
+ cur_nr_pages =
484
+ min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
485
+ page_init_poison(pfn_to_page(pfn),
486
+ sizeof(struct page) * cur_nr_pages);
487
+ }
458488
459489 #ifdef CONFIG_ZONE_DEVICE
460490 /*
....@@ -476,24 +506,21 @@
476506 set_zone_contiguous(zone);
477507 }
478508
479
-static void __remove_section(struct mem_section *ms, unsigned long map_offset,
509
+static void __remove_section(unsigned long pfn, unsigned long nr_pages,
510
+ unsigned long map_offset,
480511 struct vmem_altmap *altmap)
481512 {
482
- unsigned long start_pfn;
483
- int scn_nr;
513
+ struct mem_section *ms = __pfn_to_section(pfn);
484514
485515 if (WARN_ON_ONCE(!valid_section(ms)))
486516 return;
487517
488
- scn_nr = __section_nr(ms);
489
- start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
490
-
491
- sparse_remove_one_section(ms, map_offset, altmap);
518
+ sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
492519 }
493520
494521 /**
495522 * __remove_pages() - remove sections of pages
496
- * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
523
+ * @pfn: starting pageframe (must be aligned to start of a section)
497524 * @nr_pages: number of pages to remove (must be multiple of section size)
498525 * @altmap: alternative device page map or %NULL if default memmap is used
499526 *
....@@ -502,28 +529,24 @@
502529 * sure that pages are marked reserved and zones are adjust properly by
503530 * calling offline_pages().
504531 */
505
-void __remove_pages(unsigned long phys_start_pfn, unsigned long nr_pages,
532
+void __remove_pages(unsigned long pfn, unsigned long nr_pages,
506533 struct vmem_altmap *altmap)
507534 {
508
- unsigned long i;
535
+ const unsigned long end_pfn = pfn + nr_pages;
536
+ unsigned long cur_nr_pages;
509537 unsigned long map_offset = 0;
510
- int sections_to_remove;
511538
512
- if (altmap)
513
- map_offset = vmem_altmap_offset(altmap);
539
+ map_offset = vmem_altmap_offset(altmap);
514540
515
- /*
516
- * We can only remove entire sections
517
- */
518
- BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
519
- BUG_ON(nr_pages % PAGES_PER_SECTION);
541
+ if (check_pfn_span(pfn, nr_pages, "remove"))
542
+ return;
520543
521
- sections_to_remove = nr_pages / PAGES_PER_SECTION;
522
- for (i = 0; i < sections_to_remove; i++) {
523
- unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
524
-
544
+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
525545 cond_resched();
526
- __remove_section(__pfn_to_section(pfn), map_offset, altmap);
546
+ /* Select all remaining pages up to the next section boundary */
547
+ cur_nr_pages = min(end_pfn - pfn,
548
+ SECTION_ALIGN_UP(pfn + 1) - pfn);
549
+ __remove_section(pfn, cur_nr_pages, map_offset, altmap);
527550 map_offset = 0;
528551 }
529552 }
....@@ -566,48 +589,39 @@
566589 }
567590 EXPORT_SYMBOL_GPL(restore_online_page_callback);
568591
569
-void __online_page_set_limits(struct page *page)
592
+void generic_online_page(struct page *page, unsigned int order)
570593 {
594
+ /*
595
+ * Freeing the page with debug_pagealloc enabled will try to unmap it,
596
+ * so we should map it first. This is better than introducing a special
597
+ * case in page freeing fast path.
598
+ */
599
+ debug_pagealloc_map_pages(page, 1 << order);
600
+ __free_pages_core(page, order);
601
+ totalram_pages_add(1UL << order);
602
+#ifdef CONFIG_HIGHMEM
603
+ if (PageHighMem(page))
604
+ totalhigh_pages_add(1UL << order);
605
+#endif
571606 }
572
-EXPORT_SYMBOL_GPL(__online_page_set_limits);
607
+EXPORT_SYMBOL_GPL(generic_online_page);
573608
574
-void __online_page_increment_counters(struct page *page)
609
+static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
575610 {
576
- adjust_managed_page_count(page, 1);
577
-}
578
-EXPORT_SYMBOL_GPL(__online_page_increment_counters);
611
+ const unsigned long end_pfn = start_pfn + nr_pages;
612
+ unsigned long pfn;
579613
580
-void __online_page_free(struct page *page)
581
-{
582
- __free_reserved_page(page);
583
-}
584
-EXPORT_SYMBOL_GPL(__online_page_free);
614
+ /*
615
+ * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
616
+ * decide to not expose all pages to the buddy (e.g., expose them
617
+ * later). We account all pages as being online and belonging to this
618
+ * zone ("present").
619
+ */
620
+ for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
621
+ (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
585622
586
-static void generic_online_page(struct page *page)
587
-{
588
- __online_page_set_limits(page);
589
- __online_page_increment_counters(page);
590
- __online_page_free(page);
591
-}
592
-
593
-static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
594
- void *arg)
595
-{
596
- unsigned long i;
597
- unsigned long onlined_pages = *(unsigned long *)arg;
598
- struct page *page;
599
-
600
- if (PageReserved(pfn_to_page(start_pfn)))
601
- for (i = 0; i < nr_pages; i++) {
602
- page = pfn_to_page(start_pfn + i);
603
- (*online_page_callback)(page);
604
- onlined_pages++;
605
- }
606
-
607
- online_mem_sections(start_pfn, start_pfn + nr_pages);
608
-
609
- *(unsigned long *)arg = onlined_pages;
610
- return 0;
623
+ /* mark all involved sections as online */
624
+ online_mem_sections(start_pfn, end_pfn);
611625 }
612626
613627 /* check which state of node_states will be changed when online memory */
....@@ -615,62 +629,19 @@
615629 struct zone *zone, struct memory_notify *arg)
616630 {
617631 int nid = zone_to_nid(zone);
618
- enum zone_type zone_last = ZONE_NORMAL;
619632
620
- /*
621
- * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
622
- * contains nodes which have zones of 0...ZONE_NORMAL,
623
- * set zone_last to ZONE_NORMAL.
624
- *
625
- * If we don't have HIGHMEM nor movable node,
626
- * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
627
- * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
628
- */
629
- if (N_MEMORY == N_NORMAL_MEMORY)
630
- zone_last = ZONE_MOVABLE;
633
+ arg->status_change_nid = NUMA_NO_NODE;
634
+ arg->status_change_nid_normal = NUMA_NO_NODE;
635
+ arg->status_change_nid_high = NUMA_NO_NODE;
631636
632
- /*
633
- * if the memory to be online is in a zone of 0...zone_last, and
634
- * the zones of 0...zone_last don't have memory before online, we will
635
- * need to set the node to node_states[N_NORMAL_MEMORY] after
636
- * the memory is online.
637
- */
638
- if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
639
- arg->status_change_nid_normal = nid;
640
- else
641
- arg->status_change_nid_normal = -1;
642
-
643
-#ifdef CONFIG_HIGHMEM
644
- /*
645
- * If we have movable node, node_states[N_HIGH_MEMORY]
646
- * contains nodes which have zones of 0...ZONE_HIGHMEM,
647
- * set zone_last to ZONE_HIGHMEM.
648
- *
649
- * If we don't have movable node, node_states[N_NORMAL_MEMORY]
650
- * contains nodes which have zones of 0...ZONE_MOVABLE,
651
- * set zone_last to ZONE_MOVABLE.
652
- */
653
- zone_last = ZONE_HIGHMEM;
654
- if (N_MEMORY == N_HIGH_MEMORY)
655
- zone_last = ZONE_MOVABLE;
656
-
657
- if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
658
- arg->status_change_nid_high = nid;
659
- else
660
- arg->status_change_nid_high = -1;
661
-#else
662
- arg->status_change_nid_high = arg->status_change_nid_normal;
663
-#endif
664
-
665
- /*
666
- * if the node don't have memory befor online, we will need to
667
- * set the node to node_states[N_MEMORY] after the memory
668
- * is online.
669
- */
670637 if (!node_state(nid, N_MEMORY))
671638 arg->status_change_nid = nid;
672
- else
673
- arg->status_change_nid = -1;
639
+ if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
640
+ arg->status_change_nid_normal = nid;
641
+#ifdef CONFIG_HIGHMEM
642
+ if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
643
+ arg->status_change_nid_high = nid;
644
+#endif
674645 }
675646
676647 static void node_states_set_node(int node, struct memory_notify *arg)
....@@ -681,7 +652,8 @@
681652 if (arg->status_change_nid_high >= 0)
682653 node_set_state(node, N_HIGH_MEMORY);
683654
684
- node_set_state(node, N_MEMORY);
655
+ if (arg->status_change_nid >= 0)
656
+ node_set_state(node, N_MEMORY);
685657 }
686658
687659 static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
....@@ -704,23 +676,32 @@
704676 pgdat->node_start_pfn = start_pfn;
705677
706678 pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
707
-}
708679
680
+}
681
+/*
682
+ * Associate the pfn range with the given zone, initializing the memmaps
683
+ * and resizing the pgdat/zone data to span the added pages. After this
684
+ * call, all affected pages are PG_reserved.
685
+ *
686
+ * All aligned pageblocks are initialized to the specified migratetype
687
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
688
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
689
+ */
709690 void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
710
- unsigned long nr_pages, struct vmem_altmap *altmap)
691
+ unsigned long nr_pages,
692
+ struct vmem_altmap *altmap, int migratetype)
711693 {
712694 struct pglist_data *pgdat = zone->zone_pgdat;
713695 int nid = pgdat->node_id;
714696 unsigned long flags;
715
-
716
- if (zone_is_empty(zone))
717
- init_currently_empty_zone(zone, start_pfn, nr_pages);
718697
719698 clear_zone_contiguous(zone);
720699
721700 /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
722701 pgdat_resize_lock(pgdat, &flags);
723702 zone_span_writelock(zone);
703
+ if (zone_is_empty(zone))
704
+ init_currently_empty_zone(zone, start_pfn, nr_pages);
724705 resize_zone_range(zone, start_pfn, nr_pages);
725706 zone_span_writeunlock(zone);
726707 resize_pgdat_range(pgdat, start_pfn, nr_pages);
....@@ -732,8 +713,8 @@
732713 * expects the zone spans the pfn range. All the pages in the range
733714 * are reserved so nobody should be touching them so we should be safe
734715 */
735
- memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
736
- MEMINIT_HOTPLUG, altmap);
716
+ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0,
717
+ MEMINIT_HOTPLUG, altmap, migratetype);
737718
738719 set_zone_contiguous(zone);
739720 }
....@@ -795,43 +776,25 @@
795776 return default_zone_for_pfn(nid, start_pfn, nr_pages);
796777 }
797778
798
-/*
799
- * Associates the given pfn range with the given node and the zone appropriate
800
- * for the given online type.
801
- */
802
-static struct zone * __meminit move_pfn_range(int online_type, int nid,
803
- unsigned long start_pfn, unsigned long nr_pages)
804
-{
805
- struct zone *zone;
806
-
807
- zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
808
- move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL);
809
- return zone;
810
-}
811
-
812
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
779
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
780
+ int online_type, int nid)
813781 {
814782 unsigned long flags;
815
- unsigned long onlined_pages = 0;
816783 struct zone *zone;
817784 int need_zonelists_rebuild = 0;
818
- int nid;
819785 int ret;
820786 struct memory_notify arg;
821
- struct memory_block *mem;
787
+
788
+ /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
789
+ if (WARN_ON_ONCE(!nr_pages ||
790
+ !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
791
+ return -EINVAL;
822792
823793 mem_hotplug_begin();
824794
825
- /*
826
- * We can't use pfn_to_nid() because nid might be stored in struct page
827
- * which is not yet initialized. Instead, we find nid from memory block.
828
- */
829
- mem = find_memory_block(__pfn_to_section(pfn));
830
- nid = mem->nid;
831
- put_device(&mem->dev);
832
-
833795 /* associate pfn range with the zone */
834
- zone = move_pfn_range(online_type, nid, pfn, nr_pages);
796
+ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
797
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
835798
836799 arg.start_pfn = pfn;
837800 arg.nr_pages = nr_pages;
....@@ -843,6 +806,14 @@
843806 goto failed_addition;
844807
845808 /*
809
+ * Fixup the number of isolated pageblocks before marking the sections
810
+ * onlining, such that undo_isolate_page_range() works correctly.
811
+ */
812
+ spin_lock_irqsave(&zone->lock, flags);
813
+ zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
814
+ spin_unlock_irqrestore(&zone->lock, flags);
815
+
816
+ /*
846817 * If this zone is not populated, then it is not in zonelist.
847818 * This means the page allocator ignores this zone.
848819 * So, zonelist must be updated after online.
....@@ -852,41 +823,37 @@
852823 setup_zone_pageset(zone);
853824 }
854825
855
- ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
856
- online_pages_range);
857
- if (ret) {
858
- if (need_zonelists_rebuild)
859
- zone_pcp_reset(zone);
860
- goto failed_addition;
861
- }
862
-
863
- zone->present_pages += onlined_pages;
826
+ online_pages_range(pfn, nr_pages);
827
+ zone->present_pages += nr_pages;
864828
865829 pgdat_resize_lock(zone->zone_pgdat, &flags);
866
- zone->zone_pgdat->node_present_pages += onlined_pages;
830
+ zone->zone_pgdat->node_present_pages += nr_pages;
867831 pgdat_resize_unlock(zone->zone_pgdat, &flags);
868832
869
- if (onlined_pages) {
870
- node_states_set_node(nid, &arg);
871
- if (need_zonelists_rebuild)
872
- build_all_zonelists(NULL);
873
- else
874
- zone_pcp_update(zone);
875
- }
833
+ node_states_set_node(nid, &arg);
834
+ if (need_zonelists_rebuild)
835
+ build_all_zonelists(NULL);
836
+ zone_pcp_update(zone);
837
+
838
+ /* Basic onlining is complete, allow allocation of onlined pages. */
839
+ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
840
+
841
+ /*
842
+ * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
843
+ * the tail of the freelist when undoing isolation). Shuffle the whole
844
+ * zone to make sure the just onlined pages are properly distributed
845
+ * across the whole freelist - to create an initial shuffle.
846
+ */
847
+ shuffle_zone(zone);
876848
877849 init_per_zone_wmark_min();
878850
879
- if (onlined_pages) {
880
- kswapd_run(nid);
881
- kcompactd_run(nid);
882
- }
883
-
884
- vm_total_pages = nr_free_pagecache_pages();
851
+ kswapd_run(nid);
852
+ kcompactd_run(nid);
885853
886854 writeback_set_ratelimit();
887855
888
- if (onlined_pages)
889
- memory_notify(MEM_ONLINE, &arg);
856
+ memory_notify(MEM_ONLINE, &arg);
890857 mem_hotplug_done();
891858 return 0;
892859
....@@ -912,10 +879,9 @@
912879 }
913880
914881 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
915
-static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
882
+static pg_data_t __ref *hotadd_new_pgdat(int nid)
916883 {
917884 struct pglist_data *pgdat;
918
- unsigned long start_pfn = PFN_DOWN(start);
919885
920886 pgdat = NODE_DATA(nid);
921887 if (!pgdat) {
....@@ -923,26 +889,33 @@
923889 if (!pgdat)
924890 return NULL;
925891
892
+ pgdat->per_cpu_nodestats =
893
+ alloc_percpu(struct per_cpu_nodestat);
926894 arch_refresh_nodedata(nid, pgdat);
927895 } else {
896
+ int cpu;
928897 /*
929
- * Reset the nr_zones, order and classzone_idx before reuse.
930
- * Note that kswapd will init kswapd_classzone_idx properly
898
+ * Reset the nr_zones, order and highest_zoneidx before reuse.
899
+ * Note that kswapd will init kswapd_highest_zoneidx properly
931900 * when it starts in the near future.
932901 */
933902 pgdat->nr_zones = 0;
934903 pgdat->kswapd_order = 0;
935
- pgdat->kswapd_classzone_idx = 0;
904
+ pgdat->kswapd_highest_zoneidx = 0;
905
+ for_each_online_cpu(cpu) {
906
+ struct per_cpu_nodestat *p;
907
+
908
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
909
+ memset(p, 0, sizeof(*p));
910
+ }
936911 }
937912
938913 /* we can use NODE_DATA(nid) from here */
939
-
940914 pgdat->node_id = nid;
941
- pgdat->node_start_pfn = start_pfn;
915
+ pgdat->node_start_pfn = 0;
942916
943917 /* init node's zones as empty zones, we don't have any present pages.*/
944918 free_area_init_core_hotplug(nid);
945
- pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
946919
947920 /*
948921 * The node we allocated has no zone fallback lists. For avoiding
....@@ -968,14 +941,12 @@
968941 arch_refresh_nodedata(nid, NULL);
969942 free_percpu(pgdat->per_cpu_nodestats);
970943 arch_free_nodedata(pgdat);
971
- return;
972944 }
973945
974946
975947 /**
976948 * try_online_node - online a node if offlined
977949 * @nid: the node ID
978
- * @start: start addr of the node
979950 * @set_node_online: Whether we want to online the node
980951 * called by cpu_up() to online a node without onlined memory.
981952 *
....@@ -984,7 +955,7 @@
984955 * 0 -> the node is already online
985956 * -ENOMEM -> the node could not be allocated
986957 */
987
-static int __try_online_node(int nid, u64 start, bool set_node_online)
958
+static int __try_online_node(int nid, bool set_node_online)
988959 {
989960 pg_data_t *pgdat;
990961 int ret = 1;
....@@ -992,7 +963,7 @@
992963 if (node_online(nid))
993964 return 0;
994965
995
- pgdat = hotadd_new_pgdat(nid, start);
966
+ pgdat = hotadd_new_pgdat(nid);
996967 if (!pgdat) {
997968 pr_err("Cannot online node %d due to NULL pgdat\n", nid);
998969 ret = -ENOMEM;
....@@ -1016,23 +987,18 @@
1016987 int ret;
1017988
1018989 mem_hotplug_begin();
1019
- ret = __try_online_node(nid, 0, true);
990
+ ret = __try_online_node(nid, true);
1020991 mem_hotplug_done();
1021992 return ret;
1022993 }
1023994
1024995 static int check_hotplug_memory_range(u64 start, u64 size)
1025996 {
1026
- unsigned long block_sz = memory_block_size_bytes();
1027
- u64 block_nr_pages = block_sz >> PAGE_SHIFT;
1028
- u64 nr_pages = size >> PAGE_SHIFT;
1029
- u64 start_pfn = PFN_DOWN(start);
1030
-
1031997 /* memory range must be block size aligned */
1032
- if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) ||
1033
- !IS_ALIGNED(nr_pages, block_nr_pages)) {
998
+ if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) ||
999
+ !IS_ALIGNED(size, memory_block_size_bytes())) {
10341000 pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
1035
- block_sz, start, size);
1001
+ memory_block_size_bytes(), start, size);
10361002 return -EINVAL;
10371003 }
10381004
....@@ -1041,6 +1007,7 @@
10411007
10421008 static int online_memory_block(struct memory_block *mem, void *arg)
10431009 {
1010
+ mem->online_type = memhp_default_online_type;
10441011 return device_online(&mem->dev);
10451012 }
10461013
....@@ -1050,8 +1017,9 @@
10501017 *
10511018 * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
10521019 */
1053
-int __ref add_memory_resource(int nid, struct resource *res, bool online)
1020
+int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
10541021 {
1022
+ struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
10551023 u64 start, size;
10561024 bool new_node = false;
10571025 int ret;
....@@ -1063,23 +1031,23 @@
10631031 if (ret)
10641032 return ret;
10651033
1034
+ if (!node_possible(nid)) {
1035
+ WARN(1, "node %d was absent from the node_possible_map\n", nid);
1036
+ return -EINVAL;
1037
+ }
1038
+
10661039 mem_hotplug_begin();
10671040
1068
- /*
1069
- * Add new range to memblock so that when hotadd_new_pgdat() is called
1070
- * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
1071
- * this new range and calculate total pages correctly. The range will
1072
- * be removed at hot-remove time.
1073
- */
1074
- memblock_add_node(start, size, nid);
1041
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
1042
+ memblock_add_node(start, size, nid);
10751043
1076
- ret = __try_online_node(nid, start, false);
1044
+ ret = __try_online_node(nid, false);
10771045 if (ret < 0)
10781046 goto error;
10791047 new_node = ret;
10801048
10811049 /* call arch's memory hotadd */
1082
- ret = arch_add_memory(nid, start, size, NULL, true);
1050
+ ret = arch_add_memory(nid, start, size, &params);
10831051 if (ret < 0)
10841052 goto error;
10851053
....@@ -1102,143 +1070,170 @@
11021070 }
11031071
11041072 /* link memory sections under this node.*/
1105
- ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
1106
- MEMINIT_HOTPLUG);
1107
- BUG_ON(ret);
1073
+ link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
1074
+ MEMINIT_HOTPLUG);
11081075
11091076 /* create new memmap entry */
1110
- firmware_map_add_hotplug(start, start + size, "System RAM");
1077
+ if (!strcmp(res->name, "System RAM"))
1078
+ firmware_map_add_hotplug(start, start + size, "System RAM");
11111079
11121080 /* device_online() will take the lock when calling online_pages() */
11131081 mem_hotplug_done();
11141082
1083
+ /*
1084
+ * In case we're allowed to merge the resource, flag it and trigger
1085
+ * merging now that adding succeeded.
1086
+ */
1087
+ if (mhp_flags & MEMHP_MERGE_RESOURCE)
1088
+ merge_system_ram_resource(res);
1089
+
11151090 /* online pages if requested */
1116
- if (online)
1117
- walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
1118
- NULL, online_memory_block);
1091
+ if (memhp_default_online_type != MMOP_OFFLINE)
1092
+ walk_memory_blocks(start, size, NULL, online_memory_block);
11191093
11201094 return ret;
11211095 error:
11221096 /* rollback pgdat allocation and others */
11231097 if (new_node)
11241098 rollback_node_hotadd(nid);
1125
- memblock_remove(start, size);
1099
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
1100
+ memblock_remove(start, size);
11261101 mem_hotplug_done();
11271102 return ret;
11281103 }
11291104
11301105 /* requires device_hotplug_lock, see add_memory_resource() */
1131
-int __ref __add_memory(int nid, u64 start, u64 size)
1106
+int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
11321107 {
11331108 struct resource *res;
11341109 int ret;
11351110
1136
- res = register_memory_resource(start, size);
1111
+ res = register_memory_resource(start, size, "System RAM");
11371112 if (IS_ERR(res))
11381113 return PTR_ERR(res);
11391114
1140
- ret = add_memory_resource(nid, res, memhp_auto_online);
1115
+ ret = add_memory_resource(nid, res, mhp_flags);
11411116 if (ret < 0)
11421117 release_memory_resource(res);
11431118 return ret;
11441119 }
11451120
1146
-int add_memory(int nid, u64 start, u64 size)
1121
+int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
11471122 {
11481123 int rc;
11491124
11501125 lock_device_hotplug();
1151
- rc = __add_memory(nid, start, size);
1126
+ rc = __add_memory(nid, start, size, mhp_flags);
11521127 unlock_device_hotplug();
11531128
11541129 return rc;
11551130 }
11561131 EXPORT_SYMBOL_GPL(add_memory);
11571132
1133
+int add_memory_subsection(int nid, u64 start, u64 size)
1134
+{
1135
+ struct mhp_params params = { .pgprot = PAGE_KERNEL };
1136
+ struct resource *res;
1137
+ int ret;
1138
+
1139
+ if (!IS_ALIGNED(start, SUBSECTION_SIZE) ||
1140
+ !IS_ALIGNED(size, SUBSECTION_SIZE)) {
1141
+ pr_err("%s: start 0x%llx size 0x%llx not aligned to subsection size\n",
1142
+ __func__, start, size);
1143
+ return -EINVAL;
1144
+ }
1145
+
1146
+ res = register_memory_resource(start, size, "System RAM");
1147
+ if (IS_ERR(res))
1148
+ return PTR_ERR(res);
1149
+
1150
+ mem_hotplug_begin();
1151
+
1152
+ nid = memory_add_physaddr_to_nid(start);
1153
+
1154
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
1155
+ memblock_add_node(start, size, nid);
1156
+
1157
+ ret = arch_add_memory(nid, start, size, &params);
1158
+ if (ret) {
1159
+ pr_err("%s failed to add subsection start 0x%llx size 0x%llx\n",
1160
+ __func__, start, size);
1161
+ goto err_add_memory;
1162
+ }
1163
+ mem_hotplug_done();
1164
+
1165
+ return ret;
1166
+
1167
+err_add_memory:
1168
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
1169
+ memblock_remove(start, size);
1170
+
1171
+ mem_hotplug_done();
1172
+
1173
+ release_memory_resource(res);
1174
+ return ret;
1175
+}
1176
+EXPORT_SYMBOL_GPL(add_memory_subsection);
1177
+
1178
+/*
1179
+ * Add special, driver-managed memory to the system as system RAM. Such
1180
+ * memory is not exposed via the raw firmware-provided memmap as system
1181
+ * RAM, instead, it is detected and added by a driver - during cold boot,
1182
+ * after a reboot, and after kexec.
1183
+ *
1184
+ * Reasons why this memory should not be used for the initial memmap of a
1185
+ * kexec kernel or for placing kexec images:
1186
+ * - The booting kernel is in charge of determining how this memory will be
1187
+ * used (e.g., use persistent memory as system RAM)
1188
+ * - Coordination with a hypervisor is required before this memory
1189
+ * can be used (e.g., inaccessible parts).
1190
+ *
1191
+ * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
1192
+ * memory map") are created. Also, the created memory resource is flagged
1193
+ * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
1194
+ * this memory as well (esp., not place kexec images onto it).
1195
+ *
1196
+ * The resource_name (visible via /proc/iomem) has to have the format
1197
+ * "System RAM ($DRIVER)".
1198
+ */
1199
+int add_memory_driver_managed(int nid, u64 start, u64 size,
1200
+ const char *resource_name, mhp_t mhp_flags)
1201
+{
1202
+ struct resource *res;
1203
+ int rc;
1204
+
1205
+ if (!resource_name ||
1206
+ strstr(resource_name, "System RAM (") != resource_name ||
1207
+ resource_name[strlen(resource_name) - 1] != ')')
1208
+ return -EINVAL;
1209
+
1210
+ lock_device_hotplug();
1211
+
1212
+ res = register_memory_resource(start, size, resource_name);
1213
+ if (IS_ERR(res)) {
1214
+ rc = PTR_ERR(res);
1215
+ goto out_unlock;
1216
+ }
1217
+
1218
+ rc = add_memory_resource(nid, res, mhp_flags);
1219
+ if (rc < 0)
1220
+ release_memory_resource(res);
1221
+
1222
+out_unlock:
1223
+ unlock_device_hotplug();
1224
+ return rc;
1225
+}
1226
+EXPORT_SYMBOL_GPL(add_memory_driver_managed);
1227
+
11581228 #ifdef CONFIG_MEMORY_HOTREMOVE
11591229 /*
1160
- * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
1161
- * set and the size of the free page is given by page_order(). Using this,
1162
- * the function determines if the pageblock contains only free pages.
1163
- * Due to buddy contraints, a free page at least the size of a pageblock will
1164
- * be located at the start of the pageblock
1230
+ * Confirm all pages in a range [start, end) belong to the same zone (skipping
1231
+ * memory holes). When true, return the zone.
11651232 */
1166
-static inline int pageblock_free(struct page *page)
1167
-{
1168
- return PageBuddy(page) && page_order(page) >= pageblock_order;
1169
-}
1170
-
1171
-/* Return the pfn of the start of the next active pageblock after a given pfn */
1172
-static unsigned long next_active_pageblock(unsigned long pfn)
1173
-{
1174
- struct page *page = pfn_to_page(pfn);
1175
-
1176
- /* Ensure the starting page is pageblock-aligned */
1177
- BUG_ON(pfn & (pageblock_nr_pages - 1));
1178
-
1179
- /* If the entire pageblock is free, move to the end of free page */
1180
- if (pageblock_free(page)) {
1181
- int order;
1182
- /* be careful. we don't have locks, page_order can be changed.*/
1183
- order = page_order(page);
1184
- if ((order < MAX_ORDER) && (order >= pageblock_order))
1185
- return pfn + (1 << order);
1186
- }
1187
-
1188
- return pfn + pageblock_nr_pages;
1189
-}
1190
-
1191
-static bool is_pageblock_removable_nolock(unsigned long pfn)
1192
-{
1193
- struct page *page = pfn_to_page(pfn);
1194
- struct zone *zone;
1195
-
1196
- /*
1197
- * We have to be careful here because we are iterating over memory
1198
- * sections which are not zone aware so we might end up outside of
1199
- * the zone but still within the section.
1200
- * We have to take care about the node as well. If the node is offline
1201
- * its NODE_DATA will be NULL - see page_zone.
1202
- */
1203
- if (!node_online(page_to_nid(page)))
1204
- return false;
1205
-
1206
- zone = page_zone(page);
1207
- pfn = page_to_pfn(page);
1208
- if (!zone_spans_pfn(zone, pfn))
1209
- return false;
1210
-
1211
- return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
1212
-}
1213
-
1214
-/* Checks if this range of memory is likely to be hot-removable. */
1215
-bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
1216
-{
1217
- unsigned long end_pfn, pfn;
1218
-
1219
- end_pfn = min(start_pfn + nr_pages,
1220
- zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
1221
-
1222
- /* Check the starting page of each pageblock within the range */
1223
- for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
1224
- if (!is_pageblock_removable_nolock(pfn))
1225
- return false;
1226
- cond_resched();
1227
- }
1228
-
1229
- /* All pageblocks in the memory block are likely to be hot-removable */
1230
- return true;
1231
-}
1232
-
1233
-/*
1234
- * Confirm all pages in a range [start, end) belong to the same zone.
1235
- * When true, return its valid [start, end).
1236
- */
1237
-int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
1238
- unsigned long *valid_start, unsigned long *valid_end)
1233
+struct zone *test_pages_in_a_zone(unsigned long start_pfn,
1234
+ unsigned long end_pfn)
12391235 {
12401236 unsigned long pfn, sec_end_pfn;
1241
- unsigned long start, end;
12421237 struct zone *zone = NULL;
12431238 struct page *page;
12441239 int i;
....@@ -1259,33 +1254,30 @@
12591254 continue;
12601255 /* Check if we got outside of the zone */
12611256 if (zone && !zone_spans_pfn(zone, pfn + i))
1262
- return 0;
1257
+ return NULL;
12631258 page = pfn_to_page(pfn + i);
12641259 if (zone && page_zone(page) != zone)
1265
- return 0;
1266
- if (!zone)
1267
- start = pfn + i;
1260
+ return NULL;
12681261 zone = page_zone(page);
1269
- end = pfn + MAX_ORDER_NR_PAGES;
12701262 }
12711263 }
12721264
1273
- if (zone) {
1274
- *valid_start = start;
1275
- *valid_end = min(end, end_pfn);
1276
- return 1;
1277
- } else {
1278
- return 0;
1279
- }
1265
+ return zone;
12801266 }
12811267
12821268 /*
12831269 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
1284
- * non-lru movable pages and hugepages). We scan pfn because it's much
1285
- * easier than scanning over linked list. This function returns the pfn
1286
- * of the first found movable page if it's found, otherwise 0.
1270
+ * non-lru movable pages and hugepages). Will skip over most unmovable
1271
+ * pages (esp., pages that can be skipped when offlining), but bail out on
1272
+ * definitely unmovable pages.
1273
+ *
1274
+ * Returns:
1275
+ * 0 in case a movable page is found and movable_pfn was updated.
1276
+ * -ENOENT in case no movable page was found.
1277
+ * -EBUSY in case a definitely unmovable page was found.
12871278 */
1288
-static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1279
+static int scan_movable_pages(unsigned long start, unsigned long end,
1280
+ unsigned long *movable_pfn)
12891281 {
12901282 unsigned long pfn;
12911283
....@@ -1297,68 +1289,55 @@
12971289 continue;
12981290 page = pfn_to_page(pfn);
12991291 if (PageLRU(page))
1300
- return pfn;
1292
+ goto found;
13011293 if (__PageMovable(page))
1302
- return pfn;
1294
+ goto found;
1295
+
1296
+ /*
1297
+ * PageOffline() pages that are not marked __PageMovable() and
1298
+ * have a reference count > 0 (after MEM_GOING_OFFLINE) are
1299
+ * definitely unmovable. If their reference count would be 0,
1300
+ * they could at least be skipped when offlining memory.
1301
+ */
1302
+ if (PageOffline(page) && page_count(page))
1303
+ return -EBUSY;
13031304
13041305 if (!PageHuge(page))
13051306 continue;
13061307 head = compound_head(page);
1307
- if (hugepage_migration_supported(page_hstate(head)) &&
1308
- page_huge_active(head))
1309
- return pfn;
1310
- skip = (1 << compound_order(head)) - (page - head);
1308
+ if (page_huge_active(head))
1309
+ goto found;
1310
+ skip = compound_nr(head) - (page - head);
13111311 pfn += skip - 1;
13121312 }
1313
+ return -ENOENT;
1314
+found:
1315
+ *movable_pfn = pfn;
13131316 return 0;
13141317 }
13151318
1316
-static struct page *new_node_page(struct page *page, unsigned long private)
1317
-{
1318
- int nid = page_to_nid(page);
1319
- nodemask_t nmask = node_states[N_MEMORY];
1320
-
1321
- /*
1322
- * try to allocate from a different node but reuse this node if there
1323
- * are no other online nodes to be used (e.g. we are offlining a part
1324
- * of the only existing node)
1325
- */
1326
- node_clear(nid, nmask);
1327
- if (nodes_empty(nmask))
1328
- node_set(nid, nmask);
1329
-
1330
- return new_page_nodemask(page, nid, &nmask);
1331
-}
1332
-
1333
-#define NR_OFFLINE_AT_ONCE_PAGES (256)
13341319 static int
13351320 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
13361321 {
13371322 unsigned long pfn;
1338
- struct page *page;
1339
- int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
1340
- int not_managed = 0;
1323
+ struct page *page, *head;
13411324 int ret = 0;
13421325 LIST_HEAD(source);
1326
+ static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
1327
+ DEFAULT_RATELIMIT_BURST);
13431328
1344
- for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
1329
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
13451330 if (!pfn_valid(pfn))
13461331 continue;
13471332 page = pfn_to_page(pfn);
1333
+ head = compound_head(page);
13481334
13491335 if (PageHuge(page)) {
1350
- struct page *head = compound_head(page);
1351
- pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
1352
- if (compound_order(head) > PFN_SECTION_SHIFT) {
1353
- ret = -EBUSY;
1354
- break;
1355
- }
1356
- if (isolate_huge_page(page, &source))
1357
- move_pages -= 1 << compound_order(head);
1336
+ pfn = page_to_pfn(head) + compound_nr(head) - 1;
1337
+ isolate_hugetlb(head, &source);
13581338 continue;
13591339 } else if (PageTransHuge(page))
1360
- pfn = page_to_pfn(compound_head(page))
1361
- + hpage_nr_pages(page) - 1;
1340
+ pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
13621341
13631342 /*
13641343 * HWPoison pages have elevated reference counts so the migration would
....@@ -1371,7 +1350,7 @@
13711350 if (WARN_ON(PageLRU(page)))
13721351 isolate_lru_page(page);
13731352 if (page_mapped(page))
1374
- try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
1353
+ try_to_unmap(page, TTU_IGNORE_MLOCK);
13751354 continue;
13761355 }
13771356
....@@ -1386,98 +1365,60 @@
13861365 else
13871366 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
13881367 if (!ret) { /* Success */
1389
- put_page(page);
13901368 list_add_tail(&page->lru, &source);
1391
- move_pages--;
13921369 if (!__PageMovable(page))
13931370 inc_node_page_state(page, NR_ISOLATED_ANON +
1394
- page_is_file_cache(page));
1371
+ page_is_file_lru(page));
13951372
13961373 } else {
1397
-#ifdef CONFIG_DEBUG_VM
1398
- pr_alert("failed to isolate pfn %lx\n", pfn);
1399
- dump_page(page, "isolation failed");
1400
-#endif
1401
- put_page(page);
1402
- /* Because we don't have big zone->lock. we should
1403
- check this again here. */
1404
- if (page_count(page)) {
1405
- not_managed++;
1406
- ret = -EBUSY;
1407
- break;
1374
+ if (__ratelimit(&migrate_rs)) {
1375
+ pr_warn("failed to isolate pfn %lx\n", pfn);
1376
+ dump_page(page, "isolation failed");
14081377 }
14091378 }
1379
+ put_page(page);
14101380 }
14111381 if (!list_empty(&source)) {
1412
- if (not_managed) {
1382
+ nodemask_t nmask = node_states[N_MEMORY];
1383
+ struct migration_target_control mtc = {
1384
+ .nmask = &nmask,
1385
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
1386
+ };
1387
+
1388
+ /*
1389
+ * We have checked that migration range is on a single zone so
1390
+ * we can use the nid of the first page to all the others.
1391
+ */
1392
+ mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
1393
+
1394
+ /*
1395
+ * try to allocate from a different node but reuse this node
1396
+ * if there are no other online nodes to be used (e.g. we are
1397
+ * offlining a part of the only existing node)
1398
+ */
1399
+ node_clear(mtc.nid, nmask);
1400
+ if (nodes_empty(nmask))
1401
+ node_set(mtc.nid, nmask);
1402
+ ret = migrate_pages(&source, alloc_migration_target, NULL,
1403
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1404
+ if (ret) {
1405
+ list_for_each_entry(page, &source, lru) {
1406
+ if (__ratelimit(&migrate_rs)) {
1407
+ pr_warn("migrating pfn %lx failed ret:%d\n",
1408
+ page_to_pfn(page), ret);
1409
+ dump_page(page, "migration failure");
1410
+ }
1411
+ }
14131412 putback_movable_pages(&source);
1414
- goto out;
14151413 }
1416
-
1417
- /* Allocate a new page from the nearest neighbor node */
1418
- ret = migrate_pages(&source, new_node_page, NULL, 0,
1419
- MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1420
- if (ret)
1421
- putback_movable_pages(&source);
14221414 }
1423
-out:
1415
+
14241416 return ret;
1425
-}
1426
-
1427
-/*
1428
- * remove from free_area[] and mark all as Reserved.
1429
- */
1430
-static int
1431
-offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
1432
- void *data)
1433
-{
1434
- __offline_isolated_pages(start, start + nr_pages);
1435
- return 0;
1436
-}
1437
-
1438
-static void
1439
-offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
1440
-{
1441
- walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
1442
- offline_isolated_pages_cb);
1443
-}
1444
-
1445
-/*
1446
- * Check all pages in range, recoreded as memory resource, are isolated.
1447
- */
1448
-static int
1449
-check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
1450
- void *data)
1451
-{
1452
- int ret;
1453
- long offlined = *(long *)data;
1454
- ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
1455
- offlined = nr_pages;
1456
- if (!ret)
1457
- *(long *)data += offlined;
1458
- return ret;
1459
-}
1460
-
1461
-static long
1462
-check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
1463
-{
1464
- long offlined = 0;
1465
- int ret;
1466
-
1467
- ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
1468
- check_pages_isolated_cb);
1469
- if (ret < 0)
1470
- offlined = (long)ret;
1471
- return offlined;
14721417 }
14731418
14741419 static int __init cmdline_parse_movable_node(char *p)
14751420 {
1476
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
14771421 movable_node_enabled = true;
1478
-#else
1479
- pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
1480
-#endif
14811422 return 0;
14821423 }
14831424 early_param("movable_node", cmdline_parse_movable_node);
....@@ -1488,75 +1429,53 @@
14881429 {
14891430 struct pglist_data *pgdat = zone->zone_pgdat;
14901431 unsigned long present_pages = 0;
1491
- enum zone_type zt, zone_last = ZONE_NORMAL;
1432
+ enum zone_type zt;
1433
+
1434
+ arg->status_change_nid = NUMA_NO_NODE;
1435
+ arg->status_change_nid_normal = NUMA_NO_NODE;
1436
+ arg->status_change_nid_high = NUMA_NO_NODE;
14921437
14931438 /*
1494
- * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
1495
- * contains nodes which have zones of 0...ZONE_NORMAL,
1496
- * set zone_last to ZONE_NORMAL.
1497
- *
1498
- * If we don't have HIGHMEM nor movable node,
1499
- * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
1500
- * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
1439
+ * Check whether node_states[N_NORMAL_MEMORY] will be changed.
1440
+ * If the memory to be offline is within the range
1441
+ * [0..ZONE_NORMAL], and it is the last present memory there,
1442
+ * the zones in that range will become empty after the offlining,
1443
+ * thus we can determine that we need to clear the node from
1444
+ * node_states[N_NORMAL_MEMORY].
15011445 */
1502
- if (N_MEMORY == N_NORMAL_MEMORY)
1503
- zone_last = ZONE_MOVABLE;
1504
-
1505
- /*
1506
- * check whether node_states[N_NORMAL_MEMORY] will be changed.
1507
- * If the memory to be offline is in a zone of 0...zone_last,
1508
- * and it is the last present memory, 0...zone_last will
1509
- * become empty after offline , thus we can determind we will
1510
- * need to clear the node from node_states[N_NORMAL_MEMORY].
1511
- */
1512
- for (zt = 0; zt <= zone_last; zt++)
1446
+ for (zt = 0; zt <= ZONE_NORMAL; zt++)
15131447 present_pages += pgdat->node_zones[zt].present_pages;
1514
- if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1448
+ if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
15151449 arg->status_change_nid_normal = zone_to_nid(zone);
1516
- else
1517
- arg->status_change_nid_normal = -1;
15181450
15191451 #ifdef CONFIG_HIGHMEM
15201452 /*
1521
- * If we have movable node, node_states[N_HIGH_MEMORY]
1522
- * contains nodes which have zones of 0...ZONE_HIGHMEM,
1523
- * set zone_last to ZONE_HIGHMEM.
1524
- *
1525
- * If we don't have movable node, node_states[N_NORMAL_MEMORY]
1526
- * contains nodes which have zones of 0...ZONE_MOVABLE,
1527
- * set zone_last to ZONE_MOVABLE.
1453
+ * node_states[N_HIGH_MEMORY] contains nodes which
1454
+ * have normal memory or high memory.
1455
+ * Here we add the present_pages belonging to ZONE_HIGHMEM.
1456
+ * If the zone is within the range of [0..ZONE_HIGHMEM), and
1457
+ * we determine that the zones in that range become empty,
1458
+ * we need to clear the node for N_HIGH_MEMORY.
15281459 */
1529
- zone_last = ZONE_HIGHMEM;
1530
- if (N_MEMORY == N_HIGH_MEMORY)
1531
- zone_last = ZONE_MOVABLE;
1532
-
1533
- for (; zt <= zone_last; zt++)
1534
- present_pages += pgdat->node_zones[zt].present_pages;
1535
- if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1460
+ present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1461
+ if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
15361462 arg->status_change_nid_high = zone_to_nid(zone);
1537
- else
1538
- arg->status_change_nid_high = -1;
1539
-#else
1540
- arg->status_change_nid_high = arg->status_change_nid_normal;
15411463 #endif
15421464
15431465 /*
1544
- * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
1466
+ * We have accounted the pages from [0..ZONE_NORMAL), and
1467
+ * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
1468
+ * as well.
1469
+ * Here we count the possible pages from ZONE_MOVABLE.
1470
+ * If after having accounted all the pages, we see that the nr_pages
1471
+ * to be offlined is over or equal to the accounted pages,
1472
+ * we know that the node will become empty, and so, we can clear
1473
+ * it for N_MEMORY as well.
15451474 */
1546
- zone_last = ZONE_MOVABLE;
1475
+ present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
15471476
1548
- /*
1549
- * check whether node_states[N_HIGH_MEMORY] will be changed
1550
- * If we try to offline the last present @nr_pages from the node,
1551
- * we can determind we will need to clear the node from
1552
- * node_states[N_HIGH_MEMORY].
1553
- */
1554
- for (; zt <= zone_last; zt++)
1555
- present_pages += pgdat->node_zones[zt].present_pages;
15561477 if (nr_pages >= present_pages)
15571478 arg->status_change_nid = zone_to_nid(zone);
1558
- else
1559
- arg->status_change_nid = -1;
15601479 }
15611480
15621481 static void node_states_clear_node(int node, struct memory_notify *arg)
....@@ -1564,53 +1483,76 @@
15641483 if (arg->status_change_nid_normal >= 0)
15651484 node_clear_state(node, N_NORMAL_MEMORY);
15661485
1567
- if ((N_MEMORY != N_NORMAL_MEMORY) &&
1568
- (arg->status_change_nid_high >= 0))
1486
+ if (arg->status_change_nid_high >= 0)
15691487 node_clear_state(node, N_HIGH_MEMORY);
15701488
1571
- if ((N_MEMORY != N_HIGH_MEMORY) &&
1572
- (arg->status_change_nid >= 0))
1489
+ if (arg->status_change_nid >= 0)
15731490 node_clear_state(node, N_MEMORY);
15741491 }
15751492
1576
-static int __ref __offline_pages(unsigned long start_pfn,
1577
- unsigned long end_pfn)
1493
+static int count_system_ram_pages_cb(unsigned long start_pfn,
1494
+ unsigned long nr_pages, void *data)
15781495 {
1579
- unsigned long pfn, nr_pages;
1580
- long offlined_pages;
1581
- int ret, node;
1496
+ unsigned long *nr_system_ram_pages = data;
1497
+
1498
+ *nr_system_ram_pages += nr_pages;
1499
+ return 0;
1500
+}
1501
+
1502
+int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1503
+{
1504
+ const unsigned long end_pfn = start_pfn + nr_pages;
1505
+ unsigned long pfn, system_ram_pages = 0;
15821506 unsigned long flags;
1583
- unsigned long valid_start, valid_end;
15841507 struct zone *zone;
15851508 struct memory_notify arg;
1509
+ int ret, node;
1510
+ char *reason;
15861511
1587
- /* at least, alignment against pageblock is necessary */
1588
- if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
1589
- return -EINVAL;
1590
- if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
1512
+ /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
1513
+ if (WARN_ON_ONCE(!nr_pages ||
1514
+ !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
15911515 return -EINVAL;
15921516
15931517 mem_hotplug_begin();
15941518
1519
+ /*
1520
+ * Don't allow to offline memory blocks that contain holes.
1521
+ * Consequently, memory blocks with holes can never get onlined
1522
+ * via the hotplug path - online_pages() - as hotplugged memory has
1523
+ * no holes. This way, we e.g., don't have to worry about marking
1524
+ * memory holes PG_reserved, don't need pfn_valid() checks, and can
1525
+ * avoid using walk_system_ram_range() later.
1526
+ */
1527
+ walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
1528
+ count_system_ram_pages_cb);
1529
+ if (system_ram_pages != nr_pages) {
1530
+ ret = -EINVAL;
1531
+ reason = "memory holes";
1532
+ goto failed_removal;
1533
+ }
1534
+
15951535 /* This makes hotplug much easier...and readable.
15961536 we assume this for now. .*/
1597
- if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
1598
- &valid_end)) {
1599
- mem_hotplug_done();
1600
- return -EINVAL;
1537
+ zone = test_pages_in_a_zone(start_pfn, end_pfn);
1538
+ if (!zone) {
1539
+ ret = -EINVAL;
1540
+ reason = "multizone range";
1541
+ goto failed_removal;
16011542 }
1602
-
1603
- zone = page_zone(pfn_to_page(valid_start));
16041543 node = zone_to_nid(zone);
1605
- nr_pages = end_pfn - start_pfn;
16061544
1545
+ lru_cache_disable();
16071546 /* set above range as isolated */
16081547 ret = start_isolate_page_range(start_pfn, end_pfn,
1609
- MIGRATE_MOVABLE, true);
1548
+ MIGRATE_MOVABLE,
1549
+ MEMORY_OFFLINE | REPORT_FAILURE, NULL);
16101550 if (ret) {
1611
- mem_hotplug_done();
1612
- return ret;
1551
+ reason = "failure to isolate range";
1552
+ goto failed_removal_lru_cache_disabled;
16131553 }
1554
+
1555
+ drain_all_pages(zone);
16141556
16151557 arg.start_pfn = start_pfn;
16161558 arg.nr_pages = nr_pages;
....@@ -1618,49 +1560,84 @@
16181560
16191561 ret = memory_notify(MEM_GOING_OFFLINE, &arg);
16201562 ret = notifier_to_errno(ret);
1621
- if (ret)
1622
- goto failed_removal;
1623
-
1624
- pfn = start_pfn;
1625
-repeat:
1626
- /* start memory hot removal */
1627
- ret = -EINTR;
1628
- if (signal_pending(current))
1629
- goto failed_removal;
1630
-
1631
- cond_resched();
1632
- lru_add_drain_all();
1633
- drain_all_pages(zone);
1634
-
1635
- pfn = scan_movable_pages(start_pfn, end_pfn);
1636
- if (pfn) { /* We have movable pages */
1637
- ret = do_migrate_range(pfn, end_pfn);
1638
- goto repeat;
1563
+ if (ret) {
1564
+ reason = "notifier failure";
1565
+ goto failed_removal_isolated;
16391566 }
16401567
1568
+ do {
1569
+ pfn = start_pfn;
1570
+ do {
1571
+ if (signal_pending(current)) {
1572
+ ret = -EINTR;
1573
+ reason = "signal backoff";
1574
+ goto failed_removal_isolated;
1575
+ }
1576
+
1577
+ cond_resched();
1578
+
1579
+ ret = scan_movable_pages(pfn, end_pfn, &pfn);
1580
+ if (!ret) {
1581
+ /*
1582
+ * TODO: fatal migration failures should bail
1583
+ * out
1584
+ */
1585
+ do_migrate_range(pfn, end_pfn);
1586
+ }
1587
+ } while (!ret);
1588
+
1589
+ if (ret != -ENOENT) {
1590
+ reason = "unmovable page";
1591
+ goto failed_removal_isolated;
1592
+ }
1593
+
1594
+ /*
1595
+ * Dissolve free hugepages in the memory block before doing
1596
+ * offlining actually in order to make hugetlbfs's object
1597
+ * counting consistent.
1598
+ */
1599
+ ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1600
+ if (ret) {
1601
+ reason = "failure to dissolve huge pages";
1602
+ goto failed_removal_isolated;
1603
+ }
1604
+
1605
+ /*
1606
+ * per-cpu pages are drained after start_isolate_page_range, but
1607
+ * if there are still pages that are not free, make sure that we
1608
+ * drain again, because when we isolated range we might have
1609
+ * raced with another thread that was adding pages to pcp list.
1610
+ *
1611
+ * Forward progress should be still guaranteed because
1612
+ * pages on the pcp list can only belong to MOVABLE_ZONE
1613
+ * because has_unmovable_pages explicitly checks for
1614
+ * PageBuddy on freed pages on other zones.
1615
+ */
1616
+ ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE, NULL);
1617
+ if (ret)
1618
+ drain_all_pages(zone);
1619
+ } while (ret);
1620
+
1621
+ /* Mark all sections offline and remove free pages from the buddy. */
1622
+ __offline_isolated_pages(start_pfn, end_pfn);
1623
+ pr_info("Offlined Pages %ld\n", nr_pages);
1624
+
16411625 /*
1642
- * dissolve free hugepages in the memory block before doing offlining
1643
- * actually in order to make hugetlbfs's object counting consistent.
1626
+ * The memory sections are marked offline, and the pageblock flags
1627
+ * effectively stale; nobody should be touching them. Fixup the number
1628
+ * of isolated pageblocks, memory onlining will properly revert this.
16441629 */
1645
- ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1646
- if (ret)
1647
- goto failed_removal;
1648
- /* check again */
1649
- offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1650
- if (offlined_pages < 0)
1651
- goto repeat;
1652
- pr_info("Offlined Pages %ld\n", offlined_pages);
1653
- /* Ok, all of our target is isolated.
1654
- We cannot do rollback at this point. */
1655
- offline_isolated_pages(start_pfn, end_pfn);
1656
- /* reset pagetype flags and makes migrate type to be MOVABLE */
1657
- undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1630
+ spin_lock_irqsave(&zone->lock, flags);
1631
+ zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
1632
+ spin_unlock_irqrestore(&zone->lock, flags);
1633
+
1634
+ lru_cache_enable();
16581635 /* removal success */
1659
- adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
1660
- zone->present_pages -= offlined_pages;
1636
+ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
1637
+ zone->present_pages -= nr_pages;
16611638
16621639 pgdat_resize_lock(zone->zone_pgdat, &flags);
1663
- zone->zone_pgdat->node_present_pages -= offlined_pages;
1640
+ zone->zone_pgdat->node_present_pages -= nr_pages;
16641641 pgdat_resize_unlock(zone->zone_pgdat, &flags);
16651642
16661643 init_per_zone_wmark_min();
....@@ -1677,7 +1654,6 @@
16771654 kcompactd_stop(node);
16781655 }
16791656
1680
- vm_total_pages = nr_free_pagecache_pages();
16811657 writeback_set_ratelimit();
16821658
16831659 memory_notify(MEM_OFFLINE, &arg);
....@@ -1685,73 +1661,21 @@
16851661 mem_hotplug_done();
16861662 return 0;
16871663
1688
-failed_removal:
1689
- pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
1690
- (unsigned long long) start_pfn << PAGE_SHIFT,
1691
- ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
1692
- memory_notify(MEM_CANCEL_OFFLINE, &arg);
1693
- /* pushback to free area */
1664
+failed_removal_isolated:
16941665 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1666
+ memory_notify(MEM_CANCEL_OFFLINE, &arg);
1667
+failed_removal_lru_cache_disabled:
1668
+ lru_cache_enable();
1669
+failed_removal:
1670
+ pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
1671
+ (unsigned long long) start_pfn << PAGE_SHIFT,
1672
+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
1673
+ reason);
1674
+ /* pushback to free area */
16951675 mem_hotplug_done();
16961676 return ret;
16971677 }
16981678
1699
-int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1700
-{
1701
- return __offline_pages(start_pfn, start_pfn + nr_pages);
1702
-}
1703
-#endif /* CONFIG_MEMORY_HOTREMOVE */
1704
-
1705
-/**
1706
- * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
1707
- * @start_pfn: start pfn of the memory range
1708
- * @end_pfn: end pfn of the memory range
1709
- * @arg: argument passed to func
1710
- * @func: callback for each memory section walked
1711
- *
1712
- * This function walks through all present mem sections in range
1713
- * [start_pfn, end_pfn) and call func on each mem section.
1714
- *
1715
- * Returns the return value of func.
1716
- */
1717
-int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1718
- void *arg, int (*func)(struct memory_block *, void *))
1719
-{
1720
- struct memory_block *mem = NULL;
1721
- struct mem_section *section;
1722
- unsigned long pfn, section_nr;
1723
- int ret;
1724
-
1725
- for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1726
- section_nr = pfn_to_section_nr(pfn);
1727
- if (!present_section_nr(section_nr))
1728
- continue;
1729
-
1730
- section = __nr_to_section(section_nr);
1731
- /* same memblock? */
1732
- if (mem)
1733
- if ((section_nr >= mem->start_section_nr) &&
1734
- (section_nr <= mem->end_section_nr))
1735
- continue;
1736
-
1737
- mem = find_memory_block_hinted(section, mem);
1738
- if (!mem)
1739
- continue;
1740
-
1741
- ret = func(mem, arg);
1742
- if (ret) {
1743
- kobject_put(&mem->dev.kobj);
1744
- return ret;
1745
- }
1746
- }
1747
-
1748
- if (mem)
1749
- kobject_put(&mem->dev.kobj);
1750
-
1751
- return 0;
1752
-}
1753
-
1754
-#ifdef CONFIG_MEMORY_HOTREMOVE
17551679 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
17561680 {
17571681 int ret = !is_memblock_offlined(mem);
....@@ -1760,12 +1684,13 @@
17601684 phys_addr_t beginpa, endpa;
17611685
17621686 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
1763
- endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
1687
+ endpa = beginpa + memory_block_size_bytes() - 1;
17641688 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
17651689 &beginpa, &endpa);
1766
- }
17671690
1768
- return ret;
1691
+ return -EBUSY;
1692
+ }
1693
+ return 0;
17691694 }
17701695
17711696 static int check_cpu_on_node(pg_data_t *pgdat)
....@@ -1781,34 +1706,6 @@
17811706 return -EBUSY;
17821707 }
17831708
1784
- return 0;
1785
-}
1786
-
1787
-static void unmap_cpu_on_node(pg_data_t *pgdat)
1788
-{
1789
-#ifdef CONFIG_ACPI_NUMA
1790
- int cpu;
1791
-
1792
- for_each_possible_cpu(cpu)
1793
- if (cpu_to_node(cpu) == pgdat->node_id)
1794
- numa_clear_node(cpu);
1795
-#endif
1796
-}
1797
-
1798
-static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
1799
-{
1800
- int ret;
1801
-
1802
- ret = check_cpu_on_node(pgdat);
1803
- if (ret)
1804
- return ret;
1805
-
1806
- /*
1807
- * the node will be offlined when we come here, so we can clear
1808
- * the cpu_to_node() now.
1809
- */
1810
-
1811
- unmap_cpu_on_node(pgdat);
18121709 return 0;
18131710 }
18141711
....@@ -1855,7 +1752,7 @@
18551752 if (rc)
18561753 return;
18571754
1858
- if (check_and_unmap_cpu_on_node(pgdat))
1755
+ if (check_cpu_on_node(pgdat))
18591756 return;
18601757
18611758 /*
....@@ -1867,24 +1764,45 @@
18671764 }
18681765 EXPORT_SYMBOL(try_offline_node);
18691766
1870
-static void __release_memory_resource(resource_size_t start,
1871
- resource_size_t size)
1767
+static int __ref try_remove_memory(int nid, u64 start, u64 size)
18721768 {
1873
- int ret;
1769
+ int rc = 0;
1770
+
1771
+ BUG_ON(check_hotplug_memory_range(start, size));
18741772
18751773 /*
1876
- * When removing memory in the same granularity as it was added,
1877
- * this function never fails. It might only fail if resources
1878
- * have to be adjusted or split. We'll ignore the error, as
1879
- * removing of memory cannot fail.
1774
+ * All memory blocks must be offlined before removing memory. Check
1775
+ * whether all memory blocks in question are offline and return error
1776
+ * if this is not the case.
18801777 */
1881
- ret = release_mem_region_adjustable(&iomem_resource, start, size);
1882
- if (ret) {
1883
- resource_size_t endres = start + size - 1;
1778
+ rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
1779
+ if (rc)
1780
+ return rc;
18841781
1885
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
1886
- &start, &endres, ret);
1782
+ /* remove memmap entry */
1783
+ firmware_map_remove(start, start + size, "System RAM");
1784
+
1785
+ /*
1786
+ * Memory block device removal under the device_hotplug_lock is
1787
+ * a barrier against racing online attempts.
1788
+ */
1789
+ remove_memory_block_devices(start, size);
1790
+
1791
+ mem_hotplug_begin();
1792
+
1793
+ arch_remove_memory(nid, start, size, NULL);
1794
+
1795
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
1796
+ memblock_free(start, size);
1797
+ memblock_remove(start, size);
18871798 }
1799
+
1800
+ release_mem_region_adjustable(start, size);
1801
+
1802
+ try_offline_node(nid);
1803
+
1804
+ mem_hotplug_done();
1805
+ return 0;
18881806 }
18891807
18901808 /**
....@@ -1897,48 +1815,163 @@
18971815 * and online/offline operations before this call, as required by
18981816 * try_offline_node().
18991817 */
1900
-void __ref __remove_memory(int nid, u64 start, u64 size)
1818
+void __remove_memory(int nid, u64 start, u64 size)
19011819 {
1902
- int ret;
1903
-
1904
- BUG_ON(check_hotplug_memory_range(start, size));
19051820
19061821 /*
1907
- * All memory blocks must be offlined before removing memory. Check
1908
- * whether all memory blocks in question are offline and trigger a BUG()
1909
- * if this is not the case.
1822
+ * trigger BUG() if some memory is not offlined prior to calling this
1823
+ * function
19101824 */
1911
- ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
1912
- check_memblock_offlined_cb);
1913
- if (ret)
1825
+ if (try_remove_memory(nid, start, size))
19141826 BUG();
1915
-
1916
- /* remove memmap entry */
1917
- firmware_map_remove(start, start + size, "System RAM");
1918
- memblock_free(start, size);
1919
- memblock_remove(start, size);
1920
-
1921
- /*
1922
- * Memory block device removal under the device_hotplug_lock is
1923
- * a barrier against racing online attempts.
1924
- */
1925
- remove_memory_block_devices(start, size);
1926
-
1927
- mem_hotplug_begin();
1928
-
1929
- arch_remove_memory(nid, start, size, NULL);
1930
- __release_memory_resource(start, size);
1931
-
1932
- try_offline_node(nid);
1933
-
1934
- mem_hotplug_done();
19351827 }
19361828
1937
-void remove_memory(int nid, u64 start, u64 size)
1829
+/*
1830
+ * Remove memory if every memory block is offline, otherwise return -EBUSY is
1831
+ * some memory is not offline
1832
+ */
1833
+int remove_memory(int nid, u64 start, u64 size)
19381834 {
1835
+ int rc;
1836
+
19391837 lock_device_hotplug();
1940
- __remove_memory(nid, start, size);
1838
+ rc = try_remove_memory(nid, start, size);
19411839 unlock_device_hotplug();
1840
+
1841
+ return rc;
19421842 }
19431843 EXPORT_SYMBOL_GPL(remove_memory);
1844
+
1845
+int remove_memory_subsection(int nid, u64 start, u64 size)
1846
+{
1847
+ if (!IS_ALIGNED(start, SUBSECTION_SIZE) ||
1848
+ !IS_ALIGNED(size, SUBSECTION_SIZE)) {
1849
+ pr_err("%s: start 0x%llx size 0x%llx not aligned to subsection size\n",
1850
+ __func__, start, size);
1851
+ return -EINVAL;
1852
+ }
1853
+
1854
+ mem_hotplug_begin();
1855
+ arch_remove_memory(nid, start, size, NULL);
1856
+
1857
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
1858
+ memblock_remove(start, size);
1859
+
1860
+ release_mem_region_adjustable(start, size);
1861
+
1862
+ mem_hotplug_done();
1863
+
1864
+ return 0;
1865
+}
1866
+EXPORT_SYMBOL_GPL(remove_memory_subsection);
1867
+
1868
+static int try_offline_memory_block(struct memory_block *mem, void *arg)
1869
+{
1870
+ uint8_t online_type = MMOP_ONLINE_KERNEL;
1871
+ uint8_t **online_types = arg;
1872
+ struct page *page;
1873
+ int rc;
1874
+
1875
+ /*
1876
+ * Sense the online_type via the zone of the memory block. Offlining
1877
+ * with multiple zones within one memory block will be rejected
1878
+ * by offlining code ... so we don't care about that.
1879
+ */
1880
+ page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
1881
+ if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
1882
+ online_type = MMOP_ONLINE_MOVABLE;
1883
+
1884
+ rc = device_offline(&mem->dev);
1885
+ /*
1886
+ * Default is MMOP_OFFLINE - change it only if offlining succeeded,
1887
+ * so try_reonline_memory_block() can do the right thing.
1888
+ */
1889
+ if (!rc)
1890
+ **online_types = online_type;
1891
+
1892
+ (*online_types)++;
1893
+ /* Ignore if already offline. */
1894
+ return rc < 0 ? rc : 0;
1895
+}
1896
+
1897
+static int try_reonline_memory_block(struct memory_block *mem, void *arg)
1898
+{
1899
+ uint8_t **online_types = arg;
1900
+ int rc;
1901
+
1902
+ if (**online_types != MMOP_OFFLINE) {
1903
+ mem->online_type = **online_types;
1904
+ rc = device_online(&mem->dev);
1905
+ if (rc < 0)
1906
+ pr_warn("%s: Failed to re-online memory: %d",
1907
+ __func__, rc);
1908
+ }
1909
+
1910
+ /* Continue processing all remaining memory blocks. */
1911
+ (*online_types)++;
1912
+ return 0;
1913
+}
1914
+
1915
+/*
1916
+ * Try to offline and remove memory. Might take a long time to finish in case
1917
+ * memory is still in use. Primarily useful for memory devices that logically
1918
+ * unplugged all memory (so it's no longer in use) and want to offline + remove
1919
+ * that memory.
1920
+ */
1921
+int offline_and_remove_memory(int nid, u64 start, u64 size)
1922
+{
1923
+ const unsigned long mb_count = size / memory_block_size_bytes();
1924
+ uint8_t *online_types, *tmp;
1925
+ int rc;
1926
+
1927
+ if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
1928
+ !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
1929
+ return -EINVAL;
1930
+
1931
+ /*
1932
+ * We'll remember the old online type of each memory block, so we can
1933
+ * try to revert whatever we did when offlining one memory block fails
1934
+ * after offlining some others succeeded.
1935
+ */
1936
+ online_types = kmalloc_array(mb_count, sizeof(*online_types),
1937
+ GFP_KERNEL);
1938
+ if (!online_types)
1939
+ return -ENOMEM;
1940
+ /*
1941
+ * Initialize all states to MMOP_OFFLINE, so when we abort processing in
1942
+ * try_offline_memory_block(), we'll skip all unprocessed blocks in
1943
+ * try_reonline_memory_block().
1944
+ */
1945
+ memset(online_types, MMOP_OFFLINE, mb_count);
1946
+
1947
+ lock_device_hotplug();
1948
+
1949
+ tmp = online_types;
1950
+ rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
1951
+
1952
+ /*
1953
+ * In case we succeeded to offline all memory, remove it.
1954
+ * This cannot fail as it cannot get onlined in the meantime.
1955
+ */
1956
+ if (!rc) {
1957
+ rc = try_remove_memory(nid, start, size);
1958
+ if (rc)
1959
+ pr_err("%s: Failed to remove memory: %d", __func__, rc);
1960
+ }
1961
+
1962
+ /*
1963
+ * Rollback what we did. While memory onlining might theoretically fail
1964
+ * (nacked by a notifier), it barely ever happens.
1965
+ */
1966
+ if (rc) {
1967
+ tmp = online_types;
1968
+ walk_memory_blocks(start, size, &tmp,
1969
+ try_reonline_memory_block);
1970
+ }
1971
+ unlock_device_hotplug();
1972
+
1973
+ kfree(online_types);
1974
+ return rc;
1975
+}
1976
+EXPORT_SYMBOL_GPL(offline_and_remove_memory);
19441977 #endif /* CONFIG_MEMORY_HOTREMOVE */