hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/mm/sparse.c
....@@ -5,17 +5,17 @@
55 #include <linux/mm.h>
66 #include <linux/slab.h>
77 #include <linux/mmzone.h>
8
-#include <linux/bootmem.h>
8
+#include <linux/memblock.h>
99 #include <linux/compiler.h>
1010 #include <linux/highmem.h>
1111 #include <linux/export.h>
1212 #include <linux/spinlock.h>
1313 #include <linux/vmalloc.h>
14
+#include <linux/swap.h>
15
+#include <linux/swapops.h>
1416
1517 #include "internal.h"
1618 #include <asm/dma.h>
17
-#include <asm/pgalloc.h>
18
-#include <asm/pgtable.h>
1919
2020 /*
2121 * Permanent SPARSEMEM data:
....@@ -65,10 +65,15 @@
6565 unsigned long array_size = SECTIONS_PER_ROOT *
6666 sizeof(struct mem_section);
6767
68
- if (slab_is_available())
68
+ if (slab_is_available()) {
6969 section = kzalloc_node(array_size, GFP_KERNEL, nid);
70
- else
71
- section = memblock_virt_alloc_node(array_size, nid);
70
+ } else {
71
+ section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
72
+ nid);
73
+ if (!section)
74
+ panic("%s: Failed to allocate %lu bytes nid=%d\n",
75
+ __func__, array_size, nid);
76
+ }
7277
7378 return section;
7479 }
....@@ -78,8 +83,15 @@
7883 unsigned long root = SECTION_NR_TO_ROOT(section_nr);
7984 struct mem_section *section;
8085
86
+ /*
87
+ * An existing section is possible in the sub-section hotplug
88
+ * case. First hot-add instantiates, follow-on hot-add reuses
89
+ * the existing section.
90
+ *
91
+ * The mem_hotplug_lock resolves the apparent race below.
92
+ */
8193 if (mem_section[root])
82
- return -EEXIST;
94
+ return 0;
8395
8496 section = sparse_index_alloc(nid);
8597 if (!section)
....@@ -97,7 +109,7 @@
97109 #endif
98110
99111 #ifdef CONFIG_SPARSEMEM_EXTREME
100
-int __section_nr(struct mem_section* ms)
112
+unsigned long __section_nr(struct mem_section *ms)
101113 {
102114 unsigned long root_nr;
103115 struct mem_section *root = NULL;
....@@ -116,9 +128,9 @@
116128 return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
117129 }
118130 #else
119
-int __section_nr(struct mem_section* ms)
131
+unsigned long __section_nr(struct mem_section *ms)
120132 {
121
- return (int)(ms - mem_section[0]);
133
+ return (unsigned long)(ms - mem_section[0]);
122134 }
123135 #endif
124136
....@@ -173,10 +185,10 @@
173185 * Keeping track of this gives us an easy way to break out of
174186 * those loops early.
175187 */
176
-int __highest_present_section_nr;
188
+unsigned long __highest_present_section_nr;
177189 static void section_mark_present(struct mem_section *ms)
178190 {
179
- int section_nr = __section_nr(ms);
191
+ unsigned long section_nr = __section_nr(ms);
180192
181193 if (section_nr > __highest_present_section_nr)
182194 __highest_present_section_nr = section_nr;
....@@ -184,16 +196,6 @@
184196 ms->section_mem_map |= SECTION_MARKED_PRESENT;
185197 }
186198
187
-static inline int next_present_section_nr(int section_nr)
188
-{
189
- do {
190
- section_nr++;
191
- if (present_section_nr(section_nr))
192
- return section_nr;
193
- } while ((section_nr <= __highest_present_section_nr));
194
-
195
- return -1;
196
-}
197199 #define for_each_present_section_nr(start, section_nr) \
198200 for (section_nr = next_present_section_nr(start-1); \
199201 ((section_nr != -1) && \
....@@ -205,8 +207,49 @@
205207 return next_present_section_nr(-1);
206208 }
207209
210
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
211
+static void subsection_mask_set(unsigned long *map, unsigned long pfn,
212
+ unsigned long nr_pages)
213
+{
214
+ int idx = subsection_map_index(pfn);
215
+ int end = subsection_map_index(pfn + nr_pages - 1);
216
+
217
+ bitmap_set(map, idx, end - idx + 1);
218
+}
219
+
220
+void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
221
+{
222
+ int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
223
+ unsigned long nr, start_sec = pfn_to_section_nr(pfn);
224
+
225
+ if (!nr_pages)
226
+ return;
227
+
228
+ for (nr = start_sec; nr <= end_sec; nr++) {
229
+ struct mem_section *ms;
230
+ unsigned long pfns;
231
+
232
+ pfns = min(nr_pages, PAGES_PER_SECTION
233
+ - (pfn & ~PAGE_SECTION_MASK));
234
+ ms = __nr_to_section(nr);
235
+ subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
236
+
237
+ pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
238
+ pfns, subsection_map_index(pfn),
239
+ subsection_map_index(pfn + pfns - 1));
240
+
241
+ pfn += pfns;
242
+ nr_pages -= pfns;
243
+ }
244
+}
245
+#else
246
+void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
247
+{
248
+}
249
+#endif
250
+
208251 /* Record a memory area against a node. */
209
-void __init memory_present(int nid, unsigned long start, unsigned long end)
252
+static void __init memory_present(int nid, unsigned long start, unsigned long end)
210253 {
211254 unsigned long pfn;
212255
....@@ -216,7 +259,10 @@
216259
217260 size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
218261 align = 1 << (INTERNODE_CACHE_SHIFT);
219
- mem_section = memblock_virt_alloc(size, align);
262
+ mem_section = memblock_alloc(size, align);
263
+ if (!mem_section)
264
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
265
+ __func__, size, align);
220266 }
221267 #endif
222268
....@@ -239,6 +285,20 @@
239285 }
240286
241287 /*
288
+ * Mark all memblocks as present using memory_present().
289
+ * This is a convenience function that is useful to mark all of the systems
290
+ * memory as present during initialization.
291
+ */
292
+static void __init memblocks_present(void)
293
+{
294
+ unsigned long start, end;
295
+ int i, nid;
296
+
297
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
298
+ memory_present(nid, start, end);
299
+}
300
+
301
+/*
242302 * Subtle, we encode the real pfn into the mem_map such that
243303 * the identity pfn - section_mem_map will return the actual
244304 * physical page frame number.
....@@ -252,6 +312,7 @@
252312 return coded_mem_map;
253313 }
254314
315
+#ifdef CONFIG_MEMORY_HOTPLUG
255316 /*
256317 * Decode mem_map from the coded memmap
257318 */
....@@ -261,36 +322,35 @@
261322 coded_mem_map &= SECTION_MAP_MASK;
262323 return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
263324 }
325
+#endif /* CONFIG_MEMORY_HOTPLUG */
264326
265327 static void __meminit sparse_init_one_section(struct mem_section *ms,
266328 unsigned long pnum, struct page *mem_map,
267
- unsigned long *pageblock_bitmap)
329
+ struct mem_section_usage *usage, unsigned long flags)
268330 {
269331 ms->section_mem_map &= ~SECTION_MAP_MASK;
270
- ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
271
- SECTION_HAS_MEM_MAP;
272
- ms->pageblock_flags = pageblock_bitmap;
332
+ ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
333
+ | SECTION_HAS_MEM_MAP | flags;
334
+ ms->usage = usage;
273335 }
274336
275
-unsigned long usemap_size(void)
337
+static unsigned long usemap_size(void)
276338 {
277339 return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
278340 }
279341
280
-#ifdef CONFIG_MEMORY_HOTPLUG
281
-static unsigned long *__kmalloc_section_usemap(void)
342
+size_t mem_section_usage_size(void)
282343 {
283
- return kmalloc(usemap_size(), GFP_KERNEL);
344
+ return sizeof(struct mem_section_usage) + usemap_size();
284345 }
285
-#endif /* CONFIG_MEMORY_HOTPLUG */
286346
287347 #ifdef CONFIG_MEMORY_HOTREMOVE
288
-static unsigned long * __init
348
+static struct mem_section_usage * __init
289349 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
290350 unsigned long size)
291351 {
352
+ struct mem_section_usage *usage;
292353 unsigned long goal, limit;
293
- unsigned long *p;
294354 int nid;
295355 /*
296356 * A page may contain usemaps for other sections preventing the
....@@ -306,17 +366,16 @@
306366 limit = goal + (1UL << PA_SECTION_SHIFT);
307367 nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
308368 again:
309
- p = memblock_virt_alloc_try_nid_nopanic(size,
310
- SMP_CACHE_BYTES, goal, limit,
311
- nid);
312
- if (!p && limit) {
369
+ usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
370
+ if (!usage && limit) {
313371 limit = 0;
314372 goto again;
315373 }
316
- return p;
374
+ return usage;
317375 }
318376
319
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
377
+static void __init check_usemap_section_nr(int nid,
378
+ struct mem_section_usage *usage)
320379 {
321380 unsigned long usemap_snr, pgdat_snr;
322381 static unsigned long old_usemap_snr;
....@@ -330,7 +389,7 @@
330389 old_pgdat_snr = NR_MEM_SECTIONS;
331390 }
332391
333
- usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
392
+ usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
334393 pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
335394 if (usemap_snr == pgdat_snr)
336395 return;
....@@ -358,14 +417,15 @@
358417 usemap_snr, pgdat_snr, nid);
359418 }
360419 #else
361
-static unsigned long * __init
420
+static struct mem_section_usage * __init
362421 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
363422 unsigned long size)
364423 {
365
- return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
424
+ return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
366425 }
367426
368
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
427
+static void __init check_usemap_section_nr(int nid,
428
+ struct mem_section_usage *usage)
369429 {
370430 }
371431 #endif /* CONFIG_MEMORY_HOTREMOVE */
....@@ -382,18 +442,22 @@
382442 return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
383443 }
384444
385
-struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
386
- struct vmem_altmap *altmap)
445
+struct page __init *__populate_section_memmap(unsigned long pfn,
446
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
387447 {
388448 unsigned long size = section_map_size();
389449 struct page *map = sparse_buffer_alloc(size);
450
+ phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
390451
391452 if (map)
392453 return map;
393454
394
- map = memblock_virt_alloc_try_nid(size,
395
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
396
- BOOTMEM_ALLOC_ACCESSIBLE, nid);
455
+ map = memblock_alloc_try_nid_raw(size, size, addr,
456
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
457
+ if (!map)
458
+ panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
459
+ __func__, size, PAGE_SIZE, nid, &addr);
460
+
397461 return map;
398462 }
399463 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
....@@ -401,13 +465,23 @@
401465 static void *sparsemap_buf __meminitdata;
402466 static void *sparsemap_buf_end __meminitdata;
403467
468
+static inline void __meminit sparse_buffer_free(unsigned long size)
469
+{
470
+ WARN_ON(!sparsemap_buf || size == 0);
471
+ memblock_free_early(__pa(sparsemap_buf), size);
472
+}
473
+
404474 static void __init sparse_buffer_init(unsigned long size, int nid)
405475 {
476
+ phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
406477 WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
407
- sparsemap_buf =
408
- memblock_virt_alloc_try_nid_raw(size, PAGE_SIZE,
409
- __pa(MAX_DMA_ADDRESS),
410
- BOOTMEM_ALLOC_ACCESSIBLE, nid);
478
+ /*
479
+ * Pre-allocated buffer is mainly used by __populate_section_memmap
480
+ * and we want it to be properly aligned to the section size - this is
481
+ * especially the case for VMEMMAP which maps memmap to PMDs
482
+ */
483
+ sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(),
484
+ addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
411485 sparsemap_buf_end = sparsemap_buf + size;
412486 }
413487
....@@ -416,7 +490,7 @@
416490 unsigned long size = sparsemap_buf_end - sparsemap_buf;
417491
418492 if (sparsemap_buf && size > 0)
419
- memblock_free_early(__pa(sparsemap_buf), size);
493
+ sparse_buffer_free(size);
420494 sparsemap_buf = NULL;
421495 }
422496
....@@ -425,11 +499,15 @@
425499 void *ptr = NULL;
426500
427501 if (sparsemap_buf) {
428
- ptr = PTR_ALIGN(sparsemap_buf, size);
502
+ ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
429503 if (ptr + size > sparsemap_buf_end)
430504 ptr = NULL;
431
- else
505
+ else {
506
+ /* Free redundant aligned space */
507
+ if ((unsigned long)(ptr - sparsemap_buf) > 0)
508
+ sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
432509 sparsemap_buf = ptr + size;
510
+ }
433511 }
434512 return ptr;
435513 }
....@@ -446,23 +524,25 @@
446524 unsigned long pnum_end,
447525 unsigned long map_count)
448526 {
449
- unsigned long pnum, usemap_longs, *usemap;
527
+ struct mem_section_usage *usage;
528
+ unsigned long pnum;
450529 struct page *map;
451530
452
- usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
453
- usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
454
- usemap_size() *
455
- map_count);
456
- if (!usemap) {
531
+ usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
532
+ mem_section_usage_size() * map_count);
533
+ if (!usage) {
457534 pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
458535 goto failed;
459536 }
460537 sparse_buffer_init(map_count * section_map_size(), nid);
461538 for_each_present_section_nr(pnum_begin, pnum) {
539
+ unsigned long pfn = section_nr_to_pfn(pnum);
540
+
462541 if (pnum >= pnum_end)
463542 break;
464543
465
- map = sparse_mem_map_populate(pnum, nid, NULL);
544
+ map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
545
+ nid, NULL);
466546 if (!map) {
467547 pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
468548 __func__, nid);
....@@ -470,9 +550,10 @@
470550 sparse_buffer_fini();
471551 goto failed;
472552 }
473
- check_usemap_section_nr(nid, usemap);
474
- sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
475
- usemap += usemap_longs;
553
+ check_usemap_section_nr(nid, usage);
554
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
555
+ SECTION_IS_EARLY);
556
+ usage = (void *) usage + mem_section_usage_size();
476557 }
477558 sparse_buffer_fini();
478559 return;
....@@ -494,9 +575,13 @@
494575 */
495576 void __init sparse_init(void)
496577 {
497
- unsigned long pnum_begin = first_present_section_nr();
498
- int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
499
- unsigned long pnum_end, map_count = 1;
578
+ unsigned long pnum_end, pnum_begin, map_count = 1;
579
+ int nid_begin;
580
+
581
+ memblocks_present();
582
+
583
+ pnum_begin = first_present_section_nr();
584
+ nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
500585
501586 /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
502587 set_pageblock_order();
....@@ -540,7 +625,7 @@
540625 }
541626
542627 #ifdef CONFIG_MEMORY_HOTREMOVE
543
-/* Mark all memory sections within the pfn range as online */
628
+/* Mark all memory sections within the pfn range as offline */
544629 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
545630 {
546631 unsigned long pfn;
....@@ -563,17 +648,17 @@
563648 #endif
564649
565650 #ifdef CONFIG_SPARSEMEM_VMEMMAP
566
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
567
- struct vmem_altmap *altmap)
651
+static struct page * __meminit populate_section_memmap(unsigned long pfn,
652
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
568653 {
569
- /* This will make the necessary allocations eventually. */
570
- return sparse_mem_map_populate(pnum, nid, altmap);
654
+ return __populate_section_memmap(pfn, nr_pages, nid, altmap);
571655 }
572
-static void __kfree_section_memmap(struct page *memmap,
656
+
657
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
573658 struct vmem_altmap *altmap)
574659 {
575
- unsigned long start = (unsigned long)memmap;
576
- unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
660
+ unsigned long start = (unsigned long) pfn_to_page(pfn);
661
+ unsigned long end = start + nr_pages * sizeof(struct page);
577662
578663 vmemmap_free(start, end, altmap);
579664 }
....@@ -584,42 +669,67 @@
584669
585670 vmemmap_free(start, end, NULL);
586671 }
587
-#else
588
-static struct page *__kmalloc_section_memmap(void)
672
+
673
+static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
589674 {
590
- struct page *page, *ret;
591
- unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
675
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
676
+ DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
677
+ struct mem_section *ms = __pfn_to_section(pfn);
678
+ unsigned long *subsection_map = ms->usage
679
+ ? &ms->usage->subsection_map[0] : NULL;
592680
593
- page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
594
- if (page)
595
- goto got_map_page;
681
+ subsection_mask_set(map, pfn, nr_pages);
682
+ if (subsection_map)
683
+ bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
596684
597
- ret = vmalloc(memmap_size);
598
- if (ret)
599
- goto got_map_ptr;
685
+ if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
686
+ "section already deactivated (%#lx + %ld)\n",
687
+ pfn, nr_pages))
688
+ return -EINVAL;
600689
601
- return NULL;
602
-got_map_page:
603
- ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
604
-got_map_ptr:
605
-
606
- return ret;
690
+ bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
691
+ return 0;
607692 }
608693
609
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
610
- struct vmem_altmap *altmap)
694
+static bool is_subsection_map_empty(struct mem_section *ms)
611695 {
612
- return __kmalloc_section_memmap();
696
+ return bitmap_empty(&ms->usage->subsection_map[0],
697
+ SUBSECTIONS_PER_SECTION);
613698 }
614699
615
-static void __kfree_section_memmap(struct page *memmap,
616
- struct vmem_altmap *altmap)
700
+static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
617701 {
618
- if (is_vmalloc_addr(memmap))
619
- vfree(memmap);
702
+ struct mem_section *ms = __pfn_to_section(pfn);
703
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
704
+ unsigned long *subsection_map;
705
+ int rc = 0;
706
+
707
+ subsection_mask_set(map, pfn, nr_pages);
708
+
709
+ subsection_map = &ms->usage->subsection_map[0];
710
+
711
+ if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
712
+ rc = -EINVAL;
713
+ else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
714
+ rc = -EEXIST;
620715 else
621
- free_pages((unsigned long)memmap,
622
- get_order(sizeof(struct page) * PAGES_PER_SECTION));
716
+ bitmap_or(subsection_map, map, subsection_map,
717
+ SUBSECTIONS_PER_SECTION);
718
+
719
+ return rc;
720
+}
721
+#else
722
+struct page * __meminit populate_section_memmap(unsigned long pfn,
723
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
724
+{
725
+ return kvmalloc_node(array_size(sizeof(struct page),
726
+ PAGES_PER_SECTION), GFP_KERNEL, nid);
727
+}
728
+
729
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
730
+ struct vmem_altmap *altmap)
731
+{
732
+ kvfree(pfn_to_page(pfn));
623733 }
624734
625735 static void free_map_bootmem(struct page *memmap)
....@@ -651,62 +761,179 @@
651761 put_page_bootmem(page);
652762 }
653763 }
764
+
765
+static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
766
+{
767
+ return 0;
768
+}
769
+
770
+static bool is_subsection_map_empty(struct mem_section *ms)
771
+{
772
+ return true;
773
+}
774
+
775
+static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
776
+{
777
+ return 0;
778
+}
654779 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
655780
656781 /*
657
- * returns the number of sections whose mem_maps were properly
658
- * set. If this is <=0, then that means that the passed-in
659
- * map was not consumed and must be freed.
782
+ * To deactivate a memory region, there are 3 cases to handle across
783
+ * two configurations (SPARSEMEM_VMEMMAP={y,n}):
784
+ *
785
+ * 1. deactivation of a partial hot-added section (only possible in
786
+ * the SPARSEMEM_VMEMMAP=y case).
787
+ * a) section was present at memory init.
788
+ * b) section was hot-added post memory init.
789
+ * 2. deactivation of a complete hot-added section.
790
+ * 3. deactivation of a complete section from memory init.
791
+ *
792
+ * For 1, when subsection_map does not empty we will not be freeing the
793
+ * usage map, but still need to free the vmemmap range.
794
+ *
795
+ * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified
660796 */
661
-int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
662
- struct vmem_altmap *altmap)
797
+static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
798
+ struct vmem_altmap *altmap)
799
+{
800
+ struct mem_section *ms = __pfn_to_section(pfn);
801
+ bool section_is_early = early_section(ms);
802
+ struct page *memmap = NULL;
803
+ bool empty;
804
+
805
+ if (clear_subsection_map(pfn, nr_pages))
806
+ return;
807
+
808
+ empty = is_subsection_map_empty(ms);
809
+ if (empty) {
810
+ unsigned long section_nr = pfn_to_section_nr(pfn);
811
+
812
+ /*
813
+ * When removing an early section, the usage map is kept (as the
814
+ * usage maps of other sections fall into the same page). It
815
+ * will be re-used when re-adding the section - which is then no
816
+ * longer an early section. If the usage map is PageReserved, it
817
+ * was allocated during boot.
818
+ */
819
+ if (!PageReserved(virt_to_page(ms->usage))) {
820
+ kfree(ms->usage);
821
+ ms->usage = NULL;
822
+ }
823
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
824
+ /*
825
+ * Mark the section invalid so that valid_section()
826
+ * return false. This prevents code from dereferencing
827
+ * ms->usage array.
828
+ */
829
+ ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
830
+ }
831
+
832
+ /*
833
+ * The memmap of early sections is always fully populated. See
834
+ * section_activate() and pfn_valid() .
835
+ */
836
+ if (!section_is_early)
837
+ depopulate_section_memmap(pfn, nr_pages, altmap);
838
+ else if (memmap)
839
+ free_map_bootmem(memmap);
840
+
841
+ if (empty)
842
+ ms->section_mem_map = (unsigned long)NULL;
843
+}
844
+
845
+static struct page * __meminit section_activate(int nid, unsigned long pfn,
846
+ unsigned long nr_pages, struct vmem_altmap *altmap)
847
+{
848
+ struct mem_section *ms = __pfn_to_section(pfn);
849
+ struct mem_section_usage *usage = NULL;
850
+ struct page *memmap;
851
+ int rc = 0;
852
+
853
+ if (!ms->usage) {
854
+ usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
855
+ if (!usage)
856
+ return ERR_PTR(-ENOMEM);
857
+ ms->usage = usage;
858
+ }
859
+
860
+ rc = fill_subsection_map(pfn, nr_pages);
861
+ if (rc) {
862
+ if (usage)
863
+ ms->usage = NULL;
864
+ kfree(usage);
865
+ return ERR_PTR(rc);
866
+ }
867
+
868
+ /*
869
+ * The early init code does not consider partially populated
870
+ * initial sections, it simply assumes that memory will never be
871
+ * referenced. If we hot-add memory into such a section then we
872
+ * do not need to populate the memmap and can simply reuse what
873
+ * is already there.
874
+ */
875
+ if (nr_pages < PAGES_PER_SECTION && early_section(ms))
876
+ return pfn_to_page(pfn);
877
+
878
+ memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
879
+ if (!memmap) {
880
+ section_deactivate(pfn, nr_pages, altmap);
881
+ return ERR_PTR(-ENOMEM);
882
+ }
883
+
884
+ return memmap;
885
+}
886
+
887
+/**
888
+ * sparse_add_section - add a memory section, or populate an existing one
889
+ * @nid: The node to add section on
890
+ * @start_pfn: start pfn of the memory range
891
+ * @nr_pages: number of pfns to add in the section
892
+ * @altmap: device page map
893
+ *
894
+ * This is only intended for hotplug.
895
+ *
896
+ * Note that only VMEMMAP supports sub-section aligned hotplug,
897
+ * the proper alignment and size are gated by check_pfn_span().
898
+ *
899
+ *
900
+ * Return:
901
+ * * 0 - On success.
902
+ * * -EEXIST - Section has been present.
903
+ * * -ENOMEM - Out of memory.
904
+ */
905
+int __meminit sparse_add_section(int nid, unsigned long start_pfn,
906
+ unsigned long nr_pages, struct vmem_altmap *altmap)
663907 {
664908 unsigned long section_nr = pfn_to_section_nr(start_pfn);
665909 struct mem_section *ms;
666910 struct page *memmap;
667
- unsigned long *usemap;
668911 int ret;
669912
670
- /*
671
- * no locking for this, because it does its own
672
- * plus, it does a kmalloc
673
- */
674913 ret = sparse_index_init(section_nr, nid);
675
- if (ret < 0 && ret != -EEXIST)
914
+ if (ret < 0)
676915 return ret;
677
- ret = 0;
678
- memmap = kmalloc_section_memmap(section_nr, nid, altmap);
679
- if (!memmap)
680
- return -ENOMEM;
681
- usemap = __kmalloc_section_usemap();
682
- if (!usemap) {
683
- __kfree_section_memmap(memmap, altmap);
684
- return -ENOMEM;
685
- }
686916
687
- ms = __pfn_to_section(start_pfn);
688
- if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
689
- ret = -EEXIST;
690
- goto out;
691
- }
917
+ memmap = section_activate(nid, start_pfn, nr_pages, altmap);
918
+ if (IS_ERR(memmap))
919
+ return PTR_ERR(memmap);
692920
693
-#ifdef CONFIG_DEBUG_VM
694921 /*
695922 * Poison uninitialized struct pages in order to catch invalid flags
696923 * combinations.
697924 */
698
- memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION);
699
-#endif
925
+ page_init_poison(memmap, sizeof(struct page) * nr_pages);
700926
927
+ ms = __nr_to_section(section_nr);
928
+ set_section_nid(section_nr, nid);
701929 section_mark_present(ms);
702
- sparse_init_one_section(ms, section_nr, memmap, usemap);
703930
704
-out:
705
- if (ret < 0) {
706
- kfree(usemap);
707
- __kfree_section_memmap(memmap, altmap);
708
- }
709
- return ret;
931
+ /* Align memmap to section boundary in the subsection case */
932
+ if (section_nr_to_pfn(section_nr) != start_pfn)
933
+ memmap = pfn_to_page(section_nr_to_pfn(section_nr));
934
+ sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
935
+
936
+ return 0;
710937 }
711938
712939 #ifdef CONFIG_MEMORY_FAILURE
....@@ -714,12 +941,18 @@
714941 {
715942 int i;
716943
717
- if (!memmap)
944
+ /*
945
+ * A further optimization is to have per section refcounted
946
+ * num_poisoned_pages. But that would need more space per memmap, so
947
+ * for now just do a quick global check to speed up this routine in the
948
+ * absence of bad pages.
949
+ */
950
+ if (atomic_long_read(&num_poisoned_pages) == 0)
718951 return;
719952
720953 for (i = 0; i < nr_pages; i++) {
721954 if (PageHWPoison(&memmap[i])) {
722
- atomic_long_sub(1, &num_poisoned_pages);
955
+ num_poisoned_pages_dec();
723956 ClearPageHWPoison(&memmap[i]);
724957 }
725958 }
....@@ -730,50 +963,12 @@
730963 }
731964 #endif
732965
733
-static void free_section_usemap(struct page *memmap, unsigned long *usemap,
966
+void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
967
+ unsigned long nr_pages, unsigned long map_offset,
734968 struct vmem_altmap *altmap)
735969 {
736
- struct page *usemap_page;
737
-
738
- if (!usemap)
739
- return;
740
-
741
- usemap_page = virt_to_page(usemap);
742
- /*
743
- * Check to see if allocation came from hot-plug-add
744
- */
745
- if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
746
- kfree(usemap);
747
- if (memmap)
748
- __kfree_section_memmap(memmap, altmap);
749
- return;
750
- }
751
-
752
- /*
753
- * The usemap came from bootmem. This is packed with other usemaps
754
- * on the section which has pgdat at boot time. Just keep it as is now.
755
- */
756
-
757
- if (memmap)
758
- free_map_bootmem(memmap);
759
-}
760
-
761
-void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
762
- struct vmem_altmap *altmap)
763
-{
764
- struct page *memmap = NULL;
765
- unsigned long *usemap = NULL;
766
-
767
- if (ms->section_mem_map) {
768
- usemap = ms->pageblock_flags;
769
- memmap = sparse_decode_mem_map(ms->section_mem_map,
770
- __section_nr(ms));
771
- ms->section_mem_map = 0;
772
- ms->pageblock_flags = NULL;
773
- }
774
-
775
- clear_hwpoisoned_pages(memmap + map_offset,
776
- PAGES_PER_SECTION - map_offset);
777
- free_section_usemap(memmap, usemap, altmap);
970
+ clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
971
+ nr_pages - map_offset);
972
+ section_deactivate(pfn, nr_pages, altmap);
778973 }
779974 #endif /* CONFIG_MEMORY_HOTPLUG */