hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/arm64/mm/mmu.c
....@@ -1,20 +1,9 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Based on arch/arm/mm/mmu.c
34 *
45 * Copyright (C) 1995-2005 Russell King
56 * Copyright (C) 2012 ARM Ltd.
6
- *
7
- * This program is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License version 2 as
9
- * published by the Free Software Foundation.
10
- *
11
- * This program is distributed in the hope that it will be useful,
12
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
- * GNU General Public License for more details.
15
- *
16
- * You should have received a copy of the GNU General Public License
17
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
187 */
198
209 #include <linux/cache.h>
....@@ -28,12 +17,11 @@
2817 #include <linux/mman.h>
2918 #include <linux/nodemask.h>
3019 #include <linux/memblock.h>
20
+#include <linux/memory.h>
3121 #include <linux/fs.h>
3222 #include <linux/io.h>
3323 #include <linux/mm.h>
3424 #include <linux/vmalloc.h>
35
-#include <linux/dma-contiguous.h>
36
-#include <linux/cma.h>
3725
3826 #include <asm/barrier.h>
3927 #include <asm/cputype.h>
....@@ -42,18 +30,21 @@
4230 #include <asm/kernel-pgtable.h>
4331 #include <asm/sections.h>
4432 #include <asm/setup.h>
45
-#include <asm/sizes.h>
33
+#include <linux/sizes.h>
4634 #include <asm/tlb.h>
47
-#include <asm/memblock.h>
4835 #include <asm/mmu_context.h>
4936 #include <asm/ptdump.h>
5037 #include <asm/tlbflush.h>
38
+#include <asm/pgalloc.h>
5139
5240 #define NO_BLOCK_MAPPINGS BIT(0)
5341 #define NO_CONT_MAPPINGS BIT(1)
5442
55
-u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
43
+u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN);
5644 u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
45
+
46
+u64 __section(".mmuoff.data.write") vabits_actual;
47
+EXPORT_SYMBOL(vabits_actual);
5748
5849 u64 kimage_voffset __ro_after_init;
5950 EXPORT_SYMBOL(kimage_voffset);
....@@ -69,38 +60,23 @@
6960 static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
7061 static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
7162
72
-struct dma_contig_early_reserve {
73
- phys_addr_t base;
74
- unsigned long size;
75
-};
63
+static DEFINE_SPINLOCK(swapper_pgdir_lock);
64
+static DEFINE_MUTEX(fixmap_lock);
7665
77
-static struct dma_contig_early_reserve dma_mmu_remap[MAX_CMA_AREAS];
78
-static int dma_mmu_remap_num;
79
-
80
-void __init dma_contiguous_early_fixup(phys_addr_t base, unsigned long size)
66
+void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
8167 {
82
- if (dma_mmu_remap_num >= ARRAY_SIZE(dma_mmu_remap)) {
83
- pr_err("ARM64: Not enough slots for DMA fixup reserved regions!\n");
84
- return;
85
- }
86
- dma_mmu_remap[dma_mmu_remap_num].base = base;
87
- dma_mmu_remap[dma_mmu_remap_num].size = size;
88
- dma_mmu_remap_num++;
89
-}
68
+ pgd_t *fixmap_pgdp;
9069
91
-static bool dma_overlap(phys_addr_t start, phys_addr_t end)
92
-{
93
- int i;
94
-
95
- for (i = 0; i < dma_mmu_remap_num; i++) {
96
- phys_addr_t dma_base = dma_mmu_remap[i].base;
97
- phys_addr_t dma_end = dma_mmu_remap[i].base +
98
- dma_mmu_remap[i].size;
99
-
100
- if ((dma_base < end) && (dma_end > start))
101
- return true;
102
- }
103
- return false;
70
+ spin_lock(&swapper_pgdir_lock);
71
+ fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp));
72
+ WRITE_ONCE(*fixmap_pgdp, pgd);
73
+ /*
74
+ * We need dsb(ishst) here to ensure the page-table-walker sees
75
+ * our new entry before set_p?d() returns. The fixmap's
76
+ * flush_tlb_kernel_range() via clear_fixmap() does this for us.
77
+ */
78
+ pgd_clear_fixmap();
79
+ spin_unlock(&swapper_pgdir_lock);
10480 }
10581
10682 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
....@@ -114,12 +90,14 @@
11490 }
11591 EXPORT_SYMBOL(phys_mem_access_prot);
11692
117
-static phys_addr_t __init early_pgtable_alloc(void)
93
+static phys_addr_t __init early_pgtable_alloc(int shift)
11894 {
11995 phys_addr_t phys;
12096 void *ptr;
12197
122
- phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
98
+ phys = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
99
+ if (!phys)
100
+ panic("Failed to allocate page table page\n");
123101
124102 /*
125103 * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
....@@ -145,7 +123,7 @@
145123 * The following mapping attributes may be updated in live
146124 * kernel mappings without the need for break-before-make.
147125 */
148
- static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG;
126
+ pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG;
149127
150128 /* creating or taking down mappings is always safe */
151129 if (old == 0 || new == 0)
....@@ -158,6 +136,17 @@
158136 /* Transitioning from Non-Global to Global is unsafe */
159137 if (old & ~new & PTE_NG)
160138 return false;
139
+
140
+ /*
141
+ * Changing the memory type between Normal and Normal-Tagged is safe
142
+ * since Tagged is considered a permission attribute from the
143
+ * mismatched attribute aliases perspective.
144
+ */
145
+ if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
146
+ (old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) &&
147
+ ((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
148
+ (new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)))
149
+ mask |= PTE_ATTRINDX_MASK;
161150
162151 return ((old ^ new) & ~mask) == 0;
163152 }
....@@ -189,7 +178,7 @@
189178 static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
190179 unsigned long end, phys_addr_t phys,
191180 pgprot_t prot,
192
- phys_addr_t (*pgtable_alloc)(void),
181
+ phys_addr_t (*pgtable_alloc)(int),
193182 int flags)
194183 {
195184 unsigned long next;
....@@ -199,7 +188,7 @@
199188 if (pmd_none(pmd)) {
200189 phys_addr_t pte_phys;
201190 BUG_ON(!pgtable_alloc);
202
- pte_phys = pgtable_alloc();
191
+ pte_phys = pgtable_alloc(PAGE_SHIFT);
203192 __pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE);
204193 pmd = READ_ONCE(*pmdp);
205194 }
....@@ -223,7 +212,7 @@
223212
224213 static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
225214 phys_addr_t phys, pgprot_t prot,
226
- phys_addr_t (*pgtable_alloc)(void), int flags)
215
+ phys_addr_t (*pgtable_alloc)(int), int flags)
227216 {
228217 unsigned long next;
229218 pmd_t *pmdp;
....@@ -236,8 +225,7 @@
236225
237226 /* try section mapping first */
238227 if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
239
- (flags & NO_BLOCK_MAPPINGS) == 0 &&
240
- !dma_overlap(phys, phys + next - addr)) {
228
+ (flags & NO_BLOCK_MAPPINGS) == 0) {
241229 pmd_set_huge(pmdp, phys, prot);
242230
243231 /*
....@@ -262,7 +250,7 @@
262250 static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
263251 unsigned long end, phys_addr_t phys,
264252 pgprot_t prot,
265
- phys_addr_t (*pgtable_alloc)(void), int flags)
253
+ phys_addr_t (*pgtable_alloc)(int), int flags)
266254 {
267255 unsigned long next;
268256 pud_t pud = READ_ONCE(*pudp);
....@@ -274,7 +262,7 @@
274262 if (pud_none(pud)) {
275263 phys_addr_t pmd_phys;
276264 BUG_ON(!pgtable_alloc);
277
- pmd_phys = pgtable_alloc();
265
+ pmd_phys = pgtable_alloc(PMD_SHIFT);
278266 __pud_populate(pudp, pmd_phys, PUD_TYPE_TABLE);
279267 pud = READ_ONCE(*pudp);
280268 }
....@@ -310,23 +298,30 @@
310298
311299 static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
312300 phys_addr_t phys, pgprot_t prot,
313
- phys_addr_t (*pgtable_alloc)(void),
301
+ phys_addr_t (*pgtable_alloc)(int),
314302 int flags)
315303 {
316304 unsigned long next;
317305 pud_t *pudp;
318
- pgd_t pgd = READ_ONCE(*pgdp);
306
+ p4d_t *p4dp = p4d_offset(pgdp, addr);
307
+ p4d_t p4d = READ_ONCE(*p4dp);
319308
320
- if (pgd_none(pgd)) {
309
+ if (p4d_none(p4d)) {
321310 phys_addr_t pud_phys;
322311 BUG_ON(!pgtable_alloc);
323
- pud_phys = pgtable_alloc();
324
- __pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE);
325
- pgd = READ_ONCE(*pgdp);
312
+ pud_phys = pgtable_alloc(PUD_SHIFT);
313
+ __p4d_populate(p4dp, pud_phys, PUD_TYPE_TABLE);
314
+ p4d = READ_ONCE(*p4dp);
326315 }
327
- BUG_ON(pgd_bad(pgd));
316
+ BUG_ON(p4d_bad(p4d));
328317
329
- pudp = pud_set_fixmap_offset(pgdp, addr);
318
+ /*
319
+ * No need for locking during early boot. And it doesn't work as
320
+ * expected with KASLR enabled.
321
+ */
322
+ if (system_state != SYSTEM_BOOTING)
323
+ mutex_lock(&fixmap_lock);
324
+ pudp = pud_set_fixmap_offset(p4dp, addr);
330325 do {
331326 pud_t old_pud = READ_ONCE(*pudp);
332327
....@@ -336,8 +331,7 @@
336331 * For 4K granule only, attempt to put down a 1GB block
337332 */
338333 if (use_1G_block(addr, next, phys) &&
339
- (flags & NO_BLOCK_MAPPINGS) == 0 &&
340
- !dma_overlap(phys, phys + next - addr)) {
334
+ (flags & NO_BLOCK_MAPPINGS) == 0) {
341335 pud_set_huge(pudp, phys, prot);
342336
343337 /*
....@@ -357,16 +351,18 @@
357351 } while (pudp++, addr = next, addr != end);
358352
359353 pud_clear_fixmap();
354
+ if (system_state != SYSTEM_BOOTING)
355
+ mutex_unlock(&fixmap_lock);
360356 }
361357
362358 static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
363359 unsigned long virt, phys_addr_t size,
364360 pgprot_t prot,
365
- phys_addr_t (*pgtable_alloc)(void),
361
+ phys_addr_t (*pgtable_alloc)(int),
366362 int flags)
367363 {
368
- unsigned long addr, length, end, next;
369
- pgd_t *pgdp = pgd_offset_raw(pgdir, virt);
364
+ unsigned long addr, end, next;
365
+ pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
370366
371367 /*
372368 * If the virtual and physical address don't have the same offset
....@@ -377,9 +373,8 @@
377373
378374 phys &= PAGE_MASK;
379375 addr = virt & PAGE_MASK;
380
- length = PAGE_ALIGN(size + (virt & ~PAGE_MASK));
376
+ end = PAGE_ALIGN(virt + size);
381377
382
- end = addr + length;
383378 do {
384379 next = pgd_addr_end(addr, end);
385380 alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
....@@ -388,37 +383,35 @@
388383 } while (pgdp++, addr = next, addr != end);
389384 }
390385
391
-static phys_addr_t pgd_pgtable_alloc(void)
386
+static phys_addr_t __pgd_pgtable_alloc(int shift)
392387 {
393
- void *ptr = (void *)__get_free_page(PGALLOC_GFP);
394
- if (!ptr || !pgtable_page_ctor(virt_to_page(ptr)))
395
- BUG();
388
+ void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL);
389
+ BUG_ON(!ptr);
396390
397391 /* Ensure the zeroed page is visible to the page table walker */
398392 dsb(ishst);
399393 return __pa(ptr);
400394 }
401395
402
-/**
403
- * create_pgtable_mapping - create a pagetable mapping for given
404
- * physical start and end addresses.
405
- * @start: physical start address.
406
- * @end: physical end address.
407
- */
408
-void create_pgtable_mapping(phys_addr_t start, phys_addr_t end)
396
+static phys_addr_t pgd_pgtable_alloc(int shift)
409397 {
410
- unsigned long virt = (unsigned long)phys_to_virt(start);
398
+ phys_addr_t pa = __pgd_pgtable_alloc(shift);
411399
412
- if (virt < VMALLOC_START) {
413
- pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
414
- &start, virt);
415
- return;
416
- }
400
+ /*
401
+ * Call proper page table ctor in case later we need to
402
+ * call core mm functions like apply_to_page_range() on
403
+ * this pre-allocated page table.
404
+ *
405
+ * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
406
+ * folded, and if so pgtable_pmd_page_ctor() becomes nop.
407
+ */
408
+ if (shift == PAGE_SHIFT)
409
+ BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa)));
410
+ else if (shift == PMD_SHIFT)
411
+ BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa)));
417412
418
- __create_pgd_mapping(init_mm.pgd, start, virt, end - start,
419
- PAGE_KERNEL, NULL, 0);
413
+ return pa;
420414 }
421
-EXPORT_SYMBOL_GPL(create_pgtable_mapping);
422415
423416 /*
424417 * This function can only be used to modify existing table entries,
....@@ -428,7 +421,7 @@
428421 static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
429422 phys_addr_t size, pgprot_t prot)
430423 {
431
- if (virt < VMALLOC_START) {
424
+ if ((virt >= PAGE_END) && (virt < VMALLOC_START)) {
432425 pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
433426 &phys, virt);
434427 return;
....@@ -455,7 +448,7 @@
455448 static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
456449 phys_addr_t size, pgprot_t prot)
457450 {
458
- if (virt < VMALLOC_START) {
451
+ if ((virt >= PAGE_END) && (virt < VMALLOC_START)) {
459452 pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n",
460453 &phys, virt);
461454 return;
....@@ -485,14 +478,31 @@
485478 PAGE_KERNEL_RO);
486479 }
487480
481
+static bool crash_mem_map __initdata;
482
+
483
+static int __init enable_crash_mem_map(char *arg)
484
+{
485
+ /*
486
+ * Proper parameter parsing is done by reserve_crashkernel(). We only
487
+ * need to know if the linear map has to avoid block mappings so that
488
+ * the crashkernel reservations can be unmapped later.
489
+ */
490
+ crash_mem_map = true;
491
+
492
+ return 0;
493
+}
494
+early_param("crashkernel", enable_crash_mem_map);
495
+
488496 static void __init map_mem(pgd_t *pgdp)
489497 {
490498 phys_addr_t kernel_start = __pa_symbol(_text);
491499 phys_addr_t kernel_end = __pa_symbol(__init_begin);
492
- struct memblock_region *reg;
500
+ phys_addr_t start, end;
493501 int flags = 0;
502
+ u64 i;
494503
495
- if (debug_pagealloc_enabled())
504
+ if (rodata_full || debug_pagealloc_enabled() ||
505
+ IS_ENABLED(CONFIG_KFENCE))
496506 flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
497507
498508 /*
....@@ -502,23 +512,29 @@
502512 * the following for-loop
503513 */
504514 memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
515
+
505516 #ifdef CONFIG_KEXEC_CORE
506
- if (crashk_res.end)
507
- memblock_mark_nomap(crashk_res.start,
508
- resource_size(&crashk_res));
517
+ if (crash_mem_map) {
518
+ if (IS_ENABLED(CONFIG_ZONE_DMA) ||
519
+ IS_ENABLED(CONFIG_ZONE_DMA32))
520
+ flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
521
+ else if (crashk_res.end)
522
+ memblock_mark_nomap(crashk_res.start,
523
+ resource_size(&crashk_res));
524
+ }
509525 #endif
510526
511527 /* map all the memory banks */
512
- for_each_memblock(memory, reg) {
513
- phys_addr_t start = reg->base;
514
- phys_addr_t end = start + reg->size;
515
-
528
+ for_each_mem_range(i, &start, &end) {
516529 if (start >= end)
517530 break;
518
- if (memblock_is_nomap(reg))
519
- continue;
520
-
521
- __map_memblock(pgdp, start, end, PAGE_KERNEL, flags);
531
+ /*
532
+ * The linear map must allow allocation tags reading/writing
533
+ * if MTE is present. Otherwise, it has the same attributes as
534
+ * PAGE_KERNEL.
535
+ */
536
+ __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
537
+ flags);
522538 }
523539
524540 /*
....@@ -535,18 +551,22 @@
535551 PAGE_KERNEL, NO_CONT_MAPPINGS);
536552 memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
537553
538
-#ifdef CONFIG_KEXEC_CORE
539554 /*
540555 * Use page-level mappings here so that we can shrink the region
541556 * in page granularity and put back unused memory to buddy system
542557 * through /sys/kernel/kexec_crash_size interface.
543558 */
544
- if (crashk_res.end) {
545
- __map_memblock(pgdp, crashk_res.start, crashk_res.end + 1,
546
- PAGE_KERNEL,
547
- NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
548
- memblock_clear_nomap(crashk_res.start,
549
- resource_size(&crashk_res));
559
+#ifdef CONFIG_KEXEC_CORE
560
+ if (crash_mem_map &&
561
+ !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
562
+ if (crashk_res.end) {
563
+ __map_memblock(pgdp, crashk_res.start,
564
+ crashk_res.end + 1,
565
+ PAGE_KERNEL,
566
+ NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
567
+ memblock_clear_nomap(crashk_res.start,
568
+ resource_size(&crashk_res));
569
+ }
550570 }
551571 #endif
552572 }
....@@ -593,13 +613,27 @@
593613
594614 static int __init parse_rodata(char *arg)
595615 {
596
- return strtobool(arg, &rodata_enabled);
616
+ int ret = strtobool(arg, &rodata_enabled);
617
+ if (!ret) {
618
+ rodata_full = false;
619
+ return 0;
620
+ }
621
+
622
+ /* permit 'full' in addition to boolean options */
623
+ if (strcmp(arg, "full"))
624
+ return -EINVAL;
625
+
626
+ rodata_enabled = true;
627
+ rodata_full = true;
628
+ return 0;
597629 }
598630 early_param("rodata", parse_rodata);
599631
600632 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
601633 static int __init map_entry_trampoline(void)
602634 {
635
+ int i;
636
+
603637 pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
604638 phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
605639
....@@ -608,11 +642,15 @@
608642
609643 /* Map only the text into the trampoline page table */
610644 memset(tramp_pg_dir, 0, PGD_SIZE);
611
- __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE,
612
- prot, pgd_pgtable_alloc, 0);
645
+ __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
646
+ entry_tramp_text_size(), prot,
647
+ __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS);
613648
614649 /* Map both the text and data into the kernel page table */
615
- __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot);
650
+ for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++)
651
+ __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
652
+ pa_start + i * PAGE_SIZE, prot);
653
+
616654 if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
617655 extern char __entry_tramp_data_start[];
618656
....@@ -625,6 +663,22 @@
625663 }
626664 core_initcall(map_entry_trampoline);
627665 #endif
666
+
667
+/*
668
+ * Open coded check for BTI, only for use to determine configuration
669
+ * for early mappings for before the cpufeature code has run.
670
+ */
671
+static bool arm64_early_this_cpu_has_bti(void)
672
+{
673
+ u64 pfr1;
674
+
675
+ if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
676
+ return false;
677
+
678
+ pfr1 = __read_sysreg_by_encoding(SYS_ID_AA64PFR1_EL1);
679
+ return cpuid_feature_extract_unsigned_field(pfr1,
680
+ ID_AA64PFR1_BT_SHIFT);
681
+}
628682
629683 /*
630684 * Create fine-grained mappings for the kernel.
....@@ -642,6 +696,14 @@
642696 pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
643697
644698 /*
699
+ * If we have a CPU that supports BTI and a kernel built for
700
+ * BTI then mark the kernel executable text as guarded pages
701
+ * now so we don't have to rewrite the page tables later.
702
+ */
703
+ if (arm64_early_this_cpu_has_bti())
704
+ text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP);
705
+
706
+ /*
645707 * Only rodata will be remapped with different permissions later on,
646708 * all other segments are allowed to use contiguous mappings.
647709 */
....@@ -655,15 +717,18 @@
655717 &vmlinux_initdata, 0, VM_NO_GUARD);
656718 map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);
657719
658
- if (!READ_ONCE(pgd_val(*pgd_offset_raw(pgdp, FIXADDR_START)))) {
720
+ if (!READ_ONCE(pgd_val(*pgd_offset_pgd(pgdp, FIXADDR_START)))) {
659721 /*
660722 * The fixmap falls in a separate pgd to the kernel, and doesn't
661723 * live in the carveout for the swapper_pg_dir. We can simply
662724 * re-use the existing dir for the fixmap.
663725 */
664
- set_pgd(pgd_offset_raw(pgdp, FIXADDR_START),
726
+ set_pgd(pgd_offset_pgd(pgdp, FIXADDR_START),
665727 READ_ONCE(*pgd_offset_k(FIXADDR_START)));
666728 } else if (CONFIG_PGTABLE_LEVELS > 3) {
729
+ pgd_t *bm_pgdp;
730
+ p4d_t *bm_p4dp;
731
+ pud_t *bm_pudp;
667732 /*
668733 * The fixmap shares its top level pgd entry with the kernel
669734 * mapping. This can really only occur when we are running
....@@ -671,9 +736,10 @@
671736 * entry instead.
672737 */
673738 BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
674
- pud_populate(&init_mm,
675
- pud_set_fixmap_offset(pgdp, FIXADDR_START),
676
- lm_alias(bm_pmd));
739
+ bm_pgdp = pgd_offset_pgd(pgdp, FIXADDR_START);
740
+ bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_START);
741
+ bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_START);
742
+ pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd));
677743 pud_clear_fixmap();
678744 } else {
679745 BUG();
....@@ -682,40 +748,22 @@
682748 kasan_copy_shadow(pgdp);
683749 }
684750
685
-/*
686
- * paging_init() sets up the page tables, initialises the zone memory
687
- * maps and sets up the zero page.
688
- */
689751 void __init paging_init(void)
690752 {
691
- phys_addr_t pgd_phys = early_pgtable_alloc();
692
- pgd_t *pgdp = pgd_set_fixmap(pgd_phys);
753
+ pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
693754
694755 map_kernel(pgdp);
695756 map_mem(pgdp);
696757
697
- /*
698
- * We want to reuse the original swapper_pg_dir so we don't have to
699
- * communicate the new address to non-coherent secondaries in
700
- * secondary_entry, and so cpu_switch_mm can generate the address with
701
- * adrp+add rather than a load from some global variable.
702
- *
703
- * To do this we need to go via a temporary pgd.
704
- */
705
- cpu_replace_ttbr1(__va(pgd_phys));
706
- memcpy(swapper_pg_dir, pgdp, PGD_SIZE);
707
- cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
708
-
709758 pgd_clear_fixmap();
710
- memblock_free(pgd_phys, PAGE_SIZE);
711759
712
- /*
713
- * We only reuse the PGD from the swapper_pg_dir, not the pud + pmd
714
- * allocated with it.
715
- */
716
- memblock_free(__pa_symbol(swapper_pg_dir) + PAGE_SIZE,
717
- __pa_symbol(swapper_pg_end) - __pa_symbol(swapper_pg_dir)
718
- - PAGE_SIZE);
760
+ cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
761
+ init_mm.pgd = swapper_pg_dir;
762
+
763
+ memblock_free(__pa_symbol(init_pg_dir),
764
+ __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
765
+
766
+ memblock_allow_resize();
719767 }
720768
721769 /*
....@@ -724,10 +772,12 @@
724772 int kern_addr_valid(unsigned long addr)
725773 {
726774 pgd_t *pgdp;
775
+ p4d_t *p4dp;
727776 pud_t *pudp, pud;
728777 pmd_t *pmdp, pmd;
729778 pte_t *ptep, pte;
730779
780
+ addr = arch_kasan_reset_tag(addr);
731781 if ((((long)addr) >> VA_BITS) != -1UL)
732782 return 0;
733783
....@@ -735,7 +785,11 @@
735785 if (pgd_none(READ_ONCE(*pgdp)))
736786 return 0;
737787
738
- pudp = pud_offset(pgdp, addr);
788
+ p4dp = p4d_offset(pgdp, addr);
789
+ if (p4d_none(READ_ONCE(*p4dp)))
790
+ return 0;
791
+
792
+ pudp = pud_offset(p4dp, addr);
739793 pud = READ_ONCE(*pudp);
740794 if (pud_none(pud))
741795 return 0;
....@@ -758,13 +812,336 @@
758812
759813 return pfn_valid(pte_pfn(pte));
760814 }
761
-EXPORT_SYMBOL_GPL(kern_addr_valid);
815
+
816
+#ifdef CONFIG_MEMORY_HOTPLUG
817
+static void free_hotplug_page_range(struct page *page, size_t size,
818
+ struct vmem_altmap *altmap)
819
+{
820
+ if (altmap) {
821
+ vmem_altmap_free(altmap, size >> PAGE_SHIFT);
822
+ } else {
823
+ WARN_ON(PageReserved(page));
824
+ free_pages((unsigned long)page_address(page), get_order(size));
825
+ }
826
+}
827
+
828
+static void free_hotplug_pgtable_page(struct page *page)
829
+{
830
+ free_hotplug_page_range(page, PAGE_SIZE, NULL);
831
+}
832
+
833
+static bool pgtable_range_aligned(unsigned long start, unsigned long end,
834
+ unsigned long floor, unsigned long ceiling,
835
+ unsigned long mask)
836
+{
837
+ start &= mask;
838
+ if (start < floor)
839
+ return false;
840
+
841
+ if (ceiling) {
842
+ ceiling &= mask;
843
+ if (!ceiling)
844
+ return false;
845
+ }
846
+
847
+ if (end - 1 > ceiling - 1)
848
+ return false;
849
+ return true;
850
+}
851
+
852
+static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
853
+ unsigned long end, bool free_mapped,
854
+ struct vmem_altmap *altmap)
855
+{
856
+ pte_t *ptep, pte;
857
+
858
+ do {
859
+ ptep = pte_offset_kernel(pmdp, addr);
860
+ pte = READ_ONCE(*ptep);
861
+ if (pte_none(pte))
862
+ continue;
863
+
864
+ WARN_ON(!pte_present(pte));
865
+ pte_clear(&init_mm, addr, ptep);
866
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
867
+ if (free_mapped)
868
+ free_hotplug_page_range(pte_page(pte),
869
+ PAGE_SIZE, altmap);
870
+ } while (addr += PAGE_SIZE, addr < end);
871
+}
872
+
873
+static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
874
+ unsigned long end, bool free_mapped,
875
+ struct vmem_altmap *altmap)
876
+{
877
+ unsigned long next;
878
+ pmd_t *pmdp, pmd;
879
+
880
+ do {
881
+ next = pmd_addr_end(addr, end);
882
+ pmdp = pmd_offset(pudp, addr);
883
+ pmd = READ_ONCE(*pmdp);
884
+ if (pmd_none(pmd))
885
+ continue;
886
+
887
+ WARN_ON(!pmd_present(pmd));
888
+ if (pmd_sect(pmd)) {
889
+ pmd_clear(pmdp);
890
+
891
+ /*
892
+ * One TLBI should be sufficient here as the PMD_SIZE
893
+ * range is mapped with a single block entry.
894
+ */
895
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
896
+ if (free_mapped)
897
+ free_hotplug_page_range(pmd_page(pmd),
898
+ PMD_SIZE, altmap);
899
+ continue;
900
+ }
901
+ WARN_ON(!pmd_table(pmd));
902
+ unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
903
+ } while (addr = next, addr < end);
904
+}
905
+
906
+static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
907
+ unsigned long end, bool free_mapped,
908
+ struct vmem_altmap *altmap)
909
+{
910
+ unsigned long next;
911
+ pud_t *pudp, pud;
912
+
913
+ do {
914
+ next = pud_addr_end(addr, end);
915
+ pudp = pud_offset(p4dp, addr);
916
+ pud = READ_ONCE(*pudp);
917
+ if (pud_none(pud))
918
+ continue;
919
+
920
+ WARN_ON(!pud_present(pud));
921
+ if (pud_sect(pud)) {
922
+ pud_clear(pudp);
923
+
924
+ /*
925
+ * One TLBI should be sufficient here as the PUD_SIZE
926
+ * range is mapped with a single block entry.
927
+ */
928
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
929
+ if (free_mapped)
930
+ free_hotplug_page_range(pud_page(pud),
931
+ PUD_SIZE, altmap);
932
+ continue;
933
+ }
934
+ WARN_ON(!pud_table(pud));
935
+ unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
936
+ } while (addr = next, addr < end);
937
+}
938
+
939
+static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
940
+ unsigned long end, bool free_mapped,
941
+ struct vmem_altmap *altmap)
942
+{
943
+ unsigned long next;
944
+ p4d_t *p4dp, p4d;
945
+
946
+ do {
947
+ next = p4d_addr_end(addr, end);
948
+ p4dp = p4d_offset(pgdp, addr);
949
+ p4d = READ_ONCE(*p4dp);
950
+ if (p4d_none(p4d))
951
+ continue;
952
+
953
+ WARN_ON(!p4d_present(p4d));
954
+ unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap);
955
+ } while (addr = next, addr < end);
956
+}
957
+
958
+static void unmap_hotplug_range(unsigned long addr, unsigned long end,
959
+ bool free_mapped, struct vmem_altmap *altmap)
960
+{
961
+ unsigned long next;
962
+ pgd_t *pgdp, pgd;
963
+
964
+ /*
965
+ * altmap can only be used as vmemmap mapping backing memory.
966
+ * In case the backing memory itself is not being freed, then
967
+ * altmap is irrelevant. Warn about this inconsistency when
968
+ * encountered.
969
+ */
970
+ WARN_ON(!free_mapped && altmap);
971
+
972
+ do {
973
+ next = pgd_addr_end(addr, end);
974
+ pgdp = pgd_offset_k(addr);
975
+ pgd = READ_ONCE(*pgdp);
976
+ if (pgd_none(pgd))
977
+ continue;
978
+
979
+ WARN_ON(!pgd_present(pgd));
980
+ unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
981
+ } while (addr = next, addr < end);
982
+}
983
+
984
+static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
985
+ unsigned long end, unsigned long floor,
986
+ unsigned long ceiling)
987
+{
988
+ pte_t *ptep, pte;
989
+ unsigned long i, start = addr;
990
+
991
+ do {
992
+ ptep = pte_offset_kernel(pmdp, addr);
993
+ pte = READ_ONCE(*ptep);
994
+
995
+ /*
996
+ * This is just a sanity check here which verifies that
997
+ * pte clearing has been done by earlier unmap loops.
998
+ */
999
+ WARN_ON(!pte_none(pte));
1000
+ } while (addr += PAGE_SIZE, addr < end);
1001
+
1002
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
1003
+ return;
1004
+
1005
+ /*
1006
+ * Check whether we can free the pte page if the rest of the
1007
+ * entries are empty. Overlap with other regions have been
1008
+ * handled by the floor/ceiling check.
1009
+ */
1010
+ ptep = pte_offset_kernel(pmdp, 0UL);
1011
+ for (i = 0; i < PTRS_PER_PTE; i++) {
1012
+ if (!pte_none(READ_ONCE(ptep[i])))
1013
+ return;
1014
+ }
1015
+
1016
+ pmd_clear(pmdp);
1017
+ __flush_tlb_kernel_pgtable(start);
1018
+ free_hotplug_pgtable_page(virt_to_page(ptep));
1019
+}
1020
+
1021
+static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
1022
+ unsigned long end, unsigned long floor,
1023
+ unsigned long ceiling)
1024
+{
1025
+ pmd_t *pmdp, pmd;
1026
+ unsigned long i, next, start = addr;
1027
+
1028
+ do {
1029
+ next = pmd_addr_end(addr, end);
1030
+ pmdp = pmd_offset(pudp, addr);
1031
+ pmd = READ_ONCE(*pmdp);
1032
+ if (pmd_none(pmd))
1033
+ continue;
1034
+
1035
+ WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
1036
+ free_empty_pte_table(pmdp, addr, next, floor, ceiling);
1037
+ } while (addr = next, addr < end);
1038
+
1039
+ if (CONFIG_PGTABLE_LEVELS <= 2)
1040
+ return;
1041
+
1042
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
1043
+ return;
1044
+
1045
+ /*
1046
+ * Check whether we can free the pmd page if the rest of the
1047
+ * entries are empty. Overlap with other regions have been
1048
+ * handled by the floor/ceiling check.
1049
+ */
1050
+ pmdp = pmd_offset(pudp, 0UL);
1051
+ for (i = 0; i < PTRS_PER_PMD; i++) {
1052
+ if (!pmd_none(READ_ONCE(pmdp[i])))
1053
+ return;
1054
+ }
1055
+
1056
+ pud_clear(pudp);
1057
+ __flush_tlb_kernel_pgtable(start);
1058
+ free_hotplug_pgtable_page(virt_to_page(pmdp));
1059
+}
1060
+
1061
+static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
1062
+ unsigned long end, unsigned long floor,
1063
+ unsigned long ceiling)
1064
+{
1065
+ pud_t *pudp, pud;
1066
+ unsigned long i, next, start = addr;
1067
+
1068
+ do {
1069
+ next = pud_addr_end(addr, end);
1070
+ pudp = pud_offset(p4dp, addr);
1071
+ pud = READ_ONCE(*pudp);
1072
+ if (pud_none(pud))
1073
+ continue;
1074
+
1075
+ WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
1076
+ free_empty_pmd_table(pudp, addr, next, floor, ceiling);
1077
+ } while (addr = next, addr < end);
1078
+
1079
+ if (CONFIG_PGTABLE_LEVELS <= 3)
1080
+ return;
1081
+
1082
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
1083
+ return;
1084
+
1085
+ /*
1086
+ * Check whether we can free the pud page if the rest of the
1087
+ * entries are empty. Overlap with other regions have been
1088
+ * handled by the floor/ceiling check.
1089
+ */
1090
+ pudp = pud_offset(p4dp, 0UL);
1091
+ for (i = 0; i < PTRS_PER_PUD; i++) {
1092
+ if (!pud_none(READ_ONCE(pudp[i])))
1093
+ return;
1094
+ }
1095
+
1096
+ p4d_clear(p4dp);
1097
+ __flush_tlb_kernel_pgtable(start);
1098
+ free_hotplug_pgtable_page(virt_to_page(pudp));
1099
+}
1100
+
1101
+static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
1102
+ unsigned long end, unsigned long floor,
1103
+ unsigned long ceiling)
1104
+{
1105
+ unsigned long next;
1106
+ p4d_t *p4dp, p4d;
1107
+
1108
+ do {
1109
+ next = p4d_addr_end(addr, end);
1110
+ p4dp = p4d_offset(pgdp, addr);
1111
+ p4d = READ_ONCE(*p4dp);
1112
+ if (p4d_none(p4d))
1113
+ continue;
1114
+
1115
+ WARN_ON(!p4d_present(p4d));
1116
+ free_empty_pud_table(p4dp, addr, next, floor, ceiling);
1117
+ } while (addr = next, addr < end);
1118
+}
1119
+
1120
+static void free_empty_tables(unsigned long addr, unsigned long end,
1121
+ unsigned long floor, unsigned long ceiling)
1122
+{
1123
+ unsigned long next;
1124
+ pgd_t *pgdp, pgd;
1125
+
1126
+ do {
1127
+ next = pgd_addr_end(addr, end);
1128
+ pgdp = pgd_offset_k(addr);
1129
+ pgd = READ_ONCE(*pgdp);
1130
+ if (pgd_none(pgd))
1131
+ continue;
1132
+
1133
+ WARN_ON(!pgd_present(pgd));
1134
+ free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
1135
+ } while (addr = next, addr < end);
1136
+}
1137
+#endif
1138
+
7621139 #ifdef CONFIG_SPARSEMEM_VMEMMAP
7631140 #if !ARM64_SWAPPER_USES_SECTION_MAPS
7641141 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
7651142 struct vmem_altmap *altmap)
7661143 {
767
- return vmemmap_populate_basepages(start, end, node);
1144
+ return vmemmap_populate_basepages(start, end, node, altmap);
7681145 }
7691146 #else /* !ARM64_SWAPPER_USES_SECTION_MAPS */
7701147 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
....@@ -773,6 +1150,7 @@
7731150 unsigned long addr = start;
7741151 unsigned long next;
7751152 pgd_t *pgdp;
1153
+ p4d_t *p4dp;
7761154 pud_t *pudp;
7771155 pmd_t *pmdp;
7781156
....@@ -783,7 +1161,11 @@
7831161 if (!pgdp)
7841162 return -ENOMEM;
7851163
786
- pudp = vmemmap_pud_populate(pgdp, addr, node);
1164
+ p4dp = vmemmap_p4d_populate(pgdp, addr, node);
1165
+ if (!p4dp)
1166
+ return -ENOMEM;
1167
+
1168
+ pudp = vmemmap_pud_populate(p4dp, addr, node);
7871169 if (!pudp)
7881170 return -ENOMEM;
7891171
....@@ -791,9 +1173,12 @@
7911173 if (pmd_none(READ_ONCE(*pmdp))) {
7921174 void *p = NULL;
7931175
794
- p = vmemmap_alloc_block_buf(PMD_SIZE, node);
795
- if (!p)
796
- return -ENOMEM;
1176
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
1177
+ if (!p) {
1178
+ if (vmemmap_populate_basepages(addr, next, node, altmap))
1179
+ return -ENOMEM;
1180
+ continue;
1181
+ }
7971182
7981183 pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
7991184 } else
....@@ -802,21 +1187,28 @@
8021187
8031188 return 0;
8041189 }
805
-#endif /* CONFIG_ARM64_64K_PAGES */
1190
+#endif /* !ARM64_SWAPPER_USES_SECTION_MAPS */
8061191 void vmemmap_free(unsigned long start, unsigned long end,
8071192 struct vmem_altmap *altmap)
8081193 {
1194
+#ifdef CONFIG_MEMORY_HOTPLUG
1195
+ WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
1196
+
1197
+ unmap_hotplug_range(start, end, true, altmap);
1198
+ free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
1199
+#endif
8091200 }
8101201 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
8111202
8121203 static inline pud_t * fixmap_pud(unsigned long addr)
8131204 {
8141205 pgd_t *pgdp = pgd_offset_k(addr);
815
- pgd_t pgd = READ_ONCE(*pgdp);
1206
+ p4d_t *p4dp = p4d_offset(pgdp, addr);
1207
+ p4d_t p4d = READ_ONCE(*p4dp);
8161208
817
- BUG_ON(pgd_none(pgd) || pgd_bad(pgd));
1209
+ BUG_ON(p4d_none(p4d) || p4d_bad(p4d));
8181210
819
- return pud_offset_kimg(pgdp, addr);
1211
+ return pud_offset_kimg(p4dp, addr);
8201212 }
8211213
8221214 static inline pmd_t * fixmap_pmd(unsigned long addr)
....@@ -842,25 +1234,27 @@
8421234 */
8431235 void __init early_fixmap_init(void)
8441236 {
845
- pgd_t *pgdp, pgd;
1237
+ pgd_t *pgdp;
1238
+ p4d_t *p4dp, p4d;
8461239 pud_t *pudp;
8471240 pmd_t *pmdp;
8481241 unsigned long addr = FIXADDR_START;
8491242
8501243 pgdp = pgd_offset_k(addr);
851
- pgd = READ_ONCE(*pgdp);
1244
+ p4dp = p4d_offset(pgdp, addr);
1245
+ p4d = READ_ONCE(*p4dp);
8521246 if (CONFIG_PGTABLE_LEVELS > 3 &&
853
- !(pgd_none(pgd) || pgd_page_paddr(pgd) == __pa_symbol(bm_pud))) {
1247
+ !(p4d_none(p4d) || p4d_page_paddr(p4d) == __pa_symbol(bm_pud))) {
8541248 /*
8551249 * We only end up here if the kernel mapping and the fixmap
8561250 * share the top level pgd entry, which should only happen on
8571251 * 16k/4 levels configurations.
8581252 */
8591253 BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
860
- pudp = pud_offset_kimg(pgdp, addr);
1254
+ pudp = pud_offset_kimg(p4dp, addr);
8611255 } else {
862
- if (pgd_none(pgd))
863
- __pgd_populate(pgdp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
1256
+ if (p4d_none(p4d))
1257
+ __p4d_populate(p4dp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
8641258 pudp = fixmap_pud(addr);
8651259 }
8661260 if (pud_none(READ_ONCE(*pudp)))
....@@ -978,43 +1372,39 @@
9781372 * SW table walks can't handle removal of intermediate entries.
9791373 */
9801374 return IS_ENABLED(CONFIG_ARM64_4K_PAGES) &&
981
- !IS_ENABLED(CONFIG_ARM64_PTDUMP_DEBUGFS);
1375
+ !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
9821376 }
9831377
9841378 int __init arch_ioremap_pmd_supported(void)
9851379 {
9861380 /* See arch_ioremap_pud_supported() */
987
- return !IS_ENABLED(CONFIG_ARM64_PTDUMP_DEBUGFS);
1381
+ return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
9881382 }
9891383
9901384 int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
9911385 {
992
- pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT |
993
- pgprot_val(mk_sect_prot(prot)));
994
- pud_t new_pud = pfn_pud(__phys_to_pfn(phys), sect_prot);
1386
+ pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
9951387
9961388 /* Only allow permission changes for now */
9971389 if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)),
9981390 pud_val(new_pud)))
9991391 return 0;
10001392
1001
- BUG_ON(phys & ~PUD_MASK);
1393
+ VM_BUG_ON(phys & ~PUD_MASK);
10021394 set_pud(pudp, new_pud);
10031395 return 1;
10041396 }
10051397
10061398 int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
10071399 {
1008
- pgprot_t sect_prot = __pgprot(PMD_TYPE_SECT |
1009
- pgprot_val(mk_sect_prot(prot)));
1010
- pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), sect_prot);
1400
+ pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
10111401
10121402 /* Only allow permission changes for now */
10131403 if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)),
10141404 pmd_val(new_pmd)))
10151405 return 0;
10161406
1017
- BUG_ON(phys & ~PMD_MASK);
1407
+ VM_BUG_ON(phys & ~PMD_MASK);
10181408 set_pmd(pmdp, new_pmd);
10191409 return 1;
10201410 }
....@@ -1042,10 +1432,8 @@
10421432
10431433 pmd = READ_ONCE(*pmdp);
10441434
1045
- if (!pmd_present(pmd))
1046
- return 1;
10471435 if (!pmd_table(pmd)) {
1048
- VM_WARN_ON(!pmd_table(pmd));
1436
+ VM_WARN_ON(1);
10491437 return 1;
10501438 }
10511439
....@@ -1065,10 +1453,8 @@
10651453
10661454 pud = READ_ONCE(*pudp);
10671455
1068
- if (!pud_present(pud))
1069
- return 1;
10701456 if (!pud_table(pud)) {
1071
- VM_WARN_ON(!pud_table(pud));
1457
+ VM_WARN_ON(1);
10721458 return 1;
10731459 }
10741460
....@@ -1085,3 +1471,199 @@
10851471 pmd_free(NULL, table);
10861472 return 1;
10871473 }
1474
+
1475
+int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
1476
+{
1477
+ return 0; /* Don't attempt a block mapping */
1478
+}
1479
+
1480
+#ifdef CONFIG_MEMORY_HOTPLUG
1481
+static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
1482
+{
1483
+ unsigned long end = start + size;
1484
+
1485
+ WARN_ON(pgdir != init_mm.pgd);
1486
+ WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
1487
+
1488
+ unmap_hotplug_range(start, end, false, NULL);
1489
+ free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
1490
+}
1491
+
1492
+static bool inside_linear_region(u64 start, u64 size)
1493
+{
1494
+ u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
1495
+ u64 end_linear_pa = __pa(PAGE_END - 1);
1496
+
1497
+ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
1498
+ /*
1499
+ * Check for a wrap, it is possible because of randomized linear
1500
+ * mapping the start physical address is actually bigger than
1501
+ * the end physical address. In this case set start to zero
1502
+ * because [0, end_linear_pa] range must still be able to cover
1503
+ * all addressable physical addresses.
1504
+ */
1505
+ if (start_linear_pa > end_linear_pa)
1506
+ start_linear_pa = 0;
1507
+ }
1508
+
1509
+ WARN_ON(start_linear_pa > end_linear_pa);
1510
+
1511
+ /*
1512
+ * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)]
1513
+ * accommodating both its ends but excluding PAGE_END. Max physical
1514
+ * range which can be mapped inside this linear mapping range, must
1515
+ * also be derived from its end points.
1516
+ */
1517
+ return start >= start_linear_pa && (start + size - 1) <= end_linear_pa;
1518
+}
1519
+
1520
+int arch_add_memory(int nid, u64 start, u64 size,
1521
+ struct mhp_params *params)
1522
+{
1523
+ int ret, flags = 0;
1524
+
1525
+ if (!inside_linear_region(start, size)) {
1526
+ pr_err("[%llx %llx] is outside linear mapping region\n", start, start + size);
1527
+ return -EINVAL;
1528
+ }
1529
+
1530
+ /*
1531
+ * KFENCE requires linear map to be mapped at page granularity, so that
1532
+ * it is possible to protect/unprotect single pages in the KFENCE pool.
1533
+ */
1534
+ if (rodata_full || debug_pagealloc_enabled() ||
1535
+ IS_ENABLED(CONFIG_KFENCE))
1536
+ flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
1537
+
1538
+ __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
1539
+ size, params->pgprot, __pgd_pgtable_alloc,
1540
+ flags);
1541
+
1542
+ memblock_clear_nomap(start, size);
1543
+
1544
+ ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
1545
+ params);
1546
+ if (ret)
1547
+ __remove_pgd_mapping(swapper_pg_dir,
1548
+ __phys_to_virt(start), size);
1549
+ else {
1550
+ max_pfn = PFN_UP(start + size);
1551
+ max_low_pfn = max_pfn;
1552
+ }
1553
+
1554
+ return ret;
1555
+}
1556
+
1557
+void arch_remove_memory(int nid, u64 start, u64 size,
1558
+ struct vmem_altmap *altmap)
1559
+{
1560
+ unsigned long start_pfn = start >> PAGE_SHIFT;
1561
+ unsigned long nr_pages = size >> PAGE_SHIFT;
1562
+
1563
+ __remove_pages(start_pfn, nr_pages, altmap);
1564
+ __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
1565
+}
1566
+
1567
+int check_range_driver_managed(u64 start, u64 size, const char *resource_name)
1568
+{
1569
+ struct mem_section *ms;
1570
+ unsigned long pfn = __phys_to_pfn(start);
1571
+ unsigned long end_pfn = __phys_to_pfn(start + size);
1572
+ struct resource *res;
1573
+ unsigned long flags;
1574
+
1575
+ res = lookup_resource(&iomem_resource, start);
1576
+ if (!res) {
1577
+ pr_err("%s: couldn't find memory resource for start 0x%llx\n",
1578
+ __func__, start);
1579
+ return -EINVAL;
1580
+ }
1581
+
1582
+ flags = res->flags;
1583
+
1584
+ if (!(flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) ||
1585
+ strstr(resource_name, "System RAM (") != resource_name)
1586
+ return -EINVAL;
1587
+
1588
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1589
+ ms = __pfn_to_section(pfn);
1590
+ if (early_section(ms))
1591
+ return -EINVAL;
1592
+ }
1593
+
1594
+ return 0;
1595
+}
1596
+
1597
+int populate_range_driver_managed(u64 start, u64 size,
1598
+ const char *resource_name)
1599
+{
1600
+ unsigned long virt = (unsigned long)phys_to_virt(start);
1601
+ int flags = 0;
1602
+
1603
+ if (check_range_driver_managed(start, size, resource_name))
1604
+ return -EINVAL;
1605
+
1606
+ /*
1607
+ * When rodata_full is enabled, memory is mapped at page size granule,
1608
+ * as opposed to block mapping.
1609
+ */
1610
+ if (rodata_full || debug_pagealloc_enabled())
1611
+ flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
1612
+
1613
+ __create_pgd_mapping(init_mm.pgd, start, virt, size,
1614
+ PAGE_KERNEL, NULL, flags);
1615
+
1616
+ return 0;
1617
+}
1618
+EXPORT_SYMBOL_GPL(populate_range_driver_managed);
1619
+
1620
+int depopulate_range_driver_managed(u64 start, u64 size,
1621
+ const char *resource_name)
1622
+{
1623
+ if (check_range_driver_managed(start, size, resource_name))
1624
+ return -EINVAL;
1625
+
1626
+ unmap_hotplug_range(start, start + size, false, NULL);
1627
+
1628
+ return 0;
1629
+}
1630
+EXPORT_SYMBOL_GPL(depopulate_range_driver_managed);
1631
+
1632
+/*
1633
+ * This memory hotplug notifier helps prevent boot memory from being
1634
+ * inadvertently removed as it blocks pfn range offlining process in
1635
+ * __offline_pages(). Hence this prevents both offlining as well as
1636
+ * removal process for boot memory which is initially always online.
1637
+ * In future if and when boot memory could be removed, this notifier
1638
+ * should be dropped and free_hotplug_page_range() should handle any
1639
+ * reserved pages allocated during boot.
1640
+ */
1641
+static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
1642
+ unsigned long action, void *data)
1643
+{
1644
+ struct mem_section *ms;
1645
+ struct memory_notify *arg = data;
1646
+ unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
1647
+ unsigned long pfn = arg->start_pfn;
1648
+
1649
+ if (action != MEM_GOING_OFFLINE)
1650
+ return NOTIFY_OK;
1651
+
1652
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1653
+ ms = __pfn_to_section(pfn);
1654
+ if (early_section(ms))
1655
+ return NOTIFY_BAD;
1656
+ }
1657
+ return NOTIFY_OK;
1658
+}
1659
+
1660
+static struct notifier_block prevent_bootmem_remove_nb = {
1661
+ .notifier_call = prevent_bootmem_remove_notifier,
1662
+};
1663
+
1664
+static int __init prevent_bootmem_remove_init(void)
1665
+{
1666
+ return register_memory_notifier(&prevent_bootmem_remove_nb);
1667
+}
1668
+device_initcall(prevent_bootmem_remove_init);
1669
+#endif