.. | .. |
---|
3 | 3 | #include <linux/ioport.h> |
---|
4 | 4 | #include <linux/swap.h> |
---|
5 | 5 | #include <linux/memblock.h> |
---|
6 | | -#include <linux/bootmem.h> /* for max_low_pfn */ |
---|
7 | 6 | #include <linux/swapfile.h> |
---|
8 | 7 | #include <linux/swapops.h> |
---|
9 | 8 | #include <linux/kmemleak.h> |
---|
| 9 | +#include <linux/sched/task.h> |
---|
| 10 | +#include <linux/sched/mm.h> |
---|
10 | 11 | |
---|
11 | 12 | #include <asm/set_memory.h> |
---|
| 13 | +#include <asm/cpu_device_id.h> |
---|
12 | 14 | #include <asm/e820/api.h> |
---|
13 | 15 | #include <asm/init.h> |
---|
14 | 16 | #include <asm/page.h> |
---|
.. | .. |
---|
24 | 26 | #include <asm/hypervisor.h> |
---|
25 | 27 | #include <asm/cpufeature.h> |
---|
26 | 28 | #include <asm/pti.h> |
---|
| 29 | +#include <asm/text-patching.h> |
---|
| 30 | +#include <asm/memtype.h> |
---|
| 31 | +#include <asm/paravirt.h> |
---|
27 | 32 | |
---|
28 | 33 | /* |
---|
29 | 34 | * We need to define the tracepoints somewhere, and tlb.c |
---|
.. | .. |
---|
48 | 53 | * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte |
---|
49 | 54 | * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. |
---|
50 | 55 | */ |
---|
51 | | -uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { |
---|
| 56 | +static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { |
---|
52 | 57 | [_PAGE_CACHE_MODE_WB ] = 0 | 0 , |
---|
53 | 58 | [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, |
---|
54 | 59 | [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, |
---|
.. | .. |
---|
56 | 61 | [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, |
---|
57 | 62 | [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, |
---|
58 | 63 | }; |
---|
59 | | -EXPORT_SYMBOL(__cachemode2pte_tbl); |
---|
60 | 64 | |
---|
61 | | -uint8_t __pte2cachemode_tbl[8] = { |
---|
| 65 | +unsigned long cachemode2protval(enum page_cache_mode pcm) |
---|
| 66 | +{ |
---|
| 67 | + if (likely(pcm == 0)) |
---|
| 68 | + return 0; |
---|
| 69 | + return __cachemode2pte_tbl[pcm]; |
---|
| 70 | +} |
---|
| 71 | +EXPORT_SYMBOL(cachemode2protval); |
---|
| 72 | + |
---|
| 73 | +static uint8_t __pte2cachemode_tbl[8] = { |
---|
62 | 74 | [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, |
---|
63 | 75 | [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, |
---|
64 | 76 | [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, |
---|
.. | .. |
---|
68 | 80 | [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, |
---|
69 | 81 | [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, |
---|
70 | 82 | }; |
---|
71 | | -EXPORT_SYMBOL(__pte2cachemode_tbl); |
---|
| 83 | + |
---|
| 84 | +/* |
---|
| 85 | + * Check that the write-protect PAT entry is set for write-protect. |
---|
| 86 | + * To do this without making assumptions how PAT has been set up (Xen has |
---|
| 87 | + * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache |
---|
| 88 | + * mode via the __cachemode2pte_tbl[] into protection bits (those protection |
---|
| 89 | + * bits will select a cache mode of WP or better), and then translate the |
---|
| 90 | + * protection bits back into the cache mode using __pte2cm_idx() and the |
---|
| 91 | + * __pte2cachemode_tbl[] array. This will return the really used cache mode. |
---|
| 92 | + */ |
---|
| 93 | +bool x86_has_pat_wp(void) |
---|
| 94 | +{ |
---|
| 95 | + uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP]; |
---|
| 96 | + |
---|
| 97 | + return __pte2cachemode_tbl[__pte2cm_idx(prot)] == _PAGE_CACHE_MODE_WP; |
---|
| 98 | +} |
---|
| 99 | + |
---|
| 100 | +enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) |
---|
| 101 | +{ |
---|
| 102 | + unsigned long masked; |
---|
| 103 | + |
---|
| 104 | + masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; |
---|
| 105 | + if (likely(masked == 0)) |
---|
| 106 | + return 0; |
---|
| 107 | + return __pte2cachemode_tbl[__pte2cm_idx(masked)]; |
---|
| 108 | +} |
---|
72 | 109 | |
---|
73 | 110 | static unsigned long __initdata pgt_buf_start; |
---|
74 | 111 | static unsigned long __initdata pgt_buf_end; |
---|
.. | .. |
---|
77 | 114 | static unsigned long min_pfn_mapped; |
---|
78 | 115 | |
---|
79 | 116 | static bool __initdata can_use_brk_pgt = true; |
---|
| 117 | + |
---|
| 118 | +/* |
---|
| 119 | + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via |
---|
| 120 | + * CONFIG_ZONE_DMA32. |
---|
| 121 | + */ |
---|
| 122 | +static bool disable_dma32 __ro_after_init; |
---|
80 | 123 | |
---|
81 | 124 | /* |
---|
82 | 125 | * Pages returned are already directly mapped. |
---|
.. | .. |
---|
169 | 212 | |
---|
170 | 213 | static int page_size_mask; |
---|
171 | 214 | |
---|
| 215 | +/* |
---|
| 216 | + * Save some of cr4 feature set we're using (e.g. Pentium 4MB |
---|
| 217 | + * enable and PPro Global page enable), so that any CPU's that boot |
---|
| 218 | + * up after us can get the correct flags. Invoked on the boot CPU. |
---|
| 219 | + */ |
---|
| 220 | +static inline void cr4_set_bits_and_update_boot(unsigned long mask) |
---|
| 221 | +{ |
---|
| 222 | + mmu_cr4_features |= mask; |
---|
| 223 | + if (trampoline_cr4_features) |
---|
| 224 | + *trampoline_cr4_features = mmu_cr4_features; |
---|
| 225 | + cr4_set_bits(mask); |
---|
| 226 | +} |
---|
| 227 | + |
---|
172 | 228 | static void __init probe_page_size_mask(void) |
---|
173 | 229 | { |
---|
174 | 230 | /* |
---|
.. | .. |
---|
207 | 263 | } |
---|
208 | 264 | } |
---|
209 | 265 | |
---|
| 266 | +#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \ |
---|
| 267 | + .family = 6, \ |
---|
| 268 | + .model = _model, \ |
---|
| 269 | + } |
---|
| 270 | +/* |
---|
| 271 | + * INVLPG may not properly flush Global entries |
---|
| 272 | + * on these CPUs when PCIDs are enabled. |
---|
| 273 | + */ |
---|
| 274 | +static const struct x86_cpu_id invlpg_miss_ids[] = { |
---|
| 275 | + INTEL_MATCH(INTEL_FAM6_ALDERLAKE ), |
---|
| 276 | + INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ), |
---|
| 277 | + INTEL_MATCH(INTEL_FAM6_ALDERLAKE_N ), |
---|
| 278 | + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ), |
---|
| 279 | + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P), |
---|
| 280 | + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S), |
---|
| 281 | + {} |
---|
| 282 | +}; |
---|
| 283 | + |
---|
210 | 284 | static void setup_pcid(void) |
---|
211 | 285 | { |
---|
212 | 286 | if (!IS_ENABLED(CONFIG_X86_64)) |
---|
.. | .. |
---|
214 | 288 | |
---|
215 | 289 | if (!boot_cpu_has(X86_FEATURE_PCID)) |
---|
216 | 290 | return; |
---|
| 291 | + |
---|
| 292 | + if (x86_match_cpu(invlpg_miss_ids)) { |
---|
| 293 | + pr_info("Incomplete global flushes, disabling PCID"); |
---|
| 294 | + setup_clear_cpu_cap(X86_FEATURE_PCID); |
---|
| 295 | + return; |
---|
| 296 | + } |
---|
217 | 297 | |
---|
218 | 298 | if (boot_cpu_has(X86_FEATURE_PGE)) { |
---|
219 | 299 | /* |
---|
.. | .. |
---|
464 | 544 | * the physical memory. To access them they are temporarily mapped. |
---|
465 | 545 | */ |
---|
466 | 546 | unsigned long __ref init_memory_mapping(unsigned long start, |
---|
467 | | - unsigned long end) |
---|
| 547 | + unsigned long end, pgprot_t prot) |
---|
468 | 548 | { |
---|
469 | 549 | struct map_range mr[NR_RANGE_MR]; |
---|
470 | 550 | unsigned long ret = 0; |
---|
.. | .. |
---|
478 | 558 | |
---|
479 | 559 | for (i = 0; i < nr_range; i++) |
---|
480 | 560 | ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, |
---|
481 | | - mr[i].page_size_mask); |
---|
| 561 | + mr[i].page_size_mask, |
---|
| 562 | + prot); |
---|
482 | 563 | |
---|
483 | 564 | add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); |
---|
484 | 565 | |
---|
.. | .. |
---|
518 | 599 | */ |
---|
519 | 600 | can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= |
---|
520 | 601 | min(end, (u64)pgt_buf_top<<PAGE_SHIFT); |
---|
521 | | - init_memory_mapping(start, end); |
---|
| 602 | + init_memory_mapping(start, end, PAGE_KERNEL); |
---|
522 | 603 | mapped_ram_size += end - start; |
---|
523 | 604 | can_use_brk_pgt = true; |
---|
524 | 605 | } |
---|
.. | .. |
---|
643 | 724 | } |
---|
644 | 725 | } |
---|
645 | 726 | |
---|
| 727 | +/* |
---|
| 728 | + * The real mode trampoline, which is required for bootstrapping CPUs |
---|
| 729 | + * occupies only a small area under the low 1MB. See reserve_real_mode() |
---|
| 730 | + * for details. |
---|
| 731 | + * |
---|
| 732 | + * If KASLR is disabled the first PGD entry of the direct mapping is copied |
---|
| 733 | + * to map the real mode trampoline. |
---|
| 734 | + * |
---|
| 735 | + * If KASLR is enabled, copy only the PUD which covers the low 1MB |
---|
| 736 | + * area. This limits the randomization granularity to 1GB for both 4-level |
---|
| 737 | + * and 5-level paging. |
---|
| 738 | + */ |
---|
| 739 | +static void __init init_trampoline(void) |
---|
| 740 | +{ |
---|
| 741 | +#ifdef CONFIG_X86_64 |
---|
| 742 | + if (!kaslr_memory_enabled()) |
---|
| 743 | + trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; |
---|
| 744 | + else |
---|
| 745 | + init_trampoline_kaslr(); |
---|
| 746 | +#endif |
---|
| 747 | +} |
---|
| 748 | + |
---|
646 | 749 | void __init init_mem_mapping(void) |
---|
647 | 750 | { |
---|
648 | 751 | unsigned long end; |
---|
.. | .. |
---|
658 | 761 | #endif |
---|
659 | 762 | |
---|
660 | 763 | /* the ISA range is always mapped regardless of memory holes */ |
---|
661 | | - init_memory_mapping(0, ISA_END_ADDRESS); |
---|
| 764 | + init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL); |
---|
662 | 765 | |
---|
663 | 766 | /* Init the trampoline, possibly with KASLR memory offset */ |
---|
664 | 767 | init_trampoline(); |
---|
.. | .. |
---|
698 | 801 | x86_init.hyper.init_mem_mapping(); |
---|
699 | 802 | |
---|
700 | 803 | early_memtest(0, max_pfn_mapped << PAGE_SHIFT); |
---|
| 804 | +} |
---|
| 805 | + |
---|
| 806 | +/* |
---|
| 807 | + * Initialize an mm_struct to be used during poking and a pointer to be used |
---|
| 808 | + * during patching. |
---|
| 809 | + */ |
---|
| 810 | +void __init poking_init(void) |
---|
| 811 | +{ |
---|
| 812 | + spinlock_t *ptl; |
---|
| 813 | + pte_t *ptep; |
---|
| 814 | + |
---|
| 815 | + poking_mm = mm_alloc(); |
---|
| 816 | + BUG_ON(!poking_mm); |
---|
| 817 | + |
---|
| 818 | + /* Xen PV guests need the PGD to be pinned. */ |
---|
| 819 | + paravirt_arch_dup_mmap(NULL, poking_mm); |
---|
| 820 | + |
---|
| 821 | + /* |
---|
| 822 | + * Randomize the poking address, but make sure that the following page |
---|
| 823 | + * will be mapped at the same PMD. We need 2 pages, so find space for 3, |
---|
| 824 | + * and adjust the address if the PMD ends after the first one. |
---|
| 825 | + */ |
---|
| 826 | + poking_addr = TASK_UNMAPPED_BASE; |
---|
| 827 | + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) |
---|
| 828 | + poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % |
---|
| 829 | + (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); |
---|
| 830 | + |
---|
| 831 | + if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) |
---|
| 832 | + poking_addr += PAGE_SIZE; |
---|
| 833 | + |
---|
| 834 | + /* |
---|
| 835 | + * We need to trigger the allocation of the page-tables that will be |
---|
| 836 | + * needed for poking now. Later, poking may be performed in an atomic |
---|
| 837 | + * section, which might cause allocation to fail. |
---|
| 838 | + */ |
---|
| 839 | + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); |
---|
| 840 | + BUG_ON(!ptep); |
---|
| 841 | + pte_unmap_unlock(ptep, ptl); |
---|
701 | 842 | } |
---|
702 | 843 | |
---|
703 | 844 | /* |
---|
.. | .. |
---|
742 | 883 | return 1; |
---|
743 | 884 | } |
---|
744 | 885 | |
---|
745 | | -void free_init_pages(char *what, unsigned long begin, unsigned long end) |
---|
| 886 | +void free_init_pages(const char *what, unsigned long begin, unsigned long end) |
---|
746 | 887 | { |
---|
747 | 888 | unsigned long begin_aligned, end_aligned; |
---|
748 | 889 | |
---|
.. | .. |
---|
791 | 932 | * used for the kernel image only. free_init_pages() will do the |
---|
792 | 933 | * right thing for either kind of address. |
---|
793 | 934 | */ |
---|
794 | | -void free_kernel_image_pages(void *begin, void *end) |
---|
| 935 | +void free_kernel_image_pages(const char *what, void *begin, void *end) |
---|
795 | 936 | { |
---|
796 | 937 | unsigned long begin_ul = (unsigned long)begin; |
---|
797 | 938 | unsigned long end_ul = (unsigned long)end; |
---|
798 | 939 | unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; |
---|
799 | 940 | |
---|
800 | | - |
---|
801 | | - free_init_pages("unused kernel image", begin_ul, end_ul); |
---|
| 941 | + free_init_pages(what, begin_ul, end_ul); |
---|
802 | 942 | |
---|
803 | 943 | /* |
---|
804 | 944 | * PTI maps some of the kernel into userspace. For performance, |
---|
.. | .. |
---|
819 | 959 | set_memory_np_noalias(begin_ul, len_pages); |
---|
820 | 960 | } |
---|
821 | 961 | |
---|
822 | | -void __weak mem_encrypt_free_decrypted_mem(void) { } |
---|
823 | | - |
---|
824 | 962 | void __ref free_initmem(void) |
---|
825 | 963 | { |
---|
826 | 964 | e820__reallocate_tables(); |
---|
827 | 965 | |
---|
828 | 966 | mem_encrypt_free_decrypted_mem(); |
---|
829 | 967 | |
---|
830 | | - free_kernel_image_pages(&__init_begin, &__init_end); |
---|
| 968 | + free_kernel_image_pages("unused kernel image (initmem)", |
---|
| 969 | + &__init_begin, &__init_end); |
---|
831 | 970 | } |
---|
832 | 971 | |
---|
833 | 972 | #ifdef CONFIG_BLK_DEV_INITRD |
---|
.. | .. |
---|
903 | 1042 | max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); |
---|
904 | 1043 | #endif |
---|
905 | 1044 | #ifdef CONFIG_ZONE_DMA32 |
---|
906 | | - max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); |
---|
| 1045 | + max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn); |
---|
907 | 1046 | #endif |
---|
908 | 1047 | max_zone_pfns[ZONE_NORMAL] = max_low_pfn; |
---|
909 | 1048 | #ifdef CONFIG_HIGHMEM |
---|
910 | 1049 | max_zone_pfns[ZONE_HIGHMEM] = max_pfn; |
---|
911 | 1050 | #endif |
---|
912 | 1051 | |
---|
913 | | - free_area_init_nodes(max_zone_pfns); |
---|
| 1052 | + free_area_init(max_zone_pfns); |
---|
914 | 1053 | } |
---|
| 1054 | + |
---|
| 1055 | +static int __init early_disable_dma32(char *buf) |
---|
| 1056 | +{ |
---|
| 1057 | + if (!buf) |
---|
| 1058 | + return -EINVAL; |
---|
| 1059 | + |
---|
| 1060 | + if (!strcmp(buf, "on")) |
---|
| 1061 | + disable_dma32 = true; |
---|
| 1062 | + |
---|
| 1063 | + return 0; |
---|
| 1064 | +} |
---|
| 1065 | +early_param("disable_dma32", early_disable_dma32); |
---|
915 | 1066 | |
---|
916 | 1067 | __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { |
---|
917 | 1068 | .loaded_mm = &init_mm, |
---|
918 | 1069 | .next_asid = 1, |
---|
919 | 1070 | .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ |
---|
920 | 1071 | }; |
---|
921 | | -EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); |
---|
922 | 1072 | |
---|
923 | 1073 | void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) |
---|
924 | 1074 | { |
---|