| .. | .. |
|---|
| 3 | 3 | #include <linux/ioport.h> |
|---|
| 4 | 4 | #include <linux/swap.h> |
|---|
| 5 | 5 | #include <linux/memblock.h> |
|---|
| 6 | | -#include <linux/bootmem.h> /* for max_low_pfn */ |
|---|
| 7 | 6 | #include <linux/swapfile.h> |
|---|
| 8 | 7 | #include <linux/swapops.h> |
|---|
| 9 | 8 | #include <linux/kmemleak.h> |
|---|
| 9 | +#include <linux/sched/task.h> |
|---|
| 10 | +#include <linux/sched/mm.h> |
|---|
| 10 | 11 | |
|---|
| 11 | 12 | #include <asm/set_memory.h> |
|---|
| 13 | +#include <asm/cpu_device_id.h> |
|---|
| 12 | 14 | #include <asm/e820/api.h> |
|---|
| 13 | 15 | #include <asm/init.h> |
|---|
| 14 | 16 | #include <asm/page.h> |
|---|
| .. | .. |
|---|
| 24 | 26 | #include <asm/hypervisor.h> |
|---|
| 25 | 27 | #include <asm/cpufeature.h> |
|---|
| 26 | 28 | #include <asm/pti.h> |
|---|
| 29 | +#include <asm/text-patching.h> |
|---|
| 30 | +#include <asm/memtype.h> |
|---|
| 31 | +#include <asm/paravirt.h> |
|---|
| 27 | 32 | |
|---|
| 28 | 33 | /* |
|---|
| 29 | 34 | * We need to define the tracepoints somewhere, and tlb.c |
|---|
| .. | .. |
|---|
| 48 | 53 | * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte |
|---|
| 49 | 54 | * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. |
|---|
| 50 | 55 | */ |
|---|
| 51 | | -uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { |
|---|
| 56 | +static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { |
|---|
| 52 | 57 | [_PAGE_CACHE_MODE_WB ] = 0 | 0 , |
|---|
| 53 | 58 | [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, |
|---|
| 54 | 59 | [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, |
|---|
| .. | .. |
|---|
| 56 | 61 | [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, |
|---|
| 57 | 62 | [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, |
|---|
| 58 | 63 | }; |
|---|
| 59 | | -EXPORT_SYMBOL(__cachemode2pte_tbl); |
|---|
| 60 | 64 | |
|---|
| 61 | | -uint8_t __pte2cachemode_tbl[8] = { |
|---|
| 65 | +unsigned long cachemode2protval(enum page_cache_mode pcm) |
|---|
| 66 | +{ |
|---|
| 67 | + if (likely(pcm == 0)) |
|---|
| 68 | + return 0; |
|---|
| 69 | + return __cachemode2pte_tbl[pcm]; |
|---|
| 70 | +} |
|---|
| 71 | +EXPORT_SYMBOL(cachemode2protval); |
|---|
| 72 | + |
|---|
| 73 | +static uint8_t __pte2cachemode_tbl[8] = { |
|---|
| 62 | 74 | [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, |
|---|
| 63 | 75 | [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, |
|---|
| 64 | 76 | [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, |
|---|
| .. | .. |
|---|
| 68 | 80 | [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, |
|---|
| 69 | 81 | [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, |
|---|
| 70 | 82 | }; |
|---|
| 71 | | -EXPORT_SYMBOL(__pte2cachemode_tbl); |
|---|
| 83 | + |
|---|
| 84 | +/* |
|---|
| 85 | + * Check that the write-protect PAT entry is set for write-protect. |
|---|
| 86 | + * To do this without making assumptions how PAT has been set up (Xen has |
|---|
| 87 | + * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache |
|---|
| 88 | + * mode via the __cachemode2pte_tbl[] into protection bits (those protection |
|---|
| 89 | + * bits will select a cache mode of WP or better), and then translate the |
|---|
| 90 | + * protection bits back into the cache mode using __pte2cm_idx() and the |
|---|
| 91 | + * __pte2cachemode_tbl[] array. This will return the really used cache mode. |
|---|
| 92 | + */ |
|---|
| 93 | +bool x86_has_pat_wp(void) |
|---|
| 94 | +{ |
|---|
| 95 | + uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP]; |
|---|
| 96 | + |
|---|
| 97 | + return __pte2cachemode_tbl[__pte2cm_idx(prot)] == _PAGE_CACHE_MODE_WP; |
|---|
| 98 | +} |
|---|
| 99 | + |
|---|
| 100 | +enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) |
|---|
| 101 | +{ |
|---|
| 102 | + unsigned long masked; |
|---|
| 103 | + |
|---|
| 104 | + masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; |
|---|
| 105 | + if (likely(masked == 0)) |
|---|
| 106 | + return 0; |
|---|
| 107 | + return __pte2cachemode_tbl[__pte2cm_idx(masked)]; |
|---|
| 108 | +} |
|---|
| 72 | 109 | |
|---|
| 73 | 110 | static unsigned long __initdata pgt_buf_start; |
|---|
| 74 | 111 | static unsigned long __initdata pgt_buf_end; |
|---|
| .. | .. |
|---|
| 77 | 114 | static unsigned long min_pfn_mapped; |
|---|
| 78 | 115 | |
|---|
| 79 | 116 | static bool __initdata can_use_brk_pgt = true; |
|---|
| 117 | + |
|---|
| 118 | +/* |
|---|
| 119 | + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via |
|---|
| 120 | + * CONFIG_ZONE_DMA32. |
|---|
| 121 | + */ |
|---|
| 122 | +static bool disable_dma32 __ro_after_init; |
|---|
| 80 | 123 | |
|---|
| 81 | 124 | /* |
|---|
| 82 | 125 | * Pages returned are already directly mapped. |
|---|
| .. | .. |
|---|
| 169 | 212 | |
|---|
| 170 | 213 | static int page_size_mask; |
|---|
| 171 | 214 | |
|---|
| 215 | +/* |
|---|
| 216 | + * Save some of cr4 feature set we're using (e.g. Pentium 4MB |
|---|
| 217 | + * enable and PPro Global page enable), so that any CPU's that boot |
|---|
| 218 | + * up after us can get the correct flags. Invoked on the boot CPU. |
|---|
| 219 | + */ |
|---|
| 220 | +static inline void cr4_set_bits_and_update_boot(unsigned long mask) |
|---|
| 221 | +{ |
|---|
| 222 | + mmu_cr4_features |= mask; |
|---|
| 223 | + if (trampoline_cr4_features) |
|---|
| 224 | + *trampoline_cr4_features = mmu_cr4_features; |
|---|
| 225 | + cr4_set_bits(mask); |
|---|
| 226 | +} |
|---|
| 227 | + |
|---|
| 172 | 228 | static void __init probe_page_size_mask(void) |
|---|
| 173 | 229 | { |
|---|
| 174 | 230 | /* |
|---|
| .. | .. |
|---|
| 207 | 263 | } |
|---|
| 208 | 264 | } |
|---|
| 209 | 265 | |
|---|
| 266 | +#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \ |
|---|
| 267 | + .family = 6, \ |
|---|
| 268 | + .model = _model, \ |
|---|
| 269 | + } |
|---|
| 270 | +/* |
|---|
| 271 | + * INVLPG may not properly flush Global entries |
|---|
| 272 | + * on these CPUs when PCIDs are enabled. |
|---|
| 273 | + */ |
|---|
| 274 | +static const struct x86_cpu_id invlpg_miss_ids[] = { |
|---|
| 275 | + INTEL_MATCH(INTEL_FAM6_ALDERLAKE ), |
|---|
| 276 | + INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ), |
|---|
| 277 | + INTEL_MATCH(INTEL_FAM6_ALDERLAKE_N ), |
|---|
| 278 | + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ), |
|---|
| 279 | + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P), |
|---|
| 280 | + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S), |
|---|
| 281 | + {} |
|---|
| 282 | +}; |
|---|
| 283 | + |
|---|
| 210 | 284 | static void setup_pcid(void) |
|---|
| 211 | 285 | { |
|---|
| 212 | 286 | if (!IS_ENABLED(CONFIG_X86_64)) |
|---|
| .. | .. |
|---|
| 214 | 288 | |
|---|
| 215 | 289 | if (!boot_cpu_has(X86_FEATURE_PCID)) |
|---|
| 216 | 290 | return; |
|---|
| 291 | + |
|---|
| 292 | + if (x86_match_cpu(invlpg_miss_ids)) { |
|---|
| 293 | + pr_info("Incomplete global flushes, disabling PCID"); |
|---|
| 294 | + setup_clear_cpu_cap(X86_FEATURE_PCID); |
|---|
| 295 | + return; |
|---|
| 296 | + } |
|---|
| 217 | 297 | |
|---|
| 218 | 298 | if (boot_cpu_has(X86_FEATURE_PGE)) { |
|---|
| 219 | 299 | /* |
|---|
| .. | .. |
|---|
| 464 | 544 | * the physical memory. To access them they are temporarily mapped. |
|---|
| 465 | 545 | */ |
|---|
| 466 | 546 | unsigned long __ref init_memory_mapping(unsigned long start, |
|---|
| 467 | | - unsigned long end) |
|---|
| 547 | + unsigned long end, pgprot_t prot) |
|---|
| 468 | 548 | { |
|---|
| 469 | 549 | struct map_range mr[NR_RANGE_MR]; |
|---|
| 470 | 550 | unsigned long ret = 0; |
|---|
| .. | .. |
|---|
| 478 | 558 | |
|---|
| 479 | 559 | for (i = 0; i < nr_range; i++) |
|---|
| 480 | 560 | ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, |
|---|
| 481 | | - mr[i].page_size_mask); |
|---|
| 561 | + mr[i].page_size_mask, |
|---|
| 562 | + prot); |
|---|
| 482 | 563 | |
|---|
| 483 | 564 | add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); |
|---|
| 484 | 565 | |
|---|
| .. | .. |
|---|
| 518 | 599 | */ |
|---|
| 519 | 600 | can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= |
|---|
| 520 | 601 | min(end, (u64)pgt_buf_top<<PAGE_SHIFT); |
|---|
| 521 | | - init_memory_mapping(start, end); |
|---|
| 602 | + init_memory_mapping(start, end, PAGE_KERNEL); |
|---|
| 522 | 603 | mapped_ram_size += end - start; |
|---|
| 523 | 604 | can_use_brk_pgt = true; |
|---|
| 524 | 605 | } |
|---|
| .. | .. |
|---|
| 643 | 724 | } |
|---|
| 644 | 725 | } |
|---|
| 645 | 726 | |
|---|
| 727 | +/* |
|---|
| 728 | + * The real mode trampoline, which is required for bootstrapping CPUs |
|---|
| 729 | + * occupies only a small area under the low 1MB. See reserve_real_mode() |
|---|
| 730 | + * for details. |
|---|
| 731 | + * |
|---|
| 732 | + * If KASLR is disabled the first PGD entry of the direct mapping is copied |
|---|
| 733 | + * to map the real mode trampoline. |
|---|
| 734 | + * |
|---|
| 735 | + * If KASLR is enabled, copy only the PUD which covers the low 1MB |
|---|
| 736 | + * area. This limits the randomization granularity to 1GB for both 4-level |
|---|
| 737 | + * and 5-level paging. |
|---|
| 738 | + */ |
|---|
| 739 | +static void __init init_trampoline(void) |
|---|
| 740 | +{ |
|---|
| 741 | +#ifdef CONFIG_X86_64 |
|---|
| 742 | + if (!kaslr_memory_enabled()) |
|---|
| 743 | + trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; |
|---|
| 744 | + else |
|---|
| 745 | + init_trampoline_kaslr(); |
|---|
| 746 | +#endif |
|---|
| 747 | +} |
|---|
| 748 | + |
|---|
| 646 | 749 | void __init init_mem_mapping(void) |
|---|
| 647 | 750 | { |
|---|
| 648 | 751 | unsigned long end; |
|---|
| .. | .. |
|---|
| 658 | 761 | #endif |
|---|
| 659 | 762 | |
|---|
| 660 | 763 | /* the ISA range is always mapped regardless of memory holes */ |
|---|
| 661 | | - init_memory_mapping(0, ISA_END_ADDRESS); |
|---|
| 764 | + init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL); |
|---|
| 662 | 765 | |
|---|
| 663 | 766 | /* Init the trampoline, possibly with KASLR memory offset */ |
|---|
| 664 | 767 | init_trampoline(); |
|---|
| .. | .. |
|---|
| 698 | 801 | x86_init.hyper.init_mem_mapping(); |
|---|
| 699 | 802 | |
|---|
| 700 | 803 | early_memtest(0, max_pfn_mapped << PAGE_SHIFT); |
|---|
| 804 | +} |
|---|
| 805 | + |
|---|
| 806 | +/* |
|---|
| 807 | + * Initialize an mm_struct to be used during poking and a pointer to be used |
|---|
| 808 | + * during patching. |
|---|
| 809 | + */ |
|---|
| 810 | +void __init poking_init(void) |
|---|
| 811 | +{ |
|---|
| 812 | + spinlock_t *ptl; |
|---|
| 813 | + pte_t *ptep; |
|---|
| 814 | + |
|---|
| 815 | + poking_mm = mm_alloc(); |
|---|
| 816 | + BUG_ON(!poking_mm); |
|---|
| 817 | + |
|---|
| 818 | + /* Xen PV guests need the PGD to be pinned. */ |
|---|
| 819 | + paravirt_arch_dup_mmap(NULL, poking_mm); |
|---|
| 820 | + |
|---|
| 821 | + /* |
|---|
| 822 | + * Randomize the poking address, but make sure that the following page |
|---|
| 823 | + * will be mapped at the same PMD. We need 2 pages, so find space for 3, |
|---|
| 824 | + * and adjust the address if the PMD ends after the first one. |
|---|
| 825 | + */ |
|---|
| 826 | + poking_addr = TASK_UNMAPPED_BASE; |
|---|
| 827 | + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) |
|---|
| 828 | + poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % |
|---|
| 829 | + (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); |
|---|
| 830 | + |
|---|
| 831 | + if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) |
|---|
| 832 | + poking_addr += PAGE_SIZE; |
|---|
| 833 | + |
|---|
| 834 | + /* |
|---|
| 835 | + * We need to trigger the allocation of the page-tables that will be |
|---|
| 836 | + * needed for poking now. Later, poking may be performed in an atomic |
|---|
| 837 | + * section, which might cause allocation to fail. |
|---|
| 838 | + */ |
|---|
| 839 | + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); |
|---|
| 840 | + BUG_ON(!ptep); |
|---|
| 841 | + pte_unmap_unlock(ptep, ptl); |
|---|
| 701 | 842 | } |
|---|
| 702 | 843 | |
|---|
| 703 | 844 | /* |
|---|
| .. | .. |
|---|
| 742 | 883 | return 1; |
|---|
| 743 | 884 | } |
|---|
| 744 | 885 | |
|---|
| 745 | | -void free_init_pages(char *what, unsigned long begin, unsigned long end) |
|---|
| 886 | +void free_init_pages(const char *what, unsigned long begin, unsigned long end) |
|---|
| 746 | 887 | { |
|---|
| 747 | 888 | unsigned long begin_aligned, end_aligned; |
|---|
| 748 | 889 | |
|---|
| .. | .. |
|---|
| 791 | 932 | * used for the kernel image only. free_init_pages() will do the |
|---|
| 792 | 933 | * right thing for either kind of address. |
|---|
| 793 | 934 | */ |
|---|
| 794 | | -void free_kernel_image_pages(void *begin, void *end) |
|---|
| 935 | +void free_kernel_image_pages(const char *what, void *begin, void *end) |
|---|
| 795 | 936 | { |
|---|
| 796 | 937 | unsigned long begin_ul = (unsigned long)begin; |
|---|
| 797 | 938 | unsigned long end_ul = (unsigned long)end; |
|---|
| 798 | 939 | unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; |
|---|
| 799 | 940 | |
|---|
| 800 | | - |
|---|
| 801 | | - free_init_pages("unused kernel image", begin_ul, end_ul); |
|---|
| 941 | + free_init_pages(what, begin_ul, end_ul); |
|---|
| 802 | 942 | |
|---|
| 803 | 943 | /* |
|---|
| 804 | 944 | * PTI maps some of the kernel into userspace. For performance, |
|---|
| .. | .. |
|---|
| 819 | 959 | set_memory_np_noalias(begin_ul, len_pages); |
|---|
| 820 | 960 | } |
|---|
| 821 | 961 | |
|---|
| 822 | | -void __weak mem_encrypt_free_decrypted_mem(void) { } |
|---|
| 823 | | - |
|---|
| 824 | 962 | void __ref free_initmem(void) |
|---|
| 825 | 963 | { |
|---|
| 826 | 964 | e820__reallocate_tables(); |
|---|
| 827 | 965 | |
|---|
| 828 | 966 | mem_encrypt_free_decrypted_mem(); |
|---|
| 829 | 967 | |
|---|
| 830 | | - free_kernel_image_pages(&__init_begin, &__init_end); |
|---|
| 968 | + free_kernel_image_pages("unused kernel image (initmem)", |
|---|
| 969 | + &__init_begin, &__init_end); |
|---|
| 831 | 970 | } |
|---|
| 832 | 971 | |
|---|
| 833 | 972 | #ifdef CONFIG_BLK_DEV_INITRD |
|---|
| .. | .. |
|---|
| 903 | 1042 | max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); |
|---|
| 904 | 1043 | #endif |
|---|
| 905 | 1044 | #ifdef CONFIG_ZONE_DMA32 |
|---|
| 906 | | - max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); |
|---|
| 1045 | + max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn); |
|---|
| 907 | 1046 | #endif |
|---|
| 908 | 1047 | max_zone_pfns[ZONE_NORMAL] = max_low_pfn; |
|---|
| 909 | 1048 | #ifdef CONFIG_HIGHMEM |
|---|
| 910 | 1049 | max_zone_pfns[ZONE_HIGHMEM] = max_pfn; |
|---|
| 911 | 1050 | #endif |
|---|
| 912 | 1051 | |
|---|
| 913 | | - free_area_init_nodes(max_zone_pfns); |
|---|
| 1052 | + free_area_init(max_zone_pfns); |
|---|
| 914 | 1053 | } |
|---|
| 1054 | + |
|---|
| 1055 | +static int __init early_disable_dma32(char *buf) |
|---|
| 1056 | +{ |
|---|
| 1057 | + if (!buf) |
|---|
| 1058 | + return -EINVAL; |
|---|
| 1059 | + |
|---|
| 1060 | + if (!strcmp(buf, "on")) |
|---|
| 1061 | + disable_dma32 = true; |
|---|
| 1062 | + |
|---|
| 1063 | + return 0; |
|---|
| 1064 | +} |
|---|
| 1065 | +early_param("disable_dma32", early_disable_dma32); |
|---|
| 915 | 1066 | |
|---|
| 916 | 1067 | __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { |
|---|
| 917 | 1068 | .loaded_mm = &init_mm, |
|---|
| 918 | 1069 | .next_asid = 1, |
|---|
| 919 | 1070 | .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ |
|---|
| 920 | 1071 | }; |
|---|
| 921 | | -EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); |
|---|
| 922 | 1072 | |
|---|
| 923 | 1073 | void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) |
|---|
| 924 | 1074 | { |
|---|