| .. | .. |
|---|
| 3 | 3 | #include <linux/ioport.h> |
|---|
| 4 | 4 | #include <linux/swap.h> |
|---|
| 5 | 5 | #include <linux/memblock.h> |
|---|
| 6 | | -#include <linux/bootmem.h> /* for max_low_pfn */ |
|---|
| 7 | 6 | #include <linux/swapfile.h> |
|---|
| 8 | 7 | #include <linux/swapops.h> |
|---|
| 9 | 8 | #include <linux/kmemleak.h> |
|---|
| 9 | +#include <linux/sched/task.h> |
|---|
| 10 | 10 | |
|---|
| 11 | 11 | #include <asm/set_memory.h> |
|---|
| 12 | 12 | #include <asm/e820/api.h> |
|---|
| .. | .. |
|---|
| 24 | 24 | #include <asm/hypervisor.h> |
|---|
| 25 | 25 | #include <asm/cpufeature.h> |
|---|
| 26 | 26 | #include <asm/pti.h> |
|---|
| 27 | +#include <asm/text-patching.h> |
|---|
| 28 | +#include <asm/memtype.h> |
|---|
| 27 | 29 | |
|---|
| 28 | 30 | /* |
|---|
| 29 | 31 | * We need to define the tracepoints somewhere, and tlb.c |
|---|
| .. | .. |
|---|
| 48 | 50 | * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte |
|---|
| 49 | 51 | * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. |
|---|
| 50 | 52 | */ |
|---|
| 51 | | -uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { |
|---|
| 53 | +static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { |
|---|
| 52 | 54 | [_PAGE_CACHE_MODE_WB ] = 0 | 0 , |
|---|
| 53 | 55 | [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, |
|---|
| 54 | 56 | [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, |
|---|
| .. | .. |
|---|
| 56 | 58 | [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, |
|---|
| 57 | 59 | [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, |
|---|
| 58 | 60 | }; |
|---|
| 59 | | -EXPORT_SYMBOL(__cachemode2pte_tbl); |
|---|
| 60 | 61 | |
|---|
| 61 | | -uint8_t __pte2cachemode_tbl[8] = { |
|---|
| 62 | +unsigned long cachemode2protval(enum page_cache_mode pcm) |
|---|
| 63 | +{ |
|---|
| 64 | + if (likely(pcm == 0)) |
|---|
| 65 | + return 0; |
|---|
| 66 | + return __cachemode2pte_tbl[pcm]; |
|---|
| 67 | +} |
|---|
| 68 | +EXPORT_SYMBOL(cachemode2protval); |
|---|
| 69 | + |
|---|
| 70 | +static uint8_t __pte2cachemode_tbl[8] = { |
|---|
| 62 | 71 | [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, |
|---|
| 63 | 72 | [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, |
|---|
| 64 | 73 | [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, |
|---|
| .. | .. |
|---|
| 68 | 77 | [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, |
|---|
| 69 | 78 | [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, |
|---|
| 70 | 79 | }; |
|---|
| 71 | | -EXPORT_SYMBOL(__pte2cachemode_tbl); |
|---|
| 80 | + |
|---|
| 81 | +/* |
|---|
| 82 | + * Check that the write-protect PAT entry is set for write-protect. |
|---|
| 83 | + * To do this without making assumptions how PAT has been set up (Xen has |
|---|
| 84 | + * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache |
|---|
| 85 | + * mode via the __cachemode2pte_tbl[] into protection bits (those protection |
|---|
| 86 | + * bits will select a cache mode of WP or better), and then translate the |
|---|
| 87 | + * protection bits back into the cache mode using __pte2cm_idx() and the |
|---|
| 88 | + * __pte2cachemode_tbl[] array. This will return the really used cache mode. |
|---|
| 89 | + */ |
|---|
| 90 | +bool x86_has_pat_wp(void) |
|---|
| 91 | +{ |
|---|
| 92 | + uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP]; |
|---|
| 93 | + |
|---|
| 94 | + return __pte2cachemode_tbl[__pte2cm_idx(prot)] == _PAGE_CACHE_MODE_WP; |
|---|
| 95 | +} |
|---|
| 96 | + |
|---|
| 97 | +enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) |
|---|
| 98 | +{ |
|---|
| 99 | + unsigned long masked; |
|---|
| 100 | + |
|---|
| 101 | + masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; |
|---|
| 102 | + if (likely(masked == 0)) |
|---|
| 103 | + return 0; |
|---|
| 104 | + return __pte2cachemode_tbl[__pte2cm_idx(masked)]; |
|---|
| 105 | +} |
|---|
| 72 | 106 | |
|---|
| 73 | 107 | static unsigned long __initdata pgt_buf_start; |
|---|
| 74 | 108 | static unsigned long __initdata pgt_buf_end; |
|---|
| .. | .. |
|---|
| 77 | 111 | static unsigned long min_pfn_mapped; |
|---|
| 78 | 112 | |
|---|
| 79 | 113 | static bool __initdata can_use_brk_pgt = true; |
|---|
| 114 | + |
|---|
| 115 | +/* |
|---|
| 116 | + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via |
|---|
| 117 | + * CONFIG_ZONE_DMA32. |
|---|
| 118 | + */ |
|---|
| 119 | +static bool disable_dma32 __ro_after_init; |
|---|
| 80 | 120 | |
|---|
| 81 | 121 | /* |
|---|
| 82 | 122 | * Pages returned are already directly mapped. |
|---|
| .. | .. |
|---|
| 168 | 208 | }; |
|---|
| 169 | 209 | |
|---|
| 170 | 210 | static int page_size_mask; |
|---|
| 211 | + |
|---|
| 212 | +/* |
|---|
| 213 | + * Save some of cr4 feature set we're using (e.g. Pentium 4MB |
|---|
| 214 | + * enable and PPro Global page enable), so that any CPU's that boot |
|---|
| 215 | + * up after us can get the correct flags. Invoked on the boot CPU. |
|---|
| 216 | + */ |
|---|
| 217 | +static inline void cr4_set_bits_and_update_boot(unsigned long mask) |
|---|
| 218 | +{ |
|---|
| 219 | + mmu_cr4_features |= mask; |
|---|
| 220 | + if (trampoline_cr4_features) |
|---|
| 221 | + *trampoline_cr4_features = mmu_cr4_features; |
|---|
| 222 | + cr4_set_bits(mask); |
|---|
| 223 | +} |
|---|
| 171 | 224 | |
|---|
| 172 | 225 | static void __init probe_page_size_mask(void) |
|---|
| 173 | 226 | { |
|---|
| .. | .. |
|---|
| 464 | 517 | * the physical memory. To access them they are temporarily mapped. |
|---|
| 465 | 518 | */ |
|---|
| 466 | 519 | unsigned long __ref init_memory_mapping(unsigned long start, |
|---|
| 467 | | - unsigned long end) |
|---|
| 520 | + unsigned long end, pgprot_t prot) |
|---|
| 468 | 521 | { |
|---|
| 469 | 522 | struct map_range mr[NR_RANGE_MR]; |
|---|
| 470 | 523 | unsigned long ret = 0; |
|---|
| .. | .. |
|---|
| 478 | 531 | |
|---|
| 479 | 532 | for (i = 0; i < nr_range; i++) |
|---|
| 480 | 533 | ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, |
|---|
| 481 | | - mr[i].page_size_mask); |
|---|
| 534 | + mr[i].page_size_mask, |
|---|
| 535 | + prot); |
|---|
| 482 | 536 | |
|---|
| 483 | 537 | add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); |
|---|
| 484 | 538 | |
|---|
| .. | .. |
|---|
| 518 | 572 | */ |
|---|
| 519 | 573 | can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= |
|---|
| 520 | 574 | min(end, (u64)pgt_buf_top<<PAGE_SHIFT); |
|---|
| 521 | | - init_memory_mapping(start, end); |
|---|
| 575 | + init_memory_mapping(start, end, PAGE_KERNEL); |
|---|
| 522 | 576 | mapped_ram_size += end - start; |
|---|
| 523 | 577 | can_use_brk_pgt = true; |
|---|
| 524 | 578 | } |
|---|
| .. | .. |
|---|
| 643 | 697 | } |
|---|
| 644 | 698 | } |
|---|
| 645 | 699 | |
|---|
| 700 | +/* |
|---|
| 701 | + * The real mode trampoline, which is required for bootstrapping CPUs |
|---|
| 702 | + * occupies only a small area under the low 1MB. See reserve_real_mode() |
|---|
| 703 | + * for details. |
|---|
| 704 | + * |
|---|
| 705 | + * If KASLR is disabled the first PGD entry of the direct mapping is copied |
|---|
| 706 | + * to map the real mode trampoline. |
|---|
| 707 | + * |
|---|
| 708 | + * If KASLR is enabled, copy only the PUD which covers the low 1MB |
|---|
| 709 | + * area. This limits the randomization granularity to 1GB for both 4-level |
|---|
| 710 | + * and 5-level paging. |
|---|
| 711 | + */ |
|---|
| 712 | +static void __init init_trampoline(void) |
|---|
| 713 | +{ |
|---|
| 714 | +#ifdef CONFIG_X86_64 |
|---|
| 715 | + if (!kaslr_memory_enabled()) |
|---|
| 716 | + trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; |
|---|
| 717 | + else |
|---|
| 718 | + init_trampoline_kaslr(); |
|---|
| 719 | +#endif |
|---|
| 720 | +} |
|---|
| 721 | + |
|---|
| 646 | 722 | void __init init_mem_mapping(void) |
|---|
| 647 | 723 | { |
|---|
| 648 | 724 | unsigned long end; |
|---|
| .. | .. |
|---|
| 658 | 734 | #endif |
|---|
| 659 | 735 | |
|---|
| 660 | 736 | /* the ISA range is always mapped regardless of memory holes */ |
|---|
| 661 | | - init_memory_mapping(0, ISA_END_ADDRESS); |
|---|
| 737 | + init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL); |
|---|
| 662 | 738 | |
|---|
| 663 | 739 | /* Init the trampoline, possibly with KASLR memory offset */ |
|---|
| 664 | 740 | init_trampoline(); |
|---|
| .. | .. |
|---|
| 698 | 774 | x86_init.hyper.init_mem_mapping(); |
|---|
| 699 | 775 | |
|---|
| 700 | 776 | early_memtest(0, max_pfn_mapped << PAGE_SHIFT); |
|---|
| 777 | +} |
|---|
| 778 | + |
|---|
| 779 | +/* |
|---|
| 780 | + * Initialize an mm_struct to be used during poking and a pointer to be used |
|---|
| 781 | + * during patching. |
|---|
| 782 | + */ |
|---|
| 783 | +void __init poking_init(void) |
|---|
| 784 | +{ |
|---|
| 785 | + spinlock_t *ptl; |
|---|
| 786 | + pte_t *ptep; |
|---|
| 787 | + |
|---|
| 788 | + poking_mm = copy_init_mm(); |
|---|
| 789 | + BUG_ON(!poking_mm); |
|---|
| 790 | + |
|---|
| 791 | + /* |
|---|
| 792 | + * Randomize the poking address, but make sure that the following page |
|---|
| 793 | + * will be mapped at the same PMD. We need 2 pages, so find space for 3, |
|---|
| 794 | + * and adjust the address if the PMD ends after the first one. |
|---|
| 795 | + */ |
|---|
| 796 | + poking_addr = TASK_UNMAPPED_BASE; |
|---|
| 797 | + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) |
|---|
| 798 | + poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % |
|---|
| 799 | + (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); |
|---|
| 800 | + |
|---|
| 801 | + if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) |
|---|
| 802 | + poking_addr += PAGE_SIZE; |
|---|
| 803 | + |
|---|
| 804 | + /* |
|---|
| 805 | + * We need to trigger the allocation of the page-tables that will be |
|---|
| 806 | + * needed for poking now. Later, poking may be performed in an atomic |
|---|
| 807 | + * section, which might cause allocation to fail. |
|---|
| 808 | + */ |
|---|
| 809 | + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); |
|---|
| 810 | + BUG_ON(!ptep); |
|---|
| 811 | + pte_unmap_unlock(ptep, ptl); |
|---|
| 701 | 812 | } |
|---|
| 702 | 813 | |
|---|
| 703 | 814 | /* |
|---|
| .. | .. |
|---|
| 742 | 853 | return 1; |
|---|
| 743 | 854 | } |
|---|
| 744 | 855 | |
|---|
| 745 | | -void free_init_pages(char *what, unsigned long begin, unsigned long end) |
|---|
| 856 | +void free_init_pages(const char *what, unsigned long begin, unsigned long end) |
|---|
| 746 | 857 | { |
|---|
| 747 | 858 | unsigned long begin_aligned, end_aligned; |
|---|
| 748 | 859 | |
|---|
| .. | .. |
|---|
| 791 | 902 | * used for the kernel image only. free_init_pages() will do the |
|---|
| 792 | 903 | * right thing for either kind of address. |
|---|
| 793 | 904 | */ |
|---|
| 794 | | -void free_kernel_image_pages(void *begin, void *end) |
|---|
| 905 | +void free_kernel_image_pages(const char *what, void *begin, void *end) |
|---|
| 795 | 906 | { |
|---|
| 796 | 907 | unsigned long begin_ul = (unsigned long)begin; |
|---|
| 797 | 908 | unsigned long end_ul = (unsigned long)end; |
|---|
| 798 | 909 | unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; |
|---|
| 799 | 910 | |
|---|
| 800 | | - |
|---|
| 801 | | - free_init_pages("unused kernel image", begin_ul, end_ul); |
|---|
| 911 | + free_init_pages(what, begin_ul, end_ul); |
|---|
| 802 | 912 | |
|---|
| 803 | 913 | /* |
|---|
| 804 | 914 | * PTI maps some of the kernel into userspace. For performance, |
|---|
| .. | .. |
|---|
| 819 | 929 | set_memory_np_noalias(begin_ul, len_pages); |
|---|
| 820 | 930 | } |
|---|
| 821 | 931 | |
|---|
| 822 | | -void __weak mem_encrypt_free_decrypted_mem(void) { } |
|---|
| 823 | | - |
|---|
| 824 | 932 | void __ref free_initmem(void) |
|---|
| 825 | 933 | { |
|---|
| 826 | 934 | e820__reallocate_tables(); |
|---|
| 827 | 935 | |
|---|
| 828 | 936 | mem_encrypt_free_decrypted_mem(); |
|---|
| 829 | 937 | |
|---|
| 830 | | - free_kernel_image_pages(&__init_begin, &__init_end); |
|---|
| 938 | + free_kernel_image_pages("unused kernel image (initmem)", |
|---|
| 939 | + &__init_begin, &__init_end); |
|---|
| 831 | 940 | } |
|---|
| 832 | 941 | |
|---|
| 833 | 942 | #ifdef CONFIG_BLK_DEV_INITRD |
|---|
| .. | .. |
|---|
| 903 | 1012 | max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); |
|---|
| 904 | 1013 | #endif |
|---|
| 905 | 1014 | #ifdef CONFIG_ZONE_DMA32 |
|---|
| 906 | | - max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); |
|---|
| 1015 | + max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn); |
|---|
| 907 | 1016 | #endif |
|---|
| 908 | 1017 | max_zone_pfns[ZONE_NORMAL] = max_low_pfn; |
|---|
| 909 | 1018 | #ifdef CONFIG_HIGHMEM |
|---|
| 910 | 1019 | max_zone_pfns[ZONE_HIGHMEM] = max_pfn; |
|---|
| 911 | 1020 | #endif |
|---|
| 912 | 1021 | |
|---|
| 913 | | - free_area_init_nodes(max_zone_pfns); |
|---|
| 1022 | + free_area_init(max_zone_pfns); |
|---|
| 914 | 1023 | } |
|---|
| 1024 | + |
|---|
| 1025 | +static int __init early_disable_dma32(char *buf) |
|---|
| 1026 | +{ |
|---|
| 1027 | + if (!buf) |
|---|
| 1028 | + return -EINVAL; |
|---|
| 1029 | + |
|---|
| 1030 | + if (!strcmp(buf, "on")) |
|---|
| 1031 | + disable_dma32 = true; |
|---|
| 1032 | + |
|---|
| 1033 | + return 0; |
|---|
| 1034 | +} |
|---|
| 1035 | +early_param("disable_dma32", early_disable_dma32); |
|---|
| 915 | 1036 | |
|---|
| 916 | 1037 | __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { |
|---|
| 917 | 1038 | .loaded_mm = &init_mm, |
|---|
| 918 | 1039 | .next_asid = 1, |
|---|
| 919 | 1040 | .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ |
|---|
| 920 | 1041 | }; |
|---|
| 921 | | -EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); |
|---|
| 922 | 1042 | |
|---|
| 923 | 1043 | void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) |
|---|
| 924 | 1044 | { |
|---|