hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/arch/x86/mm/init.c
....@@ -3,12 +3,14 @@
33 #include <linux/ioport.h>
44 #include <linux/swap.h>
55 #include <linux/memblock.h>
6
-#include <linux/bootmem.h> /* for max_low_pfn */
76 #include <linux/swapfile.h>
87 #include <linux/swapops.h>
98 #include <linux/kmemleak.h>
9
+#include <linux/sched/task.h>
10
+#include <linux/sched/mm.h>
1011
1112 #include <asm/set_memory.h>
13
+#include <asm/cpu_device_id.h>
1214 #include <asm/e820/api.h>
1315 #include <asm/init.h>
1416 #include <asm/page.h>
....@@ -24,6 +26,9 @@
2426 #include <asm/hypervisor.h>
2527 #include <asm/cpufeature.h>
2628 #include <asm/pti.h>
29
+#include <asm/text-patching.h>
30
+#include <asm/memtype.h>
31
+#include <asm/paravirt.h>
2732
2833 /*
2934 * We need to define the tracepoints somewhere, and tlb.c
....@@ -48,7 +53,7 @@
4853 * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
4954 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
5055 */
51
-uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
56
+static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
5257 [_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
5358 [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD,
5459 [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
....@@ -56,9 +61,16 @@
5661 [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
5762 [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
5863 };
59
-EXPORT_SYMBOL(__cachemode2pte_tbl);
6064
61
-uint8_t __pte2cachemode_tbl[8] = {
65
+unsigned long cachemode2protval(enum page_cache_mode pcm)
66
+{
67
+ if (likely(pcm == 0))
68
+ return 0;
69
+ return __cachemode2pte_tbl[pcm];
70
+}
71
+EXPORT_SYMBOL(cachemode2protval);
72
+
73
+static uint8_t __pte2cachemode_tbl[8] = {
6274 [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
6375 [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
6476 [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
....@@ -68,7 +80,32 @@
6880 [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
6981 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
7082 };
71
-EXPORT_SYMBOL(__pte2cachemode_tbl);
83
+
84
+/*
85
+ * Check that the write-protect PAT entry is set for write-protect.
86
+ * To do this without making assumptions how PAT has been set up (Xen has
87
+ * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache
88
+ * mode via the __cachemode2pte_tbl[] into protection bits (those protection
89
+ * bits will select a cache mode of WP or better), and then translate the
90
+ * protection bits back into the cache mode using __pte2cm_idx() and the
91
+ * __pte2cachemode_tbl[] array. This will return the really used cache mode.
92
+ */
93
+bool x86_has_pat_wp(void)
94
+{
95
+ uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP];
96
+
97
+ return __pte2cachemode_tbl[__pte2cm_idx(prot)] == _PAGE_CACHE_MODE_WP;
98
+}
99
+
100
+enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
101
+{
102
+ unsigned long masked;
103
+
104
+ masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
105
+ if (likely(masked == 0))
106
+ return 0;
107
+ return __pte2cachemode_tbl[__pte2cm_idx(masked)];
108
+}
72109
73110 static unsigned long __initdata pgt_buf_start;
74111 static unsigned long __initdata pgt_buf_end;
....@@ -77,6 +114,12 @@
77114 static unsigned long min_pfn_mapped;
78115
79116 static bool __initdata can_use_brk_pgt = true;
117
+
118
+/*
119
+ * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
120
+ * CONFIG_ZONE_DMA32.
121
+ */
122
+static bool disable_dma32 __ro_after_init;
80123
81124 /*
82125 * Pages returned are already directly mapped.
....@@ -169,6 +212,19 @@
169212
170213 static int page_size_mask;
171214
215
+/*
216
+ * Save some of cr4 feature set we're using (e.g. Pentium 4MB
217
+ * enable and PPro Global page enable), so that any CPU's that boot
218
+ * up after us can get the correct flags. Invoked on the boot CPU.
219
+ */
220
+static inline void cr4_set_bits_and_update_boot(unsigned long mask)
221
+{
222
+ mmu_cr4_features |= mask;
223
+ if (trampoline_cr4_features)
224
+ *trampoline_cr4_features = mmu_cr4_features;
225
+ cr4_set_bits(mask);
226
+}
227
+
172228 static void __init probe_page_size_mask(void)
173229 {
174230 /*
....@@ -207,6 +263,24 @@
207263 }
208264 }
209265
266
+#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \
267
+ .family = 6, \
268
+ .model = _model, \
269
+ }
270
+/*
271
+ * INVLPG may not properly flush Global entries
272
+ * on these CPUs when PCIDs are enabled.
273
+ */
274
+static const struct x86_cpu_id invlpg_miss_ids[] = {
275
+ INTEL_MATCH(INTEL_FAM6_ALDERLAKE ),
276
+ INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ),
277
+ INTEL_MATCH(INTEL_FAM6_ALDERLAKE_N ),
278
+ INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ),
279
+ INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P),
280
+ INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S),
281
+ {}
282
+};
283
+
210284 static void setup_pcid(void)
211285 {
212286 if (!IS_ENABLED(CONFIG_X86_64))
....@@ -214,6 +288,12 @@
214288
215289 if (!boot_cpu_has(X86_FEATURE_PCID))
216290 return;
291
+
292
+ if (x86_match_cpu(invlpg_miss_ids)) {
293
+ pr_info("Incomplete global flushes, disabling PCID");
294
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
295
+ return;
296
+ }
217297
218298 if (boot_cpu_has(X86_FEATURE_PGE)) {
219299 /*
....@@ -464,7 +544,7 @@
464544 * the physical memory. To access them they are temporarily mapped.
465545 */
466546 unsigned long __ref init_memory_mapping(unsigned long start,
467
- unsigned long end)
547
+ unsigned long end, pgprot_t prot)
468548 {
469549 struct map_range mr[NR_RANGE_MR];
470550 unsigned long ret = 0;
....@@ -478,7 +558,8 @@
478558
479559 for (i = 0; i < nr_range; i++)
480560 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
481
- mr[i].page_size_mask);
561
+ mr[i].page_size_mask,
562
+ prot);
482563
483564 add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
484565
....@@ -518,7 +599,7 @@
518599 */
519600 can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
520601 min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
521
- init_memory_mapping(start, end);
602
+ init_memory_mapping(start, end, PAGE_KERNEL);
522603 mapped_ram_size += end - start;
523604 can_use_brk_pgt = true;
524605 }
....@@ -643,6 +724,28 @@
643724 }
644725 }
645726
727
+/*
728
+ * The real mode trampoline, which is required for bootstrapping CPUs
729
+ * occupies only a small area under the low 1MB. See reserve_real_mode()
730
+ * for details.
731
+ *
732
+ * If KASLR is disabled the first PGD entry of the direct mapping is copied
733
+ * to map the real mode trampoline.
734
+ *
735
+ * If KASLR is enabled, copy only the PUD which covers the low 1MB
736
+ * area. This limits the randomization granularity to 1GB for both 4-level
737
+ * and 5-level paging.
738
+ */
739
+static void __init init_trampoline(void)
740
+{
741
+#ifdef CONFIG_X86_64
742
+ if (!kaslr_memory_enabled())
743
+ trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
744
+ else
745
+ init_trampoline_kaslr();
746
+#endif
747
+}
748
+
646749 void __init init_mem_mapping(void)
647750 {
648751 unsigned long end;
....@@ -658,7 +761,7 @@
658761 #endif
659762
660763 /* the ISA range is always mapped regardless of memory holes */
661
- init_memory_mapping(0, ISA_END_ADDRESS);
764
+ init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL);
662765
663766 /* Init the trampoline, possibly with KASLR memory offset */
664767 init_trampoline();
....@@ -698,6 +801,44 @@
698801 x86_init.hyper.init_mem_mapping();
699802
700803 early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
804
+}
805
+
806
+/*
807
+ * Initialize an mm_struct to be used during poking and a pointer to be used
808
+ * during patching.
809
+ */
810
+void __init poking_init(void)
811
+{
812
+ spinlock_t *ptl;
813
+ pte_t *ptep;
814
+
815
+ poking_mm = mm_alloc();
816
+ BUG_ON(!poking_mm);
817
+
818
+ /* Xen PV guests need the PGD to be pinned. */
819
+ paravirt_arch_dup_mmap(NULL, poking_mm);
820
+
821
+ /*
822
+ * Randomize the poking address, but make sure that the following page
823
+ * will be mapped at the same PMD. We need 2 pages, so find space for 3,
824
+ * and adjust the address if the PMD ends after the first one.
825
+ */
826
+ poking_addr = TASK_UNMAPPED_BASE;
827
+ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
828
+ poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
829
+ (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
830
+
831
+ if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
832
+ poking_addr += PAGE_SIZE;
833
+
834
+ /*
835
+ * We need to trigger the allocation of the page-tables that will be
836
+ * needed for poking now. Later, poking may be performed in an atomic
837
+ * section, which might cause allocation to fail.
838
+ */
839
+ ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
840
+ BUG_ON(!ptep);
841
+ pte_unmap_unlock(ptep, ptl);
701842 }
702843
703844 /*
....@@ -742,7 +883,7 @@
742883 return 1;
743884 }
744885
745
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
886
+void free_init_pages(const char *what, unsigned long begin, unsigned long end)
746887 {
747888 unsigned long begin_aligned, end_aligned;
748889
....@@ -791,14 +932,13 @@
791932 * used for the kernel image only. free_init_pages() will do the
792933 * right thing for either kind of address.
793934 */
794
-void free_kernel_image_pages(void *begin, void *end)
935
+void free_kernel_image_pages(const char *what, void *begin, void *end)
795936 {
796937 unsigned long begin_ul = (unsigned long)begin;
797938 unsigned long end_ul = (unsigned long)end;
798939 unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;
799940
800
-
801
- free_init_pages("unused kernel image", begin_ul, end_ul);
941
+ free_init_pages(what, begin_ul, end_ul);
802942
803943 /*
804944 * PTI maps some of the kernel into userspace. For performance,
....@@ -819,15 +959,14 @@
819959 set_memory_np_noalias(begin_ul, len_pages);
820960 }
821961
822
-void __weak mem_encrypt_free_decrypted_mem(void) { }
823
-
824962 void __ref free_initmem(void)
825963 {
826964 e820__reallocate_tables();
827965
828966 mem_encrypt_free_decrypted_mem();
829967
830
- free_kernel_image_pages(&__init_begin, &__init_end);
968
+ free_kernel_image_pages("unused kernel image (initmem)",
969
+ &__init_begin, &__init_end);
831970 }
832971
833972 #ifdef CONFIG_BLK_DEV_INITRD
....@@ -903,22 +1042,33 @@
9031042 max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
9041043 #endif
9051044 #ifdef CONFIG_ZONE_DMA32
906
- max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
1045
+ max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn);
9071046 #endif
9081047 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
9091048 #ifdef CONFIG_HIGHMEM
9101049 max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
9111050 #endif
9121051
913
- free_area_init_nodes(max_zone_pfns);
1052
+ free_area_init(max_zone_pfns);
9141053 }
1054
+
1055
+static int __init early_disable_dma32(char *buf)
1056
+{
1057
+ if (!buf)
1058
+ return -EINVAL;
1059
+
1060
+ if (!strcmp(buf, "on"))
1061
+ disable_dma32 = true;
1062
+
1063
+ return 0;
1064
+}
1065
+early_param("disable_dma32", early_disable_dma32);
9151066
9161067 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
9171068 .loaded_mm = &init_mm,
9181069 .next_asid = 1,
9191070 .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
9201071 };
921
-EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
9221072
9231073 void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
9241074 {