hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/x86/kernel/head_64.S
....@@ -13,8 +13,8 @@
1313 #include <linux/linkage.h>
1414 #include <linux/threads.h>
1515 #include <linux/init.h>
16
+#include <linux/pgtable.h>
1617 #include <asm/segment.h>
17
-#include <asm/pgtable.h>
1818 #include <asm/page.h>
1919 #include <asm/msr.h>
2020 #include <asm/cache.h>
....@@ -26,20 +26,19 @@
2626 #include <asm/nospec-branch.h>
2727 #include <asm/fixmap.h>
2828
29
-#ifdef CONFIG_PARAVIRT
29
+#ifdef CONFIG_PARAVIRT_XXL
3030 #include <asm/asm-offsets.h>
3131 #include <asm/paravirt.h>
32
-#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
32
+#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg
3333 #else
34
-#define GET_CR2_INTO(reg) movq %cr2, reg
3534 #define INTERRUPT_RETURN iretq
35
+#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg
3636 #endif
3737
38
-/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
38
+/*
39
+ * We are not able to switch in one step to the final KERNEL ADDRESS SPACE
3940 * because we need identity-mapped pages.
40
- *
4141 */
42
-
4342 #define l4_index(x) (((x) >> 39) & 511)
4443 #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
4544
....@@ -51,8 +50,7 @@
5150 .text
5251 __HEAD
5352 .code64
54
- .globl startup_64
55
-startup_64:
53
+SYM_CODE_START_NOALIGN(startup_64)
5654 UNWIND_HINT_EMPTY
5755 /*
5856 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
....@@ -75,6 +73,20 @@
7573 /* Set up the stack for verify_cpu(), similar to initial_stack below */
7674 leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
7775
76
+ leaq _text(%rip), %rdi
77
+ pushq %rsi
78
+ call startup_64_setup_env
79
+ popq %rsi
80
+
81
+ /* Now switch to __KERNEL_CS so IRET works reliably */
82
+ pushq $__KERNEL_CS
83
+ leaq .Lon_kernel_cs(%rip), %rax
84
+ pushq %rax
85
+ lretq
86
+
87
+.Lon_kernel_cs:
88
+ UNWIND_HINT_EMPTY
89
+
7890 /* Sanitize CPU configuration */
7991 call verify_cpu
8092
....@@ -92,7 +104,9 @@
92104 /* Form the CR3 value being sure to include the CR3 modifier */
93105 addq $(early_top_pgt - __START_KERNEL_map), %rax
94106 jmp 1f
95
-ENTRY(secondary_startup_64)
107
+SYM_CODE_END(startup_64)
108
+
109
+SYM_CODE_START(secondary_startup_64)
96110 UNWIND_HINT_EMPTY
97111 /*
98112 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
....@@ -110,6 +124,18 @@
110124
111125 /* Sanitize CPU configuration */
112126 call verify_cpu
127
+
128
+ /*
129
+ * The secondary_startup_64_no_verify entry point is only used by
130
+ * SEV-ES guests. In those guests the call to verify_cpu() would cause
131
+ * #VC exceptions which can not be handled at this stage of secondary
132
+ * CPU bringup.
133
+ *
134
+ * All non SEV-ES systems, especially Intel systems, need to execute
135
+ * verify_cpu() above to make sure NX is enabled.
136
+ */
137
+SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
138
+ UNWIND_HINT_EMPTY
113139
114140 /*
115141 * Retrieve the modifier (SME encryption mask if SME is active) to be
....@@ -135,6 +161,21 @@
135161
136162 /* Setup early boot stage 4-/5-level pagetables. */
137163 addq phys_base(%rip), %rax
164
+
165
+ /*
166
+ * For SEV guests: Verify that the C-bit is correct. A malicious
167
+ * hypervisor could lie about the C-bit position to perform a ROP
168
+ * attack on the guest by writing to the unencrypted stack and wait for
169
+ * the next RET instruction.
170
+ * %rsi carries pointer to realmode data and is callee-clobbered. Save
171
+ * and restore it.
172
+ */
173
+ pushq %rsi
174
+ movq %rax, %rdi
175
+ call sev_verify_cbit
176
+ popq %rsi
177
+
178
+ /* Switch to new page-table */
138179 movq %rax, %cr3
139180
140181 /* Ensure I am executing from virtual addresses */
....@@ -143,33 +184,6 @@
143184 jmp *%rax
144185 1:
145186 UNWIND_HINT_EMPTY
146
-
147
- /* Check if nx is implemented */
148
- movl $0x80000001, %eax
149
- cpuid
150
- movl %edx,%edi
151
-
152
- /* Setup EFER (Extended Feature Enable Register) */
153
- movl $MSR_EFER, %ecx
154
- rdmsr
155
- btsl $_EFER_SCE, %eax /* Enable System Call */
156
- btl $20,%edi /* No Execute supported? */
157
- jnc 1f
158
- btsl $_EFER_NX, %eax
159
- btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
160
-1: wrmsr /* Make changes effective */
161
-
162
- /* Setup cr0 */
163
- movl $CR0_STATE, %eax
164
- /* Make changes effective */
165
- movq %rax, %cr0
166
-
167
- /* Setup a boot time stack */
168
- movq initial_stack(%rip), %rsp
169
-
170
- /* zero EFLAGS after setting rsp */
171
- pushq $0
172
- popfq
173187
174188 /*
175189 * We must switch to a new descriptor in kernel space for the GDT
....@@ -195,15 +209,50 @@
195209
196210 /* Set up %gs.
197211 *
198
- * The base of %gs always points to the bottom of the irqstack
199
- * union. If the stack protector canary is enabled, it is
200
- * located at %gs:40. Note that, on SMP, the boot cpu uses
201
- * init data section till per cpu areas are set up.
212
+ * The base of %gs always points to fixed_percpu_data. If the
213
+ * stack protector canary is enabled, it is located at %gs:40.
214
+ * Note that, on SMP, the boot cpu uses init data section until
215
+ * the per cpu areas are set up.
202216 */
203217 movl $MSR_GS_BASE,%ecx
204218 movl initial_gs(%rip),%eax
205219 movl initial_gs+4(%rip),%edx
206220 wrmsr
221
+
222
+ /*
223
+ * Setup a boot time stack - Any secondary CPU will have lost its stack
224
+ * by now because the cr3-switch above unmaps the real-mode stack
225
+ */
226
+ movq initial_stack(%rip), %rsp
227
+
228
+ /* Setup and Load IDT */
229
+ pushq %rsi
230
+ call early_setup_idt
231
+ popq %rsi
232
+
233
+ /* Check if nx is implemented */
234
+ movl $0x80000001, %eax
235
+ cpuid
236
+ movl %edx,%edi
237
+
238
+ /* Setup EFER (Extended Feature Enable Register) */
239
+ movl $MSR_EFER, %ecx
240
+ rdmsr
241
+ btsl $_EFER_SCE, %eax /* Enable System Call */
242
+ btl $20,%edi /* No Execute supported? */
243
+ jnc 1f
244
+ btsl $_EFER_NX, %eax
245
+ btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
246
+1: wrmsr /* Make changes effective */
247
+
248
+ /* Setup cr0 */
249
+ movl $CR0_STATE, %eax
250
+ /* Make changes effective */
251
+ movq %rax, %cr0
252
+
253
+ /* zero EFLAGS after setting rsp */
254
+ pushq $0
255
+ popfq
207256
208257 /* rsi is pointer to real mode structure with interesting info.
209258 pass it to C */
....@@ -242,9 +291,10 @@
242291 pushq %rax # target address in negative space
243292 lretq
244293 .Lafter_lret:
245
-END(secondary_startup_64)
294
+SYM_CODE_END(secondary_startup_64)
246295
247296 #include "verify_cpu.S"
297
+#include "sev_verify_cbit.S"
248298
249299 #ifdef CONFIG_HOTPLUG_CPU
250300 /*
....@@ -252,30 +302,66 @@
252302 * up already except stack. We just set up stack here. Then call
253303 * start_secondary() via .Ljump_to_C_code.
254304 */
255
-ENTRY(start_cpu0)
256
- movq initial_stack(%rip), %rsp
305
+SYM_CODE_START(start_cpu0)
257306 UNWIND_HINT_EMPTY
307
+ movq initial_stack(%rip), %rsp
258308 jmp .Ljump_to_C_code
259
-ENDPROC(start_cpu0)
309
+SYM_CODE_END(start_cpu0)
310
+#endif
311
+
312
+#ifdef CONFIG_AMD_MEM_ENCRYPT
313
+/*
314
+ * VC Exception handler used during early boot when running on kernel
315
+ * addresses, but before the switch to the idt_table can be made.
316
+ * The early_idt_handler_array can't be used here because it calls into a lot
317
+ * of __init code and this handler is also used during CPU offlining/onlining.
318
+ * Therefore this handler ends up in the .text section so that it stays around
319
+ * when .init.text is freed.
320
+ */
321
+SYM_CODE_START_NOALIGN(vc_boot_ghcb)
322
+ UNWIND_HINT_IRET_REGS offset=8
323
+
324
+ ANNOTATE_UNRET_END
325
+
326
+ /* Build pt_regs */
327
+ PUSH_AND_CLEAR_REGS
328
+
329
+ /* Call C handler */
330
+ movq %rsp, %rdi
331
+ movq ORIG_RAX(%rsp), %rsi
332
+ movq initial_vc_handler(%rip), %rax
333
+ ANNOTATE_RETPOLINE_SAFE
334
+ call *%rax
335
+
336
+ /* Unwind pt_regs */
337
+ POP_REGS
338
+
339
+ /* Remove Error Code */
340
+ addq $8, %rsp
341
+
342
+ /* Pure iret required here - don't use INTERRUPT_RETURN */
343
+ iretq
344
+SYM_CODE_END(vc_boot_ghcb)
260345 #endif
261346
262347 /* Both SMP bootup and ACPI suspend change these variables */
263348 __REFDATA
264349 .balign 8
265
- GLOBAL(initial_code)
266
- .quad x86_64_start_kernel
267
- GLOBAL(initial_gs)
268
- .quad INIT_PER_CPU_VAR(irq_stack_union)
269
- GLOBAL(initial_stack)
270
- /*
271
- * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
272
- * unwinder reliably detect the end of the stack.
273
- */
274
- .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
350
+SYM_DATA(initial_code, .quad x86_64_start_kernel)
351
+SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data))
352
+#ifdef CONFIG_AMD_MEM_ENCRYPT
353
+SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
354
+#endif
355
+
356
+/*
357
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
358
+ * reliably detect the end of the stack.
359
+ */
360
+SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS)
275361 __FINITDATA
276362
277363 __INIT
278
-ENTRY(early_idt_handler_array)
364
+SYM_CODE_START(early_idt_handler_array)
279365 i = 0
280366 .rept NUM_EXCEPTION_VECTORS
281367 .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
....@@ -291,9 +377,10 @@
291377 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
292378 .endr
293379 UNWIND_HINT_IRET_REGS offset=16
294
-END(early_idt_handler_array)
380
+SYM_CODE_END(early_idt_handler_array)
295381
296
-early_idt_handler_common:
382
+SYM_CODE_START_LOCAL(early_idt_handler_common)
383
+ ANNOTATE_UNRET_END
297384 /*
298385 * The stack is the hardware frame, an error code or zero, and the
299386 * vector number.
....@@ -321,31 +408,48 @@
321408 pushq %r15 /* pt_regs->r15 */
322409 UNWIND_HINT_REGS
323410
324
- cmpq $14,%rsi /* Page fault? */
325
- jnz 10f
326
- GET_CR2_INTO(%rdi) /* Can clobber any volatile register if pv */
327
- call early_make_pgtable
328
- andl %eax,%eax
329
- jz 20f /* All good */
330
-
331
-10:
332411 movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */
333
- call early_fixup_exception
412
+ call do_early_exception
334413
335
-20:
336414 decl early_recursion_flag(%rip)
337415 jmp restore_regs_and_return_to_kernel
338
-END(early_idt_handler_common)
416
+SYM_CODE_END(early_idt_handler_common)
339417
340
- __INITDATA
418
+#ifdef CONFIG_AMD_MEM_ENCRYPT
419
+/*
420
+ * VC Exception handler used during very early boot. The
421
+ * early_idt_handler_array can't be used because it returns via the
422
+ * paravirtualized INTERRUPT_RETURN and pv-ops don't work that early.
423
+ *
424
+ * This handler will end up in the .init.text section and not be
425
+ * available to boot secondary CPUs.
426
+ */
427
+SYM_CODE_START_NOALIGN(vc_no_ghcb)
428
+ UNWIND_HINT_IRET_REGS offset=8
341429
342
- .balign 4
343
-GLOBAL(early_recursion_flag)
344
- .long 0
430
+ ANNOTATE_UNRET_END
345431
346
-#define NEXT_PAGE(name) \
347
- .balign PAGE_SIZE; \
348
-GLOBAL(name)
432
+ /* Build pt_regs */
433
+ PUSH_AND_CLEAR_REGS
434
+
435
+ /* Call C handler */
436
+ movq %rsp, %rdi
437
+ movq ORIG_RAX(%rsp), %rsi
438
+ call do_vc_no_ghcb
439
+
440
+ /* Unwind pt_regs */
441
+ POP_REGS
442
+
443
+ /* Remove Error Code */
444
+ addq $8, %rsp
445
+
446
+ /* Pure iret required here - don't use INTERRUPT_RETURN */
447
+ iretq
448
+SYM_CODE_END(vc_no_ghcb)
449
+#endif
450
+
451
+#define SYM_DATA_START_PAGE_ALIGNED(name) \
452
+ SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)
349453
350454 #ifdef CONFIG_PAGE_TABLE_ISOLATION
351455 /*
....@@ -360,11 +464,11 @@
360464 */
361465 #define PTI_USER_PGD_FILL 512
362466 /* This ensures they are 8k-aligned: */
363
-#define NEXT_PGD_PAGE(name) \
364
- .balign 2 * PAGE_SIZE; \
365
-GLOBAL(name)
467
+#define SYM_DATA_START_PTI_ALIGNED(name) \
468
+ SYM_START(name, SYM_L_GLOBAL, .balign 2 * PAGE_SIZE)
366469 #else
367
-#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
470
+#define SYM_DATA_START_PTI_ALIGNED(name) \
471
+ SYM_DATA_START_PAGE_ALIGNED(name)
368472 #define PTI_USER_PGD_FILL 0
369473 #endif
370474
....@@ -377,17 +481,23 @@
377481 .endr
378482
379483 __INITDATA
380
-NEXT_PGD_PAGE(early_top_pgt)
484
+ .balign 4
485
+
486
+SYM_DATA_START_PTI_ALIGNED(early_top_pgt)
381487 .fill 512,8,0
382488 .fill PTI_USER_PGD_FILL,8,0
489
+SYM_DATA_END(early_top_pgt)
383490
384
-NEXT_PAGE(early_dynamic_pgts)
491
+SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts)
385492 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
493
+SYM_DATA_END(early_dynamic_pgts)
494
+
495
+SYM_DATA(early_recursion_flag, .long 0)
386496
387497 .data
388498
389
-#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
390
-NEXT_PGD_PAGE(init_top_pgt)
499
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_PVH)
500
+SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
391501 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
392502 .org init_top_pgt + L4_PAGE_OFFSET*8, 0
393503 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
....@@ -395,11 +505,13 @@
395505 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
396506 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
397507 .fill PTI_USER_PGD_FILL,8,0
508
+SYM_DATA_END(init_top_pgt)
398509
399
-NEXT_PAGE(level3_ident_pgt)
510
+SYM_DATA_START_PAGE_ALIGNED(level3_ident_pgt)
400511 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
401512 .fill 511, 8, 0
402
-NEXT_PAGE(level2_ident_pgt)
513
+SYM_DATA_END(level3_ident_pgt)
514
+SYM_DATA_START_PAGE_ALIGNED(level2_ident_pgt)
403515 /*
404516 * Since I easily can, map the first 1G.
405517 * Don't set NX because code runs from these pages.
....@@ -409,25 +521,29 @@
409521 * the CPU should ignore the bit.
410522 */
411523 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
524
+SYM_DATA_END(level2_ident_pgt)
412525 #else
413
-NEXT_PGD_PAGE(init_top_pgt)
526
+SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
414527 .fill 512,8,0
415528 .fill PTI_USER_PGD_FILL,8,0
529
+SYM_DATA_END(init_top_pgt)
416530 #endif
417531
418532 #ifdef CONFIG_X86_5LEVEL
419
-NEXT_PAGE(level4_kernel_pgt)
533
+SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt)
420534 .fill 511,8,0
421535 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
536
+SYM_DATA_END(level4_kernel_pgt)
422537 #endif
423538
424
-NEXT_PAGE(level3_kernel_pgt)
539
+SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt)
425540 .fill L3_START_KERNEL,8,0
426541 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
427542 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
428543 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
544
+SYM_DATA_END(level3_kernel_pgt)
429545
430
-NEXT_PAGE(level2_kernel_pgt)
546
+SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt)
431547 /*
432548 * 512 MB kernel mapping. We spend a full page on this pagetable
433549 * anyway.
....@@ -444,8 +560,9 @@
444560 */
445561 PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
446562 KERNEL_IMAGE_SIZE/PMD_SIZE)
563
+SYM_DATA_END(level2_kernel_pgt)
447564
448
-NEXT_PAGE(level2_fixmap_pgt)
565
+SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt)
449566 .fill (512 - 4 - FIXMAP_PMD_NUM),8,0
450567 pgtno = 0
451568 .rept (FIXMAP_PMD_NUM)
....@@ -455,31 +572,32 @@
455572 .endr
456573 /* 6 MB reserved space + a 2MB hole */
457574 .fill 4,8,0
575
+SYM_DATA_END(level2_fixmap_pgt)
458576
459
-NEXT_PAGE(level1_fixmap_pgt)
577
+SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt)
460578 .rept (FIXMAP_PMD_NUM)
461579 .fill 512,8,0
462580 .endr
581
+SYM_DATA_END(level1_fixmap_pgt)
463582
464583 #undef PMDS
465584
466585 .data
467586 .align 16
468
- .globl early_gdt_descr
469
-early_gdt_descr:
470
- .word GDT_ENTRIES*8-1
471
-early_gdt_descr_base:
472
- .quad INIT_PER_CPU_VAR(gdt_page)
473587
474
-ENTRY(phys_base)
475
- /* This must match the first entry in level2_kernel_pgt */
476
- .quad 0x0000000000000000
588
+SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1)
589
+SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page))
590
+
591
+ .align 16
592
+/* This must match the first entry in level2_kernel_pgt */
593
+SYM_DATA(phys_base, .quad 0x0)
477594 EXPORT_SYMBOL(phys_base)
478595
479596 #include "../../x86/xen/xen-head.S"
480597
481598 __PAGE_ALIGNED_BSS
482
-NEXT_PAGE(empty_zero_page)
599
+SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
483600 .skip PAGE_SIZE
601
+SYM_DATA_END(empty_zero_page)
484602 EXPORT_SYMBOL(empty_zero_page)
485603