hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/x86/entry/entry_64.S
....@@ -8,15 +8,14 @@
88 *
99 * entry.S contains the system-call and fault low-level handling routines.
1010 *
11
- * Some of this is documented in Documentation/x86/entry_64.txt
11
+ * Some of this is documented in Documentation/x86/entry_64.rst
1212 *
1313 * A note on terminology:
1414 * - iret frame: Architecture defined interrupt frame from SS to RIP
1515 * at the top of the kernel process stack.
1616 *
1717 * Some macro usage:
18
- * - ENTRY/END: Define functions in the symbol table.
19
- * - TRACE_IRQ_*: Trace hardirq state for lock debugging.
18
+ * - SYM_FUNC_START/END:Define functions in the symbol table.
2019 * - idtentry: Define exception entry points.
2120 */
2221 #include <linux/linkage.h>
....@@ -37,7 +36,9 @@
3736 #include <asm/pgtable_types.h>
3837 #include <asm/export.h>
3938 #include <asm/frame.h>
39
+#include <asm/trapnr.h>
4040 #include <asm/nospec-branch.h>
41
+#include <asm/fsgsbase.h>
4142 #include <linux/err.h>
4243
4344 #include "calling.h"
....@@ -45,64 +46,13 @@
4546 .code64
4647 .section .entry.text, "ax"
4748
48
-#ifdef CONFIG_PARAVIRT
49
-ENTRY(native_usergs_sysret64)
49
+#ifdef CONFIG_PARAVIRT_XXL
50
+SYM_CODE_START(native_usergs_sysret64)
5051 UNWIND_HINT_EMPTY
5152 swapgs
5253 sysretq
53
-END(native_usergs_sysret64)
54
-#endif /* CONFIG_PARAVIRT */
55
-
56
-.macro TRACE_IRQS_FLAGS flags:req
57
-#ifdef CONFIG_TRACE_IRQFLAGS
58
- btl $9, \flags /* interrupts off? */
59
- jnc 1f
60
- TRACE_IRQS_ON
61
-1:
62
-#endif
63
-.endm
64
-
65
-.macro TRACE_IRQS_IRETQ
66
- TRACE_IRQS_FLAGS EFLAGS(%rsp)
67
-.endm
68
-
69
-/*
70
- * When dynamic function tracer is enabled it will add a breakpoint
71
- * to all locations that it is about to modify, sync CPUs, update
72
- * all the code, sync CPUs, then remove the breakpoints. In this time
73
- * if lockdep is enabled, it might jump back into the debug handler
74
- * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
75
- *
76
- * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
77
- * make sure the stack pointer does not get reset back to the top
78
- * of the debug stack, and instead just reuses the current stack.
79
- */
80
-#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
81
-
82
-.macro TRACE_IRQS_OFF_DEBUG
83
- call debug_stack_set_zero
84
- TRACE_IRQS_OFF
85
- call debug_stack_reset
86
-.endm
87
-
88
-.macro TRACE_IRQS_ON_DEBUG
89
- call debug_stack_set_zero
90
- TRACE_IRQS_ON
91
- call debug_stack_reset
92
-.endm
93
-
94
-.macro TRACE_IRQS_IRETQ_DEBUG
95
- btl $9, EFLAGS(%rsp) /* interrupts off? */
96
- jnc 1f
97
- TRACE_IRQS_ON_DEBUG
98
-1:
99
-.endm
100
-
101
-#else
102
-# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
103
-# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
104
-# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
105
-#endif
54
+SYM_CODE_END(native_usergs_sysret64)
55
+#endif /* CONFIG_PARAVIRT_XXL */
10656
10757 /*
10858 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
....@@ -142,102 +92,37 @@
14292 * with them due to bugs in both AMD and Intel CPUs.
14393 */
14494
145
- .pushsection .entry_trampoline, "ax"
95
+SYM_CODE_START(entry_SYSCALL_64)
96
+ UNWIND_HINT_ENTRY
14697
147
-/*
148
- * The code in here gets remapped into cpu_entry_area's trampoline. This means
149
- * that the assembler and linker have the wrong idea as to where this code
150
- * lives (and, in fact, it's mapped more than once, so it's not even at a
151
- * fixed address). So we can't reference any symbols outside the entry
152
- * trampoline and expect it to work.
153
- *
154
- * Instead, we carefully abuse %rip-relative addressing.
155
- * _entry_trampoline(%rip) refers to the start of the remapped) entry
156
- * trampoline. We can thus find cpu_entry_area with this macro:
157
- */
158
-
159
-#define CPU_ENTRY_AREA \
160
- _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
161
-
162
-/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
163
-#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
164
- SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
165
-
166
-ENTRY(entry_SYSCALL_64_trampoline)
167
- UNWIND_HINT_EMPTY
16898 swapgs
169
-
170
- /* Stash the user RSP. */
171
- movq %rsp, RSP_SCRATCH
172
-
173
- /* Note: using %rsp as a scratch reg. */
99
+ /* tss.sp2 is scratch space. */
100
+ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
174101 SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
175
-
176
- /* Load the top of the task stack into RSP */
177
- movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
178
-
179
- /* Start building the simulated IRET frame. */
180
- pushq $__USER_DS /* pt_regs->ss */
181
- pushq RSP_SCRATCH /* pt_regs->sp */
182
- pushq %r11 /* pt_regs->flags */
183
- pushq $__USER_CS /* pt_regs->cs */
184
- pushq %rcx /* pt_regs->ip */
185
-
186
- /*
187
- * x86 lacks a near absolute jump, and we can't jump to the real
188
- * entry text with a relative jump. We could push the target
189
- * address and then use retq, but this destroys the pipeline on
190
- * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
191
- * spill RDI and restore it in a second-stage trampoline.
192
- */
193
- pushq %rdi
194
- movq $entry_SYSCALL_64_stage2, %rdi
195
- JMP_NOSPEC %rdi
196
-END(entry_SYSCALL_64_trampoline)
197
-
198
- .popsection
199
-
200
-ENTRY(entry_SYSCALL_64_stage2)
201
- UNWIND_HINT_EMPTY
202
- popq %rdi
203
- jmp entry_SYSCALL_64_after_hwframe
204
-END(entry_SYSCALL_64_stage2)
205
-
206
-ENTRY(entry_SYSCALL_64)
207
- UNWIND_HINT_EMPTY
208
- /*
209
- * Interrupts are off on entry.
210
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
211
- * it is too small to ever cause noticeable irq latency.
212
- */
213
-
214
- swapgs
215
- /*
216
- * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
217
- * is not required to switch CR3.
218
- */
219
- movq %rsp, PER_CPU_VAR(rsp_scratch)
220102 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
221103
104
+SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
105
+
222106 /* Construct struct pt_regs on stack */
223
- pushq $__USER_DS /* pt_regs->ss */
224
- pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
225
- pushq %r11 /* pt_regs->flags */
226
- pushq $__USER_CS /* pt_regs->cs */
227
- pushq %rcx /* pt_regs->ip */
228
-GLOBAL(entry_SYSCALL_64_after_hwframe)
229
- pushq %rax /* pt_regs->orig_ax */
107
+ pushq $__USER_DS /* pt_regs->ss */
108
+ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
109
+ pushq %r11 /* pt_regs->flags */
110
+ pushq $__USER_CS /* pt_regs->cs */
111
+ pushq %rcx /* pt_regs->ip */
112
+SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
113
+ pushq %rax /* pt_regs->orig_ax */
230114
231115 PUSH_AND_CLEAR_REGS rax=$-ENOSYS
232
-
233
- TRACE_IRQS_OFF
234116
235117 /* IRQs are off. */
236118 movq %rax, %rdi
237119 movq %rsp, %rsi
238
- call do_syscall_64 /* returns with IRQs disabled */
239120
240
- TRACE_IRQS_IRETQ /* we're about to change IF */
121
+ /* clobbers %rax, make sure it is after saving the syscall nr */
122
+ IBRS_ENTER
123
+ UNTRAIN_RET
124
+
125
+ call do_syscall_64 /* returns with IRQs disabled */
241126
242127 /*
243128 * Try to use SYSRET instead of IRET if we're returning to
....@@ -311,8 +196,8 @@
311196 * perf profiles. Nothing jumps here.
312197 */
313198 syscall_return_via_sysret:
314
- /* rcx and r11 are already restored (see code above) */
315
- POP_REGS pop_rdi=0 skip_r11rcx=1
199
+ IBRS_EXIT
200
+ POP_REGS pop_rdi=0
316201
317202 /*
318203 * Now all regs are restored except RSP and RDI.
....@@ -329,19 +214,21 @@
329214 * We are on the trampoline stack. All regs except RDI are live.
330215 * We can do future final exit work right here.
331216 */
217
+ STACKLEAK_ERASE_NOCLOBBER
218
+
332219 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
333220
334221 popq %rdi
335222 popq %rsp
336223 USERGS_SYSRET64
337
-END(entry_SYSCALL_64)
224
+SYM_CODE_END(entry_SYSCALL_64)
338225
339226 /*
340227 * %rdi: prev task
341228 * %rsi: next task
342229 */
343
-ENTRY(__switch_to_asm)
344
- UNWIND_HINT_FUNC
230
+.pushsection .text, "ax"
231
+SYM_FUNC_START(__switch_to_asm)
345232 /*
346233 * Save callee-saved registers
347234 * This must match the order in inactive_task_frame
....@@ -352,7 +239,6 @@
352239 pushq %r13
353240 pushq %r14
354241 pushq %r15
355
- pushfq
356242
357243 /* switch stack */
358244 movq %rsp, TASK_threadsp(%rdi)
....@@ -360,10 +246,9 @@
360246
361247 #ifdef CONFIG_STACKPROTECTOR
362248 movq TASK_stack_canary(%rsi), %rbx
363
- movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
249
+ movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
364250 #endif
365251
366
-#ifdef CONFIG_RETPOLINE
367252 /*
368253 * When switching from a shallower to a deeper call stack
369254 * the RSB may either underflow or use entries populated
....@@ -372,10 +257,8 @@
372257 * speculative execution to prevent attack.
373258 */
374259 FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
375
-#endif
376260
377261 /* restore callee-saved registers */
378
- popfq
379262 popq %r15
380263 popq %r14
381264 popq %r13
....@@ -384,7 +267,8 @@
384267 popq %rbp
385268
386269 jmp __switch_to
387
-END(__switch_to_asm)
270
+SYM_FUNC_END(__switch_to_asm)
271
+.popsection
388272
389273 /*
390274 * A newly forked process directly context switches into this address.
....@@ -393,7 +277,8 @@
393277 * rbx: kernel thread func (NULL for user thread)
394278 * r12: kernel thread arg
395279 */
396
-ENTRY(ret_from_fork)
280
+.pushsection .text, "ax"
281
+SYM_CODE_START(ret_from_fork)
397282 UNWIND_HINT_EMPTY
398283 movq %rax, %rdi
399284 call schedule_tail /* rdi: 'prev' task parameter */
....@@ -404,51 +289,23 @@
404289 2:
405290 UNWIND_HINT_REGS
406291 movq %rsp, %rdi
407
- call syscall_return_slowpath /* returns with IRQs disabled */
408
- TRACE_IRQS_ON /* user mode is traced as IRQS on */
292
+ call syscall_exit_to_user_mode /* returns with IRQs disabled */
409293 jmp swapgs_restore_regs_and_return_to_usermode
410294
411295 1:
412296 /* kernel thread */
413297 UNWIND_HINT_EMPTY
414298 movq %r12, %rdi
415
- CALL_NOSPEC %rbx
299
+ CALL_NOSPEC rbx
416300 /*
417301 * A kernel thread is allowed to return here after successfully
418
- * calling do_execve(). Exit to userspace to complete the execve()
302
+ * calling kernel_execve(). Exit to userspace to complete the execve()
419303 * syscall.
420304 */
421305 movq $0, RAX(%rsp)
422306 jmp 2b
423
-END(ret_from_fork)
424
-
425
-/*
426
- * Build the entry stubs with some assembler magic.
427
- * We pack 1 stub into every 8-byte block.
428
- */
429
- .align 8
430
-ENTRY(irq_entries_start)
431
- vector=FIRST_EXTERNAL_VECTOR
432
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
433
- UNWIND_HINT_IRET_REGS
434
- pushq $(~vector+0x80) /* Note: always in signed byte range */
435
- jmp common_interrupt
436
- .align 8
437
- vector=vector+1
438
- .endr
439
-END(irq_entries_start)
440
-
441
- .align 8
442
-ENTRY(spurious_entries_start)
443
- vector=FIRST_SYSTEM_VECTOR
444
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
445
- UNWIND_HINT_IRET_REGS
446
- pushq $(~vector+0x80) /* Note: always in signed byte range */
447
- jmp common_spurious
448
- .align 8
449
- vector=vector+1
450
- .endr
451
-END(spurious_entries_start)
307
+SYM_CODE_END(ret_from_fork)
308
+.popsection
452309
453310 .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
454311 #ifdef CONFIG_DEBUG_ENTRY
....@@ -462,229 +319,260 @@
462319 #endif
463320 .endm
464321
465
-/*
466
- * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers
467
- * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
468
- * Requires kernel GSBASE.
469
- *
470
- * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
322
+/**
323
+ * idtentry_body - Macro to emit code calling the C function
324
+ * @cfunc: C function to be called
325
+ * @has_error_code: Hardware pushed error code on stack
471326 */
472
-.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0
473
- DEBUG_ENTRY_ASSERT_IRQS_OFF
327
+.macro idtentry_body cfunc has_error_code:req
474328
475
- .if \save_ret
476
- /*
477
- * If save_ret is set, the original stack contains one additional
478
- * entry -- the return address. Therefore, move the address one
479
- * entry below %rsp to \old_rsp.
480
- */
481
- leaq 8(%rsp), \old_rsp
482
- .else
483
- movq %rsp, \old_rsp
329
+ call error_entry
330
+ UNWIND_HINT_REGS
331
+
332
+ movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
333
+
334
+ .if \has_error_code == 1
335
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
336
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
484337 .endif
485338
486
- .if \regs
487
- UNWIND_HINT_REGS base=\old_rsp
339
+ call \cfunc
340
+
341
+ jmp error_return
342
+.endm
343
+
344
+/**
345
+ * idtentry - Macro to generate entry stubs for simple IDT entries
346
+ * @vector: Vector number
347
+ * @asmsym: ASM symbol for the entry point
348
+ * @cfunc: C function to be called
349
+ * @has_error_code: Hardware pushed error code on stack
350
+ *
351
+ * The macro emits code to set up the kernel context for straight forward
352
+ * and simple IDT entries. No IST stack, no paranoid entry checks.
353
+ */
354
+.macro idtentry vector asmsym cfunc has_error_code:req
355
+SYM_CODE_START(\asmsym)
356
+ UNWIND_HINT_IRET_REGS offset=\has_error_code*8
357
+ ASM_CLAC
358
+
359
+ .if \has_error_code == 0
360
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
488361 .endif
489362
490
- incl PER_CPU_VAR(irq_count)
491
- jnz .Lirq_stack_push_old_rsp_\@
363
+ .if \vector == X86_TRAP_BP
364
+ /*
365
+ * If coming from kernel space, create a 6-word gap to allow the
366
+ * int3 handler to emulate a call instruction.
367
+ */
368
+ testb $3, CS-ORIG_RAX(%rsp)
369
+ jnz .Lfrom_usermode_no_gap_\@
370
+ .rept 6
371
+ pushq 5*8(%rsp)
372
+ .endr
373
+ UNWIND_HINT_IRET_REGS offset=8
374
+.Lfrom_usermode_no_gap_\@:
375
+ .endif
376
+
377
+ idtentry_body \cfunc \has_error_code
378
+
379
+_ASM_NOKPROBE(\asmsym)
380
+SYM_CODE_END(\asmsym)
381
+.endm
382
+
383
+/*
384
+ * Interrupt entry/exit.
385
+ *
386
+ + The interrupt stubs push (vector) onto the stack, which is the error_code
387
+ * position of idtentry exceptions, and jump to one of the two idtentry points
388
+ * (common/spurious).
389
+ *
390
+ * common_interrupt is a hotpath, align it to a cache line
391
+ */
392
+.macro idtentry_irq vector cfunc
393
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
394
+ idtentry \vector asm_\cfunc \cfunc has_error_code=1
395
+.endm
396
+
397
+/*
398
+ * System vectors which invoke their handlers directly and are not
399
+ * going through the regular common device interrupt handling code.
400
+ */
401
+.macro idtentry_sysvec vector cfunc
402
+ idtentry \vector asm_\cfunc \cfunc has_error_code=0
403
+.endm
404
+
405
+/**
406
+ * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
407
+ * @vector: Vector number
408
+ * @asmsym: ASM symbol for the entry point
409
+ * @cfunc: C function to be called
410
+ *
411
+ * The macro emits code to set up the kernel context for #MC and #DB
412
+ *
413
+ * If the entry comes from user space it uses the normal entry path
414
+ * including the return to user space work and preemption checks on
415
+ * exit.
416
+ *
417
+ * If hits in kernel mode then it needs to go through the paranoid
418
+ * entry as the exception can hit any random state. No preemption
419
+ * check on exit to keep the paranoid path simple.
420
+ */
421
+.macro idtentry_mce_db vector asmsym cfunc
422
+SYM_CODE_START(\asmsym)
423
+ UNWIND_HINT_IRET_REGS
424
+ ASM_CLAC
425
+
426
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
492427
493428 /*
494
- * Right now, if we just incremented irq_count to zero, we've
495
- * claimed the IRQ stack but we haven't switched to it yet.
496
- *
497
- * If anything is added that can interrupt us here without using IST,
498
- * it must be *extremely* careful to limit its stack usage. This
499
- * could include kprobes and a hypothetical future IST-less #DB
500
- * handler.
501
- *
502
- * The OOPS unwinder relies on the word at the top of the IRQ
503
- * stack linking back to the previous RSP for the entire time we're
504
- * on the IRQ stack. For this to work reliably, we need to write
505
- * it before we actually move ourselves to the IRQ stack.
429
+ * If the entry is from userspace, switch stacks and treat it as
430
+ * a normal entry.
506431 */
432
+ testb $3, CS-ORIG_RAX(%rsp)
433
+ jnz .Lfrom_usermode_switch_stack_\@
507434
508
- movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8)
509
- movq PER_CPU_VAR(irq_stack_ptr), %rsp
435
+ /* paranoid_entry returns GS information for paranoid_exit in EBX. */
436
+ call paranoid_entry
510437
511
-#ifdef CONFIG_DEBUG_ENTRY
438
+ UNWIND_HINT_REGS
439
+
440
+ movq %rsp, %rdi /* pt_regs pointer */
441
+
442
+ call \cfunc
443
+
444
+ jmp paranoid_exit
445
+
446
+ /* Switch to the regular task stack and use the noist entry point */
447
+.Lfrom_usermode_switch_stack_\@:
448
+ idtentry_body noist_\cfunc, has_error_code=0
449
+
450
+_ASM_NOKPROBE(\asmsym)
451
+SYM_CODE_END(\asmsym)
452
+.endm
453
+
454
+#ifdef CONFIG_AMD_MEM_ENCRYPT
455
+/**
456
+ * idtentry_vc - Macro to generate entry stub for #VC
457
+ * @vector: Vector number
458
+ * @asmsym: ASM symbol for the entry point
459
+ * @cfunc: C function to be called
460
+ *
461
+ * The macro emits code to set up the kernel context for #VC. The #VC handler
462
+ * runs on an IST stack and needs to be able to cause nested #VC exceptions.
463
+ *
464
+ * To make this work the #VC entry code tries its best to pretend it doesn't use
465
+ * an IST stack by switching to the task stack if coming from user-space (which
466
+ * includes early SYSCALL entry path) or back to the stack in the IRET frame if
467
+ * entered from kernel-mode.
468
+ *
469
+ * If entered from kernel-mode the return stack is validated first, and if it is
470
+ * not safe to use (e.g. because it points to the entry stack) the #VC handler
471
+ * will switch to a fall-back stack (VC2) and call a special handler function.
472
+ *
473
+ * The macro is only used for one vector, but it is planned to be extended in
474
+ * the future for the #HV exception.
475
+ */
476
+.macro idtentry_vc vector asmsym cfunc
477
+SYM_CODE_START(\asmsym)
478
+ UNWIND_HINT_IRET_REGS
479
+ ASM_CLAC
480
+
512481 /*
513
- * If the first movq above becomes wrong due to IRQ stack layout
514
- * changes, the only way we'll notice is if we try to unwind right
515
- * here. Assert that we set up the stack right to catch this type
516
- * of bug quickly.
482
+ * If the entry is from userspace, switch stacks and treat it as
483
+ * a normal entry.
517484 */
518
- cmpq -8(%rsp), \old_rsp
519
- je .Lirq_stack_okay\@
520
- ud2
521
- .Lirq_stack_okay\@:
485
+ testb $3, CS-ORIG_RAX(%rsp)
486
+ jnz .Lfrom_usermode_switch_stack_\@
487
+
488
+ /*
489
+ * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
490
+ * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
491
+ */
492
+ call paranoid_entry
493
+
494
+ UNWIND_HINT_REGS
495
+
496
+ /*
497
+ * Switch off the IST stack to make it free for nested exceptions. The
498
+ * vc_switch_off_ist() function will switch back to the interrupted
499
+ * stack if it is safe to do so. If not it switches to the VC fall-back
500
+ * stack.
501
+ */
502
+ movq %rsp, %rdi /* pt_regs pointer */
503
+ call vc_switch_off_ist
504
+ movq %rax, %rsp /* Switch to new stack */
505
+
506
+ ENCODE_FRAME_POINTER
507
+ UNWIND_HINT_REGS
508
+
509
+ /* Update pt_regs */
510
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
511
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
512
+
513
+ movq %rsp, %rdi /* pt_regs pointer */
514
+
515
+ call kernel_\cfunc
516
+
517
+ /*
518
+ * No need to switch back to the IST stack. The current stack is either
519
+ * identical to the stack in the IRET frame or the VC fall-back stack,
520
+ * so it is definitly mapped even with PTI enabled.
521
+ */
522
+ jmp paranoid_exit
523
+
524
+ /* Switch to the regular task stack */
525
+.Lfrom_usermode_switch_stack_\@:
526
+ idtentry_body user_\cfunc, has_error_code=1
527
+
528
+_ASM_NOKPROBE(\asmsym)
529
+SYM_CODE_END(\asmsym)
530
+.endm
522531 #endif
523532
524
-.Lirq_stack_push_old_rsp_\@:
525
- pushq \old_rsp
526
-
527
- .if \regs
528
- UNWIND_HINT_REGS indirect=1
529
- .endif
530
-
531
- .if \save_ret
532
- /*
533
- * Push the return address to the stack. This return address can
534
- * be found at the "real" original RSP, which was offset by 8 at
535
- * the beginning of this macro.
536
- */
537
- pushq -8(\old_rsp)
538
- .endif
539
-.endm
540
-
541533 /*
542
- * Undoes ENTER_IRQ_STACK.
534
+ * Double fault entry. Straight paranoid. No checks from which context
535
+ * this comes because for the espfix induced #DF this would do the wrong
536
+ * thing.
543537 */
544
-.macro LEAVE_IRQ_STACK regs=1
545
- DEBUG_ENTRY_ASSERT_IRQS_OFF
546
- /* We need to be off the IRQ stack before decrementing irq_count. */
547
- popq %rsp
548
-
549
- .if \regs
550
- UNWIND_HINT_REGS
551
- .endif
552
-
553
- /*
554
- * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
555
- * the irq stack but we're not on it.
556
- */
557
-
558
- decl PER_CPU_VAR(irq_count)
559
-.endm
560
-
561
-/*
562
- * Interrupt entry helper function.
563
- *
564
- * Entry runs with interrupts off. Stack layout at entry:
565
- * +----------------------------------------------------+
566
- * | regs->ss |
567
- * | regs->rsp |
568
- * | regs->eflags |
569
- * | regs->cs |
570
- * | regs->ip |
571
- * +----------------------------------------------------+
572
- * | regs->orig_ax = ~(interrupt number) |
573
- * +----------------------------------------------------+
574
- * | return address |
575
- * +----------------------------------------------------+
576
- */
577
-ENTRY(interrupt_entry)
578
- UNWIND_HINT_IRET_REGS offset=16
538
+.macro idtentry_df vector asmsym cfunc
539
+SYM_CODE_START(\asmsym)
540
+ UNWIND_HINT_IRET_REGS offset=8
579541 ASM_CLAC
580
- cld
581542
582
- testb $3, CS-ORIG_RAX+8(%rsp)
583
- jz 1f
584
- SWAPGS
585
- FENCE_SWAPGS_USER_ENTRY
586
- /*
587
- * Switch to the thread stack. The IRET frame and orig_ax are
588
- * on the stack, as well as the return address. RDI..R12 are
589
- * not (yet) on the stack and space has not (yet) been
590
- * allocated for them.
591
- */
592
- pushq %rdi
543
+ /* paranoid_entry returns GS information for paranoid_exit in EBX. */
544
+ call paranoid_entry
545
+ UNWIND_HINT_REGS
593546
594
- /* Need to switch before accessing the thread stack. */
595
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
596
- movq %rsp, %rdi
597
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
547
+ movq %rsp, %rdi /* pt_regs pointer into first argument */
548
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
549
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
550
+ call \cfunc
598551
599
- /*
600
- * We have RDI, return address, and orig_ax on the stack on
601
- * top of the IRET frame. That means offset=24
602
- */
603
- UNWIND_HINT_IRET_REGS base=%rdi offset=24
552
+ jmp paranoid_exit
604553
605
- pushq 7*8(%rdi) /* regs->ss */
606
- pushq 6*8(%rdi) /* regs->rsp */
607
- pushq 5*8(%rdi) /* regs->eflags */
608
- pushq 4*8(%rdi) /* regs->cs */
609
- pushq 3*8(%rdi) /* regs->ip */
610
- UNWIND_HINT_IRET_REGS
611
- pushq 2*8(%rdi) /* regs->orig_ax */
612
- pushq 8(%rdi) /* return address */
613
-
614
- movq (%rdi), %rdi
615
- jmp 2f
616
-1:
617
- FENCE_SWAPGS_KERNEL_ENTRY
618
-2:
619
- PUSH_AND_CLEAR_REGS save_ret=1
620
- ENCODE_FRAME_POINTER 8
621
-
622
- testb $3, CS+8(%rsp)
623
- jz 1f
624
-
625
- /*
626
- * IRQ from user mode.
627
- *
628
- * We need to tell lockdep that IRQs are off. We can't do this until
629
- * we fix gsbase, and we should do it before enter_from_user_mode
630
- * (which can take locks). Since TRACE_IRQS_OFF is idempotent,
631
- * the simplest way to handle it is to just call it twice if
632
- * we enter from user mode. There's no reason to optimize this since
633
- * TRACE_IRQS_OFF is a no-op if lockdep is off.
634
- */
635
- TRACE_IRQS_OFF
636
-
637
- CALL_enter_from_user_mode
638
-
639
-1:
640
- ENTER_IRQ_STACK old_rsp=%rdi save_ret=1
641
- /* We entered an interrupt context - irqs are off: */
642
- TRACE_IRQS_OFF
643
-
644
- ret
645
-END(interrupt_entry)
646
-_ASM_NOKPROBE(interrupt_entry)
647
-
648
-
649
-/* Interrupt entry/exit. */
554
+_ASM_NOKPROBE(\asmsym)
555
+SYM_CODE_END(\asmsym)
556
+.endm
650557
651558 /*
652
- * The interrupt stubs push (~vector+0x80) onto the stack and
653
- * then jump to common_spurious/interrupt.
559
+ * Include the defines which emit the idt entries which are shared
560
+ * shared between 32 and 64 bit and emit the __irqentry_text_* markers
561
+ * so the stacktrace boundary checks work.
654562 */
655
-common_spurious:
656
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
657
- call interrupt_entry
658
- UNWIND_HINT_REGS indirect=1
659
- call smp_spurious_interrupt /* rdi points to pt_regs */
660
- jmp ret_from_intr
661
-END(common_spurious)
662
-_ASM_NOKPROBE(common_spurious)
563
+ .align 16
564
+ .globl __irqentry_text_start
565
+__irqentry_text_start:
663566
664
-/* common_interrupt is a hotpath. Align it */
665
- .p2align CONFIG_X86_L1_CACHE_SHIFT
666
-common_interrupt:
667
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
668
- call interrupt_entry
669
- UNWIND_HINT_REGS indirect=1
670
- call do_IRQ /* rdi points to pt_regs */
671
- /* 0(%rsp): old RSP */
672
-ret_from_intr:
673
- DISABLE_INTERRUPTS(CLBR_ANY)
674
- TRACE_IRQS_OFF
567
+#include <asm/idtentry.h>
675568
676
- LEAVE_IRQ_STACK
569
+ .align 16
570
+ .globl __irqentry_text_end
571
+__irqentry_text_end:
677572
678
- testb $3, CS(%rsp)
679
- jz retint_kernel
680
-
681
- /* Interrupt came from user space */
682
-GLOBAL(retint_user)
683
- mov %rsp,%rdi
684
- call prepare_exit_to_usermode
685
- TRACE_IRQS_IRETQ
686
-
687
-GLOBAL(swapgs_restore_regs_and_return_to_usermode)
573
+SYM_CODE_START_LOCAL(common_interrupt_return)
574
+SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
575
+ IBRS_EXIT
688576 #ifdef CONFIG_DEBUG_ENTRY
689577 /* Assert that pt_regs indicates user mode. */
690578 testb $3, CS(%rsp)
....@@ -692,6 +580,10 @@
692580 ud2
693581 1:
694582 #endif
583
+#ifdef CONFIG_XEN_PV
584
+ ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
585
+#endif
586
+
695587 POP_REGS pop_rdi=0
696588
697589 /*
....@@ -716,6 +608,7 @@
716608 * We are on the trampoline stack. All regs except RDI are live.
717609 * We can do future final exit work right here.
718610 */
611
+ STACKLEAK_ERASE_NOCLOBBER
719612
720613 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
721614
....@@ -725,25 +618,7 @@
725618 INTERRUPT_RETURN
726619
727620
728
-/* Returning to kernel space */
729
-retint_kernel:
730
-#ifdef CONFIG_PREEMPT
731
- /* Interrupts are off */
732
- /* Check if we need preemption */
733
- btl $9, EFLAGS(%rsp) /* were interrupts off? */
734
- jnc 1f
735
-0: cmpl $0, PER_CPU_VAR(__preempt_count)
736
- jnz 1f
737
- call preempt_schedule_irq
738
- jmp 0b
739
-1:
740
-#endif
741
- /*
742
- * The iretq could re-enable interrupts:
743
- */
744
- TRACE_IRQS_IRETQ
745
-
746
-GLOBAL(restore_regs_and_return_to_kernel)
621
+SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
747622 #ifdef CONFIG_DEBUG_ENTRY
748623 /* Assert that pt_regs indicates kernel mode. */
749624 testb $3, CS(%rsp)
....@@ -759,7 +634,7 @@
759634 */
760635 INTERRUPT_RETURN
761636
762
-ENTRY(native_iret)
637
+SYM_INNER_LABEL_ALIGN(native_iret, SYM_L_GLOBAL)
763638 UNWIND_HINT_IRET_REGS
764639 /*
765640 * Are we returning to a stack segment from the LDT? Note: in
....@@ -770,12 +645,11 @@
770645 jnz native_irq_return_ldt
771646 #endif
772647
773
-.global native_irq_return_iret
774
-native_irq_return_iret:
648
+SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
775649 /*
776650 * This may fault. Non-paranoid faults on return to userspace are
777651 * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
778
- * Double-faults due to espfix64 are handled in do_double_fault.
652
+ * Double-faults due to espfix64 are handled in exc_double_fault.
779653 * Other faults here are fatal.
780654 */
781655 iretq
....@@ -804,8 +678,9 @@
804678 */
805679
806680 pushq %rdi /* Stash user RDI */
807
- SWAPGS /* to kernel GS */
681
+ swapgs /* to kernel GS */
808682 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
683
+ UNTRAIN_RET
809684
810685 movq PER_CPU_VAR(espfix_waddr), %rdi
811686 movq %rax, (0*8)(%rdi) /* user RAX */
....@@ -834,7 +709,7 @@
834709 orq PER_CPU_VAR(espfix_stack), %rax
835710
836711 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
837
- SWAPGS /* to user GS */
712
+ swapgs /* to user GS */
838713 popq %rdi /* Restore user RDI */
839714
840715 movq %rax, %rsp
....@@ -853,226 +728,32 @@
853728 */
854729 jmp native_irq_return_iret
855730 #endif
856
-END(common_interrupt)
857
-_ASM_NOKPROBE(common_interrupt)
731
+SYM_CODE_END(common_interrupt_return)
732
+_ASM_NOKPROBE(common_interrupt_return)
858733
859734 /*
860
- * APIC interrupts.
735
+ * Reload gs selector with exception handling
736
+ * edi: new selector
737
+ *
738
+ * Is in entry.text as it shouldn't be instrumented.
861739 */
862
-.macro apicinterrupt3 num sym do_sym
863
-ENTRY(\sym)
864
- UNWIND_HINT_IRET_REGS
865
- pushq $~(\num)
866
-.Lcommon_\sym:
867
- call interrupt_entry
868
- UNWIND_HINT_REGS indirect=1
869
- call \do_sym /* rdi points to pt_regs */
870
- jmp ret_from_intr
871
-END(\sym)
872
-_ASM_NOKPROBE(\sym)
873
-.endm
874
-
875
-/* Make sure APIC interrupt handlers end up in the irqentry section: */
876
-#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
877
-#define POP_SECTION_IRQENTRY .popsection
878
-
879
-.macro apicinterrupt num sym do_sym
880
-PUSH_SECTION_IRQENTRY
881
-apicinterrupt3 \num \sym \do_sym
882
-POP_SECTION_IRQENTRY
883
-.endm
884
-
885
-#ifdef CONFIG_SMP
886
-apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
887
-apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt
888
-#endif
889
-
890
-#ifdef CONFIG_X86_UV
891
-apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt
892
-#endif
893
-
894
-apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt
895
-apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi
896
-
897
-#ifdef CONFIG_HAVE_KVM
898
-apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
899
-apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
900
-apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi
901
-#endif
902
-
903
-#ifdef CONFIG_X86_MCE_THRESHOLD
904
-apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt
905
-#endif
906
-
907
-#ifdef CONFIG_X86_MCE_AMD
908
-apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt
909
-#endif
910
-
911
-#ifdef CONFIG_X86_THERMAL_VECTOR
912
-apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt
913
-#endif
914
-
915
-#ifdef CONFIG_SMP
916
-apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt
917
-apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt
918
-apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt
919
-#endif
920
-
921
-apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt
922
-apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt
923
-
924
-#ifdef CONFIG_IRQ_WORK
925
-apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
926
-#endif
927
-
928
-/*
929
- * Exception entry points.
930
- */
931
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
932
-
933
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 create_gap=0
934
-ENTRY(\sym)
935
- UNWIND_HINT_IRET_REGS offset=\has_error_code*8
936
-
937
- /* Sanity check */
938
- .if \shift_ist != -1 && \paranoid == 0
939
- .error "using shift_ist requires paranoid=1"
940
- .endif
941
-
942
- ASM_CLAC
943
-
944
- .if \has_error_code == 0
945
- pushq $-1 /* ORIG_RAX: no syscall to restart */
946
- .endif
947
-
948
- .if \paranoid == 1
949
- testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */
950
- jnz .Lfrom_usermode_switch_stack_\@
951
- .endif
952
-
953
- .if \create_gap == 1
954
- /*
955
- * If coming from kernel space, create a 6-word gap to allow the
956
- * int3 handler to emulate a call instruction.
957
- */
958
- testb $3, CS-ORIG_RAX(%rsp)
959
- jnz .Lfrom_usermode_no_gap_\@
960
- .rept 6
961
- pushq 5*8(%rsp)
962
- .endr
963
- UNWIND_HINT_IRET_REGS offset=8
964
-.Lfrom_usermode_no_gap_\@:
965
- .endif
966
-
967
- .if \paranoid
968
- call paranoid_entry
969
- .else
970
- call error_entry
971
- .endif
972
- UNWIND_HINT_REGS
973
- /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
974
-
975
- .if \paranoid
976
- .if \shift_ist != -1
977
- TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
978
- .else
979
- TRACE_IRQS_OFF
980
- .endif
981
- .endif
982
-
983
- movq %rsp, %rdi /* pt_regs pointer */
984
-
985
- .if \has_error_code
986
- movq ORIG_RAX(%rsp), %rsi /* get error code */
987
- movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
988
- .else
989
- xorl %esi, %esi /* no error code */
990
- .endif
991
-
992
- .if \shift_ist != -1
993
- subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
994
- .endif
995
-
996
- call \do_sym
997
-
998
- .if \shift_ist != -1
999
- addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
1000
- .endif
1001
-
1002
- /* these procedures expect "no swapgs" flag in ebx */
1003
- .if \paranoid
1004
- jmp paranoid_exit
1005
- .else
1006
- jmp error_exit
1007
- .endif
1008
-
1009
- .if \paranoid == 1
1010
- /*
1011
- * Entry from userspace. Switch stacks and treat it
1012
- * as a normal entry. This means that paranoid handlers
1013
- * run in real process context if user_mode(regs).
1014
- */
1015
-.Lfrom_usermode_switch_stack_\@:
1016
- call error_entry
1017
-
1018
- movq %rsp, %rdi /* pt_regs pointer */
1019
-
1020
- .if \has_error_code
1021
- movq ORIG_RAX(%rsp), %rsi /* get error code */
1022
- movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
1023
- .else
1024
- xorl %esi, %esi /* no error code */
1025
- .endif
1026
-
1027
- call \do_sym
1028
-
1029
- jmp error_exit
1030
- .endif
1031
-_ASM_NOKPROBE(\sym)
1032
-END(\sym)
1033
-.endm
1034
-
1035
-idtentry divide_error do_divide_error has_error_code=0
1036
-idtentry overflow do_overflow has_error_code=0
1037
-idtentry bounds do_bounds has_error_code=0
1038
-idtentry invalid_op do_invalid_op has_error_code=0
1039
-idtentry device_not_available do_device_not_available has_error_code=0
1040
-idtentry double_fault do_double_fault has_error_code=1 paranoid=2
1041
-idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
1042
-idtentry invalid_TSS do_invalid_TSS has_error_code=1
1043
-idtentry segment_not_present do_segment_not_present has_error_code=1
1044
-idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
1045
-idtentry coprocessor_error do_coprocessor_error has_error_code=0
1046
-idtentry alignment_check do_alignment_check has_error_code=1
1047
-idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
1048
-
1049
-
1050
- /*
1051
- * Reload gs selector with exception handling
1052
- * edi: new selector
1053
- */
1054
-ENTRY(native_load_gs_index)
740
+SYM_FUNC_START(asm_load_gs_index)
1055741 FRAME_BEGIN
1056
- pushfq
1057
- DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1058
- TRACE_IRQS_OFF
1059
- SWAPGS
742
+ swapgs
1060743 .Lgs_change:
1061744 movl %edi, %gs
1062745 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
1063
- SWAPGS
1064
- TRACE_IRQS_FLAGS (%rsp)
1065
- popfq
746
+ swapgs
1066747 FRAME_END
1067
- ret
1068
-ENDPROC(native_load_gs_index)
1069
-EXPORT_SYMBOL(native_load_gs_index)
748
+ RET
749
+SYM_FUNC_END(asm_load_gs_index)
750
+EXPORT_SYMBOL(asm_load_gs_index)
1070751
1071
- _ASM_EXTABLE(.Lgs_change, bad_gs)
752
+ _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
1072753 .section .fixup, "ax"
1073754 /* running with kernelgs */
1074
-bad_gs:
1075
- SWAPGS /* switch back to user gs */
755
+SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
756
+ swapgs /* switch back to user gs */
1076757 .macro ZAP_GS
1077758 /* This can't be a string because the preprocessor needs to see it. */
1078759 movl $__USER_DS, %eax
....@@ -1082,22 +763,51 @@
1082763 xorl %eax, %eax
1083764 movl %eax, %gs
1084765 jmp 2b
766
+SYM_CODE_END(.Lbad_gs)
1085767 .previous
1086768
1087
-/* Call softirq on interrupt stack. Interrupts are off. */
1088
-ENTRY(do_softirq_own_stack)
1089
- pushq %rbp
1090
- mov %rsp, %rbp
1091
- ENTER_IRQ_STACK regs=0 old_rsp=%r11
1092
- call __do_softirq
1093
- LEAVE_IRQ_STACK regs=0
769
+/*
770
+ * rdi: New stack pointer points to the top word of the stack
771
+ * rsi: Function pointer
772
+ * rdx: Function argument (can be NULL if none)
773
+ */
774
+SYM_FUNC_START(asm_call_on_stack)
775
+SYM_INNER_LABEL(asm_call_sysvec_on_stack, SYM_L_GLOBAL)
776
+SYM_INNER_LABEL(asm_call_irq_on_stack, SYM_L_GLOBAL)
777
+ /*
778
+ * Save the frame pointer unconditionally. This allows the ORC
779
+ * unwinder to handle the stack switch.
780
+ */
781
+ pushq %rbp
782
+ mov %rsp, %rbp
783
+
784
+ /*
785
+ * The unwinder relies on the word at the top of the new stack
786
+ * page linking back to the previous RSP.
787
+ */
788
+ mov %rsp, (%rdi)
789
+ mov %rdi, %rsp
790
+ /* Move the argument to the right place */
791
+ mov %rdx, %rdi
792
+
793
+1:
794
+ .pushsection .discard.instr_begin
795
+ .long 1b - .
796
+ .popsection
797
+
798
+ CALL_NOSPEC rsi
799
+
800
+2:
801
+ .pushsection .discard.instr_end
802
+ .long 2b - .
803
+ .popsection
804
+
805
+ /* Restore the previous stack pointer from RBP. */
1094806 leaveq
1095
- ret
1096
-ENDPROC(do_softirq_own_stack)
807
+ RET
808
+SYM_FUNC_END(asm_call_on_stack)
1097809
1098
-#ifdef CONFIG_XEN
1099
-idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
1100
-
810
+#ifdef CONFIG_XEN_PV
1101811 /*
1102812 * A note on the "critical region" in our callback handler.
1103813 * We want to avoid stacking callback handlers due to events occurring
....@@ -1110,8 +820,10 @@
1110820 * So, on entry to the handler we detect whether we interrupted an
1111821 * existing activation in its critical region -- if so, we pop the current
1112822 * activation and restart the handler using the previous one.
823
+ *
824
+ * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
1113825 */
1114
-ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
826
+SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
1115827
1116828 /*
1117829 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
....@@ -1121,15 +833,10 @@
1121833 movq %rdi, %rsp /* we don't return, adjust the stack frame */
1122834 UNWIND_HINT_REGS
1123835
1124
- ENTER_IRQ_STACK old_rsp=%r10
1125
- call xen_evtchn_do_upcall
1126
- LEAVE_IRQ_STACK
836
+ call xen_pv_evtchn_do_upcall
1127837
1128
-#ifndef CONFIG_PREEMPT
1129
- call xen_maybe_preempt_hcall
1130
-#endif
1131
- jmp error_exit
1132
-END(xen_do_hypervisor_callback)
838
+ jmp error_return
839
+SYM_CODE_END(exc_xen_hypervisor_callback)
1133840
1134841 /*
1135842 * Hypervisor uses this for application faults while it executes.
....@@ -1144,7 +851,7 @@
1144851 * We distinguish between categories by comparing each saved segment register
1145852 * with its current contents: any discrepancy means we in category 1.
1146853 */
1147
-ENTRY(xen_failsafe_callback)
854
+SYM_CODE_START(xen_failsafe_callback)
1148855 UNWIND_HINT_EMPTY
1149856 movl %ds, %ecx
1150857 cmpw %cx, 0x10(%rsp)
....@@ -1164,7 +871,7 @@
1164871 addq $0x30, %rsp
1165872 pushq $0 /* RIP */
1166873 UNWIND_HINT_IRET_REGS offset=8
1167
- jmp general_protection
874
+ jmp asm_exc_general_protection
1168875 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1169876 movq (%rsp), %rcx
1170877 movq 8(%rsp), %r11
....@@ -1173,64 +880,29 @@
1173880 pushq $-1 /* orig_ax = -1 => not a system call */
1174881 PUSH_AND_CLEAR_REGS
1175882 ENCODE_FRAME_POINTER
1176
- jmp error_exit
1177
-END(xen_failsafe_callback)
1178
-
1179
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
1180
- xen_hvm_callback_vector xen_evtchn_do_upcall
1181
-
1182
-#endif /* CONFIG_XEN */
1183
-
1184
-#if IS_ENABLED(CONFIG_HYPERV)
1185
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
1186
- hyperv_callback_vector hyperv_vector_handler
1187
-
1188
-apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \
1189
- hyperv_reenlightenment_vector hyperv_reenlightenment_intr
1190
-
1191
-apicinterrupt3 HYPERV_STIMER0_VECTOR \
1192
- hv_stimer0_callback_vector hv_stimer0_vector_handler
1193
-#endif /* CONFIG_HYPERV */
1194
-
1195
-idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
1196
-idtentry int3 do_int3 has_error_code=0 create_gap=1
1197
-idtentry stack_segment do_stack_segment has_error_code=1
1198
-
1199
-#ifdef CONFIG_XEN
1200
-idtentry xennmi do_nmi has_error_code=0
1201
-idtentry xendebug do_debug has_error_code=0
1202
-#endif
1203
-
1204
-idtentry general_protection do_general_protection has_error_code=1
1205
-idtentry page_fault do_page_fault has_error_code=1
1206
-
1207
-#ifdef CONFIG_KVM_GUEST
1208
-idtentry async_page_fault do_async_page_fault has_error_code=1
1209
-#endif
1210
-
1211
-#ifdef CONFIG_X86_MCE
1212
-idtentry machine_check do_mce has_error_code=0 paranoid=1
1213
-#endif
883
+ jmp error_return
884
+SYM_CODE_END(xen_failsafe_callback)
885
+#endif /* CONFIG_XEN_PV */
1214886
1215887 /*
1216
- * Save all registers in pt_regs, and switch gs if needed.
1217
- * Use slow, but surefire "are we in kernel?" check.
1218
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
888
+ * Save all registers in pt_regs. Return GSBASE related information
889
+ * in EBX depending on the availability of the FSGSBASE instructions:
890
+ *
891
+ * FSGSBASE R/EBX
892
+ * N 0 -> SWAPGS on exit
893
+ * 1 -> no SWAPGS on exit
894
+ *
895
+ * Y GSBASE value at entry, must be restored in paranoid_exit
896
+ *
897
+ * R14 - old CR3
898
+ * R15 - old SPEC_CTRL
1219899 */
1220
-ENTRY(paranoid_entry)
900
+SYM_CODE_START_LOCAL(paranoid_entry)
1221901 UNWIND_HINT_FUNC
1222902 cld
1223903 PUSH_AND_CLEAR_REGS save_ret=1
1224904 ENCODE_FRAME_POINTER 8
1225
- movl $1, %ebx
1226
- movl $MSR_GS_BASE, %ecx
1227
- rdmsr
1228
- testl %edx, %edx
1229
- js 1f /* negative -> in kernel */
1230
- SWAPGS
1231
- xorl %ebx, %ebx
1232905
1233
-1:
1234906 /*
1235907 * Always stash CR3 in %r14. This value will be restored,
1236908 * verbatim, at exit. Needed if paranoid_entry interrupted
....@@ -1240,18 +912,65 @@
1240912 * This is also why CS (stashed in the "iret frame" by the
1241913 * hardware at entry) can not be used: this may be a return
1242914 * to kernel code, but with a user CR3 value.
915
+ *
916
+ * Switching CR3 does not depend on kernel GSBASE so it can
917
+ * be done before switching to the kernel GSBASE. This is
918
+ * required for FSGSBASE because the kernel GSBASE has to
919
+ * be retrieved from a kernel internal table.
1243920 */
1244921 SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
1245922
1246923 /*
1247
- * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
1248
- * unconditional CR3 write, even in the PTI case. So do an lfence
1249
- * to prevent GS speculation, regardless of whether PTI is enabled.
924
+ * Handling GSBASE depends on the availability of FSGSBASE.
925
+ *
926
+ * Without FSGSBASE the kernel enforces that negative GSBASE
927
+ * values indicate kernel GSBASE. With FSGSBASE no assumptions
928
+ * can be made about the GSBASE value when entering from user
929
+ * space.
1250930 */
1251
- FENCE_SWAPGS_KERNEL_ENTRY
931
+ ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
1252932
1253
- ret
1254
-END(paranoid_entry)
933
+ /*
934
+ * Read the current GSBASE and store it in %rbx unconditionally,
935
+ * retrieve and set the current CPUs kernel GSBASE. The stored value
936
+ * has to be restored in paranoid_exit unconditionally.
937
+ *
938
+ * The unconditional write to GS base below ensures that no subsequent
939
+ * loads based on a mispredicted GS base can happen, therefore no LFENCE
940
+ * is needed here.
941
+ */
942
+ SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
943
+ jmp .Lparanoid_gsbase_done
944
+
945
+.Lparanoid_entry_checkgs:
946
+ /* EBX = 1 -> kernel GSBASE active, no restore required */
947
+ movl $1, %ebx
948
+
949
+ /*
950
+ * The kernel-enforced convention is a negative GSBASE indicates
951
+ * a kernel value. No SWAPGS needed on entry and exit.
952
+ */
953
+ movl $MSR_GS_BASE, %ecx
954
+ rdmsr
955
+ testl %edx, %edx
956
+ js .Lparanoid_kernel_gsbase
957
+
958
+ /* EBX = 0 -> SWAPGS required on exit */
959
+ xorl %ebx, %ebx
960
+ swapgs
961
+.Lparanoid_kernel_gsbase:
962
+ FENCE_SWAPGS_KERNEL_ENTRY
963
+.Lparanoid_gsbase_done:
964
+
965
+ /*
966
+ * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like
967
+ * CR3 above, keep the old value in a callee saved register.
968
+ */
969
+ IBRS_ENTER save_reg=%r15
970
+ UNTRAIN_RET
971
+
972
+ RET
973
+SYM_CODE_END(paranoid_entry)
1255974
1256975 /*
1257976 * "Paranoid" exit path from exception stack. This is invoked
....@@ -1260,34 +979,61 @@
1260979 *
1261980 * We may be returning to very strange contexts (e.g. very early
1262981 * in syscall entry), so checking for preemption here would
1263
- * be complicated. Fortunately, we there's no good reason
1264
- * to try to handle preemption here.
982
+ * be complicated. Fortunately, there's no good reason to try
983
+ * to handle preemption here.
1265984 *
1266
- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
985
+ * R/EBX contains the GSBASE related information depending on the
986
+ * availability of the FSGSBASE instructions:
987
+ *
988
+ * FSGSBASE R/EBX
989
+ * N 0 -> SWAPGS on exit
990
+ * 1 -> no SWAPGS on exit
991
+ *
992
+ * Y User space GSBASE, must be restored unconditionally
993
+ *
994
+ * R14 - old CR3
995
+ * R15 - old SPEC_CTRL
1267996 */
1268
-ENTRY(paranoid_exit)
997
+SYM_CODE_START_LOCAL(paranoid_exit)
1269998 UNWIND_HINT_REGS
1270
- DISABLE_INTERRUPTS(CLBR_ANY)
1271
- TRACE_IRQS_OFF_DEBUG
1272
- testl %ebx, %ebx /* swapgs needed? */
1273
- jnz .Lparanoid_exit_no_swapgs
1274
- TRACE_IRQS_IRETQ
1275
- /* Always restore stashed CR3 value (see paranoid_entry) */
1276
- RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
1277
- SWAPGS_UNSAFE_STACK
1278
- jmp .Lparanoid_exit_restore
1279
-.Lparanoid_exit_no_swapgs:
1280
- TRACE_IRQS_IRETQ_DEBUG
1281
- /* Always restore stashed CR3 value (see paranoid_entry) */
1282
- RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
1283
-.Lparanoid_exit_restore:
1284
- jmp restore_regs_and_return_to_kernel
1285
-END(paranoid_exit)
999
+
1000
+ /*
1001
+ * Must restore IBRS state before both CR3 and %GS since we need access
1002
+ * to the per-CPU x86_spec_ctrl_shadow variable.
1003
+ */
1004
+ IBRS_EXIT save_reg=%r15
1005
+
1006
+ /*
1007
+ * The order of operations is important. RESTORE_CR3 requires
1008
+ * kernel GSBASE.
1009
+ *
1010
+ * NB to anyone to try to optimize this code: this code does
1011
+ * not execute at all for exceptions from user mode. Those
1012
+ * exceptions go through error_exit instead.
1013
+ */
1014
+ RESTORE_CR3 scratch_reg=%rax save_reg=%r14
1015
+
1016
+ /* Handle the three GSBASE cases */
1017
+ ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
1018
+
1019
+ /* With FSGSBASE enabled, unconditionally restore GSBASE */
1020
+ wrgsbase %rbx
1021
+ jmp restore_regs_and_return_to_kernel
1022
+
1023
+.Lparanoid_exit_checkgs:
1024
+ /* On non-FSGSBASE systems, conditionally do SWAPGS */
1025
+ testl %ebx, %ebx
1026
+ jnz restore_regs_and_return_to_kernel
1027
+
1028
+ /* We are returning to a context with user GSBASE */
1029
+ swapgs
1030
+ jmp restore_regs_and_return_to_kernel
1031
+SYM_CODE_END(paranoid_exit)
12861032
12871033 /*
12881034 * Save all registers in pt_regs, and switch GS if needed.
12891035 */
1290
-ENTRY(error_entry)
1036
+SYM_CODE_START_LOCAL(error_entry)
12911037 UNWIND_HINT_FUNC
12921038 cld
12931039 PUSH_AND_CLEAR_REGS save_ret=1
....@@ -1303,8 +1049,11 @@
13031049 FENCE_SWAPGS_USER_ENTRY
13041050 /* We have user CR3. Change to kernel CR3. */
13051051 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1052
+ IBRS_ENTER
1053
+ UNTRAIN_RET
13061054
13071055 .Lerror_entry_from_usermode_after_swapgs:
1056
+
13081057 /* Put us onto the real thread stack. */
13091058 popq %r12 /* save return addr in %12 */
13101059 movq %rsp, %rdi /* arg0 = pt_regs pointer */
....@@ -1312,21 +1061,7 @@
13121061 movq %rax, %rsp /* switch stack */
13131062 ENCODE_FRAME_POINTER
13141063 pushq %r12
1315
-
1316
- /*
1317
- * We need to tell lockdep that IRQs are off. We can't do this until
1318
- * we fix gsbase, and we should do it before enter_from_user_mode
1319
- * (which can take locks).
1320
- */
1321
- TRACE_IRQS_OFF
1322
- CALL_enter_from_user_mode
1323
- ret
1324
-
1325
-.Lerror_entry_done_lfence:
1326
- FENCE_SWAPGS_KERNEL_ENTRY
1327
-.Lerror_entry_done:
1328
- TRACE_IRQS_OFF
1329
- ret
1064
+ RET
13301065
13311066 /*
13321067 * There are two places in the kernel that can potentially fault with
....@@ -1350,9 +1085,15 @@
13501085 * .Lgs_change's error handler with kernel gsbase.
13511086 */
13521087 SWAPGS
1353
- FENCE_SWAPGS_USER_ENTRY
1354
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1355
- jmp .Lerror_entry_done
1088
+
1089
+ /*
1090
+ * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
1091
+ * kernel or user gsbase.
1092
+ */
1093
+.Lerror_entry_done_lfence:
1094
+ FENCE_SWAPGS_KERNEL_ENTRY
1095
+ ANNOTATE_UNRET_END
1096
+ RET
13561097
13571098 .Lbstep_iret:
13581099 /* Fix truncated RIP */
....@@ -1367,6 +1108,8 @@
13671108 SWAPGS
13681109 FENCE_SWAPGS_USER_ENTRY
13691110 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1111
+ IBRS_ENTER
1112
+ UNTRAIN_RET
13701113
13711114 /*
13721115 * Pretend that the exception came from user mode: set up pt_regs
....@@ -1376,16 +1119,15 @@
13761119 call fixup_bad_iret
13771120 mov %rax, %rsp
13781121 jmp .Lerror_entry_from_usermode_after_swapgs
1379
-END(error_entry)
1122
+SYM_CODE_END(error_entry)
13801123
1381
-ENTRY(error_exit)
1124
+SYM_CODE_START_LOCAL(error_return)
13821125 UNWIND_HINT_REGS
1383
- DISABLE_INTERRUPTS(CLBR_ANY)
1384
- TRACE_IRQS_OFF
1126
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
13851127 testb $3, CS(%rsp)
1386
- jz retint_kernel
1387
- jmp retint_user
1388
-END(error_exit)
1128
+ jz restore_regs_and_return_to_kernel
1129
+ jmp swapgs_restore_regs_and_return_to_usermode
1130
+SYM_CODE_END(error_return)
13891131
13901132 /*
13911133 * Runs on exception stack. Xen PV does not go through this path at all,
....@@ -1395,7 +1137,7 @@
13951137 * %r14: Used to save/restore the CR3 of the interrupted context
13961138 * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
13971139 */
1398
-ENTRY(nmi)
1140
+SYM_CODE_START(asm_exc_nmi)
13991141 UNWIND_HINT_IRET_REGS
14001142
14011143 /*
....@@ -1472,6 +1214,9 @@
14721214 PUSH_AND_CLEAR_REGS rdx=(%rdx)
14731215 ENCODE_FRAME_POINTER
14741216
1217
+ IBRS_ENTER
1218
+ UNTRAIN_RET
1219
+
14751220 /*
14761221 * At this point we no longer need to worry about stack damage
14771222 * due to nesting -- we're on the normal thread stack and we're
....@@ -1480,7 +1225,7 @@
14801225
14811226 movq %rsp, %rdi
14821227 movq $-1, %rsi
1483
- call do_nmi
1228
+ call exc_nmi
14841229
14851230 /*
14861231 * Return back to user mode. We must *not* do the normal exit
....@@ -1537,7 +1282,7 @@
15371282 * end_repeat_nmi, then we are a nested NMI. We must not
15381283 * modify the "iret" frame because it's being written by
15391284 * the outer NMI. That's okay; the outer NMI handler is
1540
- * about to about to call do_nmi anyway, so we can just
1285
+ * about to about to call exc_nmi() anyway, so we can just
15411286 * resume the outer NMI.
15421287 */
15431288
....@@ -1656,7 +1401,7 @@
16561401 * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
16571402 * we're repeating an NMI, gsbase has the same value that it had on
16581403 * the first iteration. paranoid_entry will load the kernel
1659
- * gsbase if needed before we call do_nmi. "NMI executing"
1404
+ * gsbase if needed before we call exc_nmi(). "NMI executing"
16601405 * is zero.
16611406 */
16621407 movq $1, 10*8(%rsp) /* Set "NMI executing". */
....@@ -1690,18 +1435,37 @@
16901435 call paranoid_entry
16911436 UNWIND_HINT_REGS
16921437
1693
- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
16941438 movq %rsp, %rdi
16951439 movq $-1, %rsi
1696
- call do_nmi
1440
+ call exc_nmi
1441
+
1442
+ /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
1443
+ IBRS_EXIT save_reg=%r15
16971444
16981445 /* Always restore stashed CR3 value (see paranoid_entry) */
16991446 RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
17001447
1701
- testl %ebx, %ebx /* swapgs needed? */
1448
+ /*
1449
+ * The above invocation of paranoid_entry stored the GSBASE
1450
+ * related information in R/EBX depending on the availability
1451
+ * of FSGSBASE.
1452
+ *
1453
+ * If FSGSBASE is enabled, restore the saved GSBASE value
1454
+ * unconditionally, otherwise take the conditional SWAPGS path.
1455
+ */
1456
+ ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
1457
+
1458
+ wrgsbase %rbx
1459
+ jmp nmi_restore
1460
+
1461
+nmi_no_fsgsbase:
1462
+ /* EBX == 0 -> invoke SWAPGS */
1463
+ testl %ebx, %ebx
17021464 jnz nmi_restore
1465
+
17031466 nmi_swapgs:
1704
- SWAPGS_UNSAFE_STACK
1467
+ swapgs
1468
+
17051469 nmi_restore:
17061470 POP_REGS
17071471
....@@ -1730,15 +1494,22 @@
17301494 * about espfix64 on the way back to kernel mode.
17311495 */
17321496 iretq
1733
-END(nmi)
1497
+SYM_CODE_END(asm_exc_nmi)
17341498
1735
-ENTRY(ignore_sysret)
1499
+#ifndef CONFIG_IA32_EMULATION
1500
+/*
1501
+ * This handles SYSCALL from 32-bit code. There is no way to program
1502
+ * MSRs to fully disable 32-bit SYSCALL.
1503
+ */
1504
+SYM_CODE_START(ignore_sysret)
17361505 UNWIND_HINT_EMPTY
17371506 mov $-ENOSYS, %eax
1738
- sysret
1739
-END(ignore_sysret)
1507
+ sysretl
1508
+SYM_CODE_END(ignore_sysret)
1509
+#endif
17401510
1741
-ENTRY(rewind_stack_do_exit)
1511
+.pushsection .text, "ax"
1512
+SYM_CODE_START(rewind_stack_do_exit)
17421513 UNWIND_HINT_FUNC
17431514 /* Prevent any naive code from trying to unwind to our caller. */
17441515 xorl %ebp, %ebp
....@@ -1748,4 +1519,5 @@
17481519 UNWIND_HINT_REGS
17491520
17501521 call do_exit
1751
-END(rewind_stack_do_exit)
1522
+SYM_CODE_END(rewind_stack_do_exit)
1523
+.popsection