hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/x86/entry/entry_64.S
....@@ -8,15 +8,14 @@
88 *
99 * entry.S contains the system-call and fault low-level handling routines.
1010 *
11
- * Some of this is documented in Documentation/x86/entry_64.txt
11
+ * Some of this is documented in Documentation/x86/entry_64.rst
1212 *
1313 * A note on terminology:
1414 * - iret frame: Architecture defined interrupt frame from SS to RIP
1515 * at the top of the kernel process stack.
1616 *
1717 * Some macro usage:
18
- * - ENTRY/END: Define functions in the symbol table.
19
- * - TRACE_IRQ_*: Trace hardirq state for lock debugging.
18
+ * - SYM_FUNC_START/END:Define functions in the symbol table.
2019 * - idtentry: Define exception entry points.
2120 */
2221 #include <linux/linkage.h>
....@@ -37,7 +36,9 @@
3736 #include <asm/pgtable_types.h>
3837 #include <asm/export.h>
3938 #include <asm/frame.h>
39
+#include <asm/trapnr.h>
4040 #include <asm/nospec-branch.h>
41
+#include <asm/fsgsbase.h>
4142 #include <linux/err.h>
4243
4344 #include "calling.h"
....@@ -45,64 +46,13 @@
4546 .code64
4647 .section .entry.text, "ax"
4748
48
-#ifdef CONFIG_PARAVIRT
49
-ENTRY(native_usergs_sysret64)
49
+#ifdef CONFIG_PARAVIRT_XXL
50
+SYM_CODE_START(native_usergs_sysret64)
5051 UNWIND_HINT_EMPTY
5152 swapgs
5253 sysretq
53
-END(native_usergs_sysret64)
54
-#endif /* CONFIG_PARAVIRT */
55
-
56
-.macro TRACE_IRQS_FLAGS flags:req
57
-#ifdef CONFIG_TRACE_IRQFLAGS
58
- btl $9, \flags /* interrupts off? */
59
- jnc 1f
60
- TRACE_IRQS_ON
61
-1:
62
-#endif
63
-.endm
64
-
65
-.macro TRACE_IRQS_IRETQ
66
- TRACE_IRQS_FLAGS EFLAGS(%rsp)
67
-.endm
68
-
69
-/*
70
- * When dynamic function tracer is enabled it will add a breakpoint
71
- * to all locations that it is about to modify, sync CPUs, update
72
- * all the code, sync CPUs, then remove the breakpoints. In this time
73
- * if lockdep is enabled, it might jump back into the debug handler
74
- * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
75
- *
76
- * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
77
- * make sure the stack pointer does not get reset back to the top
78
- * of the debug stack, and instead just reuses the current stack.
79
- */
80
-#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
81
-
82
-.macro TRACE_IRQS_OFF_DEBUG
83
- call debug_stack_set_zero
84
- TRACE_IRQS_OFF
85
- call debug_stack_reset
86
-.endm
87
-
88
-.macro TRACE_IRQS_ON_DEBUG
89
- call debug_stack_set_zero
90
- TRACE_IRQS_ON
91
- call debug_stack_reset
92
-.endm
93
-
94
-.macro TRACE_IRQS_IRETQ_DEBUG
95
- btl $9, EFLAGS(%rsp) /* interrupts off? */
96
- jnc 1f
97
- TRACE_IRQS_ON_DEBUG
98
-1:
99
-.endm
100
-
101
-#else
102
-# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
103
-# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
104
-# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
105
-#endif
54
+SYM_CODE_END(native_usergs_sysret64)
55
+#endif /* CONFIG_PARAVIRT_XXL */
10656
10757 /*
10858 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
....@@ -142,102 +92,37 @@
14292 * with them due to bugs in both AMD and Intel CPUs.
14393 */
14494
145
- .pushsection .entry_trampoline, "ax"
95
+SYM_CODE_START(entry_SYSCALL_64)
96
+ UNWIND_HINT_ENTRY
14697
147
-/*
148
- * The code in here gets remapped into cpu_entry_area's trampoline. This means
149
- * that the assembler and linker have the wrong idea as to where this code
150
- * lives (and, in fact, it's mapped more than once, so it's not even at a
151
- * fixed address). So we can't reference any symbols outside the entry
152
- * trampoline and expect it to work.
153
- *
154
- * Instead, we carefully abuse %rip-relative addressing.
155
- * _entry_trampoline(%rip) refers to the start of the remapped) entry
156
- * trampoline. We can thus find cpu_entry_area with this macro:
157
- */
158
-
159
-#define CPU_ENTRY_AREA \
160
- _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
161
-
162
-/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
163
-#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
164
- SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
165
-
166
-ENTRY(entry_SYSCALL_64_trampoline)
167
- UNWIND_HINT_EMPTY
16898 swapgs
169
-
170
- /* Stash the user RSP. */
171
- movq %rsp, RSP_SCRATCH
172
-
173
- /* Note: using %rsp as a scratch reg. */
99
+ /* tss.sp2 is scratch space. */
100
+ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
174101 SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
175
-
176
- /* Load the top of the task stack into RSP */
177
- movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
178
-
179
- /* Start building the simulated IRET frame. */
180
- pushq $__USER_DS /* pt_regs->ss */
181
- pushq RSP_SCRATCH /* pt_regs->sp */
182
- pushq %r11 /* pt_regs->flags */
183
- pushq $__USER_CS /* pt_regs->cs */
184
- pushq %rcx /* pt_regs->ip */
185
-
186
- /*
187
- * x86 lacks a near absolute jump, and we can't jump to the real
188
- * entry text with a relative jump. We could push the target
189
- * address and then use retq, but this destroys the pipeline on
190
- * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
191
- * spill RDI and restore it in a second-stage trampoline.
192
- */
193
- pushq %rdi
194
- movq $entry_SYSCALL_64_stage2, %rdi
195
- JMP_NOSPEC %rdi
196
-END(entry_SYSCALL_64_trampoline)
197
-
198
- .popsection
199
-
200
-ENTRY(entry_SYSCALL_64_stage2)
201
- UNWIND_HINT_EMPTY
202
- popq %rdi
203
- jmp entry_SYSCALL_64_after_hwframe
204
-END(entry_SYSCALL_64_stage2)
205
-
206
-ENTRY(entry_SYSCALL_64)
207
- UNWIND_HINT_EMPTY
208
- /*
209
- * Interrupts are off on entry.
210
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
211
- * it is too small to ever cause noticeable irq latency.
212
- */
213
-
214
- swapgs
215
- /*
216
- * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
217
- * is not required to switch CR3.
218
- */
219
- movq %rsp, PER_CPU_VAR(rsp_scratch)
220102 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
221103
104
+SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
105
+
222106 /* Construct struct pt_regs on stack */
223
- pushq $__USER_DS /* pt_regs->ss */
224
- pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
225
- pushq %r11 /* pt_regs->flags */
226
- pushq $__USER_CS /* pt_regs->cs */
227
- pushq %rcx /* pt_regs->ip */
228
-GLOBAL(entry_SYSCALL_64_after_hwframe)
229
- pushq %rax /* pt_regs->orig_ax */
107
+ pushq $__USER_DS /* pt_regs->ss */
108
+ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
109
+ pushq %r11 /* pt_regs->flags */
110
+ pushq $__USER_CS /* pt_regs->cs */
111
+ pushq %rcx /* pt_regs->ip */
112
+SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
113
+ pushq %rax /* pt_regs->orig_ax */
230114
231115 PUSH_AND_CLEAR_REGS rax=$-ENOSYS
232
-
233
- TRACE_IRQS_OFF
234116
235117 /* IRQs are off. */
236118 movq %rax, %rdi
237119 movq %rsp, %rsi
238
- call do_syscall_64 /* returns with IRQs disabled */
239120
240
- TRACE_IRQS_IRETQ /* we're about to change IF */
121
+ /* clobbers %rax, make sure it is after saving the syscall nr */
122
+ IBRS_ENTER
123
+ UNTRAIN_RET
124
+
125
+ call do_syscall_64 /* returns with IRQs disabled */
241126
242127 /*
243128 * Try to use SYSRET instead of IRET if we're returning to
....@@ -311,8 +196,8 @@
311196 * perf profiles. Nothing jumps here.
312197 */
313198 syscall_return_via_sysret:
314
- /* rcx and r11 are already restored (see code above) */
315
- POP_REGS pop_rdi=0 skip_r11rcx=1
199
+ IBRS_EXIT
200
+ POP_REGS pop_rdi=0
316201
317202 /*
318203 * Now all regs are restored except RSP and RDI.
....@@ -329,19 +214,21 @@
329214 * We are on the trampoline stack. All regs except RDI are live.
330215 * We can do future final exit work right here.
331216 */
217
+ STACKLEAK_ERASE_NOCLOBBER
218
+
332219 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
333220
334221 popq %rdi
335222 popq %rsp
336223 USERGS_SYSRET64
337
-END(entry_SYSCALL_64)
224
+SYM_CODE_END(entry_SYSCALL_64)
338225
339226 /*
340227 * %rdi: prev task
341228 * %rsi: next task
342229 */
343
-ENTRY(__switch_to_asm)
344
- UNWIND_HINT_FUNC
230
+.pushsection .text, "ax"
231
+SYM_FUNC_START(__switch_to_asm)
345232 /*
346233 * Save callee-saved registers
347234 * This must match the order in inactive_task_frame
....@@ -352,7 +239,6 @@
352239 pushq %r13
353240 pushq %r14
354241 pushq %r15
355
- pushfq
356242
357243 /* switch stack */
358244 movq %rsp, TASK_threadsp(%rdi)
....@@ -360,10 +246,9 @@
360246
361247 #ifdef CONFIG_STACKPROTECTOR
362248 movq TASK_stack_canary(%rsi), %rbx
363
- movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
249
+ movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
364250 #endif
365251
366
-#ifdef CONFIG_RETPOLINE
367252 /*
368253 * When switching from a shallower to a deeper call stack
369254 * the RSB may either underflow or use entries populated
....@@ -372,10 +257,8 @@
372257 * speculative execution to prevent attack.
373258 */
374259 FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
375
-#endif
376260
377261 /* restore callee-saved registers */
378
- popfq
379262 popq %r15
380263 popq %r14
381264 popq %r13
....@@ -384,7 +267,8 @@
384267 popq %rbp
385268
386269 jmp __switch_to
387
-END(__switch_to_asm)
270
+SYM_FUNC_END(__switch_to_asm)
271
+.popsection
388272
389273 /*
390274 * A newly forked process directly context switches into this address.
....@@ -393,7 +277,8 @@
393277 * rbx: kernel thread func (NULL for user thread)
394278 * r12: kernel thread arg
395279 */
396
-ENTRY(ret_from_fork)
280
+.pushsection .text, "ax"
281
+SYM_CODE_START(ret_from_fork)
397282 UNWIND_HINT_EMPTY
398283 movq %rax, %rdi
399284 call schedule_tail /* rdi: 'prev' task parameter */
....@@ -404,51 +289,23 @@
404289 2:
405290 UNWIND_HINT_REGS
406291 movq %rsp, %rdi
407
- call syscall_return_slowpath /* returns with IRQs disabled */
408
- TRACE_IRQS_ON /* user mode is traced as IRQS on */
292
+ call syscall_exit_to_user_mode /* returns with IRQs disabled */
409293 jmp swapgs_restore_regs_and_return_to_usermode
410294
411295 1:
412296 /* kernel thread */
413297 UNWIND_HINT_EMPTY
414298 movq %r12, %rdi
415
- CALL_NOSPEC %rbx
299
+ CALL_NOSPEC rbx
416300 /*
417301 * A kernel thread is allowed to return here after successfully
418
- * calling do_execve(). Exit to userspace to complete the execve()
302
+ * calling kernel_execve(). Exit to userspace to complete the execve()
419303 * syscall.
420304 */
421305 movq $0, RAX(%rsp)
422306 jmp 2b
423
-END(ret_from_fork)
424
-
425
-/*
426
- * Build the entry stubs with some assembler magic.
427
- * We pack 1 stub into every 8-byte block.
428
- */
429
- .align 8
430
-ENTRY(irq_entries_start)
431
- vector=FIRST_EXTERNAL_VECTOR
432
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
433
- UNWIND_HINT_IRET_REGS
434
- pushq $(~vector+0x80) /* Note: always in signed byte range */
435
- jmp common_interrupt
436
- .align 8
437
- vector=vector+1
438
- .endr
439
-END(irq_entries_start)
440
-
441
- .align 8
442
-ENTRY(spurious_entries_start)
443
- vector=FIRST_SYSTEM_VECTOR
444
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
445
- UNWIND_HINT_IRET_REGS
446
- pushq $(~vector+0x80) /* Note: always in signed byte range */
447
- jmp common_spurious
448
- .align 8
449
- vector=vector+1
450
- .endr
451
-END(spurious_entries_start)
307
+SYM_CODE_END(ret_from_fork)
308
+.popsection
452309
453310 .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
454311 #ifdef CONFIG_DEBUG_ENTRY
....@@ -462,229 +319,260 @@
462319 #endif
463320 .endm
464321
465
-/*
466
- * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers
467
- * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
468
- * Requires kernel GSBASE.
469
- *
470
- * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
322
+/**
323
+ * idtentry_body - Macro to emit code calling the C function
324
+ * @cfunc: C function to be called
325
+ * @has_error_code: Hardware pushed error code on stack
471326 */
472
-.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0
473
- DEBUG_ENTRY_ASSERT_IRQS_OFF
327
+.macro idtentry_body cfunc has_error_code:req
474328
475
- .if \save_ret
476
- /*
477
- * If save_ret is set, the original stack contains one additional
478
- * entry -- the return address. Therefore, move the address one
479
- * entry below %rsp to \old_rsp.
480
- */
481
- leaq 8(%rsp), \old_rsp
482
- .else
483
- movq %rsp, \old_rsp
329
+ call error_entry
330
+ UNWIND_HINT_REGS
331
+
332
+ movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
333
+
334
+ .if \has_error_code == 1
335
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
336
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
484337 .endif
485338
486
- .if \regs
487
- UNWIND_HINT_REGS base=\old_rsp
339
+ call \cfunc
340
+
341
+ jmp error_return
342
+.endm
343
+
344
+/**
345
+ * idtentry - Macro to generate entry stubs for simple IDT entries
346
+ * @vector: Vector number
347
+ * @asmsym: ASM symbol for the entry point
348
+ * @cfunc: C function to be called
349
+ * @has_error_code: Hardware pushed error code on stack
350
+ *
351
+ * The macro emits code to set up the kernel context for straight forward
352
+ * and simple IDT entries. No IST stack, no paranoid entry checks.
353
+ */
354
+.macro idtentry vector asmsym cfunc has_error_code:req
355
+SYM_CODE_START(\asmsym)
356
+ UNWIND_HINT_IRET_REGS offset=\has_error_code*8
357
+ ASM_CLAC
358
+
359
+ .if \has_error_code == 0
360
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
488361 .endif
489362
490
- incl PER_CPU_VAR(irq_count)
491
- jnz .Lirq_stack_push_old_rsp_\@
363
+ .if \vector == X86_TRAP_BP
364
+ /*
365
+ * If coming from kernel space, create a 6-word gap to allow the
366
+ * int3 handler to emulate a call instruction.
367
+ */
368
+ testb $3, CS-ORIG_RAX(%rsp)
369
+ jnz .Lfrom_usermode_no_gap_\@
370
+ .rept 6
371
+ pushq 5*8(%rsp)
372
+ .endr
373
+ UNWIND_HINT_IRET_REGS offset=8
374
+.Lfrom_usermode_no_gap_\@:
375
+ .endif
376
+
377
+ idtentry_body \cfunc \has_error_code
378
+
379
+_ASM_NOKPROBE(\asmsym)
380
+SYM_CODE_END(\asmsym)
381
+.endm
382
+
383
+/*
384
+ * Interrupt entry/exit.
385
+ *
386
+ + The interrupt stubs push (vector) onto the stack, which is the error_code
387
+ * position of idtentry exceptions, and jump to one of the two idtentry points
388
+ * (common/spurious).
389
+ *
390
+ * common_interrupt is a hotpath, align it to a cache line
391
+ */
392
+.macro idtentry_irq vector cfunc
393
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
394
+ idtentry \vector asm_\cfunc \cfunc has_error_code=1
395
+.endm
396
+
397
+/*
398
+ * System vectors which invoke their handlers directly and are not
399
+ * going through the regular common device interrupt handling code.
400
+ */
401
+.macro idtentry_sysvec vector cfunc
402
+ idtentry \vector asm_\cfunc \cfunc has_error_code=0
403
+.endm
404
+
405
+/**
406
+ * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
407
+ * @vector: Vector number
408
+ * @asmsym: ASM symbol for the entry point
409
+ * @cfunc: C function to be called
410
+ *
411
+ * The macro emits code to set up the kernel context for #MC and #DB
412
+ *
413
+ * If the entry comes from user space it uses the normal entry path
414
+ * including the return to user space work and preemption checks on
415
+ * exit.
416
+ *
417
+ * If hits in kernel mode then it needs to go through the paranoid
418
+ * entry as the exception can hit any random state. No preemption
419
+ * check on exit to keep the paranoid path simple.
420
+ */
421
+.macro idtentry_mce_db vector asmsym cfunc
422
+SYM_CODE_START(\asmsym)
423
+ UNWIND_HINT_IRET_REGS
424
+ ASM_CLAC
425
+
426
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
492427
493428 /*
494
- * Right now, if we just incremented irq_count to zero, we've
495
- * claimed the IRQ stack but we haven't switched to it yet.
496
- *
497
- * If anything is added that can interrupt us here without using IST,
498
- * it must be *extremely* careful to limit its stack usage. This
499
- * could include kprobes and a hypothetical future IST-less #DB
500
- * handler.
501
- *
502
- * The OOPS unwinder relies on the word at the top of the IRQ
503
- * stack linking back to the previous RSP for the entire time we're
504
- * on the IRQ stack. For this to work reliably, we need to write
505
- * it before we actually move ourselves to the IRQ stack.
429
+ * If the entry is from userspace, switch stacks and treat it as
430
+ * a normal entry.
506431 */
432
+ testb $3, CS-ORIG_RAX(%rsp)
433
+ jnz .Lfrom_usermode_switch_stack_\@
507434
508
- movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8)
509
- movq PER_CPU_VAR(irq_stack_ptr), %rsp
435
+ /* paranoid_entry returns GS information for paranoid_exit in EBX. */
436
+ call paranoid_entry
510437
511
-#ifdef CONFIG_DEBUG_ENTRY
438
+ UNWIND_HINT_REGS
439
+
440
+ movq %rsp, %rdi /* pt_regs pointer */
441
+
442
+ call \cfunc
443
+
444
+ jmp paranoid_exit
445
+
446
+ /* Switch to the regular task stack and use the noist entry point */
447
+.Lfrom_usermode_switch_stack_\@:
448
+ idtentry_body noist_\cfunc, has_error_code=0
449
+
450
+_ASM_NOKPROBE(\asmsym)
451
+SYM_CODE_END(\asmsym)
452
+.endm
453
+
454
+#ifdef CONFIG_AMD_MEM_ENCRYPT
455
+/**
456
+ * idtentry_vc - Macro to generate entry stub for #VC
457
+ * @vector: Vector number
458
+ * @asmsym: ASM symbol for the entry point
459
+ * @cfunc: C function to be called
460
+ *
461
+ * The macro emits code to set up the kernel context for #VC. The #VC handler
462
+ * runs on an IST stack and needs to be able to cause nested #VC exceptions.
463
+ *
464
+ * To make this work the #VC entry code tries its best to pretend it doesn't use
465
+ * an IST stack by switching to the task stack if coming from user-space (which
466
+ * includes early SYSCALL entry path) or back to the stack in the IRET frame if
467
+ * entered from kernel-mode.
468
+ *
469
+ * If entered from kernel-mode the return stack is validated first, and if it is
470
+ * not safe to use (e.g. because it points to the entry stack) the #VC handler
471
+ * will switch to a fall-back stack (VC2) and call a special handler function.
472
+ *
473
+ * The macro is only used for one vector, but it is planned to be extended in
474
+ * the future for the #HV exception.
475
+ */
476
+.macro idtentry_vc vector asmsym cfunc
477
+SYM_CODE_START(\asmsym)
478
+ UNWIND_HINT_IRET_REGS
479
+ ASM_CLAC
480
+
512481 /*
513
- * If the first movq above becomes wrong due to IRQ stack layout
514
- * changes, the only way we'll notice is if we try to unwind right
515
- * here. Assert that we set up the stack right to catch this type
516
- * of bug quickly.
482
+ * If the entry is from userspace, switch stacks and treat it as
483
+ * a normal entry.
517484 */
518
- cmpq -8(%rsp), \old_rsp
519
- je .Lirq_stack_okay\@
520
- ud2
521
- .Lirq_stack_okay\@:
485
+ testb $3, CS-ORIG_RAX(%rsp)
486
+ jnz .Lfrom_usermode_switch_stack_\@
487
+
488
+ /*
489
+ * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
490
+ * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
491
+ */
492
+ call paranoid_entry
493
+
494
+ UNWIND_HINT_REGS
495
+
496
+ /*
497
+ * Switch off the IST stack to make it free for nested exceptions. The
498
+ * vc_switch_off_ist() function will switch back to the interrupted
499
+ * stack if it is safe to do so. If not it switches to the VC fall-back
500
+ * stack.
501
+ */
502
+ movq %rsp, %rdi /* pt_regs pointer */
503
+ call vc_switch_off_ist
504
+ movq %rax, %rsp /* Switch to new stack */
505
+
506
+ ENCODE_FRAME_POINTER
507
+ UNWIND_HINT_REGS
508
+
509
+ /* Update pt_regs */
510
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
511
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
512
+
513
+ movq %rsp, %rdi /* pt_regs pointer */
514
+
515
+ call kernel_\cfunc
516
+
517
+ /*
518
+ * No need to switch back to the IST stack. The current stack is either
519
+ * identical to the stack in the IRET frame or the VC fall-back stack,
520
+ * so it is definitly mapped even with PTI enabled.
521
+ */
522
+ jmp paranoid_exit
523
+
524
+ /* Switch to the regular task stack */
525
+.Lfrom_usermode_switch_stack_\@:
526
+ idtentry_body user_\cfunc, has_error_code=1
527
+
528
+_ASM_NOKPROBE(\asmsym)
529
+SYM_CODE_END(\asmsym)
530
+.endm
522531 #endif
523532
524
-.Lirq_stack_push_old_rsp_\@:
525
- pushq \old_rsp
526
-
527
- .if \regs
528
- UNWIND_HINT_REGS indirect=1
529
- .endif
530
-
531
- .if \save_ret
532
- /*
533
- * Push the return address to the stack. This return address can
534
- * be found at the "real" original RSP, which was offset by 8 at
535
- * the beginning of this macro.
536
- */
537
- pushq -8(\old_rsp)
538
- .endif
539
-.endm
540
-
541533 /*
542
- * Undoes ENTER_IRQ_STACK.
534
+ * Double fault entry. Straight paranoid. No checks from which context
535
+ * this comes because for the espfix induced #DF this would do the wrong
536
+ * thing.
543537 */
544
-.macro LEAVE_IRQ_STACK regs=1
545
- DEBUG_ENTRY_ASSERT_IRQS_OFF
546
- /* We need to be off the IRQ stack before decrementing irq_count. */
547
- popq %rsp
548
-
549
- .if \regs
550
- UNWIND_HINT_REGS
551
- .endif
552
-
553
- /*
554
- * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
555
- * the irq stack but we're not on it.
556
- */
557
-
558
- decl PER_CPU_VAR(irq_count)
559
-.endm
560
-
561
-/*
562
- * Interrupt entry helper function.
563
- *
564
- * Entry runs with interrupts off. Stack layout at entry:
565
- * +----------------------------------------------------+
566
- * | regs->ss |
567
- * | regs->rsp |
568
- * | regs->eflags |
569
- * | regs->cs |
570
- * | regs->ip |
571
- * +----------------------------------------------------+
572
- * | regs->orig_ax = ~(interrupt number) |
573
- * +----------------------------------------------------+
574
- * | return address |
575
- * +----------------------------------------------------+
576
- */
577
-ENTRY(interrupt_entry)
578
- UNWIND_HINT_IRET_REGS offset=16
538
+.macro idtentry_df vector asmsym cfunc
539
+SYM_CODE_START(\asmsym)
540
+ UNWIND_HINT_IRET_REGS offset=8
579541 ASM_CLAC
580
- cld
581542
582
- testb $3, CS-ORIG_RAX+8(%rsp)
583
- jz 1f
584
- SWAPGS
585
- FENCE_SWAPGS_USER_ENTRY
586
- /*
587
- * Switch to the thread stack. The IRET frame and orig_ax are
588
- * on the stack, as well as the return address. RDI..R12 are
589
- * not (yet) on the stack and space has not (yet) been
590
- * allocated for them.
591
- */
592
- pushq %rdi
543
+ /* paranoid_entry returns GS information for paranoid_exit in EBX. */
544
+ call paranoid_entry
545
+ UNWIND_HINT_REGS
593546
594
- /* Need to switch before accessing the thread stack. */
595
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
596
- movq %rsp, %rdi
597
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
547
+ movq %rsp, %rdi /* pt_regs pointer into first argument */
548
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
549
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
550
+ call \cfunc
598551
599
- /*
600
- * We have RDI, return address, and orig_ax on the stack on
601
- * top of the IRET frame. That means offset=24
602
- */
603
- UNWIND_HINT_IRET_REGS base=%rdi offset=24
552
+ jmp paranoid_exit
604553
605
- pushq 7*8(%rdi) /* regs->ss */
606
- pushq 6*8(%rdi) /* regs->rsp */
607
- pushq 5*8(%rdi) /* regs->eflags */
608
- pushq 4*8(%rdi) /* regs->cs */
609
- pushq 3*8(%rdi) /* regs->ip */
610
- UNWIND_HINT_IRET_REGS
611
- pushq 2*8(%rdi) /* regs->orig_ax */
612
- pushq 8(%rdi) /* return address */
613
-
614
- movq (%rdi), %rdi
615
- jmp 2f
616
-1:
617
- FENCE_SWAPGS_KERNEL_ENTRY
618
-2:
619
- PUSH_AND_CLEAR_REGS save_ret=1
620
- ENCODE_FRAME_POINTER 8
621
-
622
- testb $3, CS+8(%rsp)
623
- jz 1f
624
-
625
- /*
626
- * IRQ from user mode.
627
- *
628
- * We need to tell lockdep that IRQs are off. We can't do this until
629
- * we fix gsbase, and we should do it before enter_from_user_mode
630
- * (which can take locks). Since TRACE_IRQS_OFF is idempotent,
631
- * the simplest way to handle it is to just call it twice if
632
- * we enter from user mode. There's no reason to optimize this since
633
- * TRACE_IRQS_OFF is a no-op if lockdep is off.
634
- */
635
- TRACE_IRQS_OFF
636
-
637
- CALL_enter_from_user_mode
638
-
639
-1:
640
- ENTER_IRQ_STACK old_rsp=%rdi save_ret=1
641
- /* We entered an interrupt context - irqs are off: */
642
- TRACE_IRQS_OFF
643
-
644
- ret
645
-END(interrupt_entry)
646
-_ASM_NOKPROBE(interrupt_entry)
647
-
648
-
649
-/* Interrupt entry/exit. */
554
+_ASM_NOKPROBE(\asmsym)
555
+SYM_CODE_END(\asmsym)
556
+.endm
650557
651558 /*
652
- * The interrupt stubs push (~vector+0x80) onto the stack and
653
- * then jump to common_spurious/interrupt.
559
+ * Include the defines which emit the idt entries which are shared
560
+ * shared between 32 and 64 bit and emit the __irqentry_text_* markers
561
+ * so the stacktrace boundary checks work.
654562 */
655
-common_spurious:
656
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
657
- call interrupt_entry
658
- UNWIND_HINT_REGS indirect=1
659
- call smp_spurious_interrupt /* rdi points to pt_regs */
660
- jmp ret_from_intr
661
-END(common_spurious)
662
-_ASM_NOKPROBE(common_spurious)
563
+ .align 16
564
+ .globl __irqentry_text_start
565
+__irqentry_text_start:
663566
664
-/* common_interrupt is a hotpath. Align it */
665
- .p2align CONFIG_X86_L1_CACHE_SHIFT
666
-common_interrupt:
667
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
668
- call interrupt_entry
669
- UNWIND_HINT_REGS indirect=1
670
- call do_IRQ /* rdi points to pt_regs */
671
- /* 0(%rsp): old RSP */
672
-ret_from_intr:
673
- DISABLE_INTERRUPTS(CLBR_ANY)
674
- TRACE_IRQS_OFF
567
+#include <asm/idtentry.h>
675568
676
- LEAVE_IRQ_STACK
569
+ .align 16
570
+ .globl __irqentry_text_end
571
+__irqentry_text_end:
677572
678
- testb $3, CS(%rsp)
679
- jz retint_kernel
680
-
681
- /* Interrupt came from user space */
682
-GLOBAL(retint_user)
683
- mov %rsp,%rdi
684
- call prepare_exit_to_usermode
685
- TRACE_IRQS_IRETQ
686
-
687
-GLOBAL(swapgs_restore_regs_and_return_to_usermode)
573
+SYM_CODE_START_LOCAL(common_interrupt_return)
574
+SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
575
+ IBRS_EXIT
688576 #ifdef CONFIG_DEBUG_ENTRY
689577 /* Assert that pt_regs indicates user mode. */
690578 testb $3, CS(%rsp)
....@@ -692,6 +580,10 @@
692580 ud2
693581 1:
694582 #endif
583
+#ifdef CONFIG_XEN_PV
584
+ ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
585
+#endif
586
+
695587 POP_REGS pop_rdi=0
696588
697589 /*
....@@ -716,6 +608,7 @@
716608 * We are on the trampoline stack. All regs except RDI are live.
717609 * We can do future final exit work right here.
718610 */
611
+ STACKLEAK_ERASE_NOCLOBBER
719612
720613 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
721614
....@@ -725,41 +618,7 @@
725618 INTERRUPT_RETURN
726619
727620
728
-/* Returning to kernel space */
729
-retint_kernel:
730
-#ifdef CONFIG_PREEMPT
731
- /* Interrupts are off */
732
- /* Check if we need preemption */
733
- btl $9, EFLAGS(%rsp) /* were interrupts off? */
734
- jnc 1f
735
-0: cmpl $0, PER_CPU_VAR(__preempt_count)
736
-#ifndef CONFIG_PREEMPT_LAZY
737
- jnz 1f
738
-#else
739
- jz do_preempt_schedule_irq
740
-
741
- # atleast preempt count == 0 ?
742
- cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
743
- jnz 1f
744
-
745
- movq PER_CPU_VAR(current_task), %rcx
746
- cmpl $0, TASK_TI_preempt_lazy_count(%rcx)
747
- jnz 1f
748
-
749
- btl $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
750
- jnc 1f
751
-do_preempt_schedule_irq:
752
-#endif
753
- call preempt_schedule_irq
754
- jmp 0b
755
-1:
756
-#endif
757
- /*
758
- * The iretq could re-enable interrupts:
759
- */
760
- TRACE_IRQS_IRETQ
761
-
762
-GLOBAL(restore_regs_and_return_to_kernel)
621
+SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
763622 #ifdef CONFIG_DEBUG_ENTRY
764623 /* Assert that pt_regs indicates kernel mode. */
765624 testb $3, CS(%rsp)
....@@ -775,7 +634,7 @@
775634 */
776635 INTERRUPT_RETURN
777636
778
-ENTRY(native_iret)
637
+SYM_INNER_LABEL_ALIGN(native_iret, SYM_L_GLOBAL)
779638 UNWIND_HINT_IRET_REGS
780639 /*
781640 * Are we returning to a stack segment from the LDT? Note: in
....@@ -786,12 +645,11 @@
786645 jnz native_irq_return_ldt
787646 #endif
788647
789
-.global native_irq_return_iret
790
-native_irq_return_iret:
648
+SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
791649 /*
792650 * This may fault. Non-paranoid faults on return to userspace are
793651 * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
794
- * Double-faults due to espfix64 are handled in do_double_fault.
652
+ * Double-faults due to espfix64 are handled in exc_double_fault.
795653 * Other faults here are fatal.
796654 */
797655 iretq
....@@ -820,8 +678,9 @@
820678 */
821679
822680 pushq %rdi /* Stash user RDI */
823
- SWAPGS /* to kernel GS */
681
+ swapgs /* to kernel GS */
824682 SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
683
+ UNTRAIN_RET
825684
826685 movq PER_CPU_VAR(espfix_waddr), %rdi
827686 movq %rax, (0*8)(%rdi) /* user RAX */
....@@ -850,7 +709,7 @@
850709 orq PER_CPU_VAR(espfix_stack), %rax
851710
852711 SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
853
- SWAPGS /* to user GS */
712
+ swapgs /* to user GS */
854713 popq %rdi /* Restore user RDI */
855714
856715 movq %rax, %rsp
....@@ -869,226 +728,32 @@
869728 */
870729 jmp native_irq_return_iret
871730 #endif
872
-END(common_interrupt)
873
-_ASM_NOKPROBE(common_interrupt)
731
+SYM_CODE_END(common_interrupt_return)
732
+_ASM_NOKPROBE(common_interrupt_return)
874733
875734 /*
876
- * APIC interrupts.
735
+ * Reload gs selector with exception handling
736
+ * edi: new selector
737
+ *
738
+ * Is in entry.text as it shouldn't be instrumented.
877739 */
878
-.macro apicinterrupt3 num sym do_sym
879
-ENTRY(\sym)
880
- UNWIND_HINT_IRET_REGS
881
- pushq $~(\num)
882
-.Lcommon_\sym:
883
- call interrupt_entry
884
- UNWIND_HINT_REGS indirect=1
885
- call \do_sym /* rdi points to pt_regs */
886
- jmp ret_from_intr
887
-END(\sym)
888
-_ASM_NOKPROBE(\sym)
889
-.endm
890
-
891
-/* Make sure APIC interrupt handlers end up in the irqentry section: */
892
-#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
893
-#define POP_SECTION_IRQENTRY .popsection
894
-
895
-.macro apicinterrupt num sym do_sym
896
-PUSH_SECTION_IRQENTRY
897
-apicinterrupt3 \num \sym \do_sym
898
-POP_SECTION_IRQENTRY
899
-.endm
900
-
901
-#ifdef CONFIG_SMP
902
-apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
903
-apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt
904
-#endif
905
-
906
-#ifdef CONFIG_X86_UV
907
-apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt
908
-#endif
909
-
910
-apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt
911
-apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi
912
-
913
-#ifdef CONFIG_HAVE_KVM
914
-apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
915
-apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
916
-apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi
917
-#endif
918
-
919
-#ifdef CONFIG_X86_MCE_THRESHOLD
920
-apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt
921
-#endif
922
-
923
-#ifdef CONFIG_X86_MCE_AMD
924
-apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt
925
-#endif
926
-
927
-#ifdef CONFIG_X86_THERMAL_VECTOR
928
-apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt
929
-#endif
930
-
931
-#ifdef CONFIG_SMP
932
-apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt
933
-apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt
934
-apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt
935
-#endif
936
-
937
-apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt
938
-apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt
939
-
940
-#ifdef CONFIG_IRQ_WORK
941
-apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
942
-#endif
943
-
944
-/*
945
- * Exception entry points.
946
- */
947
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
948
-
949
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 create_gap=0
950
-ENTRY(\sym)
951
- UNWIND_HINT_IRET_REGS offset=\has_error_code*8
952
-
953
- /* Sanity check */
954
- .if \shift_ist != -1 && \paranoid == 0
955
- .error "using shift_ist requires paranoid=1"
956
- .endif
957
-
958
- ASM_CLAC
959
-
960
- .if \has_error_code == 0
961
- pushq $-1 /* ORIG_RAX: no syscall to restart */
962
- .endif
963
-
964
- .if \paranoid == 1
965
- testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */
966
- jnz .Lfrom_usermode_switch_stack_\@
967
- .endif
968
-
969
- .if \create_gap == 1
970
- /*
971
- * If coming from kernel space, create a 6-word gap to allow the
972
- * int3 handler to emulate a call instruction.
973
- */
974
- testb $3, CS-ORIG_RAX(%rsp)
975
- jnz .Lfrom_usermode_no_gap_\@
976
- .rept 6
977
- pushq 5*8(%rsp)
978
- .endr
979
- UNWIND_HINT_IRET_REGS offset=8
980
-.Lfrom_usermode_no_gap_\@:
981
- .endif
982
-
983
- .if \paranoid
984
- call paranoid_entry
985
- .else
986
- call error_entry
987
- .endif
988
- UNWIND_HINT_REGS
989
- /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
990
-
991
- .if \paranoid
992
- .if \shift_ist != -1
993
- TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
994
- .else
995
- TRACE_IRQS_OFF
996
- .endif
997
- .endif
998
-
999
- movq %rsp, %rdi /* pt_regs pointer */
1000
-
1001
- .if \has_error_code
1002
- movq ORIG_RAX(%rsp), %rsi /* get error code */
1003
- movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
1004
- .else
1005
- xorl %esi, %esi /* no error code */
1006
- .endif
1007
-
1008
- .if \shift_ist != -1
1009
- subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
1010
- .endif
1011
-
1012
- call \do_sym
1013
-
1014
- .if \shift_ist != -1
1015
- addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
1016
- .endif
1017
-
1018
- /* these procedures expect "no swapgs" flag in ebx */
1019
- .if \paranoid
1020
- jmp paranoid_exit
1021
- .else
1022
- jmp error_exit
1023
- .endif
1024
-
1025
- .if \paranoid == 1
1026
- /*
1027
- * Entry from userspace. Switch stacks and treat it
1028
- * as a normal entry. This means that paranoid handlers
1029
- * run in real process context if user_mode(regs).
1030
- */
1031
-.Lfrom_usermode_switch_stack_\@:
1032
- call error_entry
1033
-
1034
- movq %rsp, %rdi /* pt_regs pointer */
1035
-
1036
- .if \has_error_code
1037
- movq ORIG_RAX(%rsp), %rsi /* get error code */
1038
- movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
1039
- .else
1040
- xorl %esi, %esi /* no error code */
1041
- .endif
1042
-
1043
- call \do_sym
1044
-
1045
- jmp error_exit
1046
- .endif
1047
-_ASM_NOKPROBE(\sym)
1048
-END(\sym)
1049
-.endm
1050
-
1051
-idtentry divide_error do_divide_error has_error_code=0
1052
-idtentry overflow do_overflow has_error_code=0
1053
-idtentry bounds do_bounds has_error_code=0
1054
-idtentry invalid_op do_invalid_op has_error_code=0
1055
-idtentry device_not_available do_device_not_available has_error_code=0
1056
-idtentry double_fault do_double_fault has_error_code=1 paranoid=2
1057
-idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
1058
-idtentry invalid_TSS do_invalid_TSS has_error_code=1
1059
-idtentry segment_not_present do_segment_not_present has_error_code=1
1060
-idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
1061
-idtentry coprocessor_error do_coprocessor_error has_error_code=0
1062
-idtentry alignment_check do_alignment_check has_error_code=1
1063
-idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
1064
-
1065
-
1066
- /*
1067
- * Reload gs selector with exception handling
1068
- * edi: new selector
1069
- */
1070
-ENTRY(native_load_gs_index)
740
+SYM_FUNC_START(asm_load_gs_index)
1071741 FRAME_BEGIN
1072
- pushfq
1073
- DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1074
- TRACE_IRQS_OFF
1075
- SWAPGS
742
+ swapgs
1076743 .Lgs_change:
1077744 movl %edi, %gs
1078745 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
1079
- SWAPGS
1080
- TRACE_IRQS_FLAGS (%rsp)
1081
- popfq
746
+ swapgs
1082747 FRAME_END
1083
- ret
1084
-ENDPROC(native_load_gs_index)
1085
-EXPORT_SYMBOL(native_load_gs_index)
748
+ RET
749
+SYM_FUNC_END(asm_load_gs_index)
750
+EXPORT_SYMBOL(asm_load_gs_index)
1086751
1087
- _ASM_EXTABLE(.Lgs_change, bad_gs)
752
+ _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
1088753 .section .fixup, "ax"
1089754 /* running with kernelgs */
1090
-bad_gs:
1091
- SWAPGS /* switch back to user gs */
755
+SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
756
+ swapgs /* switch back to user gs */
1092757 .macro ZAP_GS
1093758 /* This can't be a string because the preprocessor needs to see it. */
1094759 movl $__USER_DS, %eax
....@@ -1098,24 +763,51 @@
1098763 xorl %eax, %eax
1099764 movl %eax, %gs
1100765 jmp 2b
766
+SYM_CODE_END(.Lbad_gs)
1101767 .previous
1102768
1103
-#ifndef CONFIG_PREEMPT_RT_FULL
1104
-/* Call softirq on interrupt stack. Interrupts are off. */
1105
-ENTRY(do_softirq_own_stack)
1106
- pushq %rbp
1107
- mov %rsp, %rbp
1108
- ENTER_IRQ_STACK regs=0 old_rsp=%r11
1109
- call __do_softirq
1110
- LEAVE_IRQ_STACK regs=0
769
+/*
770
+ * rdi: New stack pointer points to the top word of the stack
771
+ * rsi: Function pointer
772
+ * rdx: Function argument (can be NULL if none)
773
+ */
774
+SYM_FUNC_START(asm_call_on_stack)
775
+SYM_INNER_LABEL(asm_call_sysvec_on_stack, SYM_L_GLOBAL)
776
+SYM_INNER_LABEL(asm_call_irq_on_stack, SYM_L_GLOBAL)
777
+ /*
778
+ * Save the frame pointer unconditionally. This allows the ORC
779
+ * unwinder to handle the stack switch.
780
+ */
781
+ pushq %rbp
782
+ mov %rsp, %rbp
783
+
784
+ /*
785
+ * The unwinder relies on the word at the top of the new stack
786
+ * page linking back to the previous RSP.
787
+ */
788
+ mov %rsp, (%rdi)
789
+ mov %rdi, %rsp
790
+ /* Move the argument to the right place */
791
+ mov %rdx, %rdi
792
+
793
+1:
794
+ .pushsection .discard.instr_begin
795
+ .long 1b - .
796
+ .popsection
797
+
798
+ CALL_NOSPEC rsi
799
+
800
+2:
801
+ .pushsection .discard.instr_end
802
+ .long 2b - .
803
+ .popsection
804
+
805
+ /* Restore the previous stack pointer from RBP. */
1111806 leaveq
1112
- ret
1113
-ENDPROC(do_softirq_own_stack)
1114
-#endif
807
+ RET
808
+SYM_FUNC_END(asm_call_on_stack)
1115809
1116
-#ifdef CONFIG_XEN
1117
-idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
1118
-
810
+#ifdef CONFIG_XEN_PV
1119811 /*
1120812 * A note on the "critical region" in our callback handler.
1121813 * We want to avoid stacking callback handlers due to events occurring
....@@ -1128,8 +820,10 @@
1128820 * So, on entry to the handler we detect whether we interrupted an
1129821 * existing activation in its critical region -- if so, we pop the current
1130822 * activation and restart the handler using the previous one.
823
+ *
824
+ * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
1131825 */
1132
-ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
826
+SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
1133827
1134828 /*
1135829 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
....@@ -1139,15 +833,10 @@
1139833 movq %rdi, %rsp /* we don't return, adjust the stack frame */
1140834 UNWIND_HINT_REGS
1141835
1142
- ENTER_IRQ_STACK old_rsp=%r10
1143
- call xen_evtchn_do_upcall
1144
- LEAVE_IRQ_STACK
836
+ call xen_pv_evtchn_do_upcall
1145837
1146
-#ifndef CONFIG_PREEMPT
1147
- call xen_maybe_preempt_hcall
1148
-#endif
1149
- jmp error_exit
1150
-END(xen_do_hypervisor_callback)
838
+ jmp error_return
839
+SYM_CODE_END(exc_xen_hypervisor_callback)
1151840
1152841 /*
1153842 * Hypervisor uses this for application faults while it executes.
....@@ -1162,7 +851,7 @@
1162851 * We distinguish between categories by comparing each saved segment register
1163852 * with its current contents: any discrepancy means we in category 1.
1164853 */
1165
-ENTRY(xen_failsafe_callback)
854
+SYM_CODE_START(xen_failsafe_callback)
1166855 UNWIND_HINT_EMPTY
1167856 movl %ds, %ecx
1168857 cmpw %cx, 0x10(%rsp)
....@@ -1182,7 +871,7 @@
1182871 addq $0x30, %rsp
1183872 pushq $0 /* RIP */
1184873 UNWIND_HINT_IRET_REGS offset=8
1185
- jmp general_protection
874
+ jmp asm_exc_general_protection
1186875 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1187876 movq (%rsp), %rcx
1188877 movq 8(%rsp), %r11
....@@ -1191,64 +880,29 @@
1191880 pushq $-1 /* orig_ax = -1 => not a system call */
1192881 PUSH_AND_CLEAR_REGS
1193882 ENCODE_FRAME_POINTER
1194
- jmp error_exit
1195
-END(xen_failsafe_callback)
1196
-
1197
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
1198
- xen_hvm_callback_vector xen_evtchn_do_upcall
1199
-
1200
-#endif /* CONFIG_XEN */
1201
-
1202
-#if IS_ENABLED(CONFIG_HYPERV)
1203
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
1204
- hyperv_callback_vector hyperv_vector_handler
1205
-
1206
-apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \
1207
- hyperv_reenlightenment_vector hyperv_reenlightenment_intr
1208
-
1209
-apicinterrupt3 HYPERV_STIMER0_VECTOR \
1210
- hv_stimer0_callback_vector hv_stimer0_vector_handler
1211
-#endif /* CONFIG_HYPERV */
1212
-
1213
-idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
1214
-idtentry int3 do_int3 has_error_code=0 create_gap=1
1215
-idtentry stack_segment do_stack_segment has_error_code=1
1216
-
1217
-#ifdef CONFIG_XEN
1218
-idtentry xennmi do_nmi has_error_code=0
1219
-idtentry xendebug do_debug has_error_code=0
1220
-#endif
1221
-
1222
-idtentry general_protection do_general_protection has_error_code=1
1223
-idtentry page_fault do_page_fault has_error_code=1
1224
-
1225
-#ifdef CONFIG_KVM_GUEST
1226
-idtentry async_page_fault do_async_page_fault has_error_code=1
1227
-#endif
1228
-
1229
-#ifdef CONFIG_X86_MCE
1230
-idtentry machine_check do_mce has_error_code=0 paranoid=1
1231
-#endif
883
+ jmp error_return
884
+SYM_CODE_END(xen_failsafe_callback)
885
+#endif /* CONFIG_XEN_PV */
1232886
1233887 /*
1234
- * Save all registers in pt_regs, and switch gs if needed.
1235
- * Use slow, but surefire "are we in kernel?" check.
1236
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
888
+ * Save all registers in pt_regs. Return GSBASE related information
889
+ * in EBX depending on the availability of the FSGSBASE instructions:
890
+ *
891
+ * FSGSBASE R/EBX
892
+ * N 0 -> SWAPGS on exit
893
+ * 1 -> no SWAPGS on exit
894
+ *
895
+ * Y GSBASE value at entry, must be restored in paranoid_exit
896
+ *
897
+ * R14 - old CR3
898
+ * R15 - old SPEC_CTRL
1237899 */
1238
-ENTRY(paranoid_entry)
900
+SYM_CODE_START_LOCAL(paranoid_entry)
1239901 UNWIND_HINT_FUNC
1240902 cld
1241903 PUSH_AND_CLEAR_REGS save_ret=1
1242904 ENCODE_FRAME_POINTER 8
1243
- movl $1, %ebx
1244
- movl $MSR_GS_BASE, %ecx
1245
- rdmsr
1246
- testl %edx, %edx
1247
- js 1f /* negative -> in kernel */
1248
- SWAPGS
1249
- xorl %ebx, %ebx
1250905
1251
-1:
1252906 /*
1253907 * Always stash CR3 in %r14. This value will be restored,
1254908 * verbatim, at exit. Needed if paranoid_entry interrupted
....@@ -1258,18 +912,65 @@
1258912 * This is also why CS (stashed in the "iret frame" by the
1259913 * hardware at entry) can not be used: this may be a return
1260914 * to kernel code, but with a user CR3 value.
915
+ *
916
+ * Switching CR3 does not depend on kernel GSBASE so it can
917
+ * be done before switching to the kernel GSBASE. This is
918
+ * required for FSGSBASE because the kernel GSBASE has to
919
+ * be retrieved from a kernel internal table.
1261920 */
1262921 SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
1263922
1264923 /*
1265
- * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
1266
- * unconditional CR3 write, even in the PTI case. So do an lfence
1267
- * to prevent GS speculation, regardless of whether PTI is enabled.
924
+ * Handling GSBASE depends on the availability of FSGSBASE.
925
+ *
926
+ * Without FSGSBASE the kernel enforces that negative GSBASE
927
+ * values indicate kernel GSBASE. With FSGSBASE no assumptions
928
+ * can be made about the GSBASE value when entering from user
929
+ * space.
1268930 */
1269
- FENCE_SWAPGS_KERNEL_ENTRY
931
+ ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
1270932
1271
- ret
1272
-END(paranoid_entry)
933
+ /*
934
+ * Read the current GSBASE and store it in %rbx unconditionally,
935
+ * retrieve and set the current CPUs kernel GSBASE. The stored value
936
+ * has to be restored in paranoid_exit unconditionally.
937
+ *
938
+ * The unconditional write to GS base below ensures that no subsequent
939
+ * loads based on a mispredicted GS base can happen, therefore no LFENCE
940
+ * is needed here.
941
+ */
942
+ SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
943
+ jmp .Lparanoid_gsbase_done
944
+
945
+.Lparanoid_entry_checkgs:
946
+ /* EBX = 1 -> kernel GSBASE active, no restore required */
947
+ movl $1, %ebx
948
+
949
+ /*
950
+ * The kernel-enforced convention is a negative GSBASE indicates
951
+ * a kernel value. No SWAPGS needed on entry and exit.
952
+ */
953
+ movl $MSR_GS_BASE, %ecx
954
+ rdmsr
955
+ testl %edx, %edx
956
+ js .Lparanoid_kernel_gsbase
957
+
958
+ /* EBX = 0 -> SWAPGS required on exit */
959
+ xorl %ebx, %ebx
960
+ swapgs
961
+.Lparanoid_kernel_gsbase:
962
+ FENCE_SWAPGS_KERNEL_ENTRY
963
+.Lparanoid_gsbase_done:
964
+
965
+ /*
966
+ * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like
967
+ * CR3 above, keep the old value in a callee saved register.
968
+ */
969
+ IBRS_ENTER save_reg=%r15
970
+ UNTRAIN_RET
971
+
972
+ RET
973
+SYM_CODE_END(paranoid_entry)
1273974
1274975 /*
1275976 * "Paranoid" exit path from exception stack. This is invoked
....@@ -1278,34 +979,61 @@
1278979 *
1279980 * We may be returning to very strange contexts (e.g. very early
1280981 * in syscall entry), so checking for preemption here would
1281
- * be complicated. Fortunately, we there's no good reason
1282
- * to try to handle preemption here.
982
+ * be complicated. Fortunately, there's no good reason to try
983
+ * to handle preemption here.
1283984 *
1284
- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
985
+ * R/EBX contains the GSBASE related information depending on the
986
+ * availability of the FSGSBASE instructions:
987
+ *
988
+ * FSGSBASE R/EBX
989
+ * N 0 -> SWAPGS on exit
990
+ * 1 -> no SWAPGS on exit
991
+ *
992
+ * Y User space GSBASE, must be restored unconditionally
993
+ *
994
+ * R14 - old CR3
995
+ * R15 - old SPEC_CTRL
1285996 */
1286
-ENTRY(paranoid_exit)
997
+SYM_CODE_START_LOCAL(paranoid_exit)
1287998 UNWIND_HINT_REGS
1288
- DISABLE_INTERRUPTS(CLBR_ANY)
1289
- TRACE_IRQS_OFF_DEBUG
1290
- testl %ebx, %ebx /* swapgs needed? */
1291
- jnz .Lparanoid_exit_no_swapgs
1292
- TRACE_IRQS_IRETQ
1293
- /* Always restore stashed CR3 value (see paranoid_entry) */
1294
- RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
1295
- SWAPGS_UNSAFE_STACK
1296
- jmp .Lparanoid_exit_restore
1297
-.Lparanoid_exit_no_swapgs:
1298
- TRACE_IRQS_IRETQ_DEBUG
1299
- /* Always restore stashed CR3 value (see paranoid_entry) */
1300
- RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
1301
-.Lparanoid_exit_restore:
1302
- jmp restore_regs_and_return_to_kernel
1303
-END(paranoid_exit)
999
+
1000
+ /*
1001
+ * Must restore IBRS state before both CR3 and %GS since we need access
1002
+ * to the per-CPU x86_spec_ctrl_shadow variable.
1003
+ */
1004
+ IBRS_EXIT save_reg=%r15
1005
+
1006
+ /*
1007
+ * The order of operations is important. RESTORE_CR3 requires
1008
+ * kernel GSBASE.
1009
+ *
1010
+ * NB to anyone to try to optimize this code: this code does
1011
+ * not execute at all for exceptions from user mode. Those
1012
+ * exceptions go through error_exit instead.
1013
+ */
1014
+ RESTORE_CR3 scratch_reg=%rax save_reg=%r14
1015
+
1016
+ /* Handle the three GSBASE cases */
1017
+ ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
1018
+
1019
+ /* With FSGSBASE enabled, unconditionally restore GSBASE */
1020
+ wrgsbase %rbx
1021
+ jmp restore_regs_and_return_to_kernel
1022
+
1023
+.Lparanoid_exit_checkgs:
1024
+ /* On non-FSGSBASE systems, conditionally do SWAPGS */
1025
+ testl %ebx, %ebx
1026
+ jnz restore_regs_and_return_to_kernel
1027
+
1028
+ /* We are returning to a context with user GSBASE */
1029
+ swapgs
1030
+ jmp restore_regs_and_return_to_kernel
1031
+SYM_CODE_END(paranoid_exit)
13041032
13051033 /*
13061034 * Save all registers in pt_regs, and switch GS if needed.
13071035 */
1308
-ENTRY(error_entry)
1036
+SYM_CODE_START_LOCAL(error_entry)
13091037 UNWIND_HINT_FUNC
13101038 cld
13111039 PUSH_AND_CLEAR_REGS save_ret=1
....@@ -1321,8 +1049,11 @@
13211049 FENCE_SWAPGS_USER_ENTRY
13221050 /* We have user CR3. Change to kernel CR3. */
13231051 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1052
+ IBRS_ENTER
1053
+ UNTRAIN_RET
13241054
13251055 .Lerror_entry_from_usermode_after_swapgs:
1056
+
13261057 /* Put us onto the real thread stack. */
13271058 popq %r12 /* save return addr in %12 */
13281059 movq %rsp, %rdi /* arg0 = pt_regs pointer */
....@@ -1330,21 +1061,7 @@
13301061 movq %rax, %rsp /* switch stack */
13311062 ENCODE_FRAME_POINTER
13321063 pushq %r12
1333
-
1334
- /*
1335
- * We need to tell lockdep that IRQs are off. We can't do this until
1336
- * we fix gsbase, and we should do it before enter_from_user_mode
1337
- * (which can take locks).
1338
- */
1339
- TRACE_IRQS_OFF
1340
- CALL_enter_from_user_mode
1341
- ret
1342
-
1343
-.Lerror_entry_done_lfence:
1344
- FENCE_SWAPGS_KERNEL_ENTRY
1345
-.Lerror_entry_done:
1346
- TRACE_IRQS_OFF
1347
- ret
1064
+ RET
13481065
13491066 /*
13501067 * There are two places in the kernel that can potentially fault with
....@@ -1368,9 +1085,15 @@
13681085 * .Lgs_change's error handler with kernel gsbase.
13691086 */
13701087 SWAPGS
1371
- FENCE_SWAPGS_USER_ENTRY
1372
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1373
- jmp .Lerror_entry_done
1088
+
1089
+ /*
1090
+ * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
1091
+ * kernel or user gsbase.
1092
+ */
1093
+.Lerror_entry_done_lfence:
1094
+ FENCE_SWAPGS_KERNEL_ENTRY
1095
+ ANNOTATE_UNRET_END
1096
+ RET
13741097
13751098 .Lbstep_iret:
13761099 /* Fix truncated RIP */
....@@ -1385,6 +1108,8 @@
13851108 SWAPGS
13861109 FENCE_SWAPGS_USER_ENTRY
13871110 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1111
+ IBRS_ENTER
1112
+ UNTRAIN_RET
13881113
13891114 /*
13901115 * Pretend that the exception came from user mode: set up pt_regs
....@@ -1394,16 +1119,15 @@
13941119 call fixup_bad_iret
13951120 mov %rax, %rsp
13961121 jmp .Lerror_entry_from_usermode_after_swapgs
1397
-END(error_entry)
1122
+SYM_CODE_END(error_entry)
13981123
1399
-ENTRY(error_exit)
1124
+SYM_CODE_START_LOCAL(error_return)
14001125 UNWIND_HINT_REGS
1401
- DISABLE_INTERRUPTS(CLBR_ANY)
1402
- TRACE_IRQS_OFF
1126
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
14031127 testb $3, CS(%rsp)
1404
- jz retint_kernel
1405
- jmp retint_user
1406
-END(error_exit)
1128
+ jz restore_regs_and_return_to_kernel
1129
+ jmp swapgs_restore_regs_and_return_to_usermode
1130
+SYM_CODE_END(error_return)
14071131
14081132 /*
14091133 * Runs on exception stack. Xen PV does not go through this path at all,
....@@ -1413,7 +1137,7 @@
14131137 * %r14: Used to save/restore the CR3 of the interrupted context
14141138 * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
14151139 */
1416
-ENTRY(nmi)
1140
+SYM_CODE_START(asm_exc_nmi)
14171141 UNWIND_HINT_IRET_REGS
14181142
14191143 /*
....@@ -1490,6 +1214,9 @@
14901214 PUSH_AND_CLEAR_REGS rdx=(%rdx)
14911215 ENCODE_FRAME_POINTER
14921216
1217
+ IBRS_ENTER
1218
+ UNTRAIN_RET
1219
+
14931220 /*
14941221 * At this point we no longer need to worry about stack damage
14951222 * due to nesting -- we're on the normal thread stack and we're
....@@ -1498,7 +1225,7 @@
14981225
14991226 movq %rsp, %rdi
15001227 movq $-1, %rsi
1501
- call do_nmi
1228
+ call exc_nmi
15021229
15031230 /*
15041231 * Return back to user mode. We must *not* do the normal exit
....@@ -1555,7 +1282,7 @@
15551282 * end_repeat_nmi, then we are a nested NMI. We must not
15561283 * modify the "iret" frame because it's being written by
15571284 * the outer NMI. That's okay; the outer NMI handler is
1558
- * about to about to call do_nmi anyway, so we can just
1285
+ * about to about to call exc_nmi() anyway, so we can just
15591286 * resume the outer NMI.
15601287 */
15611288
....@@ -1674,7 +1401,7 @@
16741401 * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
16751402 * we're repeating an NMI, gsbase has the same value that it had on
16761403 * the first iteration. paranoid_entry will load the kernel
1677
- * gsbase if needed before we call do_nmi. "NMI executing"
1404
+ * gsbase if needed before we call exc_nmi(). "NMI executing"
16781405 * is zero.
16791406 */
16801407 movq $1, 10*8(%rsp) /* Set "NMI executing". */
....@@ -1708,18 +1435,37 @@
17081435 call paranoid_entry
17091436 UNWIND_HINT_REGS
17101437
1711
- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
17121438 movq %rsp, %rdi
17131439 movq $-1, %rsi
1714
- call do_nmi
1440
+ call exc_nmi
1441
+
1442
+ /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
1443
+ IBRS_EXIT save_reg=%r15
17151444
17161445 /* Always restore stashed CR3 value (see paranoid_entry) */
17171446 RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
17181447
1719
- testl %ebx, %ebx /* swapgs needed? */
1448
+ /*
1449
+ * The above invocation of paranoid_entry stored the GSBASE
1450
+ * related information in R/EBX depending on the availability
1451
+ * of FSGSBASE.
1452
+ *
1453
+ * If FSGSBASE is enabled, restore the saved GSBASE value
1454
+ * unconditionally, otherwise take the conditional SWAPGS path.
1455
+ */
1456
+ ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
1457
+
1458
+ wrgsbase %rbx
1459
+ jmp nmi_restore
1460
+
1461
+nmi_no_fsgsbase:
1462
+ /* EBX == 0 -> invoke SWAPGS */
1463
+ testl %ebx, %ebx
17201464 jnz nmi_restore
1465
+
17211466 nmi_swapgs:
1722
- SWAPGS_UNSAFE_STACK
1467
+ swapgs
1468
+
17231469 nmi_restore:
17241470 POP_REGS
17251471
....@@ -1748,15 +1494,22 @@
17481494 * about espfix64 on the way back to kernel mode.
17491495 */
17501496 iretq
1751
-END(nmi)
1497
+SYM_CODE_END(asm_exc_nmi)
17521498
1753
-ENTRY(ignore_sysret)
1499
+#ifndef CONFIG_IA32_EMULATION
1500
+/*
1501
+ * This handles SYSCALL from 32-bit code. There is no way to program
1502
+ * MSRs to fully disable 32-bit SYSCALL.
1503
+ */
1504
+SYM_CODE_START(ignore_sysret)
17541505 UNWIND_HINT_EMPTY
17551506 mov $-ENOSYS, %eax
1756
- sysret
1757
-END(ignore_sysret)
1507
+ sysretl
1508
+SYM_CODE_END(ignore_sysret)
1509
+#endif
17581510
1759
-ENTRY(rewind_stack_do_exit)
1511
+.pushsection .text, "ax"
1512
+SYM_CODE_START(rewind_stack_do_exit)
17601513 UNWIND_HINT_FUNC
17611514 /* Prevent any naive code from trying to unwind to our caller. */
17621515 xorl %ebp, %ebp
....@@ -1766,4 +1519,5 @@
17661519 UNWIND_HINT_REGS
17671520
17681521 call do_exit
1769
-END(rewind_stack_do_exit)
1522
+SYM_CODE_END(rewind_stack_do_exit)
1523
+.popsection