From d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Mon, 11 Dec 2023 02:45:28 +0000 Subject: [PATCH] add boot partition size --- kernel/arch/x86/entry/entry_64.S | 1298 +++++++++++++++++++++++----------------------------------- 1 files changed, 526 insertions(+), 772 deletions(-) diff --git a/kernel/arch/x86/entry/entry_64.S b/kernel/arch/x86/entry/entry_64.S index 94fccaa..559c82b 100644 --- a/kernel/arch/x86/entry/entry_64.S +++ b/kernel/arch/x86/entry/entry_64.S @@ -8,15 +8,14 @@ * * entry.S contains the system-call and fault low-level handling routines. * - * Some of this is documented in Documentation/x86/entry_64.txt + * Some of this is documented in Documentation/x86/entry_64.rst * * A note on terminology: * - iret frame: Architecture defined interrupt frame from SS to RIP * at the top of the kernel process stack. * * Some macro usage: - * - ENTRY/END: Define functions in the symbol table. - * - TRACE_IRQ_*: Trace hardirq state for lock debugging. + * - SYM_FUNC_START/END:Define functions in the symbol table. * - idtentry: Define exception entry points. */ #include <linux/linkage.h> @@ -37,7 +36,9 @@ #include <asm/pgtable_types.h> #include <asm/export.h> #include <asm/frame.h> +#include <asm/trapnr.h> #include <asm/nospec-branch.h> +#include <asm/fsgsbase.h> #include <linux/err.h> #include "calling.h" @@ -45,64 +46,13 @@ .code64 .section .entry.text, "ax" -#ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret64) +#ifdef CONFIG_PARAVIRT_XXL +SYM_CODE_START(native_usergs_sysret64) UNWIND_HINT_EMPTY swapgs sysretq -END(native_usergs_sysret64) -#endif /* CONFIG_PARAVIRT */ - -.macro TRACE_IRQS_FLAGS flags:req -#ifdef CONFIG_TRACE_IRQFLAGS - btl $9, \flags /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON -1: -#endif -.endm - -.macro TRACE_IRQS_IRETQ - TRACE_IRQS_FLAGS EFLAGS(%rsp) -.endm - -/* - * When dynamic function tracer is enabled it will add a breakpoint - * to all locations that it is about to modify, sync CPUs, update - * all the code, sync CPUs, then remove the breakpoints. In this time - * if lockdep is enabled, it might jump back into the debug handler - * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). - * - * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to - * make sure the stack pointer does not get reset back to the top - * of the debug stack, and instead just reuses the current stack. - */ -#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) - -.macro TRACE_IRQS_OFF_DEBUG - call debug_stack_set_zero - TRACE_IRQS_OFF - call debug_stack_reset -.endm - -.macro TRACE_IRQS_ON_DEBUG - call debug_stack_set_zero - TRACE_IRQS_ON - call debug_stack_reset -.endm - -.macro TRACE_IRQS_IRETQ_DEBUG - btl $9, EFLAGS(%rsp) /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON_DEBUG -1: -.endm - -#else -# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF -# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON -# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ -#endif +SYM_CODE_END(native_usergs_sysret64) +#endif /* CONFIG_PARAVIRT_XXL */ /* * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. @@ -142,102 +92,37 @@ * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" +SYM_CODE_START(entry_SYSCALL_64) + UNWIND_HINT_ENTRY -/* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. - * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: - */ - -#define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - -/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ - SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA - -ENTRY(entry_SYSCALL_64_trampoline) - UNWIND_HINT_EMPTY swapgs - - /* Stash the user RSP. */ - movq %rsp, RSP_SCRATCH - - /* Note: using %rsp as a scratch reg. */ + /* tss.sp2 is scratch space. */ + movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - - /* Load the top of the task stack into RSP */ - movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp - - /* Start building the simulated IRET frame. */ - pushq $__USER_DS /* pt_regs->ss */ - pushq RSP_SCRATCH /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - - /* - * x86 lacks a near absolute jump, and we can't jump to the real - * entry text with a relative jump. We could push the target - * address and then use retq, but this destroys the pipeline on - * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, - * spill RDI and restore it in a second-stage trampoline. - */ - pushq %rdi - movq $entry_SYSCALL_64_stage2, %rdi - JMP_NOSPEC %rdi -END(entry_SYSCALL_64_trampoline) - - .popsection - -ENTRY(entry_SYSCALL_64_stage2) - UNWIND_HINT_EMPTY - popq %rdi - jmp entry_SYSCALL_64_after_hwframe -END(entry_SYSCALL_64_stage2) - -ENTRY(entry_SYSCALL_64) - UNWIND_HINT_EMPTY - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - - swapgs - /* - * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it - * is not required to switch CR3. - */ - movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) + /* Construct struct pt_regs on stack */ - pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ -GLOBAL(entry_SYSCALL_64_after_hwframe) - pushq %rax /* pt_regs->orig_ax */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ +SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) + pushq %rax /* pt_regs->orig_ax */ PUSH_AND_CLEAR_REGS rax=$-ENOSYS - - TRACE_IRQS_OFF /* IRQs are off. */ movq %rax, %rdi movq %rsp, %rsi - call do_syscall_64 /* returns with IRQs disabled */ - TRACE_IRQS_IRETQ /* we're about to change IF */ + /* clobbers %rax, make sure it is after saving the syscall nr */ + IBRS_ENTER + UNTRAIN_RET + + call do_syscall_64 /* returns with IRQs disabled */ /* * Try to use SYSRET instead of IRET if we're returning to @@ -311,8 +196,8 @@ * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: - /* rcx and r11 are already restored (see code above) */ - POP_REGS pop_rdi=0 skip_r11rcx=1 + IBRS_EXIT + POP_REGS pop_rdi=0 /* * Now all regs are restored except RSP and RDI. @@ -329,19 +214,21 @@ * We are on the trampoline stack. All regs except RDI are live. * We can do future final exit work right here. */ + STACKLEAK_ERASE_NOCLOBBER + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi popq %rdi popq %rsp USERGS_SYSRET64 -END(entry_SYSCALL_64) +SYM_CODE_END(entry_SYSCALL_64) /* * %rdi: prev task * %rsi: next task */ -ENTRY(__switch_to_asm) - UNWIND_HINT_FUNC +.pushsection .text, "ax" +SYM_FUNC_START(__switch_to_asm) /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -352,7 +239,6 @@ pushq %r13 pushq %r14 pushq %r15 - pushfq /* switch stack */ movq %rsp, TASK_threadsp(%rdi) @@ -360,10 +246,9 @@ #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset + movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset #endif -#ifdef CONFIG_RETPOLINE /* * When switching from a shallower to a deeper call stack * the RSB may either underflow or use entries populated @@ -372,10 +257,8 @@ * speculative execution to prevent attack. */ FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW -#endif /* restore callee-saved registers */ - popfq popq %r15 popq %r14 popq %r13 @@ -384,7 +267,8 @@ popq %rbp jmp __switch_to -END(__switch_to_asm) +SYM_FUNC_END(__switch_to_asm) +.popsection /* * A newly forked process directly context switches into this address. @@ -393,7 +277,8 @@ * rbx: kernel thread func (NULL for user thread) * r12: kernel thread arg */ -ENTRY(ret_from_fork) +.pushsection .text, "ax" +SYM_CODE_START(ret_from_fork) UNWIND_HINT_EMPTY movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ @@ -404,51 +289,23 @@ 2: UNWIND_HINT_REGS movq %rsp, %rdi - call syscall_return_slowpath /* returns with IRQs disabled */ - TRACE_IRQS_ON /* user mode is traced as IRQS on */ + call syscall_exit_to_user_mode /* returns with IRQs disabled */ jmp swapgs_restore_regs_and_return_to_usermode 1: /* kernel thread */ UNWIND_HINT_EMPTY movq %r12, %rdi - CALL_NOSPEC %rbx + CALL_NOSPEC rbx /* * A kernel thread is allowed to return here after successfully - * calling do_execve(). Exit to userspace to complete the execve() + * calling kernel_execve(). Exit to userspace to complete the execve() * syscall. */ movq $0, RAX(%rsp) jmp 2b -END(ret_from_fork) - -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -ENTRY(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - UNWIND_HINT_IRET_REGS - pushq $(~vector+0x80) /* Note: always in signed byte range */ - jmp common_interrupt - .align 8 - vector=vector+1 - .endr -END(irq_entries_start) - - .align 8 -ENTRY(spurious_entries_start) - vector=FIRST_SYSTEM_VECTOR - .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) - UNWIND_HINT_IRET_REGS - pushq $(~vector+0x80) /* Note: always in signed byte range */ - jmp common_spurious - .align 8 - vector=vector+1 - .endr -END(spurious_entries_start) +SYM_CODE_END(ret_from_fork) +.popsection .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY @@ -462,229 +319,260 @@ #endif .endm -/* - * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers - * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. - * Requires kernel GSBASE. - * - * The invariant is that, if irq_count != -1, then the IRQ stack is in use. +/** + * idtentry_body - Macro to emit code calling the C function + * @cfunc: C function to be called + * @has_error_code: Hardware pushed error code on stack */ -.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0 - DEBUG_ENTRY_ASSERT_IRQS_OFF +.macro idtentry_body cfunc has_error_code:req - .if \save_ret - /* - * If save_ret is set, the original stack contains one additional - * entry -- the return address. Therefore, move the address one - * entry below %rsp to \old_rsp. - */ - leaq 8(%rsp), \old_rsp - .else - movq %rsp, \old_rsp + call error_entry + UNWIND_HINT_REGS + + movq %rsp, %rdi /* pt_regs pointer into 1st argument*/ + + .if \has_error_code == 1 + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .endif - .if \regs - UNWIND_HINT_REGS base=\old_rsp + call \cfunc + + jmp error_return +.endm + +/** + * idtentry - Macro to generate entry stubs for simple IDT entries + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * @has_error_code: Hardware pushed error code on stack + * + * The macro emits code to set up the kernel context for straight forward + * and simple IDT entries. No IST stack, no paranoid entry checks. + */ +.macro idtentry vector asmsym cfunc has_error_code:req +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS offset=\has_error_code*8 + ASM_CLAC + + .if \has_error_code == 0 + pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif - incl PER_CPU_VAR(irq_count) - jnz .Lirq_stack_push_old_rsp_\@ + .if \vector == X86_TRAP_BP + /* + * If coming from kernel space, create a 6-word gap to allow the + * int3 handler to emulate a call instruction. + */ + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_no_gap_\@ + .rept 6 + pushq 5*8(%rsp) + .endr + UNWIND_HINT_IRET_REGS offset=8 +.Lfrom_usermode_no_gap_\@: + .endif + + idtentry_body \cfunc \has_error_code + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm + +/* + * Interrupt entry/exit. + * + + The interrupt stubs push (vector) onto the stack, which is the error_code + * position of idtentry exceptions, and jump to one of the two idtentry points + * (common/spurious). + * + * common_interrupt is a hotpath, align it to a cache line + */ +.macro idtentry_irq vector cfunc + .p2align CONFIG_X86_L1_CACHE_SHIFT + idtentry \vector asm_\cfunc \cfunc has_error_code=1 +.endm + +/* + * System vectors which invoke their handlers directly and are not + * going through the regular common device interrupt handling code. + */ +.macro idtentry_sysvec vector cfunc + idtentry \vector asm_\cfunc \cfunc has_error_code=0 +.endm + +/** + * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * + * The macro emits code to set up the kernel context for #MC and #DB + * + * If the entry comes from user space it uses the normal entry path + * including the return to user space work and preemption checks on + * exit. + * + * If hits in kernel mode then it needs to go through the paranoid + * entry as the exception can hit any random state. No preemption + * check on exit to keep the paranoid path simple. + */ +.macro idtentry_mce_db vector asmsym cfunc +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS + ASM_CLAC + + pushq $-1 /* ORIG_RAX: no syscall to restart */ /* - * Right now, if we just incremented irq_count to zero, we've - * claimed the IRQ stack but we haven't switched to it yet. - * - * If anything is added that can interrupt us here without using IST, - * it must be *extremely* careful to limit its stack usage. This - * could include kprobes and a hypothetical future IST-less #DB - * handler. - * - * The OOPS unwinder relies on the word at the top of the IRQ - * stack linking back to the previous RSP for the entire time we're - * on the IRQ stack. For this to work reliably, we need to write - * it before we actually move ourselves to the IRQ stack. + * If the entry is from userspace, switch stacks and treat it as + * a normal entry. */ + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_switch_stack_\@ - movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) - movq PER_CPU_VAR(irq_stack_ptr), %rsp + /* paranoid_entry returns GS information for paranoid_exit in EBX. */ + call paranoid_entry -#ifdef CONFIG_DEBUG_ENTRY + UNWIND_HINT_REGS + + movq %rsp, %rdi /* pt_regs pointer */ + + call \cfunc + + jmp paranoid_exit + + /* Switch to the regular task stack and use the noist entry point */ +.Lfrom_usermode_switch_stack_\@: + idtentry_body noist_\cfunc, has_error_code=0 + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm + +#ifdef CONFIG_AMD_MEM_ENCRYPT +/** + * idtentry_vc - Macro to generate entry stub for #VC + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * + * The macro emits code to set up the kernel context for #VC. The #VC handler + * runs on an IST stack and needs to be able to cause nested #VC exceptions. + * + * To make this work the #VC entry code tries its best to pretend it doesn't use + * an IST stack by switching to the task stack if coming from user-space (which + * includes early SYSCALL entry path) or back to the stack in the IRET frame if + * entered from kernel-mode. + * + * If entered from kernel-mode the return stack is validated first, and if it is + * not safe to use (e.g. because it points to the entry stack) the #VC handler + * will switch to a fall-back stack (VC2) and call a special handler function. + * + * The macro is only used for one vector, but it is planned to be extended in + * the future for the #HV exception. + */ +.macro idtentry_vc vector asmsym cfunc +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS + ASM_CLAC + /* - * If the first movq above becomes wrong due to IRQ stack layout - * changes, the only way we'll notice is if we try to unwind right - * here. Assert that we set up the stack right to catch this type - * of bug quickly. + * If the entry is from userspace, switch stacks and treat it as + * a normal entry. */ - cmpq -8(%rsp), \old_rsp - je .Lirq_stack_okay\@ - ud2 - .Lirq_stack_okay\@: + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_switch_stack_\@ + + /* + * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. + * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS + */ + call paranoid_entry + + UNWIND_HINT_REGS + + /* + * Switch off the IST stack to make it free for nested exceptions. The + * vc_switch_off_ist() function will switch back to the interrupted + * stack if it is safe to do so. If not it switches to the VC fall-back + * stack. + */ + movq %rsp, %rdi /* pt_regs pointer */ + call vc_switch_off_ist + movq %rax, %rsp /* Switch to new stack */ + + ENCODE_FRAME_POINTER + UNWIND_HINT_REGS + + /* Update pt_regs */ + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + + movq %rsp, %rdi /* pt_regs pointer */ + + call kernel_\cfunc + + /* + * No need to switch back to the IST stack. The current stack is either + * identical to the stack in the IRET frame or the VC fall-back stack, + * so it is definitly mapped even with PTI enabled. + */ + jmp paranoid_exit + + /* Switch to the regular task stack */ +.Lfrom_usermode_switch_stack_\@: + idtentry_body user_\cfunc, has_error_code=1 + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm #endif -.Lirq_stack_push_old_rsp_\@: - pushq \old_rsp - - .if \regs - UNWIND_HINT_REGS indirect=1 - .endif - - .if \save_ret - /* - * Push the return address to the stack. This return address can - * be found at the "real" original RSP, which was offset by 8 at - * the beginning of this macro. - */ - pushq -8(\old_rsp) - .endif -.endm - /* - * Undoes ENTER_IRQ_STACK. + * Double fault entry. Straight paranoid. No checks from which context + * this comes because for the espfix induced #DF this would do the wrong + * thing. */ -.macro LEAVE_IRQ_STACK regs=1 - DEBUG_ENTRY_ASSERT_IRQS_OFF - /* We need to be off the IRQ stack before decrementing irq_count. */ - popq %rsp - - .if \regs - UNWIND_HINT_REGS - .endif - - /* - * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming - * the irq stack but we're not on it. - */ - - decl PER_CPU_VAR(irq_count) -.endm - -/* - * Interrupt entry helper function. - * - * Entry runs with interrupts off. Stack layout at entry: - * +----------------------------------------------------+ - * | regs->ss | - * | regs->rsp | - * | regs->eflags | - * | regs->cs | - * | regs->ip | - * +----------------------------------------------------+ - * | regs->orig_ax = ~(interrupt number) | - * +----------------------------------------------------+ - * | return address | - * +----------------------------------------------------+ - */ -ENTRY(interrupt_entry) - UNWIND_HINT_IRET_REGS offset=16 +.macro idtentry_df vector asmsym cfunc +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS offset=8 ASM_CLAC - cld - testb $3, CS-ORIG_RAX+8(%rsp) - jz 1f - SWAPGS - FENCE_SWAPGS_USER_ENTRY - /* - * Switch to the thread stack. The IRET frame and orig_ax are - * on the stack, as well as the return address. RDI..R12 are - * not (yet) on the stack and space has not (yet) been - * allocated for them. - */ - pushq %rdi + /* paranoid_entry returns GS information for paranoid_exit in EBX. */ + call paranoid_entry + UNWIND_HINT_REGS - /* Need to switch before accessing the thread stack. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi - movq %rsp, %rdi - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + movq %rsp, %rdi /* pt_regs pointer into first argument */ + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + call \cfunc - /* - * We have RDI, return address, and orig_ax on the stack on - * top of the IRET frame. That means offset=24 - */ - UNWIND_HINT_IRET_REGS base=%rdi offset=24 + jmp paranoid_exit - pushq 7*8(%rdi) /* regs->ss */ - pushq 6*8(%rdi) /* regs->rsp */ - pushq 5*8(%rdi) /* regs->eflags */ - pushq 4*8(%rdi) /* regs->cs */ - pushq 3*8(%rdi) /* regs->ip */ - UNWIND_HINT_IRET_REGS - pushq 2*8(%rdi) /* regs->orig_ax */ - pushq 8(%rdi) /* return address */ - - movq (%rdi), %rdi - jmp 2f -1: - FENCE_SWAPGS_KERNEL_ENTRY -2: - PUSH_AND_CLEAR_REGS save_ret=1 - ENCODE_FRAME_POINTER 8 - - testb $3, CS+8(%rsp) - jz 1f - - /* - * IRQ from user mode. - * - * We need to tell lockdep that IRQs are off. We can't do this until - * we fix gsbase, and we should do it before enter_from_user_mode - * (which can take locks). Since TRACE_IRQS_OFF is idempotent, - * the simplest way to handle it is to just call it twice if - * we enter from user mode. There's no reason to optimize this since - * TRACE_IRQS_OFF is a no-op if lockdep is off. - */ - TRACE_IRQS_OFF - - CALL_enter_from_user_mode - -1: - ENTER_IRQ_STACK old_rsp=%rdi save_ret=1 - /* We entered an interrupt context - irqs are off: */ - TRACE_IRQS_OFF - - ret -END(interrupt_entry) -_ASM_NOKPROBE(interrupt_entry) - - -/* Interrupt entry/exit. */ +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm /* - * The interrupt stubs push (~vector+0x80) onto the stack and - * then jump to common_spurious/interrupt. + * Include the defines which emit the idt entries which are shared + * shared between 32 and 64 bit and emit the __irqentry_text_* markers + * so the stacktrace boundary checks work. */ -common_spurious: - addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ - call interrupt_entry - UNWIND_HINT_REGS indirect=1 - call smp_spurious_interrupt /* rdi points to pt_regs */ - jmp ret_from_intr -END(common_spurious) -_ASM_NOKPROBE(common_spurious) + .align 16 + .globl __irqentry_text_start +__irqentry_text_start: -/* common_interrupt is a hotpath. Align it */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -common_interrupt: - addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ - call interrupt_entry - UNWIND_HINT_REGS indirect=1 - call do_IRQ /* rdi points to pt_regs */ - /* 0(%rsp): old RSP */ -ret_from_intr: - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF +#include <asm/idtentry.h> - LEAVE_IRQ_STACK + .align 16 + .globl __irqentry_text_end +__irqentry_text_end: - testb $3, CS(%rsp) - jz retint_kernel - - /* Interrupt came from user space */ -GLOBAL(retint_user) - mov %rsp,%rdi - call prepare_exit_to_usermode - TRACE_IRQS_IRETQ - -GLOBAL(swapgs_restore_regs_and_return_to_usermode) +SYM_CODE_START_LOCAL(common_interrupt_return) +SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) + IBRS_EXIT #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates user mode. */ testb $3, CS(%rsp) @@ -692,6 +580,10 @@ ud2 1: #endif +#ifdef CONFIG_XEN_PV + ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV +#endif + POP_REGS pop_rdi=0 /* @@ -716,6 +608,7 @@ * We are on the trampoline stack. All regs except RDI are live. * We can do future final exit work right here. */ + STACKLEAK_ERASE_NOCLOBBER SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi @@ -725,41 +618,7 @@ INTERRUPT_RETURN -/* Returning to kernel space */ -retint_kernel: -#ifdef CONFIG_PREEMPT - /* Interrupts are off */ - /* Check if we need preemption */ - btl $9, EFLAGS(%rsp) /* were interrupts off? */ - jnc 1f -0: cmpl $0, PER_CPU_VAR(__preempt_count) -#ifndef CONFIG_PREEMPT_LAZY - jnz 1f -#else - jz do_preempt_schedule_irq - - # atleast preempt count == 0 ? - cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count) - jnz 1f - - movq PER_CPU_VAR(current_task), %rcx - cmpl $0, TASK_TI_preempt_lazy_count(%rcx) - jnz 1f - - btl $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx) - jnc 1f -do_preempt_schedule_irq: -#endif - call preempt_schedule_irq - jmp 0b -1: -#endif - /* - * The iretq could re-enable interrupts: - */ - TRACE_IRQS_IRETQ - -GLOBAL(restore_regs_and_return_to_kernel) +SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates kernel mode. */ testb $3, CS(%rsp) @@ -775,7 +634,7 @@ */ INTERRUPT_RETURN -ENTRY(native_iret) +SYM_INNER_LABEL_ALIGN(native_iret, SYM_L_GLOBAL) UNWIND_HINT_IRET_REGS /* * Are we returning to a stack segment from the LDT? Note: in @@ -786,12 +645,11 @@ jnz native_irq_return_ldt #endif -.global native_irq_return_iret -native_irq_return_iret: +SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL) /* * This may fault. Non-paranoid faults on return to userspace are * handled by fixup_bad_iret. These include #SS, #GP, and #NP. - * Double-faults due to espfix64 are handled in do_double_fault. + * Double-faults due to espfix64 are handled in exc_double_fault. * Other faults here are fatal. */ iretq @@ -820,8 +678,9 @@ */ pushq %rdi /* Stash user RDI */ - SWAPGS /* to kernel GS */ + swapgs /* to kernel GS */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ + UNTRAIN_RET movq PER_CPU_VAR(espfix_waddr), %rdi movq %rax, (0*8)(%rdi) /* user RAX */ @@ -850,7 +709,7 @@ orq PER_CPU_VAR(espfix_stack), %rax SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi - SWAPGS /* to user GS */ + swapgs /* to user GS */ popq %rdi /* Restore user RDI */ movq %rax, %rsp @@ -869,226 +728,32 @@ */ jmp native_irq_return_iret #endif -END(common_interrupt) -_ASM_NOKPROBE(common_interrupt) +SYM_CODE_END(common_interrupt_return) +_ASM_NOKPROBE(common_interrupt_return) /* - * APIC interrupts. + * Reload gs selector with exception handling + * edi: new selector + * + * Is in entry.text as it shouldn't be instrumented. */ -.macro apicinterrupt3 num sym do_sym -ENTRY(\sym) - UNWIND_HINT_IRET_REGS - pushq $~(\num) -.Lcommon_\sym: - call interrupt_entry - UNWIND_HINT_REGS indirect=1 - call \do_sym /* rdi points to pt_regs */ - jmp ret_from_intr -END(\sym) -_ASM_NOKPROBE(\sym) -.endm - -/* Make sure APIC interrupt handlers end up in the irqentry section: */ -#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" -#define POP_SECTION_IRQENTRY .popsection - -.macro apicinterrupt num sym do_sym -PUSH_SECTION_IRQENTRY -apicinterrupt3 \num \sym \do_sym -POP_SECTION_IRQENTRY -.endm - -#ifdef CONFIG_SMP -apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt -#endif - -#ifdef CONFIG_X86_UV -apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt -#endif - -apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi - -#ifdef CONFIG_HAVE_KVM -apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi -apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi -apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi -#endif - -#ifdef CONFIG_X86_MCE_THRESHOLD -apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt -#endif - -#ifdef CONFIG_X86_MCE_AMD -apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt -#endif - -#ifdef CONFIG_X86_THERMAL_VECTOR -apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt -#endif - -#ifdef CONFIG_SMP -apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt -apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt -apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt -#endif - -apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt -apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt - -#ifdef CONFIG_IRQ_WORK -apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt -#endif - -/* - * Exception entry points. - */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) - -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 create_gap=0 -ENTRY(\sym) - UNWIND_HINT_IRET_REGS offset=\has_error_code*8 - - /* Sanity check */ - .if \shift_ist != -1 && \paranoid == 0 - .error "using shift_ist requires paranoid=1" - .endif - - ASM_CLAC - - .if \has_error_code == 0 - pushq $-1 /* ORIG_RAX: no syscall to restart */ - .endif - - .if \paranoid == 1 - testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */ - jnz .Lfrom_usermode_switch_stack_\@ - .endif - - .if \create_gap == 1 - /* - * If coming from kernel space, create a 6-word gap to allow the - * int3 handler to emulate a call instruction. - */ - testb $3, CS-ORIG_RAX(%rsp) - jnz .Lfrom_usermode_no_gap_\@ - .rept 6 - pushq 5*8(%rsp) - .endr - UNWIND_HINT_IRET_REGS offset=8 -.Lfrom_usermode_no_gap_\@: - .endif - - .if \paranoid - call paranoid_entry - .else - call error_entry - .endif - UNWIND_HINT_REGS - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - - .if \paranoid - .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ - .else - TRACE_IRQS_OFF - .endif - .endif - - movq %rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp), %rsi /* get error code */ - movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi, %esi /* no error code */ - .endif - - .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) - .endif - - call \do_sym - - .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) - .endif - - /* these procedures expect "no swapgs" flag in ebx */ - .if \paranoid - jmp paranoid_exit - .else - jmp error_exit - .endif - - .if \paranoid == 1 - /* - * Entry from userspace. Switch stacks and treat it - * as a normal entry. This means that paranoid handlers - * run in real process context if user_mode(regs). - */ -.Lfrom_usermode_switch_stack_\@: - call error_entry - - movq %rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp), %rsi /* get error code */ - movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi, %esi /* no error code */ - .endif - - call \do_sym - - jmp error_exit - .endif -_ASM_NOKPROBE(\sym) -END(\sym) -.endm - -idtentry divide_error do_divide_error has_error_code=0 -idtentry overflow do_overflow has_error_code=0 -idtentry bounds do_bounds has_error_code=0 -idtentry invalid_op do_invalid_op has_error_code=0 -idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 -idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 -idtentry invalid_TSS do_invalid_TSS has_error_code=1 -idtentry segment_not_present do_segment_not_present has_error_code=1 -idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 -idtentry coprocessor_error do_coprocessor_error has_error_code=0 -idtentry alignment_check do_alignment_check has_error_code=1 -idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 - - - /* - * Reload gs selector with exception handling - * edi: new selector - */ -ENTRY(native_load_gs_index) +SYM_FUNC_START(asm_load_gs_index) FRAME_BEGIN - pushfq - DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) - TRACE_IRQS_OFF - SWAPGS + swapgs .Lgs_change: movl %edi, %gs 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE - SWAPGS - TRACE_IRQS_FLAGS (%rsp) - popfq + swapgs FRAME_END - ret -ENDPROC(native_load_gs_index) -EXPORT_SYMBOL(native_load_gs_index) + RET +SYM_FUNC_END(asm_load_gs_index) +EXPORT_SYMBOL(asm_load_gs_index) - _ASM_EXTABLE(.Lgs_change, bad_gs) + _ASM_EXTABLE(.Lgs_change, .Lbad_gs) .section .fixup, "ax" /* running with kernelgs */ -bad_gs: - SWAPGS /* switch back to user gs */ +SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs) + swapgs /* switch back to user gs */ .macro ZAP_GS /* This can't be a string because the preprocessor needs to see it. */ movl $__USER_DS, %eax @@ -1098,24 +763,51 @@ xorl %eax, %eax movl %eax, %gs jmp 2b +SYM_CODE_END(.Lbad_gs) .previous -#ifndef CONFIG_PREEMPT_RT_FULL -/* Call softirq on interrupt stack. Interrupts are off. */ -ENTRY(do_softirq_own_stack) - pushq %rbp - mov %rsp, %rbp - ENTER_IRQ_STACK regs=0 old_rsp=%r11 - call __do_softirq - LEAVE_IRQ_STACK regs=0 +/* + * rdi: New stack pointer points to the top word of the stack + * rsi: Function pointer + * rdx: Function argument (can be NULL if none) + */ +SYM_FUNC_START(asm_call_on_stack) +SYM_INNER_LABEL(asm_call_sysvec_on_stack, SYM_L_GLOBAL) +SYM_INNER_LABEL(asm_call_irq_on_stack, SYM_L_GLOBAL) + /* + * Save the frame pointer unconditionally. This allows the ORC + * unwinder to handle the stack switch. + */ + pushq %rbp + mov %rsp, %rbp + + /* + * The unwinder relies on the word at the top of the new stack + * page linking back to the previous RSP. + */ + mov %rsp, (%rdi) + mov %rdi, %rsp + /* Move the argument to the right place */ + mov %rdx, %rdi + +1: + .pushsection .discard.instr_begin + .long 1b - . + .popsection + + CALL_NOSPEC rsi + +2: + .pushsection .discard.instr_end + .long 2b - . + .popsection + + /* Restore the previous stack pointer from RBP. */ leaveq - ret -ENDPROC(do_softirq_own_stack) -#endif + RET +SYM_FUNC_END(asm_call_on_stack) -#ifdef CONFIG_XEN -idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 - +#ifdef CONFIG_XEN_PV /* * A note on the "critical region" in our callback handler. * We want to avoid stacking callback handlers due to events occurring @@ -1128,8 +820,10 @@ * So, on entry to the handler we detect whether we interrupted an * existing activation in its critical region -- if so, we pop the current * activation and restart the handler using the previous one. + * + * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs) */ -ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ +SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback) /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will @@ -1139,15 +833,10 @@ movq %rdi, %rsp /* we don't return, adjust the stack frame */ UNWIND_HINT_REGS - ENTER_IRQ_STACK old_rsp=%r10 - call xen_evtchn_do_upcall - LEAVE_IRQ_STACK + call xen_pv_evtchn_do_upcall -#ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall -#endif - jmp error_exit -END(xen_do_hypervisor_callback) + jmp error_return +SYM_CODE_END(exc_xen_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. @@ -1162,7 +851,7 @@ * We distinguish between categories by comparing each saved segment register * with its current contents: any discrepancy means we in category 1. */ -ENTRY(xen_failsafe_callback) +SYM_CODE_START(xen_failsafe_callback) UNWIND_HINT_EMPTY movl %ds, %ecx cmpw %cx, 0x10(%rsp) @@ -1182,7 +871,7 @@ addq $0x30, %rsp pushq $0 /* RIP */ UNWIND_HINT_IRET_REGS offset=8 - jmp general_protection + jmp asm_exc_general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp), %rcx movq 8(%rsp), %r11 @@ -1191,64 +880,29 @@ pushq $-1 /* orig_ax = -1 => not a system call */ PUSH_AND_CLEAR_REGS ENCODE_FRAME_POINTER - jmp error_exit -END(xen_failsafe_callback) - -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - xen_hvm_callback_vector xen_evtchn_do_upcall - -#endif /* CONFIG_XEN */ - -#if IS_ENABLED(CONFIG_HYPERV) -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - hyperv_callback_vector hyperv_vector_handler - -apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \ - hyperv_reenlightenment_vector hyperv_reenlightenment_intr - -apicinterrupt3 HYPERV_STIMER0_VECTOR \ - hv_stimer0_callback_vector hv_stimer0_vector_handler -#endif /* CONFIG_HYPERV */ - -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry int3 do_int3 has_error_code=0 create_gap=1 -idtentry stack_segment do_stack_segment has_error_code=1 - -#ifdef CONFIG_XEN -idtentry xennmi do_nmi has_error_code=0 -idtentry xendebug do_debug has_error_code=0 -#endif - -idtentry general_protection do_general_protection has_error_code=1 -idtentry page_fault do_page_fault has_error_code=1 - -#ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 -#endif - -#ifdef CONFIG_X86_MCE -idtentry machine_check do_mce has_error_code=0 paranoid=1 -#endif + jmp error_return +SYM_CODE_END(xen_failsafe_callback) +#endif /* CONFIG_XEN_PV */ /* - * Save all registers in pt_regs, and switch gs if needed. - * Use slow, but surefire "are we in kernel?" check. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + * Save all registers in pt_regs. Return GSBASE related information + * in EBX depending on the availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit + * + * Y GSBASE value at entry, must be restored in paranoid_exit + * + * R14 - old CR3 + * R15 - old SPEC_CTRL */ -ENTRY(paranoid_entry) +SYM_CODE_START_LOCAL(paranoid_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 - movl $1, %ebx - movl $MSR_GS_BASE, %ecx - rdmsr - testl %edx, %edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx, %ebx -1: /* * Always stash CR3 in %r14. This value will be restored, * verbatim, at exit. Needed if paranoid_entry interrupted @@ -1258,18 +912,65 @@ * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return * to kernel code, but with a user CR3 value. + * + * Switching CR3 does not depend on kernel GSBASE so it can + * be done before switching to the kernel GSBASE. This is + * required for FSGSBASE because the kernel GSBASE has to + * be retrieved from a kernel internal table. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 /* - * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an - * unconditional CR3 write, even in the PTI case. So do an lfence - * to prevent GS speculation, regardless of whether PTI is enabled. + * Handling GSBASE depends on the availability of FSGSBASE. + * + * Without FSGSBASE the kernel enforces that negative GSBASE + * values indicate kernel GSBASE. With FSGSBASE no assumptions + * can be made about the GSBASE value when entering from user + * space. */ - FENCE_SWAPGS_KERNEL_ENTRY + ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE - ret -END(paranoid_entry) + /* + * Read the current GSBASE and store it in %rbx unconditionally, + * retrieve and set the current CPUs kernel GSBASE. The stored value + * has to be restored in paranoid_exit unconditionally. + * + * The unconditional write to GS base below ensures that no subsequent + * loads based on a mispredicted GS base can happen, therefore no LFENCE + * is needed here. + */ + SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx + jmp .Lparanoid_gsbase_done + +.Lparanoid_entry_checkgs: + /* EBX = 1 -> kernel GSBASE active, no restore required */ + movl $1, %ebx + + /* + * The kernel-enforced convention is a negative GSBASE indicates + * a kernel value. No SWAPGS needed on entry and exit. + */ + movl $MSR_GS_BASE, %ecx + rdmsr + testl %edx, %edx + js .Lparanoid_kernel_gsbase + + /* EBX = 0 -> SWAPGS required on exit */ + xorl %ebx, %ebx + swapgs +.Lparanoid_kernel_gsbase: + FENCE_SWAPGS_KERNEL_ENTRY +.Lparanoid_gsbase_done: + + /* + * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like + * CR3 above, keep the old value in a callee saved register. + */ + IBRS_ENTER save_reg=%r15 + UNTRAIN_RET + + RET +SYM_CODE_END(paranoid_entry) /* * "Paranoid" exit path from exception stack. This is invoked @@ -1278,34 +979,61 @@ * * We may be returning to very strange contexts (e.g. very early * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. + * be complicated. Fortunately, there's no good reason to try + * to handle preemption here. * - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) + * R/EBX contains the GSBASE related information depending on the + * availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit + * + * Y User space GSBASE, must be restored unconditionally + * + * R14 - old CR3 + * R15 - old SPEC_CTRL */ -ENTRY(paranoid_exit) +SYM_CODE_START_LOCAL(paranoid_exit) UNWIND_HINT_REGS - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF_DEBUG - testl %ebx, %ebx /* swapgs needed? */ - jnz .Lparanoid_exit_no_swapgs - TRACE_IRQS_IRETQ - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 - SWAPGS_UNSAFE_STACK - jmp .Lparanoid_exit_restore -.Lparanoid_exit_no_swapgs: - TRACE_IRQS_IRETQ_DEBUG - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 -.Lparanoid_exit_restore: - jmp restore_regs_and_return_to_kernel -END(paranoid_exit) + + /* + * Must restore IBRS state before both CR3 and %GS since we need access + * to the per-CPU x86_spec_ctrl_shadow variable. + */ + IBRS_EXIT save_reg=%r15 + + /* + * The order of operations is important. RESTORE_CR3 requires + * kernel GSBASE. + * + * NB to anyone to try to optimize this code: this code does + * not execute at all for exceptions from user mode. Those + * exceptions go through error_exit instead. + */ + RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + + /* Handle the three GSBASE cases */ + ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE + + /* With FSGSBASE enabled, unconditionally restore GSBASE */ + wrgsbase %rbx + jmp restore_regs_and_return_to_kernel + +.Lparanoid_exit_checkgs: + /* On non-FSGSBASE systems, conditionally do SWAPGS */ + testl %ebx, %ebx + jnz restore_regs_and_return_to_kernel + + /* We are returning to a context with user GSBASE */ + swapgs + jmp restore_regs_and_return_to_kernel +SYM_CODE_END(paranoid_exit) /* * Save all registers in pt_regs, and switch GS if needed. */ -ENTRY(error_entry) +SYM_CODE_START_LOCAL(error_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 @@ -1321,8 +1049,11 @@ FENCE_SWAPGS_USER_ENTRY /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + IBRS_ENTER + UNTRAIN_RET .Lerror_entry_from_usermode_after_swapgs: + /* Put us onto the real thread stack. */ popq %r12 /* save return addr in %12 */ movq %rsp, %rdi /* arg0 = pt_regs pointer */ @@ -1330,21 +1061,7 @@ movq %rax, %rsp /* switch stack */ ENCODE_FRAME_POINTER pushq %r12 - - /* - * We need to tell lockdep that IRQs are off. We can't do this until - * we fix gsbase, and we should do it before enter_from_user_mode - * (which can take locks). - */ - TRACE_IRQS_OFF - CALL_enter_from_user_mode - ret - -.Lerror_entry_done_lfence: - FENCE_SWAPGS_KERNEL_ENTRY -.Lerror_entry_done: - TRACE_IRQS_OFF - ret + RET /* * There are two places in the kernel that can potentially fault with @@ -1368,9 +1085,15 @@ * .Lgs_change's error handler with kernel gsbase. */ SWAPGS - FENCE_SWAPGS_USER_ENTRY - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax - jmp .Lerror_entry_done + + /* + * Issue an LFENCE to prevent GS speculation, regardless of whether it is a + * kernel or user gsbase. + */ +.Lerror_entry_done_lfence: + FENCE_SWAPGS_KERNEL_ENTRY + ANNOTATE_UNRET_END + RET .Lbstep_iret: /* Fix truncated RIP */ @@ -1385,6 +1108,8 @@ SWAPGS FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + IBRS_ENTER + UNTRAIN_RET /* * Pretend that the exception came from user mode: set up pt_regs @@ -1394,16 +1119,15 @@ call fixup_bad_iret mov %rax, %rsp jmp .Lerror_entry_from_usermode_after_swapgs -END(error_entry) +SYM_CODE_END(error_entry) -ENTRY(error_exit) +SYM_CODE_START_LOCAL(error_return) UNWIND_HINT_REGS - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF + DEBUG_ENTRY_ASSERT_IRQS_OFF testb $3, CS(%rsp) - jz retint_kernel - jmp retint_user -END(error_exit) + jz restore_regs_and_return_to_kernel + jmp swapgs_restore_regs_and_return_to_usermode +SYM_CODE_END(error_return) /* * Runs on exception stack. Xen PV does not go through this path at all, @@ -1413,7 +1137,7 @@ * %r14: Used to save/restore the CR3 of the interrupted context * when PAGE_TABLE_ISOLATION is in use. Do not clobber. */ -ENTRY(nmi) +SYM_CODE_START(asm_exc_nmi) UNWIND_HINT_IRET_REGS /* @@ -1490,6 +1214,9 @@ PUSH_AND_CLEAR_REGS rdx=(%rdx) ENCODE_FRAME_POINTER + IBRS_ENTER + UNTRAIN_RET + /* * At this point we no longer need to worry about stack damage * due to nesting -- we're on the normal thread stack and we're @@ -1498,7 +1225,7 @@ movq %rsp, %rdi movq $-1, %rsi - call do_nmi + call exc_nmi /* * Return back to user mode. We must *not* do the normal exit @@ -1555,7 +1282,7 @@ * end_repeat_nmi, then we are a nested NMI. We must not * modify the "iret" frame because it's being written by * the outer NMI. That's okay; the outer NMI handler is - * about to about to call do_nmi anyway, so we can just + * about to about to call exc_nmi() anyway, so we can just * resume the outer NMI. */ @@ -1674,7 +1401,7 @@ * RSP is pointing to "outermost RIP". gsbase is unknown, but, if * we're repeating an NMI, gsbase has the same value that it had on * the first iteration. paranoid_entry will load the kernel - * gsbase if needed before we call do_nmi. "NMI executing" + * gsbase if needed before we call exc_nmi(). "NMI executing" * is zero. */ movq $1, 10*8(%rsp) /* Set "NMI executing". */ @@ -1708,18 +1435,37 @@ call paranoid_entry UNWIND_HINT_REGS - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi movq $-1, %rsi - call do_nmi + call exc_nmi + + /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ + IBRS_EXIT save_reg=%r15 /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 - testl %ebx, %ebx /* swapgs needed? */ + /* + * The above invocation of paranoid_entry stored the GSBASE + * related information in R/EBX depending on the availability + * of FSGSBASE. + * + * If FSGSBASE is enabled, restore the saved GSBASE value + * unconditionally, otherwise take the conditional SWAPGS path. + */ + ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE + + wrgsbase %rbx + jmp nmi_restore + +nmi_no_fsgsbase: + /* EBX == 0 -> invoke SWAPGS */ + testl %ebx, %ebx jnz nmi_restore + nmi_swapgs: - SWAPGS_UNSAFE_STACK + swapgs + nmi_restore: POP_REGS @@ -1748,15 +1494,22 @@ * about espfix64 on the way back to kernel mode. */ iretq -END(nmi) +SYM_CODE_END(asm_exc_nmi) -ENTRY(ignore_sysret) +#ifndef CONFIG_IA32_EMULATION +/* + * This handles SYSCALL from 32-bit code. There is no way to program + * MSRs to fully disable 32-bit SYSCALL. + */ +SYM_CODE_START(ignore_sysret) UNWIND_HINT_EMPTY mov $-ENOSYS, %eax - sysret -END(ignore_sysret) + sysretl +SYM_CODE_END(ignore_sysret) +#endif -ENTRY(rewind_stack_do_exit) +.pushsection .text, "ax" +SYM_CODE_START(rewind_stack_do_exit) UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp @@ -1766,4 +1519,5 @@ UNWIND_HINT_REGS call do_exit -END(rewind_stack_do_exit) +SYM_CODE_END(rewind_stack_do_exit) +.popsection -- Gitblit v1.6.2