From 37f49e37ab4cb5d0bc4c60eb5c6d4dd57db767bb Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Fri, 10 May 2024 07:44:59 +0000 Subject: [PATCH] gmac get mac form eeprom --- kernel/arch/x86/kernel/alternative.c | 1207 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 files changed, 1,034 insertions(+), 173 deletions(-) diff --git a/kernel/arch/x86/kernel/alternative.c b/kernel/arch/x86/kernel/alternative.c index 918a237..92f0a97 100644 --- a/kernel/arch/x86/kernel/alternative.c +++ b/kernel/arch/x86/kernel/alternative.c @@ -1,26 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "SMP alternatives: " fmt #include <linux/module.h> #include <linux/sched.h> +#include <linux/perf_event.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/stringify.h> +#include <linux/highmem.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> #include <linux/stop_machine.h> #include <linux/slab.h> #include <linux/kdebug.h> +#include <linux/kprobes.h> +#include <linux/mmu_context.h> +#include <linux/bsearch.h> +#include <linux/sync_core.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> -#include <asm/pgtable.h> #include <asm/mce.h> #include <asm/nmi.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include <asm/insn.h> #include <asm/io.h> #include <asm/fixmap.h> +#include <asm/asm-prototypes.h> int __read_mostly alternatives_patched; @@ -49,7 +57,7 @@ #define DPRINTK(fmt, args...) \ do { \ if (debug_alternative) \ - printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ + printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ } while (0) #define DUMP_BYTES(buf, len, fmt, args...) \ @@ -60,7 +68,7 @@ if (!(len)) \ break; \ \ - printk(KERN_DEBUG fmt, ##args); \ + printk(KERN_DEBUG pr_fmt(fmt), ##args); \ for (j = 0; j < (len) - 1; j++) \ printk(KERN_CONT "%02hhx ", buf[j]); \ printk(KERN_CONT "%02hhx\n", buf[j]); \ @@ -222,13 +230,17 @@ } break; + case X86_VENDOR_HYGON: + ideal_nops = p6_nops; + return; + case X86_VENDOR_AMD: if (boot_cpu_data.x86 > 0xf) { ideal_nops = p6_nops; return; } - /* fall through */ + fallthrough; default: #ifdef CONFIG_X86_64 @@ -257,9 +269,11 @@ } } +extern s32 __retpoline_sites[], __retpoline_sites_end[]; +extern s32 __return_sites[], __return_sites_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; -void *text_poke_early(void *addr, const void *opcode, size_t len); +void text_poke_early(void *addr, const void *opcode, size_t len); /* * Are we looking at a near JMP with a 1 or 4-byte displacement. @@ -270,7 +284,7 @@ } static void __init_or_module -recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) +recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) { u8 *next_rip, *tgt_rip; s32 n_dspl, o_dspl; @@ -279,7 +293,7 @@ if (a->replacementlen != 5) return; - o_dspl = *(s32 *)(insnbuf + 1); + o_dspl = *(s32 *)(insn_buff + 1); /* next_rip of the replacement JMP */ next_rip = repl_insn + a->replacementlen; @@ -305,9 +319,9 @@ two_byte_jmp: n_dspl -= 2; - insnbuf[0] = 0xeb; - insnbuf[1] = (s8)n_dspl; - add_nops(insnbuf + 2, 3); + insn_buff[0] = 0xeb; + insn_buff[1] = (s8)n_dspl; + add_nops(insn_buff + 2, 3); repl_len = 2; goto done; @@ -315,8 +329,8 @@ five_byte_jmp: n_dspl -= 5; - insnbuf[0] = 0xe9; - *(s32 *)&insnbuf[1] = n_dspl; + insn_buff[0] = 0xe9; + *(s32 *)&insn_buff[1] = n_dspl; repl_len = 5; @@ -327,25 +341,69 @@ } /* + * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) + * + * @instr: instruction byte stream + * @instrlen: length of the above + * @off: offset within @instr where the first NOP has been detected + * + * Return: number of NOPs found (and replaced). + */ +static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) +{ + unsigned long flags; + int i = off, nnops; + + while (i < instrlen) { + if (instr[i] != 0x90) + break; + + i++; + } + + nnops = i - off; + + if (nnops <= 1) + return nnops; + + local_irq_save(flags); + add_nops(instr + off, nnops); + local_irq_restore(flags); + + DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); + + return nnops; +} + +/* * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ -static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) +static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) { - unsigned long flags; - int i; + struct insn insn; + int i = 0; - for (i = 0; i < a->padlen; i++) { - if (instr[i] != 0x90) + /* + * Jump over the non-NOP insns and optimize single-byte NOPs into bigger + * ones. + */ + for (;;) { + if (insn_decode_kernel(&insn, &instr[i])) + return; + + /* + * See if this and any potentially following NOPs can be + * optimized. + */ + if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) + i += optimize_nops_range(instr, len, i); + else + i += insn.length; + + if (i >= len) return; } - - local_irq_save(flags); - add_nops(instr + (a->instrlen - a->padlen), a->padlen); - local_irq_restore(flags); - - DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ", - instr, a->instrlen - a->padlen, a->padlen); } /* @@ -363,7 +421,7 @@ { struct alt_instr *a; u8 *instr, *replacement; - u8 insnbuf[MAX_PATCH_LEN]; + u8 insn_buff[MAX_PATCH_LEN]; DPRINTK("alt table %px, -> %px", start, end); /* @@ -376,30 +434,36 @@ * order. */ for (a = start; a < end; a++) { - int insnbuf_sz = 0; + int insn_buff_sz = 0; + /* Mask away "NOT" flag bit for feature to test. */ + u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; - BUG_ON(a->instrlen > sizeof(insnbuf)); - BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); - if (!boot_cpu_has(a->cpuid)) { - if (a->padlen > 1) - optimize_nops(a, instr); + BUG_ON(a->instrlen > sizeof(insn_buff)); + BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); - continue; - } + /* + * Patch if either: + * - feature is present + * - feature not present but ALTINSTR_FLAG_INV is set to mean, + * patch if feature is *NOT* present. + */ + if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) + goto next; - DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d", - a->cpuid >> 5, - a->cpuid & 0x1f, - instr, a->instrlen, - replacement, a->replacementlen, a->padlen); + DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", + (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", + feature >> 5, + feature & 0x1f, + instr, instr, a->instrlen, + replacement, a->replacementlen); DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); - memcpy(insnbuf, replacement, a->replacementlen); - insnbuf_sz = a->replacementlen; + memcpy(insn_buff, replacement, a->replacementlen); + insn_buff_sz = a->replacementlen; /* * 0xe8 is a relative jump; fix the offset. @@ -407,26 +471,271 @@ * Instruction length is checked before the opcode to avoid * accessing uninitialized bytes for zero-length replacements. */ - if (a->replacementlen == 5 && *insnbuf == 0xe8) { - *(s32 *)(insnbuf + 1) += replacement - instr; + if (a->replacementlen == 5 && *insn_buff == 0xe8) { + *(s32 *)(insn_buff + 1) += replacement - instr; DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", - *(s32 *)(insnbuf + 1), - (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5); + *(s32 *)(insn_buff + 1), + (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); } if (a->replacementlen && is_jmp(replacement[0])) - recompute_jump(a, instr, replacement, insnbuf); + recompute_jump(a, instr, replacement, insn_buff); - if (a->instrlen > a->replacementlen) { - add_nops(insnbuf + a->replacementlen, - a->instrlen - a->replacementlen); - insnbuf_sz += a->instrlen - a->replacementlen; - } - DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr); + for (; insn_buff_sz < a->instrlen; insn_buff_sz++) + insn_buff[insn_buff_sz] = 0x90; - text_poke_early(instr, insnbuf, insnbuf_sz); + DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); + + text_poke_early(instr, insn_buff, insn_buff_sz); + +next: + optimize_nops(instr, a->instrlen); } } + +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION) + +/* + * CALL/JMP *%\reg + */ +static int emit_indirect(int op, int reg, u8 *bytes) +{ + int i = 0; + u8 modrm; + + switch (op) { + case CALL_INSN_OPCODE: + modrm = 0x10; /* Reg = 2; CALL r/m */ + break; + + case JMP32_INSN_OPCODE: + modrm = 0x20; /* Reg = 4; JMP r/m */ + break; + + default: + WARN_ON_ONCE(1); + return -1; + } + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + + modrm |= 0xc0; /* Mod = 3 */ + modrm += reg; + + bytes[i++] = 0xff; /* opcode */ + bytes[i++] = modrm; + + return i; +} + +/* + * Rewrite the compiler generated retpoline thunk calls. + * + * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate + * indirect instructions, avoiding the extra indirection. + * + * For example, convert: + * + * CALL __x86_indirect_thunk_\reg + * + * into: + * + * CALL *%\reg + * + * It also tries to inline spectre_v2=retpoline,amd when size permits. + */ +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) +{ + retpoline_thunk_t *target; + int reg, ret, i = 0; + u8 op, cc; + + target = addr + insn->length + insn->immediate.value; + reg = target - __x86_indirect_thunk_array; + + if (WARN_ON_ONCE(reg & ~0xf)) + return -1; + + /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ + BUG_ON(reg == 4); + + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) + return -1; + + op = insn->opcode.bytes[0]; + + /* + * Convert: + * + * Jcc.d32 __x86_indirect_thunk_\reg + * + * into: + * + * Jncc.d8 1f + * [ LFENCE ] + * JMP *%\reg + * [ NOP ] + * 1: + */ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + cc = insn->opcode.bytes[1] & 0xf; + cc ^= 1; /* invert condition */ + + bytes[i++] = 0x70 + cc; /* Jcc.d8 */ + bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ + + /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ + op = JMP32_INSN_OPCODE; + } + + /* + * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE. + */ + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + bytes[i++] = 0x0f; + bytes[i++] = 0xae; + bytes[i++] = 0xe8; /* LFENCE */ + } + + ret = emit_indirect(op, reg, bytes + i); + if (ret < 0) + return ret; + i += ret; + + for (; i < insn->length;) + bytes[i++] = 0x90; + + return i; +} + +/* + * Generated by 'objtool --retpoline'. + */ +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op1, op2; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op1 = insn.opcode.bytes[0]; + op2 = insn.opcode.bytes[1]; + + switch (op1) { + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + break; + + case 0x0f: /* escape */ + if (op2 >= 0x80 && op2 <= 0x8f) + break; + fallthrough; + default: + WARN_ON_ONCE(1); + continue; + } + + DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_retpoline(addr, &insn, bytes); + if (len == insn.length) { + optimize_nops(bytes, len); + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} + +#ifdef CONFIG_RETHUNK +/* + * Rewrite the compiler generated return thunk tail-calls. + * + * For example, convert: + * + * JMP __x86_return_thunk + * + * into: + * + * RET + */ +static int patch_return(void *addr, struct insn *insn, u8 *bytes) +{ + int i = 0; + + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + return -1; + + bytes[i++] = RET_INSN_OPCODE; + + for (; i < insn->length;) + bytes[i++] = INT3_INSN_OPCODE; + + return i; +} + +void __init_or_module noinline apply_returns(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *dest = NULL, *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op = insn.opcode.bytes[0]; + if (op == JMP32_INSN_OPCODE) + dest = addr + insn.length + insn.immediate.value; + + if (__static_call_fixup(addr, op, dest) || + WARN_ONCE(dest != &__x86_return_thunk, + "missing return thunk: %pS-%pS: %*ph", + addr, dest, 5, addr)) + continue; + + DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_return(addr, &insn, bytes); + if (len == insn.length) { + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} +#else +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +#endif /* CONFIG_RETHUNK */ + +#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */ + +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } + +#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */ #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, @@ -586,45 +895,125 @@ struct paravirt_patch_site *end) { struct paravirt_patch_site *p; - char insnbuf[MAX_PATCH_LEN]; + char insn_buff[MAX_PATCH_LEN]; for (p = start; p < end; p++) { unsigned int used; BUG_ON(p->len > MAX_PATCH_LEN); /* prep the buffer with the original instructions */ - memcpy(insnbuf, p->instr, p->len); - used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf, - (unsigned long)p->instr, p->len); + memcpy(insn_buff, p->instr, p->len); + used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len); BUG_ON(used > p->len); /* Pad the rest with nops */ - add_nops(insnbuf + used, p->len - used); - text_poke_early(p->instr, insnbuf, p->len); + add_nops(insn_buff + used, p->len - used); + text_poke_early(p->instr, insn_buff, p->len); } } extern struct paravirt_patch_site __start_parainstructions[], __stop_parainstructions[]; #endif /* CONFIG_PARAVIRT */ +/* + * Self-test for the INT3 based CALL emulation code. + * + * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up + * properly and that there is a stack gap between the INT3 frame and the + * previous context. Without this gap doing a virtual PUSH on the interrupted + * stack would corrupt the INT3 IRET frame. + * + * See entry_{32,64}.S for more details. + */ +static void __init __no_sanitize_address notrace int3_magic(unsigned int *ptr) +{ + *ptr = 1; +} + +extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */ + +static int __init +int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + + if (!regs || user_mode(regs)) + return NOTIFY_DONE; + + if (val != DIE_INT3) + return NOTIFY_DONE; + + if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip) + return NOTIFY_DONE; + + int3_emulate_call(regs, (unsigned long)&int3_magic); + return NOTIFY_STOP; +} + +static void __init int3_selftest(void) +{ + static __initdata struct notifier_block int3_exception_nb = { + .notifier_call = int3_exception_notify, + .priority = INT_MAX-1, /* last */ + }; + unsigned int val = 0; + + BUG_ON(register_die_notifier(&int3_exception_nb)); + + /* + * Basically: int3_magic(&val); but really complicated :-) + * + * Stick the address of the INT3 instruction into int3_selftest_ip, + * then trigger the INT3, padded with NOPs to match a CALL instruction + * length. + */ + asm volatile ("1: int3; nop; nop; nop; nop\n\t" + ".pushsection .init.data,\"aw\"\n\t" + ".align " __ASM_SEL(4, 8) "\n\t" + ".type int3_selftest_ip, @object\n\t" + ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t" + "int3_selftest_ip:\n\t" + __ASM_SEL(.long, .quad) " 1b\n\t" + ".popsection\n\t" + : ASM_CALL_CONSTRAINT + : __ASM_SEL_RAW(a, D) (&val) + : "memory"); + + BUG_ON(val != 1); + + unregister_die_notifier(&int3_exception_nb); +} + void __init alternative_instructions(void) { - /* The patching is not fully atomic, so try to avoid local interruptions - that might execute the to be patched code. - Other CPUs are not running. */ + int3_selftest(); + + /* + * The patching is not fully atomic, so try to avoid local + * interruptions that might execute the to be patched code. + * Other CPUs are not running. + */ stop_nmi(); /* * Don't stop machine check exceptions while patching. * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. - * Ignoring it is worse than a unlikely patching race. + * Ignoring it is worse than an unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code * patching. */ + + /* + * Rewrite the retpolines, must be done before alternatives since + * those can rewrite the retpoline thunks. + */ + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + apply_returns(__return_sites, __return_sites_end); apply_alternatives(__alt_instructions, __alt_instructions_end); @@ -637,10 +1026,11 @@ _text, _etext); } - if (!uniproc_patched || num_possible_cpus() == 1) + if (!uniproc_patched || num_possible_cpus() == 1) { free_init_pages("SMP alternatives", (unsigned long)__smp_locks, (unsigned long)__smp_locks_end); + } #endif apply_paravirt(__parainstructions, __parainstructions_end); @@ -658,11 +1048,11 @@ * When you use this code to patch more than one byte of an instruction * you need to make sure that other CPUs cannot execute this code in parallel. * Also no thread must be currently preempted in the middle of these - * instructions. And on the local CPU you need to be protected again NMI or MCE - * handlers seeing an inconsistent instruction while you patch. + * instructions. And on the local CPU you need to be protected against NMI or + * MCE handlers seeing an inconsistent instruction while you patch. */ -void *__init_or_module text_poke_early(void *addr, const void *opcode, - size_t len) +void __init_or_module text_poke_early(void *addr, const void *opcode, + size_t len) { unsigned long flags; @@ -685,6 +1075,176 @@ * that causes hangs on some VIA CPUs. */ } +} + +typedef struct { + struct mm_struct *mm; +} temp_mm_state_t; + +/* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. + * + * Context: The temporary mm needs to be used exclusively by a single core. To + * harden security IRQs must be disabled while the temporary mm is + * loaded, thereby preventing interrupt handler bugs from overriding + * the kernel memory protection. + */ +static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) +{ + temp_mm_state_t temp_state; + + lockdep_assert_irqs_disabled(); + + /* + * Make sure not to be in TLB lazy mode, as otherwise we'll end up + * with a stale address space WITHOUT being in lazy mode after + * restoring the previous mm. + */ + if (this_cpu_read(cpu_tlbstate.is_lazy)) + leave_mm(smp_processor_id()); + + temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return temp_state; +} + +static inline void unuse_temporary_mm(temp_mm_state_t prev_state) +{ + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(NULL, prev_state.mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + +__ro_after_init struct mm_struct *poking_mm; +__ro_after_init unsigned long poking_addr; + +static void *__text_poke(void *addr, const void *opcode, size_t len) +{ + bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; + struct page *pages[2] = {NULL}; + temp_mm_state_t prev; + unsigned long flags; + pte_t pte, *ptep; + spinlock_t *ptl; + pgprot_t pgprot; + + /* + * While boot memory allocator is running we cannot use struct pages as + * they are not yet initialized. There is no way to recover. + */ + BUG_ON(!after_bootmem); + + if (!core_kernel_text((unsigned long)addr)) { + pages[0] = vmalloc_to_page(addr); + if (cross_page_boundary) + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); + } else { + pages[0] = virt_to_page(addr); + WARN_ON(!PageReserved(pages[0])); + if (cross_page_boundary) + pages[1] = virt_to_page(addr + PAGE_SIZE); + } + /* + * If something went wrong, crash and burn since recovery paths are not + * implemented. + */ + BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); + + /* + * Map the page without the global bit, as TLB flushing is done with + * flush_tlb_mm_range(), which is intended for non-global PTEs. + */ + pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); + + /* + * The lock is not really needed, but this allows to avoid open-coding. + */ + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + + /* + * This must not fail; preallocated in poking_init(). + */ + VM_BUG_ON(!ptep); + + local_irq_save(flags); + + pte = mk_pte(pages[0], pgprot); + set_pte_at(poking_mm, poking_addr, ptep, pte); + + if (cross_page_boundary) { + pte = mk_pte(pages[1], pgprot); + set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + } + + /* + * Loading the temporary mm behaves as a compiler barrier, which + * guarantees that the PTE will be set at the time memcpy() is done. + */ + prev = use_temporary_mm(poking_mm); + + kasan_disable_current(); + memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); + kasan_enable_current(); + + /* + * Ensure that the PTE is only cleared after the instructions of memcpy + * were issued by using a compiler barrier. + */ + barrier(); + + pte_clear(poking_mm, poking_addr, ptep); + if (cross_page_boundary) + pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + + /* + * Loading the previous page-table hierarchy requires a serializing + * instruction that already allows the core to see the updated version. + * Xen-PV is assumed to serialize execution in a similar manner. + */ + unuse_temporary_mm(prev); + + /* + * Flushing the TLB might involve IPIs, which would require enabled + * IRQs, but not if the mm is not used, as it is in this point. + */ + flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, + PAGE_SHIFT, false); + + /* + * If the text does not match what we just wrote then something is + * fundamentally screwy; there's nothing we can really do about that. + */ + BUG_ON(memcmp(addr, opcode, len)); + + local_irq_restore(flags); + pte_unmap_unlock(ptep, ptl); return addr; } @@ -698,48 +1258,36 @@ * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. + * + * Note that the caller must ensure that if the modified code is part of a + * module, the module would not be removed during poking. This can be achieved + * by registering a module notifier, and ordering module removal and patching + * trough a mutex. */ void *text_poke(void *addr, const void *opcode, size_t len) { - unsigned long flags; - char *vaddr; - struct page *pages[2]; - int i; - - /* - * While boot memory allocator is runnig we cannot use struct - * pages as they are not yet initialized. - */ - BUG_ON(!after_bootmem); - lockdep_assert_held(&text_mutex); - if (!core_kernel_text((unsigned long)addr)) { - pages[0] = vmalloc_to_page(addr); - pages[1] = vmalloc_to_page(addr + PAGE_SIZE); - } else { - pages[0] = virt_to_page(addr); - WARN_ON(!PageReserved(pages[0])); - pages[1] = virt_to_page(addr + PAGE_SIZE); - } - BUG_ON(!pages[0]); - local_irq_save(flags); - set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); - if (pages[1]) - set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); - vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); - memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - clear_fixmap(FIX_TEXT_POKE0); - if (pages[1]) - clear_fixmap(FIX_TEXT_POKE1); - local_flush_tlb(); - sync_core(); - /* Could also do a CLFLUSH here to speed up CPU recovery; but - that causes hangs on some VIA CPUs. */ - for (i = 0; i < len; i++) - BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); - local_irq_restore(flags); - return addr; + return __text_poke(addr, opcode, len); +} + +/** + * text_poke_kgdb - Update instructions on a live kernel by kgdb + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Only atomic text poke/set should be allowed when not doing early patching. + * It means the size must be writable atomically and the address must be aligned + * in a way that permits an atomic write. It also makes sure we fit on a single + * page. + * + * Context: should only be used by kgdb, which ensures no other core is running, + * despite the fact it does not hold the text_mutex. + */ +void *text_poke_kgdb(void *addr, const void *opcode, size_t len) +{ + return __text_poke(addr, opcode, len); } static void do_sync_core(void *info) @@ -747,34 +1295,390 @@ sync_core(); } -static bool bp_patching_in_progress; -static void *bp_int3_handler, *bp_int3_addr; - -int poke_int3_handler(struct pt_regs *regs) +void text_poke_sync(void) { + on_each_cpu(do_sync_core, NULL, 1); +} + +struct text_poke_loc { + /* addr := _stext + rel_addr */ + s32 rel_addr; + s32 disp; + u8 len; + u8 opcode; + const u8 text[POKE_MAX_OPCODE_SIZE]; + /* see text_poke_bp_batch() */ + u8 old; +}; + +struct bp_patching_desc { + struct text_poke_loc *vec; + int nr_entries; + atomic_t refs; +}; + +static struct bp_patching_desc bp_desc; + +static __always_inline +struct bp_patching_desc *try_get_desc(void) +{ + struct bp_patching_desc *desc = &bp_desc; + + if (!arch_atomic_inc_not_zero(&desc->refs)) + return NULL; + + return desc; +} + +static __always_inline void put_desc(void) +{ + struct bp_patching_desc *desc = &bp_desc; + + smp_mb__before_atomic(); + arch_atomic_dec(&desc->refs); +} + +static __always_inline void *text_poke_addr(struct text_poke_loc *tp) +{ + return _stext + tp->rel_addr; +} + +static __always_inline int patch_cmp(const void *key, const void *elt) +{ + struct text_poke_loc *tp = (struct text_poke_loc *) elt; + + if (key < text_poke_addr(tp)) + return -1; + if (key > text_poke_addr(tp)) + return 1; + return 0; +} + +noinstr int poke_int3_handler(struct pt_regs *regs) +{ + struct bp_patching_desc *desc; + struct text_poke_loc *tp; + int ret = 0; + void *ip; + + if (user_mode(regs)) + return 0; + /* * Having observed our INT3 instruction, we now must observe - * bp_patching_in_progress. + * bp_desc with non-zero refcount: * - * in_progress = TRUE INT3 - * WMB RMB - * write INT3 if (in_progress) - * - * Idem for bp_int3_handler. + * bp_desc.refs = 1 INT3 + * WMB RMB + * write INT3 if (bp_desc.refs != 0) */ smp_rmb(); - if (likely(!bp_patching_in_progress)) + desc = try_get_desc(); + if (!desc) return 0; - if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr) - return 0; + /* + * Discount the INT3. See text_poke_bp_batch(). + */ + ip = (void *) regs->ip - INT3_INSN_SIZE; - /* set up the specified breakpoint handler */ - regs->ip = (unsigned long) bp_int3_handler; + /* + * Skip the binary search if there is a single member in the vector. + */ + if (unlikely(desc->nr_entries > 1)) { + tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, + sizeof(struct text_poke_loc), + patch_cmp); + if (!tp) + goto out_put; + } else { + tp = desc->vec; + if (text_poke_addr(tp) != ip) + goto out_put; + } - return 1; + ip += tp->len; + switch (tp->opcode) { + case INT3_INSN_OPCODE: + /* + * Someone poked an explicit INT3, they'll want to handle it, + * do not consume. + */ + goto out_put; + + case RET_INSN_OPCODE: + int3_emulate_ret(regs); + break; + + case CALL_INSN_OPCODE: + int3_emulate_call(regs, (long)ip + tp->disp); + break; + + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + int3_emulate_jmp(regs, (long)ip + tp->disp); + break; + + default: + BUG(); + } + + ret = 1; + +out_put: + put_desc(); + return ret; +} + +#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) +static struct text_poke_loc tp_vec[TP_VEC_MAX]; +static int tp_vec_nr; + +/** + * text_poke_bp_batch() -- update instructions on live kernel on SMP + * @tp: vector of instructions to patch + * @nr_entries: number of entries in the vector + * + * Modify multi-byte instruction by using int3 breakpoint on SMP. + * We completely avoid stop_machine() here, and achieve the + * synchronization using int3 breakpoint. + * + * The way it is done: + * - For each entry in the vector: + * - add a int3 trap to the address that will be patched + * - sync cores + * - For each entry in the vector: + * - update all but the first byte of the patched range + * - sync cores + * - For each entry in the vector: + * - replace the first byte (int3) by the first byte of + * replacing opcode + * - sync cores + */ +static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +{ + unsigned char int3 = INT3_INSN_OPCODE; + unsigned int i; + int do_sync; + + lockdep_assert_held(&text_mutex); + + bp_desc.vec = tp; + bp_desc.nr_entries = nr_entries; + + /* + * Corresponds to the implicit memory barrier in try_get_desc() to + * ensure reading a non-zero refcount provides up to date bp_desc data. + */ + atomic_set_release(&bp_desc.refs, 1); + + /* + * Corresponding read barrier in int3 notifier for making sure the + * nr_entries and handler are correctly ordered wrt. patching. + */ + smp_wmb(); + + /* + * First step: add a int3 trap to the address that will be patched. + */ + for (i = 0; i < nr_entries; i++) { + tp[i].old = *(u8 *)text_poke_addr(&tp[i]); + text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + } + + text_poke_sync(); + + /* + * Second step: update all but the first byte of the patched range. + */ + for (do_sync = 0, i = 0; i < nr_entries; i++) { + u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; + int len = tp[i].len; + + if (len - INT3_INSN_SIZE > 0) { + memcpy(old + INT3_INSN_SIZE, + text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); + text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + (const char *)tp[i].text + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); + do_sync++; + } + + /* + * Emit a perf event to record the text poke, primarily to + * support Intel PT decoding which must walk the executable code + * to reconstruct the trace. The flow up to here is: + * - write INT3 byte + * - IPI-SYNC + * - write instruction tail + * At this point the actual control flow will be through the + * INT3 and handler and not hit the old or new instruction. + * Intel PT outputs FUP/TIP packets for the INT3, so the flow + * can still be decoded. Subsequently: + * - emit RECORD_TEXT_POKE with the new instruction + * - IPI-SYNC + * - write first byte + * - IPI-SYNC + * So before the text poke event timestamp, the decoder will see + * either the old instruction flow or FUP/TIP of INT3. After the + * text poke event timestamp, the decoder will see either the + * new instruction flow or FUP/TIP of INT3. Thus decoders can + * use the timestamp as the point at which to modify the + * executable code. + * The old instruction is recorded so that the event can be + * processed forwards or backwards. + */ + perf_event_text_poke(text_poke_addr(&tp[i]), old, len, + tp[i].text, len); + } + + if (do_sync) { + /* + * According to Intel, this core syncing is very likely + * not necessary and we'd be safe even without it. But + * better safe than sorry (plus there's not only Intel). + */ + text_poke_sync(); + } + + /* + * Third step: replace the first byte (int3) by the first byte of + * replacing opcode. + */ + for (do_sync = 0, i = 0; i < nr_entries; i++) { + if (tp[i].text[0] == INT3_INSN_OPCODE) + continue; + + text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE); + do_sync++; + } + + if (do_sync) + text_poke_sync(); + + /* + * Remove and wait for refs to be zero. + */ + if (!atomic_dec_and_test(&bp_desc.refs)) + atomic_cond_read_acquire(&bp_desc.refs, !VAL); +} + +static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate) +{ + struct insn insn; + int ret, i; + + memcpy((void *)tp->text, opcode, len); + if (!emulate) + emulate = opcode; + + ret = insn_decode_kernel(&insn, emulate); + BUG_ON(ret < 0); + + tp->rel_addr = addr - (void *)_stext; + tp->len = len; + tp->opcode = insn.opcode.bytes[0]; + + switch (tp->opcode) { + case RET_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + /* + * Control flow instructions without implied execution of the + * next instruction can be padded with INT3. + */ + for (i = insn.length; i < len; i++) + BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + break; + + default: + BUG_ON(len != insn.length); + }; + + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + case RET_INSN_OPCODE: + break; + + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + tp->disp = insn.immediate.value; + break; + + default: /* assume NOP */ + switch (len) { + case 2: /* NOP2 -- emulate as JMP8+0 */ + BUG_ON(memcmp(emulate, ideal_nops[len], len)); + tp->opcode = JMP8_INSN_OPCODE; + tp->disp = 0; + break; + + case 5: /* NOP5 -- emulate as JMP32+0 */ + BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len)); + tp->opcode = JMP32_INSN_OPCODE; + tp->disp = 0; + break; + + default: /* unknown instruction */ + BUG(); + } + break; + } +} + +/* + * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * early if needed. + */ +static bool tp_order_fail(void *addr) +{ + struct text_poke_loc *tp; + + if (!tp_vec_nr) + return false; + + if (!addr) /* force */ + return true; + + tp = &tp_vec[tp_vec_nr - 1]; + if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) + return true; + + return false; +} + +static void text_poke_flush(void *addr) +{ + if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { + text_poke_bp_batch(tp_vec, tp_vec_nr); + tp_vec_nr = 0; + } +} + +void text_poke_finish(void) +{ + text_poke_flush(NULL); +} + +void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +{ + struct text_poke_loc *tp; + + if (unlikely(system_state == SYSTEM_BOOTING)) { + text_poke_early(addr, opcode, len); + return; + } + + text_poke_flush(addr); + + tp = &tp_vec[tp_vec_nr++]; + text_poke_loc_init(tp, addr, opcode, len, emulate); } /** @@ -784,62 +1688,19 @@ * @len: length to copy * @handler: address to jump to when the temporary breakpoint is hit * - * Modify multi-byte instruction by using int3 breakpoint on SMP. - * We completely avoid stop_machine() here, and achieve the - * synchronization using int3 breakpoint. - * - * The way it is done: - * - add a int3 trap to the address that will be patched - * - sync cores - * - update all but the first byte of the patched range - * - sync cores - * - replace the first byte (int3) by the first byte of - * replacing opcode - * - sync cores + * Update a single instruction with the vector in the stack, avoiding + * dynamically allocated memory. This function should be used when it is + * not possible to allocate memory. */ -void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) { - unsigned char int3 = 0xcc; + struct text_poke_loc tp; - bp_int3_handler = handler; - bp_int3_addr = (u8 *)addr + sizeof(int3); - bp_patching_in_progress = true; - - lockdep_assert_held(&text_mutex); - - /* - * Corresponding read barrier in int3 notifier for making sure the - * in_progress and handler are correctly ordered wrt. patching. - */ - smp_wmb(); - - text_poke(addr, &int3, sizeof(int3)); - - on_each_cpu(do_sync_core, NULL, 1); - - if (len - sizeof(int3) > 0) { - /* patch all but the first byte */ - text_poke((char *)addr + sizeof(int3), - (const char *) opcode + sizeof(int3), - len - sizeof(int3)); - /* - * According to Intel, this core syncing is very likely - * not necessary and we'd be safe even without it. But - * better safe than sorry (plus there's not only Intel). - */ - on_each_cpu(do_sync_core, NULL, 1); + if (unlikely(system_state == SYSTEM_BOOTING)) { + text_poke_early(addr, opcode, len); + return; } - /* patch the first byte */ - text_poke(addr, opcode, sizeof(int3)); - - on_each_cpu(do_sync_core, NULL, 1); - /* - * sync_core() implies an smp_mb() and orders this store against - * the writing of the new instruction. - */ - bp_patching_in_progress = false; - - return addr; + text_poke_loc_init(&tp, addr, opcode, len, emulate); + text_poke_bp_batch(&tp, 1); } - -- Gitblit v1.6.2