.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | #define pr_fmt(fmt) "SMP alternatives: " fmt |
---|
2 | 3 | |
---|
3 | 4 | #include <linux/module.h> |
---|
4 | 5 | #include <linux/sched.h> |
---|
| 6 | +#include <linux/perf_event.h> |
---|
5 | 7 | #include <linux/mutex.h> |
---|
6 | 8 | #include <linux/list.h> |
---|
7 | 9 | #include <linux/stringify.h> |
---|
| 10 | +#include <linux/highmem.h> |
---|
8 | 11 | #include <linux/mm.h> |
---|
9 | 12 | #include <linux/vmalloc.h> |
---|
10 | 13 | #include <linux/memory.h> |
---|
11 | 14 | #include <linux/stop_machine.h> |
---|
12 | 15 | #include <linux/slab.h> |
---|
13 | 16 | #include <linux/kdebug.h> |
---|
| 17 | +#include <linux/kprobes.h> |
---|
| 18 | +#include <linux/mmu_context.h> |
---|
| 19 | +#include <linux/bsearch.h> |
---|
| 20 | +#include <linux/sync_core.h> |
---|
14 | 21 | #include <asm/text-patching.h> |
---|
15 | 22 | #include <asm/alternative.h> |
---|
16 | 23 | #include <asm/sections.h> |
---|
17 | | -#include <asm/pgtable.h> |
---|
18 | 24 | #include <asm/mce.h> |
---|
19 | 25 | #include <asm/nmi.h> |
---|
20 | 26 | #include <asm/cacheflush.h> |
---|
21 | 27 | #include <asm/tlbflush.h> |
---|
| 28 | +#include <asm/insn.h> |
---|
22 | 29 | #include <asm/io.h> |
---|
23 | 30 | #include <asm/fixmap.h> |
---|
| 31 | +#include <asm/asm-prototypes.h> |
---|
24 | 32 | |
---|
25 | 33 | int __read_mostly alternatives_patched; |
---|
26 | 34 | |
---|
.. | .. |
---|
49 | 57 | #define DPRINTK(fmt, args...) \ |
---|
50 | 58 | do { \ |
---|
51 | 59 | if (debug_alternative) \ |
---|
52 | | - printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ |
---|
| 60 | + printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ |
---|
53 | 61 | } while (0) |
---|
54 | 62 | |
---|
55 | 63 | #define DUMP_BYTES(buf, len, fmt, args...) \ |
---|
.. | .. |
---|
60 | 68 | if (!(len)) \ |
---|
61 | 69 | break; \ |
---|
62 | 70 | \ |
---|
63 | | - printk(KERN_DEBUG fmt, ##args); \ |
---|
| 71 | + printk(KERN_DEBUG pr_fmt(fmt), ##args); \ |
---|
64 | 72 | for (j = 0; j < (len) - 1; j++) \ |
---|
65 | 73 | printk(KERN_CONT "%02hhx ", buf[j]); \ |
---|
66 | 74 | printk(KERN_CONT "%02hhx\n", buf[j]); \ |
---|
.. | .. |
---|
222 | 230 | } |
---|
223 | 231 | break; |
---|
224 | 232 | |
---|
| 233 | + case X86_VENDOR_HYGON: |
---|
| 234 | + ideal_nops = p6_nops; |
---|
| 235 | + return; |
---|
| 236 | + |
---|
225 | 237 | case X86_VENDOR_AMD: |
---|
226 | 238 | if (boot_cpu_data.x86 > 0xf) { |
---|
227 | 239 | ideal_nops = p6_nops; |
---|
228 | 240 | return; |
---|
229 | 241 | } |
---|
230 | 242 | |
---|
231 | | - /* fall through */ |
---|
| 243 | + fallthrough; |
---|
232 | 244 | |
---|
233 | 245 | default: |
---|
234 | 246 | #ifdef CONFIG_X86_64 |
---|
.. | .. |
---|
257 | 269 | } |
---|
258 | 270 | } |
---|
259 | 271 | |
---|
| 272 | +extern s32 __retpoline_sites[], __retpoline_sites_end[]; |
---|
| 273 | +extern s32 __return_sites[], __return_sites_end[]; |
---|
260 | 274 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; |
---|
261 | 275 | extern s32 __smp_locks[], __smp_locks_end[]; |
---|
262 | | -void *text_poke_early(void *addr, const void *opcode, size_t len); |
---|
| 276 | +void text_poke_early(void *addr, const void *opcode, size_t len); |
---|
263 | 277 | |
---|
264 | 278 | /* |
---|
265 | 279 | * Are we looking at a near JMP with a 1 or 4-byte displacement. |
---|
.. | .. |
---|
270 | 284 | } |
---|
271 | 285 | |
---|
272 | 286 | static void __init_or_module |
---|
273 | | -recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) |
---|
| 287 | +recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) |
---|
274 | 288 | { |
---|
275 | 289 | u8 *next_rip, *tgt_rip; |
---|
276 | 290 | s32 n_dspl, o_dspl; |
---|
.. | .. |
---|
279 | 293 | if (a->replacementlen != 5) |
---|
280 | 294 | return; |
---|
281 | 295 | |
---|
282 | | - o_dspl = *(s32 *)(insnbuf + 1); |
---|
| 296 | + o_dspl = *(s32 *)(insn_buff + 1); |
---|
283 | 297 | |
---|
284 | 298 | /* next_rip of the replacement JMP */ |
---|
285 | 299 | next_rip = repl_insn + a->replacementlen; |
---|
.. | .. |
---|
305 | 319 | two_byte_jmp: |
---|
306 | 320 | n_dspl -= 2; |
---|
307 | 321 | |
---|
308 | | - insnbuf[0] = 0xeb; |
---|
309 | | - insnbuf[1] = (s8)n_dspl; |
---|
310 | | - add_nops(insnbuf + 2, 3); |
---|
| 322 | + insn_buff[0] = 0xeb; |
---|
| 323 | + insn_buff[1] = (s8)n_dspl; |
---|
| 324 | + add_nops(insn_buff + 2, 3); |
---|
311 | 325 | |
---|
312 | 326 | repl_len = 2; |
---|
313 | 327 | goto done; |
---|
.. | .. |
---|
315 | 329 | five_byte_jmp: |
---|
316 | 330 | n_dspl -= 5; |
---|
317 | 331 | |
---|
318 | | - insnbuf[0] = 0xe9; |
---|
319 | | - *(s32 *)&insnbuf[1] = n_dspl; |
---|
| 332 | + insn_buff[0] = 0xe9; |
---|
| 333 | + *(s32 *)&insn_buff[1] = n_dspl; |
---|
320 | 334 | |
---|
321 | 335 | repl_len = 5; |
---|
322 | 336 | |
---|
.. | .. |
---|
327 | 341 | } |
---|
328 | 342 | |
---|
329 | 343 | /* |
---|
| 344 | + * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) |
---|
| 345 | + * |
---|
| 346 | + * @instr: instruction byte stream |
---|
| 347 | + * @instrlen: length of the above |
---|
| 348 | + * @off: offset within @instr where the first NOP has been detected |
---|
| 349 | + * |
---|
| 350 | + * Return: number of NOPs found (and replaced). |
---|
| 351 | + */ |
---|
| 352 | +static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) |
---|
| 353 | +{ |
---|
| 354 | + unsigned long flags; |
---|
| 355 | + int i = off, nnops; |
---|
| 356 | + |
---|
| 357 | + while (i < instrlen) { |
---|
| 358 | + if (instr[i] != 0x90) |
---|
| 359 | + break; |
---|
| 360 | + |
---|
| 361 | + i++; |
---|
| 362 | + } |
---|
| 363 | + |
---|
| 364 | + nnops = i - off; |
---|
| 365 | + |
---|
| 366 | + if (nnops <= 1) |
---|
| 367 | + return nnops; |
---|
| 368 | + |
---|
| 369 | + local_irq_save(flags); |
---|
| 370 | + add_nops(instr + off, nnops); |
---|
| 371 | + local_irq_restore(flags); |
---|
| 372 | + |
---|
| 373 | + DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); |
---|
| 374 | + |
---|
| 375 | + return nnops; |
---|
| 376 | +} |
---|
| 377 | + |
---|
| 378 | +/* |
---|
330 | 379 | * "noinline" to cause control flow change and thus invalidate I$ and |
---|
331 | 380 | * cause refetch after modification. |
---|
332 | 381 | */ |
---|
333 | | -static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) |
---|
| 382 | +static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) |
---|
334 | 383 | { |
---|
335 | | - unsigned long flags; |
---|
336 | | - int i; |
---|
| 384 | + struct insn insn; |
---|
| 385 | + int i = 0; |
---|
337 | 386 | |
---|
338 | | - for (i = 0; i < a->padlen; i++) { |
---|
339 | | - if (instr[i] != 0x90) |
---|
| 387 | + /* |
---|
| 388 | + * Jump over the non-NOP insns and optimize single-byte NOPs into bigger |
---|
| 389 | + * ones. |
---|
| 390 | + */ |
---|
| 391 | + for (;;) { |
---|
| 392 | + if (insn_decode_kernel(&insn, &instr[i])) |
---|
| 393 | + return; |
---|
| 394 | + |
---|
| 395 | + /* |
---|
| 396 | + * See if this and any potentially following NOPs can be |
---|
| 397 | + * optimized. |
---|
| 398 | + */ |
---|
| 399 | + if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) |
---|
| 400 | + i += optimize_nops_range(instr, len, i); |
---|
| 401 | + else |
---|
| 402 | + i += insn.length; |
---|
| 403 | + |
---|
| 404 | + if (i >= len) |
---|
340 | 405 | return; |
---|
341 | 406 | } |
---|
342 | | - |
---|
343 | | - local_irq_save(flags); |
---|
344 | | - add_nops(instr + (a->instrlen - a->padlen), a->padlen); |
---|
345 | | - local_irq_restore(flags); |
---|
346 | | - |
---|
347 | | - DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ", |
---|
348 | | - instr, a->instrlen - a->padlen, a->padlen); |
---|
349 | 407 | } |
---|
350 | 408 | |
---|
351 | 409 | /* |
---|
.. | .. |
---|
363 | 421 | { |
---|
364 | 422 | struct alt_instr *a; |
---|
365 | 423 | u8 *instr, *replacement; |
---|
366 | | - u8 insnbuf[MAX_PATCH_LEN]; |
---|
| 424 | + u8 insn_buff[MAX_PATCH_LEN]; |
---|
367 | 425 | |
---|
368 | 426 | DPRINTK("alt table %px, -> %px", start, end); |
---|
369 | 427 | /* |
---|
.. | .. |
---|
376 | 434 | * order. |
---|
377 | 435 | */ |
---|
378 | 436 | for (a = start; a < end; a++) { |
---|
379 | | - int insnbuf_sz = 0; |
---|
| 437 | + int insn_buff_sz = 0; |
---|
| 438 | + /* Mask away "NOT" flag bit for feature to test. */ |
---|
| 439 | + u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; |
---|
380 | 440 | |
---|
381 | 441 | instr = (u8 *)&a->instr_offset + a->instr_offset; |
---|
382 | 442 | replacement = (u8 *)&a->repl_offset + a->repl_offset; |
---|
383 | | - BUG_ON(a->instrlen > sizeof(insnbuf)); |
---|
384 | | - BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); |
---|
385 | | - if (!boot_cpu_has(a->cpuid)) { |
---|
386 | | - if (a->padlen > 1) |
---|
387 | | - optimize_nops(a, instr); |
---|
| 443 | + BUG_ON(a->instrlen > sizeof(insn_buff)); |
---|
| 444 | + BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); |
---|
388 | 445 | |
---|
389 | | - continue; |
---|
390 | | - } |
---|
| 446 | + /* |
---|
| 447 | + * Patch if either: |
---|
| 448 | + * - feature is present |
---|
| 449 | + * - feature not present but ALTINSTR_FLAG_INV is set to mean, |
---|
| 450 | + * patch if feature is *NOT* present. |
---|
| 451 | + */ |
---|
| 452 | + if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) |
---|
| 453 | + goto next; |
---|
391 | 454 | |
---|
392 | | - DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d", |
---|
393 | | - a->cpuid >> 5, |
---|
394 | | - a->cpuid & 0x1f, |
---|
395 | | - instr, a->instrlen, |
---|
396 | | - replacement, a->replacementlen, a->padlen); |
---|
| 455 | + DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", |
---|
| 456 | + (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", |
---|
| 457 | + feature >> 5, |
---|
| 458 | + feature & 0x1f, |
---|
| 459 | + instr, instr, a->instrlen, |
---|
| 460 | + replacement, a->replacementlen); |
---|
397 | 461 | |
---|
398 | 462 | DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); |
---|
399 | 463 | DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); |
---|
400 | 464 | |
---|
401 | | - memcpy(insnbuf, replacement, a->replacementlen); |
---|
402 | | - insnbuf_sz = a->replacementlen; |
---|
| 465 | + memcpy(insn_buff, replacement, a->replacementlen); |
---|
| 466 | + insn_buff_sz = a->replacementlen; |
---|
403 | 467 | |
---|
404 | 468 | /* |
---|
405 | 469 | * 0xe8 is a relative jump; fix the offset. |
---|
.. | .. |
---|
407 | 471 | * Instruction length is checked before the opcode to avoid |
---|
408 | 472 | * accessing uninitialized bytes for zero-length replacements. |
---|
409 | 473 | */ |
---|
410 | | - if (a->replacementlen == 5 && *insnbuf == 0xe8) { |
---|
411 | | - *(s32 *)(insnbuf + 1) += replacement - instr; |
---|
| 474 | + if (a->replacementlen == 5 && *insn_buff == 0xe8) { |
---|
| 475 | + *(s32 *)(insn_buff + 1) += replacement - instr; |
---|
412 | 476 | DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", |
---|
413 | | - *(s32 *)(insnbuf + 1), |
---|
414 | | - (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5); |
---|
| 477 | + *(s32 *)(insn_buff + 1), |
---|
| 478 | + (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); |
---|
415 | 479 | } |
---|
416 | 480 | |
---|
417 | 481 | if (a->replacementlen && is_jmp(replacement[0])) |
---|
418 | | - recompute_jump(a, instr, replacement, insnbuf); |
---|
| 482 | + recompute_jump(a, instr, replacement, insn_buff); |
---|
419 | 483 | |
---|
420 | | - if (a->instrlen > a->replacementlen) { |
---|
421 | | - add_nops(insnbuf + a->replacementlen, |
---|
422 | | - a->instrlen - a->replacementlen); |
---|
423 | | - insnbuf_sz += a->instrlen - a->replacementlen; |
---|
424 | | - } |
---|
425 | | - DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr); |
---|
| 484 | + for (; insn_buff_sz < a->instrlen; insn_buff_sz++) |
---|
| 485 | + insn_buff[insn_buff_sz] = 0x90; |
---|
426 | 486 | |
---|
427 | | - text_poke_early(instr, insnbuf, insnbuf_sz); |
---|
| 487 | + DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); |
---|
| 488 | + |
---|
| 489 | + text_poke_early(instr, insn_buff, insn_buff_sz); |
---|
| 490 | + |
---|
| 491 | +next: |
---|
| 492 | + optimize_nops(instr, a->instrlen); |
---|
428 | 493 | } |
---|
429 | 494 | } |
---|
| 495 | + |
---|
| 496 | +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION) |
---|
| 497 | + |
---|
| 498 | +/* |
---|
| 499 | + * CALL/JMP *%\reg |
---|
| 500 | + */ |
---|
| 501 | +static int emit_indirect(int op, int reg, u8 *bytes) |
---|
| 502 | +{ |
---|
| 503 | + int i = 0; |
---|
| 504 | + u8 modrm; |
---|
| 505 | + |
---|
| 506 | + switch (op) { |
---|
| 507 | + case CALL_INSN_OPCODE: |
---|
| 508 | + modrm = 0x10; /* Reg = 2; CALL r/m */ |
---|
| 509 | + break; |
---|
| 510 | + |
---|
| 511 | + case JMP32_INSN_OPCODE: |
---|
| 512 | + modrm = 0x20; /* Reg = 4; JMP r/m */ |
---|
| 513 | + break; |
---|
| 514 | + |
---|
| 515 | + default: |
---|
| 516 | + WARN_ON_ONCE(1); |
---|
| 517 | + return -1; |
---|
| 518 | + } |
---|
| 519 | + |
---|
| 520 | + if (reg >= 8) { |
---|
| 521 | + bytes[i++] = 0x41; /* REX.B prefix */ |
---|
| 522 | + reg -= 8; |
---|
| 523 | + } |
---|
| 524 | + |
---|
| 525 | + modrm |= 0xc0; /* Mod = 3 */ |
---|
| 526 | + modrm += reg; |
---|
| 527 | + |
---|
| 528 | + bytes[i++] = 0xff; /* opcode */ |
---|
| 529 | + bytes[i++] = modrm; |
---|
| 530 | + |
---|
| 531 | + return i; |
---|
| 532 | +} |
---|
| 533 | + |
---|
| 534 | +/* |
---|
| 535 | + * Rewrite the compiler generated retpoline thunk calls. |
---|
| 536 | + * |
---|
| 537 | + * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate |
---|
| 538 | + * indirect instructions, avoiding the extra indirection. |
---|
| 539 | + * |
---|
| 540 | + * For example, convert: |
---|
| 541 | + * |
---|
| 542 | + * CALL __x86_indirect_thunk_\reg |
---|
| 543 | + * |
---|
| 544 | + * into: |
---|
| 545 | + * |
---|
| 546 | + * CALL *%\reg |
---|
| 547 | + * |
---|
| 548 | + * It also tries to inline spectre_v2=retpoline,amd when size permits. |
---|
| 549 | + */ |
---|
| 550 | +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) |
---|
| 551 | +{ |
---|
| 552 | + retpoline_thunk_t *target; |
---|
| 553 | + int reg, ret, i = 0; |
---|
| 554 | + u8 op, cc; |
---|
| 555 | + |
---|
| 556 | + target = addr + insn->length + insn->immediate.value; |
---|
| 557 | + reg = target - __x86_indirect_thunk_array; |
---|
| 558 | + |
---|
| 559 | + if (WARN_ON_ONCE(reg & ~0xf)) |
---|
| 560 | + return -1; |
---|
| 561 | + |
---|
| 562 | + /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ |
---|
| 563 | + BUG_ON(reg == 4); |
---|
| 564 | + |
---|
| 565 | + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && |
---|
| 566 | + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) |
---|
| 567 | + return -1; |
---|
| 568 | + |
---|
| 569 | + op = insn->opcode.bytes[0]; |
---|
| 570 | + |
---|
| 571 | + /* |
---|
| 572 | + * Convert: |
---|
| 573 | + * |
---|
| 574 | + * Jcc.d32 __x86_indirect_thunk_\reg |
---|
| 575 | + * |
---|
| 576 | + * into: |
---|
| 577 | + * |
---|
| 578 | + * Jncc.d8 1f |
---|
| 579 | + * [ LFENCE ] |
---|
| 580 | + * JMP *%\reg |
---|
| 581 | + * [ NOP ] |
---|
| 582 | + * 1: |
---|
| 583 | + */ |
---|
| 584 | + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ |
---|
| 585 | + if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { |
---|
| 586 | + cc = insn->opcode.bytes[1] & 0xf; |
---|
| 587 | + cc ^= 1; /* invert condition */ |
---|
| 588 | + |
---|
| 589 | + bytes[i++] = 0x70 + cc; /* Jcc.d8 */ |
---|
| 590 | + bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ |
---|
| 591 | + |
---|
| 592 | + /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ |
---|
| 593 | + op = JMP32_INSN_OPCODE; |
---|
| 594 | + } |
---|
| 595 | + |
---|
| 596 | + /* |
---|
| 597 | + * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE. |
---|
| 598 | + */ |
---|
| 599 | + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { |
---|
| 600 | + bytes[i++] = 0x0f; |
---|
| 601 | + bytes[i++] = 0xae; |
---|
| 602 | + bytes[i++] = 0xe8; /* LFENCE */ |
---|
| 603 | + } |
---|
| 604 | + |
---|
| 605 | + ret = emit_indirect(op, reg, bytes + i); |
---|
| 606 | + if (ret < 0) |
---|
| 607 | + return ret; |
---|
| 608 | + i += ret; |
---|
| 609 | + |
---|
| 610 | + for (; i < insn->length;) |
---|
| 611 | + bytes[i++] = 0x90; |
---|
| 612 | + |
---|
| 613 | + return i; |
---|
| 614 | +} |
---|
| 615 | + |
---|
| 616 | +/* |
---|
| 617 | + * Generated by 'objtool --retpoline'. |
---|
| 618 | + */ |
---|
| 619 | +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) |
---|
| 620 | +{ |
---|
| 621 | + s32 *s; |
---|
| 622 | + |
---|
| 623 | + for (s = start; s < end; s++) { |
---|
| 624 | + void *addr = (void *)s + *s; |
---|
| 625 | + struct insn insn; |
---|
| 626 | + int len, ret; |
---|
| 627 | + u8 bytes[16]; |
---|
| 628 | + u8 op1, op2; |
---|
| 629 | + |
---|
| 630 | + ret = insn_decode_kernel(&insn, addr); |
---|
| 631 | + if (WARN_ON_ONCE(ret < 0)) |
---|
| 632 | + continue; |
---|
| 633 | + |
---|
| 634 | + op1 = insn.opcode.bytes[0]; |
---|
| 635 | + op2 = insn.opcode.bytes[1]; |
---|
| 636 | + |
---|
| 637 | + switch (op1) { |
---|
| 638 | + case CALL_INSN_OPCODE: |
---|
| 639 | + case JMP32_INSN_OPCODE: |
---|
| 640 | + break; |
---|
| 641 | + |
---|
| 642 | + case 0x0f: /* escape */ |
---|
| 643 | + if (op2 >= 0x80 && op2 <= 0x8f) |
---|
| 644 | + break; |
---|
| 645 | + fallthrough; |
---|
| 646 | + default: |
---|
| 647 | + WARN_ON_ONCE(1); |
---|
| 648 | + continue; |
---|
| 649 | + } |
---|
| 650 | + |
---|
| 651 | + DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", |
---|
| 652 | + addr, addr, insn.length, |
---|
| 653 | + addr + insn.length + insn.immediate.value); |
---|
| 654 | + |
---|
| 655 | + len = patch_retpoline(addr, &insn, bytes); |
---|
| 656 | + if (len == insn.length) { |
---|
| 657 | + optimize_nops(bytes, len); |
---|
| 658 | + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); |
---|
| 659 | + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); |
---|
| 660 | + text_poke_early(addr, bytes, len); |
---|
| 661 | + } |
---|
| 662 | + } |
---|
| 663 | +} |
---|
| 664 | + |
---|
| 665 | +#ifdef CONFIG_RETHUNK |
---|
| 666 | +/* |
---|
| 667 | + * Rewrite the compiler generated return thunk tail-calls. |
---|
| 668 | + * |
---|
| 669 | + * For example, convert: |
---|
| 670 | + * |
---|
| 671 | + * JMP __x86_return_thunk |
---|
| 672 | + * |
---|
| 673 | + * into: |
---|
| 674 | + * |
---|
| 675 | + * RET |
---|
| 676 | + */ |
---|
| 677 | +static int patch_return(void *addr, struct insn *insn, u8 *bytes) |
---|
| 678 | +{ |
---|
| 679 | + int i = 0; |
---|
| 680 | + |
---|
| 681 | + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) |
---|
| 682 | + return -1; |
---|
| 683 | + |
---|
| 684 | + bytes[i++] = RET_INSN_OPCODE; |
---|
| 685 | + |
---|
| 686 | + for (; i < insn->length;) |
---|
| 687 | + bytes[i++] = INT3_INSN_OPCODE; |
---|
| 688 | + |
---|
| 689 | + return i; |
---|
| 690 | +} |
---|
| 691 | + |
---|
| 692 | +void __init_or_module noinline apply_returns(s32 *start, s32 *end) |
---|
| 693 | +{ |
---|
| 694 | + s32 *s; |
---|
| 695 | + |
---|
| 696 | + for (s = start; s < end; s++) { |
---|
| 697 | + void *dest = NULL, *addr = (void *)s + *s; |
---|
| 698 | + struct insn insn; |
---|
| 699 | + int len, ret; |
---|
| 700 | + u8 bytes[16]; |
---|
| 701 | + u8 op; |
---|
| 702 | + |
---|
| 703 | + ret = insn_decode_kernel(&insn, addr); |
---|
| 704 | + if (WARN_ON_ONCE(ret < 0)) |
---|
| 705 | + continue; |
---|
| 706 | + |
---|
| 707 | + op = insn.opcode.bytes[0]; |
---|
| 708 | + if (op == JMP32_INSN_OPCODE) |
---|
| 709 | + dest = addr + insn.length + insn.immediate.value; |
---|
| 710 | + |
---|
| 711 | + if (__static_call_fixup(addr, op, dest) || |
---|
| 712 | + WARN_ONCE(dest != &__x86_return_thunk, |
---|
| 713 | + "missing return thunk: %pS-%pS: %*ph", |
---|
| 714 | + addr, dest, 5, addr)) |
---|
| 715 | + continue; |
---|
| 716 | + |
---|
| 717 | + DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", |
---|
| 718 | + addr, addr, insn.length, |
---|
| 719 | + addr + insn.length + insn.immediate.value); |
---|
| 720 | + |
---|
| 721 | + len = patch_return(addr, &insn, bytes); |
---|
| 722 | + if (len == insn.length) { |
---|
| 723 | + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); |
---|
| 724 | + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); |
---|
| 725 | + text_poke_early(addr, bytes, len); |
---|
| 726 | + } |
---|
| 727 | + } |
---|
| 728 | +} |
---|
| 729 | +#else |
---|
| 730 | +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } |
---|
| 731 | +#endif /* CONFIG_RETHUNK */ |
---|
| 732 | + |
---|
| 733 | +#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */ |
---|
| 734 | + |
---|
| 735 | +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } |
---|
| 736 | +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } |
---|
| 737 | + |
---|
| 738 | +#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */ |
---|
430 | 739 | |
---|
431 | 740 | #ifdef CONFIG_SMP |
---|
432 | 741 | static void alternatives_smp_lock(const s32 *start, const s32 *end, |
---|
.. | .. |
---|
586 | 895 | struct paravirt_patch_site *end) |
---|
587 | 896 | { |
---|
588 | 897 | struct paravirt_patch_site *p; |
---|
589 | | - char insnbuf[MAX_PATCH_LEN]; |
---|
| 898 | + char insn_buff[MAX_PATCH_LEN]; |
---|
590 | 899 | |
---|
591 | 900 | for (p = start; p < end; p++) { |
---|
592 | 901 | unsigned int used; |
---|
593 | 902 | |
---|
594 | 903 | BUG_ON(p->len > MAX_PATCH_LEN); |
---|
595 | 904 | /* prep the buffer with the original instructions */ |
---|
596 | | - memcpy(insnbuf, p->instr, p->len); |
---|
597 | | - used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf, |
---|
598 | | - (unsigned long)p->instr, p->len); |
---|
| 905 | + memcpy(insn_buff, p->instr, p->len); |
---|
| 906 | + used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len); |
---|
599 | 907 | |
---|
600 | 908 | BUG_ON(used > p->len); |
---|
601 | 909 | |
---|
602 | 910 | /* Pad the rest with nops */ |
---|
603 | | - add_nops(insnbuf + used, p->len - used); |
---|
604 | | - text_poke_early(p->instr, insnbuf, p->len); |
---|
| 911 | + add_nops(insn_buff + used, p->len - used); |
---|
| 912 | + text_poke_early(p->instr, insn_buff, p->len); |
---|
605 | 913 | } |
---|
606 | 914 | } |
---|
607 | 915 | extern struct paravirt_patch_site __start_parainstructions[], |
---|
608 | 916 | __stop_parainstructions[]; |
---|
609 | 917 | #endif /* CONFIG_PARAVIRT */ |
---|
610 | 918 | |
---|
| 919 | +/* |
---|
| 920 | + * Self-test for the INT3 based CALL emulation code. |
---|
| 921 | + * |
---|
| 922 | + * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up |
---|
| 923 | + * properly and that there is a stack gap between the INT3 frame and the |
---|
| 924 | + * previous context. Without this gap doing a virtual PUSH on the interrupted |
---|
| 925 | + * stack would corrupt the INT3 IRET frame. |
---|
| 926 | + * |
---|
| 927 | + * See entry_{32,64}.S for more details. |
---|
| 928 | + */ |
---|
| 929 | +static void __init __no_sanitize_address notrace int3_magic(unsigned int *ptr) |
---|
| 930 | +{ |
---|
| 931 | + *ptr = 1; |
---|
| 932 | +} |
---|
| 933 | + |
---|
| 934 | +extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */ |
---|
| 935 | + |
---|
| 936 | +static int __init |
---|
| 937 | +int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) |
---|
| 938 | +{ |
---|
| 939 | + struct die_args *args = data; |
---|
| 940 | + struct pt_regs *regs = args->regs; |
---|
| 941 | + |
---|
| 942 | + if (!regs || user_mode(regs)) |
---|
| 943 | + return NOTIFY_DONE; |
---|
| 944 | + |
---|
| 945 | + if (val != DIE_INT3) |
---|
| 946 | + return NOTIFY_DONE; |
---|
| 947 | + |
---|
| 948 | + if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip) |
---|
| 949 | + return NOTIFY_DONE; |
---|
| 950 | + |
---|
| 951 | + int3_emulate_call(regs, (unsigned long)&int3_magic); |
---|
| 952 | + return NOTIFY_STOP; |
---|
| 953 | +} |
---|
| 954 | + |
---|
| 955 | +static void __init int3_selftest(void) |
---|
| 956 | +{ |
---|
| 957 | + static __initdata struct notifier_block int3_exception_nb = { |
---|
| 958 | + .notifier_call = int3_exception_notify, |
---|
| 959 | + .priority = INT_MAX-1, /* last */ |
---|
| 960 | + }; |
---|
| 961 | + unsigned int val = 0; |
---|
| 962 | + |
---|
| 963 | + BUG_ON(register_die_notifier(&int3_exception_nb)); |
---|
| 964 | + |
---|
| 965 | + /* |
---|
| 966 | + * Basically: int3_magic(&val); but really complicated :-) |
---|
| 967 | + * |
---|
| 968 | + * Stick the address of the INT3 instruction into int3_selftest_ip, |
---|
| 969 | + * then trigger the INT3, padded with NOPs to match a CALL instruction |
---|
| 970 | + * length. |
---|
| 971 | + */ |
---|
| 972 | + asm volatile ("1: int3; nop; nop; nop; nop\n\t" |
---|
| 973 | + ".pushsection .init.data,\"aw\"\n\t" |
---|
| 974 | + ".align " __ASM_SEL(4, 8) "\n\t" |
---|
| 975 | + ".type int3_selftest_ip, @object\n\t" |
---|
| 976 | + ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t" |
---|
| 977 | + "int3_selftest_ip:\n\t" |
---|
| 978 | + __ASM_SEL(.long, .quad) " 1b\n\t" |
---|
| 979 | + ".popsection\n\t" |
---|
| 980 | + : ASM_CALL_CONSTRAINT |
---|
| 981 | + : __ASM_SEL_RAW(a, D) (&val) |
---|
| 982 | + : "memory"); |
---|
| 983 | + |
---|
| 984 | + BUG_ON(val != 1); |
---|
| 985 | + |
---|
| 986 | + unregister_die_notifier(&int3_exception_nb); |
---|
| 987 | +} |
---|
| 988 | + |
---|
611 | 989 | void __init alternative_instructions(void) |
---|
612 | 990 | { |
---|
613 | | - /* The patching is not fully atomic, so try to avoid local interruptions |
---|
614 | | - that might execute the to be patched code. |
---|
615 | | - Other CPUs are not running. */ |
---|
| 991 | + int3_selftest(); |
---|
| 992 | + |
---|
| 993 | + /* |
---|
| 994 | + * The patching is not fully atomic, so try to avoid local |
---|
| 995 | + * interruptions that might execute the to be patched code. |
---|
| 996 | + * Other CPUs are not running. |
---|
| 997 | + */ |
---|
616 | 998 | stop_nmi(); |
---|
617 | 999 | |
---|
618 | 1000 | /* |
---|
619 | 1001 | * Don't stop machine check exceptions while patching. |
---|
620 | 1002 | * MCEs only happen when something got corrupted and in this |
---|
621 | 1003 | * case we must do something about the corruption. |
---|
622 | | - * Ignoring it is worse than a unlikely patching race. |
---|
| 1004 | + * Ignoring it is worse than an unlikely patching race. |
---|
623 | 1005 | * Also machine checks tend to be broadcast and if one CPU |
---|
624 | 1006 | * goes into machine check the others follow quickly, so we don't |
---|
625 | 1007 | * expect a machine check to cause undue problems during to code |
---|
626 | 1008 | * patching. |
---|
627 | 1009 | */ |
---|
| 1010 | + |
---|
| 1011 | + /* |
---|
| 1012 | + * Rewrite the retpolines, must be done before alternatives since |
---|
| 1013 | + * those can rewrite the retpoline thunks. |
---|
| 1014 | + */ |
---|
| 1015 | + apply_retpolines(__retpoline_sites, __retpoline_sites_end); |
---|
| 1016 | + apply_returns(__return_sites, __return_sites_end); |
---|
628 | 1017 | |
---|
629 | 1018 | apply_alternatives(__alt_instructions, __alt_instructions_end); |
---|
630 | 1019 | |
---|
.. | .. |
---|
637 | 1026 | _text, _etext); |
---|
638 | 1027 | } |
---|
639 | 1028 | |
---|
640 | | - if (!uniproc_patched || num_possible_cpus() == 1) |
---|
| 1029 | + if (!uniproc_patched || num_possible_cpus() == 1) { |
---|
641 | 1030 | free_init_pages("SMP alternatives", |
---|
642 | 1031 | (unsigned long)__smp_locks, |
---|
643 | 1032 | (unsigned long)__smp_locks_end); |
---|
| 1033 | + } |
---|
644 | 1034 | #endif |
---|
645 | 1035 | |
---|
646 | 1036 | apply_paravirt(__parainstructions, __parainstructions_end); |
---|
.. | .. |
---|
658 | 1048 | * When you use this code to patch more than one byte of an instruction |
---|
659 | 1049 | * you need to make sure that other CPUs cannot execute this code in parallel. |
---|
660 | 1050 | * Also no thread must be currently preempted in the middle of these |
---|
661 | | - * instructions. And on the local CPU you need to be protected again NMI or MCE |
---|
662 | | - * handlers seeing an inconsistent instruction while you patch. |
---|
| 1051 | + * instructions. And on the local CPU you need to be protected against NMI or |
---|
| 1052 | + * MCE handlers seeing an inconsistent instruction while you patch. |
---|
663 | 1053 | */ |
---|
664 | | -void *__init_or_module text_poke_early(void *addr, const void *opcode, |
---|
665 | | - size_t len) |
---|
| 1054 | +void __init_or_module text_poke_early(void *addr, const void *opcode, |
---|
| 1055 | + size_t len) |
---|
666 | 1056 | { |
---|
667 | 1057 | unsigned long flags; |
---|
668 | 1058 | |
---|
.. | .. |
---|
685 | 1075 | * that causes hangs on some VIA CPUs. |
---|
686 | 1076 | */ |
---|
687 | 1077 | } |
---|
| 1078 | +} |
---|
| 1079 | + |
---|
| 1080 | +typedef struct { |
---|
| 1081 | + struct mm_struct *mm; |
---|
| 1082 | +} temp_mm_state_t; |
---|
| 1083 | + |
---|
| 1084 | +/* |
---|
| 1085 | + * Using a temporary mm allows to set temporary mappings that are not accessible |
---|
| 1086 | + * by other CPUs. Such mappings are needed to perform sensitive memory writes |
---|
| 1087 | + * that override the kernel memory protections (e.g., W^X), without exposing the |
---|
| 1088 | + * temporary page-table mappings that are required for these write operations to |
---|
| 1089 | + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the |
---|
| 1090 | + * mapping is torn down. |
---|
| 1091 | + * |
---|
| 1092 | + * Context: The temporary mm needs to be used exclusively by a single core. To |
---|
| 1093 | + * harden security IRQs must be disabled while the temporary mm is |
---|
| 1094 | + * loaded, thereby preventing interrupt handler bugs from overriding |
---|
| 1095 | + * the kernel memory protection. |
---|
| 1096 | + */ |
---|
| 1097 | +static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) |
---|
| 1098 | +{ |
---|
| 1099 | + temp_mm_state_t temp_state; |
---|
| 1100 | + |
---|
| 1101 | + lockdep_assert_irqs_disabled(); |
---|
| 1102 | + |
---|
| 1103 | + /* |
---|
| 1104 | + * Make sure not to be in TLB lazy mode, as otherwise we'll end up |
---|
| 1105 | + * with a stale address space WITHOUT being in lazy mode after |
---|
| 1106 | + * restoring the previous mm. |
---|
| 1107 | + */ |
---|
| 1108 | + if (this_cpu_read(cpu_tlbstate.is_lazy)) |
---|
| 1109 | + leave_mm(smp_processor_id()); |
---|
| 1110 | + |
---|
| 1111 | + temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); |
---|
| 1112 | + switch_mm_irqs_off(NULL, mm, current); |
---|
| 1113 | + |
---|
| 1114 | + /* |
---|
| 1115 | + * If breakpoints are enabled, disable them while the temporary mm is |
---|
| 1116 | + * used. Userspace might set up watchpoints on addresses that are used |
---|
| 1117 | + * in the temporary mm, which would lead to wrong signals being sent or |
---|
| 1118 | + * crashes. |
---|
| 1119 | + * |
---|
| 1120 | + * Note that breakpoints are not disabled selectively, which also causes |
---|
| 1121 | + * kernel breakpoints (e.g., perf's) to be disabled. This might be |
---|
| 1122 | + * undesirable, but still seems reasonable as the code that runs in the |
---|
| 1123 | + * temporary mm should be short. |
---|
| 1124 | + */ |
---|
| 1125 | + if (hw_breakpoint_active()) |
---|
| 1126 | + hw_breakpoint_disable(); |
---|
| 1127 | + |
---|
| 1128 | + return temp_state; |
---|
| 1129 | +} |
---|
| 1130 | + |
---|
| 1131 | +static inline void unuse_temporary_mm(temp_mm_state_t prev_state) |
---|
| 1132 | +{ |
---|
| 1133 | + lockdep_assert_irqs_disabled(); |
---|
| 1134 | + switch_mm_irqs_off(NULL, prev_state.mm, current); |
---|
| 1135 | + |
---|
| 1136 | + /* |
---|
| 1137 | + * Restore the breakpoints if they were disabled before the temporary mm |
---|
| 1138 | + * was loaded. |
---|
| 1139 | + */ |
---|
| 1140 | + if (hw_breakpoint_active()) |
---|
| 1141 | + hw_breakpoint_restore(); |
---|
| 1142 | +} |
---|
| 1143 | + |
---|
| 1144 | +__ro_after_init struct mm_struct *poking_mm; |
---|
| 1145 | +__ro_after_init unsigned long poking_addr; |
---|
| 1146 | + |
---|
| 1147 | +static void *__text_poke(void *addr, const void *opcode, size_t len) |
---|
| 1148 | +{ |
---|
| 1149 | + bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; |
---|
| 1150 | + struct page *pages[2] = {NULL}; |
---|
| 1151 | + temp_mm_state_t prev; |
---|
| 1152 | + unsigned long flags; |
---|
| 1153 | + pte_t pte, *ptep; |
---|
| 1154 | + spinlock_t *ptl; |
---|
| 1155 | + pgprot_t pgprot; |
---|
| 1156 | + |
---|
| 1157 | + /* |
---|
| 1158 | + * While boot memory allocator is running we cannot use struct pages as |
---|
| 1159 | + * they are not yet initialized. There is no way to recover. |
---|
| 1160 | + */ |
---|
| 1161 | + BUG_ON(!after_bootmem); |
---|
| 1162 | + |
---|
| 1163 | + if (!core_kernel_text((unsigned long)addr)) { |
---|
| 1164 | + pages[0] = vmalloc_to_page(addr); |
---|
| 1165 | + if (cross_page_boundary) |
---|
| 1166 | + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); |
---|
| 1167 | + } else { |
---|
| 1168 | + pages[0] = virt_to_page(addr); |
---|
| 1169 | + WARN_ON(!PageReserved(pages[0])); |
---|
| 1170 | + if (cross_page_boundary) |
---|
| 1171 | + pages[1] = virt_to_page(addr + PAGE_SIZE); |
---|
| 1172 | + } |
---|
| 1173 | + /* |
---|
| 1174 | + * If something went wrong, crash and burn since recovery paths are not |
---|
| 1175 | + * implemented. |
---|
| 1176 | + */ |
---|
| 1177 | + BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); |
---|
| 1178 | + |
---|
| 1179 | + /* |
---|
| 1180 | + * Map the page without the global bit, as TLB flushing is done with |
---|
| 1181 | + * flush_tlb_mm_range(), which is intended for non-global PTEs. |
---|
| 1182 | + */ |
---|
| 1183 | + pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); |
---|
| 1184 | + |
---|
| 1185 | + /* |
---|
| 1186 | + * The lock is not really needed, but this allows to avoid open-coding. |
---|
| 1187 | + */ |
---|
| 1188 | + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); |
---|
| 1189 | + |
---|
| 1190 | + /* |
---|
| 1191 | + * This must not fail; preallocated in poking_init(). |
---|
| 1192 | + */ |
---|
| 1193 | + VM_BUG_ON(!ptep); |
---|
| 1194 | + |
---|
| 1195 | + local_irq_save(flags); |
---|
| 1196 | + |
---|
| 1197 | + pte = mk_pte(pages[0], pgprot); |
---|
| 1198 | + set_pte_at(poking_mm, poking_addr, ptep, pte); |
---|
| 1199 | + |
---|
| 1200 | + if (cross_page_boundary) { |
---|
| 1201 | + pte = mk_pte(pages[1], pgprot); |
---|
| 1202 | + set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); |
---|
| 1203 | + } |
---|
| 1204 | + |
---|
| 1205 | + /* |
---|
| 1206 | + * Loading the temporary mm behaves as a compiler barrier, which |
---|
| 1207 | + * guarantees that the PTE will be set at the time memcpy() is done. |
---|
| 1208 | + */ |
---|
| 1209 | + prev = use_temporary_mm(poking_mm); |
---|
| 1210 | + |
---|
| 1211 | + kasan_disable_current(); |
---|
| 1212 | + memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); |
---|
| 1213 | + kasan_enable_current(); |
---|
| 1214 | + |
---|
| 1215 | + /* |
---|
| 1216 | + * Ensure that the PTE is only cleared after the instructions of memcpy |
---|
| 1217 | + * were issued by using a compiler barrier. |
---|
| 1218 | + */ |
---|
| 1219 | + barrier(); |
---|
| 1220 | + |
---|
| 1221 | + pte_clear(poking_mm, poking_addr, ptep); |
---|
| 1222 | + if (cross_page_boundary) |
---|
| 1223 | + pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); |
---|
| 1224 | + |
---|
| 1225 | + /* |
---|
| 1226 | + * Loading the previous page-table hierarchy requires a serializing |
---|
| 1227 | + * instruction that already allows the core to see the updated version. |
---|
| 1228 | + * Xen-PV is assumed to serialize execution in a similar manner. |
---|
| 1229 | + */ |
---|
| 1230 | + unuse_temporary_mm(prev); |
---|
| 1231 | + |
---|
| 1232 | + /* |
---|
| 1233 | + * Flushing the TLB might involve IPIs, which would require enabled |
---|
| 1234 | + * IRQs, but not if the mm is not used, as it is in this point. |
---|
| 1235 | + */ |
---|
| 1236 | + flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + |
---|
| 1237 | + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, |
---|
| 1238 | + PAGE_SHIFT, false); |
---|
| 1239 | + |
---|
| 1240 | + /* |
---|
| 1241 | + * If the text does not match what we just wrote then something is |
---|
| 1242 | + * fundamentally screwy; there's nothing we can really do about that. |
---|
| 1243 | + */ |
---|
| 1244 | + BUG_ON(memcmp(addr, opcode, len)); |
---|
| 1245 | + |
---|
| 1246 | + local_irq_restore(flags); |
---|
| 1247 | + pte_unmap_unlock(ptep, ptl); |
---|
688 | 1248 | return addr; |
---|
689 | 1249 | } |
---|
690 | 1250 | |
---|
.. | .. |
---|
698 | 1258 | * It means the size must be writable atomically and the address must be aligned |
---|
699 | 1259 | * in a way that permits an atomic write. It also makes sure we fit on a single |
---|
700 | 1260 | * page. |
---|
| 1261 | + * |
---|
| 1262 | + * Note that the caller must ensure that if the modified code is part of a |
---|
| 1263 | + * module, the module would not be removed during poking. This can be achieved |
---|
| 1264 | + * by registering a module notifier, and ordering module removal and patching |
---|
| 1265 | + * trough a mutex. |
---|
701 | 1266 | */ |
---|
702 | 1267 | void *text_poke(void *addr, const void *opcode, size_t len) |
---|
703 | 1268 | { |
---|
704 | | - unsigned long flags; |
---|
705 | | - char *vaddr; |
---|
706 | | - struct page *pages[2]; |
---|
707 | | - int i; |
---|
708 | | - |
---|
709 | | - /* |
---|
710 | | - * While boot memory allocator is runnig we cannot use struct |
---|
711 | | - * pages as they are not yet initialized. |
---|
712 | | - */ |
---|
713 | | - BUG_ON(!after_bootmem); |
---|
714 | | - |
---|
715 | 1269 | lockdep_assert_held(&text_mutex); |
---|
716 | 1270 | |
---|
717 | | - if (!core_kernel_text((unsigned long)addr)) { |
---|
718 | | - pages[0] = vmalloc_to_page(addr); |
---|
719 | | - pages[1] = vmalloc_to_page(addr + PAGE_SIZE); |
---|
720 | | - } else { |
---|
721 | | - pages[0] = virt_to_page(addr); |
---|
722 | | - WARN_ON(!PageReserved(pages[0])); |
---|
723 | | - pages[1] = virt_to_page(addr + PAGE_SIZE); |
---|
724 | | - } |
---|
725 | | - BUG_ON(!pages[0]); |
---|
726 | | - local_irq_save(flags); |
---|
727 | | - set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); |
---|
728 | | - if (pages[1]) |
---|
729 | | - set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); |
---|
730 | | - vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); |
---|
731 | | - memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); |
---|
732 | | - clear_fixmap(FIX_TEXT_POKE0); |
---|
733 | | - if (pages[1]) |
---|
734 | | - clear_fixmap(FIX_TEXT_POKE1); |
---|
735 | | - local_flush_tlb(); |
---|
736 | | - sync_core(); |
---|
737 | | - /* Could also do a CLFLUSH here to speed up CPU recovery; but |
---|
738 | | - that causes hangs on some VIA CPUs. */ |
---|
739 | | - for (i = 0; i < len; i++) |
---|
740 | | - BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); |
---|
741 | | - local_irq_restore(flags); |
---|
742 | | - return addr; |
---|
| 1271 | + return __text_poke(addr, opcode, len); |
---|
| 1272 | +} |
---|
| 1273 | + |
---|
| 1274 | +/** |
---|
| 1275 | + * text_poke_kgdb - Update instructions on a live kernel by kgdb |
---|
| 1276 | + * @addr: address to modify |
---|
| 1277 | + * @opcode: source of the copy |
---|
| 1278 | + * @len: length to copy |
---|
| 1279 | + * |
---|
| 1280 | + * Only atomic text poke/set should be allowed when not doing early patching. |
---|
| 1281 | + * It means the size must be writable atomically and the address must be aligned |
---|
| 1282 | + * in a way that permits an atomic write. It also makes sure we fit on a single |
---|
| 1283 | + * page. |
---|
| 1284 | + * |
---|
| 1285 | + * Context: should only be used by kgdb, which ensures no other core is running, |
---|
| 1286 | + * despite the fact it does not hold the text_mutex. |
---|
| 1287 | + */ |
---|
| 1288 | +void *text_poke_kgdb(void *addr, const void *opcode, size_t len) |
---|
| 1289 | +{ |
---|
| 1290 | + return __text_poke(addr, opcode, len); |
---|
743 | 1291 | } |
---|
744 | 1292 | |
---|
745 | 1293 | static void do_sync_core(void *info) |
---|
.. | .. |
---|
747 | 1295 | sync_core(); |
---|
748 | 1296 | } |
---|
749 | 1297 | |
---|
750 | | -static bool bp_patching_in_progress; |
---|
751 | | -static void *bp_int3_handler, *bp_int3_addr; |
---|
752 | | - |
---|
753 | | -int poke_int3_handler(struct pt_regs *regs) |
---|
| 1298 | +void text_poke_sync(void) |
---|
754 | 1299 | { |
---|
| 1300 | + on_each_cpu(do_sync_core, NULL, 1); |
---|
| 1301 | +} |
---|
| 1302 | + |
---|
| 1303 | +struct text_poke_loc { |
---|
| 1304 | + /* addr := _stext + rel_addr */ |
---|
| 1305 | + s32 rel_addr; |
---|
| 1306 | + s32 disp; |
---|
| 1307 | + u8 len; |
---|
| 1308 | + u8 opcode; |
---|
| 1309 | + const u8 text[POKE_MAX_OPCODE_SIZE]; |
---|
| 1310 | + /* see text_poke_bp_batch() */ |
---|
| 1311 | + u8 old; |
---|
| 1312 | +}; |
---|
| 1313 | + |
---|
| 1314 | +struct bp_patching_desc { |
---|
| 1315 | + struct text_poke_loc *vec; |
---|
| 1316 | + int nr_entries; |
---|
| 1317 | + atomic_t refs; |
---|
| 1318 | +}; |
---|
| 1319 | + |
---|
| 1320 | +static struct bp_patching_desc bp_desc; |
---|
| 1321 | + |
---|
| 1322 | +static __always_inline |
---|
| 1323 | +struct bp_patching_desc *try_get_desc(void) |
---|
| 1324 | +{ |
---|
| 1325 | + struct bp_patching_desc *desc = &bp_desc; |
---|
| 1326 | + |
---|
| 1327 | + if (!arch_atomic_inc_not_zero(&desc->refs)) |
---|
| 1328 | + return NULL; |
---|
| 1329 | + |
---|
| 1330 | + return desc; |
---|
| 1331 | +} |
---|
| 1332 | + |
---|
| 1333 | +static __always_inline void put_desc(void) |
---|
| 1334 | +{ |
---|
| 1335 | + struct bp_patching_desc *desc = &bp_desc; |
---|
| 1336 | + |
---|
| 1337 | + smp_mb__before_atomic(); |
---|
| 1338 | + arch_atomic_dec(&desc->refs); |
---|
| 1339 | +} |
---|
| 1340 | + |
---|
| 1341 | +static __always_inline void *text_poke_addr(struct text_poke_loc *tp) |
---|
| 1342 | +{ |
---|
| 1343 | + return _stext + tp->rel_addr; |
---|
| 1344 | +} |
---|
| 1345 | + |
---|
| 1346 | +static __always_inline int patch_cmp(const void *key, const void *elt) |
---|
| 1347 | +{ |
---|
| 1348 | + struct text_poke_loc *tp = (struct text_poke_loc *) elt; |
---|
| 1349 | + |
---|
| 1350 | + if (key < text_poke_addr(tp)) |
---|
| 1351 | + return -1; |
---|
| 1352 | + if (key > text_poke_addr(tp)) |
---|
| 1353 | + return 1; |
---|
| 1354 | + return 0; |
---|
| 1355 | +} |
---|
| 1356 | + |
---|
| 1357 | +noinstr int poke_int3_handler(struct pt_regs *regs) |
---|
| 1358 | +{ |
---|
| 1359 | + struct bp_patching_desc *desc; |
---|
| 1360 | + struct text_poke_loc *tp; |
---|
| 1361 | + int ret = 0; |
---|
| 1362 | + void *ip; |
---|
| 1363 | + |
---|
| 1364 | + if (user_mode(regs)) |
---|
| 1365 | + return 0; |
---|
| 1366 | + |
---|
755 | 1367 | /* |
---|
756 | 1368 | * Having observed our INT3 instruction, we now must observe |
---|
757 | | - * bp_patching_in_progress. |
---|
| 1369 | + * bp_desc with non-zero refcount: |
---|
758 | 1370 | * |
---|
759 | | - * in_progress = TRUE INT3 |
---|
760 | | - * WMB RMB |
---|
761 | | - * write INT3 if (in_progress) |
---|
762 | | - * |
---|
763 | | - * Idem for bp_int3_handler. |
---|
| 1371 | + * bp_desc.refs = 1 INT3 |
---|
| 1372 | + * WMB RMB |
---|
| 1373 | + * write INT3 if (bp_desc.refs != 0) |
---|
764 | 1374 | */ |
---|
765 | 1375 | smp_rmb(); |
---|
766 | 1376 | |
---|
767 | | - if (likely(!bp_patching_in_progress)) |
---|
| 1377 | + desc = try_get_desc(); |
---|
| 1378 | + if (!desc) |
---|
768 | 1379 | return 0; |
---|
769 | 1380 | |
---|
770 | | - if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr) |
---|
771 | | - return 0; |
---|
| 1381 | + /* |
---|
| 1382 | + * Discount the INT3. See text_poke_bp_batch(). |
---|
| 1383 | + */ |
---|
| 1384 | + ip = (void *) regs->ip - INT3_INSN_SIZE; |
---|
772 | 1385 | |
---|
773 | | - /* set up the specified breakpoint handler */ |
---|
774 | | - regs->ip = (unsigned long) bp_int3_handler; |
---|
| 1386 | + /* |
---|
| 1387 | + * Skip the binary search if there is a single member in the vector. |
---|
| 1388 | + */ |
---|
| 1389 | + if (unlikely(desc->nr_entries > 1)) { |
---|
| 1390 | + tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, |
---|
| 1391 | + sizeof(struct text_poke_loc), |
---|
| 1392 | + patch_cmp); |
---|
| 1393 | + if (!tp) |
---|
| 1394 | + goto out_put; |
---|
| 1395 | + } else { |
---|
| 1396 | + tp = desc->vec; |
---|
| 1397 | + if (text_poke_addr(tp) != ip) |
---|
| 1398 | + goto out_put; |
---|
| 1399 | + } |
---|
775 | 1400 | |
---|
776 | | - return 1; |
---|
| 1401 | + ip += tp->len; |
---|
777 | 1402 | |
---|
| 1403 | + switch (tp->opcode) { |
---|
| 1404 | + case INT3_INSN_OPCODE: |
---|
| 1405 | + /* |
---|
| 1406 | + * Someone poked an explicit INT3, they'll want to handle it, |
---|
| 1407 | + * do not consume. |
---|
| 1408 | + */ |
---|
| 1409 | + goto out_put; |
---|
| 1410 | + |
---|
| 1411 | + case RET_INSN_OPCODE: |
---|
| 1412 | + int3_emulate_ret(regs); |
---|
| 1413 | + break; |
---|
| 1414 | + |
---|
| 1415 | + case CALL_INSN_OPCODE: |
---|
| 1416 | + int3_emulate_call(regs, (long)ip + tp->disp); |
---|
| 1417 | + break; |
---|
| 1418 | + |
---|
| 1419 | + case JMP32_INSN_OPCODE: |
---|
| 1420 | + case JMP8_INSN_OPCODE: |
---|
| 1421 | + int3_emulate_jmp(regs, (long)ip + tp->disp); |
---|
| 1422 | + break; |
---|
| 1423 | + |
---|
| 1424 | + default: |
---|
| 1425 | + BUG(); |
---|
| 1426 | + } |
---|
| 1427 | + |
---|
| 1428 | + ret = 1; |
---|
| 1429 | + |
---|
| 1430 | +out_put: |
---|
| 1431 | + put_desc(); |
---|
| 1432 | + return ret; |
---|
| 1433 | +} |
---|
| 1434 | + |
---|
| 1435 | +#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) |
---|
| 1436 | +static struct text_poke_loc tp_vec[TP_VEC_MAX]; |
---|
| 1437 | +static int tp_vec_nr; |
---|
| 1438 | + |
---|
| 1439 | +/** |
---|
| 1440 | + * text_poke_bp_batch() -- update instructions on live kernel on SMP |
---|
| 1441 | + * @tp: vector of instructions to patch |
---|
| 1442 | + * @nr_entries: number of entries in the vector |
---|
| 1443 | + * |
---|
| 1444 | + * Modify multi-byte instruction by using int3 breakpoint on SMP. |
---|
| 1445 | + * We completely avoid stop_machine() here, and achieve the |
---|
| 1446 | + * synchronization using int3 breakpoint. |
---|
| 1447 | + * |
---|
| 1448 | + * The way it is done: |
---|
| 1449 | + * - For each entry in the vector: |
---|
| 1450 | + * - add a int3 trap to the address that will be patched |
---|
| 1451 | + * - sync cores |
---|
| 1452 | + * - For each entry in the vector: |
---|
| 1453 | + * - update all but the first byte of the patched range |
---|
| 1454 | + * - sync cores |
---|
| 1455 | + * - For each entry in the vector: |
---|
| 1456 | + * - replace the first byte (int3) by the first byte of |
---|
| 1457 | + * replacing opcode |
---|
| 1458 | + * - sync cores |
---|
| 1459 | + */ |
---|
| 1460 | +static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) |
---|
| 1461 | +{ |
---|
| 1462 | + unsigned char int3 = INT3_INSN_OPCODE; |
---|
| 1463 | + unsigned int i; |
---|
| 1464 | + int do_sync; |
---|
| 1465 | + |
---|
| 1466 | + lockdep_assert_held(&text_mutex); |
---|
| 1467 | + |
---|
| 1468 | + bp_desc.vec = tp; |
---|
| 1469 | + bp_desc.nr_entries = nr_entries; |
---|
| 1470 | + |
---|
| 1471 | + /* |
---|
| 1472 | + * Corresponds to the implicit memory barrier in try_get_desc() to |
---|
| 1473 | + * ensure reading a non-zero refcount provides up to date bp_desc data. |
---|
| 1474 | + */ |
---|
| 1475 | + atomic_set_release(&bp_desc.refs, 1); |
---|
| 1476 | + |
---|
| 1477 | + /* |
---|
| 1478 | + * Corresponding read barrier in int3 notifier for making sure the |
---|
| 1479 | + * nr_entries and handler are correctly ordered wrt. patching. |
---|
| 1480 | + */ |
---|
| 1481 | + smp_wmb(); |
---|
| 1482 | + |
---|
| 1483 | + /* |
---|
| 1484 | + * First step: add a int3 trap to the address that will be patched. |
---|
| 1485 | + */ |
---|
| 1486 | + for (i = 0; i < nr_entries; i++) { |
---|
| 1487 | + tp[i].old = *(u8 *)text_poke_addr(&tp[i]); |
---|
| 1488 | + text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); |
---|
| 1489 | + } |
---|
| 1490 | + |
---|
| 1491 | + text_poke_sync(); |
---|
| 1492 | + |
---|
| 1493 | + /* |
---|
| 1494 | + * Second step: update all but the first byte of the patched range. |
---|
| 1495 | + */ |
---|
| 1496 | + for (do_sync = 0, i = 0; i < nr_entries; i++) { |
---|
| 1497 | + u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; |
---|
| 1498 | + int len = tp[i].len; |
---|
| 1499 | + |
---|
| 1500 | + if (len - INT3_INSN_SIZE > 0) { |
---|
| 1501 | + memcpy(old + INT3_INSN_SIZE, |
---|
| 1502 | + text_poke_addr(&tp[i]) + INT3_INSN_SIZE, |
---|
| 1503 | + len - INT3_INSN_SIZE); |
---|
| 1504 | + text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, |
---|
| 1505 | + (const char *)tp[i].text + INT3_INSN_SIZE, |
---|
| 1506 | + len - INT3_INSN_SIZE); |
---|
| 1507 | + do_sync++; |
---|
| 1508 | + } |
---|
| 1509 | + |
---|
| 1510 | + /* |
---|
| 1511 | + * Emit a perf event to record the text poke, primarily to |
---|
| 1512 | + * support Intel PT decoding which must walk the executable code |
---|
| 1513 | + * to reconstruct the trace. The flow up to here is: |
---|
| 1514 | + * - write INT3 byte |
---|
| 1515 | + * - IPI-SYNC |
---|
| 1516 | + * - write instruction tail |
---|
| 1517 | + * At this point the actual control flow will be through the |
---|
| 1518 | + * INT3 and handler and not hit the old or new instruction. |
---|
| 1519 | + * Intel PT outputs FUP/TIP packets for the INT3, so the flow |
---|
| 1520 | + * can still be decoded. Subsequently: |
---|
| 1521 | + * - emit RECORD_TEXT_POKE with the new instruction |
---|
| 1522 | + * - IPI-SYNC |
---|
| 1523 | + * - write first byte |
---|
| 1524 | + * - IPI-SYNC |
---|
| 1525 | + * So before the text poke event timestamp, the decoder will see |
---|
| 1526 | + * either the old instruction flow or FUP/TIP of INT3. After the |
---|
| 1527 | + * text poke event timestamp, the decoder will see either the |
---|
| 1528 | + * new instruction flow or FUP/TIP of INT3. Thus decoders can |
---|
| 1529 | + * use the timestamp as the point at which to modify the |
---|
| 1530 | + * executable code. |
---|
| 1531 | + * The old instruction is recorded so that the event can be |
---|
| 1532 | + * processed forwards or backwards. |
---|
| 1533 | + */ |
---|
| 1534 | + perf_event_text_poke(text_poke_addr(&tp[i]), old, len, |
---|
| 1535 | + tp[i].text, len); |
---|
| 1536 | + } |
---|
| 1537 | + |
---|
| 1538 | + if (do_sync) { |
---|
| 1539 | + /* |
---|
| 1540 | + * According to Intel, this core syncing is very likely |
---|
| 1541 | + * not necessary and we'd be safe even without it. But |
---|
| 1542 | + * better safe than sorry (plus there's not only Intel). |
---|
| 1543 | + */ |
---|
| 1544 | + text_poke_sync(); |
---|
| 1545 | + } |
---|
| 1546 | + |
---|
| 1547 | + /* |
---|
| 1548 | + * Third step: replace the first byte (int3) by the first byte of |
---|
| 1549 | + * replacing opcode. |
---|
| 1550 | + */ |
---|
| 1551 | + for (do_sync = 0, i = 0; i < nr_entries; i++) { |
---|
| 1552 | + if (tp[i].text[0] == INT3_INSN_OPCODE) |
---|
| 1553 | + continue; |
---|
| 1554 | + |
---|
| 1555 | + text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE); |
---|
| 1556 | + do_sync++; |
---|
| 1557 | + } |
---|
| 1558 | + |
---|
| 1559 | + if (do_sync) |
---|
| 1560 | + text_poke_sync(); |
---|
| 1561 | + |
---|
| 1562 | + /* |
---|
| 1563 | + * Remove and wait for refs to be zero. |
---|
| 1564 | + */ |
---|
| 1565 | + if (!atomic_dec_and_test(&bp_desc.refs)) |
---|
| 1566 | + atomic_cond_read_acquire(&bp_desc.refs, !VAL); |
---|
| 1567 | +} |
---|
| 1568 | + |
---|
| 1569 | +static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, |
---|
| 1570 | + const void *opcode, size_t len, const void *emulate) |
---|
| 1571 | +{ |
---|
| 1572 | + struct insn insn; |
---|
| 1573 | + int ret, i; |
---|
| 1574 | + |
---|
| 1575 | + memcpy((void *)tp->text, opcode, len); |
---|
| 1576 | + if (!emulate) |
---|
| 1577 | + emulate = opcode; |
---|
| 1578 | + |
---|
| 1579 | + ret = insn_decode_kernel(&insn, emulate); |
---|
| 1580 | + BUG_ON(ret < 0); |
---|
| 1581 | + |
---|
| 1582 | + tp->rel_addr = addr - (void *)_stext; |
---|
| 1583 | + tp->len = len; |
---|
| 1584 | + tp->opcode = insn.opcode.bytes[0]; |
---|
| 1585 | + |
---|
| 1586 | + switch (tp->opcode) { |
---|
| 1587 | + case RET_INSN_OPCODE: |
---|
| 1588 | + case JMP32_INSN_OPCODE: |
---|
| 1589 | + case JMP8_INSN_OPCODE: |
---|
| 1590 | + /* |
---|
| 1591 | + * Control flow instructions without implied execution of the |
---|
| 1592 | + * next instruction can be padded with INT3. |
---|
| 1593 | + */ |
---|
| 1594 | + for (i = insn.length; i < len; i++) |
---|
| 1595 | + BUG_ON(tp->text[i] != INT3_INSN_OPCODE); |
---|
| 1596 | + break; |
---|
| 1597 | + |
---|
| 1598 | + default: |
---|
| 1599 | + BUG_ON(len != insn.length); |
---|
| 1600 | + }; |
---|
| 1601 | + |
---|
| 1602 | + |
---|
| 1603 | + switch (tp->opcode) { |
---|
| 1604 | + case INT3_INSN_OPCODE: |
---|
| 1605 | + case RET_INSN_OPCODE: |
---|
| 1606 | + break; |
---|
| 1607 | + |
---|
| 1608 | + case CALL_INSN_OPCODE: |
---|
| 1609 | + case JMP32_INSN_OPCODE: |
---|
| 1610 | + case JMP8_INSN_OPCODE: |
---|
| 1611 | + tp->disp = insn.immediate.value; |
---|
| 1612 | + break; |
---|
| 1613 | + |
---|
| 1614 | + default: /* assume NOP */ |
---|
| 1615 | + switch (len) { |
---|
| 1616 | + case 2: /* NOP2 -- emulate as JMP8+0 */ |
---|
| 1617 | + BUG_ON(memcmp(emulate, ideal_nops[len], len)); |
---|
| 1618 | + tp->opcode = JMP8_INSN_OPCODE; |
---|
| 1619 | + tp->disp = 0; |
---|
| 1620 | + break; |
---|
| 1621 | + |
---|
| 1622 | + case 5: /* NOP5 -- emulate as JMP32+0 */ |
---|
| 1623 | + BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len)); |
---|
| 1624 | + tp->opcode = JMP32_INSN_OPCODE; |
---|
| 1625 | + tp->disp = 0; |
---|
| 1626 | + break; |
---|
| 1627 | + |
---|
| 1628 | + default: /* unknown instruction */ |
---|
| 1629 | + BUG(); |
---|
| 1630 | + } |
---|
| 1631 | + break; |
---|
| 1632 | + } |
---|
| 1633 | +} |
---|
| 1634 | + |
---|
| 1635 | +/* |
---|
| 1636 | + * We hard rely on the tp_vec being ordered; ensure this is so by flushing |
---|
| 1637 | + * early if needed. |
---|
| 1638 | + */ |
---|
| 1639 | +static bool tp_order_fail(void *addr) |
---|
| 1640 | +{ |
---|
| 1641 | + struct text_poke_loc *tp; |
---|
| 1642 | + |
---|
| 1643 | + if (!tp_vec_nr) |
---|
| 1644 | + return false; |
---|
| 1645 | + |
---|
| 1646 | + if (!addr) /* force */ |
---|
| 1647 | + return true; |
---|
| 1648 | + |
---|
| 1649 | + tp = &tp_vec[tp_vec_nr - 1]; |
---|
| 1650 | + if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) |
---|
| 1651 | + return true; |
---|
| 1652 | + |
---|
| 1653 | + return false; |
---|
| 1654 | +} |
---|
| 1655 | + |
---|
| 1656 | +static void text_poke_flush(void *addr) |
---|
| 1657 | +{ |
---|
| 1658 | + if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { |
---|
| 1659 | + text_poke_bp_batch(tp_vec, tp_vec_nr); |
---|
| 1660 | + tp_vec_nr = 0; |
---|
| 1661 | + } |
---|
| 1662 | +} |
---|
| 1663 | + |
---|
| 1664 | +void text_poke_finish(void) |
---|
| 1665 | +{ |
---|
| 1666 | + text_poke_flush(NULL); |
---|
| 1667 | +} |
---|
| 1668 | + |
---|
| 1669 | +void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) |
---|
| 1670 | +{ |
---|
| 1671 | + struct text_poke_loc *tp; |
---|
| 1672 | + |
---|
| 1673 | + if (unlikely(system_state == SYSTEM_BOOTING)) { |
---|
| 1674 | + text_poke_early(addr, opcode, len); |
---|
| 1675 | + return; |
---|
| 1676 | + } |
---|
| 1677 | + |
---|
| 1678 | + text_poke_flush(addr); |
---|
| 1679 | + |
---|
| 1680 | + tp = &tp_vec[tp_vec_nr++]; |
---|
| 1681 | + text_poke_loc_init(tp, addr, opcode, len, emulate); |
---|
778 | 1682 | } |
---|
779 | 1683 | |
---|
780 | 1684 | /** |
---|
.. | .. |
---|
784 | 1688 | * @len: length to copy |
---|
785 | 1689 | * @handler: address to jump to when the temporary breakpoint is hit |
---|
786 | 1690 | * |
---|
787 | | - * Modify multi-byte instruction by using int3 breakpoint on SMP. |
---|
788 | | - * We completely avoid stop_machine() here, and achieve the |
---|
789 | | - * synchronization using int3 breakpoint. |
---|
790 | | - * |
---|
791 | | - * The way it is done: |
---|
792 | | - * - add a int3 trap to the address that will be patched |
---|
793 | | - * - sync cores |
---|
794 | | - * - update all but the first byte of the patched range |
---|
795 | | - * - sync cores |
---|
796 | | - * - replace the first byte (int3) by the first byte of |
---|
797 | | - * replacing opcode |
---|
798 | | - * - sync cores |
---|
| 1691 | + * Update a single instruction with the vector in the stack, avoiding |
---|
| 1692 | + * dynamically allocated memory. This function should be used when it is |
---|
| 1693 | + * not possible to allocate memory. |
---|
799 | 1694 | */ |
---|
800 | | -void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) |
---|
| 1695 | +void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) |
---|
801 | 1696 | { |
---|
802 | | - unsigned char int3 = 0xcc; |
---|
| 1697 | + struct text_poke_loc tp; |
---|
803 | 1698 | |
---|
804 | | - bp_int3_handler = handler; |
---|
805 | | - bp_int3_addr = (u8 *)addr + sizeof(int3); |
---|
806 | | - bp_patching_in_progress = true; |
---|
807 | | - |
---|
808 | | - lockdep_assert_held(&text_mutex); |
---|
809 | | - |
---|
810 | | - /* |
---|
811 | | - * Corresponding read barrier in int3 notifier for making sure the |
---|
812 | | - * in_progress and handler are correctly ordered wrt. patching. |
---|
813 | | - */ |
---|
814 | | - smp_wmb(); |
---|
815 | | - |
---|
816 | | - text_poke(addr, &int3, sizeof(int3)); |
---|
817 | | - |
---|
818 | | - on_each_cpu(do_sync_core, NULL, 1); |
---|
819 | | - |
---|
820 | | - if (len - sizeof(int3) > 0) { |
---|
821 | | - /* patch all but the first byte */ |
---|
822 | | - text_poke((char *)addr + sizeof(int3), |
---|
823 | | - (const char *) opcode + sizeof(int3), |
---|
824 | | - len - sizeof(int3)); |
---|
825 | | - /* |
---|
826 | | - * According to Intel, this core syncing is very likely |
---|
827 | | - * not necessary and we'd be safe even without it. But |
---|
828 | | - * better safe than sorry (plus there's not only Intel). |
---|
829 | | - */ |
---|
830 | | - on_each_cpu(do_sync_core, NULL, 1); |
---|
| 1699 | + if (unlikely(system_state == SYSTEM_BOOTING)) { |
---|
| 1700 | + text_poke_early(addr, opcode, len); |
---|
| 1701 | + return; |
---|
831 | 1702 | } |
---|
832 | 1703 | |
---|
833 | | - /* patch the first byte */ |
---|
834 | | - text_poke(addr, opcode, sizeof(int3)); |
---|
835 | | - |
---|
836 | | - on_each_cpu(do_sync_core, NULL, 1); |
---|
837 | | - /* |
---|
838 | | - * sync_core() implies an smp_mb() and orders this store against |
---|
839 | | - * the writing of the new instruction. |
---|
840 | | - */ |
---|
841 | | - bp_patching_in_progress = false; |
---|
842 | | - |
---|
843 | | - return addr; |
---|
| 1704 | + text_poke_loc_init(&tp, addr, opcode, len, emulate); |
---|
| 1705 | + text_poke_bp_batch(&tp, 1); |
---|
844 | 1706 | } |
---|
845 | | - |
---|