| .. | .. |
|---|
| 2 | 2 | |
|---|
| 3 | 3 | #include <linux/context_tracking.h> |
|---|
| 4 | 4 | #include <linux/entry-common.h> |
|---|
| 5 | +#include <linux/irq_pipeline.h> |
|---|
| 5 | 6 | #include <linux/livepatch.h> |
|---|
| 6 | 7 | #include <linux/audit.h> |
|---|
| 7 | 8 | |
|---|
| .. | .. |
|---|
| 71 | 72 | return ret ? : syscall; |
|---|
| 72 | 73 | } |
|---|
| 73 | 74 | |
|---|
| 75 | +static __always_inline void |
|---|
| 76 | +syscall_enter_from_user_enable_irqs(void) |
|---|
| 77 | +{ |
|---|
| 78 | + if (running_inband()) { |
|---|
| 79 | + /* |
|---|
| 80 | + * If pipelining interrupts, prepare for emulating a |
|---|
| 81 | + * stall -> unstall transition (we are currently |
|---|
| 82 | + * unstalled), fixing up the IRQ trace state in order |
|---|
| 83 | + * to keep lockdep happy (and silent). |
|---|
| 84 | + */ |
|---|
| 85 | + stall_inband_nocheck(); |
|---|
| 86 | + hard_cond_local_irq_enable(); |
|---|
| 87 | + local_irq_enable(); |
|---|
| 88 | + } else { |
|---|
| 89 | + /* |
|---|
| 90 | + * We are running on the out-of-band stage, don't mess |
|---|
| 91 | + * with the in-band interrupt state. This is none of |
|---|
| 92 | + * our business. We may manipulate the hardware state |
|---|
| 93 | + * only. |
|---|
| 94 | + */ |
|---|
| 95 | + hard_local_irq_enable(); |
|---|
| 96 | + } |
|---|
| 97 | +} |
|---|
| 98 | + |
|---|
| 74 | 99 | static __always_inline long |
|---|
| 75 | 100 | __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) |
|---|
| 76 | 101 | { |
|---|
| 77 | 102 | unsigned long ti_work; |
|---|
| 103 | + int ret; |
|---|
| 104 | + |
|---|
| 105 | + /* |
|---|
| 106 | + * Pipeline the syscall to the companion core if the current |
|---|
| 107 | + * task wants this. Compiled out if not dovetailing. |
|---|
| 108 | + */ |
|---|
| 109 | + ret = pipeline_syscall(syscall, regs); |
|---|
| 110 | + if (ret > 0) /* out-of-band, bail out. */ |
|---|
| 111 | + return EXIT_SYSCALL_OOB; |
|---|
| 112 | + if (ret < 0) /* in-band, tail work only. */ |
|---|
| 113 | + return EXIT_SYSCALL_TAIL; |
|---|
| 78 | 114 | |
|---|
| 79 | 115 | ti_work = READ_ONCE(current_thread_info()->flags); |
|---|
| 80 | 116 | if (ti_work & SYSCALL_ENTER_WORK) |
|---|
| .. | .. |
|---|
| 95 | 131 | enter_from_user_mode(regs); |
|---|
| 96 | 132 | |
|---|
| 97 | 133 | instrumentation_begin(); |
|---|
| 98 | | - local_irq_enable(); |
|---|
| 134 | + syscall_enter_from_user_enable_irqs(); |
|---|
| 99 | 135 | ret = __syscall_enter_from_user_work(regs, syscall); |
|---|
| 100 | 136 | instrumentation_end(); |
|---|
| 101 | 137 | |
|---|
| .. | .. |
|---|
| 106 | 142 | { |
|---|
| 107 | 143 | enter_from_user_mode(regs); |
|---|
| 108 | 144 | instrumentation_begin(); |
|---|
| 109 | | - local_irq_enable(); |
|---|
| 145 | + syscall_enter_from_user_enable_irqs(); |
|---|
| 110 | 146 | instrumentation_end(); |
|---|
| 111 | 147 | } |
|---|
| 112 | 148 | |
|---|
| .. | .. |
|---|
| 121 | 157 | * 3) Invoke architecture specific last minute exit code, e.g. speculation |
|---|
| 122 | 158 | * mitigations, etc. |
|---|
| 123 | 159 | * 4) Tell lockdep that interrupts are enabled |
|---|
| 160 | + * 5) Unstall the in-band stage of the interrupt pipeline if current |
|---|
| 124 | 161 | */ |
|---|
| 125 | 162 | static __always_inline void exit_to_user_mode(void) |
|---|
| 126 | 163 | { |
|---|
| .. | .. |
|---|
| 132 | 169 | user_enter_irqoff(); |
|---|
| 133 | 170 | arch_exit_to_user_mode(); |
|---|
| 134 | 171 | lockdep_hardirqs_on(CALLER_ADDR0); |
|---|
| 172 | + if (running_inband()) |
|---|
| 173 | + unstall_inband(); |
|---|
| 135 | 174 | } |
|---|
| 136 | 175 | |
|---|
| 137 | 176 | /* Workaround to allow gradual conversion of architecture code */ |
|---|
| .. | .. |
|---|
| 155 | 194 | while (ti_work & EXIT_TO_USER_MODE_WORK) { |
|---|
| 156 | 195 | |
|---|
| 157 | 196 | local_irq_enable_exit_to_user(ti_work); |
|---|
| 197 | + |
|---|
| 198 | + /* |
|---|
| 199 | + * Check that local_irq_enable_exit_to_user() does the |
|---|
| 200 | + * right thing when pipelining. |
|---|
| 201 | + */ |
|---|
| 202 | + WARN_ON_ONCE(irq_pipeline_debug() && hard_irqs_disabled()); |
|---|
| 158 | 203 | |
|---|
| 159 | 204 | if (ti_work & _TIF_NEED_RESCHED) |
|---|
| 160 | 205 | schedule(); |
|---|
| .. | .. |
|---|
| 182 | 227 | * enabled above. |
|---|
| 183 | 228 | */ |
|---|
| 184 | 229 | local_irq_disable_exit_to_user(); |
|---|
| 230 | + WARN_ON_ONCE(irq_pipeline_debug() && !hard_irqs_disabled()); |
|---|
| 185 | 231 | ti_work = READ_ONCE(current_thread_info()->flags); |
|---|
| 186 | 232 | } |
|---|
| 187 | 233 | |
|---|
| .. | .. |
|---|
| 189 | 235 | return ti_work; |
|---|
| 190 | 236 | } |
|---|
| 191 | 237 | |
|---|
| 238 | +static inline bool do_retuser(unsigned long ti_work) |
|---|
| 239 | +{ |
|---|
| 240 | + if (dovetailing() && (ti_work & _TIF_RETUSER)) { |
|---|
| 241 | + hard_local_irq_enable(); |
|---|
| 242 | + inband_retuser_notify(); |
|---|
| 243 | + hard_local_irq_disable(); |
|---|
| 244 | + /* RETUSER might have switched oob */ |
|---|
| 245 | + return running_inband(); |
|---|
| 246 | + } |
|---|
| 247 | + |
|---|
| 248 | + return false; |
|---|
| 249 | +} |
|---|
| 250 | + |
|---|
| 192 | 251 | static void exit_to_user_mode_prepare(struct pt_regs *regs) |
|---|
| 193 | 252 | { |
|---|
| 194 | | - unsigned long ti_work = READ_ONCE(current_thread_info()->flags); |
|---|
| 253 | + unsigned long ti_work; |
|---|
| 254 | + |
|---|
| 255 | + check_hard_irqs_disabled(); |
|---|
| 195 | 256 | |
|---|
| 196 | 257 | lockdep_assert_irqs_disabled(); |
|---|
| 258 | +again: |
|---|
| 259 | + ti_work = READ_ONCE(current_thread_info()->flags); |
|---|
| 197 | 260 | |
|---|
| 198 | 261 | if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) |
|---|
| 199 | 262 | ti_work = exit_to_user_mode_loop(regs, ti_work); |
|---|
| 200 | 263 | |
|---|
| 201 | 264 | arch_exit_to_user_mode_prepare(regs, ti_work); |
|---|
| 265 | + |
|---|
| 266 | + if (do_retuser(ti_work)) |
|---|
| 267 | + goto again; |
|---|
| 202 | 268 | |
|---|
| 203 | 269 | /* Ensure that the address limit is intact and no locks are held */ |
|---|
| 204 | 270 | addr_limit_user_check(); |
|---|
| .. | .. |
|---|
| 252 | 318 | |
|---|
| 253 | 319 | if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { |
|---|
| 254 | 320 | if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) |
|---|
| 255 | | - local_irq_enable(); |
|---|
| 321 | + local_irq_enable_full(); |
|---|
| 256 | 322 | } |
|---|
| 257 | 323 | |
|---|
| 258 | 324 | rseq_syscall(regs); |
|---|
| .. | .. |
|---|
| 261 | 327 | * Do one-time syscall specific work. If these work items are |
|---|
| 262 | 328 | * enabled, we want to run them exactly once per syscall exit with |
|---|
| 263 | 329 | * interrupts enabled. |
|---|
| 330 | + * |
|---|
| 331 | + * Dovetail: if this does not look like an in-band syscall, it |
|---|
| 332 | + * has to belong to the companion core. Typically, |
|---|
| 333 | + * __OOB_SYSCALL_BIT would be set in this value. Skip the |
|---|
| 334 | + * work for those syscalls. |
|---|
| 264 | 335 | */ |
|---|
| 265 | | - if (unlikely(cached_flags & SYSCALL_EXIT_WORK)) |
|---|
| 336 | + if (unlikely((cached_flags & SYSCALL_EXIT_WORK) && |
|---|
| 337 | + (!irqs_pipelined() || |
|---|
| 338 | + syscall_get_nr(current, regs) < NR_syscalls))) |
|---|
| 266 | 339 | syscall_exit_work(regs, cached_flags); |
|---|
| 267 | 340 | } |
|---|
| 268 | 341 | |
|---|
| .. | .. |
|---|
| 278 | 351 | |
|---|
| 279 | 352 | noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) |
|---|
| 280 | 353 | { |
|---|
| 354 | + WARN_ON_ONCE(irq_pipeline_debug() && irqs_disabled()); |
|---|
| 355 | + stall_inband_nocheck(); |
|---|
| 281 | 356 | enter_from_user_mode(regs); |
|---|
| 282 | 357 | } |
|---|
| 283 | 358 | |
|---|
| .. | .. |
|---|
| 293 | 368 | { |
|---|
| 294 | 369 | irqentry_state_t ret = { |
|---|
| 295 | 370 | .exit_rcu = false, |
|---|
| 371 | +#ifdef CONFIG_IRQ_PIPELINE |
|---|
| 372 | + .stage_info = IRQENTRY_INBAND_STALLED, |
|---|
| 373 | +#endif |
|---|
| 296 | 374 | }; |
|---|
| 297 | 375 | |
|---|
| 376 | +#ifdef CONFIG_IRQ_PIPELINE |
|---|
| 377 | + if (running_oob()) { |
|---|
| 378 | + WARN_ON_ONCE(irq_pipeline_debug() && oob_irqs_disabled()); |
|---|
| 379 | + ret.stage_info = IRQENTRY_OOB; |
|---|
| 380 | + return ret; |
|---|
| 381 | + } |
|---|
| 382 | +#endif |
|---|
| 383 | + |
|---|
| 298 | 384 | if (user_mode(regs)) { |
|---|
| 385 | +#ifdef CONFIG_IRQ_PIPELINE |
|---|
| 386 | + ret.stage_info = IRQENTRY_INBAND_UNSTALLED; |
|---|
| 387 | +#endif |
|---|
| 299 | 388 | irqentry_enter_from_user_mode(regs); |
|---|
| 300 | 389 | return ret; |
|---|
| 301 | 390 | } |
|---|
| 391 | + |
|---|
| 392 | +#ifdef CONFIG_IRQ_PIPELINE |
|---|
| 393 | + /* |
|---|
| 394 | + * IRQ pipeline: If we trapped from kernel space, the virtual |
|---|
| 395 | + * state may or may not match the hardware state. Since hard |
|---|
| 396 | + * irqs are off on entry, we have to stall the in-band stage. |
|---|
| 397 | + */ |
|---|
| 398 | + if (!test_and_stall_inband_nocheck()) |
|---|
| 399 | + ret.stage_info = IRQENTRY_INBAND_UNSTALLED; |
|---|
| 400 | +#endif |
|---|
| 302 | 401 | |
|---|
| 303 | 402 | /* |
|---|
| 304 | 403 | * If this entry hit the idle task invoke rcu_irq_enter() whether |
|---|
| .. | .. |
|---|
| 366 | 465 | } |
|---|
| 367 | 466 | } |
|---|
| 368 | 467 | |
|---|
| 468 | +#ifdef CONFIG_IRQ_PIPELINE |
|---|
| 469 | + |
|---|
| 470 | +static inline |
|---|
| 471 | +bool irqexit_may_preempt_schedule(irqentry_state_t state, |
|---|
| 472 | + struct pt_regs *regs) |
|---|
| 473 | +{ |
|---|
| 474 | + return state.stage_info == IRQENTRY_INBAND_UNSTALLED; |
|---|
| 475 | +} |
|---|
| 476 | + |
|---|
| 477 | +#else |
|---|
| 478 | + |
|---|
| 479 | +static inline |
|---|
| 480 | +bool irqexit_may_preempt_schedule(irqentry_state_t state, |
|---|
| 481 | + struct pt_regs *regs) |
|---|
| 482 | +{ |
|---|
| 483 | + return !regs_irqs_disabled(regs); |
|---|
| 484 | +} |
|---|
| 485 | + |
|---|
| 486 | +#endif |
|---|
| 487 | + |
|---|
| 488 | +#ifdef CONFIG_IRQ_PIPELINE |
|---|
| 489 | + |
|---|
| 490 | +static bool irqentry_syncstage(irqentry_state_t state) /* hard irqs off */ |
|---|
| 491 | +{ |
|---|
| 492 | + /* |
|---|
| 493 | + * If pipelining interrupts, enable in-band IRQs then |
|---|
| 494 | + * synchronize the interrupt log on exit if: |
|---|
| 495 | + * |
|---|
| 496 | + * - irqentry_enter() stalled the stage in order to mirror the |
|---|
| 497 | + * hardware state. |
|---|
| 498 | + * |
|---|
| 499 | + * - we where coming from oob, thus went through a stage migration |
|---|
| 500 | + * that was caused by taking a CPU exception, e.g., a fault. |
|---|
| 501 | + * |
|---|
| 502 | + * We run before preempt_schedule_irq() may be called later on |
|---|
| 503 | + * by preemptible kernels, so that any rescheduling request |
|---|
| 504 | + * triggered by in-band IRQ handlers is considered. |
|---|
| 505 | + */ |
|---|
| 506 | + if (state.stage_info == IRQENTRY_INBAND_UNSTALLED || |
|---|
| 507 | + state.stage_info == IRQENTRY_OOB) { |
|---|
| 508 | + unstall_inband_nocheck(); |
|---|
| 509 | + synchronize_pipeline_on_irq(); |
|---|
| 510 | + stall_inband_nocheck(); |
|---|
| 511 | + return true; |
|---|
| 512 | + } |
|---|
| 513 | + |
|---|
| 514 | + return false; |
|---|
| 515 | +} |
|---|
| 516 | + |
|---|
| 517 | +static void irqentry_unstall(void) |
|---|
| 518 | +{ |
|---|
| 519 | + unstall_inband_nocheck(); |
|---|
| 520 | +} |
|---|
| 521 | + |
|---|
| 522 | +#else |
|---|
| 523 | + |
|---|
| 524 | +static bool irqentry_syncstage(irqentry_state_t state) |
|---|
| 525 | +{ |
|---|
| 526 | + return false; |
|---|
| 527 | +} |
|---|
| 528 | + |
|---|
| 529 | +static void irqentry_unstall(void) |
|---|
| 530 | +{ |
|---|
| 531 | +} |
|---|
| 532 | + |
|---|
| 533 | +#endif |
|---|
| 534 | + |
|---|
| 369 | 535 | noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) |
|---|
| 370 | 536 | { |
|---|
| 537 | + bool synchronized = false; |
|---|
| 538 | + |
|---|
| 539 | + if (running_oob()) |
|---|
| 540 | + return; |
|---|
| 541 | + |
|---|
| 371 | 542 | lockdep_assert_irqs_disabled(); |
|---|
| 372 | 543 | |
|---|
| 373 | 544 | /* Check whether this returns to user mode */ |
|---|
| 374 | 545 | if (user_mode(regs)) { |
|---|
| 375 | 546 | irqentry_exit_to_user_mode(regs); |
|---|
| 376 | | - } else if (!regs_irqs_disabled(regs)) { |
|---|
| 547 | + return; |
|---|
| 548 | + } |
|---|
| 549 | + |
|---|
| 550 | + synchronized = irqentry_syncstage(state); |
|---|
| 551 | + |
|---|
| 552 | + if (irqexit_may_preempt_schedule(state, regs)) { |
|---|
| 377 | 553 | /* |
|---|
| 378 | 554 | * If RCU was not watching on entry this needs to be done |
|---|
| 379 | 555 | * carefully and needs the same ordering of lockdep/tracing |
|---|
| .. | .. |
|---|
| 387 | 563 | instrumentation_end(); |
|---|
| 388 | 564 | rcu_irq_exit(); |
|---|
| 389 | 565 | lockdep_hardirqs_on(CALLER_ADDR0); |
|---|
| 390 | | - return; |
|---|
| 566 | + goto out; |
|---|
| 391 | 567 | } |
|---|
| 392 | 568 | |
|---|
| 393 | 569 | instrumentation_begin(); |
|---|
| .. | .. |
|---|
| 404 | 580 | if (state.exit_rcu) |
|---|
| 405 | 581 | rcu_irq_exit(); |
|---|
| 406 | 582 | } |
|---|
| 583 | + |
|---|
| 584 | +out: |
|---|
| 585 | + if (synchronized) |
|---|
| 586 | + irqentry_unstall(); |
|---|
| 587 | + |
|---|
| 588 | + return; |
|---|
| 407 | 589 | } |
|---|
| 408 | 590 | |
|---|
| 409 | 591 | irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) |
|---|