.. | .. |
---|
2 | 2 | |
---|
3 | 3 | #include <linux/context_tracking.h> |
---|
4 | 4 | #include <linux/entry-common.h> |
---|
| 5 | +#include <linux/irq_pipeline.h> |
---|
5 | 6 | #include <linux/livepatch.h> |
---|
6 | 7 | #include <linux/audit.h> |
---|
7 | 8 | |
---|
.. | .. |
---|
71 | 72 | return ret ? : syscall; |
---|
72 | 73 | } |
---|
73 | 74 | |
---|
| 75 | +static __always_inline void |
---|
| 76 | +syscall_enter_from_user_enable_irqs(void) |
---|
| 77 | +{ |
---|
| 78 | + if (running_inband()) { |
---|
| 79 | + /* |
---|
| 80 | + * If pipelining interrupts, prepare for emulating a |
---|
| 81 | + * stall -> unstall transition (we are currently |
---|
| 82 | + * unstalled), fixing up the IRQ trace state in order |
---|
| 83 | + * to keep lockdep happy (and silent). |
---|
| 84 | + */ |
---|
| 85 | + stall_inband_nocheck(); |
---|
| 86 | + hard_cond_local_irq_enable(); |
---|
| 87 | + local_irq_enable(); |
---|
| 88 | + } else { |
---|
| 89 | + /* |
---|
| 90 | + * We are running on the out-of-band stage, don't mess |
---|
| 91 | + * with the in-band interrupt state. This is none of |
---|
| 92 | + * our business. We may manipulate the hardware state |
---|
| 93 | + * only. |
---|
| 94 | + */ |
---|
| 95 | + hard_local_irq_enable(); |
---|
| 96 | + } |
---|
| 97 | +} |
---|
| 98 | + |
---|
74 | 99 | static __always_inline long |
---|
75 | 100 | __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) |
---|
76 | 101 | { |
---|
77 | 102 | unsigned long ti_work; |
---|
| 103 | + int ret; |
---|
| 104 | + |
---|
| 105 | + /* |
---|
| 106 | + * Pipeline the syscall to the companion core if the current |
---|
| 107 | + * task wants this. Compiled out if not dovetailing. |
---|
| 108 | + */ |
---|
| 109 | + ret = pipeline_syscall(syscall, regs); |
---|
| 110 | + if (ret > 0) /* out-of-band, bail out. */ |
---|
| 111 | + return EXIT_SYSCALL_OOB; |
---|
| 112 | + if (ret < 0) /* in-band, tail work only. */ |
---|
| 113 | + return EXIT_SYSCALL_TAIL; |
---|
78 | 114 | |
---|
79 | 115 | ti_work = READ_ONCE(current_thread_info()->flags); |
---|
80 | 116 | if (ti_work & SYSCALL_ENTER_WORK) |
---|
.. | .. |
---|
95 | 131 | enter_from_user_mode(regs); |
---|
96 | 132 | |
---|
97 | 133 | instrumentation_begin(); |
---|
98 | | - local_irq_enable(); |
---|
| 134 | + syscall_enter_from_user_enable_irqs(); |
---|
99 | 135 | ret = __syscall_enter_from_user_work(regs, syscall); |
---|
100 | 136 | instrumentation_end(); |
---|
101 | 137 | |
---|
.. | .. |
---|
106 | 142 | { |
---|
107 | 143 | enter_from_user_mode(regs); |
---|
108 | 144 | instrumentation_begin(); |
---|
109 | | - local_irq_enable(); |
---|
| 145 | + syscall_enter_from_user_enable_irqs(); |
---|
110 | 146 | instrumentation_end(); |
---|
111 | 147 | } |
---|
112 | 148 | |
---|
.. | .. |
---|
121 | 157 | * 3) Invoke architecture specific last minute exit code, e.g. speculation |
---|
122 | 158 | * mitigations, etc. |
---|
123 | 159 | * 4) Tell lockdep that interrupts are enabled |
---|
| 160 | + * 5) Unstall the in-band stage of the interrupt pipeline if current |
---|
124 | 161 | */ |
---|
125 | 162 | static __always_inline void exit_to_user_mode(void) |
---|
126 | 163 | { |
---|
.. | .. |
---|
132 | 169 | user_enter_irqoff(); |
---|
133 | 170 | arch_exit_to_user_mode(); |
---|
134 | 171 | lockdep_hardirqs_on(CALLER_ADDR0); |
---|
| 172 | + if (running_inband()) |
---|
| 173 | + unstall_inband(); |
---|
135 | 174 | } |
---|
136 | 175 | |
---|
137 | 176 | /* Workaround to allow gradual conversion of architecture code */ |
---|
.. | .. |
---|
155 | 194 | while (ti_work & EXIT_TO_USER_MODE_WORK) { |
---|
156 | 195 | |
---|
157 | 196 | local_irq_enable_exit_to_user(ti_work); |
---|
| 197 | + |
---|
| 198 | + /* |
---|
| 199 | + * Check that local_irq_enable_exit_to_user() does the |
---|
| 200 | + * right thing when pipelining. |
---|
| 201 | + */ |
---|
| 202 | + WARN_ON_ONCE(irq_pipeline_debug() && hard_irqs_disabled()); |
---|
158 | 203 | |
---|
159 | 204 | if (ti_work & _TIF_NEED_RESCHED) |
---|
160 | 205 | schedule(); |
---|
.. | .. |
---|
182 | 227 | * enabled above. |
---|
183 | 228 | */ |
---|
184 | 229 | local_irq_disable_exit_to_user(); |
---|
| 230 | + WARN_ON_ONCE(irq_pipeline_debug() && !hard_irqs_disabled()); |
---|
185 | 231 | ti_work = READ_ONCE(current_thread_info()->flags); |
---|
186 | 232 | } |
---|
187 | 233 | |
---|
.. | .. |
---|
189 | 235 | return ti_work; |
---|
190 | 236 | } |
---|
191 | 237 | |
---|
| 238 | +static inline bool do_retuser(unsigned long ti_work) |
---|
| 239 | +{ |
---|
| 240 | + if (dovetailing() && (ti_work & _TIF_RETUSER)) { |
---|
| 241 | + hard_local_irq_enable(); |
---|
| 242 | + inband_retuser_notify(); |
---|
| 243 | + hard_local_irq_disable(); |
---|
| 244 | + /* RETUSER might have switched oob */ |
---|
| 245 | + return running_inband(); |
---|
| 246 | + } |
---|
| 247 | + |
---|
| 248 | + return false; |
---|
| 249 | +} |
---|
| 250 | + |
---|
192 | 251 | static void exit_to_user_mode_prepare(struct pt_regs *regs) |
---|
193 | 252 | { |
---|
194 | | - unsigned long ti_work = READ_ONCE(current_thread_info()->flags); |
---|
| 253 | + unsigned long ti_work; |
---|
| 254 | + |
---|
| 255 | + check_hard_irqs_disabled(); |
---|
195 | 256 | |
---|
196 | 257 | lockdep_assert_irqs_disabled(); |
---|
| 258 | +again: |
---|
| 259 | + ti_work = READ_ONCE(current_thread_info()->flags); |
---|
197 | 260 | |
---|
198 | 261 | if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) |
---|
199 | 262 | ti_work = exit_to_user_mode_loop(regs, ti_work); |
---|
200 | 263 | |
---|
201 | 264 | arch_exit_to_user_mode_prepare(regs, ti_work); |
---|
| 265 | + |
---|
| 266 | + if (do_retuser(ti_work)) |
---|
| 267 | + goto again; |
---|
202 | 268 | |
---|
203 | 269 | /* Ensure that the address limit is intact and no locks are held */ |
---|
204 | 270 | addr_limit_user_check(); |
---|
.. | .. |
---|
252 | 318 | |
---|
253 | 319 | if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { |
---|
254 | 320 | if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) |
---|
255 | | - local_irq_enable(); |
---|
| 321 | + local_irq_enable_full(); |
---|
256 | 322 | } |
---|
257 | 323 | |
---|
258 | 324 | rseq_syscall(regs); |
---|
.. | .. |
---|
261 | 327 | * Do one-time syscall specific work. If these work items are |
---|
262 | 328 | * enabled, we want to run them exactly once per syscall exit with |
---|
263 | 329 | * interrupts enabled. |
---|
| 330 | + * |
---|
| 331 | + * Dovetail: if this does not look like an in-band syscall, it |
---|
| 332 | + * has to belong to the companion core. Typically, |
---|
| 333 | + * __OOB_SYSCALL_BIT would be set in this value. Skip the |
---|
| 334 | + * work for those syscalls. |
---|
264 | 335 | */ |
---|
265 | | - if (unlikely(cached_flags & SYSCALL_EXIT_WORK)) |
---|
| 336 | + if (unlikely((cached_flags & SYSCALL_EXIT_WORK) && |
---|
| 337 | + (!irqs_pipelined() || |
---|
| 338 | + syscall_get_nr(current, regs) < NR_syscalls))) |
---|
266 | 339 | syscall_exit_work(regs, cached_flags); |
---|
267 | 340 | } |
---|
268 | 341 | |
---|
.. | .. |
---|
278 | 351 | |
---|
279 | 352 | noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) |
---|
280 | 353 | { |
---|
| 354 | + WARN_ON_ONCE(irq_pipeline_debug() && irqs_disabled()); |
---|
| 355 | + stall_inband_nocheck(); |
---|
281 | 356 | enter_from_user_mode(regs); |
---|
282 | 357 | } |
---|
283 | 358 | |
---|
.. | .. |
---|
293 | 368 | { |
---|
294 | 369 | irqentry_state_t ret = { |
---|
295 | 370 | .exit_rcu = false, |
---|
| 371 | +#ifdef CONFIG_IRQ_PIPELINE |
---|
| 372 | + .stage_info = IRQENTRY_INBAND_STALLED, |
---|
| 373 | +#endif |
---|
296 | 374 | }; |
---|
297 | 375 | |
---|
| 376 | +#ifdef CONFIG_IRQ_PIPELINE |
---|
| 377 | + if (running_oob()) { |
---|
| 378 | + WARN_ON_ONCE(irq_pipeline_debug() && oob_irqs_disabled()); |
---|
| 379 | + ret.stage_info = IRQENTRY_OOB; |
---|
| 380 | + return ret; |
---|
| 381 | + } |
---|
| 382 | +#endif |
---|
| 383 | + |
---|
298 | 384 | if (user_mode(regs)) { |
---|
| 385 | +#ifdef CONFIG_IRQ_PIPELINE |
---|
| 386 | + ret.stage_info = IRQENTRY_INBAND_UNSTALLED; |
---|
| 387 | +#endif |
---|
299 | 388 | irqentry_enter_from_user_mode(regs); |
---|
300 | 389 | return ret; |
---|
301 | 390 | } |
---|
| 391 | + |
---|
| 392 | +#ifdef CONFIG_IRQ_PIPELINE |
---|
| 393 | + /* |
---|
| 394 | + * IRQ pipeline: If we trapped from kernel space, the virtual |
---|
| 395 | + * state may or may not match the hardware state. Since hard |
---|
| 396 | + * irqs are off on entry, we have to stall the in-band stage. |
---|
| 397 | + */ |
---|
| 398 | + if (!test_and_stall_inband_nocheck()) |
---|
| 399 | + ret.stage_info = IRQENTRY_INBAND_UNSTALLED; |
---|
| 400 | +#endif |
---|
302 | 401 | |
---|
303 | 402 | /* |
---|
304 | 403 | * If this entry hit the idle task invoke rcu_irq_enter() whether |
---|
.. | .. |
---|
366 | 465 | } |
---|
367 | 466 | } |
---|
368 | 467 | |
---|
| 468 | +#ifdef CONFIG_IRQ_PIPELINE |
---|
| 469 | + |
---|
| 470 | +static inline |
---|
| 471 | +bool irqexit_may_preempt_schedule(irqentry_state_t state, |
---|
| 472 | + struct pt_regs *regs) |
---|
| 473 | +{ |
---|
| 474 | + return state.stage_info == IRQENTRY_INBAND_UNSTALLED; |
---|
| 475 | +} |
---|
| 476 | + |
---|
| 477 | +#else |
---|
| 478 | + |
---|
| 479 | +static inline |
---|
| 480 | +bool irqexit_may_preempt_schedule(irqentry_state_t state, |
---|
| 481 | + struct pt_regs *regs) |
---|
| 482 | +{ |
---|
| 483 | + return !regs_irqs_disabled(regs); |
---|
| 484 | +} |
---|
| 485 | + |
---|
| 486 | +#endif |
---|
| 487 | + |
---|
| 488 | +#ifdef CONFIG_IRQ_PIPELINE |
---|
| 489 | + |
---|
| 490 | +static bool irqentry_syncstage(irqentry_state_t state) /* hard irqs off */ |
---|
| 491 | +{ |
---|
| 492 | + /* |
---|
| 493 | + * If pipelining interrupts, enable in-band IRQs then |
---|
| 494 | + * synchronize the interrupt log on exit if: |
---|
| 495 | + * |
---|
| 496 | + * - irqentry_enter() stalled the stage in order to mirror the |
---|
| 497 | + * hardware state. |
---|
| 498 | + * |
---|
| 499 | + * - we where coming from oob, thus went through a stage migration |
---|
| 500 | + * that was caused by taking a CPU exception, e.g., a fault. |
---|
| 501 | + * |
---|
| 502 | + * We run before preempt_schedule_irq() may be called later on |
---|
| 503 | + * by preemptible kernels, so that any rescheduling request |
---|
| 504 | + * triggered by in-band IRQ handlers is considered. |
---|
| 505 | + */ |
---|
| 506 | + if (state.stage_info == IRQENTRY_INBAND_UNSTALLED || |
---|
| 507 | + state.stage_info == IRQENTRY_OOB) { |
---|
| 508 | + unstall_inband_nocheck(); |
---|
| 509 | + synchronize_pipeline_on_irq(); |
---|
| 510 | + stall_inband_nocheck(); |
---|
| 511 | + return true; |
---|
| 512 | + } |
---|
| 513 | + |
---|
| 514 | + return false; |
---|
| 515 | +} |
---|
| 516 | + |
---|
| 517 | +static void irqentry_unstall(void) |
---|
| 518 | +{ |
---|
| 519 | + unstall_inband_nocheck(); |
---|
| 520 | +} |
---|
| 521 | + |
---|
| 522 | +#else |
---|
| 523 | + |
---|
| 524 | +static bool irqentry_syncstage(irqentry_state_t state) |
---|
| 525 | +{ |
---|
| 526 | + return false; |
---|
| 527 | +} |
---|
| 528 | + |
---|
| 529 | +static void irqentry_unstall(void) |
---|
| 530 | +{ |
---|
| 531 | +} |
---|
| 532 | + |
---|
| 533 | +#endif |
---|
| 534 | + |
---|
369 | 535 | noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) |
---|
370 | 536 | { |
---|
| 537 | + bool synchronized = false; |
---|
| 538 | + |
---|
| 539 | + if (running_oob()) |
---|
| 540 | + return; |
---|
| 541 | + |
---|
371 | 542 | lockdep_assert_irqs_disabled(); |
---|
372 | 543 | |
---|
373 | 544 | /* Check whether this returns to user mode */ |
---|
374 | 545 | if (user_mode(regs)) { |
---|
375 | 546 | irqentry_exit_to_user_mode(regs); |
---|
376 | | - } else if (!regs_irqs_disabled(regs)) { |
---|
| 547 | + return; |
---|
| 548 | + } |
---|
| 549 | + |
---|
| 550 | + synchronized = irqentry_syncstage(state); |
---|
| 551 | + |
---|
| 552 | + if (irqexit_may_preempt_schedule(state, regs)) { |
---|
377 | 553 | /* |
---|
378 | 554 | * If RCU was not watching on entry this needs to be done |
---|
379 | 555 | * carefully and needs the same ordering of lockdep/tracing |
---|
.. | .. |
---|
387 | 563 | instrumentation_end(); |
---|
388 | 564 | rcu_irq_exit(); |
---|
389 | 565 | lockdep_hardirqs_on(CALLER_ADDR0); |
---|
390 | | - return; |
---|
| 566 | + goto out; |
---|
391 | 567 | } |
---|
392 | 568 | |
---|
393 | 569 | instrumentation_begin(); |
---|
.. | .. |
---|
404 | 580 | if (state.exit_rcu) |
---|
405 | 581 | rcu_irq_exit(); |
---|
406 | 582 | } |
---|
| 583 | + |
---|
| 584 | +out: |
---|
| 585 | + if (synchronized) |
---|
| 586 | + irqentry_unstall(); |
---|
| 587 | + |
---|
| 588 | + return; |
---|
407 | 589 | } |
---|
408 | 590 | |
---|
409 | 591 | irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) |
---|