hc
2024-11-01 2f529f9b558ca1c1bd74be7437a84e4711743404
kernel/kernel/entry/common.c
....@@ -2,6 +2,7 @@
22
33 #include <linux/context_tracking.h>
44 #include <linux/entry-common.h>
5
+#include <linux/irq_pipeline.h>
56 #include <linux/livepatch.h>
67 #include <linux/audit.h>
78
....@@ -71,10 +72,45 @@
7172 return ret ? : syscall;
7273 }
7374
75
+static __always_inline void
76
+syscall_enter_from_user_enable_irqs(void)
77
+{
78
+ if (running_inband()) {
79
+ /*
80
+ * If pipelining interrupts, prepare for emulating a
81
+ * stall -> unstall transition (we are currently
82
+ * unstalled), fixing up the IRQ trace state in order
83
+ * to keep lockdep happy (and silent).
84
+ */
85
+ stall_inband_nocheck();
86
+ hard_cond_local_irq_enable();
87
+ local_irq_enable();
88
+ } else {
89
+ /*
90
+ * We are running on the out-of-band stage, don't mess
91
+ * with the in-band interrupt state. This is none of
92
+ * our business. We may manipulate the hardware state
93
+ * only.
94
+ */
95
+ hard_local_irq_enable();
96
+ }
97
+}
98
+
7499 static __always_inline long
75100 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
76101 {
77102 unsigned long ti_work;
103
+ int ret;
104
+
105
+ /*
106
+ * Pipeline the syscall to the companion core if the current
107
+ * task wants this. Compiled out if not dovetailing.
108
+ */
109
+ ret = pipeline_syscall(syscall, regs);
110
+ if (ret > 0) /* out-of-band, bail out. */
111
+ return EXIT_SYSCALL_OOB;
112
+ if (ret < 0) /* in-band, tail work only. */
113
+ return EXIT_SYSCALL_TAIL;
78114
79115 ti_work = READ_ONCE(current_thread_info()->flags);
80116 if (ti_work & SYSCALL_ENTER_WORK)
....@@ -95,7 +131,7 @@
95131 enter_from_user_mode(regs);
96132
97133 instrumentation_begin();
98
- local_irq_enable();
134
+ syscall_enter_from_user_enable_irqs();
99135 ret = __syscall_enter_from_user_work(regs, syscall);
100136 instrumentation_end();
101137
....@@ -106,7 +142,7 @@
106142 {
107143 enter_from_user_mode(regs);
108144 instrumentation_begin();
109
- local_irq_enable();
145
+ syscall_enter_from_user_enable_irqs();
110146 instrumentation_end();
111147 }
112148
....@@ -121,6 +157,7 @@
121157 * 3) Invoke architecture specific last minute exit code, e.g. speculation
122158 * mitigations, etc.
123159 * 4) Tell lockdep that interrupts are enabled
160
+ * 5) Unstall the in-band stage of the interrupt pipeline if current
124161 */
125162 static __always_inline void exit_to_user_mode(void)
126163 {
....@@ -132,6 +169,8 @@
132169 user_enter_irqoff();
133170 arch_exit_to_user_mode();
134171 lockdep_hardirqs_on(CALLER_ADDR0);
172
+ if (running_inband())
173
+ unstall_inband();
135174 }
136175
137176 /* Workaround to allow gradual conversion of architecture code */
....@@ -155,6 +194,12 @@
155194 while (ti_work & EXIT_TO_USER_MODE_WORK) {
156195
157196 local_irq_enable_exit_to_user(ti_work);
197
+
198
+ /*
199
+ * Check that local_irq_enable_exit_to_user() does the
200
+ * right thing when pipelining.
201
+ */
202
+ WARN_ON_ONCE(irq_pipeline_debug() && hard_irqs_disabled());
158203
159204 if (ti_work & _TIF_NEED_RESCHED)
160205 schedule();
....@@ -182,6 +227,7 @@
182227 * enabled above.
183228 */
184229 local_irq_disable_exit_to_user();
230
+ WARN_ON_ONCE(irq_pipeline_debug() && !hard_irqs_disabled());
185231 ti_work = READ_ONCE(current_thread_info()->flags);
186232 }
187233
....@@ -189,16 +235,36 @@
189235 return ti_work;
190236 }
191237
238
+static inline bool do_retuser(unsigned long ti_work)
239
+{
240
+ if (dovetailing() && (ti_work & _TIF_RETUSER)) {
241
+ hard_local_irq_enable();
242
+ inband_retuser_notify();
243
+ hard_local_irq_disable();
244
+ /* RETUSER might have switched oob */
245
+ return running_inband();
246
+ }
247
+
248
+ return false;
249
+}
250
+
192251 static void exit_to_user_mode_prepare(struct pt_regs *regs)
193252 {
194
- unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
253
+ unsigned long ti_work;
254
+
255
+ check_hard_irqs_disabled();
195256
196257 lockdep_assert_irqs_disabled();
258
+again:
259
+ ti_work = READ_ONCE(current_thread_info()->flags);
197260
198261 if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
199262 ti_work = exit_to_user_mode_loop(regs, ti_work);
200263
201264 arch_exit_to_user_mode_prepare(regs, ti_work);
265
+
266
+ if (do_retuser(ti_work))
267
+ goto again;
202268
203269 /* Ensure that the address limit is intact and no locks are held */
204270 addr_limit_user_check();
....@@ -252,7 +318,7 @@
252318
253319 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
254320 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
255
- local_irq_enable();
321
+ local_irq_enable_full();
256322 }
257323
258324 rseq_syscall(regs);
....@@ -261,8 +327,15 @@
261327 * Do one-time syscall specific work. If these work items are
262328 * enabled, we want to run them exactly once per syscall exit with
263329 * interrupts enabled.
330
+ *
331
+ * Dovetail: if this does not look like an in-band syscall, it
332
+ * has to belong to the companion core. Typically,
333
+ * __OOB_SYSCALL_BIT would be set in this value. Skip the
334
+ * work for those syscalls.
264335 */
265
- if (unlikely(cached_flags & SYSCALL_EXIT_WORK))
336
+ if (unlikely((cached_flags & SYSCALL_EXIT_WORK) &&
337
+ (!irqs_pipelined() ||
338
+ syscall_get_nr(current, regs) < NR_syscalls)))
266339 syscall_exit_work(regs, cached_flags);
267340 }
268341
....@@ -278,6 +351,8 @@
278351
279352 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
280353 {
354
+ WARN_ON_ONCE(irq_pipeline_debug() && irqs_disabled());
355
+ stall_inband_nocheck();
281356 enter_from_user_mode(regs);
282357 }
283358
....@@ -293,12 +368,36 @@
293368 {
294369 irqentry_state_t ret = {
295370 .exit_rcu = false,
371
+#ifdef CONFIG_IRQ_PIPELINE
372
+ .stage_info = IRQENTRY_INBAND_STALLED,
373
+#endif
296374 };
297375
376
+#ifdef CONFIG_IRQ_PIPELINE
377
+ if (running_oob()) {
378
+ WARN_ON_ONCE(irq_pipeline_debug() && oob_irqs_disabled());
379
+ ret.stage_info = IRQENTRY_OOB;
380
+ return ret;
381
+ }
382
+#endif
383
+
298384 if (user_mode(regs)) {
385
+#ifdef CONFIG_IRQ_PIPELINE
386
+ ret.stage_info = IRQENTRY_INBAND_UNSTALLED;
387
+#endif
299388 irqentry_enter_from_user_mode(regs);
300389 return ret;
301390 }
391
+
392
+#ifdef CONFIG_IRQ_PIPELINE
393
+ /*
394
+ * IRQ pipeline: If we trapped from kernel space, the virtual
395
+ * state may or may not match the hardware state. Since hard
396
+ * irqs are off on entry, we have to stall the in-band stage.
397
+ */
398
+ if (!test_and_stall_inband_nocheck())
399
+ ret.stage_info = IRQENTRY_INBAND_UNSTALLED;
400
+#endif
302401
303402 /*
304403 * If this entry hit the idle task invoke rcu_irq_enter() whether
....@@ -366,14 +465,91 @@
366465 }
367466 }
368467
468
+#ifdef CONFIG_IRQ_PIPELINE
469
+
470
+static inline
471
+bool irqexit_may_preempt_schedule(irqentry_state_t state,
472
+ struct pt_regs *regs)
473
+{
474
+ return state.stage_info == IRQENTRY_INBAND_UNSTALLED;
475
+}
476
+
477
+#else
478
+
479
+static inline
480
+bool irqexit_may_preempt_schedule(irqentry_state_t state,
481
+ struct pt_regs *regs)
482
+{
483
+ return !regs_irqs_disabled(regs);
484
+}
485
+
486
+#endif
487
+
488
+#ifdef CONFIG_IRQ_PIPELINE
489
+
490
+static bool irqentry_syncstage(irqentry_state_t state) /* hard irqs off */
491
+{
492
+ /*
493
+ * If pipelining interrupts, enable in-band IRQs then
494
+ * synchronize the interrupt log on exit if:
495
+ *
496
+ * - irqentry_enter() stalled the stage in order to mirror the
497
+ * hardware state.
498
+ *
499
+ * - we where coming from oob, thus went through a stage migration
500
+ * that was caused by taking a CPU exception, e.g., a fault.
501
+ *
502
+ * We run before preempt_schedule_irq() may be called later on
503
+ * by preemptible kernels, so that any rescheduling request
504
+ * triggered by in-band IRQ handlers is considered.
505
+ */
506
+ if (state.stage_info == IRQENTRY_INBAND_UNSTALLED ||
507
+ state.stage_info == IRQENTRY_OOB) {
508
+ unstall_inband_nocheck();
509
+ synchronize_pipeline_on_irq();
510
+ stall_inband_nocheck();
511
+ return true;
512
+ }
513
+
514
+ return false;
515
+}
516
+
517
+static void irqentry_unstall(void)
518
+{
519
+ unstall_inband_nocheck();
520
+}
521
+
522
+#else
523
+
524
+static bool irqentry_syncstage(irqentry_state_t state)
525
+{
526
+ return false;
527
+}
528
+
529
+static void irqentry_unstall(void)
530
+{
531
+}
532
+
533
+#endif
534
+
369535 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
370536 {
537
+ bool synchronized = false;
538
+
539
+ if (running_oob())
540
+ return;
541
+
371542 lockdep_assert_irqs_disabled();
372543
373544 /* Check whether this returns to user mode */
374545 if (user_mode(regs)) {
375546 irqentry_exit_to_user_mode(regs);
376
- } else if (!regs_irqs_disabled(regs)) {
547
+ return;
548
+ }
549
+
550
+ synchronized = irqentry_syncstage(state);
551
+
552
+ if (irqexit_may_preempt_schedule(state, regs)) {
377553 /*
378554 * If RCU was not watching on entry this needs to be done
379555 * carefully and needs the same ordering of lockdep/tracing
....@@ -387,7 +563,7 @@
387563 instrumentation_end();
388564 rcu_irq_exit();
389565 lockdep_hardirqs_on(CALLER_ADDR0);
390
- return;
566
+ goto out;
391567 }
392568
393569 instrumentation_begin();
....@@ -404,6 +580,12 @@
404580 if (state.exit_rcu)
405581 rcu_irq_exit();
406582 }
583
+
584
+out:
585
+ if (synchronized)
586
+ irqentry_unstall();
587
+
588
+ return;
407589 }
408590
409591 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)