hc
2024-11-01 2f529f9b558ca1c1bd74be7437a84e4711743404
kernel/kernel/sched/core.c
....@@ -2045,6 +2045,7 @@
20452045 if (cpumask_test_cpu(task_cpu(p), new_mask))
20462046 goto out;
20472047
2048
+ inband_migration_notify(p, dest_cpu);
20482049 if (task_running(rq, p) || p->state == TASK_WAKING) {
20492050 struct migration_arg arg = { p, dest_cpu };
20502051 /* Need help from migration thread: drop lock and wait. */
....@@ -3065,7 +3066,7 @@
30653066 * - we're serialized against set_special_state() by virtue of
30663067 * it disabling IRQs (this allows not taking ->pi_lock).
30673068 */
3068
- if (!(p->state & state))
3069
+ if (!(p->state & state) || task_is_off_stage(p))
30693070 goto out;
30703071
30713072 success = 1;
....@@ -3083,7 +3084,7 @@
30833084 */
30843085 raw_spin_lock_irqsave(&p->pi_lock, flags);
30853086 smp_mb__after_spinlock();
3086
- if (!(p->state & state))
3087
+ if (!(p->state & state) || task_is_off_stage(p))
30873088 goto unlock;
30883089
30893090 #ifdef CONFIG_FREEZER
....@@ -3348,6 +3349,9 @@
33483349 init_numa_balancing(clone_flags, p);
33493350 #ifdef CONFIG_SMP
33503351 p->wake_entry.u_flags = CSD_TYPE_TTWU;
3352
+#endif
3353
+#ifdef CONFIG_IRQ_PIPELINE
3354
+ init_task_stall_bits(p);
33513355 #endif
33523356 }
33533357
....@@ -3816,6 +3820,13 @@
38163820 rseq_preempt(prev);
38173821 fire_sched_out_preempt_notifiers(prev, next);
38183822 prepare_task(next);
3823
+ prepare_inband_switch(next);
3824
+ /*
3825
+ * Do not fold the following hard irqs disabling into
3826
+ * prepare_inband_switch(), this is required when pipelining
3827
+ * interrupts, not only by alternate scheduling.
3828
+ */
3829
+ hard_cond_local_irq_disable();
38193830 prepare_arch_switch(next);
38203831 }
38213832
....@@ -3973,8 +3984,19 @@
39733984 * finish_task_switch() will drop rq->lock() and lower preempt_count
39743985 * and the preempt_enable() will end up enabling preemption (on
39753986 * PREEMPT_COUNT kernels).
3987
+ *
3988
+ * If interrupts are pipelined, we may enable hard irqs since
3989
+ * the in-band stage is stalled. If dovetailing is enabled
3990
+ * too, schedule_tail() is the place where transitions of
3991
+ * tasks from the in-band to the oob stage completes. The
3992
+ * companion core is notified that 'prev' is now suspended in
3993
+ * the in-band stage, and can be safely resumed in the oob
3994
+ * stage.
39763995 */
39773996
3997
+ WARN_ON_ONCE(irq_pipeline_debug() && !irqs_disabled());
3998
+ hard_cond_local_irq_enable();
3999
+ oob_trampoline();
39784000 rq = finish_task_switch(prev);
39794001 balance_callback(rq);
39804002 preempt_enable();
....@@ -4028,6 +4050,20 @@
40284050 */
40294051 switch_mm_irqs_off(prev->active_mm, next->mm, next);
40304052
4053
+ /*
4054
+ * If dovetail is enabled, insert a short window of
4055
+ * opportunity for preemption by out-of-band IRQs
4056
+ * before finalizing the context switch.
4057
+ * dovetail_context_switch() can deal with preempting
4058
+ * partially switched in-band contexts.
4059
+ */
4060
+ if (dovetailing()) {
4061
+ struct mm_struct *oldmm = prev->active_mm;
4062
+ prev->active_mm = next->mm;
4063
+ hard_local_irq_sync();
4064
+ prev->active_mm = oldmm;
4065
+ }
4066
+
40314067 if (!prev->mm) { // from kernel
40324068 /* will mmdrop() in finish_task_switch(). */
40334069 rq->prev_mm = prev->active_mm;
....@@ -4042,6 +4078,15 @@
40424078 /* Here we just switch the register state and the stack. */
40434079 switch_to(prev, next, prev);
40444080 barrier();
4081
+
4082
+ /*
4083
+ * If 'next' is on its way to the oob stage, don't run the
4084
+ * context switch epilogue just yet. We will do that at some
4085
+ * point later, when the task switches back to the in-band
4086
+ * stage.
4087
+ */
4088
+ if (unlikely(inband_switch_tail()))
4089
+ return NULL;
40454090
40464091 return finish_task_switch(prev);
40474092 }
....@@ -4557,6 +4602,8 @@
45574602 panic("corrupted shadow stack detected inside scheduler\n");
45584603 #endif
45594604
4605
+ check_inband_stage();
4606
+
45604607 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
45614608 if (!preempt && prev->state && prev->non_block_count) {
45624609 printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
....@@ -4682,7 +4729,7 @@
46824729 *
46834730 * WARNING: must be called with preemption disabled!
46844731 */
4685
-static void __sched notrace __schedule(bool preempt)
4732
+static int __sched notrace __schedule(bool preempt)
46864733 {
46874734 struct task_struct *prev, *next;
46884735 unsigned long *switch_count;
....@@ -4802,12 +4849,17 @@
48024849
48034850 /* Also unlocks the rq: */
48044851 rq = context_switch(rq, prev, next, &rf);
4852
+ if (dovetailing() && rq == NULL)
4853
+ /* Task moved to the oob stage. */
4854
+ return 1;
48054855 } else {
48064856 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
48074857 rq_unlock_irq(rq, &rf);
48084858 }
48094859
48104860 balance_callback(rq);
4861
+
4862
+ return 0;
48114863 }
48124864
48134865 void __noreturn do_task_dead(void)
....@@ -4879,7 +4931,8 @@
48794931 sched_submit_work(tsk);
48804932 do {
48814933 preempt_disable();
4882
- __schedule(false);
4934
+ if (__schedule(false))
4935
+ return;
48834936 sched_preempt_enable_no_resched();
48844937 } while (need_resched());
48854938 sched_update_worker(tsk);
....@@ -4960,7 +5013,8 @@
49605013 */
49615014 preempt_disable_notrace();
49625015 preempt_latency_start(1);
4963
- __schedule(true);
5016
+ if (__schedule(true))
5017
+ return;
49645018 preempt_latency_stop(1);
49655019 preempt_enable_no_resched_notrace();
49665020
....@@ -4982,7 +5036,7 @@
49825036 * If there is a non-zero preempt_count or interrupts are disabled,
49835037 * we do not want to preempt the current task. Just return..
49845038 */
4985
- if (likely(!preemptible()))
5039
+ if (likely(!running_inband() || !preemptible()))
49865040 return;
49875041
49885042 preempt_schedule_common();
....@@ -5008,7 +5062,7 @@
50085062 {
50095063 enum ctx_state prev_ctx;
50105064
5011
- if (likely(!preemptible()))
5065
+ if (likely(!running_inband() || !preemptible()))
50125066 return;
50135067
50145068 do {
....@@ -5049,23 +5103,41 @@
50495103 * off of irq context.
50505104 * Note, that this is called and return with irqs disabled. This will
50515105 * protect us against recursive calling from irq.
5106
+ *
5107
+ * IRQ pipeline: we are called with hard irqs off, synchronize the
5108
+ * pipeline then return the same way, so that the in-band log is
5109
+ * guaranteed empty and further interrupt delivery is postponed by the
5110
+ * hardware until have exited the kernel.
50525111 */
50535112 asmlinkage __visible void __sched preempt_schedule_irq(void)
50545113 {
50555114 enum ctx_state prev_state;
5115
+
5116
+ if (irq_pipeline_debug()) {
5117
+ /* Catch any weirdness in pipelined entry code. */
5118
+ if (WARN_ON_ONCE(!running_inband()))
5119
+ return;
5120
+ WARN_ON_ONCE(!hard_irqs_disabled());
5121
+ }
5122
+
5123
+ hard_cond_local_irq_enable();
50565124
50575125 /* Catch callers which need to be fixed */
50585126 BUG_ON(preempt_count() || !irqs_disabled());
50595127
50605128 prev_state = exception_enter();
50615129
5062
- do {
5130
+ for (;;) {
50635131 preempt_disable();
50645132 local_irq_enable();
50655133 __schedule(true);
5134
+ sync_inband_irqs();
50665135 local_irq_disable();
50675136 sched_preempt_enable_no_resched();
5068
- } while (need_resched());
5137
+ if (!need_resched())
5138
+ break;
5139
+ hard_cond_local_irq_enable();
5140
+ }
50695141
50705142 exception_exit(prev_state);
50715143 }
....@@ -8892,6 +8964,233 @@
88928964
88938965 #endif /* CONFIG_CGROUP_SCHED */
88948966
8967
+#ifdef CONFIG_DOVETAIL
8968
+
8969
+int dovetail_leave_inband(void)
8970
+{
8971
+ struct task_struct *p = current;
8972
+ struct irq_pipeline_data *pd;
8973
+ unsigned long flags;
8974
+
8975
+ preempt_disable();
8976
+
8977
+ pd = raw_cpu_ptr(&irq_pipeline);
8978
+
8979
+ if (WARN_ON_ONCE(dovetail_debug() && pd->task_inflight))
8980
+ goto out; /* Paranoid. */
8981
+
8982
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
8983
+ pd->task_inflight = p;
8984
+ /*
8985
+ * The scope of the off-stage state is broader than _TLF_OOB,
8986
+ * in that it includes the transition path from the in-band
8987
+ * context to the oob stage.
8988
+ */
8989
+ set_thread_local_flags(_TLF_OFFSTAGE);
8990
+ set_current_state(TASK_INTERRUPTIBLE);
8991
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
8992
+ sched_submit_work(p);
8993
+ /*
8994
+ * The current task is scheduled out from the inband stage,
8995
+ * before resuming on the oob stage. Since this code stands
8996
+ * for the scheduling tail of the oob scheduler,
8997
+ * arch_dovetail_switch_finish() is called to perform
8998
+ * architecture-specific fixups (e.g. fpu context reload).
8999
+ */
9000
+ if (likely(__schedule(false))) {
9001
+ arch_dovetail_switch_finish(false);
9002
+ return 0;
9003
+ }
9004
+
9005
+ clear_thread_local_flags(_TLF_OFFSTAGE);
9006
+ pd->task_inflight = NULL;
9007
+out:
9008
+ preempt_enable();
9009
+
9010
+ return -ERESTARTSYS;
9011
+}
9012
+EXPORT_SYMBOL_GPL(dovetail_leave_inband);
9013
+
9014
+void dovetail_resume_inband(void)
9015
+{
9016
+ struct task_struct *p;
9017
+ struct rq *rq;
9018
+
9019
+ p = __this_cpu_read(irq_pipeline.rqlock_owner);
9020
+ if (WARN_ON_ONCE(dovetail_debug() && p == NULL))
9021
+ return;
9022
+
9023
+ if (WARN_ON_ONCE(dovetail_debug() && (preempt_count() & STAGE_MASK)))
9024
+ return;
9025
+
9026
+ rq = finish_task_switch(p);
9027
+ balance_callback(rq);
9028
+ preempt_enable();
9029
+ oob_trampoline();
9030
+}
9031
+EXPORT_SYMBOL_GPL(dovetail_resume_inband);
9032
+
9033
+#ifdef CONFIG_KVM
9034
+
9035
+#include <linux/kvm_host.h>
9036
+
9037
+static inline void notify_guest_preempt(void)
9038
+{
9039
+ struct kvm_oob_notifier *nfy;
9040
+ struct irq_pipeline_data *p;
9041
+
9042
+ p = raw_cpu_ptr(&irq_pipeline);
9043
+ nfy = p->vcpu_notify;
9044
+ if (unlikely(nfy))
9045
+ nfy->handler(nfy);
9046
+}
9047
+#else
9048
+static inline void notify_guest_preempt(void)
9049
+{ }
9050
+#endif
9051
+
9052
+bool dovetail_context_switch(struct dovetail_altsched_context *out,
9053
+ struct dovetail_altsched_context *in,
9054
+ bool leave_inband)
9055
+{
9056
+ unsigned long pc __maybe_unused, lockdep_irqs;
9057
+ struct task_struct *next, *prev, *last;
9058
+ struct mm_struct *prev_mm, *next_mm;
9059
+ bool inband_tail = false;
9060
+
9061
+ WARN_ON_ONCE(dovetail_debug() && on_pipeline_entry());
9062
+
9063
+ if (leave_inband) {
9064
+ struct task_struct *tsk = current;
9065
+ /*
9066
+ * We are about to leave the current inband context
9067
+ * for switching to an out-of-band task, save the
9068
+ * preempted context information.
9069
+ */
9070
+ out->task = tsk;
9071
+ out->active_mm = tsk->active_mm;
9072
+ /*
9073
+ * Switching out-of-band may require some housekeeping
9074
+ * from a kernel VM which might currently run guest
9075
+ * code, notify it about the upcoming preemption.
9076
+ */
9077
+ notify_guest_preempt();
9078
+ }
9079
+
9080
+ arch_dovetail_switch_prepare(leave_inband);
9081
+
9082
+ next = in->task;
9083
+ prev = out->task;
9084
+ prev_mm = out->active_mm;
9085
+ next_mm = in->active_mm;
9086
+
9087
+ if (next_mm == NULL) {
9088
+ in->active_mm = prev_mm;
9089
+ in->borrowed_mm = true;
9090
+ enter_lazy_tlb(prev_mm, next);
9091
+ } else {
9092
+ switch_oob_mm(prev_mm, next_mm, next);
9093
+ /*
9094
+ * We might be switching back to the inband context
9095
+ * which we preempted earlier, shortly after "current"
9096
+ * dropped its mm context in the do_exit() path
9097
+ * (next->mm == NULL). In such a case, a lazy TLB
9098
+ * state is expected when leaving the mm.
9099
+ */
9100
+ if (next->mm == NULL)
9101
+ enter_lazy_tlb(prev_mm, next);
9102
+ }
9103
+
9104
+ if (out->borrowed_mm) {
9105
+ out->borrowed_mm = false;
9106
+ out->active_mm = NULL;
9107
+ }
9108
+
9109
+ /*
9110
+ * Tasks running out-of-band may alter the (in-band)
9111
+ * preemption count as long as they don't trigger an in-band
9112
+ * rescheduling, which Dovetail properly blocks.
9113
+ *
9114
+ * If the preemption count is not stack-based but a global
9115
+ * per-cpu variable instead, changing it has a globally
9116
+ * visible side-effect though, which is a problem if the
9117
+ * out-of-band task is preempted and schedules away before the
9118
+ * change is rolled back: this may cause the in-band context
9119
+ * to later resume with a broken preemption count.
9120
+ *
9121
+ * For this reason, the preemption count of any context which
9122
+ * blocks from the out-of-band stage is carried over and
9123
+ * restored across switches, emulating a stack-based
9124
+ * storage.
9125
+ *
9126
+ * Eventually, the count is reset to FORK_PREEMPT_COUNT upon
9127
+ * transition from out-of-band to in-band stage, reinstating
9128
+ * the value in effect when the converse transition happened
9129
+ * at some point before.
9130
+ */
9131
+ if (IS_ENABLED(CONFIG_HAVE_PERCPU_PREEMPT_COUNT))
9132
+ pc = preempt_count();
9133
+
9134
+ /*
9135
+ * Like the preemption count and for the same reason, the irq
9136
+ * state maintained by lockdep must be preserved across
9137
+ * switches.
9138
+ */
9139
+ lockdep_irqs = lockdep_read_irqs_state();
9140
+
9141
+ switch_to(prev, next, last);
9142
+ barrier();
9143
+
9144
+ if (check_hard_irqs_disabled())
9145
+ hard_local_irq_disable();
9146
+
9147
+ /*
9148
+ * If we entered this routine for switching to an out-of-band
9149
+ * task but don't have _TLF_OOB set for the current context
9150
+ * when resuming, this portion of code is the switch tail of
9151
+ * the inband schedule() routine, finalizing a transition to
9152
+ * the inband stage for the current task. Update the stage
9153
+ * level as/if required.
9154
+ */
9155
+ if (unlikely(!leave_inband && !test_thread_local_flags(_TLF_OOB))) {
9156
+ if (IS_ENABLED(CONFIG_HAVE_PERCPU_PREEMPT_COUNT))
9157
+ preempt_count_set(FORK_PREEMPT_COUNT);
9158
+ else if (unlikely(dovetail_debug() &&
9159
+ !(preempt_count() & STAGE_MASK)))
9160
+ WARN_ON_ONCE(1);
9161
+ else
9162
+ preempt_count_sub(STAGE_OFFSET);
9163
+
9164
+ lockdep_write_irqs_state(lockdep_irqs);
9165
+
9166
+ /*
9167
+ * Fixup the interrupt state conversely to what
9168
+ * inband_switch_tail() does for the opposite stage
9169
+ * switching direction.
9170
+ */
9171
+ stall_inband();
9172
+ trace_hardirqs_off();
9173
+ inband_tail = true;
9174
+ } else {
9175
+ if (IS_ENABLED(CONFIG_HAVE_PERCPU_PREEMPT_COUNT))
9176
+ preempt_count_set(pc);
9177
+
9178
+ lockdep_write_irqs_state(lockdep_irqs);
9179
+ }
9180
+
9181
+ arch_dovetail_switch_finish(leave_inband);
9182
+
9183
+ /*
9184
+ * inband_tail is true whenever we are finalizing a transition
9185
+ * to the inband stage from the oob context for current. See
9186
+ * above.
9187
+ */
9188
+ return inband_tail;
9189
+}
9190
+EXPORT_SYMBOL_GPL(dovetail_context_switch);
9191
+
9192
+#endif /* CONFIG_DOVETAIL */
9193
+
88959194 void dump_cpu_task(int cpu)
88969195 {
88979196 pr_info("Task dump for CPU %d:\n", cpu);