From 2f529f9b558ca1c1bd74be7437a84e4711743404 Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Fri, 01 Nov 2024 02:11:33 +0000 Subject: [PATCH] add xenomai --- kernel/kernel/sched/core.c | 317 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 308 insertions(+), 9 deletions(-) diff --git a/kernel/kernel/sched/core.c b/kernel/kernel/sched/core.c index 7359375..b14a6fb 100644 --- a/kernel/kernel/sched/core.c +++ b/kernel/kernel/sched/core.c @@ -2045,6 +2045,7 @@ if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; + inband_migration_notify(p, dest_cpu); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ @@ -3065,7 +3066,7 @@ * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - if (!(p->state & state)) + if (!(p->state & state) || task_is_off_stage(p)) goto out; success = 1; @@ -3083,7 +3084,7 @@ */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); - if (!(p->state & state)) + if (!(p->state & state) || task_is_off_stage(p)) goto unlock; #ifdef CONFIG_FREEZER @@ -3348,6 +3349,9 @@ init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; +#endif +#ifdef CONFIG_IRQ_PIPELINE + init_task_stall_bits(p); #endif } @@ -3816,6 +3820,13 @@ rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); prepare_task(next); + prepare_inband_switch(next); + /* + * Do not fold the following hard irqs disabling into + * prepare_inband_switch(), this is required when pipelining + * interrupts, not only by alternate scheduling. + */ + hard_cond_local_irq_disable(); prepare_arch_switch(next); } @@ -3973,8 +3984,19 @@ * finish_task_switch() will drop rq->lock() and lower preempt_count * and the preempt_enable() will end up enabling preemption (on * PREEMPT_COUNT kernels). + * + * If interrupts are pipelined, we may enable hard irqs since + * the in-band stage is stalled. If dovetailing is enabled + * too, schedule_tail() is the place where transitions of + * tasks from the in-band to the oob stage completes. The + * companion core is notified that 'prev' is now suspended in + * the in-band stage, and can be safely resumed in the oob + * stage. */ + WARN_ON_ONCE(irq_pipeline_debug() && !irqs_disabled()); + hard_cond_local_irq_enable(); + oob_trampoline(); rq = finish_task_switch(prev); balance_callback(rq); preempt_enable(); @@ -4028,6 +4050,20 @@ */ switch_mm_irqs_off(prev->active_mm, next->mm, next); + /* + * If dovetail is enabled, insert a short window of + * opportunity for preemption by out-of-band IRQs + * before finalizing the context switch. + * dovetail_context_switch() can deal with preempting + * partially switched in-band contexts. + */ + if (dovetailing()) { + struct mm_struct *oldmm = prev->active_mm; + prev->active_mm = next->mm; + hard_local_irq_sync(); + prev->active_mm = oldmm; + } + if (!prev->mm) { // from kernel /* will mmdrop() in finish_task_switch(). */ rq->prev_mm = prev->active_mm; @@ -4042,6 +4078,15 @@ /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); + + /* + * If 'next' is on its way to the oob stage, don't run the + * context switch epilogue just yet. We will do that at some + * point later, when the task switches back to the in-band + * stage. + */ + if (unlikely(inband_switch_tail())) + return NULL; return finish_task_switch(prev); } @@ -4557,6 +4602,8 @@ panic("corrupted shadow stack detected inside scheduler\n"); #endif + check_inband_stage(); + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP if (!preempt && prev->state && prev->non_block_count) { printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", @@ -4682,7 +4729,7 @@ * * WARNING: must be called with preemption disabled! */ -static void __sched notrace __schedule(bool preempt) +static int __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -4802,12 +4849,17 @@ /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); + if (dovetailing() && rq == NULL) + /* Task moved to the oob stage. */ + return 1; } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); rq_unlock_irq(rq, &rf); } balance_callback(rq); + + return 0; } void __noreturn do_task_dead(void) @@ -4879,7 +4931,8 @@ sched_submit_work(tsk); do { preempt_disable(); - __schedule(false); + if (__schedule(false)) + return; sched_preempt_enable_no_resched(); } while (need_resched()); sched_update_worker(tsk); @@ -4960,7 +5013,8 @@ */ preempt_disable_notrace(); preempt_latency_start(1); - __schedule(true); + if (__schedule(true)) + return; preempt_latency_stop(1); preempt_enable_no_resched_notrace(); @@ -4982,7 +5036,7 @@ * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ - if (likely(!preemptible())) + if (likely(!running_inband() || !preemptible())) return; preempt_schedule_common(); @@ -5008,7 +5062,7 @@ { enum ctx_state prev_ctx; - if (likely(!preemptible())) + if (likely(!running_inband() || !preemptible())) return; do { @@ -5049,23 +5103,41 @@ * off of irq context. * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. + * + * IRQ pipeline: we are called with hard irqs off, synchronize the + * pipeline then return the same way, so that the in-band log is + * guaranteed empty and further interrupt delivery is postponed by the + * hardware until have exited the kernel. */ asmlinkage __visible void __sched preempt_schedule_irq(void) { enum ctx_state prev_state; + + if (irq_pipeline_debug()) { + /* Catch any weirdness in pipelined entry code. */ + if (WARN_ON_ONCE(!running_inband())) + return; + WARN_ON_ONCE(!hard_irqs_disabled()); + } + + hard_cond_local_irq_enable(); /* Catch callers which need to be fixed */ BUG_ON(preempt_count() || !irqs_disabled()); prev_state = exception_enter(); - do { + for (;;) { preempt_disable(); local_irq_enable(); __schedule(true); + sync_inband_irqs(); local_irq_disable(); sched_preempt_enable_no_resched(); - } while (need_resched()); + if (!need_resched()) + break; + hard_cond_local_irq_enable(); + } exception_exit(prev_state); } @@ -8892,6 +8964,233 @@ #endif /* CONFIG_CGROUP_SCHED */ +#ifdef CONFIG_DOVETAIL + +int dovetail_leave_inband(void) +{ + struct task_struct *p = current; + struct irq_pipeline_data *pd; + unsigned long flags; + + preempt_disable(); + + pd = raw_cpu_ptr(&irq_pipeline); + + if (WARN_ON_ONCE(dovetail_debug() && pd->task_inflight)) + goto out; /* Paranoid. */ + + raw_spin_lock_irqsave(&p->pi_lock, flags); + pd->task_inflight = p; + /* + * The scope of the off-stage state is broader than _TLF_OOB, + * in that it includes the transition path from the in-band + * context to the oob stage. + */ + set_thread_local_flags(_TLF_OFFSTAGE); + set_current_state(TASK_INTERRUPTIBLE); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + sched_submit_work(p); + /* + * The current task is scheduled out from the inband stage, + * before resuming on the oob stage. Since this code stands + * for the scheduling tail of the oob scheduler, + * arch_dovetail_switch_finish() is called to perform + * architecture-specific fixups (e.g. fpu context reload). + */ + if (likely(__schedule(false))) { + arch_dovetail_switch_finish(false); + return 0; + } + + clear_thread_local_flags(_TLF_OFFSTAGE); + pd->task_inflight = NULL; +out: + preempt_enable(); + + return -ERESTARTSYS; +} +EXPORT_SYMBOL_GPL(dovetail_leave_inband); + +void dovetail_resume_inband(void) +{ + struct task_struct *p; + struct rq *rq; + + p = __this_cpu_read(irq_pipeline.rqlock_owner); + if (WARN_ON_ONCE(dovetail_debug() && p == NULL)) + return; + + if (WARN_ON_ONCE(dovetail_debug() && (preempt_count() & STAGE_MASK))) + return; + + rq = finish_task_switch(p); + balance_callback(rq); + preempt_enable(); + oob_trampoline(); +} +EXPORT_SYMBOL_GPL(dovetail_resume_inband); + +#ifdef CONFIG_KVM + +#include <linux/kvm_host.h> + +static inline void notify_guest_preempt(void) +{ + struct kvm_oob_notifier *nfy; + struct irq_pipeline_data *p; + + p = raw_cpu_ptr(&irq_pipeline); + nfy = p->vcpu_notify; + if (unlikely(nfy)) + nfy->handler(nfy); +} +#else +static inline void notify_guest_preempt(void) +{ } +#endif + +bool dovetail_context_switch(struct dovetail_altsched_context *out, + struct dovetail_altsched_context *in, + bool leave_inband) +{ + unsigned long pc __maybe_unused, lockdep_irqs; + struct task_struct *next, *prev, *last; + struct mm_struct *prev_mm, *next_mm; + bool inband_tail = false; + + WARN_ON_ONCE(dovetail_debug() && on_pipeline_entry()); + + if (leave_inband) { + struct task_struct *tsk = current; + /* + * We are about to leave the current inband context + * for switching to an out-of-band task, save the + * preempted context information. + */ + out->task = tsk; + out->active_mm = tsk->active_mm; + /* + * Switching out-of-band may require some housekeeping + * from a kernel VM which might currently run guest + * code, notify it about the upcoming preemption. + */ + notify_guest_preempt(); + } + + arch_dovetail_switch_prepare(leave_inband); + + next = in->task; + prev = out->task; + prev_mm = out->active_mm; + next_mm = in->active_mm; + + if (next_mm == NULL) { + in->active_mm = prev_mm; + in->borrowed_mm = true; + enter_lazy_tlb(prev_mm, next); + } else { + switch_oob_mm(prev_mm, next_mm, next); + /* + * We might be switching back to the inband context + * which we preempted earlier, shortly after "current" + * dropped its mm context in the do_exit() path + * (next->mm == NULL). In such a case, a lazy TLB + * state is expected when leaving the mm. + */ + if (next->mm == NULL) + enter_lazy_tlb(prev_mm, next); + } + + if (out->borrowed_mm) { + out->borrowed_mm = false; + out->active_mm = NULL; + } + + /* + * Tasks running out-of-band may alter the (in-band) + * preemption count as long as they don't trigger an in-band + * rescheduling, which Dovetail properly blocks. + * + * If the preemption count is not stack-based but a global + * per-cpu variable instead, changing it has a globally + * visible side-effect though, which is a problem if the + * out-of-band task is preempted and schedules away before the + * change is rolled back: this may cause the in-band context + * to later resume with a broken preemption count. + * + * For this reason, the preemption count of any context which + * blocks from the out-of-band stage is carried over and + * restored across switches, emulating a stack-based + * storage. + * + * Eventually, the count is reset to FORK_PREEMPT_COUNT upon + * transition from out-of-band to in-band stage, reinstating + * the value in effect when the converse transition happened + * at some point before. + */ + if (IS_ENABLED(CONFIG_HAVE_PERCPU_PREEMPT_COUNT)) + pc = preempt_count(); + + /* + * Like the preemption count and for the same reason, the irq + * state maintained by lockdep must be preserved across + * switches. + */ + lockdep_irqs = lockdep_read_irqs_state(); + + switch_to(prev, next, last); + barrier(); + + if (check_hard_irqs_disabled()) + hard_local_irq_disable(); + + /* + * If we entered this routine for switching to an out-of-band + * task but don't have _TLF_OOB set for the current context + * when resuming, this portion of code is the switch tail of + * the inband schedule() routine, finalizing a transition to + * the inband stage for the current task. Update the stage + * level as/if required. + */ + if (unlikely(!leave_inband && !test_thread_local_flags(_TLF_OOB))) { + if (IS_ENABLED(CONFIG_HAVE_PERCPU_PREEMPT_COUNT)) + preempt_count_set(FORK_PREEMPT_COUNT); + else if (unlikely(dovetail_debug() && + !(preempt_count() & STAGE_MASK))) + WARN_ON_ONCE(1); + else + preempt_count_sub(STAGE_OFFSET); + + lockdep_write_irqs_state(lockdep_irqs); + + /* + * Fixup the interrupt state conversely to what + * inband_switch_tail() does for the opposite stage + * switching direction. + */ + stall_inband(); + trace_hardirqs_off(); + inband_tail = true; + } else { + if (IS_ENABLED(CONFIG_HAVE_PERCPU_PREEMPT_COUNT)) + preempt_count_set(pc); + + lockdep_write_irqs_state(lockdep_irqs); + } + + arch_dovetail_switch_finish(leave_inband); + + /* + * inband_tail is true whenever we are finalizing a transition + * to the inband stage from the oob context for current. See + * above. + */ + return inband_tail; +} +EXPORT_SYMBOL_GPL(dovetail_context_switch); + +#endif /* CONFIG_DOVETAIL */ + void dump_cpu_task(int cpu) { pr_info("Task dump for CPU %d:\n", cpu); -- Gitblit v1.6.2