From 2f529f9b558ca1c1bd74be7437a84e4711743404 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Fri, 01 Nov 2024 02:11:33 +0000
Subject: [PATCH] add xenomai

---
 kernel/kernel/sched/core.c |  317 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 308 insertions(+), 9 deletions(-)

diff --git a/kernel/kernel/sched/core.c b/kernel/kernel/sched/core.c
index 7359375..b14a6fb 100644
--- a/kernel/kernel/sched/core.c
+++ b/kernel/kernel/sched/core.c
@@ -2045,6 +2045,7 @@
 	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 
+	inband_migration_notify(p, dest_cpu);
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
@@ -3065,7 +3066,7 @@
 		 *  - we're serialized against set_special_state() by virtue of
 		 *    it disabling IRQs (this allows not taking ->pi_lock).
 		 */
-		if (!(p->state & state))
+		if (!(p->state & state) || task_is_off_stage(p))
 			goto out;
 
 		success = 1;
@@ -3083,7 +3084,7 @@
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	smp_mb__after_spinlock();
-	if (!(p->state & state))
+	if (!(p->state & state) || task_is_off_stage(p))
 		goto unlock;
 
 #ifdef CONFIG_FREEZER
@@ -3348,6 +3349,9 @@
 	init_numa_balancing(clone_flags, p);
 #ifdef CONFIG_SMP
 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
+#endif
+#ifdef CONFIG_IRQ_PIPELINE
+	init_task_stall_bits(p);
 #endif
 }
 
@@ -3816,6 +3820,13 @@
 	rseq_preempt(prev);
 	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_task(next);
+	prepare_inband_switch(next);
+	/*
+	 * Do not fold the following hard irqs disabling into
+	 * prepare_inband_switch(), this is required when pipelining
+	 * interrupts, not only by alternate scheduling.
+	 */
+	hard_cond_local_irq_disable();
 	prepare_arch_switch(next);
 }
 
@@ -3973,8 +3984,19 @@
 	 * finish_task_switch() will drop rq->lock() and lower preempt_count
 	 * and the preempt_enable() will end up enabling preemption (on
 	 * PREEMPT_COUNT kernels).
+	 *
+	 * If interrupts are pipelined, we may enable hard irqs since
+	 * the in-band stage is stalled. If dovetailing is enabled
+	 * too, schedule_tail() is the place where transitions of
+	 * tasks from the in-band to the oob stage completes. The
+	 * companion core is notified that 'prev' is now suspended in
+	 * the in-band stage, and can be safely resumed in the oob
+	 * stage.
 	 */
 
+	WARN_ON_ONCE(irq_pipeline_debug() && !irqs_disabled());
+	hard_cond_local_irq_enable();
+	oob_trampoline();
 	rq = finish_task_switch(prev);
 	balance_callback(rq);
 	preempt_enable();
@@ -4028,6 +4050,20 @@
 		 */
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
+		/*
+		 * If dovetail is enabled, insert a short window of
+		 * opportunity for preemption by out-of-band IRQs
+		 * before finalizing the context switch.
+		 * dovetail_context_switch() can deal with preempting
+		 * partially switched in-band contexts.
+		 */
+		if (dovetailing()) {
+			struct mm_struct *oldmm = prev->active_mm;
+			prev->active_mm = next->mm;
+			hard_local_irq_sync();
+			prev->active_mm = oldmm;
+		}
+
 		if (!prev->mm) {                        // from kernel
 			/* will mmdrop() in finish_task_switch(). */
 			rq->prev_mm = prev->active_mm;
@@ -4042,6 +4078,15 @@
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 	barrier();
+
+	/*
+	 * If 'next' is on its way to the oob stage, don't run the
+	 * context switch epilogue just yet. We will do that at some
+	 * point later, when the task switches back to the in-band
+	 * stage.
+	 */
+	if (unlikely(inband_switch_tail()))
+		return NULL;
 
 	return finish_task_switch(prev);
 }
@@ -4557,6 +4602,8 @@
 		panic("corrupted shadow stack detected inside scheduler\n");
 #endif
 
+	check_inband_stage();
+
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 	if (!preempt && prev->state && prev->non_block_count) {
 		printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
@@ -4682,7 +4729,7 @@
  *
  * WARNING: must be called with preemption disabled!
  */
-static void __sched notrace __schedule(bool preempt)
+static int __sched notrace __schedule(bool preempt)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
@@ -4802,12 +4849,17 @@
 
 		/* Also unlocks the rq: */
 		rq = context_switch(rq, prev, next, &rf);
+		if (dovetailing() && rq == NULL)
+			/* Task moved to the oob stage. */
+			return 1;
 	} else {
 		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 		rq_unlock_irq(rq, &rf);
 	}
 
 	balance_callback(rq);
+
+	return 0;
 }
 
 void __noreturn do_task_dead(void)
@@ -4879,7 +4931,8 @@
 	sched_submit_work(tsk);
 	do {
 		preempt_disable();
-		__schedule(false);
+		if (__schedule(false))
+			return;
 		sched_preempt_enable_no_resched();
 	} while (need_resched());
 	sched_update_worker(tsk);
@@ -4960,7 +5013,8 @@
 		 */
 		preempt_disable_notrace();
 		preempt_latency_start(1);
-		__schedule(true);
+		if (__schedule(true))
+			return;
 		preempt_latency_stop(1);
 		preempt_enable_no_resched_notrace();
 
@@ -4982,7 +5036,7 @@
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
 	 */
-	if (likely(!preemptible()))
+	if (likely(!running_inband() || !preemptible()))
 		return;
 
 	preempt_schedule_common();
@@ -5008,7 +5062,7 @@
 {
 	enum ctx_state prev_ctx;
 
-	if (likely(!preemptible()))
+	if (likely(!running_inband() || !preemptible()))
 		return;
 
 	do {
@@ -5049,23 +5103,41 @@
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
+ *
+ * IRQ pipeline: we are called with hard irqs off, synchronize the
+ * pipeline then return the same way, so that the in-band log is
+ * guaranteed empty and further interrupt delivery is postponed by the
+ * hardware until have exited the kernel.
  */
 asmlinkage __visible void __sched preempt_schedule_irq(void)
 {
 	enum ctx_state prev_state;
+
+	if (irq_pipeline_debug()) {
+		/* Catch any weirdness in pipelined entry code. */
+		if (WARN_ON_ONCE(!running_inband()))
+			return;
+		WARN_ON_ONCE(!hard_irqs_disabled());
+	}
+
+	hard_cond_local_irq_enable();
 
 	/* Catch callers which need to be fixed */
 	BUG_ON(preempt_count() || !irqs_disabled());
 
 	prev_state = exception_enter();
 
-	do {
+	for (;;) {
 		preempt_disable();
 		local_irq_enable();
 		__schedule(true);
+		sync_inband_irqs();
 		local_irq_disable();
 		sched_preempt_enable_no_resched();
-	} while (need_resched());
+		if (!need_resched())
+			break;
+		hard_cond_local_irq_enable();
+	}
 
 	exception_exit(prev_state);
 }
@@ -8892,6 +8964,233 @@
 
 #endif	/* CONFIG_CGROUP_SCHED */
 
+#ifdef CONFIG_DOVETAIL
+
+int dovetail_leave_inband(void)
+{
+	struct task_struct *p = current;
+	struct irq_pipeline_data *pd;
+	unsigned long flags;
+
+	preempt_disable();
+
+	pd = raw_cpu_ptr(&irq_pipeline);
+
+	if (WARN_ON_ONCE(dovetail_debug() && pd->task_inflight))
+		goto out;	/* Paranoid. */
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	pd->task_inflight = p;
+	/*
+	 * The scope of the off-stage state is broader than _TLF_OOB,
+	 * in that it includes the transition path from the in-band
+	 * context to the oob stage.
+	 */
+	set_thread_local_flags(_TLF_OFFSTAGE);
+	set_current_state(TASK_INTERRUPTIBLE);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	sched_submit_work(p);
+	/*
+	 * The current task is scheduled out from the inband stage,
+	 * before resuming on the oob stage. Since this code stands
+	 * for the scheduling tail of the oob scheduler,
+	 * arch_dovetail_switch_finish() is called to perform
+	 * architecture-specific fixups (e.g. fpu context reload).
+	 */
+	if (likely(__schedule(false))) {
+		arch_dovetail_switch_finish(false);
+		return 0;
+	}
+
+	clear_thread_local_flags(_TLF_OFFSTAGE);
+	pd->task_inflight = NULL;
+out:
+	preempt_enable();
+
+	return -ERESTARTSYS;
+}
+EXPORT_SYMBOL_GPL(dovetail_leave_inband);
+
+void dovetail_resume_inband(void)
+{
+	struct task_struct *p;
+	struct rq *rq;
+
+	p = __this_cpu_read(irq_pipeline.rqlock_owner);
+	if (WARN_ON_ONCE(dovetail_debug() && p == NULL))
+		return;
+
+	if (WARN_ON_ONCE(dovetail_debug() && (preempt_count() & STAGE_MASK)))
+		return;
+
+	rq = finish_task_switch(p);
+	balance_callback(rq);
+	preempt_enable();
+	oob_trampoline();
+}
+EXPORT_SYMBOL_GPL(dovetail_resume_inband);
+
+#ifdef CONFIG_KVM
+
+#include <linux/kvm_host.h>
+
+static inline void notify_guest_preempt(void)
+{
+	struct kvm_oob_notifier *nfy;
+	struct irq_pipeline_data *p;
+
+	p = raw_cpu_ptr(&irq_pipeline);
+	nfy = p->vcpu_notify;
+	if (unlikely(nfy))
+		nfy->handler(nfy);
+}
+#else
+static inline void notify_guest_preempt(void)
+{ }
+#endif
+
+bool dovetail_context_switch(struct dovetail_altsched_context *out,
+			struct dovetail_altsched_context *in,
+			bool leave_inband)
+{
+	unsigned long pc __maybe_unused, lockdep_irqs;
+	struct task_struct *next, *prev, *last;
+	struct mm_struct *prev_mm, *next_mm;
+	bool inband_tail = false;
+
+	WARN_ON_ONCE(dovetail_debug() && on_pipeline_entry());
+
+	if (leave_inband) {
+		struct task_struct *tsk = current;
+		/*
+		 * We are about to leave the current inband context
+		 * for switching to an out-of-band task, save the
+		 * preempted context information.
+		 */
+		out->task = tsk;
+		out->active_mm = tsk->active_mm;
+		/*
+		 * Switching out-of-band may require some housekeeping
+		 * from a kernel VM which might currently run guest
+		 * code, notify it about the upcoming preemption.
+		 */
+		notify_guest_preempt();
+	}
+
+	arch_dovetail_switch_prepare(leave_inband);
+
+	next = in->task;
+	prev = out->task;
+	prev_mm = out->active_mm;
+	next_mm = in->active_mm;
+
+	if (next_mm == NULL) {
+		in->active_mm = prev_mm;
+		in->borrowed_mm = true;
+		enter_lazy_tlb(prev_mm, next);
+	} else {
+		switch_oob_mm(prev_mm, next_mm, next);
+		/*
+		 * We might be switching back to the inband context
+		 * which we preempted earlier, shortly after "current"
+		 * dropped its mm context in the do_exit() path
+		 * (next->mm == NULL). In such a case, a lazy TLB
+		 * state is expected when leaving the mm.
+		 */
+		if (next->mm == NULL)
+			enter_lazy_tlb(prev_mm, next);
+	}
+
+	if (out->borrowed_mm) {
+		out->borrowed_mm = false;
+		out->active_mm = NULL;
+	}
+
+	/*
+	 * Tasks running out-of-band may alter the (in-band)
+	 * preemption count as long as they don't trigger an in-band
+	 * rescheduling, which Dovetail properly blocks.
+	 *
+	 * If the preemption count is not stack-based but a global
+	 * per-cpu variable instead, changing it has a globally
+	 * visible side-effect though, which is a problem if the
+	 * out-of-band task is preempted and schedules away before the
+	 * change is rolled back: this may cause the in-band context
+	 * to later resume with a broken preemption count.
+	 *
+	 * For this reason, the preemption count of any context which
+	 * blocks from the out-of-band stage is carried over and
+	 * restored across switches, emulating a stack-based
+	 * storage.
+	 *
+	 * Eventually, the count is reset to FORK_PREEMPT_COUNT upon
+	 * transition from out-of-band to in-band stage, reinstating
+	 * the value in effect when the converse transition happened
+	 * at some point before.
+	 */
+	if (IS_ENABLED(CONFIG_HAVE_PERCPU_PREEMPT_COUNT))
+		pc = preempt_count();
+
+	/*
+	 * Like the preemption count and for the same reason, the irq
+	 * state maintained by lockdep must be preserved across
+	 * switches.
+	 */
+	lockdep_irqs = lockdep_read_irqs_state();
+
+	switch_to(prev, next, last);
+	barrier();
+
+	if (check_hard_irqs_disabled())
+		hard_local_irq_disable();
+
+	/*
+	 * If we entered this routine for switching to an out-of-band
+	 * task but don't have _TLF_OOB set for the current context
+	 * when resuming, this portion of code is the switch tail of
+	 * the inband schedule() routine, finalizing a transition to
+	 * the inband stage for the current task. Update the stage
+	 * level as/if required.
+	 */
+	if (unlikely(!leave_inband && !test_thread_local_flags(_TLF_OOB))) {
+		if (IS_ENABLED(CONFIG_HAVE_PERCPU_PREEMPT_COUNT))
+			preempt_count_set(FORK_PREEMPT_COUNT);
+		else if (unlikely(dovetail_debug() &&
+					!(preempt_count() & STAGE_MASK)))
+			WARN_ON_ONCE(1);
+		else
+			preempt_count_sub(STAGE_OFFSET);
+
+		lockdep_write_irqs_state(lockdep_irqs);
+
+		/*
+		 * Fixup the interrupt state conversely to what
+		 * inband_switch_tail() does for the opposite stage
+		 * switching direction.
+		 */
+		stall_inband();
+		trace_hardirqs_off();
+		inband_tail = true;
+	} else {
+		if (IS_ENABLED(CONFIG_HAVE_PERCPU_PREEMPT_COUNT))
+			preempt_count_set(pc);
+
+		lockdep_write_irqs_state(lockdep_irqs);
+	}
+
+	arch_dovetail_switch_finish(leave_inband);
+
+	/*
+	 * inband_tail is true whenever we are finalizing a transition
+	 * to the inband stage from the oob context for current. See
+	 * above.
+	 */
+	return inband_tail;
+}
+EXPORT_SYMBOL_GPL(dovetail_context_switch);
+
+#endif /* CONFIG_DOVETAIL */
+
 void dump_cpu_task(int cpu)
 {
 	pr_info("Task dump for CPU %d:\n", cpu);

--
Gitblit v1.6.2