From 6778948f9de86c3cfaf36725a7c87dcff9ba247f Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Mon, 11 Dec 2023 08:20:59 +0000
Subject: [PATCH] kernel_5.10 no rt

---
 kernel/kernel/sched/core.c | 1275 ++++++++++++---------------------------------------------
 1 files changed, 269 insertions(+), 1,006 deletions(-)

diff --git a/kernel/kernel/sched/core.c b/kernel/kernel/sched/core.c
index e00ae06..7359375 100644
--- a/kernel/kernel/sched/core.c
+++ b/kernel/kernel/sched/core.c
@@ -78,11 +78,7 @@
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
  */
-#ifdef CONFIG_PREEMPT_RT
-const_debug unsigned int sysctl_sched_nr_migrate = 8;
-#else
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
-#endif
 
 /*
  * period over which we measure -rt task CPU usage in us.
@@ -531,15 +527,9 @@
 #endif
 #endif
 
-static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task,
-			 bool sleeper)
+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
 {
-	struct wake_q_node *node;
-
-	if (sleeper)
-		node = &task->wake_q_sleeper;
-	else
-		node = &task->wake_q;
+	struct wake_q_node *node = &task->wake_q;
 
 	/*
 	 * Atomically grab the task, if ->wake_q is !nil already it means
@@ -576,13 +566,7 @@
  */
 void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 {
-	if (__wake_q_add(head, task, false))
-		get_task_struct(task);
-}
-
-void wake_q_add_sleeper(struct wake_q_head *head, struct task_struct *task)
-{
-	if (__wake_q_add(head, task, true))
+	if (__wake_q_add(head, task))
 		get_task_struct(task);
 }
 
@@ -605,40 +589,29 @@
  */
 void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
 {
-	if (!__wake_q_add(head, task, false))
+	if (!__wake_q_add(head, task))
 		put_task_struct(task);
 }
 
-void __wake_up_q(struct wake_q_head *head, bool sleeper)
+void wake_up_q(struct wake_q_head *head)
 {
 	struct wake_q_node *node = head->first;
 
 	while (node != WAKE_Q_TAIL) {
 		struct task_struct *task;
 
-		if (sleeper)
-			task = container_of(node, struct task_struct, wake_q_sleeper);
-		else
-			task = container_of(node, struct task_struct, wake_q);
-
+		task = container_of(node, struct task_struct, wake_q);
 		BUG_ON(!task);
 		/* Task can safely be re-inserted now: */
 		node = node->next;
+		task->wake_q.next = NULL;
 		task->wake_q_count = head->count;
-		if (sleeper)
-			task->wake_q_sleeper.next = NULL;
-		else
-			task->wake_q.next = NULL;
 
 		/*
 		 * wake_up_process() executes a full barrier, which pairs with
 		 * the queueing in wake_q_add() so as not to miss wakeups.
 		 */
-		if (sleeper)
-			wake_up_lock_sleeper(task);
-		else
-			wake_up_process(task);
-
+		wake_up_process(task);
 		task->wake_q_count = 0;
 		put_task_struct(task);
 	}
@@ -675,48 +648,6 @@
 		trace_sched_wake_idle_without_ipi(cpu);
 }
 EXPORT_SYMBOL_GPL(resched_curr);
-
-#ifdef CONFIG_PREEMPT_LAZY
-
-static int tsk_is_polling(struct task_struct *p)
-{
-#ifdef TIF_POLLING_NRFLAG
-	return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
-#else
-	return 0;
-#endif
-}
-
-void resched_curr_lazy(struct rq *rq)
-{
-	struct task_struct *curr = rq->curr;
-	int cpu;
-
-	if (!sched_feat(PREEMPT_LAZY)) {
-		resched_curr(rq);
-		return;
-	}
-
-	lockdep_assert_held(&rq->lock);
-
-	if (test_tsk_need_resched(curr))
-		return;
-
-	if (test_tsk_need_resched_lazy(curr))
-		return;
-
-	set_tsk_need_resched_lazy(curr);
-
-	cpu = cpu_of(rq);
-	if (cpu == smp_processor_id())
-		return;
-
-	/* NEED_RESCHED_LAZY must be visible before we test polling */
-	smp_mb();
-	if (!tsk_is_polling(curr))
-		smp_send_reschedule(cpu);
-}
-#endif
 
 void resched_cpu(int cpu)
 {
@@ -1870,82 +1801,6 @@
 
 #ifdef CONFIG_SMP
 
-static void
-__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
-
-static int __set_cpus_allowed_ptr(struct task_struct *p,
-				  const struct cpumask *new_mask,
-				  u32 flags);
-
-static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
-{
-	if (likely(!p->migration_disabled))
-		return;
-
-	if (p->cpus_ptr != &p->cpus_mask)
-		return;
-
-	/*
-	 * Violates locking rules! see comment in __do_set_cpus_allowed().
-	 */
-	__do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
-}
-
-void migrate_disable(void)
-{
-	struct task_struct *p = current;
-
-	if (p->migration_disabled) {
-		p->migration_disabled++;
-		return;
-	}
-
-	trace_sched_migrate_disable_tp(p);
-
-	preempt_disable();
-	this_rq()->nr_pinned++;
-	p->migration_disabled = 1;
-	preempt_lazy_disable();
-	preempt_enable();
-}
-EXPORT_SYMBOL_GPL(migrate_disable);
-
-void migrate_enable(void)
-{
-	struct task_struct *p = current;
-
-	if (p->migration_disabled > 1) {
-		p->migration_disabled--;
-		return;
-	}
-
-	/*
-	 * Ensure stop_task runs either before or after this, and that
-	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
-	 */
-	preempt_disable();
-	if (p->cpus_ptr != &p->cpus_mask)
-		__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
-	/*
-	 * Mustn't clear migration_disabled() until cpus_ptr points back at the
-	 * regular cpus_mask, otherwise things that race (eg.
-	 * select_fallback_rq) get confused.
-	 */
-	barrier();
-	p->migration_disabled = 0;
-	this_rq()->nr_pinned--;
-	preempt_lazy_enable();
-	preempt_enable();
-
-	trace_sched_migrate_enable_tp(p);
-}
-EXPORT_SYMBOL_GPL(migrate_enable);
-
-static inline bool rq_has_pinned_tasks(struct rq *rq)
-{
-	return rq->nr_pinned;
-}
-
 /*
  * Per-CPU kthreads are allowed to run on !active && online CPUs, see
  * __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -1955,7 +1810,7 @@
 	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 		return false;
 
-	if (is_per_cpu_kthread(p) || is_migration_disabled(p))
+	if (is_per_cpu_kthread(p))
 		return cpu_online(cpu);
 
 	if (!cpu_active(cpu))
@@ -2015,21 +1870,8 @@
 }
 
 struct migration_arg {
-	struct task_struct		*task;
-	int				dest_cpu;
-	struct set_affinity_pending	*pending;
-};
-
-/*
- * @refs: number of wait_for_completion()
- * @stop_pending: is @stop_work in use
- */
-struct set_affinity_pending {
-	refcount_t		refs;
-	unsigned int		stop_pending;
-	struct completion	done;
-	struct cpu_stop_work	stop_work;
-	struct migration_arg	arg;
+	struct task_struct *task;
+	int dest_cpu;
 };
 
 /*
@@ -2062,17 +1904,15 @@
 static int migration_cpu_stop(void *data)
 {
 	struct migration_arg *arg = data;
-	struct set_affinity_pending *pending = arg->pending;
 	struct task_struct *p = arg->task;
 	struct rq *rq = this_rq();
-	bool complete = false;
 	struct rq_flags rf;
 
 	/*
 	 * The original target CPU might have gone down and we might
 	 * be on another CPU but it doesn't matter.
 	 */
-	local_irq_save(rf.flags);
+	local_irq_disable();
 	/*
 	 * We need to explicitly wake pending tasks before running
 	 * __migrate_task() such that we will not miss enforcing cpus_ptr
@@ -2082,121 +1922,21 @@
 
 	raw_spin_lock(&p->pi_lock);
 	rq_lock(rq, &rf);
-
 	/*
 	 * If task_rq(p) != rq, it cannot be migrated here, because we're
 	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
 	 * we're holding p->pi_lock.
 	 */
 	if (task_rq(p) == rq) {
-		if (is_migration_disabled(p))
-			goto out;
-
-		if (pending) {
-			if (p->migration_pending == pending)
-				p->migration_pending = NULL;
-			complete = true;
-
-			if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
-				goto out;
-		}
-
 		if (task_on_rq_queued(p))
 			rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
 		else
 			p->wake_cpu = arg->dest_cpu;
-
-		/*
-		 * XXX __migrate_task() can fail, at which point we might end
-		 * up running on a dodgy CPU, AFAICT this can only happen
-		 * during CPU hotplug, at which point we'll get pushed out
-		 * anyway, so it's probably not a big deal.
-		 */
-
-	} else if (pending) {
-		/*
-		 * This happens when we get migrated between migrate_enable()'s
-		 * preempt_enable() and scheduling the stopper task. At that
-		 * point we're a regular task again and not current anymore.
-		 *
-		 * A !PREEMPT kernel has a giant hole here, which makes it far
-		 * more likely.
-		 */
-
-		/*
-		 * The task moved before the stopper got to run. We're holding
-		 * ->pi_lock, so the allowed mask is stable - if it got
-		 * somewhere allowed, we're done.
-		 */
-		if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
-			if (p->migration_pending == pending)
-				p->migration_pending = NULL;
-			complete = true;
-			goto out;
-		}
-
-		/*
-		 * When migrate_enable() hits a rq mis-match we can't reliably
-		 * determine is_migration_disabled() and so have to chase after
-		 * it.
-		 */
-		WARN_ON_ONCE(!pending->stop_pending);
-		task_rq_unlock(rq, p, &rf);
-		stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
-				    &pending->arg, &pending->stop_work);
-		return 0;
 	}
-out:
-	if (pending)
-		pending->stop_pending = false;
-	task_rq_unlock(rq, p, &rf);
+	rq_unlock(rq, &rf);
+	raw_spin_unlock(&p->pi_lock);
 
-	if (complete)
-		complete_all(&pending->done);
-
-	return 0;
-}
-
-int push_cpu_stop(void *arg)
-{
-	struct rq *lowest_rq = NULL, *rq = this_rq();
-	struct task_struct *p = arg;
-
-	raw_spin_lock_irq(&p->pi_lock);
-	raw_spin_lock(&rq->lock);
-
-	if (task_rq(p) != rq)
-		goto out_unlock;
-
-	if (is_migration_disabled(p)) {
-		p->migration_flags |= MDF_PUSH;
-		goto out_unlock;
-	}
-
-	p->migration_flags &= ~MDF_PUSH;
-
-	if (p->sched_class->find_lock_rq)
-		lowest_rq = p->sched_class->find_lock_rq(p, rq);
-
-	if (!lowest_rq)
-		goto out_unlock;
-
-	// XXX validate p is still the highest prio task
-	if (task_rq(p) == rq) {
-		deactivate_task(rq, p, 0);
-		set_task_cpu(p, lowest_rq->cpu);
-		activate_task(lowest_rq, p, 0);
-		resched_curr(lowest_rq);
-	}
-
-	double_unlock_balance(rq, lowest_rq);
-
-out_unlock:
-	rq->push_busy = false;
-	raw_spin_unlock(&rq->lock);
-	raw_spin_unlock_irq(&p->pi_lock);
-
-	put_task_struct(p);
+	local_irq_enable();
 	return 0;
 }
 
@@ -2204,40 +1944,19 @@
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
  */
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
 {
-	if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
-		p->cpus_ptr = new_mask;
-		return;
-	}
-
 	cpumask_copy(&p->cpus_mask, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 	trace_android_rvh_set_cpus_allowed_comm(p, new_mask);
 }
 
-static void
-__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
 	struct rq *rq = task_rq(p);
 	bool queued, running;
 
-	/*
-	 * This here violates the locking rules for affinity, since we're only
-	 * supposed to change these variables while holding both rq->lock and
-	 * p->pi_lock.
-	 *
-	 * HOWEVER, it magically works, because ttwu() is the only code that
-	 * accesses these variables under p->pi_lock and only does so after
-	 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
-	 * before finish_task().
-	 *
-	 * XXX do further audits, this smells like something putrid.
-	 */
-	if (flags & SCA_MIGRATE_DISABLE)
-		SCHED_WARN_ON(!p->on_cpu);
-	else
-		lockdep_assert_held(&p->pi_lock);
+	lockdep_assert_held(&p->pi_lock);
 
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
@@ -2253,7 +1972,7 @@
 	if (running)
 		put_prev_task(rq, p);
 
-	p->sched_class->set_cpus_allowed(p, new_mask, flags);
+	p->sched_class->set_cpus_allowed(p, new_mask);
 
 	if (queued)
 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -2261,14 +1980,12 @@
 		set_next_task(rq, p);
 }
 
-static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
-			    int dest_cpu, unsigned int flags);
 /*
  * Called with both p->pi_lock and rq->lock held; drops both before returning.
  */
 static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
 					 const struct cpumask *new_mask,
-					 u32 flags,
+					 bool check,
 					 struct rq *rq,
 					 struct rq_flags *rf)
 {
@@ -2279,14 +1996,9 @@
 
 	update_rq_clock(rq);
 
-	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
+	if (p->flags & PF_KTHREAD) {
 		/*
-		 * Kernel threads are allowed on online && !active CPUs.
-		 *
-		 * Specifically, migration_disabled() tasks must not fail the
-		 * cpumask_any_and_distribute() pick below, esp. so on
-		 * SCA_MIGRATE_ENABLE, otherwise we'll not call
-		 * set_cpus_allowed_common() and actually reset p->cpus_ptr.
+		 * Kernel threads are allowed on online && !active CPUs
 		 */
 		cpu_valid_mask = cpu_online_mask;
 	} else if (!cpumask_subset(new_mask, cpu_allowed_mask)) {
@@ -2298,22 +2010,13 @@
 	 * Must re-check here, to close a race against __kthread_bind(),
 	 * sched_setaffinity() is not guaranteed to observe the flag.
 	 */
-	if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
+	if (check && (p->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
 		goto out;
 	}
 
-	if (!(flags & SCA_MIGRATE_ENABLE)) {
-		if (cpumask_equal(&p->cpus_mask, new_mask))
-			goto out;
-
-		if (WARN_ON_ONCE(p == current &&
-				 is_migration_disabled(p) &&
-				 !cpumask_test_cpu(task_cpu(p), new_mask))) {
-			ret = -EBUSY;
-			goto out;
-		}
-	}
+	if (cpumask_equal(&p->cpus_mask, new_mask))
+		goto out;
 
 	/*
 	 * Picking a ~random cpu helps in cases where we are changing affinity
@@ -2326,7 +2029,7 @@
 		goto out;
 	}
 
-	__do_set_cpus_allowed(p, new_mask, flags);
+	do_set_cpus_allowed(p, new_mask);
 
 	if (p->flags & PF_KTHREAD) {
 		/*
@@ -2338,227 +2041,27 @@
 			p->nr_cpus_allowed != 1);
 	}
 
-	return affine_move_task(rq, p, rf, dest_cpu, flags);
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
+		goto out;
+
+	if (task_running(rq, p) || p->state == TASK_WAKING) {
+		struct migration_arg arg = { p, dest_cpu };
+		/* Need help from migration thread: drop lock and wait. */
+		task_rq_unlock(rq, p, rf);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+		return 0;
+	} else if (task_on_rq_queued(p)) {
+		/*
+		 * OK, since we're going to drop the lock immediately
+		 * afterwards anyway.
+		 */
+		rq = move_queued_task(rq, rf, p, dest_cpu);
+	}
 out:
 	task_rq_unlock(rq, p, rf);
 
 	return ret;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-	__do_set_cpus_allowed(p, new_mask, 0);
-}
-
-/*
- * This function is wildly self concurrent; here be dragons.
- *
- *
- * When given a valid mask, __set_cpus_allowed_ptr() must block until the
- * designated task is enqueued on an allowed CPU. If that task is currently
- * running, we have to kick it out using the CPU stopper.
- *
- * Migrate-Disable comes along and tramples all over our nice sandcastle.
- * Consider:
- *
- *     Initial conditions: P0->cpus_mask = [0, 1]
- *
- *     P0@CPU0                  P1
- *
- *     migrate_disable();
- *     <preempted>
- *                              set_cpus_allowed_ptr(P0, [1]);
- *
- * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
- * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
- * This means we need the following scheme:
- *
- *     P0@CPU0                  P1
- *
- *     migrate_disable();
- *     <preempted>
- *                              set_cpus_allowed_ptr(P0, [1]);
- *                                <blocks>
- *     <resumes>
- *     migrate_enable();
- *       __set_cpus_allowed_ptr();
- *       <wakes local stopper>
- *                         `--> <woken on migration completion>
- *
- * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
- * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
- * task p are serialized by p->pi_lock, which we can leverage: the one that
- * should come into effect at the end of the Migrate-Disable region is the last
- * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
- * but we still need to properly signal those waiting tasks at the appropriate
- * moment.
- *
- * This is implemented using struct set_affinity_pending. The first
- * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
- * setup an instance of that struct and install it on the targeted task_struct.
- * Any and all further callers will reuse that instance. Those then wait for
- * a completion signaled at the tail of the CPU stopper callback (1), triggered
- * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
- *
- *
- * (1) In the cases covered above. There is one more where the completion is
- * signaled within affine_move_task() itself: when a subsequent affinity request
- * cancels the need for an active migration. Consider:
- *
- *     Initial conditions: P0->cpus_mask = [0, 1]
- *
- *     P0@CPU0            P1                             P2
- *
- *     migrate_disable();
- *     <preempted>
- *                        set_cpus_allowed_ptr(P0, [1]);
- *                          <blocks>
- *                                                       set_cpus_allowed_ptr(P0, [0, 1]);
- *                                                         <signal completion>
- *                          <awakes>
- *
- * Note that the above is safe vs a concurrent migrate_enable(), as any
- * pending affinity completion is preceded an uninstallion of
- * p->migration_pending done with p->pi_lock held.
- */
-static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
-			    int dest_cpu, unsigned int flags)
-{
-	struct set_affinity_pending my_pending = { }, *pending = NULL;
-	bool stop_pending, complete = false;
-
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
-		struct task_struct *push_task = NULL;
-
-		if ((flags & SCA_MIGRATE_ENABLE) &&
-		    (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
-			rq->push_busy = true;
-			push_task = get_task_struct(p);
-		}
-
-		/*
-		 * If there are pending waiters, but no pending stop_work,
-		 * then complete now.
-		 */
-		pending = p->migration_pending;
-		if (pending && !pending->stop_pending) {
-			p->migration_pending = NULL;
-			complete = true;
-		}
-
-		task_rq_unlock(rq, p, rf);
-
-		if (push_task) {
-			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
-					    p, &rq->push_work);
-		}
-
-		if (complete)
-			complete_all(&pending->done);
-
-		return 0;
-	}
-
-	if (!(flags & SCA_MIGRATE_ENABLE)) {
-		/* serialized by p->pi_lock */
-		if (!p->migration_pending) {
-			/* Install the request */
-			refcount_set(&my_pending.refs, 1);
-			init_completion(&my_pending.done);
-			my_pending.arg = (struct migration_arg) {
-				.task = p,
-				.dest_cpu = dest_cpu,
-				.pending = &my_pending,
-			};
-
-			p->migration_pending = &my_pending;
-		} else {
-			pending = p->migration_pending;
-			refcount_inc(&pending->refs);
-			/*
-			 * Affinity has changed, but we've already installed a
-			 * pending. migration_cpu_stop() *must* see this, else
-			 * we risk a completion of the pending despite having a
-			 * task on a disallowed CPU.
-			 *
-			 * Serialized by p->pi_lock, so this is safe.
-			 */
-			pending->arg.dest_cpu = dest_cpu;
-		}
-	}
-	pending = p->migration_pending;
-	/*
-	 * - !MIGRATE_ENABLE:
-	 *   we'll have installed a pending if there wasn't one already.
-	 *
-	 * - MIGRATE_ENABLE:
-	 *   we're here because the current CPU isn't matching anymore,
-	 *   the only way that can happen is because of a concurrent
-	 *   set_cpus_allowed_ptr() call, which should then still be
-	 *   pending completion.
-	 *
-	 * Either way, we really should have a @pending here.
-	 */
-	if (WARN_ON_ONCE(!pending)) {
-		task_rq_unlock(rq, p, rf);
-		return -EINVAL;
-	}
-
-	if (task_running(rq, p) || p->state == TASK_WAKING) {
-		/*
-		 * MIGRATE_ENABLE gets here because 'p == current', but for
-		 * anything else we cannot do is_migration_disabled(), punt
-		 * and have the stopper function handle it all race-free.
-		 */
-		stop_pending = pending->stop_pending;
-		if (!stop_pending)
-			pending->stop_pending = true;
-
-		if (flags & SCA_MIGRATE_ENABLE)
-			p->migration_flags &= ~MDF_PUSH;
-
-		task_rq_unlock(rq, p, rf);
-
-		if (!stop_pending) {
-			stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
-					    &pending->arg, &pending->stop_work);
-		}
-
-		if (flags & SCA_MIGRATE_ENABLE)
-			return 0;
-	} else {
-
-		if (!is_migration_disabled(p)) {
-			if (task_on_rq_queued(p))
-				rq = move_queued_task(rq, rf, p, dest_cpu);
-
-			if (!pending->stop_pending) {
-				p->migration_pending = NULL;
-				complete = true;
-			}
-		}
-		task_rq_unlock(rq, p, rf);
-
-		if (complete)
-			complete_all(&pending->done);
-	}
-
-	wait_for_completion(&pending->done);
-
-	if (refcount_dec_and_test(&pending->refs))
-		wake_up_var(&pending->refs); /* No UaF, just an address */
-
-	/*
-	 * Block the original owner of &pending until all subsequent callers
-	 * have seen the completion and decremented the refcount
-	 */
-	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
-
-	/* ARGH */
-	WARN_ON_ONCE(my_pending.stop_pending);
-
-	return 0;
 }
 
 /*
@@ -2571,19 +2074,18 @@
  * call is not atomic; no spinlocks may be held.
  */
 static int __set_cpus_allowed_ptr(struct task_struct *p,
-				  const struct cpumask *new_mask,
-				  u32 flags)
+				  const struct cpumask *new_mask, bool check)
 {
 	struct rq_flags rf;
 	struct rq *rq;
 
 	rq = task_rq_lock(p, &rf);
-	return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+	return __set_cpus_allowed_ptr_locked(p, new_mask, check, rq, &rf);
 }
 
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-	return __set_cpus_allowed_ptr(p, new_mask, 0);
+	return __set_cpus_allowed_ptr(p, new_mask, false);
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
@@ -2692,8 +2194,6 @@
 	 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
 	 */
 	WARN_ON_ONCE(!cpu_online(new_cpu));
-
-	WARN_ON_ONCE(is_migration_disabled(p));
 #endif
 
 	trace_sched_migrate_task(p, new_cpu);
@@ -2827,18 +2327,6 @@
 }
 EXPORT_SYMBOL_GPL(migrate_swap);
 
-static bool check_task_state(struct task_struct *p, long match_state)
-{
-	bool match = false;
-
-	raw_spin_lock_irq(&p->pi_lock);
-	if (p->state == match_state || p->saved_state == match_state)
-		match = true;
-	raw_spin_unlock_irq(&p->pi_lock);
-
-	return match;
-}
-
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -2883,7 +2371,7 @@
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p)) {
-			if (match_state && !check_task_state(p, match_state))
+			if (match_state && unlikely(p->state != match_state))
 				return 0;
 			cpu_relax();
 		}
@@ -2898,8 +2386,7 @@
 		running = task_running(rq, p);
 		queued = task_on_rq_queued(p);
 		ncsw = 0;
-		if (!match_state || p->state == match_state ||
-		    p->saved_state == match_state)
+		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, p, &rf);
 
@@ -2933,7 +2420,7 @@
 			ktime_t to = NSEC_PER_SEC / HZ;
 
 			set_current_state(TASK_UNINTERRUPTIBLE);
-			schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
+			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
 			continue;
 		}
 
@@ -3040,12 +2527,6 @@
 			}
 			fallthrough;
 		case possible:
-			/*
-			 * XXX When called from select_task_rq() we only
-			 * hold p->pi_lock and again violate locking order.
-			 *
-			 * More yuck to audit.
-			 */
 			do_set_cpus_allowed(p, task_cpu_possible_mask(p));
 			state = fail;
 			break;
@@ -3079,7 +2560,7 @@
 {
 	lockdep_assert_held(&p->pi_lock);
 
-	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
+	if (p->nr_cpus_allowed > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	else
 		cpu = cpumask_any(p->cpus_ptr);
@@ -3102,7 +2583,6 @@
 
 void sched_set_stop_task(int cpu, struct task_struct *stop)
 {
-	static struct lock_class_key stop_pi_lock;
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 	struct task_struct *old_stop = cpu_rq(cpu)->stop;
 
@@ -3118,20 +2598,6 @@
 		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
 
 		stop->sched_class = &stop_sched_class;
-
-		/*
-		 * The PI code calls rt_mutex_setprio() with ->pi_lock held to
-		 * adjust the effective priority of a task. As a result,
-		 * rt_mutex_setprio() can trigger (RT) balancing operations,
-		 * which can then trigger wakeups of the stop thread to push
-		 * around the current task.
-		 *
-		 * The stop task itself will never be part of the PI-chain, it
-		 * never blocks, therefore that ->pi_lock recursion is safe.
-		 * Tell lockdep about this by placing the stop->pi_lock in its
-		 * own class.
-		 */
-		lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
 	}
 
 	cpu_rq(cpu)->stop = stop;
@@ -3145,23 +2611,15 @@
 	}
 }
 
-#else /* CONFIG_SMP */
+#else
 
 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
-					 const struct cpumask *new_mask,
-					 u32 flags)
+					 const struct cpumask *new_mask, bool check)
 {
 	return set_cpus_allowed_ptr(p, new_mask);
 }
 
-static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
-
-static inline bool rq_has_pinned_tasks(struct rq *rq)
-{
-	return false;
-}
-
-#endif /* !CONFIG_SMP */
+#endif /* CONFIG_SMP */
 
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -3595,7 +3053,7 @@
 	int cpu, success = 0;
 
 	preempt_disable();
-	if (!IS_ENABLED(CONFIG_PREEMPT_RT) && p == current) {
+	if (p == current) {
 		/*
 		 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
 		 * == smp_processor_id()'. Together this means we can special
@@ -3625,26 +3083,8 @@
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	smp_mb__after_spinlock();
-	if (!(p->state & state)) {
-		/*
-		 * The task might be running due to a spinlock sleeper
-		 * wakeup. Check the saved state and set it to running
-		 * if the wakeup condition is true.
-		 */
-		if (!(wake_flags & WF_LOCK_SLEEPER)) {
-			if (p->saved_state & state) {
-				p->saved_state = TASK_RUNNING;
-				success = 1;
-			}
-		}
+	if (!(p->state & state))
 		goto unlock;
-	}
-	/*
-	 * If this is a regular wakeup, then we can unconditionally
-	 * clear the saved state of a "lock sleeper".
-	 */
-	if (!(wake_flags & WF_LOCK_SLEEPER))
-		p->saved_state = TASK_RUNNING;
 
 #ifdef CONFIG_FREEZER
 	/*
@@ -3853,18 +3293,6 @@
 }
 EXPORT_SYMBOL(wake_up_process);
 
-/**
- * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
- * @p: The process to be woken up.
- *
- * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
- * the nature of the wakeup.
- */
-int wake_up_lock_sleeper(struct task_struct *p)
-{
-	return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
-}
-
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
@@ -3920,7 +3348,6 @@
 	init_numa_balancing(clone_flags, p);
 #ifdef CONFIG_SMP
 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
-	p->migration_pending = NULL;
 #endif
 }
 
@@ -4099,9 +3526,6 @@
 	p->on_cpu = 0;
 #endif
 	init_task_preempt_count(p);
-#ifdef CONFIG_HAVE_PREEMPT_LAZY
-	task_thread_info(p)->preempt_lazy_count = 0;
-#endif
 #ifdef CONFIG_SMP
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
@@ -4329,90 +3753,6 @@
 #endif
 }
 
-#ifdef CONFIG_SMP
-
-static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
-{
-	void (*func)(struct rq *rq);
-	struct callback_head *next;
-
-	lockdep_assert_held(&rq->lock);
-
-	while (head) {
-		func = (void (*)(struct rq *))head->func;
-		next = head->next;
-		head->next = NULL;
-		head = next;
-
-		func(rq);
-	}
-}
-
-static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
-{
-	struct callback_head *head = rq->balance_callback;
-
-	lockdep_assert_held(&rq->lock);
-	if (head) {
-		rq->balance_callback = NULL;
-		rq->balance_flags &= ~BALANCE_WORK;
-	}
-
-	return head;
-}
-
-static void __balance_callbacks(struct rq *rq)
-{
-	do_balance_callbacks(rq, splice_balance_callbacks(rq));
-}
-
-static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
-{
-	unsigned long flags;
-
-	if (unlikely(head)) {
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		do_balance_callbacks(rq, head);
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-	}
-}
-
-static void balance_push(struct rq *rq);
-
-static inline void balance_switch(struct rq *rq)
-{
-	if (likely(!rq->balance_flags))
-		return;
-
-	if (rq->balance_flags & BALANCE_PUSH) {
-		balance_push(rq);
-		return;
-	}
-
-	__balance_callbacks(rq);
-}
-
-#else
-
-static inline void __balance_callbacks(struct rq *rq)
-{
-}
-
-static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
-{
-	return NULL;
-}
-
-static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
-{
-}
-
-static inline void balance_switch(struct rq *rq)
-{
-}
-
-#endif
-
 static inline void
 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
 {
@@ -4438,7 +3778,6 @@
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-	balance_switch(rq);
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -4453,22 +3792,6 @@
 #ifndef finish_arch_post_lock_switch
 # define finish_arch_post_lock_switch()	do { } while (0)
 #endif
-
-static inline void kmap_local_sched_out(void)
-{
-#ifdef CONFIG_KMAP_LOCAL
-	if (unlikely(current->kmap_ctrl.idx))
-		__kmap_local_sched_out();
-#endif
-}
-
-static inline void kmap_local_sched_in(void)
-{
-#ifdef CONFIG_KMAP_LOCAL
-	if (unlikely(current->kmap_ctrl.idx))
-		__kmap_local_sched_in();
-#endif
-}
 
 /**
  * prepare_task_switch - prepare to switch tasks
@@ -4492,7 +3815,6 @@
 	perf_event_task_sched_out(prev, next);
 	rseq_preempt(prev);
 	fire_sched_out_preempt_notifiers(prev, next);
-	kmap_local_sched_out();
 	prepare_task(next);
 	prepare_arch_switch(next);
 }
@@ -4559,7 +3881,6 @@
 	finish_lock_switch(rq);
 	finish_arch_post_lock_switch();
 	kcov_finish_switch(current);
-	kmap_local_sched_in();
 
 	fire_sched_in_preempt_notifiers(current);
 	/*
@@ -4574,17 +3895,23 @@
 	 *   provided by mmdrop(),
 	 * - a sync_core for SYNC_CORE.
 	 */
-	/*
-	 * We use mmdrop_delayed() here so we don't have to do the
-	 * full __mmdrop() when we are the last user.
-	 */
 	if (mm) {
 		membarrier_mm_sync_core_before_usermode(mm);
-		mmdrop_delayed(mm);
+		mmdrop(mm);
 	}
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
+
+		/*
+		 * Remove function-return probe instances associated with this
+		 * task and put them back on the free list.
+		 */
+		kprobe_flush_task(prev);
+		trace_android_rvh_flush_task(prev);
+
+		/* Task is done with its stack. */
+		put_task_stack(prev);
 
 		put_task_struct_rcu_user(prev);
 	}
@@ -4592,6 +3919,43 @@
 	tick_nohz_task_switch();
 	return rq;
 }
+
+#ifdef CONFIG_SMP
+
+/* rq->lock is NOT held, but preemption is disabled */
+static void __balance_callback(struct rq *rq)
+{
+	struct callback_head *head, *next;
+	void (*func)(struct rq *rq);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	head = rq->balance_callback;
+	rq->balance_callback = NULL;
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
+
+		func(rq);
+	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+	if (unlikely(rq->balance_callback))
+		__balance_callback(rq);
+}
+
+#else
+
+static inline void balance_callback(struct rq *rq)
+{
+}
+
+#endif
 
 /**
  * schedule_tail - first thing a freshly forked thread must call.
@@ -4612,6 +3976,7 @@
 	 */
 
 	rq = finish_task_switch(prev);
+	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -5317,7 +4682,7 @@
  *
  * WARNING: must be called with preemption disabled!
  */
-static void __sched notrace __schedule(bool preempt, bool spinning_lock)
+static void __sched notrace __schedule(bool preempt)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
@@ -5370,7 +4735,7 @@
 	 *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
 	 */
 	prev_state = prev->state;
-	if ((!preempt || spinning_lock) && prev_state) {
+	if (!preempt && prev_state) {
 		if (signal_pending_state(prev_state, prev)) {
 			prev->state = TASK_RUNNING;
 		} else {
@@ -5405,7 +4770,6 @@
 
 	next = pick_next_task(rq, prev, &rf);
 	clear_tsk_need_resched(prev);
-	clear_tsk_need_resched_lazy(prev);
 	clear_preempt_need_resched();
 
 	trace_android_rvh_schedule(prev, next, rq);
@@ -5432,7 +4796,6 @@
 		 */
 		++*switch_count;
 
-		migrate_disable_switch(rq, prev);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
 		trace_sched_switch(preempt, prev, next);
@@ -5441,11 +4804,10 @@
 		rq = context_switch(rq, prev, next, &rf);
 	} else {
 		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-
-		rq_unpin_lock(rq, &rf);
-		__balance_callbacks(rq);
-		raw_spin_unlock_irq(&rq->lock);
+		rq_unlock_irq(rq, &rf);
 	}
+
+	balance_callback(rq);
 }
 
 void __noreturn do_task_dead(void)
@@ -5456,7 +4818,7 @@
 	/* Tell freezer to ignore us: */
 	current->flags |= PF_NOFREEZE;
 
-	__schedule(false, false);
+	__schedule(false);
 	BUG();
 
 	/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
@@ -5489,6 +4851,9 @@
 		preempt_enable_no_resched();
 	}
 
+	if (tsk_is_pi_blocked(tsk))
+		return;
+
 	/*
 	 * If we are going to sleep and we have plugged IO queued,
 	 * make sure to submit it to avoid deadlocks.
@@ -5514,7 +4879,7 @@
 	sched_submit_work(tsk);
 	do {
 		preempt_disable();
-		__schedule(false, false);
+		__schedule(false);
 		sched_preempt_enable_no_resched();
 	} while (need_resched());
 	sched_update_worker(tsk);
@@ -5542,7 +4907,7 @@
 	 */
 	WARN_ON_ONCE(current->state);
 	do {
-		__schedule(false, false);
+		__schedule(false);
 	} while (need_resched());
 }
 
@@ -5595,7 +4960,7 @@
 		 */
 		preempt_disable_notrace();
 		preempt_latency_start(1);
-		__schedule(true, false);
+		__schedule(true);
 		preempt_latency_stop(1);
 		preempt_enable_no_resched_notrace();
 
@@ -5605,30 +4970,6 @@
 		 */
 	} while (need_resched());
 }
-
-#ifdef CONFIG_PREEMPT_LAZY
-/*
- * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
- * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
- * preempt_lazy_count counter >0.
- */
-static __always_inline int preemptible_lazy(void)
-{
-	if (test_thread_flag(TIF_NEED_RESCHED))
-		return 1;
-	if (current_thread_info()->preempt_lazy_count)
-		return 0;
-	return 1;
-}
-
-#else
-
-static inline int preemptible_lazy(void)
-{
-	return 1;
-}
-
-#endif
 
 #ifdef CONFIG_PREEMPTION
 /*
@@ -5643,25 +4984,11 @@
 	 */
 	if (likely(!preemptible()))
 		return;
-	if (!preemptible_lazy())
-		return;
+
 	preempt_schedule_common();
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
-
-#ifdef CONFIG_PREEMPT_RT
-void __sched notrace preempt_schedule_lock(void)
-{
-	do {
-		preempt_disable();
-		__schedule(true, true);
-		sched_preempt_enable_no_resched();
-	} while (need_resched());
-}
-NOKPROBE_SYMBOL(preempt_schedule_lock);
-EXPORT_SYMBOL(preempt_schedule_lock);
-#endif
 
 /**
  * preempt_schedule_notrace - preempt_schedule called by tracing
@@ -5682,9 +5009,6 @@
 	enum ctx_state prev_ctx;
 
 	if (likely(!preemptible()))
-		return;
-
-	if (!preemptible_lazy())
 		return;
 
 	do {
@@ -5709,7 +5033,7 @@
 		 * an infinite recursion.
 		 */
 		prev_ctx = exception_enter();
-		__schedule(true, false);
+		__schedule(true);
 		exception_exit(prev_ctx);
 
 		preempt_latency_stop(1);
@@ -5738,7 +5062,7 @@
 	do {
 		preempt_disable();
 		local_irq_enable();
-		__schedule(true, false);
+		__schedule(true);
 		local_irq_disable();
 		sched_preempt_enable_no_resched();
 	} while (need_resched());
@@ -5905,11 +5229,9 @@
 out_unlock:
 	/* Avoid rq from going away on us: */
 	preempt_disable();
+	__task_rq_unlock(rq, &rf);
 
-	rq_unpin_lock(rq, &rf);
-	__balance_callbacks(rq);
-	raw_spin_unlock(&rq->lock);
-
+	balance_callback(rq);
 	preempt_enable();
 }
 #else
@@ -6154,7 +5476,6 @@
 	int oldpolicy = -1, policy = attr->sched_policy;
 	int retval, oldprio, newprio, queued, running;
 	const struct sched_class *prev_class;
-	struct callback_head *head;
 	struct rq_flags rf;
 	int reset_on_fork;
 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -6397,14 +5718,13 @@
 
 	/* Avoid rq from going away on us: */
 	preempt_disable();
-	head = splice_balance_callbacks(rq);
 	task_rq_unlock(rq, p, &rf);
 
 	if (pi)
 		rt_mutex_adjust_pi(p);
 
 	/* Run balance callbacks after we've adjusted the PI chain: */
-	balance_callbacks(rq, head);
+	balance_callback(rq);
 	preempt_enable();
 
 	return 0;
@@ -6916,7 +6236,7 @@
 	}
 #endif
 again:
-	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
+	retval = __set_cpus_allowed_ptr(p, new_mask, true);
 
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
@@ -7498,7 +6818,7 @@
 	 *
 	 * And since this is boot we can forgo the serialization.
 	 */
-	set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
+	set_cpus_allowed_common(idle, cpumask_of(cpu));
 #endif
 	/*
 	 * We're having a chicken and egg problem, even though we are
@@ -7525,9 +6845,7 @@
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	init_idle_preempt_count(idle, cpu);
-#ifdef CONFIG_HAVE_PREEMPT_LAZY
-	task_thread_info(idle)->preempt_lazy_count = 0;
-#endif
+
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
@@ -7637,7 +6955,6 @@
 #endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_HOTPLUG_CPU
-
 /*
  * Ensure that the idle task is using init_mm right before its CPU goes
  * offline.
@@ -7657,124 +6974,166 @@
 	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
-static int __balance_push_cpu_stop(void *arg)
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta
+ * we might have. Assumes we're called after migrate_tasks() so that the
+ * nr_active count is stable. We need to take the teardown thread which
+ * is calling this into account, so we hand in adjust = 1 to the load
+ * calculation.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
 {
-	struct task_struct *p = arg;
-	struct rq *rq = this_rq();
-	struct rq_flags rf;
-	int cpu;
+	long delta = calc_load_fold_active(rq, 1);
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks);
+}
 
-	raw_spin_lock_irq(&p->pi_lock);
-	rq_lock(rq, &rf);
+static struct task_struct *__pick_migrate_task(struct rq *rq)
+{
+	const struct sched_class *class;
+	struct task_struct *next;
 
+	for_each_class(class) {
+		next = class->pick_next_task(rq);
+		if (next) {
+			next->sched_class->put_prev_task(rq, next);
+			return next;
+		}
+	}
+
+	/* The idle class should always have a runnable task */
+	BUG();
+}
+
+/*
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
+ *
+ * force: if false, the function will skip CPU pinned kthreads.
+ */
+static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, bool force)
+{
+	struct rq *rq = dead_rq;
+	struct task_struct *next, *tmp, *stop = rq->stop;
+	LIST_HEAD(percpu_kthreads);
+	struct rq_flags orf = *rf;
+	int dest_cpu;
+
+	/*
+	 * Fudge the rq selection such that the below task selection loop
+	 * doesn't get stuck on the currently eligible stop task.
+	 *
+	 * We're currently inside stop_machine() and the rq is either stuck
+	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
+	 * either way we should never end up calling schedule() until we're
+	 * done here.
+	 */
+	rq->stop = NULL;
+
+	/*
+	 * put_prev_task() and pick_next_task() sched
+	 * class method both need to have an up-to-date
+	 * value of rq->clock[_task]
+	 */
 	update_rq_clock(rq);
 
-	if (task_rq(p) == rq && task_on_rq_queued(p)) {
-		cpu = select_fallback_rq(rq->cpu, p);
-		rq = __migrate_task(rq, &rf, p, cpu);
-	}
+#ifdef CONFIG_SCHED_DEBUG
+	/* note the clock update in orf */
+	orf.clock_update_flags |= RQCF_UPDATED;
+#endif
 
-	rq_unlock(rq, &rf);
-	raw_spin_unlock_irq(&p->pi_lock);
-
-	put_task_struct(p);
-
-	return 0;
-}
-
-static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
-
-/*
- * Ensure we only run per-cpu kthreads once the CPU goes !active.
- */
-
-
-static void balance_push(struct rq *rq)
-{
-	struct task_struct *push_task = rq->curr;
-
-	lockdep_assert_held(&rq->lock);
-	SCHED_WARN_ON(rq->cpu != smp_processor_id());
-
-	/*
-	 * Both the cpu-hotplug and stop task are in this case and are
-	 * required to complete the hotplug process.
-	 */
-	if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
+	for (;;) {
 		/*
-		 * If this is the idle task on the outgoing CPU try to wake
-		 * up the hotplug control thread which might wait for the
-		 * last task to vanish. The rcuwait_active() check is
-		 * accurate here because the waiter is pinned on this CPU
-		 * and can't obviously be running in parallel.
-		 *
-		 * On RT kernels this also has to check whether there are
-		 * pinned and scheduled out tasks on the runqueue. They
-		 * need to leave the migrate disabled section first.
+		 * There's this thread running, bail when that's the only
+		 * remaining thread:
 		 */
-		if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
-		    rcuwait_active(&rq->hotplug_wait)) {
-			raw_spin_unlock(&rq->lock);
-			rcuwait_wake_up(&rq->hotplug_wait);
-			raw_spin_lock(&rq->lock);
+		if (rq->nr_running == 1)
+			break;
+
+		next = __pick_migrate_task(rq);
+
+		/*
+		 * Argh ... no iterator for tasks, we need to remove the
+		 * kthread from the run-queue to continue.
+		 */
+		if (!force && is_per_cpu_kthread(next)) {
+			INIT_LIST_HEAD(&next->percpu_kthread_node);
+			list_add(&next->percpu_kthread_node, &percpu_kthreads);
+
+			/* DEQUEUE_SAVE not used due to move_entity in rt */
+			deactivate_task(rq, next,
+					DEQUEUE_NOCLOCK);
+			continue;
 		}
-		return;
+
+		/*
+		 * Rules for changing task_struct::cpus_mask are holding
+		 * both pi_lock and rq->lock, such that holding either
+		 * stabilizes the mask.
+		 *
+		 * Drop rq->lock is not quite as disastrous as it usually is
+		 * because !cpu_active at this point, which means load-balance
+		 * will not interfere. Also, stop-machine.
+		 */
+		rq_unlock(rq, rf);
+		raw_spin_lock(&next->pi_lock);
+		rq_relock(rq, rf);
+
+		/*
+		 * Since we're inside stop-machine, _nothing_ should have
+		 * changed the task, WARN if weird stuff happened, because in
+		 * that case the above rq->lock drop is a fail too.
+		 */
+		if (task_rq(next) != rq || !task_on_rq_queued(next)) {
+			/*
+			 * In the !force case, there is a hole between
+			 * rq_unlock() and rq_relock(), where another CPU might
+			 * not observe an up to date cpu_active_mask and try to
+			 * move tasks around.
+			 */
+			WARN_ON(force);
+			raw_spin_unlock(&next->pi_lock);
+			continue;
+		}
+
+		/* Find suitable destination for @next, with force if needed. */
+		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+		rq = __migrate_task(rq, rf, next, dest_cpu);
+		if (rq != dead_rq) {
+			rq_unlock(rq, rf);
+			rq = dead_rq;
+			*rf = orf;
+			rq_relock(rq, rf);
+		}
+		raw_spin_unlock(&next->pi_lock);
 	}
 
-	get_task_struct(push_task);
-	/*
-	 * Temporarily drop rq->lock such that we can wake-up the stop task.
-	 * Both preemption and IRQs are still disabled.
-	 */
-	raw_spin_unlock(&rq->lock);
-	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
-			    this_cpu_ptr(&push_work));
-	/*
-	 * At this point need_resched() is true and we'll take the loop in
-	 * schedule(). The next pick is obviously going to be the stop task
-	 * which is_per_cpu_kthread() and will push this task away.
-	 */
-	raw_spin_lock(&rq->lock);
-}
+	list_for_each_entry_safe(next, tmp, &percpu_kthreads,
+				 percpu_kthread_node) {
 
-static void balance_push_set(int cpu, bool on)
-{
-	struct rq *rq = cpu_rq(cpu);
-	struct rq_flags rf;
+		/* ENQUEUE_RESTORE not used due to move_entity in rt */
+		activate_task(rq, next, ENQUEUE_NOCLOCK);
+		list_del(&next->percpu_kthread_node);
+	}
 
-	rq_lock_irqsave(rq, &rf);
-	if (on)
-		rq->balance_flags |= BALANCE_PUSH;
-	else
-		rq->balance_flags &= ~BALANCE_PUSH;
-	rq_unlock_irqrestore(rq, &rf);
-}
-
-/*
- * Invoked from a CPUs hotplug control thread after the CPU has been marked
- * inactive. All tasks which are not per CPU kernel threads are either
- * pushed off this CPU now via balance_push() or placed on a different CPU
- * during wakeup. Wait until the CPU is quiescent.
- */
-static void balance_hotplug_wait(void)
-{
-	struct rq *rq = this_rq();
-
-	rcuwait_wait_event(&rq->hotplug_wait,
-			   rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
-			   TASK_UNINTERRUPTIBLE);
+	rq->stop = stop;
 }
 
 static int drain_rq_cpu_stop(void *data)
 {
-#ifndef CONFIG_PREEMPT_RT
 	struct rq *rq = this_rq();
 	struct rq_flags rf;
 
 	rq_lock_irqsave(rq, &rf);
 	migrate_tasks(rq, &rf, false);
 	rq_unlock_irqrestore(rq, &rf);
-#endif
+
 	return 0;
 }
 
@@ -7799,21 +7158,6 @@
 	if (rq_drain->done)
 		cpu_stop_work_wait(rq_drain);
 }
-
-#else
-
-static inline void balance_push(struct rq *rq)
-{
-}
-
-static inline void balance_push_set(int cpu, bool on)
-{
-}
-
-static inline void balance_hotplug_wait(void)
-{
-}
-
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -7901,8 +7245,6 @@
 	struct rq *rq = cpu_rq(cpu);
 	struct rq_flags rf;
 
-	balance_push_set(cpu, false);
-
 #ifdef CONFIG_SCHED_SMT
 	/*
 	 * When going up, increment the number of cores with SMT present.
@@ -7956,21 +7298,9 @@
 
 int _sched_cpu_deactivate(unsigned int cpu)
 {
-	struct rq *rq = cpu_rq(cpu);
-	struct rq_flags rf;
 	int ret;
 
 	set_cpu_active(cpu, false);
-
-	balance_push_set(cpu, true);
-
-	rq_lock_irqsave(rq, &rf);
-	if (rq->rd) {
-		update_rq_clock(rq);
-		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-		set_rq_offline(rq);
-	}
-	rq_unlock_irqrestore(rq, &rf);
 
 #ifdef CONFIG_SCHED_SMT
 	/*
@@ -7985,7 +7315,6 @@
 
 	ret = cpuset_cpu_inactive(cpu);
 	if (ret) {
-		balance_push_set(cpu, false);
 		set_cpu_active(cpu, true);
 		return ret;
 	}
@@ -8049,41 +7378,6 @@
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Invoked immediately before the stopper thread is invoked to bring the
- * CPU down completely. At this point all per CPU kthreads except the
- * hotplug thread (current) and the stopper thread (inactive) have been
- * either parked or have been unbound from the outgoing CPU. Ensure that
- * any of those which might be on the way out are gone.
- *
- * If after this point a bound task is being woken on this CPU then the
- * responsible hotplug callback has failed to do it's job.
- * sched_cpu_dying() will catch it with the appropriate fireworks.
- */
-int sched_cpu_wait_empty(unsigned int cpu)
-{
-	balance_hotplug_wait();
-	return 0;
-}
-
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta we
- * might have. Called from the CPU stopper task after ensuring that the
- * stopper is the last running task on the CPU, so nr_active count is
- * stable. We need to take the teardown thread which is calling this into
- * account, so we hand in adjust = 1 to the load calculation.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
-{
-	long delta = calc_load_fold_active(rq, 1);
-
-	if (delta)
-		atomic_long_add(delta, &calc_load_tasks);
-}
-
 int sched_cpu_dying(unsigned int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -8093,7 +7387,12 @@
 	sched_tick_stop(cpu);
 
 	rq_lock_irqsave(rq, &rf);
-	BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
+	if (rq->rd) {
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_offline(rq);
+	}
+	migrate_tasks(rq, &rf, true);
+	BUG_ON(rq->nr_running != 1);
 	rq_unlock_irqrestore(rq, &rf);
 
 	trace_android_rvh_sched_cpu_dying(cpu);
@@ -8304,9 +7603,6 @@
 
 		rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
 #endif
-#ifdef CONFIG_HOTPLUG_CPU
-		rcuwait_init(&rq->hotplug_wait);
-#endif
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
@@ -8347,7 +7643,7 @@
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
-	int nested = preempt_count() + sched_rcu_preempt_depth();
+	int nested = preempt_count() + rcu_preempt_depth();
 
 	return (nested == preempt_offset);
 }
@@ -8447,39 +7743,6 @@
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 EXPORT_SYMBOL_GPL(__cant_sleep);
-
-#ifdef CONFIG_SMP
-void __cant_migrate(const char *file, int line)
-{
-	static unsigned long prev_jiffy;
-
-	if (irqs_disabled())
-		return;
-
-	if (is_migration_disabled(current))
-		return;
-
-	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
-		return;
-
-	if (preempt_count() > 0)
-		return;
-
-	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-		return;
-	prev_jiffy = jiffies;
-
-	pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
-	pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
-	       in_atomic(), irqs_disabled(), is_migration_disabled(current),
-	       current->pid, current->comm);
-
-	debug_show_held_locks(current);
-	dump_stack();
-	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
-}
-EXPORT_SYMBOL_GPL(__cant_migrate);
-#endif
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ

--
Gitblit v1.6.2