~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* kernel/sched/core.c
3	4	*
..	..	@@ -5,6 +6,10 @@
5	6	*
6	7	* Copyright (C) 1991-2002 Linus Torvalds
7	8	*/
	9	+#define CREATE_TRACE_POINTS
	10	+#include <trace/events/sched.h>
	11	+#undef CREATE_TRACE_POINTS
	12	+
8	13	#include "sched.h"
9	14
10	15	#include <linux/nospec.h>
..	..	@@ -16,14 +21,41 @@
16	21	#include <asm/tlb.h>
17	22
18	23	#include "../workqueue_internal.h"
	24	+#include "../../io_uring/io-wq.h"
19	25	#include "../smpboot.h"
20	26
21	27	#include "pelt.h"
	28	+#include "smp.h"
22	29
23		-#define CREATE_TRACE_POINTS
24		-#include <trace/events/sched.h>
	30	+#include <trace/hooks/sched.h>
	31	+#include <trace/hooks/dtask.h>
	32	+
	33	+/*
	34	+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
	35	+ * associated with them) to allow external modules to probe them.
	36	+ */
	37	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
	38	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
	39	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
	40	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
	41	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
	42	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
	43	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
	44	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
	45	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
	46	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
	47	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
	48	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_switch);
	49	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking);
	50	+#ifdef CONFIG_SCHEDSTATS
	51	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep);
	52	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait);
	53	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait);
	54	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked);
	55	+#endif
25	56
26	57	DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
	58	+EXPORT_SYMBOL_GPL(runqueues);
27	59
28	60	#ifdef CONFIG_SCHED_DEBUG
29	61	/*
..	..	@@ -38,6 +70,7 @@
38	70	const_debug unsigned int sysctl_sched_features =
39	71	#include "features.h"
40	72	0;
	73	+EXPORT_SYMBOL_GPL(sysctl_sched_features);
41	74	#undef SCHED_FEAT
42	75	#endif
43	76
..	..	@@ -45,7 +78,11 @@
45	78	* Number of tasks to iterate in a single balance run.
46	79	* Limited because this is done with IRQs disabled.
47	80	*/
	81	+#ifdef CONFIG_PREEMPT_RT
	82	+const_debug unsigned int sysctl_sched_nr_migrate = 8;
	83	+#else
48	84	const_debug unsigned int sysctl_sched_nr_migrate = 32;
	85	+#endif
49	86
50	87	/*
51	88	* period over which we measure -rt task CPU usage in us.
..	..	@@ -60,6 +97,100 @@
60	97	* default: 0.95s
61	98	*/
62	99	int sysctl_sched_rt_runtime = 950000;
	100	+
	101	+
	102	+/*
	103	+ * Serialization rules:
	104	+ *
	105	+ * Lock order:
	106	+ *
	107	+ * p->pi_lock
	108	+ * rq->lock
	109	+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
	110	+ *
	111	+ * rq1->lock
	112	+ * rq2->lock where: rq1 < rq2
	113	+ *
	114	+ * Regular state:
	115	+ *
	116	+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
	117	+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
	118	+ * always looks at the local rq data structures to find the most elegible task
	119	+ * to run next.
	120	+ *
	121	+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
	122	+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
	123	+ * the local CPU to avoid bouncing the runqueue state around [ see
	124	+ * ttwu_queue_wakelist() ]
	125	+ *
	126	+ * Task wakeup, specifically wakeups that involve migration, are horribly
	127	+ * complicated to avoid having to take two rq->locks.
	128	+ *
	129	+ * Special state:
	130	+ *
	131	+ * System-calls and anything external will use task_rq_lock() which acquires
	132	+ * both p->pi_lock and rq->lock. As a consequence the state they change is
	133	+ * stable while holding either lock:
	134	+ *
	135	+ * - sched_setaffinity()/
	136	+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
	137	+ * - set_user_nice(): p->se.load, p->*prio
	138	+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
	139	+ * p->se.load, p->rt_priority,
	140	+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
	141	+ * - sched_setnuma(): p->numa_preferred_nid
	142	+ * - sched_move_task()/
	143	+ * cpu_cgroup_fork(): p->sched_task_group
	144	+ * - uclamp_update_active() p->uclamp*
	145	+ *
	146	+ * p->state <- TASK_*:
	147	+ *
	148	+ * is changed locklessly using set_current_state(), __set_current_state() or
	149	+ * set_special_state(), see their respective comments, or by
	150	+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
	151	+ * concurrent self.
	152	+ *
	153	+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
	154	+ *
	155	+ * is set by activate_task() and cleared by deactivate_task(), under
	156	+ * rq->lock. Non-zero indicates the task is runnable, the special
	157	+ * ON_RQ_MIGRATING state is used for migration without holding both
	158	+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
	159	+ *
	160	+ * p->on_cpu <- { 0, 1 }:
	161	+ *
	162	+ * is set by prepare_task() and cleared by finish_task() such that it will be
	163	+ * set before p is scheduled-in and cleared after p is scheduled-out, both
	164	+ * under rq->lock. Non-zero indicates the task is running on its CPU.
	165	+ *
	166	+ * [ The astute reader will observe that it is possible for two tasks on one
	167	+ * CPU to have ->on_cpu = 1 at the same time. ]
	168	+ *
	169	+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
	170	+ *
	171	+ * - Don't call set_task_cpu() on a blocked task:
	172	+ *
	173	+ * We don't care what CPU we're not running on, this simplifies hotplug,
	174	+ * the CPU assignment of blocked tasks isn't required to be valid.
	175	+ *
	176	+ * - for try_to_wake_up(), called under p->pi_lock:
	177	+ *
	178	+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
	179	+ *
	180	+ * - for migration called under rq->lock:
	181	+ * [ see task_on_rq_migrating() in task_rq_lock() ]
	182	+ *
	183	+ * o move_queued_task()
	184	+ * o detach_task()
	185	+ *
	186	+ * - for migration called under double_rq_lock():
	187	+ *
	188	+ * o __migrate_swap_task()
	189	+ * o push_rt_task() / pull_rt_task()
	190	+ * o push_dl_task() / pull_dl_task()
	191	+ * o dl_task_offline_migration()
	192	+ *
	193	+ */
63	194
64	195	/*
65	196	* __task_rq_lock - lock the rq @p resides on.
..	..	@@ -84,6 +215,7 @@
84	215	cpu_relax();
85	216	}
86	217	}
	218	+EXPORT_SYMBOL_GPL(__task_rq_lock);
87	219
88	220	/*
89	221	* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
..	..	@@ -126,6 +258,7 @@
126	258	cpu_relax();
127	259	}
128	260	}
	261	+EXPORT_SYMBOL_GPL(task_rq_lock);
129	262
130	263	/*
131	264	* RQ-clock updating methods:
..	..	@@ -206,7 +339,15 @@
206	339	rq->clock += delta;
207	340	update_rq_clock_task(rq, delta);
208	341	}
	342	+EXPORT_SYMBOL_GPL(update_rq_clock);
209	343
	344	+static inline void
	345	+rq_csd_init(struct rq rq, struct __call_single_data csd, smp_call_func_t func)
	346	+{
	347	+ csd->flags = 0;
	348	+ csd->func = func;
	349	+ csd->info = rq;
	350	+}
210	351
211	352	#ifdef CONFIG_SCHED_HRTICK
212	353	/*
..	..	@@ -243,8 +384,9 @@
243	384	static void __hrtick_restart(struct rq *rq)
244	385	{
245	386	struct hrtimer *timer = &rq->hrtick_timer;
	387	+ ktime_t time = rq->hrtick_time;
246	388
247		- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
	389	+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
248	390	}
249	391
250	392	/*
..	..	@@ -257,7 +399,6 @@
257	399
258	400	rq_lock(rq, &rf);
259	401	__hrtick_restart(rq);
260		- rq->hrtick_csd_pending = 0;
261	402	rq_unlock(rq, &rf);
262	403	}
263	404
..	..	@@ -269,7 +410,6 @@
269	410	void hrtick_start(struct rq *rq, u64 delay)
270	411	{
271	412	struct hrtimer *timer = &rq->hrtick_timer;
272		- ktime_t time;
273	413	s64 delta;
274	414
275	415	/*
..	..	@@ -277,16 +417,12 @@
277	417	* doesn't make sense and can cause timer DoS.
278	418	*/
279	419	delta = max_t(s64, delay, 10000LL);
280		- time = ktime_add_ns(timer->base->get_time(), delta);
	420	+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
281	421
282		- hrtimer_set_expires(timer, time);
283		-
284		- if (rq == this_rq()) {
	422	+ if (rq == this_rq())
285	423	__hrtick_restart(rq);
286		- } else if (!rq->hrtick_csd_pending) {
	424	+ else
287	425	smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
288		- rq->hrtick_csd_pending = 1;
289		- }
290	426	}
291	427
292	428	#else
..	..	@@ -303,21 +439,17 @@
303	439	*/
304	440	delay = max_t(u64, delay, 10000LL);
305	441	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
306		- HRTIMER_MODE_REL_PINNED);
	442	+ HRTIMER_MODE_REL_PINNED_HARD);
307	443	}
	444	+
308	445	#endif /* CONFIG_SMP */
309	446
310	447	static void hrtick_rq_init(struct rq *rq)
311	448	{
312	449	#ifdef CONFIG_SMP
313		- rq->hrtick_csd_pending = 0;
314		-
315		- rq->hrtick_csd.flags = 0;
316		- rq->hrtick_csd.func = __hrtick_start;
317		- rq->hrtick_csd.info = rq;
	450	+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
318	451	#endif
319		-
320		- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	452	+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
321	453	rq->hrtick_timer.function = hrtick;
322	454	}
323	455	#else /* CONFIG_SCHED_HRTICK */
..	..	@@ -399,9 +531,15 @@
399	531	#endif
400	532	#endif
401	533
402		-void wake_q_add(struct wake_q_head head, struct task_struct task)
	534	+static bool __wake_q_add(struct wake_q_head head, struct task_struct task,
	535	+ bool sleeper)
403	536	{
404		- struct wake_q_node *node = &task->wake_q;
	537	+ struct wake_q_node *node;
	538	+
	539	+ if (sleeper)
	540	+ node = &task->wake_q_sleeper;
	541	+ else
	542	+ node = &task->wake_q;
405	543
406	544	/*
407	545	* Atomically grab the task, if ->wake_q is !nil already it means
..	..	@@ -412,42 +550,96 @@
412	550	* state, even in the failed case, an explicit smp_mb() must be used.
413	551	*/
414	552	smp_mb__before_atomic();
415		- if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
416		- return;
417		-
418		- head->count++;
419		-
420		- get_task_struct(task);
	553	+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
	554	+ return false;
421	555
422	556	/*
423	557	* The head is context local, there can be no concurrency.
424	558	*/
425	559	*head->lastp = node;
426	560	head->lastp = &node->next;
	561	+ head->count++;
	562	+ return true;
427	563	}
428	564
429		-static int
430		-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
431		- int sibling_count_hint);
	565	+/**
	566	+ * wake_q_add() - queue a wakeup for 'later' waking.
	567	+ * @head: the wake_q_head to add @task to
	568	+ * @task: the task to queue for 'later' wakeup
	569	+ *
	570	+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
	571	+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
	572	+ * instantly.
	573	+ *
	574	+ * This function must be used as-if it were wake_up_process(); IOW the task
	575	+ * must be ready to be woken at this location.
	576	+ */
	577	+void wake_q_add(struct wake_q_head head, struct task_struct task)
	578	+{
	579	+ if (__wake_q_add(head, task, false))
	580	+ get_task_struct(task);
	581	+}
432	582
433		-void wake_up_q(struct wake_q_head *head)
	583	+void wake_q_add_sleeper(struct wake_q_head head, struct task_struct task)
	584	+{
	585	+ if (__wake_q_add(head, task, true))
	586	+ get_task_struct(task);
	587	+}
	588	+
	589	+/**
	590	+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
	591	+ * @head: the wake_q_head to add @task to
	592	+ * @task: the task to queue for 'later' wakeup
	593	+ *
	594	+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
	595	+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
	596	+ * instantly.
	597	+ *
	598	+ * This function must be used as-if it were wake_up_process(); IOW the task
	599	+ * must be ready to be woken at this location.
	600	+ *
	601	+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
	602	+ * that already hold reference to @task can call the 'safe' version and trust
	603	+ * wake_q to do the right thing depending whether or not the @task is already
	604	+ * queued for wakeup.
	605	+ */
	606	+void wake_q_add_safe(struct wake_q_head head, struct task_struct task)
	607	+{
	608	+ if (!__wake_q_add(head, task, false))
	609	+ put_task_struct(task);
	610	+}
	611	+
	612	+void __wake_up_q(struct wake_q_head *head, bool sleeper)
434	613	{
435	614	struct wake_q_node *node = head->first;
436	615
437	616	while (node != WAKE_Q_TAIL) {
438	617	struct task_struct *task;
439	618
440		- task = container_of(node, struct task_struct, wake_q);
	619	+ if (sleeper)
	620	+ task = container_of(node, struct task_struct, wake_q_sleeper);
	621	+ else
	622	+ task = container_of(node, struct task_struct, wake_q);
	623	+
441	624	BUG_ON(!task);
442	625	/* Task can safely be re-inserted now: */
443	626	node = node->next;
444		- task->wake_q.next = NULL;
	627	+ task->wake_q_count = head->count;
	628	+ if (sleeper)
	629	+ task->wake_q_sleeper.next = NULL;
	630	+ else
	631	+ task->wake_q.next = NULL;
445	632
446	633	/*
447		- * try_to_wake_up() executes a full barrier, which pairs with
	634	+ * wake_up_process() executes a full barrier, which pairs with
448	635	* the queueing in wake_q_add() so as not to miss wakeups.
449	636	*/
450		- try_to_wake_up(task, TASK_NORMAL, 0, head->count);
	637	+ if (sleeper)
	638	+ wake_up_lock_sleeper(task);
	639	+ else
	640	+ wake_up_process(task);
	641	+
	642	+ task->wake_q_count = 0;
451	643	put_task_struct(task);
452	644	}
453	645	}
..	..	@@ -477,15 +669,54 @@
477	669	return;
478	670	}
479	671
480		-#ifdef CONFIG_PREEMPT
481	672	if (set_nr_and_not_polling(curr))
482		-#else
483		- if (set_nr_and_not_polling(curr) && (rq->curr == rq->idle))
484		-#endif
485	673	smp_send_reschedule(cpu);
486	674	else
487	675	trace_sched_wake_idle_without_ipi(cpu);
488	676	}
	677	+EXPORT_SYMBOL_GPL(resched_curr);
	678	+
	679	+#ifdef CONFIG_PREEMPT_LAZY
	680	+
	681	+static int tsk_is_polling(struct task_struct *p)
	682	+{
	683	+#ifdef TIF_POLLING_NRFLAG
	684	+ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
	685	+#else
	686	+ return 0;
	687	+#endif
	688	+}
	689	+
	690	+void resched_curr_lazy(struct rq *rq)
	691	+{
	692	+ struct task_struct *curr = rq->curr;
	693	+ int cpu;
	694	+
	695	+ if (!sched_feat(PREEMPT_LAZY)) {
	696	+ resched_curr(rq);
	697	+ return;
	698	+ }
	699	+
	700	+ lockdep_assert_held(&rq->lock);
	701	+
	702	+ if (test_tsk_need_resched(curr))
	703	+ return;
	704	+
	705	+ if (test_tsk_need_resched_lazy(curr))
	706	+ return;
	707	+
	708	+ set_tsk_need_resched_lazy(curr);
	709	+
	710	+ cpu = cpu_of(rq);
	711	+ if (cpu == smp_processor_id())
	712	+ return;
	713	+
	714	+ /* NEED_RESCHED_LAZY must be visible before we test polling */
	715	+ smp_mb();
	716	+ if (!tsk_is_polling(curr))
	717	+ smp_send_reschedule(cpu);
	718	+}
	719	+#endif
489	720
490	721	void resched_cpu(int cpu)
491	722	{
..	..	@@ -510,27 +741,49 @@
510	741	*/
511	742	int get_nohz_timer_target(void)
512	743	{
513		- int i, cpu = smp_processor_id();
	744	+ int i, cpu = smp_processor_id(), default_cpu = -1;
514	745	struct sched_domain *sd;
515	746
516		- if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
517		- return cpu;
	747	+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) {
	748	+ if (!idle_cpu(cpu))
	749	+ return cpu;
	750	+ default_cpu = cpu;
	751	+ }
518	752
519	753	rcu_read_lock();
520	754	for_each_domain(cpu, sd) {
521		- for_each_cpu(i, sched_domain_span(sd)) {
	755	+ for_each_cpu_and(i, sched_domain_span(sd),
	756	+ housekeeping_cpumask(HK_FLAG_TIMER)) {
522	757	if (cpu == i)
523	758	continue;
524	759
525		- if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
	760	+ if (!idle_cpu(i)) {
526	761	cpu = i;
527	762	goto unlock;
528	763	}
529	764	}
530	765	}
531	766
532		- if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
533		- cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
	767	+ if (default_cpu == -1) {
	768	+ for_each_cpu_and(i, cpu_active_mask,
	769	+ housekeeping_cpumask(HK_FLAG_TIMER)) {
	770	+ if (cpu == i)
	771	+ continue;
	772	+
	773	+ if (!idle_cpu(i)) {
	774	+ cpu = i;
	775	+ goto unlock;
	776	+ }
	777	+ }
	778	+
	779	+ /* no active, not-idle, housekpeeing CPU found. */
	780	+ default_cpu = cpumask_any(cpu_active_mask);
	781	+
	782	+ if (unlikely(default_cpu >= nr_cpu_ids))
	783	+ goto unlock;
	784	+ }
	785	+
	786	+ cpu = default_cpu;
534	787	unlock:
535	788	rcu_read_unlock();
536	789	return cpu;
..	..	@@ -590,29 +843,23 @@
590	843	wake_up_idle_cpu(cpu);
591	844	}
592	845
593		-static inline bool got_nohz_idle_kick(void)
	846	+static void nohz_csd_func(void *info)
594	847	{
595		- int cpu = smp_processor_id();
596		-
597		- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
598		- return false;
599		-
600		- if (idle_cpu(cpu) && !need_resched())
601		- return true;
	848	+ struct rq *rq = info;
	849	+ int cpu = cpu_of(rq);
	850	+ unsigned int flags;
602	851
603	852	/*
604		- * We can't run Idle Load Balance on this CPU for this time so we
605		- * cancel it and clear NOHZ_BALANCE_KICK
	853	+ * Release the rq::nohz_csd.
606	854	*/
607		- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
608		- return false;
609		-}
	855	+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
	856	+ WARN_ON(!(flags & NOHZ_KICK_MASK));
610	857
611		-#else /* CONFIG_NO_HZ_COMMON */
612		-
613		-static inline bool got_nohz_idle_kick(void)
614		-{
615		- return false;
	858	+ rq->idle_balance = idle_cpu(cpu);
	859	+ if (rq->idle_balance && !need_resched()) {
	860	+ rq->nohz_idle_balance = flags;
	861	+ raise_softirq_irqoff(SCHED_SOFTIRQ);
	862	+ }
616	863	}
617	864
618	865	#endif /* CONFIG_NO_HZ_COMMON */
..	..	@@ -703,18 +950,18 @@
703	950	}
704	951	#endif
705	952
706		-static void set_load_weight(struct task_struct *p, bool update_load)
	953	+static void set_load_weight(struct task_struct *p)
707	954	{
	955	+ bool update_load = !(READ_ONCE(p->state) & TASK_NEW);
708	956	int prio = p->static_prio - MAX_RT_PRIO;
709	957	struct load_weight *load = &p->se.load;
710	958
711	959	/*
712	960	* SCHED_IDLE tasks get minimal weight:
713	961	*/
714		- if (idle_policy(p->policy)) {
	962	+ if (task_has_idle_policy(p)) {
715	963	load->weight = scale_load(WEIGHT_IDLEPRIO);
716	964	load->inv_weight = WMULT_IDLEPRIO;
717		- p->se.runnable_weight = load->weight;
718	965	return;
719	966	}
720	967
..	..	@@ -727,7 +974,6 @@
727	974	} else {
728	975	load->weight = scale_load(sched_prio_to_weight[prio]);
729	976	load->inv_weight = sched_prio_to_wmult[prio];
730		- p->se.runnable_weight = load->weight;
731	977	}
732	978	}
733	979
..	..	@@ -750,8 +996,46 @@
750	996	/* Max allowed maximum utilization */
751	997	unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
752	998
	999	+/*
	1000	+ * By default RT tasks run at the maximum performance point/capacity of the
	1001	+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
	1002	+ * SCHED_CAPACITY_SCALE.
	1003	+ *
	1004	+ * This knob allows admins to change the default behavior when uclamp is being
	1005	+ * used. In battery powered devices, particularly, running at the maximum
	1006	+ * capacity and frequency will increase energy consumption and shorten the
	1007	+ * battery life.
	1008	+ *
	1009	+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
	1010	+ *
	1011	+ * This knob will not override the system default sched_util_clamp_min defined
	1012	+ * above.
	1013	+ */
	1014	+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
	1015	+
753	1016	/* All clamps are required to be less or equal than these values */
754	1017	static struct uclamp_se uclamp_default[UCLAMP_CNT];
	1018	+
	1019	+/*
	1020	+ * This static key is used to reduce the uclamp overhead in the fast path. It
	1021	+ * primarily disables the call to uclamp_rq_{inc, dec}() in
	1022	+ * enqueue/dequeue_task().
	1023	+ *
	1024	+ * This allows users to continue to enable uclamp in their kernel config with
	1025	+ * minimum uclamp overhead in the fast path.
	1026	+ *
	1027	+ * As soon as userspace modifies any of the uclamp knobs, the static key is
	1028	+ * enabled, since we have an actual users that make use of uclamp
	1029	+ * functionality.
	1030	+ *
	1031	+ * The knobs that would enable this static key are:
	1032	+ *
	1033	+ * * A task modifying its uclamp value with sched_setattr().
	1034	+ * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
	1035	+ * * An admin modifying the cgroup cpu.uclamp.{min, max}
	1036	+ */
	1037	+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
	1038	+EXPORT_SYMBOL_GPL(sched_uclamp_used);
755	1039
756	1040	/* Integer rounded range for each bucket */
757	1041	#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
..	..	@@ -762,11 +1046,6 @@
762	1046	static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
763	1047	{
764	1048	return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
765		-}
766		-
767		-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
768		-{
769		- return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
770	1049	}
771	1050
772	1051	static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
..	..	@@ -832,12 +1111,79 @@
832	1111	return uclamp_idle_value(rq, clamp_id, clamp_value);
833	1112	}
834	1113
	1114	+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
	1115	+{
	1116	+ unsigned int default_util_min;
	1117	+ struct uclamp_se *uc_se;
	1118	+
	1119	+ lockdep_assert_held(&p->pi_lock);
	1120	+
	1121	+ uc_se = &p->uclamp_req[UCLAMP_MIN];
	1122	+
	1123	+ /* Only sync if user didn't override the default */
	1124	+ if (uc_se->user_defined)
	1125	+ return;
	1126	+
	1127	+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
	1128	+ uclamp_se_set(uc_se, default_util_min, false);
	1129	+}
	1130	+
	1131	+static void uclamp_update_util_min_rt_default(struct task_struct *p)
	1132	+{
	1133	+ struct rq_flags rf;
	1134	+ struct rq *rq;
	1135	+
	1136	+ if (!rt_task(p))
	1137	+ return;
	1138	+
	1139	+ /* Protect updates to p->uclamp_* */
	1140	+ rq = task_rq_lock(p, &rf);
	1141	+ __uclamp_update_util_min_rt_default(p);
	1142	+ task_rq_unlock(rq, p, &rf);
	1143	+}
	1144	+
	1145	+static void uclamp_sync_util_min_rt_default(void)
	1146	+{
	1147	+ struct task_struct g, p;
	1148	+
	1149	+ /*
	1150	+ * copy_process() sysctl_uclamp
	1151	+ * uclamp_min_rt = X;
	1152	+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
	1153	+ * // link thread smp_mb__after_spinlock()
	1154	+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
	1155	+ * sched_post_fork() for_each_process_thread()
	1156	+ * __uclamp_sync_rt() __uclamp_sync_rt()
	1157	+ *
	1158	+ * Ensures that either sched_post_fork() will observe the new
	1159	+ * uclamp_min_rt or for_each_process_thread() will observe the new
	1160	+ * task.
	1161	+ */
	1162	+ read_lock(&tasklist_lock);
	1163	+ smp_mb__after_spinlock();
	1164	+ read_unlock(&tasklist_lock);
	1165	+
	1166	+ rcu_read_lock();
	1167	+ for_each_process_thread(g, p)
	1168	+ uclamp_update_util_min_rt_default(p);
	1169	+ rcu_read_unlock();
	1170	+}
	1171	+
	1172	+#if IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)
	1173	+void rockchip_perf_uclamp_sync_util_min_rt_default(void)
	1174	+{
	1175	+ uclamp_sync_util_min_rt_default();
	1176	+}
	1177	+EXPORT_SYMBOL(rockchip_perf_uclamp_sync_util_min_rt_default);
	1178	+#endif
	1179	+
835	1180	static inline struct uclamp_se
836	1181	uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
837	1182	{
	1183	+ /* Copy by value as we could modify it */
838	1184	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
839	1185	#ifdef CONFIG_UCLAMP_TASK_GROUP
840		- struct uclamp_se uc_max;
	1186	+ unsigned int tg_min, tg_max, value;
841	1187
842	1188	/*
843	1189	* Tasks in autogroups or root task group will be
..	..	@@ -848,9 +1194,11 @@
848	1194	if (task_group(p) == &root_task_group)
849	1195	return uc_req;
850	1196
851		- uc_max = task_group(p)->uclamp[clamp_id];
852		- if (uc_req.value > uc_max.value \|\| !uc_req.user_defined)
853		- return uc_max;
	1197	+ tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
	1198	+ tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
	1199	+ value = uc_req.value;
	1200	+ value = clamp(value, tg_min, tg_max);
	1201	+ uclamp_se_set(&uc_req, value, false);
854	1202	#endif
855	1203
856	1204	return uc_req;
..	..	@@ -869,6 +1217,12 @@
869	1217	{
870	1218	struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
871	1219	struct uclamp_se uc_max = uclamp_default[clamp_id];
	1220	+ struct uclamp_se uc_eff;
	1221	+ int ret = 0;
	1222	+
	1223	+ trace_android_rvh_uclamp_eff_get(p, clamp_id, &uc_max, &uc_eff, &ret);
	1224	+ if (ret)
	1225	+ return uc_eff;
872	1226
873	1227	/* System default restrictions always apply */
874	1228	if (unlikely(uc_req.value > uc_max.value))
..	..	@@ -889,6 +1243,7 @@
889	1243
890	1244	return (unsigned long)uc_eff.value;
891	1245	}
	1246	+EXPORT_SYMBOL_GPL(uclamp_eff_value);
892	1247
893	1248	/*
894	1249	* When a task is enqueued on a rq, the clamp bucket currently defined by the
..	..	@@ -949,10 +1304,38 @@
949	1304
950	1305	lockdep_assert_held(&rq->lock);
951	1306
	1307	+ /*
	1308	+ * If sched_uclamp_used was enabled after task @p was enqueued,
	1309	+ * we could end up with unbalanced call to uclamp_rq_dec_id().
	1310	+ *
	1311	+ * In this case the uc_se->active flag should be false since no uclamp
	1312	+ * accounting was performed at enqueue time and we can just return
	1313	+ * here.
	1314	+ *
	1315	+ * Need to be careful of the following enqeueue/dequeue ordering
	1316	+ * problem too
	1317	+ *
	1318	+ * enqueue(taskA)
	1319	+ * // sched_uclamp_used gets enabled
	1320	+ * enqueue(taskB)
	1321	+ * dequeue(taskA)
	1322	+ * // Must not decrement bukcet->tasks here
	1323	+ * dequeue(taskB)
	1324	+ *
	1325	+ * where we could end up with stale data in uc_se and
	1326	+ * bucket[uc_se->bucket_id].
	1327	+ *
	1328	+ * The following check here eliminates the possibility of such race.
	1329	+ */
	1330	+ if (unlikely(!uc_se->active))
	1331	+ return;
	1332	+
952	1333	bucket = &uc_rq->bucket[uc_se->bucket_id];
	1334	+
953	1335	SCHED_WARN_ON(!bucket->tasks);
954	1336	if (likely(bucket->tasks))
955	1337	bucket->tasks--;
	1338	+
956	1339	uc_se->active = false;
957	1340
958	1341	/*
..	..	@@ -980,6 +1363,15 @@
980	1363	{
981	1364	enum uclamp_id clamp_id;
982	1365
	1366	+ /*
	1367	+ * Avoid any overhead until uclamp is actually used by the userspace.
	1368	+ *
	1369	+ * The condition is constructed such that a NOP is generated when
	1370	+ * sched_uclamp_used is disabled.
	1371	+ */
	1372	+ if (!static_branch_unlikely(&sched_uclamp_used))
	1373	+ return;
	1374	+
983	1375	if (unlikely(!p->sched_class->uclamp_enabled))
984	1376	return;
985	1377
..	..	@@ -995,6 +1387,15 @@
995	1387	{
996	1388	enum uclamp_id clamp_id;
997	1389
	1390	+ /*
	1391	+ * Avoid any overhead until uclamp is actually used by the userspace.
	1392	+ *
	1393	+ * The condition is constructed such that a NOP is generated when
	1394	+ * sched_uclamp_used is disabled.
	1395	+ */
	1396	+ if (!static_branch_unlikely(&sched_uclamp_used))
	1397	+ return;
	1398	+
998	1399	if (unlikely(!p->sched_class->uclamp_enabled))
999	1400	return;
1000	1401
..	..	@@ -1002,9 +1403,27 @@
1002	1403	uclamp_rq_dec_id(rq, p, clamp_id);
1003	1404	}
1004	1405
1005		-static inline void
1006		-uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
	1406	+static inline void uclamp_rq_reinc_id(struct rq rq, struct task_struct p,
	1407	+ enum uclamp_id clamp_id)
1007	1408	{
	1409	+ if (!p->uclamp[clamp_id].active)
	1410	+ return;
	1411	+
	1412	+ uclamp_rq_dec_id(rq, p, clamp_id);
	1413	+ uclamp_rq_inc_id(rq, p, clamp_id);
	1414	+
	1415	+ /*
	1416	+ * Make sure to clear the idle flag if we've transiently reached 0
	1417	+ * active tasks on rq.
	1418	+ */
	1419	+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
	1420	+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
	1421	+}
	1422	+
	1423	+static inline void
	1424	+uclamp_update_active(struct task_struct *p)
	1425	+{
	1426	+ enum uclamp_id clamp_id;
1008	1427	struct rq_flags rf;
1009	1428	struct rq *rq;
1010	1429
..	..	@@ -1024,30 +1443,22 @@
1024	1443	* affecting a valid clamp bucket, the next time it's enqueued,
1025	1444	* it will already see the updated clamp bucket value.
1026	1445	*/
1027		- if (p->uclamp[clamp_id].active) {
1028		- uclamp_rq_dec_id(rq, p, clamp_id);
1029		- uclamp_rq_inc_id(rq, p, clamp_id);
1030		- }
	1446	+ for_each_clamp_id(clamp_id)
	1447	+ uclamp_rq_reinc_id(rq, p, clamp_id);
1031	1448
1032	1449	task_rq_unlock(rq, p, &rf);
1033	1450	}
1034	1451
1035	1452	#ifdef CONFIG_UCLAMP_TASK_GROUP
1036	1453	static inline void
1037		-uclamp_update_active_tasks(struct cgroup_subsys_state *css,
1038		- unsigned int clamps)
	1454	+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1039	1455	{
1040		- enum uclamp_id clamp_id;
1041	1456	struct css_task_iter it;
1042	1457	struct task_struct *p;
1043	1458
1044	1459	css_task_iter_start(css, 0, &it);
1045		- while ((p = css_task_iter_next(&it))) {
1046		- for_each_clamp_id(clamp_id) {
1047		- if ((0x1 << clamp_id) & clamps)
1048		- uclamp_update_active(p, clamp_id);
1049		- }
1050		- }
	1460	+ while ((p = css_task_iter_next(&it)))
	1461	+ uclamp_update_active(p);
1051	1462	css_task_iter_end(&it);
1052	1463	}
1053	1464
..	..	@@ -1070,16 +1481,16 @@
1070	1481	#endif
1071	1482
1072	1483	int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1073		- void __user buffer, size_t lenp,
1074		- loff_t *ppos)
	1484	+ void buffer, size_t lenp, loff_t *ppos)
1075	1485	{
1076	1486	bool update_root_tg = false;
1077		- int old_min, old_max;
	1487	+ int old_min, old_max, old_min_rt;
1078	1488	int result;
1079	1489
1080	1490	mutex_lock(&uclamp_mutex);
1081	1491	old_min = sysctl_sched_uclamp_util_min;
1082	1492	old_max = sysctl_sched_uclamp_util_max;
	1493	+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1083	1494
1084	1495	result = proc_dointvec(table, write, buffer, lenp, ppos);
1085	1496	if (result)
..	..	@@ -1088,7 +1499,9 @@
1088	1499	goto done;
1089	1500
1090	1501	if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max \|\|
1091		- sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
	1502	+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE \|\|
	1503	+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
	1504	+
1092	1505	result = -EINVAL;
1093	1506	goto undo;
1094	1507	}
..	..	@@ -1104,8 +1517,15 @@
1104	1517	update_root_tg = true;
1105	1518	}
1106	1519
1107		- if (update_root_tg)
	1520	+ if (update_root_tg) {
	1521	+ static_branch_enable(&sched_uclamp_used);
1108	1522	uclamp_update_root_tg();
	1523	+ }
	1524	+
	1525	+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
	1526	+ static_branch_enable(&sched_uclamp_used);
	1527	+ uclamp_sync_util_min_rt_default();
	1528	+ }
1109	1529
1110	1530	/*
1111	1531	* We update all RUNNABLE tasks only when task groups are in use.
..	..	@@ -1118,6 +1538,7 @@
1118	1538	undo:
1119	1539	sysctl_sched_uclamp_util_min = old_min;
1120	1540	sysctl_sched_uclamp_util_max = old_max;
	1541	+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1121	1542	done:
1122	1543	mutex_unlock(&uclamp_mutex);
1123	1544
..	..	@@ -1127,20 +1548,61 @@
1127	1548	static int uclamp_validate(struct task_struct *p,
1128	1549	const struct sched_attr *attr)
1129	1550	{
1130		- unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1131		- unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
	1551	+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
	1552	+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
1132	1553
1133		- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1134		- lower_bound = attr->sched_util_min;
1135		- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1136		- upper_bound = attr->sched_util_max;
	1554	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
	1555	+ util_min = attr->sched_util_min;
1137	1556
1138		- if (lower_bound > upper_bound)
	1557	+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
	1558	+ return -EINVAL;
	1559	+ }
	1560	+
	1561	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
	1562	+ util_max = attr->sched_util_max;
	1563	+
	1564	+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
	1565	+ return -EINVAL;
	1566	+ }
	1567	+
	1568	+ if (util_min != -1 && util_max != -1 && util_min > util_max)
1139	1569	return -EINVAL;
1140		- if (upper_bound > SCHED_CAPACITY_SCALE)
1141		- return -EINVAL;
	1570	+
	1571	+ /*
	1572	+ * We have valid uclamp attributes; make sure uclamp is enabled.
	1573	+ *
	1574	+ * We need to do that here, because enabling static branches is a
	1575	+ * blocking operation which obviously cannot be done while holding
	1576	+ * scheduler locks.
	1577	+ */
	1578	+ static_branch_enable(&sched_uclamp_used);
1142	1579
1143	1580	return 0;
	1581	+}
	1582	+
	1583	+static bool uclamp_reset(const struct sched_attr *attr,
	1584	+ enum uclamp_id clamp_id,
	1585	+ struct uclamp_se *uc_se)
	1586	+{
	1587	+ /* Reset on sched class change for a non user-defined clamp value. */
	1588	+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
	1589	+ !uc_se->user_defined)
	1590	+ return true;
	1591	+
	1592	+ /* Reset on sched_util_{min,max} == -1. */
	1593	+ if (clamp_id == UCLAMP_MIN &&
	1594	+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
	1595	+ attr->sched_util_min == -1) {
	1596	+ return true;
	1597	+ }
	1598	+
	1599	+ if (clamp_id == UCLAMP_MAX &&
	1600	+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
	1601	+ attr->sched_util_max == -1) {
	1602	+ return true;
	1603	+ }
	1604	+
	1605	+ return false;
1144	1606	}
1145	1607
1146	1608	static void __setscheduler_uclamp(struct task_struct *p,
..	..	@@ -1148,40 +1610,41 @@
1148	1610	{
1149	1611	enum uclamp_id clamp_id;
1150	1612
1151		- /*
1152		- * On scheduling class change, reset to default clamps for tasks
1153		- * without a task-specific value.
1154		- */
1155	1613	for_each_clamp_id(clamp_id) {
1156	1614	struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1157		- unsigned int clamp_value = uclamp_none(clamp_id);
	1615	+ unsigned int value;
1158	1616
1159		- /* Keep using defined clamps across class changes */
1160		- if (uc_se->user_defined)
	1617	+ if (!uclamp_reset(attr, clamp_id, uc_se))
1161	1618	continue;
1162	1619
1163		- /* By default, RT tasks always get 100% boost */
1164		- if (sched_feat(SUGOV_RT_MAX_FREQ) &&
1165		- unlikely(rt_task(p) &&
1166		- clamp_id == UCLAMP_MIN)) {
	1620	+ /*
	1621	+ * RT by default have a 100% boost value that could be modified
	1622	+ * at runtime.
	1623	+ */
	1624	+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
	1625	+ value = sysctl_sched_uclamp_util_min_rt_default;
	1626	+ else
	1627	+ value = uclamp_none(clamp_id);
1167	1628
1168		- clamp_value = uclamp_none(UCLAMP_MAX);
1169		- }
	1629	+ uclamp_se_set(uc_se, value, false);
1170	1630
1171		- uclamp_se_set(uc_se, clamp_value, false);
1172	1631	}
1173	1632
1174	1633	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1175	1634	return;
1176	1635
1177		- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
	1636	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
	1637	+ attr->sched_util_min != -1) {
1178	1638	uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1179	1639	attr->sched_util_min, true);
	1640	+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MIN, attr->sched_util_min);
1180	1641	}
1181	1642
1182		- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
	1643	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
	1644	+ attr->sched_util_max != -1) {
1183	1645	uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1184	1646	attr->sched_util_max, true);
	1647	+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max);
1185	1648	}
1186	1649	}
1187	1650
..	..	@@ -1189,6 +1652,10 @@
1189	1652	{
1190	1653	enum uclamp_id clamp_id;
1191	1654
	1655	+ /*
	1656	+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
	1657	+ * as the task is still at its early fork stages.
	1658	+ */
1192	1659	for_each_clamp_id(clamp_id)
1193	1660	p->uclamp[clamp_id].active = false;
1194	1661
..	..	@@ -1201,39 +1668,24 @@
1201	1668	}
1202	1669	}
1203	1670
1204		-#ifdef CONFIG_SMP
1205		-unsigned int uclamp_task(struct task_struct *p)
	1671	+static void uclamp_post_fork(struct task_struct *p)
1206	1672	{
1207		- unsigned long util;
1208		-
1209		- util = task_util_est(p);
1210		- util = max(util, uclamp_eff_value(p, UCLAMP_MIN));
1211		- util = min(util, uclamp_eff_value(p, UCLAMP_MAX));
1212		-
1213		- return util;
	1673	+ uclamp_update_util_min_rt_default(p);
1214	1674	}
1215	1675
1216		-bool uclamp_boosted(struct task_struct *p)
	1676	+static void __init init_uclamp_rq(struct rq *rq)
1217	1677	{
1218		- return uclamp_eff_value(p, UCLAMP_MIN) > 0;
	1678	+ enum uclamp_id clamp_id;
	1679	+ struct uclamp_rq *uc_rq = rq->uclamp;
	1680	+
	1681	+ for_each_clamp_id(clamp_id) {
	1682	+ uc_rq[clamp_id] = (struct uclamp_rq) {
	1683	+ .value = uclamp_none(clamp_id)
	1684	+ };
	1685	+ }
	1686	+
	1687	+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
1219	1688	}
1220		-
1221		-bool uclamp_latency_sensitive(struct task_struct *p)
1222		-{
1223		-#ifdef CONFIG_UCLAMP_TASK_GROUP
1224		- struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id);
1225		- struct task_group *tg;
1226		-
1227		- if (!css)
1228		- return false;
1229		- tg = container_of(css, struct task_group, css);
1230		-
1231		- return tg->latency_sensitive;
1232		-#else
1233		- return false;
1234		-#endif
1235		-}
1236		-#endif /* CONFIG_SMP */
1237	1689
1238	1690	static void __init init_uclamp(void)
1239	1691	{
..	..	@@ -1241,13 +1693,8 @@
1241	1693	enum uclamp_id clamp_id;
1242	1694	int cpu;
1243	1695
1244		- mutex_init(&uclamp_mutex);
1245		-
1246		- for_each_possible_cpu(cpu) {
1247		- memset(&cpu_rq(cpu)->uclamp, 0,
1248		- sizeof(struct uclamp_rq)*UCLAMP_CNT);
1249		- cpu_rq(cpu)->uclamp_flags = 0;
1250		- }
	1696	+ for_each_possible_cpu(cpu)
	1697	+ init_uclamp_rq(cpu_rq(cpu));
1251	1698
1252	1699	for_each_clamp_id(clamp_id) {
1253	1700	uclamp_se_set(&init_task.uclamp_req[clamp_id],
..	..	@@ -1276,41 +1723,7 @@
1276	1723	static void __setscheduler_uclamp(struct task_struct *p,
1277	1724	const struct sched_attr *attr) { }
1278	1725	static inline void uclamp_fork(struct task_struct *p) { }
1279		-
1280		-long schedtune_task_margin(struct task_struct *task);
1281		-
1282		-#ifdef CONFIG_SMP
1283		-unsigned int uclamp_task(struct task_struct *p)
1284		-{
1285		- unsigned long util = task_util_est(p);
1286		-#ifdef CONFIG_SCHED_TUNE
1287		- long margin = schedtune_task_margin(p);
1288		-
1289		- trace_sched_boost_task(p, util, margin);
1290		-
1291		- util += margin;
1292		-#endif
1293		-
1294		- return util;
1295		-}
1296		-
1297		-bool uclamp_boosted(struct task_struct *p)
1298		-{
1299		-#ifdef CONFIG_SCHED_TUNE
1300		- return schedtune_task_boost(p) > 0;
1301		-#endif
1302		- return false;
1303		-}
1304		-
1305		-bool uclamp_latency_sensitive(struct task_struct *p)
1306		-{
1307		-#ifdef CONFIG_SCHED_TUNE
1308		- return schedtune_prefer_idle(p) != 0;
1309		-#endif
1310		- return false;
1311		-}
1312		-#endif /* CONFIG_SMP */
1313		-
	1726	+static inline void uclamp_post_fork(struct task_struct *p) { }
1314	1727	static inline void init_uclamp(void) { }
1315	1728	#endif /* CONFIG_UCLAMP_TASK */
1316	1729
..	..	@@ -1325,7 +1738,9 @@
1325	1738	}
1326	1739
1327	1740	uclamp_rq_inc(rq, p);
	1741	+ trace_android_rvh_enqueue_task(rq, p, flags);
1328	1742	p->sched_class->enqueue_task(rq, p, flags);
	1743	+ trace_android_rvh_after_enqueue_task(rq, p);
1329	1744	}
1330	1745
1331	1746	static inline void dequeue_task(struct rq rq, struct task_struct p, int flags)
..	..	@@ -1339,31 +1754,39 @@
1339	1754	}
1340	1755
1341	1756	uclamp_rq_dec(rq, p);
	1757	+ trace_android_rvh_dequeue_task(rq, p, flags);
1342	1758	p->sched_class->dequeue_task(rq, p, flags);
	1759	+ trace_android_rvh_after_dequeue_task(rq, p);
1343	1760	}
1344	1761
1345	1762	void activate_task(struct rq rq, struct task_struct p, int flags)
1346	1763	{
1347		- if (task_contributes_to_load(p))
1348		- rq->nr_uninterruptible--;
1349		-
1350	1764	enqueue_task(rq, p, flags);
	1765	+
	1766	+ p->on_rq = TASK_ON_RQ_QUEUED;
1351	1767	}
	1768	+EXPORT_SYMBOL_GPL(activate_task);
1352	1769
1353	1770	void deactivate_task(struct rq rq, struct task_struct p, int flags)
1354	1771	{
1355		- if (task_contributes_to_load(p))
1356		- rq->nr_uninterruptible++;
	1772	+ p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
1357	1773
1358	1774	dequeue_task(rq, p, flags);
1359	1775	}
	1776	+EXPORT_SYMBOL_GPL(deactivate_task);
1360	1777
1361		-/*
1362		- * __normal_prio - return the priority that is based on the static prio
1363		- */
1364		-static inline int __normal_prio(struct task_struct *p)
	1778	+static inline int __normal_prio(int policy, int rt_prio, int nice)
1365	1779	{
1366		- return p->static_prio;
	1780	+ int prio;
	1781	+
	1782	+ if (dl_policy(policy))
	1783	+ prio = MAX_DL_PRIO - 1;
	1784	+ else if (rt_policy(policy))
	1785	+ prio = MAX_RT_PRIO - 1 - rt_prio;
	1786	+ else
	1787	+ prio = NICE_TO_PRIO(nice);
	1788	+
	1789	+ return prio;
1367	1790	}
1368	1791
1369	1792	/*
..	..	@@ -1375,15 +1798,7 @@
1375	1798	*/
1376	1799	static inline int normal_prio(struct task_struct *p)
1377	1800	{
1378		- int prio;
1379		-
1380		- if (task_has_dl_policy(p))
1381		- prio = MAX_DL_PRIO-1;
1382		- else if (task_has_rt_policy(p))
1383		- prio = MAX_RT_PRIO-1 - p->rt_priority;
1384		- else
1385		- prio = __normal_prio(p);
1386		- return prio;
	1801	+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
1387	1802	}
1388	1803
1389	1804	/*
..	..	@@ -1439,20 +1854,10 @@
1439	1854
1440	1855	void check_preempt_curr(struct rq rq, struct task_struct p, int flags)
1441	1856	{
1442		- const struct sched_class *class;
1443		-
1444		- if (p->sched_class == rq->curr->sched_class) {
	1857	+ if (p->sched_class == rq->curr->sched_class)
1445	1858	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1446		- } else {
1447		- for_each_class(class) {
1448		- if (class == rq->curr->sched_class)
1449		- break;
1450		- if (class == p->sched_class) {
1451		- resched_curr(rq);
1452		- break;
1453		- }
1454		- }
1455		- }
	1859	+ else if (p->sched_class > rq->curr->sched_class)
	1860	+ resched_curr(rq);
1456	1861
1457	1862	/*
1458	1863	* A queue event has occurred, and we're going to schedule. In
..	..	@@ -1461,33 +1866,102 @@
1461	1866	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1462	1867	rq_clock_skip_update(rq);
1463	1868	}
	1869	+EXPORT_SYMBOL_GPL(check_preempt_curr);
1464	1870
1465	1871	#ifdef CONFIG_SMP
1466	1872
1467		-static inline bool is_per_cpu_kthread(struct task_struct *p)
	1873	+static void
	1874	+__do_set_cpus_allowed(struct task_struct p, const struct cpumask new_mask, u32 flags);
	1875	+
	1876	+static int __set_cpus_allowed_ptr(struct task_struct *p,
	1877	+ const struct cpumask *new_mask,
	1878	+ u32 flags);
	1879	+
	1880	+static void migrate_disable_switch(struct rq rq, struct task_struct p)
1468	1881	{
1469		- if (!(p->flags & PF_KTHREAD))
1470		- return false;
	1882	+ if (likely(!p->migration_disabled))
	1883	+ return;
1471	1884
1472		- if (p->nr_cpus_allowed != 1)
1473		- return false;
	1885	+ if (p->cpus_ptr != &p->cpus_mask)
	1886	+ return;
1474	1887
1475		- return true;
	1888	+ /*
	1889	+ * Violates locking rules! see comment in __do_set_cpus_allowed().
	1890	+ */
	1891	+ __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
	1892	+}
	1893	+
	1894	+void migrate_disable(void)
	1895	+{
	1896	+ struct task_struct *p = current;
	1897	+
	1898	+ if (p->migration_disabled) {
	1899	+ p->migration_disabled++;
	1900	+ return;
	1901	+ }
	1902	+
	1903	+ trace_sched_migrate_disable_tp(p);
	1904	+
	1905	+ preempt_disable();
	1906	+ this_rq()->nr_pinned++;
	1907	+ p->migration_disabled = 1;
	1908	+ preempt_lazy_disable();
	1909	+ preempt_enable();
	1910	+}
	1911	+EXPORT_SYMBOL_GPL(migrate_disable);
	1912	+
	1913	+void migrate_enable(void)
	1914	+{
	1915	+ struct task_struct *p = current;
	1916	+
	1917	+ if (p->migration_disabled > 1) {
	1918	+ p->migration_disabled--;
	1919	+ return;
	1920	+ }
	1921	+
	1922	+ /*
	1923	+ * Ensure stop_task runs either before or after this, and that
	1924	+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
	1925	+ */
	1926	+ preempt_disable();
	1927	+ if (p->cpus_ptr != &p->cpus_mask)
	1928	+ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
	1929	+ /*
	1930	+ * Mustn't clear migration_disabled() until cpus_ptr points back at the
	1931	+ * regular cpus_mask, otherwise things that race (eg.
	1932	+ * select_fallback_rq) get confused.
	1933	+ */
	1934	+ barrier();
	1935	+ p->migration_disabled = 0;
	1936	+ this_rq()->nr_pinned--;
	1937	+ preempt_lazy_enable();
	1938	+ preempt_enable();
	1939	+
	1940	+ trace_sched_migrate_enable_tp(p);
	1941	+}
	1942	+EXPORT_SYMBOL_GPL(migrate_enable);
	1943	+
	1944	+static inline bool rq_has_pinned_tasks(struct rq *rq)
	1945	+{
	1946	+ return rq->nr_pinned;
1476	1947	}
1477	1948
1478	1949	/*
1479		- * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
	1950	+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
1480	1951	* __set_cpus_allowed_ptr() and select_fallback_rq().
1481	1952	*/
1482	1953	static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
1483	1954	{
1484		- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
	1955	+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
1485	1956	return false;
1486	1957
1487		- if (is_per_cpu_kthread(p))
	1958	+ if (is_per_cpu_kthread(p) \|\| is_migration_disabled(p))
1488	1959	return cpu_online(cpu);
1489	1960
1490		- return cpu_active(cpu);
	1961	+ if (!cpu_active(cpu))
	1962	+ return false;
	1963	+
	1964	+ return cpumask_test_cpu(cpu, task_cpu_possible_mask(p));
1491	1965	}
1492	1966
1493	1967	/*
..	..	@@ -1512,27 +1986,50 @@
1512	1986	static struct rq move_queued_task(struct rq rq, struct rq_flags *rf,
1513	1987	struct task_struct *p, int new_cpu)
1514	1988	{
	1989	+ int detached = 0;
	1990	+
1515	1991	lockdep_assert_held(&rq->lock);
1516	1992
1517		- WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
1518		- dequeue_task(rq, p, DEQUEUE_NOCLOCK);
1519		- set_task_cpu(p, new_cpu);
1520		- rq_unlock(rq, rf);
	1993	+ /*
	1994	+ * The vendor hook may drop the lock temporarily, so
	1995	+ * pass the rq flags to unpin lock. We expect the
	1996	+ * rq lock to be held after return.
	1997	+ */
	1998	+ trace_android_rvh_migrate_queued_task(rq, rf, p, new_cpu, &detached);
	1999	+ if (detached)
	2000	+ goto attach;
1521	2001
	2002	+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
	2003	+ set_task_cpu(p, new_cpu);
	2004	+
	2005	+attach:
	2006	+ rq_unlock(rq, rf);
1522	2007	rq = cpu_rq(new_cpu);
1523	2008
1524	2009	rq_lock(rq, rf);
1525	2010	BUG_ON(task_cpu(p) != new_cpu);
1526		- enqueue_task(rq, p, 0);
1527		- p->on_rq = TASK_ON_RQ_QUEUED;
	2011	+ activate_task(rq, p, 0);
1528	2012	check_preempt_curr(rq, p, 0);
1529	2013
1530	2014	return rq;
1531	2015	}
1532	2016
1533	2017	struct migration_arg {
1534		- struct task_struct *task;
1535		- int dest_cpu;
	2018	+ struct task_struct *task;
	2019	+ int dest_cpu;
	2020	+ struct set_affinity_pending *pending;
	2021	+};
	2022	+
	2023	+/*
	2024	+ * @refs: number of wait_for_completion()
	2025	+ * @stop_pending: is @stop_work in use
	2026	+ */
	2027	+struct set_affinity_pending {
	2028	+ refcount_t refs;
	2029	+ unsigned int stop_pending;
	2030	+ struct completion done;
	2031	+ struct cpu_stop_work stop_work;
	2032	+ struct migration_arg arg;
1536	2033	};
1537	2034
1538	2035	/*
..	..	@@ -1565,39 +2062,141 @@
1565	2062	static int migration_cpu_stop(void *data)
1566	2063	{
1567	2064	struct migration_arg *arg = data;
	2065	+ struct set_affinity_pending *pending = arg->pending;
1568	2066	struct task_struct *p = arg->task;
1569	2067	struct rq *rq = this_rq();
	2068	+ bool complete = false;
1570	2069	struct rq_flags rf;
1571	2070
1572	2071	/*
1573	2072	* The original target CPU might have gone down and we might
1574	2073	* be on another CPU but it doesn't matter.
1575	2074	*/
1576		- local_irq_disable();
	2075	+ local_irq_save(rf.flags);
1577	2076	/*
1578	2077	* We need to explicitly wake pending tasks before running
1579		- * __migrate_task() such that we will not miss enforcing cpus_allowed
	2078	+ * __migrate_task() such that we will not miss enforcing cpus_ptr
1580	2079	* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
1581	2080	*/
1582		- sched_ttwu_pending();
	2081	+ flush_smp_call_function_from_idle();
1583	2082
1584	2083	raw_spin_lock(&p->pi_lock);
1585	2084	rq_lock(rq, &rf);
	2085	+
1586	2086	/*
1587	2087	* If task_rq(p) != rq, it cannot be migrated here, because we're
1588	2088	* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
1589	2089	* we're holding p->pi_lock.
1590	2090	*/
1591	2091	if (task_rq(p) == rq) {
	2092	+ if (is_migration_disabled(p))
	2093	+ goto out;
	2094	+
	2095	+ if (pending) {
	2096	+ if (p->migration_pending == pending)
	2097	+ p->migration_pending = NULL;
	2098	+ complete = true;
	2099	+
	2100	+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
	2101	+ goto out;
	2102	+ }
	2103	+
1592	2104	if (task_on_rq_queued(p))
1593	2105	rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1594	2106	else
1595	2107	p->wake_cpu = arg->dest_cpu;
1596		- }
1597		- rq_unlock(rq, &rf);
1598		- raw_spin_unlock(&p->pi_lock);
1599	2108
1600		- local_irq_enable();
	2109	+ /*
	2110	+ * XXX __migrate_task() can fail, at which point we might end
	2111	+ * up running on a dodgy CPU, AFAICT this can only happen
	2112	+ * during CPU hotplug, at which point we'll get pushed out
	2113	+ * anyway, so it's probably not a big deal.
	2114	+ */
	2115	+
	2116	+ } else if (pending) {
	2117	+ /*
	2118	+ * This happens when we get migrated between migrate_enable()'s
	2119	+ * preempt_enable() and scheduling the stopper task. At that
	2120	+ * point we're a regular task again and not current anymore.
	2121	+ *
	2122	+ * A !PREEMPT kernel has a giant hole here, which makes it far
	2123	+ * more likely.
	2124	+ */
	2125	+
	2126	+ /*
	2127	+ * The task moved before the stopper got to run. We're holding
	2128	+ * ->pi_lock, so the allowed mask is stable - if it got
	2129	+ * somewhere allowed, we're done.
	2130	+ */
	2131	+ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
	2132	+ if (p->migration_pending == pending)
	2133	+ p->migration_pending = NULL;
	2134	+ complete = true;
	2135	+ goto out;
	2136	+ }
	2137	+
	2138	+ /*
	2139	+ * When migrate_enable() hits a rq mis-match we can't reliably
	2140	+ * determine is_migration_disabled() and so have to chase after
	2141	+ * it.
	2142	+ */
	2143	+ WARN_ON_ONCE(!pending->stop_pending);
	2144	+ task_rq_unlock(rq, p, &rf);
	2145	+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
	2146	+ &pending->arg, &pending->stop_work);
	2147	+ return 0;
	2148	+ }
	2149	+out:
	2150	+ if (pending)
	2151	+ pending->stop_pending = false;
	2152	+ task_rq_unlock(rq, p, &rf);
	2153	+
	2154	+ if (complete)
	2155	+ complete_all(&pending->done);
	2156	+
	2157	+ return 0;
	2158	+}
	2159	+
	2160	+int push_cpu_stop(void *arg)
	2161	+{
	2162	+ struct rq lowest_rq = NULL, rq = this_rq();
	2163	+ struct task_struct *p = arg;
	2164	+
	2165	+ raw_spin_lock_irq(&p->pi_lock);
	2166	+ raw_spin_lock(&rq->lock);
	2167	+
	2168	+ if (task_rq(p) != rq)
	2169	+ goto out_unlock;
	2170	+
	2171	+ if (is_migration_disabled(p)) {
	2172	+ p->migration_flags \|= MDF_PUSH;
	2173	+ goto out_unlock;
	2174	+ }
	2175	+
	2176	+ p->migration_flags &= ~MDF_PUSH;
	2177	+
	2178	+ if (p->sched_class->find_lock_rq)
	2179	+ lowest_rq = p->sched_class->find_lock_rq(p, rq);
	2180	+
	2181	+ if (!lowest_rq)
	2182	+ goto out_unlock;
	2183	+
	2184	+ // XXX validate p is still the highest prio task
	2185	+ if (task_rq(p) == rq) {
	2186	+ deactivate_task(rq, p, 0);
	2187	+ set_task_cpu(p, lowest_rq->cpu);
	2188	+ activate_task(lowest_rq, p, 0);
	2189	+ resched_curr(lowest_rq);
	2190	+ }
	2191	+
	2192	+ double_unlock_balance(rq, lowest_rq);
	2193	+
	2194	+out_unlock:
	2195	+ rq->push_busy = false;
	2196	+ raw_spin_unlock(&rq->lock);
	2197	+ raw_spin_unlock_irq(&p->pi_lock);
	2198	+
	2199	+ put_task_struct(p);
1601	2200	return 0;
1602	2201	}
1603	2202
..	..	@@ -1605,18 +2204,40 @@
1605	2204	* sched_class::set_cpus_allowed must do the below, but is not required to
1606	2205	* actually call this function.
1607	2206	*/
1608		-void set_cpus_allowed_common(struct task_struct p, const struct cpumask new_mask)
	2207	+void set_cpus_allowed_common(struct task_struct p, const struct cpumask new_mask, u32 flags)
1609	2208	{
1610		- cpumask_copy(&p->cpus_allowed, new_mask);
	2209	+ if (flags & (SCA_MIGRATE_ENABLE \| SCA_MIGRATE_DISABLE)) {
	2210	+ p->cpus_ptr = new_mask;
	2211	+ return;
	2212	+ }
	2213	+
	2214	+ cpumask_copy(&p->cpus_mask, new_mask);
1611	2215	p->nr_cpus_allowed = cpumask_weight(new_mask);
	2216	+ trace_android_rvh_set_cpus_allowed_comm(p, new_mask);
1612	2217	}
1613	2218
1614		-void do_set_cpus_allowed(struct task_struct p, const struct cpumask new_mask)
	2219	+static void
	2220	+__do_set_cpus_allowed(struct task_struct p, const struct cpumask new_mask, u32 flags)
1615	2221	{
1616	2222	struct rq *rq = task_rq(p);
1617	2223	bool queued, running;
1618	2224
1619		- lockdep_assert_held(&p->pi_lock);
	2225	+ /*
	2226	+ * This here violates the locking rules for affinity, since we're only
	2227	+ * supposed to change these variables while holding both rq->lock and
	2228	+ * p->pi_lock.
	2229	+ *
	2230	+ * HOWEVER, it magically works, because ttwu() is the only code that
	2231	+ * accesses these variables under p->pi_lock and only does so after
	2232	+ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
	2233	+ * before finish_task().
	2234	+ *
	2235	+ * XXX do further audits, this smells like something putrid.
	2236	+ */
	2237	+ if (flags & SCA_MIGRATE_DISABLE)
	2238	+ SCHED_WARN_ON(!p->on_cpu);
	2239	+ else
	2240	+ lockdep_assert_held(&p->pi_lock);
1620	2241
1621	2242	queued = task_on_rq_queued(p);
1622	2243	running = task_current(rq, p);
..	..	@@ -1632,12 +2253,312 @@
1632	2253	if (running)
1633	2254	put_prev_task(rq, p);
1634	2255
1635		- p->sched_class->set_cpus_allowed(p, new_mask);
	2256	+ p->sched_class->set_cpus_allowed(p, new_mask, flags);
1636	2257
1637	2258	if (queued)
1638	2259	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
1639	2260	if (running)
1640		- set_curr_task(rq, p);
	2261	+ set_next_task(rq, p);
	2262	+}
	2263	+
	2264	+static int affine_move_task(struct rq rq, struct task_struct p, struct rq_flags *rf,
	2265	+ int dest_cpu, unsigned int flags);
	2266	+/*
	2267	+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
	2268	+ */
	2269	+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
	2270	+ const struct cpumask *new_mask,
	2271	+ u32 flags,
	2272	+ struct rq *rq,
	2273	+ struct rq_flags *rf)
	2274	+{
	2275	+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
	2276	+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
	2277	+ unsigned int dest_cpu;
	2278	+ int ret = 0;
	2279	+
	2280	+ update_rq_clock(rq);
	2281	+
	2282	+ if (p->flags & PF_KTHREAD \|\| is_migration_disabled(p)) {
	2283	+ /*
	2284	+ * Kernel threads are allowed on online && !active CPUs.
	2285	+ *
	2286	+ * Specifically, migration_disabled() tasks must not fail the
	2287	+ * cpumask_any_and_distribute() pick below, esp. so on
	2288	+ * SCA_MIGRATE_ENABLE, otherwise we'll not call
	2289	+ * set_cpus_allowed_common() and actually reset p->cpus_ptr.
	2290	+ */
	2291	+ cpu_valid_mask = cpu_online_mask;
	2292	+ } else if (!cpumask_subset(new_mask, cpu_allowed_mask)) {
	2293	+ ret = -EINVAL;
	2294	+ goto out;
	2295	+ }
	2296	+
	2297	+ /*
	2298	+ * Must re-check here, to close a race against __kthread_bind(),
	2299	+ * sched_setaffinity() is not guaranteed to observe the flag.
	2300	+ */
	2301	+ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
	2302	+ ret = -EINVAL;
	2303	+ goto out;
	2304	+ }
	2305	+
	2306	+ if (!(flags & SCA_MIGRATE_ENABLE)) {
	2307	+ if (cpumask_equal(&p->cpus_mask, new_mask))
	2308	+ goto out;
	2309	+
	2310	+ if (WARN_ON_ONCE(p == current &&
	2311	+ is_migration_disabled(p) &&
	2312	+ !cpumask_test_cpu(task_cpu(p), new_mask))) {
	2313	+ ret = -EBUSY;
	2314	+ goto out;
	2315	+ }
	2316	+ }
	2317	+
	2318	+ /*
	2319	+ * Picking a ~random cpu helps in cases where we are changing affinity
	2320	+ * for groups of tasks (ie. cpuset), so that load balancing is not
	2321	+ * immediately required to distribute the tasks within their new mask.
	2322	+ */
	2323	+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
	2324	+ if (dest_cpu >= nr_cpu_ids) {
	2325	+ ret = -EINVAL;
	2326	+ goto out;
	2327	+ }
	2328	+
	2329	+ __do_set_cpus_allowed(p, new_mask, flags);
	2330	+
	2331	+ if (p->flags & PF_KTHREAD) {
	2332	+ /*
	2333	+ * For kernel threads that do indeed end up on online &&
	2334	+ * !active we want to ensure they are strict per-CPU threads.
	2335	+ */
	2336	+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
	2337	+ !cpumask_intersects(new_mask, cpu_active_mask) &&
	2338	+ p->nr_cpus_allowed != 1);
	2339	+ }
	2340	+
	2341	+ return affine_move_task(rq, p, rf, dest_cpu, flags);
	2342	+out:
	2343	+ task_rq_unlock(rq, p, rf);
	2344	+
	2345	+ return ret;
	2346	+}
	2347	+
	2348	+void do_set_cpus_allowed(struct task_struct p, const struct cpumask new_mask)
	2349	+{
	2350	+ __do_set_cpus_allowed(p, new_mask, 0);
	2351	+}
	2352	+
	2353	+/*
	2354	+ * This function is wildly self concurrent; here be dragons.
	2355	+ *
	2356	+ *
	2357	+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
	2358	+ * designated task is enqueued on an allowed CPU. If that task is currently
	2359	+ * running, we have to kick it out using the CPU stopper.
	2360	+ *
	2361	+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
	2362	+ * Consider:
	2363	+ *
	2364	+ * Initial conditions: P0->cpus_mask = [0, 1]
	2365	+ *
	2366	+ * P0@CPU0 P1
	2367	+ *
	2368	+ * migrate_disable();
	2369	+ * <preempted>
	2370	+ * set_cpus_allowed_ptr(P0, [1]);
	2371	+ *
	2372	+ * P1 cannot return from this set_cpus_allowed_ptr() call until P0 executes
	2373	+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
	2374	+ * This means we need the following scheme:
	2375	+ *
	2376	+ * P0@CPU0 P1
	2377	+ *
	2378	+ * migrate_disable();
	2379	+ * <preempted>
	2380	+ * set_cpus_allowed_ptr(P0, [1]);
	2381	+ * <blocks>
	2382	+ * <resumes>
	2383	+ * migrate_enable();
	2384	+ * __set_cpus_allowed_ptr();
	2385	+ * <wakes local stopper>
	2386	+ * `--> <woken on migration completion>
	2387	+ *
	2388	+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
	2389	+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
	2390	+ * task p are serialized by p->pi_lock, which we can leverage: the one that
	2391	+ * should come into effect at the end of the Migrate-Disable region is the last
	2392	+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
	2393	+ * but we still need to properly signal those waiting tasks at the appropriate
	2394	+ * moment.
	2395	+ *
	2396	+ * This is implemented using struct set_affinity_pending. The first
	2397	+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
	2398	+ * setup an instance of that struct and install it on the targeted task_struct.
	2399	+ * Any and all further callers will reuse that instance. Those then wait for
	2400	+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
	2401	+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
	2402	+ *
	2403	+ *
	2404	+ * (1) In the cases covered above. There is one more where the completion is
	2405	+ * signaled within affine_move_task() itself: when a subsequent affinity request
	2406	+ * cancels the need for an active migration. Consider:
	2407	+ *
	2408	+ * Initial conditions: P0->cpus_mask = [0, 1]
	2409	+ *
	2410	+ * P0@CPU0 P1 P2
	2411	+ *
	2412	+ * migrate_disable();
	2413	+ * <preempted>
	2414	+ * set_cpus_allowed_ptr(P0, [1]);
	2415	+ * <blocks>
	2416	+ * set_cpus_allowed_ptr(P0, [0, 1]);
	2417	+ * <signal completion>
	2418	+ * <awakes>
	2419	+ *
	2420	+ * Note that the above is safe vs a concurrent migrate_enable(), as any
	2421	+ * pending affinity completion is preceded an uninstallion of
	2422	+ * p->migration_pending done with p->pi_lock held.
	2423	+ */
	2424	+static int affine_move_task(struct rq rq, struct task_struct p, struct rq_flags *rf,
	2425	+ int dest_cpu, unsigned int flags)
	2426	+{
	2427	+ struct set_affinity_pending my_pending = { }, *pending = NULL;
	2428	+ bool stop_pending, complete = false;
	2429	+
	2430	+ /* Can the task run on the task's current CPU? If so, we're done */
	2431	+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
	2432	+ struct task_struct *push_task = NULL;
	2433	+
	2434	+ if ((flags & SCA_MIGRATE_ENABLE) &&
	2435	+ (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
	2436	+ rq->push_busy = true;
	2437	+ push_task = get_task_struct(p);
	2438	+ }
	2439	+
	2440	+ /*
	2441	+ * If there are pending waiters, but no pending stop_work,
	2442	+ * then complete now.
	2443	+ */
	2444	+ pending = p->migration_pending;
	2445	+ if (pending && !pending->stop_pending) {
	2446	+ p->migration_pending = NULL;
	2447	+ complete = true;
	2448	+ }
	2449	+
	2450	+ task_rq_unlock(rq, p, rf);
	2451	+
	2452	+ if (push_task) {
	2453	+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
	2454	+ p, &rq->push_work);
	2455	+ }
	2456	+
	2457	+ if (complete)
	2458	+ complete_all(&pending->done);
	2459	+
	2460	+ return 0;
	2461	+ }
	2462	+
	2463	+ if (!(flags & SCA_MIGRATE_ENABLE)) {
	2464	+ /* serialized by p->pi_lock */
	2465	+ if (!p->migration_pending) {
	2466	+ /* Install the request */
	2467	+ refcount_set(&my_pending.refs, 1);
	2468	+ init_completion(&my_pending.done);
	2469	+ my_pending.arg = (struct migration_arg) {
	2470	+ .task = p,
	2471	+ .dest_cpu = dest_cpu,
	2472	+ .pending = &my_pending,
	2473	+ };
	2474	+
	2475	+ p->migration_pending = &my_pending;
	2476	+ } else {
	2477	+ pending = p->migration_pending;
	2478	+ refcount_inc(&pending->refs);
	2479	+ /*
	2480	+ * Affinity has changed, but we've already installed a
	2481	+ * pending. migration_cpu_stop() must see this, else
	2482	+ * we risk a completion of the pending despite having a
	2483	+ * task on a disallowed CPU.
	2484	+ *
	2485	+ * Serialized by p->pi_lock, so this is safe.
	2486	+ */
	2487	+ pending->arg.dest_cpu = dest_cpu;
	2488	+ }
	2489	+ }
	2490	+ pending = p->migration_pending;
	2491	+ /*
	2492	+ * - !MIGRATE_ENABLE:
	2493	+ * we'll have installed a pending if there wasn't one already.
	2494	+ *
	2495	+ * - MIGRATE_ENABLE:
	2496	+ * we're here because the current CPU isn't matching anymore,
	2497	+ * the only way that can happen is because of a concurrent
	2498	+ * set_cpus_allowed_ptr() call, which should then still be
	2499	+ * pending completion.
	2500	+ *
	2501	+ * Either way, we really should have a @pending here.
	2502	+ */
	2503	+ if (WARN_ON_ONCE(!pending)) {
	2504	+ task_rq_unlock(rq, p, rf);
	2505	+ return -EINVAL;
	2506	+ }
	2507	+
	2508	+ if (task_running(rq, p) \|\| p->state == TASK_WAKING) {
	2509	+ /*
	2510	+ * MIGRATE_ENABLE gets here because 'p == current', but for
	2511	+ * anything else we cannot do is_migration_disabled(), punt
	2512	+ * and have the stopper function handle it all race-free.
	2513	+ */
	2514	+ stop_pending = pending->stop_pending;
	2515	+ if (!stop_pending)
	2516	+ pending->stop_pending = true;
	2517	+
	2518	+ if (flags & SCA_MIGRATE_ENABLE)
	2519	+ p->migration_flags &= ~MDF_PUSH;
	2520	+
	2521	+ task_rq_unlock(rq, p, rf);
	2522	+
	2523	+ if (!stop_pending) {
	2524	+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
	2525	+ &pending->arg, &pending->stop_work);
	2526	+ }
	2527	+
	2528	+ if (flags & SCA_MIGRATE_ENABLE)
	2529	+ return 0;
	2530	+ } else {
	2531	+
	2532	+ if (!is_migration_disabled(p)) {
	2533	+ if (task_on_rq_queued(p))
	2534	+ rq = move_queued_task(rq, rf, p, dest_cpu);
	2535	+
	2536	+ if (!pending->stop_pending) {
	2537	+ p->migration_pending = NULL;
	2538	+ complete = true;
	2539	+ }
	2540	+ }
	2541	+ task_rq_unlock(rq, p, rf);
	2542	+
	2543	+ if (complete)
	2544	+ complete_all(&pending->done);
	2545	+ }
	2546	+
	2547	+ wait_for_completion(&pending->done);
	2548	+
	2549	+ if (refcount_dec_and_test(&pending->refs))
	2550	+ wake_up_var(&pending->refs); /* No UaF, just an address */
	2551	+
	2552	+ /*
	2553	+ * Block the original owner of &pending until all subsequent callers
	2554	+ * have seen the completion and decremented the refcount
	2555	+ */
	2556	+ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
	2557	+
	2558	+ /* ARGH */
	2559	+ WARN_ON_ONCE(my_pending.stop_pending);
	2560	+
	2561	+ return 0;
1641	2562	}
1642	2563
1643	2564	/*
..	..	@@ -1650,83 +2571,89 @@
1650	2571	* call is not atomic; no spinlocks may be held.
1651	2572	*/
1652	2573	static int __set_cpus_allowed_ptr(struct task_struct *p,
1653		- const struct cpumask *new_mask, bool check)
	2574	+ const struct cpumask *new_mask,
	2575	+ u32 flags)
1654	2576	{
1655		- const struct cpumask *cpu_valid_mask = cpu_active_mask;
1656		- unsigned int dest_cpu;
1657	2577	struct rq_flags rf;
1658	2578	struct rq *rq;
1659		- int ret = 0;
1660	2579
1661	2580	rq = task_rq_lock(p, &rf);
1662		- update_rq_clock(rq);
1663		-
1664		- if (p->flags & PF_KTHREAD) {
1665		- /*
1666		- * Kernel threads are allowed on online && !active CPUs
1667		- */
1668		- cpu_valid_mask = cpu_online_mask;
1669		- }
1670		-
1671		- /*
1672		- * Must re-check here, to close a race against __kthread_bind(),
1673		- * sched_setaffinity() is not guaranteed to observe the flag.
1674		- */
1675		- if (check && (p->flags & PF_NO_SETAFFINITY)) {
1676		- ret = -EINVAL;
1677		- goto out;
1678		- }
1679		-
1680		- if (cpumask_equal(&p->cpus_allowed, new_mask))
1681		- goto out;
1682		-
1683		- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
1684		- if (dest_cpu >= nr_cpu_ids) {
1685		- ret = -EINVAL;
1686		- goto out;
1687		- }
1688		-
1689		- do_set_cpus_allowed(p, new_mask);
1690		-
1691		- if (p->flags & PF_KTHREAD) {
1692		- /*
1693		- * For kernel threads that do indeed end up on online &&
1694		- * !active we want to ensure they are strict per-CPU threads.
1695		- */
1696		- WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1697		- !cpumask_intersects(new_mask, cpu_active_mask) &&
1698		- p->nr_cpus_allowed != 1);
1699		- }
1700		-
1701		- /* Can the task run on the task's current CPU? If so, we're done */
1702		- if (cpumask_test_cpu(task_cpu(p), new_mask))
1703		- goto out;
1704		-
1705		- if (task_running(rq, p) \|\| p->state == TASK_WAKING) {
1706		- struct migration_arg arg = { p, dest_cpu };
1707		- /* Need help from migration thread: drop lock and wait. */
1708		- task_rq_unlock(rq, p, &rf);
1709		- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1710		- tlb_migrate_finish(p->mm);
1711		- return 0;
1712		- } else if (task_on_rq_queued(p)) {
1713		- /*
1714		- * OK, since we're going to drop the lock immediately
1715		- * afterwards anyway.
1716		- */
1717		- rq = move_queued_task(rq, &rf, p, dest_cpu);
1718		- }
1719		-out:
1720		- task_rq_unlock(rq, p, &rf);
1721		-
1722		- return ret;
	2581	+ return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
1723	2582	}
1724	2583
1725	2584	int set_cpus_allowed_ptr(struct task_struct p, const struct cpumask new_mask)
1726	2585	{
1727		- return __set_cpus_allowed_ptr(p, new_mask, false);
	2586	+ return __set_cpus_allowed_ptr(p, new_mask, 0);
1728	2587	}
1729	2588	EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
	2589	+
	2590	+/*
	2591	+ * Change a given task's CPU affinity to the intersection of its current
	2592	+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
	2593	+ * If the resulting mask is empty, leave the affinity unchanged and return
	2594	+ * -EINVAL.
	2595	+ */
	2596	+static int restrict_cpus_allowed_ptr(struct task_struct *p,
	2597	+ struct cpumask *new_mask,
	2598	+ const struct cpumask *subset_mask)
	2599	+{
	2600	+ struct rq_flags rf;
	2601	+ struct rq *rq;
	2602	+
	2603	+ rq = task_rq_lock(p, &rf);
	2604	+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
	2605	+ task_rq_unlock(rq, p, &rf);
	2606	+ return -EINVAL;
	2607	+ }
	2608	+
	2609	+ return __set_cpus_allowed_ptr_locked(p, new_mask, false, rq, &rf);
	2610	+}
	2611	+
	2612	+/*
	2613	+ * Restrict a given task's CPU affinity so that it is a subset of
	2614	+ * task_cpu_possible_mask(). If the resulting mask is empty, we warn and
	2615	+ * walk up the cpuset hierarchy until we find a suitable mask.
	2616	+ */
	2617	+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
	2618	+{
	2619	+ cpumask_var_t new_mask;
	2620	+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
	2621	+
	2622	+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
	2623	+
	2624	+ /*
	2625	+ * __migrate_task() can fail silently in the face of concurrent
	2626	+ * offlining of the chosen destination CPU, so take the hotplug
	2627	+ * lock to ensure that the migration succeeds.
	2628	+ */
	2629	+ trace_android_rvh_force_compatible_pre(NULL);
	2630	+ cpus_read_lock();
	2631	+ if (!cpumask_available(new_mask))
	2632	+ goto out_set_mask;
	2633	+
	2634	+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
	2635	+ goto out_free_mask;
	2636	+
	2637	+ /*
	2638	+ * We failed to find a valid subset of the affinity mask for the
	2639	+ * task, so override it based on its cpuset hierarchy.
	2640	+ */
	2641	+ cpuset_cpus_allowed(p, new_mask);
	2642	+ override_mask = new_mask;
	2643	+
	2644	+out_set_mask:
	2645	+ if (printk_ratelimit()) {
	2646	+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
	2647	+ task_pid_nr(p), p->comm,
	2648	+ cpumask_pr_args(override_mask));
	2649	+ }
	2650	+
	2651	+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
	2652	+out_free_mask:
	2653	+ cpus_read_unlock();
	2654	+ trace_android_rvh_force_compatible_post(NULL);
	2655	+ free_cpumask_var(new_mask);
	2656	+}
1730	2657
1731	2658	void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1732	2659	{
..	..	@@ -1765,6 +2692,8 @@
1765	2692	* Clearly, migrating tasks to offline CPUs is a fairly daft thing.
1766	2693	*/
1767	2694	WARN_ON_ONCE(!cpu_online(new_cpu));
	2695	+
	2696	+ WARN_ON_ONCE(is_migration_disabled(p));
1768	2697	#endif
1769	2698
1770	2699	trace_sched_migrate_task(p, new_cpu);
..	..	@@ -1775,12 +2704,13 @@
1775	2704	p->se.nr_migrations++;
1776	2705	rseq_migrate(p);
1777	2706	perf_event_task_migrate(p);
	2707	+ trace_android_rvh_set_task_cpu(p, new_cpu);
1778	2708	}
1779	2709
1780	2710	__set_task_cpu(p, new_cpu);
1781	2711	}
	2712	+EXPORT_SYMBOL_GPL(set_task_cpu);
1782	2713
1783		-#ifdef CONFIG_NUMA_BALANCING
1784	2714	static void __migrate_swap_task(struct task_struct *p, int cpu)
1785	2715	{
1786	2716	if (task_on_rq_queued(p)) {
..	..	@@ -1793,11 +2723,9 @@
1793	2723	rq_pin_lock(src_rq, &srf);
1794	2724	rq_pin_lock(dst_rq, &drf);
1795	2725
1796		- p->on_rq = TASK_ON_RQ_MIGRATING;
1797	2726	deactivate_task(src_rq, p, 0);
1798	2727	set_task_cpu(p, cpu);
1799	2728	activate_task(dst_rq, p, 0);
1800		- p->on_rq = TASK_ON_RQ_QUEUED;
1801	2729	check_preempt_curr(dst_rq, p, 0);
1802	2730
1803	2731	rq_unpin_lock(dst_rq, &drf);
..	..	@@ -1840,10 +2768,10 @@
1840	2768	if (task_cpu(arg->src_task) != arg->src_cpu)
1841	2769	goto unlock;
1842	2770
1843		- if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
	2771	+ if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
1844	2772	goto unlock;
1845	2773
1846		- if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
	2774	+ if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
1847	2775	goto unlock;
1848	2776
1849	2777	__migrate_swap_task(arg->src_task, arg->dst_cpu);
..	..	@@ -1885,10 +2813,10 @@
1885	2813	if (!cpu_active(arg.src_cpu) \|\| !cpu_active(arg.dst_cpu))
1886	2814	goto out;
1887	2815
1888		- if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
	2816	+ if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
1889	2817	goto out;
1890	2818
1891		- if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
	2819	+ if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
1892	2820	goto out;
1893	2821
1894	2822	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
..	..	@@ -1897,7 +2825,19 @@
1897	2825	out:
1898	2826	return ret;
1899	2827	}
1900		-#endif /* CONFIG_NUMA_BALANCING */
	2828	+EXPORT_SYMBOL_GPL(migrate_swap);
	2829	+
	2830	+static bool check_task_state(struct task_struct *p, long match_state)
	2831	+{
	2832	+ bool match = false;
	2833	+
	2834	+ raw_spin_lock_irq(&p->pi_lock);
	2835	+ if (p->state == match_state \|\| p->saved_state == match_state)
	2836	+ match = true;
	2837	+ raw_spin_unlock_irq(&p->pi_lock);
	2838	+
	2839	+ return match;
	2840	+}
1901	2841
1902	2842	/*
1903	2843	* wait_task_inactive - wait for a thread to unschedule.
..	..	@@ -1943,7 +2883,7 @@
1943	2883	* is actually now running somewhere else!
1944	2884	*/
1945	2885	while (task_running(rq, p)) {
1946		- if (match_state && unlikely(p->state != match_state))
	2886	+ if (match_state && !check_task_state(p, match_state))
1947	2887	return 0;
1948	2888	cpu_relax();
1949	2889	}
..	..	@@ -1958,7 +2898,8 @@
1958	2898	running = task_running(rq, p);
1959	2899	queued = task_on_rq_queued(p);
1960	2900	ncsw = 0;
1961		- if (!match_state \|\| p->state == match_state)
	2901	+ if (!match_state \|\| p->state == match_state \|\|
	2902	+ p->saved_state == match_state)
1962	2903	ncsw = p->nvcsw \| LONG_MIN; /* sets MSB */
1963	2904	task_rq_unlock(rq, p, &rf);
1964	2905
..	..	@@ -1992,7 +2933,7 @@
1992	2933	ktime_t to = NSEC_PER_SEC / HZ;
1993	2934
1994	2935	set_current_state(TASK_UNINTERRUPTIBLE);
1995		- schedule_hrtimeout(&to, HRTIMER_MODE_REL);
	2936	+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
1996	2937	continue;
1997	2938	}
1998	2939
..	..	@@ -2033,7 +2974,7 @@
2033	2974	EXPORT_SYMBOL_GPL(kick_process);
2034	2975
2035	2976	/*
2036		- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
	2977	+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
2037	2978	*
2038	2979	* A few notes on cpu_active vs cpu_online:
2039	2980	*
..	..	@@ -2059,7 +3000,11 @@
2059	3000	int nid = cpu_to_node(cpu);
2060	3001	const struct cpumask *nodemask = NULL;
2061	3002	enum { cpuset, possible, fail } state = cpuset;
2062		- int dest_cpu;
	3003	+ int dest_cpu = -1;
	3004	+
	3005	+ trace_android_rvh_select_fallback_rq(cpu, p, &dest_cpu);
	3006	+ if (dest_cpu >= 0)
	3007	+ return dest_cpu;
2063	3008
2064	3009	/*
2065	3010	* If the node that the CPU is on has been offlined, cpu_to_node()
..	..	@@ -2071,16 +3016,14 @@
2071	3016
2072	3017	/* Look for allowed, online CPU in same node. */
2073	3018	for_each_cpu(dest_cpu, nodemask) {
2074		- if (!cpu_active(dest_cpu))
2075		- continue;
2076		- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
	3019	+ if (is_cpu_allowed(p, dest_cpu))
2077	3020	return dest_cpu;
2078	3021	}
2079	3022	}
2080	3023
2081	3024	for (;;) {
2082	3025	/* Any allowed, online CPU? */
2083		- for_each_cpu(dest_cpu, &p->cpus_allowed) {
	3026	+ for_each_cpu(dest_cpu, p->cpus_ptr) {
2084	3027	if (!is_cpu_allowed(p, dest_cpu))
2085	3028	continue;
2086	3029
..	..	@@ -2095,12 +3038,17 @@
2095	3038	state = possible;
2096	3039	break;
2097	3040	}
2098		- /* Fall-through */
	3041	+ fallthrough;
2099	3042	case possible:
2100		- do_set_cpus_allowed(p, cpu_possible_mask);
	3043	+ /*
	3044	+ * XXX When called from select_task_rq() we only
	3045	+ * hold p->pi_lock and again violate locking order.
	3046	+ *
	3047	+ * More yuck to audit.
	3048	+ */
	3049	+ do_set_cpus_allowed(p, task_cpu_possible_mask(p));
2101	3050	state = fail;
2102	3051	break;
2103		-
2104	3052	case fail:
2105	3053	BUG();
2106	3054	break;
..	..	@@ -2124,23 +3072,21 @@
2124	3072	}
2125	3073
2126	3074	/*
2127		- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
	3075	+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
2128	3076	*/
2129	3077	static inline
2130		-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
2131		- int sibling_count_hint)
	3078	+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
2132	3079	{
2133	3080	lockdep_assert_held(&p->pi_lock);
2134	3081
2135		- if (p->nr_cpus_allowed > 1)
2136		- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
2137		- sibling_count_hint);
	3082	+ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
	3083	+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
2138	3084	else
2139		- cpu = cpumask_any(&p->cpus_allowed);
	3085	+ cpu = cpumask_any(p->cpus_ptr);
2140	3086
2141	3087	/*
2142	3088	* In order not to call set_task_cpu() on a blocking task we need
2143		- * to rely on ttwu() to place the task on a valid ->cpus_allowed
	3089	+ * to rely on ttwu() to place the task on a valid ->cpus_ptr
2144	3090	* CPU.
2145	3091	*
2146	3092	* Since this is common to all placement strategies, this lives here.
..	..	@@ -2154,14 +3100,9 @@
2154	3100	return cpu;
2155	3101	}
2156	3102
2157		-static void update_avg(u64 *avg, u64 sample)
2158		-{
2159		- s64 diff = sample - *avg;
2160		- *avg += diff >> 3;
2161		-}
2162		-
2163	3103	void sched_set_stop_task(int cpu, struct task_struct *stop)
2164	3104	{
	3105	+ static struct lock_class_key stop_pi_lock;
2165	3106	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2166	3107	struct task_struct *old_stop = cpu_rq(cpu)->stop;
2167	3108
..	..	@@ -2177,6 +3118,20 @@
2177	3118	sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2178	3119
2179	3120	stop->sched_class = &stop_sched_class;
	3121	+
	3122	+ /*
	3123	+ * The PI code calls rt_mutex_setprio() with ->pi_lock held to
	3124	+ * adjust the effective priority of a task. As a result,
	3125	+ * rt_mutex_setprio() can trigger (RT) balancing operations,
	3126	+ * which can then trigger wakeups of the stop thread to push
	3127	+ * around the current task.
	3128	+ *
	3129	+ * The stop task itself will never be part of the PI-chain, it
	3130	+ * never blocks, therefore that ->pi_lock recursion is safe.
	3131	+ * Tell lockdep about this by placing the stop->pi_lock in its
	3132	+ * own class.
	3133	+ */
	3134	+ lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
2180	3135	}
2181	3136
2182	3137	cpu_rq(cpu)->stop = stop;
..	..	@@ -2190,15 +3145,23 @@
2190	3145	}
2191	3146	}
2192	3147
2193		-#else
	3148	+#else /* CONFIG_SMP */
2194	3149
2195	3150	static inline int __set_cpus_allowed_ptr(struct task_struct *p,
2196		- const struct cpumask *new_mask, bool check)
	3151	+ const struct cpumask *new_mask,
	3152	+ u32 flags)
2197	3153	{
2198	3154	return set_cpus_allowed_ptr(p, new_mask);
2199	3155	}
2200	3156
2201		-#endif /* CONFIG_SMP */
	3157	+static inline void migrate_disable_switch(struct rq rq, struct task_struct p) { }
	3158	+
	3159	+static inline bool rq_has_pinned_tasks(struct rq *rq)
	3160	+{
	3161	+ return false;
	3162	+}
	3163	+
	3164	+#endif /* !CONFIG_SMP */
2202	3165
2203	3166	static void
2204	3167	ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
..	..	@@ -2237,16 +3200,6 @@
2237	3200
2238	3201	if (wake_flags & WF_SYNC)
2239	3202	__schedstat_inc(p->se.statistics.nr_wakeups_sync);
2240		-}
2241		-
2242		-static inline void ttwu_activate(struct rq rq, struct task_struct p, int en_flags)
2243		-{
2244		- activate_task(rq, p, en_flags);
2245		- p->on_rq = TASK_ON_RQ_QUEUED;
2246		-
2247		- /* If a worker is waking up, notify the workqueue: */
2248		- if (p->flags & PF_WQ_WORKER)
2249		- wq_worker_waking_up(p, cpu_of(rq));
2250	3203	}
2251	3204
2252	3205	/*
..	..	@@ -2290,27 +3243,54 @@
2290	3243	{
2291	3244	int en_flags = ENQUEUE_WAKEUP \| ENQUEUE_NOCLOCK;
2292	3245
	3246	+ if (wake_flags & WF_SYNC)
	3247	+ en_flags \|= ENQUEUE_WAKEUP_SYNC;
	3248	+
2293	3249	lockdep_assert_held(&rq->lock);
2294	3250
2295		-#ifdef CONFIG_SMP
2296	3251	if (p->sched_contributes_to_load)
2297	3252	rq->nr_uninterruptible--;
2298	3253
	3254	+#ifdef CONFIG_SMP
2299	3255	if (wake_flags & WF_MIGRATED)
2300	3256	en_flags \|= ENQUEUE_MIGRATED;
	3257	+ else
2301	3258	#endif
	3259	+ if (p->in_iowait) {
	3260	+ delayacct_blkio_end(p);
	3261	+ atomic_dec(&task_rq(p)->nr_iowait);
	3262	+ }
2302	3263
2303		- ttwu_activate(rq, p, en_flags);
	3264	+ activate_task(rq, p, en_flags);
2304	3265	ttwu_do_wakeup(rq, p, wake_flags, rf);
2305	3266	}
2306	3267
2307	3268	/*
2308		- * Called in case the task @p isn't fully descheduled from its runqueue,
2309		- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2310		- * since all we need to do is flip p->state to TASK_RUNNING, since
2311		- * the task is still ->on_rq.
	3269	+ * Consider @p being inside a wait loop:
	3270	+ *
	3271	+ * for (;;) {
	3272	+ * set_current_state(TASK_UNINTERRUPTIBLE);
	3273	+ *
	3274	+ * if (CONDITION)
	3275	+ * break;
	3276	+ *
	3277	+ * schedule();
	3278	+ * }
	3279	+ * __set_current_state(TASK_RUNNING);
	3280	+ *
	3281	+ * between set_current_state() and schedule(). In this case @p is still
	3282	+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
	3283	+ * an atomic manner.
	3284	+ *
	3285	+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
	3286	+ * then schedule() must still happen and p->state can be changed to
	3287	+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
	3288	+ * need to do a full wakeup with enqueue.
	3289	+ *
	3290	+ * Returns: %true when the wakeup is done,
	3291	+ * %false otherwise.
2312	3292	*/
2313		-static int ttwu_remote(struct task_struct *p, int wake_flags)
	3293	+static int ttwu_runnable(struct task_struct *p, int wake_flags)
2314	3294	{
2315	3295	struct rq_flags rf;
2316	3296	struct rq *rq;
..	..	@@ -2329,75 +3309,63 @@
2329	3309	}
2330	3310
2331	3311	#ifdef CONFIG_SMP
2332		-void sched_ttwu_pending(void)
	3312	+void sched_ttwu_pending(void *arg)
2333	3313	{
	3314	+ struct llist_node *llist = arg;
2334	3315	struct rq *rq = this_rq();
2335		- struct llist_node *llist = llist_del_all(&rq->wake_list);
2336	3316	struct task_struct p, t;
2337	3317	struct rq_flags rf;
2338	3318
2339	3319	if (!llist)
2340	3320	return;
2341	3321
	3322	+ /*
	3323	+ * rq::ttwu_pending racy indication of out-standing wakeups.
	3324	+ * Races such that false-negatives are possible, since they
	3325	+ * are shorter lived that false-positives would be.
	3326	+ */
	3327	+ WRITE_ONCE(rq->ttwu_pending, 0);
	3328	+
2342	3329	rq_lock_irqsave(rq, &rf);
2343	3330	update_rq_clock(rq);
2344	3331
2345		- llist_for_each_entry_safe(p, t, llist, wake_entry)
	3332	+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
	3333	+ if (WARN_ON_ONCE(p->on_cpu))
	3334	+ smp_cond_load_acquire(&p->on_cpu, !VAL);
	3335	+
	3336	+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
	3337	+ set_task_cpu(p, cpu_of(rq));
	3338	+
2346	3339	ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
	3340	+ }
2347	3341
2348	3342	rq_unlock_irqrestore(rq, &rf);
2349	3343	}
2350	3344
2351		-void scheduler_ipi(void)
	3345	+void send_call_function_single_ipi(int cpu)
2352	3346	{
2353		- /*
2354		- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
2355		- * TIF_NEED_RESCHED remotely (for the first time) will also send
2356		- * this IPI.
2357		- */
2358		- preempt_fold_need_resched();
	3347	+ struct rq *rq = cpu_rq(cpu);
2359	3348
2360		- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2361		- return;
2362		-
2363		- /*
2364		- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2365		- * traditionally all their work was done from the interrupt return
2366		- * path. Now that we actually do some work, we need to make sure
2367		- * we do call them.
2368		- *
2369		- * Some archs already do call them, luckily irq_enter/exit nest
2370		- * properly.
2371		- *
2372		- * Arguably we should visit all archs and update all handlers,
2373		- * however a fair share of IPIs are still resched only so this would
2374		- * somewhat pessimize the simple resched case.
2375		- */
2376		- irq_enter();
2377		- sched_ttwu_pending();
2378		-
2379		- /*
2380		- * Check if someone kicked us for doing the nohz idle load balance.
2381		- */
2382		- if (unlikely(got_nohz_idle_kick())) {
2383		- this_rq()->idle_balance = 1;
2384		- raise_softirq_irqoff(SCHED_SOFTIRQ);
2385		- }
2386		- irq_exit();
	3349	+ if (!set_nr_if_polling(rq->idle))
	3350	+ arch_send_call_function_single_ipi(cpu);
	3351	+ else
	3352	+ trace_sched_wake_idle_without_ipi(cpu);
2387	3353	}
2388	3354
2389		-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
	3355	+/*
	3356	+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
	3357	+ * necessary. The wakee CPU on receipt of the IPI will queue the task
	3358	+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
	3359	+ * of the wakeup instead of the waker.
	3360	+ */
	3361	+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2390	3362	{
2391	3363	struct rq *rq = cpu_rq(cpu);
2392	3364
2393	3365	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
2394	3366
2395		- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
2396		- if (!set_nr_if_polling(rq->idle))
2397		- smp_send_reschedule(cpu);
2398		- else
2399		- trace_sched_wake_idle_without_ipi(cpu);
2400		- }
	3367	+ WRITE_ONCE(rq->ttwu_pending, 1);
	3368	+ __smp_call_single_queue(cpu, &p->wake_entry.llist);
2401	3369	}
2402	3370
2403	3371	void wake_up_if_idle(int cpu)
..	..	@@ -2423,6 +3391,7 @@
2423	3391	out:
2424	3392	rcu_read_unlock();
2425	3393	}
	3394	+EXPORT_SYMBOL_GPL(wake_up_if_idle);
2426	3395
2427	3396	bool cpus_share_cache(int this_cpu, int that_cpu)
2428	3397	{
..	..	@@ -2431,6 +3400,58 @@
2431	3400
2432	3401	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
2433	3402	}
	3403	+
	3404	+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
	3405	+{
	3406	+ /*
	3407	+ * If the CPU does not share cache, then queue the task on the
	3408	+ * remote rqs wakelist to avoid accessing remote data.
	3409	+ */
	3410	+ if (!cpus_share_cache(smp_processor_id(), cpu))
	3411	+ return true;
	3412	+
	3413	+ /*
	3414	+ * If the task is descheduling and the only running task on the
	3415	+ * CPU then use the wakelist to offload the task activation to
	3416	+ * the soon-to-be-idle CPU as the current CPU is likely busy.
	3417	+ * nr_running is checked to avoid unnecessary task stacking.
	3418	+ *
	3419	+ * Note that we can only get here with (wakee) p->on_rq=0,
	3420	+ * p->on_cpu can be whatever, we've done the dequeue, so
	3421	+ * the wakee has been accounted out of ->nr_running.
	3422	+ */
	3423	+ if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running)
	3424	+ return true;
	3425	+
	3426	+ return false;
	3427	+}
	3428	+
	3429	+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
	3430	+{
	3431	+ bool cond = false;
	3432	+
	3433	+ trace_android_rvh_ttwu_cond(&cond);
	3434	+
	3435	+ if ((sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) \|\|
	3436	+ cond) {
	3437	+ if (WARN_ON_ONCE(cpu == smp_processor_id()))
	3438	+ return false;
	3439	+
	3440	+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
	3441	+ __ttwu_queue_wakelist(p, cpu, wake_flags);
	3442	+ return true;
	3443	+ }
	3444	+
	3445	+ return false;
	3446	+}
	3447	+
	3448	+#else /* !CONFIG_SMP */
	3449	+
	3450	+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
	3451	+{
	3452	+ return false;
	3453	+}
	3454	+
2434	3455	#endif /* CONFIG_SMP */
2435	3456
2436	3457	static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
..	..	@@ -2438,13 +3459,8 @@
2438	3459	struct rq *rq = cpu_rq(cpu);
2439	3460	struct rq_flags rf;
2440	3461
2441		-#if defined(CONFIG_SMP)
2442		- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
2443		- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
2444		- ttwu_queue_remote(p, cpu, wake_flags);
	3462	+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
2445	3463	return;
2446		- }
2447		-#endif
2448	3464
2449	3465	rq_lock(rq, &rf);
2450	3466	update_rq_clock(rq);
..	..	@@ -2500,8 +3516,8 @@
2500	3516	* migration. However the means are completely different as there is no lock
2501	3517	* chain to provide order. Instead we do:
2502	3518	*
2503		- * 1) smp_store_release(X->on_cpu, 0)
2504		- * 2) smp_cond_load_acquire(!X->on_cpu)
	3519	+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
	3520	+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
2505	3521	*
2506	3522	* Example:
2507	3523	*
..	..	@@ -2540,45 +3556,113 @@
2540	3556	* @p: the thread to be awakened
2541	3557	* @state: the mask of task states that can be woken
2542	3558	* @wake_flags: wake modifier flags (WF_*)
2543		- * @sibling_count_hint: A hint at the number of threads that are being woken up
2544		- * in this event.
2545	3559	*
2546		- * If (@state & @p->state) @p->state = TASK_RUNNING.
	3560	+ * Conceptually does:
	3561	+ *
	3562	+ * If (@state & @p->state) @p->state = TASK_RUNNING.
2547	3563	*
2548	3564	* If the task was not queued/runnable, also place it back on a runqueue.
2549	3565	*
2550		- * Atomic against schedule() which would dequeue a task, also see
2551		- * set_current_state().
	3566	+ * This function is atomic against schedule() which would dequeue the task.
2552	3567	*
2553		- * This function executes a full memory barrier before accessing the task
2554		- * state; see set_current_state().
	3568	+ * It issues a full memory barrier before accessing @p->state, see the comment
	3569	+ * with set_current_state().
	3570	+ *
	3571	+ * Uses p->pi_lock to serialize against concurrent wake-ups.
	3572	+ *
	3573	+ * Relies on p->pi_lock stabilizing:
	3574	+ * - p->sched_class
	3575	+ * - p->cpus_ptr
	3576	+ * - p->sched_task_group
	3577	+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
	3578	+ *
	3579	+ * Tries really hard to only take one task_rq(p)->lock for performance.
	3580	+ * Takes rq->lock in:
	3581	+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
	3582	+ * - ttwu_queue() -- new rq, for enqueue of the task;
	3583	+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
	3584	+ *
	3585	+ * As a consequence we race really badly with just about everything. See the
	3586	+ * many memory barriers and their comments for details.
2555	3587	*
2556	3588	* Return: %true if @p->state changes (an actual wakeup was done),
2557	3589	* %false otherwise.
2558	3590	*/
2559	3591	static int
2560		-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
2561		- int sibling_count_hint)
	3592	+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2562	3593	{
2563	3594	unsigned long flags;
2564	3595	int cpu, success = 0;
2565	3596
	3597	+ preempt_disable();
	3598	+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && p == current) {
	3599	+ /*
	3600	+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
	3601	+ * == smp_processor_id()'. Together this means we can special
	3602	+ * case the whole 'p->on_rq && ttwu_runnable()' case below
	3603	+ * without taking any locks.
	3604	+ *
	3605	+ * In particular:
	3606	+ * - we rely on Program-Order guarantees for all the ordering,
	3607	+ * - we're serialized against set_special_state() by virtue of
	3608	+ * it disabling IRQs (this allows not taking ->pi_lock).
	3609	+ */
	3610	+ if (!(p->state & state))
	3611	+ goto out;
	3612	+
	3613	+ success = 1;
	3614	+ trace_sched_waking(p);
	3615	+ p->state = TASK_RUNNING;
	3616	+ trace_sched_wakeup(p);
	3617	+ goto out;
	3618	+ }
	3619	+
2566	3620	/*
2567	3621	* If we are going to wake up a thread waiting for CONDITION we
2568	3622	* need to ensure that CONDITION=1 done by the caller can not be
2569		- * reordered with p->state check below. This pairs with mb() in
2570		- * set_current_state() the waiting thread does.
	3623	+ * reordered with p->state check below. This pairs with smp_store_mb()
	3624	+ * in set_current_state() that the waiting thread does.
2571	3625	*/
2572	3626	raw_spin_lock_irqsave(&p->pi_lock, flags);
2573	3627	smp_mb__after_spinlock();
2574		- if (!(p->state & state))
2575		- goto out;
	3628	+ if (!(p->state & state)) {
	3629	+ /*
	3630	+ * The task might be running due to a spinlock sleeper
	3631	+ * wakeup. Check the saved state and set it to running
	3632	+ * if the wakeup condition is true.
	3633	+ */
	3634	+ if (!(wake_flags & WF_LOCK_SLEEPER)) {
	3635	+ if (p->saved_state & state) {
	3636	+ p->saved_state = TASK_RUNNING;
	3637	+ success = 1;
	3638	+ }
	3639	+ }
	3640	+ goto unlock;
	3641	+ }
	3642	+ /*
	3643	+ * If this is a regular wakeup, then we can unconditionally
	3644	+ * clear the saved state of a "lock sleeper".
	3645	+ */
	3646	+ if (!(wake_flags & WF_LOCK_SLEEPER))
	3647	+ p->saved_state = TASK_RUNNING;
	3648	+
	3649	+#ifdef CONFIG_FREEZER
	3650	+ /*
	3651	+ * If we're going to wake up a thread which may be frozen, then
	3652	+ * we can only do so if we have an active CPU which is capable of
	3653	+ * running it. This may not be the case when resuming from suspend,
	3654	+ * as the secondary CPUs may not yet be back online. See __thaw_task()
	3655	+ * for the actual wakeup.
	3656	+ */
	3657	+ if (unlikely(frozen_or_skipped(p)) &&
	3658	+ !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p)))
	3659	+ goto unlock;
	3660	+#endif
2576	3661
2577	3662	trace_sched_waking(p);
2578	3663
2579	3664	/* We're going to change ->state: */
2580	3665	success = 1;
2581		- cpu = task_cpu(p);
2582	3666
2583	3667	/*
2584	3668	* Ensure we load p->on_rq _after_ p->state, otherwise it would
..	..	@@ -2599,10 +3683,15 @@
2599	3683	*
2600	3684	* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
2601	3685	* __schedule(). See the comment for smp_mb__after_spinlock().
	3686	+ *
	3687	+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
2602	3688	*/
2603	3689	smp_rmb();
2604		- if (p->on_rq && ttwu_remote(p, wake_flags))
2605		- goto stat;
	3690	+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
	3691	+ goto unlock;
	3692	+
	3693	+ if (p->state & TASK_UNINTERRUPTIBLE)
	3694	+ trace_sched_blocked_reason(p);
2606	3695
2607	3696	#ifdef CONFIG_SMP
2608	3697	/*
..	..	@@ -2623,8 +3712,43 @@
2623	3712	*
2624	3713	* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
2625	3714	* __schedule(). See the comment for smp_mb__after_spinlock().
	3715	+ *
	3716	+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
	3717	+ * schedule()'s deactivate_task() has 'happened' and p will no longer
	3718	+ * care about it's own p->state. See the comment in __schedule().
2626	3719	*/
2627		- smp_rmb();
	3720	+ smp_acquire__after_ctrl_dep();
	3721	+
	3722	+ /*
	3723	+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
	3724	+ * == 0), which means we need to do an enqueue, change p->state to
	3725	+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
	3726	+ * enqueue, such as ttwu_queue_wakelist().
	3727	+ */
	3728	+ p->state = TASK_WAKING;
	3729	+
	3730	+ /*
	3731	+ * If the owning (remote) CPU is still in the middle of schedule() with
	3732	+ * this task as prev, considering queueing p on the remote CPUs wake_list
	3733	+ * which potentially sends an IPI instead of spinning on p->on_cpu to
	3734	+ * let the waker make forward progress. This is safe because IRQs are
	3735	+ * disabled and the IPI will deliver after on_cpu is cleared.
	3736	+ *
	3737	+ * Ensure we load task_cpu(p) after p->on_cpu:
	3738	+ *
	3739	+ * set_task_cpu(p, cpu);
	3740	+ * STORE p->cpu = @cpu
	3741	+ * __schedule() (switch to task 'p')
	3742	+ * LOCK rq->lock
	3743	+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
	3744	+ * STORE p->on_cpu = 1 LOAD p->cpu
	3745	+ *
	3746	+ * to ensure we observe the correct CPU on which the task is currently
	3747	+ * scheduling.
	3748	+ */
	3749	+ if (smp_load_acquire(&p->on_cpu) &&
	3750	+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags \| WF_ON_CPU))
	3751	+ goto unlock;
2628	3752
2629	3753	/*
2630	3754	* If the owning (remote) CPU is still in the middle of schedule() with
..	..	@@ -2637,88 +3761,79 @@
2637	3761	*/
2638	3762	smp_cond_load_acquire(&p->on_cpu, !VAL);
2639	3763
2640		- p->sched_contributes_to_load = !!task_contributes_to_load(p);
2641		- p->state = TASK_WAKING;
	3764	+ trace_android_rvh_try_to_wake_up(p);
2642	3765
2643		- if (p->in_iowait) {
2644		- delayacct_blkio_end(p);
2645		- atomic_dec(&task_rq(p)->nr_iowait);
2646		- }
2647		-
2648		- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
2649		- sibling_count_hint);
	3766	+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2650	3767	if (task_cpu(p) != cpu) {
	3768	+ if (p->in_iowait) {
	3769	+ delayacct_blkio_end(p);
	3770	+ atomic_dec(&task_rq(p)->nr_iowait);
	3771	+ }
	3772	+
2651	3773	wake_flags \|= WF_MIGRATED;
2652	3774	psi_ttwu_dequeue(p);
2653	3775	set_task_cpu(p, cpu);
2654	3776	}
2655		-
2656		-#else /* CONFIG_SMP */
2657		-
2658		- if (p->in_iowait) {
2659		- delayacct_blkio_end(p);
2660		- atomic_dec(&task_rq(p)->nr_iowait);
2661		- }
2662		-
	3777	+#else
	3778	+ cpu = task_cpu(p);
2663	3779	#endif /* CONFIG_SMP */
2664	3780
2665	3781	ttwu_queue(p, cpu, wake_flags);
2666		-stat:
2667		- ttwu_stat(p, cpu, wake_flags);
2668		-out:
	3782	+unlock:
2669	3783	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
	3784	+out:
	3785	+ if (success) {
	3786	+ trace_android_rvh_try_to_wake_up_success(p);
	3787	+ ttwu_stat(p, task_cpu(p), wake_flags);
	3788	+ }
	3789	+ preempt_enable();
2670	3790
2671	3791	return success;
2672	3792	}
2673	3793
2674	3794	/**
2675		- * try_to_wake_up_local - try to wake up a local task with rq lock held
2676		- * @p: the thread to be awakened
2677		- * @rf: request-queue flags for pinning
	3795	+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
	3796	+ * @p: Process for which the function is to be invoked, can be @current.
	3797	+ * @func: Function to invoke.
	3798	+ * @arg: Argument to function.
2678	3799	*
2679		- * Put @p on the run-queue if it's not already there. The caller must
2680		- * ensure that this_rq() is locked, @p is bound to this_rq() and not
2681		- * the current task.
	3800	+ * If the specified task can be quickly locked into a definite state
	3801	+ * (either sleeping or on a given runqueue), arrange to keep it in that
	3802	+ * state while invoking @func(@arg). This function can use ->on_rq and
	3803	+ * task_curr() to work out what the state is, if required. Given that
	3804	+ * @func can be invoked with a runqueue lock held, it had better be quite
	3805	+ * lightweight.
	3806	+ *
	3807	+ * Returns:
	3808	+ * @false if the task slipped out from under the locks.
	3809	+ * @true if the task was locked onto a runqueue or is sleeping.
	3810	+ * However, @func can override this by returning @false.
2682	3811	*/
2683		-static void try_to_wake_up_local(struct task_struct p, struct rq_flags rf)
	3812	+bool try_invoke_on_locked_down_task(struct task_struct p, bool (func)(struct task_struct t, void arg), void *arg)
2684	3813	{
2685		- struct rq *rq = task_rq(p);
	3814	+ struct rq_flags rf;
	3815	+ bool ret = false;
	3816	+ struct rq *rq;
2686	3817
2687		- if (WARN_ON_ONCE(rq != this_rq()) \|\|
2688		- WARN_ON_ONCE(p == current))
2689		- return;
2690		-
2691		- lockdep_assert_held(&rq->lock);
2692		-
2693		- if (!raw_spin_trylock(&p->pi_lock)) {
2694		- /*
2695		- * This is OK, because current is on_cpu, which avoids it being
2696		- * picked for load-balance and preemption/IRQs are still
2697		- * disabled avoiding further scheduler activity on it and we've
2698		- * not yet picked a replacement task.
2699		- */
2700		- rq_unlock(rq, rf);
2701		- raw_spin_lock(&p->pi_lock);
2702		- rq_relock(rq, rf);
2703		- }
2704		-
2705		- if (!(p->state & TASK_NORMAL))
2706		- goto out;
2707		-
2708		- trace_sched_waking(p);
2709		-
2710		- if (!task_on_rq_queued(p)) {
2711		- if (p->in_iowait) {
2712		- delayacct_blkio_end(p);
2713		- atomic_dec(&rq->nr_iowait);
	3818	+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
	3819	+ if (p->on_rq) {
	3820	+ rq = __task_rq_lock(p, &rf);
	3821	+ if (task_rq(p) == rq)
	3822	+ ret = func(p, arg);
	3823	+ rq_unlock(rq, &rf);
	3824	+ } else {
	3825	+ switch (p->state) {
	3826	+ case TASK_RUNNING:
	3827	+ case TASK_WAKING:
	3828	+ break;
	3829	+ default:
	3830	+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
	3831	+ if (!p->on_rq)
	3832	+ ret = func(p, arg);
2714	3833	}
2715		- ttwu_activate(rq, p, ENQUEUE_WAKEUP \| ENQUEUE_NOCLOCK);
2716	3834	}
2717		-
2718		- ttwu_do_wakeup(rq, p, 0, rf);
2719		- ttwu_stat(p, smp_processor_id(), 0);
2720		-out:
2721		- raw_spin_unlock(&p->pi_lock);
	3835	+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
	3836	+ return ret;
2722	3837	}
2723	3838
2724	3839	/**
..	..	@@ -2734,13 +3849,25 @@
2734	3849	*/
2735	3850	int wake_up_process(struct task_struct *p)
2736	3851	{
2737		- return try_to_wake_up(p, TASK_NORMAL, 0, 1);
	3852	+ return try_to_wake_up(p, TASK_NORMAL, 0);
2738	3853	}
2739	3854	EXPORT_SYMBOL(wake_up_process);
2740	3855
	3856	+/**
	3857	+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
	3858	+ * @p: The process to be woken up.
	3859	+ *
	3860	+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
	3861	+ * the nature of the wakeup.
	3862	+ */
	3863	+int wake_up_lock_sleeper(struct task_struct *p)
	3864	+{
	3865	+ return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
	3866	+}
	3867	+
2741	3868	int wake_up_state(struct task_struct *p, unsigned int state)
2742	3869	{
2743		- return try_to_wake_up(p, state, 0, 1);
	3870	+ return try_to_wake_up(p, state, 0);
2744	3871	}
2745	3872
2746	3873	/*
..	..	@@ -2765,6 +3892,8 @@
2765	3892	p->se.cfs_rq = NULL;
2766	3893	#endif
2767	3894
	3895	+ trace_android_rvh_sched_fork_init(p);
	3896	+
2768	3897	#ifdef CONFIG_SCHEDSTATS
2769	3898	/* Even if schedstat is disabled, there should not be garbage */
2770	3899	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
..	..	@@ -2785,7 +3914,14 @@
2785	3914	INIT_HLIST_HEAD(&p->preempt_notifiers);
2786	3915	#endif
2787	3916
	3917	+#ifdef CONFIG_COMPACTION
	3918	+ p->capture_control = NULL;
	3919	+#endif
2788	3920	init_numa_balancing(clone_flags, p);
	3921	+#ifdef CONFIG_SMP
	3922	+ p->wake_entry.u_flags = CSD_TYPE_TTWU;
	3923	+ p->migration_pending = NULL;
	3924	+#endif
2789	3925	}
2790	3926
2791	3927	DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
..	..	@@ -2802,7 +3938,7 @@
2802	3938
2803	3939	#ifdef CONFIG_PROC_SYSCTL
2804	3940	int sysctl_numa_balancing(struct ctl_table *table, int write,
2805		- void __user buffer, size_t lenp, loff_t *ppos)
	3941	+ void buffer, size_t lenp, loff_t *ppos)
2806	3942	{
2807	3943	struct ctl_table t;
2808	3944	int err;
..	..	@@ -2876,8 +4012,8 @@
2876	4012	}
2877	4013
2878	4014	#ifdef CONFIG_PROC_SYSCTL
2879		-int sysctl_schedstats(struct ctl_table *table, int write,
2880		- void __user buffer, size_t lenp, loff_t *ppos)
	4015	+int sysctl_schedstats(struct ctl_table table, int write, void buffer,
	4016	+ size_t lenp, loff_t ppos)
2881	4017	{
2882	4018	struct ctl_table t;
2883	4019	int err;
..	..	@@ -2905,7 +4041,7 @@
2905	4041	*/
2906	4042	int sched_fork(unsigned long clone_flags, struct task_struct *p)
2907	4043	{
2908		- unsigned long flags;
	4044	+ trace_android_rvh_sched_fork(p);
2909	4045
2910	4046	__sched_fork(clone_flags, p);
2911	4047	/*
..	..	@@ -2919,6 +4055,7 @@
2919	4055	* Make sure we do not leak PI boosting priority to the child.
2920	4056	*/
2921	4057	p->prio = current->normal_prio;
	4058	+ trace_android_rvh_prepare_prio_fork(p);
2922	4059
2923	4060	uclamp_fork(p);
2924	4061
..	..	@@ -2933,8 +4070,8 @@
2933	4070	} else if (PRIO_TO_NICE(p->static_prio) < 0)
2934	4071	p->static_prio = NICE_TO_PRIO(0);
2935	4072
2936		- p->prio = p->normal_prio = __normal_prio(p);
2937		- set_load_weight(p, false);
	4073	+ p->prio = p->normal_prio = p->static_prio;
	4074	+ set_load_weight(p);
2938	4075
2939	4076	/*
2940	4077	* We don't need the reset flag anymore after the fork. It has
..	..	@@ -2951,24 +4088,8 @@
2951	4088	p->sched_class = &fair_sched_class;
2952	4089
2953	4090	init_entity_runnable_average(&p->se);
	4091	+ trace_android_rvh_finish_prio_fork(p);
2954	4092
2955		- /*
2956		- * The child is not yet in the pid-hash so no cgroup attach races,
2957		- * and the cgroup is pinned to this child due to cgroup_fork()
2958		- * is ran before sched_fork().
2959		- *
2960		- * Silence PROVE_RCU.
2961		- */
2962		- raw_spin_lock_irqsave(&p->pi_lock, flags);
2963		- rseq_migrate(p);
2964		- /*
2965		- * We're setting the CPU for the first time, we don't migrate,
2966		- * so use __set_task_cpu().
2967		- */
2968		- __set_task_cpu(p, smp_processor_id());
2969		- if (p->sched_class->task_fork)
2970		- p->sched_class->task_fork(p);
2971		- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2972	4093
2973	4094	#ifdef CONFIG_SCHED_INFO
2974	4095	if (likely(sched_info_on()))
..	..	@@ -2978,11 +4099,49 @@
2978	4099	p->on_cpu = 0;
2979	4100	#endif
2980	4101	init_task_preempt_count(p);
	4102	+#ifdef CONFIG_HAVE_PREEMPT_LAZY
	4103	+ task_thread_info(p)->preempt_lazy_count = 0;
	4104	+#endif
2981	4105	#ifdef CONFIG_SMP
2982	4106	plist_node_init(&p->pushable_tasks, MAX_PRIO);
2983	4107	RB_CLEAR_NODE(&p->pushable_dl_tasks);
2984	4108	#endif
2985	4109	return 0;
	4110	+}
	4111	+
	4112	+void sched_cgroup_fork(struct task_struct p, struct kernel_clone_args kargs)
	4113	+{
	4114	+ unsigned long flags;
	4115	+
	4116	+ /*
	4117	+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
	4118	+ * required yet, but lockdep gets upset if rules are violated.
	4119	+ */
	4120	+ raw_spin_lock_irqsave(&p->pi_lock, flags);
	4121	+#ifdef CONFIG_CGROUP_SCHED
	4122	+ if (1) {
	4123	+ struct task_group *tg;
	4124	+
	4125	+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
	4126	+ struct task_group, css);
	4127	+ tg = autogroup_task_group(p, tg);
	4128	+ p->sched_task_group = tg;
	4129	+ }
	4130	+#endif
	4131	+ rseq_migrate(p);
	4132	+ /*
	4133	+ * We're setting the CPU for the first time, we don't migrate,
	4134	+ * so use __set_task_cpu().
	4135	+ */
	4136	+ __set_task_cpu(p, smp_processor_id());
	4137	+ if (p->sched_class->task_fork)
	4138	+ p->sched_class->task_fork(p);
	4139	+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
	4140	+}
	4141	+
	4142	+void sched_post_fork(struct task_struct *p)
	4143	+{
	4144	+ uclamp_post_fork(p);
2986	4145	}
2987	4146
2988	4147	unsigned long to_ratio(u64 period, u64 runtime)
..	..	@@ -3013,12 +4172,14 @@
3013	4172	struct rq_flags rf;
3014	4173	struct rq *rq;
3015	4174
	4175	+ trace_android_rvh_wake_up_new_task(p);
	4176	+
3016	4177	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3017	4178	p->state = TASK_RUNNING;
3018	4179	#ifdef CONFIG_SMP
3019	4180	/*
3020	4181	* Fork balancing, do it here and not earlier because:
3021		- * - cpus_allowed can change in the fork path
	4182	+ * - cpus_ptr can change in the fork path
3022	4183	* - any previously selected CPU might disappear through hotplug
3023	4184	*
3024	4185	* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
..	..	@@ -3026,14 +4187,14 @@
3026	4187	*/
3027	4188	p->recent_used_cpu = task_cpu(p);
3028	4189	rseq_migrate(p);
3029		- __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
	4190	+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
3030	4191	#endif
3031	4192	rq = __task_rq_lock(p, &rf);
3032	4193	update_rq_clock(rq);
3033		- post_init_entity_util_avg(&p->se);
	4194	+ post_init_entity_util_avg(p);
	4195	+ trace_android_rvh_new_task_stats(p);
3034	4196
3035	4197	activate_task(rq, p, ENQUEUE_NOCLOCK);
3036		- p->on_rq = TASK_ON_RQ_QUEUED;
3037	4198	trace_sched_wakeup_new(p);
3038	4199	check_preempt_curr(rq, p, WF_FORK);
3039	4200	#ifdef CONFIG_SMP
..	..	@@ -3143,8 +4304,10 @@
3143	4304	/*
3144	4305	* Claim the task as running, we do this before switching to it
3145	4306	* such that any running task will have this set.
	4307	+ *
	4308	+ * See the ttwu() WF_ON_CPU case and its ordering comment.
3146	4309	*/
3147		- next->on_cpu = 1;
	4310	+ WRITE_ONCE(next->on_cpu, 1);
3148	4311	#endif
3149	4312	}
3150	4313
..	..	@@ -3152,8 +4315,9 @@
3152	4315	{
3153	4316	#ifdef CONFIG_SMP
3154	4317	/*
3155		- * After ->on_cpu is cleared, the task can be moved to a different CPU.
3156		- * We must ensure this doesn't happen until the switch is completely
	4318	+ * This must be the very last reference to @prev from this CPU. After
	4319	+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
	4320	+ * must ensure this doesn't happen until the switch is completely
3157	4321	* finished.
3158	4322	*
3159	4323	* In particular, the load of prev->state in finish_task_switch() must
..	..	@@ -3165,6 +4329,90 @@
3165	4329	#endif
3166	4330	}
3167	4331
	4332	+#ifdef CONFIG_SMP
	4333	+
	4334	+static void do_balance_callbacks(struct rq rq, struct callback_head head)
	4335	+{
	4336	+ void (func)(struct rq rq);
	4337	+ struct callback_head *next;
	4338	+
	4339	+ lockdep_assert_held(&rq->lock);
	4340	+
	4341	+ while (head) {
	4342	+ func = (void ()(struct rq ))head->func;
	4343	+ next = head->next;
	4344	+ head->next = NULL;
	4345	+ head = next;
	4346	+
	4347	+ func(rq);
	4348	+ }
	4349	+}
	4350	+
	4351	+static inline struct callback_head splice_balance_callbacks(struct rq rq)
	4352	+{
	4353	+ struct callback_head *head = rq->balance_callback;
	4354	+
	4355	+ lockdep_assert_held(&rq->lock);
	4356	+ if (head) {
	4357	+ rq->balance_callback = NULL;
	4358	+ rq->balance_flags &= ~BALANCE_WORK;
	4359	+ }
	4360	+
	4361	+ return head;
	4362	+}
	4363	+
	4364	+static void __balance_callbacks(struct rq *rq)
	4365	+{
	4366	+ do_balance_callbacks(rq, splice_balance_callbacks(rq));
	4367	+}
	4368	+
	4369	+static inline void balance_callbacks(struct rq rq, struct callback_head head)
	4370	+{
	4371	+ unsigned long flags;
	4372	+
	4373	+ if (unlikely(head)) {
	4374	+ raw_spin_lock_irqsave(&rq->lock, flags);
	4375	+ do_balance_callbacks(rq, head);
	4376	+ raw_spin_unlock_irqrestore(&rq->lock, flags);
	4377	+ }
	4378	+}
	4379	+
	4380	+static void balance_push(struct rq *rq);
	4381	+
	4382	+static inline void balance_switch(struct rq *rq)
	4383	+{
	4384	+ if (likely(!rq->balance_flags))
	4385	+ return;
	4386	+
	4387	+ if (rq->balance_flags & BALANCE_PUSH) {
	4388	+ balance_push(rq);
	4389	+ return;
	4390	+ }
	4391	+
	4392	+ __balance_callbacks(rq);
	4393	+}
	4394	+
	4395	+#else
	4396	+
	4397	+static inline void __balance_callbacks(struct rq *rq)
	4398	+{
	4399	+}
	4400	+
	4401	+static inline struct callback_head splice_balance_callbacks(struct rq rq)
	4402	+{
	4403	+ return NULL;
	4404	+}
	4405	+
	4406	+static inline void balance_callbacks(struct rq rq, struct callback_head head)
	4407	+{
	4408	+}
	4409	+
	4410	+static inline void balance_switch(struct rq *rq)
	4411	+{
	4412	+}
	4413	+
	4414	+#endif
	4415	+
3168	4416	static inline void
3169	4417	prepare_lock_switch(struct rq rq, struct task_struct next, struct rq_flags *rf)
3170	4418	{
..	..	@@ -3175,7 +4423,7 @@
3175	4423	* do an early lockdep release here:
3176	4424	*/
3177	4425	rq_unpin_lock(rq, rf);
3178		- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
	4426	+ spin_release(&rq->lock.dep_map, _THIS_IP_);
3179	4427	#ifdef CONFIG_DEBUG_SPINLOCK
3180	4428	/* this is a valid case when another task releases the spinlock */
3181	4429	rq->lock.owner = next;
..	..	@@ -3190,6 +4438,7 @@
3190	4438	* prev into current:
3191	4439	*/
3192	4440	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
	4441	+ balance_switch(rq);
3193	4442	raw_spin_unlock_irq(&rq->lock);
3194	4443	}
3195	4444
..	..	@@ -3204,6 +4453,22 @@
3204	4453	#ifndef finish_arch_post_lock_switch
3205	4454	# define finish_arch_post_lock_switch() do { } while (0)
3206	4455	#endif
	4456	+
	4457	+static inline void kmap_local_sched_out(void)
	4458	+{
	4459	+#ifdef CONFIG_KMAP_LOCAL
	4460	+ if (unlikely(current->kmap_ctrl.idx))
	4461	+ __kmap_local_sched_out();
	4462	+#endif
	4463	+}
	4464	+
	4465	+static inline void kmap_local_sched_in(void)
	4466	+{
	4467	+#ifdef CONFIG_KMAP_LOCAL
	4468	+ if (unlikely(current->kmap_ctrl.idx))
	4469	+ __kmap_local_sched_in();
	4470	+#endif
	4471	+}
3207	4472
3208	4473	/**
3209	4474	* prepare_task_switch - prepare to switch tasks
..	..	@@ -3227,6 +4492,7 @@
3227	4492	perf_event_task_sched_out(prev, next);
3228	4493	rseq_preempt(prev);
3229	4494	fire_sched_out_preempt_notifiers(prev, next);
	4495	+ kmap_local_sched_out();
3230	4496	prepare_task(next);
3231	4497	prepare_arch_switch(next);
3232	4498	}
..	..	@@ -3293,6 +4559,7 @@
3293	4559	finish_lock_switch(rq);
3294	4560	finish_arch_post_lock_switch();
3295	4561	kcov_finish_switch(current);
	4562	+ kmap_local_sched_in();
3296	4563
3297	4564	fire_sched_in_preempt_notifiers(current);
3298	4565	/*
..	..	@@ -3307,66 +4574,24 @@
3307	4574	* provided by mmdrop(),
3308	4575	* - a sync_core for SYNC_CORE.
3309	4576	*/
	4577	+ /*
	4578	+ * We use mmdrop_delayed() here so we don't have to do the
	4579	+ * full __mmdrop() when we are the last user.
	4580	+ */
3310	4581	if (mm) {
3311	4582	membarrier_mm_sync_core_before_usermode(mm);
3312		- mmdrop(mm);
	4583	+ mmdrop_delayed(mm);
3313	4584	}
3314	4585	if (unlikely(prev_state == TASK_DEAD)) {
3315	4586	if (prev->sched_class->task_dead)
3316	4587	prev->sched_class->task_dead(prev);
3317	4588
3318		- /*
3319		- * Remove function-return probe instances associated with this
3320		- * task and put them back on the free list.
3321		- */
3322		- kprobe_flush_task(prev);
3323		-
3324		- /* Task is done with its stack. */
3325		- put_task_stack(prev);
3326		-
3327		- put_task_struct(prev);
	4589	+ put_task_struct_rcu_user(prev);
3328	4590	}
3329	4591
3330	4592	tick_nohz_task_switch();
3331	4593	return rq;
3332	4594	}
3333		-
3334		-#ifdef CONFIG_SMP
3335		-
3336		-/* rq->lock is NOT held, but preemption is disabled */
3337		-static void __balance_callback(struct rq *rq)
3338		-{
3339		- struct callback_head head, next;
3340		- void (func)(struct rq rq);
3341		- unsigned long flags;
3342		-
3343		- raw_spin_lock_irqsave(&rq->lock, flags);
3344		- head = rq->balance_callback;
3345		- rq->balance_callback = NULL;
3346		- while (head) {
3347		- func = (void ()(struct rq ))head->func;
3348		- next = head->next;
3349		- head->next = NULL;
3350		- head = next;
3351		-
3352		- func(rq);
3353		- }
3354		- raw_spin_unlock_irqrestore(&rq->lock, flags);
3355		-}
3356		-
3357		-static inline void balance_callback(struct rq *rq)
3358		-{
3359		- if (unlikely(rq->balance_callback))
3360		- __balance_callback(rq);
3361		-}
3362		-
3363		-#else
3364		-
3365		-static inline void balance_callback(struct rq *rq)
3366		-{
3367		-}
3368		-
3369		-#endif
3370	4595
3371	4596	/**
3372	4597	* schedule_tail - first thing a freshly forked thread must call.
..	..	@@ -3387,7 +4612,6 @@
3387	4612	*/
3388	4613
3389	4614	rq = finish_task_switch(prev);
3390		- balance_callback(rq);
3391	4615	preempt_enable();
3392	4616
3393	4617	if (current->set_child_tid)
..	..	@@ -3403,12 +4627,8 @@
3403	4627	context_switch(struct rq rq, struct task_struct prev,
3404	4628	struct task_struct next, struct rq_flags rf)
3405	4629	{
3406		- struct mm_struct mm, oldmm;
3407		-
3408	4630	prepare_task_switch(rq, prev, next);
3409	4631
3410		- mm = next->mm;
3411		- oldmm = prev->active_mm;
3412	4632	/*
3413	4633	* For paravirt, this is coupled with an exit in switch_to to
3414	4634	* combine the page table reload and the switch backend into
..	..	@@ -3417,22 +4637,37 @@
3417	4637	arch_start_context_switch(prev);
3418	4638
3419	4639	/*
3420		- * If mm is non-NULL, we pass through switch_mm(). If mm is
3421		- * NULL, we will pass through mmdrop() in finish_task_switch().
3422		- * Both of these contain the full memory barrier required by
3423		- * membarrier after storing to rq->curr, before returning to
3424		- * user-space.
	4640	+ * kernel -> kernel lazy + transfer active
	4641	+ * user -> kernel lazy + mmgrab() active
	4642	+ *
	4643	+ * kernel -> user switch + mmdrop() active
	4644	+ * user -> user switch
3425	4645	*/
3426		- if (!mm) {
3427		- next->active_mm = oldmm;
3428		- mmgrab(oldmm);
3429		- enter_lazy_tlb(oldmm, next);
3430		- } else
3431		- switch_mm_irqs_off(oldmm, mm, next);
	4646	+ if (!next->mm) { // to kernel
	4647	+ enter_lazy_tlb(prev->active_mm, next);
3432	4648
3433		- if (!prev->mm) {
3434		- prev->active_mm = NULL;
3435		- rq->prev_mm = oldmm;
	4649	+ next->active_mm = prev->active_mm;
	4650	+ if (prev->mm) // from user
	4651	+ mmgrab(prev->active_mm);
	4652	+ else
	4653	+ prev->active_mm = NULL;
	4654	+ } else { // to user
	4655	+ membarrier_switch_mm(rq, prev->active_mm, next->mm);
	4656	+ /*
	4657	+ * sys_membarrier() requires an smp_mb() between setting
	4658	+ * rq->curr / membarrier_switch_mm() and returning to userspace.
	4659	+ *
	4660	+ * The below provides this either through switch_mm(), or in
	4661	+ * case 'prev->active_mm == next->mm' through
	4662	+ * finish_task_switch()'s mmdrop().
	4663	+ */
	4664	+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
	4665	+
	4666	+ if (!prev->mm) { // from kernel
	4667	+ /* will mmdrop() in finish_task_switch(). */
	4668	+ rq->prev_mm = prev->active_mm;
	4669	+ prev->active_mm = NULL;
	4670	+ }
3436	4671	}
3437	4672
3438	4673	rq->clock_update_flags &= ~(RQCF_ACT_SKIP\|RQCF_REQ_SKIP);
..	..	@@ -3469,7 +4704,7 @@
3469	4704	* preemption, thus the result might have a time-of-check-to-time-of-use
3470	4705	* race. The caller is responsible to use it correctly, for example:
3471	4706	*
3472		- * - from a non-preemptable section (of course)
	4707	+ * - from a non-preemptible section (of course)
3473	4708	*
3474	4709	* - from a thread that is bound to a single CPU
3475	4710	*
..	..	@@ -3490,6 +4725,18 @@
3490	4725	sum += cpu_rq(i)->nr_switches;
3491	4726
3492	4727	return sum;
	4728	+}
	4729	+
	4730	+/*
	4731	+ * Consumers of these two interfaces, like for example the cpuidle menu
	4732	+ * governor, are using nonsensical data. Preferring shallow idle state selection
	4733	+ * for a CPU that has IO-wait which might not even end up running the task when
	4734	+ * it does become runnable.
	4735	+ */
	4736	+
	4737	+unsigned long nr_iowait_cpu(int cpu)
	4738	+{
	4739	+ return atomic_read(&cpu_rq(cpu)->nr_iowait);
3493	4740	}
3494	4741
3495	4742	/*
..	..	@@ -3527,29 +4774,9 @@
3527	4774	unsigned long i, sum = 0;
3528	4775
3529	4776	for_each_possible_cpu(i)
3530		- sum += atomic_read(&cpu_rq(i)->nr_iowait);
	4777	+ sum += nr_iowait_cpu(i);
3531	4778
3532	4779	return sum;
3533		-}
3534		-
3535		-/*
3536		- * Consumers of these two interfaces, like for example the cpufreq menu
3537		- * governor are using nonsensical data. Boosting frequency for a CPU that has
3538		- * IO-wait which might not even end up running the task when it does become
3539		- * runnable.
3540		- */
3541		-
3542		-unsigned long nr_iowait_cpu(int cpu)
3543		-{
3544		- struct rq *this = cpu_rq(cpu);
3545		- return atomic_read(&this->nr_iowait);
3546		-}
3547		-
3548		-void get_iowait_load(unsigned long nr_waiters, unsigned long load)
3549		-{
3550		- struct rq *rq = this_rq();
3551		- *nr_waiters = atomic_read(&rq->nr_iowait);
3552		- *load = rq->load.weight;
3553	4780	}
3554	4781
3555	4782	#ifdef CONFIG_SMP
..	..	@@ -3563,9 +4790,14 @@
3563	4790	struct task_struct *p = current;
3564	4791	unsigned long flags;
3565	4792	int dest_cpu;
	4793	+ bool cond = false;
	4794	+
	4795	+ trace_android_rvh_sched_exec(&cond);
	4796	+ if (cond)
	4797	+ return;
3566	4798
3567	4799	raw_spin_lock_irqsave(&p->pi_lock, flags);
3568		- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
	4800	+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
3569	4801	if (dest_cpu == smp_processor_id())
3570	4802	goto unlock;
3571	4803
..	..	@@ -3648,6 +4880,7 @@
3648	4880
3649	4881	return ns;
3650	4882	}
	4883	+EXPORT_SYMBOL_GPL(task_sched_runtime);
3651	4884
3652	4885	/*
3653	4886	* This function gets called by the timer code, with HZ frequency.
..	..	@@ -3659,14 +4892,18 @@
3659	4892	struct rq *rq = cpu_rq(cpu);
3660	4893	struct task_struct *curr = rq->curr;
3661	4894	struct rq_flags rf;
	4895	+ unsigned long thermal_pressure;
3662	4896
	4897	+ arch_scale_freq_tick();
3663	4898	sched_clock_tick();
3664	4899
3665	4900	rq_lock(rq, &rf);
3666	4901
	4902	+ trace_android_rvh_tick_entry(rq);
3667	4903	update_rq_clock(rq);
	4904	+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
	4905	+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
3668	4906	curr->sched_class->task_tick(rq, curr, 0);
3669		- cpu_load_update_active(rq);
3670	4907	calc_global_load_tick(rq);
3671	4908	psi_task_tick(rq);
3672	4909
..	..	@@ -3678,6 +4915,8 @@
3678	4915	rq->idle_balance = idle_cpu(cpu);
3679	4916	trigger_load_balance(rq);
3680	4917	#endif
	4918	+
	4919	+ trace_android_vh_scheduler_tick(rq);
3681	4920	}
3682	4921
3683	4922	#ifdef CONFIG_NO_HZ_FULL
..	..	@@ -3735,28 +4974,31 @@
3735	4974	* statistics and checks timeslices in a time-independent way, regardless
3736	4975	* of when exactly it is running.
3737	4976	*/
3738		- if (idle_cpu(cpu) \|\| !tick_nohz_tick_stopped_cpu(cpu))
	4977	+ if (!tick_nohz_tick_stopped_cpu(cpu))
3739	4978	goto out_requeue;
3740	4979
3741	4980	rq_lock_irq(rq, &rf);
3742	4981	curr = rq->curr;
3743		- if (is_idle_task(curr) \|\| cpu_is_offline(cpu))
	4982	+ if (cpu_is_offline(cpu))
3744	4983	goto out_unlock;
3745	4984
3746	4985	update_rq_clock(rq);
3747		- delta = rq_clock_task(rq) - curr->se.exec_start;
3748	4986
3749		- /*
3750		- * Make sure the next tick runs within a reasonable
3751		- * amount of time.
3752		- */
3753		- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
	4987	+ if (!is_idle_task(curr)) {
	4988	+ /*
	4989	+ * Make sure the next tick runs within a reasonable
	4990	+ * amount of time.
	4991	+ */
	4992	+ delta = rq_clock_task(rq) - curr->se.exec_start;
	4993	+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
	4994	+ }
3754	4995	curr->sched_class->task_tick(rq, curr, 0);
3755	4996
	4997	+ calc_load_nohz_remote(rq);
3756	4998	out_unlock:
3757	4999	rq_unlock_irq(rq, &rf);
3758		-
3759	5000	out_requeue:
	5001	+
3760	5002	/*
3761	5003	* Run the remote tick once per second (1Hz). This arbitrary
3762	5004	* frequency is large enough to avoid overload but short enough
..	..	@@ -3820,7 +5062,7 @@
3820	5062	static inline void sched_tick_stop(int cpu) { }
3821	5063	#endif
3822	5064
3823		-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \
	5065	+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \
3824	5066	defined(CONFIG_TRACE_PREEMPT_TOGGLE))
3825	5067	/*
3826	5068	* If the value passed in is equal to the current preempt count
..	..	@@ -3926,11 +5168,12 @@
3926	5168	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
3927	5169	&& in_atomic_preempt_off()) {
3928	5170	pr_err("Preemption disabled at:");
3929		- print_ip_sym(preempt_disable_ip);
3930		- pr_cont("\n");
	5171	+ print_ip_sym(KERN_ERR, preempt_disable_ip);
3931	5172	}
3932	5173	if (panic_on_warn)
3933	5174	panic("scheduling while atomic\n");
	5175	+
	5176	+ trace_android_rvh_schedule_bug(prev);
3934	5177
3935	5178	dump_stack();
3936	5179	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
..	..	@@ -3939,11 +5182,23 @@
3939	5182	/*
3940	5183	* Various schedule()-time debugging checks and statistics:
3941	5184	*/
3942		-static inline void schedule_debug(struct task_struct *prev)
	5185	+static inline void schedule_debug(struct task_struct *prev, bool preempt)
3943	5186	{
3944	5187	#ifdef CONFIG_SCHED_STACK_END_CHECK
3945	5188	if (task_stack_end_corrupted(prev))
3946	5189	panic("corrupted stack end detected inside scheduler\n");
	5190	+
	5191	+ if (task_scs_end_corrupted(prev))
	5192	+ panic("corrupted shadow stack detected inside scheduler\n");
	5193	+#endif
	5194	+
	5195	+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
	5196	+ if (!preempt && prev->state && prev->non_block_count) {
	5197	+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
	5198	+ prev->comm, prev->pid, prev->non_block_count);
	5199	+ dump_stack();
	5200	+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
	5201	+ }
3947	5202	#endif
3948	5203
3949	5204	if (unlikely(in_atomic_preempt_off())) {
..	..	@@ -3955,6 +5210,28 @@
3955	5210	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3956	5211
3957	5212	schedstat_inc(this_rq()->sched_count);
	5213	+}
	5214	+
	5215	+static void put_prev_task_balance(struct rq rq, struct task_struct prev,
	5216	+ struct rq_flags *rf)
	5217	+{
	5218	+#ifdef CONFIG_SMP
	5219	+ const struct sched_class *class;
	5220	+ /*
	5221	+ * We must do the balancing pass before put_prev_task(), such
	5222	+ * that when we release the rq->lock the task is in the same
	5223	+ * state as before we took rq->lock.
	5224	+ *
	5225	+ * We can terminate the balance pass as soon as we know there is
	5226	+ * a runnable task of @class priority or higher.
	5227	+ */
	5228	+ for_class_range(class, prev->sched_class, &idle_sched_class) {
	5229	+ if (class->balance(rq, prev, rf))
	5230	+ break;
	5231	+ }
	5232	+#endif
	5233	+
	5234	+ put_prev_task(rq, prev);
3958	5235	}
3959	5236
3960	5237	/*
..	..	@@ -3972,29 +5249,29 @@
3972	5249	* higher scheduling class, because otherwise those loose the
3973	5250	* opportunity to pull in more work from other CPUs.
3974	5251	*/
3975		- if (likely((prev->sched_class == &idle_sched_class \|\|
3976		- prev->sched_class == &fair_sched_class) &&
	5252	+ if (likely(prev->sched_class <= &fair_sched_class &&
3977	5253	rq->nr_running == rq->cfs.h_nr_running)) {
3978	5254
3979		- p = fair_sched_class.pick_next_task(rq, prev, rf);
	5255	+ p = pick_next_task_fair(rq, prev, rf);
3980	5256	if (unlikely(p == RETRY_TASK))
3981		- goto again;
	5257	+ goto restart;
3982	5258
3983	5259	/* Assumes fair_sched_class->next == idle_sched_class */
3984		- if (unlikely(!p))
3985		- p = idle_sched_class.pick_next_task(rq, prev, rf);
	5260	+ if (!p) {
	5261	+ put_prev_task(rq, prev);
	5262	+ p = pick_next_task_idle(rq);
	5263	+ }
3986	5264
3987	5265	return p;
3988	5266	}
3989	5267
3990		-again:
	5268	+restart:
	5269	+ put_prev_task_balance(rq, prev, rf);
	5270	+
3991	5271	for_each_class(class) {
3992		- p = class->pick_next_task(rq, prev, rf);
3993		- if (p) {
3994		- if (unlikely(p == RETRY_TASK))
3995		- goto again;
	5272	+ p = class->pick_next_task(rq);
	5273	+ if (p)
3996	5274	return p;
3997		- }
3998	5275	}
3999	5276
4000	5277	/* The idle class should always have a runnable task: */
..	..	@@ -4021,7 +5298,7 @@
4021	5298	* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
4022	5299	* called on the nearest possible occasion:
4023	5300	*
4024		- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
	5301	+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
4025	5302	*
4026	5303	* - in syscall or exception context, at the next outmost
4027	5304	* preempt_enable(). (this might be as soon as the wake_up()'s
..	..	@@ -4030,7 +5307,7 @@
4030	5307	* - in IRQ context, return from interrupt-handler to
4031	5308	* preemptible context
4032	5309	*
4033		- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
	5310	+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
4034	5311	* then at the next:
4035	5312	*
4036	5313	* - cond_resched() call
..	..	@@ -4040,10 +5317,11 @@
4040	5317	*
4041	5318	* WARNING: must be called with preemption disabled!
4042	5319	*/
4043		-static void __sched notrace __schedule(bool preempt)
	5320	+static void __sched notrace __schedule(bool preempt, bool spinning_lock)
4044	5321	{
4045	5322	struct task_struct prev, next;
4046	5323	unsigned long *switch_count;
	5324	+ unsigned long prev_state;
4047	5325	struct rq_flags rf;
4048	5326	struct rq *rq;
4049	5327	int cpu;
..	..	@@ -4052,7 +5330,7 @@
4052	5330	rq = cpu_rq(cpu);
4053	5331	prev = rq->curr;
4054	5332
4055		- schedule_debug(prev);
	5333	+ schedule_debug(prev, preempt);
4056	5334
4057	5335	if (sched_feat(HRTICK))
4058	5336	hrtick_clear(rq);
..	..	@@ -4063,9 +5341,16 @@
4063	5341	/*
4064	5342	* Make sure that signal_pending_state()->signal_pending() below
4065	5343	* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4066		- * done by the caller to avoid the race with signal_wake_up().
	5344	+ * done by the caller to avoid the race with signal_wake_up():
4067	5345	*
4068		- * The membarrier system call requires a full memory barrier
	5346	+ * __set_current_state(@state) signal_wake_up()
	5347	+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
	5348	+ * wake_up_state(p, state)
	5349	+ * LOCK rq->lock LOCK p->pi_state
	5350	+ * smp_mb__after_spinlock() smp_mb__after_spinlock()
	5351	+ * if (signal_pending_state()) if (p->state & @state)
	5352	+ *
	5353	+ * Also, the membarrier system call requires a full memory barrier
4069	5354	* after coming from user-space, before storing to rq->curr.
4070	5355	*/
4071	5356	rq_lock(rq, &rf);
..	..	@@ -4076,29 +5361,43 @@
4076	5361	update_rq_clock(rq);
4077	5362
4078	5363	switch_count = &prev->nivcsw;
4079		- if (!preempt && prev->state) {
4080		- if (unlikely(signal_pending_state(prev->state, prev))) {
	5364	+
	5365	+ /*
	5366	+ * We must load prev->state once (task_struct::state is volatile), such
	5367	+ * that:
	5368	+ *
	5369	+ * - we form a control dependency vs deactivate_task() below.
	5370	+ * - ptrace_{,un}freeze_traced() can change ->state underneath us.
	5371	+ */
	5372	+ prev_state = prev->state;
	5373	+ if ((!preempt \|\| spinning_lock) && prev_state) {
	5374	+ if (signal_pending_state(prev_state, prev)) {
4081	5375	prev->state = TASK_RUNNING;
4082	5376	} else {
	5377	+ prev->sched_contributes_to_load =
	5378	+ (prev_state & TASK_UNINTERRUPTIBLE) &&
	5379	+ !(prev_state & TASK_NOLOAD) &&
	5380	+ !(prev->flags & PF_FROZEN);
	5381	+
	5382	+ if (prev->sched_contributes_to_load)
	5383	+ rq->nr_uninterruptible++;
	5384	+
	5385	+ /*
	5386	+ * __schedule() ttwu()
	5387	+ * prev_state = prev->state; if (p->on_rq && ...)
	5388	+ * if (prev_state) goto out;
	5389	+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
	5390	+ * p->state = TASK_WAKING
	5391	+ *
	5392	+ * Where __schedule() and ttwu() have matching control dependencies.
	5393	+ *
	5394	+ * After this, schedule() must not care about p->state any more.
	5395	+ */
4083	5396	deactivate_task(rq, prev, DEQUEUE_SLEEP \| DEQUEUE_NOCLOCK);
4084		- prev->on_rq = 0;
4085	5397
4086	5398	if (prev->in_iowait) {
4087	5399	atomic_inc(&rq->nr_iowait);
4088	5400	delayacct_blkio_start();
4089		- }
4090		-
4091		- /*
4092		- * If a worker went to sleep, notify and ask workqueue
4093		- * whether it wants to wake up a task to maintain
4094		- * concurrency.
4095		- */
4096		- if (prev->flags & PF_WQ_WORKER) {
4097		- struct task_struct *to_wakeup;
4098		-
4099		- to_wakeup = wq_worker_sleeping(prev);
4100		- if (to_wakeup)
4101		- try_to_wake_up_local(to_wakeup, &rf);
4102	5401	}
4103	5402	}
4104	5403	switch_count = &prev->nvcsw;
..	..	@@ -4106,11 +5405,17 @@
4106	5405
4107	5406	next = pick_next_task(rq, prev, &rf);
4108	5407	clear_tsk_need_resched(prev);
	5408	+ clear_tsk_need_resched_lazy(prev);
4109	5409	clear_preempt_need_resched();
4110	5410
	5411	+ trace_android_rvh_schedule(prev, next, rq);
4111	5412	if (likely(prev != next)) {
4112	5413	rq->nr_switches++;
4113		- rq->curr = next;
	5414	+ /*
	5415	+ * RCU users of rcu_dereference(rq->curr) may not see
	5416	+ * changes to task_struct made by pick_next_task().
	5417	+ */
	5418	+ RCU_INIT_POINTER(rq->curr, next);
4114	5419	/*
4115	5420	* The membarrier system call requires each architecture
4116	5421	* to have a full memory barrier after updating
..	..	@@ -4127,16 +5432,20 @@
4127	5432	*/
4128	5433	++*switch_count;
4129	5434
	5435	+ migrate_disable_switch(rq, prev);
	5436	+ psi_sched_switch(prev, next, !task_on_rq_queued(prev));
	5437	+
4130	5438	trace_sched_switch(preempt, prev, next);
4131	5439
4132	5440	/* Also unlocks the rq: */
4133	5441	rq = context_switch(rq, prev, next, &rf);
4134	5442	} else {
4135	5443	rq->clock_update_flags &= ~(RQCF_ACT_SKIP\|RQCF_REQ_SKIP);
4136		- rq_unlock_irq(rq, &rf);
4137		- }
4138	5444
4139		- balance_callback(rq);
	5445	+ rq_unpin_lock(rq, &rf);
	5446	+ __balance_callbacks(rq);
	5447	+ raw_spin_unlock_irq(&rq->lock);
	5448	+ }
4140	5449	}
4141	5450
4142	5451	void __noreturn do_task_dead(void)
..	..	@@ -4147,7 +5456,7 @@
4147	5456	/* Tell freezer to ignore us: */
4148	5457	current->flags \|= PF_NOFREEZE;
4149	5458
4150		- __schedule(false);
	5459	+ __schedule(false, false);
4151	5460	BUG();
4152	5461
4153	5462	/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
..	..	@@ -4157,14 +5466,45 @@
4157	5466
4158	5467	static inline void sched_submit_work(struct task_struct *tsk)
4159	5468	{
4160		- if (!tsk->state \|\| tsk_is_pi_blocked(tsk))
	5469	+ unsigned int task_flags;
	5470	+
	5471	+ if (!tsk->state)
4161	5472	return;
	5473	+
	5474	+ task_flags = tsk->flags;
	5475	+ /*
	5476	+ * If a worker went to sleep, notify and ask workqueue whether
	5477	+ * it wants to wake up a task to maintain concurrency.
	5478	+ * As this function is called inside the schedule() context,
	5479	+ * we disable preemption to avoid it calling schedule() again
	5480	+ * in the possible wakeup of a kworker and because wq_worker_sleeping()
	5481	+ * requires it.
	5482	+ */
	5483	+ if (task_flags & (PF_WQ_WORKER \| PF_IO_WORKER)) {
	5484	+ preempt_disable();
	5485	+ if (task_flags & PF_WQ_WORKER)
	5486	+ wq_worker_sleeping(tsk);
	5487	+ else
	5488	+ io_wq_worker_sleeping(tsk);
	5489	+ preempt_enable_no_resched();
	5490	+ }
	5491	+
4162	5492	/*
4163	5493	* If we are going to sleep and we have plugged IO queued,
4164	5494	* make sure to submit it to avoid deadlocks.
4165	5495	*/
4166	5496	if (blk_needs_flush_plug(tsk))
4167	5497	blk_schedule_flush_plug(tsk);
	5498	+}
	5499	+
	5500	+static void sched_update_worker(struct task_struct *tsk)
	5501	+{
	5502	+ if (tsk->flags & (PF_WQ_WORKER \| PF_IO_WORKER)) {
	5503	+ if (tsk->flags & PF_WQ_WORKER)
	5504	+ wq_worker_running(tsk);
	5505	+ else
	5506	+ io_wq_worker_running(tsk);
	5507	+ }
4168	5508	}
4169	5509
4170	5510	asmlinkage __visible void __sched schedule(void)
..	..	@@ -4174,9 +5514,10 @@
4174	5514	sched_submit_work(tsk);
4175	5515	do {
4176	5516	preempt_disable();
4177		- __schedule(false);
	5517	+ __schedule(false, false);
4178	5518	sched_preempt_enable_no_resched();
4179	5519	} while (need_resched());
	5520	+ sched_update_worker(tsk);
4180	5521	}
4181	5522	EXPORT_SYMBOL(schedule);
4182	5523
..	..	@@ -4201,7 +5542,7 @@
4201	5542	*/
4202	5543	WARN_ON_ONCE(current->state);
4203	5544	do {
4204		- __schedule(false);
	5545	+ __schedule(false, false);
4205	5546	} while (need_resched());
4206	5547	}
4207	5548
..	..	@@ -4254,7 +5595,7 @@
4254	5595	*/
4255	5596	preempt_disable_notrace();
4256	5597	preempt_latency_start(1);
4257		- __schedule(true);
	5598	+ __schedule(true, false);
4258	5599	preempt_latency_stop(1);
4259	5600	preempt_enable_no_resched_notrace();
4260	5601
..	..	@@ -4265,11 +5606,34 @@
4265	5606	} while (need_resched());
4266	5607	}
4267	5608
4268		-#ifdef CONFIG_PREEMPT
	5609	+#ifdef CONFIG_PREEMPT_LAZY
4269	5610	/*
4270		- * this is the entry point to schedule() from in-kernel preemption
4271		- * off of preempt_enable. Kernel preemptions off return from interrupt
4272		- * occur there and call schedule directly.
	5611	+ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
	5612	+ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
	5613	+ * preempt_lazy_count counter >0.
	5614	+ */
	5615	+static __always_inline int preemptible_lazy(void)
	5616	+{
	5617	+ if (test_thread_flag(TIF_NEED_RESCHED))
	5618	+ return 1;
	5619	+ if (current_thread_info()->preempt_lazy_count)
	5620	+ return 0;
	5621	+ return 1;
	5622	+}
	5623	+
	5624	+#else
	5625	+
	5626	+static inline int preemptible_lazy(void)
	5627	+{
	5628	+ return 1;
	5629	+}
	5630	+
	5631	+#endif
	5632	+
	5633	+#ifdef CONFIG_PREEMPTION
	5634	+/*
	5635	+ * This is the entry point to schedule() from in-kernel preemption
	5636	+ * off of preempt_enable.
4273	5637	*/
4274	5638	asmlinkage __visible void __sched notrace preempt_schedule(void)
4275	5639	{
..	..	@@ -4279,11 +5643,25 @@
4279	5643	*/
4280	5644	if (likely(!preemptible()))
4281	5645	return;
4282		-
	5646	+ if (!preemptible_lazy())
	5647	+ return;
4283	5648	preempt_schedule_common();
4284	5649	}
4285	5650	NOKPROBE_SYMBOL(preempt_schedule);
4286	5651	EXPORT_SYMBOL(preempt_schedule);
	5652	+
	5653	+#ifdef CONFIG_PREEMPT_RT
	5654	+void __sched notrace preempt_schedule_lock(void)
	5655	+{
	5656	+ do {
	5657	+ preempt_disable();
	5658	+ __schedule(true, true);
	5659	+ sched_preempt_enable_no_resched();
	5660	+ } while (need_resched());
	5661	+}
	5662	+NOKPROBE_SYMBOL(preempt_schedule_lock);
	5663	+EXPORT_SYMBOL(preempt_schedule_lock);
	5664	+#endif
4287	5665
4288	5666	/**
4289	5667	* preempt_schedule_notrace - preempt_schedule called by tracing
..	..	@@ -4304,6 +5682,9 @@
4304	5682	enum ctx_state prev_ctx;
4305	5683
4306	5684	if (likely(!preemptible()))
	5685	+ return;
	5686	+
	5687	+ if (!preemptible_lazy())
4307	5688	return;
4308	5689
4309	5690	do {
..	..	@@ -4328,7 +5709,7 @@
4328	5709	* an infinite recursion.
4329	5710	*/
4330	5711	prev_ctx = exception_enter();
4331		- __schedule(true);
	5712	+ __schedule(true, false);
4332	5713	exception_exit(prev_ctx);
4333	5714
4334	5715	preempt_latency_stop(1);
..	..	@@ -4337,10 +5718,10 @@
4337	5718	}
4338	5719	EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
4339	5720
4340		-#endif /* CONFIG_PREEMPT */
	5721	+#endif /* CONFIG_PREEMPTION */
4341	5722
4342	5723	/*
4343		- * this is the entry point to schedule() from kernel preemption
	5724	+ * This is the entry point to schedule() from kernel preemption
4344	5725	* off of irq context.
4345	5726	* Note, that this is called and return with irqs disabled. This will
4346	5727	* protect us against recursive calling from irq.
..	..	@@ -4357,7 +5738,7 @@
4357	5738	do {
4358	5739	preempt_disable();
4359	5740	local_irq_enable();
4360		- __schedule(true);
	5741	+ __schedule(true, false);
4361	5742	local_irq_disable();
4362	5743	sched_preempt_enable_no_resched();
4363	5744	} while (need_resched());
..	..	@@ -4368,9 +5749,22 @@
4368	5749	int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
4369	5750	void *key)
4370	5751	{
4371		- return try_to_wake_up(curr->private, mode, wake_flags, 1);
	5752	+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC \| WF_ANDROID_VENDOR));
	5753	+ return try_to_wake_up(curr->private, mode, wake_flags);
4372	5754	}
4373	5755	EXPORT_SYMBOL(default_wake_function);
	5756	+
	5757	+static void __setscheduler_prio(struct task_struct *p, int prio)
	5758	+{
	5759	+ if (dl_prio(prio))
	5760	+ p->sched_class = &dl_sched_class;
	5761	+ else if (rt_prio(prio))
	5762	+ p->sched_class = &rt_sched_class;
	5763	+ else
	5764	+ p->sched_class = &fair_sched_class;
	5765	+
	5766	+ p->prio = prio;
	5767	+}
4374	5768
4375	5769	#ifdef CONFIG_RT_MUTEXES
4376	5770
..	..	@@ -4408,6 +5802,7 @@
4408	5802	struct rq_flags rf;
4409	5803	struct rq *rq;
4410	5804
	5805	+ trace_android_rvh_rtmutex_prepare_setprio(p, pi_task);
4411	5806	/* XXX used to be waiter->prio, not waiter->task->prio */
4412	5807	prio = __rt_effective_prio(pi_task, p->normal_prio);
4413	5808
..	..	@@ -4482,39 +5877,39 @@
4482	5877	if (!dl_prio(p->normal_prio) \|\|
4483	5878	(pi_task && dl_prio(pi_task->prio) &&
4484	5879	dl_entity_preempt(&pi_task->dl, &p->dl))) {
4485		- p->dl.dl_boosted = 1;
	5880	+ p->dl.pi_se = pi_task->dl.pi_se;
4486	5881	queue_flag \|= ENQUEUE_REPLENISH;
4487		- } else
4488		- p->dl.dl_boosted = 0;
4489		- p->sched_class = &dl_sched_class;
	5882	+ } else {
	5883	+ p->dl.pi_se = &p->dl;
	5884	+ }
4490	5885	} else if (rt_prio(prio)) {
4491	5886	if (dl_prio(oldprio))
4492		- p->dl.dl_boosted = 0;
	5887	+ p->dl.pi_se = &p->dl;
4493	5888	if (oldprio < prio)
4494	5889	queue_flag \|= ENQUEUE_HEAD;
4495		- p->sched_class = &rt_sched_class;
4496	5890	} else {
4497	5891	if (dl_prio(oldprio))
4498		- p->dl.dl_boosted = 0;
	5892	+ p->dl.pi_se = &p->dl;
4499	5893	if (rt_prio(oldprio))
4500	5894	p->rt.timeout = 0;
4501		- p->sched_class = &fair_sched_class;
4502	5895	}
4503	5896
4504		- p->prio = prio;
	5897	+ __setscheduler_prio(p, prio);
4505	5898
4506	5899	if (queued)
4507	5900	enqueue_task(rq, p, queue_flag);
4508	5901	if (running)
4509		- set_curr_task(rq, p);
	5902	+ set_next_task(rq, p);
4510	5903
4511	5904	check_class_changed(rq, p, prev_class, oldprio);
4512	5905	out_unlock:
4513	5906	/* Avoid rq from going away on us: */
4514	5907	preempt_disable();
4515		- __task_rq_unlock(rq, &rf);
4516	5908
4517		- balance_callback(rq);
	5909	+ rq_unpin_lock(rq, &rf);
	5910	+ __balance_callbacks(rq);
	5911	+ raw_spin_unlock(&rq->lock);
	5912	+
4518	5913	preempt_enable();
4519	5914	}
4520	5915	#else
..	..	@@ -4526,12 +5921,13 @@
4526	5921
4527	5922	void set_user_nice(struct task_struct *p, long nice)
4528	5923	{
4529		- bool queued, running;
4530		- int old_prio, delta;
	5924	+ bool queued, running, allowed = false;
	5925	+ int old_prio;
4531	5926	struct rq_flags rf;
4532	5927	struct rq *rq;
4533	5928
4534		- if (task_nice(p) == nice \|\| nice < MIN_NICE \|\| nice > MAX_NICE)
	5929	+ trace_android_rvh_set_user_nice(p, &nice, &allowed);
	5930	+ if ((task_nice(p) == nice \|\| nice < MIN_NICE \|\| nice > MAX_NICE) && !allowed)
4535	5931	return;
4536	5932	/*
4537	5933	* We have to be careful, if called from sys_setpriority(),
..	..	@@ -4558,22 +5954,21 @@
4558	5954	put_prev_task(rq, p);
4559	5955
4560	5956	p->static_prio = NICE_TO_PRIO(nice);
4561		- set_load_weight(p, true);
	5957	+ set_load_weight(p);
4562	5958	old_prio = p->prio;
4563	5959	p->prio = effective_prio(p);
4564		- delta = p->prio - old_prio;
4565	5960
4566		- if (queued) {
	5961	+ if (queued)
4567	5962	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
4568		- /*
4569		- * If the task increased its priority or is running and
4570		- * lowered its priority, then reschedule its CPU:
4571		- */
4572		- if (delta < 0 \|\| (delta > 0 && task_running(rq, p)))
4573		- resched_curr(rq);
4574		- }
4575	5963	if (running)
4576		- set_curr_task(rq, p);
	5964	+ set_next_task(rq, p);
	5965	+
	5966	+ /*
	5967	+ * If the task increased its priority or is running and
	5968	+ * lowered its priority, then reschedule its CPU:
	5969	+ */
	5970	+ p->sched_class->prio_changed(rq, p, old_prio);
	5971	+
4577	5972	out_unlock:
4578	5973	task_rq_unlock(rq, p, &rf);
4579	5974	}
..	..	@@ -4658,7 +6053,7 @@
4658	6053	return 0;
4659	6054
4660	6055	#ifdef CONFIG_SMP
4661		- if (!llist_empty(&rq->wake_list))
	6056	+ if (rq->ttwu_pending)
4662	6057	return 0;
4663	6058	#endif
4664	6059
..	..	@@ -4681,6 +6076,7 @@
4681	6076
4682	6077	return 1;
4683	6078	}
	6079	+EXPORT_SYMBOL_GPL(available_idle_cpu);
4684	6080
4685	6081	/**
4686	6082	* idle_task - return the idle task for a given CPU.
..	..	@@ -4732,36 +6128,7 @@
4732	6128	*/
4733	6129	p->rt_priority = attr->sched_priority;
4734	6130	p->normal_prio = normal_prio(p);
4735		- set_load_weight(p, true);
4736		-}
4737		-
4738		-/* Actually do priority change: must hold pi & rq lock. */
4739		-static void __setscheduler(struct rq rq, struct task_struct p,
4740		- const struct sched_attr *attr, bool keep_boost)
4741		-{
4742		- /*
4743		- * If params can't change scheduling class changes aren't allowed
4744		- * either.
4745		- */
4746		- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
4747		- return;
4748		-
4749		- __setscheduler_params(p, attr);
4750		-
4751		- /*
4752		- * Keep a potential priority boosting if called from
4753		- * sched_setscheduler().
4754		- */
4755		- p->prio = normal_prio(p);
4756		- if (keep_boost)
4757		- p->prio = rt_effective_prio(p, p->prio);
4758		-
4759		- if (dl_prio(p->prio))
4760		- p->sched_class = &dl_sched_class;
4761		- else if (rt_prio(p->prio))
4762		- p->sched_class = &rt_sched_class;
4763		- else
4764		- p->sched_class = &fair_sched_class;
	6131	+ set_load_weight(p);
4765	6132	}
4766	6133
4767	6134	/*
..	..	@@ -4784,11 +6151,10 @@
4784	6151	const struct sched_attr *attr,
4785	6152	bool user, bool pi)
4786	6153	{
4787		- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4788		- MAX_RT_PRIO - 1 - attr->sched_priority;
4789		- int retval, oldprio, oldpolicy = -1, queued, running;
4790		- int new_effective_prio, policy = attr->sched_policy;
	6154	+ int oldpolicy = -1, policy = attr->sched_policy;
	6155	+ int retval, oldprio, newprio, queued, running;
4791	6156	const struct sched_class *prev_class;
	6157	+ struct callback_head *head;
4792	6158	struct rq_flags rf;
4793	6159	int reset_on_fork;
4794	6160	int queue_flags = DEQUEUE_SAVE \| DEQUEUE_MOVE \| DEQUEUE_NOCLOCK;
..	..	@@ -4860,7 +6226,7 @@
4860	6226	* Treat SCHED_IDLE as nice 20. Only allow a switch to
4861	6227	* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4862	6228	*/
4863		- if (idle_policy(p->policy) && !idle_policy(policy)) {
	6229	+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
4864	6230	if (!can_nice(p, task_nice(p)))
4865	6231	return -EPERM;
4866	6232	}
..	..	@@ -4871,6 +6237,10 @@
4871	6237
4872	6238	/* Normal users shall not reset the sched_reset_on_fork flag: */
4873	6239	if (p->sched_reset_on_fork && !reset_on_fork)
	6240	+ return -EPERM;
	6241	+
	6242	+ /* Can't change util-clamps */
	6243	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
4874	6244	return -EPERM;
4875	6245	}
4876	6246
..	..	@@ -4904,8 +6274,8 @@
4904	6274	* Changing the policy of the stop threads its a very bad idea:
4905	6275	*/
4906	6276	if (p == rq->stop) {
4907		- task_rq_unlock(rq, p, &rf);
4908		- return -EINVAL;
	6277	+ retval = -EINVAL;
	6278	+ goto unlock;
4909	6279	}
4910	6280
4911	6281	/*
..	..	@@ -4923,8 +6293,8 @@
4923	6293	goto change;
4924	6294
4925	6295	p->sched_reset_on_fork = reset_on_fork;
4926		- task_rq_unlock(rq, p, &rf);
4927		- return 0;
	6296	+ retval = 0;
	6297	+ goto unlock;
4928	6298	}
4929	6299	change:
4930	6300
..	..	@@ -4937,8 +6307,8 @@
4937	6307	if (rt_bandwidth_enabled() && rt_policy(policy) &&
4938	6308	task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4939	6309	!task_group_is_autogroup(task_group(p))) {
4940		- task_rq_unlock(rq, p, &rf);
4941		- return -EPERM;
	6310	+ retval = -EPERM;
	6311	+ goto unlock;
4942	6312	}
4943	6313	#endif
4944	6314	#ifdef CONFIG_SMP
..	..	@@ -4951,10 +6321,10 @@
4951	6321	* the entire root_domain to become SCHED_DEADLINE. We
4952	6322	* will also fail if there's no bandwidth available.
4953	6323	*/
4954		- if (!cpumask_subset(span, &p->cpus_allowed) \|\|
	6324	+ if (!cpumask_subset(span, p->cpus_ptr) \|\|
4955	6325	rq->rd->dl_bw.bw == 0) {
4956		- task_rq_unlock(rq, p, &rf);
4957		- return -EPERM;
	6326	+ retval = -EPERM;
	6327	+ goto unlock;
4958	6328	}
4959	6329	}
4960	6330	#endif
..	..	@@ -4973,13 +6343,14 @@
4973	6343	* is available.
4974	6344	*/
4975	6345	if ((dl_policy(policy) \|\| dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
4976		- task_rq_unlock(rq, p, &rf);
4977		- return -EBUSY;
	6346	+ retval = -EBUSY;
	6347	+ goto unlock;
4978	6348	}
4979	6349
4980	6350	p->sched_reset_on_fork = reset_on_fork;
4981	6351	oldprio = p->prio;
4982	6352
	6353	+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
4983	6354	if (pi) {
4984	6355	/*
4985	6356	* Take priority boosted tasks into account. If the new
..	..	@@ -4988,8 +6359,8 @@
4988	6359	* the runqueue. This will be done when the task deboost
4989	6360	* itself.
4990	6361	*/
4991		- new_effective_prio = rt_effective_prio(p, newprio);
4992		- if (new_effective_prio == oldprio)
	6362	+ newprio = rt_effective_prio(p, newprio);
	6363	+ if (newprio == oldprio)
4993	6364	queue_flags &= ~DEQUEUE_MOVE;
4994	6365	}
4995	6366
..	..	@@ -5002,7 +6373,11 @@
5002	6373
5003	6374	prev_class = p->sched_class;
5004	6375
5005		- __setscheduler(rq, p, attr, pi);
	6376	+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
	6377	+ __setscheduler_params(p, attr);
	6378	+ __setscheduler_prio(p, newprio);
	6379	+ trace_android_rvh_setscheduler(p);
	6380	+ }
5006	6381	__setscheduler_uclamp(p, attr);
5007	6382
5008	6383	if (queued) {
..	..	@@ -5016,22 +6391,27 @@
5016	6391	enqueue_task(rq, p, queue_flags);
5017	6392	}
5018	6393	if (running)
5019		- set_curr_task(rq, p);
	6394	+ set_next_task(rq, p);
5020	6395
5021	6396	check_class_changed(rq, p, prev_class, oldprio);
5022	6397
5023	6398	/* Avoid rq from going away on us: */
5024	6399	preempt_disable();
	6400	+ head = splice_balance_callbacks(rq);
5025	6401	task_rq_unlock(rq, p, &rf);
5026	6402
5027	6403	if (pi)
5028	6404	rt_mutex_adjust_pi(p);
5029	6405
5030	6406	/* Run balance callbacks after we've adjusted the PI chain: */
5031		- balance_callback(rq);
	6407	+ balance_callbacks(rq, head);
5032	6408	preempt_enable();
5033	6409
5034	6410	return 0;
	6411	+
	6412	+unlock:
	6413	+ task_rq_unlock(rq, p, &rf);
	6414	+ return retval;
5035	6415	}
5036	6416
5037	6417	static int _sched_setscheduler(struct task_struct *p, int policy,
..	..	@@ -5043,6 +6423,14 @@
5043	6423	.sched_nice = PRIO_TO_NICE(p->static_prio),
5044	6424	};
5045	6425
	6426	+ if (IS_ENABLED(CONFIG_ROCKCHIP_OPTIMIZE_RT_PRIO) &&
	6427	+ ((policy == SCHED_FIFO) \|\| (policy == SCHED_RR))) {
	6428	+ attr.sched_priority /= 2;
	6429	+ if (!check)
	6430	+ attr.sched_priority += MAX_RT_PRIO / 2;
	6431	+ if (!attr.sched_priority)
	6432	+ attr.sched_priority = 1;
	6433	+ }
5046	6434	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
5047	6435	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
5048	6436	attr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;
..	..	@@ -5057,6 +6445,8 @@
5057	6445	* @p: the task in question.
5058	6446	* @policy: new policy.
5059	6447	* @param: structure containing the new RT priority.
	6448	+ *
	6449	+ * Use sched_set_fifo(), read its comment.
5060	6450	*
5061	6451	* Return: 0 on success. An error code otherwise.
5062	6452	*
..	..	@@ -5079,6 +6469,7 @@
5079	6469	{
5080	6470	return __sched_setscheduler(p, attr, false, true);
5081	6471	}
	6472	+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
5082	6473
5083	6474	/**
5084	6475	* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
..	..	@@ -5099,6 +6490,51 @@
5099	6490	return _sched_setscheduler(p, policy, param, false);
5100	6491	}
5101	6492	EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
	6493	+
	6494	+/*
	6495	+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
	6496	+ * incapable of resource management, which is the one thing an OS really should
	6497	+ * be doing.
	6498	+ *
	6499	+ * This is of course the reason it is limited to privileged users only.
	6500	+ *
	6501	+ * Worse still; it is fundamentally impossible to compose static priority
	6502	+ * workloads. You cannot take two correctly working static prio workloads
	6503	+ * and smash them together and still expect them to work.
	6504	+ *
	6505	+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
	6506	+ *
	6507	+ * MAX_RT_PRIO / 2
	6508	+ *
	6509	+ * The administrator _MUST_ configure the system, the kernel simply doesn't
	6510	+ * know enough information to make a sensible choice.
	6511	+ */
	6512	+void sched_set_fifo(struct task_struct *p)
	6513	+{
	6514	+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
	6515	+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
	6516	+}
	6517	+EXPORT_SYMBOL_GPL(sched_set_fifo);
	6518	+
	6519	+/*
	6520	+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
	6521	+ */
	6522	+void sched_set_fifo_low(struct task_struct *p)
	6523	+{
	6524	+ struct sched_param sp = { .sched_priority = 1 };
	6525	+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
	6526	+}
	6527	+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
	6528	+
	6529	+void sched_set_normal(struct task_struct *p, int nice)
	6530	+{
	6531	+ struct sched_attr attr = {
	6532	+ .sched_policy = SCHED_NORMAL,
	6533	+ .sched_nice = nice,
	6534	+ };
	6535	+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
	6536	+}
	6537	+EXPORT_SYMBOL_GPL(sched_set_normal);
5102	6538
5103	6539	static int
5104	6540	do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
..	..	@@ -5130,9 +6566,6 @@
5130	6566	u32 size;
5131	6567	int ret;
5132	6568
5133		- if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
5134		- return -EFAULT;
5135		-
5136	6569	/* Zero the full structure, so that a short copy will be nice: */
5137	6570	memset(attr, 0, sizeof(*attr));
5138	6571
..	..	@@ -5140,44 +6573,18 @@
5140	6573	if (ret)
5141	6574	return ret;
5142	6575
5143		- /* Bail out on silly large: */
5144		- if (size > PAGE_SIZE)
5145		- goto err_size;
5146		-
5147	6576	/* ABI compatibility quirk: */
5148	6577	if (!size)
5149	6578	size = SCHED_ATTR_SIZE_VER0;
5150		-
5151		- if (size < SCHED_ATTR_SIZE_VER0)
	6579	+ if (size < SCHED_ATTR_SIZE_VER0 \|\| size > PAGE_SIZE)
5152	6580	goto err_size;
5153	6581
5154		- /*
5155		- * If we're handed a bigger struct than we know of,
5156		- * ensure all the unknown bits are 0 - i.e. new
5157		- * user-space does not rely on any kernel feature
5158		- * extensions we dont know about yet.
5159		- */
5160		- if (size > sizeof(*attr)) {
5161		- unsigned char __user *addr;
5162		- unsigned char __user *end;
5163		- unsigned char val;
5164		-
5165		- addr = (void __user )uattr + sizeof(attr);
5166		- end = (void __user *)uattr + size;
5167		-
5168		- for (; addr < end; addr++) {
5169		- ret = get_user(val, addr);
5170		- if (ret)
5171		- return ret;
5172		- if (val)
5173		- goto err_size;
5174		- }
5175		- size = sizeof(*attr);
	6582	+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
	6583	+ if (ret) {
	6584	+ if (ret == -E2BIG)
	6585	+ goto err_size;
	6586	+ return ret;
5176	6587	}
5177		-
5178		- ret = copy_from_user(attr, uattr, size);
5179		- if (ret)
5180		- return -EFAULT;
5181	6588
5182	6589	if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
5183	6590	size < SCHED_ATTR_SIZE_VER1)
..	..	@@ -5194,6 +6601,16 @@
5194	6601	err_size:
5195	6602	put_user(sizeof(*attr), &uattr->size);
5196	6603	return -E2BIG;
	6604	+}
	6605	+
	6606	+static void get_params(struct task_struct p, struct sched_attr attr)
	6607	+{
	6608	+ if (task_has_dl_policy(p))
	6609	+ __getparam_dl(p, attr);
	6610	+ else if (task_has_rt_policy(p))
	6611	+ attr->sched_priority = p->rt_priority;
	6612	+ else
	6613	+ attr->sched_nice = task_nice(p);
5197	6614	}
5198	6615
5199	6616	/**
..	..	@@ -5257,6 +6674,8 @@
5257	6674	rcu_read_unlock();
5258	6675
5259	6676	if (likely(p)) {
	6677	+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
	6678	+ get_params(p, &attr);
5260	6679	retval = sched_setattr(p, &attr);
5261	6680	put_task_struct(p);
5262	6681	}
..	..	@@ -5350,7 +6769,7 @@
5350	6769	{
5351	6770	unsigned int ksize = sizeof(*kattr);
5352	6771
5353		- if (!access_ok(VERIFY_WRITE, uattr, usize))
	6772	+ if (!access_ok(uattr, usize))
5354	6773	return -EFAULT;
5355	6774
5356	6775	/*
..	..	@@ -5378,7 +6797,7 @@
5378	6797	* sys_sched_getattr - similar to sched_getparam, but with sched_attr
5379	6798	* @pid: the pid in question.
5380	6799	* @uattr: structure containing the extended parameters.
5381		- * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
	6800	+ * @usize: sizeof(attr) for fwd/bwd comp.
5382	6801	* @flags: for future extension.
5383	6802	*/
5384	6803	SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
..	..	@@ -5405,14 +6824,15 @@
5405	6824	kattr.sched_policy = p->policy;
5406	6825	if (p->sched_reset_on_fork)
5407	6826	kattr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;
5408		- if (task_has_dl_policy(p))
5409		- __getparam_dl(p, &kattr);
5410		- else if (task_has_rt_policy(p))
5411		- kattr.sched_priority = p->rt_priority;
5412		- else
5413		- kattr.sched_nice = task_nice(p);
	6827	+ get_params(p, &kattr);
	6828	+ kattr.sched_flags &= SCHED_FLAG_ALL;
5414	6829
5415	6830	#ifdef CONFIG_UCLAMP_TASK
	6831	+ /*
	6832	+ * This could race with another potential updater, but this is fine
	6833	+ * because it'll correctly read the old or the new value. We don't need
	6834	+ * to guarantee who wins the race as long as it doesn't return garbage.
	6835	+ */
5416	6836	kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
5417	6837	kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
5418	6838	#endif
..	..	@@ -5431,6 +6851,7 @@
5431	6851	cpumask_var_t cpus_allowed, new_mask;
5432	6852	struct task_struct *p;
5433	6853	int retval;
	6854	+ int skip = 0;
5434	6855
5435	6856	rcu_read_lock();
5436	6857
..	..	@@ -5466,6 +6887,9 @@
5466	6887	rcu_read_unlock();
5467	6888	}
5468	6889
	6890	+ trace_android_vh_sched_setaffinity_early(p, in_mask, &skip);
	6891	+ if (skip)
	6892	+ goto out_free_new_mask;
5469	6893	retval = security_task_setscheduler(p);
5470	6894	if (retval)
5471	6895	goto out_free_new_mask;
..	..	@@ -5492,7 +6916,7 @@
5492	6916	}
5493	6917	#endif
5494	6918	again:
5495		- retval = __set_cpus_allowed_ptr(p, new_mask, true);
	6919	+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
5496	6920
5497	6921	if (!retval) {
5498	6922	cpuset_cpus_allowed(p, cpus_allowed);
..	..	@@ -5506,6 +6930,9 @@
5506	6930	goto again;
5507	6931	}
5508	6932	}
	6933	+
	6934	+ trace_android_rvh_sched_setaffinity(p, in_mask, &retval);
	6935	+
5509	6936	out_free_new_mask:
5510	6937	free_cpumask_var(new_mask);
5511	6938	out_free_cpus_allowed:
..	..	@@ -5514,7 +6941,6 @@
5514	6941	put_task_struct(p);
5515	6942	return retval;
5516	6943	}
5517		-EXPORT_SYMBOL_GPL(sched_setaffinity);
5518	6944
5519	6945	static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5520	6946	struct cpumask *new_mask)
..	..	@@ -5569,7 +6995,7 @@
5569	6995	goto out_unlock;
5570	6996
5571	6997	raw_spin_lock_irqsave(&p->pi_lock, flags);
5572		- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
	6998	+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
5573	6999	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5574	7000
5575	7001	out_unlock:
..	..	@@ -5633,6 +7059,8 @@
5633	7059	schedstat_inc(rq->yld_count);
5634	7060	current->sched_class->yield_task(rq);
5635	7061
	7062	+ trace_android_rvh_do_sched_yield(rq);
	7063	+
5636	7064	preempt_disable();
5637	7065	rq_unlock_irq(rq, &rf);
5638	7066	sched_preempt_enable_no_resched();
..	..	@@ -5646,7 +7074,7 @@
5646	7074	return 0;
5647	7075	}
5648	7076
5649		-#ifndef CONFIG_PREEMPT
	7077	+#ifndef CONFIG_PREEMPTION
5650	7078	int __sched _cond_resched(void)
5651	7079	{
5652	7080	if (should_resched(0)) {
..	..	@@ -5663,7 +7091,7 @@
5663	7091	* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
5664	7092	* call schedule, and on return reacquire the lock.
5665	7093	*
5666		- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
	7094	+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
5667	7095	* operations here to prevent schedule() from being called twice (once via
5668	7096	* spin_unlock(), once by hand).
5669	7097	*/
..	..	@@ -5767,7 +7195,7 @@
5767	7195	if (task_running(p_rq, p) \|\| p->state)
5768	7196	goto out_unlock;
5769	7197
5770		- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
	7198	+ yielded = curr->sched_class->yield_to_task(rq, p);
5771	7199	if (yielded) {
5772	7200	schedstat_inc(rq->yld_count);
5773	7201	/*
..	..	@@ -5933,7 +7361,7 @@
5933	7361	* an error code.
5934	7362	*/
5935	7363	SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5936		- struct timespec __user *, interval)
	7364	+ struct __kernel_timespec __user *, interval)
5937	7365	{
5938	7366	struct timespec64 t;
5939	7367	int retval = sched_rr_get_interval(pid, &t);
..	..	@@ -5944,16 +7372,15 @@
5944	7372	return retval;
5945	7373	}
5946	7374
5947		-#ifdef CONFIG_COMPAT
5948		-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
5949		- compat_pid_t, pid,
5950		- struct compat_timespec __user *, interval)
	7375	+#ifdef CONFIG_COMPAT_32BIT_TIME
	7376	+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
	7377	+ struct old_timespec32 __user *, interval)
5951	7378	{
5952	7379	struct timespec64 t;
5953	7380	int retval = sched_rr_get_interval(pid, &t);
5954	7381
5955	7382	if (retval == 0)
5956		- retval = compat_put_timespec64(&t, interval);
	7383	+ retval = put_old_timespec32(&t, interval);
5957	7384	return retval;
5958	7385	}
5959	7386	#endif
..	..	@@ -5966,10 +7393,10 @@
5966	7393	if (!try_get_task_stack(p))
5967	7394	return;
5968	7395
5969		- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
	7396	+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
5970	7397
5971	7398	if (p->state == TASK_RUNNING)
5972		- printk(KERN_CONT " running task ");
	7399	+ pr_cont(" running task ");
5973	7400	#ifdef CONFIG_DEBUG_STACK_USAGE
5974	7401	free = stack_not_used(p);
5975	7402	#endif
..	..	@@ -5978,12 +7405,13 @@
5978	7405	if (pid_alive(p))
5979	7406	ppid = task_pid_nr(rcu_dereference(p->real_parent));
5980	7407	rcu_read_unlock();
5981		- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5982		- task_pid_nr(p), ppid,
	7408	+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
	7409	+ free, task_pid_nr(p), ppid,
5983	7410	(unsigned long)task_thread_info(p)->flags);
5984	7411
5985	7412	print_worker_info(KERN_INFO, p);
5986		- show_stack(p, NULL);
	7413	+ trace_android_vh_sched_show_task(p);
	7414	+ show_stack(p, NULL, KERN_INFO);
5987	7415	put_task_stack(p);
5988	7416	}
5989	7417	EXPORT_SYMBOL_GPL(sched_show_task);
..	..	@@ -6014,13 +7442,6 @@
6014	7442	{
6015	7443	struct task_struct g, p;
6016	7444
6017		-#if BITS_PER_LONG == 32
6018		- printk(KERN_INFO
6019		- " task PC stack pid father\n");
6020		-#else
6021		- printk(KERN_INFO
6022		- " task PC stack pid father\n");
6023		-#endif
6024	7445	rcu_read_lock();
6025	7446	for_each_process_thread(g, p) {
6026	7447	/*
..	..	@@ -6056,7 +7477,7 @@
6056	7477	* NOTE: this function does not set the idle thread's NEED_RESCHED
6057	7478	* flag, to make booting more robust.
6058	7479	*/
6059		-void init_idle(struct task_struct *idle, int cpu)
	7480	+void __init init_idle(struct task_struct *idle, int cpu)
6060	7481	{
6061	7482	struct rq *rq = cpu_rq(cpu);
6062	7483	unsigned long flags;
..	..	@@ -6070,9 +7491,6 @@
6070	7491	idle->se.exec_start = sched_clock();
6071	7492	idle->flags \|= PF_IDLE;
6072	7493
6073		- scs_task_reset(idle);
6074		- kasan_unpoison_task_stack(idle);
6075		-
6076	7494	#ifdef CONFIG_SMP
6077	7495	/*
6078	7496	* Its possible that init_idle() gets called multiple times on a task,
..	..	@@ -6080,7 +7498,7 @@
6080	7498	*
6081	7499	* And since this is boot we can forgo the serialization.
6082	7500	*/
6083		- set_cpus_allowed_common(idle, cpumask_of(cpu));
	7501	+ set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
6084	7502	#endif
6085	7503	/*
6086	7504	* We're having a chicken and egg problem, even though we are
..	..	@@ -6096,7 +7514,8 @@
6096	7514	__set_task_cpu(idle, cpu);
6097	7515	rcu_read_unlock();
6098	7516
6099		- rq->curr = rq->idle = idle;
	7517	+ rq->idle = idle;
	7518	+ rcu_assign_pointer(rq->curr, idle);
6100	7519	idle->on_rq = TASK_ON_RQ_QUEUED;
6101	7520	#ifdef CONFIG_SMP
6102	7521	idle->on_cpu = 1;
..	..	@@ -6106,7 +7525,9 @@
6106	7525
6107	7526	/* Set the preempt count _outside_ the spinlocks! */
6108	7527	init_idle_preempt_count(idle, cpu);
6109		-
	7528	+#ifdef CONFIG_HAVE_PREEMPT_LAZY
	7529	+ task_thread_info(idle)->preempt_lazy_count = 0;
	7530	+#endif
6110	7531	/*
6111	7532	* The idle tasks have their own, simple scheduling class:
6112	7533	*/
..	..	@@ -6134,7 +7555,7 @@
6134	7555	}
6135	7556
6136	7557	int task_can_attach(struct task_struct *p,
6137		- const struct cpumask *cs_cpus_allowed)
	7558	+ const struct cpumask *cs_effective_cpus)
6138	7559	{
6139	7560	int ret = 0;
6140	7561
..	..	@@ -6145,7 +7566,7 @@
6145	7566	* allowed nodes is unnecessary. Thus, cpusets are not
6146	7567	* applicable for such threads. This prevents checking for
6147	7568	* success of set_cpus_allowed_ptr() on all attached tasks
6148		- * before cpus_allowed may be changed.
	7569	+ * before cpus_mask may be changed.
6149	7570	*/
6150	7571	if (p->flags & PF_NO_SETAFFINITY) {
6151	7572	ret = -EINVAL;
..	..	@@ -6153,8 +7574,13 @@
6153	7574	}
6154	7575
6155	7576	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
6156		- cs_cpus_allowed))
6157		- ret = dl_task_can_attach(p, cs_cpus_allowed);
	7577	+ cs_effective_cpus)) {
	7578	+ int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
	7579	+
	7580	+ if (unlikely(cpu >= nr_cpu_ids))
	7581	+ return -EINVAL;
	7582	+ ret = dl_cpu_busy(cpu, p);
	7583	+ }
6158	7584
6159	7585	out:
6160	7586	return ret;
..	..	@@ -6172,7 +7598,7 @@
6172	7598	if (curr_cpu == target_cpu)
6173	7599	return 0;
6174	7600
6175		- if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
	7601	+ if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
6176	7602	return -EINVAL;
6177	7603
6178	7604	/* TODO: This is not properly updating schedstats */
..	..	@@ -6205,12 +7631,13 @@
6205	7631	if (queued)
6206	7632	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
6207	7633	if (running)
6208		- set_curr_task(rq, p);
	7634	+ set_next_task(rq, p);
6209	7635	task_rq_unlock(rq, p, &rf);
6210	7636	}
6211	7637	#endif /* CONFIG_NUMA_BALANCING */
6212	7638
6213	7639	#ifdef CONFIG_HOTPLUG_CPU
	7640	+
6214	7641	/*
6215	7642	* Ensure that the idle task is using init_mm right before its CPU goes
6216	7643	* offline.
..	..	@@ -6230,123 +7657,163 @@
6230	7657	/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
6231	7658	}
6232	7659
6233		-/*
6234		- * Since this CPU is going 'away' for a while, fold any nr_active delta
6235		- * we might have. Assumes we're called after migrate_tasks() so that the
6236		- * nr_active count is stable. We need to take the teardown thread which
6237		- * is calling this into account, so we hand in adjust = 1 to the load
6238		- * calculation.
6239		- *
6240		- * Also see the comment "Global load-average calculations".
6241		- */
6242		-static void calc_load_migrate(struct rq *rq)
	7660	+static int __balance_push_cpu_stop(void *arg)
6243	7661	{
6244		- long delta = calc_load_fold_active(rq, 1);
6245		- if (delta)
6246		- atomic_long_add(delta, &calc_load_tasks);
6247		-}
	7662	+ struct task_struct *p = arg;
	7663	+ struct rq *rq = this_rq();
	7664	+ struct rq_flags rf;
	7665	+ int cpu;
6248	7666
6249		-static void put_prev_task_fake(struct rq rq, struct task_struct prev)
6250		-{
6251		-}
	7667	+ raw_spin_lock_irq(&p->pi_lock);
	7668	+ rq_lock(rq, &rf);
6252	7669
6253		-static const struct sched_class fake_sched_class = {
6254		- .put_prev_task = put_prev_task_fake,
6255		-};
6256		-
6257		-static struct task_struct fake_task = {
6258		- /*
6259		- * Avoid pull_{rt,dl}_task()
6260		- */
6261		- .prio = MAX_PRIO + 1,
6262		- .sched_class = &fake_sched_class,
6263		-};
6264		-
6265		-/*
6266		- * Migrate all tasks from the rq, sleeping tasks will be migrated by
6267		- * try_to_wake_up()->select_task_rq().
6268		- *
6269		- * Called with rq->lock held even though we'er in stop_machine() and
6270		- * there's no concurrency possible, we hold the required locks anyway
6271		- * because of lock validation efforts.
6272		- */
6273		-static void migrate_tasks(struct rq dead_rq, struct rq_flags rf)
6274		-{
6275		- struct rq *rq = dead_rq;
6276		- struct task_struct next, stop = rq->stop;
6277		- struct rq_flags orf = *rf;
6278		- int dest_cpu;
6279		-
6280		- /*
6281		- * Fudge the rq selection such that the below task selection loop
6282		- * doesn't get stuck on the currently eligible stop task.
6283		- *
6284		- * We're currently inside stop_machine() and the rq is either stuck
6285		- * in the stop_machine_cpu_stop() loop, or we're executing this code,
6286		- * either way we should never end up calling schedule() until we're
6287		- * done here.
6288		- */
6289		- rq->stop = NULL;
6290		-
6291		- /*
6292		- * put_prev_task() and pick_next_task() sched
6293		- * class method both need to have an up-to-date
6294		- * value of rq->clock[_task]
6295		- */
6296	7670	update_rq_clock(rq);
6297	7671
6298		- for (;;) {
6299		- /*
6300		- * There's this thread running, bail when that's the only
6301		- * remaining thread:
6302		- */
6303		- if (rq->nr_running == 1)
6304		- break;
6305		-
6306		- /*
6307		- * pick_next_task() assumes pinned rq->lock:
6308		- */
6309		- next = pick_next_task(rq, &fake_task, rf);
6310		- BUG_ON(!next);
6311		- put_prev_task(rq, next);
6312		-
6313		- /*
6314		- * Rules for changing task_struct::cpus_allowed are holding
6315		- * both pi_lock and rq->lock, such that holding either
6316		- * stabilizes the mask.
6317		- *
6318		- * Drop rq->lock is not quite as disastrous as it usually is
6319		- * because !cpu_active at this point, which means load-balance
6320		- * will not interfere. Also, stop-machine.
6321		- */
6322		- rq_unlock(rq, rf);
6323		- raw_spin_lock(&next->pi_lock);
6324		- rq_relock(rq, rf);
6325		-
6326		- /*
6327		- * Since we're inside stop-machine, _nothing_ should have
6328		- * changed the task, WARN if weird stuff happened, because in
6329		- * that case the above rq->lock drop is a fail too.
6330		- */
6331		- if (WARN_ON(task_rq(next) != rq \|\| !task_on_rq_queued(next))) {
6332		- raw_spin_unlock(&next->pi_lock);
6333		- continue;
6334		- }
6335		-
6336		- /* Find suitable destination for @next, with force if needed. */
6337		- dest_cpu = select_fallback_rq(dead_rq->cpu, next);
6338		- rq = __migrate_task(rq, rf, next, dest_cpu);
6339		- if (rq != dead_rq) {
6340		- rq_unlock(rq, rf);
6341		- rq = dead_rq;
6342		- *rf = orf;
6343		- rq_relock(rq, rf);
6344		- }
6345		- raw_spin_unlock(&next->pi_lock);
	7672	+ if (task_rq(p) == rq && task_on_rq_queued(p)) {
	7673	+ cpu = select_fallback_rq(rq->cpu, p);
	7674	+ rq = __migrate_task(rq, &rf, p, cpu);
6346	7675	}
6347	7676
6348		- rq->stop = stop;
	7677	+ rq_unlock(rq, &rf);
	7678	+ raw_spin_unlock_irq(&p->pi_lock);
	7679	+
	7680	+ put_task_struct(p);
	7681	+
	7682	+ return 0;
6349	7683	}
	7684	+
	7685	+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
	7686	+
	7687	+/*
	7688	+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
	7689	+ */
	7690	+
	7691	+
	7692	+static void balance_push(struct rq *rq)
	7693	+{
	7694	+ struct task_struct *push_task = rq->curr;
	7695	+
	7696	+ lockdep_assert_held(&rq->lock);
	7697	+ SCHED_WARN_ON(rq->cpu != smp_processor_id());
	7698	+
	7699	+ /*
	7700	+ * Both the cpu-hotplug and stop task are in this case and are
	7701	+ * required to complete the hotplug process.
	7702	+ */
	7703	+ if (is_per_cpu_kthread(push_task) \|\| is_migration_disabled(push_task)) {
	7704	+ /*
	7705	+ * If this is the idle task on the outgoing CPU try to wake
	7706	+ * up the hotplug control thread which might wait for the
	7707	+ * last task to vanish. The rcuwait_active() check is
	7708	+ * accurate here because the waiter is pinned on this CPU
	7709	+ * and can't obviously be running in parallel.
	7710	+ *
	7711	+ * On RT kernels this also has to check whether there are
	7712	+ * pinned and scheduled out tasks on the runqueue. They
	7713	+ * need to leave the migrate disabled section first.
	7714	+ */
	7715	+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
	7716	+ rcuwait_active(&rq->hotplug_wait)) {
	7717	+ raw_spin_unlock(&rq->lock);
	7718	+ rcuwait_wake_up(&rq->hotplug_wait);
	7719	+ raw_spin_lock(&rq->lock);
	7720	+ }
	7721	+ return;
	7722	+ }
	7723	+
	7724	+ get_task_struct(push_task);
	7725	+ /*
	7726	+ * Temporarily drop rq->lock such that we can wake-up the stop task.
	7727	+ * Both preemption and IRQs are still disabled.
	7728	+ */
	7729	+ raw_spin_unlock(&rq->lock);
	7730	+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
	7731	+ this_cpu_ptr(&push_work));
	7732	+ /*
	7733	+ * At this point need_resched() is true and we'll take the loop in
	7734	+ * schedule(). The next pick is obviously going to be the stop task
	7735	+ * which is_per_cpu_kthread() and will push this task away.
	7736	+ */
	7737	+ raw_spin_lock(&rq->lock);
	7738	+}
	7739	+
	7740	+static void balance_push_set(int cpu, bool on)
	7741	+{
	7742	+ struct rq *rq = cpu_rq(cpu);
	7743	+ struct rq_flags rf;
	7744	+
	7745	+ rq_lock_irqsave(rq, &rf);
	7746	+ if (on)
	7747	+ rq->balance_flags \|= BALANCE_PUSH;
	7748	+ else
	7749	+ rq->balance_flags &= ~BALANCE_PUSH;
	7750	+ rq_unlock_irqrestore(rq, &rf);
	7751	+}
	7752	+
	7753	+/*
	7754	+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
	7755	+ * inactive. All tasks which are not per CPU kernel threads are either
	7756	+ * pushed off this CPU now via balance_push() or placed on a different CPU
	7757	+ * during wakeup. Wait until the CPU is quiescent.
	7758	+ */
	7759	+static void balance_hotplug_wait(void)
	7760	+{
	7761	+ struct rq *rq = this_rq();
	7762	+
	7763	+ rcuwait_wait_event(&rq->hotplug_wait,
	7764	+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
	7765	+ TASK_UNINTERRUPTIBLE);
	7766	+}
	7767	+
	7768	+static int drain_rq_cpu_stop(void *data)
	7769	+{
	7770	+#ifndef CONFIG_PREEMPT_RT
	7771	+ struct rq *rq = this_rq();
	7772	+ struct rq_flags rf;
	7773	+
	7774	+ rq_lock_irqsave(rq, &rf);
	7775	+ migrate_tasks(rq, &rf, false);
	7776	+ rq_unlock_irqrestore(rq, &rf);
	7777	+#endif
	7778	+ return 0;
	7779	+}
	7780	+
	7781	+int sched_cpu_drain_rq(unsigned int cpu)
	7782	+{
	7783	+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
	7784	+ struct cpu_stop_done *rq_drain_done = &(cpu_rq(cpu)->drain_done);
	7785	+
	7786	+ if (idle_cpu(cpu)) {
	7787	+ rq_drain->done = NULL;
	7788	+ return 0;
	7789	+ }
	7790	+
	7791	+ return stop_one_cpu_async(cpu, drain_rq_cpu_stop, NULL, rq_drain,
	7792	+ rq_drain_done);
	7793	+}
	7794	+
	7795	+void sched_cpu_drain_rq_wait(unsigned int cpu)
	7796	+{
	7797	+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
	7798	+
	7799	+ if (rq_drain->done)
	7800	+ cpu_stop_work_wait(rq_drain);
	7801	+}
	7802	+
	7803	+#else
	7804	+
	7805	+static inline void balance_push(struct rq *rq)
	7806	+{
	7807	+}
	7808	+
	7809	+static inline void balance_push_set(int cpu, bool on)
	7810	+{
	7811	+}
	7812	+
	7813	+static inline void balance_hotplug_wait(void)
	7814	+{
	7815	+}
	7816	+
6350	7817	#endif /* CONFIG_HOTPLUG_CPU */
6351	7818
6352	7819	void set_rq_online(struct rq *rq)
..	..	@@ -6417,8 +7884,10 @@
6417	7884	static int cpuset_cpu_inactive(unsigned int cpu)
6418	7885	{
6419	7886	if (!cpuhp_tasks_frozen) {
6420		- if (dl_cpu_busy(cpu))
6421		- return -EBUSY;
	7887	+ int ret = dl_cpu_busy(cpu, NULL);
	7888	+
	7889	+ if (ret)
	7890	+ return ret;
6422	7891	cpuset_update_active_cpus();
6423	7892	} else {
6424	7893	num_cpus_frozen++;
..	..	@@ -6431,6 +7900,8 @@
6431	7900	{
6432	7901	struct rq *rq = cpu_rq(cpu);
6433	7902	struct rq_flags rf;
	7903	+
	7904	+ balance_push_set(cpu, false);
6434	7905
6435	7906	#ifdef CONFIG_SCHED_SMT
6436	7907	/*
..	..	@@ -6467,19 +7938,39 @@
6467	7938	return 0;
6468	7939	}
6469	7940
6470		-int sched_cpu_deactivate(unsigned int cpu)
	7941	+int sched_cpus_activate(struct cpumask *cpus)
6471	7942	{
	7943	+ unsigned int cpu;
	7944	+
	7945	+ for_each_cpu(cpu, cpus) {
	7946	+ if (sched_cpu_activate(cpu)) {
	7947	+ for_each_cpu_and(cpu, cpus, cpu_active_mask)
	7948	+ sched_cpu_deactivate(cpu);
	7949	+
	7950	+ return -EBUSY;
	7951	+ }
	7952	+ }
	7953	+
	7954	+ return 0;
	7955	+}
	7956	+
	7957	+int _sched_cpu_deactivate(unsigned int cpu)
	7958	+{
	7959	+ struct rq *rq = cpu_rq(cpu);
	7960	+ struct rq_flags rf;
6472	7961	int ret;
6473	7962
6474	7963	set_cpu_active(cpu, false);
6475		- /*
6476		- * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
6477		- * users of this state to go away such that all new such users will
6478		- * observe it.
6479		- *
6480		- * Do sync before park smpboot threads to take care the rcu boost case.
6481		- */
6482		- synchronize_rcu_mult(call_rcu, call_rcu_sched);
	7964	+
	7965	+ balance_push_set(cpu, true);
	7966	+
	7967	+ rq_lock_irqsave(rq, &rf);
	7968	+ if (rq->rd) {
	7969	+ update_rq_clock(rq);
	7970	+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
	7971	+ set_rq_offline(rq);
	7972	+ }
	7973	+ rq_unlock_irqrestore(rq, &rf);
6483	7974
6484	7975	#ifdef CONFIG_SCHED_SMT
6485	7976	/*
..	..	@@ -6494,10 +7985,51 @@
6494	7985
6495	7986	ret = cpuset_cpu_inactive(cpu);
6496	7987	if (ret) {
	7988	+ balance_push_set(cpu, false);
6497	7989	set_cpu_active(cpu, true);
6498	7990	return ret;
6499	7991	}
6500	7992	sched_domains_numa_masks_clear(cpu);
	7993	+
	7994	+ update_max_interval();
	7995	+
	7996	+ return 0;
	7997	+}
	7998	+
	7999	+int sched_cpu_deactivate(unsigned int cpu)
	8000	+{
	8001	+ int ret = _sched_cpu_deactivate(cpu);
	8002	+
	8003	+ if (ret)
	8004	+ return ret;
	8005	+
	8006	+ /*
	8007	+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
	8008	+ * users of this state to go away such that all new such users will
	8009	+ * observe it.
	8010	+ *
	8011	+ * Do sync before park smpboot threads to take care the rcu boost case.
	8012	+ */
	8013	+ synchronize_rcu();
	8014	+
	8015	+ return 0;
	8016	+}
	8017	+
	8018	+int sched_cpus_deactivate_nosync(struct cpumask *cpus)
	8019	+{
	8020	+ unsigned int cpu;
	8021	+
	8022	+ for_each_cpu(cpu, cpus) {
	8023	+ if (_sched_cpu_deactivate(cpu)) {
	8024	+ for_each_cpu(cpu, cpus) {
	8025	+ if (!cpu_active(cpu))
	8026	+ sched_cpu_activate(cpu);
	8027	+ }
	8028	+
	8029	+ return -EBUSY;
	8030	+ }
	8031	+ }
	8032	+
6501	8033	return 0;
6502	8034	}
6503	8035
..	..	@@ -6506,37 +8038,67 @@
6506	8038	struct rq *rq = cpu_rq(cpu);
6507	8039
6508	8040	rq->calc_load_update = calc_load_update;
6509		- update_max_interval();
6510	8041	}
6511	8042
6512	8043	int sched_cpu_starting(unsigned int cpu)
6513	8044	{
6514	8045	sched_rq_cpu_starting(cpu);
6515	8046	sched_tick_start(cpu);
	8047	+ trace_android_rvh_sched_cpu_starting(cpu);
6516	8048	return 0;
6517	8049	}
6518	8050
6519	8051	#ifdef CONFIG_HOTPLUG_CPU
	8052	+
	8053	+/*
	8054	+ * Invoked immediately before the stopper thread is invoked to bring the
	8055	+ * CPU down completely. At this point all per CPU kthreads except the
	8056	+ * hotplug thread (current) and the stopper thread (inactive) have been
	8057	+ * either parked or have been unbound from the outgoing CPU. Ensure that
	8058	+ * any of those which might be on the way out are gone.
	8059	+ *
	8060	+ * If after this point a bound task is being woken on this CPU then the
	8061	+ * responsible hotplug callback has failed to do it's job.
	8062	+ * sched_cpu_dying() will catch it with the appropriate fireworks.
	8063	+ */
	8064	+int sched_cpu_wait_empty(unsigned int cpu)
	8065	+{
	8066	+ balance_hotplug_wait();
	8067	+ return 0;
	8068	+}
	8069	+
	8070	+/*
	8071	+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
	8072	+ * might have. Called from the CPU stopper task after ensuring that the
	8073	+ * stopper is the last running task on the CPU, so nr_active count is
	8074	+ * stable. We need to take the teardown thread which is calling this into
	8075	+ * account, so we hand in adjust = 1 to the load calculation.
	8076	+ *
	8077	+ * Also see the comment "Global load-average calculations".
	8078	+ */
	8079	+static void calc_load_migrate(struct rq *rq)
	8080	+{
	8081	+ long delta = calc_load_fold_active(rq, 1);
	8082	+
	8083	+ if (delta)
	8084	+ atomic_long_add(delta, &calc_load_tasks);
	8085	+}
	8086	+
6520	8087	int sched_cpu_dying(unsigned int cpu)
6521	8088	{
6522	8089	struct rq *rq = cpu_rq(cpu);
6523	8090	struct rq_flags rf;
6524	8091
6525	8092	/* Handle pending wakeups and then migrate everything off */
6526		- sched_ttwu_pending();
6527	8093	sched_tick_stop(cpu);
6528	8094
6529	8095	rq_lock_irqsave(rq, &rf);
6530		- if (rq->rd) {
6531		- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6532		- set_rq_offline(rq);
6533		- }
6534		- migrate_tasks(rq, &rf);
6535		- BUG_ON(rq->nr_running != 1);
	8096	+ BUG_ON(rq->nr_running != 1 \|\| rq_has_pinned_tasks(rq));
6536	8097	rq_unlock_irqrestore(rq, &rf);
6537	8098
	8099	+ trace_android_rvh_sched_cpu_dying(cpu);
	8100	+
6538	8101	calc_load_migrate(rq);
6539		- update_max_interval();
6540	8102	nohz_balance_exit_idle(rq);
6541	8103	hrtick_clear(rq);
6542	8104	return 0;
..	..	@@ -6550,18 +8112,16 @@
6550	8112	/*
6551	8113	* There's no userspace yet to cause hotplug operations; hence all the
6552	8114	* CPU masks are stable and all blatant races in the below code cannot
6553		- * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
6554		- * but there won't be any contention on it.
	8115	+ * happen.
6555	8116	*/
6556		- cpus_read_lock();
6557	8117	mutex_lock(&sched_domains_mutex);
6558	8118	sched_init_domains(cpu_active_mask);
6559	8119	mutex_unlock(&sched_domains_mutex);
6560		- cpus_read_unlock();
6561	8120
6562	8121	/* Move init over to a non-isolated CPU */
6563	8122	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
6564	8123	BUG();
	8124	+
6565	8125	sched_init_granularity();
6566	8126
6567	8127	init_sched_rt_class();
..	..	@@ -6572,7 +8132,7 @@
6572	8132
6573	8133	static int __init migration_init(void)
6574	8134	{
6575		- sched_rq_cpu_starting(smp_processor_id());
	8135	+ sched_cpu_starting(smp_processor_id());
6576	8136	return 0;
6577	8137	}
6578	8138	early_initcall(migration_init);
..	..	@@ -6597,7 +8157,9 @@
6597	8157	* Every task in system belongs to this group at bootup.
6598	8158	*/
6599	8159	struct task_group root_task_group;
	8160	+EXPORT_SYMBOL_GPL(root_task_group);
6600	8161	LIST_HEAD(task_groups);
	8162	+EXPORT_SYMBOL_GPL(task_groups);
6601	8163
6602	8164	/* Cacheline aligned slab cache for task_group */
6603	8165	static struct kmem_cache *task_group_cache __read_mostly;
..	..	@@ -6608,19 +8170,27 @@
6608	8170
6609	8171	void __init sched_init(void)
6610	8172	{
6611		- int i, j;
6612		- unsigned long alloc_size = 0, ptr;
	8173	+ unsigned long ptr = 0;
	8174	+ int i;
	8175	+
	8176	+ /* Make sure the linker didn't screw up */
	8177	+ BUG_ON(&idle_sched_class + 1 != &fair_sched_class \|\|
	8178	+ &fair_sched_class + 1 != &rt_sched_class \|\|
	8179	+ &rt_sched_class + 1 != &dl_sched_class);
	8180	+#ifdef CONFIG_SMP
	8181	+ BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
	8182	+#endif
6613	8183
6614	8184	wait_bit_init();
6615	8185
6616	8186	#ifdef CONFIG_FAIR_GROUP_SCHED
6617		- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
	8187	+ ptr += 2 * nr_cpu_ids * sizeof(void **);
6618	8188	#endif
6619	8189	#ifdef CONFIG_RT_GROUP_SCHED
6620		- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
	8190	+ ptr += 2 * nr_cpu_ids * sizeof(void **);
6621	8191	#endif
6622		- if (alloc_size) {
6623		- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
	8192	+ if (ptr) {
	8193	+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
6624	8194
6625	8195	#ifdef CONFIG_FAIR_GROUP_SCHED
6626	8196	root_task_group.se = (struct sched_entity **)ptr;
..	..	@@ -6629,6 +8199,8 @@
6629	8199	root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6630	8200	ptr += nr_cpu_ids * sizeof(void **);
6631	8201
	8202	+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
	8203	+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6632	8204	#endif /* CONFIG_FAIR_GROUP_SCHED */
6633	8205	#ifdef CONFIG_RT_GROUP_SCHED
6634	8206	root_task_group.rt_se = (struct sched_rt_entity **)ptr;
..	..	@@ -6681,7 +8253,6 @@
6681	8253	init_rt_rq(&rq->rt);
6682	8254	init_dl_rq(&rq->dl);
6683	8255	#ifdef CONFIG_FAIR_GROUP_SCHED
6684		- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6685	8256	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6686	8257	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
6687	8258	/*
..	..	@@ -6703,7 +8274,6 @@
6703	8274	* We achieve this by letting root_task_group's tasks sit
6704	8275	* directly in rq->cfs (i.e root_task_group->se[] = NULL).
6705	8276	*/
6706		- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6707	8277	init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6708	8278	#endif /* CONFIG_FAIR_GROUP_SCHED */
6709	8279
..	..	@@ -6711,10 +8281,6 @@
6711	8281	#ifdef CONFIG_RT_GROUP_SCHED
6712	8282	init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6713	8283	#endif
6714		-
6715		- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6716		- rq->cpu_load[j] = 0;
6717		-
6718	8284	#ifdef CONFIG_SMP
6719	8285	rq->sd = NULL;
6720	8286	rq->rd = NULL;
..	..	@@ -6733,16 +8299,20 @@
6733	8299
6734	8300	rq_attach_root(rq, &def_root_domain);
6735	8301	#ifdef CONFIG_NO_HZ_COMMON
6736		- rq->last_load_update_tick = jiffies;
6737	8302	rq->last_blocked_load_update_tick = jiffies;
6738	8303	atomic_set(&rq->nohz_flags, 0);
	8304	+
	8305	+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
	8306	+#endif
	8307	+#ifdef CONFIG_HOTPLUG_CPU
	8308	+ rcuwait_init(&rq->hotplug_wait);
6739	8309	#endif
6740	8310	#endif /* CONFIG_SMP */
6741	8311	hrtick_rq_init(rq);
6742	8312	atomic_set(&rq->nr_iowait, 0);
6743	8313	}
6744	8314
6745		- set_load_weight(&init_task, false);
	8315	+ set_load_weight(&init_task);
6746	8316
6747	8317	/*
6748	8318	* The boot idle thread does lazy MMU switching as well:
..	..	@@ -6777,7 +8347,7 @@
6777	8347	#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6778	8348	static inline int preempt_count_equals(int preempt_offset)
6779	8349	{
6780		- int nested = preempt_count() + rcu_preempt_depth();
	8350	+ int nested = preempt_count() + sched_rcu_preempt_depth();
6781	8351
6782	8352	return (nested == preempt_offset);
6783	8353	}
..	..	@@ -6811,7 +8381,7 @@
6811	8381	rcu_sleep_check();
6812	8382
6813	8383	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6814		- !is_idle_task(current)) \|\|
	8384	+ !is_idle_task(current) && !current->non_block_count) \|\|
6815	8385	system_state == SYSTEM_BOOTING \|\| system_state > SYSTEM_RUNNING \|\|
6816	8386	oops_in_progress)
6817	8387	return;
..	..	@@ -6827,8 +8397,8 @@
6827	8397	"BUG: sleeping function called from invalid context at %s:%d\n",
6828	8398	file, line);
6829	8399	printk(KERN_ERR
6830		- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6831		- in_atomic(), irqs_disabled(),
	8400	+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
	8401	+ in_atomic(), irqs_disabled(), current->non_block_count,
6832	8402	current->pid, current->comm);
6833	8403
6834	8404	if (task_stack_end_corrupted(current))
..	..	@@ -6840,13 +8410,76 @@
6840	8410	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
6841	8411	&& !preempt_count_equals(preempt_offset)) {
6842	8412	pr_err("Preemption disabled at:");
6843		- print_ip_sym(preempt_disable_ip);
6844		- pr_cont("\n");
	8413	+ print_ip_sym(KERN_ERR, preempt_disable_ip);
6845	8414	}
	8415	+
	8416	+ trace_android_rvh_schedule_bug(NULL);
	8417	+
6846	8418	dump_stack();
6847	8419	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
6848	8420	}
6849	8421	EXPORT_SYMBOL(___might_sleep);
	8422	+
	8423	+void __cant_sleep(const char *file, int line, int preempt_offset)
	8424	+{
	8425	+ static unsigned long prev_jiffy;
	8426	+
	8427	+ if (irqs_disabled())
	8428	+ return;
	8429	+
	8430	+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
	8431	+ return;
	8432	+
	8433	+ if (preempt_count() > preempt_offset)
	8434	+ return;
	8435	+
	8436	+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
	8437	+ return;
	8438	+ prev_jiffy = jiffies;
	8439	+
	8440	+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
	8441	+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
	8442	+ in_atomic(), irqs_disabled(),
	8443	+ current->pid, current->comm);
	8444	+
	8445	+ debug_show_held_locks(current);
	8446	+ dump_stack();
	8447	+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
	8448	+}
	8449	+EXPORT_SYMBOL_GPL(__cant_sleep);
	8450	+
	8451	+#ifdef CONFIG_SMP
	8452	+void __cant_migrate(const char *file, int line)
	8453	+{
	8454	+ static unsigned long prev_jiffy;
	8455	+
	8456	+ if (irqs_disabled())
	8457	+ return;
	8458	+
	8459	+ if (is_migration_disabled(current))
	8460	+ return;
	8461	+
	8462	+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
	8463	+ return;
	8464	+
	8465	+ if (preempt_count() > 0)
	8466	+ return;
	8467	+
	8468	+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
	8469	+ return;
	8470	+ prev_jiffy = jiffies;
	8471	+
	8472	+ pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
	8473	+ pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
	8474	+ in_atomic(), irqs_disabled(), is_migration_disabled(current),
	8475	+ current->pid, current->comm);
	8476	+
	8477	+ debug_show_held_locks(current);
	8478	+ dump_stack();
	8479	+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
	8480	+}
	8481	+EXPORT_SYMBOL_GPL(__cant_migrate);
	8482	+#endif
6850	8483	#endif
6851	8484
6852	8485	#ifdef CONFIG_MAGIC_SYSRQ
..	..	@@ -6915,7 +8548,7 @@
6915	8548
6916	8549	#ifdef CONFIG_IA64
6917	8550	/**
6918		- * set_curr_task - set the current task for a given CPU.
	8551	+ * ia64_set_curr_task - set the current task for a given CPU.
6919	8552	* @cpu: the processor in question.
6920	8553	* @p: the task pointer to set.
6921	8554	*
..	..	@@ -7081,8 +8714,15 @@
7081	8714
7082	8715	if (queued)
7083	8716	enqueue_task(rq, tsk, queue_flags);
7084		- if (running)
7085		- set_curr_task(rq, tsk);
	8717	+ if (running) {
	8718	+ set_next_task(rq, tsk);
	8719	+ /*
	8720	+ * After changing group, the running task may have joined a
	8721	+ * throttled one but it's still the running task. Trigger a
	8722	+ * resched to make sure that task can still run.
	8723	+ */
	8724	+ resched_curr(rq);
	8725	+ }
7086	8726
7087	8727	task_rq_unlock(rq, tsk, &rf);
7088	8728	}
..	..	@@ -7121,9 +8761,14 @@
7121	8761
7122	8762	#ifdef CONFIG_UCLAMP_TASK_GROUP
7123	8763	/* Propagate the effective uclamp value for the new group */
	8764	+ mutex_lock(&uclamp_mutex);
	8765	+ rcu_read_lock();
7124	8766	cpu_util_update_eff(css);
	8767	+ rcu_read_unlock();
	8768	+ mutex_unlock(&uclamp_mutex);
7125	8769	#endif
7126	8770
	8771	+ trace_android_rvh_cpu_cgroup_online(css);
7127	8772	return 0;
7128	8773	}
7129	8774
..	..	@@ -7189,6 +8834,9 @@
7189	8834	if (ret)
7190	8835	break;
7191	8836	}
	8837	+
	8838	+ trace_android_rvh_cpu_cgroup_can_attach(tset, &ret);
	8839	+
7192	8840	return ret;
7193	8841	}
7194	8842
..	..	@@ -7199,6 +8847,8 @@
7199	8847
7200	8848	cgroup_taskset_for_each(task, css, tset)
7201	8849	sched_move_task(task);
	8850	+
	8851	+ trace_android_rvh_cpu_cgroup_attach(tset);
7202	8852	}
7203	8853
7204	8854	#ifdef CONFIG_UCLAMP_TASK_GROUP
..	..	@@ -7210,6 +8860,9 @@
7210	8860	unsigned int eff[UCLAMP_CNT];
7211	8861	enum uclamp_id clamp_id;
7212	8862	unsigned int clamps;
	8863	+
	8864	+ lockdep_assert_held(&uclamp_mutex);
	8865	+ SCHED_WARN_ON(!rcu_read_lock_held());
7213	8866
7214	8867	css_for_each_descendant_pre(css, top_css) {
7215	8868	uc_parent = css_tg(css)->parent
..	..	@@ -7243,7 +8896,7 @@
7243	8896	}
7244	8897
7245	8898	/* Immediately update descendants RUNNABLE tasks */
7246		- uclamp_update_active_tasks(css, clamps);
	8899	+ uclamp_update_active_tasks(css);
7247	8900	}
7248	8901	}
7249	8902
..	..	@@ -7300,6 +8953,8 @@
7300	8953	req = capacity_from_percent(buf);
7301	8954	if (req.ret)
7302	8955	return req.ret;
	8956	+
	8957	+ static_branch_enable(&sched_uclamp_used);
7303	8958
7304	8959	mutex_lock(&uclamp_mutex);
7305	8960	rcu_read_lock();
..	..	@@ -7415,7 +9070,9 @@
7415	9070	static DEFINE_MUTEX(cfs_constraints_mutex);
7416	9071
7417	9072	const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7418		-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
	9073	+static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
	9074	+/* More than 203 days if BW_SHIFT equals 20. */
	9075	+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
7419	9076
7420	9077	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7421	9078
..	..	@@ -7441,6 +9098,12 @@
7441	9098	* feasibility.
7442	9099	*/
7443	9100	if (period > max_cfs_quota_period)
	9101	+ return -EINVAL;
	9102	+
	9103	+ /*
	9104	+ * Bound quota to defend quota against overflow during bandwidth shift.
	9105	+ */
	9106	+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
7444	9107	return -EINVAL;
7445	9108
7446	9109	/*
..	..	@@ -7495,7 +9158,7 @@
7495	9158	return ret;
7496	9159	}
7497	9160
7498		-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
	9161	+static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7499	9162	{
7500	9163	u64 quota, period;
7501	9164
..	..	@@ -7510,7 +9173,7 @@
7510	9173	return tg_set_cfs_bandwidth(tg, period, quota);
7511	9174	}
7512	9175
7513		-long tg_get_cfs_quota(struct task_group *tg)
	9176	+static long tg_get_cfs_quota(struct task_group *tg)
7514	9177	{
7515	9178	u64 quota_us;
7516	9179
..	..	@@ -7523,7 +9186,7 @@
7523	9186	return quota_us;
7524	9187	}
7525	9188
7526		-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
	9189	+static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7527	9190	{
7528	9191	u64 quota, period;
7529	9192
..	..	@@ -7536,7 +9199,7 @@
7536	9199	return tg_set_cfs_bandwidth(tg, period, quota);
7537	9200	}
7538	9201
7539		-long tg_get_cfs_period(struct task_group *tg)
	9202	+static long tg_get_cfs_period(struct task_group *tg)
7540	9203	{
7541	9204	u64 cfs_period_us;
7542	9205
..	..	@@ -8013,4 +9676,7 @@
8013	9676	/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
8014	9677	};
8015	9678
8016		-#undef CREATE_TRACE_POINTS
	9679	+void call_trace_sched_update_nr_running(struct rq *rq, int count)
	9680	+{
	9681	+ trace_sched_update_nr_running_tp(rq, count);
	9682	+}