~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* kernel/sched/core.c
3	4	*
..	..	@@ -5,6 +6,10 @@
5	6	*
6	7	* Copyright (C) 1991-2002 Linus Torvalds
7	8	*/
	9	+#define CREATE_TRACE_POINTS
	10	+#include <trace/events/sched.h>
	11	+#undef CREATE_TRACE_POINTS
	12	+
8	13	#include "sched.h"
9	14
10	15	#include <linux/nospec.h>
..	..	@@ -16,14 +21,41 @@
16	21	#include <asm/tlb.h>
17	22
18	23	#include "../workqueue_internal.h"
	24	+#include "../../io_uring/io-wq.h"
19	25	#include "../smpboot.h"
20	26
21	27	#include "pelt.h"
	28	+#include "smp.h"
22	29
23		-#define CREATE_TRACE_POINTS
24		-#include <trace/events/sched.h>
	30	+#include <trace/hooks/sched.h>
	31	+#include <trace/hooks/dtask.h>
	32	+
	33	+/*
	34	+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
	35	+ * associated with them) to allow external modules to probe them.
	36	+ */
	37	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
	38	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
	39	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
	40	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
	41	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
	42	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
	43	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
	44	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
	45	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
	46	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
	47	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
	48	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_switch);
	49	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking);
	50	+#ifdef CONFIG_SCHEDSTATS
	51	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep);
	52	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait);
	53	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait);
	54	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked);
	55	+#endif
25	56
26	57	DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
	58	+EXPORT_SYMBOL_GPL(runqueues);
27	59
28	60	#ifdef CONFIG_SCHED_DEBUG
29	61	/*
..	..	@@ -38,6 +70,7 @@
38	70	const_debug unsigned int sysctl_sched_features =
39	71	#include "features.h"
40	72	0;
	73	+EXPORT_SYMBOL_GPL(sysctl_sched_features);
41	74	#undef SCHED_FEAT
42	75	#endif
43	76
..	..	@@ -45,11 +78,7 @@
45	78	* Number of tasks to iterate in a single balance run.
46	79	* Limited because this is done with IRQs disabled.
47	80	*/
48		-#ifdef CONFIG_PREEMPT_RT_FULL
49		-const_debug unsigned int sysctl_sched_nr_migrate = 8;
50		-#else
51	81	const_debug unsigned int sysctl_sched_nr_migrate = 32;
52		-#endif
53	82
54	83	/*
55	84	* period over which we measure -rt task CPU usage in us.
..	..	@@ -64,6 +93,100 @@
64	93	* default: 0.95s
65	94	*/
66	95	int sysctl_sched_rt_runtime = 950000;
	96	+
	97	+
	98	+/*
	99	+ * Serialization rules:
	100	+ *
	101	+ * Lock order:
	102	+ *
	103	+ * p->pi_lock
	104	+ * rq->lock
	105	+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
	106	+ *
	107	+ * rq1->lock
	108	+ * rq2->lock where: rq1 < rq2
	109	+ *
	110	+ * Regular state:
	111	+ *
	112	+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
	113	+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
	114	+ * always looks at the local rq data structures to find the most elegible task
	115	+ * to run next.
	116	+ *
	117	+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
	118	+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
	119	+ * the local CPU to avoid bouncing the runqueue state around [ see
	120	+ * ttwu_queue_wakelist() ]
	121	+ *
	122	+ * Task wakeup, specifically wakeups that involve migration, are horribly
	123	+ * complicated to avoid having to take two rq->locks.
	124	+ *
	125	+ * Special state:
	126	+ *
	127	+ * System-calls and anything external will use task_rq_lock() which acquires
	128	+ * both p->pi_lock and rq->lock. As a consequence the state they change is
	129	+ * stable while holding either lock:
	130	+ *
	131	+ * - sched_setaffinity()/
	132	+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
	133	+ * - set_user_nice(): p->se.load, p->*prio
	134	+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
	135	+ * p->se.load, p->rt_priority,
	136	+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
	137	+ * - sched_setnuma(): p->numa_preferred_nid
	138	+ * - sched_move_task()/
	139	+ * cpu_cgroup_fork(): p->sched_task_group
	140	+ * - uclamp_update_active() p->uclamp*
	141	+ *
	142	+ * p->state <- TASK_*:
	143	+ *
	144	+ * is changed locklessly using set_current_state(), __set_current_state() or
	145	+ * set_special_state(), see their respective comments, or by
	146	+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
	147	+ * concurrent self.
	148	+ *
	149	+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
	150	+ *
	151	+ * is set by activate_task() and cleared by deactivate_task(), under
	152	+ * rq->lock. Non-zero indicates the task is runnable, the special
	153	+ * ON_RQ_MIGRATING state is used for migration without holding both
	154	+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
	155	+ *
	156	+ * p->on_cpu <- { 0, 1 }:
	157	+ *
	158	+ * is set by prepare_task() and cleared by finish_task() such that it will be
	159	+ * set before p is scheduled-in and cleared after p is scheduled-out, both
	160	+ * under rq->lock. Non-zero indicates the task is running on its CPU.
	161	+ *
	162	+ * [ The astute reader will observe that it is possible for two tasks on one
	163	+ * CPU to have ->on_cpu = 1 at the same time. ]
	164	+ *
	165	+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
	166	+ *
	167	+ * - Don't call set_task_cpu() on a blocked task:
	168	+ *
	169	+ * We don't care what CPU we're not running on, this simplifies hotplug,
	170	+ * the CPU assignment of blocked tasks isn't required to be valid.
	171	+ *
	172	+ * - for try_to_wake_up(), called under p->pi_lock:
	173	+ *
	174	+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
	175	+ *
	176	+ * - for migration called under rq->lock:
	177	+ * [ see task_on_rq_migrating() in task_rq_lock() ]
	178	+ *
	179	+ * o move_queued_task()
	180	+ * o detach_task()
	181	+ *
	182	+ * - for migration called under double_rq_lock():
	183	+ *
	184	+ * o __migrate_swap_task()
	185	+ * o push_rt_task() / pull_rt_task()
	186	+ * o push_dl_task() / pull_dl_task()
	187	+ * o dl_task_offline_migration()
	188	+ *
	189	+ */
67	190
68	191	/*
69	192	* __task_rq_lock - lock the rq @p resides on.
..	..	@@ -88,6 +211,7 @@
88	211	cpu_relax();
89	212	}
90	213	}
	214	+EXPORT_SYMBOL_GPL(__task_rq_lock);
91	215
92	216	/*
93	217	* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
..	..	@@ -130,6 +254,7 @@
130	254	cpu_relax();
131	255	}
132	256	}
	257	+EXPORT_SYMBOL_GPL(task_rq_lock);
133	258
134	259	/*
135	260	* RQ-clock updating methods:
..	..	@@ -210,7 +335,15 @@
210	335	rq->clock += delta;
211	336	update_rq_clock_task(rq, delta);
212	337	}
	338	+EXPORT_SYMBOL_GPL(update_rq_clock);
213	339
	340	+static inline void
	341	+rq_csd_init(struct rq rq, struct __call_single_data csd, smp_call_func_t func)
	342	+{
	343	+ csd->flags = 0;
	344	+ csd->func = func;
	345	+ csd->info = rq;
	346	+}
214	347
215	348	#ifdef CONFIG_SCHED_HRTICK
216	349	/*
..	..	@@ -247,8 +380,9 @@
247	380	static void __hrtick_restart(struct rq *rq)
248	381	{
249	382	struct hrtimer *timer = &rq->hrtick_timer;
	383	+ ktime_t time = rq->hrtick_time;
250	384
251		- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
	385	+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
252	386	}
253	387
254	388	/*
..	..	@@ -261,7 +395,6 @@
261	395
262	396	rq_lock(rq, &rf);
263	397	__hrtick_restart(rq);
264		- rq->hrtick_csd_pending = 0;
265	398	rq_unlock(rq, &rf);
266	399	}
267	400
..	..	@@ -273,7 +406,6 @@
273	406	void hrtick_start(struct rq *rq, u64 delay)
274	407	{
275	408	struct hrtimer *timer = &rq->hrtick_timer;
276		- ktime_t time;
277	409	s64 delta;
278	410
279	411	/*
..	..	@@ -281,16 +413,12 @@
281	413	* doesn't make sense and can cause timer DoS.
282	414	*/
283	415	delta = max_t(s64, delay, 10000LL);
284		- time = ktime_add_ns(timer->base->get_time(), delta);
	416	+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
285	417
286		- hrtimer_set_expires(timer, time);
287		-
288		- if (rq == this_rq()) {
	418	+ if (rq == this_rq())
289	419	__hrtick_restart(rq);
290		- } else if (!rq->hrtick_csd_pending) {
	420	+ else
291	421	smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
292		- rq->hrtick_csd_pending = 1;
293		- }
294	422	}
295	423
296	424	#else
..	..	@@ -307,20 +435,16 @@
307	435	*/
308	436	delay = max_t(u64, delay, 10000LL);
309	437	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
310		- HRTIMER_MODE_REL_PINNED);
	438	+ HRTIMER_MODE_REL_PINNED_HARD);
311	439	}
	440	+
312	441	#endif /* CONFIG_SMP */
313	442
314	443	static void hrtick_rq_init(struct rq *rq)
315	444	{
316	445	#ifdef CONFIG_SMP
317		- rq->hrtick_csd_pending = 0;
318		-
319		- rq->hrtick_csd.flags = 0;
320		- rq->hrtick_csd.func = __hrtick_start;
321		- rq->hrtick_csd.info = rq;
	446	+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
322	447	#endif
323		-
324	448	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
325	449	rq->hrtick_timer.function = hrtick;
326	450	}
..	..	@@ -403,15 +527,9 @@
403	527	#endif
404	528	#endif
405	529
406		-void __wake_q_add(struct wake_q_head head, struct task_struct task,
407		- bool sleeper)
	530	+static bool __wake_q_add(struct wake_q_head head, struct task_struct task)
408	531	{
409		- struct wake_q_node *node;
410		-
411		- if (sleeper)
412		- node = &task->wake_q_sleeper;
413		- else
414		- node = &task->wake_q;
	532	+ struct wake_q_node *node = &task->wake_q;
415	533
416	534	/*
417	535	* Atomically grab the task, if ->wake_q is !nil already it means
..	..	@@ -422,50 +540,79 @@
422	540	* state, even in the failed case, an explicit smp_mb() must be used.
423	541	*/
424	542	smp_mb__before_atomic();
425		- if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
426		- return;
427		-
428		- head->count++;
429		-
430		- get_task_struct(task);
	543	+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
	544	+ return false;
431	545
432	546	/*
433	547	* The head is context local, there can be no concurrency.
434	548	*/
435	549	*head->lastp = node;
436	550	head->lastp = &node->next;
	551	+ head->count++;
	552	+ return true;
437	553	}
438	554
439		-static int
440		-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
441		- int sibling_count_hint);
442		-void __wake_up_q(struct wake_q_head *head, bool sleeper)
	555	+/**
	556	+ * wake_q_add() - queue a wakeup for 'later' waking.
	557	+ * @head: the wake_q_head to add @task to
	558	+ * @task: the task to queue for 'later' wakeup
	559	+ *
	560	+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
	561	+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
	562	+ * instantly.
	563	+ *
	564	+ * This function must be used as-if it were wake_up_process(); IOW the task
	565	+ * must be ready to be woken at this location.
	566	+ */
	567	+void wake_q_add(struct wake_q_head head, struct task_struct task)
	568	+{
	569	+ if (__wake_q_add(head, task))
	570	+ get_task_struct(task);
	571	+}
	572	+
	573	+/**
	574	+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
	575	+ * @head: the wake_q_head to add @task to
	576	+ * @task: the task to queue for 'later' wakeup
	577	+ *
	578	+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
	579	+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
	580	+ * instantly.
	581	+ *
	582	+ * This function must be used as-if it were wake_up_process(); IOW the task
	583	+ * must be ready to be woken at this location.
	584	+ *
	585	+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
	586	+ * that already hold reference to @task can call the 'safe' version and trust
	587	+ * wake_q to do the right thing depending whether or not the @task is already
	588	+ * queued for wakeup.
	589	+ */
	590	+void wake_q_add_safe(struct wake_q_head head, struct task_struct task)
	591	+{
	592	+ if (!__wake_q_add(head, task))
	593	+ put_task_struct(task);
	594	+}
	595	+
	596	+void wake_up_q(struct wake_q_head *head)
443	597	{
444	598	struct wake_q_node *node = head->first;
445	599
446	600	while (node != WAKE_Q_TAIL) {
447	601	struct task_struct *task;
448	602
449		- if (sleeper)
450		- task = container_of(node, struct task_struct, wake_q_sleeper);
451		- else
452		- task = container_of(node, struct task_struct, wake_q);
	603	+ task = container_of(node, struct task_struct, wake_q);
453	604	BUG_ON(!task);
454	605	/* Task can safely be re-inserted now: */
455	606	node = node->next;
456		- if (sleeper)
457		- task->wake_q_sleeper.next = NULL;
458		- else
459		- task->wake_q.next = NULL;
	607	+ task->wake_q.next = NULL;
	608	+ task->wake_q_count = head->count;
	609	+
460	610	/*
461	611	* wake_up_process() executes a full barrier, which pairs with
462	612	* the queueing in wake_q_add() so as not to miss wakeups.
463	613	*/
464		- if (sleeper)
465		- wake_up_lock_sleeper(task);
466		- else
467		- wake_up_process(task);
468		-
	614	+ wake_up_process(task);
	615	+ task->wake_q_count = 0;
469	616	put_task_struct(task);
470	617	}
471	618	}
..	..	@@ -495,57 +642,12 @@
495	642	return;
496	643	}
497	644
498		-#ifdef CONFIG_PREEMPT
499	645	if (set_nr_and_not_polling(curr))
500		-#else
501		- if (set_nr_and_not_polling(curr) && (rq->curr == rq->idle))
502		-#endif
503	646	smp_send_reschedule(cpu);
504	647	else
505	648	trace_sched_wake_idle_without_ipi(cpu);
506	649	}
507		-
508		-#ifdef CONFIG_PREEMPT_LAZY
509		-
510		-static int tsk_is_polling(struct task_struct *p)
511		-{
512		-#ifdef TIF_POLLING_NRFLAG
513		- return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
514		-#else
515		- return 0;
516		-#endif
517		-}
518		-
519		-void resched_curr_lazy(struct rq *rq)
520		-{
521		- struct task_struct *curr = rq->curr;
522		- int cpu;
523		-
524		- if (!sched_feat(PREEMPT_LAZY)) {
525		- resched_curr(rq);
526		- return;
527		- }
528		-
529		- lockdep_assert_held(&rq->lock);
530		-
531		- if (test_tsk_need_resched(curr))
532		- return;
533		-
534		- if (test_tsk_need_resched_lazy(curr))
535		- return;
536		-
537		- set_tsk_need_resched_lazy(curr);
538		-
539		- cpu = cpu_of(rq);
540		- if (cpu == smp_processor_id())
541		- return;
542		-
543		- /* NEED_RESCHED_LAZY must be visible before we test polling */
544		- smp_mb();
545		- if (!tsk_is_polling(curr))
546		- smp_send_reschedule(cpu);
547		-}
548		-#endif
	650	+EXPORT_SYMBOL_GPL(resched_curr);
549	651
550	652	void resched_cpu(int cpu)
551	653	{
..	..	@@ -570,27 +672,49 @@
570	672	*/
571	673	int get_nohz_timer_target(void)
572	674	{
573		- int i, cpu = smp_processor_id();
	675	+ int i, cpu = smp_processor_id(), default_cpu = -1;
574	676	struct sched_domain *sd;
575	677
576		- if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
577		- return cpu;
	678	+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) {
	679	+ if (!idle_cpu(cpu))
	680	+ return cpu;
	681	+ default_cpu = cpu;
	682	+ }
578	683
579	684	rcu_read_lock();
580	685	for_each_domain(cpu, sd) {
581		- for_each_cpu(i, sched_domain_span(sd)) {
	686	+ for_each_cpu_and(i, sched_domain_span(sd),
	687	+ housekeeping_cpumask(HK_FLAG_TIMER)) {
582	688	if (cpu == i)
583	689	continue;
584	690
585		- if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
	691	+ if (!idle_cpu(i)) {
586	692	cpu = i;
587	693	goto unlock;
588	694	}
589	695	}
590	696	}
591	697
592		- if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
593		- cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
	698	+ if (default_cpu == -1) {
	699	+ for_each_cpu_and(i, cpu_active_mask,
	700	+ housekeeping_cpumask(HK_FLAG_TIMER)) {
	701	+ if (cpu == i)
	702	+ continue;
	703	+
	704	+ if (!idle_cpu(i)) {
	705	+ cpu = i;
	706	+ goto unlock;
	707	+ }
	708	+ }
	709	+
	710	+ /* no active, not-idle, housekpeeing CPU found. */
	711	+ default_cpu = cpumask_any(cpu_active_mask);
	712	+
	713	+ if (unlikely(default_cpu >= nr_cpu_ids))
	714	+ goto unlock;
	715	+ }
	716	+
	717	+ cpu = default_cpu;
594	718	unlock:
595	719	rcu_read_unlock();
596	720	return cpu;
..	..	@@ -650,29 +774,23 @@
650	774	wake_up_idle_cpu(cpu);
651	775	}
652	776
653		-static inline bool got_nohz_idle_kick(void)
	777	+static void nohz_csd_func(void *info)
654	778	{
655		- int cpu = smp_processor_id();
656		-
657		- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
658		- return false;
659		-
660		- if (idle_cpu(cpu) && !need_resched())
661		- return true;
	779	+ struct rq *rq = info;
	780	+ int cpu = cpu_of(rq);
	781	+ unsigned int flags;
662	782
663	783	/*
664		- * We can't run Idle Load Balance on this CPU for this time so we
665		- * cancel it and clear NOHZ_BALANCE_KICK
	784	+ * Release the rq::nohz_csd.
666	785	*/
667		- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
668		- return false;
669		-}
	786	+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
	787	+ WARN_ON(!(flags & NOHZ_KICK_MASK));
670	788
671		-#else /* CONFIG_NO_HZ_COMMON */
672		-
673		-static inline bool got_nohz_idle_kick(void)
674		-{
675		- return false;
	789	+ rq->idle_balance = idle_cpu(cpu);
	790	+ if (rq->idle_balance && !need_resched()) {
	791	+ rq->nohz_idle_balance = flags;
	792	+ raise_softirq_irqoff(SCHED_SOFTIRQ);
	793	+ }
676	794	}
677	795
678	796	#endif /* CONFIG_NO_HZ_COMMON */
..	..	@@ -763,18 +881,18 @@
763	881	}
764	882	#endif
765	883
766		-static void set_load_weight(struct task_struct *p, bool update_load)
	884	+static void set_load_weight(struct task_struct *p)
767	885	{
	886	+ bool update_load = !(READ_ONCE(p->state) & TASK_NEW);
768	887	int prio = p->static_prio - MAX_RT_PRIO;
769	888	struct load_weight *load = &p->se.load;
770	889
771	890	/*
772	891	* SCHED_IDLE tasks get minimal weight:
773	892	*/
774		- if (idle_policy(p->policy)) {
	893	+ if (task_has_idle_policy(p)) {
775	894	load->weight = scale_load(WEIGHT_IDLEPRIO);
776	895	load->inv_weight = WMULT_IDLEPRIO;
777		- p->se.runnable_weight = load->weight;
778	896	return;
779	897	}
780	898
..	..	@@ -787,7 +905,6 @@
787	905	} else {
788	906	load->weight = scale_load(sched_prio_to_weight[prio]);
789	907	load->inv_weight = sched_prio_to_wmult[prio];
790		- p->se.runnable_weight = load->weight;
791	908	}
792	909	}
793	910
..	..	@@ -810,8 +927,46 @@
810	927	/* Max allowed maximum utilization */
811	928	unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
812	929
	930	+/*
	931	+ * By default RT tasks run at the maximum performance point/capacity of the
	932	+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
	933	+ * SCHED_CAPACITY_SCALE.
	934	+ *
	935	+ * This knob allows admins to change the default behavior when uclamp is being
	936	+ * used. In battery powered devices, particularly, running at the maximum
	937	+ * capacity and frequency will increase energy consumption and shorten the
	938	+ * battery life.
	939	+ *
	940	+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
	941	+ *
	942	+ * This knob will not override the system default sched_util_clamp_min defined
	943	+ * above.
	944	+ */
	945	+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
	946	+
813	947	/* All clamps are required to be less or equal than these values */
814	948	static struct uclamp_se uclamp_default[UCLAMP_CNT];
	949	+
	950	+/*
	951	+ * This static key is used to reduce the uclamp overhead in the fast path. It
	952	+ * primarily disables the call to uclamp_rq_{inc, dec}() in
	953	+ * enqueue/dequeue_task().
	954	+ *
	955	+ * This allows users to continue to enable uclamp in their kernel config with
	956	+ * minimum uclamp overhead in the fast path.
	957	+ *
	958	+ * As soon as userspace modifies any of the uclamp knobs, the static key is
	959	+ * enabled, since we have an actual users that make use of uclamp
	960	+ * functionality.
	961	+ *
	962	+ * The knobs that would enable this static key are:
	963	+ *
	964	+ * * A task modifying its uclamp value with sched_setattr().
	965	+ * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
	966	+ * * An admin modifying the cgroup cpu.uclamp.{min, max}
	967	+ */
	968	+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
	969	+EXPORT_SYMBOL_GPL(sched_uclamp_used);
815	970
816	971	/* Integer rounded range for each bucket */
817	972	#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
..	..	@@ -822,11 +977,6 @@
822	977	static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
823	978	{
824	979	return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
825		-}
826		-
827		-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
828		-{
829		- return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
830	980	}
831	981
832	982	static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
..	..	@@ -892,12 +1042,79 @@
892	1042	return uclamp_idle_value(rq, clamp_id, clamp_value);
893	1043	}
894	1044
	1045	+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
	1046	+{
	1047	+ unsigned int default_util_min;
	1048	+ struct uclamp_se *uc_se;
	1049	+
	1050	+ lockdep_assert_held(&p->pi_lock);
	1051	+
	1052	+ uc_se = &p->uclamp_req[UCLAMP_MIN];
	1053	+
	1054	+ /* Only sync if user didn't override the default */
	1055	+ if (uc_se->user_defined)
	1056	+ return;
	1057	+
	1058	+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
	1059	+ uclamp_se_set(uc_se, default_util_min, false);
	1060	+}
	1061	+
	1062	+static void uclamp_update_util_min_rt_default(struct task_struct *p)
	1063	+{
	1064	+ struct rq_flags rf;
	1065	+ struct rq *rq;
	1066	+
	1067	+ if (!rt_task(p))
	1068	+ return;
	1069	+
	1070	+ /* Protect updates to p->uclamp_* */
	1071	+ rq = task_rq_lock(p, &rf);
	1072	+ __uclamp_update_util_min_rt_default(p);
	1073	+ task_rq_unlock(rq, p, &rf);
	1074	+}
	1075	+
	1076	+static void uclamp_sync_util_min_rt_default(void)
	1077	+{
	1078	+ struct task_struct g, p;
	1079	+
	1080	+ /*
	1081	+ * copy_process() sysctl_uclamp
	1082	+ * uclamp_min_rt = X;
	1083	+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
	1084	+ * // link thread smp_mb__after_spinlock()
	1085	+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
	1086	+ * sched_post_fork() for_each_process_thread()
	1087	+ * __uclamp_sync_rt() __uclamp_sync_rt()
	1088	+ *
	1089	+ * Ensures that either sched_post_fork() will observe the new
	1090	+ * uclamp_min_rt or for_each_process_thread() will observe the new
	1091	+ * task.
	1092	+ */
	1093	+ read_lock(&tasklist_lock);
	1094	+ smp_mb__after_spinlock();
	1095	+ read_unlock(&tasklist_lock);
	1096	+
	1097	+ rcu_read_lock();
	1098	+ for_each_process_thread(g, p)
	1099	+ uclamp_update_util_min_rt_default(p);
	1100	+ rcu_read_unlock();
	1101	+}
	1102	+
	1103	+#if IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)
	1104	+void rockchip_perf_uclamp_sync_util_min_rt_default(void)
	1105	+{
	1106	+ uclamp_sync_util_min_rt_default();
	1107	+}
	1108	+EXPORT_SYMBOL(rockchip_perf_uclamp_sync_util_min_rt_default);
	1109	+#endif
	1110	+
895	1111	static inline struct uclamp_se
896	1112	uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
897	1113	{
	1114	+ /* Copy by value as we could modify it */
898	1115	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
899	1116	#ifdef CONFIG_UCLAMP_TASK_GROUP
900		- struct uclamp_se uc_max;
	1117	+ unsigned int tg_min, tg_max, value;
901	1118
902	1119	/*
903	1120	* Tasks in autogroups or root task group will be
..	..	@@ -908,9 +1125,11 @@
908	1125	if (task_group(p) == &root_task_group)
909	1126	return uc_req;
910	1127
911		- uc_max = task_group(p)->uclamp[clamp_id];
912		- if (uc_req.value > uc_max.value \|\| !uc_req.user_defined)
913		- return uc_max;
	1128	+ tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
	1129	+ tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
	1130	+ value = uc_req.value;
	1131	+ value = clamp(value, tg_min, tg_max);
	1132	+ uclamp_se_set(&uc_req, value, false);
914	1133	#endif
915	1134
916	1135	return uc_req;
..	..	@@ -929,6 +1148,12 @@
929	1148	{
930	1149	struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
931	1150	struct uclamp_se uc_max = uclamp_default[clamp_id];
	1151	+ struct uclamp_se uc_eff;
	1152	+ int ret = 0;
	1153	+
	1154	+ trace_android_rvh_uclamp_eff_get(p, clamp_id, &uc_max, &uc_eff, &ret);
	1155	+ if (ret)
	1156	+ return uc_eff;
932	1157
933	1158	/* System default restrictions always apply */
934	1159	if (unlikely(uc_req.value > uc_max.value))
..	..	@@ -949,6 +1174,7 @@
949	1174
950	1175	return (unsigned long)uc_eff.value;
951	1176	}
	1177	+EXPORT_SYMBOL_GPL(uclamp_eff_value);
952	1178
953	1179	/*
954	1180	* When a task is enqueued on a rq, the clamp bucket currently defined by the
..	..	@@ -1009,10 +1235,38 @@
1009	1235
1010	1236	lockdep_assert_held(&rq->lock);
1011	1237
	1238	+ /*
	1239	+ * If sched_uclamp_used was enabled after task @p was enqueued,
	1240	+ * we could end up with unbalanced call to uclamp_rq_dec_id().
	1241	+ *
	1242	+ * In this case the uc_se->active flag should be false since no uclamp
	1243	+ * accounting was performed at enqueue time and we can just return
	1244	+ * here.
	1245	+ *
	1246	+ * Need to be careful of the following enqeueue/dequeue ordering
	1247	+ * problem too
	1248	+ *
	1249	+ * enqueue(taskA)
	1250	+ * // sched_uclamp_used gets enabled
	1251	+ * enqueue(taskB)
	1252	+ * dequeue(taskA)
	1253	+ * // Must not decrement bukcet->tasks here
	1254	+ * dequeue(taskB)
	1255	+ *
	1256	+ * where we could end up with stale data in uc_se and
	1257	+ * bucket[uc_se->bucket_id].
	1258	+ *
	1259	+ * The following check here eliminates the possibility of such race.
	1260	+ */
	1261	+ if (unlikely(!uc_se->active))
	1262	+ return;
	1263	+
1012	1264	bucket = &uc_rq->bucket[uc_se->bucket_id];
	1265	+
1013	1266	SCHED_WARN_ON(!bucket->tasks);
1014	1267	if (likely(bucket->tasks))
1015	1268	bucket->tasks--;
	1269	+
1016	1270	uc_se->active = false;
1017	1271
1018	1272	/*
..	..	@@ -1040,6 +1294,15 @@
1040	1294	{
1041	1295	enum uclamp_id clamp_id;
1042	1296
	1297	+ /*
	1298	+ * Avoid any overhead until uclamp is actually used by the userspace.
	1299	+ *
	1300	+ * The condition is constructed such that a NOP is generated when
	1301	+ * sched_uclamp_used is disabled.
	1302	+ */
	1303	+ if (!static_branch_unlikely(&sched_uclamp_used))
	1304	+ return;
	1305	+
1043	1306	if (unlikely(!p->sched_class->uclamp_enabled))
1044	1307	return;
1045	1308
..	..	@@ -1055,6 +1318,15 @@
1055	1318	{
1056	1319	enum uclamp_id clamp_id;
1057	1320
	1321	+ /*
	1322	+ * Avoid any overhead until uclamp is actually used by the userspace.
	1323	+ *
	1324	+ * The condition is constructed such that a NOP is generated when
	1325	+ * sched_uclamp_used is disabled.
	1326	+ */
	1327	+ if (!static_branch_unlikely(&sched_uclamp_used))
	1328	+ return;
	1329	+
1058	1330	if (unlikely(!p->sched_class->uclamp_enabled))
1059	1331	return;
1060	1332
..	..	@@ -1062,9 +1334,27 @@
1062	1334	uclamp_rq_dec_id(rq, p, clamp_id);
1063	1335	}
1064	1336
1065		-static inline void
1066		-uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
	1337	+static inline void uclamp_rq_reinc_id(struct rq rq, struct task_struct p,
	1338	+ enum uclamp_id clamp_id)
1067	1339	{
	1340	+ if (!p->uclamp[clamp_id].active)
	1341	+ return;
	1342	+
	1343	+ uclamp_rq_dec_id(rq, p, clamp_id);
	1344	+ uclamp_rq_inc_id(rq, p, clamp_id);
	1345	+
	1346	+ /*
	1347	+ * Make sure to clear the idle flag if we've transiently reached 0
	1348	+ * active tasks on rq.
	1349	+ */
	1350	+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
	1351	+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
	1352	+}
	1353	+
	1354	+static inline void
	1355	+uclamp_update_active(struct task_struct *p)
	1356	+{
	1357	+ enum uclamp_id clamp_id;
1068	1358	struct rq_flags rf;
1069	1359	struct rq *rq;
1070	1360
..	..	@@ -1084,30 +1374,22 @@
1084	1374	* affecting a valid clamp bucket, the next time it's enqueued,
1085	1375	* it will already see the updated clamp bucket value.
1086	1376	*/
1087		- if (p->uclamp[clamp_id].active) {
1088		- uclamp_rq_dec_id(rq, p, clamp_id);
1089		- uclamp_rq_inc_id(rq, p, clamp_id);
1090		- }
	1377	+ for_each_clamp_id(clamp_id)
	1378	+ uclamp_rq_reinc_id(rq, p, clamp_id);
1091	1379
1092	1380	task_rq_unlock(rq, p, &rf);
1093	1381	}
1094	1382
1095	1383	#ifdef CONFIG_UCLAMP_TASK_GROUP
1096	1384	static inline void
1097		-uclamp_update_active_tasks(struct cgroup_subsys_state *css,
1098		- unsigned int clamps)
	1385	+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1099	1386	{
1100		- enum uclamp_id clamp_id;
1101	1387	struct css_task_iter it;
1102	1388	struct task_struct *p;
1103	1389
1104	1390	css_task_iter_start(css, 0, &it);
1105		- while ((p = css_task_iter_next(&it))) {
1106		- for_each_clamp_id(clamp_id) {
1107		- if ((0x1 << clamp_id) & clamps)
1108		- uclamp_update_active(p, clamp_id);
1109		- }
1110		- }
	1391	+ while ((p = css_task_iter_next(&it)))
	1392	+ uclamp_update_active(p);
1111	1393	css_task_iter_end(&it);
1112	1394	}
1113	1395
..	..	@@ -1130,16 +1412,16 @@
1130	1412	#endif
1131	1413
1132	1414	int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1133		- void __user buffer, size_t lenp,
1134		- loff_t *ppos)
	1415	+ void buffer, size_t lenp, loff_t *ppos)
1135	1416	{
1136	1417	bool update_root_tg = false;
1137		- int old_min, old_max;
	1418	+ int old_min, old_max, old_min_rt;
1138	1419	int result;
1139	1420
1140	1421	mutex_lock(&uclamp_mutex);
1141	1422	old_min = sysctl_sched_uclamp_util_min;
1142	1423	old_max = sysctl_sched_uclamp_util_max;
	1424	+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1143	1425
1144	1426	result = proc_dointvec(table, write, buffer, lenp, ppos);
1145	1427	if (result)
..	..	@@ -1148,7 +1430,9 @@
1148	1430	goto done;
1149	1431
1150	1432	if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max \|\|
1151		- sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
	1433	+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE \|\|
	1434	+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
	1435	+
1152	1436	result = -EINVAL;
1153	1437	goto undo;
1154	1438	}
..	..	@@ -1164,8 +1448,15 @@
1164	1448	update_root_tg = true;
1165	1449	}
1166	1450
1167		- if (update_root_tg)
	1451	+ if (update_root_tg) {
	1452	+ static_branch_enable(&sched_uclamp_used);
1168	1453	uclamp_update_root_tg();
	1454	+ }
	1455	+
	1456	+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
	1457	+ static_branch_enable(&sched_uclamp_used);
	1458	+ uclamp_sync_util_min_rt_default();
	1459	+ }
1169	1460
1170	1461	/*
1171	1462	* We update all RUNNABLE tasks only when task groups are in use.
..	..	@@ -1178,6 +1469,7 @@
1178	1469	undo:
1179	1470	sysctl_sched_uclamp_util_min = old_min;
1180	1471	sysctl_sched_uclamp_util_max = old_max;
	1472	+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1181	1473	done:
1182	1474	mutex_unlock(&uclamp_mutex);
1183	1475
..	..	@@ -1187,20 +1479,61 @@
1187	1479	static int uclamp_validate(struct task_struct *p,
1188	1480	const struct sched_attr *attr)
1189	1481	{
1190		- unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1191		- unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
	1482	+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
	1483	+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
1192	1484
1193		- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1194		- lower_bound = attr->sched_util_min;
1195		- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1196		- upper_bound = attr->sched_util_max;
	1485	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
	1486	+ util_min = attr->sched_util_min;
1197	1487
1198		- if (lower_bound > upper_bound)
	1488	+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
	1489	+ return -EINVAL;
	1490	+ }
	1491	+
	1492	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
	1493	+ util_max = attr->sched_util_max;
	1494	+
	1495	+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
	1496	+ return -EINVAL;
	1497	+ }
	1498	+
	1499	+ if (util_min != -1 && util_max != -1 && util_min > util_max)
1199	1500	return -EINVAL;
1200		- if (upper_bound > SCHED_CAPACITY_SCALE)
1201		- return -EINVAL;
	1501	+
	1502	+ /*
	1503	+ * We have valid uclamp attributes; make sure uclamp is enabled.
	1504	+ *
	1505	+ * We need to do that here, because enabling static branches is a
	1506	+ * blocking operation which obviously cannot be done while holding
	1507	+ * scheduler locks.
	1508	+ */
	1509	+ static_branch_enable(&sched_uclamp_used);
1202	1510
1203	1511	return 0;
	1512	+}
	1513	+
	1514	+static bool uclamp_reset(const struct sched_attr *attr,
	1515	+ enum uclamp_id clamp_id,
	1516	+ struct uclamp_se *uc_se)
	1517	+{
	1518	+ /* Reset on sched class change for a non user-defined clamp value. */
	1519	+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
	1520	+ !uc_se->user_defined)
	1521	+ return true;
	1522	+
	1523	+ /* Reset on sched_util_{min,max} == -1. */
	1524	+ if (clamp_id == UCLAMP_MIN &&
	1525	+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
	1526	+ attr->sched_util_min == -1) {
	1527	+ return true;
	1528	+ }
	1529	+
	1530	+ if (clamp_id == UCLAMP_MAX &&
	1531	+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
	1532	+ attr->sched_util_max == -1) {
	1533	+ return true;
	1534	+ }
	1535	+
	1536	+ return false;
1204	1537	}
1205	1538
1206	1539	static void __setscheduler_uclamp(struct task_struct *p,
..	..	@@ -1208,40 +1541,41 @@
1208	1541	{
1209	1542	enum uclamp_id clamp_id;
1210	1543
1211		- /*
1212		- * On scheduling class change, reset to default clamps for tasks
1213		- * without a task-specific value.
1214		- */
1215	1544	for_each_clamp_id(clamp_id) {
1216	1545	struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1217		- unsigned int clamp_value = uclamp_none(clamp_id);
	1546	+ unsigned int value;
1218	1547
1219		- /* Keep using defined clamps across class changes */
1220		- if (uc_se->user_defined)
	1548	+ if (!uclamp_reset(attr, clamp_id, uc_se))
1221	1549	continue;
1222	1550
1223		- /* By default, RT tasks always get 100% boost */
1224		- if (sched_feat(SUGOV_RT_MAX_FREQ) &&
1225		- unlikely(rt_task(p) &&
1226		- clamp_id == UCLAMP_MIN)) {
	1551	+ /*
	1552	+ * RT by default have a 100% boost value that could be modified
	1553	+ * at runtime.
	1554	+ */
	1555	+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
	1556	+ value = sysctl_sched_uclamp_util_min_rt_default;
	1557	+ else
	1558	+ value = uclamp_none(clamp_id);
1227	1559
1228		- clamp_value = uclamp_none(UCLAMP_MAX);
1229		- }
	1560	+ uclamp_se_set(uc_se, value, false);
1230	1561
1231		- uclamp_se_set(uc_se, clamp_value, false);
1232	1562	}
1233	1563
1234	1564	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1235	1565	return;
1236	1566
1237		- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
	1567	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
	1568	+ attr->sched_util_min != -1) {
1238	1569	uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1239	1570	attr->sched_util_min, true);
	1571	+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MIN, attr->sched_util_min);
1240	1572	}
1241	1573
1242		- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
	1574	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
	1575	+ attr->sched_util_max != -1) {
1243	1576	uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1244	1577	attr->sched_util_max, true);
	1578	+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max);
1245	1579	}
1246	1580	}
1247	1581
..	..	@@ -1249,6 +1583,10 @@
1249	1583	{
1250	1584	enum uclamp_id clamp_id;
1251	1585
	1586	+ /*
	1587	+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
	1588	+ * as the task is still at its early fork stages.
	1589	+ */
1252	1590	for_each_clamp_id(clamp_id)
1253	1591	p->uclamp[clamp_id].active = false;
1254	1592
..	..	@@ -1261,39 +1599,24 @@
1261	1599	}
1262	1600	}
1263	1601
1264		-#ifdef CONFIG_SMP
1265		-unsigned int uclamp_task(struct task_struct *p)
	1602	+static void uclamp_post_fork(struct task_struct *p)
1266	1603	{
1267		- unsigned long util;
1268		-
1269		- util = task_util_est(p);
1270		- util = max(util, uclamp_eff_value(p, UCLAMP_MIN));
1271		- util = min(util, uclamp_eff_value(p, UCLAMP_MAX));
1272		-
1273		- return util;
	1604	+ uclamp_update_util_min_rt_default(p);
1274	1605	}
1275	1606
1276		-bool uclamp_boosted(struct task_struct *p)
	1607	+static void __init init_uclamp_rq(struct rq *rq)
1277	1608	{
1278		- return uclamp_eff_value(p, UCLAMP_MIN) > 0;
	1609	+ enum uclamp_id clamp_id;
	1610	+ struct uclamp_rq *uc_rq = rq->uclamp;
	1611	+
	1612	+ for_each_clamp_id(clamp_id) {
	1613	+ uc_rq[clamp_id] = (struct uclamp_rq) {
	1614	+ .value = uclamp_none(clamp_id)
	1615	+ };
	1616	+ }
	1617	+
	1618	+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
1279	1619	}
1280		-
1281		-bool uclamp_latency_sensitive(struct task_struct *p)
1282		-{
1283		-#ifdef CONFIG_UCLAMP_TASK_GROUP
1284		- struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id);
1285		- struct task_group *tg;
1286		-
1287		- if (!css)
1288		- return false;
1289		- tg = container_of(css, struct task_group, css);
1290		-
1291		- return tg->latency_sensitive;
1292		-#else
1293		- return false;
1294		-#endif
1295		-}
1296		-#endif /* CONFIG_SMP */
1297	1620
1298	1621	static void __init init_uclamp(void)
1299	1622	{
..	..	@@ -1301,13 +1624,8 @@
1301	1624	enum uclamp_id clamp_id;
1302	1625	int cpu;
1303	1626
1304		- mutex_init(&uclamp_mutex);
1305		-
1306		- for_each_possible_cpu(cpu) {
1307		- memset(&cpu_rq(cpu)->uclamp, 0,
1308		- sizeof(struct uclamp_rq)*UCLAMP_CNT);
1309		- cpu_rq(cpu)->uclamp_flags = 0;
1310		- }
	1627	+ for_each_possible_cpu(cpu)
	1628	+ init_uclamp_rq(cpu_rq(cpu));
1311	1629
1312	1630	for_each_clamp_id(clamp_id) {
1313	1631	uclamp_se_set(&init_task.uclamp_req[clamp_id],
..	..	@@ -1336,41 +1654,7 @@
1336	1654	static void __setscheduler_uclamp(struct task_struct *p,
1337	1655	const struct sched_attr *attr) { }
1338	1656	static inline void uclamp_fork(struct task_struct *p) { }
1339		-
1340		-long schedtune_task_margin(struct task_struct *task);
1341		-
1342		-#ifdef CONFIG_SMP
1343		-unsigned int uclamp_task(struct task_struct *p)
1344		-{
1345		- unsigned long util = task_util_est(p);
1346		-#ifdef CONFIG_SCHED_TUNE
1347		- long margin = schedtune_task_margin(p);
1348		-
1349		- trace_sched_boost_task(p, util, margin);
1350		-
1351		- util += margin;
1352		-#endif
1353		-
1354		- return util;
1355		-}
1356		-
1357		-bool uclamp_boosted(struct task_struct *p)
1358		-{
1359		-#ifdef CONFIG_SCHED_TUNE
1360		- return schedtune_task_boost(p) > 0;
1361		-#endif
1362		- return false;
1363		-}
1364		-
1365		-bool uclamp_latency_sensitive(struct task_struct *p)
1366		-{
1367		-#ifdef CONFIG_SCHED_TUNE
1368		- return schedtune_prefer_idle(p) != 0;
1369		-#endif
1370		- return false;
1371		-}
1372		-#endif /* CONFIG_SMP */
1373		-
	1657	+static inline void uclamp_post_fork(struct task_struct *p) { }
1374	1658	static inline void init_uclamp(void) { }
1375	1659	#endif /* CONFIG_UCLAMP_TASK */
1376	1660
..	..	@@ -1385,7 +1669,9 @@
1385	1669	}
1386	1670
1387	1671	uclamp_rq_inc(rq, p);
	1672	+ trace_android_rvh_enqueue_task(rq, p, flags);
1388	1673	p->sched_class->enqueue_task(rq, p, flags);
	1674	+ trace_android_rvh_after_enqueue_task(rq, p);
1389	1675	}
1390	1676
1391	1677	static inline void dequeue_task(struct rq rq, struct task_struct p, int flags)
..	..	@@ -1399,31 +1685,39 @@
1399	1685	}
1400	1686
1401	1687	uclamp_rq_dec(rq, p);
	1688	+ trace_android_rvh_dequeue_task(rq, p, flags);
1402	1689	p->sched_class->dequeue_task(rq, p, flags);
	1690	+ trace_android_rvh_after_dequeue_task(rq, p);
1403	1691	}
1404	1692
1405	1693	void activate_task(struct rq rq, struct task_struct p, int flags)
1406	1694	{
1407		- if (task_contributes_to_load(p))
1408		- rq->nr_uninterruptible--;
1409		-
1410	1695	enqueue_task(rq, p, flags);
	1696	+
	1697	+ p->on_rq = TASK_ON_RQ_QUEUED;
1411	1698	}
	1699	+EXPORT_SYMBOL_GPL(activate_task);
1412	1700
1413	1701	void deactivate_task(struct rq rq, struct task_struct p, int flags)
1414	1702	{
1415		- if (task_contributes_to_load(p))
1416		- rq->nr_uninterruptible++;
	1703	+ p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
1417	1704
1418	1705	dequeue_task(rq, p, flags);
1419	1706	}
	1707	+EXPORT_SYMBOL_GPL(deactivate_task);
1420	1708
1421		-/*
1422		- * __normal_prio - return the priority that is based on the static prio
1423		- */
1424		-static inline int __normal_prio(struct task_struct *p)
	1709	+static inline int __normal_prio(int policy, int rt_prio, int nice)
1425	1710	{
1426		- return p->static_prio;
	1711	+ int prio;
	1712	+
	1713	+ if (dl_policy(policy))
	1714	+ prio = MAX_DL_PRIO - 1;
	1715	+ else if (rt_policy(policy))
	1716	+ prio = MAX_RT_PRIO - 1 - rt_prio;
	1717	+ else
	1718	+ prio = NICE_TO_PRIO(nice);
	1719	+
	1720	+ return prio;
1427	1721	}
1428	1722
1429	1723	/*
..	..	@@ -1435,15 +1729,7 @@
1435	1729	*/
1436	1730	static inline int normal_prio(struct task_struct *p)
1437	1731	{
1438		- int prio;
1439		-
1440		- if (task_has_dl_policy(p))
1441		- prio = MAX_DL_PRIO-1;
1442		- else if (task_has_rt_policy(p))
1443		- prio = MAX_RT_PRIO-1 - p->rt_priority;
1444		- else
1445		- prio = __normal_prio(p);
1446		- return prio;
	1732	+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
1447	1733	}
1448	1734
1449	1735	/*
..	..	@@ -1499,20 +1785,10 @@
1499	1785
1500	1786	void check_preempt_curr(struct rq rq, struct task_struct p, int flags)
1501	1787	{
1502		- const struct sched_class *class;
1503		-
1504		- if (p->sched_class == rq->curr->sched_class) {
	1788	+ if (p->sched_class == rq->curr->sched_class)
1505	1789	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1506		- } else {
1507		- for_each_class(class) {
1508		- if (class == rq->curr->sched_class)
1509		- break;
1510		- if (class == p->sched_class) {
1511		- resched_curr(rq);
1512		- break;
1513		- }
1514		- }
1515		- }
	1790	+ else if (p->sched_class > rq->curr->sched_class)
	1791	+ resched_curr(rq);
1516	1792
1517	1793	/*
1518	1794	* A queue event has occurred, and we're going to schedule. In
..	..	@@ -1521,22 +1797,12 @@
1521	1797	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1522	1798	rq_clock_skip_update(rq);
1523	1799	}
	1800	+EXPORT_SYMBOL_GPL(check_preempt_curr);
1524	1801
1525	1802	#ifdef CONFIG_SMP
1526	1803
1527		-static inline bool is_per_cpu_kthread(struct task_struct *p)
1528		-{
1529		- if (!(p->flags & PF_KTHREAD))
1530		- return false;
1531		-
1532		- if (p->nr_cpus_allowed != 1)
1533		- return false;
1534		-
1535		- return true;
1536		-}
1537		-
1538	1804	/*
1539		- * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
	1805	+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
1540	1806	* __set_cpus_allowed_ptr() and select_fallback_rq().
1541	1807	*/
1542	1808	static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
..	..	@@ -1544,10 +1810,13 @@
1544	1810	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
1545	1811	return false;
1546	1812
1547		- if (is_per_cpu_kthread(p) \|\| __migrate_disabled(p))
	1813	+ if (is_per_cpu_kthread(p))
1548	1814	return cpu_online(cpu);
1549	1815
1550		- return cpu_active(cpu);
	1816	+ if (!cpu_active(cpu))
	1817	+ return false;
	1818	+
	1819	+ return cpumask_test_cpu(cpu, task_cpu_possible_mask(p));
1551	1820	}
1552	1821
1553	1822	/*
..	..	@@ -1572,19 +1841,29 @@
1572	1841	static struct rq move_queued_task(struct rq rq, struct rq_flags *rf,
1573	1842	struct task_struct *p, int new_cpu)
1574	1843	{
	1844	+ int detached = 0;
	1845	+
1575	1846	lockdep_assert_held(&rq->lock);
1576	1847
1577		- WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
1578		- dequeue_task(rq, p, DEQUEUE_NOCLOCK);
1579		- set_task_cpu(p, new_cpu);
1580		- rq_unlock(rq, rf);
	1848	+ /*
	1849	+ * The vendor hook may drop the lock temporarily, so
	1850	+ * pass the rq flags to unpin lock. We expect the
	1851	+ * rq lock to be held after return.
	1852	+ */
	1853	+ trace_android_rvh_migrate_queued_task(rq, rf, p, new_cpu, &detached);
	1854	+ if (detached)
	1855	+ goto attach;
1581	1856
	1857	+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
	1858	+ set_task_cpu(p, new_cpu);
	1859	+
	1860	+attach:
	1861	+ rq_unlock(rq, rf);
1582	1862	rq = cpu_rq(new_cpu);
1583	1863
1584	1864	rq_lock(rq, rf);
1585	1865	BUG_ON(task_cpu(p) != new_cpu);
1586		- enqueue_task(rq, p, 0);
1587		- p->on_rq = TASK_ON_RQ_QUEUED;
	1866	+ activate_task(rq, p, 0);
1588	1867	check_preempt_curr(rq, p, 0);
1589	1868
1590	1869	return rq;
..	..	@@ -1593,7 +1872,6 @@
1593	1872	struct migration_arg {
1594	1873	struct task_struct *task;
1595	1874	int dest_cpu;
1596		- bool done;
1597	1875	};
1598	1876
1599	1877	/*
..	..	@@ -1629,11 +1907,6 @@
1629	1907	struct task_struct *p = arg->task;
1630	1908	struct rq *rq = this_rq();
1631	1909	struct rq_flags rf;
1632		- int dest_cpu = arg->dest_cpu;
1633		-
1634		- /* We don't look at arg after this point. */
1635		- smp_mb();
1636		- arg->done = true;
1637	1910
1638	1911	/*
1639	1912	* The original target CPU might have gone down and we might
..	..	@@ -1645,7 +1918,7 @@
1645	1918	* __migrate_task() such that we will not miss enforcing cpus_ptr
1646	1919	* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
1647	1920	*/
1648		- sched_ttwu_pending();
	1921	+ flush_smp_call_function_from_idle();
1649	1922
1650	1923	raw_spin_lock(&p->pi_lock);
1651	1924	rq_lock(rq, &rf);
..	..	@@ -1656,9 +1929,9 @@
1656	1929	*/
1657	1930	if (task_rq(p) == rq) {
1658	1931	if (task_on_rq_queued(p))
1659		- rq = __migrate_task(rq, &rf, p, dest_cpu);
	1932	+ rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1660	1933	else
1661		- p->wake_cpu = dest_cpu;
	1934	+ p->wake_cpu = arg->dest_cpu;
1662	1935	}
1663	1936	rq_unlock(rq, &rf);
1664	1937	raw_spin_unlock(&p->pi_lock);
..	..	@@ -1674,17 +1947,9 @@
1674	1947	void set_cpus_allowed_common(struct task_struct p, const struct cpumask new_mask)
1675	1948	{
1676	1949	cpumask_copy(&p->cpus_mask, new_mask);
1677		- if (p->cpus_ptr == &p->cpus_mask)
1678		- p->nr_cpus_allowed = cpumask_weight(new_mask);
	1950	+ p->nr_cpus_allowed = cpumask_weight(new_mask);
	1951	+ trace_android_rvh_set_cpus_allowed_comm(p, new_mask);
1679	1952	}
1680		-
1681		-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
1682		-int __migrate_disabled(struct task_struct *p)
1683		-{
1684		- return p->migrate_disable;
1685		-}
1686		-EXPORT_SYMBOL_GPL(__migrate_disabled);
1687		-#endif
1688	1953
1689	1954	void do_set_cpus_allowed(struct task_struct p, const struct cpumask new_mask)
1690	1955	{
..	..	@@ -1712,28 +1977,23 @@
1712	1977	if (queued)
1713	1978	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
1714	1979	if (running)
1715		- set_curr_task(rq, p);
	1980	+ set_next_task(rq, p);
1716	1981	}
1717	1982
1718	1983	/*
1719		- * Change a given task's CPU affinity. Migrate the thread to a
1720		- * proper CPU and schedule it away if the CPU it's executing on
1721		- * is removed from the allowed bitmask.
1722		- *
1723		- * NOTE: the caller must have a valid reference to the task, the
1724		- * task must not exit() & deallocate itself prematurely. The
1725		- * call is not atomic; no spinlocks may be held.
	1984	+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
1726	1985	*/
1727		-static int __set_cpus_allowed_ptr(struct task_struct *p,
1728		- const struct cpumask *new_mask, bool check)
	1986	+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
	1987	+ const struct cpumask *new_mask,
	1988	+ bool check,
	1989	+ struct rq *rq,
	1990	+ struct rq_flags *rf)
1729	1991	{
1730	1992	const struct cpumask *cpu_valid_mask = cpu_active_mask;
	1993	+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
1731	1994	unsigned int dest_cpu;
1732		- struct rq_flags rf;
1733		- struct rq *rq;
1734	1995	int ret = 0;
1735	1996
1736		- rq = task_rq_lock(p, &rf);
1737	1997	update_rq_clock(rq);
1738	1998
1739	1999	if (p->flags & PF_KTHREAD) {
..	..	@@ -1741,6 +2001,9 @@
1741	2001	* Kernel threads are allowed on online && !active CPUs
1742	2002	*/
1743	2003	cpu_valid_mask = cpu_online_mask;
	2004	+ } else if (!cpumask_subset(new_mask, cpu_allowed_mask)) {
	2005	+ ret = -EINVAL;
	2006	+ goto out;
1744	2007	}
1745	2008
1746	2009	/*
..	..	@@ -1755,7 +2018,12 @@
1755	2018	if (cpumask_equal(&p->cpus_mask, new_mask))
1756	2019	goto out;
1757	2020
1758		- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
	2021	+ /*
	2022	+ * Picking a ~random cpu helps in cases where we are changing affinity
	2023	+ * for groups of tasks (ie. cpuset), so that load balancing is not
	2024	+ * immediately required to distribute the tasks within their new mask.
	2025	+ */
	2026	+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
1759	2027	if (dest_cpu >= nr_cpu_ids) {
1760	2028	ret = -EINVAL;
1761	2029	goto out;
..	..	@@ -1774,28 +2042,45 @@
1774	2042	}
1775	2043
1776	2044	/* Can the task run on the task's current CPU? If so, we're done */
1777		- if (cpumask_test_cpu(task_cpu(p), new_mask) \|\|
1778		- p->cpus_ptr != &p->cpus_mask)
	2045	+ if (cpumask_test_cpu(task_cpu(p), new_mask))
1779	2046	goto out;
1780	2047
1781	2048	if (task_running(rq, p) \|\| p->state == TASK_WAKING) {
1782	2049	struct migration_arg arg = { p, dest_cpu };
1783	2050	/* Need help from migration thread: drop lock and wait. */
1784		- task_rq_unlock(rq, p, &rf);
	2051	+ task_rq_unlock(rq, p, rf);
1785	2052	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1786		- tlb_migrate_finish(p->mm);
1787	2053	return 0;
1788	2054	} else if (task_on_rq_queued(p)) {
1789	2055	/*
1790	2056	* OK, since we're going to drop the lock immediately
1791	2057	* afterwards anyway.
1792	2058	*/
1793		- rq = move_queued_task(rq, &rf, p, dest_cpu);
	2059	+ rq = move_queued_task(rq, rf, p, dest_cpu);
1794	2060	}
1795	2061	out:
1796		- task_rq_unlock(rq, p, &rf);
	2062	+ task_rq_unlock(rq, p, rf);
1797	2063
1798	2064	return ret;
	2065	+}
	2066	+
	2067	+/*
	2068	+ * Change a given task's CPU affinity. Migrate the thread to a
	2069	+ * proper CPU and schedule it away if the CPU it's executing on
	2070	+ * is removed from the allowed bitmask.
	2071	+ *
	2072	+ * NOTE: the caller must have a valid reference to the task, the
	2073	+ * task must not exit() & deallocate itself prematurely. The
	2074	+ * call is not atomic; no spinlocks may be held.
	2075	+ */
	2076	+static int __set_cpus_allowed_ptr(struct task_struct *p,
	2077	+ const struct cpumask *new_mask, bool check)
	2078	+{
	2079	+ struct rq_flags rf;
	2080	+ struct rq *rq;
	2081	+
	2082	+ rq = task_rq_lock(p, &rf);
	2083	+ return __set_cpus_allowed_ptr_locked(p, new_mask, check, rq, &rf);
1799	2084	}
1800	2085
1801	2086	int set_cpus_allowed_ptr(struct task_struct p, const struct cpumask new_mask)
..	..	@@ -1803,6 +2088,74 @@
1803	2088	return __set_cpus_allowed_ptr(p, new_mask, false);
1804	2089	}
1805	2090	EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
	2091	+
	2092	+/*
	2093	+ * Change a given task's CPU affinity to the intersection of its current
	2094	+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
	2095	+ * If the resulting mask is empty, leave the affinity unchanged and return
	2096	+ * -EINVAL.
	2097	+ */
	2098	+static int restrict_cpus_allowed_ptr(struct task_struct *p,
	2099	+ struct cpumask *new_mask,
	2100	+ const struct cpumask *subset_mask)
	2101	+{
	2102	+ struct rq_flags rf;
	2103	+ struct rq *rq;
	2104	+
	2105	+ rq = task_rq_lock(p, &rf);
	2106	+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
	2107	+ task_rq_unlock(rq, p, &rf);
	2108	+ return -EINVAL;
	2109	+ }
	2110	+
	2111	+ return __set_cpus_allowed_ptr_locked(p, new_mask, false, rq, &rf);
	2112	+}
	2113	+
	2114	+/*
	2115	+ * Restrict a given task's CPU affinity so that it is a subset of
	2116	+ * task_cpu_possible_mask(). If the resulting mask is empty, we warn and
	2117	+ * walk up the cpuset hierarchy until we find a suitable mask.
	2118	+ */
	2119	+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
	2120	+{
	2121	+ cpumask_var_t new_mask;
	2122	+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
	2123	+
	2124	+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
	2125	+
	2126	+ /*
	2127	+ * __migrate_task() can fail silently in the face of concurrent
	2128	+ * offlining of the chosen destination CPU, so take the hotplug
	2129	+ * lock to ensure that the migration succeeds.
	2130	+ */
	2131	+ trace_android_rvh_force_compatible_pre(NULL);
	2132	+ cpus_read_lock();
	2133	+ if (!cpumask_available(new_mask))
	2134	+ goto out_set_mask;
	2135	+
	2136	+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
	2137	+ goto out_free_mask;
	2138	+
	2139	+ /*
	2140	+ * We failed to find a valid subset of the affinity mask for the
	2141	+ * task, so override it based on its cpuset hierarchy.
	2142	+ */
	2143	+ cpuset_cpus_allowed(p, new_mask);
	2144	+ override_mask = new_mask;
	2145	+
	2146	+out_set_mask:
	2147	+ if (printk_ratelimit()) {
	2148	+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
	2149	+ task_pid_nr(p), p->comm,
	2150	+ cpumask_pr_args(override_mask));
	2151	+ }
	2152	+
	2153	+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
	2154	+out_free_mask:
	2155	+ cpus_read_unlock();
	2156	+ trace_android_rvh_force_compatible_post(NULL);
	2157	+ free_cpumask_var(new_mask);
	2158	+}
1806	2159
1807	2160	void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1808	2161	{
..	..	@@ -1851,12 +2204,13 @@
1851	2204	p->se.nr_migrations++;
1852	2205	rseq_migrate(p);
1853	2206	perf_event_task_migrate(p);
	2207	+ trace_android_rvh_set_task_cpu(p, new_cpu);
1854	2208	}
1855	2209
1856	2210	__set_task_cpu(p, new_cpu);
1857	2211	}
	2212	+EXPORT_SYMBOL_GPL(set_task_cpu);
1858	2213
1859		-#ifdef CONFIG_NUMA_BALANCING
1860	2214	static void __migrate_swap_task(struct task_struct *p, int cpu)
1861	2215	{
1862	2216	if (task_on_rq_queued(p)) {
..	..	@@ -1869,11 +2223,9 @@
1869	2223	rq_pin_lock(src_rq, &srf);
1870	2224	rq_pin_lock(dst_rq, &drf);
1871	2225
1872		- p->on_rq = TASK_ON_RQ_MIGRATING;
1873	2226	deactivate_task(src_rq, p, 0);
1874	2227	set_task_cpu(p, cpu);
1875	2228	activate_task(dst_rq, p, 0);
1876		- p->on_rq = TASK_ON_RQ_QUEUED;
1877	2229	check_preempt_curr(dst_rq, p, 0);
1878	2230
1879	2231	rq_unpin_lock(dst_rq, &drf);
..	..	@@ -1973,19 +2325,7 @@
1973	2325	out:
1974	2326	return ret;
1975	2327	}
1976		-#endif /* CONFIG_NUMA_BALANCING */
1977		-
1978		-static bool check_task_state(struct task_struct *p, long match_state)
1979		-{
1980		- bool match = false;
1981		-
1982		- raw_spin_lock_irq(&p->pi_lock);
1983		- if (p->state == match_state \|\| p->saved_state == match_state)
1984		- match = true;
1985		- raw_spin_unlock_irq(&p->pi_lock);
1986		-
1987		- return match;
1988		-}
	2328	+EXPORT_SYMBOL_GPL(migrate_swap);
1989	2329
1990	2330	/*
1991	2331	* wait_task_inactive - wait for a thread to unschedule.
..	..	@@ -2031,7 +2371,7 @@
2031	2371	* is actually now running somewhere else!
2032	2372	*/
2033	2373	while (task_running(rq, p)) {
2034		- if (match_state && !check_task_state(p, match_state))
	2374	+ if (match_state && unlikely(p->state != match_state))
2035	2375	return 0;
2036	2376	cpu_relax();
2037	2377	}
..	..	@@ -2046,8 +2386,7 @@
2046	2386	running = task_running(rq, p);
2047	2387	queued = task_on_rq_queued(p);
2048	2388	ncsw = 0;
2049		- if (!match_state \|\| p->state == match_state \|\|
2050		- p->saved_state == match_state)
	2389	+ if (!match_state \|\| p->state == match_state)
2051	2390	ncsw = p->nvcsw \| LONG_MIN; /* sets MSB */
2052	2391	task_rq_unlock(rq, p, &rf);
2053	2392
..	..	@@ -2148,7 +2487,11 @@
2148	2487	int nid = cpu_to_node(cpu);
2149	2488	const struct cpumask *nodemask = NULL;
2150	2489	enum { cpuset, possible, fail } state = cpuset;
2151		- int dest_cpu;
	2490	+ int dest_cpu = -1;
	2491	+
	2492	+ trace_android_rvh_select_fallback_rq(cpu, p, &dest_cpu);
	2493	+ if (dest_cpu >= 0)
	2494	+ return dest_cpu;
2152	2495
2153	2496	/*
2154	2497	* If the node that the CPU is on has been offlined, cpu_to_node()
..	..	@@ -2160,9 +2503,7 @@
2160	2503
2161	2504	/* Look for allowed, online CPU in same node. */
2162	2505	for_each_cpu(dest_cpu, nodemask) {
2163		- if (!cpu_active(dest_cpu))
2164		- continue;
2165		- if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
	2506	+ if (is_cpu_allowed(p, dest_cpu))
2166	2507	return dest_cpu;
2167	2508	}
2168	2509	}
..	..	@@ -2184,12 +2525,11 @@
2184	2525	state = possible;
2185	2526	break;
2186	2527	}
2187		- /* Fall-through */
	2528	+ fallthrough;
2188	2529	case possible:
2189		- do_set_cpus_allowed(p, cpu_possible_mask);
	2530	+ do_set_cpus_allowed(p, task_cpu_possible_mask(p));
2190	2531	state = fail;
2191	2532	break;
2192		-
2193	2533	case fail:
2194	2534	BUG();
2195	2535	break;
..	..	@@ -2216,14 +2556,12 @@
2216	2556	* The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
2217	2557	*/
2218	2558	static inline
2219		-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
2220		- int sibling_count_hint)
	2559	+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
2221	2560	{
2222	2561	lockdep_assert_held(&p->pi_lock);
2223	2562
2224	2563	if (p->nr_cpus_allowed > 1)
2225		- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
2226		- sibling_count_hint);
	2564	+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
2227	2565	else
2228	2566	cpu = cpumask_any(p->cpus_ptr);
2229	2567
..	..	@@ -2241,12 +2579,6 @@
2241	2579	cpu = select_fallback_rq(task_cpu(p), p);
2242	2580
2243	2581	return cpu;
2244		-}
2245		-
2246		-static void update_avg(u64 *avg, u64 sample)
2247		-{
2248		- s64 diff = sample - *avg;
2249		- *avg += diff >> 3;
2250	2582	}
2251	2583
2252	2584	void sched_set_stop_task(int cpu, struct task_struct *stop)
..	..	@@ -2328,12 +2660,6 @@
2328	2660	__schedstat_inc(p->se.statistics.nr_wakeups_sync);
2329	2661	}
2330	2662
2331		-static inline void ttwu_activate(struct rq rq, struct task_struct p, int en_flags)
2332		-{
2333		- activate_task(rq, p, en_flags);
2334		- p->on_rq = TASK_ON_RQ_QUEUED;
2335		-}
2336		-
2337	2663	/*
2338	2664	* Mark the task runnable and perform wakeup-preemption.
2339	2665	*/
..	..	@@ -2375,27 +2701,54 @@
2375	2701	{
2376	2702	int en_flags = ENQUEUE_WAKEUP \| ENQUEUE_NOCLOCK;
2377	2703
	2704	+ if (wake_flags & WF_SYNC)
	2705	+ en_flags \|= ENQUEUE_WAKEUP_SYNC;
	2706	+
2378	2707	lockdep_assert_held(&rq->lock);
2379	2708
2380		-#ifdef CONFIG_SMP
2381	2709	if (p->sched_contributes_to_load)
2382	2710	rq->nr_uninterruptible--;
2383	2711
	2712	+#ifdef CONFIG_SMP
2384	2713	if (wake_flags & WF_MIGRATED)
2385	2714	en_flags \|= ENQUEUE_MIGRATED;
	2715	+ else
2386	2716	#endif
	2717	+ if (p->in_iowait) {
	2718	+ delayacct_blkio_end(p);
	2719	+ atomic_dec(&task_rq(p)->nr_iowait);
	2720	+ }
2387	2721
2388		- ttwu_activate(rq, p, en_flags);
	2722	+ activate_task(rq, p, en_flags);
2389	2723	ttwu_do_wakeup(rq, p, wake_flags, rf);
2390	2724	}
2391	2725
2392	2726	/*
2393		- * Called in case the task @p isn't fully descheduled from its runqueue,
2394		- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2395		- * since all we need to do is flip p->state to TASK_RUNNING, since
2396		- * the task is still ->on_rq.
	2727	+ * Consider @p being inside a wait loop:
	2728	+ *
	2729	+ * for (;;) {
	2730	+ * set_current_state(TASK_UNINTERRUPTIBLE);
	2731	+ *
	2732	+ * if (CONDITION)
	2733	+ * break;
	2734	+ *
	2735	+ * schedule();
	2736	+ * }
	2737	+ * __set_current_state(TASK_RUNNING);
	2738	+ *
	2739	+ * between set_current_state() and schedule(). In this case @p is still
	2740	+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
	2741	+ * an atomic manner.
	2742	+ *
	2743	+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
	2744	+ * then schedule() must still happen and p->state can be changed to
	2745	+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
	2746	+ * need to do a full wakeup with enqueue.
	2747	+ *
	2748	+ * Returns: %true when the wakeup is done,
	2749	+ * %false otherwise.
2397	2750	*/
2398		-static int ttwu_remote(struct task_struct *p, int wake_flags)
	2751	+static int ttwu_runnable(struct task_struct *p, int wake_flags)
2399	2752	{
2400	2753	struct rq_flags rf;
2401	2754	struct rq *rq;
..	..	@@ -2414,75 +2767,63 @@
2414	2767	}
2415	2768
2416	2769	#ifdef CONFIG_SMP
2417		-void sched_ttwu_pending(void)
	2770	+void sched_ttwu_pending(void *arg)
2418	2771	{
	2772	+ struct llist_node *llist = arg;
2419	2773	struct rq *rq = this_rq();
2420		- struct llist_node *llist = llist_del_all(&rq->wake_list);
2421	2774	struct task_struct p, t;
2422	2775	struct rq_flags rf;
2423	2776
2424	2777	if (!llist)
2425	2778	return;
2426	2779
	2780	+ /*
	2781	+ * rq::ttwu_pending racy indication of out-standing wakeups.
	2782	+ * Races such that false-negatives are possible, since they
	2783	+ * are shorter lived that false-positives would be.
	2784	+ */
	2785	+ WRITE_ONCE(rq->ttwu_pending, 0);
	2786	+
2427	2787	rq_lock_irqsave(rq, &rf);
2428	2788	update_rq_clock(rq);
2429	2789
2430		- llist_for_each_entry_safe(p, t, llist, wake_entry)
	2790	+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
	2791	+ if (WARN_ON_ONCE(p->on_cpu))
	2792	+ smp_cond_load_acquire(&p->on_cpu, !VAL);
	2793	+
	2794	+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
	2795	+ set_task_cpu(p, cpu_of(rq));
	2796	+
2431	2797	ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
	2798	+ }
2432	2799
2433	2800	rq_unlock_irqrestore(rq, &rf);
2434	2801	}
2435	2802
2436		-void scheduler_ipi(void)
	2803	+void send_call_function_single_ipi(int cpu)
2437	2804	{
2438		- /*
2439		- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
2440		- * TIF_NEED_RESCHED remotely (for the first time) will also send
2441		- * this IPI.
2442		- */
2443		- preempt_fold_need_resched();
	2805	+ struct rq *rq = cpu_rq(cpu);
2444	2806
2445		- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2446		- return;
2447		-
2448		- /*
2449		- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2450		- * traditionally all their work was done from the interrupt return
2451		- * path. Now that we actually do some work, we need to make sure
2452		- * we do call them.
2453		- *
2454		- * Some archs already do call them, luckily irq_enter/exit nest
2455		- * properly.
2456		- *
2457		- * Arguably we should visit all archs and update all handlers,
2458		- * however a fair share of IPIs are still resched only so this would
2459		- * somewhat pessimize the simple resched case.
2460		- */
2461		- irq_enter();
2462		- sched_ttwu_pending();
2463		-
2464		- /*
2465		- * Check if someone kicked us for doing the nohz idle load balance.
2466		- */
2467		- if (unlikely(got_nohz_idle_kick())) {
2468		- this_rq()->idle_balance = 1;
2469		- raise_softirq_irqoff(SCHED_SOFTIRQ);
2470		- }
2471		- irq_exit();
	2807	+ if (!set_nr_if_polling(rq->idle))
	2808	+ arch_send_call_function_single_ipi(cpu);
	2809	+ else
	2810	+ trace_sched_wake_idle_without_ipi(cpu);
2472	2811	}
2473	2812
2474		-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
	2813	+/*
	2814	+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
	2815	+ * necessary. The wakee CPU on receipt of the IPI will queue the task
	2816	+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
	2817	+ * of the wakeup instead of the waker.
	2818	+ */
	2819	+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2475	2820	{
2476	2821	struct rq *rq = cpu_rq(cpu);
2477	2822
2478	2823	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
2479	2824
2480		- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
2481		- if (!set_nr_if_polling(rq->idle))
2482		- smp_send_reschedule(cpu);
2483		- else
2484		- trace_sched_wake_idle_without_ipi(cpu);
2485		- }
	2825	+ WRITE_ONCE(rq->ttwu_pending, 1);
	2826	+ __smp_call_single_queue(cpu, &p->wake_entry.llist);
2486	2827	}
2487	2828
2488	2829	void wake_up_if_idle(int cpu)
..	..	@@ -2508,6 +2849,7 @@
2508	2849	out:
2509	2850	rcu_read_unlock();
2510	2851	}
	2852	+EXPORT_SYMBOL_GPL(wake_up_if_idle);
2511	2853
2512	2854	bool cpus_share_cache(int this_cpu, int that_cpu)
2513	2855	{
..	..	@@ -2516,6 +2858,58 @@
2516	2858
2517	2859	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
2518	2860	}
	2861	+
	2862	+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
	2863	+{
	2864	+ /*
	2865	+ * If the CPU does not share cache, then queue the task on the
	2866	+ * remote rqs wakelist to avoid accessing remote data.
	2867	+ */
	2868	+ if (!cpus_share_cache(smp_processor_id(), cpu))
	2869	+ return true;
	2870	+
	2871	+ /*
	2872	+ * If the task is descheduling and the only running task on the
	2873	+ * CPU then use the wakelist to offload the task activation to
	2874	+ * the soon-to-be-idle CPU as the current CPU is likely busy.
	2875	+ * nr_running is checked to avoid unnecessary task stacking.
	2876	+ *
	2877	+ * Note that we can only get here with (wakee) p->on_rq=0,
	2878	+ * p->on_cpu can be whatever, we've done the dequeue, so
	2879	+ * the wakee has been accounted out of ->nr_running.
	2880	+ */
	2881	+ if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running)
	2882	+ return true;
	2883	+
	2884	+ return false;
	2885	+}
	2886	+
	2887	+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
	2888	+{
	2889	+ bool cond = false;
	2890	+
	2891	+ trace_android_rvh_ttwu_cond(&cond);
	2892	+
	2893	+ if ((sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) \|\|
	2894	+ cond) {
	2895	+ if (WARN_ON_ONCE(cpu == smp_processor_id()))
	2896	+ return false;
	2897	+
	2898	+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
	2899	+ __ttwu_queue_wakelist(p, cpu, wake_flags);
	2900	+ return true;
	2901	+ }
	2902	+
	2903	+ return false;
	2904	+}
	2905	+
	2906	+#else /* !CONFIG_SMP */
	2907	+
	2908	+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
	2909	+{
	2910	+ return false;
	2911	+}
	2912	+
2519	2913	#endif /* CONFIG_SMP */
2520	2914
2521	2915	static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
..	..	@@ -2523,13 +2917,8 @@
2523	2917	struct rq *rq = cpu_rq(cpu);
2524	2918	struct rq_flags rf;
2525	2919
2526		-#if defined(CONFIG_SMP)
2527		- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
2528		- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
2529		- ttwu_queue_remote(p, cpu, wake_flags);
	2920	+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
2530	2921	return;
2531		- }
2532		-#endif
2533	2922
2534	2923	rq_lock(rq, &rf);
2535	2924	update_rq_clock(rq);
..	..	@@ -2585,8 +2974,8 @@
2585	2974	* migration. However the means are completely different as there is no lock
2586	2975	* chain to provide order. Instead we do:
2587	2976	*
2588		- * 1) smp_store_release(X->on_cpu, 0)
2589		- * 2) smp_cond_load_acquire(!X->on_cpu)
	2977	+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
	2978	+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
2590	2979	*
2591	2980	* Example:
2592	2981	*
..	..	@@ -2625,64 +3014,95 @@
2625	3014	* @p: the thread to be awakened
2626	3015	* @state: the mask of task states that can be woken
2627	3016	* @wake_flags: wake modifier flags (WF_*)
2628		- * @sibling_count_hint: A hint at the number of threads that are being woken up
2629		- * in this event.
2630	3017	*
2631		- * If (@state & @p->state) @p->state = TASK_RUNNING.
	3018	+ * Conceptually does:
	3019	+ *
	3020	+ * If (@state & @p->state) @p->state = TASK_RUNNING.
2632	3021	*
2633	3022	* If the task was not queued/runnable, also place it back on a runqueue.
2634	3023	*
2635		- * Atomic against schedule() which would dequeue a task, also see
2636		- * set_current_state().
	3024	+ * This function is atomic against schedule() which would dequeue the task.
2637	3025	*
2638		- * This function executes a full memory barrier before accessing the task
2639		- * state; see set_current_state().
	3026	+ * It issues a full memory barrier before accessing @p->state, see the comment
	3027	+ * with set_current_state().
	3028	+ *
	3029	+ * Uses p->pi_lock to serialize against concurrent wake-ups.
	3030	+ *
	3031	+ * Relies on p->pi_lock stabilizing:
	3032	+ * - p->sched_class
	3033	+ * - p->cpus_ptr
	3034	+ * - p->sched_task_group
	3035	+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
	3036	+ *
	3037	+ * Tries really hard to only take one task_rq(p)->lock for performance.
	3038	+ * Takes rq->lock in:
	3039	+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
	3040	+ * - ttwu_queue() -- new rq, for enqueue of the task;
	3041	+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
	3042	+ *
	3043	+ * As a consequence we race really badly with just about everything. See the
	3044	+ * many memory barriers and their comments for details.
2640	3045	*
2641	3046	* Return: %true if @p->state changes (an actual wakeup was done),
2642	3047	* %false otherwise.
2643	3048	*/
2644	3049	static int
2645		-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
2646		- int sibling_count_hint)
	3050	+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2647	3051	{
2648	3052	unsigned long flags;
2649	3053	int cpu, success = 0;
2650	3054
2651		- /*
2652		- * If we are going to wake up a thread waiting for CONDITION we
2653		- * need to ensure that CONDITION=1 done by the caller can not be
2654		- * reordered with p->state check below. This pairs with mb() in
2655		- * set_current_state() the waiting thread does.
2656		- */
2657		- raw_spin_lock_irqsave(&p->pi_lock, flags);
2658		- smp_mb__after_spinlock();
2659		- if (!(p->state & state)) {
	3055	+ preempt_disable();
	3056	+ if (p == current) {
2660	3057	/*
2661		- * The task might be running due to a spinlock sleeper
2662		- * wakeup. Check the saved state and set it to running
2663		- * if the wakeup condition is true.
	3058	+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
	3059	+ * == smp_processor_id()'. Together this means we can special
	3060	+ * case the whole 'p->on_rq && ttwu_runnable()' case below
	3061	+ * without taking any locks.
	3062	+ *
	3063	+ * In particular:
	3064	+ * - we rely on Program-Order guarantees for all the ordering,
	3065	+ * - we're serialized against set_special_state() by virtue of
	3066	+ * it disabling IRQs (this allows not taking ->pi_lock).
2664	3067	*/
2665		- if (!(wake_flags & WF_LOCK_SLEEPER)) {
2666		- if (p->saved_state & state) {
2667		- p->saved_state = TASK_RUNNING;
2668		- success = 1;
2669		- }
2670		- }
	3068	+ if (!(p->state & state))
	3069	+ goto out;
	3070	+
	3071	+ success = 1;
	3072	+ trace_sched_waking(p);
	3073	+ p->state = TASK_RUNNING;
	3074	+ trace_sched_wakeup(p);
2671	3075	goto out;
2672	3076	}
2673	3077
2674	3078	/*
2675		- * If this is a regular wakeup, then we can unconditionally
2676		- * clear the saved state of a "lock sleeper".
	3079	+ * If we are going to wake up a thread waiting for CONDITION we
	3080	+ * need to ensure that CONDITION=1 done by the caller can not be
	3081	+ * reordered with p->state check below. This pairs with smp_store_mb()
	3082	+ * in set_current_state() that the waiting thread does.
2677	3083	*/
2678		- if (!(wake_flags & WF_LOCK_SLEEPER))
2679		- p->saved_state = TASK_RUNNING;
	3084	+ raw_spin_lock_irqsave(&p->pi_lock, flags);
	3085	+ smp_mb__after_spinlock();
	3086	+ if (!(p->state & state))
	3087	+ goto unlock;
	3088	+
	3089	+#ifdef CONFIG_FREEZER
	3090	+ /*
	3091	+ * If we're going to wake up a thread which may be frozen, then
	3092	+ * we can only do so if we have an active CPU which is capable of
	3093	+ * running it. This may not be the case when resuming from suspend,
	3094	+ * as the secondary CPUs may not yet be back online. See __thaw_task()
	3095	+ * for the actual wakeup.
	3096	+ */
	3097	+ if (unlikely(frozen_or_skipped(p)) &&
	3098	+ !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p)))
	3099	+ goto unlock;
	3100	+#endif
2680	3101
2681	3102	trace_sched_waking(p);
2682	3103
2683	3104	/* We're going to change ->state: */
2684	3105	success = 1;
2685		- cpu = task_cpu(p);
2686	3106
2687	3107	/*
2688	3108	* Ensure we load p->on_rq _after_ p->state, otherwise it would
..	..	@@ -2703,10 +3123,15 @@
2703	3123	*
2704	3124	* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
2705	3125	* __schedule(). See the comment for smp_mb__after_spinlock().
	3126	+ *
	3127	+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
2706	3128	*/
2707	3129	smp_rmb();
2708		- if (p->on_rq && ttwu_remote(p, wake_flags))
2709		- goto stat;
	3130	+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
	3131	+ goto unlock;
	3132	+
	3133	+ if (p->state & TASK_UNINTERRUPTIBLE)
	3134	+ trace_sched_blocked_reason(p);
2710	3135
2711	3136	#ifdef CONFIG_SMP
2712	3137	/*
..	..	@@ -2727,8 +3152,43 @@
2727	3152	*
2728	3153	* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
2729	3154	* __schedule(). See the comment for smp_mb__after_spinlock().
	3155	+ *
	3156	+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
	3157	+ * schedule()'s deactivate_task() has 'happened' and p will no longer
	3158	+ * care about it's own p->state. See the comment in __schedule().
2730	3159	*/
2731		- smp_rmb();
	3160	+ smp_acquire__after_ctrl_dep();
	3161	+
	3162	+ /*
	3163	+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
	3164	+ * == 0), which means we need to do an enqueue, change p->state to
	3165	+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
	3166	+ * enqueue, such as ttwu_queue_wakelist().
	3167	+ */
	3168	+ p->state = TASK_WAKING;
	3169	+
	3170	+ /*
	3171	+ * If the owning (remote) CPU is still in the middle of schedule() with
	3172	+ * this task as prev, considering queueing p on the remote CPUs wake_list
	3173	+ * which potentially sends an IPI instead of spinning on p->on_cpu to
	3174	+ * let the waker make forward progress. This is safe because IRQs are
	3175	+ * disabled and the IPI will deliver after on_cpu is cleared.
	3176	+ *
	3177	+ * Ensure we load task_cpu(p) after p->on_cpu:
	3178	+ *
	3179	+ * set_task_cpu(p, cpu);
	3180	+ * STORE p->cpu = @cpu
	3181	+ * __schedule() (switch to task 'p')
	3182	+ * LOCK rq->lock
	3183	+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
	3184	+ * STORE p->on_cpu = 1 LOAD p->cpu
	3185	+ *
	3186	+ * to ensure we observe the correct CPU on which the task is currently
	3187	+ * scheduling.
	3188	+ */
	3189	+ if (smp_load_acquire(&p->on_cpu) &&
	3190	+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags \| WF_ON_CPU))
	3191	+ goto unlock;
2732	3192
2733	3193	/*
2734	3194	* If the owning (remote) CPU is still in the middle of schedule() with
..	..	@@ -2741,38 +3201,79 @@
2741	3201	*/
2742	3202	smp_cond_load_acquire(&p->on_cpu, !VAL);
2743	3203
2744		- p->sched_contributes_to_load = !!task_contributes_to_load(p);
2745		- p->state = TASK_WAKING;
	3204	+ trace_android_rvh_try_to_wake_up(p);
2746	3205
2747		- if (p->in_iowait) {
2748		- delayacct_blkio_end(p);
2749		- atomic_dec(&task_rq(p)->nr_iowait);
2750		- }
2751		-
2752		- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
2753		- sibling_count_hint);
	3206	+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2754	3207	if (task_cpu(p) != cpu) {
	3208	+ if (p->in_iowait) {
	3209	+ delayacct_blkio_end(p);
	3210	+ atomic_dec(&task_rq(p)->nr_iowait);
	3211	+ }
	3212	+
2755	3213	wake_flags \|= WF_MIGRATED;
2756	3214	psi_ttwu_dequeue(p);
2757	3215	set_task_cpu(p, cpu);
2758	3216	}
2759		-
2760		-#else /* CONFIG_SMP */
2761		-
2762		- if (p->in_iowait) {
2763		- delayacct_blkio_end(p);
2764		- atomic_dec(&task_rq(p)->nr_iowait);
2765		- }
2766		-
	3217	+#else
	3218	+ cpu = task_cpu(p);
2767	3219	#endif /* CONFIG_SMP */
2768	3220
2769	3221	ttwu_queue(p, cpu, wake_flags);
2770		-stat:
2771		- ttwu_stat(p, cpu, wake_flags);
2772		-out:
	3222	+unlock:
2773	3223	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
	3224	+out:
	3225	+ if (success) {
	3226	+ trace_android_rvh_try_to_wake_up_success(p);
	3227	+ ttwu_stat(p, task_cpu(p), wake_flags);
	3228	+ }
	3229	+ preempt_enable();
2774	3230
2775	3231	return success;
	3232	+}
	3233	+
	3234	+/**
	3235	+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
	3236	+ * @p: Process for which the function is to be invoked, can be @current.
	3237	+ * @func: Function to invoke.
	3238	+ * @arg: Argument to function.
	3239	+ *
	3240	+ * If the specified task can be quickly locked into a definite state
	3241	+ * (either sleeping or on a given runqueue), arrange to keep it in that
	3242	+ * state while invoking @func(@arg). This function can use ->on_rq and
	3243	+ * task_curr() to work out what the state is, if required. Given that
	3244	+ * @func can be invoked with a runqueue lock held, it had better be quite
	3245	+ * lightweight.
	3246	+ *
	3247	+ * Returns:
	3248	+ * @false if the task slipped out from under the locks.
	3249	+ * @true if the task was locked onto a runqueue or is sleeping.
	3250	+ * However, @func can override this by returning @false.
	3251	+ */
	3252	+bool try_invoke_on_locked_down_task(struct task_struct p, bool (func)(struct task_struct t, void arg), void *arg)
	3253	+{
	3254	+ struct rq_flags rf;
	3255	+ bool ret = false;
	3256	+ struct rq *rq;
	3257	+
	3258	+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
	3259	+ if (p->on_rq) {
	3260	+ rq = __task_rq_lock(p, &rf);
	3261	+ if (task_rq(p) == rq)
	3262	+ ret = func(p, arg);
	3263	+ rq_unlock(rq, &rf);
	3264	+ } else {
	3265	+ switch (p->state) {
	3266	+ case TASK_RUNNING:
	3267	+ case TASK_WAKING:
	3268	+ break;
	3269	+ default:
	3270	+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
	3271	+ if (!p->on_rq)
	3272	+ ret = func(p, arg);
	3273	+ }
	3274	+ }
	3275	+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
	3276	+ return ret;
2776	3277	}
2777	3278
2778	3279	/**
..	..	@@ -2788,25 +3289,13 @@
2788	3289	*/
2789	3290	int wake_up_process(struct task_struct *p)
2790	3291	{
2791		- return try_to_wake_up(p, TASK_NORMAL, 0, 1);
	3292	+ return try_to_wake_up(p, TASK_NORMAL, 0);
2792	3293	}
2793	3294	EXPORT_SYMBOL(wake_up_process);
2794	3295
2795		-/**
2796		- * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
2797		- * @p: The process to be woken up.
2798		- *
2799		- * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
2800		- * the nature of the wakeup.
2801		- */
2802		-int wake_up_lock_sleeper(struct task_struct *p)
2803		-{
2804		- return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER, 1);
2805		-}
2806		-
2807	3296	int wake_up_state(struct task_struct *p, unsigned int state)
2808	3297	{
2809		- return try_to_wake_up(p, state, 0, 1);
	3298	+ return try_to_wake_up(p, state, 0);
2810	3299	}
2811	3300
2812	3301	/*
..	..	@@ -2831,6 +3320,8 @@
2831	3320	p->se.cfs_rq = NULL;
2832	3321	#endif
2833	3322
	3323	+ trace_android_rvh_sched_fork_init(p);
	3324	+
2834	3325	#ifdef CONFIG_SCHEDSTATS
2835	3326	/* Even if schedstat is disabled, there should not be garbage */
2836	3327	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
..	..	@@ -2851,7 +3342,13 @@
2851	3342	INIT_HLIST_HEAD(&p->preempt_notifiers);
2852	3343	#endif
2853	3344
	3345	+#ifdef CONFIG_COMPACTION
	3346	+ p->capture_control = NULL;
	3347	+#endif
2854	3348	init_numa_balancing(clone_flags, p);
	3349	+#ifdef CONFIG_SMP
	3350	+ p->wake_entry.u_flags = CSD_TYPE_TTWU;
	3351	+#endif
2855	3352	}
2856	3353
2857	3354	DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
..	..	@@ -2868,7 +3365,7 @@
2868	3365
2869	3366	#ifdef CONFIG_PROC_SYSCTL
2870	3367	int sysctl_numa_balancing(struct ctl_table *table, int write,
2871		- void __user buffer, size_t lenp, loff_t *ppos)
	3368	+ void buffer, size_t lenp, loff_t *ppos)
2872	3369	{
2873	3370	struct ctl_table t;
2874	3371	int err;
..	..	@@ -2942,8 +3439,8 @@
2942	3439	}
2943	3440
2944	3441	#ifdef CONFIG_PROC_SYSCTL
2945		-int sysctl_schedstats(struct ctl_table *table, int write,
2946		- void __user buffer, size_t lenp, loff_t *ppos)
	3442	+int sysctl_schedstats(struct ctl_table table, int write, void buffer,
	3443	+ size_t lenp, loff_t ppos)
2947	3444	{
2948	3445	struct ctl_table t;
2949	3446	int err;
..	..	@@ -2971,7 +3468,7 @@
2971	3468	*/
2972	3469	int sched_fork(unsigned long clone_flags, struct task_struct *p)
2973	3470	{
2974		- unsigned long flags;
	3471	+ trace_android_rvh_sched_fork(p);
2975	3472
2976	3473	__sched_fork(clone_flags, p);
2977	3474	/*
..	..	@@ -2985,6 +3482,7 @@
2985	3482	* Make sure we do not leak PI boosting priority to the child.
2986	3483	*/
2987	3484	p->prio = current->normal_prio;
	3485	+ trace_android_rvh_prepare_prio_fork(p);
2988	3486
2989	3487	uclamp_fork(p);
2990	3488
..	..	@@ -2999,8 +3497,8 @@
2999	3497	} else if (PRIO_TO_NICE(p->static_prio) < 0)
3000	3498	p->static_prio = NICE_TO_PRIO(0);
3001	3499
3002		- p->prio = p->normal_prio = __normal_prio(p);
3003		- set_load_weight(p, false);
	3500	+ p->prio = p->normal_prio = p->static_prio;
	3501	+ set_load_weight(p);
3004	3502
3005	3503	/*
3006	3504	* We don't need the reset flag anymore after the fork. It has
..	..	@@ -3017,24 +3515,8 @@
3017	3515	p->sched_class = &fair_sched_class;
3018	3516
3019	3517	init_entity_runnable_average(&p->se);
	3518	+ trace_android_rvh_finish_prio_fork(p);
3020	3519
3021		- /*
3022		- * The child is not yet in the pid-hash so no cgroup attach races,
3023		- * and the cgroup is pinned to this child due to cgroup_fork()
3024		- * is ran before sched_fork().
3025		- *
3026		- * Silence PROVE_RCU.
3027		- */
3028		- raw_spin_lock_irqsave(&p->pi_lock, flags);
3029		- rseq_migrate(p);
3030		- /*
3031		- * We're setting the CPU for the first time, we don't migrate,
3032		- * so use __set_task_cpu().
3033		- */
3034		- __set_task_cpu(p, smp_processor_id());
3035		- if (p->sched_class->task_fork)
3036		- p->sched_class->task_fork(p);
3037		- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3038	3520
3039	3521	#ifdef CONFIG_SCHED_INFO
3040	3522	if (likely(sched_info_on()))
..	..	@@ -3044,14 +3526,46 @@
3044	3526	p->on_cpu = 0;
3045	3527	#endif
3046	3528	init_task_preempt_count(p);
3047		-#ifdef CONFIG_HAVE_PREEMPT_LAZY
3048		- task_thread_info(p)->preempt_lazy_count = 0;
3049		-#endif
3050	3529	#ifdef CONFIG_SMP
3051	3530	plist_node_init(&p->pushable_tasks, MAX_PRIO);
3052	3531	RB_CLEAR_NODE(&p->pushable_dl_tasks);
3053	3532	#endif
3054	3533	return 0;
	3534	+}
	3535	+
	3536	+void sched_cgroup_fork(struct task_struct p, struct kernel_clone_args kargs)
	3537	+{
	3538	+ unsigned long flags;
	3539	+
	3540	+ /*
	3541	+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
	3542	+ * required yet, but lockdep gets upset if rules are violated.
	3543	+ */
	3544	+ raw_spin_lock_irqsave(&p->pi_lock, flags);
	3545	+#ifdef CONFIG_CGROUP_SCHED
	3546	+ if (1) {
	3547	+ struct task_group *tg;
	3548	+
	3549	+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
	3550	+ struct task_group, css);
	3551	+ tg = autogroup_task_group(p, tg);
	3552	+ p->sched_task_group = tg;
	3553	+ }
	3554	+#endif
	3555	+ rseq_migrate(p);
	3556	+ /*
	3557	+ * We're setting the CPU for the first time, we don't migrate,
	3558	+ * so use __set_task_cpu().
	3559	+ */
	3560	+ __set_task_cpu(p, smp_processor_id());
	3561	+ if (p->sched_class->task_fork)
	3562	+ p->sched_class->task_fork(p);
	3563	+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
	3564	+}
	3565	+
	3566	+void sched_post_fork(struct task_struct *p)
	3567	+{
	3568	+ uclamp_post_fork(p);
3055	3569	}
3056	3570
3057	3571	unsigned long to_ratio(u64 period, u64 runtime)
..	..	@@ -3082,6 +3596,8 @@
3082	3596	struct rq_flags rf;
3083	3597	struct rq *rq;
3084	3598
	3599	+ trace_android_rvh_wake_up_new_task(p);
	3600	+
3085	3601	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3086	3602	p->state = TASK_RUNNING;
3087	3603	#ifdef CONFIG_SMP
..	..	@@ -3095,14 +3611,14 @@
3095	3611	*/
3096	3612	p->recent_used_cpu = task_cpu(p);
3097	3613	rseq_migrate(p);
3098		- __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
	3614	+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
3099	3615	#endif
3100	3616	rq = __task_rq_lock(p, &rf);
3101	3617	update_rq_clock(rq);
3102		- post_init_entity_util_avg(&p->se);
	3618	+ post_init_entity_util_avg(p);
	3619	+ trace_android_rvh_new_task_stats(p);
3103	3620
3104	3621	activate_task(rq, p, ENQUEUE_NOCLOCK);
3105		- p->on_rq = TASK_ON_RQ_QUEUED;
3106	3622	trace_sched_wakeup_new(p);
3107	3623	check_preempt_curr(rq, p, WF_FORK);
3108	3624	#ifdef CONFIG_SMP
..	..	@@ -3212,8 +3728,10 @@
3212	3728	/*
3213	3729	* Claim the task as running, we do this before switching to it
3214	3730	* such that any running task will have this set.
	3731	+ *
	3732	+ * See the ttwu() WF_ON_CPU case and its ordering comment.
3215	3733	*/
3216		- next->on_cpu = 1;
	3734	+ WRITE_ONCE(next->on_cpu, 1);
3217	3735	#endif
3218	3736	}
3219	3737
..	..	@@ -3221,8 +3739,9 @@
3221	3739	{
3222	3740	#ifdef CONFIG_SMP
3223	3741	/*
3224		- * After ->on_cpu is cleared, the task can be moved to a different CPU.
3225		- * We must ensure this doesn't happen until the switch is completely
	3742	+ * This must be the very last reference to @prev from this CPU. After
	3743	+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
	3744	+ * must ensure this doesn't happen until the switch is completely
3226	3745	* finished.
3227	3746	*
3228	3747	* In particular, the load of prev->state in finish_task_switch() must
..	..	@@ -3244,7 +3763,7 @@
3244	3763	* do an early lockdep release here:
3245	3764	*/
3246	3765	rq_unpin_lock(rq, rf);
3247		- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
	3766	+ spin_release(&rq->lock.dep_map, _THIS_IP_);
3248	3767	#ifdef CONFIG_DEBUG_SPINLOCK
3249	3768	/* this is a valid case when another task releases the spinlock */
3250	3769	rq->lock.owner = next;
..	..	@@ -3376,19 +3895,25 @@
3376	3895	* provided by mmdrop(),
3377	3896	* - a sync_core for SYNC_CORE.
3378	3897	*/
3379		- /*
3380		- * We use mmdrop_delayed() here so we don't have to do the
3381		- * full __mmdrop() when we are the last user.
3382		- */
3383	3898	if (mm) {
3384	3899	membarrier_mm_sync_core_before_usermode(mm);
3385		- mmdrop_delayed(mm);
	3900	+ mmdrop(mm);
3386	3901	}
3387	3902	if (unlikely(prev_state == TASK_DEAD)) {
3388	3903	if (prev->sched_class->task_dead)
3389	3904	prev->sched_class->task_dead(prev);
3390	3905
3391		- put_task_struct(prev);
	3906	+ /*
	3907	+ * Remove function-return probe instances associated with this
	3908	+ * task and put them back on the free list.
	3909	+ */
	3910	+ kprobe_flush_task(prev);
	3911	+ trace_android_rvh_flush_task(prev);
	3912	+
	3913	+ /* Task is done with its stack. */
	3914	+ put_task_stack(prev);
	3915	+
	3916	+ put_task_struct_rcu_user(prev);
3392	3917	}
3393	3918
3394	3919	tick_nohz_task_switch();
..	..	@@ -3467,12 +3992,8 @@
3467	3992	context_switch(struct rq rq, struct task_struct prev,
3468	3993	struct task_struct next, struct rq_flags rf)
3469	3994	{
3470		- struct mm_struct mm, oldmm;
3471		-
3472	3995	prepare_task_switch(rq, prev, next);
3473	3996
3474		- mm = next->mm;
3475		- oldmm = prev->active_mm;
3476	3997	/*
3477	3998	* For paravirt, this is coupled with an exit in switch_to to
3478	3999	* combine the page table reload and the switch backend into
..	..	@@ -3481,22 +4002,37 @@
3481	4002	arch_start_context_switch(prev);
3482	4003
3483	4004	/*
3484		- * If mm is non-NULL, we pass through switch_mm(). If mm is
3485		- * NULL, we will pass through mmdrop() in finish_task_switch().
3486		- * Both of these contain the full memory barrier required by
3487		- * membarrier after storing to rq->curr, before returning to
3488		- * user-space.
	4005	+ * kernel -> kernel lazy + transfer active
	4006	+ * user -> kernel lazy + mmgrab() active
	4007	+ *
	4008	+ * kernel -> user switch + mmdrop() active
	4009	+ * user -> user switch
3489	4010	*/
3490		- if (!mm) {
3491		- next->active_mm = oldmm;
3492		- mmgrab(oldmm);
3493		- enter_lazy_tlb(oldmm, next);
3494		- } else
3495		- switch_mm_irqs_off(oldmm, mm, next);
	4011	+ if (!next->mm) { // to kernel
	4012	+ enter_lazy_tlb(prev->active_mm, next);
3496	4013
3497		- if (!prev->mm) {
3498		- prev->active_mm = NULL;
3499		- rq->prev_mm = oldmm;
	4014	+ next->active_mm = prev->active_mm;
	4015	+ if (prev->mm) // from user
	4016	+ mmgrab(prev->active_mm);
	4017	+ else
	4018	+ prev->active_mm = NULL;
	4019	+ } else { // to user
	4020	+ membarrier_switch_mm(rq, prev->active_mm, next->mm);
	4021	+ /*
	4022	+ * sys_membarrier() requires an smp_mb() between setting
	4023	+ * rq->curr / membarrier_switch_mm() and returning to userspace.
	4024	+ *
	4025	+ * The below provides this either through switch_mm(), or in
	4026	+ * case 'prev->active_mm == next->mm' through
	4027	+ * finish_task_switch()'s mmdrop().
	4028	+ */
	4029	+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
	4030	+
	4031	+ if (!prev->mm) { // from kernel
	4032	+ /* will mmdrop() in finish_task_switch(). */
	4033	+ rq->prev_mm = prev->active_mm;
	4034	+ prev->active_mm = NULL;
	4035	+ }
3500	4036	}
3501	4037
3502	4038	rq->clock_update_flags &= ~(RQCF_ACT_SKIP\|RQCF_REQ_SKIP);
..	..	@@ -3533,7 +4069,7 @@
3533	4069	* preemption, thus the result might have a time-of-check-to-time-of-use
3534	4070	* race. The caller is responsible to use it correctly, for example:
3535	4071	*
3536		- * - from a non-preemptable section (of course)
	4072	+ * - from a non-preemptible section (of course)
3537	4073	*
3538	4074	* - from a thread that is bound to a single CPU
3539	4075	*
..	..	@@ -3554,6 +4090,18 @@
3554	4090	sum += cpu_rq(i)->nr_switches;
3555	4091
3556	4092	return sum;
	4093	+}
	4094	+
	4095	+/*
	4096	+ * Consumers of these two interfaces, like for example the cpuidle menu
	4097	+ * governor, are using nonsensical data. Preferring shallow idle state selection
	4098	+ * for a CPU that has IO-wait which might not even end up running the task when
	4099	+ * it does become runnable.
	4100	+ */
	4101	+
	4102	+unsigned long nr_iowait_cpu(int cpu)
	4103	+{
	4104	+ return atomic_read(&cpu_rq(cpu)->nr_iowait);
3557	4105	}
3558	4106
3559	4107	/*
..	..	@@ -3591,29 +4139,9 @@
3591	4139	unsigned long i, sum = 0;
3592	4140
3593	4141	for_each_possible_cpu(i)
3594		- sum += atomic_read(&cpu_rq(i)->nr_iowait);
	4142	+ sum += nr_iowait_cpu(i);
3595	4143
3596	4144	return sum;
3597		-}
3598		-
3599		-/*
3600		- * Consumers of these two interfaces, like for example the cpufreq menu
3601		- * governor are using nonsensical data. Boosting frequency for a CPU that has
3602		- * IO-wait which might not even end up running the task when it does become
3603		- * runnable.
3604		- */
3605		-
3606		-unsigned long nr_iowait_cpu(int cpu)
3607		-{
3608		- struct rq *this = cpu_rq(cpu);
3609		- return atomic_read(&this->nr_iowait);
3610		-}
3611		-
3612		-void get_iowait_load(unsigned long nr_waiters, unsigned long load)
3613		-{
3614		- struct rq *rq = this_rq();
3615		- *nr_waiters = atomic_read(&rq->nr_iowait);
3616		- *load = rq->load.weight;
3617	4145	}
3618	4146
3619	4147	#ifdef CONFIG_SMP
..	..	@@ -3627,9 +4155,14 @@
3627	4155	struct task_struct *p = current;
3628	4156	unsigned long flags;
3629	4157	int dest_cpu;
	4158	+ bool cond = false;
	4159	+
	4160	+ trace_android_rvh_sched_exec(&cond);
	4161	+ if (cond)
	4162	+ return;
3630	4163
3631	4164	raw_spin_lock_irqsave(&p->pi_lock, flags);
3632		- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
	4165	+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
3633	4166	if (dest_cpu == smp_processor_id())
3634	4167	goto unlock;
3635	4168
..	..	@@ -3712,6 +4245,7 @@
3712	4245
3713	4246	return ns;
3714	4247	}
	4248	+EXPORT_SYMBOL_GPL(task_sched_runtime);
3715	4249
3716	4250	/*
3717	4251	* This function gets called by the timer code, with HZ frequency.
..	..	@@ -3723,14 +4257,18 @@
3723	4257	struct rq *rq = cpu_rq(cpu);
3724	4258	struct task_struct *curr = rq->curr;
3725	4259	struct rq_flags rf;
	4260	+ unsigned long thermal_pressure;
3726	4261
	4262	+ arch_scale_freq_tick();
3727	4263	sched_clock_tick();
3728	4264
3729	4265	rq_lock(rq, &rf);
3730	4266
	4267	+ trace_android_rvh_tick_entry(rq);
3731	4268	update_rq_clock(rq);
	4269	+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
	4270	+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
3732	4271	curr->sched_class->task_tick(rq, curr, 0);
3733		- cpu_load_update_active(rq);
3734	4272	calc_global_load_tick(rq);
3735	4273	psi_task_tick(rq);
3736	4274
..	..	@@ -3742,6 +4280,8 @@
3742	4280	rq->idle_balance = idle_cpu(cpu);
3743	4281	trigger_load_balance(rq);
3744	4282	#endif
	4283	+
	4284	+ trace_android_vh_scheduler_tick(rq);
3745	4285	}
3746	4286
3747	4287	#ifdef CONFIG_NO_HZ_FULL
..	..	@@ -3799,28 +4339,31 @@
3799	4339	* statistics and checks timeslices in a time-independent way, regardless
3800	4340	* of when exactly it is running.
3801	4341	*/
3802		- if (idle_cpu(cpu) \|\| !tick_nohz_tick_stopped_cpu(cpu))
	4342	+ if (!tick_nohz_tick_stopped_cpu(cpu))
3803	4343	goto out_requeue;
3804	4344
3805	4345	rq_lock_irq(rq, &rf);
3806	4346	curr = rq->curr;
3807		- if (is_idle_task(curr) \|\| cpu_is_offline(cpu))
	4347	+ if (cpu_is_offline(cpu))
3808	4348	goto out_unlock;
3809	4349
3810	4350	update_rq_clock(rq);
3811		- delta = rq_clock_task(rq) - curr->se.exec_start;
3812	4351
3813		- /*
3814		- * Make sure the next tick runs within a reasonable
3815		- * amount of time.
3816		- */
3817		- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
	4352	+ if (!is_idle_task(curr)) {
	4353	+ /*
	4354	+ * Make sure the next tick runs within a reasonable
	4355	+ * amount of time.
	4356	+ */
	4357	+ delta = rq_clock_task(rq) - curr->se.exec_start;
	4358	+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
	4359	+ }
3818	4360	curr->sched_class->task_tick(rq, curr, 0);
3819	4361
	4362	+ calc_load_nohz_remote(rq);
3820	4363	out_unlock:
3821	4364	rq_unlock_irq(rq, &rf);
3822		-
3823	4365	out_requeue:
	4366	+
3824	4367	/*
3825	4368	* Run the remote tick once per second (1Hz). This arbitrary
3826	4369	* frequency is large enough to avoid overload but short enough
..	..	@@ -3884,7 +4427,7 @@
3884	4427	static inline void sched_tick_stop(int cpu) { }
3885	4428	#endif
3886	4429
3887		-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \
	4430	+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \
3888	4431	defined(CONFIG_TRACE_PREEMPT_TOGGLE))
3889	4432	/*
3890	4433	* If the value passed in is equal to the current preempt count
..	..	@@ -3990,11 +4533,12 @@
3990	4533	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
3991	4534	&& in_atomic_preempt_off()) {
3992	4535	pr_err("Preemption disabled at:");
3993		- print_ip_sym(preempt_disable_ip);
3994		- pr_cont("\n");
	4536	+ print_ip_sym(KERN_ERR, preempt_disable_ip);
3995	4537	}
3996	4538	if (panic_on_warn)
3997	4539	panic("scheduling while atomic\n");
	4540	+
	4541	+ trace_android_rvh_schedule_bug(prev);
3998	4542
3999	4543	dump_stack();
4000	4544	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
..	..	@@ -4003,11 +4547,23 @@
4003	4547	/*
4004	4548	* Various schedule()-time debugging checks and statistics:
4005	4549	*/
4006		-static inline void schedule_debug(struct task_struct *prev)
	4550	+static inline void schedule_debug(struct task_struct *prev, bool preempt)
4007	4551	{
4008	4552	#ifdef CONFIG_SCHED_STACK_END_CHECK
4009	4553	if (task_stack_end_corrupted(prev))
4010	4554	panic("corrupted stack end detected inside scheduler\n");
	4555	+
	4556	+ if (task_scs_end_corrupted(prev))
	4557	+ panic("corrupted shadow stack detected inside scheduler\n");
	4558	+#endif
	4559	+
	4560	+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
	4561	+ if (!preempt && prev->state && prev->non_block_count) {
	4562	+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
	4563	+ prev->comm, prev->pid, prev->non_block_count);
	4564	+ dump_stack();
	4565	+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
	4566	+ }
4011	4567	#endif
4012	4568
4013	4569	if (unlikely(in_atomic_preempt_off())) {
..	..	@@ -4019,6 +4575,28 @@
4019	4575	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4020	4576
4021	4577	schedstat_inc(this_rq()->sched_count);
	4578	+}
	4579	+
	4580	+static void put_prev_task_balance(struct rq rq, struct task_struct prev,
	4581	+ struct rq_flags *rf)
	4582	+{
	4583	+#ifdef CONFIG_SMP
	4584	+ const struct sched_class *class;
	4585	+ /*
	4586	+ * We must do the balancing pass before put_prev_task(), such
	4587	+ * that when we release the rq->lock the task is in the same
	4588	+ * state as before we took rq->lock.
	4589	+ *
	4590	+ * We can terminate the balance pass as soon as we know there is
	4591	+ * a runnable task of @class priority or higher.
	4592	+ */
	4593	+ for_class_range(class, prev->sched_class, &idle_sched_class) {
	4594	+ if (class->balance(rq, prev, rf))
	4595	+ break;
	4596	+ }
	4597	+#endif
	4598	+
	4599	+ put_prev_task(rq, prev);
4022	4600	}
4023	4601
4024	4602	/*
..	..	@@ -4036,36 +4614,34 @@
4036	4614	* higher scheduling class, because otherwise those loose the
4037	4615	* opportunity to pull in more work from other CPUs.
4038	4616	*/
4039		- if (likely((prev->sched_class == &idle_sched_class \|\|
4040		- prev->sched_class == &fair_sched_class) &&
	4617	+ if (likely(prev->sched_class <= &fair_sched_class &&
4041	4618	rq->nr_running == rq->cfs.h_nr_running)) {
4042	4619
4043		- p = fair_sched_class.pick_next_task(rq, prev, rf);
	4620	+ p = pick_next_task_fair(rq, prev, rf);
4044	4621	if (unlikely(p == RETRY_TASK))
4045		- goto again;
	4622	+ goto restart;
4046	4623
4047	4624	/* Assumes fair_sched_class->next == idle_sched_class */
4048		- if (unlikely(!p))
4049		- p = idle_sched_class.pick_next_task(rq, prev, rf);
	4625	+ if (!p) {
	4626	+ put_prev_task(rq, prev);
	4627	+ p = pick_next_task_idle(rq);
	4628	+ }
4050	4629
4051	4630	return p;
4052	4631	}
4053	4632
4054		-again:
	4633	+restart:
	4634	+ put_prev_task_balance(rq, prev, rf);
	4635	+
4055	4636	for_each_class(class) {
4056		- p = class->pick_next_task(rq, prev, rf);
4057		- if (p) {
4058		- if (unlikely(p == RETRY_TASK))
4059		- goto again;
	4637	+ p = class->pick_next_task(rq);
	4638	+ if (p)
4060	4639	return p;
4061		- }
4062	4640	}
4063	4641
4064	4642	/* The idle class should always have a runnable task: */
4065	4643	BUG();
4066	4644	}
4067		-
4068		-static void migrate_disabled_sched(struct task_struct *p);
4069	4645
4070	4646	/*
4071	4647	* __schedule() is the main scheduler function.
..	..	@@ -4087,7 +4663,7 @@
4087	4663	* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
4088	4664	* called on the nearest possible occasion:
4089	4665	*
4090		- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
	4666	+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
4091	4667	*
4092	4668	* - in syscall or exception context, at the next outmost
4093	4669	* preempt_enable(). (this might be as soon as the wake_up()'s
..	..	@@ -4096,7 +4672,7 @@
4096	4672	* - in IRQ context, return from interrupt-handler to
4097	4673	* preemptible context
4098	4674	*
4099		- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
	4675	+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
4100	4676	* then at the next:
4101	4677	*
4102	4678	* - cond_resched() call
..	..	@@ -4110,6 +4686,7 @@
4110	4686	{
4111	4687	struct task_struct prev, next;
4112	4688	unsigned long *switch_count;
	4689	+ unsigned long prev_state;
4113	4690	struct rq_flags rf;
4114	4691	struct rq *rq;
4115	4692	int cpu;
..	..	@@ -4118,7 +4695,7 @@
4118	4695	rq = cpu_rq(cpu);
4119	4696	prev = rq->curr;
4120	4697
4121		- schedule_debug(prev);
	4698	+ schedule_debug(prev, preempt);
4122	4699
4123	4700	if (sched_feat(HRTICK))
4124	4701	hrtick_clear(rq);
..	..	@@ -4129,28 +4706,59 @@
4129	4706	/*
4130	4707	* Make sure that signal_pending_state()->signal_pending() below
4131	4708	* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4132		- * done by the caller to avoid the race with signal_wake_up().
	4709	+ * done by the caller to avoid the race with signal_wake_up():
4133	4710	*
4134		- * The membarrier system call requires a full memory barrier
	4711	+ * __set_current_state(@state) signal_wake_up()
	4712	+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
	4713	+ * wake_up_state(p, state)
	4714	+ * LOCK rq->lock LOCK p->pi_state
	4715	+ * smp_mb__after_spinlock() smp_mb__after_spinlock()
	4716	+ * if (signal_pending_state()) if (p->state & @state)
	4717	+ *
	4718	+ * Also, the membarrier system call requires a full memory barrier
4135	4719	* after coming from user-space, before storing to rq->curr.
4136	4720	*/
4137	4721	rq_lock(rq, &rf);
4138	4722	smp_mb__after_spinlock();
4139		-
4140		- if (__migrate_disabled(prev))
4141		- migrate_disabled_sched(prev);
4142	4723
4143	4724	/* Promote REQ to ACT */
4144	4725	rq->clock_update_flags <<= 1;
4145	4726	update_rq_clock(rq);
4146	4727
4147	4728	switch_count = &prev->nivcsw;
4148		- if (!preempt && prev->state) {
4149		- if (unlikely(signal_pending_state(prev->state, prev))) {
	4729	+
	4730	+ /*
	4731	+ * We must load prev->state once (task_struct::state is volatile), such
	4732	+ * that:
	4733	+ *
	4734	+ * - we form a control dependency vs deactivate_task() below.
	4735	+ * - ptrace_{,un}freeze_traced() can change ->state underneath us.
	4736	+ */
	4737	+ prev_state = prev->state;
	4738	+ if (!preempt && prev_state) {
	4739	+ if (signal_pending_state(prev_state, prev)) {
4150	4740	prev->state = TASK_RUNNING;
4151	4741	} else {
	4742	+ prev->sched_contributes_to_load =
	4743	+ (prev_state & TASK_UNINTERRUPTIBLE) &&
	4744	+ !(prev_state & TASK_NOLOAD) &&
	4745	+ !(prev->flags & PF_FROZEN);
	4746	+
	4747	+ if (prev->sched_contributes_to_load)
	4748	+ rq->nr_uninterruptible++;
	4749	+
	4750	+ /*
	4751	+ * __schedule() ttwu()
	4752	+ * prev_state = prev->state; if (p->on_rq && ...)
	4753	+ * if (prev_state) goto out;
	4754	+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
	4755	+ * p->state = TASK_WAKING
	4756	+ *
	4757	+ * Where __schedule() and ttwu() have matching control dependencies.
	4758	+ *
	4759	+ * After this, schedule() must not care about p->state any more.
	4760	+ */
4152	4761	deactivate_task(rq, prev, DEQUEUE_SLEEP \| DEQUEUE_NOCLOCK);
4153		- prev->on_rq = 0;
4154	4762
4155	4763	if (prev->in_iowait) {
4156	4764	atomic_inc(&rq->nr_iowait);
..	..	@@ -4162,12 +4770,16 @@
4162	4770
4163	4771	next = pick_next_task(rq, prev, &rf);
4164	4772	clear_tsk_need_resched(prev);
4165		- clear_tsk_need_resched_lazy(prev);
4166	4773	clear_preempt_need_resched();
4167	4774
	4775	+ trace_android_rvh_schedule(prev, next, rq);
4168	4776	if (likely(prev != next)) {
4169	4777	rq->nr_switches++;
4170		- rq->curr = next;
	4778	+ /*
	4779	+ * RCU users of rcu_dereference(rq->curr) may not see
	4780	+ * changes to task_struct made by pick_next_task().
	4781	+ */
	4782	+ RCU_INIT_POINTER(rq->curr, next);
4171	4783	/*
4172	4784	* The membarrier system call requires each architecture
4173	4785	* to have a full memory barrier after updating
..	..	@@ -4183,6 +4795,8 @@
4183	4795	* is a RELEASE barrier),
4184	4796	*/
4185	4797	++*switch_count;
	4798	+
	4799	+ psi_sched_switch(prev, next, !task_on_rq_queued(prev));
4186	4800
4187	4801	trace_sched_switch(preempt, prev, next);
4188	4802
..	..	@@ -4214,19 +4828,26 @@
4214	4828
4215	4829	static inline void sched_submit_work(struct task_struct *tsk)
4216	4830	{
	4831	+ unsigned int task_flags;
	4832	+
4217	4833	if (!tsk->state)
4218	4834	return;
4219	4835
	4836	+ task_flags = tsk->flags;
4220	4837	/*
4221	4838	* If a worker went to sleep, notify and ask workqueue whether
4222	4839	* it wants to wake up a task to maintain concurrency.
4223	4840	* As this function is called inside the schedule() context,
4224	4841	* we disable preemption to avoid it calling schedule() again
4225		- * in the possible wakeup of a kworker.
	4842	+ * in the possible wakeup of a kworker and because wq_worker_sleeping()
	4843	+ * requires it.
4226	4844	*/
4227		- if (tsk->flags & PF_WQ_WORKER) {
	4845	+ if (task_flags & (PF_WQ_WORKER \| PF_IO_WORKER)) {
4228	4846	preempt_disable();
4229		- wq_worker_sleeping(tsk);
	4847	+ if (task_flags & PF_WQ_WORKER)
	4848	+ wq_worker_sleeping(tsk);
	4849	+ else
	4850	+ io_wq_worker_sleeping(tsk);
4230	4851	preempt_enable_no_resched();
4231	4852	}
4232	4853
..	..	@@ -4243,8 +4864,12 @@
4243	4864
4244	4865	static void sched_update_worker(struct task_struct *tsk)
4245	4866	{
4246		- if (tsk->flags & PF_WQ_WORKER)
4247		- wq_worker_running(tsk);
	4867	+ if (tsk->flags & (PF_WQ_WORKER \| PF_IO_WORKER)) {
	4868	+ if (tsk->flags & PF_WQ_WORKER)
	4869	+ wq_worker_running(tsk);
	4870	+ else
	4871	+ io_wq_worker_running(tsk);
	4872	+ }
4248	4873	}
4249	4874
4250	4875	asmlinkage __visible void __sched schedule(void)
..	..	@@ -4346,35 +4971,10 @@
4346	4971	} while (need_resched());
4347	4972	}
4348	4973
4349		-#ifdef CONFIG_PREEMPT_LAZY
	4974	+#ifdef CONFIG_PREEMPTION
4350	4975	/*
4351		- * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
4352		- * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
4353		- * preempt_lazy_count counter >0.
4354		- */
4355		-static __always_inline int preemptible_lazy(void)
4356		-{
4357		- if (test_thread_flag(TIF_NEED_RESCHED))
4358		- return 1;
4359		- if (current_thread_info()->preempt_lazy_count)
4360		- return 0;
4361		- return 1;
4362		-}
4363		-
4364		-#else
4365		-
4366		-static inline int preemptible_lazy(void)
4367		-{
4368		- return 1;
4369		-}
4370		-
4371		-#endif
4372		-
4373		-#ifdef CONFIG_PREEMPT
4374		-/*
4375		- * this is the entry point to schedule() from in-kernel preemption
4376		- * off of preempt_enable. Kernel preemptions off return from interrupt
4377		- * occur there and call schedule directly.
	4976	+ * This is the entry point to schedule() from in-kernel preemption
	4977	+ * off of preempt_enable.
4378	4978	*/
4379	4979	asmlinkage __visible void __sched notrace preempt_schedule(void)
4380	4980	{
..	..	@@ -4384,8 +4984,7 @@
4384	4984	*/
4385	4985	if (likely(!preemptible()))
4386	4986	return;
4387		- if (!preemptible_lazy())
4388		- return;
	4987	+
4389	4988	preempt_schedule_common();
4390	4989	}
4391	4990	NOKPROBE_SYMBOL(preempt_schedule);
..	..	@@ -4410,9 +5009,6 @@
4410	5009	enum ctx_state prev_ctx;
4411	5010
4412	5011	if (likely(!preemptible()))
4413		- return;
4414		-
4415		- if (!preemptible_lazy())
4416	5012	return;
4417	5013
4418	5014	do {
..	..	@@ -4446,10 +5042,10 @@
4446	5042	}
4447	5043	EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
4448	5044
4449		-#endif /* CONFIG_PREEMPT */
	5045	+#endif /* CONFIG_PREEMPTION */
4450	5046
4451	5047	/*
4452		- * this is the entry point to schedule() from kernel preemption
	5048	+ * This is the entry point to schedule() from kernel preemption
4453	5049	* off of irq context.
4454	5050	* Note, that this is called and return with irqs disabled. This will
4455	5051	* protect us against recursive calling from irq.
..	..	@@ -4477,9 +5073,22 @@
4477	5073	int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
4478	5074	void *key)
4479	5075	{
4480		- return try_to_wake_up(curr->private, mode, wake_flags, 1);
	5076	+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC \| WF_ANDROID_VENDOR));
	5077	+ return try_to_wake_up(curr->private, mode, wake_flags);
4481	5078	}
4482	5079	EXPORT_SYMBOL(default_wake_function);
	5080	+
	5081	+static void __setscheduler_prio(struct task_struct *p, int prio)
	5082	+{
	5083	+ if (dl_prio(prio))
	5084	+ p->sched_class = &dl_sched_class;
	5085	+ else if (rt_prio(prio))
	5086	+ p->sched_class = &rt_sched_class;
	5087	+ else
	5088	+ p->sched_class = &fair_sched_class;
	5089	+
	5090	+ p->prio = prio;
	5091	+}
4483	5092
4484	5093	#ifdef CONFIG_RT_MUTEXES
4485	5094
..	..	@@ -4517,6 +5126,7 @@
4517	5126	struct rq_flags rf;
4518	5127	struct rq *rq;
4519	5128
	5129	+ trace_android_rvh_rtmutex_prepare_setprio(p, pi_task);
4520	5130	/* XXX used to be waiter->prio, not waiter->task->prio */
4521	5131	prio = __rt_effective_prio(pi_task, p->normal_prio);
4522	5132
..	..	@@ -4591,31 +5201,29 @@
4591	5201	if (!dl_prio(p->normal_prio) \|\|
4592	5202	(pi_task && dl_prio(pi_task->prio) &&
4593	5203	dl_entity_preempt(&pi_task->dl, &p->dl))) {
4594		- p->dl.dl_boosted = 1;
	5204	+ p->dl.pi_se = pi_task->dl.pi_se;
4595	5205	queue_flag \|= ENQUEUE_REPLENISH;
4596		- } else
4597		- p->dl.dl_boosted = 0;
4598		- p->sched_class = &dl_sched_class;
	5206	+ } else {
	5207	+ p->dl.pi_se = &p->dl;
	5208	+ }
4599	5209	} else if (rt_prio(prio)) {
4600	5210	if (dl_prio(oldprio))
4601		- p->dl.dl_boosted = 0;
	5211	+ p->dl.pi_se = &p->dl;
4602	5212	if (oldprio < prio)
4603	5213	queue_flag \|= ENQUEUE_HEAD;
4604		- p->sched_class = &rt_sched_class;
4605	5214	} else {
4606	5215	if (dl_prio(oldprio))
4607		- p->dl.dl_boosted = 0;
	5216	+ p->dl.pi_se = &p->dl;
4608	5217	if (rt_prio(oldprio))
4609	5218	p->rt.timeout = 0;
4610		- p->sched_class = &fair_sched_class;
4611	5219	}
4612	5220
4613		- p->prio = prio;
	5221	+ __setscheduler_prio(p, prio);
4614	5222
4615	5223	if (queued)
4616	5224	enqueue_task(rq, p, queue_flag);
4617	5225	if (running)
4618		- set_curr_task(rq, p);
	5226	+ set_next_task(rq, p);
4619	5227
4620	5228	check_class_changed(rq, p, prev_class, oldprio);
4621	5229	out_unlock:
..	..	@@ -4635,12 +5243,13 @@
4635	5243
4636	5244	void set_user_nice(struct task_struct *p, long nice)
4637	5245	{
4638		- bool queued, running;
4639		- int old_prio, delta;
	5246	+ bool queued, running, allowed = false;
	5247	+ int old_prio;
4640	5248	struct rq_flags rf;
4641	5249	struct rq *rq;
4642	5250
4643		- if (task_nice(p) == nice \|\| nice < MIN_NICE \|\| nice > MAX_NICE)
	5251	+ trace_android_rvh_set_user_nice(p, &nice, &allowed);
	5252	+ if ((task_nice(p) == nice \|\| nice < MIN_NICE \|\| nice > MAX_NICE) && !allowed)
4644	5253	return;
4645	5254	/*
4646	5255	* We have to be careful, if called from sys_setpriority(),
..	..	@@ -4667,22 +5276,21 @@
4667	5276	put_prev_task(rq, p);
4668	5277
4669	5278	p->static_prio = NICE_TO_PRIO(nice);
4670		- set_load_weight(p, true);
	5279	+ set_load_weight(p);
4671	5280	old_prio = p->prio;
4672	5281	p->prio = effective_prio(p);
4673		- delta = p->prio - old_prio;
4674	5282
4675		- if (queued) {
	5283	+ if (queued)
4676	5284	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
4677		- /*
4678		- * If the task increased its priority or is running and
4679		- * lowered its priority, then reschedule its CPU:
4680		- */
4681		- if (delta < 0 \|\| (delta > 0 && task_running(rq, p)))
4682		- resched_curr(rq);
4683		- }
4684	5285	if (running)
4685		- set_curr_task(rq, p);
	5286	+ set_next_task(rq, p);
	5287	+
	5288	+ /*
	5289	+ * If the task increased its priority or is running and
	5290	+ * lowered its priority, then reschedule its CPU:
	5291	+ */
	5292	+ p->sched_class->prio_changed(rq, p, old_prio);
	5293	+
4686	5294	out_unlock:
4687	5295	task_rq_unlock(rq, p, &rf);
4688	5296	}
..	..	@@ -4767,7 +5375,7 @@
4767	5375	return 0;
4768	5376
4769	5377	#ifdef CONFIG_SMP
4770		- if (!llist_empty(&rq->wake_list))
	5378	+ if (rq->ttwu_pending)
4771	5379	return 0;
4772	5380	#endif
4773	5381
..	..	@@ -4790,6 +5398,7 @@
4790	5398
4791	5399	return 1;
4792	5400	}
	5401	+EXPORT_SYMBOL_GPL(available_idle_cpu);
4793	5402
4794	5403	/**
4795	5404	* idle_task - return the idle task for a given CPU.
..	..	@@ -4841,36 +5450,7 @@
4841	5450	*/
4842	5451	p->rt_priority = attr->sched_priority;
4843	5452	p->normal_prio = normal_prio(p);
4844		- set_load_weight(p, true);
4845		-}
4846		-
4847		-/* Actually do priority change: must hold pi & rq lock. */
4848		-static void __setscheduler(struct rq rq, struct task_struct p,
4849		- const struct sched_attr *attr, bool keep_boost)
4850		-{
4851		- /*
4852		- * If params can't change scheduling class changes aren't allowed
4853		- * either.
4854		- */
4855		- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
4856		- return;
4857		-
4858		- __setscheduler_params(p, attr);
4859		-
4860		- /*
4861		- * Keep a potential priority boosting if called from
4862		- * sched_setscheduler().
4863		- */
4864		- p->prio = normal_prio(p);
4865		- if (keep_boost)
4866		- p->prio = rt_effective_prio(p, p->prio);
4867		-
4868		- if (dl_prio(p->prio))
4869		- p->sched_class = &dl_sched_class;
4870		- else if (rt_prio(p->prio))
4871		- p->sched_class = &rt_sched_class;
4872		- else
4873		- p->sched_class = &fair_sched_class;
	5453	+ set_load_weight(p);
4874	5454	}
4875	5455
4876	5456	/*
..	..	@@ -4893,10 +5473,8 @@
4893	5473	const struct sched_attr *attr,
4894	5474	bool user, bool pi)
4895	5475	{
4896		- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4897		- MAX_RT_PRIO - 1 - attr->sched_priority;
4898		- int retval, oldprio, oldpolicy = -1, queued, running;
4899		- int new_effective_prio, policy = attr->sched_policy;
	5476	+ int oldpolicy = -1, policy = attr->sched_policy;
	5477	+ int retval, oldprio, newprio, queued, running;
4900	5478	const struct sched_class *prev_class;
4901	5479	struct rq_flags rf;
4902	5480	int reset_on_fork;
..	..	@@ -4969,7 +5547,7 @@
4969	5547	* Treat SCHED_IDLE as nice 20. Only allow a switch to
4970	5548	* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4971	5549	*/
4972		- if (idle_policy(p->policy) && !idle_policy(policy)) {
	5550	+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
4973	5551	if (!can_nice(p, task_nice(p)))
4974	5552	return -EPERM;
4975	5553	}
..	..	@@ -4980,6 +5558,10 @@
4980	5558
4981	5559	/* Normal users shall not reset the sched_reset_on_fork flag: */
4982	5560	if (p->sched_reset_on_fork && !reset_on_fork)
	5561	+ return -EPERM;
	5562	+
	5563	+ /* Can't change util-clamps */
	5564	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
4983	5565	return -EPERM;
4984	5566	}
4985	5567
..	..	@@ -5013,8 +5595,8 @@
5013	5595	* Changing the policy of the stop threads its a very bad idea:
5014	5596	*/
5015	5597	if (p == rq->stop) {
5016		- task_rq_unlock(rq, p, &rf);
5017		- return -EINVAL;
	5598	+ retval = -EINVAL;
	5599	+ goto unlock;
5018	5600	}
5019	5601
5020	5602	/*
..	..	@@ -5032,8 +5614,8 @@
5032	5614	goto change;
5033	5615
5034	5616	p->sched_reset_on_fork = reset_on_fork;
5035		- task_rq_unlock(rq, p, &rf);
5036		- return 0;
	5617	+ retval = 0;
	5618	+ goto unlock;
5037	5619	}
5038	5620	change:
5039	5621
..	..	@@ -5046,8 +5628,8 @@
5046	5628	if (rt_bandwidth_enabled() && rt_policy(policy) &&
5047	5629	task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5048	5630	!task_group_is_autogroup(task_group(p))) {
5049		- task_rq_unlock(rq, p, &rf);
5050		- return -EPERM;
	5631	+ retval = -EPERM;
	5632	+ goto unlock;
5051	5633	}
5052	5634	#endif
5053	5635	#ifdef CONFIG_SMP
..	..	@@ -5062,8 +5644,8 @@
5062	5644	*/
5063	5645	if (!cpumask_subset(span, p->cpus_ptr) \|\|
5064	5646	rq->rd->dl_bw.bw == 0) {
5065		- task_rq_unlock(rq, p, &rf);
5066		- return -EPERM;
	5647	+ retval = -EPERM;
	5648	+ goto unlock;
5067	5649	}
5068	5650	}
5069	5651	#endif
..	..	@@ -5082,13 +5664,14 @@
5082	5664	* is available.
5083	5665	*/
5084	5666	if ((dl_policy(policy) \|\| dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
5085		- task_rq_unlock(rq, p, &rf);
5086		- return -EBUSY;
	5667	+ retval = -EBUSY;
	5668	+ goto unlock;
5087	5669	}
5088	5670
5089	5671	p->sched_reset_on_fork = reset_on_fork;
5090	5672	oldprio = p->prio;
5091	5673
	5674	+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
5092	5675	if (pi) {
5093	5676	/*
5094	5677	* Take priority boosted tasks into account. If the new
..	..	@@ -5097,8 +5680,8 @@
5097	5680	* the runqueue. This will be done when the task deboost
5098	5681	* itself.
5099	5682	*/
5100		- new_effective_prio = rt_effective_prio(p, newprio);
5101		- if (new_effective_prio == oldprio)
	5683	+ newprio = rt_effective_prio(p, newprio);
	5684	+ if (newprio == oldprio)
5102	5685	queue_flags &= ~DEQUEUE_MOVE;
5103	5686	}
5104	5687
..	..	@@ -5111,7 +5694,11 @@
5111	5694
5112	5695	prev_class = p->sched_class;
5113	5696
5114		- __setscheduler(rq, p, attr, pi);
	5697	+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
	5698	+ __setscheduler_params(p, attr);
	5699	+ __setscheduler_prio(p, newprio);
	5700	+ trace_android_rvh_setscheduler(p);
	5701	+ }
5115	5702	__setscheduler_uclamp(p, attr);
5116	5703
5117	5704	if (queued) {
..	..	@@ -5125,7 +5712,7 @@
5125	5712	enqueue_task(rq, p, queue_flags);
5126	5713	}
5127	5714	if (running)
5128		- set_curr_task(rq, p);
	5715	+ set_next_task(rq, p);
5129	5716
5130	5717	check_class_changed(rq, p, prev_class, oldprio);
5131	5718
..	..	@@ -5141,6 +5728,10 @@
5141	5728	preempt_enable();
5142	5729
5143	5730	return 0;
	5731	+
	5732	+unlock:
	5733	+ task_rq_unlock(rq, p, &rf);
	5734	+ return retval;
5144	5735	}
5145	5736
5146	5737	static int _sched_setscheduler(struct task_struct *p, int policy,
..	..	@@ -5152,6 +5743,14 @@
5152	5743	.sched_nice = PRIO_TO_NICE(p->static_prio),
5153	5744	};
5154	5745
	5746	+ if (IS_ENABLED(CONFIG_ROCKCHIP_OPTIMIZE_RT_PRIO) &&
	5747	+ ((policy == SCHED_FIFO) \|\| (policy == SCHED_RR))) {
	5748	+ attr.sched_priority /= 2;
	5749	+ if (!check)
	5750	+ attr.sched_priority += MAX_RT_PRIO / 2;
	5751	+ if (!attr.sched_priority)
	5752	+ attr.sched_priority = 1;
	5753	+ }
5155	5754	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
5156	5755	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
5157	5756	attr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;
..	..	@@ -5166,6 +5765,8 @@
5166	5765	* @p: the task in question.
5167	5766	* @policy: new policy.
5168	5767	* @param: structure containing the new RT priority.
	5768	+ *
	5769	+ * Use sched_set_fifo(), read its comment.
5169	5770	*
5170	5771	* Return: 0 on success. An error code otherwise.
5171	5772	*
..	..	@@ -5188,6 +5789,7 @@
5188	5789	{
5189	5790	return __sched_setscheduler(p, attr, false, true);
5190	5791	}
	5792	+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
5191	5793
5192	5794	/**
5193	5795	* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
..	..	@@ -5208,6 +5810,51 @@
5208	5810	return _sched_setscheduler(p, policy, param, false);
5209	5811	}
5210	5812	EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
	5813	+
	5814	+/*
	5815	+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
	5816	+ * incapable of resource management, which is the one thing an OS really should
	5817	+ * be doing.
	5818	+ *
	5819	+ * This is of course the reason it is limited to privileged users only.
	5820	+ *
	5821	+ * Worse still; it is fundamentally impossible to compose static priority
	5822	+ * workloads. You cannot take two correctly working static prio workloads
	5823	+ * and smash them together and still expect them to work.
	5824	+ *
	5825	+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
	5826	+ *
	5827	+ * MAX_RT_PRIO / 2
	5828	+ *
	5829	+ * The administrator _MUST_ configure the system, the kernel simply doesn't
	5830	+ * know enough information to make a sensible choice.
	5831	+ */
	5832	+void sched_set_fifo(struct task_struct *p)
	5833	+{
	5834	+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
	5835	+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
	5836	+}
	5837	+EXPORT_SYMBOL_GPL(sched_set_fifo);
	5838	+
	5839	+/*
	5840	+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
	5841	+ */
	5842	+void sched_set_fifo_low(struct task_struct *p)
	5843	+{
	5844	+ struct sched_param sp = { .sched_priority = 1 };
	5845	+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
	5846	+}
	5847	+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
	5848	+
	5849	+void sched_set_normal(struct task_struct *p, int nice)
	5850	+{
	5851	+ struct sched_attr attr = {
	5852	+ .sched_policy = SCHED_NORMAL,
	5853	+ .sched_nice = nice,
	5854	+ };
	5855	+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
	5856	+}
	5857	+EXPORT_SYMBOL_GPL(sched_set_normal);
5211	5858
5212	5859	static int
5213	5860	do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
..	..	@@ -5239,9 +5886,6 @@
5239	5886	u32 size;
5240	5887	int ret;
5241	5888
5242		- if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
5243		- return -EFAULT;
5244		-
5245	5889	/* Zero the full structure, so that a short copy will be nice: */
5246	5890	memset(attr, 0, sizeof(*attr));
5247	5891
..	..	@@ -5249,44 +5893,18 @@
5249	5893	if (ret)
5250	5894	return ret;
5251	5895
5252		- /* Bail out on silly large: */
5253		- if (size > PAGE_SIZE)
5254		- goto err_size;
5255		-
5256	5896	/* ABI compatibility quirk: */
5257	5897	if (!size)
5258	5898	size = SCHED_ATTR_SIZE_VER0;
5259		-
5260		- if (size < SCHED_ATTR_SIZE_VER0)
	5899	+ if (size < SCHED_ATTR_SIZE_VER0 \|\| size > PAGE_SIZE)
5261	5900	goto err_size;
5262	5901
5263		- /*
5264		- * If we're handed a bigger struct than we know of,
5265		- * ensure all the unknown bits are 0 - i.e. new
5266		- * user-space does not rely on any kernel feature
5267		- * extensions we dont know about yet.
5268		- */
5269		- if (size > sizeof(*attr)) {
5270		- unsigned char __user *addr;
5271		- unsigned char __user *end;
5272		- unsigned char val;
5273		-
5274		- addr = (void __user )uattr + sizeof(attr);
5275		- end = (void __user *)uattr + size;
5276		-
5277		- for (; addr < end; addr++) {
5278		- ret = get_user(val, addr);
5279		- if (ret)
5280		- return ret;
5281		- if (val)
5282		- goto err_size;
5283		- }
5284		- size = sizeof(*attr);
	5902	+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
	5903	+ if (ret) {
	5904	+ if (ret == -E2BIG)
	5905	+ goto err_size;
	5906	+ return ret;
5285	5907	}
5286		-
5287		- ret = copy_from_user(attr, uattr, size);
5288		- if (ret)
5289		- return -EFAULT;
5290	5908
5291	5909	if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
5292	5910	size < SCHED_ATTR_SIZE_VER1)
..	..	@@ -5303,6 +5921,16 @@
5303	5921	err_size:
5304	5922	put_user(sizeof(*attr), &uattr->size);
5305	5923	return -E2BIG;
	5924	+}
	5925	+
	5926	+static void get_params(struct task_struct p, struct sched_attr attr)
	5927	+{
	5928	+ if (task_has_dl_policy(p))
	5929	+ __getparam_dl(p, attr);
	5930	+ else if (task_has_rt_policy(p))
	5931	+ attr->sched_priority = p->rt_priority;
	5932	+ else
	5933	+ attr->sched_nice = task_nice(p);
5306	5934	}
5307	5935
5308	5936	/**
..	..	@@ -5366,6 +5994,8 @@
5366	5994	rcu_read_unlock();
5367	5995
5368	5996	if (likely(p)) {
	5997	+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
	5998	+ get_params(p, &attr);
5369	5999	retval = sched_setattr(p, &attr);
5370	6000	put_task_struct(p);
5371	6001	}
..	..	@@ -5459,7 +6089,7 @@
5459	6089	{
5460	6090	unsigned int ksize = sizeof(*kattr);
5461	6091
5462		- if (!access_ok(VERIFY_WRITE, uattr, usize))
	6092	+ if (!access_ok(uattr, usize))
5463	6093	return -EFAULT;
5464	6094
5465	6095	/*
..	..	@@ -5487,7 +6117,7 @@
5487	6117	* sys_sched_getattr - similar to sched_getparam, but with sched_attr
5488	6118	* @pid: the pid in question.
5489	6119	* @uattr: structure containing the extended parameters.
5490		- * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
	6120	+ * @usize: sizeof(attr) for fwd/bwd comp.
5491	6121	* @flags: for future extension.
5492	6122	*/
5493	6123	SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
..	..	@@ -5514,14 +6144,15 @@
5514	6144	kattr.sched_policy = p->policy;
5515	6145	if (p->sched_reset_on_fork)
5516	6146	kattr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;
5517		- if (task_has_dl_policy(p))
5518		- __getparam_dl(p, &kattr);
5519		- else if (task_has_rt_policy(p))
5520		- kattr.sched_priority = p->rt_priority;
5521		- else
5522		- kattr.sched_nice = task_nice(p);
	6147	+ get_params(p, &kattr);
	6148	+ kattr.sched_flags &= SCHED_FLAG_ALL;
5523	6149
5524	6150	#ifdef CONFIG_UCLAMP_TASK
	6151	+ /*
	6152	+ * This could race with another potential updater, but this is fine
	6153	+ * because it'll correctly read the old or the new value. We don't need
	6154	+ * to guarantee who wins the race as long as it doesn't return garbage.
	6155	+ */
5525	6156	kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
5526	6157	kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
5527	6158	#endif
..	..	@@ -5540,6 +6171,7 @@
5540	6171	cpumask_var_t cpus_allowed, new_mask;
5541	6172	struct task_struct *p;
5542	6173	int retval;
	6174	+ int skip = 0;
5543	6175
5544	6176	rcu_read_lock();
5545	6177
..	..	@@ -5575,6 +6207,9 @@
5575	6207	rcu_read_unlock();
5576	6208	}
5577	6209
	6210	+ trace_android_vh_sched_setaffinity_early(p, in_mask, &skip);
	6211	+ if (skip)
	6212	+ goto out_free_new_mask;
5578	6213	retval = security_task_setscheduler(p);
5579	6214	if (retval)
5580	6215	goto out_free_new_mask;
..	..	@@ -5615,6 +6250,9 @@
5615	6250	goto again;
5616	6251	}
5617	6252	}
	6253	+
	6254	+ trace_android_rvh_sched_setaffinity(p, in_mask, &retval);
	6255	+
5618	6256	out_free_new_mask:
5619	6257	free_cpumask_var(new_mask);
5620	6258	out_free_cpus_allowed:
..	..	@@ -5623,7 +6261,6 @@
5623	6261	put_task_struct(p);
5624	6262	return retval;
5625	6263	}
5626		-EXPORT_SYMBOL_GPL(sched_setaffinity);
5627	6264
5628	6265	static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5629	6266	struct cpumask *new_mask)
..	..	@@ -5742,6 +6379,8 @@
5742	6379	schedstat_inc(rq->yld_count);
5743	6380	current->sched_class->yield_task(rq);
5744	6381
	6382	+ trace_android_rvh_do_sched_yield(rq);
	6383	+
5745	6384	preempt_disable();
5746	6385	rq_unlock_irq(rq, &rf);
5747	6386	sched_preempt_enable_no_resched();
..	..	@@ -5755,7 +6394,7 @@
5755	6394	return 0;
5756	6395	}
5757	6396
5758		-#ifndef CONFIG_PREEMPT
	6397	+#ifndef CONFIG_PREEMPTION
5759	6398	int __sched _cond_resched(void)
5760	6399	{
5761	6400	if (should_resched(0)) {
..	..	@@ -5772,7 +6411,7 @@
5772	6411	* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
5773	6412	* call schedule, and on return reacquire the lock.
5774	6413	*
5775		- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
	6414	+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
5776	6415	* operations here to prevent schedule() from being called twice (once via
5777	6416	* spin_unlock(), once by hand).
5778	6417	*/
..	..	@@ -5876,7 +6515,7 @@
5876	6515	if (task_running(p_rq, p) \|\| p->state)
5877	6516	goto out_unlock;
5878	6517
5879		- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
	6518	+ yielded = curr->sched_class->yield_to_task(rq, p);
5880	6519	if (yielded) {
5881	6520	schedstat_inc(rq->yld_count);
5882	6521	/*
..	..	@@ -6042,7 +6681,7 @@
6042	6681	* an error code.
6043	6682	*/
6044	6683	SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6045		- struct timespec __user *, interval)
	6684	+ struct __kernel_timespec __user *, interval)
6046	6685	{
6047	6686	struct timespec64 t;
6048	6687	int retval = sched_rr_get_interval(pid, &t);
..	..	@@ -6053,16 +6692,15 @@
6053	6692	return retval;
6054	6693	}
6055	6694
6056		-#ifdef CONFIG_COMPAT
6057		-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
6058		- compat_pid_t, pid,
6059		- struct compat_timespec __user *, interval)
	6695	+#ifdef CONFIG_COMPAT_32BIT_TIME
	6696	+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
	6697	+ struct old_timespec32 __user *, interval)
6060	6698	{
6061	6699	struct timespec64 t;
6062	6700	int retval = sched_rr_get_interval(pid, &t);
6063	6701
6064	6702	if (retval == 0)
6065		- retval = compat_put_timespec64(&t, interval);
	6703	+ retval = put_old_timespec32(&t, interval);
6066	6704	return retval;
6067	6705	}
6068	6706	#endif
..	..	@@ -6075,10 +6713,10 @@
6075	6713	if (!try_get_task_stack(p))
6076	6714	return;
6077	6715
6078		- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
	6716	+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
6079	6717
6080	6718	if (p->state == TASK_RUNNING)
6081		- printk(KERN_CONT " running task ");
	6719	+ pr_cont(" running task ");
6082	6720	#ifdef CONFIG_DEBUG_STACK_USAGE
6083	6721	free = stack_not_used(p);
6084	6722	#endif
..	..	@@ -6087,12 +6725,13 @@
6087	6725	if (pid_alive(p))
6088	6726	ppid = task_pid_nr(rcu_dereference(p->real_parent));
6089	6727	rcu_read_unlock();
6090		- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6091		- task_pid_nr(p), ppid,
	6728	+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
	6729	+ free, task_pid_nr(p), ppid,
6092	6730	(unsigned long)task_thread_info(p)->flags);
6093	6731
6094	6732	print_worker_info(KERN_INFO, p);
6095		- show_stack(p, NULL);
	6733	+ trace_android_vh_sched_show_task(p);
	6734	+ show_stack(p, NULL, KERN_INFO);
6096	6735	put_task_stack(p);
6097	6736	}
6098	6737	EXPORT_SYMBOL_GPL(sched_show_task);
..	..	@@ -6123,13 +6762,6 @@
6123	6762	{
6124	6763	struct task_struct g, p;
6125	6764
6126		-#if BITS_PER_LONG == 32
6127		- printk(KERN_INFO
6128		- " task PC stack pid father\n");
6129		-#else
6130		- printk(KERN_INFO
6131		- " task PC stack pid father\n");
6132		-#endif
6133	6765	rcu_read_lock();
6134	6766	for_each_process_thread(g, p) {
6135	6767	/*
..	..	@@ -6165,7 +6797,7 @@
6165	6797	* NOTE: this function does not set the idle thread's NEED_RESCHED
6166	6798	* flag, to make booting more robust.
6167	6799	*/
6168		-void init_idle(struct task_struct *idle, int cpu)
	6800	+void __init init_idle(struct task_struct *idle, int cpu)
6169	6801	{
6170	6802	struct rq *rq = cpu_rq(cpu);
6171	6803	unsigned long flags;
..	..	@@ -6178,9 +6810,6 @@
6178	6810	idle->state = TASK_RUNNING;
6179	6811	idle->se.exec_start = sched_clock();
6180	6812	idle->flags \|= PF_IDLE;
6181		-
6182		- scs_task_reset(idle);
6183		- kasan_unpoison_task_stack(idle);
6184	6813
6185	6814	#ifdef CONFIG_SMP
6186	6815	/*
..	..	@@ -6205,7 +6834,8 @@
6205	6834	__set_task_cpu(idle, cpu);
6206	6835	rcu_read_unlock();
6207	6836
6208		- rq->curr = rq->idle = idle;
	6837	+ rq->idle = idle;
	6838	+ rcu_assign_pointer(rq->curr, idle);
6209	6839	idle->on_rq = TASK_ON_RQ_QUEUED;
6210	6840	#ifdef CONFIG_SMP
6211	6841	idle->on_cpu = 1;
..	..	@@ -6215,9 +6845,7 @@
6215	6845
6216	6846	/* Set the preempt count _outside_ the spinlocks! */
6217	6847	init_idle_preempt_count(idle, cpu);
6218		-#ifdef CONFIG_HAVE_PREEMPT_LAZY
6219		- task_thread_info(idle)->preempt_lazy_count = 0;
6220		-#endif
	6848	+
6221	6849	/*
6222	6850	* The idle tasks have their own, simple scheduling class:
6223	6851	*/
..	..	@@ -6245,7 +6873,7 @@
6245	6873	}
6246	6874
6247	6875	int task_can_attach(struct task_struct *p,
6248		- const struct cpumask *cs_cpus_allowed)
	6876	+ const struct cpumask *cs_effective_cpus)
6249	6877	{
6250	6878	int ret = 0;
6251	6879
..	..	@@ -6264,8 +6892,13 @@
6264	6892	}
6265	6893
6266	6894	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
6267		- cs_cpus_allowed))
6268		- ret = dl_task_can_attach(p, cs_cpus_allowed);
	6895	+ cs_effective_cpus)) {
	6896	+ int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
	6897	+
	6898	+ if (unlikely(cpu >= nr_cpu_ids))
	6899	+ return -EINVAL;
	6900	+ ret = dl_cpu_busy(cpu, p);
	6901	+ }
6269	6902
6270	6903	out:
6271	6904	return ret;
..	..	@@ -6316,13 +6949,12 @@
6316	6949	if (queued)
6317	6950	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
6318	6951	if (running)
6319		- set_curr_task(rq, p);
	6952	+ set_next_task(rq, p);
6320	6953	task_rq_unlock(rq, p, &rf);
6321	6954	}
6322	6955	#endif /* CONFIG_NUMA_BALANCING */
6323	6956
6324	6957	#ifdef CONFIG_HOTPLUG_CPU
6325		-
6326	6958	/*
6327	6959	* Ensure that the idle task is using init_mm right before its CPU goes
6328	6960	* offline.
..	..	@@ -6358,21 +6990,22 @@
6358	6990	atomic_long_add(delta, &calc_load_tasks);
6359	6991	}
6360	6992
6361		-static void put_prev_task_fake(struct rq rq, struct task_struct prev)
	6993	+static struct task_struct __pick_migrate_task(struct rq rq)
6362	6994	{
	6995	+ const struct sched_class *class;
	6996	+ struct task_struct *next;
	6997	+
	6998	+ for_each_class(class) {
	6999	+ next = class->pick_next_task(rq);
	7000	+ if (next) {
	7001	+ next->sched_class->put_prev_task(rq, next);
	7002	+ return next;
	7003	+ }
	7004	+ }
	7005	+
	7006	+ /* The idle class should always have a runnable task */
	7007	+ BUG();
6363	7008	}
6364		-
6365		-static const struct sched_class fake_sched_class = {
6366		- .put_prev_task = put_prev_task_fake,
6367		-};
6368		-
6369		-static struct task_struct fake_task = {
6370		- /*
6371		- * Avoid pull_{rt,dl}_task()
6372		- */
6373		- .prio = MAX_PRIO + 1,
6374		- .sched_class = &fake_sched_class,
6375		-};
6376	7009
6377	7010	/*
6378	7011	* Migrate all tasks from the rq, sleeping tasks will be migrated by
..	..	@@ -6381,11 +7014,14 @@
6381	7014	* Called with rq->lock held even though we'er in stop_machine() and
6382	7015	* there's no concurrency possible, we hold the required locks anyway
6383	7016	* because of lock validation efforts.
	7017	+ *
	7018	+ * force: if false, the function will skip CPU pinned kthreads.
6384	7019	*/
6385		-static void migrate_tasks(struct rq dead_rq, struct rq_flags rf)
	7020	+static void migrate_tasks(struct rq dead_rq, struct rq_flags rf, bool force)
6386	7021	{
6387	7022	struct rq *rq = dead_rq;
6388		- struct task_struct next, stop = rq->stop;
	7023	+ struct task_struct next, tmp, *stop = rq->stop;
	7024	+ LIST_HEAD(percpu_kthreads);
6389	7025	struct rq_flags orf = *rf;
6390	7026	int dest_cpu;
6391	7027
..	..	@@ -6407,6 +7043,11 @@
6407	7043	*/
6408	7044	update_rq_clock(rq);
6409	7045
	7046	+#ifdef CONFIG_SCHED_DEBUG
	7047	+ /* note the clock update in orf */
	7048	+ orf.clock_update_flags \|= RQCF_UPDATED;
	7049	+#endif
	7050	+
6410	7051	for (;;) {
6411	7052	/*
6412	7053	* There's this thread running, bail when that's the only
..	..	@@ -6415,14 +7056,21 @@
6415	7056	if (rq->nr_running == 1)
6416	7057	break;
6417	7058
6418		- /*
6419		- * pick_next_task() assumes pinned rq->lock:
6420		- */
6421		- next = pick_next_task(rq, &fake_task, rf);
6422		- BUG_ON(!next);
6423		- put_prev_task(rq, next);
	7059	+ next = __pick_migrate_task(rq);
6424	7060
6425		- WARN_ON_ONCE(__migrate_disabled(next));
	7061	+ /*
	7062	+ * Argh ... no iterator for tasks, we need to remove the
	7063	+ * kthread from the run-queue to continue.
	7064	+ */
	7065	+ if (!force && is_per_cpu_kthread(next)) {
	7066	+ INIT_LIST_HEAD(&next->percpu_kthread_node);
	7067	+ list_add(&next->percpu_kthread_node, &percpu_kthreads);
	7068	+
	7069	+ /* DEQUEUE_SAVE not used due to move_entity in rt */
	7070	+ deactivate_task(rq, next,
	7071	+ DEQUEUE_NOCLOCK);
	7072	+ continue;
	7073	+ }
6426	7074
6427	7075	/*
6428	7076	* Rules for changing task_struct::cpus_mask are holding
..	..	@@ -6442,7 +7090,14 @@
6442	7090	* changed the task, WARN if weird stuff happened, because in
6443	7091	* that case the above rq->lock drop is a fail too.
6444	7092	*/
6445		- if (WARN_ON(task_rq(next) != rq \|\| !task_on_rq_queued(next))) {
	7093	+ if (task_rq(next) != rq \|\| !task_on_rq_queued(next)) {
	7094	+ /*
	7095	+ * In the !force case, there is a hole between
	7096	+ * rq_unlock() and rq_relock(), where another CPU might
	7097	+ * not observe an up to date cpu_active_mask and try to
	7098	+ * move tasks around.
	7099	+ */
	7100	+ WARN_ON(force);
6446	7101	raw_spin_unlock(&next->pi_lock);
6447	7102	continue;
6448	7103	}
..	..	@@ -6459,7 +7114,49 @@
6459	7114	raw_spin_unlock(&next->pi_lock);
6460	7115	}
6461	7116
	7117	+ list_for_each_entry_safe(next, tmp, &percpu_kthreads,
	7118	+ percpu_kthread_node) {
	7119	+
	7120	+ /* ENQUEUE_RESTORE not used due to move_entity in rt */
	7121	+ activate_task(rq, next, ENQUEUE_NOCLOCK);
	7122	+ list_del(&next->percpu_kthread_node);
	7123	+ }
	7124	+
6462	7125	rq->stop = stop;
	7126	+}
	7127	+
	7128	+static int drain_rq_cpu_stop(void *data)
	7129	+{
	7130	+ struct rq *rq = this_rq();
	7131	+ struct rq_flags rf;
	7132	+
	7133	+ rq_lock_irqsave(rq, &rf);
	7134	+ migrate_tasks(rq, &rf, false);
	7135	+ rq_unlock_irqrestore(rq, &rf);
	7136	+
	7137	+ return 0;
	7138	+}
	7139	+
	7140	+int sched_cpu_drain_rq(unsigned int cpu)
	7141	+{
	7142	+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
	7143	+ struct cpu_stop_done *rq_drain_done = &(cpu_rq(cpu)->drain_done);
	7144	+
	7145	+ if (idle_cpu(cpu)) {
	7146	+ rq_drain->done = NULL;
	7147	+ return 0;
	7148	+ }
	7149	+
	7150	+ return stop_one_cpu_async(cpu, drain_rq_cpu_stop, NULL, rq_drain,
	7151	+ rq_drain_done);
	7152	+}
	7153	+
	7154	+void sched_cpu_drain_rq_wait(unsigned int cpu)
	7155	+{
	7156	+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
	7157	+
	7158	+ if (rq_drain->done)
	7159	+ cpu_stop_work_wait(rq_drain);
6463	7160	}
6464	7161	#endif /* CONFIG_HOTPLUG_CPU */
6465	7162
..	..	@@ -6531,8 +7228,10 @@
6531	7228	static int cpuset_cpu_inactive(unsigned int cpu)
6532	7229	{
6533	7230	if (!cpuhp_tasks_frozen) {
6534		- if (dl_cpu_busy(cpu))
6535		- return -EBUSY;
	7231	+ int ret = dl_cpu_busy(cpu, NULL);
	7232	+
	7233	+ if (ret)
	7234	+ return ret;
6536	7235	cpuset_update_active_cpus();
6537	7236	} else {
6538	7237	num_cpus_frozen++;
..	..	@@ -6581,19 +7280,27 @@
6581	7280	return 0;
6582	7281	}
6583	7282
6584		-int sched_cpu_deactivate(unsigned int cpu)
	7283	+int sched_cpus_activate(struct cpumask *cpus)
	7284	+{
	7285	+ unsigned int cpu;
	7286	+
	7287	+ for_each_cpu(cpu, cpus) {
	7288	+ if (sched_cpu_activate(cpu)) {
	7289	+ for_each_cpu_and(cpu, cpus, cpu_active_mask)
	7290	+ sched_cpu_deactivate(cpu);
	7291	+
	7292	+ return -EBUSY;
	7293	+ }
	7294	+ }
	7295	+
	7296	+ return 0;
	7297	+}
	7298	+
	7299	+int _sched_cpu_deactivate(unsigned int cpu)
6585	7300	{
6586	7301	int ret;
6587	7302
6588	7303	set_cpu_active(cpu, false);
6589		- /*
6590		- * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
6591		- * users of this state to go away such that all new such users will
6592		- * observe it.
6593		- *
6594		- * Do sync before park smpboot threads to take care the rcu boost case.
6595		- */
6596		- synchronize_rcu_mult(call_rcu, call_rcu_sched);
6597	7304
6598	7305	#ifdef CONFIG_SCHED_SMT
6599	7306	/*
..	..	@@ -6612,6 +7319,46 @@
6612	7319	return ret;
6613	7320	}
6614	7321	sched_domains_numa_masks_clear(cpu);
	7322	+
	7323	+ update_max_interval();
	7324	+
	7325	+ return 0;
	7326	+}
	7327	+
	7328	+int sched_cpu_deactivate(unsigned int cpu)
	7329	+{
	7330	+ int ret = _sched_cpu_deactivate(cpu);
	7331	+
	7332	+ if (ret)
	7333	+ return ret;
	7334	+
	7335	+ /*
	7336	+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
	7337	+ * users of this state to go away such that all new such users will
	7338	+ * observe it.
	7339	+ *
	7340	+ * Do sync before park smpboot threads to take care the rcu boost case.
	7341	+ */
	7342	+ synchronize_rcu();
	7343	+
	7344	+ return 0;
	7345	+}
	7346	+
	7347	+int sched_cpus_deactivate_nosync(struct cpumask *cpus)
	7348	+{
	7349	+ unsigned int cpu;
	7350	+
	7351	+ for_each_cpu(cpu, cpus) {
	7352	+ if (_sched_cpu_deactivate(cpu)) {
	7353	+ for_each_cpu(cpu, cpus) {
	7354	+ if (!cpu_active(cpu))
	7355	+ sched_cpu_activate(cpu);
	7356	+ }
	7357	+
	7358	+ return -EBUSY;
	7359	+ }
	7360	+ }
	7361	+
6615	7362	return 0;
6616	7363	}
6617	7364
..	..	@@ -6620,13 +7367,13 @@
6620	7367	struct rq *rq = cpu_rq(cpu);
6621	7368
6622	7369	rq->calc_load_update = calc_load_update;
6623		- update_max_interval();
6624	7370	}
6625	7371
6626	7372	int sched_cpu_starting(unsigned int cpu)
6627	7373	{
6628	7374	sched_rq_cpu_starting(cpu);
6629	7375	sched_tick_start(cpu);
	7376	+ trace_android_rvh_sched_cpu_starting(cpu);
6630	7377	return 0;
6631	7378	}
6632	7379
..	..	@@ -6637,7 +7384,6 @@
6637	7384	struct rq_flags rf;
6638	7385
6639	7386	/* Handle pending wakeups and then migrate everything off */
6640		- sched_ttwu_pending();
6641	7387	sched_tick_stop(cpu);
6642	7388
6643	7389	rq_lock_irqsave(rq, &rf);
..	..	@@ -6645,12 +7391,13 @@
6645	7391	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6646	7392	set_rq_offline(rq);
6647	7393	}
6648		- migrate_tasks(rq, &rf);
	7394	+ migrate_tasks(rq, &rf, true);
6649	7395	BUG_ON(rq->nr_running != 1);
6650	7396	rq_unlock_irqrestore(rq, &rf);
6651	7397
	7398	+ trace_android_rvh_sched_cpu_dying(cpu);
	7399	+
6652	7400	calc_load_migrate(rq);
6653		- update_max_interval();
6654	7401	nohz_balance_exit_idle(rq);
6655	7402	hrtick_clear(rq);
6656	7403	return 0;
..	..	@@ -6664,18 +7411,16 @@
6664	7411	/*
6665	7412	* There's no userspace yet to cause hotplug operations; hence all the
6666	7413	* CPU masks are stable and all blatant races in the below code cannot
6667		- * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
6668		- * but there won't be any contention on it.
	7414	+ * happen.
6669	7415	*/
6670		- cpus_read_lock();
6671	7416	mutex_lock(&sched_domains_mutex);
6672	7417	sched_init_domains(cpu_active_mask);
6673	7418	mutex_unlock(&sched_domains_mutex);
6674		- cpus_read_unlock();
6675	7419
6676	7420	/* Move init over to a non-isolated CPU */
6677	7421	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
6678	7422	BUG();
	7423	+
6679	7424	sched_init_granularity();
6680	7425
6681	7426	init_sched_rt_class();
..	..	@@ -6686,7 +7431,7 @@
6686	7431
6687	7432	static int __init migration_init(void)
6688	7433	{
6689		- sched_rq_cpu_starting(smp_processor_id());
	7434	+ sched_cpu_starting(smp_processor_id());
6690	7435	return 0;
6691	7436	}
6692	7437	early_initcall(migration_init);
..	..	@@ -6711,7 +7456,9 @@
6711	7456	* Every task in system belongs to this group at bootup.
6712	7457	*/
6713	7458	struct task_group root_task_group;
	7459	+EXPORT_SYMBOL_GPL(root_task_group);
6714	7460	LIST_HEAD(task_groups);
	7461	+EXPORT_SYMBOL_GPL(task_groups);
6715	7462
6716	7463	/* Cacheline aligned slab cache for task_group */
6717	7464	static struct kmem_cache *task_group_cache __read_mostly;
..	..	@@ -6722,19 +7469,27 @@
6722	7469
6723	7470	void __init sched_init(void)
6724	7471	{
6725		- int i, j;
6726		- unsigned long alloc_size = 0, ptr;
	7472	+ unsigned long ptr = 0;
	7473	+ int i;
	7474	+
	7475	+ /* Make sure the linker didn't screw up */
	7476	+ BUG_ON(&idle_sched_class + 1 != &fair_sched_class \|\|
	7477	+ &fair_sched_class + 1 != &rt_sched_class \|\|
	7478	+ &rt_sched_class + 1 != &dl_sched_class);
	7479	+#ifdef CONFIG_SMP
	7480	+ BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
	7481	+#endif
6727	7482
6728	7483	wait_bit_init();
6729	7484
6730	7485	#ifdef CONFIG_FAIR_GROUP_SCHED
6731		- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
	7486	+ ptr += 2 * nr_cpu_ids * sizeof(void **);
6732	7487	#endif
6733	7488	#ifdef CONFIG_RT_GROUP_SCHED
6734		- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
	7489	+ ptr += 2 * nr_cpu_ids * sizeof(void **);
6735	7490	#endif
6736		- if (alloc_size) {
6737		- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
	7491	+ if (ptr) {
	7492	+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
6738	7493
6739	7494	#ifdef CONFIG_FAIR_GROUP_SCHED
6740	7495	root_task_group.se = (struct sched_entity **)ptr;
..	..	@@ -6743,6 +7498,8 @@
6743	7498	root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6744	7499	ptr += nr_cpu_ids * sizeof(void **);
6745	7500
	7501	+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
	7502	+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6746	7503	#endif /* CONFIG_FAIR_GROUP_SCHED */
6747	7504	#ifdef CONFIG_RT_GROUP_SCHED
6748	7505	root_task_group.rt_se = (struct sched_rt_entity **)ptr;
..	..	@@ -6795,7 +7552,6 @@
6795	7552	init_rt_rq(&rq->rt);
6796	7553	init_dl_rq(&rq->dl);
6797	7554	#ifdef CONFIG_FAIR_GROUP_SCHED
6798		- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6799	7555	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6800	7556	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
6801	7557	/*
..	..	@@ -6817,7 +7573,6 @@
6817	7573	* We achieve this by letting root_task_group's tasks sit
6818	7574	* directly in rq->cfs (i.e root_task_group->se[] = NULL).
6819	7575	*/
6820		- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6821	7576	init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6822	7577	#endif /* CONFIG_FAIR_GROUP_SCHED */
6823	7578
..	..	@@ -6825,10 +7580,6 @@
6825	7580	#ifdef CONFIG_RT_GROUP_SCHED
6826	7581	init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6827	7582	#endif
6828		-
6829		- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6830		- rq->cpu_load[j] = 0;
6831		-
6832	7583	#ifdef CONFIG_SMP
6833	7584	rq->sd = NULL;
6834	7585	rq->rd = NULL;
..	..	@@ -6847,16 +7598,17 @@
6847	7598
6848	7599	rq_attach_root(rq, &def_root_domain);
6849	7600	#ifdef CONFIG_NO_HZ_COMMON
6850		- rq->last_load_update_tick = jiffies;
6851	7601	rq->last_blocked_load_update_tick = jiffies;
6852	7602	atomic_set(&rq->nohz_flags, 0);
	7603	+
	7604	+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
6853	7605	#endif
6854	7606	#endif /* CONFIG_SMP */
6855	7607	hrtick_rq_init(rq);
6856	7608	atomic_set(&rq->nr_iowait, 0);
6857	7609	}
6858	7610
6859		- set_load_weight(&init_task, false);
	7611	+ set_load_weight(&init_task);
6860	7612
6861	7613	/*
6862	7614	* The boot idle thread does lazy MMU switching as well:
..	..	@@ -6891,7 +7643,7 @@
6891	7643	#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6892	7644	static inline int preempt_count_equals(int preempt_offset)
6893	7645	{
6894		- int nested = preempt_count() + sched_rcu_preempt_depth();
	7646	+ int nested = preempt_count() + rcu_preempt_depth();
6895	7647
6896	7648	return (nested == preempt_offset);
6897	7649	}
..	..	@@ -6925,7 +7677,7 @@
6925	7677	rcu_sleep_check();
6926	7678
6927	7679	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6928		- !is_idle_task(current)) \|\|
	7680	+ !is_idle_task(current) && !current->non_block_count) \|\|
6929	7681	system_state == SYSTEM_BOOTING \|\| system_state > SYSTEM_RUNNING \|\|
6930	7682	oops_in_progress)
6931	7683	return;
..	..	@@ -6941,8 +7693,8 @@
6941	7693	"BUG: sleeping function called from invalid context at %s:%d\n",
6942	7694	file, line);
6943	7695	printk(KERN_ERR
6944		- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6945		- in_atomic(), irqs_disabled(),
	7696	+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
	7697	+ in_atomic(), irqs_disabled(), current->non_block_count,
6946	7698	current->pid, current->comm);
6947	7699
6948	7700	if (task_stack_end_corrupted(current))
..	..	@@ -6954,13 +7706,43 @@
6954	7706	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
6955	7707	&& !preempt_count_equals(preempt_offset)) {
6956	7708	pr_err("Preemption disabled at:");
6957		- print_ip_sym(preempt_disable_ip);
6958		- pr_cont("\n");
	7709	+ print_ip_sym(KERN_ERR, preempt_disable_ip);
6959	7710	}
	7711	+
	7712	+ trace_android_rvh_schedule_bug(NULL);
	7713	+
6960	7714	dump_stack();
6961	7715	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
6962	7716	}
6963	7717	EXPORT_SYMBOL(___might_sleep);
	7718	+
	7719	+void __cant_sleep(const char *file, int line, int preempt_offset)
	7720	+{
	7721	+ static unsigned long prev_jiffy;
	7722	+
	7723	+ if (irqs_disabled())
	7724	+ return;
	7725	+
	7726	+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
	7727	+ return;
	7728	+
	7729	+ if (preempt_count() > preempt_offset)
	7730	+ return;
	7731	+
	7732	+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
	7733	+ return;
	7734	+ prev_jiffy = jiffies;
	7735	+
	7736	+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
	7737	+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
	7738	+ in_atomic(), irqs_disabled(),
	7739	+ current->pid, current->comm);
	7740	+
	7741	+ debug_show_held_locks(current);
	7742	+ dump_stack();
	7743	+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
	7744	+}
	7745	+EXPORT_SYMBOL_GPL(__cant_sleep);
6964	7746	#endif
6965	7747
6966	7748	#ifdef CONFIG_MAGIC_SYSRQ
..	..	@@ -7029,7 +7811,7 @@
7029	7811
7030	7812	#ifdef CONFIG_IA64
7031	7813	/**
7032		- * set_curr_task - set the current task for a given CPU.
	7814	+ * ia64_set_curr_task - set the current task for a given CPU.
7033	7815	* @cpu: the processor in question.
7034	7816	* @p: the task pointer to set.
7035	7817	*
..	..	@@ -7195,8 +7977,15 @@
7195	7977
7196	7978	if (queued)
7197	7979	enqueue_task(rq, tsk, queue_flags);
7198		- if (running)
7199		- set_curr_task(rq, tsk);
	7980	+ if (running) {
	7981	+ set_next_task(rq, tsk);
	7982	+ /*
	7983	+ * After changing group, the running task may have joined a
	7984	+ * throttled one but it's still the running task. Trigger a
	7985	+ * resched to make sure that task can still run.
	7986	+ */
	7987	+ resched_curr(rq);
	7988	+ }
7200	7989
7201	7990	task_rq_unlock(rq, tsk, &rf);
7202	7991	}
..	..	@@ -7235,9 +8024,14 @@
7235	8024
7236	8025	#ifdef CONFIG_UCLAMP_TASK_GROUP
7237	8026	/* Propagate the effective uclamp value for the new group */
	8027	+ mutex_lock(&uclamp_mutex);
	8028	+ rcu_read_lock();
7238	8029	cpu_util_update_eff(css);
	8030	+ rcu_read_unlock();
	8031	+ mutex_unlock(&uclamp_mutex);
7239	8032	#endif
7240	8033
	8034	+ trace_android_rvh_cpu_cgroup_online(css);
7241	8035	return 0;
7242	8036	}
7243	8037
..	..	@@ -7303,6 +8097,9 @@
7303	8097	if (ret)
7304	8098	break;
7305	8099	}
	8100	+
	8101	+ trace_android_rvh_cpu_cgroup_can_attach(tset, &ret);
	8102	+
7306	8103	return ret;
7307	8104	}
7308	8105
..	..	@@ -7313,6 +8110,8 @@
7313	8110
7314	8111	cgroup_taskset_for_each(task, css, tset)
7315	8112	sched_move_task(task);
	8113	+
	8114	+ trace_android_rvh_cpu_cgroup_attach(tset);
7316	8115	}
7317	8116
7318	8117	#ifdef CONFIG_UCLAMP_TASK_GROUP
..	..	@@ -7324,6 +8123,9 @@
7324	8123	unsigned int eff[UCLAMP_CNT];
7325	8124	enum uclamp_id clamp_id;
7326	8125	unsigned int clamps;
	8126	+
	8127	+ lockdep_assert_held(&uclamp_mutex);
	8128	+ SCHED_WARN_ON(!rcu_read_lock_held());
7327	8129
7328	8130	css_for_each_descendant_pre(css, top_css) {
7329	8131	uc_parent = css_tg(css)->parent
..	..	@@ -7357,7 +8159,7 @@
7357	8159	}
7358	8160
7359	8161	/* Immediately update descendants RUNNABLE tasks */
7360		- uclamp_update_active_tasks(css, clamps);
	8162	+ uclamp_update_active_tasks(css);
7361	8163	}
7362	8164	}
7363	8165
..	..	@@ -7414,6 +8216,8 @@
7414	8216	req = capacity_from_percent(buf);
7415	8217	if (req.ret)
7416	8218	return req.ret;
	8219	+
	8220	+ static_branch_enable(&sched_uclamp_used);
7417	8221
7418	8222	mutex_lock(&uclamp_mutex);
7419	8223	rcu_read_lock();
..	..	@@ -7529,7 +8333,9 @@
7529	8333	static DEFINE_MUTEX(cfs_constraints_mutex);
7530	8334
7531	8335	const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7532		-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
	8336	+static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
	8337	+/* More than 203 days if BW_SHIFT equals 20. */
	8338	+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
7533	8339
7534	8340	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7535	8341
..	..	@@ -7555,6 +8361,12 @@
7555	8361	* feasibility.
7556	8362	*/
7557	8363	if (period > max_cfs_quota_period)
	8364	+ return -EINVAL;
	8365	+
	8366	+ /*
	8367	+ * Bound quota to defend quota against overflow during bandwidth shift.
	8368	+ */
	8369	+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
7558	8370	return -EINVAL;
7559	8371
7560	8372	/*
..	..	@@ -7609,7 +8421,7 @@
7609	8421	return ret;
7610	8422	}
7611	8423
7612		-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
	8424	+static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7613	8425	{
7614	8426	u64 quota, period;
7615	8427
..	..	@@ -7624,7 +8436,7 @@
7624	8436	return tg_set_cfs_bandwidth(tg, period, quota);
7625	8437	}
7626	8438
7627		-long tg_get_cfs_quota(struct task_group *tg)
	8439	+static long tg_get_cfs_quota(struct task_group *tg)
7628	8440	{
7629	8441	u64 quota_us;
7630	8442
..	..	@@ -7637,7 +8449,7 @@
7637	8449	return quota_us;
7638	8450	}
7639	8451
7640		-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
	8452	+static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7641	8453	{
7642	8454	u64 quota, period;
7643	8455
..	..	@@ -7650,7 +8462,7 @@
7650	8462	return tg_set_cfs_bandwidth(tg, period, quota);
7651	8463	}
7652	8464
7653		-long tg_get_cfs_period(struct task_group *tg)
	8465	+static long tg_get_cfs_period(struct task_group *tg)
7654	8466	{
7655	8467	u64 cfs_period_us;
7656	8468
..	..	@@ -8127,172 +8939,7 @@
8127	8939	/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
8128	8940	};
8129	8941
8130		-#undef CREATE_TRACE_POINTS
8131		-
8132		-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
8133		-
8134		-static inline void
8135		-update_nr_migratory(struct task_struct *p, long delta)
	8942	+void call_trace_sched_update_nr_running(struct rq *rq, int count)
8136	8943	{
8137		- if (unlikely((p->sched_class == &rt_sched_class \|\|
8138		- p->sched_class == &dl_sched_class) &&
8139		- p->nr_cpus_allowed > 1)) {
8140		- if (p->sched_class == &rt_sched_class)
8141		- task_rq(p)->rt.rt_nr_migratory += delta;
8142		- else
8143		- task_rq(p)->dl.dl_nr_migratory += delta;
8144		- }
	8944	+ trace_sched_update_nr_running_tp(rq, count);
8145	8945	}
8146		-
8147		-static inline void
8148		-migrate_disable_update_cpus_allowed(struct task_struct *p)
8149		-{
8150		- p->cpus_ptr = cpumask_of(smp_processor_id());
8151		- update_nr_migratory(p, -1);
8152		- p->nr_cpus_allowed = 1;
8153		-}
8154		-
8155		-static inline void
8156		-migrate_enable_update_cpus_allowed(struct task_struct *p)
8157		-{
8158		- struct rq *rq;
8159		- struct rq_flags rf;
8160		-
8161		- rq = task_rq_lock(p, &rf);
8162		- p->cpus_ptr = &p->cpus_mask;
8163		- p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
8164		- update_nr_migratory(p, 1);
8165		- task_rq_unlock(rq, p, &rf);
8166		-}
8167		-
8168		-void migrate_disable(void)
8169		-{
8170		- preempt_disable();
8171		-
8172		- if (++current->migrate_disable == 1) {
8173		- this_rq()->nr_pinned++;
8174		- preempt_lazy_disable();
8175		-#ifdef CONFIG_SCHED_DEBUG
8176		- WARN_ON_ONCE(current->pinned_on_cpu >= 0);
8177		- current->pinned_on_cpu = smp_processor_id();
8178		-#endif
8179		- }
8180		-
8181		- preempt_enable();
8182		-}
8183		-EXPORT_SYMBOL(migrate_disable);
8184		-
8185		-static void migrate_disabled_sched(struct task_struct *p)
8186		-{
8187		- if (p->migrate_disable_scheduled)
8188		- return;
8189		-
8190		- migrate_disable_update_cpus_allowed(p);
8191		- p->migrate_disable_scheduled = 1;
8192		-}
8193		-
8194		-static DEFINE_PER_CPU(struct cpu_stop_work, migrate_work);
8195		-static DEFINE_PER_CPU(struct migration_arg, migrate_arg);
8196		-
8197		-void migrate_enable(void)
8198		-{
8199		- struct task_struct *p = current;
8200		- struct rq *rq = this_rq();
8201		- int cpu = task_cpu(p);
8202		-
8203		- WARN_ON_ONCE(p->migrate_disable <= 0);
8204		- if (p->migrate_disable > 1) {
8205		- p->migrate_disable--;
8206		- return;
8207		- }
8208		-
8209		- preempt_disable();
8210		-
8211		-#ifdef CONFIG_SCHED_DEBUG
8212		- WARN_ON_ONCE(current->pinned_on_cpu != cpu);
8213		- current->pinned_on_cpu = -1;
8214		-#endif
8215		-
8216		- WARN_ON_ONCE(rq->nr_pinned < 1);
8217		-
8218		- p->migrate_disable = 0;
8219		- rq->nr_pinned--;
8220		-#ifdef CONFIG_HOTPLUG_CPU
8221		- if (rq->nr_pinned == 0 && unlikely(!cpu_active(cpu)) &&
8222		- takedown_cpu_task)
8223		- wake_up_process(takedown_cpu_task);
8224		-#endif
8225		-
8226		- if (!p->migrate_disable_scheduled)
8227		- goto out;
8228		-
8229		- p->migrate_disable_scheduled = 0;
8230		-
8231		- migrate_enable_update_cpus_allowed(p);
8232		-
8233		- WARN_ON(smp_processor_id() != cpu);
8234		- if (!is_cpu_allowed(p, cpu)) {
8235		- struct migration_arg __percpu *arg;
8236		- struct cpu_stop_work __percpu *work;
8237		- struct rq_flags rf;
8238		-
8239		- work = this_cpu_ptr(&migrate_work);
8240		- arg = this_cpu_ptr(&migrate_arg);
8241		- WARN_ON_ONCE(!arg->done && !work->disabled && work->arg);
8242		-
8243		- arg->task = p;
8244		- arg->done = false;
8245		-
8246		- rq = task_rq_lock(p, &rf);
8247		- update_rq_clock(rq);
8248		- arg->dest_cpu = select_fallback_rq(cpu, p);
8249		- task_rq_unlock(rq, p, &rf);
8250		-
8251		- stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
8252		- arg, work);
8253		- tlb_migrate_finish(p->mm);
8254		- }
8255		-
8256		-out:
8257		- preempt_lazy_enable();
8258		- preempt_enable();
8259		-}
8260		-EXPORT_SYMBOL(migrate_enable);
8261		-
8262		-int cpu_nr_pinned(int cpu)
8263		-{
8264		- struct rq *rq = cpu_rq(cpu);
8265		-
8266		- return rq->nr_pinned;
8267		-}
8268		-
8269		-#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
8270		-static void migrate_disabled_sched(struct task_struct *p)
8271		-{
8272		-}
8273		-
8274		-void migrate_disable(void)
8275		-{
8276		-#ifdef CONFIG_SCHED_DEBUG
8277		- current->migrate_disable++;
8278		-#endif
8279		- barrier();
8280		-}
8281		-EXPORT_SYMBOL(migrate_disable);
8282		-
8283		-void migrate_enable(void)
8284		-{
8285		-#ifdef CONFIG_SCHED_DEBUG
8286		- struct task_struct *p = current;
8287		-
8288		- WARN_ON_ONCE(p->migrate_disable <= 0);
8289		- p->migrate_disable--;
8290		-#endif
8291		- barrier();
8292		-}
8293		-EXPORT_SYMBOL(migrate_enable);
8294		-#else
8295		-static void migrate_disabled_sched(struct task_struct *p)
8296		-{
8297		-}
8298		-#endif