~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* kernel/sched/core.c
3	4	*
..	..	@@ -5,6 +6,10 @@
5	6	*
6	7	* Copyright (C) 1991-2002 Linus Torvalds
7	8	*/
	9	+#define CREATE_TRACE_POINTS
	10	+#include <trace/events/sched.h>
	11	+#undef CREATE_TRACE_POINTS
	12	+
8	13	#include "sched.h"
9	14
10	15	#include <linux/nospec.h>
..	..	@@ -16,14 +21,41 @@
16	21	#include <asm/tlb.h>
17	22
18	23	#include "../workqueue_internal.h"
	24	+#include "../../io_uring/io-wq.h"
19	25	#include "../smpboot.h"
20	26
21	27	#include "pelt.h"
	28	+#include "smp.h"
22	29
23		-#define CREATE_TRACE_POINTS
24		-#include <trace/events/sched.h>
	30	+#include <trace/hooks/sched.h>
	31	+#include <trace/hooks/dtask.h>
	32	+
	33	+/*
	34	+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
	35	+ * associated with them) to allow external modules to probe them.
	36	+ */
	37	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
	38	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
	39	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
	40	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
	41	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
	42	+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
	43	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
	44	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
	45	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
	46	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
	47	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
	48	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_switch);
	49	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking);
	50	+#ifdef CONFIG_SCHEDSTATS
	51	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep);
	52	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait);
	53	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait);
	54	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked);
	55	+#endif
25	56
26	57	DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
	58	+EXPORT_SYMBOL_GPL(runqueues);
27	59
28	60	#ifdef CONFIG_SCHED_DEBUG
29	61	/*
..	..	@@ -38,6 +70,7 @@
38	70	const_debug unsigned int sysctl_sched_features =
39	71	#include "features.h"
40	72	0;
	73	+EXPORT_SYMBOL_GPL(sysctl_sched_features);
41	74	#undef SCHED_FEAT
42	75	#endif
43	76
..	..	@@ -45,11 +78,7 @@
45	78	* Number of tasks to iterate in a single balance run.
46	79	* Limited because this is done with IRQs disabled.
47	80	*/
48		-#ifdef CONFIG_PREEMPT_RT_FULL
49		-const_debug unsigned int sysctl_sched_nr_migrate = 8;
50		-#else
51	81	const_debug unsigned int sysctl_sched_nr_migrate = 32;
52		-#endif
53	82
54	83	/*
55	84	* period over which we measure -rt task CPU usage in us.
..	..	@@ -64,6 +93,100 @@
64	93	* default: 0.95s
65	94	*/
66	95	int sysctl_sched_rt_runtime = 950000;
	96	+
	97	+
	98	+/*
	99	+ * Serialization rules:
	100	+ *
	101	+ * Lock order:
	102	+ *
	103	+ * p->pi_lock
	104	+ * rq->lock
	105	+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
	106	+ *
	107	+ * rq1->lock
	108	+ * rq2->lock where: rq1 < rq2
	109	+ *
	110	+ * Regular state:
	111	+ *
	112	+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
	113	+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
	114	+ * always looks at the local rq data structures to find the most elegible task
	115	+ * to run next.
	116	+ *
	117	+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
	118	+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
	119	+ * the local CPU to avoid bouncing the runqueue state around [ see
	120	+ * ttwu_queue_wakelist() ]
	121	+ *
	122	+ * Task wakeup, specifically wakeups that involve migration, are horribly
	123	+ * complicated to avoid having to take two rq->locks.
	124	+ *
	125	+ * Special state:
	126	+ *
	127	+ * System-calls and anything external will use task_rq_lock() which acquires
	128	+ * both p->pi_lock and rq->lock. As a consequence the state they change is
	129	+ * stable while holding either lock:
	130	+ *
	131	+ * - sched_setaffinity()/
	132	+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
	133	+ * - set_user_nice(): p->se.load, p->*prio
	134	+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
	135	+ * p->se.load, p->rt_priority,
	136	+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
	137	+ * - sched_setnuma(): p->numa_preferred_nid
	138	+ * - sched_move_task()/
	139	+ * cpu_cgroup_fork(): p->sched_task_group
	140	+ * - uclamp_update_active() p->uclamp*
	141	+ *
	142	+ * p->state <- TASK_*:
	143	+ *
	144	+ * is changed locklessly using set_current_state(), __set_current_state() or
	145	+ * set_special_state(), see their respective comments, or by
	146	+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
	147	+ * concurrent self.
	148	+ *
	149	+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
	150	+ *
	151	+ * is set by activate_task() and cleared by deactivate_task(), under
	152	+ * rq->lock. Non-zero indicates the task is runnable, the special
	153	+ * ON_RQ_MIGRATING state is used for migration without holding both
	154	+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
	155	+ *
	156	+ * p->on_cpu <- { 0, 1 }:
	157	+ *
	158	+ * is set by prepare_task() and cleared by finish_task() such that it will be
	159	+ * set before p is scheduled-in and cleared after p is scheduled-out, both
	160	+ * under rq->lock. Non-zero indicates the task is running on its CPU.
	161	+ *
	162	+ * [ The astute reader will observe that it is possible for two tasks on one
	163	+ * CPU to have ->on_cpu = 1 at the same time. ]
	164	+ *
	165	+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
	166	+ *
	167	+ * - Don't call set_task_cpu() on a blocked task:
	168	+ *
	169	+ * We don't care what CPU we're not running on, this simplifies hotplug,
	170	+ * the CPU assignment of blocked tasks isn't required to be valid.
	171	+ *
	172	+ * - for try_to_wake_up(), called under p->pi_lock:
	173	+ *
	174	+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
	175	+ *
	176	+ * - for migration called under rq->lock:
	177	+ * [ see task_on_rq_migrating() in task_rq_lock() ]
	178	+ *
	179	+ * o move_queued_task()
	180	+ * o detach_task()
	181	+ *
	182	+ * - for migration called under double_rq_lock():
	183	+ *
	184	+ * o __migrate_swap_task()
	185	+ * o push_rt_task() / pull_rt_task()
	186	+ * o push_dl_task() / pull_dl_task()
	187	+ * o dl_task_offline_migration()
	188	+ *
	189	+ */
67	190
68	191	/*
69	192	* __task_rq_lock - lock the rq @p resides on.
..	..	@@ -88,6 +211,7 @@
88	211	cpu_relax();
89	212	}
90	213	}
	214	+EXPORT_SYMBOL_GPL(__task_rq_lock);
91	215
92	216	/*
93	217	* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
..	..	@@ -130,6 +254,7 @@
130	254	cpu_relax();
131	255	}
132	256	}
	257	+EXPORT_SYMBOL_GPL(task_rq_lock);
133	258
134	259	/*
135	260	* RQ-clock updating methods:
..	..	@@ -210,7 +335,15 @@
210	335	rq->clock += delta;
211	336	update_rq_clock_task(rq, delta);
212	337	}
	338	+EXPORT_SYMBOL_GPL(update_rq_clock);
213	339
	340	+static inline void
	341	+rq_csd_init(struct rq rq, struct __call_single_data csd, smp_call_func_t func)
	342	+{
	343	+ csd->flags = 0;
	344	+ csd->func = func;
	345	+ csd->info = rq;
	346	+}
214	347
215	348	#ifdef CONFIG_SCHED_HRTICK
216	349	/*
..	..	@@ -247,8 +380,9 @@
247	380	static void __hrtick_restart(struct rq *rq)
248	381	{
249	382	struct hrtimer *timer = &rq->hrtick_timer;
	383	+ ktime_t time = rq->hrtick_time;
250	384
251		- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
	385	+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
252	386	}
253	387
254	388	/*
..	..	@@ -261,7 +395,6 @@
261	395
262	396	rq_lock(rq, &rf);
263	397	__hrtick_restart(rq);
264		- rq->hrtick_csd_pending = 0;
265	398	rq_unlock(rq, &rf);
266	399	}
267	400
..	..	@@ -273,7 +406,6 @@
273	406	void hrtick_start(struct rq *rq, u64 delay)
274	407	{
275	408	struct hrtimer *timer = &rq->hrtick_timer;
276		- ktime_t time;
277	409	s64 delta;
278	410
279	411	/*
..	..	@@ -281,16 +413,12 @@
281	413	* doesn't make sense and can cause timer DoS.
282	414	*/
283	415	delta = max_t(s64, delay, 10000LL);
284		- time = ktime_add_ns(timer->base->get_time(), delta);
	416	+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
285	417
286		- hrtimer_set_expires(timer, time);
287		-
288		- if (rq == this_rq()) {
	418	+ if (rq == this_rq())
289	419	__hrtick_restart(rq);
290		- } else if (!rq->hrtick_csd_pending) {
	420	+ else
291	421	smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
292		- rq->hrtick_csd_pending = 1;
293		- }
294	422	}
295	423
296	424	#else
..	..	@@ -307,20 +435,16 @@
307	435	*/
308	436	delay = max_t(u64, delay, 10000LL);
309	437	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
310		- HRTIMER_MODE_REL_PINNED);
	438	+ HRTIMER_MODE_REL_PINNED_HARD);
311	439	}
	440	+
312	441	#endif /* CONFIG_SMP */
313	442
314	443	static void hrtick_rq_init(struct rq *rq)
315	444	{
316	445	#ifdef CONFIG_SMP
317		- rq->hrtick_csd_pending = 0;
318		-
319		- rq->hrtick_csd.flags = 0;
320		- rq->hrtick_csd.func = __hrtick_start;
321		- rq->hrtick_csd.info = rq;
	446	+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
322	447	#endif
323		-
324	448	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
325	449	rq->hrtick_timer.function = hrtick;
326	450	}
..	..	@@ -403,15 +527,9 @@
403	527	#endif
404	528	#endif
405	529
406		-void __wake_q_add(struct wake_q_head head, struct task_struct task,
407		- bool sleeper)
	530	+static bool __wake_q_add(struct wake_q_head head, struct task_struct task)
408	531	{
409		- struct wake_q_node *node;
410		-
411		- if (sleeper)
412		- node = &task->wake_q_sleeper;
413		- else
414		- node = &task->wake_q;
	532	+ struct wake_q_node *node = &task->wake_q;
415	533
416	534	/*
417	535	* Atomically grab the task, if ->wake_q is !nil already it means
..	..	@@ -422,50 +540,79 @@
422	540	* state, even in the failed case, an explicit smp_mb() must be used.
423	541	*/
424	542	smp_mb__before_atomic();
425		- if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
426		- return;
427		-
428		- head->count++;
429		-
430		- get_task_struct(task);
	543	+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
	544	+ return false;
431	545
432	546	/*
433	547	* The head is context local, there can be no concurrency.
434	548	*/
435	549	*head->lastp = node;
436	550	head->lastp = &node->next;
	551	+ head->count++;
	552	+ return true;
437	553	}
438	554
439		-static int
440		-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
441		- int sibling_count_hint);
442		-void __wake_up_q(struct wake_q_head *head, bool sleeper)
	555	+/**
	556	+ * wake_q_add() - queue a wakeup for 'later' waking.
	557	+ * @head: the wake_q_head to add @task to
	558	+ * @task: the task to queue for 'later' wakeup
	559	+ *
	560	+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
	561	+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
	562	+ * instantly.
	563	+ *
	564	+ * This function must be used as-if it were wake_up_process(); IOW the task
	565	+ * must be ready to be woken at this location.
	566	+ */
	567	+void wake_q_add(struct wake_q_head head, struct task_struct task)
	568	+{
	569	+ if (__wake_q_add(head, task))
	570	+ get_task_struct(task);
	571	+}
	572	+
	573	+/**
	574	+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
	575	+ * @head: the wake_q_head to add @task to
	576	+ * @task: the task to queue for 'later' wakeup
	577	+ *
	578	+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
	579	+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
	580	+ * instantly.
	581	+ *
	582	+ * This function must be used as-if it were wake_up_process(); IOW the task
	583	+ * must be ready to be woken at this location.
	584	+ *
	585	+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
	586	+ * that already hold reference to @task can call the 'safe' version and trust
	587	+ * wake_q to do the right thing depending whether or not the @task is already
	588	+ * queued for wakeup.
	589	+ */
	590	+void wake_q_add_safe(struct wake_q_head head, struct task_struct task)
	591	+{
	592	+ if (!__wake_q_add(head, task))
	593	+ put_task_struct(task);
	594	+}
	595	+
	596	+void wake_up_q(struct wake_q_head *head)
443	597	{
444	598	struct wake_q_node *node = head->first;
445	599
446	600	while (node != WAKE_Q_TAIL) {
447	601	struct task_struct *task;
448	602
449		- if (sleeper)
450		- task = container_of(node, struct task_struct, wake_q_sleeper);
451		- else
452		- task = container_of(node, struct task_struct, wake_q);
	603	+ task = container_of(node, struct task_struct, wake_q);
453	604	BUG_ON(!task);
454	605	/* Task can safely be re-inserted now: */
455	606	node = node->next;
456		- if (sleeper)
457		- task->wake_q_sleeper.next = NULL;
458		- else
459		- task->wake_q.next = NULL;
	607	+ task->wake_q.next = NULL;
	608	+ task->wake_q_count = head->count;
	609	+
460	610	/*
461	611	* wake_up_process() executes a full barrier, which pairs with
462	612	* the queueing in wake_q_add() so as not to miss wakeups.
463	613	*/
464		- if (sleeper)
465		- wake_up_lock_sleeper(task);
466		- else
467		- wake_up_process(task);
468		-
	614	+ wake_up_process(task);
	615	+ task->wake_q_count = 0;
469	616	put_task_struct(task);
470	617	}
471	618	}
..	..	@@ -495,57 +642,12 @@
495	642	return;
496	643	}
497	644
498		-#ifdef CONFIG_PREEMPT
499	645	if (set_nr_and_not_polling(curr))
500		-#else
501		- if (set_nr_and_not_polling(curr) && (rq->curr == rq->idle))
502		-#endif
503	646	smp_send_reschedule(cpu);
504	647	else
505	648	trace_sched_wake_idle_without_ipi(cpu);
506	649	}
507		-
508		-#ifdef CONFIG_PREEMPT_LAZY
509		-
510		-static int tsk_is_polling(struct task_struct *p)
511		-{
512		-#ifdef TIF_POLLING_NRFLAG
513		- return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
514		-#else
515		- return 0;
516		-#endif
517		-}
518		-
519		-void resched_curr_lazy(struct rq *rq)
520		-{
521		- struct task_struct *curr = rq->curr;
522		- int cpu;
523		-
524		- if (!sched_feat(PREEMPT_LAZY)) {
525		- resched_curr(rq);
526		- return;
527		- }
528		-
529		- lockdep_assert_held(&rq->lock);
530		-
531		- if (test_tsk_need_resched(curr))
532		- return;
533		-
534		- if (test_tsk_need_resched_lazy(curr))
535		- return;
536		-
537		- set_tsk_need_resched_lazy(curr);
538		-
539		- cpu = cpu_of(rq);
540		- if (cpu == smp_processor_id())
541		- return;
542		-
543		- /* NEED_RESCHED_LAZY must be visible before we test polling */
544		- smp_mb();
545		- if (!tsk_is_polling(curr))
546		- smp_send_reschedule(cpu);
547		-}
548		-#endif
	650	+EXPORT_SYMBOL_GPL(resched_curr);
549	651
550	652	void resched_cpu(int cpu)
551	653	{
..	..	@@ -570,27 +672,49 @@
570	672	*/
571	673	int get_nohz_timer_target(void)
572	674	{
573		- int i, cpu = smp_processor_id();
	675	+ int i, cpu = smp_processor_id(), default_cpu = -1;
574	676	struct sched_domain *sd;
575	677
576		- if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
577		- return cpu;
	678	+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) {
	679	+ if (!idle_cpu(cpu))
	680	+ return cpu;
	681	+ default_cpu = cpu;
	682	+ }
578	683
579	684	rcu_read_lock();
580	685	for_each_domain(cpu, sd) {
581		- for_each_cpu(i, sched_domain_span(sd)) {
	686	+ for_each_cpu_and(i, sched_domain_span(sd),
	687	+ housekeeping_cpumask(HK_FLAG_TIMER)) {
582	688	if (cpu == i)
583	689	continue;
584	690
585		- if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
	691	+ if (!idle_cpu(i)) {
586	692	cpu = i;
587	693	goto unlock;
588	694	}
589	695	}
590	696	}
591	697
592		- if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
593		- cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
	698	+ if (default_cpu == -1) {
	699	+ for_each_cpu_and(i, cpu_active_mask,
	700	+ housekeeping_cpumask(HK_FLAG_TIMER)) {
	701	+ if (cpu == i)
	702	+ continue;
	703	+
	704	+ if (!idle_cpu(i)) {
	705	+ cpu = i;
	706	+ goto unlock;
	707	+ }
	708	+ }
	709	+
	710	+ /* no active, not-idle, housekpeeing CPU found. */
	711	+ default_cpu = cpumask_any(cpu_active_mask);
	712	+
	713	+ if (unlikely(default_cpu >= nr_cpu_ids))
	714	+ goto unlock;
	715	+ }
	716	+
	717	+ cpu = default_cpu;
594	718	unlock:
595	719	rcu_read_unlock();
596	720	return cpu;
..	..	@@ -650,29 +774,23 @@
650	774	wake_up_idle_cpu(cpu);
651	775	}
652	776
653		-static inline bool got_nohz_idle_kick(void)
	777	+static void nohz_csd_func(void *info)
654	778	{
655		- int cpu = smp_processor_id();
656		-
657		- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
658		- return false;
659		-
660		- if (idle_cpu(cpu) && !need_resched())
661		- return true;
	779	+ struct rq *rq = info;
	780	+ int cpu = cpu_of(rq);
	781	+ unsigned int flags;
662	782
663	783	/*
664		- * We can't run Idle Load Balance on this CPU for this time so we
665		- * cancel it and clear NOHZ_BALANCE_KICK
	784	+ * Release the rq::nohz_csd.
666	785	*/
667		- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
668		- return false;
669		-}
	786	+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
	787	+ WARN_ON(!(flags & NOHZ_KICK_MASK));
670	788
671		-#else /* CONFIG_NO_HZ_COMMON */
672		-
673		-static inline bool got_nohz_idle_kick(void)
674		-{
675		- return false;
	789	+ rq->idle_balance = idle_cpu(cpu);
	790	+ if (rq->idle_balance && !need_resched()) {
	791	+ rq->nohz_idle_balance = flags;
	792	+ raise_softirq_irqoff(SCHED_SOFTIRQ);
	793	+ }
676	794	}
677	795
678	796	#endif /* CONFIG_NO_HZ_COMMON */
..	..	@@ -763,18 +881,18 @@
763	881	}
764	882	#endif
765	883
766		-static void set_load_weight(struct task_struct *p, bool update_load)
	884	+static void set_load_weight(struct task_struct *p)
767	885	{
	886	+ bool update_load = !(READ_ONCE(p->state) & TASK_NEW);
768	887	int prio = p->static_prio - MAX_RT_PRIO;
769	888	struct load_weight *load = &p->se.load;
770	889
771	890	/*
772	891	* SCHED_IDLE tasks get minimal weight:
773	892	*/
774		- if (idle_policy(p->policy)) {
	893	+ if (task_has_idle_policy(p)) {
775	894	load->weight = scale_load(WEIGHT_IDLEPRIO);
776	895	load->inv_weight = WMULT_IDLEPRIO;
777		- p->se.runnable_weight = load->weight;
778	896	return;
779	897	}
780	898
..	..	@@ -787,7 +905,6 @@
787	905	} else {
788	906	load->weight = scale_load(sched_prio_to_weight[prio]);
789	907	load->inv_weight = sched_prio_to_wmult[prio];
790		- p->se.runnable_weight = load->weight;
791	908	}
792	909	}
793	910
..	..	@@ -810,8 +927,46 @@
810	927	/* Max allowed maximum utilization */
811	928	unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
812	929
	930	+/*
	931	+ * By default RT tasks run at the maximum performance point/capacity of the
	932	+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
	933	+ * SCHED_CAPACITY_SCALE.
	934	+ *
	935	+ * This knob allows admins to change the default behavior when uclamp is being
	936	+ * used. In battery powered devices, particularly, running at the maximum
	937	+ * capacity and frequency will increase energy consumption and shorten the
	938	+ * battery life.
	939	+ *
	940	+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
	941	+ *
	942	+ * This knob will not override the system default sched_util_clamp_min defined
	943	+ * above.
	944	+ */
	945	+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
	946	+
813	947	/* All clamps are required to be less or equal than these values */
814	948	static struct uclamp_se uclamp_default[UCLAMP_CNT];
	949	+
	950	+/*
	951	+ * This static key is used to reduce the uclamp overhead in the fast path. It
	952	+ * primarily disables the call to uclamp_rq_{inc, dec}() in
	953	+ * enqueue/dequeue_task().
	954	+ *
	955	+ * This allows users to continue to enable uclamp in their kernel config with
	956	+ * minimum uclamp overhead in the fast path.
	957	+ *
	958	+ * As soon as userspace modifies any of the uclamp knobs, the static key is
	959	+ * enabled, since we have an actual users that make use of uclamp
	960	+ * functionality.
	961	+ *
	962	+ * The knobs that would enable this static key are:
	963	+ *
	964	+ * * A task modifying its uclamp value with sched_setattr().
	965	+ * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
	966	+ * * An admin modifying the cgroup cpu.uclamp.{min, max}
	967	+ */
	968	+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
	969	+EXPORT_SYMBOL_GPL(sched_uclamp_used);
815	970
816	971	/* Integer rounded range for each bucket */
817	972	#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
..	..	@@ -822,11 +977,6 @@
822	977	static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
823	978	{
824	979	return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
825		-}
826		-
827		-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
828		-{
829		- return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
830	980	}
831	981
832	982	static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
..	..	@@ -868,7 +1018,7 @@
868	1018	if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
869	1019	return;
870	1020
871		- WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
	1021	+ uclamp_rq_set(rq, clamp_id, clamp_value);
872	1022	}
873	1023
874	1024	static inline
..	..	@@ -892,12 +1042,79 @@
892	1042	return uclamp_idle_value(rq, clamp_id, clamp_value);
893	1043	}
894	1044
	1045	+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
	1046	+{
	1047	+ unsigned int default_util_min;
	1048	+ struct uclamp_se *uc_se;
	1049	+
	1050	+ lockdep_assert_held(&p->pi_lock);
	1051	+
	1052	+ uc_se = &p->uclamp_req[UCLAMP_MIN];
	1053	+
	1054	+ /* Only sync if user didn't override the default */
	1055	+ if (uc_se->user_defined)
	1056	+ return;
	1057	+
	1058	+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
	1059	+ uclamp_se_set(uc_se, default_util_min, false);
	1060	+}
	1061	+
	1062	+static void uclamp_update_util_min_rt_default(struct task_struct *p)
	1063	+{
	1064	+ struct rq_flags rf;
	1065	+ struct rq *rq;
	1066	+
	1067	+ if (!rt_task(p))
	1068	+ return;
	1069	+
	1070	+ /* Protect updates to p->uclamp_* */
	1071	+ rq = task_rq_lock(p, &rf);
	1072	+ __uclamp_update_util_min_rt_default(p);
	1073	+ task_rq_unlock(rq, p, &rf);
	1074	+}
	1075	+
	1076	+static void uclamp_sync_util_min_rt_default(void)
	1077	+{
	1078	+ struct task_struct g, p;
	1079	+
	1080	+ /*
	1081	+ * copy_process() sysctl_uclamp
	1082	+ * uclamp_min_rt = X;
	1083	+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
	1084	+ * // link thread smp_mb__after_spinlock()
	1085	+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
	1086	+ * sched_post_fork() for_each_process_thread()
	1087	+ * __uclamp_sync_rt() __uclamp_sync_rt()
	1088	+ *
	1089	+ * Ensures that either sched_post_fork() will observe the new
	1090	+ * uclamp_min_rt or for_each_process_thread() will observe the new
	1091	+ * task.
	1092	+ */
	1093	+ read_lock(&tasklist_lock);
	1094	+ smp_mb__after_spinlock();
	1095	+ read_unlock(&tasklist_lock);
	1096	+
	1097	+ rcu_read_lock();
	1098	+ for_each_process_thread(g, p)
	1099	+ uclamp_update_util_min_rt_default(p);
	1100	+ rcu_read_unlock();
	1101	+}
	1102	+
	1103	+#if IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)
	1104	+void rockchip_perf_uclamp_sync_util_min_rt_default(void)
	1105	+{
	1106	+ uclamp_sync_util_min_rt_default();
	1107	+}
	1108	+EXPORT_SYMBOL(rockchip_perf_uclamp_sync_util_min_rt_default);
	1109	+#endif
	1110	+
895	1111	static inline struct uclamp_se
896	1112	uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
897	1113	{
	1114	+ /* Copy by value as we could modify it */
898	1115	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
899	1116	#ifdef CONFIG_UCLAMP_TASK_GROUP
900		- struct uclamp_se uc_max;
	1117	+ unsigned int tg_min, tg_max, value;
901	1118
902	1119	/*
903	1120	* Tasks in autogroups or root task group will be
..	..	@@ -908,9 +1125,11 @@
908	1125	if (task_group(p) == &root_task_group)
909	1126	return uc_req;
910	1127
911		- uc_max = task_group(p)->uclamp[clamp_id];
912		- if (uc_req.value > uc_max.value \|\| !uc_req.user_defined)
913		- return uc_max;
	1128	+ tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
	1129	+ tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
	1130	+ value = uc_req.value;
	1131	+ value = clamp(value, tg_min, tg_max);
	1132	+ uclamp_se_set(&uc_req, value, false);
914	1133	#endif
915	1134
916	1135	return uc_req;
..	..	@@ -929,6 +1148,12 @@
929	1148	{
930	1149	struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
931	1150	struct uclamp_se uc_max = uclamp_default[clamp_id];
	1151	+ struct uclamp_se uc_eff;
	1152	+ int ret = 0;
	1153	+
	1154	+ trace_android_rvh_uclamp_eff_get(p, clamp_id, &uc_max, &uc_eff, &ret);
	1155	+ if (ret)
	1156	+ return uc_eff;
932	1157
933	1158	/* System default restrictions always apply */
934	1159	if (unlikely(uc_req.value > uc_max.value))
..	..	@@ -949,6 +1174,7 @@
949	1174
950	1175	return (unsigned long)uc_eff.value;
951	1176	}
	1177	+EXPORT_SYMBOL_GPL(uclamp_eff_value);
952	1178
953	1179	/*
954	1180	* When a task is enqueued on a rq, the clamp bucket currently defined by the
..	..	@@ -985,8 +1211,8 @@
985	1211	if (bucket->tasks == 1 \|\| uc_se->value > bucket->value)
986	1212	bucket->value = uc_se->value;
987	1213
988		- if (uc_se->value > READ_ONCE(uc_rq->value))
989		- WRITE_ONCE(uc_rq->value, uc_se->value);
	1214	+ if (uc_se->value > uclamp_rq_get(rq, clamp_id))
	1215	+ uclamp_rq_set(rq, clamp_id, uc_se->value);
990	1216	}
991	1217
992	1218	/*
..	..	@@ -1009,10 +1235,38 @@
1009	1235
1010	1236	lockdep_assert_held(&rq->lock);
1011	1237
	1238	+ /*
	1239	+ * If sched_uclamp_used was enabled after task @p was enqueued,
	1240	+ * we could end up with unbalanced call to uclamp_rq_dec_id().
	1241	+ *
	1242	+ * In this case the uc_se->active flag should be false since no uclamp
	1243	+ * accounting was performed at enqueue time and we can just return
	1244	+ * here.
	1245	+ *
	1246	+ * Need to be careful of the following enqeueue/dequeue ordering
	1247	+ * problem too
	1248	+ *
	1249	+ * enqueue(taskA)
	1250	+ * // sched_uclamp_used gets enabled
	1251	+ * enqueue(taskB)
	1252	+ * dequeue(taskA)
	1253	+ * // Must not decrement bukcet->tasks here
	1254	+ * dequeue(taskB)
	1255	+ *
	1256	+ * where we could end up with stale data in uc_se and
	1257	+ * bucket[uc_se->bucket_id].
	1258	+ *
	1259	+ * The following check here eliminates the possibility of such race.
	1260	+ */
	1261	+ if (unlikely(!uc_se->active))
	1262	+ return;
	1263	+
1012	1264	bucket = &uc_rq->bucket[uc_se->bucket_id];
	1265	+
1013	1266	SCHED_WARN_ON(!bucket->tasks);
1014	1267	if (likely(bucket->tasks))
1015	1268	bucket->tasks--;
	1269	+
1016	1270	uc_se->active = false;
1017	1271
1018	1272	/*
..	..	@@ -1024,7 +1278,7 @@
1024	1278	if (likely(bucket->tasks))
1025	1279	return;
1026	1280
1027		- rq_clamp = READ_ONCE(uc_rq->value);
	1281	+ rq_clamp = uclamp_rq_get(rq, clamp_id);
1028	1282	/*
1029	1283	* Defensive programming: this should never happen. If it happens,
1030	1284	* e.g. due to future modification, warn and fixup the expected value.
..	..	@@ -1032,13 +1286,22 @@
1032	1286	SCHED_WARN_ON(bucket->value > rq_clamp);
1033	1287	if (bucket->value >= rq_clamp) {
1034	1288	bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1035		- WRITE_ONCE(uc_rq->value, bkt_clamp);
	1289	+ uclamp_rq_set(rq, clamp_id, bkt_clamp);
1036	1290	}
1037	1291	}
1038	1292
1039	1293	static inline void uclamp_rq_inc(struct rq rq, struct task_struct p)
1040	1294	{
1041	1295	enum uclamp_id clamp_id;
	1296	+
	1297	+ /*
	1298	+ * Avoid any overhead until uclamp is actually used by the userspace.
	1299	+ *
	1300	+ * The condition is constructed such that a NOP is generated when
	1301	+ * sched_uclamp_used is disabled.
	1302	+ */
	1303	+ if (!static_branch_unlikely(&sched_uclamp_used))
	1304	+ return;
1042	1305
1043	1306	if (unlikely(!p->sched_class->uclamp_enabled))
1044	1307	return;
..	..	@@ -1055,6 +1318,15 @@
1055	1318	{
1056	1319	enum uclamp_id clamp_id;
1057	1320
	1321	+ /*
	1322	+ * Avoid any overhead until uclamp is actually used by the userspace.
	1323	+ *
	1324	+ * The condition is constructed such that a NOP is generated when
	1325	+ * sched_uclamp_used is disabled.
	1326	+ */
	1327	+ if (!static_branch_unlikely(&sched_uclamp_used))
	1328	+ return;
	1329	+
1058	1330	if (unlikely(!p->sched_class->uclamp_enabled))
1059	1331	return;
1060	1332
..	..	@@ -1062,9 +1334,27 @@
1062	1334	uclamp_rq_dec_id(rq, p, clamp_id);
1063	1335	}
1064	1336
1065		-static inline void
1066		-uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
	1337	+static inline void uclamp_rq_reinc_id(struct rq rq, struct task_struct p,
	1338	+ enum uclamp_id clamp_id)
1067	1339	{
	1340	+ if (!p->uclamp[clamp_id].active)
	1341	+ return;
	1342	+
	1343	+ uclamp_rq_dec_id(rq, p, clamp_id);
	1344	+ uclamp_rq_inc_id(rq, p, clamp_id);
	1345	+
	1346	+ /*
	1347	+ * Make sure to clear the idle flag if we've transiently reached 0
	1348	+ * active tasks on rq.
	1349	+ */
	1350	+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
	1351	+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
	1352	+}
	1353	+
	1354	+static inline void
	1355	+uclamp_update_active(struct task_struct *p)
	1356	+{
	1357	+ enum uclamp_id clamp_id;
1068	1358	struct rq_flags rf;
1069	1359	struct rq *rq;
1070	1360
..	..	@@ -1084,30 +1374,22 @@
1084	1374	* affecting a valid clamp bucket, the next time it's enqueued,
1085	1375	* it will already see the updated clamp bucket value.
1086	1376	*/
1087		- if (p->uclamp[clamp_id].active) {
1088		- uclamp_rq_dec_id(rq, p, clamp_id);
1089		- uclamp_rq_inc_id(rq, p, clamp_id);
1090		- }
	1377	+ for_each_clamp_id(clamp_id)
	1378	+ uclamp_rq_reinc_id(rq, p, clamp_id);
1091	1379
1092	1380	task_rq_unlock(rq, p, &rf);
1093	1381	}
1094	1382
1095	1383	#ifdef CONFIG_UCLAMP_TASK_GROUP
1096	1384	static inline void
1097		-uclamp_update_active_tasks(struct cgroup_subsys_state *css,
1098		- unsigned int clamps)
	1385	+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1099	1386	{
1100		- enum uclamp_id clamp_id;
1101	1387	struct css_task_iter it;
1102	1388	struct task_struct *p;
1103	1389
1104	1390	css_task_iter_start(css, 0, &it);
1105		- while ((p = css_task_iter_next(&it))) {
1106		- for_each_clamp_id(clamp_id) {
1107		- if ((0x1 << clamp_id) & clamps)
1108		- uclamp_update_active(p, clamp_id);
1109		- }
1110		- }
	1391	+ while ((p = css_task_iter_next(&it)))
	1392	+ uclamp_update_active(p);
1111	1393	css_task_iter_end(&it);
1112	1394	}
1113	1395
..	..	@@ -1130,16 +1412,16 @@
1130	1412	#endif
1131	1413
1132	1414	int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1133		- void __user buffer, size_t lenp,
1134		- loff_t *ppos)
	1415	+ void buffer, size_t lenp, loff_t *ppos)
1135	1416	{
1136	1417	bool update_root_tg = false;
1137		- int old_min, old_max;
	1418	+ int old_min, old_max, old_min_rt;
1138	1419	int result;
1139	1420
1140	1421	mutex_lock(&uclamp_mutex);
1141	1422	old_min = sysctl_sched_uclamp_util_min;
1142	1423	old_max = sysctl_sched_uclamp_util_max;
	1424	+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1143	1425
1144	1426	result = proc_dointvec(table, write, buffer, lenp, ppos);
1145	1427	if (result)
..	..	@@ -1148,7 +1430,9 @@
1148	1430	goto done;
1149	1431
1150	1432	if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max \|\|
1151		- sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
	1433	+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE \|\|
	1434	+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
	1435	+
1152	1436	result = -EINVAL;
1153	1437	goto undo;
1154	1438	}
..	..	@@ -1164,8 +1448,15 @@
1164	1448	update_root_tg = true;
1165	1449	}
1166	1450
1167		- if (update_root_tg)
	1451	+ if (update_root_tg) {
	1452	+ static_branch_enable(&sched_uclamp_used);
1168	1453	uclamp_update_root_tg();
	1454	+ }
	1455	+
	1456	+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
	1457	+ static_branch_enable(&sched_uclamp_used);
	1458	+ uclamp_sync_util_min_rt_default();
	1459	+ }
1169	1460
1170	1461	/*
1171	1462	* We update all RUNNABLE tasks only when task groups are in use.
..	..	@@ -1178,6 +1469,7 @@
1178	1469	undo:
1179	1470	sysctl_sched_uclamp_util_min = old_min;
1180	1471	sysctl_sched_uclamp_util_max = old_max;
	1472	+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1181	1473	done:
1182	1474	mutex_unlock(&uclamp_mutex);
1183	1475
..	..	@@ -1187,20 +1479,61 @@
1187	1479	static int uclamp_validate(struct task_struct *p,
1188	1480	const struct sched_attr *attr)
1189	1481	{
1190		- unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1191		- unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
	1482	+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
	1483	+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
1192	1484
1193		- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1194		- lower_bound = attr->sched_util_min;
1195		- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1196		- upper_bound = attr->sched_util_max;
	1485	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
	1486	+ util_min = attr->sched_util_min;
1197	1487
1198		- if (lower_bound > upper_bound)
	1488	+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
	1489	+ return -EINVAL;
	1490	+ }
	1491	+
	1492	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
	1493	+ util_max = attr->sched_util_max;
	1494	+
	1495	+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
	1496	+ return -EINVAL;
	1497	+ }
	1498	+
	1499	+ if (util_min != -1 && util_max != -1 && util_min > util_max)
1199	1500	return -EINVAL;
1200		- if (upper_bound > SCHED_CAPACITY_SCALE)
1201		- return -EINVAL;
	1501	+
	1502	+ /*
	1503	+ * We have valid uclamp attributes; make sure uclamp is enabled.
	1504	+ *
	1505	+ * We need to do that here, because enabling static branches is a
	1506	+ * blocking operation which obviously cannot be done while holding
	1507	+ * scheduler locks.
	1508	+ */
	1509	+ static_branch_enable(&sched_uclamp_used);
1202	1510
1203	1511	return 0;
	1512	+}
	1513	+
	1514	+static bool uclamp_reset(const struct sched_attr *attr,
	1515	+ enum uclamp_id clamp_id,
	1516	+ struct uclamp_se *uc_se)
	1517	+{
	1518	+ /* Reset on sched class change for a non user-defined clamp value. */
	1519	+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
	1520	+ !uc_se->user_defined)
	1521	+ return true;
	1522	+
	1523	+ /* Reset on sched_util_{min,max} == -1. */
	1524	+ if (clamp_id == UCLAMP_MIN &&
	1525	+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
	1526	+ attr->sched_util_min == -1) {
	1527	+ return true;
	1528	+ }
	1529	+
	1530	+ if (clamp_id == UCLAMP_MAX &&
	1531	+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
	1532	+ attr->sched_util_max == -1) {
	1533	+ return true;
	1534	+ }
	1535	+
	1536	+ return false;
1204	1537	}
1205	1538
1206	1539	static void __setscheduler_uclamp(struct task_struct *p,
..	..	@@ -1208,40 +1541,41 @@
1208	1541	{
1209	1542	enum uclamp_id clamp_id;
1210	1543
1211		- /*
1212		- * On scheduling class change, reset to default clamps for tasks
1213		- * without a task-specific value.
1214		- */
1215	1544	for_each_clamp_id(clamp_id) {
1216	1545	struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1217		- unsigned int clamp_value = uclamp_none(clamp_id);
	1546	+ unsigned int value;
1218	1547
1219		- /* Keep using defined clamps across class changes */
1220		- if (uc_se->user_defined)
	1548	+ if (!uclamp_reset(attr, clamp_id, uc_se))
1221	1549	continue;
1222	1550
1223		- /* By default, RT tasks always get 100% boost */
1224		- if (sched_feat(SUGOV_RT_MAX_FREQ) &&
1225		- unlikely(rt_task(p) &&
1226		- clamp_id == UCLAMP_MIN)) {
	1551	+ /*
	1552	+ * RT by default have a 100% boost value that could be modified
	1553	+ * at runtime.
	1554	+ */
	1555	+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
	1556	+ value = sysctl_sched_uclamp_util_min_rt_default;
	1557	+ else
	1558	+ value = uclamp_none(clamp_id);
1227	1559
1228		- clamp_value = uclamp_none(UCLAMP_MAX);
1229		- }
	1560	+ uclamp_se_set(uc_se, value, false);
1230	1561
1231		- uclamp_se_set(uc_se, clamp_value, false);
1232	1562	}
1233	1563
1234	1564	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1235	1565	return;
1236	1566
1237		- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
	1567	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
	1568	+ attr->sched_util_min != -1) {
1238	1569	uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1239	1570	attr->sched_util_min, true);
	1571	+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MIN, attr->sched_util_min);
1240	1572	}
1241	1573
1242		- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
	1574	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
	1575	+ attr->sched_util_max != -1) {
1243	1576	uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1244	1577	attr->sched_util_max, true);
	1578	+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max);
1245	1579	}
1246	1580	}
1247	1581
..	..	@@ -1249,6 +1583,10 @@
1249	1583	{
1250	1584	enum uclamp_id clamp_id;
1251	1585
	1586	+ /*
	1587	+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
	1588	+ * as the task is still at its early fork stages.
	1589	+ */
1252	1590	for_each_clamp_id(clamp_id)
1253	1591	p->uclamp[clamp_id].active = false;
1254	1592
..	..	@@ -1261,39 +1599,24 @@
1261	1599	}
1262	1600	}
1263	1601
1264		-#ifdef CONFIG_SMP
1265		-unsigned int uclamp_task(struct task_struct *p)
	1602	+static void uclamp_post_fork(struct task_struct *p)
1266	1603	{
1267		- unsigned long util;
1268		-
1269		- util = task_util_est(p);
1270		- util = max(util, uclamp_eff_value(p, UCLAMP_MIN));
1271		- util = min(util, uclamp_eff_value(p, UCLAMP_MAX));
1272		-
1273		- return util;
	1604	+ uclamp_update_util_min_rt_default(p);
1274	1605	}
1275	1606
1276		-bool uclamp_boosted(struct task_struct *p)
	1607	+static void __init init_uclamp_rq(struct rq *rq)
1277	1608	{
1278		- return uclamp_eff_value(p, UCLAMP_MIN) > 0;
	1609	+ enum uclamp_id clamp_id;
	1610	+ struct uclamp_rq *uc_rq = rq->uclamp;
	1611	+
	1612	+ for_each_clamp_id(clamp_id) {
	1613	+ uc_rq[clamp_id] = (struct uclamp_rq) {
	1614	+ .value = uclamp_none(clamp_id)
	1615	+ };
	1616	+ }
	1617	+
	1618	+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
1279	1619	}
1280		-
1281		-bool uclamp_latency_sensitive(struct task_struct *p)
1282		-{
1283		-#ifdef CONFIG_UCLAMP_TASK_GROUP
1284		- struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id);
1285		- struct task_group *tg;
1286		-
1287		- if (!css)
1288		- return false;
1289		- tg = container_of(css, struct task_group, css);
1290		-
1291		- return tg->latency_sensitive;
1292		-#else
1293		- return false;
1294		-#endif
1295		-}
1296		-#endif /* CONFIG_SMP */
1297	1620
1298	1621	static void __init init_uclamp(void)
1299	1622	{
..	..	@@ -1301,13 +1624,8 @@
1301	1624	enum uclamp_id clamp_id;
1302	1625	int cpu;
1303	1626
1304		- mutex_init(&uclamp_mutex);
1305		-
1306		- for_each_possible_cpu(cpu) {
1307		- memset(&cpu_rq(cpu)->uclamp, 0,
1308		- sizeof(struct uclamp_rq)*UCLAMP_CNT);
1309		- cpu_rq(cpu)->uclamp_flags = 0;
1310		- }
	1627	+ for_each_possible_cpu(cpu)
	1628	+ init_uclamp_rq(cpu_rq(cpu));
1311	1629
1312	1630	for_each_clamp_id(clamp_id) {
1313	1631	uclamp_se_set(&init_task.uclamp_req[clamp_id],
..	..	@@ -1336,41 +1654,7 @@
1336	1654	static void __setscheduler_uclamp(struct task_struct *p,
1337	1655	const struct sched_attr *attr) { }
1338	1656	static inline void uclamp_fork(struct task_struct *p) { }
1339		-
1340		-long schedtune_task_margin(struct task_struct *task);
1341		-
1342		-#ifdef CONFIG_SMP
1343		-unsigned int uclamp_task(struct task_struct *p)
1344		-{
1345		- unsigned long util = task_util_est(p);
1346		-#ifdef CONFIG_SCHED_TUNE
1347		- long margin = schedtune_task_margin(p);
1348		-
1349		- trace_sched_boost_task(p, util, margin);
1350		-
1351		- util += margin;
1352		-#endif
1353		-
1354		- return util;
1355		-}
1356		-
1357		-bool uclamp_boosted(struct task_struct *p)
1358		-{
1359		-#ifdef CONFIG_SCHED_TUNE
1360		- return schedtune_task_boost(p) > 0;
1361		-#endif
1362		- return false;
1363		-}
1364		-
1365		-bool uclamp_latency_sensitive(struct task_struct *p)
1366		-{
1367		-#ifdef CONFIG_SCHED_TUNE
1368		- return schedtune_prefer_idle(p) != 0;
1369		-#endif
1370		- return false;
1371		-}
1372		-#endif /* CONFIG_SMP */
1373		-
	1657	+static inline void uclamp_post_fork(struct task_struct *p) { }
1374	1658	static inline void init_uclamp(void) { }
1375	1659	#endif /* CONFIG_UCLAMP_TASK */
1376	1660
..	..	@@ -1385,7 +1669,9 @@
1385	1669	}
1386	1670
1387	1671	uclamp_rq_inc(rq, p);
	1672	+ trace_android_rvh_enqueue_task(rq, p, flags);
1388	1673	p->sched_class->enqueue_task(rq, p, flags);
	1674	+ trace_android_rvh_after_enqueue_task(rq, p);
1389	1675	}
1390	1676
1391	1677	static inline void dequeue_task(struct rq rq, struct task_struct p, int flags)
..	..	@@ -1399,31 +1685,42 @@
1399	1685	}
1400	1686
1401	1687	uclamp_rq_dec(rq, p);
	1688	+ trace_android_rvh_dequeue_task(rq, p, flags);
1402	1689	p->sched_class->dequeue_task(rq, p, flags);
	1690	+ trace_android_rvh_after_dequeue_task(rq, p);
1403	1691	}
1404	1692
1405	1693	void activate_task(struct rq rq, struct task_struct p, int flags)
1406	1694	{
1407		- if (task_contributes_to_load(p))
1408		- rq->nr_uninterruptible--;
	1695	+ if (task_on_rq_migrating(p))
	1696	+ flags \|= ENQUEUE_MIGRATED;
1409	1697
1410	1698	enqueue_task(rq, p, flags);
	1699	+
	1700	+ p->on_rq = TASK_ON_RQ_QUEUED;
1411	1701	}
	1702	+EXPORT_SYMBOL_GPL(activate_task);
1412	1703
1413	1704	void deactivate_task(struct rq rq, struct task_struct p, int flags)
1414	1705	{
1415		- if (task_contributes_to_load(p))
1416		- rq->nr_uninterruptible++;
	1706	+ p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
1417	1707
1418	1708	dequeue_task(rq, p, flags);
1419	1709	}
	1710	+EXPORT_SYMBOL_GPL(deactivate_task);
1420	1711
1421		-/*
1422		- * __normal_prio - return the priority that is based on the static prio
1423		- */
1424		-static inline int __normal_prio(struct task_struct *p)
	1712	+static inline int __normal_prio(int policy, int rt_prio, int nice)
1425	1713	{
1426		- return p->static_prio;
	1714	+ int prio;
	1715	+
	1716	+ if (dl_policy(policy))
	1717	+ prio = MAX_DL_PRIO - 1;
	1718	+ else if (rt_policy(policy))
	1719	+ prio = MAX_RT_PRIO - 1 - rt_prio;
	1720	+ else
	1721	+ prio = NICE_TO_PRIO(nice);
	1722	+
	1723	+ return prio;
1427	1724	}
1428	1725
1429	1726	/*
..	..	@@ -1435,15 +1732,7 @@
1435	1732	*/
1436	1733	static inline int normal_prio(struct task_struct *p)
1437	1734	{
1438		- int prio;
1439		-
1440		- if (task_has_dl_policy(p))
1441		- prio = MAX_DL_PRIO-1;
1442		- else if (task_has_rt_policy(p))
1443		- prio = MAX_RT_PRIO-1 - p->rt_priority;
1444		- else
1445		- prio = __normal_prio(p);
1446		- return prio;
	1735	+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
1447	1736	}
1448	1737
1449	1738	/*
..	..	@@ -1499,20 +1788,10 @@
1499	1788
1500	1789	void check_preempt_curr(struct rq rq, struct task_struct p, int flags)
1501	1790	{
1502		- const struct sched_class *class;
1503		-
1504		- if (p->sched_class == rq->curr->sched_class) {
	1791	+ if (p->sched_class == rq->curr->sched_class)
1505	1792	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1506		- } else {
1507		- for_each_class(class) {
1508		- if (class == rq->curr->sched_class)
1509		- break;
1510		- if (class == p->sched_class) {
1511		- resched_curr(rq);
1512		- break;
1513		- }
1514		- }
1515		- }
	1793	+ else if (p->sched_class > rq->curr->sched_class)
	1794	+ resched_curr(rq);
1516	1795
1517	1796	/*
1518	1797	* A queue event has occurred, and we're going to schedule. In
..	..	@@ -1521,22 +1800,12 @@
1521	1800	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1522	1801	rq_clock_skip_update(rq);
1523	1802	}
	1803	+EXPORT_SYMBOL_GPL(check_preempt_curr);
1524	1804
1525	1805	#ifdef CONFIG_SMP
1526	1806
1527		-static inline bool is_per_cpu_kthread(struct task_struct *p)
1528		-{
1529		- if (!(p->flags & PF_KTHREAD))
1530		- return false;
1531		-
1532		- if (p->nr_cpus_allowed != 1)
1533		- return false;
1534		-
1535		- return true;
1536		-}
1537		-
1538	1807	/*
1539		- * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
	1808	+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
1540	1809	* __set_cpus_allowed_ptr() and select_fallback_rq().
1541	1810	*/
1542	1811	static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
..	..	@@ -1544,10 +1813,13 @@
1544	1813	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
1545	1814	return false;
1546	1815
1547		- if (is_per_cpu_kthread(p) \|\| __migrate_disabled(p))
	1816	+ if (is_per_cpu_kthread(p))
1548	1817	return cpu_online(cpu);
1549	1818
1550		- return cpu_active(cpu);
	1819	+ if (!cpu_active(cpu))
	1820	+ return false;
	1821	+
	1822	+ return cpumask_test_cpu(cpu, task_cpu_possible_mask(p));
1551	1823	}
1552	1824
1553	1825	/*
..	..	@@ -1572,19 +1844,29 @@
1572	1844	static struct rq move_queued_task(struct rq rq, struct rq_flags *rf,
1573	1845	struct task_struct *p, int new_cpu)
1574	1846	{
	1847	+ int detached = 0;
	1848	+
1575	1849	lockdep_assert_held(&rq->lock);
1576	1850
1577		- WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
1578		- dequeue_task(rq, p, DEQUEUE_NOCLOCK);
1579		- set_task_cpu(p, new_cpu);
1580		- rq_unlock(rq, rf);
	1851	+ /*
	1852	+ * The vendor hook may drop the lock temporarily, so
	1853	+ * pass the rq flags to unpin lock. We expect the
	1854	+ * rq lock to be held after return.
	1855	+ */
	1856	+ trace_android_rvh_migrate_queued_task(rq, rf, p, new_cpu, &detached);
	1857	+ if (detached)
	1858	+ goto attach;
1581	1859
	1860	+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
	1861	+ set_task_cpu(p, new_cpu);
	1862	+
	1863	+attach:
	1864	+ rq_unlock(rq, rf);
1582	1865	rq = cpu_rq(new_cpu);
1583	1866
1584	1867	rq_lock(rq, rf);
1585	1868	BUG_ON(task_cpu(p) != new_cpu);
1586		- enqueue_task(rq, p, 0);
1587		- p->on_rq = TASK_ON_RQ_QUEUED;
	1869	+ activate_task(rq, p, 0);
1588	1870	check_preempt_curr(rq, p, 0);
1589	1871
1590	1872	return rq;
..	..	@@ -1593,7 +1875,6 @@
1593	1875	struct migration_arg {
1594	1876	struct task_struct *task;
1595	1877	int dest_cpu;
1596		- bool done;
1597	1878	};
1598	1879
1599	1880	/*
..	..	@@ -1629,11 +1910,6 @@
1629	1910	struct task_struct *p = arg->task;
1630	1911	struct rq *rq = this_rq();
1631	1912	struct rq_flags rf;
1632		- int dest_cpu = arg->dest_cpu;
1633		-
1634		- /* We don't look at arg after this point. */
1635		- smp_mb();
1636		- arg->done = true;
1637	1913
1638	1914	/*
1639	1915	* The original target CPU might have gone down and we might
..	..	@@ -1645,7 +1921,7 @@
1645	1921	* __migrate_task() such that we will not miss enforcing cpus_ptr
1646	1922	* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
1647	1923	*/
1648		- sched_ttwu_pending();
	1924	+ flush_smp_call_function_from_idle();
1649	1925
1650	1926	raw_spin_lock(&p->pi_lock);
1651	1927	rq_lock(rq, &rf);
..	..	@@ -1656,9 +1932,9 @@
1656	1932	*/
1657	1933	if (task_rq(p) == rq) {
1658	1934	if (task_on_rq_queued(p))
1659		- rq = __migrate_task(rq, &rf, p, dest_cpu);
	1935	+ rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1660	1936	else
1661		- p->wake_cpu = dest_cpu;
	1937	+ p->wake_cpu = arg->dest_cpu;
1662	1938	}
1663	1939	rq_unlock(rq, &rf);
1664	1940	raw_spin_unlock(&p->pi_lock);
..	..	@@ -1674,17 +1950,9 @@
1674	1950	void set_cpus_allowed_common(struct task_struct p, const struct cpumask new_mask)
1675	1951	{
1676	1952	cpumask_copy(&p->cpus_mask, new_mask);
1677		- if (p->cpus_ptr == &p->cpus_mask)
1678		- p->nr_cpus_allowed = cpumask_weight(new_mask);
	1953	+ p->nr_cpus_allowed = cpumask_weight(new_mask);
	1954	+ trace_android_rvh_set_cpus_allowed_comm(p, new_mask);
1679	1955	}
1680		-
1681		-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
1682		-int __migrate_disabled(struct task_struct *p)
1683		-{
1684		- return p->migrate_disable;
1685		-}
1686		-EXPORT_SYMBOL_GPL(__migrate_disabled);
1687		-#endif
1688	1956
1689	1957	void do_set_cpus_allowed(struct task_struct p, const struct cpumask new_mask)
1690	1958	{
..	..	@@ -1712,28 +1980,23 @@
1712	1980	if (queued)
1713	1981	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
1714	1982	if (running)
1715		- set_curr_task(rq, p);
	1983	+ set_next_task(rq, p);
1716	1984	}
1717	1985
1718	1986	/*
1719		- * Change a given task's CPU affinity. Migrate the thread to a
1720		- * proper CPU and schedule it away if the CPU it's executing on
1721		- * is removed from the allowed bitmask.
1722		- *
1723		- * NOTE: the caller must have a valid reference to the task, the
1724		- * task must not exit() & deallocate itself prematurely. The
1725		- * call is not atomic; no spinlocks may be held.
	1987	+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
1726	1988	*/
1727		-static int __set_cpus_allowed_ptr(struct task_struct *p,
1728		- const struct cpumask *new_mask, bool check)
	1989	+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
	1990	+ const struct cpumask *new_mask,
	1991	+ bool check,
	1992	+ struct rq *rq,
	1993	+ struct rq_flags *rf)
1729	1994	{
1730	1995	const struct cpumask *cpu_valid_mask = cpu_active_mask;
	1996	+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
1731	1997	unsigned int dest_cpu;
1732		- struct rq_flags rf;
1733		- struct rq *rq;
1734	1998	int ret = 0;
1735	1999
1736		- rq = task_rq_lock(p, &rf);
1737	2000	update_rq_clock(rq);
1738	2001
1739	2002	if (p->flags & PF_KTHREAD) {
..	..	@@ -1741,6 +2004,9 @@
1741	2004	* Kernel threads are allowed on online && !active CPUs
1742	2005	*/
1743	2006	cpu_valid_mask = cpu_online_mask;
	2007	+ } else if (!cpumask_subset(new_mask, cpu_allowed_mask)) {
	2008	+ ret = -EINVAL;
	2009	+ goto out;
1744	2010	}
1745	2011
1746	2012	/*
..	..	@@ -1755,7 +2021,12 @@
1755	2021	if (cpumask_equal(&p->cpus_mask, new_mask))
1756	2022	goto out;
1757	2023
1758		- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
	2024	+ /*
	2025	+ * Picking a ~random cpu helps in cases where we are changing affinity
	2026	+ * for groups of tasks (ie. cpuset), so that load balancing is not
	2027	+ * immediately required to distribute the tasks within their new mask.
	2028	+ */
	2029	+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
1759	2030	if (dest_cpu >= nr_cpu_ids) {
1760	2031	ret = -EINVAL;
1761	2032	goto out;
..	..	@@ -1774,28 +2045,45 @@
1774	2045	}
1775	2046
1776	2047	/* Can the task run on the task's current CPU? If so, we're done */
1777		- if (cpumask_test_cpu(task_cpu(p), new_mask) \|\|
1778		- p->cpus_ptr != &p->cpus_mask)
	2048	+ if (cpumask_test_cpu(task_cpu(p), new_mask))
1779	2049	goto out;
1780	2050
1781	2051	if (task_running(rq, p) \|\| p->state == TASK_WAKING) {
1782	2052	struct migration_arg arg = { p, dest_cpu };
1783	2053	/* Need help from migration thread: drop lock and wait. */
1784		- task_rq_unlock(rq, p, &rf);
	2054	+ task_rq_unlock(rq, p, rf);
1785	2055	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1786		- tlb_migrate_finish(p->mm);
1787	2056	return 0;
1788	2057	} else if (task_on_rq_queued(p)) {
1789	2058	/*
1790	2059	* OK, since we're going to drop the lock immediately
1791	2060	* afterwards anyway.
1792	2061	*/
1793		- rq = move_queued_task(rq, &rf, p, dest_cpu);
	2062	+ rq = move_queued_task(rq, rf, p, dest_cpu);
1794	2063	}
1795	2064	out:
1796		- task_rq_unlock(rq, p, &rf);
	2065	+ task_rq_unlock(rq, p, rf);
1797	2066
1798	2067	return ret;
	2068	+}
	2069	+
	2070	+/*
	2071	+ * Change a given task's CPU affinity. Migrate the thread to a
	2072	+ * proper CPU and schedule it away if the CPU it's executing on
	2073	+ * is removed from the allowed bitmask.
	2074	+ *
	2075	+ * NOTE: the caller must have a valid reference to the task, the
	2076	+ * task must not exit() & deallocate itself prematurely. The
	2077	+ * call is not atomic; no spinlocks may be held.
	2078	+ */
	2079	+static int __set_cpus_allowed_ptr(struct task_struct *p,
	2080	+ const struct cpumask *new_mask, bool check)
	2081	+{
	2082	+ struct rq_flags rf;
	2083	+ struct rq *rq;
	2084	+
	2085	+ rq = task_rq_lock(p, &rf);
	2086	+ return __set_cpus_allowed_ptr_locked(p, new_mask, check, rq, &rf);
1799	2087	}
1800	2088
1801	2089	int set_cpus_allowed_ptr(struct task_struct p, const struct cpumask new_mask)
..	..	@@ -1803,6 +2091,74 @@
1803	2091	return __set_cpus_allowed_ptr(p, new_mask, false);
1804	2092	}
1805	2093	EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
	2094	+
	2095	+/*
	2096	+ * Change a given task's CPU affinity to the intersection of its current
	2097	+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
	2098	+ * If the resulting mask is empty, leave the affinity unchanged and return
	2099	+ * -EINVAL.
	2100	+ */
	2101	+static int restrict_cpus_allowed_ptr(struct task_struct *p,
	2102	+ struct cpumask *new_mask,
	2103	+ const struct cpumask *subset_mask)
	2104	+{
	2105	+ struct rq_flags rf;
	2106	+ struct rq *rq;
	2107	+
	2108	+ rq = task_rq_lock(p, &rf);
	2109	+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
	2110	+ task_rq_unlock(rq, p, &rf);
	2111	+ return -EINVAL;
	2112	+ }
	2113	+
	2114	+ return __set_cpus_allowed_ptr_locked(p, new_mask, false, rq, &rf);
	2115	+}
	2116	+
	2117	+/*
	2118	+ * Restrict a given task's CPU affinity so that it is a subset of
	2119	+ * task_cpu_possible_mask(). If the resulting mask is empty, we warn and
	2120	+ * walk up the cpuset hierarchy until we find a suitable mask.
	2121	+ */
	2122	+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
	2123	+{
	2124	+ cpumask_var_t new_mask;
	2125	+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
	2126	+
	2127	+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
	2128	+
	2129	+ /*
	2130	+ * __migrate_task() can fail silently in the face of concurrent
	2131	+ * offlining of the chosen destination CPU, so take the hotplug
	2132	+ * lock to ensure that the migration succeeds.
	2133	+ */
	2134	+ trace_android_rvh_force_compatible_pre(NULL);
	2135	+ cpus_read_lock();
	2136	+ if (!cpumask_available(new_mask))
	2137	+ goto out_set_mask;
	2138	+
	2139	+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
	2140	+ goto out_free_mask;
	2141	+
	2142	+ /*
	2143	+ * We failed to find a valid subset of the affinity mask for the
	2144	+ * task, so override it based on its cpuset hierarchy.
	2145	+ */
	2146	+ cpuset_cpus_allowed(p, new_mask);
	2147	+ override_mask = new_mask;
	2148	+
	2149	+out_set_mask:
	2150	+ if (printk_ratelimit()) {
	2151	+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
	2152	+ task_pid_nr(p), p->comm,
	2153	+ cpumask_pr_args(override_mask));
	2154	+ }
	2155	+
	2156	+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
	2157	+out_free_mask:
	2158	+ cpus_read_unlock();
	2159	+ trace_android_rvh_force_compatible_post(NULL);
	2160	+ free_cpumask_var(new_mask);
	2161	+}
1806	2162
1807	2163	void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1808	2164	{
..	..	@@ -1851,12 +2207,13 @@
1851	2207	p->se.nr_migrations++;
1852	2208	rseq_migrate(p);
1853	2209	perf_event_task_migrate(p);
	2210	+ trace_android_rvh_set_task_cpu(p, new_cpu);
1854	2211	}
1855	2212
1856	2213	__set_task_cpu(p, new_cpu);
1857	2214	}
	2215	+EXPORT_SYMBOL_GPL(set_task_cpu);
1858	2216
1859		-#ifdef CONFIG_NUMA_BALANCING
1860	2217	static void __migrate_swap_task(struct task_struct *p, int cpu)
1861	2218	{
1862	2219	if (task_on_rq_queued(p)) {
..	..	@@ -1869,11 +2226,9 @@
1869	2226	rq_pin_lock(src_rq, &srf);
1870	2227	rq_pin_lock(dst_rq, &drf);
1871	2228
1872		- p->on_rq = TASK_ON_RQ_MIGRATING;
1873	2229	deactivate_task(src_rq, p, 0);
1874	2230	set_task_cpu(p, cpu);
1875	2231	activate_task(dst_rq, p, 0);
1876		- p->on_rq = TASK_ON_RQ_QUEUED;
1877	2232	check_preempt_curr(dst_rq, p, 0);
1878	2233
1879	2234	rq_unpin_lock(dst_rq, &drf);
..	..	@@ -1973,19 +2328,7 @@
1973	2328	out:
1974	2329	return ret;
1975	2330	}
1976		-#endif /* CONFIG_NUMA_BALANCING */
1977		-
1978		-static bool check_task_state(struct task_struct *p, long match_state)
1979		-{
1980		- bool match = false;
1981		-
1982		- raw_spin_lock_irq(&p->pi_lock);
1983		- if (p->state == match_state \|\| p->saved_state == match_state)
1984		- match = true;
1985		- raw_spin_unlock_irq(&p->pi_lock);
1986		-
1987		- return match;
1988		-}
	2331	+EXPORT_SYMBOL_GPL(migrate_swap);
1989	2332
1990	2333	/*
1991	2334	* wait_task_inactive - wait for a thread to unschedule.
..	..	@@ -2031,7 +2374,7 @@
2031	2374	* is actually now running somewhere else!
2032	2375	*/
2033	2376	while (task_running(rq, p)) {
2034		- if (match_state && !check_task_state(p, match_state))
	2377	+ if (match_state && unlikely(p->state != match_state))
2035	2378	return 0;
2036	2379	cpu_relax();
2037	2380	}
..	..	@@ -2046,8 +2389,7 @@
2046	2389	running = task_running(rq, p);
2047	2390	queued = task_on_rq_queued(p);
2048	2391	ncsw = 0;
2049		- if (!match_state \|\| p->state == match_state \|\|
2050		- p->saved_state == match_state)
	2392	+ if (!match_state \|\| p->state == match_state)
2051	2393	ncsw = p->nvcsw \| LONG_MIN; /* sets MSB */
2052	2394	task_rq_unlock(rq, p, &rf);
2053	2395
..	..	@@ -2148,7 +2490,11 @@
2148	2490	int nid = cpu_to_node(cpu);
2149	2491	const struct cpumask *nodemask = NULL;
2150	2492	enum { cpuset, possible, fail } state = cpuset;
2151		- int dest_cpu;
	2493	+ int dest_cpu = -1;
	2494	+
	2495	+ trace_android_rvh_select_fallback_rq(cpu, p, &dest_cpu);
	2496	+ if (dest_cpu >= 0)
	2497	+ return dest_cpu;
2152	2498
2153	2499	/*
2154	2500	* If the node that the CPU is on has been offlined, cpu_to_node()
..	..	@@ -2160,9 +2506,7 @@
2160	2506
2161	2507	/* Look for allowed, online CPU in same node. */
2162	2508	for_each_cpu(dest_cpu, nodemask) {
2163		- if (!cpu_active(dest_cpu))
2164		- continue;
2165		- if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
	2509	+ if (is_cpu_allowed(p, dest_cpu))
2166	2510	return dest_cpu;
2167	2511	}
2168	2512	}
..	..	@@ -2184,12 +2528,11 @@
2184	2528	state = possible;
2185	2529	break;
2186	2530	}
2187		- /* Fall-through */
	2531	+ fallthrough;
2188	2532	case possible:
2189		- do_set_cpus_allowed(p, cpu_possible_mask);
	2533	+ do_set_cpus_allowed(p, task_cpu_possible_mask(p));
2190	2534	state = fail;
2191	2535	break;
2192		-
2193	2536	case fail:
2194	2537	BUG();
2195	2538	break;
..	..	@@ -2216,14 +2559,12 @@
2216	2559	* The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
2217	2560	*/
2218	2561	static inline
2219		-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
2220		- int sibling_count_hint)
	2562	+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
2221	2563	{
2222	2564	lockdep_assert_held(&p->pi_lock);
2223	2565
2224	2566	if (p->nr_cpus_allowed > 1)
2225		- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
2226		- sibling_count_hint);
	2567	+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
2227	2568	else
2228	2569	cpu = cpumask_any(p->cpus_ptr);
2229	2570
..	..	@@ -2241,12 +2582,6 @@
2241	2582	cpu = select_fallback_rq(task_cpu(p), p);
2242	2583
2243	2584	return cpu;
2244		-}
2245		-
2246		-static void update_avg(u64 *avg, u64 sample)
2247		-{
2248		- s64 diff = sample - *avg;
2249		- *avg += diff >> 3;
2250	2585	}
2251	2586
2252	2587	void sched_set_stop_task(int cpu, struct task_struct *stop)
..	..	@@ -2328,12 +2663,6 @@
2328	2663	__schedstat_inc(p->se.statistics.nr_wakeups_sync);
2329	2664	}
2330	2665
2331		-static inline void ttwu_activate(struct rq rq, struct task_struct p, int en_flags)
2332		-{
2333		- activate_task(rq, p, en_flags);
2334		- p->on_rq = TASK_ON_RQ_QUEUED;
2335		-}
2336		-
2337	2666	/*
2338	2667	* Mark the task runnable and perform wakeup-preemption.
2339	2668	*/
..	..	@@ -2375,27 +2704,54 @@
2375	2704	{
2376	2705	int en_flags = ENQUEUE_WAKEUP \| ENQUEUE_NOCLOCK;
2377	2706
	2707	+ if (wake_flags & WF_SYNC)
	2708	+ en_flags \|= ENQUEUE_WAKEUP_SYNC;
	2709	+
2378	2710	lockdep_assert_held(&rq->lock);
2379	2711
2380		-#ifdef CONFIG_SMP
2381	2712	if (p->sched_contributes_to_load)
2382	2713	rq->nr_uninterruptible--;
2383	2714
	2715	+#ifdef CONFIG_SMP
2384	2716	if (wake_flags & WF_MIGRATED)
2385	2717	en_flags \|= ENQUEUE_MIGRATED;
	2718	+ else
2386	2719	#endif
	2720	+ if (p->in_iowait) {
	2721	+ delayacct_blkio_end(p);
	2722	+ atomic_dec(&task_rq(p)->nr_iowait);
	2723	+ }
2387	2724
2388		- ttwu_activate(rq, p, en_flags);
	2725	+ activate_task(rq, p, en_flags);
2389	2726	ttwu_do_wakeup(rq, p, wake_flags, rf);
2390	2727	}
2391	2728
2392	2729	/*
2393		- * Called in case the task @p isn't fully descheduled from its runqueue,
2394		- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2395		- * since all we need to do is flip p->state to TASK_RUNNING, since
2396		- * the task is still ->on_rq.
	2730	+ * Consider @p being inside a wait loop:
	2731	+ *
	2732	+ * for (;;) {
	2733	+ * set_current_state(TASK_UNINTERRUPTIBLE);
	2734	+ *
	2735	+ * if (CONDITION)
	2736	+ * break;
	2737	+ *
	2738	+ * schedule();
	2739	+ * }
	2740	+ * __set_current_state(TASK_RUNNING);
	2741	+ *
	2742	+ * between set_current_state() and schedule(). In this case @p is still
	2743	+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
	2744	+ * an atomic manner.
	2745	+ *
	2746	+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
	2747	+ * then schedule() must still happen and p->state can be changed to
	2748	+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
	2749	+ * need to do a full wakeup with enqueue.
	2750	+ *
	2751	+ * Returns: %true when the wakeup is done,
	2752	+ * %false otherwise.
2397	2753	*/
2398		-static int ttwu_remote(struct task_struct *p, int wake_flags)
	2754	+static int ttwu_runnable(struct task_struct *p, int wake_flags)
2399	2755	{
2400	2756	struct rq_flags rf;
2401	2757	struct rq *rq;
..	..	@@ -2414,75 +2770,63 @@
2414	2770	}
2415	2771
2416	2772	#ifdef CONFIG_SMP
2417		-void sched_ttwu_pending(void)
	2773	+void sched_ttwu_pending(void *arg)
2418	2774	{
	2775	+ struct llist_node *llist = arg;
2419	2776	struct rq *rq = this_rq();
2420		- struct llist_node *llist = llist_del_all(&rq->wake_list);
2421	2777	struct task_struct p, t;
2422	2778	struct rq_flags rf;
2423	2779
2424	2780	if (!llist)
2425	2781	return;
2426	2782
	2783	+ /*
	2784	+ * rq::ttwu_pending racy indication of out-standing wakeups.
	2785	+ * Races such that false-negatives are possible, since they
	2786	+ * are shorter lived that false-positives would be.
	2787	+ */
	2788	+ WRITE_ONCE(rq->ttwu_pending, 0);
	2789	+
2427	2790	rq_lock_irqsave(rq, &rf);
2428	2791	update_rq_clock(rq);
2429	2792
2430		- llist_for_each_entry_safe(p, t, llist, wake_entry)
	2793	+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
	2794	+ if (WARN_ON_ONCE(p->on_cpu))
	2795	+ smp_cond_load_acquire(&p->on_cpu, !VAL);
	2796	+
	2797	+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
	2798	+ set_task_cpu(p, cpu_of(rq));
	2799	+
2431	2800	ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
	2801	+ }
2432	2802
2433	2803	rq_unlock_irqrestore(rq, &rf);
2434	2804	}
2435	2805
2436		-void scheduler_ipi(void)
	2806	+void send_call_function_single_ipi(int cpu)
2437	2807	{
2438		- /*
2439		- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
2440		- * TIF_NEED_RESCHED remotely (for the first time) will also send
2441		- * this IPI.
2442		- */
2443		- preempt_fold_need_resched();
	2808	+ struct rq *rq = cpu_rq(cpu);
2444	2809
2445		- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2446		- return;
2447		-
2448		- /*
2449		- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2450		- * traditionally all their work was done from the interrupt return
2451		- * path. Now that we actually do some work, we need to make sure
2452		- * we do call them.
2453		- *
2454		- * Some archs already do call them, luckily irq_enter/exit nest
2455		- * properly.
2456		- *
2457		- * Arguably we should visit all archs and update all handlers,
2458		- * however a fair share of IPIs are still resched only so this would
2459		- * somewhat pessimize the simple resched case.
2460		- */
2461		- irq_enter();
2462		- sched_ttwu_pending();
2463		-
2464		- /*
2465		- * Check if someone kicked us for doing the nohz idle load balance.
2466		- */
2467		- if (unlikely(got_nohz_idle_kick())) {
2468		- this_rq()->idle_balance = 1;
2469		- raise_softirq_irqoff(SCHED_SOFTIRQ);
2470		- }
2471		- irq_exit();
	2810	+ if (!set_nr_if_polling(rq->idle))
	2811	+ arch_send_call_function_single_ipi(cpu);
	2812	+ else
	2813	+ trace_sched_wake_idle_without_ipi(cpu);
2472	2814	}
2473	2815
2474		-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
	2816	+/*
	2817	+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
	2818	+ * necessary. The wakee CPU on receipt of the IPI will queue the task
	2819	+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
	2820	+ * of the wakeup instead of the waker.
	2821	+ */
	2822	+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2475	2823	{
2476	2824	struct rq *rq = cpu_rq(cpu);
2477	2825
2478	2826	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
2479	2827
2480		- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
2481		- if (!set_nr_if_polling(rq->idle))
2482		- smp_send_reschedule(cpu);
2483		- else
2484		- trace_sched_wake_idle_without_ipi(cpu);
2485		- }
	2828	+ WRITE_ONCE(rq->ttwu_pending, 1);
	2829	+ __smp_call_single_queue(cpu, &p->wake_entry.llist);
2486	2830	}
2487	2831
2488	2832	void wake_up_if_idle(int cpu)
..	..	@@ -2508,6 +2852,7 @@
2508	2852	out:
2509	2853	rcu_read_unlock();
2510	2854	}
	2855	+EXPORT_SYMBOL_GPL(wake_up_if_idle);
2511	2856
2512	2857	bool cpus_share_cache(int this_cpu, int that_cpu)
2513	2858	{
..	..	@@ -2516,6 +2861,58 @@
2516	2861
2517	2862	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
2518	2863	}
	2864	+
	2865	+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
	2866	+{
	2867	+ /*
	2868	+ * If the CPU does not share cache, then queue the task on the
	2869	+ * remote rqs wakelist to avoid accessing remote data.
	2870	+ */
	2871	+ if (!cpus_share_cache(smp_processor_id(), cpu))
	2872	+ return true;
	2873	+
	2874	+ /*
	2875	+ * If the task is descheduling and the only running task on the
	2876	+ * CPU then use the wakelist to offload the task activation to
	2877	+ * the soon-to-be-idle CPU as the current CPU is likely busy.
	2878	+ * nr_running is checked to avoid unnecessary task stacking.
	2879	+ *
	2880	+ * Note that we can only get here with (wakee) p->on_rq=0,
	2881	+ * p->on_cpu can be whatever, we've done the dequeue, so
	2882	+ * the wakee has been accounted out of ->nr_running.
	2883	+ */
	2884	+ if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running)
	2885	+ return true;
	2886	+
	2887	+ return false;
	2888	+}
	2889	+
	2890	+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
	2891	+{
	2892	+ bool cond = false;
	2893	+
	2894	+ trace_android_rvh_ttwu_cond(&cond);
	2895	+
	2896	+ if ((sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) \|\|
	2897	+ cond) {
	2898	+ if (WARN_ON_ONCE(cpu == smp_processor_id()))
	2899	+ return false;
	2900	+
	2901	+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
	2902	+ __ttwu_queue_wakelist(p, cpu, wake_flags);
	2903	+ return true;
	2904	+ }
	2905	+
	2906	+ return false;
	2907	+}
	2908	+
	2909	+#else /* !CONFIG_SMP */
	2910	+
	2911	+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
	2912	+{
	2913	+ return false;
	2914	+}
	2915	+
2519	2916	#endif /* CONFIG_SMP */
2520	2917
2521	2918	static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
..	..	@@ -2523,13 +2920,8 @@
2523	2920	struct rq *rq = cpu_rq(cpu);
2524	2921	struct rq_flags rf;
2525	2922
2526		-#if defined(CONFIG_SMP)
2527		- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
2528		- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
2529		- ttwu_queue_remote(p, cpu, wake_flags);
	2923	+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
2530	2924	return;
2531		- }
2532		-#endif
2533	2925
2534	2926	rq_lock(rq, &rf);
2535	2927	update_rq_clock(rq);
..	..	@@ -2585,8 +2977,8 @@
2585	2977	* migration. However the means are completely different as there is no lock
2586	2978	* chain to provide order. Instead we do:
2587	2979	*
2588		- * 1) smp_store_release(X->on_cpu, 0)
2589		- * 2) smp_cond_load_acquire(!X->on_cpu)
	2980	+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
	2981	+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
2590	2982	*
2591	2983	* Example:
2592	2984	*
..	..	@@ -2625,64 +3017,95 @@
2625	3017	* @p: the thread to be awakened
2626	3018	* @state: the mask of task states that can be woken
2627	3019	* @wake_flags: wake modifier flags (WF_*)
2628		- * @sibling_count_hint: A hint at the number of threads that are being woken up
2629		- * in this event.
2630	3020	*
2631		- * If (@state & @p->state) @p->state = TASK_RUNNING.
	3021	+ * Conceptually does:
	3022	+ *
	3023	+ * If (@state & @p->state) @p->state = TASK_RUNNING.
2632	3024	*
2633	3025	* If the task was not queued/runnable, also place it back on a runqueue.
2634	3026	*
2635		- * Atomic against schedule() which would dequeue a task, also see
2636		- * set_current_state().
	3027	+ * This function is atomic against schedule() which would dequeue the task.
2637	3028	*
2638		- * This function executes a full memory barrier before accessing the task
2639		- * state; see set_current_state().
	3029	+ * It issues a full memory barrier before accessing @p->state, see the comment
	3030	+ * with set_current_state().
	3031	+ *
	3032	+ * Uses p->pi_lock to serialize against concurrent wake-ups.
	3033	+ *
	3034	+ * Relies on p->pi_lock stabilizing:
	3035	+ * - p->sched_class
	3036	+ * - p->cpus_ptr
	3037	+ * - p->sched_task_group
	3038	+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
	3039	+ *
	3040	+ * Tries really hard to only take one task_rq(p)->lock for performance.
	3041	+ * Takes rq->lock in:
	3042	+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
	3043	+ * - ttwu_queue() -- new rq, for enqueue of the task;
	3044	+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
	3045	+ *
	3046	+ * As a consequence we race really badly with just about everything. See the
	3047	+ * many memory barriers and their comments for details.
2640	3048	*
2641	3049	* Return: %true if @p->state changes (an actual wakeup was done),
2642	3050	* %false otherwise.
2643	3051	*/
2644	3052	static int
2645		-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
2646		- int sibling_count_hint)
	3053	+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2647	3054	{
2648	3055	unsigned long flags;
2649	3056	int cpu, success = 0;
2650	3057
2651		- /*
2652		- * If we are going to wake up a thread waiting for CONDITION we
2653		- * need to ensure that CONDITION=1 done by the caller can not be
2654		- * reordered with p->state check below. This pairs with mb() in
2655		- * set_current_state() the waiting thread does.
2656		- */
2657		- raw_spin_lock_irqsave(&p->pi_lock, flags);
2658		- smp_mb__after_spinlock();
2659		- if (!(p->state & state)) {
	3058	+ preempt_disable();
	3059	+ if (p == current) {
2660	3060	/*
2661		- * The task might be running due to a spinlock sleeper
2662		- * wakeup. Check the saved state and set it to running
2663		- * if the wakeup condition is true.
	3061	+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
	3062	+ * == smp_processor_id()'. Together this means we can special
	3063	+ * case the whole 'p->on_rq && ttwu_runnable()' case below
	3064	+ * without taking any locks.
	3065	+ *
	3066	+ * In particular:
	3067	+ * - we rely on Program-Order guarantees for all the ordering,
	3068	+ * - we're serialized against set_special_state() by virtue of
	3069	+ * it disabling IRQs (this allows not taking ->pi_lock).
2664	3070	*/
2665		- if (!(wake_flags & WF_LOCK_SLEEPER)) {
2666		- if (p->saved_state & state) {
2667		- p->saved_state = TASK_RUNNING;
2668		- success = 1;
2669		- }
2670		- }
	3071	+ if (!(p->state & state))
	3072	+ goto out;
	3073	+
	3074	+ success = 1;
	3075	+ trace_sched_waking(p);
	3076	+ p->state = TASK_RUNNING;
	3077	+ trace_sched_wakeup(p);
2671	3078	goto out;
2672	3079	}
2673	3080
2674	3081	/*
2675		- * If this is a regular wakeup, then we can unconditionally
2676		- * clear the saved state of a "lock sleeper".
	3082	+ * If we are going to wake up a thread waiting for CONDITION we
	3083	+ * need to ensure that CONDITION=1 done by the caller can not be
	3084	+ * reordered with p->state check below. This pairs with smp_store_mb()
	3085	+ * in set_current_state() that the waiting thread does.
2677	3086	*/
2678		- if (!(wake_flags & WF_LOCK_SLEEPER))
2679		- p->saved_state = TASK_RUNNING;
	3087	+ raw_spin_lock_irqsave(&p->pi_lock, flags);
	3088	+ smp_mb__after_spinlock();
	3089	+ if (!(p->state & state))
	3090	+ goto unlock;
	3091	+
	3092	+#ifdef CONFIG_FREEZER
	3093	+ /*
	3094	+ * If we're going to wake up a thread which may be frozen, then
	3095	+ * we can only do so if we have an active CPU which is capable of
	3096	+ * running it. This may not be the case when resuming from suspend,
	3097	+ * as the secondary CPUs may not yet be back online. See __thaw_task()
	3098	+ * for the actual wakeup.
	3099	+ */
	3100	+ if (unlikely(frozen_or_skipped(p)) &&
	3101	+ !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p)))
	3102	+ goto unlock;
	3103	+#endif
2680	3104
2681	3105	trace_sched_waking(p);
2682	3106
2683	3107	/* We're going to change ->state: */
2684	3108	success = 1;
2685		- cpu = task_cpu(p);
2686	3109
2687	3110	/*
2688	3111	* Ensure we load p->on_rq _after_ p->state, otherwise it would
..	..	@@ -2703,10 +3126,15 @@
2703	3126	*
2704	3127	* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
2705	3128	* __schedule(). See the comment for smp_mb__after_spinlock().
	3129	+ *
	3130	+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
2706	3131	*/
2707	3132	smp_rmb();
2708		- if (p->on_rq && ttwu_remote(p, wake_flags))
2709		- goto stat;
	3133	+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
	3134	+ goto unlock;
	3135	+
	3136	+ if (p->state & TASK_UNINTERRUPTIBLE)
	3137	+ trace_sched_blocked_reason(p);
2710	3138
2711	3139	#ifdef CONFIG_SMP
2712	3140	/*
..	..	@@ -2727,8 +3155,43 @@
2727	3155	*
2728	3156	* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
2729	3157	* __schedule(). See the comment for smp_mb__after_spinlock().
	3158	+ *
	3159	+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
	3160	+ * schedule()'s deactivate_task() has 'happened' and p will no longer
	3161	+ * care about it's own p->state. See the comment in __schedule().
2730	3162	*/
2731		- smp_rmb();
	3163	+ smp_acquire__after_ctrl_dep();
	3164	+
	3165	+ /*
	3166	+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
	3167	+ * == 0), which means we need to do an enqueue, change p->state to
	3168	+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
	3169	+ * enqueue, such as ttwu_queue_wakelist().
	3170	+ */
	3171	+ p->state = TASK_WAKING;
	3172	+
	3173	+ /*
	3174	+ * If the owning (remote) CPU is still in the middle of schedule() with
	3175	+ * this task as prev, considering queueing p on the remote CPUs wake_list
	3176	+ * which potentially sends an IPI instead of spinning on p->on_cpu to
	3177	+ * let the waker make forward progress. This is safe because IRQs are
	3178	+ * disabled and the IPI will deliver after on_cpu is cleared.
	3179	+ *
	3180	+ * Ensure we load task_cpu(p) after p->on_cpu:
	3181	+ *
	3182	+ * set_task_cpu(p, cpu);
	3183	+ * STORE p->cpu = @cpu
	3184	+ * __schedule() (switch to task 'p')
	3185	+ * LOCK rq->lock
	3186	+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
	3187	+ * STORE p->on_cpu = 1 LOAD p->cpu
	3188	+ *
	3189	+ * to ensure we observe the correct CPU on which the task is currently
	3190	+ * scheduling.
	3191	+ */
	3192	+ if (smp_load_acquire(&p->on_cpu) &&
	3193	+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags \| WF_ON_CPU))
	3194	+ goto unlock;
2732	3195
2733	3196	/*
2734	3197	* If the owning (remote) CPU is still in the middle of schedule() with
..	..	@@ -2741,38 +3204,79 @@
2741	3204	*/
2742	3205	smp_cond_load_acquire(&p->on_cpu, !VAL);
2743	3206
2744		- p->sched_contributes_to_load = !!task_contributes_to_load(p);
2745		- p->state = TASK_WAKING;
	3207	+ trace_android_rvh_try_to_wake_up(p);
2746	3208
2747		- if (p->in_iowait) {
2748		- delayacct_blkio_end(p);
2749		- atomic_dec(&task_rq(p)->nr_iowait);
2750		- }
2751		-
2752		- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
2753		- sibling_count_hint);
	3209	+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2754	3210	if (task_cpu(p) != cpu) {
	3211	+ if (p->in_iowait) {
	3212	+ delayacct_blkio_end(p);
	3213	+ atomic_dec(&task_rq(p)->nr_iowait);
	3214	+ }
	3215	+
2755	3216	wake_flags \|= WF_MIGRATED;
2756	3217	psi_ttwu_dequeue(p);
2757	3218	set_task_cpu(p, cpu);
2758	3219	}
2759		-
2760		-#else /* CONFIG_SMP */
2761		-
2762		- if (p->in_iowait) {
2763		- delayacct_blkio_end(p);
2764		- atomic_dec(&task_rq(p)->nr_iowait);
2765		- }
2766		-
	3220	+#else
	3221	+ cpu = task_cpu(p);
2767	3222	#endif /* CONFIG_SMP */
2768	3223
2769	3224	ttwu_queue(p, cpu, wake_flags);
2770		-stat:
2771		- ttwu_stat(p, cpu, wake_flags);
2772		-out:
	3225	+unlock:
2773	3226	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
	3227	+out:
	3228	+ if (success) {
	3229	+ trace_android_rvh_try_to_wake_up_success(p);
	3230	+ ttwu_stat(p, task_cpu(p), wake_flags);
	3231	+ }
	3232	+ preempt_enable();
2774	3233
2775	3234	return success;
	3235	+}
	3236	+
	3237	+/**
	3238	+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
	3239	+ * @p: Process for which the function is to be invoked, can be @current.
	3240	+ * @func: Function to invoke.
	3241	+ * @arg: Argument to function.
	3242	+ *
	3243	+ * If the specified task can be quickly locked into a definite state
	3244	+ * (either sleeping or on a given runqueue), arrange to keep it in that
	3245	+ * state while invoking @func(@arg). This function can use ->on_rq and
	3246	+ * task_curr() to work out what the state is, if required. Given that
	3247	+ * @func can be invoked with a runqueue lock held, it had better be quite
	3248	+ * lightweight.
	3249	+ *
	3250	+ * Returns:
	3251	+ * @false if the task slipped out from under the locks.
	3252	+ * @true if the task was locked onto a runqueue or is sleeping.
	3253	+ * However, @func can override this by returning @false.
	3254	+ */
	3255	+bool try_invoke_on_locked_down_task(struct task_struct p, bool (func)(struct task_struct t, void arg), void *arg)
	3256	+{
	3257	+ struct rq_flags rf;
	3258	+ bool ret = false;
	3259	+ struct rq *rq;
	3260	+
	3261	+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
	3262	+ if (p->on_rq) {
	3263	+ rq = __task_rq_lock(p, &rf);
	3264	+ if (task_rq(p) == rq)
	3265	+ ret = func(p, arg);
	3266	+ rq_unlock(rq, &rf);
	3267	+ } else {
	3268	+ switch (p->state) {
	3269	+ case TASK_RUNNING:
	3270	+ case TASK_WAKING:
	3271	+ break;
	3272	+ default:
	3273	+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
	3274	+ if (!p->on_rq)
	3275	+ ret = func(p, arg);
	3276	+ }
	3277	+ }
	3278	+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
	3279	+ return ret;
2776	3280	}
2777	3281
2778	3282	/**
..	..	@@ -2788,25 +3292,13 @@
2788	3292	*/
2789	3293	int wake_up_process(struct task_struct *p)
2790	3294	{
2791		- return try_to_wake_up(p, TASK_NORMAL, 0, 1);
	3295	+ return try_to_wake_up(p, TASK_NORMAL, 0);
2792	3296	}
2793	3297	EXPORT_SYMBOL(wake_up_process);
2794	3298
2795		-/**
2796		- * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
2797		- * @p: The process to be woken up.
2798		- *
2799		- * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
2800		- * the nature of the wakeup.
2801		- */
2802		-int wake_up_lock_sleeper(struct task_struct *p)
2803		-{
2804		- return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER, 1);
2805		-}
2806		-
2807	3299	int wake_up_state(struct task_struct *p, unsigned int state)
2808	3300	{
2809		- return try_to_wake_up(p, state, 0, 1);
	3301	+ return try_to_wake_up(p, state, 0);
2810	3302	}
2811	3303
2812	3304	/*
..	..	@@ -2831,6 +3323,8 @@
2831	3323	p->se.cfs_rq = NULL;
2832	3324	#endif
2833	3325
	3326	+ trace_android_rvh_sched_fork_init(p);
	3327	+
2834	3328	#ifdef CONFIG_SCHEDSTATS
2835	3329	/* Even if schedstat is disabled, there should not be garbage */
2836	3330	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
..	..	@@ -2851,7 +3345,13 @@
2851	3345	INIT_HLIST_HEAD(&p->preempt_notifiers);
2852	3346	#endif
2853	3347
	3348	+#ifdef CONFIG_COMPACTION
	3349	+ p->capture_control = NULL;
	3350	+#endif
2854	3351	init_numa_balancing(clone_flags, p);
	3352	+#ifdef CONFIG_SMP
	3353	+ p->wake_entry.u_flags = CSD_TYPE_TTWU;
	3354	+#endif
2855	3355	}
2856	3356
2857	3357	DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
..	..	@@ -2868,7 +3368,7 @@
2868	3368
2869	3369	#ifdef CONFIG_PROC_SYSCTL
2870	3370	int sysctl_numa_balancing(struct ctl_table *table, int write,
2871		- void __user buffer, size_t lenp, loff_t *ppos)
	3371	+ void buffer, size_t lenp, loff_t *ppos)
2872	3372	{
2873	3373	struct ctl_table t;
2874	3374	int err;
..	..	@@ -2942,8 +3442,8 @@
2942	3442	}
2943	3443
2944	3444	#ifdef CONFIG_PROC_SYSCTL
2945		-int sysctl_schedstats(struct ctl_table *table, int write,
2946		- void __user buffer, size_t lenp, loff_t *ppos)
	3445	+int sysctl_schedstats(struct ctl_table table, int write, void buffer,
	3446	+ size_t lenp, loff_t ppos)
2947	3447	{
2948	3448	struct ctl_table t;
2949	3449	int err;
..	..	@@ -2971,7 +3471,7 @@
2971	3471	*/
2972	3472	int sched_fork(unsigned long clone_flags, struct task_struct *p)
2973	3473	{
2974		- unsigned long flags;
	3474	+ trace_android_rvh_sched_fork(p);
2975	3475
2976	3476	__sched_fork(clone_flags, p);
2977	3477	/*
..	..	@@ -2985,6 +3485,7 @@
2985	3485	* Make sure we do not leak PI boosting priority to the child.
2986	3486	*/
2987	3487	p->prio = current->normal_prio;
	3488	+ trace_android_rvh_prepare_prio_fork(p);
2988	3489
2989	3490	uclamp_fork(p);
2990	3491
..	..	@@ -2999,8 +3500,8 @@
2999	3500	} else if (PRIO_TO_NICE(p->static_prio) < 0)
3000	3501	p->static_prio = NICE_TO_PRIO(0);
3001	3502
3002		- p->prio = p->normal_prio = __normal_prio(p);
3003		- set_load_weight(p, false);
	3503	+ p->prio = p->normal_prio = p->static_prio;
	3504	+ set_load_weight(p);
3004	3505
3005	3506	/*
3006	3507	* We don't need the reset flag anymore after the fork. It has
..	..	@@ -3017,24 +3518,8 @@
3017	3518	p->sched_class = &fair_sched_class;
3018	3519
3019	3520	init_entity_runnable_average(&p->se);
	3521	+ trace_android_rvh_finish_prio_fork(p);
3020	3522
3021		- /*
3022		- * The child is not yet in the pid-hash so no cgroup attach races,
3023		- * and the cgroup is pinned to this child due to cgroup_fork()
3024		- * is ran before sched_fork().
3025		- *
3026		- * Silence PROVE_RCU.
3027		- */
3028		- raw_spin_lock_irqsave(&p->pi_lock, flags);
3029		- rseq_migrate(p);
3030		- /*
3031		- * We're setting the CPU for the first time, we don't migrate,
3032		- * so use __set_task_cpu().
3033		- */
3034		- __set_task_cpu(p, smp_processor_id());
3035		- if (p->sched_class->task_fork)
3036		- p->sched_class->task_fork(p);
3037		- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3038	3523
3039	3524	#ifdef CONFIG_SCHED_INFO
3040	3525	if (likely(sched_info_on()))
..	..	@@ -3044,14 +3529,46 @@
3044	3529	p->on_cpu = 0;
3045	3530	#endif
3046	3531	init_task_preempt_count(p);
3047		-#ifdef CONFIG_HAVE_PREEMPT_LAZY
3048		- task_thread_info(p)->preempt_lazy_count = 0;
3049		-#endif
3050	3532	#ifdef CONFIG_SMP
3051	3533	plist_node_init(&p->pushable_tasks, MAX_PRIO);
3052	3534	RB_CLEAR_NODE(&p->pushable_dl_tasks);
3053	3535	#endif
3054	3536	return 0;
	3537	+}
	3538	+
	3539	+void sched_cgroup_fork(struct task_struct p, struct kernel_clone_args kargs)
	3540	+{
	3541	+ unsigned long flags;
	3542	+
	3543	+ /*
	3544	+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
	3545	+ * required yet, but lockdep gets upset if rules are violated.
	3546	+ */
	3547	+ raw_spin_lock_irqsave(&p->pi_lock, flags);
	3548	+#ifdef CONFIG_CGROUP_SCHED
	3549	+ if (1) {
	3550	+ struct task_group *tg;
	3551	+
	3552	+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
	3553	+ struct task_group, css);
	3554	+ tg = autogroup_task_group(p, tg);
	3555	+ p->sched_task_group = tg;
	3556	+ }
	3557	+#endif
	3558	+ rseq_migrate(p);
	3559	+ /*
	3560	+ * We're setting the CPU for the first time, we don't migrate,
	3561	+ * so use __set_task_cpu().
	3562	+ */
	3563	+ __set_task_cpu(p, smp_processor_id());
	3564	+ if (p->sched_class->task_fork)
	3565	+ p->sched_class->task_fork(p);
	3566	+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
	3567	+}
	3568	+
	3569	+void sched_post_fork(struct task_struct *p)
	3570	+{
	3571	+ uclamp_post_fork(p);
3055	3572	}
3056	3573
3057	3574	unsigned long to_ratio(u64 period, u64 runtime)
..	..	@@ -3082,6 +3599,8 @@
3082	3599	struct rq_flags rf;
3083	3600	struct rq *rq;
3084	3601
	3602	+ trace_android_rvh_wake_up_new_task(p);
	3603	+
3085	3604	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3086	3605	p->state = TASK_RUNNING;
3087	3606	#ifdef CONFIG_SMP
..	..	@@ -3095,14 +3614,14 @@
3095	3614	*/
3096	3615	p->recent_used_cpu = task_cpu(p);
3097	3616	rseq_migrate(p);
3098		- __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
	3617	+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
3099	3618	#endif
3100	3619	rq = __task_rq_lock(p, &rf);
3101	3620	update_rq_clock(rq);
3102		- post_init_entity_util_avg(&p->se);
	3621	+ post_init_entity_util_avg(p);
	3622	+ trace_android_rvh_new_task_stats(p);
3103	3623
3104	3624	activate_task(rq, p, ENQUEUE_NOCLOCK);
3105		- p->on_rq = TASK_ON_RQ_QUEUED;
3106	3625	trace_sched_wakeup_new(p);
3107	3626	check_preempt_curr(rq, p, WF_FORK);
3108	3627	#ifdef CONFIG_SMP
..	..	@@ -3212,8 +3731,10 @@
3212	3731	/*
3213	3732	* Claim the task as running, we do this before switching to it
3214	3733	* such that any running task will have this set.
	3734	+ *
	3735	+ * See the ttwu() WF_ON_CPU case and its ordering comment.
3215	3736	*/
3216		- next->on_cpu = 1;
	3737	+ WRITE_ONCE(next->on_cpu, 1);
3217	3738	#endif
3218	3739	}
3219	3740
..	..	@@ -3221,8 +3742,9 @@
3221	3742	{
3222	3743	#ifdef CONFIG_SMP
3223	3744	/*
3224		- * After ->on_cpu is cleared, the task can be moved to a different CPU.
3225		- * We must ensure this doesn't happen until the switch is completely
	3745	+ * This must be the very last reference to @prev from this CPU. After
	3746	+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
	3747	+ * must ensure this doesn't happen until the switch is completely
3226	3748	* finished.
3227	3749	*
3228	3750	* In particular, the load of prev->state in finish_task_switch() must
..	..	@@ -3244,7 +3766,7 @@
3244	3766	* do an early lockdep release here:
3245	3767	*/
3246	3768	rq_unpin_lock(rq, rf);
3247		- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
	3769	+ spin_release(&rq->lock.dep_map, _THIS_IP_);
3248	3770	#ifdef CONFIG_DEBUG_SPINLOCK
3249	3771	/* this is a valid case when another task releases the spinlock */
3250	3772	rq->lock.owner = next;
..	..	@@ -3376,19 +3898,25 @@
3376	3898	* provided by mmdrop(),
3377	3899	* - a sync_core for SYNC_CORE.
3378	3900	*/
3379		- /*
3380		- * We use mmdrop_delayed() here so we don't have to do the
3381		- * full __mmdrop() when we are the last user.
3382		- */
3383	3901	if (mm) {
3384	3902	membarrier_mm_sync_core_before_usermode(mm);
3385		- mmdrop_delayed(mm);
	3903	+ mmdrop(mm);
3386	3904	}
3387	3905	if (unlikely(prev_state == TASK_DEAD)) {
3388	3906	if (prev->sched_class->task_dead)
3389	3907	prev->sched_class->task_dead(prev);
3390	3908
3391		- put_task_struct(prev);
	3909	+ /*
	3910	+ * Remove function-return probe instances associated with this
	3911	+ * task and put them back on the free list.
	3912	+ */
	3913	+ kprobe_flush_task(prev);
	3914	+ trace_android_rvh_flush_task(prev);
	3915	+
	3916	+ /* Task is done with its stack. */
	3917	+ put_task_stack(prev);
	3918	+
	3919	+ put_task_struct_rcu_user(prev);
3392	3920	}
3393	3921
3394	3922	tick_nohz_task_switch();
..	..	@@ -3467,12 +3995,8 @@
3467	3995	context_switch(struct rq rq, struct task_struct prev,
3468	3996	struct task_struct next, struct rq_flags rf)
3469	3997	{
3470		- struct mm_struct mm, oldmm;
3471		-
3472	3998	prepare_task_switch(rq, prev, next);
3473	3999
3474		- mm = next->mm;
3475		- oldmm = prev->active_mm;
3476	4000	/*
3477	4001	* For paravirt, this is coupled with an exit in switch_to to
3478	4002	* combine the page table reload and the switch backend into
..	..	@@ -3481,22 +4005,37 @@
3481	4005	arch_start_context_switch(prev);
3482	4006
3483	4007	/*
3484		- * If mm is non-NULL, we pass through switch_mm(). If mm is
3485		- * NULL, we will pass through mmdrop() in finish_task_switch().
3486		- * Both of these contain the full memory barrier required by
3487		- * membarrier after storing to rq->curr, before returning to
3488		- * user-space.
	4008	+ * kernel -> kernel lazy + transfer active
	4009	+ * user -> kernel lazy + mmgrab() active
	4010	+ *
	4011	+ * kernel -> user switch + mmdrop() active
	4012	+ * user -> user switch
3489	4013	*/
3490		- if (!mm) {
3491		- next->active_mm = oldmm;
3492		- mmgrab(oldmm);
3493		- enter_lazy_tlb(oldmm, next);
3494		- } else
3495		- switch_mm_irqs_off(oldmm, mm, next);
	4014	+ if (!next->mm) { // to kernel
	4015	+ enter_lazy_tlb(prev->active_mm, next);
3496	4016
3497		- if (!prev->mm) {
3498		- prev->active_mm = NULL;
3499		- rq->prev_mm = oldmm;
	4017	+ next->active_mm = prev->active_mm;
	4018	+ if (prev->mm) // from user
	4019	+ mmgrab(prev->active_mm);
	4020	+ else
	4021	+ prev->active_mm = NULL;
	4022	+ } else { // to user
	4023	+ membarrier_switch_mm(rq, prev->active_mm, next->mm);
	4024	+ /*
	4025	+ * sys_membarrier() requires an smp_mb() between setting
	4026	+ * rq->curr / membarrier_switch_mm() and returning to userspace.
	4027	+ *
	4028	+ * The below provides this either through switch_mm(), or in
	4029	+ * case 'prev->active_mm == next->mm' through
	4030	+ * finish_task_switch()'s mmdrop().
	4031	+ */
	4032	+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
	4033	+
	4034	+ if (!prev->mm) { // from kernel
	4035	+ /* will mmdrop() in finish_task_switch(). */
	4036	+ rq->prev_mm = prev->active_mm;
	4037	+ prev->active_mm = NULL;
	4038	+ }
3500	4039	}
3501	4040
3502	4041	rq->clock_update_flags &= ~(RQCF_ACT_SKIP\|RQCF_REQ_SKIP);
..	..	@@ -3533,7 +4072,7 @@
3533	4072	* preemption, thus the result might have a time-of-check-to-time-of-use
3534	4073	* race. The caller is responsible to use it correctly, for example:
3535	4074	*
3536		- * - from a non-preemptable section (of course)
	4075	+ * - from a non-preemptible section (of course)
3537	4076	*
3538	4077	* - from a thread that is bound to a single CPU
3539	4078	*
..	..	@@ -3554,6 +4093,18 @@
3554	4093	sum += cpu_rq(i)->nr_switches;
3555	4094
3556	4095	return sum;
	4096	+}
	4097	+
	4098	+/*
	4099	+ * Consumers of these two interfaces, like for example the cpuidle menu
	4100	+ * governor, are using nonsensical data. Preferring shallow idle state selection
	4101	+ * for a CPU that has IO-wait which might not even end up running the task when
	4102	+ * it does become runnable.
	4103	+ */
	4104	+
	4105	+unsigned long nr_iowait_cpu(int cpu)
	4106	+{
	4107	+ return atomic_read(&cpu_rq(cpu)->nr_iowait);
3557	4108	}
3558	4109
3559	4110	/*
..	..	@@ -3591,29 +4142,9 @@
3591	4142	unsigned long i, sum = 0;
3592	4143
3593	4144	for_each_possible_cpu(i)
3594		- sum += atomic_read(&cpu_rq(i)->nr_iowait);
	4145	+ sum += nr_iowait_cpu(i);
3595	4146
3596	4147	return sum;
3597		-}
3598		-
3599		-/*
3600		- * Consumers of these two interfaces, like for example the cpufreq menu
3601		- * governor are using nonsensical data. Boosting frequency for a CPU that has
3602		- * IO-wait which might not even end up running the task when it does become
3603		- * runnable.
3604		- */
3605		-
3606		-unsigned long nr_iowait_cpu(int cpu)
3607		-{
3608		- struct rq *this = cpu_rq(cpu);
3609		- return atomic_read(&this->nr_iowait);
3610		-}
3611		-
3612		-void get_iowait_load(unsigned long nr_waiters, unsigned long load)
3613		-{
3614		- struct rq *rq = this_rq();
3615		- *nr_waiters = atomic_read(&rq->nr_iowait);
3616		- *load = rq->load.weight;
3617	4148	}
3618	4149
3619	4150	#ifdef CONFIG_SMP
..	..	@@ -3627,9 +4158,14 @@
3627	4158	struct task_struct *p = current;
3628	4159	unsigned long flags;
3629	4160	int dest_cpu;
	4161	+ bool cond = false;
	4162	+
	4163	+ trace_android_rvh_sched_exec(&cond);
	4164	+ if (cond)
	4165	+ return;
3630	4166
3631	4167	raw_spin_lock_irqsave(&p->pi_lock, flags);
3632		- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
	4168	+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
3633	4169	if (dest_cpu == smp_processor_id())
3634	4170	goto unlock;
3635	4171
..	..	@@ -3712,6 +4248,7 @@
3712	4248
3713	4249	return ns;
3714	4250	}
	4251	+EXPORT_SYMBOL_GPL(task_sched_runtime);
3715	4252
3716	4253	/*
3717	4254	* This function gets called by the timer code, with HZ frequency.
..	..	@@ -3723,14 +4260,18 @@
3723	4260	struct rq *rq = cpu_rq(cpu);
3724	4261	struct task_struct *curr = rq->curr;
3725	4262	struct rq_flags rf;
	4263	+ unsigned long thermal_pressure;
3726	4264
	4265	+ arch_scale_freq_tick();
3727	4266	sched_clock_tick();
3728	4267
3729	4268	rq_lock(rq, &rf);
3730	4269
	4270	+ trace_android_rvh_tick_entry(rq);
3731	4271	update_rq_clock(rq);
	4272	+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
	4273	+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
3732	4274	curr->sched_class->task_tick(rq, curr, 0);
3733		- cpu_load_update_active(rq);
3734	4275	calc_global_load_tick(rq);
3735	4276	psi_task_tick(rq);
3736	4277
..	..	@@ -3742,6 +4283,8 @@
3742	4283	rq->idle_balance = idle_cpu(cpu);
3743	4284	trigger_load_balance(rq);
3744	4285	#endif
	4286	+
	4287	+ trace_android_vh_scheduler_tick(rq);
3745	4288	}
3746	4289
3747	4290	#ifdef CONFIG_NO_HZ_FULL
..	..	@@ -3799,28 +4342,31 @@
3799	4342	* statistics and checks timeslices in a time-independent way, regardless
3800	4343	* of when exactly it is running.
3801	4344	*/
3802		- if (idle_cpu(cpu) \|\| !tick_nohz_tick_stopped_cpu(cpu))
	4345	+ if (!tick_nohz_tick_stopped_cpu(cpu))
3803	4346	goto out_requeue;
3804	4347
3805	4348	rq_lock_irq(rq, &rf);
3806	4349	curr = rq->curr;
3807		- if (is_idle_task(curr) \|\| cpu_is_offline(cpu))
	4350	+ if (cpu_is_offline(cpu))
3808	4351	goto out_unlock;
3809	4352
3810	4353	update_rq_clock(rq);
3811		- delta = rq_clock_task(rq) - curr->se.exec_start;
3812	4354
3813		- /*
3814		- * Make sure the next tick runs within a reasonable
3815		- * amount of time.
3816		- */
3817		- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
	4355	+ if (!is_idle_task(curr)) {
	4356	+ /*
	4357	+ * Make sure the next tick runs within a reasonable
	4358	+ * amount of time.
	4359	+ */
	4360	+ delta = rq_clock_task(rq) - curr->se.exec_start;
	4361	+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
	4362	+ }
3818	4363	curr->sched_class->task_tick(rq, curr, 0);
3819	4364
	4365	+ calc_load_nohz_remote(rq);
3820	4366	out_unlock:
3821	4367	rq_unlock_irq(rq, &rf);
3822		-
3823	4368	out_requeue:
	4369	+
3824	4370	/*
3825	4371	* Run the remote tick once per second (1Hz). This arbitrary
3826	4372	* frequency is large enough to avoid overload but short enough
..	..	@@ -3884,7 +4430,7 @@
3884	4430	static inline void sched_tick_stop(int cpu) { }
3885	4431	#endif
3886	4432
3887		-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \
	4433	+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \
3888	4434	defined(CONFIG_TRACE_PREEMPT_TOGGLE))
3889	4435	/*
3890	4436	* If the value passed in is equal to the current preempt count
..	..	@@ -3990,11 +4536,11 @@
3990	4536	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
3991	4537	&& in_atomic_preempt_off()) {
3992	4538	pr_err("Preemption disabled at:");
3993		- print_ip_sym(preempt_disable_ip);
3994		- pr_cont("\n");
	4539	+ print_ip_sym(KERN_ERR, preempt_disable_ip);
3995	4540	}
3996		- if (panic_on_warn)
3997		- panic("scheduling while atomic\n");
	4541	+ check_panic_on_warn("scheduling while atomic");
	4542	+
	4543	+ trace_android_rvh_schedule_bug(prev);
3998	4544
3999	4545	dump_stack();
4000	4546	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
..	..	@@ -4003,11 +4549,23 @@
4003	4549	/*
4004	4550	* Various schedule()-time debugging checks and statistics:
4005	4551	*/
4006		-static inline void schedule_debug(struct task_struct *prev)
	4552	+static inline void schedule_debug(struct task_struct *prev, bool preempt)
4007	4553	{
4008	4554	#ifdef CONFIG_SCHED_STACK_END_CHECK
4009	4555	if (task_stack_end_corrupted(prev))
4010	4556	panic("corrupted stack end detected inside scheduler\n");
	4557	+
	4558	+ if (task_scs_end_corrupted(prev))
	4559	+ panic("corrupted shadow stack detected inside scheduler\n");
	4560	+#endif
	4561	+
	4562	+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
	4563	+ if (!preempt && prev->state && prev->non_block_count) {
	4564	+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
	4565	+ prev->comm, prev->pid, prev->non_block_count);
	4566	+ dump_stack();
	4567	+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
	4568	+ }
4011	4569	#endif
4012	4570
4013	4571	if (unlikely(in_atomic_preempt_off())) {
..	..	@@ -4019,6 +4577,28 @@
4019	4577	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4020	4578
4021	4579	schedstat_inc(this_rq()->sched_count);
	4580	+}
	4581	+
	4582	+static void put_prev_task_balance(struct rq rq, struct task_struct prev,
	4583	+ struct rq_flags *rf)
	4584	+{
	4585	+#ifdef CONFIG_SMP
	4586	+ const struct sched_class *class;
	4587	+ /*
	4588	+ * We must do the balancing pass before put_prev_task(), such
	4589	+ * that when we release the rq->lock the task is in the same
	4590	+ * state as before we took rq->lock.
	4591	+ *
	4592	+ * We can terminate the balance pass as soon as we know there is
	4593	+ * a runnable task of @class priority or higher.
	4594	+ */
	4595	+ for_class_range(class, prev->sched_class, &idle_sched_class) {
	4596	+ if (class->balance(rq, prev, rf))
	4597	+ break;
	4598	+ }
	4599	+#endif
	4600	+
	4601	+ put_prev_task(rq, prev);
4022	4602	}
4023	4603
4024	4604	/*
..	..	@@ -4036,36 +4616,34 @@
4036	4616	* higher scheduling class, because otherwise those loose the
4037	4617	* opportunity to pull in more work from other CPUs.
4038	4618	*/
4039		- if (likely((prev->sched_class == &idle_sched_class \|\|
4040		- prev->sched_class == &fair_sched_class) &&
	4619	+ if (likely(prev->sched_class <= &fair_sched_class &&
4041	4620	rq->nr_running == rq->cfs.h_nr_running)) {
4042	4621
4043		- p = fair_sched_class.pick_next_task(rq, prev, rf);
	4622	+ p = pick_next_task_fair(rq, prev, rf);
4044	4623	if (unlikely(p == RETRY_TASK))
4045		- goto again;
	4624	+ goto restart;
4046	4625
4047	4626	/* Assumes fair_sched_class->next == idle_sched_class */
4048		- if (unlikely(!p))
4049		- p = idle_sched_class.pick_next_task(rq, prev, rf);
	4627	+ if (!p) {
	4628	+ put_prev_task(rq, prev);
	4629	+ p = pick_next_task_idle(rq);
	4630	+ }
4050	4631
4051	4632	return p;
4052	4633	}
4053	4634
4054		-again:
	4635	+restart:
	4636	+ put_prev_task_balance(rq, prev, rf);
	4637	+
4055	4638	for_each_class(class) {
4056		- p = class->pick_next_task(rq, prev, rf);
4057		- if (p) {
4058		- if (unlikely(p == RETRY_TASK))
4059		- goto again;
	4639	+ p = class->pick_next_task(rq);
	4640	+ if (p)
4060	4641	return p;
4061		- }
4062	4642	}
4063	4643
4064	4644	/* The idle class should always have a runnable task: */
4065	4645	BUG();
4066	4646	}
4067		-
4068		-static void migrate_disabled_sched(struct task_struct *p);
4069	4647
4070	4648	/*
4071	4649	* __schedule() is the main scheduler function.
..	..	@@ -4087,7 +4665,7 @@
4087	4665	* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
4088	4666	* called on the nearest possible occasion:
4089	4667	*
4090		- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
	4668	+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
4091	4669	*
4092	4670	* - in syscall or exception context, at the next outmost
4093	4671	* preempt_enable(). (this might be as soon as the wake_up()'s
..	..	@@ -4096,7 +4674,7 @@
4096	4674	* - in IRQ context, return from interrupt-handler to
4097	4675	* preemptible context
4098	4676	*
4099		- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
	4677	+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
4100	4678	* then at the next:
4101	4679	*
4102	4680	* - cond_resched() call
..	..	@@ -4110,6 +4688,7 @@
4110	4688	{
4111	4689	struct task_struct prev, next;
4112	4690	unsigned long *switch_count;
	4691	+ unsigned long prev_state;
4113	4692	struct rq_flags rf;
4114	4693	struct rq *rq;
4115	4694	int cpu;
..	..	@@ -4118,7 +4697,7 @@
4118	4697	rq = cpu_rq(cpu);
4119	4698	prev = rq->curr;
4120	4699
4121		- schedule_debug(prev);
	4700	+ schedule_debug(prev, preempt);
4122	4701
4123	4702	if (sched_feat(HRTICK))
4124	4703	hrtick_clear(rq);
..	..	@@ -4129,28 +4708,59 @@
4129	4708	/*
4130	4709	* Make sure that signal_pending_state()->signal_pending() below
4131	4710	* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4132		- * done by the caller to avoid the race with signal_wake_up().
	4711	+ * done by the caller to avoid the race with signal_wake_up():
4133	4712	*
4134		- * The membarrier system call requires a full memory barrier
	4713	+ * __set_current_state(@state) signal_wake_up()
	4714	+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
	4715	+ * wake_up_state(p, state)
	4716	+ * LOCK rq->lock LOCK p->pi_state
	4717	+ * smp_mb__after_spinlock() smp_mb__after_spinlock()
	4718	+ * if (signal_pending_state()) if (p->state & @state)
	4719	+ *
	4720	+ * Also, the membarrier system call requires a full memory barrier
4135	4721	* after coming from user-space, before storing to rq->curr.
4136	4722	*/
4137	4723	rq_lock(rq, &rf);
4138	4724	smp_mb__after_spinlock();
4139		-
4140		- if (__migrate_disabled(prev))
4141		- migrate_disabled_sched(prev);
4142	4725
4143	4726	/* Promote REQ to ACT */
4144	4727	rq->clock_update_flags <<= 1;
4145	4728	update_rq_clock(rq);
4146	4729
4147	4730	switch_count = &prev->nivcsw;
4148		- if (!preempt && prev->state) {
4149		- if (unlikely(signal_pending_state(prev->state, prev))) {
	4731	+
	4732	+ /*
	4733	+ * We must load prev->state once (task_struct::state is volatile), such
	4734	+ * that:
	4735	+ *
	4736	+ * - we form a control dependency vs deactivate_task() below.
	4737	+ * - ptrace_{,un}freeze_traced() can change ->state underneath us.
	4738	+ */
	4739	+ prev_state = prev->state;
	4740	+ if (!preempt && prev_state) {
	4741	+ if (signal_pending_state(prev_state, prev)) {
4150	4742	prev->state = TASK_RUNNING;
4151	4743	} else {
	4744	+ prev->sched_contributes_to_load =
	4745	+ (prev_state & TASK_UNINTERRUPTIBLE) &&
	4746	+ !(prev_state & TASK_NOLOAD) &&
	4747	+ !(prev->flags & PF_FROZEN);
	4748	+
	4749	+ if (prev->sched_contributes_to_load)
	4750	+ rq->nr_uninterruptible++;
	4751	+
	4752	+ /*
	4753	+ * __schedule() ttwu()
	4754	+ * prev_state = prev->state; if (p->on_rq && ...)
	4755	+ * if (prev_state) goto out;
	4756	+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
	4757	+ * p->state = TASK_WAKING
	4758	+ *
	4759	+ * Where __schedule() and ttwu() have matching control dependencies.
	4760	+ *
	4761	+ * After this, schedule() must not care about p->state any more.
	4762	+ */
4152	4763	deactivate_task(rq, prev, DEQUEUE_SLEEP \| DEQUEUE_NOCLOCK);
4153		- prev->on_rq = 0;
4154	4764
4155	4765	if (prev->in_iowait) {
4156	4766	atomic_inc(&rq->nr_iowait);
..	..	@@ -4162,12 +4772,16 @@
4162	4772
4163	4773	next = pick_next_task(rq, prev, &rf);
4164	4774	clear_tsk_need_resched(prev);
4165		- clear_tsk_need_resched_lazy(prev);
4166	4775	clear_preempt_need_resched();
4167	4776
	4777	+ trace_android_rvh_schedule(prev, next, rq);
4168	4778	if (likely(prev != next)) {
4169	4779	rq->nr_switches++;
4170		- rq->curr = next;
	4780	+ /*
	4781	+ * RCU users of rcu_dereference(rq->curr) may not see
	4782	+ * changes to task_struct made by pick_next_task().
	4783	+ */
	4784	+ RCU_INIT_POINTER(rq->curr, next);
4171	4785	/*
4172	4786	* The membarrier system call requires each architecture
4173	4787	* to have a full memory barrier after updating
..	..	@@ -4183,6 +4797,8 @@
4183	4797	* is a RELEASE barrier),
4184	4798	*/
4185	4799	++*switch_count;
	4800	+
	4801	+ psi_sched_switch(prev, next, !task_on_rq_queued(prev));
4186	4802
4187	4803	trace_sched_switch(preempt, prev, next);
4188	4804
..	..	@@ -4214,19 +4830,26 @@
4214	4830
4215	4831	static inline void sched_submit_work(struct task_struct *tsk)
4216	4832	{
	4833	+ unsigned int task_flags;
	4834	+
4217	4835	if (!tsk->state)
4218	4836	return;
4219	4837
	4838	+ task_flags = tsk->flags;
4220	4839	/*
4221	4840	* If a worker went to sleep, notify and ask workqueue whether
4222	4841	* it wants to wake up a task to maintain concurrency.
4223	4842	* As this function is called inside the schedule() context,
4224	4843	* we disable preemption to avoid it calling schedule() again
4225		- * in the possible wakeup of a kworker.
	4844	+ * in the possible wakeup of a kworker and because wq_worker_sleeping()
	4845	+ * requires it.
4226	4846	*/
4227		- if (tsk->flags & PF_WQ_WORKER) {
	4847	+ if (task_flags & (PF_WQ_WORKER \| PF_IO_WORKER)) {
4228	4848	preempt_disable();
4229		- wq_worker_sleeping(tsk);
	4849	+ if (task_flags & PF_WQ_WORKER)
	4850	+ wq_worker_sleeping(tsk);
	4851	+ else
	4852	+ io_wq_worker_sleeping(tsk);
4230	4853	preempt_enable_no_resched();
4231	4854	}
4232	4855
..	..	@@ -4243,8 +4866,12 @@
4243	4866
4244	4867	static void sched_update_worker(struct task_struct *tsk)
4245	4868	{
4246		- if (tsk->flags & PF_WQ_WORKER)
4247		- wq_worker_running(tsk);
	4869	+ if (tsk->flags & (PF_WQ_WORKER \| PF_IO_WORKER)) {
	4870	+ if (tsk->flags & PF_WQ_WORKER)
	4871	+ wq_worker_running(tsk);
	4872	+ else
	4873	+ io_wq_worker_running(tsk);
	4874	+ }
4248	4875	}
4249	4876
4250	4877	asmlinkage __visible void __sched schedule(void)
..	..	@@ -4346,35 +4973,10 @@
4346	4973	} while (need_resched());
4347	4974	}
4348	4975
4349		-#ifdef CONFIG_PREEMPT_LAZY
	4976	+#ifdef CONFIG_PREEMPTION
4350	4977	/*
4351		- * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
4352		- * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
4353		- * preempt_lazy_count counter >0.
4354		- */
4355		-static __always_inline int preemptible_lazy(void)
4356		-{
4357		- if (test_thread_flag(TIF_NEED_RESCHED))
4358		- return 1;
4359		- if (current_thread_info()->preempt_lazy_count)
4360		- return 0;
4361		- return 1;
4362		-}
4363		-
4364		-#else
4365		-
4366		-static inline int preemptible_lazy(void)
4367		-{
4368		- return 1;
4369		-}
4370		-
4371		-#endif
4372		-
4373		-#ifdef CONFIG_PREEMPT
4374		-/*
4375		- * this is the entry point to schedule() from in-kernel preemption
4376		- * off of preempt_enable. Kernel preemptions off return from interrupt
4377		- * occur there and call schedule directly.
	4978	+ * This is the entry point to schedule() from in-kernel preemption
	4979	+ * off of preempt_enable.
4378	4980	*/
4379	4981	asmlinkage __visible void __sched notrace preempt_schedule(void)
4380	4982	{
..	..	@@ -4384,8 +4986,7 @@
4384	4986	*/
4385	4987	if (likely(!preemptible()))
4386	4988	return;
4387		- if (!preemptible_lazy())
4388		- return;
	4989	+
4389	4990	preempt_schedule_common();
4390	4991	}
4391	4992	NOKPROBE_SYMBOL(preempt_schedule);
..	..	@@ -4410,9 +5011,6 @@
4410	5011	enum ctx_state prev_ctx;
4411	5012
4412	5013	if (likely(!preemptible()))
4413		- return;
4414		-
4415		- if (!preemptible_lazy())
4416	5014	return;
4417	5015
4418	5016	do {
..	..	@@ -4446,10 +5044,10 @@
4446	5044	}
4447	5045	EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
4448	5046
4449		-#endif /* CONFIG_PREEMPT */
	5047	+#endif /* CONFIG_PREEMPTION */
4450	5048
4451	5049	/*
4452		- * this is the entry point to schedule() from kernel preemption
	5050	+ * This is the entry point to schedule() from kernel preemption
4453	5051	* off of irq context.
4454	5052	* Note, that this is called and return with irqs disabled. This will
4455	5053	* protect us against recursive calling from irq.
..	..	@@ -4477,9 +5075,22 @@
4477	5075	int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
4478	5076	void *key)
4479	5077	{
4480		- return try_to_wake_up(curr->private, mode, wake_flags, 1);
	5078	+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC \| WF_ANDROID_VENDOR));
	5079	+ return try_to_wake_up(curr->private, mode, wake_flags);
4481	5080	}
4482	5081	EXPORT_SYMBOL(default_wake_function);
	5082	+
	5083	+static void __setscheduler_prio(struct task_struct *p, int prio)
	5084	+{
	5085	+ if (dl_prio(prio))
	5086	+ p->sched_class = &dl_sched_class;
	5087	+ else if (rt_prio(prio))
	5088	+ p->sched_class = &rt_sched_class;
	5089	+ else
	5090	+ p->sched_class = &fair_sched_class;
	5091	+
	5092	+ p->prio = prio;
	5093	+}
4483	5094
4484	5095	#ifdef CONFIG_RT_MUTEXES
4485	5096
..	..	@@ -4517,6 +5128,7 @@
4517	5128	struct rq_flags rf;
4518	5129	struct rq *rq;
4519	5130
	5131	+ trace_android_rvh_rtmutex_prepare_setprio(p, pi_task);
4520	5132	/* XXX used to be waiter->prio, not waiter->task->prio */
4521	5133	prio = __rt_effective_prio(pi_task, p->normal_prio);
4522	5134
..	..	@@ -4591,31 +5203,29 @@
4591	5203	if (!dl_prio(p->normal_prio) \|\|
4592	5204	(pi_task && dl_prio(pi_task->prio) &&
4593	5205	dl_entity_preempt(&pi_task->dl, &p->dl))) {
4594		- p->dl.dl_boosted = 1;
	5206	+ p->dl.pi_se = pi_task->dl.pi_se;
4595	5207	queue_flag \|= ENQUEUE_REPLENISH;
4596		- } else
4597		- p->dl.dl_boosted = 0;
4598		- p->sched_class = &dl_sched_class;
	5208	+ } else {
	5209	+ p->dl.pi_se = &p->dl;
	5210	+ }
4599	5211	} else if (rt_prio(prio)) {
4600	5212	if (dl_prio(oldprio))
4601		- p->dl.dl_boosted = 0;
	5213	+ p->dl.pi_se = &p->dl;
4602	5214	if (oldprio < prio)
4603	5215	queue_flag \|= ENQUEUE_HEAD;
4604		- p->sched_class = &rt_sched_class;
4605	5216	} else {
4606	5217	if (dl_prio(oldprio))
4607		- p->dl.dl_boosted = 0;
	5218	+ p->dl.pi_se = &p->dl;
4608	5219	if (rt_prio(oldprio))
4609	5220	p->rt.timeout = 0;
4610		- p->sched_class = &fair_sched_class;
4611	5221	}
4612	5222
4613		- p->prio = prio;
	5223	+ __setscheduler_prio(p, prio);
4614	5224
4615	5225	if (queued)
4616	5226	enqueue_task(rq, p, queue_flag);
4617	5227	if (running)
4618		- set_curr_task(rq, p);
	5228	+ set_next_task(rq, p);
4619	5229
4620	5230	check_class_changed(rq, p, prev_class, oldprio);
4621	5231	out_unlock:
..	..	@@ -4635,12 +5245,13 @@
4635	5245
4636	5246	void set_user_nice(struct task_struct *p, long nice)
4637	5247	{
4638		- bool queued, running;
4639		- int old_prio, delta;
	5248	+ bool queued, running, allowed = false;
	5249	+ int old_prio;
4640	5250	struct rq_flags rf;
4641	5251	struct rq *rq;
4642	5252
4643		- if (task_nice(p) == nice \|\| nice < MIN_NICE \|\| nice > MAX_NICE)
	5253	+ trace_android_rvh_set_user_nice(p, &nice, &allowed);
	5254	+ if ((task_nice(p) == nice \|\| nice < MIN_NICE \|\| nice > MAX_NICE) && !allowed)
4644	5255	return;
4645	5256	/*
4646	5257	* We have to be careful, if called from sys_setpriority(),
..	..	@@ -4667,22 +5278,21 @@
4667	5278	put_prev_task(rq, p);
4668	5279
4669	5280	p->static_prio = NICE_TO_PRIO(nice);
4670		- set_load_weight(p, true);
	5281	+ set_load_weight(p);
4671	5282	old_prio = p->prio;
4672	5283	p->prio = effective_prio(p);
4673		- delta = p->prio - old_prio;
4674	5284
4675		- if (queued) {
	5285	+ if (queued)
4676	5286	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
4677		- /*
4678		- * If the task increased its priority or is running and
4679		- * lowered its priority, then reschedule its CPU:
4680		- */
4681		- if (delta < 0 \|\| (delta > 0 && task_running(rq, p)))
4682		- resched_curr(rq);
4683		- }
4684	5287	if (running)
4685		- set_curr_task(rq, p);
	5288	+ set_next_task(rq, p);
	5289	+
	5290	+ /*
	5291	+ * If the task increased its priority or is running and
	5292	+ * lowered its priority, then reschedule its CPU:
	5293	+ */
	5294	+ p->sched_class->prio_changed(rq, p, old_prio);
	5295	+
4686	5296	out_unlock:
4687	5297	task_rq_unlock(rq, p, &rf);
4688	5298	}
..	..	@@ -4767,7 +5377,7 @@
4767	5377	return 0;
4768	5378
4769	5379	#ifdef CONFIG_SMP
4770		- if (!llist_empty(&rq->wake_list))
	5380	+ if (rq->ttwu_pending)
4771	5381	return 0;
4772	5382	#endif
4773	5383
..	..	@@ -4790,6 +5400,7 @@
4790	5400
4791	5401	return 1;
4792	5402	}
	5403	+EXPORT_SYMBOL_GPL(available_idle_cpu);
4793	5404
4794	5405	/**
4795	5406	* idle_task - return the idle task for a given CPU.
..	..	@@ -4841,36 +5452,7 @@
4841	5452	*/
4842	5453	p->rt_priority = attr->sched_priority;
4843	5454	p->normal_prio = normal_prio(p);
4844		- set_load_weight(p, true);
4845		-}
4846		-
4847		-/* Actually do priority change: must hold pi & rq lock. */
4848		-static void __setscheduler(struct rq rq, struct task_struct p,
4849		- const struct sched_attr *attr, bool keep_boost)
4850		-{
4851		- /*
4852		- * If params can't change scheduling class changes aren't allowed
4853		- * either.
4854		- */
4855		- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
4856		- return;
4857		-
4858		- __setscheduler_params(p, attr);
4859		-
4860		- /*
4861		- * Keep a potential priority boosting if called from
4862		- * sched_setscheduler().
4863		- */
4864		- p->prio = normal_prio(p);
4865		- if (keep_boost)
4866		- p->prio = rt_effective_prio(p, p->prio);
4867		-
4868		- if (dl_prio(p->prio))
4869		- p->sched_class = &dl_sched_class;
4870		- else if (rt_prio(p->prio))
4871		- p->sched_class = &rt_sched_class;
4872		- else
4873		- p->sched_class = &fair_sched_class;
	5455	+ set_load_weight(p);
4874	5456	}
4875	5457
4876	5458	/*
..	..	@@ -4893,15 +5475,14 @@
4893	5475	const struct sched_attr *attr,
4894	5476	bool user, bool pi)
4895	5477	{
4896		- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4897		- MAX_RT_PRIO - 1 - attr->sched_priority;
4898		- int retval, oldprio, oldpolicy = -1, queued, running;
4899		- int new_effective_prio, policy = attr->sched_policy;
	5478	+ int oldpolicy = -1, policy = attr->sched_policy;
	5479	+ int retval, oldprio, newprio, queued, running;
4900	5480	const struct sched_class *prev_class;
4901	5481	struct rq_flags rf;
4902	5482	int reset_on_fork;
4903	5483	int queue_flags = DEQUEUE_SAVE \| DEQUEUE_MOVE \| DEQUEUE_NOCLOCK;
4904	5484	struct rq *rq;
	5485	+ bool cpuset_locked = false;
4905	5486
4906	5487	/* The pi code expects interrupts enabled */
4907	5488	BUG_ON(pi && in_interrupt());
..	..	@@ -4969,7 +5550,7 @@
4969	5550	* Treat SCHED_IDLE as nice 20. Only allow a switch to
4970	5551	* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4971	5552	*/
4972		- if (idle_policy(p->policy) && !idle_policy(policy)) {
	5553	+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
4973	5554	if (!can_nice(p, task_nice(p)))
4974	5555	return -EPERM;
4975	5556	}
..	..	@@ -4980,6 +5561,10 @@
4980	5561
4981	5562	/* Normal users shall not reset the sched_reset_on_fork flag: */
4982	5563	if (p->sched_reset_on_fork && !reset_on_fork)
	5564	+ return -EPERM;
	5565	+
	5566	+ /* Can't change util-clamps */
	5567	+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
4983	5568	return -EPERM;
4984	5569	}
4985	5570
..	..	@@ -5000,6 +5585,15 @@
5000	5585	}
5001	5586
5002	5587	/*
	5588	+ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
	5589	+ * information.
	5590	+ */
	5591	+ if (dl_policy(policy) \|\| dl_policy(p->policy)) {
	5592	+ cpuset_locked = true;
	5593	+ cpuset_lock();
	5594	+ }
	5595	+
	5596	+ /*
5003	5597	* Make sure no PI-waiters arrive (or leave) while we are
5004	5598	* changing the priority of the task:
5005	5599	*
..	..	@@ -5013,8 +5607,8 @@
5013	5607	* Changing the policy of the stop threads its a very bad idea:
5014	5608	*/
5015	5609	if (p == rq->stop) {
5016		- task_rq_unlock(rq, p, &rf);
5017		- return -EINVAL;
	5610	+ retval = -EINVAL;
	5611	+ goto unlock;
5018	5612	}
5019	5613
5020	5614	/*
..	..	@@ -5032,8 +5626,8 @@
5032	5626	goto change;
5033	5627
5034	5628	p->sched_reset_on_fork = reset_on_fork;
5035		- task_rq_unlock(rq, p, &rf);
5036		- return 0;
	5629	+ retval = 0;
	5630	+ goto unlock;
5037	5631	}
5038	5632	change:
5039	5633
..	..	@@ -5046,8 +5640,8 @@
5046	5640	if (rt_bandwidth_enabled() && rt_policy(policy) &&
5047	5641	task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5048	5642	!task_group_is_autogroup(task_group(p))) {
5049		- task_rq_unlock(rq, p, &rf);
5050		- return -EPERM;
	5643	+ retval = -EPERM;
	5644	+ goto unlock;
5051	5645	}
5052	5646	#endif
5053	5647	#ifdef CONFIG_SMP
..	..	@@ -5062,8 +5656,8 @@
5062	5656	*/
5063	5657	if (!cpumask_subset(span, p->cpus_ptr) \|\|
5064	5658	rq->rd->dl_bw.bw == 0) {
5065		- task_rq_unlock(rq, p, &rf);
5066		- return -EPERM;
	5659	+ retval = -EPERM;
	5660	+ goto unlock;
5067	5661	}
5068	5662	}
5069	5663	#endif
..	..	@@ -5073,6 +5667,8 @@
5073	5667	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5074	5668	policy = oldpolicy = -1;
5075	5669	task_rq_unlock(rq, p, &rf);
	5670	+ if (cpuset_locked)
	5671	+ cpuset_unlock();
5076	5672	goto recheck;
5077	5673	}
5078	5674
..	..	@@ -5082,13 +5678,14 @@
5082	5678	* is available.
5083	5679	*/
5084	5680	if ((dl_policy(policy) \|\| dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
5085		- task_rq_unlock(rq, p, &rf);
5086		- return -EBUSY;
	5681	+ retval = -EBUSY;
	5682	+ goto unlock;
5087	5683	}
5088	5684
5089	5685	p->sched_reset_on_fork = reset_on_fork;
5090	5686	oldprio = p->prio;
5091	5687
	5688	+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
5092	5689	if (pi) {
5093	5690	/*
5094	5691	* Take priority boosted tasks into account. If the new
..	..	@@ -5097,8 +5694,8 @@
5097	5694	* the runqueue. This will be done when the task deboost
5098	5695	* itself.
5099	5696	*/
5100		- new_effective_prio = rt_effective_prio(p, newprio);
5101		- if (new_effective_prio == oldprio)
	5697	+ newprio = rt_effective_prio(p, newprio);
	5698	+ if (newprio == oldprio)
5102	5699	queue_flags &= ~DEQUEUE_MOVE;
5103	5700	}
5104	5701
..	..	@@ -5111,7 +5708,11 @@
5111	5708
5112	5709	prev_class = p->sched_class;
5113	5710
5114		- __setscheduler(rq, p, attr, pi);
	5711	+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
	5712	+ __setscheduler_params(p, attr);
	5713	+ __setscheduler_prio(p, newprio);
	5714	+ trace_android_rvh_setscheduler(p);
	5715	+ }
5115	5716	__setscheduler_uclamp(p, attr);
5116	5717
5117	5718	if (queued) {
..	..	@@ -5125,7 +5726,7 @@
5125	5726	enqueue_task(rq, p, queue_flags);
5126	5727	}
5127	5728	if (running)
5128		- set_curr_task(rq, p);
	5729	+ set_next_task(rq, p);
5129	5730
5130	5731	check_class_changed(rq, p, prev_class, oldprio);
5131	5732
..	..	@@ -5133,14 +5734,23 @@
5133	5734	preempt_disable();
5134	5735	task_rq_unlock(rq, p, &rf);
5135	5736
5136		- if (pi)
	5737	+ if (pi) {
	5738	+ if (cpuset_locked)
	5739	+ cpuset_unlock();
5137	5740	rt_mutex_adjust_pi(p);
	5741	+ }
5138	5742
5139	5743	/* Run balance callbacks after we've adjusted the PI chain: */
5140	5744	balance_callback(rq);
5141	5745	preempt_enable();
5142	5746
5143	5747	return 0;
	5748	+
	5749	+unlock:
	5750	+ task_rq_unlock(rq, p, &rf);
	5751	+ if (cpuset_locked)
	5752	+ cpuset_unlock();
	5753	+ return retval;
5144	5754	}
5145	5755
5146	5756	static int _sched_setscheduler(struct task_struct *p, int policy,
..	..	@@ -5152,6 +5762,14 @@
5152	5762	.sched_nice = PRIO_TO_NICE(p->static_prio),
5153	5763	};
5154	5764
	5765	+ if (IS_ENABLED(CONFIG_ROCKCHIP_OPTIMIZE_RT_PRIO) &&
	5766	+ ((policy == SCHED_FIFO) \|\| (policy == SCHED_RR))) {
	5767	+ attr.sched_priority /= 2;
	5768	+ if (!check)
	5769	+ attr.sched_priority += MAX_RT_PRIO / 2;
	5770	+ if (!attr.sched_priority)
	5771	+ attr.sched_priority = 1;
	5772	+ }
5155	5773	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
5156	5774	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
5157	5775	attr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;
..	..	@@ -5166,6 +5784,8 @@
5166	5784	* @p: the task in question.
5167	5785	* @policy: new policy.
5168	5786	* @param: structure containing the new RT priority.
	5787	+ *
	5788	+ * Use sched_set_fifo(), read its comment.
5169	5789	*
5170	5790	* Return: 0 on success. An error code otherwise.
5171	5791	*
..	..	@@ -5188,6 +5808,7 @@
5188	5808	{
5189	5809	return __sched_setscheduler(p, attr, false, true);
5190	5810	}
	5811	+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
5191	5812
5192	5813	/**
5193	5814	* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
..	..	@@ -5208,6 +5829,51 @@
5208	5829	return _sched_setscheduler(p, policy, param, false);
5209	5830	}
5210	5831	EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
	5832	+
	5833	+/*
	5834	+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
	5835	+ * incapable of resource management, which is the one thing an OS really should
	5836	+ * be doing.
	5837	+ *
	5838	+ * This is of course the reason it is limited to privileged users only.
	5839	+ *
	5840	+ * Worse still; it is fundamentally impossible to compose static priority
	5841	+ * workloads. You cannot take two correctly working static prio workloads
	5842	+ * and smash them together and still expect them to work.
	5843	+ *
	5844	+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
	5845	+ *
	5846	+ * MAX_RT_PRIO / 2
	5847	+ *
	5848	+ * The administrator _MUST_ configure the system, the kernel simply doesn't
	5849	+ * know enough information to make a sensible choice.
	5850	+ */
	5851	+void sched_set_fifo(struct task_struct *p)
	5852	+{
	5853	+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
	5854	+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
	5855	+}
	5856	+EXPORT_SYMBOL_GPL(sched_set_fifo);
	5857	+
	5858	+/*
	5859	+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
	5860	+ */
	5861	+void sched_set_fifo_low(struct task_struct *p)
	5862	+{
	5863	+ struct sched_param sp = { .sched_priority = 1 };
	5864	+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
	5865	+}
	5866	+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
	5867	+
	5868	+void sched_set_normal(struct task_struct *p, int nice)
	5869	+{
	5870	+ struct sched_attr attr = {
	5871	+ .sched_policy = SCHED_NORMAL,
	5872	+ .sched_nice = nice,
	5873	+ };
	5874	+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
	5875	+}
	5876	+EXPORT_SYMBOL_GPL(sched_set_normal);
5211	5877
5212	5878	static int
5213	5879	do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
..	..	@@ -5239,9 +5905,6 @@
5239	5905	u32 size;
5240	5906	int ret;
5241	5907
5242		- if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
5243		- return -EFAULT;
5244		-
5245	5908	/* Zero the full structure, so that a short copy will be nice: */
5246	5909	memset(attr, 0, sizeof(*attr));
5247	5910
..	..	@@ -5249,44 +5912,18 @@
5249	5912	if (ret)
5250	5913	return ret;
5251	5914
5252		- /* Bail out on silly large: */
5253		- if (size > PAGE_SIZE)
5254		- goto err_size;
5255		-
5256	5915	/* ABI compatibility quirk: */
5257	5916	if (!size)
5258	5917	size = SCHED_ATTR_SIZE_VER0;
5259		-
5260		- if (size < SCHED_ATTR_SIZE_VER0)
	5918	+ if (size < SCHED_ATTR_SIZE_VER0 \|\| size > PAGE_SIZE)
5261	5919	goto err_size;
5262	5920
5263		- /*
5264		- * If we're handed a bigger struct than we know of,
5265		- * ensure all the unknown bits are 0 - i.e. new
5266		- * user-space does not rely on any kernel feature
5267		- * extensions we dont know about yet.
5268		- */
5269		- if (size > sizeof(*attr)) {
5270		- unsigned char __user *addr;
5271		- unsigned char __user *end;
5272		- unsigned char val;
5273		-
5274		- addr = (void __user )uattr + sizeof(attr);
5275		- end = (void __user *)uattr + size;
5276		-
5277		- for (; addr < end; addr++) {
5278		- ret = get_user(val, addr);
5279		- if (ret)
5280		- return ret;
5281		- if (val)
5282		- goto err_size;
5283		- }
5284		- size = sizeof(*attr);
	5921	+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
	5922	+ if (ret) {
	5923	+ if (ret == -E2BIG)
	5924	+ goto err_size;
	5925	+ return ret;
5285	5926	}
5286		-
5287		- ret = copy_from_user(attr, uattr, size);
5288		- if (ret)
5289		- return -EFAULT;
5290	5927
5291	5928	if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
5292	5929	size < SCHED_ATTR_SIZE_VER1)
..	..	@@ -5303,6 +5940,16 @@
5303	5940	err_size:
5304	5941	put_user(sizeof(*attr), &uattr->size);
5305	5942	return -E2BIG;
	5943	+}
	5944	+
	5945	+static void get_params(struct task_struct p, struct sched_attr attr)
	5946	+{
	5947	+ if (task_has_dl_policy(p))
	5948	+ __getparam_dl(p, attr);
	5949	+ else if (task_has_rt_policy(p))
	5950	+ attr->sched_priority = p->rt_priority;
	5951	+ else
	5952	+ attr->sched_nice = task_nice(p);
5306	5953	}
5307	5954
5308	5955	/**
..	..	@@ -5366,6 +6013,8 @@
5366	6013	rcu_read_unlock();
5367	6014
5368	6015	if (likely(p)) {
	6016	+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
	6017	+ get_params(p, &attr);
5369	6018	retval = sched_setattr(p, &attr);
5370	6019	put_task_struct(p);
5371	6020	}
..	..	@@ -5459,7 +6108,7 @@
5459	6108	{
5460	6109	unsigned int ksize = sizeof(*kattr);
5461	6110
5462		- if (!access_ok(VERIFY_WRITE, uattr, usize))
	6111	+ if (!access_ok(uattr, usize))
5463	6112	return -EFAULT;
5464	6113
5465	6114	/*
..	..	@@ -5487,7 +6136,7 @@
5487	6136	* sys_sched_getattr - similar to sched_getparam, but with sched_attr
5488	6137	* @pid: the pid in question.
5489	6138	* @uattr: structure containing the extended parameters.
5490		- * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
	6139	+ * @usize: sizeof(attr) for fwd/bwd comp.
5491	6140	* @flags: for future extension.
5492	6141	*/
5493	6142	SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
..	..	@@ -5514,14 +6163,15 @@
5514	6163	kattr.sched_policy = p->policy;
5515	6164	if (p->sched_reset_on_fork)
5516	6165	kattr.sched_flags \|= SCHED_FLAG_RESET_ON_FORK;
5517		- if (task_has_dl_policy(p))
5518		- __getparam_dl(p, &kattr);
5519		- else if (task_has_rt_policy(p))
5520		- kattr.sched_priority = p->rt_priority;
5521		- else
5522		- kattr.sched_nice = task_nice(p);
	6166	+ get_params(p, &kattr);
	6167	+ kattr.sched_flags &= SCHED_FLAG_ALL;
5523	6168
5524	6169	#ifdef CONFIG_UCLAMP_TASK
	6170	+ /*
	6171	+ * This could race with another potential updater, but this is fine
	6172	+ * because it'll correctly read the old or the new value. We don't need
	6173	+ * to guarantee who wins the race as long as it doesn't return garbage.
	6174	+ */
5525	6175	kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
5526	6176	kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
5527	6177	#endif
..	..	@@ -5540,6 +6190,7 @@
5540	6190	cpumask_var_t cpus_allowed, new_mask;
5541	6191	struct task_struct *p;
5542	6192	int retval;
	6193	+ int skip = 0;
5543	6194
5544	6195	rcu_read_lock();
5545	6196
..	..	@@ -5575,6 +6226,9 @@
5575	6226	rcu_read_unlock();
5576	6227	}
5577	6228
	6229	+ trace_android_vh_sched_setaffinity_early(p, in_mask, &skip);
	6230	+ if (skip)
	6231	+ goto out_free_new_mask;
5578	6232	retval = security_task_setscheduler(p);
5579	6233	if (retval)
5580	6234	goto out_free_new_mask;
..	..	@@ -5615,6 +6269,9 @@
5615	6269	goto again;
5616	6270	}
5617	6271	}
	6272	+
	6273	+ trace_android_rvh_sched_setaffinity(p, in_mask, &retval);
	6274	+
5618	6275	out_free_new_mask:
5619	6276	free_cpumask_var(new_mask);
5620	6277	out_free_cpus_allowed:
..	..	@@ -5623,7 +6280,6 @@
5623	6280	put_task_struct(p);
5624	6281	return retval;
5625	6282	}
5626		-EXPORT_SYMBOL_GPL(sched_setaffinity);
5627	6283
5628	6284	static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5629	6285	struct cpumask *new_mask)
..	..	@@ -5707,14 +6363,14 @@
5707	6363	if (len & (sizeof(unsigned long)-1))
5708	6364	return -EINVAL;
5709	6365
5710		- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
	6366	+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
5711	6367	return -ENOMEM;
5712	6368
5713	6369	ret = sched_getaffinity(pid, mask);
5714	6370	if (ret == 0) {
5715	6371	unsigned int retlen = min(len, cpumask_size());
5716	6372
5717		- if (copy_to_user(user_mask_ptr, mask, retlen))
	6373	+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
5718	6374	ret = -EFAULT;
5719	6375	else
5720	6376	ret = retlen;
..	..	@@ -5742,6 +6398,8 @@
5742	6398	schedstat_inc(rq->yld_count);
5743	6399	current->sched_class->yield_task(rq);
5744	6400
	6401	+ trace_android_rvh_do_sched_yield(rq);
	6402	+
5745	6403	preempt_disable();
5746	6404	rq_unlock_irq(rq, &rf);
5747	6405	sched_preempt_enable_no_resched();
..	..	@@ -5755,7 +6413,7 @@
5755	6413	return 0;
5756	6414	}
5757	6415
5758		-#ifndef CONFIG_PREEMPT
	6416	+#ifndef CONFIG_PREEMPTION
5759	6417	int __sched _cond_resched(void)
5760	6418	{
5761	6419	if (should_resched(0)) {
..	..	@@ -5772,7 +6430,7 @@
5772	6430	* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
5773	6431	* call schedule, and on return reacquire the lock.
5774	6432	*
5775		- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
	6433	+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
5776	6434	* operations here to prevent schedule() from being called twice (once via
5777	6435	* spin_unlock(), once by hand).
5778	6436	*/
..	..	@@ -5876,7 +6534,7 @@
5876	6534	if (task_running(p_rq, p) \|\| p->state)
5877	6535	goto out_unlock;
5878	6536
5879		- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
	6537	+ yielded = curr->sched_class->yield_to_task(rq, p);
5880	6538	if (yielded) {
5881	6539	schedstat_inc(rq->yld_count);
5882	6540	/*
..	..	@@ -6042,7 +6700,7 @@
6042	6700	* an error code.
6043	6701	*/
6044	6702	SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6045		- struct timespec __user *, interval)
	6703	+ struct __kernel_timespec __user *, interval)
6046	6704	{
6047	6705	struct timespec64 t;
6048	6706	int retval = sched_rr_get_interval(pid, &t);
..	..	@@ -6053,16 +6711,15 @@
6053	6711	return retval;
6054	6712	}
6055	6713
6056		-#ifdef CONFIG_COMPAT
6057		-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
6058		- compat_pid_t, pid,
6059		- struct compat_timespec __user *, interval)
	6714	+#ifdef CONFIG_COMPAT_32BIT_TIME
	6715	+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
	6716	+ struct old_timespec32 __user *, interval)
6060	6717	{
6061	6718	struct timespec64 t;
6062	6719	int retval = sched_rr_get_interval(pid, &t);
6063	6720
6064	6721	if (retval == 0)
6065		- retval = compat_put_timespec64(&t, interval);
	6722	+ retval = put_old_timespec32(&t, interval);
6066	6723	return retval;
6067	6724	}
6068	6725	#endif
..	..	@@ -6075,10 +6732,10 @@
6075	6732	if (!try_get_task_stack(p))
6076	6733	return;
6077	6734
6078		- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
	6735	+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
6079	6736
6080	6737	if (p->state == TASK_RUNNING)
6081		- printk(KERN_CONT " running task ");
	6738	+ pr_cont(" running task ");
6082	6739	#ifdef CONFIG_DEBUG_STACK_USAGE
6083	6740	free = stack_not_used(p);
6084	6741	#endif
..	..	@@ -6087,12 +6744,13 @@
6087	6744	if (pid_alive(p))
6088	6745	ppid = task_pid_nr(rcu_dereference(p->real_parent));
6089	6746	rcu_read_unlock();
6090		- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6091		- task_pid_nr(p), ppid,
	6747	+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
	6748	+ free, task_pid_nr(p), ppid,
6092	6749	(unsigned long)task_thread_info(p)->flags);
6093	6750
6094	6751	print_worker_info(KERN_INFO, p);
6095		- show_stack(p, NULL);
	6752	+ trace_android_vh_sched_show_task(p);
	6753	+ show_stack(p, NULL, KERN_INFO);
6096	6754	put_task_stack(p);
6097	6755	}
6098	6756	EXPORT_SYMBOL_GPL(sched_show_task);
..	..	@@ -6123,13 +6781,6 @@
6123	6781	{
6124	6782	struct task_struct g, p;
6125	6783
6126		-#if BITS_PER_LONG == 32
6127		- printk(KERN_INFO
6128		- " task PC stack pid father\n");
6129		-#else
6130		- printk(KERN_INFO
6131		- " task PC stack pid father\n");
6132		-#endif
6133	6784	rcu_read_lock();
6134	6785	for_each_process_thread(g, p) {
6135	6786	/*
..	..	@@ -6165,7 +6816,7 @@
6165	6816	* NOTE: this function does not set the idle thread's NEED_RESCHED
6166	6817	* flag, to make booting more robust.
6167	6818	*/
6168		-void init_idle(struct task_struct *idle, int cpu)
	6819	+void __init init_idle(struct task_struct *idle, int cpu)
6169	6820	{
6170	6821	struct rq *rq = cpu_rq(cpu);
6171	6822	unsigned long flags;
..	..	@@ -6178,9 +6829,6 @@
6178	6829	idle->state = TASK_RUNNING;
6179	6830	idle->se.exec_start = sched_clock();
6180	6831	idle->flags \|= PF_IDLE;
6181		-
6182		- scs_task_reset(idle);
6183		- kasan_unpoison_task_stack(idle);
6184	6832
6185	6833	#ifdef CONFIG_SMP
6186	6834	/*
..	..	@@ -6205,7 +6853,8 @@
6205	6853	__set_task_cpu(idle, cpu);
6206	6854	rcu_read_unlock();
6207	6855
6208		- rq->curr = rq->idle = idle;
	6856	+ rq->idle = idle;
	6857	+ rcu_assign_pointer(rq->curr, idle);
6209	6858	idle->on_rq = TASK_ON_RQ_QUEUED;
6210	6859	#ifdef CONFIG_SMP
6211	6860	idle->on_cpu = 1;
..	..	@@ -6215,9 +6864,7 @@
6215	6864
6216	6865	/* Set the preempt count _outside_ the spinlocks! */
6217	6866	init_idle_preempt_count(idle, cpu);
6218		-#ifdef CONFIG_HAVE_PREEMPT_LAZY
6219		- task_thread_info(idle)->preempt_lazy_count = 0;
6220		-#endif
	6867	+
6221	6868	/*
6222	6869	* The idle tasks have their own, simple scheduling class:
6223	6870	*/
..	..	@@ -6244,8 +6891,7 @@
6244	6891	return ret;
6245	6892	}
6246	6893
6247		-int task_can_attach(struct task_struct *p,
6248		- const struct cpumask *cs_cpus_allowed)
	6894	+int task_can_attach(struct task_struct *p)
6249	6895	{
6250	6896	int ret = 0;
6251	6897
..	..	@@ -6258,16 +6904,9 @@
6258	6904	* success of set_cpus_allowed_ptr() on all attached tasks
6259	6905	* before cpus_mask may be changed.
6260	6906	*/
6261		- if (p->flags & PF_NO_SETAFFINITY) {
	6907	+ if (p->flags & PF_NO_SETAFFINITY)
6262	6908	ret = -EINVAL;
6263		- goto out;
6264		- }
6265	6909
6266		- if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
6267		- cs_cpus_allowed))
6268		- ret = dl_task_can_attach(p, cs_cpus_allowed);
6269		-
6270		-out:
6271	6910	return ret;
6272	6911	}
6273	6912
..	..	@@ -6316,13 +6955,12 @@
6316	6955	if (queued)
6317	6956	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
6318	6957	if (running)
6319		- set_curr_task(rq, p);
	6958	+ set_next_task(rq, p);
6320	6959	task_rq_unlock(rq, p, &rf);
6321	6960	}
6322	6961	#endif /* CONFIG_NUMA_BALANCING */
6323	6962
6324	6963	#ifdef CONFIG_HOTPLUG_CPU
6325		-
6326	6964	/*
6327	6965	* Ensure that the idle task is using init_mm right before its CPU goes
6328	6966	* offline.
..	..	@@ -6358,21 +6996,22 @@
6358	6996	atomic_long_add(delta, &calc_load_tasks);
6359	6997	}
6360	6998
6361		-static void put_prev_task_fake(struct rq rq, struct task_struct prev)
	6999	+static struct task_struct __pick_migrate_task(struct rq rq)
6362	7000	{
	7001	+ const struct sched_class *class;
	7002	+ struct task_struct *next;
	7003	+
	7004	+ for_each_class(class) {
	7005	+ next = class->pick_next_task(rq);
	7006	+ if (next) {
	7007	+ next->sched_class->put_prev_task(rq, next);
	7008	+ return next;
	7009	+ }
	7010	+ }
	7011	+
	7012	+ /* The idle class should always have a runnable task */
	7013	+ BUG();
6363	7014	}
6364		-
6365		-static const struct sched_class fake_sched_class = {
6366		- .put_prev_task = put_prev_task_fake,
6367		-};
6368		-
6369		-static struct task_struct fake_task = {
6370		- /*
6371		- * Avoid pull_{rt,dl}_task()
6372		- */
6373		- .prio = MAX_PRIO + 1,
6374		- .sched_class = &fake_sched_class,
6375		-};
6376	7015
6377	7016	/*
6378	7017	* Migrate all tasks from the rq, sleeping tasks will be migrated by
..	..	@@ -6381,11 +7020,14 @@
6381	7020	* Called with rq->lock held even though we'er in stop_machine() and
6382	7021	* there's no concurrency possible, we hold the required locks anyway
6383	7022	* because of lock validation efforts.
	7023	+ *
	7024	+ * force: if false, the function will skip CPU pinned kthreads.
6384	7025	*/
6385		-static void migrate_tasks(struct rq dead_rq, struct rq_flags rf)
	7026	+static void migrate_tasks(struct rq dead_rq, struct rq_flags rf, bool force)
6386	7027	{
6387	7028	struct rq *rq = dead_rq;
6388		- struct task_struct next, stop = rq->stop;
	7029	+ struct task_struct next, tmp, *stop = rq->stop;
	7030	+ LIST_HEAD(percpu_kthreads);
6389	7031	struct rq_flags orf = *rf;
6390	7032	int dest_cpu;
6391	7033
..	..	@@ -6407,6 +7049,11 @@
6407	7049	*/
6408	7050	update_rq_clock(rq);
6409	7051
	7052	+#ifdef CONFIG_SCHED_DEBUG
	7053	+ /* note the clock update in orf */
	7054	+ orf.clock_update_flags \|= RQCF_UPDATED;
	7055	+#endif
	7056	+
6410	7057	for (;;) {
6411	7058	/*
6412	7059	* There's this thread running, bail when that's the only
..	..	@@ -6415,14 +7062,21 @@
6415	7062	if (rq->nr_running == 1)
6416	7063	break;
6417	7064
6418		- /*
6419		- * pick_next_task() assumes pinned rq->lock:
6420		- */
6421		- next = pick_next_task(rq, &fake_task, rf);
6422		- BUG_ON(!next);
6423		- put_prev_task(rq, next);
	7065	+ next = __pick_migrate_task(rq);
6424	7066
6425		- WARN_ON_ONCE(__migrate_disabled(next));
	7067	+ /*
	7068	+ * Argh ... no iterator for tasks, we need to remove the
	7069	+ * kthread from the run-queue to continue.
	7070	+ */
	7071	+ if (!force && is_per_cpu_kthread(next)) {
	7072	+ INIT_LIST_HEAD(&next->percpu_kthread_node);
	7073	+ list_add(&next->percpu_kthread_node, &percpu_kthreads);
	7074	+
	7075	+ /* DEQUEUE_SAVE not used due to move_entity in rt */
	7076	+ deactivate_task(rq, next,
	7077	+ DEQUEUE_NOCLOCK);
	7078	+ continue;
	7079	+ }
6426	7080
6427	7081	/*
6428	7082	* Rules for changing task_struct::cpus_mask are holding
..	..	@@ -6442,7 +7096,14 @@
6442	7096	* changed the task, WARN if weird stuff happened, because in
6443	7097	* that case the above rq->lock drop is a fail too.
6444	7098	*/
6445		- if (WARN_ON(task_rq(next) != rq \|\| !task_on_rq_queued(next))) {
	7099	+ if (task_rq(next) != rq \|\| !task_on_rq_queued(next)) {
	7100	+ /*
	7101	+ * In the !force case, there is a hole between
	7102	+ * rq_unlock() and rq_relock(), where another CPU might
	7103	+ * not observe an up to date cpu_active_mask and try to
	7104	+ * move tasks around.
	7105	+ */
	7106	+ WARN_ON(force);
6446	7107	raw_spin_unlock(&next->pi_lock);
6447	7108	continue;
6448	7109	}
..	..	@@ -6459,7 +7120,49 @@
6459	7120	raw_spin_unlock(&next->pi_lock);
6460	7121	}
6461	7122
	7123	+ list_for_each_entry_safe(next, tmp, &percpu_kthreads,
	7124	+ percpu_kthread_node) {
	7125	+
	7126	+ /* ENQUEUE_RESTORE not used due to move_entity in rt */
	7127	+ activate_task(rq, next, ENQUEUE_NOCLOCK);
	7128	+ list_del(&next->percpu_kthread_node);
	7129	+ }
	7130	+
6462	7131	rq->stop = stop;
	7132	+}
	7133	+
	7134	+static int drain_rq_cpu_stop(void *data)
	7135	+{
	7136	+ struct rq *rq = this_rq();
	7137	+ struct rq_flags rf;
	7138	+
	7139	+ rq_lock_irqsave(rq, &rf);
	7140	+ migrate_tasks(rq, &rf, false);
	7141	+ rq_unlock_irqrestore(rq, &rf);
	7142	+
	7143	+ return 0;
	7144	+}
	7145	+
	7146	+int sched_cpu_drain_rq(unsigned int cpu)
	7147	+{
	7148	+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
	7149	+ struct cpu_stop_done *rq_drain_done = &(cpu_rq(cpu)->drain_done);
	7150	+
	7151	+ if (idle_cpu(cpu)) {
	7152	+ rq_drain->done = NULL;
	7153	+ return 0;
	7154	+ }
	7155	+
	7156	+ return stop_one_cpu_async(cpu, drain_rq_cpu_stop, NULL, rq_drain,
	7157	+ rq_drain_done);
	7158	+}
	7159	+
	7160	+void sched_cpu_drain_rq_wait(unsigned int cpu)
	7161	+{
	7162	+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
	7163	+
	7164	+ if (rq_drain->done)
	7165	+ cpu_stop_work_wait(rq_drain);
6463	7166	}
6464	7167	#endif /* CONFIG_HOTPLUG_CPU */
6465	7168
..	..	@@ -6531,8 +7234,10 @@
6531	7234	static int cpuset_cpu_inactive(unsigned int cpu)
6532	7235	{
6533	7236	if (!cpuhp_tasks_frozen) {
6534		- if (dl_cpu_busy(cpu))
6535		- return -EBUSY;
	7237	+ int ret = dl_bw_check_overflow(cpu);
	7238	+
	7239	+ if (ret)
	7240	+ return ret;
6536	7241	cpuset_update_active_cpus();
6537	7242	} else {
6538	7243	num_cpus_frozen++;
..	..	@@ -6581,19 +7286,27 @@
6581	7286	return 0;
6582	7287	}
6583	7288
6584		-int sched_cpu_deactivate(unsigned int cpu)
	7289	+int sched_cpus_activate(struct cpumask *cpus)
	7290	+{
	7291	+ unsigned int cpu;
	7292	+
	7293	+ for_each_cpu(cpu, cpus) {
	7294	+ if (sched_cpu_activate(cpu)) {
	7295	+ for_each_cpu_and(cpu, cpus, cpu_active_mask)
	7296	+ sched_cpu_deactivate(cpu);
	7297	+
	7298	+ return -EBUSY;
	7299	+ }
	7300	+ }
	7301	+
	7302	+ return 0;
	7303	+}
	7304	+
	7305	+int _sched_cpu_deactivate(unsigned int cpu)
6585	7306	{
6586	7307	int ret;
6587	7308
6588	7309	set_cpu_active(cpu, false);
6589		- /*
6590		- * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
6591		- * users of this state to go away such that all new such users will
6592		- * observe it.
6593		- *
6594		- * Do sync before park smpboot threads to take care the rcu boost case.
6595		- */
6596		- synchronize_rcu_mult(call_rcu, call_rcu_sched);
6597	7310
6598	7311	#ifdef CONFIG_SCHED_SMT
6599	7312	/*
..	..	@@ -6612,6 +7325,46 @@
6612	7325	return ret;
6613	7326	}
6614	7327	sched_domains_numa_masks_clear(cpu);
	7328	+
	7329	+ update_max_interval();
	7330	+
	7331	+ return 0;
	7332	+}
	7333	+
	7334	+int sched_cpu_deactivate(unsigned int cpu)
	7335	+{
	7336	+ int ret = _sched_cpu_deactivate(cpu);
	7337	+
	7338	+ if (ret)
	7339	+ return ret;
	7340	+
	7341	+ /*
	7342	+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
	7343	+ * users of this state to go away such that all new such users will
	7344	+ * observe it.
	7345	+ *
	7346	+ * Do sync before park smpboot threads to take care the rcu boost case.
	7347	+ */
	7348	+ synchronize_rcu();
	7349	+
	7350	+ return 0;
	7351	+}
	7352	+
	7353	+int sched_cpus_deactivate_nosync(struct cpumask *cpus)
	7354	+{
	7355	+ unsigned int cpu;
	7356	+
	7357	+ for_each_cpu(cpu, cpus) {
	7358	+ if (_sched_cpu_deactivate(cpu)) {
	7359	+ for_each_cpu(cpu, cpus) {
	7360	+ if (!cpu_active(cpu))
	7361	+ sched_cpu_activate(cpu);
	7362	+ }
	7363	+
	7364	+ return -EBUSY;
	7365	+ }
	7366	+ }
	7367	+
6615	7368	return 0;
6616	7369	}
6617	7370
..	..	@@ -6620,13 +7373,13 @@
6620	7373	struct rq *rq = cpu_rq(cpu);
6621	7374
6622	7375	rq->calc_load_update = calc_load_update;
6623		- update_max_interval();
6624	7376	}
6625	7377
6626	7378	int sched_cpu_starting(unsigned int cpu)
6627	7379	{
6628	7380	sched_rq_cpu_starting(cpu);
6629	7381	sched_tick_start(cpu);
	7382	+ trace_android_rvh_sched_cpu_starting(cpu);
6630	7383	return 0;
6631	7384	}
6632	7385
..	..	@@ -6637,7 +7390,6 @@
6637	7390	struct rq_flags rf;
6638	7391
6639	7392	/* Handle pending wakeups and then migrate everything off */
6640		- sched_ttwu_pending();
6641	7393	sched_tick_stop(cpu);
6642	7394
6643	7395	rq_lock_irqsave(rq, &rf);
..	..	@@ -6645,12 +7397,13 @@
6645	7397	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6646	7398	set_rq_offline(rq);
6647	7399	}
6648		- migrate_tasks(rq, &rf);
	7400	+ migrate_tasks(rq, &rf, true);
6649	7401	BUG_ON(rq->nr_running != 1);
6650	7402	rq_unlock_irqrestore(rq, &rf);
6651	7403
	7404	+ trace_android_rvh_sched_cpu_dying(cpu);
	7405	+
6652	7406	calc_load_migrate(rq);
6653		- update_max_interval();
6654	7407	nohz_balance_exit_idle(rq);
6655	7408	hrtick_clear(rq);
6656	7409	return 0;
..	..	@@ -6664,18 +7417,16 @@
6664	7417	/*
6665	7418	* There's no userspace yet to cause hotplug operations; hence all the
6666	7419	* CPU masks are stable and all blatant races in the below code cannot
6667		- * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
6668		- * but there won't be any contention on it.
	7420	+ * happen.
6669	7421	*/
6670		- cpus_read_lock();
6671	7422	mutex_lock(&sched_domains_mutex);
6672	7423	sched_init_domains(cpu_active_mask);
6673	7424	mutex_unlock(&sched_domains_mutex);
6674		- cpus_read_unlock();
6675	7425
6676	7426	/* Move init over to a non-isolated CPU */
6677	7427	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
6678	7428	BUG();
	7429	+
6679	7430	sched_init_granularity();
6680	7431
6681	7432	init_sched_rt_class();
..	..	@@ -6686,7 +7437,7 @@
6686	7437
6687	7438	static int __init migration_init(void)
6688	7439	{
6689		- sched_rq_cpu_starting(smp_processor_id());
	7440	+ sched_cpu_starting(smp_processor_id());
6690	7441	return 0;
6691	7442	}
6692	7443	early_initcall(migration_init);
..	..	@@ -6711,7 +7462,9 @@
6711	7462	* Every task in system belongs to this group at bootup.
6712	7463	*/
6713	7464	struct task_group root_task_group;
	7465	+EXPORT_SYMBOL_GPL(root_task_group);
6714	7466	LIST_HEAD(task_groups);
	7467	+EXPORT_SYMBOL_GPL(task_groups);
6715	7468
6716	7469	/* Cacheline aligned slab cache for task_group */
6717	7470	static struct kmem_cache *task_group_cache __read_mostly;
..	..	@@ -6722,19 +7475,27 @@
6722	7475
6723	7476	void __init sched_init(void)
6724	7477	{
6725		- int i, j;
6726		- unsigned long alloc_size = 0, ptr;
	7478	+ unsigned long ptr = 0;
	7479	+ int i;
	7480	+
	7481	+ /* Make sure the linker didn't screw up */
	7482	+ BUG_ON(&idle_sched_class + 1 != &fair_sched_class \|\|
	7483	+ &fair_sched_class + 1 != &rt_sched_class \|\|
	7484	+ &rt_sched_class + 1 != &dl_sched_class);
	7485	+#ifdef CONFIG_SMP
	7486	+ BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
	7487	+#endif
6727	7488
6728	7489	wait_bit_init();
6729	7490
6730	7491	#ifdef CONFIG_FAIR_GROUP_SCHED
6731		- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
	7492	+ ptr += 2 * nr_cpu_ids * sizeof(void **);
6732	7493	#endif
6733	7494	#ifdef CONFIG_RT_GROUP_SCHED
6734		- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
	7495	+ ptr += 2 * nr_cpu_ids * sizeof(void **);
6735	7496	#endif
6736		- if (alloc_size) {
6737		- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
	7497	+ if (ptr) {
	7498	+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
6738	7499
6739	7500	#ifdef CONFIG_FAIR_GROUP_SCHED
6740	7501	root_task_group.se = (struct sched_entity **)ptr;
..	..	@@ -6743,6 +7504,8 @@
6743	7504	root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6744	7505	ptr += nr_cpu_ids * sizeof(void **);
6745	7506
	7507	+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
	7508	+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6746	7509	#endif /* CONFIG_FAIR_GROUP_SCHED */
6747	7510	#ifdef CONFIG_RT_GROUP_SCHED
6748	7511	root_task_group.rt_se = (struct sched_rt_entity **)ptr;
..	..	@@ -6795,7 +7558,6 @@
6795	7558	init_rt_rq(&rq->rt);
6796	7559	init_dl_rq(&rq->dl);
6797	7560	#ifdef CONFIG_FAIR_GROUP_SCHED
6798		- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6799	7561	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6800	7562	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
6801	7563	/*
..	..	@@ -6817,7 +7579,6 @@
6817	7579	* We achieve this by letting root_task_group's tasks sit
6818	7580	* directly in rq->cfs (i.e root_task_group->se[] = NULL).
6819	7581	*/
6820		- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6821	7582	init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6822	7583	#endif /* CONFIG_FAIR_GROUP_SCHED */
6823	7584
..	..	@@ -6825,10 +7586,6 @@
6825	7586	#ifdef CONFIG_RT_GROUP_SCHED
6826	7587	init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6827	7588	#endif
6828		-
6829		- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6830		- rq->cpu_load[j] = 0;
6831		-
6832	7589	#ifdef CONFIG_SMP
6833	7590	rq->sd = NULL;
6834	7591	rq->rd = NULL;
..	..	@@ -6847,16 +7604,17 @@
6847	7604
6848	7605	rq_attach_root(rq, &def_root_domain);
6849	7606	#ifdef CONFIG_NO_HZ_COMMON
6850		- rq->last_load_update_tick = jiffies;
6851	7607	rq->last_blocked_load_update_tick = jiffies;
6852	7608	atomic_set(&rq->nohz_flags, 0);
	7609	+
	7610	+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
6853	7611	#endif
6854	7612	#endif /* CONFIG_SMP */
6855	7613	hrtick_rq_init(rq);
6856	7614	atomic_set(&rq->nr_iowait, 0);
6857	7615	}
6858	7616
6859		- set_load_weight(&init_task, false);
	7617	+ set_load_weight(&init_task);
6860	7618
6861	7619	/*
6862	7620	* The boot idle thread does lazy MMU switching as well:
..	..	@@ -6891,7 +7649,7 @@
6891	7649	#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6892	7650	static inline int preempt_count_equals(int preempt_offset)
6893	7651	{
6894		- int nested = preempt_count() + sched_rcu_preempt_depth();
	7652	+ int nested = preempt_count() + rcu_preempt_depth();
6895	7653
6896	7654	return (nested == preempt_offset);
6897	7655	}
..	..	@@ -6925,7 +7683,7 @@
6925	7683	rcu_sleep_check();
6926	7684
6927	7685	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6928		- !is_idle_task(current)) \|\|
	7686	+ !is_idle_task(current) && !current->non_block_count) \|\|
6929	7687	system_state == SYSTEM_BOOTING \|\| system_state > SYSTEM_RUNNING \|\|
6930	7688	oops_in_progress)
6931	7689	return;
..	..	@@ -6941,8 +7699,8 @@
6941	7699	"BUG: sleeping function called from invalid context at %s:%d\n",
6942	7700	file, line);
6943	7701	printk(KERN_ERR
6944		- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6945		- in_atomic(), irqs_disabled(),
	7702	+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
	7703	+ in_atomic(), irqs_disabled(), current->non_block_count,
6946	7704	current->pid, current->comm);
6947	7705
6948	7706	if (task_stack_end_corrupted(current))
..	..	@@ -6954,13 +7712,43 @@
6954	7712	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
6955	7713	&& !preempt_count_equals(preempt_offset)) {
6956	7714	pr_err("Preemption disabled at:");
6957		- print_ip_sym(preempt_disable_ip);
6958		- pr_cont("\n");
	7715	+ print_ip_sym(KERN_ERR, preempt_disable_ip);
6959	7716	}
	7717	+
	7718	+ trace_android_rvh_schedule_bug(NULL);
	7719	+
6960	7720	dump_stack();
6961	7721	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
6962	7722	}
6963	7723	EXPORT_SYMBOL(___might_sleep);
	7724	+
	7725	+void __cant_sleep(const char *file, int line, int preempt_offset)
	7726	+{
	7727	+ static unsigned long prev_jiffy;
	7728	+
	7729	+ if (irqs_disabled())
	7730	+ return;
	7731	+
	7732	+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
	7733	+ return;
	7734	+
	7735	+ if (preempt_count() > preempt_offset)
	7736	+ return;
	7737	+
	7738	+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
	7739	+ return;
	7740	+ prev_jiffy = jiffies;
	7741	+
	7742	+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
	7743	+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
	7744	+ in_atomic(), irqs_disabled(),
	7745	+ current->pid, current->comm);
	7746	+
	7747	+ debug_show_held_locks(current);
	7748	+ dump_stack();
	7749	+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
	7750	+}
	7751	+EXPORT_SYMBOL_GPL(__cant_sleep);
6964	7752	#endif
6965	7753
6966	7754	#ifdef CONFIG_MAGIC_SYSRQ
..	..	@@ -7029,7 +7817,7 @@
7029	7817
7030	7818	#ifdef CONFIG_IA64
7031	7819	/**
7032		- * set_curr_task - set the current task for a given CPU.
	7820	+ * ia64_set_curr_task - set the current task for a given CPU.
7033	7821	* @cpu: the processor in question.
7034	7822	* @p: the task pointer to set.
7035	7823	*
..	..	@@ -7195,8 +7983,15 @@
7195	7983
7196	7984	if (queued)
7197	7985	enqueue_task(rq, tsk, queue_flags);
7198		- if (running)
7199		- set_curr_task(rq, tsk);
	7986	+ if (running) {
	7987	+ set_next_task(rq, tsk);
	7988	+ /*
	7989	+ * After changing group, the running task may have joined a
	7990	+ * throttled one but it's still the running task. Trigger a
	7991	+ * resched to make sure that task can still run.
	7992	+ */
	7993	+ resched_curr(rq);
	7994	+ }
7200	7995
7201	7996	task_rq_unlock(rq, tsk, &rf);
7202	7997	}
..	..	@@ -7235,9 +8030,14 @@
7235	8030
7236	8031	#ifdef CONFIG_UCLAMP_TASK_GROUP
7237	8032	/* Propagate the effective uclamp value for the new group */
	8033	+ mutex_lock(&uclamp_mutex);
	8034	+ rcu_read_lock();
7238	8035	cpu_util_update_eff(css);
	8036	+ rcu_read_unlock();
	8037	+ mutex_unlock(&uclamp_mutex);
7239	8038	#endif
7240	8039
	8040	+ trace_android_rvh_cpu_cgroup_online(css);
7241	8041	return 0;
7242	8042	}
7243	8043
..	..	@@ -7303,6 +8103,9 @@
7303	8103	if (ret)
7304	8104	break;
7305	8105	}
	8106	+
	8107	+ trace_android_rvh_cpu_cgroup_can_attach(tset, &ret);
	8108	+
7306	8109	return ret;
7307	8110	}
7308	8111
..	..	@@ -7313,6 +8116,8 @@
7313	8116
7314	8117	cgroup_taskset_for_each(task, css, tset)
7315	8118	sched_move_task(task);
	8119	+
	8120	+ trace_android_rvh_cpu_cgroup_attach(tset);
7316	8121	}
7317	8122
7318	8123	#ifdef CONFIG_UCLAMP_TASK_GROUP
..	..	@@ -7324,6 +8129,9 @@
7324	8129	unsigned int eff[UCLAMP_CNT];
7325	8130	enum uclamp_id clamp_id;
7326	8131	unsigned int clamps;
	8132	+
	8133	+ lockdep_assert_held(&uclamp_mutex);
	8134	+ SCHED_WARN_ON(!rcu_read_lock_held());
7327	8135
7328	8136	css_for_each_descendant_pre(css, top_css) {
7329	8137	uc_parent = css_tg(css)->parent
..	..	@@ -7357,7 +8165,7 @@
7357	8165	}
7358	8166
7359	8167	/* Immediately update descendants RUNNABLE tasks */
7360		- uclamp_update_active_tasks(css, clamps);
	8168	+ uclamp_update_active_tasks(css);
7361	8169	}
7362	8170	}
7363	8171
..	..	@@ -7414,6 +8222,8 @@
7414	8222	req = capacity_from_percent(buf);
7415	8223	if (req.ret)
7416	8224	return req.ret;
	8225	+
	8226	+ static_branch_enable(&sched_uclamp_used);
7417	8227
7418	8228	mutex_lock(&uclamp_mutex);
7419	8229	rcu_read_lock();
..	..	@@ -7529,7 +8339,9 @@
7529	8339	static DEFINE_MUTEX(cfs_constraints_mutex);
7530	8340
7531	8341	const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7532		-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
	8342	+static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
	8343	+/* More than 203 days if BW_SHIFT equals 20. */
	8344	+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
7533	8345
7534	8346	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7535	8347
..	..	@@ -7555,6 +8367,12 @@
7555	8367	* feasibility.
7556	8368	*/
7557	8369	if (period > max_cfs_quota_period)
	8370	+ return -EINVAL;
	8371	+
	8372	+ /*
	8373	+ * Bound quota to defend quota against overflow during bandwidth shift.
	8374	+ */
	8375	+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
7558	8376	return -EINVAL;
7559	8377
7560	8378	/*
..	..	@@ -7609,7 +8427,7 @@
7609	8427	return ret;
7610	8428	}
7611	8429
7612		-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
	8430	+static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7613	8431	{
7614	8432	u64 quota, period;
7615	8433
..	..	@@ -7624,7 +8442,7 @@
7624	8442	return tg_set_cfs_bandwidth(tg, period, quota);
7625	8443	}
7626	8444
7627		-long tg_get_cfs_quota(struct task_group *tg)
	8445	+static long tg_get_cfs_quota(struct task_group *tg)
7628	8446	{
7629	8447	u64 quota_us;
7630	8448
..	..	@@ -7637,7 +8455,7 @@
7637	8455	return quota_us;
7638	8456	}
7639	8457
7640		-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
	8458	+static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7641	8459	{
7642	8460	u64 quota, period;
7643	8461
..	..	@@ -7650,7 +8468,7 @@
7650	8468	return tg_set_cfs_bandwidth(tg, period, quota);
7651	8469	}
7652	8470
7653		-long tg_get_cfs_period(struct task_group *tg)
	8471	+static long tg_get_cfs_period(struct task_group *tg)
7654	8472	{
7655	8473	u64 cfs_period_us;
7656	8474
..	..	@@ -8127,172 +8945,7 @@
8127	8945	/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
8128	8946	};
8129	8947
8130		-#undef CREATE_TRACE_POINTS
8131		-
8132		-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
8133		-
8134		-static inline void
8135		-update_nr_migratory(struct task_struct *p, long delta)
	8948	+void call_trace_sched_update_nr_running(struct rq *rq, int count)
8136	8949	{
8137		- if (unlikely((p->sched_class == &rt_sched_class \|\|
8138		- p->sched_class == &dl_sched_class) &&
8139		- p->nr_cpus_allowed > 1)) {
8140		- if (p->sched_class == &rt_sched_class)
8141		- task_rq(p)->rt.rt_nr_migratory += delta;
8142		- else
8143		- task_rq(p)->dl.dl_nr_migratory += delta;
8144		- }
	8950	+ trace_sched_update_nr_running_tp(rq, count);
8145	8951	}
8146		-
8147		-static inline void
8148		-migrate_disable_update_cpus_allowed(struct task_struct *p)
8149		-{
8150		- p->cpus_ptr = cpumask_of(smp_processor_id());
8151		- update_nr_migratory(p, -1);
8152		- p->nr_cpus_allowed = 1;
8153		-}
8154		-
8155		-static inline void
8156		-migrate_enable_update_cpus_allowed(struct task_struct *p)
8157		-{
8158		- struct rq *rq;
8159		- struct rq_flags rf;
8160		-
8161		- rq = task_rq_lock(p, &rf);
8162		- p->cpus_ptr = &p->cpus_mask;
8163		- p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
8164		- update_nr_migratory(p, 1);
8165		- task_rq_unlock(rq, p, &rf);
8166		-}
8167		-
8168		-void migrate_disable(void)
8169		-{
8170		- preempt_disable();
8171		-
8172		- if (++current->migrate_disable == 1) {
8173		- this_rq()->nr_pinned++;
8174		- preempt_lazy_disable();
8175		-#ifdef CONFIG_SCHED_DEBUG
8176		- WARN_ON_ONCE(current->pinned_on_cpu >= 0);
8177		- current->pinned_on_cpu = smp_processor_id();
8178		-#endif
8179		- }
8180		-
8181		- preempt_enable();
8182		-}
8183		-EXPORT_SYMBOL(migrate_disable);
8184		-
8185		-static void migrate_disabled_sched(struct task_struct *p)
8186		-{
8187		- if (p->migrate_disable_scheduled)
8188		- return;
8189		-
8190		- migrate_disable_update_cpus_allowed(p);
8191		- p->migrate_disable_scheduled = 1;
8192		-}
8193		-
8194		-static DEFINE_PER_CPU(struct cpu_stop_work, migrate_work);
8195		-static DEFINE_PER_CPU(struct migration_arg, migrate_arg);
8196		-
8197		-void migrate_enable(void)
8198		-{
8199		- struct task_struct *p = current;
8200		- struct rq *rq = this_rq();
8201		- int cpu = task_cpu(p);
8202		-
8203		- WARN_ON_ONCE(p->migrate_disable <= 0);
8204		- if (p->migrate_disable > 1) {
8205		- p->migrate_disable--;
8206		- return;
8207		- }
8208		-
8209		- preempt_disable();
8210		-
8211		-#ifdef CONFIG_SCHED_DEBUG
8212		- WARN_ON_ONCE(current->pinned_on_cpu != cpu);
8213		- current->pinned_on_cpu = -1;
8214		-#endif
8215		-
8216		- WARN_ON_ONCE(rq->nr_pinned < 1);
8217		-
8218		- p->migrate_disable = 0;
8219		- rq->nr_pinned--;
8220		-#ifdef CONFIG_HOTPLUG_CPU
8221		- if (rq->nr_pinned == 0 && unlikely(!cpu_active(cpu)) &&
8222		- takedown_cpu_task)
8223		- wake_up_process(takedown_cpu_task);
8224		-#endif
8225		-
8226		- if (!p->migrate_disable_scheduled)
8227		- goto out;
8228		-
8229		- p->migrate_disable_scheduled = 0;
8230		-
8231		- migrate_enable_update_cpus_allowed(p);
8232		-
8233		- WARN_ON(smp_processor_id() != cpu);
8234		- if (!is_cpu_allowed(p, cpu)) {
8235		- struct migration_arg __percpu *arg;
8236		- struct cpu_stop_work __percpu *work;
8237		- struct rq_flags rf;
8238		-
8239		- work = this_cpu_ptr(&migrate_work);
8240		- arg = this_cpu_ptr(&migrate_arg);
8241		- WARN_ON_ONCE(!arg->done && !work->disabled && work->arg);
8242		-
8243		- arg->task = p;
8244		- arg->done = false;
8245		-
8246		- rq = task_rq_lock(p, &rf);
8247		- update_rq_clock(rq);
8248		- arg->dest_cpu = select_fallback_rq(cpu, p);
8249		- task_rq_unlock(rq, p, &rf);
8250		-
8251		- stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
8252		- arg, work);
8253		- tlb_migrate_finish(p->mm);
8254		- }
8255		-
8256		-out:
8257		- preempt_lazy_enable();
8258		- preempt_enable();
8259		-}
8260		-EXPORT_SYMBOL(migrate_enable);
8261		-
8262		-int cpu_nr_pinned(int cpu)
8263		-{
8264		- struct rq *rq = cpu_rq(cpu);
8265		-
8266		- return rq->nr_pinned;
8267		-}
8268		-
8269		-#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
8270		-static void migrate_disabled_sched(struct task_struct *p)
8271		-{
8272		-}
8273		-
8274		-void migrate_disable(void)
8275		-{
8276		-#ifdef CONFIG_SCHED_DEBUG
8277		- current->migrate_disable++;
8278		-#endif
8279		- barrier();
8280		-}
8281		-EXPORT_SYMBOL(migrate_disable);
8282		-
8283		-void migrate_enable(void)
8284		-{
8285		-#ifdef CONFIG_SCHED_DEBUG
8286		- struct task_struct *p = current;
8287		-
8288		- WARN_ON_ONCE(p->migrate_disable <= 0);
8289		- p->migrate_disable--;
8290		-#endif
8291		- barrier();
8292		-}
8293		-EXPORT_SYMBOL(migrate_enable);
8294		-#else
8295		-static void migrate_disabled_sched(struct task_struct *p)
8296		-{
8297		-}
8298		-#endif