~hc/RK356X_SDK_RELEASE.git

..	..	@@ -20,12 +20,11 @@
20	20	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21	21	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22	22	*/
23		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
24		-#include <linux/cpufreq.h>
25		-#endif
26	23	#include "sched.h"
27	24
28		-#include <trace/events/sched.h>
	25	+#include <trace/hooks/sched.h>
	26	+
	27	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_runtime);
29	28
30	29	/*
31	30	* Targeted preemption latency for CPU-bound tasks:
..	..	@@ -41,17 +40,8 @@
41	40	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
42	41	*/
43	42	unsigned int sysctl_sched_latency = 6000000ULL;
44		-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
45		-
46		-/*
47		- * Enable/disable honoring sync flag in energy-aware wakeups.
48		- */
49		-unsigned int sysctl_sched_sync_hint_enable = 1;
50		-
51		-/*
52		- * Enable/disable using cstate knowledge in idle sibling selection
53		- */
54		-unsigned int sysctl_sched_cstate_aware = 1;
	43	+EXPORT_SYMBOL_GPL(sysctl_sched_latency);
	44	+static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
55	45
56	46	/*
57	47	* The initial- and re-scaling of tunables is configurable
..	..	@@ -71,8 +61,9 @@
71	61	*
72	62	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
73	63	*/
74		-unsigned int sysctl_sched_min_granularity = 750000ULL;
75		-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
	64	+unsigned int sysctl_sched_min_granularity = 750000ULL;
	65	+EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity);
	66	+static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
76	67
77	68	/*
78	69	* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
..	..	@@ -94,10 +85,23 @@
94	85	*
95	86	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
96	87	*/
97		-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
98		-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
	88	+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
	89	+static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
99	90
100	91	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
	92	+
	93	+int sched_thermal_decay_shift;
	94	+static int __init setup_sched_thermal_decay_shift(char *str)
	95	+{
	96	+ int _shift = 0;
	97	+
	98	+ if (kstrtoint(str, 0, &_shift))
	99	+ pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
	100	+
	101	+ sched_thermal_decay_shift = clamp(_shift, 0, 10);
	102	+ return 1;
	103	+}
	104	+__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
101	105
102	106	#ifdef CONFIG_SMP
103	107	/*
..	..	@@ -107,6 +111,14 @@
107	111	{
108	112	return -cpu;
109	113	}
	114	+
	115	+/*
	116	+ * The margin used when comparing utilization with CPU capacity.
	117	+ *
	118	+ * (default: ~20%)
	119	+ */
	120	+#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
	121	+
110	122	#endif
111	123
112	124	#ifdef CONFIG_CFS_BANDWIDTH
..	..	@@ -122,18 +134,6 @@
122	134	*/
123	135	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
124	136	#endif
125		-
126		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
127		-unsigned int sysctl_sched_performance_bias = 1;
128		-#endif
129		-
130		-/*
131		- * The margin used when comparing utilization with CPU capacity:
132		- * util * margin < capacity * 1024
133		- *
134		- * (default: ~20%)
135		- */
136		-unsigned int capacity_margin = 1280;
137	137
138	138	static inline void update_load_add(struct load_weight *lw, unsigned long inc)
139	139	{
..	..	@@ -195,7 +195,7 @@
195	195	#undef SET_SYSCTL
196	196	}
197	197
198		-void sched_init_granularity(void)
	198	+void __init sched_init_granularity(void)
199	199	{
200	200	update_sysctl();
201	201	}
..	..	@@ -246,8 +246,7 @@
246	246	}
247	247	}
248	248
249		- /* hint to use a 32x32->64 mul */
250		- fact = (u64)(u32)fact * lw->inv_weight;
	249	+ fact = mul_u32_u32(fact, lw->inv_weight);
251	250
252	251	while (fact >> 32) {
253	252	fact >>= 1;
..	..	@@ -290,6 +289,19 @@
290	289	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
291	290	{
292	291	return grp->my_q;
	292	+}
	293	+
	294	+static inline void cfs_rq_tg_path(struct cfs_rq cfs_rq, char path, int len)
	295	+{
	296	+ if (!path)
	297	+ return;
	298	+
	299	+ if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
	300	+ autogroup_path(cfs_rq->tg, path, len);
	301	+ else if (cfs_rq && cfs_rq->tg->css.cgroup)
	302	+ cgroup_path(cfs_rq->tg->css.cgroup, path, len);
	303	+ else
	304	+ strlcpy(path, "(null)", len);
293	305	}
294	306
295	307	static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
..	..	@@ -466,6 +478,12 @@
466	478	return NULL;
467	479	}
468	480
	481	+static inline void cfs_rq_tg_path(struct cfs_rq cfs_rq, char path, int len)
	482	+{
	483	+ if (path)
	484	+ strlcpy(path, "(null)", len);
	485	+}
	486	+
469	487	static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
470	488	{
471	489	return true;
..	..	@@ -567,6 +585,7 @@
567	585	struct sched_entity *entry;
568	586	bool leftmost = true;
569	587
	588	+ trace_android_rvh_enqueue_entity(cfs_rq, se);
570	589	/*
571	590	* Find the right place in the rbtree:
572	591	*/
..	..	@@ -592,6 +611,7 @@
592	611
593	612	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
594	613	{
	614	+ trace_android_rvh_dequeue_entity(cfs_rq, se);
595	615	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
596	616	}
597	617
..	..	@@ -631,8 +651,7 @@
631	651	*/
632	652
633	653	int sched_proc_update_handler(struct ctl_table *table, int write,
634		- void __user buffer, size_t lenp,
635		- loff_t *ppos)
	654	+ void buffer, size_t lenp, loff_t *ppos)
636	655	{
637	656	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
638	657	unsigned int factor = get_update_sysctl_factor();
..	..	@@ -689,7 +708,13 @@
689	708	*/
690	709	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
691	710	{
692		- u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
	711	+ unsigned int nr_running = cfs_rq->nr_running;
	712	+ u64 slice;
	713	+
	714	+ if (sched_feat(ALT_PERIOD))
	715	+ nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
	716	+
	717	+ slice = __sched_period(nr_running + !se->on_rq);
693	718
694	719	for_each_sched_entity(se) {
695	720	struct load_weight *load;
..	..	@@ -706,6 +731,10 @@
706	731	}
707	732	slice = __calc_delta(slice, se->load.weight, load);
708	733	}
	734	+
	735	+ if (sched_feat(BASE_SLICE))
	736	+ slice = max(slice, (u64)sysctl_sched_min_granularity);
	737	+
709	738	return slice;
710	739	}
711	740
..	..	@@ -734,26 +763,17 @@
734	763	memset(sa, 0, sizeof(*sa));
735	764
736	765	/*
737		- * Tasks are intialized with full load to be seen as heavy tasks until
	766	+ * Tasks are initialized with full load to be seen as heavy tasks until
738	767	* they get a chance to stabilize to their real load level.
739		- * Group entities are intialized with zero load to reflect the fact that
	768	+ * Group entities are initialized with zero load to reflect the fact that
740	769	* nothing has been attached to the task group yet.
741	770	*/
742	771	if (entity_is_task(se))
743		- sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
	772	+ sa->load_avg = scale_load_down(se->load.weight);
744	773
745		- se->runnable_weight = se->load.weight;
746		-
747		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
748		- if (sysctl_sched_performance_bias) {
749		- sa->util_avg = SCHED_CAPACITY_SCALE >> 1;
750		- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
751		- }
752		-#endif
753	774	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
754	775	}
755	776
756		-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
757	777	static void attach_entity_cfs_rq(struct sched_entity *se);
758	778
759	779	/*
..	..	@@ -782,18 +802,15 @@
782	802	* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
783	803	* if util_avg > util_avg_cap.
784	804	*/
785		-void post_init_entity_util_avg(struct sched_entity *se)
	805	+void post_init_entity_util_avg(struct task_struct *p)
786	806	{
	807	+ struct sched_entity *se = &p->se;
787	808	struct cfs_rq *cfs_rq = cfs_rq_of(se);
788	809	struct sched_avg *sa = &se->avg;
789		- long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
	810	+ long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
790	811	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
791	812
792		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
793		- if (!sysctl_sched_performance_bias && (cap > 0)) {
794		-#else
795	813	if (cap > 0) {
796		-#endif
797	814	if (cfs_rq->avg.util_avg != 0) {
798	815	sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
799	816	sa->util_avg /= (cfs_rq->avg.load_avg + 1);
..	..	@@ -805,24 +822,25 @@
805	822	}
806	823	}
807	824
808		- if (entity_is_task(se)) {
809		- struct task_struct *p = task_of(se);
810		- if (p->sched_class != &fair_sched_class) {
811		- /*
812		- * For !fair tasks do:
813		- *
814		- update_cfs_rq_load_avg(now, cfs_rq);
815		- attach_entity_load_avg(cfs_rq, se, 0);
816		- switched_from_fair(rq, p);
817		- *
818		- * such that the next switched_to_fair() has the
819		- * expected state.
820		- */
821		- se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
822		- return;
823		- }
	825	+ sa->runnable_avg = sa->util_avg;
	826	+
	827	+ if (p->sched_class != &fair_sched_class) {
	828	+ /*
	829	+ * For !fair tasks do:
	830	+ *
	831	+ update_cfs_rq_load_avg(now, cfs_rq);
	832	+ attach_entity_load_avg(cfs_rq, se);
	833	+ switched_from_fair(rq, p);
	834	+ *
	835	+ * such that the next switched_to_fair() has the
	836	+ * expected state.
	837	+ */
	838	+ se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
	839	+ return;
824	840	}
825	841
	842	+ /* Hook before this se's util is attached to cfs_rq's util */
	843	+ trace_android_rvh_post_init_entity_util_avg(se);
826	844	attach_entity_cfs_rq(se);
827	845	}
828	846
..	..	@@ -830,10 +848,10 @@
830	848	void init_entity_runnable_average(struct sched_entity *se)
831	849	{
832	850	}
833		-void post_init_entity_util_avg(struct sched_entity *se)
	851	+void post_init_entity_util_avg(struct task_struct *p)
834	852	{
835	853	}
836		-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
	854	+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
837	855	{
838	856	}
839	857	#endif /* CONFIG_SMP */
..	..	@@ -983,7 +1001,6 @@
983	1001	}
984	1002
985	1003	trace_sched_stat_blocked(tsk, delta);
986		- trace_sched_blocked_reason(tsk);
987	1004
988	1005	/*
989	1006	* Blocking time is in units of nanosecs, so shift by
..	..	@@ -1078,7 +1095,7 @@
1078	1095	unsigned int sysctl_numa_balancing_scan_delay = 1000;
1079	1096
1080	1097	struct numa_group {
1081		- atomic_t refcount;
	1098	+ refcount_t refcount;
1082	1099
1083	1100	spinlock_t lock; /* nr_tasks, tasks */
1084	1101	int nr_tasks;
..	..	@@ -1094,7 +1111,7 @@
1094	1111	* more by CPU use than by memory faults.
1095	1112	*/
1096	1113	unsigned long *faults_cpu;
1097		- unsigned long faults[0];
	1114	+ unsigned long faults[];
1098	1115	};
1099	1116
1100	1117	/*
..	..	@@ -1164,7 +1181,7 @@
1164	1181	unsigned long shared = group_faults_shared(ng);
1165	1182	unsigned long private = group_faults_priv(ng);
1166	1183
1167		- period *= atomic_read(&ng->refcount);
	1184	+ period *= refcount_read(&ng->refcount);
1168	1185	period *= shared + 1;
1169	1186	period /= private + shared + 1;
1170	1187	}
..	..	@@ -1189,7 +1206,7 @@
1189	1206	unsigned long private = group_faults_priv(ng);
1190	1207	unsigned long period = smax;
1191	1208
1192		- period *= atomic_read(&ng->refcount);
	1209	+ period *= refcount_read(&ng->refcount);
1193	1210	period *= shared + 1;
1194	1211	period /= private + shared + 1;
1195	1212
..	..	@@ -1199,56 +1216,15 @@
1199	1216	return max(smin, smax);
1200	1217	}
1201	1218
1202		-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1203		-{
1204		- int mm_users = 0;
1205		- struct mm_struct *mm = p->mm;
1206		-
1207		- if (mm) {
1208		- mm_users = atomic_read(&mm->mm_users);
1209		- if (mm_users == 1) {
1210		- mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1211		- mm->numa_scan_seq = 0;
1212		- }
1213		- }
1214		- p->node_stamp = 0;
1215		- p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
1216		- p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1217		- p->numa_work.next = &p->numa_work;
1218		- p->numa_faults = NULL;
1219		- RCU_INIT_POINTER(p->numa_group, NULL);
1220		- p->last_task_numa_placement = 0;
1221		- p->last_sum_exec_runtime = 0;
1222		-
1223		- /* New address space, reset the preferred nid */
1224		- if (!(clone_flags & CLONE_VM)) {
1225		- p->numa_preferred_nid = -1;
1226		- return;
1227		- }
1228		-
1229		- /*
1230		- * New thread, keep existing numa_preferred_nid which should be copied
1231		- * already by arch_dup_task_struct but stagger when scans start.
1232		- */
1233		- if (mm) {
1234		- unsigned int delay;
1235		-
1236		- delay = min_t(unsigned int, task_scan_max(current),
1237		- current->numa_scan_period * mm_users * NSEC_PER_MSEC);
1238		- delay += 2 * TICK_NSEC;
1239		- p->node_stamp = delay;
1240		- }
1241		-}
1242		-
1243	1219	static void account_numa_enqueue(struct rq rq, struct task_struct p)
1244	1220	{
1245		- rq->nr_numa_running += (p->numa_preferred_nid != -1);
	1221	+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1246	1222	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1247	1223	}
1248	1224
1249	1225	static void account_numa_dequeue(struct rq rq, struct task_struct p)
1250	1226	{
1251		- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
	1227	+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1252	1228	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1253	1229	}
1254	1230
..	..	@@ -1474,7 +1450,7 @@
1474	1450	* two full passes of the "multi-stage node selection" test that is
1475	1451	* executed below.
1476	1452	*/
1477		- if ((p->numa_preferred_nid == -1 \|\| p->numa_scan_seq <= 4) &&
	1453	+ if ((p->numa_preferred_nid == NUMA_NO_NODE \|\| p->numa_scan_seq <= 4) &&
1478	1454	(cpupid_pid_unset(last_cpupid) \|\| cpupid_match_pid(p, last_cpupid)))
1479	1455	return true;
1480	1456
..	..	@@ -1527,55 +1503,52 @@
1527	1503	group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1528	1504	}
1529	1505
1530		-static unsigned long weighted_cpuload(struct rq *rq);
1531		-static unsigned long source_load(int cpu, int type);
1532		-static unsigned long target_load(int cpu, int type);
	1506	+/*
	1507	+ * 'numa_type' describes the node at the moment of load balancing.
	1508	+ */
	1509	+enum numa_type {
	1510	+ /* The node has spare capacity that can be used to run more tasks. */
	1511	+ node_has_spare = 0,
	1512	+ /*
	1513	+ * The node is fully used and the tasks don't compete for more CPU
	1514	+ * cycles. Nevertheless, some tasks might wait before running.
	1515	+ */
	1516	+ node_fully_busy,
	1517	+ /*
	1518	+ * The node is overloaded and can't provide expected CPU cycles to all
	1519	+ * tasks.
	1520	+ */
	1521	+ node_overloaded
	1522	+};
1533	1523
1534	1524	/* Cached statistics for all CPUs within a node */
1535	1525	struct numa_stats {
1536	1526	unsigned long load;
1537		-
	1527	+ unsigned long runnable;
	1528	+ unsigned long util;
1538	1529	/* Total compute capacity of CPUs on a node */
1539	1530	unsigned long compute_capacity;
1540		-
1541	1531	unsigned int nr_running;
	1532	+ unsigned int weight;
	1533	+ enum numa_type node_type;
	1534	+ int idle_cpu;
1542	1535	};
1543	1536
1544		-/*
1545		- * XXX borrowed from update_sg_lb_stats
1546		- */
1547		-static void update_numa_stats(struct numa_stats *ns, int nid)
	1537	+static inline bool is_core_idle(int cpu)
1548	1538	{
1549		- int smt, cpu, cpus = 0;
1550		- unsigned long capacity;
	1539	+#ifdef CONFIG_SCHED_SMT
	1540	+ int sibling;
1551	1541
1552		- memset(ns, 0, sizeof(*ns));
1553		- for_each_cpu(cpu, cpumask_of_node(nid)) {
1554		- struct rq *rq = cpu_rq(cpu);
	1542	+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
	1543	+ if (cpu == sibling)
	1544	+ continue;
1555	1545
1556		- ns->nr_running += rq->nr_running;
1557		- ns->load += weighted_cpuload(rq);
1558		- ns->compute_capacity += capacity_of(cpu);
1559		-
1560		- cpus++;
	1546	+ if (!idle_cpu(sibling))
	1547	+ return false;
1561	1548	}
	1549	+#endif
1562	1550
1563		- /*
1564		- * If we raced with hotplug and there are no CPUs left in our mask
1565		- * the @ns structure is NULL'ed and task_numa_compare() will
1566		- * not find this node attractive.
1567		- *
1568		- * We'll detect a huge imbalance and bail there.
1569		- */
1570		- if (!cpus)
1571		- return;
1572		-
1573		- /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1574		- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1575		- capacity = cpus / smt; /* cores */
1576		-
1577		- capacity = min_t(unsigned, capacity,
1578		- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
	1551	+ return true;
1579	1552	}
1580	1553
1581	1554	struct task_numa_env {
..	..	@@ -1594,20 +1567,132 @@
1594	1567	int best_cpu;
1595	1568	};
1596	1569
	1570	+static unsigned long cpu_load(struct rq *rq);
	1571	+static unsigned long cpu_runnable(struct rq *rq);
	1572	+static unsigned long cpu_util(int cpu);
	1573	+static inline long adjust_numa_imbalance(int imbalance, int nr_running);
	1574	+
	1575	+static inline enum
	1576	+numa_type numa_classify(unsigned int imbalance_pct,
	1577	+ struct numa_stats *ns)
	1578	+{
	1579	+ if ((ns->nr_running > ns->weight) &&
	1580	+ (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) \|\|
	1581	+ ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
	1582	+ return node_overloaded;
	1583	+
	1584	+ if ((ns->nr_running < ns->weight) \|\|
	1585	+ (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
	1586	+ ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
	1587	+ return node_has_spare;
	1588	+
	1589	+ return node_fully_busy;
	1590	+}
	1591	+
	1592	+#ifdef CONFIG_SCHED_SMT
	1593	+/* Forward declarations of select_idle_sibling helpers */
	1594	+static inline bool test_idle_cores(int cpu, bool def);
	1595	+static inline int numa_idle_core(int idle_core, int cpu)
	1596	+{
	1597	+ if (!static_branch_likely(&sched_smt_present) \|\|
	1598	+ idle_core >= 0 \|\| !test_idle_cores(cpu, false))
	1599	+ return idle_core;
	1600	+
	1601	+ /*
	1602	+ * Prefer cores instead of packing HT siblings
	1603	+ * and triggering future load balancing.
	1604	+ */
	1605	+ if (is_core_idle(cpu))
	1606	+ idle_core = cpu;
	1607	+
	1608	+ return idle_core;
	1609	+}
	1610	+#else
	1611	+static inline int numa_idle_core(int idle_core, int cpu)
	1612	+{
	1613	+ return idle_core;
	1614	+}
	1615	+#endif
	1616	+
	1617	+/*
	1618	+ * Gather all necessary information to make NUMA balancing placement
	1619	+ * decisions that are compatible with standard load balancer. This
	1620	+ * borrows code and logic from update_sg_lb_stats but sharing a
	1621	+ * common implementation is impractical.
	1622	+ */
	1623	+static void update_numa_stats(struct task_numa_env *env,
	1624	+ struct numa_stats *ns, int nid,
	1625	+ bool find_idle)
	1626	+{
	1627	+ int cpu, idle_core = -1;
	1628	+
	1629	+ memset(ns, 0, sizeof(*ns));
	1630	+ ns->idle_cpu = -1;
	1631	+
	1632	+ rcu_read_lock();
	1633	+ for_each_cpu(cpu, cpumask_of_node(nid)) {
	1634	+ struct rq *rq = cpu_rq(cpu);
	1635	+
	1636	+ ns->load += cpu_load(rq);
	1637	+ ns->runnable += cpu_runnable(rq);
	1638	+ ns->util += cpu_util(cpu);
	1639	+ ns->nr_running += rq->cfs.h_nr_running;
	1640	+ ns->compute_capacity += capacity_of(cpu);
	1641	+
	1642	+ if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
	1643	+ if (READ_ONCE(rq->numa_migrate_on) \|\|
	1644	+ !cpumask_test_cpu(cpu, env->p->cpus_ptr))
	1645	+ continue;
	1646	+
	1647	+ if (ns->idle_cpu == -1)
	1648	+ ns->idle_cpu = cpu;
	1649	+
	1650	+ idle_core = numa_idle_core(idle_core, cpu);
	1651	+ }
	1652	+ }
	1653	+ rcu_read_unlock();
	1654	+
	1655	+ ns->weight = cpumask_weight(cpumask_of_node(nid));
	1656	+
	1657	+ ns->node_type = numa_classify(env->imbalance_pct, ns);
	1658	+
	1659	+ if (idle_core >= 0)
	1660	+ ns->idle_cpu = idle_core;
	1661	+}
	1662	+
1597	1663	static void task_numa_assign(struct task_numa_env *env,
1598	1664	struct task_struct *p, long imp)
1599	1665	{
1600	1666	struct rq *rq = cpu_rq(env->dst_cpu);
1601	1667
1602		- /* Bail out if run-queue part of active NUMA balance. */
1603		- if (xchg(&rq->numa_migrate_on, 1))
1604		- return;
	1668	+ /* Check if run-queue part of active NUMA balance. */
	1669	+ if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
	1670	+ int cpu;
	1671	+ int start = env->dst_cpu;
1605	1672
	1673	+ /* Find alternative idle CPU. */
	1674	+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
	1675	+ if (cpu == env->best_cpu \|\| !idle_cpu(cpu) \|\|
	1676	+ !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
	1677	+ continue;
	1678	+ }
	1679	+
	1680	+ env->dst_cpu = cpu;
	1681	+ rq = cpu_rq(env->dst_cpu);
	1682	+ if (!xchg(&rq->numa_migrate_on, 1))
	1683	+ goto assign;
	1684	+ }
	1685	+
	1686	+ /* Failed to find an alternative idle CPU */
	1687	+ return;
	1688	+ }
	1689	+
	1690	+assign:
1606	1691	/*
1607	1692	* Clear previous best_cpu/rq numa-migrate flag, since task now
1608	1693	* found a better CPU to move/swap.
1609	1694	*/
1610		- if (env->best_cpu != -1) {
	1695	+ if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
1611	1696	rq = cpu_rq(env->best_cpu);
1612	1697	WRITE_ONCE(rq->numa_migrate_on, 0);
1613	1698	}
..	..	@@ -1663,7 +1748,7 @@
1663	1748	* into account that it might be best if task running on the dst_cpu should
1664	1749	* be exchanged with the source task
1665	1750	*/
1666		-static void task_numa_compare(struct task_numa_env *env,
	1751	+static bool task_numa_compare(struct task_numa_env *env,
1667	1752	long taskimp, long groupimp, bool maymove)
1668	1753	{
1669	1754	struct numa_group cur_ng, p_ng = deref_curr_numa_group(env->p);
..	..	@@ -1674,12 +1759,13 @@
1674	1759	int dist = env->dist;
1675	1760	long moveimp = imp;
1676	1761	long load;
	1762	+ bool stopsearch = false;
1677	1763
1678	1764	if (READ_ONCE(dst_rq->numa_migrate_on))
1679		- return;
	1765	+ return false;
1680	1766
1681	1767	rcu_read_lock();
1682		- cur = task_rcu_dereference(&dst_rq->curr);
	1768	+ cur = rcu_dereference(dst_rq->curr);
1683	1769	if (cur && ((cur->flags & PF_EXITING) \|\| is_idle_task(cur)))
1684	1770	cur = NULL;
1685	1771
..	..	@@ -1687,8 +1773,10 @@
1687	1773	* Because we have preemption enabled we can get migrated around and
1688	1774	* end try selecting ourselves (current == env->p) as a swap candidate.
1689	1775	*/
1690		- if (cur == env->p)
	1776	+ if (cur == env->p) {
	1777	+ stopsearch = true;
1691	1778	goto unlock;
	1779	+ }
1692	1780
1693	1781	if (!cur) {
1694	1782	if (maymove && moveimp >= env->best_imp)
..	..	@@ -1697,18 +1785,27 @@
1697	1785	goto unlock;
1698	1786	}
1699	1787
	1788	+ /* Skip this swap candidate if cannot move to the source cpu. */
	1789	+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
	1790	+ goto unlock;
	1791	+
	1792	+ /*
	1793	+ * Skip this swap candidate if it is not moving to its preferred
	1794	+ * node and the best task is.
	1795	+ */
	1796	+ if (env->best_task &&
	1797	+ env->best_task->numa_preferred_nid == env->src_nid &&
	1798	+ cur->numa_preferred_nid != env->src_nid) {
	1799	+ goto unlock;
	1800	+ }
	1801	+
1700	1802	/*
1701	1803	* "imp" is the fault differential for the source task between the
1702	1804	* source and destination node. Calculate the total differential for
1703	1805	* the source task and potential destination task. The more negative
1704	1806	* the value is, the more remote accesses that would be expected to
1705	1807	* be incurred if the tasks were swapped.
1706		- */
1707		- /* Skip this swap candidate if cannot move to the source cpu */
1708		- if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1709		- goto unlock;
1710		-
1711		- /*
	1808	+ *
1712	1809	* If dst and source tasks are in the same NUMA group, or not
1713	1810	* in any group then look only at task weights.
1714	1811	*/
..	..	@@ -1735,9 +1832,31 @@
1735	1832	task_weight(cur, env->dst_nid, dist);
1736	1833	}
1737	1834
	1835	+ /* Discourage picking a task already on its preferred node */
	1836	+ if (cur->numa_preferred_nid == env->dst_nid)
	1837	+ imp -= imp / 16;
	1838	+
	1839	+ /*
	1840	+ * Encourage picking a task that moves to its preferred node.
	1841	+ * This potentially makes imp larger than it's maximum of
	1842	+ * 1998 (see SMALLIMP and task_weight for why) but in this
	1843	+ * case, it does not matter.
	1844	+ */
	1845	+ if (cur->numa_preferred_nid == env->src_nid)
	1846	+ imp += imp / 8;
	1847	+
1738	1848	if (maymove && moveimp > imp && moveimp > env->best_imp) {
1739	1849	imp = moveimp;
1740	1850	cur = NULL;
	1851	+ goto assign;
	1852	+ }
	1853	+
	1854	+ /*
	1855	+ * Prefer swapping with a task moving to its preferred node over a
	1856	+ * task that is not.
	1857	+ */
	1858	+ if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
	1859	+ env->best_task->numa_preferred_nid != env->src_nid) {
1741	1860	goto assign;
1742	1861	}
1743	1862
..	..	@@ -1764,42 +1883,95 @@
1764	1883	goto unlock;
1765	1884
1766	1885	assign:
1767		- /*
1768		- * One idle CPU per node is evaluated for a task numa move.
1769		- * Call select_idle_sibling to maybe find a better one.
1770		- */
	1886	+ /* Evaluate an idle CPU for a task numa move. */
1771	1887	if (!cur) {
	1888	+ int cpu = env->dst_stats.idle_cpu;
	1889	+
	1890	+ /* Nothing cached so current CPU went idle since the search. */
	1891	+ if (cpu < 0)
	1892	+ cpu = env->dst_cpu;
	1893	+
1772	1894	/*
1773		- * select_idle_siblings() uses an per-CPU cpumask that
1774		- * can be used from IRQ context.
	1895	+ * If the CPU is no longer truly idle and the previous best CPU
	1896	+ * is, keep using it.
1775	1897	*/
1776		- local_irq_disable();
1777		- env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1778		- env->dst_cpu);
1779		- local_irq_enable();
	1898	+ if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
	1899	+ idle_cpu(env->best_cpu)) {
	1900	+ cpu = env->best_cpu;
	1901	+ }
	1902	+
	1903	+ env->dst_cpu = cpu;
1780	1904	}
1781	1905
1782	1906	task_numa_assign(env, cur, imp);
	1907	+
	1908	+ /*
	1909	+ * If a move to idle is allowed because there is capacity or load
	1910	+ * balance improves then stop the search. While a better swap
	1911	+ * candidate may exist, a search is not free.
	1912	+ */
	1913	+ if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
	1914	+ stopsearch = true;
	1915	+
	1916	+ /*
	1917	+ * If a swap candidate must be identified and the current best task
	1918	+ * moves its preferred node then stop the search.
	1919	+ */
	1920	+ if (!maymove && env->best_task &&
	1921	+ env->best_task->numa_preferred_nid == env->src_nid) {
	1922	+ stopsearch = true;
	1923	+ }
1783	1924	unlock:
1784	1925	rcu_read_unlock();
	1926	+
	1927	+ return stopsearch;
1785	1928	}
1786	1929
1787	1930	static void task_numa_find_cpu(struct task_numa_env *env,
1788	1931	long taskimp, long groupimp)
1789	1932	{
1790		- long src_load, dst_load, load;
1791	1933	bool maymove = false;
1792	1934	int cpu;
1793	1935
1794		- load = task_h_load(env->p);
1795		- dst_load = env->dst_stats.load + load;
1796		- src_load = env->src_stats.load - load;
1797		-
1798	1936	/*
1799		- * If the improvement from just moving env->p direction is better
1800		- * than swapping tasks around, check if a move is possible.
	1937	+ * If dst node has spare capacity, then check if there is an
	1938	+ * imbalance that would be overruled by the load balancer.
1801	1939	*/
1802		- maymove = !load_too_imbalanced(src_load, dst_load, env);
	1940	+ if (env->dst_stats.node_type == node_has_spare) {
	1941	+ unsigned int imbalance;
	1942	+ int src_running, dst_running;
	1943	+
	1944	+ /*
	1945	+ * Would movement cause an imbalance? Note that if src has
	1946	+ * more running tasks that the imbalance is ignored as the
	1947	+ * move improves the imbalance from the perspective of the
	1948	+ * CPU load balancer.
	1949	+ * */
	1950	+ src_running = env->src_stats.nr_running - 1;
	1951	+ dst_running = env->dst_stats.nr_running + 1;
	1952	+ imbalance = max(0, dst_running - src_running);
	1953	+ imbalance = adjust_numa_imbalance(imbalance, dst_running);
	1954	+
	1955	+ /* Use idle CPU if there is no imbalance */
	1956	+ if (!imbalance) {
	1957	+ maymove = true;
	1958	+ if (env->dst_stats.idle_cpu >= 0) {
	1959	+ env->dst_cpu = env->dst_stats.idle_cpu;
	1960	+ task_numa_assign(env, NULL, 0);
	1961	+ return;
	1962	+ }
	1963	+ }
	1964	+ } else {
	1965	+ long src_load, dst_load, load;
	1966	+ /*
	1967	+ * If the improvement from just moving env->p direction is better
	1968	+ * than swapping tasks around, check if a move is possible.
	1969	+ */
	1970	+ load = task_h_load(env->p);
	1971	+ dst_load = env->dst_stats.load + load;
	1972	+ src_load = env->src_stats.load - load;
	1973	+ maymove = !load_too_imbalanced(src_load, dst_load, env);
	1974	+ }
1803	1975
1804	1976	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1805	1977	/* Skip this CPU if the source task cannot migrate */
..	..	@@ -1807,7 +1979,8 @@
1807	1979	continue;
1808	1980
1809	1981	env->dst_cpu = cpu;
1810		- task_numa_compare(env, taskimp, groupimp, maymove);
	1982	+ if (task_numa_compare(env, taskimp, groupimp, maymove))
	1983	+ break;
1811	1984	}
1812	1985	}
1813	1986
..	..	@@ -1861,10 +2034,10 @@
1861	2034	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1862	2035	taskweight = task_weight(p, env.src_nid, dist);
1863	2036	groupweight = group_weight(p, env.src_nid, dist);
1864		- update_numa_stats(&env.src_stats, env.src_nid);
	2037	+ update_numa_stats(&env, &env.src_stats, env.src_nid, false);
1865	2038	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1866	2039	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1867		- update_numa_stats(&env.dst_stats, env.dst_nid);
	2040	+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
1868	2041
1869	2042	/* Try to find a spot on the preferred nid. */
1870	2043	task_numa_find_cpu(&env, taskimp, groupimp);
..	..	@@ -1897,7 +2070,7 @@
1897	2070
1898	2071	env.dist = dist;
1899	2072	env.dst_nid = nid;
1900		- update_numa_stats(&env.dst_stats, env.dst_nid);
	2073	+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
1901	2074	task_numa_find_cpu(&env, taskimp, groupimp);
1902	2075	}
1903	2076	}
..	..	@@ -1921,15 +2094,17 @@
1921	2094	}
1922	2095
1923	2096	/* No better CPU than the current one was found. */
1924		- if (env.best_cpu == -1)
	2097	+ if (env.best_cpu == -1) {
	2098	+ trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
1925	2099	return -EAGAIN;
	2100	+ }
1926	2101
1927	2102	best_rq = cpu_rq(env.best_cpu);
1928	2103	if (env.best_task == NULL) {
1929	2104	ret = migrate_task_to(p, env.best_cpu);
1930	2105	WRITE_ONCE(best_rq->numa_migrate_on, 0);
1931	2106	if (ret != 0)
1932		- trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
	2107	+ trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
1933	2108	return ret;
1934	2109	}
1935	2110
..	..	@@ -1937,7 +2112,7 @@
1937	2112	WRITE_ONCE(best_rq->numa_migrate_on, 0);
1938	2113
1939	2114	if (ret != 0)
1940		- trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
	2115	+ trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
1941	2116	put_task_struct(env.best_task);
1942	2117	return ret;
1943	2118	}
..	..	@@ -1948,7 +2123,7 @@
1948	2123	unsigned long interval = HZ;
1949	2124
1950	2125	/* This task has no NUMA fault statistics yet */
1951		- if (unlikely(p->numa_preferred_nid == -1 \|\| !p->numa_faults))
	2126	+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE \|\| !p->numa_faults))
1952	2127	return;
1953	2128
1954	2129	/* Periodically retry migrating the task to the preferred node */
..	..	@@ -2199,7 +2374,7 @@
2199	2374
2200	2375	static void task_numa_placement(struct task_struct *p)
2201	2376	{
2202		- int seq, nid, max_nid = -1;
	2377	+ int seq, nid, max_nid = NUMA_NO_NODE;
2203	2378	unsigned long max_faults = 0;
2204	2379	unsigned long fault_types[2] = { 0, 0 };
2205	2380	unsigned long total_faults;
..	..	@@ -2309,12 +2484,12 @@
2309	2484
2310	2485	static inline int get_numa_group(struct numa_group *grp)
2311	2486	{
2312		- return atomic_inc_not_zero(&grp->refcount);
	2487	+ return refcount_inc_not_zero(&grp->refcount);
2313	2488	}
2314	2489
2315	2490	static inline void put_numa_group(struct numa_group *grp)
2316	2491	{
2317		- if (atomic_dec_and_test(&grp->refcount))
	2492	+ if (refcount_dec_and_test(&grp->refcount))
2318	2493	kfree_rcu(grp, rcu);
2319	2494	}
2320	2495
..	..	@@ -2335,7 +2510,7 @@
2335	2510	if (!grp)
2336	2511	return;
2337	2512
2338		- atomic_set(&grp->refcount, 1);
	2513	+ refcount_set(&grp->refcount, 1);
2339	2514	grp->active_nodes = 1;
2340	2515	grp->max_faults_cpu = 0;
2341	2516	spin_lock_init(&grp->lock);
..	..	@@ -2522,8 +2697,8 @@
2522	2697	local = 1;
2523	2698
2524	2699	/*
2525		- * Retry task to preferred node migration periodically, in case it
2526		- * case it previously failed, or the scheduler moved us.
	2700	+ * Retry to migrate task to preferred node periodically, in case it
	2701	+ * previously failed, or the scheduler moved us.
2527	2702	*/
2528	2703	if (time_after(jiffies, p->numa_migrate_retry)) {
2529	2704	task_numa_placement(p);
..	..	@@ -2558,7 +2733,7 @@
2558	2733	* The expensive part of numa migration is done from task_work context.
2559	2734	* Triggered from task_tick_numa().
2560	2735	*/
2561		-void task_numa_work(struct callback_head *work)
	2736	+static void task_numa_work(struct callback_head *work)
2562	2737	{
2563	2738	unsigned long migrate, next_scan, now = jiffies;
2564	2739	struct task_struct *p = current;
..	..	@@ -2571,7 +2746,7 @@
2571	2746
2572	2747	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2573	2748
2574		- work->next = work; /* protect against double add */
	2749	+ work->next = work;
2575	2750	/*
2576	2751	* Who cares about NUMA placement when they're dying.
2577	2752	*
..	..	@@ -2618,7 +2793,7 @@
2618	2793	return;
2619	2794
2620	2795
2621		- if (!down_read_trylock(&mm->mmap_sem))
	2796	+ if (!mmap_read_trylock(mm))
2622	2797	return;
2623	2798	vma = find_vma(mm, start);
2624	2799	if (!vma) {
..	..	@@ -2646,7 +2821,7 @@
2646	2821	* Skip inaccessible VMAs to avoid any confusion between
2647	2822	* PROT_NONE and NUMA hinting ptes
2648	2823	*/
2649		- if (!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)))
	2824	+ if (!vma_is_accessible(vma))
2650	2825	continue;
2651	2826
2652	2827	do {
..	..	@@ -2686,7 +2861,7 @@
2686	2861	mm->numa_scan_offset = start;
2687	2862	else
2688	2863	reset_ptenuma_scan(p);
2689		- up_read(&mm->mmap_sem);
	2864	+ mmap_read_unlock(mm);
2690	2865
2691	2866	/*
2692	2867	* Make sure tasks use at least 32x as much time to run other code
..	..	@@ -2700,10 +2875,54 @@
2700	2875	}
2701	2876	}
2702	2877
	2878	+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
	2879	+{
	2880	+ int mm_users = 0;
	2881	+ struct mm_struct *mm = p->mm;
	2882	+
	2883	+ if (mm) {
	2884	+ mm_users = atomic_read(&mm->mm_users);
	2885	+ if (mm_users == 1) {
	2886	+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
	2887	+ mm->numa_scan_seq = 0;
	2888	+ }
	2889	+ }
	2890	+ p->node_stamp = 0;
	2891	+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
	2892	+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
	2893	+ /* Protect against double add, see task_tick_numa and task_numa_work */
	2894	+ p->numa_work.next = &p->numa_work;
	2895	+ p->numa_faults = NULL;
	2896	+ RCU_INIT_POINTER(p->numa_group, NULL);
	2897	+ p->last_task_numa_placement = 0;
	2898	+ p->last_sum_exec_runtime = 0;
	2899	+
	2900	+ init_task_work(&p->numa_work, task_numa_work);
	2901	+
	2902	+ /* New address space, reset the preferred nid */
	2903	+ if (!(clone_flags & CLONE_VM)) {
	2904	+ p->numa_preferred_nid = NUMA_NO_NODE;
	2905	+ return;
	2906	+ }
	2907	+
	2908	+ /*
	2909	+ * New thread, keep existing numa_preferred_nid which should be copied
	2910	+ * already by arch_dup_task_struct but stagger when scans start.
	2911	+ */
	2912	+ if (mm) {
	2913	+ unsigned int delay;
	2914	+
	2915	+ delay = min_t(unsigned int, task_scan_max(current),
	2916	+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
	2917	+ delay += 2 * TICK_NSEC;
	2918	+ p->node_stamp = delay;
	2919	+ }
	2920	+}
	2921	+
2703	2922	/*
2704	2923	* Drive the periodic memory faults..
2705	2924	*/
2706		-void task_tick_numa(struct rq rq, struct task_struct curr)
	2925	+static void task_tick_numa(struct rq rq, struct task_struct curr)
2707	2926	{
2708	2927	struct callback_head *work = &curr->numa_work;
2709	2928	u64 period, now;
..	..	@@ -2728,10 +2947,8 @@
2728	2947	curr->numa_scan_period = task_scan_start(curr);
2729	2948	curr->node_stamp += period;
2730	2949
2731		- if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2732		- init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2733		- task_work_add(curr, work, true);
2734		- }
	2950	+ if (!time_before(jiffies, curr->mm->numa_next_scan))
	2951	+ task_work_add(curr, work, TWA_RESUME);
2735	2952	}
2736	2953	}
2737	2954
..	..	@@ -2761,7 +2978,8 @@
2761	2978	* the preferred node.
2762	2979	*/
2763	2980	if (dst_nid == p->numa_preferred_nid \|\|
2764		- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
	2981	+ (p->numa_preferred_nid != NUMA_NO_NODE &&
	2982	+ src_nid != p->numa_preferred_nid))
2765	2983	return;
2766	2984	}
2767	2985
..	..	@@ -2791,8 +3009,6 @@
2791	3009	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
2792	3010	{
2793	3011	update_load_add(&cfs_rq->load, se->load.weight);
2794		- if (!parent_entity(se))
2795		- update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2796	3012	#ifdef CONFIG_SMP
2797	3013	if (entity_is_task(se)) {
2798	3014	struct rq *rq = rq_of(cfs_rq);
..	..	@@ -2808,8 +3024,6 @@
2808	3024	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
2809	3025	{
2810	3026	update_load_sub(&cfs_rq->load, se->load.weight);
2811		- if (!parent_entity(se))
2812		- update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2813	3027	#ifdef CONFIG_SMP
2814	3028	if (entity_is_task(se)) {
2815	3029	account_numa_dequeue(rq_of(cfs_rq), task_of(se));
..	..	@@ -2856,26 +3070,18 @@
2856	3070	WRITE_ONCE(*ptr, res); \
2857	3071	} while (0)
2858	3072
	3073	+/*
	3074	+ * Remove and clamp on negative, from a local variable.
	3075	+ *
	3076	+ * A variant of sub_positive(), which does not use explicit load-store
	3077	+ * and is thus optimized for local variable updates.
	3078	+ */
	3079	+#define lsub_positive(_ptr, _val) do { \
	3080	+ typeof(_ptr) ptr = (_ptr); \
	3081	+ ptr -= min_t(typeof(ptr), *ptr, _val); \
	3082	+} while (0)
	3083	+
2859	3084	#ifdef CONFIG_SMP
2860		-static inline void
2861		-enqueue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
2862		-{
2863		- cfs_rq->runnable_weight += se->runnable_weight;
2864		-
2865		- cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2866		- cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2867		-}
2868		-
2869		-static inline void
2870		-dequeue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
2871		-{
2872		- cfs_rq->runnable_weight -= se->runnable_weight;
2873		-
2874		- sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2875		- sub_positive(&cfs_rq->avg.runnable_load_sum,
2876		- se_runnable(se) * se->avg.runnable_load_sum);
2877		-}
2878		-
2879	3085	static inline void
2880	3086	enqueue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
2881	3087	{
..	..	@@ -2891,45 +3097,36 @@
2891	3097	}
2892	3098	#else
2893	3099	static inline void
2894		-enqueue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
2895		-static inline void
2896		-dequeue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
2897		-static inline void
2898	3100	enqueue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
2899	3101	static inline void
2900	3102	dequeue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
2901	3103	#endif
2902	3104
2903	3105	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
2904		- unsigned long weight, unsigned long runnable)
	3106	+ unsigned long weight)
2905	3107	{
2906	3108	if (se->on_rq) {
2907	3109	/* commit outstanding execution time */
2908	3110	if (cfs_rq->curr == se)
2909	3111	update_curr(cfs_rq);
2910		- account_entity_dequeue(cfs_rq, se);
2911		- dequeue_runnable_load_avg(cfs_rq, se);
	3112	+ update_load_sub(&cfs_rq->load, se->load.weight);
2912	3113	}
2913	3114	dequeue_load_avg(cfs_rq, se);
2914	3115
2915		- se->runnable_weight = runnable;
2916	3116	update_load_set(&se->load, weight);
2917	3117
2918	3118	#ifdef CONFIG_SMP
2919	3119	do {
2920		- u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
	3120	+ u32 divider = get_pelt_divider(&se->avg);
2921	3121
2922	3122	se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2923		- se->avg.runnable_load_avg =
2924		- div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2925	3123	} while (0);
2926	3124	#endif
2927	3125
2928	3126	enqueue_load_avg(cfs_rq, se);
2929		- if (se->on_rq) {
2930		- account_entity_enqueue(cfs_rq, se);
2931		- enqueue_runnable_load_avg(cfs_rq, se);
2932		- }
	3127	+ if (se->on_rq)
	3128	+ update_load_add(&cfs_rq->load, se->load.weight);
	3129	+
2933	3130	}
2934	3131
2935	3132	void reweight_task(struct task_struct *p, int prio)
..	..	@@ -2939,7 +3136,7 @@
2939	3136	struct load_weight *load = &se->load;
2940	3137	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2941	3138
2942		- reweight_entity(cfs_rq, se, weight, weight);
	3139	+ reweight_entity(cfs_rq, se, weight);
2943	3140	load->inv_weight = sched_prio_to_wmult[prio];
2944	3141	}
2945	3142
..	..	@@ -3051,50 +3248,6 @@
3051	3248	*/
3052	3249	return clamp_t(long, shares, MIN_SHARES, tg_shares);
3053	3250	}
3054		-
3055		-/*
3056		- * This calculates the effective runnable weight for a group entity based on
3057		- * the group entity weight calculated above.
3058		- *
3059		- * Because of the above approximation (2), our group entity weight is
3060		- * an load_avg based ratio (3). This means that it includes blocked load and
3061		- * does not represent the runnable weight.
3062		- *
3063		- * Approximate the group entity's runnable weight per ratio from the group
3064		- * runqueue:
3065		- *
3066		- * grq->avg.runnable_load_avg
3067		- * ge->runnable_weight = ge->load.weight * -------------------------- (7)
3068		- * grq->avg.load_avg
3069		- *
3070		- * However, analogous to above, since the avg numbers are slow, this leads to
3071		- * transients in the from-idle case. Instead we use:
3072		- *
3073		- * ge->runnable_weight = ge->load.weight *
3074		- *
3075		- * max(grq->avg.runnable_load_avg, grq->runnable_weight)
3076		- * ----------------------------------------------------- (8)
3077		- * max(grq->avg.load_avg, grq->load.weight)
3078		- *
3079		- * Where these max() serve both to use the 'instant' values to fix the slow
3080		- * from-idle and avoid the /0 on to-idle, similar to (6).
3081		- */
3082		-static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
3083		-{
3084		- long runnable, load_avg;
3085		-
3086		- load_avg = max(cfs_rq->avg.load_avg,
3087		- scale_load_down(cfs_rq->load.weight));
3088		-
3089		- runnable = max(cfs_rq->avg.runnable_load_avg,
3090		- scale_load_down(cfs_rq->runnable_weight));
3091		-
3092		- runnable *= shares;
3093		- if (load_avg)
3094		- runnable /= load_avg;
3095		-
3096		- return clamp_t(long, runnable, MIN_SHARES, shares);
3097		-}
3098	3251	#endif /* CONFIG_SMP */
3099	3252
3100	3253	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
..	..	@@ -3106,7 +3259,7 @@
3106	3259	static void update_cfs_group(struct sched_entity *se)
3107	3260	{
3108	3261	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3109		- long shares, runnable;
	3262	+ long shares;
3110	3263
3111	3264	if (!gcfs_rq)
3112	3265	return;
..	..	@@ -3115,16 +3268,15 @@
3115	3268	return;
3116	3269
3117	3270	#ifndef CONFIG_SMP
3118		- runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
	3271	+ shares = READ_ONCE(gcfs_rq->tg->shares);
3119	3272
3120	3273	if (likely(se->load.weight == shares))
3121	3274	return;
3122	3275	#else
3123	3276	shares = calc_group_shares(gcfs_rq);
3124		- runnable = calc_group_runnable(gcfs_rq, shares);
3125	3277	#endif
3126	3278
3127		- reweight_entity(cfs_rq_of(se), se, shares, runnable);
	3279	+ reweight_entity(cfs_rq_of(se), se, shares);
3128	3280	}
3129	3281
3130	3282	#else /* CONFIG_FAIR_GROUP_SCHED */
..	..	@@ -3137,7 +3289,7 @@
3137	3289	{
3138	3290	struct rq *rq = rq_of(cfs_rq);
3139	3291
3140		- if (&rq->cfs == cfs_rq \|\| (flags & SCHED_CPUFREQ_MIGRATION)) {
	3292	+ if (&rq->cfs == cfs_rq) {
3141	3293	/*
3142	3294	* There are a few boundary cases this might miss but it should
3143	3295	* get called often enough that that should (hopefully) not be
..	..	@@ -3161,7 +3313,6 @@
3161	3313	/**
3162	3314	* update_tg_load_avg - update the tg's load avg
3163	3315	* @cfs_rq: the cfs_rq whose avg changed
3164		- * @force: update regardless of how small the difference
3165	3316	*
3166	3317	* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3167	3318	* However, because tg->load_avg is a global value there are performance
..	..	@@ -3173,7 +3324,7 @@
3173	3324	*
3174	3325	* Updating tg's load_avg is necessary before update_cfs_share().
3175	3326	*/
3176		-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
	3327	+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
3177	3328	{
3178	3329	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3179	3330
..	..	@@ -3183,11 +3334,9 @@
3183	3334	if (cfs_rq->tg == &root_task_group)
3184	3335	return;
3185	3336
3186		- if (force \|\| abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
	3337	+ if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3187	3338	atomic_long_add(delta, &cfs_rq->tg->load_avg);
3188	3339	cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3189		-
3190		- trace_sched_load_tg(cfs_rq);
3191	3340	}
3192	3341	}
3193	3342
..	..	@@ -3240,7 +3389,6 @@
3240	3389	se->avg.last_update_time = n_last_update_time;
3241	3390	}
3242	3391
3243		-
3244	3392	/*
3245	3393	* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3246	3394	* propagate its contribution. The key to this propagation is the invariant
..	..	@@ -3251,11 +3399,11 @@
3251	3399	* _IFF_ we look at the pure running and runnable sums. Because they
3252	3400	* represent the very same entity, just at different points in the hierarchy.
3253	3401	*
3254		- * Per the above update_tg_cfs_util() is trivial and simply copies the running
3255		- * sum over (but still wrong, because the group entity and group rq do not have
3256		- * their PELT windows aligned).
	3402	+ * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
	3403	+ * and simply copies the running/runnable sum over (but still wrong, because
	3404	+ * the group entity and group rq do not have their PELT windows aligned).
3257	3405	*
3258		- * However, update_tg_cfs_runnable() is more complex. So we have:
	3406	+ * However, update_tg_cfs_load() is more complex. So we have:
3259	3407	*
3260	3408	* ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
3261	3409	*
..	..	@@ -3308,45 +3456,75 @@
3308	3456	* XXX: only do this for the part of runnable > running ?
3309	3457	*
3310	3458	*/
3311		-
3312	3459	static inline void
3313	3460	update_tg_cfs_util(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
3314	3461	{
3315	3462	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
	3463	+ u32 divider;
3316	3464
3317	3465	/* Nothing to update */
3318	3466	if (!delta)
3319	3467	return;
3320	3468
3321	3469	/*
3322		- * The relation between sum and avg is:
3323		- *
3324		- * LOAD_AVG_MAX - 1024 + sa->period_contrib
3325		- *
3326		- * however, the PELT windows are not aligned between grq and gse.
	3470	+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
	3471	+ * See ___update_load_avg() for details.
3327	3472	*/
	3473	+ divider = get_pelt_divider(&cfs_rq->avg);
3328	3474
3329	3475	/* Set new sched_entity's utilization */
3330	3476	se->avg.util_avg = gcfs_rq->avg.util_avg;
3331		- se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
	3477	+ se->avg.util_sum = se->avg.util_avg * divider;
3332	3478
3333	3479	/* Update parent cfs_rq utilization */
3334	3480	add_positive(&cfs_rq->avg.util_avg, delta);
3335		- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
	3481	+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3336	3482	}
3337	3483
3338	3484	static inline void
3339	3485	update_tg_cfs_runnable(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
3340	3486	{
	3487	+ long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
	3488	+ u32 divider;
	3489	+
	3490	+ /* Nothing to update */
	3491	+ if (!delta)
	3492	+ return;
	3493	+
	3494	+ /*
	3495	+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
	3496	+ * See ___update_load_avg() for details.
	3497	+ */
	3498	+ divider = get_pelt_divider(&cfs_rq->avg);
	3499	+
	3500	+ /* Set new sched_entity's runnable */
	3501	+ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
	3502	+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
	3503	+
	3504	+ /* Update parent cfs_rq runnable */
	3505	+ add_positive(&cfs_rq->avg.runnable_avg, delta);
	3506	+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
	3507	+}
	3508	+
	3509	+static inline void
	3510	+update_tg_cfs_load(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
	3511	+{
3341	3512	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3342		- unsigned long runnable_load_avg, load_avg;
3343		- u64 runnable_load_sum, load_sum = 0;
	3513	+ unsigned long load_avg;
	3514	+ u64 load_sum = 0;
3344	3515	s64 delta_sum;
	3516	+ u32 divider;
3345	3517
3346	3518	if (!runnable_sum)
3347	3519	return;
3348	3520
3349	3521	gcfs_rq->prop_runnable_sum = 0;
	3522	+
	3523	+ /*
	3524	+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
	3525	+ * See ___update_load_avg() for details.
	3526	+ */
	3527	+ divider = get_pelt_divider(&cfs_rq->avg);
3350	3528
3351	3529	if (runnable_sum >= 0) {
3352	3530	/*
..	..	@@ -3354,7 +3532,7 @@
3354	3532	* the CPU is saturated running == runnable.
3355	3533	*/
3356	3534	runnable_sum += se->avg.load_sum;
3357		- runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
	3535	+ runnable_sum = min_t(long, runnable_sum, divider);
3358	3536	} else {
3359	3537	/*
3360	3538	* Estimate the new unweighted runnable_sum of the gcfs_rq by
..	..	@@ -3379,7 +3557,7 @@
3379	3557	runnable_sum = max(runnable_sum, running_sum);
3380	3558
3381	3559	load_sum = (s64)se_weight(se) * runnable_sum;
3382		- load_avg = div_s64(load_sum, LOAD_AVG_MAX);
	3560	+ load_avg = div_s64(load_sum, divider);
3383	3561
3384	3562	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3385	3563	delta_avg = load_avg - se->avg.load_avg;
..	..	@@ -3388,19 +3566,6 @@
3388	3566	se->avg.load_avg = load_avg;
3389	3567	add_positive(&cfs_rq->avg.load_avg, delta_avg);
3390	3568	add_positive(&cfs_rq->avg.load_sum, delta_sum);
3391		-
3392		- runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3393		- runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3394		- delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3395		- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3396		-
3397		- se->avg.runnable_load_sum = runnable_sum;
3398		- se->avg.runnable_load_avg = runnable_load_avg;
3399		-
3400		- if (se->on_rq) {
3401		- add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3402		- add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3403		- }
3404	3569	}
3405	3570
3406	3571	static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
..	..	@@ -3429,9 +3594,10 @@
3429	3594
3430	3595	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3431	3596	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
	3597	+ update_tg_cfs_load(cfs_rq, se, gcfs_rq);
3432	3598
3433		- trace_sched_load_cfs_rq(cfs_rq);
3434		- trace_sched_load_se(se);
	3599	+ trace_pelt_cfs_tp(cfs_rq);
	3600	+ trace_pelt_se_tp(se);
3435	3601
3436	3602	return 1;
3437	3603	}
..	..	@@ -3468,7 +3634,7 @@
3468	3634
3469	3635	#else /* CONFIG_FAIR_GROUP_SCHED */
3470	3636
3471		-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
	3637	+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
3472	3638
3473	3639	static inline int propagate_entity_load_avg(struct sched_entity *se)
3474	3640	{
..	..	@@ -3498,18 +3664,18 @@
3498	3664	static inline int
3499	3665	update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3500	3666	{
3501		- unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
	3667	+ unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
3502	3668	struct sched_avg *sa = &cfs_rq->avg;
3503	3669	int decayed = 0;
3504	3670
3505	3671	if (cfs_rq->removed.nr) {
3506	3672	unsigned long r;
3507		- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
	3673	+ u32 divider = get_pelt_divider(&cfs_rq->avg);
3508	3674
3509	3675	raw_spin_lock(&cfs_rq->removed.lock);
3510	3676	swap(cfs_rq->removed.util_avg, removed_util);
3511	3677	swap(cfs_rq->removed.load_avg, removed_load);
3512		- swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
	3678	+ swap(cfs_rq->removed.runnable_avg, removed_runnable);
3513	3679	cfs_rq->removed.nr = 0;
3514	3680	raw_spin_unlock(&cfs_rq->removed.lock);
3515	3681
..	..	@@ -3520,8 +3686,29 @@
3520	3686	r = removed_util;
3521	3687	sub_positive(&sa->util_avg, r);
3522	3688	sub_positive(&sa->util_sum, r * divider);
	3689	+ /*
	3690	+ * Because of rounding, se->util_sum might ends up being +1 more than
	3691	+ * cfs->util_sum. Although this is not a problem by itself, detaching
	3692	+ * a lot of tasks with the rounding problem between 2 updates of
	3693	+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
	3694	+ * cfs_util_avg is not.
	3695	+ * Check that util_sum is still above its lower bound for the new
	3696	+ * util_avg. Given that period_contrib might have moved since the last
	3697	+ * sync, we are only sure that util_sum must be above or equal to
	3698	+ * util_avg * minimum possible divider
	3699	+ */
	3700	+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
3523	3701
3524		- add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
	3702	+ r = removed_runnable;
	3703	+ sub_positive(&sa->runnable_avg, r);
	3704	+ sub_positive(&sa->runnable_sum, r * divider);
	3705	+
	3706	+ /*
	3707	+ * removed_runnable is the unweighted version of removed_load so we
	3708	+ * can use it to estimate removed_load_sum.
	3709	+ */
	3710	+ add_tg_cfs_propagate(cfs_rq,
	3711	+ -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
3525	3712
3526	3713	decayed = 1;
3527	3714	}
..	..	@@ -3533,9 +3720,6 @@
3533	3720	cfs_rq->load_last_update_time_copy = sa->last_update_time;
3534	3721	#endif
3535	3722
3536		- if (decayed)
3537		- cfs_rq_util_change(cfs_rq, 0);
3538		-
3539	3723	return decayed;
3540	3724	}
3541	3725
..	..	@@ -3543,14 +3727,17 @@
3543	3727	* attach_entity_load_avg - attach this entity to its cfs_rq load avg
3544	3728	* @cfs_rq: cfs_rq to attach to
3545	3729	* @se: sched_entity to attach
3546		- * @flags: migration hints
3547	3730	*
3548	3731	* Must call update_cfs_rq_load_avg() before this, since we rely on
3549	3732	* cfs_rq->avg.last_update_time being current.
3550	3733	*/
3551		-static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
	3734	+static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
3552	3735	{
3553		- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
	3736	+ /*
	3737	+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
	3738	+ * See ___update_load_avg() for details.
	3739	+ */
	3740	+ u32 divider = get_pelt_divider(&cfs_rq->avg);
3554	3741
3555	3742	/*
3556	3743	* When we attach the @se to the @cfs_rq, we must align the decay
..	..	@@ -3570,23 +3757,25 @@
3570	3757	*/
3571	3758	se->avg.util_sum = se->avg.util_avg * divider;
3572	3759
3573		- se->avg.load_sum = divider;
3574		- if (se_weight(se)) {
3575		- se->avg.load_sum =
3576		- div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3577		- }
	3760	+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
3578	3761
3579		- se->avg.runnable_load_sum = se->avg.load_sum;
	3762	+ se->avg.load_sum = se->avg.load_avg * divider;
	3763	+ if (se_weight(se) < se->avg.load_sum)
	3764	+ se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
	3765	+ else
	3766	+ se->avg.load_sum = 1;
3580	3767
3581	3768	enqueue_load_avg(cfs_rq, se);
3582	3769	cfs_rq->avg.util_avg += se->avg.util_avg;
3583	3770	cfs_rq->avg.util_sum += se->avg.util_sum;
	3771	+ cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
	3772	+ cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
3584	3773
3585	3774	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3586	3775
3587		- cfs_rq_util_change(cfs_rq, flags);
	3776	+ cfs_rq_util_change(cfs_rq, 0);
3588	3777
3589		- trace_sched_load_cfs_rq(cfs_rq);
	3778	+ trace_pelt_cfs_tp(cfs_rq);
3590	3779	}
3591	3780
3592	3781	/**
..	..	@@ -3602,12 +3791,14 @@
3602	3791	dequeue_load_avg(cfs_rq, se);
3603	3792	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3604	3793	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
	3794	+ sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
	3795	+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
3605	3796
3606	3797	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3607	3798
3608	3799	cfs_rq_util_change(cfs_rq, 0);
3609	3800
3610		- trace_sched_load_cfs_rq(cfs_rq);
	3801	+ trace_pelt_cfs_tp(cfs_rq);
3611	3802	}
3612	3803
3613	3804	/*
..	..	@@ -3623,12 +3814,15 @@
3623	3814	u64 now = cfs_rq_clock_pelt(cfs_rq);
3624	3815	int decayed;
3625	3816
	3817	+ trace_android_vh_prepare_update_load_avg_se(se, flags);
3626	3818	/*
3627	3819	* Track task load average for carrying it to new CPU after migrated, and
3628	3820	* track group sched_entity load average for task_h_load calc in migration
3629	3821	*/
3630	3822	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3631	3823	__update_load_avg_se(now, cfs_rq, se);
	3824	+
	3825	+ trace_android_vh_finish_update_load_avg_se(se, flags);
3632	3826
3633	3827	decayed = update_cfs_rq_load_avg(now, cfs_rq);
3634	3828	decayed \|= propagate_entity_load_avg(se);
..	..	@@ -3642,11 +3836,15 @@
3642	3836	*
3643	3837	* IOW we're enqueueing a task on a new CPU.
3644	3838	*/
3645		- attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3646		- update_tg_load_avg(cfs_rq, 0);
	3839	+ attach_entity_load_avg(cfs_rq, se);
	3840	+ update_tg_load_avg(cfs_rq);
3647	3841
3648		- } else if (decayed && (flags & UPDATE_TG))
3649		- update_tg_load_avg(cfs_rq, 0);
	3842	+ } else if (decayed) {
	3843	+ cfs_rq_util_change(cfs_rq, 0);
	3844	+
	3845	+ if (flags & UPDATE_TG)
	3846	+ update_tg_load_avg(cfs_rq);
	3847	+ }
3650	3848	}
3651	3849
3652	3850	#ifndef CONFIG_64BIT
..	..	@@ -3674,20 +3872,22 @@
3674	3872	* Synchronize entity load avg of dequeued entity without locking
3675	3873	* the previous rq.
3676	3874	*/
3677		-void sync_entity_load_avg(struct sched_entity *se)
	3875	+static void sync_entity_load_avg(struct sched_entity *se)
3678	3876	{
3679	3877	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3680	3878	u64 last_update_time;
3681	3879
3682	3880	last_update_time = cfs_rq_last_update_time(cfs_rq);
	3881	+ trace_android_vh_prepare_update_load_avg_se(se, 0);
3683	3882	__update_load_avg_blocked_se(last_update_time, se);
	3883	+ trace_android_vh_finish_update_load_avg_se(se, 0);
3684	3884	}
3685	3885
3686	3886	/*
3687	3887	* Task first catches up with cfs_rq, and then subtract
3688	3888	* itself from the cfs_rq (task must be off the queue now).
3689	3889	*/
3690		-void remove_entity_load_avg(struct sched_entity *se)
	3890	+static void remove_entity_load_avg(struct sched_entity *se)
3691	3891	{
3692	3892	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3693	3893	unsigned long flags;
..	..	@@ -3696,10 +3896,6 @@
3696	3896	* tasks cannot exit without having gone through wake_up_new_task() ->
3697	3897	* post_init_entity_util_avg() which will have added things to the
3698	3898	* cfs_rq, so we can remove unconditionally.
3699		- *
3700		- * Similarly for groups, they will have passed through
3701		- * post_init_entity_util_avg() before unregister_sched_fair_group()
3702		- * calls this.
3703	3899	*/
3704	3900
3705	3901	sync_entity_load_avg(se);
..	..	@@ -3708,13 +3904,13 @@
3708	3904	++cfs_rq->removed.nr;
3709	3905	cfs_rq->removed.util_avg += se->avg.util_avg;
3710	3906	cfs_rq->removed.load_avg += se->avg.load_avg;
3711		- cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
	3907	+ cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
3712	3908	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3713	3909	}
3714	3910
3715		-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
	3911	+static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
3716	3912	{
3717		- return cfs_rq->avg.runnable_load_avg;
	3913	+ return cfs_rq->avg.runnable_avg;
3718	3914	}
3719	3915
3720	3916	static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
..	..	@@ -3722,7 +3918,7 @@
3722	3918	return cfs_rq->avg.load_avg;
3723	3919	}
3724	3920
3725		-static int idle_balance(struct rq this_rq, struct rq_flags rf);
	3921	+static int newidle_balance(struct rq this_rq, struct rq_flags rf);
3726	3922
3727	3923	static inline unsigned long task_util(struct task_struct *p)
3728	3924	{
..	..	@@ -3733,23 +3929,25 @@
3733	3929	{
3734	3930	struct util_est ue = READ_ONCE(p->se.avg.util_est);
3735	3931
3736		- return max(ue.ewma, ue.enqueued);
	3932	+ return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
3737	3933	}
3738	3934
3739		-unsigned long task_util_est(struct task_struct *p)
	3935	+static inline unsigned long task_util_est(struct task_struct *p)
3740	3936	{
3741	3937	return max(task_util(p), _task_util_est(p));
3742	3938	}
3743	3939
3744	3940	#ifdef CONFIG_UCLAMP_TASK
3745		-static inline unsigned long uclamp_task_util(struct task_struct *p)
	3941	+static inline unsigned long uclamp_task_util(struct task_struct *p,
	3942	+ unsigned long uclamp_min,
	3943	+ unsigned long uclamp_max)
3746	3944	{
3747		- return clamp(task_util_est(p),
3748		- uclamp_eff_value(p, UCLAMP_MIN),
3749		- uclamp_eff_value(p, UCLAMP_MAX));
	3945	+ return clamp(task_util_est(p), uclamp_min, uclamp_max);
3750	3946	}
3751	3947	#else
3752		-static inline unsigned long uclamp_task_util(struct task_struct *p)
	3948	+static inline unsigned long uclamp_task_util(struct task_struct *p,
	3949	+ unsigned long uclamp_min,
	3950	+ unsigned long uclamp_max)
3753	3951	{
3754	3952	return task_util_est(p);
3755	3953	}
..	..	@@ -3765,13 +3963,29 @@
3765	3963
3766	3964	/* Update root cfs_rq's estimated utilization */
3767	3965	enqueued = cfs_rq->avg.util_est.enqueued;
3768		- enqueued += (_task_util_est(p) \| UTIL_AVG_UNCHANGED);
	3966	+ enqueued += _task_util_est(p);
3769	3967	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3770	3968
3771		- /* Update plots for Task and CPU estimated utilization */
3772		- trace_sched_util_est_task(p, &p->se.avg);
3773		- trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
	3969	+ trace_sched_util_est_cfs_tp(cfs_rq);
3774	3970	}
	3971	+
	3972	+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
	3973	+ struct task_struct *p)
	3974	+{
	3975	+ unsigned int enqueued;
	3976	+
	3977	+ if (!sched_feat(UTIL_EST))
	3978	+ return;
	3979	+
	3980	+ /* Update root cfs_rq's estimated utilization */
	3981	+ enqueued = cfs_rq->avg.util_est.enqueued;
	3982	+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
	3983	+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
	3984	+
	3985	+ trace_sched_util_est_cfs_tp(cfs_rq);
	3986	+}
	3987	+
	3988	+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
3775	3989
3776	3990	/*
3777	3991	* Check if a (signed) value is within a specified (unsigned) margin,
..	..	@@ -3786,24 +4000,20 @@
3786	4000	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3787	4001	}
3788	4002
3789		-static void
3790		-util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p, bool task_sleep)
	4003	+static inline void util_est_update(struct cfs_rq *cfs_rq,
	4004	+ struct task_struct *p,
	4005	+ bool task_sleep)
3791	4006	{
3792		- long last_ewma_diff;
	4007	+ long last_ewma_diff, last_enqueued_diff;
3793	4008	struct util_est ue;
3794		- int cpu;
	4009	+ int ret = 0;
	4010	+
	4011	+ trace_android_rvh_util_est_update(cfs_rq, p, task_sleep, &ret);
	4012	+ if (ret)
	4013	+ return;
3795	4014
3796	4015	if (!sched_feat(UTIL_EST))
3797	4016	return;
3798		-
3799		- /* Update root cfs_rq's estimated utilization */
3800		- ue.enqueued = cfs_rq->avg.util_est.enqueued;
3801		- ue.enqueued -= min_t(unsigned int, ue.enqueued,
3802		- (_task_util_est(p) \| UTIL_AVG_UNCHANGED));
3803		- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3804		-
3805		- /* Update plots for CPU's estimated utilization */
3806		- trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
3807	4017
3808	4018	/*
3809	4019	* Skip update of task's estimated utilization when the task has not
..	..	@@ -3820,11 +4030,13 @@
3820	4030	if (ue.enqueued & UTIL_AVG_UNCHANGED)
3821	4031	return;
3822	4032
	4033	+ last_enqueued_diff = ue.enqueued;
	4034	+
3823	4035	/*
3824	4036	* Reset EWMA on utilization increases, the moving average is used only
3825	4037	* to smooth utilization decreases.
3826	4038	*/
3827		- ue.enqueued = (task_util(p) \| UTIL_AVG_UNCHANGED);
	4039	+ ue.enqueued = task_util(p);
3828	4040	if (sched_feat(UTIL_EST_FASTUP)) {
3829	4041	if (ue.ewma < ue.enqueued) {
3830	4042	ue.ewma = ue.enqueued;
..	..	@@ -3833,19 +4045,23 @@
3833	4045	}
3834	4046
3835	4047	/*
3836		- * Skip update of task's estimated utilization when its EWMA is
	4048	+ * Skip update of task's estimated utilization when its members are
3837	4049	* already ~1% close to its last activation value.
3838	4050	*/
3839	4051	last_ewma_diff = ue.enqueued - ue.ewma;
3840		- if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
	4052	+ last_enqueued_diff -= ue.enqueued;
	4053	+ if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
	4054	+ if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
	4055	+ goto done;
	4056	+
3841	4057	return;
	4058	+ }
3842	4059
3843	4060	/*
3844	4061	* To avoid overestimation of actual task utilization, skip updates if
3845	4062	* we cannot grant there is idle time in this CPU.
3846	4063	*/
3847		- cpu = cpu_of(rq_of(cfs_rq));
3848		- if (task_util(p) > capacity_orig_of(cpu))
	4064	+ if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
3849	4065	return;
3850	4066
3851	4067	/*
..	..	@@ -3869,49 +4085,166 @@
3869	4085	ue.ewma += last_ewma_diff;
3870	4086	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
3871	4087	done:
	4088	+ ue.enqueued \|= UTIL_AVG_UNCHANGED;
3872	4089	WRITE_ONCE(p->se.avg.util_est, ue);
3873	4090
3874		- /* Update plots for Task's estimated utilization */
3875		- trace_sched_util_est_task(p, &p->se.avg);
	4091	+ trace_sched_util_est_se_tp(&p->se);
3876	4092	}
3877	4093
3878		-static inline int task_fits_capacity(struct task_struct *p, long capacity)
	4094	+static inline int util_fits_cpu(unsigned long util,
	4095	+ unsigned long uclamp_min,
	4096	+ unsigned long uclamp_max,
	4097	+ int cpu)
3879	4098	{
3880		- return capacity * 1024 > uclamp_task_util(p) * capacity_margin;
3881		-}
3882		-
3883		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
3884		-static inline bool task_fits_max(struct task_struct *p, int cpu)
3885		-{
	4099	+ unsigned long capacity_orig, capacity_orig_thermal;
3886	4100	unsigned long capacity = capacity_of(cpu);
3887		- unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
	4101	+ bool fits, uclamp_max_fits;
3888	4102
3889		- if (capacity == max_capacity)
3890		- return true;
	4103	+ /*
	4104	+ * Check if the real util fits without any uclamp boost/cap applied.
	4105	+ */
	4106	+ fits = fits_capacity(util, capacity);
3891	4107
3892		- if (capacity * capacity_margin > max_capacity * 1024)
3893		- return true;
	4108	+ if (!uclamp_is_used())
	4109	+ return fits;
3894	4110
3895		- return task_fits_capacity(p, capacity);
	4111	+ /*
	4112	+ * We must use capacity_orig_of() for comparing against uclamp_min and
	4113	+ * uclamp_max. We only care about capacity pressure (by using
	4114	+ * capacity_of()) for comparing against the real util.
	4115	+ *
	4116	+ * If a task is boosted to 1024 for example, we don't want a tiny
	4117	+ * pressure to skew the check whether it fits a CPU or not.
	4118	+ *
	4119	+ * Similarly if a task is capped to capacity_orig_of(little_cpu), it
	4120	+ * should fit a little cpu even if there's some pressure.
	4121	+ *
	4122	+ * Only exception is for thermal pressure since it has a direct impact
	4123	+ * on available OPP of the system.
	4124	+ *
	4125	+ * We honour it for uclamp_min only as a drop in performance level
	4126	+ * could result in not getting the requested minimum performance level.
	4127	+ *
	4128	+ * For uclamp_max, we can tolerate a drop in performance level as the
	4129	+ * goal is to cap the task. So it's okay if it's getting less.
	4130	+ *
	4131	+ * In case of capacity inversion, which is not handled yet, we should
	4132	+ * honour the inverted capacity for both uclamp_min and uclamp_max all
	4133	+ * the time.
	4134	+ */
	4135	+ capacity_orig = capacity_orig_of(cpu);
	4136	+ capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
	4137	+
	4138	+ /*
	4139	+ * We want to force a task to fit a cpu as implied by uclamp_max.
	4140	+ * But we do have some corner cases to cater for..
	4141	+ *
	4142	+ *
	4143	+ * C=z
	4144	+ * \| ___
	4145	+ * \| C=y \| \|
	4146	+ * \|_ _ _ _ _ _ _ _ _ ___ _ _ _ \| _ \| _ _ _ _ _ uclamp_max
	4147	+ * \| C=x \| \| \| \|
	4148	+ * \| ___ \| \| \| \|
	4149	+ * \| \| \| \| \| \| \| (util somewhere in this region)
	4150	+ * \| \| \| \| \| \| \|
	4151	+ * \| \| \| \| \| \| \|
	4152	+ * +----------------------------------------
	4153	+ * cpu0 cpu1 cpu2
	4154	+ *
	4155	+ * In the above example if a task is capped to a specific performance
	4156	+ * point, y, then when:
	4157	+ *
	4158	+ * * util = 80% of x then it does not fit on cpu0 and should migrate
	4159	+ * to cpu1
	4160	+ * * util = 80% of y then it is forced to fit on cpu1 to honour
	4161	+ * uclamp_max request.
	4162	+ *
	4163	+ * which is what we're enforcing here. A task always fits if
	4164	+ * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
	4165	+ * the normal upmigration rules should withhold still.
	4166	+ *
	4167	+ * Only exception is when we are on max capacity, then we need to be
	4168	+ * careful not to block overutilized state. This is so because:
	4169	+ *
	4170	+ * 1. There's no concept of capping at max_capacity! We can't go
	4171	+ * beyond this performance level anyway.
	4172	+ * 2. The system is being saturated when we're operating near
	4173	+ * max capacity, it doesn't make sense to block overutilized.
	4174	+ */
	4175	+ uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
	4176	+ uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
	4177	+ fits = fits \|\| uclamp_max_fits;
	4178	+
	4179	+ /*
	4180	+ *
	4181	+ * C=z
	4182	+ * \| ___ (region a, capped, util >= uclamp_max)
	4183	+ * \| C=y \| \|
	4184	+ * \|_ _ _ _ _ _ _ _ _ ___ _ _ _ \| _ \| _ _ _ _ _ uclamp_max
	4185	+ * \| C=x \| \| \| \|
	4186	+ * \| ___ \| \| \| \| (region b, uclamp_min <= util <= uclamp_max)
	4187	+ * \|_ _ _\|_ _\|_ _ _ _\| _ \| _ _ _\| _ \| _ _ _ _ _ uclamp_min
	4188	+ * \| \| \| \| \| \| \|
	4189	+ * \| \| \| \| \| \| \| (region c, boosted, util < uclamp_min)
	4190	+ * +----------------------------------------
	4191	+ * cpu0 cpu1 cpu2
	4192	+ *
	4193	+ * a) If util > uclamp_max, then we're capped, we don't care about
	4194	+ * actual fitness value here. We only care if uclamp_max fits
	4195	+ * capacity without taking margin/pressure into account.
	4196	+ * See comment above.
	4197	+ *
	4198	+ * b) If uclamp_min <= util <= uclamp_max, then the normal
	4199	+ * fits_capacity() rules apply. Except we need to ensure that we
	4200	+ * enforce we remain within uclamp_max, see comment above.
	4201	+ *
	4202	+ * c) If util < uclamp_min, then we are boosted. Same as (b) but we
	4203	+ * need to take into account the boosted value fits the CPU without
	4204	+ * taking margin/pressure into account.
	4205	+ *
	4206	+ * Cases (a) and (b) are handled in the 'fits' variable already. We
	4207	+ * just need to consider an extra check for case (c) after ensuring we
	4208	+ * handle the case uclamp_min > uclamp_max.
	4209	+ */
	4210	+ uclamp_min = min(uclamp_min, uclamp_max);
	4211	+ if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
	4212	+ fits = fits && (uclamp_min <= capacity_orig_thermal);
	4213	+
	4214	+ return fits;
3896	4215	}
3897		-#endif
	4216	+
	4217	+static inline int task_fits_cpu(struct task_struct *p, int cpu)
	4218	+{
	4219	+ unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
	4220	+ unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
	4221	+ unsigned long util = task_util_est(p);
	4222	+ return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
	4223	+}
3898	4224
3899	4225	static inline void update_misfit_status(struct task_struct p, struct rq rq)
3900	4226	{
3901		- if (!static_branch_unlikely(&sched_asym_cpucapacity))
	4227	+ bool need_update = true;
	4228	+
	4229	+ trace_android_rvh_update_misfit_status(p, rq, &need_update);
	4230	+ if (!static_branch_unlikely(&sched_asym_cpucapacity) \|\| !need_update)
3902	4231	return;
3903	4232
3904		- if (!p) {
	4233	+ if (!p \|\| p->nr_cpus_allowed == 1) {
3905	4234	rq->misfit_task_load = 0;
3906	4235	return;
3907	4236	}
3908	4237
3909		- if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
	4238	+ if (task_fits_cpu(p, cpu_of(rq))) {
3910	4239	rq->misfit_task_load = 0;
3911	4240	return;
3912	4241	}
3913	4242
3914		- rq->misfit_task_load = task_h_load(p);
	4243	+ /*
	4244	+ * Make sure that misfit_task_load will not be null even if
	4245	+ * task_h_load() returns 0.
	4246	+ */
	4247	+ rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
3915	4248	}
3916	4249
3917	4250	#else /* CONFIG_SMP */
..	..	@@ -3928,11 +4261,11 @@
3928	4261	static inline void remove_entity_load_avg(struct sched_entity *se) {}
3929	4262
3930	4263	static inline void
3931		-attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se, int flags) {}
	4264	+attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
3932	4265	static inline void
3933	4266	detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
3934	4267
3935		-static inline int idle_balance(struct rq rq, struct rq_flags rf)
	4268	+static inline int newidle_balance(struct rq rq, struct rq_flags rf)
3936	4269	{
3937	4270	return 0;
3938	4271	}
..	..	@@ -3941,8 +4274,11 @@
3941	4274	util_est_enqueue(struct cfs_rq cfs_rq, struct task_struct p) {}
3942	4275
3943	4276	static inline void
3944		-util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p,
3945		- bool task_sleep) {}
	4277	+util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p) {}
	4278	+
	4279	+static inline void
	4280	+util_est_update(struct cfs_rq cfs_rq, struct task_struct p,
	4281	+ bool task_sleep) {}
3946	4282	static inline void update_misfit_status(struct task_struct p, struct rq rq) {}
3947	4283
3948	4284	#endif /* CONFIG_SMP */
..	..	@@ -3958,6 +4294,29 @@
3958	4294	if (d > 3*sysctl_sched_latency)
3959	4295	schedstat_inc(cfs_rq->nr_spread_over);
3960	4296	#endif
	4297	+}
	4298	+
	4299	+static inline bool entity_is_long_sleeper(struct sched_entity *se)
	4300	+{
	4301	+ struct cfs_rq *cfs_rq;
	4302	+ u64 sleep_time;
	4303	+
	4304	+ if (se->exec_start == 0)
	4305	+ return false;
	4306	+
	4307	+ cfs_rq = cfs_rq_of(se);
	4308	+
	4309	+ sleep_time = rq_clock_task(rq_of(cfs_rq));
	4310	+
	4311	+ /* Happen while migrating because of clock task divergence */
	4312	+ if (sleep_time <= se->exec_start)
	4313	+ return false;
	4314	+
	4315	+ sleep_time -= se->exec_start;
	4316	+ if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
	4317	+ return true;
	4318	+
	4319	+ return false;
3961	4320	}
3962	4321
3963	4322	static void
..	..	@@ -3988,8 +4347,30 @@
3988	4347	vruntime -= thresh;
3989	4348	}
3990	4349
3991		- /* ensure we never gain time by being placed backwards. */
3992		- se->vruntime = max_vruntime(se->vruntime, vruntime);
	4350	+ /*
	4351	+ * Pull vruntime of the entity being placed to the base level of
	4352	+ * cfs_rq, to prevent boosting it if placed backwards.
	4353	+ * However, min_vruntime can advance much faster than real time, with
	4354	+ * the extreme being when an entity with the minimal weight always runs
	4355	+ * on the cfs_rq. If the waking entity slept for a long time, its
	4356	+ * vruntime difference from min_vruntime may overflow s64 and their
	4357	+ * comparison may get inversed, so ignore the entity's original
	4358	+ * vruntime in that case.
	4359	+ * The maximal vruntime speedup is given by the ratio of normal to
	4360	+ * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
	4361	+ * When placing a migrated waking entity, its exec_start has been set
	4362	+ * from a different rq. In order to take into account a possible
	4363	+ * divergence between new and prev rq's clocks task because of irq and
	4364	+ * stolen time, we take an additional margin.
	4365	+ * So, cutting off on the sleep time of
	4366	+ * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
	4367	+ * should be safe.
	4368	+ */
	4369	+ if (entity_is_long_sleeper(se))
	4370	+ se->vruntime = vruntime;
	4371	+ else
	4372	+ se->vruntime = max_vruntime(se->vruntime, vruntime);
	4373	+ trace_android_rvh_place_entity(cfs_rq, se, initial, vruntime);
3993	4374	}
3994	4375
3995	4376	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
..	..	@@ -4014,6 +4395,7 @@
4014	4395	#endif
4015	4396	}
4016	4397
	4398	+static inline bool cfs_bandwidth_used(void);
4017	4399
4018	4400	/*
4019	4401	* MIGRATION
..	..	@@ -4078,12 +4460,15 @@
4078	4460	* - Add its new weight to cfs_rq->load.weight
4079	4461	*/
4080	4462	update_load_avg(cfs_rq, se, UPDATE_TG \| DO_ATTACH);
	4463	+ se_update_runnable(se);
4081	4464	update_cfs_group(se);
4082		- enqueue_runnable_load_avg(cfs_rq, se);
4083	4465	account_entity_enqueue(cfs_rq, se);
4084	4466
4085	4467	if (flags & ENQUEUE_WAKEUP)
4086	4468	place_entity(cfs_rq, se, 0);
	4469	+ /* Entity has migrated, no longer consider this task hot */
	4470	+ if (flags & ENQUEUE_MIGRATED)
	4471	+ se->exec_start = 0;
4087	4472
4088	4473	check_schedstat_required();
4089	4474	update_stats_enqueue(cfs_rq, se, flags);
..	..	@@ -4092,10 +4477,16 @@
4092	4477	__enqueue_entity(cfs_rq, se);
4093	4478	se->on_rq = 1;
4094	4479
4095		- if (cfs_rq->nr_running == 1) {
	4480	+ /*
	4481	+ * When bandwidth control is enabled, cfs might have been removed
	4482	+ * because of a parent been throttled but cfs->nr_running > 1. Try to
	4483	+ * add it unconditionnally.
	4484	+ */
	4485	+ if (cfs_rq->nr_running == 1 \|\| cfs_bandwidth_used())
4096	4486	list_add_leaf_cfs_rq(cfs_rq);
	4487	+
	4488	+ if (cfs_rq->nr_running == 1)
4097	4489	check_enqueue_throttle(cfs_rq);
4098		- }
4099	4490	}
4100	4491
4101	4492	static void __clear_buddies_last(struct sched_entity *se)
..	..	@@ -4156,13 +4547,13 @@
4156	4547	/*
4157	4548	* When dequeuing a sched_entity, we must:
4158	4549	* - Update loads to have both entity and cfs_rq synced with now.
4159		- * - Substract its load from the cfs_rq->runnable_avg.
4160		- * - Substract its previous weight from cfs_rq->load.weight.
	4550	+ * - Subtract its load from the cfs_rq->runnable_avg.
	4551	+ * - Subtract its previous weight from cfs_rq->load.weight.
4161	4552	* - For group entity, update its weight to reflect the new share
4162	4553	* of its group cfs_rq.
4163	4554	*/
4164	4555	update_load_avg(cfs_rq, se, UPDATE_TG);
4165		- dequeue_runnable_load_avg(cfs_rq, se);
	4556	+ se_update_runnable(se);
4166	4557
4167	4558	update_stats_dequeue(cfs_rq, se, flags);
4168	4559
..	..	@@ -4206,11 +4597,16 @@
4206	4597	unsigned long ideal_runtime, delta_exec;
4207	4598	struct sched_entity *se;
4208	4599	s64 delta;
	4600	+ bool skip_preempt = false;
4209	4601
4210	4602	ideal_runtime = sched_slice(cfs_rq, curr);
4211	4603	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
	4604	+ trace_android_rvh_check_preempt_tick(current, &ideal_runtime, &skip_preempt,
	4605	+ delta_exec, cfs_rq, curr, sysctl_sched_min_granularity);
	4606	+ if (skip_preempt)
	4607	+ return;
4212	4608	if (delta_exec > ideal_runtime) {
4213		- resched_curr_lazy(rq_of(cfs_rq));
	4609	+ resched_curr(rq_of(cfs_rq));
4214	4610	/*
4215	4611	* The current task ran long enough, ensure it doesn't get
4216	4612	* re-elected due to buddy favours.
..	..	@@ -4234,11 +4630,10 @@
4234	4630	return;
4235	4631
4236	4632	if (delta > ideal_runtime)
4237		- resched_curr_lazy(rq_of(cfs_rq));
	4633	+ resched_curr(rq_of(cfs_rq));
4238	4634	}
4239	4635
4240		-static void
4241		-set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
	4636	+void set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
4242	4637	{
4243	4638	/* 'current' is not kept within the tree. */
4244	4639	if (se->on_rq) {
..	..	@@ -4260,7 +4655,8 @@
4260	4655	* least twice that of our own weight (i.e. dont track it
4261	4656	* when there are only lesser-weight tasks around):
4262	4657	*/
4263		- if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
	4658	+ if (schedstat_enabled() &&
	4659	+ rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4264	4660	schedstat_set(se->statistics.slice_max,
4265	4661	max((u64)schedstat_val(se->statistics.slice_max),
4266	4662	se->sum_exec_runtime - se->prev_sum_exec_runtime));
..	..	@@ -4268,6 +4664,8 @@
4268	4664
4269	4665	se->prev_sum_exec_runtime = se->sum_exec_runtime;
4270	4666	}
	4667	+EXPORT_SYMBOL_GPL(set_next_entity);
	4668	+
4271	4669
4272	4670	static int
4273	4671	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
..	..	@@ -4283,7 +4681,11 @@
4283	4681	pick_next_entity(struct cfs_rq cfs_rq, struct sched_entity curr)
4284	4682	{
4285	4683	struct sched_entity *left = __pick_first_entity(cfs_rq);
4286		- struct sched_entity *se;
	4684	+ struct sched_entity *se = NULL;
	4685	+
	4686	+ trace_android_rvh_pick_next_entity(cfs_rq, curr, &se);
	4687	+ if (se)
	4688	+ goto done;
4287	4689
4288	4690	/*
4289	4691	* If curr is set we have to see if its left of the leftmost entity
..	..	@@ -4313,18 +4715,19 @@
4313	4715	se = second;
4314	4716	}
4315	4717
4316		- /*
4317		- * Prefer last buddy, try to return the CPU to a preempted task.
4318		- */
4319		- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4320		- se = cfs_rq->last;
4321		-
4322		- /*
4323		- * Someone really wants this to run. If it's not unfair, run it.
4324		- */
4325		- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
	4718	+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
	4719	+ /*
	4720	+ * Someone really wants this to run. If it's not unfair, run it.
	4721	+ */
4326	4722	se = cfs_rq->next;
	4723	+ } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
	4724	+ /*
	4725	+ * Prefer last buddy, try to return the CPU to a preempted task.
	4726	+ */
	4727	+ se = cfs_rq->last;
	4728	+ }
4327	4729
	4730	+done:
4328	4731	clear_buddies(cfs_rq, se);
4329	4732
4330	4733	return se;
..	..	@@ -4376,7 +4779,7 @@
4376	4779	* validating it and just reschedule.
4377	4780	*/
4378	4781	if (queued) {
4379		- resched_curr_lazy(rq_of(cfs_rq));
	4782	+ resched_curr(rq_of(cfs_rq));
4380	4783	return;
4381	4784	}
4382	4785	/*
..	..	@@ -4457,26 +4860,17 @@
4457	4860	return &tg->cfs_bandwidth;
4458	4861	}
4459	4862
4460		-/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
4461		-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4462		-{
4463		- if (unlikely(cfs_rq->throttle_count))
4464		- return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
4465		-
4466		- return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
4467		-}
4468		-
4469	4863	/* returns 0 on failure to allocate runtime */
4470		-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
	4864	+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
	4865	+ struct cfs_rq *cfs_rq, u64 target_runtime)
4471	4866	{
4472		- struct task_group *tg = cfs_rq->tg;
4473		- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4474		- u64 amount = 0, min_amount;
	4867	+ u64 min_amount, amount = 0;
	4868	+
	4869	+ lockdep_assert_held(&cfs_b->lock);
4475	4870
4476	4871	/* note: this is a positive sum as runtime_remaining <= 0 */
4477		- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
	4872	+ min_amount = target_runtime - cfs_rq->runtime_remaining;
4478	4873
4479		- raw_spin_lock(&cfs_b->lock);
4480	4874	if (cfs_b->quota == RUNTIME_INF)
4481	4875	amount = min_amount;
4482	4876	else {
..	..	@@ -4488,11 +4882,23 @@
4488	4882	cfs_b->idle = 0;
4489	4883	}
4490	4884	}
4491		- raw_spin_unlock(&cfs_b->lock);
4492	4885
4493	4886	cfs_rq->runtime_remaining += amount;
4494	4887
4495	4888	return cfs_rq->runtime_remaining > 0;
	4889	+}
	4890	+
	4891	+/* returns 0 on failure to allocate runtime */
	4892	+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
	4893	+{
	4894	+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
	4895	+ int ret;
	4896	+
	4897	+ raw_spin_lock(&cfs_b->lock);
	4898	+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
	4899	+ raw_spin_unlock(&cfs_b->lock);
	4900	+
	4901	+ return ret;
4496	4902	}
4497	4903
4498	4904	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
..	..	@@ -4510,7 +4916,7 @@
4510	4916	* hierarchy can be throttled
4511	4917	*/
4512	4918	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4513		- resched_curr_lazy(rq_of(cfs_rq));
	4919	+ resched_curr(rq_of(cfs_rq));
4514	4920	}
4515	4921
4516	4922	static __always_inline
..	..	@@ -4557,9 +4963,8 @@
4557	4963
4558	4964	cfs_rq->throttle_count--;
4559	4965	if (!cfs_rq->throttle_count) {
4560		- /* adjust cfs_rq_clock_task() */
4561		- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4562		- cfs_rq->throttled_clock_task;
	4966	+ cfs_rq->throttled_clock_pelt_time += rq_clock_task_mult(rq) -
	4967	+ cfs_rq->throttled_clock_pelt;
4563	4968
4564	4969	/* Add cfs_rq with already running entity in the list */
4565	4970	if (cfs_rq->nr_running >= 1)
..	..	@@ -4576,7 +4981,7 @@
4576	4981
4577	4982	/* group is entering throttled state, stop time */
4578	4983	if (!cfs_rq->throttle_count) {
4579		- cfs_rq->throttled_clock_task = rq_clock_task(rq);
	4984	+ cfs_rq->throttled_clock_pelt = rq_clock_task_mult(rq);
4580	4985	list_del_leaf_cfs_rq(cfs_rq);
4581	4986	}
4582	4987	cfs_rq->throttle_count++;
..	..	@@ -4584,13 +4989,33 @@
4584	4989	return 0;
4585	4990	}
4586	4991
4587		-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
	4992	+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
4588	4993	{
4589	4994	struct rq *rq = rq_of(cfs_rq);
4590	4995	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4591	4996	struct sched_entity *se;
4592		- long task_delta, dequeue = 1;
4593		- bool empty;
	4997	+ long task_delta, idle_task_delta, dequeue = 1;
	4998	+
	4999	+ raw_spin_lock(&cfs_b->lock);
	5000	+ /* This will start the period timer if necessary */
	5001	+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
	5002	+ /*
	5003	+ * We have raced with bandwidth becoming available, and if we
	5004	+ * actually throttled the timer might not unthrottle us for an
	5005	+ * entire period. We additionally needed to make sure that any
	5006	+ * subsequent check_cfs_rq_runtime calls agree not to throttle
	5007	+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
	5008	+ * for 1ns of runtime rather than just check cfs_b.
	5009	+ */
	5010	+ dequeue = 0;
	5011	+ } else {
	5012	+ list_add_tail_rcu(&cfs_rq->throttled_list,
	5013	+ &cfs_b->throttled_cfs_rq);
	5014	+ }
	5015	+ raw_spin_unlock(&cfs_b->lock);
	5016	+
	5017	+ if (!dequeue)
	5018	+ return false; /* Throttle no longer required. */
4594	5019
4595	5020	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4596	5021
..	..	@@ -4600,15 +5025,22 @@
4600	5025	rcu_read_unlock();
4601	5026
4602	5027	task_delta = cfs_rq->h_nr_running;
	5028	+ idle_task_delta = cfs_rq->idle_h_nr_running;
4603	5029	for_each_sched_entity(se) {
4604	5030	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4605	5031	/* throttled entity or throttle-on-deactivate */
4606	5032	if (!se->on_rq)
4607	5033	break;
4608	5034
4609		- if (dequeue)
	5035	+ if (dequeue) {
4610	5036	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
	5037	+ } else {
	5038	+ update_load_avg(qcfs_rq, se, 0);
	5039	+ se_update_runnable(se);
	5040	+ }
	5041	+
4611	5042	qcfs_rq->h_nr_running -= task_delta;
	5043	+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
4612	5044
4613	5045	if (qcfs_rq->load.weight)
4614	5046	dequeue = 0;
..	..	@@ -4617,29 +5049,13 @@
4617	5049	if (!se)
4618	5050	sub_nr_running(rq, task_delta);
4619	5051
	5052	+ /*
	5053	+ * Note: distribution will already see us throttled via the
	5054	+ * throttled-list. rq->lock protects completion.
	5055	+ */
4620	5056	cfs_rq->throttled = 1;
4621	5057	cfs_rq->throttled_clock = rq_clock(rq);
4622		- raw_spin_lock(&cfs_b->lock);
4623		- empty = list_empty(&cfs_b->throttled_cfs_rq);
4624		-
4625		- /*
4626		- * Add to the _head_ of the list, so that an already-started
4627		- * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4628		- * not running add to the tail so that later runqueues don't get starved.
4629		- */
4630		- if (cfs_b->distribute_running)
4631		- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4632		- else
4633		- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4634		-
4635		- /*
4636		- * If we're the first throttled task, make sure the bandwidth
4637		- * timer is running.
4638		- */
4639		- if (empty)
4640		- start_cfs_bandwidth(cfs_b);
4641		-
4642		- raw_spin_unlock(&cfs_b->lock);
	5058	+ return true;
4643	5059	}
4644	5060
4645	5061	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
..	..	@@ -4647,8 +5063,7 @@
4647	5063	struct rq *rq = rq_of(cfs_rq);
4648	5064	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4649	5065	struct sched_entity *se;
4650		- int enqueue = 1;
4651		- long task_delta;
	5066	+ long task_delta, idle_task_delta;
4652	5067
4653	5068	se = cfs_rq->tg->se[cpu_of(rq)];
4654	5069
..	..	@@ -4668,34 +5083,70 @@
4668	5083	return;
4669	5084
4670	5085	task_delta = cfs_rq->h_nr_running;
	5086	+ idle_task_delta = cfs_rq->idle_h_nr_running;
4671	5087	for_each_sched_entity(se) {
4672	5088	if (se->on_rq)
4673		- enqueue = 0;
4674		-
	5089	+ break;
4675	5090	cfs_rq = cfs_rq_of(se);
4676		- if (enqueue)
4677		- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4678		- cfs_rq->h_nr_running += task_delta;
	5091	+ enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4679	5092
	5093	+ cfs_rq->h_nr_running += task_delta;
	5094	+ cfs_rq->idle_h_nr_running += idle_task_delta;
	5095	+
	5096	+ /* end evaluation on encountering a throttled cfs_rq */
4680	5097	if (cfs_rq_throttled(cfs_rq))
	5098	+ goto unthrottle_throttle;
	5099	+ }
	5100	+
	5101	+ for_each_sched_entity(se) {
	5102	+ cfs_rq = cfs_rq_of(se);
	5103	+
	5104	+ update_load_avg(cfs_rq, se, UPDATE_TG);
	5105	+ se_update_runnable(se);
	5106	+
	5107	+ cfs_rq->h_nr_running += task_delta;
	5108	+ cfs_rq->idle_h_nr_running += idle_task_delta;
	5109	+
	5110	+
	5111	+ /* end evaluation on encountering a throttled cfs_rq */
	5112	+ if (cfs_rq_throttled(cfs_rq))
	5113	+ goto unthrottle_throttle;
	5114	+
	5115	+ /*
	5116	+ * One parent has been throttled and cfs_rq removed from the
	5117	+ * list. Add it back to not break the leaf list.
	5118	+ */
	5119	+ if (throttled_hierarchy(cfs_rq))
	5120	+ list_add_leaf_cfs_rq(cfs_rq);
	5121	+ }
	5122	+
	5123	+ /* At this point se is NULL and we are at root level*/
	5124	+ add_nr_running(rq, task_delta);
	5125	+
	5126	+unthrottle_throttle:
	5127	+ /*
	5128	+ * The cfs_rq_throttled() breaks in the above iteration can result in
	5129	+ * incomplete leaf list maintenance, resulting in triggering the
	5130	+ * assertion below.
	5131	+ */
	5132	+ for_each_sched_entity(se) {
	5133	+ cfs_rq = cfs_rq_of(se);
	5134	+
	5135	+ if (list_add_leaf_cfs_rq(cfs_rq))
4681	5136	break;
4682	5137	}
4683	5138
4684	5139	assert_list_leaf_cfs_rq(rq);
4685		-
4686		- if (!se)
4687		- add_nr_running(rq, task_delta);
4688	5140
4689	5141	/* Determine whether we need to wake up potentially idle CPU: */
4690	5142	if (rq->curr == rq->idle && rq->cfs.nr_running)
4691	5143	resched_curr(rq);
4692	5144	}
4693	5145
4694		-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
	5146	+static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
4695	5147	{
4696	5148	struct cfs_rq *cfs_rq;
4697		- u64 runtime;
4698		- u64 starting_runtime = remaining;
	5149	+ u64 runtime, remaining = 1;
4699	5150
4700	5151	rcu_read_lock();
4701	5152	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
..	..	@@ -4710,10 +5161,13 @@
4710	5161	/* By the above check, this should never be true */
4711	5162	SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
4712	5163
	5164	+ raw_spin_lock(&cfs_b->lock);
4713	5165	runtime = -cfs_rq->runtime_remaining + 1;
4714		- if (runtime > remaining)
4715		- runtime = remaining;
4716		- remaining -= runtime;
	5166	+ if (runtime > cfs_b->runtime)
	5167	+ runtime = cfs_b->runtime;
	5168	+ cfs_b->runtime -= runtime;
	5169	+ remaining = cfs_b->runtime;
	5170	+ raw_spin_unlock(&cfs_b->lock);
4717	5171
4718	5172	cfs_rq->runtime_remaining += runtime;
4719	5173
..	..	@@ -4728,8 +5182,6 @@
4728	5182	break;
4729	5183	}
4730	5184	rcu_read_unlock();
4731		-
4732		- return starting_runtime - remaining;
4733	5185	}
4734	5186
4735	5187	/*
..	..	@@ -4740,7 +5192,6 @@
4740	5192	*/
4741	5193	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
4742	5194	{
4743		- u64 runtime;
4744	5195	int throttled;
4745	5196
4746	5197	/* no need to continue the timer with no bandwidth constraint */
..	..	@@ -4769,24 +5220,15 @@
4769	5220	cfs_b->nr_throttled += overrun;
4770	5221
4771	5222	/*
4772		- * This check is repeated as we are holding onto the new bandwidth while
4773		- * we unthrottle. This can potentially race with an unthrottled group
4774		- * trying to acquire new bandwidth from the global pool. This can result
4775		- * in us over-using our runtime if it is all used during this loop, but
4776		- * only by limited amounts in that extreme case.
	5223	+ * This check is repeated as we release cfs_b->lock while we unthrottle.
4777	5224	*/
4778		- while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4779		- runtime = cfs_b->runtime;
4780		- cfs_b->distribute_running = 1;
	5225	+ while (throttled && cfs_b->runtime > 0) {
4781	5226	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4782	5227	/* we can't nest cfs_b->lock while distributing bandwidth */
4783		- runtime = distribute_cfs_runtime(cfs_b, runtime);
	5228	+ distribute_cfs_runtime(cfs_b);
4784	5229	raw_spin_lock_irqsave(&cfs_b->lock, flags);
4785	5230
4786		- cfs_b->distribute_running = 0;
4787	5231	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4788		-
4789		- cfs_b->runtime -= min(runtime, cfs_b->runtime);
4790	5232	}
4791	5233
4792	5234	/*
..	..	@@ -4842,6 +5284,11 @@
4842	5284	if (runtime_refresh_within(cfs_b, min_left))
4843	5285	return;
4844	5286
	5287	+ /* don't push forwards an existing deferred unthrottle */
	5288	+ if (cfs_b->slack_started)
	5289	+ return;
	5290	+ cfs_b->slack_started = true;
	5291	+
4845	5292	hrtimer_start(&cfs_b->slack_timer,
4846	5293	ns_to_ktime(cfs_bandwidth_slack_period),
4847	5294	HRTIMER_MODE_REL);
..	..	@@ -4893,10 +5340,7 @@
4893	5340
4894	5341	/* confirm we're still not at a refresh boundary */
4895	5342	raw_spin_lock_irqsave(&cfs_b->lock, flags);
4896		- if (cfs_b->distribute_running) {
4897		- raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4898		- return;
4899		- }
	5343	+ cfs_b->slack_started = false;
4900	5344
4901	5345	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4902	5346	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
..	..	@@ -4906,26 +5350,21 @@
4906	5350	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4907	5351	runtime = cfs_b->runtime;
4908	5352
4909		- if (runtime)
4910		- cfs_b->distribute_running = 1;
4911		-
4912	5353	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4913	5354
4914	5355	if (!runtime)
4915	5356	return;
4916	5357
4917		- runtime = distribute_cfs_runtime(cfs_b, runtime);
	5358	+ distribute_cfs_runtime(cfs_b);
4918	5359
4919	5360	raw_spin_lock_irqsave(&cfs_b->lock, flags);
4920		- cfs_b->runtime -= min(runtime, cfs_b->runtime);
4921		- cfs_b->distribute_running = 0;
4922	5361	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4923	5362	}
4924	5363
4925	5364	/*
4926	5365	* When a group wakes up we want to make sure that its quota is not already
4927	5366	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
4928		- * runtime as update_curr() throttling can not not trigger until it's on-rq.
	5367	+ * runtime as update_curr() throttling can not trigger until it's on-rq.
4929	5368	*/
4930	5369	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4931	5370	{
..	..	@@ -4960,7 +5399,7 @@
4960	5399	pcfs_rq = tg->parent->cfs_rq[cpu];
4961	5400
4962	5401	cfs_rq->throttle_count = pcfs_rq->throttle_count;
4963		- cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
	5402	+ cfs_rq->throttled_clock_pelt = rq_clock_task_mult(cpu_rq(cpu));
4964	5403	}
4965	5404
4966	5405	/* conditionally throttle active cfs_rq's from put_prev_entity() */
..	..	@@ -4979,8 +5418,7 @@
4979	5418	if (cfs_rq_throttled(cfs_rq))
4980	5419	return true;
4981	5420
4982		- throttle_cfs_rq(cfs_rq);
4983		- return true;
	5421	+ return throttle_cfs_rq(cfs_rq);
4984	5422	}
4985	5423
4986	5424	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
..	..	@@ -5009,6 +5447,8 @@
5009	5447	overrun = hrtimer_forward_now(timer, cfs_b->period);
5010	5448	if (!overrun)
5011	5449	break;
	5450	+
	5451	+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
5012	5452
5013	5453	if (++count > 3) {
5014	5454	u64 new, old = ktime_to_ns(cfs_b->period);
..	..	@@ -5039,8 +5479,6 @@
5039	5479	/* reset count so we don't come right back in here */
5040	5480	count = 0;
5041	5481	}
5042		-
5043		- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
5044	5482	}
5045	5483	if (idle)
5046	5484	cfs_b->period_active = 0;
..	..	@@ -5061,7 +5499,7 @@
5061	5499	cfs_b->period_timer.function = sched_cfs_period_timer;
5062	5500	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5063	5501	cfs_b->slack_timer.function = sched_cfs_slack_timer;
5064		- cfs_b->distribute_running = 0;
	5502	+ cfs_b->slack_started = false;
5065	5503	}
5066	5504
5067	5505	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
..	..	@@ -5156,11 +5594,6 @@
5156	5594	return false;
5157	5595	}
5158	5596
5159		-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5160		-{
5161		- return rq_clock_task(rq_of(cfs_rq));
5162		-}
5163		-
5164	5597	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5165	5598	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5166	5599	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
..	..	@@ -5218,7 +5651,7 @@
5218	5651
5219	5652	if (delta < 0) {
5220	5653	if (rq->curr == p)
5221		- resched_curr_lazy(rq);
	5654	+ resched_curr(rq);
5222	5655	return;
5223	5656	}
5224	5657	hrtick_start(rq, delta);
..	..	@@ -5253,22 +5686,43 @@
5253	5686
5254	5687	#ifdef CONFIG_SMP
5255	5688	static inline unsigned long cpu_util(int cpu);
5256		-static unsigned long capacity_of(int cpu);
5257	5689
5258	5690	static inline bool cpu_overutilized(int cpu)
5259	5691	{
5260		- return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
	5692	+ unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
	5693	+ unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
	5694	+ int overutilized = -1;
	5695	+
	5696	+ trace_android_rvh_cpu_overutilized(cpu, &overutilized);
	5697	+ if (overutilized != -1)
	5698	+ return overutilized;
	5699	+
	5700	+ return !util_fits_cpu(cpu_util(cpu), rq_util_min, rq_util_max, cpu);
5261	5701	}
5262	5702
5263	5703	static inline void update_overutilized_status(struct rq *rq)
5264	5704	{
5265	5705	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5266	5706	WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5267		- trace_sched_overutilized(1);
	5707	+ trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5268	5708	}
5269	5709	}
5270	5710	#else
5271	5711	static inline void update_overutilized_status(struct rq *rq) { }
	5712	+#endif
	5713	+
	5714	+/* Runqueue only has SCHED_IDLE tasks enqueued */
	5715	+static int sched_idle_rq(struct rq *rq)
	5716	+{
	5717	+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
	5718	+ rq->nr_running);
	5719	+}
	5720	+
	5721	+#ifdef CONFIG_SMP
	5722	+static int sched_idle_cpu(int cpu)
	5723	+{
	5724	+ return sched_idle_rq(cpu_rq(cpu));
	5725	+}
5272	5726	#endif
5273	5727
5274	5728	/*
..	..	@@ -5281,12 +5735,9 @@
5281	5735	{
5282	5736	struct cfs_rq *cfs_rq;
5283	5737	struct sched_entity *se = &p->se;
	5738	+ int idle_h_nr_running = task_has_idle_policy(p);
5284	5739	int task_new = !(flags & ENQUEUE_WAKEUP);
5285		-
5286		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
5287		- if (sysctl_sched_performance_bias)
5288		- cpufreq_task_boost(rq->cpu, task_util_est(p));
5289		-#endif
	5740	+ int should_iowait_boost;
5290	5741
5291	5742	/*
5292	5743	* The code below (indirectly) updates schedutil which looks at
..	..	@@ -5297,29 +5748,13 @@
5297	5748	util_est_enqueue(&rq->cfs, p);
5298	5749
5299	5750	/*
5300		- * The code below (indirectly) updates schedutil which looks at
5301		- * the cfs_rq utilization to select a frequency.
5302		- * Let's update schedtune here to ensure the boost value of the
5303		- * current task is accounted for in the selection of the OPP.
5304		- *
5305		- * We do it also in the case where we enqueue a throttled task;
5306		- * we could argue that a throttled task should not boost a CPU,
5307		- * however:
5308		- * a) properly implementing CPU boosting considering throttled
5309		- * tasks will increase a lot the complexity of the solution
5310		- * b) it's not easy to quantify the benefits introduced by
5311		- * such a more complex solution.
5312		- * Thus, for the time being we go for the simple solution and boost
5313		- * also for throttled RQs.
5314		- */
5315		- schedtune_enqueue_task(p, cpu_of(rq));
5316		-
5317		- /*
5318	5751	* If in_iowait is set, the code below may not trigger any cpufreq
5319	5752	* utilization updates, so do it here explicitly with the IOWAIT flag
5320	5753	* passed.
5321	5754	*/
5322		- if (p->in_iowait)
	5755	+ should_iowait_boost = p->in_iowait;
	5756	+ trace_android_rvh_set_iowait(p, &should_iowait_boost);
	5757	+ if (should_iowait_boost)
5323	5758	cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5324	5759
5325	5760	for_each_sched_entity(se) {
..	..	@@ -5328,51 +5763,60 @@
5328	5763	cfs_rq = cfs_rq_of(se);
5329	5764	enqueue_entity(cfs_rq, se, flags);
5330	5765
5331		- /*
5332		- * end evaluation on encountering a throttled cfs_rq
5333		- *
5334		- * note: in the case of encountering a throttled cfs_rq we will
5335		- * post the final h_nr_running increment below.
5336		- */
5337		- if (cfs_rq_throttled(cfs_rq))
5338		- break;
5339	5766	cfs_rq->h_nr_running++;
	5767	+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
	5768	+
	5769	+ /* end evaluation on encountering a throttled cfs_rq */
	5770	+ if (cfs_rq_throttled(cfs_rq))
	5771	+ goto enqueue_throttle;
5340	5772
5341	5773	flags = ENQUEUE_WAKEUP;
5342	5774	}
5343	5775
	5776	+ trace_android_rvh_enqueue_task_fair(rq, p, flags);
5344	5777	for_each_sched_entity(se) {
5345	5778	cfs_rq = cfs_rq_of(se);
5346		- cfs_rq->h_nr_running++;
5347		-
5348		- if (cfs_rq_throttled(cfs_rq))
5349		- break;
5350	5779
5351	5780	update_load_avg(cfs_rq, se, UPDATE_TG);
	5781	+ se_update_runnable(se);
5352	5782	update_cfs_group(se);
	5783	+
	5784	+ cfs_rq->h_nr_running++;
	5785	+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
	5786	+
	5787	+ /* end evaluation on encountering a throttled cfs_rq */
	5788	+ if (cfs_rq_throttled(cfs_rq))
	5789	+ goto enqueue_throttle;
	5790	+
	5791	+ /*
	5792	+ * One parent has been throttled and cfs_rq removed from the
	5793	+ * list. Add it back to not break the leaf list.
	5794	+ */
	5795	+ if (throttled_hierarchy(cfs_rq))
	5796	+ list_add_leaf_cfs_rq(cfs_rq);
5353	5797	}
5354	5798
5355		- if (!se) {
5356		- add_nr_running(rq, 1);
5357		- /*
5358		- * Since new tasks are assigned an initial util_avg equal to
5359		- * half of the spare capacity of their CPU, tiny tasks have the
5360		- * ability to cross the overutilized threshold, which will
5361		- * result in the load balancer ruining all the task placement
5362		- * done by EAS. As a way to mitigate that effect, do not account
5363		- * for the first enqueue operation of new tasks during the
5364		- * overutilized flag detection.
5365		- *
5366		- * A better way of solving this problem would be to wait for
5367		- * the PELT signals of tasks to converge before taking them
5368		- * into account, but that is not straightforward to implement,
5369		- * and the following generally works well enough in practice.
5370		- */
5371		- if (!task_new)
5372		- update_overutilized_status(rq);
	5799	+ /* At this point se is NULL and we are at root level*/
	5800	+ add_nr_running(rq, 1);
5373	5801
5374		- }
	5802	+ /*
	5803	+ * Since new tasks are assigned an initial util_avg equal to
	5804	+ * half of the spare capacity of their CPU, tiny tasks have the
	5805	+ * ability to cross the overutilized threshold, which will
	5806	+ * result in the load balancer ruining all the task placement
	5807	+ * done by EAS. As a way to mitigate that effect, do not account
	5808	+ * for the first enqueue operation of new tasks during the
	5809	+ * overutilized flag detection.
	5810	+ *
	5811	+ * A better way of solving this problem would be to wait for
	5812	+ * the PELT signals of tasks to converge before taking them
	5813	+ * into account, but that is not straightforward to implement,
	5814	+ * and the following generally works well enough in practice.
	5815	+ */
	5816	+ if (!task_new)
	5817	+ update_overutilized_status(rq);
5375	5818
	5819	+enqueue_throttle:
5376	5820	if (cfs_bandwidth_used()) {
5377	5821	/*
5378	5822	* When bandwidth control is enabled; the cfs_rq_throttled()
..	..	@@ -5405,28 +5849,21 @@
5405	5849	struct cfs_rq *cfs_rq;
5406	5850	struct sched_entity *se = &p->se;
5407	5851	int task_sleep = flags & DEQUEUE_SLEEP;
	5852	+ int idle_h_nr_running = task_has_idle_policy(p);
	5853	+ bool was_sched_idle = sched_idle_rq(rq);
5408	5854
5409		- /*
5410		- * The code below (indirectly) updates schedutil which looks at
5411		- * the cfs_rq utilization to select a frequency.
5412		- * Let's update schedtune here to ensure the boost value of the
5413		- * current task is not more accounted for in the selection of the OPP.
5414		- */
5415		- schedtune_dequeue_task(p, cpu_of(rq));
	5855	+ util_est_dequeue(&rq->cfs, p);
5416	5856
5417	5857	for_each_sched_entity(se) {
5418	5858	cfs_rq = cfs_rq_of(se);
5419	5859	dequeue_entity(cfs_rq, se, flags);
5420	5860
5421		- /*
5422		- * end evaluation on encountering a throttled cfs_rq
5423		- *
5424		- * note: in the case of encountering a throttled cfs_rq we will
5425		- * post the final h_nr_running decrement below.
5426		- */
5427		- if (cfs_rq_throttled(cfs_rq))
5428		- break;
5429	5861	cfs_rq->h_nr_running--;
	5862	+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
	5863	+
	5864	+ /* end evaluation on encountering a throttled cfs_rq */
	5865	+ if (cfs_rq_throttled(cfs_rq))
	5866	+ goto dequeue_throttle;
5430	5867
5431	5868	/* Don't dequeue parent if it has other entities besides us */
5432	5869	if (cfs_rq->load.weight) {
..	..	@@ -5443,21 +5880,32 @@
5443	5880	flags \|= DEQUEUE_SLEEP;
5444	5881	}
5445	5882
	5883	+ trace_android_rvh_dequeue_task_fair(rq, p, flags);
5446	5884	for_each_sched_entity(se) {
5447	5885	cfs_rq = cfs_rq_of(se);
5448		- cfs_rq->h_nr_running--;
5449		-
5450		- if (cfs_rq_throttled(cfs_rq))
5451		- break;
5452	5886
5453	5887	update_load_avg(cfs_rq, se, UPDATE_TG);
	5888	+ se_update_runnable(se);
5454	5889	update_cfs_group(se);
	5890	+
	5891	+ cfs_rq->h_nr_running--;
	5892	+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
	5893	+
	5894	+ /* end evaluation on encountering a throttled cfs_rq */
	5895	+ if (cfs_rq_throttled(cfs_rq))
	5896	+ goto dequeue_throttle;
	5897	+
5455	5898	}
5456	5899
5457		- if (!se)
5458		- sub_nr_running(rq, 1);
	5900	+ /* At this point se is NULL and we are at root level*/
	5901	+ sub_nr_running(rq, 1);
5459	5902
5460		- util_est_dequeue(&rq->cfs, p, task_sleep);
	5903	+ /* balance early to pull high priority tasks */
	5904	+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
	5905	+ rq->next_balance = jiffies;
	5906	+
	5907	+dequeue_throttle:
	5908	+ util_est_update(&rq->cfs, p, task_sleep);
5461	5909	hrtick_update(rq);
5462	5910	}
5463	5911
..	..	@@ -5468,71 +5916,6 @@
5468	5916	DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5469	5917
5470	5918	#ifdef CONFIG_NO_HZ_COMMON
5471		-/*
5472		- * per rq 'load' arrray crap; XXX kill this.
5473		- */
5474		-
5475		-/*
5476		- * The exact cpuload calculated at every tick would be:
5477		- *
5478		- * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5479		- *
5480		- * If a CPU misses updates for n ticks (as it was idle) and update gets
5481		- * called on the n+1-th tick when CPU may be busy, then we have:
5482		- *
5483		- * load_n = (1 - 1/2^i)^n * load_0
5484		- * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
5485		- *
5486		- * decay_load_missed() below does efficient calculation of
5487		- *
5488		- * load' = (1 - 1/2^i)^n * load
5489		- *
5490		- * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
5491		- * This allows us to precompute the above in said factors, thereby allowing the
5492		- * reduction of an arbitrary n in O(log_2 n) steps. (See also
5493		- * fixed_power_int())
5494		- *
5495		- * The calculation is approximated on a 128 point scale.
5496		- */
5497		-#define DEGRADE_SHIFT 7
5498		-
5499		-static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5500		-static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5501		- { 0, 0, 0, 0, 0, 0, 0, 0 },
5502		- { 64, 32, 8, 0, 0, 0, 0, 0 },
5503		- { 96, 72, 40, 12, 1, 0, 0, 0 },
5504		- { 112, 98, 75, 43, 15, 1, 0, 0 },
5505		- { 120, 112, 98, 76, 45, 16, 2, 0 }
5506		-};
5507		-
5508		-/*
5509		- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
5510		- * would be when CPU is idle and so we just decay the old load without
5511		- * adding any new load.
5512		- */
5513		-static unsigned long
5514		-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5515		-{
5516		- int j = 0;
5517		-
5518		- if (!missed_updates)
5519		- return load;
5520		-
5521		- if (missed_updates >= degrade_zero_ticks[idx])
5522		- return 0;
5523		-
5524		- if (idx == 1)
5525		- return load >> missed_updates;
5526		-
5527		- while (missed_updates) {
5528		- if (missed_updates % 2)
5529		- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5530		-
5531		- missed_updates >>= 1;
5532		- j++;
5533		- }
5534		- return load;
5535		-}
5536	5919
5537	5920	static struct {
5538	5921	cpumask_var_t idle_cpus_mask;
..	..	@@ -5544,249 +5927,68 @@
5544	5927
5545	5928	#endif /* CONFIG_NO_HZ_COMMON */
5546	5929
5547		-/**
5548		- * __cpu_load_update - update the rq->cpu_load[] statistics
5549		- * @this_rq: The rq to update statistics for
5550		- * @this_load: The current load
5551		- * @pending_updates: The number of missed updates
5552		- *
5553		- * Update rq->cpu_load[] statistics. This function is usually called every
5554		- * scheduler tick (TICK_NSEC).
5555		- *
5556		- * This function computes a decaying average:
5557		- *
5558		- * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
5559		- *
5560		- * Because of NOHZ it might not get called on every tick which gives need for
5561		- * the @pending_updates argument.
5562		- *
5563		- * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
5564		- * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
5565		- * = A * (A * load[i]_n-2 + B) + B
5566		- * = A * (A * (A * load[i]_n-3 + B) + B) + B
5567		- * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
5568		- * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
5569		- * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
5570		- * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
5571		- *
5572		- * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
5573		- * any change in load would have resulted in the tick being turned back on.
5574		- *
5575		- * For regular NOHZ, this reduces to:
5576		- *
5577		- * load[i]_n = (1 - 1/2^i)^n * load[i]_0
5578		- *
5579		- * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
5580		- * term.
5581		- */
5582		-static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5583		- unsigned long pending_updates)
	5930	+static unsigned long cpu_load(struct rq *rq)
5584	5931	{
5585		- unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
5586		- int i, scale;
5587		-
5588		- this_rq->nr_load_updates++;
5589		-
5590		- /* Update our load: */
5591		- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5592		- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5593		- unsigned long old_load, new_load;
5594		-
5595		- /* scale is effectively 1 << i now, and >> i divides by scale */
5596		-
5597		- old_load = this_rq->cpu_load[i];
5598		-#ifdef CONFIG_NO_HZ_COMMON
5599		- old_load = decay_load_missed(old_load, pending_updates - 1, i);
5600		- if (tickless_load) {
5601		- old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5602		- /*
5603		- * old_load can never be a negative value because a
5604		- * decayed tickless_load cannot be greater than the
5605		- * original tickless_load.
5606		- */
5607		- old_load += tickless_load;
5608		- }
5609		-#endif
5610		- new_load = this_load;
5611		- /*
5612		- * Round up the averaging division if load is increasing. This
5613		- * prevents us from getting stuck on 9 if the load is 10, for
5614		- * example.
5615		- */
5616		- if (new_load > old_load)
5617		- new_load += scale - 1;
5618		-
5619		- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5620		- }
5621		-}
5622		-
5623		-/* Used instead of source_load when we know the type == 0 */
5624		-static unsigned long weighted_cpuload(struct rq *rq)
5625		-{
5626		- return cfs_rq_runnable_load_avg(&rq->cfs);
5627		-}
5628		-
5629		-#ifdef CONFIG_NO_HZ_COMMON
5630		-/*
5631		- * There is no sane way to deal with nohz on smp when using jiffies because the
5632		- * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
5633		- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5634		- *
5635		- * Therefore we need to avoid the delta approach from the regular tick when
5636		- * possible since that would seriously skew the load calculation. This is why we
5637		- * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
5638		- * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
5639		- * loop exit, nohz_idle_balance, nohz full exit...)
5640		- *
5641		- * This means we might still be one tick off for nohz periods.
5642		- */
5643		-
5644		-static void cpu_load_update_nohz(struct rq *this_rq,
5645		- unsigned long curr_jiffies,
5646		- unsigned long load)
5647		-{
5648		- unsigned long pending_updates;
5649		-
5650		- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5651		- if (pending_updates) {
5652		- this_rq->last_load_update_tick = curr_jiffies;
5653		- /*
5654		- * In the regular NOHZ case, we were idle, this means load 0.
5655		- * In the NOHZ_FULL case, we were non-idle, we should consider
5656		- * its weighted load.
5657		- */
5658		- cpu_load_update(this_rq, load, pending_updates);
5659		- }
	5932	+ return cfs_rq_load_avg(&rq->cfs);
5660	5933	}
5661	5934
5662	5935	/*
5663		- * Called from nohz_idle_balance() to update the load ratings before doing the
5664		- * idle balance.
5665		- */
5666		-static void cpu_load_update_idle(struct rq *this_rq)
5667		-{
5668		- /*
5669		- * bail if there's load or we're actually up-to-date.
5670		- */
5671		- if (weighted_cpuload(this_rq))
5672		- return;
5673		-
5674		- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
5675		-}
5676		-
5677		-/*
5678		- * Record CPU load on nohz entry so we know the tickless load to account
5679		- * on nohz exit. cpu_load[0] happens then to be updated more frequently
5680		- * than other cpu_load[idx] but it should be fine as cpu_load readers
5681		- * shouldn't rely into synchronized cpu_load[*] updates.
5682		- */
5683		-void cpu_load_update_nohz_start(void)
5684		-{
5685		- struct rq *this_rq = this_rq();
5686		-
5687		- /*
5688		- * This is all lockless but should be fine. If weighted_cpuload changes
5689		- * concurrently we'll exit nohz. And cpu_load write can race with
5690		- * cpu_load_update_idle() but both updater would be writing the same.
5691		- */
5692		- this_rq->cpu_load[0] = weighted_cpuload(this_rq);
5693		-}
5694		-
5695		-/*
5696		- * Account the tickless load in the end of a nohz frame.
5697		- */
5698		-void cpu_load_update_nohz_stop(void)
5699		-{
5700		- unsigned long curr_jiffies = READ_ONCE(jiffies);
5701		- struct rq *this_rq = this_rq();
5702		- unsigned long load;
5703		- struct rq_flags rf;
5704		-
5705		- if (curr_jiffies == this_rq->last_load_update_tick)
5706		- return;
5707		-
5708		- load = weighted_cpuload(this_rq);
5709		- rq_lock(this_rq, &rf);
5710		- update_rq_clock(this_rq);
5711		- cpu_load_update_nohz(this_rq, curr_jiffies, load);
5712		- rq_unlock(this_rq, &rf);
5713		-}
5714		-#else /* !CONFIG_NO_HZ_COMMON */
5715		-static inline void cpu_load_update_nohz(struct rq *this_rq,
5716		- unsigned long curr_jiffies,
5717		- unsigned long load) { }
5718		-#endif /* CONFIG_NO_HZ_COMMON */
5719		-
5720		-static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5721		-{
5722		-#ifdef CONFIG_NO_HZ_COMMON
5723		- /* See the mess around cpu_load_update_nohz(). */
5724		- this_rq->last_load_update_tick = READ_ONCE(jiffies);
5725		-#endif
5726		- cpu_load_update(this_rq, load, 1);
5727		-}
5728		-
5729		-/*
5730		- * Called from scheduler_tick()
5731		- */
5732		-void cpu_load_update_active(struct rq *this_rq)
5733		-{
5734		- unsigned long load = weighted_cpuload(this_rq);
5735		-
5736		- if (tick_nohz_tick_stopped())
5737		- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5738		- else
5739		- cpu_load_update_periodic(this_rq, load);
5740		-}
5741		-
5742		-/*
5743		- * Return a low guess at the load of a migration-source CPU weighted
5744		- * according to the scheduling class and "nice" value.
	5936	+ * cpu_load_without - compute CPU load without any contributions from *p
	5937	+ * @cpu: the CPU which load is requested
	5938	+ * @p: the task which load should be discounted
5745	5939	*
5746		- * We want to under-estimate the load of migration sources, to
5747		- * balance conservatively.
	5940	+ * The load of a CPU is defined by the load of tasks currently enqueued on that
	5941	+ * CPU as well as tasks which are currently sleeping after an execution on that
	5942	+ * CPU.
	5943	+ *
	5944	+ * This method returns the load of the specified CPU by discounting the load of
	5945	+ * the specified task, whenever the task is currently contributing to the CPU
	5946	+ * load.
5748	5947	*/
5749		-static unsigned long source_load(int cpu, int type)
	5948	+static unsigned long cpu_load_without(struct rq rq, struct task_struct p)
5750	5949	{
5751		- struct rq *rq = cpu_rq(cpu);
5752		- unsigned long total = weighted_cpuload(rq);
	5950	+ struct cfs_rq *cfs_rq;
	5951	+ unsigned int load;
5753	5952
5754		- if (type == 0 \|\| !sched_feat(LB_BIAS))
5755		- return total;
	5953	+ /* Task has no contribution or is new */
	5954	+ if (cpu_of(rq) != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
	5955	+ return cpu_load(rq);
5756	5956
5757		- return min(rq->cpu_load[type-1], total);
	5957	+ cfs_rq = &rq->cfs;
	5958	+ load = READ_ONCE(cfs_rq->avg.load_avg);
	5959	+
	5960	+ /* Discount task's util from CPU's util */
	5961	+ lsub_positive(&load, task_h_load(p));
	5962	+
	5963	+ return load;
5758	5964	}
5759	5965
5760		-/*
5761		- * Return a high guess at the load of a migration-target CPU weighted
5762		- * according to the scheduling class and "nice" value.
5763		- */
5764		-static unsigned long target_load(int cpu, int type)
	5966	+static unsigned long cpu_runnable(struct rq *rq)
5765	5967	{
5766		- struct rq *rq = cpu_rq(cpu);
5767		- unsigned long total = weighted_cpuload(rq);
	5968	+ return cfs_rq_runnable_avg(&rq->cfs);
	5969	+}
5768	5970
5769		- if (type == 0 \|\| !sched_feat(LB_BIAS))
5770		- return total;
	5971	+static unsigned long cpu_runnable_without(struct rq rq, struct task_struct p)
	5972	+{
	5973	+ struct cfs_rq *cfs_rq;
	5974	+ unsigned int runnable;
5771	5975
5772		- return max(rq->cpu_load[type-1], total);
	5976	+ /* Task has no contribution or is new */
	5977	+ if (cpu_of(rq) != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
	5978	+ return cpu_runnable(rq);
	5979	+
	5980	+ cfs_rq = &rq->cfs;
	5981	+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
	5982	+
	5983	+ /* Discount task's runnable from CPU's runnable */
	5984	+ lsub_positive(&runnable, p->se.avg.runnable_avg);
	5985	+
	5986	+ return runnable;
5773	5987	}
5774	5988
5775	5989	static unsigned long capacity_of(int cpu)
5776	5990	{
5777	5991	return cpu_rq(cpu)->cpu_capacity;
5778		-}
5779		-
5780		-static unsigned long cpu_avg_load_per_task(int cpu)
5781		-{
5782		- struct rq *rq = cpu_rq(cpu);
5783		- unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5784		- unsigned long load_avg = weighted_cpuload(rq);
5785		-
5786		- if (nr_running)
5787		- return load_avg / nr_running;
5788		-
5789		- return 0;
5790	5992	}
5791	5993
5792	5994	static void record_wakee(struct task_struct *p)
..	..	@@ -5823,18 +6025,15 @@
5823	6025	* whatever is irrelevant, spread criteria is apparent partner count exceeds
5824	6026	* socket size.
5825	6027	*/
5826		-static int wake_wide(struct task_struct *p, int sibling_count_hint)
	6028	+static int wake_wide(struct task_struct *p)
5827	6029	{
5828	6030	unsigned int master = current->wakee_flips;
5829	6031	unsigned int slave = p->wakee_flips;
5830		- int llc_size = this_cpu_read(sd_llc_size);
5831		-
5832		- if (sibling_count_hint >= llc_size)
5833		- return 1;
	6032	+ int factor = __this_cpu_read(sd_llc_size);
5834	6033
5835	6034	if (master < slave)
5836	6035	swap(master, slave);
5837		- if (slave < llc_size \|\| master < slave * llc_size)
	6036	+ if (slave < factor \|\| master < slave * factor)
5838	6037	return 0;
5839	6038	return 1;
5840	6039	}
..	..	@@ -5882,7 +6081,7 @@
5882	6081	s64 this_eff_load, prev_eff_load;
5883	6082	unsigned long task_load;
5884	6083
5885		- this_eff_load = target_load(this_cpu, sd->wake_idx);
	6084	+ this_eff_load = cpu_load(cpu_rq(this_cpu));
5886	6085
5887	6086	if (sync) {
5888	6087	unsigned long current_load = task_h_load(current);
..	..	@@ -5900,7 +6099,7 @@
5900	6099	this_eff_load *= 100;
5901	6100	this_eff_load *= capacity_of(prev_cpu);
5902	6101
5903		- prev_eff_load = source_load(prev_cpu, sd->wake_idx);
	6102	+ prev_eff_load = cpu_load(cpu_rq(prev_cpu));
5904	6103	prev_eff_load -= task_load;
5905	6104	if (sched_feat(WA_BIAS))
5906	6105	prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
..	..	@@ -5938,242 +6137,8 @@
5938	6137	return target;
5939	6138	}
5940	6139
5941		-#ifdef CONFIG_SCHED_TUNE
5942		-struct reciprocal_value schedtune_spc_rdiv;
5943		-
5944		-static long
5945		-schedtune_margin(unsigned long signal, long boost)
5946		-{
5947		- long long margin = 0;
5948		-
5949		- /*
5950		- * Signal proportional compensation (SPC)
5951		- *
5952		- * The Boost (B) value is used to compute a Margin (M) which is
5953		- * proportional to the complement of the original Signal (S):
5954		- * M = B * (SCHED_CAPACITY_SCALE - S)
5955		- * The obtained M could be used by the caller to "boost" S.
5956		- */
5957		- if (boost >= 0) {
5958		- margin = SCHED_CAPACITY_SCALE - signal;
5959		- margin *= boost;
5960		- } else
5961		- margin = -signal * boost;
5962		-
5963		- margin = reciprocal_divide(margin, schedtune_spc_rdiv);
5964		-
5965		- if (boost < 0)
5966		- margin *= -1;
5967		- return margin;
5968		-}
5969		-
5970		-inline long
5971		-schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
5972		-{
5973		- int boost = schedtune_cpu_boost_with(cpu, p);
5974		- long margin;
5975		-
5976		- if (boost == 0)
5977		- margin = 0;
5978		- else
5979		- margin = schedtune_margin(util, boost);
5980		-
5981		- trace_sched_boost_cpu(cpu, util, margin);
5982		-
5983		- return margin;
5984		-}
5985		-
5986		-long schedtune_task_margin(struct task_struct *task)
5987		-{
5988		- int boost = schedtune_task_boost(task);
5989		- unsigned long util;
5990		- long margin;
5991		-
5992		- if (boost == 0)
5993		- return 0;
5994		-
5995		- util = task_util_est(task);
5996		- margin = schedtune_margin(util, boost);
5997		-
5998		- return margin;
5999		-}
6000		-
6001		-#else /* CONFIG_SCHED_TUNE */
6002		-
6003		-inline long
6004		-schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
6005		-{
6006		- return 0;
6007		-}
6008		-
6009		-#endif /* CONFIG_SCHED_TUNE */
6010		-
6011		-static unsigned long cpu_util_without(int cpu, struct task_struct *p);
6012		-
6013		-static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
6014		-{
6015		- return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
6016		-}
6017		-
6018		-/*
6019		- * find_idlest_group finds and returns the least busy CPU group within the
6020		- * domain.
6021		- *
6022		- * Assumes p is allowed on at least one CPU in sd.
6023		- */
6024	6140	static struct sched_group *
6025		-find_idlest_group(struct sched_domain sd, struct task_struct p,
6026		- int this_cpu, int sd_flag)
6027		-{
6028		- struct sched_group idlest = NULL, group = sd->groups;
6029		- struct sched_group *most_spare_sg = NULL;
6030		- unsigned long min_runnable_load = ULONG_MAX;
6031		- unsigned long this_runnable_load = ULONG_MAX;
6032		- unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
6033		- unsigned long most_spare = 0, this_spare = 0;
6034		- int load_idx = sd->forkexec_idx;
6035		- int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
6036		- unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
6037		- (sd->imbalance_pct-100) / 100;
6038		-
6039		- if (sd_flag & SD_BALANCE_WAKE)
6040		- load_idx = sd->wake_idx;
6041		-
6042		- do {
6043		- unsigned long load, avg_load, runnable_load;
6044		- unsigned long spare_cap, max_spare_cap;
6045		- int local_group;
6046		- int i;
6047		-
6048		- /* Skip over this group if it has no CPUs allowed */
6049		- if (!cpumask_intersects(sched_group_span(group),
6050		- p->cpus_ptr))
6051		- continue;
6052		-
6053		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
6054		- if (sysctl_sched_performance_bias) {
6055		- if (!task_fits_max(p, group_first_cpu(group)))
6056		- continue;
6057		- }
6058		-#endif
6059		-
6060		- local_group = cpumask_test_cpu(this_cpu,
6061		- sched_group_span(group));
6062		-
6063		- /*
6064		- * Tally up the load of all CPUs in the group and find
6065		- * the group containing the CPU with most spare capacity.
6066		- */
6067		- avg_load = 0;
6068		- runnable_load = 0;
6069		- max_spare_cap = 0;
6070		-
6071		- for_each_cpu(i, sched_group_span(group)) {
6072		- /* Bias balancing toward CPUs of our domain */
6073		- if (local_group)
6074		- load = source_load(i, load_idx);
6075		- else
6076		- load = target_load(i, load_idx);
6077		-
6078		- runnable_load += load;
6079		-
6080		- avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
6081		-
6082		- spare_cap = capacity_spare_without(i, p);
6083		-
6084		- if (spare_cap > max_spare_cap)
6085		- max_spare_cap = spare_cap;
6086		- }
6087		-
6088		- /* Adjust by relative CPU capacity of the group */
6089		- avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
6090		- group->sgc->capacity;
6091		- runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
6092		- group->sgc->capacity;
6093		-
6094		- if (local_group) {
6095		- this_runnable_load = runnable_load;
6096		- this_avg_load = avg_load;
6097		- this_spare = max_spare_cap;
6098		- } else {
6099		- if (min_runnable_load > (runnable_load + imbalance)) {
6100		- /*
6101		- * The runnable load is significantly smaller
6102		- * so we can pick this new CPU:
6103		- */
6104		- min_runnable_load = runnable_load;
6105		- min_avg_load = avg_load;
6106		- idlest = group;
6107		- } else if ((runnable_load < (min_runnable_load + imbalance)) &&
6108		- (100min_avg_load > imbalance_scaleavg_load)) {
6109		- /*
6110		- * The runnable loads are close so take the
6111		- * blocked load into account through avg_load:
6112		- */
6113		- min_avg_load = avg_load;
6114		- idlest = group;
6115		- }
6116		-
6117		- if (most_spare < max_spare_cap) {
6118		- most_spare = max_spare_cap;
6119		- most_spare_sg = group;
6120		- }
6121		- }
6122		- } while (group = group->next, group != sd->groups);
6123		-
6124		- /*
6125		- * The cross-over point between using spare capacity or least load
6126		- * is too conservative for high utilization tasks on partially
6127		- * utilized systems if we require spare_capacity > task_util(p),
6128		- * so we allow for some task stuffing by using
6129		- * spare_capacity > task_util(p)/2.
6130		- *
6131		- * Spare capacity can't be used for fork because the utilization has
6132		- * not been set yet, we must first select a rq to compute the initial
6133		- * utilization.
6134		- */
6135		- if (sd_flag & SD_BALANCE_FORK)
6136		- goto skip_spare;
6137		-
6138		- if (this_spare > task_util(p) / 2 &&
6139		- imbalance_scalethis_spare > 100most_spare)
6140		- return NULL;
6141		-
6142		- if (most_spare > task_util(p) / 2)
6143		- return most_spare_sg;
6144		-
6145		-skip_spare:
6146		- if (!idlest)
6147		- return NULL;
6148		-
6149		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
6150		- if (sysctl_sched_performance_bias) {
6151		- if ((this_runnable_load == ULONG_MAX) \|\| (this_avg_load == ULONG_MAX))
6152		- return idlest;
6153		- }
6154		-#endif
6155		-
6156		- /*
6157		- * When comparing groups across NUMA domains, it's possible for the
6158		- * local domain to be very lightly loaded relative to the remote
6159		- * domains but "imbalance" skews the comparison making remote CPUs
6160		- * look much more favourable. When considering cross-domain, add
6161		- * imbalance to the runnable load on the remote node and consider
6162		- * staying local.
6163		- */
6164		- if ((sd->flags & SD_NUMA) &&
6165		- min_runnable_load + imbalance >= this_runnable_load)
6166		- return NULL;
6167		-
6168		- if (min_runnable_load > (this_runnable_load + imbalance))
6169		- return NULL;
6170		-
6171		- if ((this_runnable_load < (min_runnable_load + imbalance)) &&
6172		- (100this_avg_load < imbalance_scalemin_avg_load))
6173		- return NULL;
6174		-
6175		- return idlest;
6176		-}
	6141	+find_idlest_group(struct sched_domain sd, struct task_struct p, int this_cpu);
6177	6142
6178	6143	/*
6179	6144	* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
..	..	@@ -6194,6 +6159,9 @@
6194	6159
6195	6160	/* Traverse only the allowed CPUs */
6196	6161	for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
	6162	+ if (sched_idle_cpu(i))
	6163	+ return i;
	6164	+
6197	6165	if (available_idle_cpu(i)) {
6198	6166	struct rq *rq = cpu_rq(i);
6199	6167	struct cpuidle_state *idle = idle_get_state(rq);
..	..	@@ -6217,7 +6185,7 @@
6217	6185	shallowest_idle_cpu = i;
6218	6186	}
6219	6187	} else if (shallowest_idle_cpu == -1) {
6220		- load = weighted_cpuload(cpu_rq(i));
	6188	+ load = cpu_load(cpu_rq(i));
6221	6189	if (load < min_load) {
6222	6190	min_load = load;
6223	6191	least_loaded_cpu = i;
..	..	@@ -6237,7 +6205,7 @@
6237	6205	return prev_cpu;
6238	6206
6239	6207	/*
6240		- * We need task's util for capacity_spare_without, sync it up to
	6208	+ * We need task's util for cpu_util_without, sync it up to
6241	6209	* prev_cpu's last_update_time.
6242	6210	*/
6243	6211	if (!(sd_flag & SD_BALANCE_FORK))
..	..	@@ -6253,7 +6221,7 @@
6253	6221	continue;
6254	6222	}
6255	6223
6256		- group = find_idlest_group(sd, p, cpu, sd_flag);
	6224	+ group = find_idlest_group(sd, p, cpu);
6257	6225	if (!group) {
6258	6226	sd = sd->child;
6259	6227	continue;
..	..	@@ -6356,10 +6324,12 @@
6356	6324	bool idle = true;
6357	6325
6358	6326	for_each_cpu(cpu, cpu_smt_mask(core)) {
6359		- cpumask_clear_cpu(cpu, cpus);
6360		- if (!available_idle_cpu(cpu))
	6327	+ if (!available_idle_cpu(cpu)) {
6361	6328	idle = false;
	6329	+ break;
	6330	+ }
6362	6331	}
	6332	+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
6363	6333
6364	6334	if (idle)
6365	6335	return core;
..	..	@@ -6384,9 +6354,10 @@
6384	6354	return -1;
6385	6355
6386	6356	for_each_cpu(cpu, cpu_smt_mask(target)) {
6387		- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
	6357	+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) \|\|
	6358	+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
6388	6359	continue;
6389		- if (available_idle_cpu(cpu))
	6360	+ if (available_idle_cpu(cpu) \|\| sched_idle_cpu(cpu))
6390	6361	return cpu;
6391	6362	}
6392	6363
..	..	@@ -6417,8 +6388,8 @@
6417	6388	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6418	6389	struct sched_domain *this_sd;
6419	6390	u64 avg_cost, avg_idle;
6420		- u64 time, cost;
6421		- s64 delta;
	6391	+ u64 time;
	6392	+ int this = smp_processor_id();
6422	6393	int cpu, nr = INT_MAX;
6423	6394
6424	6395	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
..	..	@@ -6443,23 +6414,68 @@
6443	6414	nr = 4;
6444	6415	}
6445	6416
6446		- time = local_clock();
	6417	+ time = cpu_clock(this);
6447	6418
6448	6419	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6449	6420
6450	6421	for_each_cpu_wrap(cpu, cpus, target) {
6451	6422	if (!--nr)
6452	6423	return -1;
6453		- if (available_idle_cpu(cpu))
	6424	+ if (available_idle_cpu(cpu) \|\| sched_idle_cpu(cpu))
6454	6425	break;
6455	6426	}
6456	6427
6457		- time = local_clock() - time;
6458		- cost = this_sd->avg_scan_cost;
6459		- delta = (s64)(time - cost) / 8;
6460		- this_sd->avg_scan_cost += delta;
	6428	+ time = cpu_clock(this) - time;
	6429	+ update_avg(&this_sd->avg_scan_cost, time);
6461	6430
6462	6431	return cpu;
	6432	+}
	6433	+
	6434	+/*
	6435	+ * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
	6436	+ * the task fits. If no CPU is big enough, but there are idle ones, try to
	6437	+ * maximize capacity.
	6438	+ */
	6439	+static int
	6440	+select_idle_capacity(struct task_struct p, struct sched_domain sd, int target)
	6441	+{
	6442	+ unsigned long task_util, util_min, util_max, best_cap = 0;
	6443	+ int cpu, best_cpu = -1;
	6444	+ struct cpumask *cpus;
	6445	+
	6446	+ cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
	6447	+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
	6448	+
	6449	+ task_util = task_util_est(p);
	6450	+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
	6451	+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
	6452	+
	6453	+ for_each_cpu_wrap(cpu, cpus, target) {
	6454	+ unsigned long cpu_cap = capacity_of(cpu);
	6455	+
	6456	+ if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
	6457	+ continue;
	6458	+ if (util_fits_cpu(task_util, util_min, util_max, cpu))
	6459	+ return cpu;
	6460	+
	6461	+ if (cpu_cap > best_cap) {
	6462	+ best_cap = cpu_cap;
	6463	+ best_cpu = cpu;
	6464	+ }
	6465	+ }
	6466	+
	6467	+ return best_cpu;
	6468	+}
	6469	+
	6470	+static inline bool asym_fits_cpu(unsigned long util,
	6471	+ unsigned long util_min,
	6472	+ unsigned long util_max,
	6473	+ int cpu)
	6474	+{
	6475	+ if (static_branch_unlikely(&sched_asym_cpucapacity))
	6476	+ return util_fits_cpu(util, util_min, util_max, cpu);
	6477	+
	6478	+ return true;
6463	6479	}
6464	6480
6465	6481	/*
..	..	@@ -6468,24 +6484,56 @@
6468	6484	static int select_idle_sibling(struct task_struct *p, int prev, int target)
6469	6485	{
6470	6486	struct sched_domain *sd;
	6487	+ unsigned long task_util, util_min, util_max;
6471	6488	int i, recent_used_cpu;
6472	6489
6473		- if (available_idle_cpu(target))
	6490	+ /*
	6491	+ * On asymmetric system, update task utilization because we will check
	6492	+ * that the task fits with cpu's capacity.
	6493	+ */
	6494	+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
	6495	+ sync_entity_load_avg(&p->se);
	6496	+ task_util = task_util_est(p);
	6497	+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
	6498	+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
	6499	+ }
	6500	+
	6501	+ if ((available_idle_cpu(target) \|\| sched_idle_cpu(target)) &&
	6502	+ asym_fits_cpu(task_util, util_min, util_max, target))
6474	6503	return target;
6475	6504
6476	6505	/*
6477	6506	* If the previous CPU is cache affine and idle, don't be stupid:
6478	6507	*/
6479		- if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
	6508	+ if (prev != target && cpus_share_cache(prev, target) &&
	6509	+ (available_idle_cpu(prev) \|\| sched_idle_cpu(prev)) &&
	6510	+ asym_fits_cpu(task_util, util_min, util_max, prev))
6480	6511	return prev;
	6512	+
	6513	+ /*
	6514	+ * Allow a per-cpu kthread to stack with the wakee if the
	6515	+ * kworker thread and the tasks previous CPUs are the same.
	6516	+ * The assumption is that the wakee queued work for the
	6517	+ * per-cpu kthread that is now complete and the wakeup is
	6518	+ * essentially a sync wakeup. An obvious example of this
	6519	+ * pattern is IO completions.
	6520	+ */
	6521	+ if (is_per_cpu_kthread(current) &&
	6522	+ in_task() &&
	6523	+ prev == smp_processor_id() &&
	6524	+ this_rq()->nr_running <= 1 &&
	6525	+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
	6526	+ return prev;
	6527	+ }
6481	6528
6482	6529	/* Check a recently used CPU as a potential idle candidate: */
6483	6530	recent_used_cpu = p->recent_used_cpu;
6484	6531	if (recent_used_cpu != prev &&
6485	6532	recent_used_cpu != target &&
6486	6533	cpus_share_cache(recent_used_cpu, target) &&
6487		- available_idle_cpu(recent_used_cpu) &&
6488		- cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
	6534	+ (available_idle_cpu(recent_used_cpu) \|\| sched_idle_cpu(recent_used_cpu)) &&
	6535	+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
	6536	+ asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
6489	6537	/*
6490	6538	* Replace recent_used_cpu with prev as it is a potential
6491	6539	* candidate for the next wake:
..	..	@@ -6494,6 +6542,32 @@
6494	6542	return recent_used_cpu;
6495	6543	}
6496	6544
	6545	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6546	+ if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH)
	6547	+ goto sd_llc;
	6548	+ }
	6549	+
	6550	+ /*
	6551	+ * For asymmetric CPU capacity systems, our domain of interest is
	6552	+ * sd_asym_cpucapacity rather than sd_llc.
	6553	+ */
	6554	+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
	6555	+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
	6556	+ /*
	6557	+ * On an asymmetric CPU capacity system where an exclusive
	6558	+ * cpuset defines a symmetric island (i.e. one unique
	6559	+ * capacity_orig value through the cpuset), the key will be set
	6560	+ * but the CPUs within that cpuset will not have a domain with
	6561	+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
	6562	+ * capacity path.
	6563	+ */
	6564	+ if (sd) {
	6565	+ i = select_idle_capacity(p, sd, target);
	6566	+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
	6567	+ }
	6568	+ }
	6569	+
	6570	+sd_llc:
6497	6571	sd = rcu_dereference(per_cpu(sd_llc, target));
6498	6572	if (!sd)
6499	6573	return target;
..	..	@@ -6591,7 +6665,7 @@
6591	6665	util = READ_ONCE(cfs_rq->avg.util_avg);
6592	6666
6593	6667	/* Discount task's util from CPU's util */
6594		- util -= min_t(unsigned int, util, task_util(p));
	6668	+ lsub_positive(&util, task_util(p));
6595	6669
6596	6670	/*
6597	6671	* Covered cases:
..	..	@@ -6640,10 +6714,9 @@
6640	6714	* properly fix the execl regression and it helps in further
6641	6715	* reducing the chances for the above race.
6642	6716	*/
6643		- if (unlikely(task_on_rq_queued(p) \|\| current == p)) {
6644		- estimated -= min_t(unsigned int, estimated,
6645		- (_task_util_est(p) \| UTIL_AVG_UNCHANGED));
6646		- }
	6717	+ if (unlikely(task_on_rq_queued(p) \|\| current == p))
	6718	+ lsub_positive(&estimated, _task_util_est(p));
	6719	+
6647	6720	util = max(util, estimated);
6648	6721	}
6649	6722
..	..	@@ -6653,350 +6726,6 @@
6653	6726	* the cpu_util call.
6654	6727	*/
6655	6728	return min_t(unsigned long, util, capacity_orig_of(cpu));
6656		-}
6657		-
6658		-/*
6659		- * Returns the current capacity of cpu after applying both
6660		- * cpu and freq scaling.
6661		- */
6662		-unsigned long capacity_curr_of(int cpu)
6663		-{
6664		- unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig;
6665		- unsigned long scale_freq = arch_scale_freq_capacity(cpu);
6666		-
6667		- return cap_scale(max_cap, scale_freq);
6668		-}
6669		-
6670		-static void find_best_target(struct sched_domain sd, cpumask_t cpus,
6671		- struct task_struct *p)
6672		-{
6673		- unsigned long min_util = uclamp_task(p);
6674		- unsigned long target_capacity = ULONG_MAX;
6675		- unsigned long min_wake_util = ULONG_MAX;
6676		- unsigned long target_max_spare_cap = 0;
6677		- unsigned long target_util = ULONG_MAX;
6678		- /* Initialise with deepest possible cstate (INT_MAX) */
6679		- int shallowest_idle_cstate = INT_MAX;
6680		- struct sched_group *sg;
6681		- int best_active_cpu = -1;
6682		- int best_idle_cpu = -1;
6683		- int target_cpu = -1;
6684		- int backup_cpu = -1;
6685		- bool prefer_idle;
6686		- bool boosted;
6687		- int i;
6688		-
6689		- /*
6690		- * In most cases, target_capacity tracks capacity_orig of the most
6691		- * energy efficient CPU candidate, thus requiring to minimise
6692		- * target_capacity. For these cases target_capacity is already
6693		- * initialized to ULONG_MAX.
6694		- * However, for prefer_idle and boosted tasks we look for a high
6695		- * performance CPU, thus requiring to maximise target_capacity. In this
6696		- * case we initialise target_capacity to 0.
6697		- */
6698		- prefer_idle = uclamp_latency_sensitive(p);
6699		- boosted = uclamp_boosted(p);
6700		- if (prefer_idle && boosted)
6701		- target_capacity = 0;
6702		-
6703		- /* Scan CPUs in all SDs */
6704		- sg = sd->groups;
6705		- do {
6706		- for_each_cpu_and(i, p->cpus_ptr, sched_group_span(sg)) {
6707		- unsigned long capacity_curr = capacity_curr_of(i);
6708		- unsigned long capacity_orig = capacity_orig_of(i);
6709		- unsigned long wake_util, new_util;
6710		- long spare_cap;
6711		- int idle_idx = INT_MAX;
6712		-
6713		- if (!cpu_online(i))
6714		- continue;
6715		-
6716		- /*
6717		- * p's blocked utilization is still accounted for on prev_cpu
6718		- * so prev_cpu will receive a negative bias due to the double
6719		- * accounting. However, the blocked utilization may be zero.
6720		- */
6721		- wake_util = cpu_util_without(i, p);
6722		- new_util = wake_util + task_util_est(p);
6723		-
6724		- /*
6725		- * Ensure minimum capacity to grant the required boost.
6726		- * The target CPU can be already at a capacity level higher
6727		- * than the one required to boost the task.
6728		- */
6729		- new_util = max(min_util, new_util);
6730		- if (new_util > capacity_orig)
6731		- continue;
6732		-
6733		- /*
6734		- * Pre-compute the maximum possible capacity we expect
6735		- * to have available on this CPU once the task is
6736		- * enqueued here.
6737		- */
6738		- spare_cap = capacity_orig - new_util;
6739		-
6740		- if (idle_cpu(i))
6741		- idle_idx = idle_get_state_idx(cpu_rq(i));
6742		-
6743		-
6744		- /*
6745		- * Case A) Latency sensitive tasks
6746		- *
6747		- * Unconditionally favoring tasks that prefer idle CPU to
6748		- * improve latency.
6749		- *
6750		- * Looking for:
6751		- * - an idle CPU, whatever its idle_state is, since
6752		- * the first CPUs we explore are more likely to be
6753		- * reserved for latency sensitive tasks.
6754		- * - a non idle CPU where the task fits in its current
6755		- * capacity and has the maximum spare capacity.
6756		- * - a non idle CPU with lower contention from other
6757		- * tasks and running at the lowest possible OPP.
6758		- *
6759		- * The last two goals tries to favor a non idle CPU
6760		- * where the task can run as if it is "almost alone".
6761		- * A maximum spare capacity CPU is favoured since
6762		- * the task already fits into that CPU's capacity
6763		- * without waiting for an OPP chance.
6764		- *
6765		- * The following code path is the only one in the CPUs
6766		- * exploration loop which is always used by
6767		- * prefer_idle tasks. It exits the loop with wither a
6768		- * best_active_cpu or a target_cpu which should
6769		- * represent an optimal choice for latency sensitive
6770		- * tasks.
6771		- */
6772		- if (prefer_idle) {
6773		-
6774		- /*
6775		- * Case A.1: IDLE CPU
6776		- * Return the best IDLE CPU we find:
6777		- * - for boosted tasks: the CPU with the highest
6778		- * performance (i.e. biggest capacity_orig)
6779		- * - for !boosted tasks: the most energy
6780		- * efficient CPU (i.e. smallest capacity_orig)
6781		- */
6782		- if (idle_cpu(i)) {
6783		- if (boosted &&
6784		- capacity_orig < target_capacity)
6785		- continue;
6786		- if (!boosted &&
6787		- capacity_orig > target_capacity)
6788		- continue;
6789		- /*
6790		- * Minimise value of idle state: skip
6791		- * deeper idle states and pick the
6792		- * shallowest.
6793		- */
6794		- if (capacity_orig == target_capacity &&
6795		- sysctl_sched_cstate_aware &&
6796		- idle_idx >= shallowest_idle_cstate)
6797		- continue;
6798		-
6799		- target_capacity = capacity_orig;
6800		- shallowest_idle_cstate = idle_idx;
6801		- best_idle_cpu = i;
6802		- continue;
6803		- }
6804		- if (best_idle_cpu != -1)
6805		- continue;
6806		-
6807		- /*
6808		- * Case A.2: Target ACTIVE CPU
6809		- * Favor CPUs with max spare capacity.
6810		- */
6811		- if (capacity_curr > new_util &&
6812		- spare_cap > target_max_spare_cap) {
6813		- target_max_spare_cap = spare_cap;
6814		- target_cpu = i;
6815		- continue;
6816		- }
6817		- if (target_cpu != -1)
6818		- continue;
6819		-
6820		-
6821		- /*
6822		- * Case A.3: Backup ACTIVE CPU
6823		- * Favor CPUs with:
6824		- * - lower utilization due to other tasks
6825		- * - lower utilization with the task in
6826		- */
6827		- if (wake_util > min_wake_util)
6828		- continue;
6829		- min_wake_util = wake_util;
6830		- best_active_cpu = i;
6831		- continue;
6832		- }
6833		-
6834		- /*
6835		- * Enforce EAS mode
6836		- *
6837		- * For non latency sensitive tasks, skip CPUs that
6838		- * will be overutilized by moving the task there.
6839		- *
6840		- * The goal here is to remain in EAS mode as long as
6841		- * possible at least for !prefer_idle tasks.
6842		- */
6843		- if ((new_util * capacity_margin) >
6844		- (capacity_orig * SCHED_CAPACITY_SCALE))
6845		- continue;
6846		-
6847		- /*
6848		- * Favor CPUs with smaller capacity for non latency
6849		- * sensitive tasks.
6850		- */
6851		- if (capacity_orig > target_capacity)
6852		- continue;
6853		-
6854		- /*
6855		- * Case B) Non latency sensitive tasks on IDLE CPUs.
6856		- *
6857		- * Find an optimal backup IDLE CPU for non latency
6858		- * sensitive tasks.
6859		- *
6860		- * Looking for:
6861		- * - minimizing the capacity_orig,
6862		- * i.e. preferring LITTLE CPUs
6863		- * - favoring shallowest idle states
6864		- * i.e. avoid to wakeup deep-idle CPUs
6865		- *
6866		- * The following code path is used by non latency
6867		- * sensitive tasks if IDLE CPUs are available. If at
6868		- * least one of such CPUs are available it sets the
6869		- * best_idle_cpu to the most suitable idle CPU to be
6870		- * selected.
6871		- *
6872		- * If idle CPUs are available, favour these CPUs to
6873		- * improve performances by spreading tasks.
6874		- * Indeed, the energy_diff() computed by the caller
6875		- * will take care to ensure the minimization of energy
6876		- * consumptions without affecting performance.
6877		- */
6878		- if (idle_cpu(i)) {
6879		- /*
6880		- * Skip CPUs in deeper idle state, but only
6881		- * if they are also less energy efficient.
6882		- * IOW, prefer a deep IDLE LITTLE CPU vs a
6883		- * shallow idle big CPU.
6884		- */
6885		- if (capacity_orig == target_capacity &&
6886		- sysctl_sched_cstate_aware &&
6887		- idle_idx >= shallowest_idle_cstate)
6888		- continue;
6889		-
6890		- target_capacity = capacity_orig;
6891		- shallowest_idle_cstate = idle_idx;
6892		- best_idle_cpu = i;
6893		- continue;
6894		- }
6895		-
6896		- /*
6897		- * Case C) Non latency sensitive tasks on ACTIVE CPUs.
6898		- *
6899		- * Pack tasks in the most energy efficient capacities.
6900		- *
6901		- * This task packing strategy prefers more energy
6902		- * efficient CPUs (i.e. pack on smaller maximum
6903		- * capacity CPUs) while also trying to spread tasks to
6904		- * run them all at the lower OPP.
6905		- *
6906		- * This assumes for example that it's more energy
6907		- * efficient to run two tasks on two CPUs at a lower
6908		- * OPP than packing both on a single CPU but running
6909		- * that CPU at an higher OPP.
6910		- *
6911		- * Thus, this case keep track of the CPU with the
6912		- * smallest maximum capacity and highest spare maximum
6913		- * capacity.
6914		- */
6915		-
6916		- /* Favor CPUs with maximum spare capacity */
6917		- if (capacity_orig == target_capacity &&
6918		- spare_cap < target_max_spare_cap)
6919		- continue;
6920		-
6921		- target_max_spare_cap = spare_cap;
6922		- target_capacity = capacity_orig;
6923		- target_util = new_util;
6924		- target_cpu = i;
6925		- }
6926		-
6927		- } while (sg = sg->next, sg != sd->groups);
6928		-
6929		- /*
6930		- * For non latency sensitive tasks, cases B and C in the previous loop,
6931		- * we pick the best IDLE CPU only if we was not able to find a target
6932		- * ACTIVE CPU.
6933		- *
6934		- * Policies priorities:
6935		- *
6936		- * - prefer_idle tasks:
6937		- *
6938		- * a) IDLE CPU available: best_idle_cpu
6939		- * b) ACTIVE CPU where task fits and has the bigger maximum spare
6940		- * capacity (i.e. target_cpu)
6941		- * c) ACTIVE CPU with less contention due to other tasks
6942		- * (i.e. best_active_cpu)
6943		- *
6944		- * - NON prefer_idle tasks:
6945		- *
6946		- * a) ACTIVE CPU: target_cpu
6947		- * b) IDLE CPU: best_idle_cpu
6948		- */
6949		-
6950		- if (prefer_idle && (best_idle_cpu != -1)) {
6951		- target_cpu = best_idle_cpu;
6952		- goto target;
6953		- }
6954		-
6955		- if (target_cpu == -1)
6956		- target_cpu = prefer_idle
6957		- ? best_active_cpu
6958		- : best_idle_cpu;
6959		- else
6960		- backup_cpu = prefer_idle
6961		- ? best_active_cpu
6962		- : best_idle_cpu;
6963		-
6964		- if (backup_cpu >= 0)
6965		- cpumask_set_cpu(backup_cpu, cpus);
6966		- if (target_cpu >= 0) {
6967		-target:
6968		- cpumask_set_cpu(target_cpu, cpus);
6969		- }
6970		-
6971		- trace_sched_find_best_target(p, prefer_idle, min_util, best_idle_cpu,
6972		- best_active_cpu, target_cpu, backup_cpu);
6973		-}
6974		-
6975		-/*
6976		- * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
6977		- * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
6978		- *
6979		- * In that case WAKE_AFFINE doesn't make sense and we'll let
6980		- * BALANCE_WAKE sort things out.
6981		- */
6982		-static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6983		-{
6984		- long min_cap, max_cap;
6985		-
6986		- if (!static_branch_unlikely(&sched_asym_cpucapacity))
6987		- return 0;
6988		-
6989		- min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6990		- max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
6991		-
6992		- /* Minimum capacity is close to max, no need to abort wake_affine */
6993		- if (max_cap - min_cap < max_cap >> 3)
6994		- return 0;
6995		-
6996		- /* Bring task utilization in sync with prev_cpu */
6997		- sync_entity_load_avg(&p->se);
6998		-
6999		- return !task_fits_capacity(p, min_cap);
7000	6729	}
7001	6730
7002	6731	/*
..	..	@@ -7038,154 +6767,61 @@
7038	6767	}
7039	6768
7040	6769	/*
7041		- * compute_energy(): Estimates the energy that would be consumed if @p was
	6770	+ * compute_energy(): Estimates the energy that @pd would consume if @p was
7042	6771	* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
7043		- * landscape of the * CPUs after the task migration, and uses the Energy Model
	6772	+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
7044	6773	* to compute what would be the energy if we decided to actually migrate that
7045	6774	* task.
7046	6775	*/
7047	6776	static long
7048	6777	compute_energy(struct task_struct p, int dst_cpu, struct perf_domain pd)
7049	6778	{
7050		- unsigned int max_util, util_cfs, cpu_util, cpu_cap;
7051		- unsigned long sum_util, energy = 0;
7052		- struct task_struct *tsk;
	6779	+ struct cpumask *pd_mask = perf_domain_span(pd);
	6780	+ unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
	6781	+ unsigned long max_util = 0, sum_util = 0;
	6782	+ unsigned long energy = 0;
7053	6783	int cpu;
7054	6784
7055		- for (; pd; pd = pd->next) {
7056		- struct cpumask *pd_mask = perf_domain_span(pd);
	6785	+ /*
	6786	+ * The capacity state of CPUs of the current rd can be driven by CPUs
	6787	+ * of another rd if they belong to the same pd. So, account for the
	6788	+ * utilization of these CPUs too by masking pd with cpu_online_mask
	6789	+ * instead of the rd span.
	6790	+ *
	6791	+ * If an entire pd is outside of the current rd, it will not appear in
	6792	+ * its pd list and will not be accounted by compute_energy().
	6793	+ */
	6794	+ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
	6795	+ unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
	6796	+ struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
7057	6797
7058	6798	/*
7059		- * The energy model mandates all the CPUs of a performance
7060		- * domain have the same capacity.
	6799	+ * Busy time computation: utilization clamping is not
	6800	+ * required since the ratio (sum_util / cpu_capacity)
	6801	+ * is already enough to scale the EM reported power
	6802	+ * consumption at the (eventually clamped) cpu_capacity.
7061	6803	*/
7062		- cpu_cap = arch_scale_cpu_capacity(NULL, cpumask_first(pd_mask));
7063		- max_util = sum_util = 0;
	6804	+ sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
	6805	+ ENERGY_UTIL, NULL);
7064	6806
7065	6807	/*
7066		- * The capacity state of CPUs of the current rd can be driven by
7067		- * CPUs of another rd if they belong to the same performance
7068		- * domain. So, account for the utilization of these CPUs too
7069		- * by masking pd with cpu_online_mask instead of the rd span.
7070		- *
7071		- * If an entire performance domain is outside of the current rd,
7072		- * it will not appear in its pd list and will not be accounted
7073		- * by compute_energy().
	6808	+ * Performance domain frequency: utilization clamping
	6809	+ * must be considered since it affects the selection
	6810	+ * of the performance domain frequency.
	6811	+ * NOTE: in case RT tasks are running, by default the
	6812	+ * FREQUENCY_UTIL's utilization can be max OPP.
7074	6813	*/
7075		- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
7076		- util_cfs = cpu_util_next(cpu, p, dst_cpu);
7077		-
7078		- /*
7079		- * Busy time computation: utilization clamping is not
7080		- * required since the ratio (sum_util / cpu_capacity)
7081		- * is already enough to scale the EM reported power
7082		- * consumption at the (eventually clamped) cpu_capacity.
7083		- */
7084		- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
7085		- ENERGY_UTIL, NULL);
7086		-
7087		- /*
7088		- * Performance domain frequency: utilization clamping
7089		- * must be considered since it affects the selection
7090		- * of the performance domain frequency.
7091		- * NOTE: in case RT tasks are running, by default the
7092		- * FREQUENCY_UTIL's utilization can be max OPP.
7093		- */
7094		- tsk = cpu == dst_cpu ? p : NULL;
7095		- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
7096		- FREQUENCY_UTIL, tsk);
7097		- max_util = max(max_util, cpu_util);
7098		- }
7099		-
7100		- energy += em_pd_energy(pd->em_pd, max_util, sum_util);
	6814	+ cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
	6815	+ FREQUENCY_UTIL, tsk);
	6816	+ max_util = max(max_util, cpu_util);
7101	6817	}
	6818	+
	6819	+ trace_android_vh_em_cpu_energy(pd->em_pd, max_util, sum_util, &energy);
	6820	+ if (!energy)
	6821	+ energy = em_cpu_energy(pd->em_pd, max_util, sum_util);
7102	6822
7103	6823	return energy;
7104	6824	}
7105		-
7106		-static void select_cpu_candidates(struct sched_domain sd, cpumask_t cpus,
7107		- struct perf_domain pd, struct task_struct p, int prev_cpu)
7108		-{
7109		- int highest_spare_cap_cpu = prev_cpu, best_idle_cpu = -1;
7110		- unsigned long spare_cap, max_spare_cap, util, cpu_cap;
7111		- bool prefer_idle = uclamp_latency_sensitive(p);
7112		- bool boosted = uclamp_boosted(p);
7113		- unsigned long target_cap = boosted ? 0 : ULONG_MAX;
7114		- unsigned long highest_spare_cap = 0;
7115		- unsigned int min_exit_lat = UINT_MAX;
7116		- int cpu, max_spare_cap_cpu;
7117		- struct cpuidle_state *idle;
7118		-
7119		- for (; pd; pd = pd->next) {
7120		- max_spare_cap_cpu = -1;
7121		- max_spare_cap = 0;
7122		-
7123		- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
7124		- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
7125		- continue;
7126		-
7127		- util = cpu_util_next(cpu, p, cpu);
7128		- cpu_cap = capacity_of(cpu);
7129		- spare_cap = cpu_cap - util;
7130		-
7131		- /*
7132		- * Skip CPUs that cannot satisfy the capacity request.
7133		- * IOW, placing the task there would make the CPU
7134		- * overutilized. Take uclamp into account to see how
7135		- * much capacity we can get out of the CPU; this is
7136		- * aligned with schedutil_cpu_util().
7137		- */
7138		- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
7139		- if (cpu_cap * 1024 < util * capacity_margin)
7140		- continue;
7141		-
7142		- /*
7143		- * Find the CPU with the maximum spare capacity in
7144		- * the performance domain
7145		- */
7146		- if (spare_cap > max_spare_cap) {
7147		- max_spare_cap = spare_cap;
7148		- max_spare_cap_cpu = cpu;
7149		- }
7150		-
7151		- if (!prefer_idle)
7152		- continue;
7153		-
7154		- if (idle_cpu(cpu)) {
7155		- cpu_cap = capacity_orig_of(cpu);
7156		- if (boosted && cpu_cap < target_cap)
7157		- continue;
7158		- if (!boosted && cpu_cap > target_cap)
7159		- continue;
7160		- idle = idle_get_state(cpu_rq(cpu));
7161		- if (idle && idle->exit_latency > min_exit_lat &&
7162		- cpu_cap == target_cap)
7163		- continue;
7164		-
7165		- if (idle)
7166		- min_exit_lat = idle->exit_latency;
7167		- target_cap = cpu_cap;
7168		- best_idle_cpu = cpu;
7169		- } else if (spare_cap > highest_spare_cap) {
7170		- highest_spare_cap = spare_cap;
7171		- highest_spare_cap_cpu = cpu;
7172		- }
7173		- }
7174		-
7175		- if (!prefer_idle && max_spare_cap_cpu >= 0)
7176		- cpumask_set_cpu(max_spare_cap_cpu, cpus);
7177		- }
7178		-
7179		- if (!prefer_idle)
7180		- return;
7181		-
7182		- if (best_idle_cpu >= 0)
7183		- cpumask_set_cpu(best_idle_cpu, cpus);
7184		- else
7185		- cpumask_set_cpu(highest_spare_cap_cpu, cpus);
7186		-}
7187		-
7188		-static DEFINE_PER_CPU(cpumask_t, energy_cpus);
7189	6825
7190	6826	/*
7191	6827	* find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
..	..	@@ -7226,27 +6862,41 @@
7226	6862	* other use-cases too. So, until someone finds a better way to solve this,
7227	6863	* let's keep things simple by re-using the existing slow path.
7228	6864	*/
7229		-
7230	6865	static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
7231	6866	{
7232		- unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
	6867	+ unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
	6868	+ unsigned long best_delta2 = ULONG_MAX;
	6869	+ unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
	6870	+ unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
7233	6871	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
7234		- int weight, cpu, best_energy_cpu = prev_cpu;
7235		- unsigned long cur_energy;
7236		- struct perf_domain *pd;
	6872	+ int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1;
	6873	+ unsigned long max_spare_cap_ls = 0, target_cap;
	6874	+ unsigned long cpu_cap, util, base_energy = 0;
	6875	+ bool boosted, latency_sensitive = false;
	6876	+ unsigned int min_exit_lat = UINT_MAX;
	6877	+ int cpu, best_energy_cpu = prev_cpu;
	6878	+ struct cpuidle_state *idle;
7237	6879	struct sched_domain *sd;
7238		- cpumask_t *candidates;
	6880	+ struct perf_domain *pd;
	6881	+ int new_cpu = INT_MAX;
7239	6882
7240		- if (sysctl_sched_sync_hint_enable && sync) {
7241		- cpu = smp_processor_id();
7242		- if (cpumask_test_cpu(cpu, p->cpus_ptr))
7243		- return cpu;
7244		- }
	6883	+ sync_entity_load_avg(&p->se);
	6884	+ trace_android_rvh_find_energy_efficient_cpu(p, prev_cpu, sync, &new_cpu);
	6885	+ if (new_cpu != INT_MAX)
	6886	+ return new_cpu;
7245	6887
7246	6888	rcu_read_lock();
7247	6889	pd = rcu_dereference(rd->pd);
7248	6890	if (!pd \|\| READ_ONCE(rd->overutilized))
7249	6891	goto fail;
	6892	+
	6893	+ cpu = smp_processor_id();
	6894	+ if (sync && cpu_rq(cpu)->nr_running == 1 &&
	6895	+ cpumask_test_cpu(cpu, p->cpus_ptr) &&
	6896	+ task_fits_cpu(p, cpu)) {
	6897	+ rcu_read_unlock();
	6898	+ return cpu;
	6899	+ }
7250	6900
7251	6901	/*
7252	6902	* Energy-aware wake-up happens on the lowest sched_domain starting
..	..	@@ -7258,59 +6908,169 @@
7258	6908	if (!sd)
7259	6909	goto fail;
7260	6910
7261		- sync_entity_load_avg(&p->se);
7262		- if (!task_util_est(p))
	6911	+ if (!uclamp_task_util(p, p_util_min, p_util_max))
7263	6912	goto unlock;
7264	6913
7265		- /* Pre-select a set of candidate CPUs. */
7266		- candidates = this_cpu_ptr(&energy_cpus);
7267		- cpumask_clear(candidates);
	6914	+ latency_sensitive = uclamp_latency_sensitive(p);
	6915	+ boosted = uclamp_boosted(p);
	6916	+ target_cap = boosted ? 0 : ULONG_MAX;
7268	6917
7269		- if (sched_feat(FIND_BEST_TARGET))
7270		- find_best_target(sd, candidates, p);
7271		- else
7272		- select_cpu_candidates(sd, candidates, pd, p, prev_cpu);
	6918	+ for (; pd; pd = pd->next) {
	6919	+ unsigned long cur_delta, spare_cap, max_spare_cap = 0;
	6920	+ unsigned long rq_util_min, rq_util_max;
	6921	+ unsigned long util_min, util_max;
	6922	+ unsigned long base_energy_pd;
	6923	+ int max_spare_cap_cpu = -1;
7273	6924
7274		- /* Bail out if no candidate was found. */
7275		- weight = cpumask_weight(candidates);
7276		- if (!weight)
7277		- goto unlock;
	6925	+ /* Compute the 'base' energy of the pd, without @p */
	6926	+ base_energy_pd = compute_energy(p, -1, pd);
	6927	+ base_energy += base_energy_pd;
7278	6928
7279		- /* If there is only one sensible candidate, select it now. */
7280		- cpu = cpumask_first(candidates);
7281		- if (weight == 1 && ((uclamp_latency_sensitive(p) && idle_cpu(cpu)) \|\|
7282		- (cpu == prev_cpu))) {
7283		- best_energy_cpu = cpu;
7284		- goto unlock;
7285		- }
	6929	+ for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
	6930	+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
	6931	+ continue;
7286	6932
7287		- if (cpumask_test_cpu(prev_cpu, p->cpus_ptr))
7288		- prev_energy = best_energy = compute_energy(p, prev_cpu, pd);
7289		- else
7290		- prev_energy = best_energy = ULONG_MAX;
	6933	+ util = cpu_util_next(cpu, p, cpu);
	6934	+ cpu_cap = capacity_of(cpu);
	6935	+ spare_cap = cpu_cap;
	6936	+ lsub_positive(&spare_cap, util);
7291	6937
7292		- /* Select the best candidate energy-wise. */
7293		- for_each_cpu(cpu, candidates) {
7294		- if (cpu == prev_cpu)
7295		- continue;
7296		- cur_energy = compute_energy(p, cpu, pd);
7297		- if (cur_energy < best_energy) {
7298		- best_energy = cur_energy;
7299		- best_energy_cpu = cpu;
	6938	+ /*
	6939	+ * Skip CPUs that cannot satisfy the capacity request.
	6940	+ * IOW, placing the task there would make the CPU
	6941	+ * overutilized. Take uclamp into account to see how
	6942	+ * much capacity we can get out of the CPU; this is
	6943	+ * aligned with schedutil_cpu_util().
	6944	+ */
	6945	+ if (uclamp_is_used()) {
	6946	+ if (uclamp_rq_is_idle(cpu_rq(cpu))) {
	6947	+ util_min = p_util_min;
	6948	+ util_max = p_util_max;
	6949	+ } else {
	6950	+ /*
	6951	+ * Open code uclamp_rq_util_with() except for
	6952	+ * the clamp() part. Ie: apply max aggregation
	6953	+ * only. util_fits_cpu() logic requires to
	6954	+ * operate on non clamped util but must use the
	6955	+ * max-aggregated uclamp_{min, max}.
	6956	+ */
	6957	+ rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
	6958	+ rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
	6959	+
	6960	+ util_min = max(rq_util_min, p_util_min);
	6961	+ util_max = max(rq_util_max, p_util_max);
	6962	+ }
	6963	+ }
	6964	+ if (!util_fits_cpu(util, util_min, util_max, cpu))
	6965	+ continue;
	6966	+
	6967	+ /* Always use prev_cpu as a candidate. */
	6968	+ if (!latency_sensitive && cpu == prev_cpu) {
	6969	+ prev_delta = compute_energy(p, prev_cpu, pd);
	6970	+ prev_delta -= base_energy_pd;
	6971	+ best_delta = min(best_delta, prev_delta);
	6972	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6973	+ if (prev_delta == best_delta)
	6974	+ best_energy_cpu = prev_cpu;
	6975	+ }
	6976	+ }
	6977	+
	6978	+ /*
	6979	+ * Find the CPU with the maximum spare capacity in
	6980	+ * the performance domain
	6981	+ */
	6982	+ if (spare_cap > max_spare_cap) {
	6983	+ max_spare_cap = spare_cap;
	6984	+ max_spare_cap_cpu = cpu;
	6985	+ }
	6986	+
	6987	+ if (!IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6988	+ if (!latency_sensitive)
	6989	+ continue;
	6990	+ }
	6991	+
	6992	+ if (idle_cpu(cpu)) {
	6993	+ cpu_cap = capacity_orig_of(cpu);
	6994	+ if (boosted && cpu_cap < target_cap)
	6995	+ continue;
	6996	+ if (!boosted && cpu_cap > target_cap)
	6997	+ continue;
	6998	+ idle = idle_get_state(cpu_rq(cpu));
	6999	+ if (idle && idle->exit_latency > min_exit_lat &&
	7000	+ cpu_cap == target_cap)
	7001	+ continue;
	7002	+
	7003	+ if (idle)
	7004	+ min_exit_lat = idle->exit_latency;
	7005	+ target_cap = cpu_cap;
	7006	+ best_idle_cpu = cpu;
	7007	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	7008	+ best_delta2 = compute_energy(p, cpu, pd);
	7009	+ best_delta2 -= base_energy_pd;
	7010	+ }
	7011	+ } else if (spare_cap > max_spare_cap_ls) {
	7012	+ max_spare_cap_ls = spare_cap;
	7013	+ max_spare_cap_cpu_ls = cpu;
	7014	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	7015	+ if (best_idle_cpu == -1) {
	7016	+ best_delta2 = compute_energy(p, cpu, pd);
	7017	+ best_delta2 -= base_energy_pd;
	7018	+ }
	7019	+ }
	7020	+ }
	7021	+ }
	7022	+
	7023	+ /* Evaluate the energy impact of using this CPU. */
	7024	+ if (!latency_sensitive && max_spare_cap_cpu >= 0 &&
	7025	+ max_spare_cap_cpu != prev_cpu) {
	7026	+ cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
	7027	+ cur_delta -= base_energy_pd;
	7028	+ if (cur_delta < best_delta) {
	7029	+ best_delta = cur_delta;
	7030	+ best_energy_cpu = max_spare_cap_cpu;
	7031	+ }
7300	7032	}
7301	7033	}
7302	7034	unlock:
7303	7035	rcu_read_unlock();
7304	7036
	7037	+ if (latency_sensitive)
	7038	+ return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
	7039	+
7305	7040	/*
7306	7041	* Pick the best CPU if prev_cpu cannot be used, or if it saves at
7307	7042	* least 6% of the energy used by prev_cpu.
7308	7043	*/
7309		- if (prev_energy == ULONG_MAX)
	7044	+ if (prev_delta == ULONG_MAX)
7310	7045	return best_energy_cpu;
7311	7046
7312		- if ((prev_energy - best_energy) > (prev_energy >> 4))
	7047	+ if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
7313	7048	return best_energy_cpu;
	7049	+
	7050	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	7051	+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
	7052	+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
	7053	+ int level = rockchip_perf_get_level();
	7054	+
	7055	+ /*
	7056	+ * when select ROCKCHIP_PERFORMANCE_LOW:
	7057	+ * Pick best_energy_cpu if prev_cpu is big cpu and best_energy_cpu
	7058	+ * is little cpu, so that tasks can migrate from big cpu to little
	7059	+ * cpu easier to save power.
	7060	+ */
	7061	+ if ((level == ROCKCHIP_PERFORMANCE_LOW) && cpul_mask &&
	7062	+ cpub_mask && cpumask_test_cpu(prev_cpu, cpub_mask) &&
	7063	+ cpumask_test_cpu(best_energy_cpu, cpul_mask)) {
	7064	+ return best_energy_cpu;
	7065	+ }
	7066	+
	7067	+ /*
	7068	+ * Pick the idlest cpu if it is a little power increased(<3.1%).
	7069	+ */
	7070	+ if ((best_delta2 <= prev_delta) \|\|
	7071	+ ((best_delta2 - prev_delta) < ((prev_delta + base_energy) >> 5)))
	7072	+ return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
	7073	+ }
7314	7074
7315	7075	return prev_cpu;
7316	7076
..	..	@@ -7333,39 +7093,44 @@
7333	7093	* preempt must be disabled.
7334	7094	*/
7335	7095	static int
7336		-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
7337		- int sibling_count_hint)
	7096	+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
7338	7097	{
7339	7098	struct sched_domain tmp, sd = NULL;
7340	7099	int cpu = smp_processor_id();
7341	7100	int new_cpu = prev_cpu;
7342	7101	int want_affine = 0;
7343	7102	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
	7103	+ int target_cpu = -1;
	7104	+
	7105	+ if (trace_android_rvh_select_task_rq_fair_enabled() &&
	7106	+ !(sd_flag & SD_BALANCE_FORK))
	7107	+ sync_entity_load_avg(&p->se);
	7108	+ trace_android_rvh_select_task_rq_fair(p, prev_cpu, sd_flag,
	7109	+ wake_flags, &target_cpu);
	7110	+ if (target_cpu >= 0)
	7111	+ return target_cpu;
7344	7112
7345	7113	if (sd_flag & SD_BALANCE_WAKE) {
7346	7114	record_wakee(p);
7347	7115
7348		- if (static_branch_unlikely(&sched_energy_present)) {
7349		- if (uclamp_latency_sensitive(p) && !sched_feat(EAS_PREFER_IDLE) && !sync)
7350		- goto sd_loop;
	7116	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	7117	+ if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH)
	7118	+ goto no_eas;
	7119	+ }
7351	7120
	7121	+ if (sched_energy_enabled()) {
7352	7122	new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync);
7353	7123	if (new_cpu >= 0)
7354	7124	return new_cpu;
7355	7125	new_cpu = prev_cpu;
7356	7126	}
7357	7127
7358		- want_affine = !wake_wide(p, sibling_count_hint) &&
7359		- !wake_cap(p, cpu, prev_cpu) &&
7360		- cpumask_test_cpu(cpu, p->cpus_ptr);
	7128	+no_eas:
	7129	+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
7361	7130	}
7362	7131
7363		-sd_loop:
7364	7132	rcu_read_lock();
7365	7133	for_each_domain(cpu, tmp) {
7366		- if (!(tmp->flags & SD_LOAD_BALANCE))
7367		- break;
7368		-
7369	7134	/*
7370	7135	* If both 'cpu' and 'prev_cpu' are part of this domain,
7371	7136	* cpu is a valid SD_WAKE_AFFINE target.
..	..	@@ -7392,6 +7157,23 @@
7392	7157	/* Fast path */
7393	7158
7394	7159	new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
	7160	+
	7161	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	7162	+ struct root_domain *rd = cpu_rq(cpu)->rd;
	7163	+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
	7164	+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
	7165	+ int level = rockchip_perf_get_level();
	7166	+
	7167	+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
	7168	+ cpul_mask && cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) &&
	7169	+ cpumask_test_cpu(new_cpu, cpul_mask)) {
	7170	+ for_each_domain(cpu, tmp) {
	7171	+ sd = tmp;
	7172	+ }
	7173	+ if (sd)
	7174	+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
	7175	+ }
	7176	+ }
7395	7177
7396	7178	if (want_affine)
7397	7179	current->recent_used_cpu = cpu;
..	..	@@ -7459,15 +7241,21 @@
7459	7241	/* Tell new CPU we are migrated */
7460	7242	p->se.avg.last_update_time = 0;
7461	7243
7462		- /* We have migrated, no longer consider this task hot */
7463		- p->se.exec_start = 0;
7464		-
7465	7244	update_scan_period(p, new_cpu);
7466	7245	}
7467	7246
7468	7247	static void task_dead_fair(struct task_struct *p)
7469	7248	{
7470	7249	remove_entity_load_avg(&p->se);
	7250	+}
	7251	+
	7252	+static int
	7253	+balance_fair(struct rq rq, struct task_struct prev, struct rq_flags *rf)
	7254	+{
	7255	+ if (rq->nr_running)
	7256	+ return 1;
	7257	+
	7258	+ return newidle_balance(rq, rf) != 0;
7471	7259	}
7472	7260	#endif /* CONFIG_SMP */
7473	7261
..	..	@@ -7522,7 +7310,7 @@
7522	7310
7523	7311	static void set_last_buddy(struct sched_entity *se)
7524	7312	{
7525		- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
	7313	+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
7526	7314	return;
7527	7315
7528	7316	for_each_sched_entity(se) {
..	..	@@ -7534,7 +7322,7 @@
7534	7322
7535	7323	static void set_next_buddy(struct sched_entity *se)
7536	7324	{
7537		- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
	7325	+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
7538	7326	return;
7539	7327
7540	7328	for_each_sched_entity(se) {
..	..	@@ -7560,6 +7348,7 @@
7560	7348	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7561	7349	int scale = cfs_rq->nr_running >= sched_nr_latency;
7562	7350	int next_buddy_marked = 0;
	7351	+ bool preempt = false, nopreempt = false;
7563	7352
7564	7353	if (unlikely(se == pse))
7565	7354	return;
..	..	@@ -7592,8 +7381,8 @@
7592	7381	return;
7593	7382
7594	7383	/* Idle tasks are by definition preempted by non-idle tasks. */
7595		- if (unlikely(curr->policy == SCHED_IDLE) &&
7596		- likely(p->policy != SCHED_IDLE))
	7384	+ if (unlikely(task_has_idle_policy(curr)) &&
	7385	+ likely(!task_has_idle_policy(p)))
7597	7386	goto preempt;
7598	7387
7599	7388	/*
..	..	@@ -7605,6 +7394,12 @@
7605	7394
7606	7395	find_matching_se(&se, &pse);
7607	7396	update_curr(cfs_rq_of(se));
	7397	+ trace_android_rvh_check_preempt_wakeup(rq, p, &preempt, &nopreempt,
	7398	+ wake_flags, se, pse, next_buddy_marked, sysctl_sched_wakeup_granularity);
	7399	+ if (preempt)
	7400	+ goto preempt;
	7401	+ if (nopreempt)
	7402	+ return;
7608	7403	BUG_ON(!pse);
7609	7404	if (wakeup_preempt_entity(se, pse) == 1) {
7610	7405	/*
..	..	@@ -7619,7 +7414,7 @@
7619	7414	return;
7620	7415
7621	7416	preempt:
7622		- resched_curr_lazy(rq);
	7417	+ resched_curr(rq);
7623	7418	/*
7624	7419	* Only set the backward buddy when the current task is still
7625	7420	* on the rq. This can happen when a wakeup gets interleaved
..	..	@@ -7636,20 +7431,21 @@
7636	7431	set_last_buddy(se);
7637	7432	}
7638	7433
7639		-static struct task_struct *
	7434	+struct task_struct *
7640	7435	pick_next_task_fair(struct rq rq, struct task_struct prev, struct rq_flags *rf)
7641	7436	{
7642	7437	struct cfs_rq *cfs_rq = &rq->cfs;
7643		- struct sched_entity *se;
7644		- struct task_struct *p;
	7438	+ struct sched_entity *se = NULL;
	7439	+ struct task_struct *p = NULL;
7645	7440	int new_tasks;
	7441	+ bool repick = false;
7646	7442
7647	7443	again:
7648		- if (!cfs_rq->nr_running)
	7444	+ if (!sched_fair_runnable(rq))
7649	7445	goto idle;
7650	7446
7651	7447	#ifdef CONFIG_FAIR_GROUP_SCHED
7652		- if (prev->sched_class != &fair_sched_class)
	7448	+ if (!prev \|\| prev->sched_class != &fair_sched_class)
7653	7449	goto simple;
7654	7450
7655	7451	/*
..	..	@@ -7696,7 +7492,7 @@
7696	7492	} while (cfs_rq);
7697	7493
7698	7494	p = task_of(se);
7699		-
	7495	+ trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, false, prev);
7700	7496	/*
7701	7497	* Since we haven't yet done put_prev_entity and if the selected task
7702	7498	* is a different task than we started out with, try and touch the
..	..	@@ -7726,8 +7522,15 @@
7726	7522	goto done;
7727	7523	simple:
7728	7524	#endif
	7525	+ if (prev)
	7526	+ put_prev_task(rq, prev);
7729	7527
7730		- put_prev_task(rq, prev);
	7528	+ trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, true, prev);
	7529	+ if (repick) {
	7530	+ for_each_sched_entity(se)
	7531	+ set_next_entity(cfs_rq_of(se), se);
	7532	+ goto done;
	7533	+ }
7731	7534
7732	7535	do {
7733	7536	se = pick_next_entity(cfs_rq, NULL);
..	..	@@ -7755,11 +7558,13 @@
7755	7558	return p;
7756	7559
7757	7560	idle:
7758		- update_misfit_status(NULL, rq);
7759		- new_tasks = idle_balance(rq, rf);
	7561	+ if (!rf)
	7562	+ return NULL;
	7563	+
	7564	+ new_tasks = newidle_balance(rq, rf);
7760	7565
7761	7566	/*
7762		- * Because idle_balance() releases (and re-acquires) rq->lock, it is
	7567	+ * Because newidle_balance() releases (and re-acquires) rq->lock, it is
7763	7568	* possible for any higher priority task to appear. In that case we
7764	7569	* must re-start the pick_next_entity() loop.
7765	7570	*/
..	..	@@ -7776,6 +7581,11 @@
7776	7581	update_idle_rq_clock_pelt(rq);
7777	7582
7778	7583	return NULL;
	7584	+}
	7585	+
	7586	+static struct task_struct __pick_next_task_fair(struct rq rq)
	7587	+{
	7588	+ return pick_next_task_fair(rq, NULL, NULL);
7779	7589	}
7780	7590
7781	7591	/*
..	..	@@ -7828,7 +7638,7 @@
7828	7638	set_skip_buddy(se);
7829	7639	}
7830	7640
7831		-static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
	7641	+static bool yield_to_task_fair(struct rq rq, struct task_struct p)
7832	7642	{
7833	7643	struct sched_entity *se = &p->se;
7834	7644
..	..	@@ -7963,15 +7773,54 @@
7963	7773	* rewrite all of this once again.]
7964	7774	*/
7965	7775
7966		-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
	7776	+unsigned long __read_mostly max_load_balance_interval = HZ/10;
	7777	+EXPORT_SYMBOL_GPL(max_load_balance_interval);
7967	7778
7968	7779	enum fbq_type { regular, remote, all };
7969	7780
	7781	+/*
	7782	+ * 'group_type' describes the group of CPUs at the moment of load balancing.
	7783	+ *
	7784	+ * The enum is ordered by pulling priority, with the group with lowest priority
	7785	+ * first so the group_type can simply be compared when selecting the busiest
	7786	+ * group. See update_sd_pick_busiest().
	7787	+ */
7970	7788	enum group_type {
7971		- group_other = 0,
	7789	+ /* The group has spare capacity that can be used to run more tasks. */
	7790	+ group_has_spare = 0,
	7791	+ /*
	7792	+ * The group is fully used and the tasks don't compete for more CPU
	7793	+ * cycles. Nevertheless, some tasks might wait before running.
	7794	+ */
	7795	+ group_fully_busy,
	7796	+ /*
	7797	+ * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
	7798	+ * and must be migrated to a more powerful CPU.
	7799	+ */
7972	7800	group_misfit_task,
	7801	+ /*
	7802	+ * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
	7803	+ * and the task should be migrated to it instead of running on the
	7804	+ * current CPU.
	7805	+ */
	7806	+ group_asym_packing,
	7807	+ /*
	7808	+ * The tasks' affinity constraints previously prevented the scheduler
	7809	+ * from balancing the load across the system.
	7810	+ */
7973	7811	group_imbalanced,
7974		- group_overloaded,
	7812	+ /*
	7813	+ * The CPU is overloaded and can't provide expected CPU cycles to all
	7814	+ * tasks.
	7815	+ */
	7816	+ group_overloaded
	7817	+};
	7818	+
	7819	+enum migration_type {
	7820	+ migrate_load = 0,
	7821	+ migrate_util,
	7822	+ migrate_task,
	7823	+ migrate_misfit
7975	7824	};
7976	7825
7977	7826	#define LBF_ALL_PINNED 0x01
..	..	@@ -7994,7 +7843,6 @@
7994	7843	int new_dst_cpu;
7995	7844	enum cpu_idle_type idle;
7996	7845	long imbalance;
7997		- unsigned int src_grp_nr_running;
7998	7846	/* The set of CPUs under consideration for load-balancing */
7999	7847	struct cpumask *cpus;
8000	7848
..	..	@@ -8005,8 +7853,9 @@
8005	7853	unsigned int loop_max;
8006	7854
8007	7855	enum fbq_type fbq_type;
8008		- enum group_type src_grp_type;
	7856	+ enum migration_type migration_type;
8009	7857	struct list_head tasks;
	7858	+ struct rq_flags *src_rq_rf;
8010	7859	};
8011	7860
8012	7861	/*
..	..	@@ -8021,7 +7870,11 @@
8021	7870	if (p->sched_class != &fair_sched_class)
8022	7871	return 0;
8023	7872
8024		- if (unlikely(p->policy == SCHED_IDLE))
	7873	+ if (unlikely(task_has_idle_policy(p)))
	7874	+ return 0;
	7875	+
	7876	+ /* SMT siblings share cache */
	7877	+ if (env->sd->flags & SD_SHARE_CPUCAPACITY)
8025	7878	return 0;
8026	7879
8027	7880	/*
..	..	@@ -8109,8 +7962,13 @@
8109	7962	int can_migrate_task(struct task_struct p, struct lb_env env)
8110	7963	{
8111	7964	int tsk_cache_hot;
	7965	+ int can_migrate = 1;
8112	7966
8113	7967	lockdep_assert_held(&env->src_rq->lock);
	7968	+
	7969	+ trace_android_rvh_can_migrate_task(p, env->dst_cpu, &can_migrate);
	7970	+ if (!can_migrate)
	7971	+ return 0;
8114	7972
8115	7973	/*
8116	7974	* We do not migrate tasks that are:
..	..	@@ -8120,6 +7978,10 @@
8120	7978	* 4) are cache-hot on their current CPU.
8121	7979	*/
8122	7980	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
	7981	+ return 0;
	7982	+
	7983	+ /* Disregard pcpu kthreads; they are where they need to be. */
	7984	+ if (kthread_is_per_cpu(p))
8123	7985	return 0;
8124	7986
8125	7987	if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
..	..	@@ -8188,9 +8050,20 @@
8188	8050	*/
8189	8051	static void detach_task(struct task_struct p, struct lb_env env)
8190	8052	{
	8053	+ int detached = 0;
	8054	+
8191	8055	lockdep_assert_held(&env->src_rq->lock);
8192	8056
8193		- p->on_rq = TASK_ON_RQ_MIGRATING;
	8057	+ /*
	8058	+ * The vendor hook may drop the lock temporarily, so
	8059	+ * pass the rq flags to unpin lock. We expect the
	8060	+ * rq lock to be held after return.
	8061	+ */
	8062	+ trace_android_rvh_migrate_queued_task(env->src_rq, env->src_rq_rf, p,
	8063	+ env->dst_cpu, &detached);
	8064	+ if (detached)
	8065	+ return;
	8066	+
8194	8067	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
8195	8068	set_task_cpu(p, env->dst_cpu);
8196	8069	}
..	..	@@ -8229,7 +8102,7 @@
8229	8102	static const unsigned int sched_nr_migrate_break = 32;
8230	8103
8231	8104	/*
8232		- * detach_tasks() -- tries to detach up to imbalance weighted load from
	8105	+ * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
8233	8106	* busiest_rq, as part of a balancing operation within domain "sd".
8234	8107	*
8235	8108	* Returns number of detached tasks if successful and 0 otherwise.
..	..	@@ -8237,8 +8110,8 @@
8237	8110	static int detach_tasks(struct lb_env *env)
8238	8111	{
8239	8112	struct list_head *tasks = &env->src_rq->cfs_tasks;
	8113	+ unsigned long util, load;
8240	8114	struct task_struct *p;
8241		- unsigned long load;
8242	8115	int detached = 0;
8243	8116
8244	8117	lockdep_assert_held(&env->src_rq->lock);
..	..	@@ -8268,39 +8141,64 @@
8268	8141	break;
8269	8142	}
8270	8143
8271		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
8272		- if (sysctl_sched_performance_bias) {
8273		- if ((env->idle == CPU_NOT_IDLE) && (!task_fits_max(p, env->dst_cpu)))
8274		- goto next;
8275		- }
8276		-#endif
8277		-
8278	8144	if (!can_migrate_task(p, env))
8279	8145	goto next;
8280	8146
8281		- /*
8282		- * Depending of the number of CPUs and tasks and the
8283		- * cgroup hierarchy, task_h_load() can return a null
8284		- * value. Make sure that env->imbalance decreases
8285		- * otherwise detach_tasks() will stop only after
8286		- * detaching up to loop_max tasks.
8287		- */
8288		- load = max_t(unsigned long, task_h_load(p), 1);
	8147	+ switch (env->migration_type) {
	8148	+ case migrate_load:
	8149	+ /*
	8150	+ * Depending of the number of CPUs and tasks and the
	8151	+ * cgroup hierarchy, task_h_load() can return a null
	8152	+ * value. Make sure that env->imbalance decreases
	8153	+ * otherwise detach_tasks() will stop only after
	8154	+ * detaching up to loop_max tasks.
	8155	+ */
	8156	+ load = max_t(unsigned long, task_h_load(p), 1);
8289	8157
	8158	+ if (sched_feat(LB_MIN) &&
	8159	+ load < 16 && !env->sd->nr_balance_failed)
	8160	+ goto next;
8290	8161
8291		- if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
8292		- goto next;
	8162	+ /*
	8163	+ * Make sure that we don't migrate too much load.
	8164	+ * Nevertheless, let relax the constraint if
	8165	+ * scheduler fails to find a good waiting task to
	8166	+ * migrate.
	8167	+ */
	8168	+ if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
	8169	+ goto next;
8293	8170
8294		- if ((load / 2) > env->imbalance)
8295		- goto next;
	8171	+ env->imbalance -= load;
	8172	+ break;
	8173	+
	8174	+ case migrate_util:
	8175	+ util = task_util_est(p);
	8176	+
	8177	+ if (util > env->imbalance)
	8178	+ goto next;
	8179	+
	8180	+ env->imbalance -= util;
	8181	+ break;
	8182	+
	8183	+ case migrate_task:
	8184	+ env->imbalance--;
	8185	+ break;
	8186	+
	8187	+ case migrate_misfit:
	8188	+ /* This is not a misfit task */
	8189	+ if (task_fits_cpu(p, env->src_cpu))
	8190	+ goto next;
	8191	+
	8192	+ env->imbalance = 0;
	8193	+ break;
	8194	+ }
8296	8195
8297	8196	detach_task(p, env);
8298	8197	list_add(&p->se.group_node, &env->tasks);
8299	8198
8300	8199	detached++;
8301		- env->imbalance -= load;
8302	8200
8303		-#ifdef CONFIG_PREEMPT
	8201	+#ifdef CONFIG_PREEMPTION
8304	8202	/*
8305	8203	* NEWIDLE balancing is a source of latency, so preemptible
8306	8204	* kernels will stop after the first task is detached to minimize
..	..	@@ -8312,7 +8210,7 @@
8312	8210
8313	8211	/*
8314	8212	* We only want to steal up to the prescribed amount of
8315		- * weighted load.
	8213	+ * load/util/tasks.
8316	8214	*/
8317	8215	if (env->imbalance <= 0)
8318	8216	break;
..	..	@@ -8341,7 +8239,6 @@
8341	8239
8342	8240	BUG_ON(task_rq(p) != rq);
8343	8241	activate_task(rq, p, ENQUEUE_NOCLOCK);
8344		- p->on_rq = TASK_ON_RQ_QUEUED;
8345	8242	check_preempt_curr(rq, p, 0);
8346	8243	}
8347	8244
..	..	@@ -8382,6 +8279,7 @@
8382	8279	rq_unlock(env->dst_rq, &rf);
8383	8280	}
8384	8281
	8282	+#ifdef CONFIG_NO_HZ_COMMON
8385	8283	static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
8386	8284	{
8387	8285	if (cfs_rq->avg.load_avg)
..	..	@@ -8401,12 +8299,54 @@
8401	8299	if (READ_ONCE(rq->avg_dl.util_avg))
8402	8300	return true;
8403	8301
	8302	+ if (thermal_load_avg(rq))
	8303	+ return true;
	8304	+
8404	8305	#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
8405	8306	if (READ_ONCE(rq->avg_irq.util_avg))
8406	8307	return true;
8407	8308	#endif
8408	8309
8409	8310	return false;
	8311	+}
	8312	+
	8313	+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
	8314	+{
	8315	+ rq->last_blocked_load_update_tick = jiffies;
	8316	+
	8317	+ if (!has_blocked)
	8318	+ rq->has_blocked_load = 0;
	8319	+}
	8320	+#else
	8321	+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
	8322	+static inline bool others_have_blocked(struct rq *rq) { return false; }
	8323	+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
	8324	+#endif
	8325	+
	8326	+static bool __update_blocked_others(struct rq rq, bool done)
	8327	+{
	8328	+ const struct sched_class *curr_class;
	8329	+ u64 now = rq_clock_pelt(rq);
	8330	+ unsigned long thermal_pressure;
	8331	+ bool decayed;
	8332	+
	8333	+ /*
	8334	+ * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
	8335	+ * DL and IRQ signals have been updated before updating CFS.
	8336	+ */
	8337	+ curr_class = rq->curr->sched_class;
	8338	+
	8339	+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
	8340	+
	8341	+ decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) \|
	8342	+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) \|
	8343	+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) \|
	8344	+ update_irq_load_avg(rq, 0);
	8345	+
	8346	+ if (others_have_blocked(rq))
	8347	+ *done = false;
	8348	+
	8349	+ return decayed;
8410	8350	}
8411	8351
8412	8352	#ifdef CONFIG_FAIR_GROUP_SCHED
..	..	@@ -8422,22 +8362,17 @@
8422	8362	if (cfs_rq->avg.util_sum)
8423	8363	return false;
8424	8364
8425		- if (cfs_rq->avg.runnable_load_sum)
	8365	+ if (cfs_rq->avg.runnable_sum)
8426	8366	return false;
8427	8367
8428	8368	return true;
8429	8369	}
8430	8370
8431		-static void update_blocked_averages(int cpu)
	8371	+static bool __update_blocked_fair(struct rq rq, bool done)
8432	8372	{
8433		- struct rq *rq = cpu_rq(cpu);
8434	8373	struct cfs_rq cfs_rq, pos;
8435		- const struct sched_class *curr_class;
8436		- struct rq_flags rf;
8437		- bool done = true;
8438		-
8439		- rq_lock_irqsave(rq, &rf);
8440		- update_rq_clock(rq);
	8374	+ bool decayed = false;
	8375	+ int cpu = cpu_of(rq);
8441	8376
8442	8377	/*
8443	8378	* Iterates the task_group tree in a bottom up fashion, see
..	..	@@ -8446,8 +8381,12 @@
8446	8381	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
8447	8382	struct sched_entity *se;
8448	8383
8449		- if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
8450		- update_tg_load_avg(cfs_rq, 0);
	8384	+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
	8385	+ update_tg_load_avg(cfs_rq);
	8386	+
	8387	+ if (cfs_rq == &rq->cfs)
	8388	+ decayed = true;
	8389	+ }
8451	8390
8452	8391	/* Propagate pending load changes to the parent, if any: */
8453	8392	se = cfs_rq->tg->se[cpu];
..	..	@@ -8463,23 +8402,10 @@
8463	8402
8464	8403	/* Don't need periodic decay once load/util_avg are null */
8465	8404	if (cfs_rq_has_blocked(cfs_rq))
8466		- done = false;
	8405	+ *done = false;
8467	8406	}
8468	8407
8469		- curr_class = rq->curr->sched_class;
8470		- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
8471		- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
8472		- update_irq_load_avg(rq, 0);
8473		- /* Don't need periodic decay once load/util_avg are null */
8474		- if (others_have_blocked(rq))
8475		- done = false;
8476		-
8477		-#ifdef CONFIG_NO_HZ_COMMON
8478		- rq->last_blocked_load_update_tick = jiffies;
8479		- if (done)
8480		- rq->has_blocked_load = 0;
8481		-#endif
8482		- rq_unlock_irqrestore(rq, &rf);
	8408	+ return decayed;
8483	8409	}
8484	8410
8485	8411	/*
..	..	@@ -8529,27 +8455,16 @@
8529	8455	cfs_rq_load_avg(cfs_rq) + 1);
8530	8456	}
8531	8457	#else
8532		-static inline void update_blocked_averages(int cpu)
	8458	+static bool __update_blocked_fair(struct rq rq, bool done)
8533	8459	{
8534		- struct rq *rq = cpu_rq(cpu);
8535	8460	struct cfs_rq *cfs_rq = &rq->cfs;
8536		- const struct sched_class *curr_class;
8537		- struct rq_flags rf;
	8461	+ bool decayed;
8538	8462
8539		- rq_lock_irqsave(rq, &rf);
8540		- update_rq_clock(rq);
8541		- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
	8463	+ decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
	8464	+ if (cfs_rq_has_blocked(cfs_rq))
	8465	+ *done = false;
8542	8466
8543		- curr_class = rq->curr->sched_class;
8544		- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
8545		- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
8546		- update_irq_load_avg(rq, 0);
8547		-#ifdef CONFIG_NO_HZ_COMMON
8548		- rq->last_blocked_load_update_tick = jiffies;
8549		- if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
8550		- rq->has_blocked_load = 0;
8551		-#endif
8552		- rq_unlock_irqrestore(rq, &rf);
	8467	+ return decayed;
8553	8468	}
8554	8469
8555	8470	static unsigned long task_h_load(struct task_struct *p)
..	..	@@ -8557,6 +8472,24 @@
8557	8472	return p->se.avg.load_avg;
8558	8473	}
8559	8474	#endif
	8475	+
	8476	+static void update_blocked_averages(int cpu)
	8477	+{
	8478	+ bool decayed = false, done = true;
	8479	+ struct rq *rq = cpu_rq(cpu);
	8480	+ struct rq_flags rf;
	8481	+
	8482	+ rq_lock_irqsave(rq, &rf);
	8483	+ update_rq_clock(rq);
	8484	+
	8485	+ decayed \|= __update_blocked_others(rq, &done);
	8486	+ decayed \|= __update_blocked_fair(rq, &done);
	8487	+
	8488	+ update_blocked_load_status(rq, !done);
	8489	+ if (decayed)
	8490	+ cpufreq_update_util(rq, 0);
	8491	+ rq_unlock_irqrestore(rq, &rf);
	8492	+}
8560	8493
8561	8494	/******** Helpers for find_busiest_group **********************/
8562	8495
..	..	@@ -8566,15 +8499,15 @@
8566	8499	struct sg_lb_stats {
8567	8500	unsigned long avg_load; /Avg load across the CPUs of the group /
8568	8501	unsigned long group_load; /* Total load over the CPUs of the group */
8569		- unsigned long sum_weighted_load; /* Weighted load of group's tasks */
8570		- unsigned long load_per_task;
8571	8502	unsigned long group_capacity;
8572		- unsigned long group_util; /* Total utilization of the group */
8573		- unsigned int sum_nr_running; /* Nr tasks running in the group */
	8503	+ unsigned long group_util; /* Total utilization over the CPUs of the group */
	8504	+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
	8505	+ unsigned int sum_nr_running; /* Nr of tasks running in the group */
	8506	+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
8574	8507	unsigned int idle_cpus;
8575	8508	unsigned int group_weight;
8576	8509	enum group_type group_type;
8577		- int group_no_capacity;
	8510	+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
8578	8511	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
8579	8512	#ifdef CONFIG_NUMA_BALANCING
8580	8513	unsigned int nr_numa_running;
..	..	@@ -8589,10 +8522,10 @@
8589	8522	struct sd_lb_stats {
8590	8523	struct sched_group busiest; / Busiest group in this sd */
8591	8524	struct sched_group local; / Local group in this sd */
8592		- unsigned long total_running;
8593	8525	unsigned long total_load; /* Total load of all groups in sd */
8594	8526	unsigned long total_capacity; /* Total capacity of all groups in sd */
8595	8527	unsigned long avg_load; /* Average load across all groups in sd */
	8528	+ unsigned int prefer_sibling; /* tasks should go to sibling first */
8596	8529
8597	8530	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
8598	8531	struct sg_lb_stats local_stat; /* Statistics of the local group */
..	..	@@ -8603,54 +8536,26 @@
8603	8536	/*
8604	8537	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
8605	8538	* local_stat because update_sg_lb_stats() does a full clear/assignment.
8606		- * We must however clear busiest_stat::avg_load because
8607		- * update_sd_pick_busiest() reads this before assignment.
	8539	+ * We must however set busiest_stat::group_type and
	8540	+ * busiest_stat::idle_cpus to the worst busiest group because
	8541	+ * update_sd_pick_busiest() reads these before assignment.
8608	8542	*/
8609	8543	*sds = (struct sd_lb_stats){
8610	8544	.busiest = NULL,
8611	8545	.local = NULL,
8612		- .total_running = 0UL,
8613	8546	.total_load = 0UL,
8614	8547	.total_capacity = 0UL,
8615	8548	.busiest_stat = {
8616		- .avg_load = 0UL,
8617		- .sum_nr_running = 0,
8618		- .group_type = group_other,
	8549	+ .idle_cpus = UINT_MAX,
	8550	+ .group_type = group_has_spare,
8619	8551	},
8620	8552	};
8621	8553	}
8622	8554
8623		-/**
8624		- * get_sd_load_idx - Obtain the load index for a given sched domain.
8625		- * @sd: The sched_domain whose load_idx is to be obtained.
8626		- * @idle: The idle status of the CPU for whose sd load_idx is obtained.
8627		- *
8628		- * Return: The load index.
8629		- */
8630		-static inline int get_sd_load_idx(struct sched_domain *sd,
8631		- enum cpu_idle_type idle)
8632		-{
8633		- int load_idx;
8634		-
8635		- switch (idle) {
8636		- case CPU_NOT_IDLE:
8637		- load_idx = sd->busy_idx;
8638		- break;
8639		-
8640		- case CPU_NEWLY_IDLE:
8641		- load_idx = sd->newidle_idx;
8642		- break;
8643		- default:
8644		- load_idx = sd->idle_idx;
8645		- break;
8646		- }
8647		-
8648		- return load_idx;
8649		-}
8650		-
8651		-static unsigned long scale_rt_capacity(int cpu, unsigned long max)
	8555	+static unsigned long scale_rt_capacity(int cpu)
8652	8556	{
8653	8557	struct rq *rq = cpu_rq(cpu);
	8558	+ unsigned long max = arch_scale_cpu_capacity(cpu);
8654	8559	unsigned long used, free;
8655	8560	unsigned long irq;
8656	8561
..	..	@@ -8659,8 +8564,15 @@
8659	8564	if (unlikely(irq >= max))
8660	8565	return 1;
8661	8566
	8567	+ /*
	8568	+ * avg_rt.util_avg and avg_dl.util_avg track binary signals
	8569	+ * (running and not running) with weights 0 and 1024 respectively.
	8570	+ * avg_thermal.load_avg tracks thermal pressure and the weighted
	8571	+ * average uses the actual delta max capacity(load).
	8572	+ */
8662	8573	used = READ_ONCE(rq->avg_rt.util_avg);
8663	8574	used += READ_ONCE(rq->avg_dl.util_avg);
	8575	+ used += thermal_load_avg(rq);
8664	8576
8665	8577	if (unlikely(used >= max))
8666	8578	return 1;
..	..	@@ -8670,52 +8582,20 @@
8670	8582	return scale_irq_capacity(free, irq, max);
8671	8583	}
8672	8584
8673		-void init_max_cpu_capacity(struct max_cpu_capacity *mcc) {
8674		- raw_spin_lock_init(&mcc->lock);
8675		- mcc->val = 0;
8676		- mcc->cpu = -1;
8677		-}
8678		-
8679	8585	static void update_cpu_capacity(struct sched_domain *sd, int cpu)
8680	8586	{
8681		- unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
	8587	+ unsigned long capacity = scale_rt_capacity(cpu);
8682	8588	struct sched_group *sdg = sd->groups;
8683		- struct max_cpu_capacity *mcc;
8684		- unsigned long max_capacity;
8685		- int max_cap_cpu;
8686		- unsigned long flags;
8687	8589
8688		- cpu_rq(cpu)->cpu_capacity_orig = capacity;
8689		-
8690		- capacity *= arch_scale_max_freq_capacity(sd, cpu);
8691		- capacity >>= SCHED_CAPACITY_SHIFT;
8692		-
8693		- mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
8694		-
8695		- raw_spin_lock_irqsave(&mcc->lock, flags);
8696		- max_capacity = mcc->val;
8697		- max_cap_cpu = mcc->cpu;
8698		-
8699		- if ((max_capacity > capacity && max_cap_cpu == cpu) \|\|
8700		- (max_capacity < capacity)) {
8701		- mcc->val = capacity;
8702		- mcc->cpu = cpu;
8703		-#ifdef CONFIG_SCHED_DEBUG
8704		- raw_spin_unlock_irqrestore(&mcc->lock, flags);
8705		- //printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
8706		- // cpu, capacity);
8707		- goto skip_unlock;
8708		-#endif
8709		- }
8710		- raw_spin_unlock_irqrestore(&mcc->lock, flags);
8711		-
8712		-skip_unlock: __attribute__ ((unused));
8713		- capacity = scale_rt_capacity(cpu, capacity);
	8590	+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
8714	8591
8715	8592	if (!capacity)
8716	8593	capacity = 1;
8717	8594
	8595	+ trace_android_rvh_update_cpu_capacity(cpu, &capacity);
8718	8596	cpu_rq(cpu)->cpu_capacity = capacity;
	8597	+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
	8598	+
8719	8599	sdg->sgc->capacity = capacity;
8720	8600	sdg->sgc->min_capacity = capacity;
8721	8601	sdg->sgc->max_capacity = capacity;
..	..	@@ -8748,29 +8628,11 @@
8748	8628	*/
8749	8629
8750	8630	for_each_cpu(cpu, sched_group_span(sdg)) {
8751		- struct sched_group_capacity *sgc;
8752		- struct rq *rq = cpu_rq(cpu);
	8631	+ unsigned long cpu_cap = capacity_of(cpu);
8753	8632
8754		- /*
8755		- * build_sched_domains() -> init_sched_groups_capacity()
8756		- * gets here before we've attached the domains to the
8757		- * runqueues.
8758		- *
8759		- * Use capacity_of(), which is set irrespective of domains
8760		- * in update_cpu_capacity().
8761		- *
8762		- * This avoids capacity from being 0 and
8763		- * causing divide-by-zero issues on boot.
8764		- */
8765		- if (unlikely(!rq->sd)) {
8766		- capacity += capacity_of(cpu);
8767		- } else {
8768		- sgc = rq->sd->groups->sgc;
8769		- capacity += sgc->capacity;
8770		- }
8771		-
8772		- min_capacity = min(capacity, min_capacity);
8773		- max_capacity = max(capacity, max_capacity);
	8633	+ capacity += cpu_cap;
	8634	+ min_capacity = min(cpu_cap, min_capacity);
	8635	+ max_capacity = max(cpu_cap, max_capacity);
8774	8636	}
8775	8637	} else {
8776	8638	/*
..	..	@@ -8804,6 +8666,18 @@
8804	8666	{
8805	8667	return ((rq->cpu_capacity * sd->imbalance_pct) <
8806	8668	(rq->cpu_capacity_orig * 100));
	8669	+}
	8670	+
	8671	+/*
	8672	+ * Check whether a rq has a misfit task and if it looks like we can actually
	8673	+ * help that task: we can migrate the task to a CPU of higher capacity, or
	8674	+ * the task's current CPU is heavily pressured.
	8675	+ */
	8676	+static inline int check_misfit_status(struct rq rq, struct sched_domain sd)
	8677	+{
	8678	+ return rq->misfit_task_load &&
	8679	+ (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity \|\|
	8680	+ check_cpu_capacity(rq, sd));
8807	8681	}
8808	8682
8809	8683	/*
..	..	@@ -8853,13 +8727,17 @@
8853	8727	* any benefit for the load balance.
8854	8728	*/
8855	8729	static inline bool
8856		-group_has_capacity(struct lb_env env, struct sg_lb_stats sgs)
	8730	+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8857	8731	{
8858	8732	if (sgs->sum_nr_running < sgs->group_weight)
8859	8733	return true;
8860	8734
	8735	+ if ((sgs->group_capacity * imbalance_pct) <
	8736	+ (sgs->group_runnable * 100))
	8737	+ return false;
	8738	+
8861	8739	if ((sgs->group_capacity * 100) >
8862		- (sgs->group_util * env->sd->imbalance_pct))
	8740	+ (sgs->group_util * imbalance_pct))
8863	8741	return true;
8864	8742
8865	8743	return false;
..	..	@@ -8874,13 +8752,17 @@
8874	8752	* false.
8875	8753	*/
8876	8754	static inline bool
8877		-group_is_overloaded(struct lb_env env, struct sg_lb_stats sgs)
	8755	+group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8878	8756	{
8879	8757	if (sgs->sum_nr_running <= sgs->group_weight)
8880	8758	return false;
8881	8759
8882	8760	if ((sgs->group_capacity * 100) <
8883		- (sgs->group_util * env->sd->imbalance_pct))
	8761	+ (sgs->group_util * imbalance_pct))
	8762	+ return true;
	8763	+
	8764	+ if ((sgs->group_capacity * imbalance_pct) <
	8765	+ (sgs->group_runnable * 100))
8884	8766	return true;
8885	8767
8886	8768	return false;
..	..	@@ -8893,8 +8775,7 @@
8893	8775	static inline bool
8894	8776	group_smaller_min_cpu_capacity(struct sched_group sg, struct sched_group ref)
8895	8777	{
8896		- return sg->sgc->min_capacity * capacity_margin <
8897		- ref->sgc->min_capacity * 1024;
	8778	+ return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
8898	8779	}
8899	8780
8900	8781	/*
..	..	@@ -8904,24 +8785,30 @@
8904	8785	static inline bool
8905	8786	group_smaller_max_cpu_capacity(struct sched_group sg, struct sched_group ref)
8906	8787	{
8907		- return sg->sgc->max_capacity * capacity_margin <
8908		- ref->sgc->max_capacity * 1024;
	8788	+ return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
8909	8789	}
8910	8790
8911	8791	static inline enum
8912		-group_type group_classify(struct sched_group *group,
	8792	+group_type group_classify(unsigned int imbalance_pct,
	8793	+ struct sched_group *group,
8913	8794	struct sg_lb_stats *sgs)
8914	8795	{
8915		- if (sgs->group_no_capacity)
	8796	+ if (group_is_overloaded(imbalance_pct, sgs))
8916	8797	return group_overloaded;
8917	8798
8918	8799	if (sg_imbalanced(group))
8919	8800	return group_imbalanced;
8920	8801
	8802	+ if (sgs->group_asym_packing)
	8803	+ return group_asym_packing;
	8804	+
8921	8805	if (sgs->group_misfit_task_load)
8922	8806	return group_misfit_task;
8923	8807
8924		- return group_other;
	8808	+ if (!group_has_capacity(imbalance_pct, sgs))
	8809	+ return group_fully_busy;
	8810	+
	8811	+ return group_has_spare;
8925	8812	}
8926	8813
8927	8814	static bool update_nohz_stats(struct rq *rq, bool force)
..	..	@@ -8958,12 +8845,11 @@
8958	8845	struct sg_lb_stats *sgs,
8959	8846	int *sg_status)
8960	8847	{
8961		- int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8962		- int load_idx = get_sd_load_idx(env->sd, env->idle);
8963		- unsigned long load;
8964		- int i, nr_running;
	8848	+ int i, nr_running, local_group;
8965	8849
8966	8850	memset(sgs, 0, sizeof(*sgs));
	8851	+
	8852	+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8967	8853
8968	8854	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8969	8855	struct rq *rq = cpu_rq(i);
..	..	@@ -8971,17 +8857,14 @@
8971	8857	if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8972	8858	env->flags \|= LBF_NOHZ_AGAIN;
8973	8859
8974		- /* Bias balancing toward CPUs of our domain: */
8975		- if (local_group)
8976		- load = target_load(i, load_idx);
8977		- else
8978		- load = source_load(i, load_idx);
8979		-
8980		- sgs->group_load += load;
	8860	+ sgs->group_load += cpu_load(rq);
8981	8861	sgs->group_util += cpu_util(i);
8982		- sgs->sum_nr_running += rq->cfs.h_nr_running;
	8862	+ sgs->group_runnable += cpu_runnable(rq);
	8863	+ sgs->sum_h_nr_running += rq->cfs.h_nr_running;
8983	8864
8984	8865	nr_running = rq->nr_running;
	8866	+ sgs->sum_nr_running += nr_running;
	8867	+
8985	8868	if (nr_running > 1)
8986	8869	*sg_status \|= SG_OVERLOAD;
8987	8870
..	..	@@ -8992,13 +8875,19 @@
8992	8875	sgs->nr_numa_running += rq->nr_numa_running;
8993	8876	sgs->nr_preferred_running += rq->nr_preferred_running;
8994	8877	#endif
8995		- sgs->sum_weighted_load += weighted_cpuload(rq);
8996	8878	/*
8997	8879	* No need to call idle_cpu() if nr_running is not 0
8998	8880	*/
8999		- if (!nr_running && idle_cpu(i))
	8881	+ if (!nr_running && idle_cpu(i)) {
9000	8882	sgs->idle_cpus++;
	8883	+ /* Idle cpu can't have misfit task */
	8884	+ continue;
	8885	+ }
9001	8886
	8887	+ if (local_group)
	8888	+ continue;
	8889	+
	8890	+ /* Check for a misfit task on the cpu */
9002	8891	if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9003	8892	sgs->group_misfit_task_load < rq->misfit_task_load) {
9004	8893	sgs->group_misfit_task_load = rq->misfit_task_load;
..	..	@@ -9006,17 +8895,24 @@
9006	8895	}
9007	8896	}
9008	8897
9009		- /* Adjust by relative CPU capacity of the group */
9010		- sgs->group_capacity = group->sgc->capacity;
9011		- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
	8898	+ /* Check if dst CPU is idle and preferred to this group */
	8899	+ if (env->sd->flags & SD_ASYM_PACKING &&
	8900	+ env->idle != CPU_NOT_IDLE &&
	8901	+ sgs->sum_h_nr_running &&
	8902	+ sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
	8903	+ sgs->group_asym_packing = 1;
	8904	+ }
9012	8905
9013		- if (sgs->sum_nr_running)
9014		- sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
	8906	+ sgs->group_capacity = group->sgc->capacity;
9015	8907
9016	8908	sgs->group_weight = group->group_weight;
9017	8909
9018		- sgs->group_no_capacity = group_is_overloaded(env, sgs);
9019		- sgs->group_type = group_classify(group, sgs);
	8910	+ sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
	8911	+
	8912	+ /* Computing avg_load makes sense only when group is overloaded */
	8913	+ if (sgs->group_type == group_overloaded)
	8914	+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
	8915	+ sgs->group_capacity;
9020	8916	}
9021	8917
9022	8918	/**
..	..	@@ -9039,6 +8935,10 @@
9039	8935	{
9040	8936	struct sg_lb_stats *busiest = &sds->busiest_stat;
9041	8937
	8938	+ /* Make sure that there is at least one task to pull */
	8939	+ if (!sgs->sum_h_nr_running)
	8940	+ return false;
	8941	+
9042	8942	/*
9043	8943	* Don't try to pull misfit tasks we can't help.
9044	8944	* We can use max_capacity here as reduction in capacity on some
..	..	@@ -9047,7 +8947,7 @@
9047	8947	*/
9048	8948	if (sgs->group_type == group_misfit_task &&
9049	8949	(!group_smaller_max_cpu_capacity(sg, sds->local) \|\|
9050		- !group_has_capacity(env, &sds->local_stat)))
	8950	+ sds->local_stat.group_type != group_has_spare))
9051	8951	return false;
9052	8952
9053	8953	if (sgs->group_type > busiest->group_type)
..	..	@@ -9056,62 +8956,92 @@
9056	8956	if (sgs->group_type < busiest->group_type)
9057	8957	return false;
9058	8958
9059		- if (sgs->avg_load <= busiest->avg_load)
	8959	+ /*
	8960	+ * The candidate and the current busiest group are the same type of
	8961	+ * group. Let check which one is the busiest according to the type.
	8962	+ */
	8963	+
	8964	+ switch (sgs->group_type) {
	8965	+ case group_overloaded:
	8966	+ /* Select the overloaded group with highest avg_load. */
	8967	+ if (sgs->avg_load <= busiest->avg_load)
	8968	+ return false;
	8969	+ break;
	8970	+
	8971	+ case group_imbalanced:
	8972	+ /*
	8973	+ * Select the 1st imbalanced group as we don't have any way to
	8974	+ * choose one more than another.
	8975	+ */
9060	8976	return false;
9061	8977
9062		- if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
9063		- goto asym_packing;
9064		-
9065		- /*
9066		- * Candidate sg has no more than one task per CPU and
9067		- * has higher per-CPU capacity. Migrating tasks to less
9068		- * capable CPUs may harm throughput. Maximize throughput,
9069		- * power/energy consequences are not considered.
9070		- */
9071		- if (sgs->sum_nr_running <= sgs->group_weight &&
9072		- group_smaller_min_cpu_capacity(sds->local, sg))
9073		- return false;
9074		-
9075		- /*
9076		- * If we have more than one misfit sg go with the biggest misfit.
9077		- */
9078		- if (sgs->group_type == group_misfit_task &&
9079		- sgs->group_misfit_task_load < busiest->group_misfit_task_load)
9080		- return false;
9081		-
9082		-asym_packing:
9083		- /* This is the busiest node in its class. */
9084		- if (!(env->sd->flags & SD_ASYM_PACKING))
9085		- return true;
9086		-
9087		- /* No ASYM_PACKING if target CPU is already busy */
9088		- if (env->idle == CPU_NOT_IDLE)
9089		- return true;
9090		- /*
9091		- * ASYM_PACKING needs to move all the work to the highest
9092		- * prority CPUs in the group, therefore mark all groups
9093		- * of lower priority than ourself as busy.
9094		- */
9095		- if (sgs->sum_nr_running &&
9096		- sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
9097		- if (!sds->busiest)
9098		- return true;
9099		-
	8978	+ case group_asym_packing:
9100	8979	/* Prefer to move from lowest priority CPU's work */
9101		- if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
9102		- sg->asym_prefer_cpu))
9103		- return true;
	8980	+ if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
	8981	+ return false;
	8982	+ break;
	8983	+
	8984	+ case group_misfit_task:
	8985	+ /*
	8986	+ * If we have more than one misfit sg go with the biggest
	8987	+ * misfit.
	8988	+ */
	8989	+ if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
	8990	+ return false;
	8991	+ break;
	8992	+
	8993	+ case group_fully_busy:
	8994	+ /*
	8995	+ * Select the fully busy group with highest avg_load. In
	8996	+ * theory, there is no need to pull task from such kind of
	8997	+ * group because tasks have all compute capacity that they need
	8998	+ * but we can still improve the overall throughput by reducing
	8999	+ * contention when accessing shared HW resources.
	9000	+ *
	9001	+ * XXX for now avg_load is not computed and always 0 so we
	9002	+ * select the 1st one.
	9003	+ */
	9004	+ if (sgs->avg_load <= busiest->avg_load)
	9005	+ return false;
	9006	+ break;
	9007	+
	9008	+ case group_has_spare:
	9009	+ /*
	9010	+ * Select not overloaded group with lowest number of idle cpus
	9011	+ * and highest number of running tasks. We could also compare
	9012	+ * the spare capacity which is more stable but it can end up
	9013	+ * that the group has less spare capacity but finally more idle
	9014	+ * CPUs which means less opportunity to pull tasks.
	9015	+ */
	9016	+ if (sgs->idle_cpus > busiest->idle_cpus)
	9017	+ return false;
	9018	+ else if ((sgs->idle_cpus == busiest->idle_cpus) &&
	9019	+ (sgs->sum_nr_running <= busiest->sum_nr_running))
	9020	+ return false;
	9021	+
	9022	+ break;
9104	9023	}
9105	9024
9106		- return false;
	9025	+ /*
	9026	+ * Candidate sg has no more than one task per CPU and has higher
	9027	+ * per-CPU capacity. Migrating tasks to less capable CPUs may harm
	9028	+ * throughput. Maximize throughput, power/energy consequences are not
	9029	+ * considered.
	9030	+ */
	9031	+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
	9032	+ (sgs->group_type <= group_fully_busy) &&
	9033	+ (group_smaller_min_cpu_capacity(sds->local, sg)))
	9034	+ return false;
	9035	+
	9036	+ return true;
9107	9037	}
9108	9038
9109	9039	#ifdef CONFIG_NUMA_BALANCING
9110	9040	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9111	9041	{
9112		- if (sgs->sum_nr_running > sgs->nr_numa_running)
	9042	+ if (sgs->sum_h_nr_running > sgs->nr_numa_running)
9113	9043	return regular;
9114		- if (sgs->sum_nr_running > sgs->nr_preferred_running)
	9044	+ if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
9115	9045	return remote;
9116	9046	return all;
9117	9047	}
..	..	@@ -9136,18 +9066,338 @@
9136	9066	}
9137	9067	#endif /* CONFIG_NUMA_BALANCING */
9138	9068
	9069	+
	9070	+struct sg_lb_stats;
	9071	+
	9072	+/*
	9073	+ * task_running_on_cpu - return 1 if @p is running on @cpu.
	9074	+ */
	9075	+
	9076	+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
	9077	+{
	9078	+ /* Task has no contribution or is new */
	9079	+ if (cpu != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
	9080	+ return 0;
	9081	+
	9082	+ if (task_on_rq_queued(p))
	9083	+ return 1;
	9084	+
	9085	+ return 0;
	9086	+}
	9087	+
	9088	+/**
	9089	+ * idle_cpu_without - would a given CPU be idle without p ?
	9090	+ * @cpu: the processor on which idleness is tested.
	9091	+ * @p: task which should be ignored.
	9092	+ *
	9093	+ * Return: 1 if the CPU would be idle. 0 otherwise.
	9094	+ */
	9095	+static int idle_cpu_without(int cpu, struct task_struct *p)
	9096	+{
	9097	+ struct rq *rq = cpu_rq(cpu);
	9098	+
	9099	+ if (rq->curr != rq->idle && rq->curr != p)
	9100	+ return 0;
	9101	+
	9102	+ /*
	9103	+ * rq->nr_running can't be used but an updated version without the
	9104	+ * impact of p on cpu must be used instead. The updated nr_running
	9105	+ * be computed and tested before calling idle_cpu_without().
	9106	+ */
	9107	+
	9108	+#ifdef CONFIG_SMP
	9109	+ if (rq->ttwu_pending)
	9110	+ return 0;
	9111	+#endif
	9112	+
	9113	+ return 1;
	9114	+}
	9115	+
	9116	+/*
	9117	+ * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
	9118	+ * @sd: The sched_domain level to look for idlest group.
	9119	+ * @group: sched_group whose statistics are to be updated.
	9120	+ * @sgs: variable to hold the statistics for this group.
	9121	+ * @p: The task for which we look for the idlest group/CPU.
	9122	+ */
	9123	+static inline void update_sg_wakeup_stats(struct sched_domain *sd,
	9124	+ struct sched_group *group,
	9125	+ struct sg_lb_stats *sgs,
	9126	+ struct task_struct *p)
	9127	+{
	9128	+ int i, nr_running;
	9129	+
	9130	+ memset(sgs, 0, sizeof(*sgs));
	9131	+
	9132	+ /* Assume that task can't fit any CPU of the group */
	9133	+ if (sd->flags & SD_ASYM_CPUCAPACITY)
	9134	+ sgs->group_misfit_task_load = 1;
	9135	+
	9136	+ for_each_cpu(i, sched_group_span(group)) {
	9137	+ struct rq *rq = cpu_rq(i);
	9138	+ unsigned int local;
	9139	+
	9140	+ sgs->group_load += cpu_load_without(rq, p);
	9141	+ sgs->group_util += cpu_util_without(i, p);
	9142	+ sgs->group_runnable += cpu_runnable_without(rq, p);
	9143	+ local = task_running_on_cpu(i, p);
	9144	+ sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
	9145	+
	9146	+ nr_running = rq->nr_running - local;
	9147	+ sgs->sum_nr_running += nr_running;
	9148	+
	9149	+ /*
	9150	+ * No need to call idle_cpu_without() if nr_running is not 0
	9151	+ */
	9152	+ if (!nr_running && idle_cpu_without(i, p))
	9153	+ sgs->idle_cpus++;
	9154	+
	9155	+ /* Check if task fits in the CPU */
	9156	+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
	9157	+ sgs->group_misfit_task_load &&
	9158	+ task_fits_cpu(p, i))
	9159	+ sgs->group_misfit_task_load = 0;
	9160	+
	9161	+ }
	9162	+
	9163	+ sgs->group_capacity = group->sgc->capacity;
	9164	+
	9165	+ sgs->group_weight = group->group_weight;
	9166	+
	9167	+ sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
	9168	+
	9169	+ /*
	9170	+ * Computing avg_load makes sense only when group is fully busy or
	9171	+ * overloaded
	9172	+ */
	9173	+ if (sgs->group_type == group_fully_busy \|\|
	9174	+ sgs->group_type == group_overloaded)
	9175	+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
	9176	+ sgs->group_capacity;
	9177	+}
	9178	+
	9179	+static bool update_pick_idlest(struct sched_group *idlest,
	9180	+ struct sg_lb_stats *idlest_sgs,
	9181	+ struct sched_group *group,
	9182	+ struct sg_lb_stats *sgs)
	9183	+{
	9184	+ if (sgs->group_type < idlest_sgs->group_type)
	9185	+ return true;
	9186	+
	9187	+ if (sgs->group_type > idlest_sgs->group_type)
	9188	+ return false;
	9189	+
	9190	+ /*
	9191	+ * The candidate and the current idlest group are the same type of
	9192	+ * group. Let check which one is the idlest according to the type.
	9193	+ */
	9194	+
	9195	+ switch (sgs->group_type) {
	9196	+ case group_overloaded:
	9197	+ case group_fully_busy:
	9198	+ /* Select the group with lowest avg_load. */
	9199	+ if (idlest_sgs->avg_load <= sgs->avg_load)
	9200	+ return false;
	9201	+ break;
	9202	+
	9203	+ case group_imbalanced:
	9204	+ case group_asym_packing:
	9205	+ /* Those types are not used in the slow wakeup path */
	9206	+ return false;
	9207	+
	9208	+ case group_misfit_task:
	9209	+ /* Select group with the highest max capacity */
	9210	+ if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
	9211	+ return false;
	9212	+ break;
	9213	+
	9214	+ case group_has_spare:
	9215	+ /* Select group with most idle CPUs */
	9216	+ if (idlest_sgs->idle_cpus > sgs->idle_cpus)
	9217	+ return false;
	9218	+
	9219	+ /* Select group with lowest group_util */
	9220	+ if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
	9221	+ idlest_sgs->group_util <= sgs->group_util)
	9222	+ return false;
	9223	+
	9224	+ break;
	9225	+ }
	9226	+
	9227	+ return true;
	9228	+}
	9229	+
	9230	+/*
	9231	+ * find_idlest_group() finds and returns the least busy CPU group within the
	9232	+ * domain.
	9233	+ *
	9234	+ * Assumes p is allowed on at least one CPU in sd.
	9235	+ */
	9236	+static struct sched_group *
	9237	+find_idlest_group(struct sched_domain sd, struct task_struct p, int this_cpu)
	9238	+{
	9239	+ struct sched_group idlest = NULL, local = NULL, *group = sd->groups;
	9240	+ struct sg_lb_stats local_sgs, tmp_sgs;
	9241	+ struct sg_lb_stats *sgs;
	9242	+ unsigned long imbalance;
	9243	+ struct sg_lb_stats idlest_sgs = {
	9244	+ .avg_load = UINT_MAX,
	9245	+ .group_type = group_overloaded,
	9246	+ };
	9247	+
	9248	+ imbalance = scale_load_down(NICE_0_LOAD) *
	9249	+ (sd->imbalance_pct-100) / 100;
	9250	+
	9251	+ do {
	9252	+ int local_group;
	9253	+
	9254	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	9255	+ struct root_domain *rd = cpu_rq(this_cpu)->rd;
	9256	+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
	9257	+ int level = rockchip_perf_get_level();
	9258	+
	9259	+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
	9260	+ cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) &&
	9261	+ !cpumask_intersects(sched_group_span(group), cpub_mask))
	9262	+ continue;
	9263	+ }
	9264	+
	9265	+ /* Skip over this group if it has no CPUs allowed */
	9266	+ if (!cpumask_intersects(sched_group_span(group),
	9267	+ p->cpus_ptr))
	9268	+ continue;
	9269	+
	9270	+ local_group = cpumask_test_cpu(this_cpu,
	9271	+ sched_group_span(group));
	9272	+
	9273	+ if (local_group) {
	9274	+ sgs = &local_sgs;
	9275	+ local = group;
	9276	+ } else {
	9277	+ sgs = &tmp_sgs;
	9278	+ }
	9279	+
	9280	+ update_sg_wakeup_stats(sd, group, sgs, p);
	9281	+
	9282	+ if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
	9283	+ idlest = group;
	9284	+ idlest_sgs = *sgs;
	9285	+ }
	9286	+
	9287	+ } while (group = group->next, group != sd->groups);
	9288	+
	9289	+
	9290	+ /* There is no idlest group to push tasks to */
	9291	+ if (!idlest)
	9292	+ return NULL;
	9293	+
	9294	+ /* The local group has been skipped because of CPU affinity */
	9295	+ if (!local)
	9296	+ return idlest;
	9297	+
	9298	+ /*
	9299	+ * If the local group is idler than the selected idlest group
	9300	+ * don't try and push the task.
	9301	+ */
	9302	+ if (local_sgs.group_type < idlest_sgs.group_type)
	9303	+ return NULL;
	9304	+
	9305	+ /*
	9306	+ * If the local group is busier than the selected idlest group
	9307	+ * try and push the task.
	9308	+ */
	9309	+ if (local_sgs.group_type > idlest_sgs.group_type)
	9310	+ return idlest;
	9311	+
	9312	+ switch (local_sgs.group_type) {
	9313	+ case group_overloaded:
	9314	+ case group_fully_busy:
	9315	+ /*
	9316	+ * When comparing groups across NUMA domains, it's possible for
	9317	+ * the local domain to be very lightly loaded relative to the
	9318	+ * remote domains but "imbalance" skews the comparison making
	9319	+ * remote CPUs look much more favourable. When considering
	9320	+ * cross-domain, add imbalance to the load on the remote node
	9321	+ * and consider staying local.
	9322	+ */
	9323	+
	9324	+ if ((sd->flags & SD_NUMA) &&
	9325	+ ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
	9326	+ return NULL;
	9327	+
	9328	+ /*
	9329	+ * If the local group is less loaded than the selected
	9330	+ * idlest group don't try and push any tasks.
	9331	+ */
	9332	+ if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
	9333	+ return NULL;
	9334	+
	9335	+ if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
	9336	+ return NULL;
	9337	+ break;
	9338	+
	9339	+ case group_imbalanced:
	9340	+ case group_asym_packing:
	9341	+ /* Those type are not used in the slow wakeup path */
	9342	+ return NULL;
	9343	+
	9344	+ case group_misfit_task:
	9345	+ /* Select group with the highest max capacity */
	9346	+ if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
	9347	+ return NULL;
	9348	+ break;
	9349	+
	9350	+ case group_has_spare:
	9351	+ if (sd->flags & SD_NUMA) {
	9352	+#ifdef CONFIG_NUMA_BALANCING
	9353	+ int idlest_cpu;
	9354	+ /*
	9355	+ * If there is spare capacity at NUMA, try to select
	9356	+ * the preferred node
	9357	+ */
	9358	+ if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
	9359	+ return NULL;
	9360	+
	9361	+ idlest_cpu = cpumask_first(sched_group_span(idlest));
	9362	+ if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
	9363	+ return idlest;
	9364	+#endif
	9365	+ /*
	9366	+ * Otherwise, keep the task on this node to stay close
	9367	+ * its wakeup source and improve locality. If there is
	9368	+ * a real need of migration, periodic load balance will
	9369	+ * take care of it.
	9370	+ */
	9371	+ if (local_sgs.idle_cpus)
	9372	+ return NULL;
	9373	+ }
	9374	+
	9375	+ /*
	9376	+ * Select group with highest number of idle CPUs. We could also
	9377	+ * compare the utilization which is more stable but it can end
	9378	+ * up that the group has less spare capacity but finally more
	9379	+ * idle CPUs which means more opportunity to run task.
	9380	+ */
	9381	+ if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
	9382	+ return NULL;
	9383	+ break;
	9384	+ }
	9385	+
	9386	+ return idlest;
	9387	+}
	9388	+
9139	9389	/**
9140	9390	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
9141	9391	* @env: The load balancing environment.
9142	9392	* @sds: variable to hold the statistics for this sched_domain.
9143	9393	*/
	9394	+
9144	9395	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)
9145	9396	{
9146	9397	struct sched_domain *child = env->sd->child;
9147	9398	struct sched_group *sg = env->sd->groups;
9148	9399	struct sg_lb_stats *local = &sds->local_stat;
9149	9400	struct sg_lb_stats tmp_sgs;
9150		- bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
9151	9401	int sg_status = 0;
9152	9402
9153	9403	#ifdef CONFIG_NO_HZ_COMMON
..	..	@@ -9174,22 +9424,6 @@
9174	9424	if (local_group)
9175	9425	goto next_group;
9176	9426
9177		- /*
9178		- * In case the child domain prefers tasks go to siblings
9179		- * first, lower the sg capacity so that we'll try
9180		- * and move all the excess tasks away. We lower the capacity
9181		- * of a group only if the local group has the capacity to fit
9182		- * these excess tasks. The extra check prevents the case where
9183		- * you always pull from the heaviest group when it is already
9184		- * under-utilized (possible with a large weight task outweighs
9185		- * the tasks on the system).
9186		- */
9187		- if (prefer_sibling && sds->local &&
9188		- group_has_capacity(env, local) &&
9189		- (sgs->sum_nr_running > local->sum_nr_running + 1)) {
9190		- sgs->group_no_capacity = 1;
9191		- sgs->group_type = group_classify(sg, sgs);
9192		- }
9193	9427
9194	9428	if (update_sd_pick_busiest(env, sds, sg, sgs)) {
9195	9429	sds->busiest = sg;
..	..	@@ -9198,12 +9432,14 @@
9198	9432
9199	9433	next_group:
9200	9434	/* Now, start updating sd_lb_stats */
9201		- sds->total_running += sgs->sum_nr_running;
9202	9435	sds->total_load += sgs->group_load;
9203	9436	sds->total_capacity += sgs->group_capacity;
9204	9437
9205	9438	sg = sg->next;
9206	9439	} while (sg != env->sd->groups);
	9440	+
	9441	+ /* Tag domain that child domain prefers tasks go to siblings first */
	9442	+ sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
9207	9443
9208	9444	#ifdef CONFIG_NO_HZ_COMMON
9209	9445	if ((env->flags & LBF_NOHZ_AGAIN) &&
..	..	@@ -9217,8 +9453,6 @@
9217	9453	if (env->sd->flags & SD_NUMA)
9218	9454	env->fbq_type = fbq_classify_group(&sds->busiest_stat);
9219	9455
9220		- env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
9221		-
9222	9456	if (!env->sd->parent) {
9223	9457	struct root_domain *rd = env->dst_rq->rd;
9224	9458
..	..	@@ -9227,144 +9461,28 @@
9227	9461
9228	9462	/* Update over-utilization (tipping point, U >= 0) indicator */
9229	9463	WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
9230		- trace_sched_overutilized(!!(sg_status & SG_OVERUTILIZED));
	9464	+ trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
9231	9465	} else if (sg_status & SG_OVERUTILIZED) {
9232		- WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
9233		- trace_sched_overutilized(1);
9234		- }
	9466	+ struct root_domain *rd = env->dst_rq->rd;
9235	9467
	9468	+ WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
	9469	+ trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
	9470	+ }
9236	9471	}
9237	9472
9238		-/**
9239		- * check_asym_packing - Check to see if the group is packed into the
9240		- * sched domain.
9241		- *
9242		- * This is primarily intended to used at the sibling level. Some
9243		- * cores like POWER7 prefer to use lower numbered SMT threads. In the
9244		- * case of POWER7, it can move to lower SMT modes only when higher
9245		- * threads are idle. When in lower SMT modes, the threads will
9246		- * perform better since they share less core resources. Hence when we
9247		- * have idle threads, we want them to be the higher ones.
9248		- *
9249		- * This packing function is run on idle threads. It checks to see if
9250		- * the busiest CPU in this domain (core in the P7 case) has a higher
9251		- * CPU number than the packing function is being run on. Here we are
9252		- * assuming lower CPU number will be equivalent to lower a SMT thread
9253		- * number.
9254		- *
9255		- * Return: 1 when packing is required and a task should be moved to
9256		- * this CPU. The amount of the imbalance is returned in env->imbalance.
9257		- *
9258		- * @env: The load balancing environment.
9259		- * @sds: Statistics of the sched_domain which is to be packed
9260		- */
9261		-static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
	9473	+static inline long adjust_numa_imbalance(int imbalance, int nr_running)
9262	9474	{
9263		- int busiest_cpu;
9264		-
9265		- if (!(env->sd->flags & SD_ASYM_PACKING))
9266		- return 0;
9267		-
9268		- if (env->idle == CPU_NOT_IDLE)
9269		- return 0;
9270		-
9271		- if (!sds->busiest)
9272		- return 0;
9273		-
9274		- busiest_cpu = sds->busiest->asym_prefer_cpu;
9275		- if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
9276		- return 0;
9277		-
9278		- env->imbalance = DIV_ROUND_CLOSEST(
9279		- sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
9280		- SCHED_CAPACITY_SCALE);
9281		-
9282		- return 1;
9283		-}
9284		-
9285		-/**
9286		- * fix_small_imbalance - Calculate the minor imbalance that exists
9287		- * amongst the groups of a sched_domain, during
9288		- * load balancing.
9289		- * @env: The load balancing environment.
9290		- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
9291		- */
9292		-static inline
9293		-void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
9294		-{
9295		- unsigned long tmp, capa_now = 0, capa_move = 0;
9296		- unsigned int imbn = 2;
9297		- unsigned long scaled_busy_load_per_task;
9298		- struct sg_lb_stats local, busiest;
9299		-
9300		- local = &sds->local_stat;
9301		- busiest = &sds->busiest_stat;
9302		-
9303		- if (!local->sum_nr_running)
9304		- local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
9305		- else if (busiest->load_per_task > local->load_per_task)
9306		- imbn = 1;
9307		-
9308		- scaled_busy_load_per_task =
9309		- (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9310		- busiest->group_capacity;
9311		-
9312		- if (busiest->avg_load + scaled_busy_load_per_task >=
9313		- local->avg_load + (scaled_busy_load_per_task * imbn)) {
9314		- env->imbalance = busiest->load_per_task;
9315		- return;
9316		- }
	9475	+ unsigned int imbalance_min;
9317	9476
9318	9477	/*
9319		- * OK, we don't have enough imbalance to justify moving tasks,
9320		- * however we may be able to increase total CPU capacity used by
9321		- * moving them.
	9478	+ * Allow a small imbalance based on a simple pair of communicating
	9479	+ * tasks that remain local when the source domain is almost idle.
9322	9480	*/
	9481	+ imbalance_min = 2;
	9482	+ if (nr_running <= imbalance_min)
	9483	+ return 0;
9323	9484
9324		- capa_now += busiest->group_capacity *
9325		- min(busiest->load_per_task, busiest->avg_load);
9326		- capa_now += local->group_capacity *
9327		- min(local->load_per_task, local->avg_load);
9328		- capa_now /= SCHED_CAPACITY_SCALE;
9329		-
9330		- /* Amount of load we'd subtract */
9331		- if (busiest->avg_load > scaled_busy_load_per_task) {
9332		- capa_move += busiest->group_capacity *
9333		- min(busiest->load_per_task,
9334		- busiest->avg_load - scaled_busy_load_per_task);
9335		- }
9336		-
9337		- /* Amount of load we'd add */
9338		- if (busiest->avg_load * busiest->group_capacity <
9339		- busiest->load_per_task * SCHED_CAPACITY_SCALE) {
9340		- tmp = (busiest->avg_load * busiest->group_capacity) /
9341		- local->group_capacity;
9342		- } else {
9343		- tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9344		- local->group_capacity;
9345		- }
9346		- capa_move += local->group_capacity *
9347		- min(local->load_per_task, local->avg_load + tmp);
9348		- capa_move /= SCHED_CAPACITY_SCALE;
9349		-
9350		- /* Move if we gain throughput */
9351		- if (capa_move > capa_now) {
9352		- env->imbalance = busiest->load_per_task;
9353		- return;
9354		- }
9355		-
9356		- /* We can't see throughput improvement with the load-based
9357		- * method, but it is possible depending upon group size and
9358		- * capacity range that there might still be an underutilized
9359		- * cpu available in an asymmetric capacity system. Do one last
9360		- * check just in case.
9361		- */
9362		- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9363		- busiest->group_type == group_overloaded &&
9364		- busiest->sum_nr_running > busiest->group_weight &&
9365		- local->sum_nr_running < local->group_weight &&
9366		- local->group_capacity < busiest->group_capacity)
9367		- env->imbalance = busiest->load_per_task;
	9485	+ return imbalance;
9368	9486	}
9369	9487
9370	9488	/**
..	..	@@ -9375,96 +9493,180 @@
9375	9493	*/
9376	9494	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
9377	9495	{
9378		- unsigned long max_pull, load_above_capacity = ~0UL;
9379	9496	struct sg_lb_stats local, busiest;
9380	9497
9381	9498	local = &sds->local_stat;
9382	9499	busiest = &sds->busiest_stat;
9383	9500
	9501	+ if (busiest->group_type == group_misfit_task) {
	9502	+ /* Set imbalance to allow misfit tasks to be balanced. */
	9503	+ env->migration_type = migrate_misfit;
	9504	+ env->imbalance = 1;
	9505	+ return;
	9506	+ }
	9507	+
	9508	+ if (busiest->group_type == group_asym_packing) {
	9509	+ /*
	9510	+ * In case of asym capacity, we will try to migrate all load to
	9511	+ * the preferred CPU.
	9512	+ */
	9513	+ env->migration_type = migrate_task;
	9514	+ env->imbalance = busiest->sum_h_nr_running;
	9515	+ return;
	9516	+ }
	9517	+
9384	9518	if (busiest->group_type == group_imbalanced) {
9385	9519	/*
9386	9520	* In the group_imb case we cannot rely on group-wide averages
9387		- * to ensure CPU-load equilibrium, look at wider averages. XXX
	9521	+ * to ensure CPU-load equilibrium, try to move any task to fix
	9522	+ * the imbalance. The next load balance will take care of
	9523	+ * balancing back the system.
9388	9524	*/
9389		- busiest->load_per_task =
9390		- min(busiest->load_per_task, sds->avg_load);
	9525	+ env->migration_type = migrate_task;
	9526	+ env->imbalance = 1;
	9527	+ return;
9391	9528	}
9392	9529
9393	9530	/*
9394		- * Avg load of busiest sg can be less and avg load of local sg can
9395		- * be greater than avg load across all sgs of sd because avg load
9396		- * factors in sg capacity and sgs with smaller group_type are
9397		- * skipped when updating the busiest sg:
	9531	+ * Try to use spare capacity of local group without overloading it or
	9532	+ * emptying busiest.
9398	9533	*/
9399		- if (busiest->group_type != group_misfit_task &&
9400		- (busiest->avg_load <= sds->avg_load \|\|
9401		- local->avg_load >= sds->avg_load)) {
9402		- env->imbalance = 0;
9403		- return fix_small_imbalance(env, sds);
	9534	+ if (local->group_type == group_has_spare) {
	9535	+ if ((busiest->group_type > group_fully_busy) &&
	9536	+ !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
	9537	+ /*
	9538	+ * If busiest is overloaded, try to fill spare
	9539	+ * capacity. This might end up creating spare capacity
	9540	+ * in busiest or busiest still being overloaded but
	9541	+ * there is no simple way to directly compute the
	9542	+ * amount of load to migrate in order to balance the
	9543	+ * system.
	9544	+ */
	9545	+ env->migration_type = migrate_util;
	9546	+ env->imbalance = max(local->group_capacity, local->group_util) -
	9547	+ local->group_util;
	9548	+
	9549	+ /*
	9550	+ * In some cases, the group's utilization is max or even
	9551	+ * higher than capacity because of migrations but the
	9552	+ * local CPU is (newly) idle. There is at least one
	9553	+ * waiting task in this overloaded busiest group. Let's
	9554	+ * try to pull it.
	9555	+ */
	9556	+ if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
	9557	+ env->migration_type = migrate_task;
	9558	+ env->imbalance = 1;
	9559	+ }
	9560	+
	9561	+ return;
	9562	+ }
	9563	+
	9564	+ if (busiest->group_weight == 1 \|\| sds->prefer_sibling) {
	9565	+ unsigned int nr_diff = busiest->sum_nr_running;
	9566	+ /*
	9567	+ * When prefer sibling, evenly spread running tasks on
	9568	+ * groups.
	9569	+ */
	9570	+ env->migration_type = migrate_task;
	9571	+ lsub_positive(&nr_diff, local->sum_nr_running);
	9572	+ env->imbalance = nr_diff >> 1;
	9573	+ } else {
	9574	+
	9575	+ /*
	9576	+ * If there is no overload, we just want to even the number of
	9577	+ * idle cpus.
	9578	+ */
	9579	+ env->migration_type = migrate_task;
	9580	+ env->imbalance = max_t(long, 0, (local->idle_cpus -
	9581	+ busiest->idle_cpus) >> 1);
	9582	+ }
	9583	+
	9584	+ /* Consider allowing a small imbalance between NUMA groups */
	9585	+ if (env->sd->flags & SD_NUMA)
	9586	+ env->imbalance = adjust_numa_imbalance(env->imbalance,
	9587	+ busiest->sum_nr_running);
	9588	+
	9589	+ return;
9404	9590	}
9405	9591
9406	9592	/*
9407		- * If there aren't any idle CPUs, avoid creating some.
	9593	+ * Local is fully busy but has to take more load to relieve the
	9594	+ * busiest group
9408	9595	*/
9409		- if (busiest->group_type == group_overloaded &&
9410		- local->group_type == group_overloaded) {
9411		- load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
9412		- if (load_above_capacity > busiest->group_capacity) {
9413		- load_above_capacity -= busiest->group_capacity;
9414		- load_above_capacity *= scale_load_down(NICE_0_LOAD);
9415		- load_above_capacity /= busiest->group_capacity;
9416		- } else
9417		- load_above_capacity = ~0UL;
	9596	+ if (local->group_type < group_overloaded) {
	9597	+ /*
	9598	+ * Local will become overloaded so the avg_load metrics are
	9599	+ * finally needed.
	9600	+ */
	9601	+
	9602	+ local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
	9603	+ local->group_capacity;
	9604	+
	9605	+ /*
	9606	+ * If the local group is more loaded than the selected
	9607	+ * busiest group don't try to pull any tasks.
	9608	+ */
	9609	+ if (local->avg_load >= busiest->avg_load) {
	9610	+ env->imbalance = 0;
	9611	+ return;
	9612	+ }
	9613	+
	9614	+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
	9615	+ sds->total_capacity;
	9616	+
	9617	+ /*
	9618	+ * If the local group is more loaded than the average system
	9619	+ * load, don't try to pull any tasks.
	9620	+ */
	9621	+ if (local->avg_load >= sds->avg_load) {
	9622	+ env->imbalance = 0;
	9623	+ return;
	9624	+ }
	9625	+
9418	9626	}
9419	9627
9420	9628	/*
9421		- * We're trying to get all the CPUs to the average_load, so we don't
9422		- * want to push ourselves above the average load, nor do we wish to
9423		- * reduce the max loaded CPU below the average load. At the same time,
9424		- * we also don't want to reduce the group load below the group
9425		- * capacity. Thus we look for the minimum possible imbalance.
	9629	+ * Both group are or will become overloaded and we're trying to get all
	9630	+ * the CPUs to the average_load, so we don't want to push ourselves
	9631	+ * above the average load, nor do we wish to reduce the max loaded CPU
	9632	+ * below the average load. At the same time, we also don't want to
	9633	+ * reduce the group load below the group capacity. Thus we look for
	9634	+ * the minimum possible imbalance.
9426	9635	*/
9427		- max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
9428		-
9429		- /* How much load to actually move to equalise the imbalance */
	9636	+ env->migration_type = migrate_load;
9430	9637	env->imbalance = min(
9431		- max_pull * busiest->group_capacity,
	9638	+ (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
9432	9639	(sds->avg_load - local->avg_load) * local->group_capacity
9433	9640	) / SCHED_CAPACITY_SCALE;
9434		-
9435		- /* Boost imbalance to allow misfit task to be balanced.
9436		- * Always do this if we are doing a NEWLY_IDLE balance
9437		- * on the assumption that any tasks we have must not be
9438		- * long-running (and hence we cannot rely upon load).
9439		- * However if we are not idle, we should assume the tasks
9440		- * we have are longer running and not override load-based
9441		- * calculations above unless we are sure that the local
9442		- * group is underutilized.
9443		- */
9444		- if (busiest->group_type == group_misfit_task &&
9445		- (env->idle == CPU_NEWLY_IDLE \|\|
9446		- local->sum_nr_running < local->group_weight)) {
9447		- env->imbalance = max_t(long, env->imbalance,
9448		- busiest->group_misfit_task_load);
9449		- }
9450		-
9451		- /*
9452		- * if *imbalance is less than the average load per runnable task
9453		- * there is no guarantee that any tasks will be moved so we'll have
9454		- * a think about bumping its value to force at least one task to be
9455		- * moved
9456		- */
9457		- if (env->imbalance < busiest->load_per_task)
9458		- return fix_small_imbalance(env, sds);
9459	9641	}
9460	9642
9461	9643	/***** find_busiest_group() helpers end here *******************/
	9644	+
	9645	+/*
	9646	+ * Decision matrix according to the local and busiest group type:
	9647	+ *
	9648	+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
	9649	+ * has_spare nr_idle balanced N/A N/A balanced balanced
	9650	+ * fully_busy nr_idle nr_idle N/A N/A balanced balanced
	9651	+ * misfit_task force N/A N/A N/A force force
	9652	+ * asym_packing force force N/A N/A force force
	9653	+ * imbalanced force force N/A N/A force force
	9654	+ * overloaded force force N/A N/A force avg_load
	9655	+ *
	9656	+ * N/A : Not Applicable because already filtered while updating
	9657	+ * statistics.
	9658	+ * balanced : The system is balanced for these 2 groups.
	9659	+ * force : Calculate the imbalance as load migration is probably needed.
	9660	+ * avg_load : Only if imbalance is significant enough.
	9661	+ * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
	9662	+ * different in groups.
	9663	+ */
9462	9664
9463	9665	/**
9464	9666	* find_busiest_group - Returns the busiest group within the sched_domain
9465	9667	* if there is an imbalance.
9466	9668	*
9467		- * Also calculates the amount of weighted load which should be moved
	9669	+ * Also calculates the amount of runnable load which should be moved
9468	9670	* to restore balance.
9469	9671	*
9470	9672	* @env: The load balancing environment.
..	..	@@ -9479,32 +9681,36 @@
9479	9681	init_sd_lb_stats(&sds);
9480	9682
9481	9683	/*
9482		- * Compute the various statistics relavent for load balancing at
	9684	+ * Compute the various statistics relevant for load balancing at
9483	9685	* this level.
9484	9686	*/
9485	9687	update_sd_lb_stats(env, &sds);
9486	9688
9487		- if (static_branch_unlikely(&sched_energy_present)) {
	9689	+ if (sched_energy_enabled()) {
9488	9690	struct root_domain *rd = env->dst_rq->rd;
	9691	+ int out_balance = 1;
9489	9692
9490		- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
	9693	+ trace_android_rvh_find_busiest_group(sds.busiest, env->dst_rq,
	9694	+ &out_balance);
	9695	+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)
	9696	+ && out_balance)
9491	9697	goto out_balanced;
9492	9698	}
9493	9699
9494	9700	local = &sds.local_stat;
9495	9701	busiest = &sds.busiest_stat;
9496	9702
9497		- /* ASYM feature bypasses nice load balance check */
9498		- if (check_asym_packing(env, &sds))
9499		- return sds.busiest;
9500		-
9501	9703	/* There is no busy sibling group to pull tasks from */
9502		- if (!sds.busiest \|\| busiest->sum_nr_running == 0)
	9704	+ if (!sds.busiest)
9503	9705	goto out_balanced;
9504	9706
9505		- /* XXX broken for overlapping NUMA groups */
9506		- sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
9507		- / sds.total_capacity;
	9707	+ /* Misfit tasks should be dealt with regardless of the avg load */
	9708	+ if (busiest->group_type == group_misfit_task)
	9709	+ goto force_balance;
	9710	+
	9711	+ /* ASYM feature bypasses nice load balance check */
	9712	+ if (busiest->group_type == group_asym_packing)
	9713	+ goto force_balance;
9508	9714
9509	9715	/*
9510	9716	* If the busiest group is imbalanced the below checks don't
..	..	@@ -9515,55 +9721,80 @@
9515	9721	goto force_balance;
9516	9722
9517	9723	/*
9518		- * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
9519		- * capacities from resulting in underutilization due to avg_load.
9520		- */
9521		- if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
9522		- busiest->group_no_capacity)
9523		- goto force_balance;
9524		-
9525		- /* Misfit tasks should be dealt with regardless of the avg load */
9526		- if (busiest->group_type == group_misfit_task)
9527		- goto force_balance;
9528		-
9529		- /*
9530	9724	* If the local group is busier than the selected busiest group
9531	9725	* don't try and pull any tasks.
9532	9726	*/
9533		- if (local->avg_load >= busiest->avg_load)
	9727	+ if (local->group_type > busiest->group_type)
9534	9728	goto out_balanced;
9535	9729
9536	9730	/*
9537		- * Don't pull any tasks if this group is already above the domain
9538		- * average load.
	9731	+ * When groups are overloaded, use the avg_load to ensure fairness
	9732	+ * between tasks.
9539	9733	*/
9540		- if (local->avg_load >= sds.avg_load)
9541		- goto out_balanced;
9542		-
9543		- if (env->idle == CPU_IDLE) {
	9734	+ if (local->group_type == group_overloaded) {
9544	9735	/*
9545		- * This CPU is idle. If the busiest group is not overloaded
9546		- * and there is no imbalance between this and busiest group
9547		- * wrt idle CPUs, it is balanced. The imbalance becomes
9548		- * significant if the diff is greater than 1 otherwise we
9549		- * might end up to just move the imbalance on another group
	9736	+ * If the local group is more loaded than the selected
	9737	+ * busiest group don't try to pull any tasks.
9550	9738	*/
9551		- if ((busiest->group_type != group_overloaded) &&
9552		- (local->idle_cpus <= (busiest->idle_cpus + 1)))
	9739	+ if (local->avg_load >= busiest->avg_load)
9553	9740	goto out_balanced;
9554		- } else {
	9741	+
	9742	+ /* XXX broken for overlapping NUMA groups */
	9743	+ sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
	9744	+ sds.total_capacity;
	9745	+
9555	9746	/*
9556		- * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
9557		- * imbalance_pct to be conservative.
	9747	+ * Don't pull any tasks if this group is already above the
	9748	+ * domain average load.
	9749	+ */
	9750	+ if (local->avg_load >= sds.avg_load)
	9751	+ goto out_balanced;
	9752	+
	9753	+ /*
	9754	+ * If the busiest group is more loaded, use imbalance_pct to be
	9755	+ * conservative.
9558	9756	*/
9559	9757	if (100 * busiest->avg_load <=
9560	9758	env->sd->imbalance_pct * local->avg_load)
9561	9759	goto out_balanced;
9562	9760	}
9563	9761
	9762	+ /* Try to move all excess tasks to child's sibling domain */
	9763	+ if (sds.prefer_sibling && local->group_type == group_has_spare &&
	9764	+ busiest->sum_nr_running > local->sum_nr_running + 1)
	9765	+ goto force_balance;
	9766	+
	9767	+ if (busiest->group_type != group_overloaded) {
	9768	+ if (env->idle == CPU_NOT_IDLE)
	9769	+ /*
	9770	+ * If the busiest group is not overloaded (and as a
	9771	+ * result the local one too) but this CPU is already
	9772	+ * busy, let another idle CPU try to pull task.
	9773	+ */
	9774	+ goto out_balanced;
	9775	+
	9776	+ if (busiest->group_weight > 1 &&
	9777	+ local->idle_cpus <= (busiest->idle_cpus + 1))
	9778	+ /*
	9779	+ * If the busiest group is not overloaded
	9780	+ * and there is no imbalance between this and busiest
	9781	+ * group wrt idle CPUs, it is balanced. The imbalance
	9782	+ * becomes significant if the diff is greater than 1
	9783	+ * otherwise we might end up to just move the imbalance
	9784	+ * on another group. Of course this applies only if
	9785	+ * there is more than 1 CPU per group.
	9786	+ */
	9787	+ goto out_balanced;
	9788	+
	9789	+ if (busiest->sum_h_nr_running == 1)
	9790	+ /*
	9791	+ * busiest doesn't have any tasks waiting to run
	9792	+ */
	9793	+ goto out_balanced;
	9794	+ }
	9795	+
9564	9796	force_balance:
9565	9797	/* Looks like there is an imbalance. Compute it */
9566		- env->src_grp_type = busiest->group_type;
9567	9798	calculate_imbalance(env, &sds);
9568	9799	return env->imbalance ? sds.busiest : NULL;
9569	9800
..	..	@@ -9579,11 +9810,18 @@
9579	9810	struct sched_group *group)
9580	9811	{
9581	9812	struct rq busiest = NULL, rq;
9582		- unsigned long busiest_load = 0, busiest_capacity = 1;
9583		- int i;
	9813	+ unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
	9814	+ unsigned int busiest_nr = 0;
	9815	+ int i, done = 0;
	9816	+
	9817	+ trace_android_rvh_find_busiest_queue(env->dst_cpu, group, env->cpus,
	9818	+ &busiest, &done);
	9819	+ if (done)
	9820	+ return busiest;
9584	9821
9585	9822	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
9586		- unsigned long capacity, wl;
	9823	+ unsigned long capacity, load, util;
	9824	+ unsigned int nr_running;
9587	9825	enum fbq_type rt;
9588	9826
9589	9827	rq = cpu_rq(i);
..	..	@@ -9611,20 +9849,8 @@
9611	9849	if (rt > env->fbq_type)
9612	9850	continue;
9613	9851
9614		- /*
9615		- * For ASYM_CPUCAPACITY domains with misfit tasks we simply
9616		- * seek the "biggest" misfit task.
9617		- */
9618		- if (env->src_grp_type == group_misfit_task) {
9619		- if (rq->misfit_task_load > busiest_load) {
9620		- busiest_load = rq->misfit_task_load;
9621		- busiest = rq;
9622		- }
9623		-
9624		- continue;
9625		- }
9626		-
9627	9852	capacity = capacity_of(i);
	9853	+ nr_running = rq->cfs.h_nr_running;
9628	9854
9629	9855	/*
9630	9856	* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
..	..	@@ -9634,35 +9860,77 @@
9634	9860	*/
9635	9861	if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9636	9862	capacity_of(env->dst_cpu) < capacity &&
9637		- rq->nr_running == 1)
	9863	+ nr_running == 1)
9638	9864	continue;
9639	9865
9640		- wl = weighted_cpuload(rq);
	9866	+ switch (env->migration_type) {
	9867	+ case migrate_load:
	9868	+ /*
	9869	+ * When comparing with load imbalance, use cpu_load()
	9870	+ * which is not scaled with the CPU capacity.
	9871	+ */
	9872	+ load = cpu_load(rq);
9641	9873
9642		- /*
9643		- * When comparing with imbalance, use weighted_cpuload()
9644		- * which is not scaled with the CPU capacity.
9645		- */
	9874	+ if (nr_running == 1 && load > env->imbalance &&
	9875	+ !check_cpu_capacity(rq, env->sd))
	9876	+ break;
9646	9877
9647		- if (rq->nr_running == 1 && wl > env->imbalance &&
9648		- !check_cpu_capacity(rq, env->sd))
9649		- continue;
	9878	+ /*
	9879	+ * For the load comparisons with the other CPUs,
	9880	+ * consider the cpu_load() scaled with the CPU
	9881	+ * capacity, so that the load can be moved away
	9882	+ * from the CPU that is potentially running at a
	9883	+ * lower capacity.
	9884	+ *
	9885	+ * Thus we're looking for max(load_i / capacity_i),
	9886	+ * crosswise multiplication to rid ourselves of the
	9887	+ * division works out to:
	9888	+ * load_i * capacity_j > load_j * capacity_i;
	9889	+ * where j is our previous maximum.
	9890	+ */
	9891	+ if (load * busiest_capacity > busiest_load * capacity) {
	9892	+ busiest_load = load;
	9893	+ busiest_capacity = capacity;
	9894	+ busiest = rq;
	9895	+ }
	9896	+ break;
9650	9897
9651		- /*
9652		- * For the load comparisons with the other CPU's, consider
9653		- * the weighted_cpuload() scaled with the CPU capacity, so
9654		- * that the load can be moved away from the CPU that is
9655		- * potentially running at a lower capacity.
9656		- *
9657		- * Thus we're looking for max(wl_i / capacity_i), crosswise
9658		- * multiplication to rid ourselves of the division works out
9659		- * to: wl_i * capacity_j > wl_j * capacity_i; where j is
9660		- * our previous maximum.
9661		- */
9662		- if (wl * busiest_capacity > busiest_load * capacity) {
9663		- busiest_load = wl;
9664		- busiest_capacity = capacity;
9665		- busiest = rq;
	9898	+ case migrate_util:
	9899	+ util = cpu_util(cpu_of(rq));
	9900	+
	9901	+ /*
	9902	+ * Don't try to pull utilization from a CPU with one
	9903	+ * running task. Whatever its utilization, we will fail
	9904	+ * detach the task.
	9905	+ */
	9906	+ if (nr_running <= 1)
	9907	+ continue;
	9908	+
	9909	+ if (busiest_util < util) {
	9910	+ busiest_util = util;
	9911	+ busiest = rq;
	9912	+ }
	9913	+ break;
	9914	+
	9915	+ case migrate_task:
	9916	+ if (busiest_nr < nr_running) {
	9917	+ busiest_nr = nr_running;
	9918	+ busiest = rq;
	9919	+ }
	9920	+ break;
	9921	+
	9922	+ case migrate_misfit:
	9923	+ /*
	9924	+ * For ASYM_CPUCAPACITY domains with misfit tasks we
	9925	+ * simply seek the "biggest" misfit task.
	9926	+ */
	9927	+ if (rq->misfit_task_load > busiest_load) {
	9928	+ busiest_load = rq->misfit_task_load;
	9929	+ busiest = rq;
	9930	+ }
	9931	+
	9932	+ break;
	9933	+
9666	9934	}
9667	9935	}
9668	9936
..	..	@@ -9675,21 +9943,25 @@
9675	9943	*/
9676	9944	#define MAX_PINNED_INTERVAL 512
9677	9945
9678		-static int need_active_balance(struct lb_env *env)
	9946	+static inline bool
	9947	+asym_active_balance(struct lb_env *env)
	9948	+{
	9949	+ /*
	9950	+ * ASYM_PACKING needs to force migrate tasks from busy but
	9951	+ * lower priority CPUs in order to pack all tasks in the
	9952	+ * highest priority CPUs.
	9953	+ */
	9954	+ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
	9955	+ sched_asym_prefer(env->dst_cpu, env->src_cpu);
	9956	+}
	9957	+
	9958	+static inline bool
	9959	+voluntary_active_balance(struct lb_env *env)
9679	9960	{
9680	9961	struct sched_domain *sd = env->sd;
9681	9962
9682		- if (env->idle == CPU_NEWLY_IDLE) {
9683		-
9684		- /*
9685		- * ASYM_PACKING needs to force migrate tasks from busy but
9686		- * lower priority CPUs in order to pack all tasks in the
9687		- * highest priority CPUs.
9688		- */
9689		- if ((sd->flags & SD_ASYM_PACKING) &&
9690		- sched_asym_prefer(env->dst_cpu, env->src_cpu))
9691		- return 1;
9692		- }
	9963	+ if (asym_active_balance(env))
	9964	+ return 1;
9693	9965
9694	9966	/*
9695	9967	* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
..	..	@@ -9704,19 +9976,18 @@
9704	9976	return 1;
9705	9977	}
9706	9978
9707		- if (env->src_grp_type == group_misfit_task)
	9979	+ if (env->migration_type == migrate_misfit)
9708	9980	return 1;
9709	9981
9710		- if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
9711		- env->src_rq->cfs.h_nr_running == 1 &&
9712		- cpu_overutilized(env->src_cpu) &&
9713		- !cpu_overutilized(env->dst_cpu)) {
9714		- return 1;
9715		- }
	9982	+ return 0;
	9983	+}
9716	9984
9717		- if (env->src_grp_type == group_overloaded && env->src_rq->misfit_task_load)
9718		- return 1;
	9985	+static int need_active_balance(struct lb_env *env)
	9986	+{
	9987	+ struct sched_domain *sd = env->sd;
9719	9988
	9989	+ if (voluntary_active_balance(env))
	9990	+ return 1;
9720	9991
9721	9992	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
9722	9993	}
..	..	@@ -9726,7 +9997,17 @@
9726	9997	static int should_we_balance(struct lb_env *env)
9727	9998	{
9728	9999	struct sched_group *sg = env->sd->groups;
9729		- int cpu, balance_cpu = -1;
	10000	+ int cpu;
	10001	+
	10002	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	10003	+ struct root_domain *rd = env->dst_rq->rd;
	10004	+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
	10005	+ int level = rockchip_perf_get_level();
	10006	+
	10007	+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
	10008	+ cpul_mask && cpumask_test_cpu(env->dst_cpu, cpul_mask))
	10009	+ return 0;
	10010	+ }
9730	10011
9731	10012	/*
9732	10013	* Ensure the balancing environment is consistent; can happen
..	..	@@ -9747,18 +10028,12 @@
9747	10028	if (!idle_cpu(cpu))
9748	10029	continue;
9749	10030
9750		- balance_cpu = cpu;
9751		- break;
	10031	+ /* Are we the first idle CPU? */
	10032	+ return cpu == env->dst_cpu;
9752	10033	}
9753	10034
9754		- if (balance_cpu == -1)
9755		- balance_cpu = group_balance_cpu(sg);
9756		-
9757		- /*
9758		- * First idle CPU or the first CPU(busiest) in this sched group
9759		- * is eligible for doing load balancing at this and above domains.
9760		- */
9761		- return balance_cpu == env->dst_cpu;
	10035	+ /* Are we the first CPU of this group ? */
	10036	+ return group_balance_cpu(sg) == env->dst_cpu;
9762	10037	}
9763	10038
9764	10039	/*
..	..	@@ -9780,7 +10055,7 @@
9780	10055	.sd = sd,
9781	10056	.dst_cpu = this_cpu,
9782	10057	.dst_rq = this_rq,
9783		- .dst_grpmask = sched_group_span(sd->groups),
	10058	+ .dst_grpmask = group_balance_mask(sd->groups),
9784	10059	.idle = idle,
9785	10060	.loop_break = sched_nr_migrate_break,
9786	10061	.cpus = cpus,
..	..	@@ -9830,6 +10105,7 @@
9830	10105
9831	10106	more_balance:
9832	10107	rq_lock_irqsave(busiest, &rf);
	10108	+ env.src_rq_rf = &rf;
9833	10109	update_rq_clock(busiest);
9834	10110
9835	10111	/*
..	..	@@ -9882,7 +10158,7 @@
9882	10158	if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
9883	10159
9884	10160	/* Prevent to re-select dst_cpu via env's CPUs */
9885		- cpumask_clear_cpu(env.dst_cpu, env.cpus);
	10161	+ __cpumask_clear_cpu(env.dst_cpu, env.cpus);
9886	10162
9887	10163	env.dst_rq = cpu_rq(env.new_dst_cpu);
9888	10164	env.dst_cpu = env.new_dst_cpu;
..	..	@@ -9909,7 +10185,7 @@
9909	10185
9910	10186	/* All tasks on this runqueue were pinned by CPU affinity */
9911	10187	if (unlikely(env.flags & LBF_ALL_PINNED)) {
9912		- cpumask_clear_cpu(cpu_of(busiest), cpus);
	10188	+ __cpumask_clear_cpu(cpu_of(busiest), cpus);
9913	10189	/*
9914	10190	* Attempting to continue load balancing at the current
9915	10191	* sched_domain level only makes sense if there are
..	..	@@ -9936,8 +10212,7 @@
9936	10212	* excessive cache_hot migrations and active balances.
9937	10213	*/
9938	10214	if (idle != CPU_NEWLY_IDLE)
9939		- if (env.src_grp_nr_running > 1)
9940		- sd->nr_balance_failed++;
	10215	+ sd->nr_balance_failed++;
9941	10216
9942	10217	if (need_active_balance(&env)) {
9943	10218	unsigned long flags;
..	..	@@ -9980,7 +10255,7 @@
9980	10255	} else
9981	10256	sd->nr_balance_failed = 0;
9982	10257
9983		- if (likely(!active_balance)) {
	10258	+ if (likely(!active_balance) \|\| voluntary_active_balance(&env)) {
9984	10259	/* We were unbalanced, so reset the balancing interval */
9985	10260	sd->balance_interval = sd->min_interval;
9986	10261	} else {
..	..	@@ -10023,18 +10298,18 @@
10023	10298	ld_moved = 0;
10024	10299
10025	10300	/*
10026		- * idle_balance() disregards balance intervals, so we could repeatedly
10027		- * reach this code, which would lead to balance_interval skyrocketting
10028		- * in a short amount of time. Skip the balance_interval increase logic
10029		- * to avoid that.
	10301	+ * newidle_balance() disregards balance intervals, so we could
	10302	+ * repeatedly reach this code, which would lead to balance_interval
	10303	+ * skyrocketting in a short amount of time. Skip the balance_interval
	10304	+ * increase logic to avoid that.
10030	10305	*/
10031	10306	if (env.idle == CPU_NEWLY_IDLE)
10032	10307	goto out;
10033	10308
10034	10309	/* tune up the balancing interval */
10035		- if (((env.flags & LBF_ALL_PINNED) &&
10036		- sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
10037		- (sd->balance_interval < sd->max_interval))
	10310	+ if ((env.flags & LBF_ALL_PINNED &&
	10311	+ sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
	10312	+ sd->balance_interval < sd->max_interval)
10038	10313	sd->balance_interval *= 2;
10039	10314	out:
10040	10315	return ld_moved;
..	..	@@ -10050,6 +10325,15 @@
10050	10325
10051	10326	/* scale ms to jiffies */
10052	10327	interval = msecs_to_jiffies(interval);
	10328	+
	10329	+ /*
	10330	+ * Reduce likelihood of busy balancing at higher domains racing with
	10331	+ * balancing at lower domains by preventing their balancing periods
	10332	+ * from being multiples of each other.
	10333	+ */
	10334	+ if (cpu_busy)
	10335	+ interval -= 1;
	10336	+
10053	10337	interval = clamp(interval, 1UL, max_load_balance_interval);
10054	10338
10055	10339	return interval;
..	..	@@ -10112,9 +10396,8 @@
10112	10396	/* Search for an sd spanning us and the target CPU. */
10113	10397	rcu_read_lock();
10114	10398	for_each_domain(target_cpu, sd) {
10115		- if ((sd->flags & SD_LOAD_BALANCE) &&
10116		- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10117		- break;
	10399	+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
	10400	+ break;
10118	10401	}
10119	10402
10120	10403	if (likely(sd)) {
..	..	@@ -10132,6 +10415,7 @@
10132	10415	* about DST_PINNED.
10133	10416	*/
10134	10417	.flags = LBF_DST_PINNED,
	10418	+ .src_rq_rf = &rf,
10135	10419	};
10136	10420
10137	10421	schedstat_inc(sd->alb_count);
..	..	@@ -10167,7 +10451,7 @@
10167	10451	*/
10168	10452	void update_max_interval(void)
10169	10453	{
10170		- max_load_balance_interval = HZ*num_online_cpus()/10;
	10454	+ max_load_balance_interval = HZ*num_active_cpus()/10;
10171	10455	}
10172	10456
10173	10457	/*
..	..	@@ -10180,6 +10464,7 @@
10180	10464	{
10181	10465	int continue_balancing = 1;
10182	10466	int cpu = rq->cpu;
	10467	+ int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
10183	10468	unsigned long interval;
10184	10469	struct sched_domain *sd;
10185	10470	/* Earliest time when we have to do rebalance again */
..	..	@@ -10187,6 +10472,10 @@
10187	10472	int update_next_balance = 0;
10188	10473	int need_serialize, need_decay = 0;
10189	10474	u64 max_cost = 0;
	10475	+
	10476	+ trace_android_rvh_sched_rebalance_domains(rq, &continue_balancing);
	10477	+ if (!continue_balancing)
	10478	+ return;
10190	10479
10191	10480	rcu_read_lock();
10192	10481	for_each_domain(cpu, sd) {
..	..	@@ -10202,9 +10491,6 @@
10202	10491	}
10203	10492	max_cost += sd->max_newidle_lb_cost;
10204	10493
10205		- if (!(sd->flags & SD_LOAD_BALANCE))
10206		- continue;
10207		-
10208	10494	/*
10209	10495	* Stop the load balance at this level. There is another
10210	10496	* CPU in our sched group which is doing load balancing more
..	..	@@ -10216,7 +10502,7 @@
10216	10502	break;
10217	10503	}
10218	10504
10219		- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
	10505	+ interval = get_sd_balance_interval(sd, busy);
10220	10506
10221	10507	need_serialize = sd->flags & SD_SERIALIZE;
10222	10508	if (need_serialize) {
..	..	@@ -10232,9 +10518,10 @@
10232	10518	* state even if we migrated tasks. Update it.
10233	10519	*/
10234	10520	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
	10521	+ busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
10235	10522	}
10236	10523	sd->last_balance = jiffies;
10237		- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
	10524	+ interval = get_sd_balance_interval(sd, busy);
10238	10525	}
10239	10526	if (need_serialize)
10240	10527	spin_unlock(&balancing);
..	..	@@ -10294,7 +10581,11 @@
10294	10581
10295	10582	static inline int find_new_ilb(void)
10296	10583	{
10297		- int ilb;
	10584	+ int ilb = -1;
	10585	+
	10586	+ trace_android_rvh_find_new_ilb(nohz.idle_cpus_mask, &ilb);
	10587	+ if (ilb >= 0)
	10588	+ return ilb;
10298	10589
10299	10590	for_each_cpu_and(ilb, nohz.idle_cpus_mask,
10300	10591	housekeeping_cpumask(HK_FLAG_MISC)) {
..	..	@@ -10325,29 +10616,25 @@
10325	10616	if (ilb_cpu >= nr_cpu_ids)
10326	10617	return;
10327	10618
	10619	+ /*
	10620	+ * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
	10621	+ * the first flag owns it; cleared by nohz_csd_func().
	10622	+ */
10328	10623	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
10329	10624	if (flags & NOHZ_KICK_MASK)
10330	10625	return;
10331	10626
10332	10627	/*
10333		- * Use smp_send_reschedule() instead of resched_cpu().
10334		- * This way we generate a sched IPI on the target CPU which
	10628	+ * This way we generate an IPI on the target CPU which
10335	10629	* is idle. And the softirq performing nohz idle load balance
10336	10630	* will be run before returning from the IPI.
10337	10631	*/
10338		- smp_send_reschedule(ilb_cpu);
	10632	+ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
10339	10633	}
10340	10634
10341	10635	/*
10342		- * Current heuristic for kicking the idle load balancer in the presence
10343		- * of an idle cpu in the system.
10344		- * - This rq has more than one task.
10345		- * - This rq has at least one CFS task and the capacity of the CPU is
10346		- * significantly reduced because of RT tasks or IRQs.
10347		- * - At parent of LLC scheduler domain level, this cpu's scheduler group has
10348		- * multiple busy cpu.
10349		- * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
10350		- * domain span are idle.
	10636	+ * Current decision point for kicking the idle load balancer in the presence
	10637	+ * of idle CPUs in the system.
10351	10638	*/
10352	10639	static void nohz_balancer_kick(struct rq *rq)
10353	10640	{
..	..	@@ -10356,6 +10643,7 @@
10356	10643	struct sched_domain *sd;
10357	10644	int nr_busy, i, cpu = rq->cpu;
10358	10645	unsigned int flags = 0;
	10646	+ int done = 0;
10359	10647
10360	10648	if (unlikely(rq->idle_balance))
10361	10649	return;
..	..	@@ -10380,30 +10668,25 @@
10380	10668	if (time_before(now, nohz.next_balance))
10381	10669	goto out;
10382	10670
10383		- if (rq->nr_running >= 2 \|\| rq->misfit_task_load) {
	10671	+ trace_android_rvh_sched_nohz_balancer_kick(rq, &flags, &done);
	10672	+ if (done)
	10673	+ goto out;
	10674	+
	10675	+ if (rq->nr_running >= 2) {
10384	10676	flags = NOHZ_KICK_MASK;
10385	10677	goto out;
10386	10678	}
10387	10679
10388	10680	rcu_read_lock();
10389		- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10390		- if (sds) {
10391		- /*
10392		- * XXX: write a coherent comment on why we do this.
10393		- * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
10394		- */
10395		- nr_busy = atomic_read(&sds->nr_busy_cpus);
10396		- if (nr_busy > 1) {
10397		- flags = NOHZ_KICK_MASK;
10398		- goto unlock;
10399		- }
10400		-
10401		- }
10402	10681
10403	10682	sd = rcu_dereference(rq->sd);
10404	10683	if (sd) {
10405		- if ((rq->cfs.h_nr_running >= 1) &&
10406		- check_cpu_capacity(rq, sd)) {
	10684	+ /*
	10685	+ * If there's a CFS task and the current CPU has reduced
	10686	+ * capacity; kick the ILB to see if there's a better CPU to run
	10687	+ * on.
	10688	+ */
	10689	+ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
10407	10690	flags = NOHZ_KICK_MASK;
10408	10691	goto unlock;
10409	10692	}
..	..	@@ -10411,15 +10694,55 @@
10411	10694
10412	10695	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
10413	10696	if (sd) {
10414		- for_each_cpu(i, sched_domain_span(sd)) {
10415		- if (i == cpu \|\|
10416		- !cpumask_test_cpu(i, nohz.idle_cpus_mask))
10417		- continue;
10418		-
	10697	+ /*
	10698	+ * When ASYM_PACKING; see if there's a more preferred CPU
	10699	+ * currently idle; in which case, kick the ILB to move tasks
	10700	+ * around.
	10701	+ */
	10702	+ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
10419	10703	if (sched_asym_prefer(i, cpu)) {
10420	10704	flags = NOHZ_KICK_MASK;
10421	10705	goto unlock;
10422	10706	}
	10707	+ }
	10708	+ }
	10709	+
	10710	+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
	10711	+ if (sd) {
	10712	+ /*
	10713	+ * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
	10714	+ * to run the misfit task on.
	10715	+ */
	10716	+ if (check_misfit_status(rq, sd)) {
	10717	+ flags = NOHZ_KICK_MASK;
	10718	+ goto unlock;
	10719	+ }
	10720	+
	10721	+ /*
	10722	+ * For asymmetric systems, we do not want to nicely balance
	10723	+ * cache use, instead we want to embrace asymmetry and only
	10724	+ * ensure tasks have enough CPU capacity.
	10725	+ *
	10726	+ * Skip the LLC logic because it's not relevant in that case.
	10727	+ */
	10728	+ goto unlock;
	10729	+ }
	10730	+
	10731	+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
	10732	+ if (sds) {
	10733	+ /*
	10734	+ * If there is an imbalance between LLC domains (IOW we could
	10735	+ * increase the overall cache use), we need some less-loaded LLC
	10736	+ * domain to pull some load. Likewise, we may need to spread
	10737	+ * load within the current LLC domain (e.g. packed SMT cores but
	10738	+ * other CPUs are idle). We can't really know from here how busy
	10739	+ * the others are - so just get a nohz balance going if it looks
	10740	+ * like this LLC domain has tasks we could move.
	10741	+ */
	10742	+ nr_busy = atomic_read(&sds->nr_busy_cpus);
	10743	+ if (nr_busy > 1) {
	10744	+ flags = NOHZ_KICK_MASK;
	10745	+ goto unlock;
10423	10746	}
10424	10747	}
10425	10748	unlock:
..	..	@@ -10485,9 +10808,20 @@
10485	10808
10486	10809	SCHED_WARN_ON(cpu != smp_processor_id());
10487	10810
10488		- /* If this CPU is going down, then nothing needs to be done: */
10489		- if (!cpu_active(cpu))
	10811	+ if (!cpu_active(cpu)) {
	10812	+ /*
	10813	+ * A CPU can be paused while it is idle with it's tick
	10814	+ * stopped. nohz_balance_exit_idle() should be called
	10815	+ * from the local CPU, so it can't be called during
	10816	+ * pause. This results in paused CPU participating in
	10817	+ * the nohz idle balance, which should be avoided.
	10818	+ *
	10819	+ * When the paused CPU exits idle and enters again,
	10820	+ * exempt the paused CPU from nohz_balance_exit_idle.
	10821	+ */
	10822	+ nohz_balance_exit_idle(rq);
10490	10823	return;
	10824	+ }
10491	10825
10492	10826	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
10493	10827	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
..	..	@@ -10600,7 +10934,6 @@
10600	10934
10601	10935	rq_lock_irqsave(rq, &rf);
10602	10936	update_rq_clock(rq);
10603		- cpu_load_update_idle(rq);
10604	10937	rq_unlock_irqrestore(rq, &rf);
10605	10938
10606	10939	if (flags & NOHZ_BALANCE_KICK)
..	..	@@ -10650,22 +10983,14 @@
10650	10983	*/
10651	10984	static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
10652	10985	{
10653		- int this_cpu = this_rq->cpu;
10654		- unsigned int flags;
	10986	+ unsigned int flags = this_rq->nohz_idle_balance;
10655	10987
10656		- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
	10988	+ if (!flags)
10657	10989	return false;
10658	10990
10659		- if (idle != CPU_IDLE) {
10660		- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10661		- return false;
10662		- }
	10991	+ this_rq->nohz_idle_balance = 0;
10663	10992
10664		- /*
10665		- * barrier, pairs with nohz_balance_enter_idle(), ensures ...
10666		- */
10667		- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10668		- if (!(flags & NOHZ_KICK_MASK))
	10993	+ if (idle != CPU_IDLE)
10669	10994	return false;
10670	10995
10671	10996	_nohz_idle_balance(this_rq, flags, idle);
..	..	@@ -10719,15 +11044,26 @@
10719	11044	/*
10720	11045	* idle_balance is called by schedule() if this_cpu is about to become
10721	11046	* idle. Attempts to pull tasks from other CPUs.
	11047	+ *
	11048	+ * Returns:
	11049	+ * < 0 - we released the lock and there are !fair tasks present
	11050	+ * 0 - failed, no new tasks
	11051	+ * > 0 - success, new (fair) tasks present
10722	11052	*/
10723		-static int idle_balance(struct rq this_rq, struct rq_flags rf)
	11053	+static int newidle_balance(struct rq this_rq, struct rq_flags rf)
10724	11054	{
10725	11055	unsigned long next_balance = jiffies + HZ;
10726	11056	int this_cpu = this_rq->cpu;
10727	11057	struct sched_domain *sd;
10728	11058	int pulled_task = 0;
10729	11059	u64 curr_cost = 0;
	11060	+ int done = 0;
10730	11061
	11062	+ trace_android_rvh_sched_newidle_balance(this_rq, rf, &pulled_task, &done);
	11063	+ if (done)
	11064	+ return pulled_task;
	11065	+
	11066	+ update_misfit_status(NULL, this_rq);
10731	11067	/*
10732	11068	* We must set idle_stamp _before_ calling idle_balance(), such that we
10733	11069	* measure the duration of idle_balance() as idle time.
..	..	@@ -10769,9 +11105,6 @@
10769	11105	for_each_domain(this_cpu, sd) {
10770	11106	int continue_balancing = 1;
10771	11107	u64 t0, domain_cost;
10772		-
10773		- if (!(sd->flags & SD_LOAD_BALANCE))
10774		- continue;
10775	11108
10776	11109	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
10777	11110	update_next_balance(sd, &next_balance);
..	..	@@ -10945,7 +11278,7 @@
10945	11278	* 'current' within the tree based on its new key value.
10946	11279	*/
10947	11280	swap(curr->vruntime, se->vruntime);
10948		- resched_curr_lazy(rq);
	11281	+ resched_curr(rq);
10949	11282	}
10950	11283
10951	11284	se->vruntime -= cfs_rq->min_vruntime;
..	..	@@ -10962,6 +11295,9 @@
10962	11295	if (!task_on_rq_queued(p))
10963	11296	return;
10964	11297
	11298	+ if (rq->cfs.nr_running == 1)
	11299	+ return;
	11300	+
10965	11301	/*
10966	11302	* Reschedule if we are currently running on this runqueue and
10967	11303	* our priority decreased, or if we are not currently running on
..	..	@@ -10969,7 +11305,7 @@
10969	11305	*/
10970	11306	if (rq->curr == p) {
10971	11307	if (p->prio > oldprio)
10972		- resched_curr_lazy(rq);
	11308	+ resched_curr(rq);
10973	11309	} else
10974	11310	check_preempt_curr(rq, p, 0);
10975	11311	}
..	..	@@ -11040,7 +11376,7 @@
11040	11376	/* Catch up with the cfs_rq and remove our load when we leave */
11041	11377	update_load_avg(cfs_rq, se, 0);
11042	11378	detach_entity_load_avg(cfs_rq, se);
11043		- update_tg_load_avg(cfs_rq, false);
	11379	+ update_tg_load_avg(cfs_rq);
11044	11380	propagate_entity_cfs_rq(se);
11045	11381	}
11046	11382
..	..	@@ -11058,8 +11394,8 @@
11058	11394
11059	11395	/* Synchronize entity with its cfs_rq */
11060	11396	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
11061		- attach_entity_load_avg(cfs_rq, se, 0);
11062		- update_tg_load_avg(cfs_rq, false);
	11397	+ attach_entity_load_avg(cfs_rq, se);
	11398	+ update_tg_load_avg(cfs_rq);
11063	11399	propagate_entity_cfs_rq(se);
11064	11400	}
11065	11401
..	..	@@ -11118,9 +11454,19 @@
11118	11454	* This routine is mostly called to set cfs_rq->curr field when a task
11119	11455	* migrates between groups/classes.
11120	11456	*/
11121		-static void set_curr_task_fair(struct rq *rq)
	11457	+static void set_next_task_fair(struct rq rq, struct task_struct p, bool first)
11122	11458	{
11123		- struct sched_entity *se = &rq->curr->se;
	11459	+ struct sched_entity *se = &p->se;
	11460	+
	11461	+#ifdef CONFIG_SMP
	11462	+ if (task_on_rq_queued(p)) {
	11463	+ /*
	11464	+ * Move the next running task to the front of the list, so our
	11465	+ * cfs_tasks list becomes MRU one.
	11466	+ */
	11467	+ list_move(&se->group_node, &rq->cfs_tasks);
	11468	+ }
	11469	+#endif
11124	11470
11125	11471	for_each_sched_entity(se) {
11126	11472	struct cfs_rq *cfs_rq = cfs_rq_of(se);
..	..	@@ -11381,8 +11727,8 @@
11381	11727	/*
11382	11728	* All the scheduling class methods:
11383	11729	*/
11384		-const struct sched_class fair_sched_class = {
11385		- .next = &idle_sched_class,
	11730	+const struct sched_class fair_sched_class
	11731	+ __section("__fair_sched_class") = {
11386	11732	.enqueue_task = enqueue_task_fair,
11387	11733	.dequeue_task = dequeue_task_fair,
11388	11734	.yield_task = yield_task_fair,
..	..	@@ -11390,10 +11736,12 @@
11390	11736
11391	11737	.check_preempt_curr = check_preempt_wakeup,
11392	11738
11393		- .pick_next_task = pick_next_task_fair,
	11739	+ .pick_next_task = __pick_next_task_fair,
11394	11740	.put_prev_task = put_prev_task_fair,
	11741	+ .set_next_task = set_next_task_fair,
11395	11742
11396	11743	#ifdef CONFIG_SMP
	11744	+ .balance = balance_fair,
11397	11745	.select_task_rq = select_task_rq_fair,
11398	11746	.migrate_task_rq = migrate_task_rq_fair,
11399	11747
..	..	@@ -11404,7 +11752,6 @@
11404	11752	.set_cpus_allowed = set_cpus_allowed_common,
11405	11753	#endif
11406	11754
11407		- .set_curr_task = set_curr_task_fair,
11408	11755	.task_tick = task_tick_fair,
11409	11756	.task_fork = task_fork_fair,
11410	11757
..	..	@@ -11474,3 +11821,101 @@
11474	11821	#endif /* SMP */
11475	11822
11476	11823	}
	11824	+
	11825	+/*
	11826	+ * Helper functions to facilitate extracting info from tracepoints.
	11827	+ */
	11828	+
	11829	+const struct sched_avg sched_trace_cfs_rq_avg(struct cfs_rq cfs_rq)
	11830	+{
	11831	+#ifdef CONFIG_SMP
	11832	+ return cfs_rq ? &cfs_rq->avg : NULL;
	11833	+#else
	11834	+ return NULL;
	11835	+#endif
	11836	+}
	11837	+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
	11838	+
	11839	+char sched_trace_cfs_rq_path(struct cfs_rq cfs_rq, char *str, int len)
	11840	+{
	11841	+ if (!cfs_rq) {
	11842	+ if (str)
	11843	+ strlcpy(str, "(null)", len);
	11844	+ else
	11845	+ return NULL;
	11846	+ }
	11847	+
	11848	+ cfs_rq_tg_path(cfs_rq, str, len);
	11849	+ return str;
	11850	+}
	11851	+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
	11852	+
	11853	+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
	11854	+{
	11855	+ return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
	11856	+}
	11857	+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
	11858	+
	11859	+const struct sched_avg sched_trace_rq_avg_rt(struct rq rq)
	11860	+{
	11861	+#ifdef CONFIG_SMP
	11862	+ return rq ? &rq->avg_rt : NULL;
	11863	+#else
	11864	+ return NULL;
	11865	+#endif
	11866	+}
	11867	+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
	11868	+
	11869	+const struct sched_avg sched_trace_rq_avg_dl(struct rq rq)
	11870	+{
	11871	+#ifdef CONFIG_SMP
	11872	+ return rq ? &rq->avg_dl : NULL;
	11873	+#else
	11874	+ return NULL;
	11875	+#endif
	11876	+}
	11877	+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
	11878	+
	11879	+const struct sched_avg sched_trace_rq_avg_irq(struct rq rq)
	11880	+{
	11881	+#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
	11882	+ return rq ? &rq->avg_irq : NULL;
	11883	+#else
	11884	+ return NULL;
	11885	+#endif
	11886	+}
	11887	+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
	11888	+
	11889	+int sched_trace_rq_cpu(struct rq *rq)
	11890	+{
	11891	+ return rq ? cpu_of(rq) : -1;
	11892	+}
	11893	+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
	11894	+
	11895	+int sched_trace_rq_cpu_capacity(struct rq *rq)
	11896	+{
	11897	+ return rq ?
	11898	+#ifdef CONFIG_SMP
	11899	+ rq->cpu_capacity
	11900	+#else
	11901	+ SCHED_CAPACITY_SCALE
	11902	+#endif
	11903	+ : -1;
	11904	+}
	11905	+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
	11906	+
	11907	+const struct cpumask sched_trace_rd_span(struct root_domain rd)
	11908	+{
	11909	+#ifdef CONFIG_SMP
	11910	+ return rd ? rd->span : NULL;
	11911	+#else
	11912	+ return NULL;
	11913	+#endif
	11914	+}
	11915	+EXPORT_SYMBOL_GPL(sched_trace_rd_span);
	11916	+
	11917	+int sched_trace_rq_nr_running(struct rq *rq)
	11918	+{
	11919	+ return rq ? rq->nr_running : -1;
	11920	+}
	11921	+EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);